配置pip源
更新pip pip install --upgrade pip
下载清华源 pip config set global.index-url Simple Index
requests
用代码模拟向浏览器发送请求
#返回的是json格式,分页查询
import requests
for i in range(0,100,10):
res=requests.get(
url="https://movie.douban.com/j/tv/recommend_groups",
headers={
"User-Agent":"Mozilla/05.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0"
}
)
res.encoding="utf-8"
print(res.text)
#把jasn类型转换为字典
import json
data_dict=json.loads(res.text)
for ele in data_dict['groups']:
name=ele["name"]
picture=ele['picture']
url=ele['url']
print(name,picture,'#######',url)
print(data_dict)
#爬取豆瓣高分电影
import requests
import json
res=requests.get(
url="https://movie.douban.com/j/search_subjects?type=movie&tag=%E8%B1%86%E7%93%A3%E9%AB%98%E5%88%86&page_limit=50&page_start=0",
headers={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0"
}
)
res.encoding="utf-8"
data_dict=json.loads(res.text)
for ele in data_dict['subjects']:
title=ele["title"]
url=ele['url']
print(title,url)
#肖申克的救赎 https://movie.douban.com/subject/1292052/
#爱乐之城 https://movie.douban.com/subject/25934014/
#万物生灵:2023圣诞特别集 https://movie.douban.com/subject/35729996/
#疯狂动物城 https://movie.douban.com/subject/25662329/
#我不是药神 https://movie.douban.com/subject/26752088/
bs4(解析html格式文件)
#pip install BeautifulSoup4
#爬取汽车之家的新闻,图片
#https://www.autohome.com.cn/news/
import requests
import json
#BeautifulSoup用于解析 HTML和 XML文档
from bs4 import BeautifulSoup
res=requests.get(
url="https://www.autohome.com.cn/news/",
headers={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0"
}
)
#汽车之家的编码格式为gb2312,其余一般为utf-8
res.encoding="gb2312"
soup=BeautifulSoup(res.text,features="html.parser")
#获取所有标签为div,属性为class:article-wrapper
# findall找到所有
data=soup.find(name='div',attrs={"class":"article-wrapper"})
li_list_node=data.find_all(name='li')
for i in li_list_node:
aa=i.find(name="h3")
if not aa:
continue
#.text获取文本
print(aa.text)
# 获取p标签内容
p=i.find(name="p")
print(p.text)
#获取img标签内容,用sttrs获取字典,再获取['img']
img=i.find(name="img")
print(img.attrs['src'])
#练习
import requests
import json
#BeautifulSoup用于解析 HTML和 XML文档
from bs4 import BeautifulSoup
res=requests.get(
url="https://www.autohome.com.cn/news/",
headers={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0"
}
)
#汽车之家的编码格式为gb2312,其余一般为utf-8
res.encoding="gb2312"
soup=BeautifulSoup(res.text,features="html.parser")
#获取所有标签为div,属性为class:article-wrapper
# findall找到所有
data=soup.find(name='div',attrs={"class":"editor-wrap"})
li_list_node=data.find_all(name='li')
for i in li_list_node:
aa=i.find(name="div",attrs={"class":"editorname"})
bb=i.find(name="div",attrs={"class":"dept"})
cc=i.find(name="div",attrs={"class":"position"})
dd=i.find(name="a")["href"]
print(aa.text,bb.text,cc.text,dd)
#获取商城商品价格,并将商品图片下载到本地
import requests
import json
#BeautifulSoup用于解析 HTML和 XML文档
from bs4 import BeautifulSoup
res=requests.get(
url="https://mall.10010.com/bj/",
headers={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0"
}
)
#汽车之家的编码格式为gb2312,其余一般为utf-8
res.encoding="utf-8"
soup=BeautifulSoup(res.text,features="html.parser")
#获取所有标签为div,属性为class:article-wrapper
# findall找到所有
data=soup.find(name='div',attrs={"class":"mobileZone"})
li_list_node=data.find_all(name='li')
for i in li_list_node:
img_tags = i.find_all(name='img')
aa = i.find("p")
if aa is None:
continue
title = aa.text
for img_tag in img_tags:
url = img_tag['src']
res = requests.get(url=url)
name1 = "{}.jpg".format(title)
with open(name1, 'wb') as f:
f.write(res.content)
print(aa.text)
本文含有隐藏内容,请 开通VIP 后查看