配置pip源
更新pip pip install --upgrade pip
下载清华源 pip config set global.index-url Simple Index
requests
用代码模拟向浏览器发送请求
#返回的是json格式,分页查询 import requests for i in range(0,100,10): res=requests.get( url="https://movie.douban.com/j/tv/recommend_groups", headers={ "User-Agent":"Mozilla/05.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0" } ) res.encoding="utf-8" print(res.text) #把jasn类型转换为字典 import json data_dict=json.loads(res.text) for ele in data_dict['groups']: name=ele["name"] picture=ele['picture'] url=ele['url'] print(name,picture,'#######',url) print(data_dict)
#爬取豆瓣高分电影 import requests import json res=requests.get( url="https://movie.douban.com/j/search_subjects?type=movie&tag=%E8%B1%86%E7%93%A3%E9%AB%98%E5%88%86&page_limit=50&page_start=0", headers={ "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0" } ) res.encoding="utf-8" data_dict=json.loads(res.text) for ele in data_dict['subjects']: title=ele["title"] url=ele['url'] print(title,url) #肖申克的救赎 https://movie.douban.com/subject/1292052/ #爱乐之城 https://movie.douban.com/subject/25934014/ #万物生灵:2023圣诞特别集 https://movie.douban.com/subject/35729996/ #疯狂动物城 https://movie.douban.com/subject/25662329/ #我不是药神 https://movie.douban.com/subject/26752088/
bs4(解析html格式文件)
#pip install BeautifulSoup4 #爬取汽车之家的新闻,图片 #https://www.autohome.com.cn/news/ import requests import json #BeautifulSoup用于解析 HTML和 XML文档 from bs4 import BeautifulSoup res=requests.get( url="https://www.autohome.com.cn/news/", headers={ "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0" } ) #汽车之家的编码格式为gb2312,其余一般为utf-8 res.encoding="gb2312" soup=BeautifulSoup(res.text,features="html.parser") #获取所有标签为div,属性为class:article-wrapper # findall找到所有 data=soup.find(name='div',attrs={"class":"article-wrapper"}) li_list_node=data.find_all(name='li') for i in li_list_node: aa=i.find(name="h3") if not aa: continue #.text获取文本 print(aa.text) # 获取p标签内容 p=i.find(name="p") print(p.text) #获取img标签内容,用sttrs获取字典,再获取['img'] img=i.find(name="img") print(img.attrs['src'])
#练习 import requests import json #BeautifulSoup用于解析 HTML和 XML文档 from bs4 import BeautifulSoup res=requests.get( url="https://www.autohome.com.cn/news/", headers={ "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0" } ) #汽车之家的编码格式为gb2312,其余一般为utf-8 res.encoding="gb2312" soup=BeautifulSoup(res.text,features="html.parser") #获取所有标签为div,属性为class:article-wrapper # findall找到所有 data=soup.find(name='div',attrs={"class":"editor-wrap"}) li_list_node=data.find_all(name='li') for i in li_list_node: aa=i.find(name="div",attrs={"class":"editorname"}) bb=i.find(name="div",attrs={"class":"dept"}) cc=i.find(name="div",attrs={"class":"position"}) dd=i.find(name="a")["href"] print(aa.text,bb.text,cc.text,dd)
#获取商城商品价格,并将商品图片下载到本地 import requests import json #BeautifulSoup用于解析 HTML和 XML文档 from bs4 import BeautifulSoup res=requests.get( url="https://mall.10010.com/bj/", headers={ "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0" } ) #汽车之家的编码格式为gb2312,其余一般为utf-8 res.encoding="utf-8" soup=BeautifulSoup(res.text,features="html.parser") #获取所有标签为div,属性为class:article-wrapper # findall找到所有 data=soup.find(name='div',attrs={"class":"mobileZone"}) li_list_node=data.find_all(name='li') for i in li_list_node: img_tags = i.find_all(name='img') aa = i.find("p") if aa is None: continue title = aa.text for img_tag in img_tags: url = img_tag['src'] res = requests.get(url=url) name1 = "{}.jpg".format(title) with open(name1, 'wb') as f: f.write(res.content) print(aa.text)
本文含有隐藏内容,请 开通VIP 后查看