python之第三方模块

发布于:2024-03-14 ⋅ 阅读:(53) ⋅ 点赞:(0)

配置pip源

更新pip pip install --upgrade pip

下载清华源 pip config set global.index-url Simple Index

requests

用代码模拟向浏览器发送请求

#返回的是json格式,分页查询
import requests
for i in range(0,100,10):
    res=requests.get(
        url="https://movie.douban.com/j/tv/recommend_groups",
        headers={
        "User-Agent":"Mozilla/05.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0"
        }
    )
    res.encoding="utf-8"
    print(res.text)
#把jasn类型转换为字典
    import json
    data_dict=json.loads(res.text)
    for ele in data_dict['groups']:
        name=ele["name"]
        picture=ele['picture']
        url=ele['url']
        print(name,picture,'#######',url)
​
    print(data_dict)
#爬取豆瓣高分电影
import requests
import json
res=requests.get(
        url="https://movie.douban.com/j/search_subjects?type=movie&tag=%E8%B1%86%E7%93%A3%E9%AB%98%E5%88%86&page_limit=50&page_start=0",
        headers={
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0"
        }
    )
res.encoding="utf-8"
data_dict=json.loads(res.text)
for ele in data_dict['subjects']:
    title=ele["title"]
    url=ele['url']
    print(title,url)
#肖申克的救赎 https://movie.douban.com/subject/1292052/
#爱乐之城 https://movie.douban.com/subject/25934014/
#万物生灵:2023圣诞特别集 https://movie.douban.com/subject/35729996/
#疯狂动物城 https://movie.douban.com/subject/25662329/
#我不是药神 https://movie.douban.com/subject/26752088/
​

bs4(解析html格式文件)

#pip install BeautifulSoup4
#爬取汽车之家的新闻,图片
#https://www.autohome.com.cn/news/           
import requests
import json
#BeautifulSoup用于解析 HTML和 XML文档
from bs4 import BeautifulSoup
res=requests.get(
        url="https://www.autohome.com.cn/news/",
        headers={
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0"
        }
    )
#汽车之家的编码格式为gb2312,其余一般为utf-8
res.encoding="gb2312"
soup=BeautifulSoup(res.text,features="html.parser")
#获取所有标签为div,属性为class:article-wrapper
# findall找到所有
data=soup.find(name='div',attrs={"class":"article-wrapper"})
li_list_node=data.find_all(name='li')
for i in li_list_node:
    aa=i.find(name="h3")
    if not aa:
        continue
#.text获取文本
    print(aa.text)
# 获取p标签内容
    p=i.find(name="p")
    print(p.text)
#获取img标签内容,用sttrs获取字典,再获取['img']
    img=i.find(name="img")
    print(img.attrs['src'])
#练习
import requests
import json
#BeautifulSoup用于解析 HTML和 XML文档
from bs4 import BeautifulSoup
res=requests.get(
        url="https://www.autohome.com.cn/news/",
        headers={
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0"
        }
    )
#汽车之家的编码格式为gb2312,其余一般为utf-8
res.encoding="gb2312"
soup=BeautifulSoup(res.text,features="html.parser")
#获取所有标签为div,属性为class:article-wrapper
# findall找到所有
data=soup.find(name='div',attrs={"class":"editor-wrap"})
​
li_list_node=data.find_all(name='li')
for i in li_list_node:
    aa=i.find(name="div",attrs={"class":"editorname"})
    bb=i.find(name="div",attrs={"class":"dept"})
    cc=i.find(name="div",attrs={"class":"position"})
    dd=i.find(name="a")["href"]
    print(aa.text,bb.text,cc.text,dd)
#获取商城商品价格,并将商品图片下载到本地
import requests
import json
#BeautifulSoup用于解析 HTML和 XML文档
from bs4 import BeautifulSoup
res=requests.get(
        url="https://mall.10010.com/bj/",
        headers={
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0"
        }
    )
#汽车之家的编码格式为gb2312,其余一般为utf-8
res.encoding="utf-8"
soup=BeautifulSoup(res.text,features="html.parser")
#获取所有标签为div,属性为class:article-wrapper
# findall找到所有
data=soup.find(name='div',attrs={"class":"mobileZone"})
li_list_node=data.find_all(name='li')
​
​
for i in li_list_node:
    img_tags = i.find_all(name='img')
    aa = i.find("p")
    if aa is None:
        continue
    title = aa.text
    for img_tag in img_tags:
        url =  img_tag['src']
        res = requests.get(url=url)
        name1 = "{}.jpg".format(title)
        with open(name1, 'wb') as f:
            f.write(res.content)
​
    print(aa.text)
​
​
​

本文含有隐藏内容,请 开通VIP 后查看