爬虫-解析数据

发布于:2024-04-24 ⋅ 阅读:(28) ⋅ 点赞:(0)

import requests
#pip install beautifulsoup4
from bs4 import BeautifulSoup
from lxml import etree
from pdb import set_trace
class ddooo:
    def __init__(self):
        res=self.爬取源代码()
        #self.用xpath解析数据(res)
        self.用bs解析数据(res)
    def 用xpath解析数据(self,res):
        html=etree.HTML(res) #初始化
        li=[]
        for i in range(1,21):
            #p=f'//*[@id="dnrj"]/ul/li[{i}]/div[1]/p/a/text()'
            p=f'//*[@id="dnrj"]/ul/li[{i}]/div[1]/p/a/@href'
            li.append(html.xpath(p))
        return li
    def 用bs解析数据(self,res):
        html=BeautifulSoup(res,'lxml') #初始化
        x=html.find_all(attrs={"class":"pic"})
        li=[]
        for i in x:
            li.append(i.text)
            #print(i.text)
            #print(i.get('href'))
            #print(type(i))
            
        '''
        初始化后的源代码.find_all(
            attrs={
                "name":'a'      "class"="pic"  
                }
            )
        '''
        return li
    def 爬取源代码(self):
        headers = {
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
            "Accept-Language": "zh-CN,zh;q=0.9",
            "Cache-Control": "max-age=0",
            "Connection": "keep-alive",
            "Sec-Fetch-Dest": "document",
            "Sec-Fetch-Mode": "navigate",
            "Sec-Fetch-Site": "none",
            "Sec-Fetch-User": "?1",
            "Upgrade-Insecure-Requests": "1",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
            "sec-ch-ua": "\"Google Chrome\";v=\"123\", \"Not:A-Brand\";v=\"8\", \"Chromium\";v=\"123\"",
            "sec-ch-ua-mobile": "?0",
            "sec-ch-ua-platform": "\"Windows\""
        }
        cookies = {
            "UM_distinctid": "18eb34b98a52f9-064823b28d2454-26001a51-1fa400-18eb34b98a6989",
            "CNZZDATA1281127966": "1317709386-1712405060-%7C1713836543"
        }
        url = "https://search.ddooo.com/search.html"
        params = {
            "wd": "qq"
        }
        response = requests.get(url, headers=headers, cookies=cookies, params=params).text
        return response
if __name__=='__main__':    
    f=ddooo()
    res=f.爬取源代码()
    xpath=f.用xpath解析数据(res)
    bs=f.用bs解析数据(res)
    print(xpath)
    print()
    print(bs)