import requests
#pip install beautifulsoup4
from bs4 import BeautifulSoup
from lxml import etree
from pdb import set_trace
class ddooo:
def __init__(self):
res=self.爬取源代码()
#self.用xpath解析数据(res)
self.用bs解析数据(res)
def 用xpath解析数据(self,res):
html=etree.HTML(res) #初始化
li=[]
for i in range(1,21):
#p=f'//*[@id="dnrj"]/ul/li[{i}]/div[1]/p/a/text()'
p=f'//*[@id="dnrj"]/ul/li[{i}]/div[1]/p/a/@href'
li.append(html.xpath(p))
return li
def 用bs解析数据(self,res):
html=BeautifulSoup(res,'lxml') #初始化
x=html.find_all(attrs={"class":"pic"})
li=[]
for i in x:
li.append(i.text)
#print(i.text)
#print(i.get('href'))
#print(type(i))
'''
初始化后的源代码.find_all(
attrs={
"name":'a' "class"="pic"
}
)
'''
return li
def 爬取源代码(self):
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
"sec-ch-ua": "\"Google Chrome\";v=\"123\", \"Not:A-Brand\";v=\"8\", \"Chromium\";v=\"123\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Windows\""
}
cookies = {
"UM_distinctid": "18eb34b98a52f9-064823b28d2454-26001a51-1fa400-18eb34b98a6989",
"CNZZDATA1281127966": "1317709386-1712405060-%7C1713836543"
}
url = "https://search.ddooo.com/search.html"
params = {
"wd": "qq"
}
response = requests.get(url, headers=headers, cookies=cookies, params=params).text
return response
if __name__=='__main__':
f=ddooo()
res=f.爬取源代码()
xpath=f.用xpath解析数据(res)
bs=f.用bs解析数据(res)
print(xpath)
print()
print(bs)