如果网页是get请求方法
import requests
from bs4 import BeautifulSoup
if __name__ == '__main__':
url = 'https://www.biqg.cc/book/6909/11.html'#目标访问网站url
header = {"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0"}
req = requests.get(url=url,headers = header)
req.encoding = 'utf-8'
html = req.text
bes = BeautifulSoup(html,"lxml")
texts = bes.find("div", id="chaptercontent",class_ = "Readarea ReadAjax_content")
# print(texts)
texts_list = texts.text.split("\xa0" * 4) #texts.text.split("\xa0" * 4)会将texts.text按照\xa0\xa0\xa0\xa0进行分割,得到一个字符串列表,存储在texts_list中。
texts_list = texts.text.split("\u3000" * 2)
# print(texts_list)
with open("D:/novel.txt","w") as file: ##打开读写文件,逐行将列表读入文件内
for line in texts_list:
file.write(line+"\n")
如果网页是post请求方法
import requests
import json
import pandas as pd
from tqdm import tqdm
userNames = []
commentDetails = []
commentTimes = []
total_pages = 1
for pagen in tqdm(range(0, total_pages), desc='爬取进度', unit='页'):
#payload参数实质上就是网络下的负载
payload = {
"arg": {
"channelType": 7,
"collapseTpte": 1,
"commentTagId": 0,
"pageIndex": pagen,
"pageSize": 10,
"resourceId":230,
"resourceType":11,
"sourseType": 1,
"sortType": 3,
"starType": 0
},
"head": {
"cid": "09031081213865125571",
"ctok": "",
"cver": "1.0",
"lang": "01",
"sid": "8888",
"syscode": "09",
"auth": "",
"xsid": "",
"extension": []
}
}
#网络的标头中的url路径,采用POST请求方法,其?后面的内容就是payload
postUrl = "https://m.ctrip.com/restapi/soa2/13444/json/getCommentCollapseList"
html = requests.post(postUrl, data=json.dumps(payload)).text
html_1 = json.loads(html)#html_1实质就是网络下面的响应界面
# 检查响应中是否存在'items'
if 'items' in html_1["result"]:
commentItems = html_1["result"]["items"]
for i in range(0, len(commentItems)):
# 在访问元素之前检查当前项是否不为None
if commentItems[i] is not None and 'userInfo' in commentItems[i] and 'userNick' in commentItems[i][
'userInfo']:
userName = commentItems[i]['userInfo']['userNick']
commentDetail = commentItems[i]['content']
commentTime = commentItems[i]['publishTypeTag']
userNames.append(userName)
commentDetails.append(commentDetail)
commentTimes.append(commentTime)