目录
urllib:
python内置的http请求库
4个模块:
request模块:最基本的http请求模块
error模块:异常处理模块
parse模块:工具模块 提供url的处理方法
robotparser模块:识别robot.txt
最基本的请求:
使用openurl发送
get请求:
带参数直接在url上面拼接
参数可能不止一个
多参数:urllib.parse.urlencode(params)
基本的URL地址和一个包含查询参数的字典params
使用urllib.parse.urlencode()函数将查询参数编码为查询字符串
post请求:
添加data(可选)
爬取一个百度网页
#!/usr/bin/env python3
import urllib.request
def load_baidu_data():
url = 'http://www.baidu.com'
response =urllib.request.urlopen(url)
data=response.read()
str_data = data.decode('utf-8')
with open("bd.html","w",encoding='utf-8') as f:
f.write(str_data)
load_baidu_data()
get请求拼接
代码实现百度搜索柯南
#!/usr/bin/env python3
import urllib.request
import urllib.parse
import string
def load_baidu_data():
url = 'http://www.baidu.com/s?wd='
#url+search
name = "柯南"
final_url = url+name
#网址中有汉字,需要进行转码
encode_url = urllib.parse.quote(final_url,safe=string.printable)
# print(encode_url)
response = urllib.request.urlopen(encode_url)
data = response.read()
#将data获取到的东西转换为字符串
str_data =data.decode('utf-8')
# print(str_data)
with open("baidu-kenan.html","w",encoding="utf-8") as f:
f.write(str_data)
load_baidu_data()
get请求拼接多个参数
代码实现百度搜索柯南第九页,和和上一个步骤一样先观察url
#!/usr/bin/env python3
import urllib.request
import urllib.parse
import string
def load_baidu_data():
url = 'http://www.baidu.com/s?'
#url+search
params ={
"wd":"柯南",
"pn":"80"
}
query_str = urllib.parse.urlencode(params)
final_url = url+query_str
print(final_url)
response = urllib.request.urlopen(final_url)
data = response.read()
#将data获取到的东西转换为字符串
str_data =data.decode('utf-8')
# print(str_data)
with open("baidu-kenan-pn80.html","w",encoding="utf-8") as f:
f.write(str_data)
load_baidu_data()
成功
post请求(data,timeout):
#!/usr/bin/env python
import urllib.request
import urllib.parse
import urllib.error
#1.定义url(自己找一个url)
url = 'http://www.baidu.com/post'
#创建要发送的数据表单
data = {
'hello':'world',
'name':'kenan'
}
#data要进行编码
data_encode = urllib.parse.urlencode(data).encode("utf-8")
#加上encode(“utf-8”) str -》bytes
#解码 decode("utf-8") byte -》str
try:
response = urllib.request.urlopen(url=url,data=data_encode,timeout=0.1)
print(response.read().decode("utf-8"))
except urllib.error.URLError as e:
print("连接超时!")
User-Agent
自定义User-Agent
#!/usr/bin/env python
import urllib.request
import urllib.parse
import urllib.error
#1.定义url
url = 'http://www.baidu.com/post'
#2自定义request 添加一个user-agent
header = {
"User-Agent":"Mozilla/5.0 (Linux; U; Android 11; zh-CN; 2112123AC Build/RKQ1.200826.002) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/100.0.4896.58 Quark/6.2.2.246 Mobile Safari/537.36"
}
req = urllib.request.Request(url=url,headers=header,method='POST')
response = urllib.request.urlopen(req)
print(response.read().decode("utf-8"))
随机User-Agent
#!/usr/bin/env python
import urllib.request
import urllib.parse
import urllib.error
import random
def user_agent():
url = 'http://www.baidu.com/post'
#找一堆user-agent
user_agent_list = [
"Mozilla/5.0 (Linux; Android 12; ELS-AN00 Build/HUAWEIELS-AN00; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/107.0.5304.141 Mobile Safari/537.36 XWEB/5075 MMWEBSDK/20230504 MMWEBID/9308 MicroMessenger/8.0.37.2380(0x2800253D) WeChat/arm64 Weixin NetType/5G Language/zh_CN ABI/arm64 MiniProgramEnv/android",
"Mozilla/5.0 (iPhone; CPU iPhone OS............ile/15E148 MicroMessenger/8.0.34(0x18002234) NetType/4G Language/zh_CN",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; hu; rv:1.8.0.11) Gecko/20070312 Firefox/1.5.0.1120",
"Mozilla/5.0 (Macintosh; Int............ecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.67",
"Mozilla/5.0 (X1............7.6) Gecko/20050318 Firefox/1.0.2",
"Mozilla/5.0 (Windows; U; Win............o) Chrome/100.0.4896.58 Safari/537.36 UCBrowser/15.4.8.1238"
]
#每次取一个user-agent
random_user_agent = random.choice(user_agent_list)
header = {
"User-Agent":random_user_agent
}
req = urllib.request.Request(url=url,headers=header,method='POST')
response = urllib.request.urlopen(req)
print(response.read().decode("utf-8"))
user_agent()
handler:
HTTPDefaultErrorHandler 用于处理 HTTP 响应错误,错误都会抛出 HTTPError 类型的异常。
HTTPRedirectHandler 用于处理重定向。
HTTPCookieProcessor 用于处理 Cookies。
ProxyHandler 用于设置代理,默认代理为空。
HTTPPasswordMgr 用于管理密码,它维护了用户名密码的表。
HTTPBasicAuthHandler 用于管理认证,如果一个链接打开时需要认证,那么可以用它来解决认证问题。
自定义opener
#!/usr/bin/env python
import urllib.request
import urllib.parse
import urllib.error
def handler_open():
url = 'http://www.baidu.com/get'
#创建自己的opener
handler = urllib.request.HTTPHandler()
opener = urllib.request.build_opener(handler)
response = opener.open(url)
print(response.read().decode("utf-8"))
handler_open()
自定义ip代理
#!/usr/bin/env python
import urllib.request
import urllib.parse
import urllib.error
def handler_open():
try:
url = 'http://httpbin.org/get'
#添加代理 代理的ip 端口
proxy = {
"http":"http://192.168.6.6:8888"
}
#创建代理处理器
proxy_handler = urllib.request.ProxyHandler(proxy)
#创建自己的opener
opener = urllib.request.build_opener(proxy_handler)
response = opener.open(url)
print(response.read().decode("utf-8"))
except urllib.error.URLError as e:
print("error: ",e)
handler_open()
随机ip代理
#!/usr/bin/env python
import urllib.request
import urllib.parse
import urllib.error
import random
def proxy_ip():
url = 'https://www.kuaidaili.com/testproxy'
#找一堆代理ip
ip_list = [
"http://183.161.45.66:17114",
"http://119.41.198.172:18350",
"http://27.191.60.244:15982",
"http://27.215.237.221:20983",
]
#每次取一个ip
proxy = random.choice(ip_list)
print(proxy)
try:
#创建代理处理器
proxy_handler = urllib.request.ProxyHandler({'http':proxy,'https':proxy})
#创建自己的opener
opener = urllib.request.build_opener(proxy_handler)
response = opener.open(url)
print(response.read().decode("utf-8"))
except urllib.error.URLError as e:
print("error: ",e)
proxy_ip()
验证:
HTTPBasicAuthHandler(用户基本的身份验证处理)
HTTPPasswordMgrWithDefaultRealm(经常和authhandler一起出现)
#创建一个密码管理器
password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()
#添加进目标url,用户名 密码
password_mgr.add_password(None,url,username,password)
第一个参数为NONE,表示默认的域
如果需要添加不同域的凭证可以将none替换为对应的域名
https://ssr3.scrape.center示例网站https://ssr3.scrape.center
授权认证
#!/usr/bin/env python
import urllib.request
from urllib.parse import urlparse
def auth_login():
url = 'https://ssr3.scrape.center/'
#指定用户名and密码
username = 'admin'
password = 'admin'
#创建一个密码管理器
password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()
#添加进目标url,用户名和密码
password_mgr.add_password(None,url,username,password)
#创建一个基本密码认证处理器并将密码管理器传递给他
handler = urllib.request.HTTPBasicAuthHandler(password_mgr)
#创建网络请求的构造器
opener = urllib.request.build_opener(handler)
response = opener.open(url)
# response = urllib.request.urlopen(url)
print(response.read().decode('utf-8'))
auth_login()
Authorization验证
#!/usr/bin/env python
import urllib.request
from urllib.parse import urlparse
def auth_login():
url = 'https://ssr3.scrape.center/'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.54',
'Authorization':'Basic YWRtaW46YWRtaW4='
}
req = urllib.request.Request(url=url,headers=headers)
response = urllib.request.urlopen(req)
print(response.read().decode('utf-8'))
auth_login()
cookiejar读写cookie:
处理cookie相关的handler
写:
MozillaCookieJar:
可以将cookies保存成Mozilla型浏览器的cookies格式
#!/usr/bin/env python
import urllib.request
import http.cookiejar
# cookie = http.cookiejar.CookieJar()
# handler = urllib.request.HTTPCookieProcessor(cookie)
# opener = urllib.request.build_opener(handler)
# response = opener.open('http://www.baidu.com')
#
# for item in cookie:
# print(item)
'''把打印出来的cookie存到一个文件里'''
filename = 'ccc.txt'
cookie = http.cookiejar.MozillaCookieJar(filename=filename)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
cookie.save(ignore_discard=True,ignore_expires=True)
读:
LWPCookieJar
保存成 libwww-perl(LWP) 格式cookies文件。
#!/usr/bin/env python
import urllib.request
import http.cookiejar
filename = 'ccc.txt'
cookie = http.cookiejar.LWPCookieJar()
cookie.load(filename,ignore_discard=True,ignore_expires=True)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
print(response.read().decode('utf-8'))
使用cookie登录网页
#!/usr/bin/env python
import urllib.request
import http.cookiejar
import urllib.parse
#1.用账号密码登录网站
#登录的url
url = 'https://www.yaozh.com/login'
#登录的参数
login_data = {
"type":"0",
"username":"ppkke007",
"pwd":"Iceropq13315",
"pc_vcode":"",
"country":"86_zh-CN",
"mobile":"",
"vcode":"",
"pincode":"",
"formhash":"CEA7846B38",
"backurl":"https%253A%252F%252Fwww.yaozh.com%252F"
}
#发送登录请求
cookie = http.cookiejar.CookieJar()
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.54',
}
#转码
login_str = urllib.parse.urlencode(login_data).encode('utf-8')
req = urllib.request.Request(url=url,headers=headers,data=login_str)
#如果登录成功 cookjar会自动保存cookie
opener.open(req)
#2.代码带着cookie去访问
login_url = "https://www.yaozh.com/member/"
req2 = urllib.request.Request(login_url,headers=headers)
response = opener.open(login_url)
# response = urllib.request.urlopen(login_url)
# print(response.read().decode('utf-8'))
data = response.read()
with open('cookie2.html',"wb") as f:
f.write(data)
URLerror
urllib的error模块:
urlerror继承自OSError
except error.URLError as e:
print(e.reason)
打印错误的原因
HTTPError:
专门用来处理HTTP请求
#!/usr/bin/env python
import urllib.request
from urllib.error import *
import socket
# try:
# url = 'https://www.baidu.com'
# response = urllib.request.urlopen(url=url,timeout=0.01)
# except URLError as e:
# print(e.reason)
# if isinstance(e.reason,socket.timeout):
# print("Time out!!")
try:
url = 'https://ssr3.scrape.center/asdasd'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
}
req = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(req)
print(response.read().decode('utf-8'))
# except URLError as e:
# print(e.reason)
except HTTPError as e:
print("http error:",e)
url链接组成
urlparse
是python标准库中的一个模块,解析和操作url(这是一个标准的url链接格式)
scheme://netloc/path;params?query#fragmentscheme(协议) http or https
netloc(网络位置) host
path(路径)
params(携带的参数)
query(查询参数)
fragment(片段) 内部导航
urlparse&urlunparse
#!/usr/bin/env python
from urllib.parse import urlparse,urlunparse,urlsplit,urlunsplit,urljoin,parse_qs,parse_qsl,quote,unquote
url = 'http://www.baidu.com/index.html;user?id=0#comment'
result = urlparse(url=url)
# print(type(result),result)
print(result.scheme)
print(result.netloc)
print(result.path)
print(result.params)
print(result.query)
print(result.fragment)
result = urlparse(url=url,scheme='https',allow_fragments=False)
print(result.scheme)
print(result.fragment)
data = ['http','www.baidu.com','index.html','user','id=0','comment']
url = urlunparse(data)
print(url)
#http://www.baidu.com/index.html;user?id=0#comment
urlsplit&urlunsplit
和urlparse不同的是urlsplit不解析查询参数和片段部分,只分割url
返回的是一个元组类型,可以用索引来取
#!/usr/bin/env python
from urllib.parse import urlparse,urlunparse,urlsplit,urlunsplit,urljoin,parse_qs,parse_qsl,quote,unquote
urlsplit
url = 'http://www.baidu.com/index.html;user?id=0#comment'
result = urlsplit(url)
print(result.scheme)
print(result[0])
print(result.netloc)
print(result.path)
print(result.query)
print(result.fragment)
urlunsplit
data = ('http','www.baidu.com','index.html','id=0','comment')
print(urlunsplit(data))
urljoin
将一个相对URL 解析成一个绝对的url
base 基本url,通常是一个绝对的url
url,这个是相对的url
#!/usr/bin/env python
from urllib.parse import urlparse,urlunparse,urlsplit,urlunsplit,urljoin,parse_qs,parse_qsl,quote,unquote
urljoin
base_url = 'https://www.baidu.com'
relative_url = '/path/to/xxxx'
url = urljoin(base_url,relative_url)
print(url)
print(urljoin('https://www.baidu.com','/FAQ.html'))
print(urljoin('https://www.baidu.com','http://www.taobao.com'))
print(urljoin('https://www.baidu.com/admin.html','http://www.taobao.com'))
print(urljoin('https://www.baidu.com?wd=aaa','?user=1#comment'))
print(urljoin('https://www.baidu.com#comment','?user=1'))
结论:base_url 提供了三项内容 scheme、netloc 和 path。如果这 3 项在新的链接里不存在,就予以补充;如果新的链接存在,就使用新的链接的部分。而 base_url 中的 params、query 和 fragment 是不起作用的。
parse_qs&parse_qsl
parse_qs
get参数的序列化
parse_qsl
将参数转为元组组成列表
#!/usr/bin/env python
from urllib.parse import urlparse,urlunparse,urlsplit,urlunsplit,urljoin,parse_qs,parse_qsl,quote,unquote
parse_qs
query = 'name=kenan&age=16'
print(parse_qs(query))
parse_qsl
data = parse_qs(query)
print(parse_qsl(query))
quote&unquote
quote
url编码unquote
url解码
#!/usr/bin/env python
from urllib.parse import urlparse,urlunparse,urlsplit,urlunsplit,urljoin,parse_qs,parse_qsl,quote,unquote
keyword = "柯南"
url = 'https://www.baidu.com/s?wd='+quote(keyword)
print(url)
#unquote
url_1 = unquote(url)
print(url_1)
robot协议
爬虫协议
告诉爬虫or搜索引擎,哪些可以爬 哪些不能爬
robot.txt(一般就在根目录下)robotparser模块
用于解析robots.txt
#!/usr/bin/env python
from urllib.robotparser import RobotFileParser
#创建一个RobotFileParser对象用于解析robots.txt
robot_parser = RobotFileParser()
robot_parser.set_url('https://www.zhihu.com/robots.txt')
#读取并且解析robots.txt
robot_parser.read()
#检查是否可以爬取特定的url
user_agent = "BaiduSpider"
check_url = 'https://www.zhihu.com/'
#can_fetch
if robot_parser.can_fetch(user_agent,check_url):
print("可以爬取这个url")
else:
print("不可以爬取这个url")