Scrapy 入门
#scrapy安装#
·pip install scrapy -i https://pypi.tuna.tsinghua.edu.cn/simple
需要抓取数据项items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class WwwJobComItem(scrapy.Item):
position_id = scrapy.Field()
position_name = scrapy.Field()
position_lables = scrapy.Field()
work_year = scrapy.Field()
salary = scrapy.Field()
avg_salary = scrapy.Field()
city = scrapy.Field()
education = scrapy.Field()
company_name = scrapy.Field()
industry_field = scrapy.Field()
finance_stage = scrapy.Field()
company_size = scrapy.Field()
time = scrapy.Field()
updated_at = scrapy.Field()
platform = scrapy.Field()
解析数据spiders/51jobspider.py
# -*- coding: utf-8 -*-
import scrapy
import time
from www_job_com.items import WwwJobComItem
from selenium import webdriver#支持网页脚本
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
#接口:https://we.51job.com/api/job/search-pc?api_key=51job×tamp=1718105974&keyword=Qt&searchType=2&function=&industry=&jobArea=110300&jobArea2=&landmark=&metro=&salary=&workYear=°ree=&companyType=&companySize=&jobType=&issueDate=&sortType=0&pageNum=2&requestId=8c6f4cdeb1385105b3c84db814bc7595&keywordType=&pageSize=20&source=1&accountId=&pageCode=sou%7Csou%7Csoulb&u_atoken=2411093d2a078aa2e11f4df75d89aeff&u_asession=01IsIw5W-9QS3NOvPfl6e1xUfA4Y8C-LJU-U9CHzLiUOB6TpeLOQWw5EkuO1PWxknVJB-YY_UqRErInTL5mMzm-GyPlBJUEqctiaTooWaXr7I&u_asig=05Ju4pRRMDauGGVqy-c_gQE7FxLMbfJy3d6_JVVRbfidgjAIPcV24TMCbBdIrj3Co6tYmluDj39MSETBif5VN8PvzNFigoJzvVoa7xCkt2V_-u18NE04XeCMJAIByPlmMn3ZYHbhtgBFlMnpbO7EXVKUtfscfboS4VrslPF88R_4fBzhvSc0Kr8URjOX9Xe4tkaFz52ZL0wSSeBMFpVEI75YxuDdwJdvTs3wo1QVyrNHhVQ0CgfAckO5j-QEuxuIrnewtyftWbfuxhWtuQjX92YSbo21IoqKumRO3QXK1wAo8MI7Z-80-lKwjxRoAdqb7gF-n1f-CA6GZoq9ddPTuuQA&u_aref=cD8WFSKdmXy1U7TQlHrzti34Ko0%3D
class Job51Spider(scrapy.Spider):
name = 'job51'
allowed_domains = ['we.51job.com']
start_urls = ['https://we.51job.com']
positionUrl = ''
curPage = 0
headers = {}
def __init__(self):
self.driver = webdriver.Chrome() # 或者其他浏览器
def start_requests(self):
return [self.next_request()]
def parse(self, response):
self.driver.get(response.url)
# 加载动态内容
WebDriverWait(self.driver, 20).until(
EC.presence_of_all_elements_located((By.CLASS_NAME, 'joblist-item'))
)
print("request -> " + response.url)
html = response.body.decode('utf-8')
# job_list = response.css('div.joblist-item')
job_list= self.driver.find_elements(By.CLASS_NAME,'joblist-item')
# job_list = self.driver.find_elements_by_css_selector('.joblist-item')
if (len(job_list) > 1):
print("51job Nums:" + str(len(job_list)))
i=1
for job in job_list:
item = WwwJobComItem()
str_time = ''#job.css('span::text').extract_first().strip()
if (str_time == "发布时间"):
continue
else:
item['position_id'] = i #job.css('p.t1 > input::attr(value)').extract_first().strip()
i = i+1
item["position_name"] = self.driver.find_element(By.CLASS_NAME,'jname').text#job.css('joblist-item-top > span ::text').extract_first().strip()
salary = self.driver.find_element(By.CLASS_NAME,'sal').text#job.css('joblist-item-top > span ::text').extract()[1].strip()
if (salary.find("万/月-") > -1):
salary = salary.replace("万/月", "").split("-")
item["salary"] = str(float(salary[0]) * 10) + "K-" + str(float(salary[1]) * 10) + "K"
item["avg_salary"] = (float(salary[0]) * 10 + float(salary[1]) * 10) / 2
elif (salary.find("万/年-") > -1):
salary = salary.replace("万/年", "").split("-")
item["salary"] = str(float(salary[0]) / 12) + "K-" + str(float(salary[1]) / 12) + "K"
item["avg_salary"] = (float(salary[0]) / 12 + float(salary[1]) / 12) / 2
elif (salary.find("元/天-") > -1):
continue
else:
#salary = salary.replace("千/月", "").split("-")
item["salary"] = salary#salary[0] + "K-" + salary[1] + "K"
item["avg_salary"] = ''# (float(salary[0]) + float(salary[1])) / 2
item['city'] = self.driver.find_element(By.CLASS_NAME,'area').text#job.css('div.area > div::text').extract_first().strip()
item['work_year'] = ""
item['education'] = ""
item['company_name'] = self.driver.find_element(By.CLASS_NAME,'cname').text#job.css('div.joblist-item-bot > div.bl > a ::text').extract_first().strip()
# driver.find_element(By.CSS_SELECTOR
item['industry_field'] = self.driver.find_elements(By.CLASS_NAME,'dc')[0].text#job.css('span.text-cut ::text').extract()[1].strip()#行业
item['finance_stage'] = self.driver.find_elements(By.CLASS_NAME,'dc')[1].text#job.css('span.dc.shrink-0 > div.bl > a ::text').extract()[0].strip()#资金阶段
item['company_size'] = self.driver.find_elements(By.CLASS_NAME,'dc')[2].text#job.css('span.dc.shrink-0 > div.bl > a ::text').extract()[1].strip()
# listlabels = job.css('div.joblist-item-mid > div.tags > div.tag ::text').extract()
# position_labels = ','.join(listlabels)
item['position_lables'] = self.driver.find_element(By.CLASS_NAME,'tags').text.replace('\n',',')#position_labels
item['time'] = ""#str_time
item['updated_at'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
item['platform'] = "51job"
yield item
yield self.next_request()
# 发送请求
def next_request(self):
self.curPage += 1
self.positionUrl = "https://we.51job.com/pc/search?jobArea=110300&keyword=Qt&searchType=2&keywordType="
# self.positionUrl = "http://search.51job.com/list/170200,000000,0000,00,9,99,php,2," + str(self.curPage) + ".html"
print("51job page:" + str(self.curPage))
time.sleep(10)
return scrapy.http.FormRequest(self.positionUrl,
headers=self.headers,
callback=self.parse)
管道保存数据pipelines.py
# -*- coding: utf-8 -*-
import os
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
from twisted.enterprise import adbapi
class WwwJobComPipeline(object):
@classmethod
def from_settings(cls, settings):
dbparams = dict(
host=settings['MYSQL_HOST'],
db=settings['MYSQL_DBNAME'],
user=settings['MYSQL_USER'],
passwd=settings['MYSQL_PASSWD'],
charset='utf8',
cursorclass=pymysql.cursors.DictCursor,
use_unicode=False,
)
dbpool = adbapi.ConnectionPool('pymysql', **dbparams)
return cls(dbpool)
def __init__(self, dbpool):
self.dbpool = dbpool
def process_item_saveto_txt_zhaopin(self, item, spider):
# file_path = os.path.join(self.files_dir, item['file_name'])
file_path = os.path.join("","zhaopintxt.txt")
with open(file_path, 'a', encoding='utf-8') as f:
f.write("\n"+str(item["position_id"])+"\n"+item["city"]+"\n"+item["salary"]+"\n"+item["company_name"]+"\n"+item["company_size"] +"\n"+item["position_name"]+"\n"+item["work_year"]+"\n" )
return item
def process_item_saveto_txt_job51(self, item, spider):
# file_path = os.path.join(self.files_dir, item['file_name'])
file_path = os.path.join("","zhaopintxt.txt")
with open(file_path, 'a', encoding='utf-8') as f:
f.write("\n"+str(item["position_id"])+"\n"+item["position_name"]+"\n"+item["salary"]+"\n"+item["city"]+"\n"+
item["company_name"] +"\n"+item["industry_field"]+"\n"+item["finance_stage"]+"\n"+
item["company_size"]+"\n"+item["position_lables"]+"\n"+item["updated_at"]
)
return item
def process_item(self, item, spider):
self.process_item_saveto_txt_job51(item, spider)
query = self.dbpool.runInteraction(self._conditional_insert, item)
query.addErrback(self._handle_error, item, spider)
return item
def _conditional_insert(self, tx, item):
# print item['name']
sql = "select * from jobs where position_id=%s and platform=%s"
position_id = (item["position_id"], item["platform"])
result = tx.execute(sql, position_id)
if (result == 0):
sql = "insert into jobs(position_id,position_name,position_lables,work_year,salary,city,education,company_name,industry_field,finance_stage,company_size,updated_at,`time`,platform,avg_salary) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
params = (
item["position_id"], item["position_name"], item["position_lables"], item["work_year"], item["salary"],
item["city"], item["education"], item["company_name"], item["industry_field"],
item["finance_stage"], item["company_size"], item["updated_at"], item["time"],
item["platform"], item["avg_salary"])
tx.execute(sql, params)
def _handle_error(self, failue, item, spider):
print('_handle_error')
print(item)
print(failue)
爬虫代码调试
#item.py同级目录下创建run.py文件,内容如下,然后再运行此脚本进行调试:
from scrapy import cmdline
name = r'C:\Users\Administrator\Downloads\source\www_job_com-master\www_job_com\spiders\job51_spider.py'
cmd = 'scrapy runspider {0}'.format(name)
cmdline.execute(cmd.split())
Selenium支持网页脚本
Selenium安装
pip install selenium -i https://pypi.tuna.tsinghua.edu.cn/simple
Python实战使用Selenium爬取网页数据
爬取网页数据代码:
from selenium import webdriver
# 指定chromedriver.exe的路径
driver_path = r"C:\path\to\chromedriver.exe"
# 创建一个WebDriver实例,指定使用Chrome浏览器
driver = webdriver.Chrome(driver_path)
# 访问目标网站
driver.get("https://www.example.com")
# 获取网页标题
page_title = driver.title
print("Page Title:", page_title)
# 关闭浏览器
driver.quit()
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver_path = r"C:\path\to\chromedriver.exe"
driver = webdriver.Chrome(driver_path)
driver.get("https://www.example.com/dynamic-content")
# 等待指定元素出现,最多等待10秒
element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "joblist-item"))
)
# 操作该元素...
job_list= self.driver.find_elements(By.CLASS_NAME,'joblist-item')#数组
jname = self.driver.find_element(By.CLASS_NAME,'jname').text#字符串
driver.quit()