python scrapy爬虫学习

发布于:2024-06-18 ⋅ 阅读:(213) ⋅ 点赞:(0)

Scrapy 入门

#scrapy安装#

·pip install scrapy -i https://pypi.tuna.tsinghua.edu.cn/simple

Scrapy 入门教程

需要抓取数据项items.py

# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class WwwJobComItem(scrapy.Item):
    position_id = scrapy.Field()
    position_name = scrapy.Field()
    position_lables = scrapy.Field()
    work_year = scrapy.Field()
    salary = scrapy.Field()
    avg_salary = scrapy.Field()
    city = scrapy.Field()
    education = scrapy.Field()
    company_name = scrapy.Field()
    industry_field = scrapy.Field()
    finance_stage = scrapy.Field()
    company_size = scrapy.Field()
    time = scrapy.Field()
    updated_at = scrapy.Field()
    platform = scrapy.Field()

解析数据spiders/51jobspider.py

# -*- coding: utf-8 -*-
import scrapy
import time
from www_job_com.items import WwwJobComItem
from selenium import webdriver#支持网页脚本
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
#接口:https://we.51job.com/api/job/search-pc?api_key=51job&timestamp=1718105974&keyword=Qt&searchType=2&function=&industry=&jobArea=110300&jobArea2=&landmark=&metro=&salary=&workYear=&degree=&companyType=&companySize=&jobType=&issueDate=&sortType=0&pageNum=2&requestId=8c6f4cdeb1385105b3c84db814bc7595&keywordType=&pageSize=20&source=1&accountId=&pageCode=sou%7Csou%7Csoulb&u_atoken=2411093d2a078aa2e11f4df75d89aeff&u_asession=01IsIw5W-9QS3NOvPfl6e1xUfA4Y8C-LJU-U9CHzLiUOB6TpeLOQWw5EkuO1PWxknVJB-YY_UqRErInTL5mMzm-GyPlBJUEqctiaTooWaXr7I&u_asig=05Ju4pRRMDauGGVqy-c_gQE7FxLMbfJy3d6_JVVRbfidgjAIPcV24TMCbBdIrj3Co6tYmluDj39MSETBif5VN8PvzNFigoJzvVoa7xCkt2V_-u18NE04XeCMJAIByPlmMn3ZYHbhtgBFlMnpbO7EXVKUtfscfboS4VrslPF88R_4fBzhvSc0Kr8URjOX9Xe4tkaFz52ZL0wSSeBMFpVEI75YxuDdwJdvTs3wo1QVyrNHhVQ0CgfAckO5j-QEuxuIrnewtyftWbfuxhWtuQjX92YSbo21IoqKumRO3QXK1wAo8MI7Z-80-lKwjxRoAdqb7gF-n1f-CA6GZoq9ddPTuuQA&u_aref=cD8WFSKdmXy1U7TQlHrzti34Ko0%3D
class Job51Spider(scrapy.Spider):
    name = 'job51'
    allowed_domains = ['we.51job.com']
    start_urls = ['https://we.51job.com']
    positionUrl = ''
    curPage = 0
    headers = {}

    def __init__(self):
        self.driver = webdriver.Chrome()  # 或者其他浏览器

    def start_requests(self):
        return [self.next_request()]

    def parse(self, response):
        self.driver.get(response.url)
        # 加载动态内容
        WebDriverWait(self.driver, 20).until(
            EC.presence_of_all_elements_located((By.CLASS_NAME, 'joblist-item'))
        )

        print("request -> " + response.url)
        html = response.body.decode('utf-8')
        # job_list = response.css('div.joblist-item')
        job_list= self.driver.find_elements(By.CLASS_NAME,'joblist-item')
        # job_list = self.driver.find_elements_by_css_selector('.joblist-item')
        if (len(job_list) > 1):
            print("51job Nums:" + str(len(job_list)))
            i=1
            for job in job_list:
                item = WwwJobComItem()
                str_time = ''#job.css('span::text').extract_first().strip()
                if (str_time == "发布时间"):
                    continue
                else:
                    item['position_id'] = i #job.css('p.t1 > input::attr(value)').extract_first().strip()
                    i = i+1
                    item["position_name"] = self.driver.find_element(By.CLASS_NAME,'jname').text#job.css('joblist-item-top > span ::text').extract_first().strip()
                    salary = self.driver.find_element(By.CLASS_NAME,'sal').text#job.css('joblist-item-top > span ::text').extract()[1].strip()
                    if (salary.find("万/月-") > -1):
                        salary = salary.replace("万/月", "").split("-")
                        item["salary"] = str(float(salary[0]) * 10) + "K-" + str(float(salary[1]) * 10) + "K"
                        item["avg_salary"] = (float(salary[0]) * 10 + float(salary[1]) * 10) / 2
                    elif (salary.find("万/年-") > -1):
                        salary = salary.replace("万/年", "").split("-")
                        item["salary"] = str(float(salary[0]) / 12) + "K-" + str(float(salary[1]) / 12) + "K"
                        item["avg_salary"] = (float(salary[0]) / 12 + float(salary[1]) / 12) / 2
                    elif (salary.find("元/天-") > -1):
                        continue
                    else:
                        #salary = salary.replace("千/月", "").split("-")
                        item["salary"] = salary#salary[0] + "K-" + salary[1] + "K"
                        item["avg_salary"] = ''# (float(salary[0]) + float(salary[1])) / 2
                    item['city'] = self.driver.find_element(By.CLASS_NAME,'area').text#job.css('div.area > div::text').extract_first().strip()
                    item['work_year'] = ""
                    item['education'] = ""
                    item['company_name'] = self.driver.find_element(By.CLASS_NAME,'cname').text#job.css('div.joblist-item-bot > div.bl > a ::text').extract_first().strip()
                    # driver.find_element(By.CSS_SELECTOR
                    item['industry_field'] = self.driver.find_elements(By.CLASS_NAME,'dc')[0].text#job.css('span.text-cut ::text').extract()[1].strip()#行业
                    item['finance_stage'] = self.driver.find_elements(By.CLASS_NAME,'dc')[1].text#job.css('span.dc.shrink-0 > div.bl > a ::text').extract()[0].strip()#资金阶段
                    item['company_size'] = self.driver.find_elements(By.CLASS_NAME,'dc')[2].text#job.css('span.dc.shrink-0 > div.bl > a ::text').extract()[1].strip()
                    # listlabels = job.css('div.joblist-item-mid > div.tags > div.tag ::text').extract()
                    # position_labels = ','.join(listlabels)
                    item['position_lables'] = self.driver.find_element(By.CLASS_NAME,'tags').text.replace('\n',',')#position_labels
                    item['time'] = ""#str_time
                    item['updated_at'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                    item['platform'] = "51job"
                    yield item
            yield self.next_request()

    # 发送请求
    def next_request(self):
        self.curPage += 1
        self.positionUrl = "https://we.51job.com/pc/search?jobArea=110300&keyword=Qt&searchType=2&keywordType="
        # self.positionUrl = "http://search.51job.com/list/170200,000000,0000,00,9,99,php,2," + str(self.curPage) + ".html"
        print("51job page:" + str(self.curPage))
        time.sleep(10)
        return scrapy.http.FormRequest(self.positionUrl,
                                       headers=self.headers,
                                       callback=self.parse)

管道保存数据pipelines.py

# -*- coding: utf-8 -*-
import os

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
from twisted.enterprise import adbapi


class WwwJobComPipeline(object):
    @classmethod
    def from_settings(cls, settings):
        dbparams = dict(
            host=settings['MYSQL_HOST'],
            db=settings['MYSQL_DBNAME'],
            user=settings['MYSQL_USER'],
            passwd=settings['MYSQL_PASSWD'],
            charset='utf8',
            cursorclass=pymysql.cursors.DictCursor,
            use_unicode=False,
        )
        dbpool = adbapi.ConnectionPool('pymysql', **dbparams)
        return cls(dbpool)

    def __init__(self, dbpool):
        self.dbpool = dbpool

    def process_item_saveto_txt_zhaopin(self, item, spider):
        # file_path = os.path.join(self.files_dir, item['file_name'])
        file_path = os.path.join("","zhaopintxt.txt")
        with open(file_path, 'a', encoding='utf-8') as f:
            f.write("\n"+str(item["position_id"])+"\n"+item["city"]+"\n"+item["salary"]+"\n"+item["company_name"]+"\n"+item["company_size"] +"\n"+item["position_name"]+"\n"+item["work_year"]+"\n" )
        return item
    def process_item_saveto_txt_job51(self, item, spider):
        # file_path = os.path.join(self.files_dir, item['file_name'])
        file_path = os.path.join("","zhaopintxt.txt")
        with open(file_path, 'a', encoding='utf-8') as f:
            f.write("\n"+str(item["position_id"])+"\n"+item["position_name"]+"\n"+item["salary"]+"\n"+item["city"]+"\n"+
                    item["company_name"] +"\n"+item["industry_field"]+"\n"+item["finance_stage"]+"\n"+
                    item["company_size"]+"\n"+item["position_lables"]+"\n"+item["updated_at"]
                    )
        return item

    def process_item(self, item, spider):
        self.process_item_saveto_txt_job51(item, spider)
        query = self.dbpool.runInteraction(self._conditional_insert, item)
        query.addErrback(self._handle_error, item, spider)
        return item

    def _conditional_insert(self, tx, item):
        # print item['name']
        sql = "select * from jobs where position_id=%s and platform=%s"
        position_id = (item["position_id"], item["platform"])
        result = tx.execute(sql, position_id)
        if (result == 0):
            sql = "insert into jobs(position_id,position_name,position_lables,work_year,salary,city,education,company_name,industry_field,finance_stage,company_size,updated_at,`time`,platform,avg_salary) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
            params = (
                item["position_id"], item["position_name"], item["position_lables"], item["work_year"], item["salary"],
                item["city"], item["education"], item["company_name"], item["industry_field"],
                item["finance_stage"], item["company_size"], item["updated_at"], item["time"],
                item["platform"], item["avg_salary"])
            tx.execute(sql, params)

    def _handle_error(self, failue, item, spider):
        print('_handle_error')
        print(item)
        print(failue)

爬虫代码调试

#item.py同级目录下创建run.py文件,内容如下,然后再运行此脚本进行调试:
from scrapy import cmdline


name = r'C:\Users\Administrator\Downloads\source\www_job_com-master\www_job_com\spiders\job51_spider.py'
cmd = 'scrapy runspider {0}'.format(name)
cmdline.execute(cmd.split())

Selenium支持网页脚本

Selenium安装

pip install selenium -i https://pypi.tuna.tsinghua.edu.cn/simple

Python实战使用Selenium爬取网页数据

爬取网页数据代码:

from selenium import webdriver
# 指定chromedriver.exe的路径
driver_path = r"C:\path\to\chromedriver.exe"
# 创建一个WebDriver实例,指定使用Chrome浏览器
driver = webdriver.Chrome(driver_path)
# 访问目标网站
driver.get("https://www.example.com")
# 获取网页标题
page_title = driver.title
print("Page Title:", page_title)
# 关闭浏览器
driver.quit()
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

driver_path = r"C:\path\to\chromedriver.exe"
driver = webdriver.Chrome(driver_path)

driver.get("https://www.example.com/dynamic-content")

# 等待指定元素出现,最多等待10秒
element = WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.ID, "joblist-item"))
)

# 操作该元素...
job_list= self.driver.find_elements(By.CLASS_NAME,'joblist-item')#数组
jname = self.driver.find_element(By.CLASS_NAME,'jname').text#字符串

driver.quit()


网站公告

今日签到

点亮在社区的每一天
去签到