介绍:以中关村为例采集手机信息异步持久化到mysql和mongo(学习使用,请勿用于商业行为)
1.创建scrapy项目 (Scrapy_test_spider)
1. scrapy startproject Scrapy_test_spider
2. cd Scrapy_test_spider
3. scrapy genspider zgc_spider xxx.com
2.项目目录结构
3.setting文件初步配置
# UA池
USER_AGENT_LIST = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
from random import choice
USER_AGENT = choice(USER_AGENT_LIST) #随机UA
ROBOTSTXT_OBEY = False # ROBOT协议
# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 5 # 设置并发数为5,默认16
4.item中定义字段信息
- 比如我们要采集手机的标题,价格,评分,评论数量。
import scrapy
class ScrapyTestSpiderItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field() #标题
price = scrapy.Field() #价格
score = scrapy.Field() #评分
comment_total = scrapy.Field() #评论总数
5.mysql设计表结构
6.爬虫代码编写
python
import scrapy
from Scrapy_test_spider.items import ScrapyTestSpiderItem
class ZgcSpiderSpider(scrapy.Spider):
name = 'zgc_spider'
allowed_domains = ['detail.zol.com.cn']
start_urls = ['https://detail.zol.com.cn/cell_phone_index/subcate57_list_1.html']
count = 5 #控制翻页次数
def parse(self, response):
item = ScrapyTestSpiderItem()
li_list = response.css('#J_PicMode li')
for li in li_list:
price = li.css('div.price-row > span.price.price-normal > b.price-type::text').get()
title = li.css('li > h3 > a::text').get()
score = li.css('div.comment-row > span.score::text').get()
comment_total = li.css('div.comment-row > a.comment-num::text').get()
if not price or not title or not score or not comment_total:
continue
item['title'] = title
item['price'] = float(price)
item['score'] = float(score)
item['comment_total'] = int(comment_total.replace('人点评',''))
yield item
# 翻页操作
next_page = response.css('a.next::attr(href)').get()
if next_page and self.count:
print(response.urljoin(next_page))
yield scrapy.Request(response.urljoin(next_page),callback=self.parse)
self.count -= 1
if __name__ == '__main__':
from scrapy.cmdline import execute
execute('scrapy crawl zgc_spider'.split())
7.pipelines管道异步持久化到mysql
from twisted.enterprise import adbapi
from scrapy.utils.project import get_project_settings
settings = get_project_settings()
class ScrapyTestSpiderPipelineMySql(object):
def __init__(self,dbpool):
self.dbpool = dbpool
self.table = settings.get('MYSQL_TABLE')
self._sql = None
@classmethod
def from_crawler(cls, crawler):
parmas = {
'host': crawler.settings.get('MYSQL_HOST'),
'port': crawler.settings.get('MYSQL_PORT'),
'user': crawler.settings.get('MYSQL_USER'),
'password': crawler.settings.get('MYSQL_PWD'),
'db': crawler.settings.get('MYSQL_DB'),
'charset': 'utf8'
}
dbpool = adbapi.ConnectionPool(
'pymysql',
**parmas
)
return cls(dbpool)
def process_item(self, item, spider):
query = self.dbpool.runInteraction(
self.insert_data_to_mysql,
item,
spider
)
#插入失败时回调
query.addErrback(
self.handle_error,
item
)
@property
def sql(self):
if not self._sql:
self._sql = f"""
insert into {self.table}(title, price, score, comment_total)
VALUE (%s, %s, %s, %s)
"""
return self._sql
return self._sql
def insert_data_to_mysql(self,cursors,item,spider):
parmas = (item['title'], item['price'], item['score'], item['comment_total'])
cursors.execute(self.sql,parmas)
print('数据插入成功!')
def handle_error(self,failure,item):
print(f'数据插入失败!--------------------{failure}')
def close_spider(self, spider):
self.dbpool.close()
8.setting文件添加如下内容
MYSQL_HOST = '127.0.0.1'
MYSQL_PORT = 3306
MYSQL_USER = 'root'
MYSQL_PWD = '123456'
MYSQL_DB = 'my_test'
MYSQL_TABLE = 'zgc_data'
ITEM_PIPELINES = {
'Scrapy_test_spider.pipelines.ScrapyTestSpiderPipelineMySql': 300,
}
9.启动爬虫程序
if __name__ == '__main__':
from scrapy.cmdline import execute
execute('scrapy crawl zgc_spider'.split())
- 速度非常快异步持久化到mysql中。
10.异步持久化到mongo
pipelines中代码编写
import pymongo
from copy import deepcopy
from twisted.internet import reactor, defer
class ScrapyTestSpiderPipelineMongo(object):
def __init__(self, mongo_uri, mongo_db, mongo_coll):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
self.mongo_col = mongo_coll
@classmethod
def from_crawler(cls, crawler):
mongo_uri=crawler.settings.get('MONGO_URI')
mongo_db=crawler.settings.get('MONGO_DB')
mongo_coll=crawler.settings.get('MONGO_COL')
return cls(mongo_uri,mongo_db,mongo_coll)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.mongodb = self.client[self.mongo_db]
def close_spider(self, spider):
self.client.close()
@defer.inlineCallbacks
def process_item(self, item, spider):
item = deepcopy(item)
defer_out = defer.Deferred()
reactor.callInThread(self._insert, item, defer_out, spider)
yield defer_out
defer.returnValue(item)
def _insert(self, item, defer_out, spider):
self.mongodb[self.mongo_col].insert(dict(item))
reactor.callFromThread(defer_out.callback, item)
settings中添加
MONGO_URL = '127.0.0.1:27017'
MONGO_DB = 'my_test'
MONGO_COL= 'zgc_data'
ITEM_PIPELINES = {
'Scrapy_test_spider.pipelines.ScrapyTestSpiderPipelineMongo': 300,
}
启动爬虫项目
mongo数据库
以上均为学习分享,可能存在不足或者还有其他更优雅的语法,欢迎评论区留言交流!
本文含有隐藏内容,请 开通VIP 后查看