Python爬虫_某宝网案例

发布于:2023-01-20 ⋅ 阅读:(21) ⋅ 点赞:(0) ⋅ 评论:(0)

Python爬虫_某宝网案例

一、导入第三方库,确定url,定义headers ,伪装爬虫代码

import requests
url = 'https://s.taobao.com/search?q=%E6%98%BE%E5%8D%A1&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20220811&ie=utf8'
headers = {
    'cookie':'cna=zIsvG8QofGgCAXAc0HQF5jMC; ariaDefaultTheme=undefined; t=9ac1f71719420207d1f87d27eb676a4c; _m_h5_tk=adcc3c021e3b87caf717886de2956b4f_1660197714179; _m_h5_tk_enc=1af4dc9e2bf60884ef3d0e255253f6b2; xlly_s=1; cookie2=16aa0d04efd876db9a0a6ea3a6201798; _tb_token_=e8f30e5eeeaee; _samesite_flag_=true; sgcookie=E100lJaxeK%2FAPyj3QKfLcL9nnFAvbSQ1NVa%2Fj5KnkOmbyuRuRVi5UIhuo%2F950QL5HA5pu7UW1W7o5e1gKyskjeASeiG%2Fu8b%2Bx2w%2BNK1TNfbC3%2BY%3D; unb=3403337303; uc1=cookie15=Vq8l%2BKCLz3%2F65A%3D%3D&cookie14=UoeyDt7VJs5rtg%3D%3D&existShop=false&pas=0&cookie16=W5iHLLyFPlMGbLDwA%2BdvAGZqLg%3D%3D&cookie21=VFC%2FuZ9ainBZ; uc3=lg2=W5iHLLyFOGW7aA%3D%3D&id2=UNQ3HL3rNGIh9Q%3D%3D&vt3=F8dCv4G1KArg9Z5EDnI%3D&nk2=py7xJGsI3wn8W4Q%3D; csg=abea7184; lgc=%5Cu9006%5Cu98CE%5Cu8FFD%5Cu98CEzgf; cancelledSubSites=empty; cookie17=UNQ3HL3rNGIh9Q%3D%3D; dnk=%5Cu9006%5Cu98CE%5Cu8FFD%5Cu98CEzgf; skt=7eb00df2545b28f1; existShop=MTY2MDE4Nzg5Mw%3D%3D; uc4=id4=0%40UgP8IaO4dk7rKbnRwpAL1RCASure&nk4=0%40pRj%2BYG91XDR4VZfDtp5sZkTvbfnKjg%3D%3D; tracknick=%5Cu9006%5Cu98CE%5Cu8FFD%5Cu98CEzgf; _cc_=UIHiLt3xSw%3D%3D; _l_g_=Ug%3D%3D; sg=f3f; _nk_=%5Cu9006%5Cu98CE%5Cu8FFD%5Cu98CEzgf; cookie1=URmvlmqe9vvqj4%2FetXdyS32Np7aof75Ji3WJNOrxmAo%3D; enc=Wc21Ym4ZtT2bAKugjrg4mga24om36KJRqmV58dwu1eCI9NiOMGxoPn%2BuEfXDf82wAhxp6sq2XAkI8TAxsuD0CQ%3D%3D; JSESSIONID=110B64FBCE3C522DA285BDE7FEF11591; tfstk=cun5BPOtj_fSjuRbgz928VtWelqCZadghwVxFImyTdyXp5M5i5ja1Iq4G_qUp-1..; l=eB_Q_LVPLdI5ulzEBOfwnurza77tsIRAguPzaNbMiOCPO-1p5S3FW6YRMrT9CnGVh6kvR3k0hWaBBeYBqIv4n5U62j-lasDmn; isg=BBoasw0KLOE0w6BNINhb8iDla8A8S54lfo04kySTwK14l7rRDNnmNRflY2MLRxa9',
    'referer':'https://s.taobao.com/search?q=%E6%98%BE%E5%8D%A1&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.jianhua.201856-taobao-item.2&ie=utf8&initiative_id=tbindexz_20170306',
    'sec-ch-ua':'"Chromium";v="104", " Not A;Brand";v="99", "Microsoft Edge";v="104"',
    'sec-ch-ua-mobile':'?0',
    'sec-ch-ua-platform':'"Windows"',
    'sec-fetch-dest':'document',
    'sec-fetch-mode':'navigate',
    'sec-fetch-site':'same-origin',
    'sec-fetch-user':'?1',
    'upgrade-insecure-requests':'1',
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.81 Safari/537.36 Edg/104.0.1293.47',
}

注:选中文本后,ctrl+R,采用正则表达式:(.*?):(.*)全部替换为'$1':'$2',这样我们就将每个字段添加了单引号。
替换过后,务必将多余的空格删除,否则会报错

二、版本一完整代码(数据保存至CSV文件)

import re
import json
import pprint
import requests
import csv # 写入csv文件中

with open('taobao.csv','w',encoding='ANSI',newline='') as filename :
    # 定义表头
    csvwriter = csv.DictWriter(filename,fieldnames=['标题','价格','店铺','购买人数','地点','商品详情页','店铺链接','图片链接'])
    # 写入表头
    csvwriter.writeheader()
    url = 'https://s.taobao.com/search?q=%E6%98%BE%E5%8D%A1&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20220811&ie=utf8'
    headers = {
        'cookie':'cna=zIsvG8QofGgCAXAc0HQF5jMC; ariaDefaultTheme=undefined; t=9ac1f71719420207d1f87d27eb676a4c; _m_h5_tk=adcc3c021e3b87caf717886de2956b4f_1660197714179; _m_h5_tk_enc=1af4dc9e2bf60884ef3d0e255253f6b2; xlly_s=1; cookie2=16aa0d04efd876db9a0a6ea3a6201798; _tb_token_=e8f30e5eeeaee; _samesite_flag_=true; sgcookie=E100lJaxeK%2FAPyj3QKfLcL9nnFAvbSQ1NVa%2Fj5KnkOmbyuRuRVi5UIhuo%2F950QL5HA5pu7UW1W7o5e1gKyskjeASeiG%2Fu8b%2Bx2w%2BNK1TNfbC3%2BY%3D; unb=3403337303; uc1=cookie15=Vq8l%2BKCLz3%2F65A%3D%3D&cookie14=UoeyDt7VJs5rtg%3D%3D&existShop=false&pas=0&cookie16=W5iHLLyFPlMGbLDwA%2BdvAGZqLg%3D%3D&cookie21=VFC%2FuZ9ainBZ; uc3=lg2=W5iHLLyFOGW7aA%3D%3D&id2=UNQ3HL3rNGIh9Q%3D%3D&vt3=F8dCv4G1KArg9Z5EDnI%3D&nk2=py7xJGsI3wn8W4Q%3D; csg=abea7184; lgc=%5Cu9006%5Cu98CE%5Cu8FFD%5Cu98CEzgf; cancelledSubSites=empty; cookie17=UNQ3HL3rNGIh9Q%3D%3D; dnk=%5Cu9006%5Cu98CE%5Cu8FFD%5Cu98CEzgf; skt=7eb00df2545b28f1; existShop=MTY2MDE4Nzg5Mw%3D%3D; uc4=id4=0%40UgP8IaO4dk7rKbnRwpAL1RCASure&nk4=0%40pRj%2BYG91XDR4VZfDtp5sZkTvbfnKjg%3D%3D; tracknick=%5Cu9006%5Cu98CE%5Cu8FFD%5Cu98CEzgf; _cc_=UIHiLt3xSw%3D%3D; _l_g_=Ug%3D%3D; sg=f3f; _nk_=%5Cu9006%5Cu98CE%5Cu8FFD%5Cu98CEzgf; cookie1=URmvlmqe9vvqj4%2FetXdyS32Np7aof75Ji3WJNOrxmAo%3D; enc=Wc21Ym4ZtT2bAKugjrg4mga24om36KJRqmV58dwu1eCI9NiOMGxoPn%2BuEfXDf82wAhxp6sq2XAkI8TAxsuD0CQ%3D%3D; JSESSIONID=110B64FBCE3C522DA285BDE7FEF11591; tfstk=cun5BPOtj_fSjuRbgz928VtWelqCZadghwVxFImyTdyXp5M5i5ja1Iq4G_qUp-1..; l=eB_Q_LVPLdI5ulzEBOfwnurza77tsIRAguPzaNbMiOCPO-1p5S3FW6YRMrT9CnGVh6kvR3k0hWaBBeYBqIv4n5U62j-lasDmn; isg=BBoasw0KLOE0w6BNINhb8iDla8A8S54lfo04kySTwK14l7rRDNnmNRflY2MLRxa9',
        'referer':'https://s.taobao.com/search?q=%E6%98%BE%E5%8D%A1&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.jianhua.201856-taobao-item.2&ie=utf8&initiative_id=tbindexz_20170306',
        'sec-ch-ua':'"Chromium";v="104", " Not A;Brand";v="99", "Microsoft Edge";v="104"',
        'sec-ch-ua-mobile':'?0',
        'sec-ch-ua-platform':'"Windows"',
        'sec-fetch-dest':'document',
        'sec-fetch-mode':'navigate',
        'sec-fetch-site':'same-origin',
        'sec-fetch-user':'?1',
        'upgrade-insecure-requests':'1',
        'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.81 Safari/537.36 Edg/104.0.1293.47',
    }
    response = requests.get(url=url,headers=headers)
    # print(response.text)
    html_data = re.findall('g_page_config = (.*);',response.text)[0]
    # print(html_data)
    json_data = json.loads(html_data) # 将原本的json数据格式转换为了python字典
    # pprint.pprint(json_data)
    # 产品标题 raw_title 在'mods' 'itemlist' 'data' 'auctions'标签内
    data = json_data['mods']['itemlist']['data']['auctions']
    
    for index in data :
        dict = {
            '标题' : index['raw_title'], # 将标题取出写入字典中
            '价格' : index['view_price'],
            '店铺' : index['nick'],
            '购买人数' : index['view_sales'],
            '地点' : index['item_loc'],
            '商品详情页' : 'https:' + index['detail_url'],
            '店铺链接' : index['shopLink'],
            '图片链接' : 'https:' + index['pic_url']
        }
        csvwriter.writerow(dict) # 数据写入csv文件
        print(dict)

三、版本二完整代码(数据保存至sqlite3数据库)

import re
import json
import pprint
import requests
import csv # 写入csv文件中
import sqlite3  #进行SQLite数据库操作

dbpath = 'taobao.db'
def getdata() :
    init_db(dbpath)
    conn = sqlite3.connect(dbpath)
    cur = conn.cursor()  # 获取游标

    url = 'https://s.taobao.com/search?q=%E6%98%BE%E5%8D%A1&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20220811&ie=utf8'
    headers = {
        'cookie':'cna=zIsvG8QofGgCAXAc0HQF5jMC; ariaDefaultTheme=undefined; t=9ac1f71719420207d1f87d27eb676a4c; _m_h5_tk=adcc3c021e3b87caf717886de2956b4f_1660197714179; _m_h5_tk_enc=1af4dc9e2bf60884ef3d0e255253f6b2; xlly_s=1; cookie2=16aa0d04efd876db9a0a6ea3a6201798; _tb_token_=e8f30e5eeeaee; _samesite_flag_=true; sgcookie=E100lJaxeK%2FAPyj3QKfLcL9nnFAvbSQ1NVa%2Fj5KnkOmbyuRuRVi5UIhuo%2F950QL5HA5pu7UW1W7o5e1gKyskjeASeiG%2Fu8b%2Bx2w%2BNK1TNfbC3%2BY%3D; unb=3403337303; uc1=cookie15=Vq8l%2BKCLz3%2F65A%3D%3D&cookie14=UoeyDt7VJs5rtg%3D%3D&existShop=false&pas=0&cookie16=W5iHLLyFPlMGbLDwA%2BdvAGZqLg%3D%3D&cookie21=VFC%2FuZ9ainBZ; uc3=lg2=W5iHLLyFOGW7aA%3D%3D&id2=UNQ3HL3rNGIh9Q%3D%3D&vt3=F8dCv4G1KArg9Z5EDnI%3D&nk2=py7xJGsI3wn8W4Q%3D; csg=abea7184; lgc=%5Cu9006%5Cu98CE%5Cu8FFD%5Cu98CEzgf; cancelledSubSites=empty; cookie17=UNQ3HL3rNGIh9Q%3D%3D; dnk=%5Cu9006%5Cu98CE%5Cu8FFD%5Cu98CEzgf; skt=7eb00df2545b28f1; existShop=MTY2MDE4Nzg5Mw%3D%3D; uc4=id4=0%40UgP8IaO4dk7rKbnRwpAL1RCASure&nk4=0%40pRj%2BYG91XDR4VZfDtp5sZkTvbfnKjg%3D%3D; tracknick=%5Cu9006%5Cu98CE%5Cu8FFD%5Cu98CEzgf; _cc_=UIHiLt3xSw%3D%3D; _l_g_=Ug%3D%3D; sg=f3f; _nk_=%5Cu9006%5Cu98CE%5Cu8FFD%5Cu98CEzgf; cookie1=URmvlmqe9vvqj4%2FetXdyS32Np7aof75Ji3WJNOrxmAo%3D; enc=Wc21Ym4ZtT2bAKugjrg4mga24om36KJRqmV58dwu1eCI9NiOMGxoPn%2BuEfXDf82wAhxp6sq2XAkI8TAxsuD0CQ%3D%3D; JSESSIONID=110B64FBCE3C522DA285BDE7FEF11591; tfstk=cun5BPOtj_fSjuRbgz928VtWelqCZadghwVxFImyTdyXp5M5i5ja1Iq4G_qUp-1..; l=eB_Q_LVPLdI5ulzEBOfwnurza77tsIRAguPzaNbMiOCPO-1p5S3FW6YRMrT9CnGVh6kvR3k0hWaBBeYBqIv4n5U62j-lasDmn; isg=BBoasw0KLOE0w6BNINhb8iDla8A8S54lfo04kySTwK14l7rRDNnmNRflY2MLRxa9',
        'referer':'https://s.taobao.com/search?q=%E6%98%BE%E5%8D%A1&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.jianhua.201856-taobao-item.2&ie=utf8&initiative_id=tbindexz_20170306',
        'sec-ch-ua':'"Chromium";v="104", " Not A;Brand";v="99", "Microsoft Edge";v="104"',
        'sec-ch-ua-mobile':'?0',
        'sec-ch-ua-platform':'"Windows"',
        'sec-fetch-dest':'document',
        'sec-fetch-mode':'navigate',
        'sec-fetch-site':'same-origin',
        'sec-fetch-user':'?1',
        'upgrade-insecure-requests':'1',
        'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.81 Safari/537.36 Edg/104.0.1293.47',
    }
    response = requests.get(url=url,headers=headers)
    # print(response.text)
    html_data = re.findall('g_page_config = (.*);',response.text)[0]
    # print(html_data)
    json_data = json.loads(html_data) # 将原本的json数据格式转换为了python字典
    # pprint.pprint(json_data)
    # 产品标题 raw_title 在'mods' 'itemlist' 'data' 'auctions'标签内
    data = json_data['mods']['itemlist']['data']['auctions']

    for value in data :
        sql = '''
            insert into taobao(
                rawtitle,viewprie,nick,viewsales,itemloc,detailurl,shoplink,picurl)
                values('%s','%s','%s','%s','%s','%s','%s','%s')'''% (value['raw_title'],value['view_price'],value['nick'],value['view_sales'],value['item_loc'],value['detail_url'],value['shopLink'],value['pic_url'])
        cur.execute(sql)
        conn.commit()
    cur.close()
    conn.close()

# 初始化创建数据库
def init_db(dbpath) :
    sql = '''
       create table taobao(
           id integer primary key autoincrement,
           rawtitle varchar,
           viewprie numeric,
           nick varchar,
           viewsales varchar,
           itemloc varchar,
           detailurl text,
           shoplink text,
           picurl text
       ) 
    '''
    conn = sqlite3.connect(dbpath)
    cursor = conn.cursor() # 获取数据库游标
    cursor.execute(sql)
    conn.commit()
    conn.close()

getdata()
print("保存完成!")

在这里插入图片描述


网站公告

欢迎关注微信公众号

今日签到

点亮在社区的每一天
签到