爬取89ip代理、 爬取豆瓣电影

发布于:2024-05-06 ⋅ 阅读:(21) ⋅ 点赞:(0)

1 爬取89ip代理
2 爬取豆瓣电影

1 爬取89ip代理

import requests
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
from requests.exceptions import ProxyError


class SpiderIP:
    def __init__(self):
        # 定义目标地址哦
        self.tag_url = "https://www.89ip.cn/"

        self.headers = {
            "User-Agent": UserAgent().random
        }

    def spider_index_response(self):
        response = requests.get(url=self.tag_url, headers=self.headers)
        return response.text

    def create_soup(self):
        return BeautifulSoup(self.spider_index_response(), 'lxml')

    def spider_ip_port(self):
        soup = self.create_soup()
        tr_list = soup.select('div.layui-row.layui-col-space15 > div.layui-col-md8 > div > div.layui-form > table > tbody > tr')
        data_list = []
        for tr in tr_list:
            td_list = tr.find_all("td")
            ip = td_list[0].text.strip()
            port = td_list[1].string.strip()
            store = td_list[3].get_text().strip()
            # {"http":"http://IP:PORT"}
            data_list.append({"store": store, "proxies": {
                "http": f"http://{ip}:{port}"
            }})
        return data_list

    def __spider_baidu(self, proxies):
        try:
            response = requests.get("http://httpbin.org/get", headers=self.headers, proxies=proxies, timeout=2)
            # 检查请求是否成功
            if response.status_code == 200:
                # 处理响应内容
                response.encoding = 'utf-8'  # 设置响应内容的编码格式为utf-8
                # 解析JSON结果
                data = response.text  # 获取响应信息
                print(data)
            else:
                print("请求失败:", response.status_code)
        except ProxyError:
            pass

    def test_ip(self):
        data_list = self.spider_ip_port()
        for index, data in enumerate(data_list, start=1):
            store = data.get("store")
            proxies = data.get("proxies")
            print(f"这是第 {index} 条数据! 运营商是 :>>> {store}")
            proxies = self.__spider_baidu(proxies=proxies)
            if proxies:
                print(f"当前代理可用")
            else:
                print(f"已废除")

    def main(self):
        self.test_ip()


if __name__ == '__main__':
    s = SpiderIP()
    s.main()

2 爬取豆瓣电影

import requests
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
from lxml import etree


class SpiderBase:
    def __init__(self):
        self.tag_url_list = []
        self.headers = {
            "User-Agent": UserAgent().random
        }


class SpiderTopSoup(SpiderBase):
    def __init__(self):
        super().__init__()
        self.tag_url_list = self.__create_tag_url_list()

    def __create_tag_url_list(self):
        tag_url_list = []
        for i in range(0, int(250 / 25)):
            if i == 0:
                tag_url = "https://movie.douban.com/top250"
                tag_url_list.append(tag_url)
            else:
                tag_url = f"https://movie.douban.com/top250?start={i * 25}"
                tag_url_list.append(tag_url)
        return tag_url_list

    def __create_soup(self, page_text):
        return BeautifulSoup(page_text, 'lxml')

    def __spider_detail_data(self, soup):
        data_list = []
        div_list = soup.find_all("div", class_="item")
        for div in div_list:
            #
            pic_div = div.find("div", class_="pic")
            # 封面图链接地址
            img_url = pic_div.a.img.get("src")
            # 排名
            level = pic_div.em.text
            # 详情链接
            detail_url = pic_div.a.get("href")

            bd_a_span_list = div.find("div", class_="info").find("div", class_="hd").a.find_all("span")

            try:
                title = bd_a_span_list[0].text
            except:
                title = ""
            try:
                title_eg = bd_a_span_list[1].text
            except:
                title_eg = ""
            try:
                title_desc = bd_a_span_list[2].text
            except:
                title_desc = ""

            bd_div = div.find("div", class_="info").find("div", class_="bd")

            # 导演和上映日期
            action, publish_date = [data.replace("\xa0", "").strip() for data in bd_div.p.text.strip().split("\n")]

            # 评分 和 评价
            span_list = bd_div.find("div", class_="star").find_all("span")

            score = span_list[1].text
            comment_num = span_list[-1].text[0:-3]

            # 格言
            try:
                quote = bd_div.find("p", class_="quote").span.text
            except:
                quote = ""

            data_list.append({
                "title": title,
                "title_eg": title_eg,
                "title_desc": title_desc,
                "img_url": img_url,
                "level": level,
                "detail_url": detail_url,
                "action": action,
                "publish_date": publish_date,
                "score": score,
                "comment_num": comment_num,
                "quote": quote,
            })
            print(data_list)
        return data_list

    def spider_index_data(self, tag_url):
        response = requests.get(url=tag_url, headers=self.headers)
        soup = self.__create_soup(page_text=response.text)
        return self.__spider_detail_data(soup=soup)

    def main(self):
        data_list_all = []
        for tag_url in self.tag_url_list:
            data_list = self.spider_index_data(tag_url=tag_url)
            data_list_all.extend(data_list)

        print(len(data_list_all))


if __name__ == '__main__':
    s = SpiderTopSoup()
    s.main()


版本2(建议用这个)

import requests
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
from lxml import etree

movie_dict = {
    "title": '电影名',
    "title_eg": '英文名',
    "title_desc": '简介',
    "img_url": '图片链接',
    "level": '级别',
    "detail_url": '播放地址',
    "action": '导演和演员',
    "publish_date": '播放日期',
    "score": '评分',
    "comment_num": '评论数',
    "quote": '格言',
}

class SpiderBase:
    def __init__(self):
        self.tag_url_list = []
        self.headers = {
            "User-Agent": UserAgent().random
        }


class SpiderTopSoup(SpiderBase):
    def __init__(self):
        super().__init__()
        self.tag_url_list = self.__create_tag_url_list()

    def __create_tag_url_list(self):
        tag_url_list = []
        for i in range(0, int(250 / 25)):
            if i == 0:
                tag_url = "https://movie.douban.com/top250"
                tag_url_list.append(tag_url)
            else:
                tag_url = f"https://movie.douban.com/top250?start={i * 25}"
                tag_url_list.append(tag_url)
        return tag_url_list

    def __create_soup(self, page_text):
        return BeautifulSoup(page_text, 'lxml')

    def __spider_detail_data(self, soup):
        data_list = []
        div_list = soup.find_all("div", class_="item")
        for div in div_list:
            #
            pic_div = div.find("div", class_="pic")
            # 封面图链接地址
            img_url = pic_div.a.img.get("src")
            # 排名
            level = pic_div.em.text
            # 详情链接
            detail_url = pic_div.a.get("href")

            bd_a_span_list = div.find("div", class_="info").find("div", class_="hd").a.find_all("span")

            try:
                title = bd_a_span_list[0].text
            except:
                title = ""
            try:
                title_eg = bd_a_span_list[1].text
            except:
                title_eg = ""
            try:
                title_desc = bd_a_span_list[2].text
            except:
                title_desc = ""

            bd_div = div.find("div", class_="info").find("div", class_="bd")

            # 导演和上映日期
            action, publish_date = [data.replace("\xa0", "").strip() for data in bd_div.p.text.strip().split("\n")]

            # 评分 和 评价
            span_list = bd_div.find("div", class_="star").find_all("span")

            score = span_list[1].text
            comment_num = span_list[-1].text[0:-3]

            # 格言
            try:
                quote = bd_div.find("p", class_="quote").span.text
            except:
                quote = ""

            data_dict = {
                "title": title,
                "title_eg": title_eg,
                "title_desc": title_desc,
                "img_url": img_url,
                "level": level,
                "detail_url": detail_url,
                "action": action,
                "publish_date": publish_date,
                "score": score,
                "comment_num": comment_num,
                "quote": quote,
            }


            for key, value in movie_dict.items():
                new_dict = f"{value}: {data_dict[key]}"
                data_list.append(new_dict)
            print(data_list)
        return data_list

    def spider_index_data(self, tag_url):
        response = requests.get(url=tag_url, headers=self.headers)
        soup = self.__create_soup(page_text=response.text)
        return self.__spider_detail_data(soup=soup)

    def main(self):
        data_list_all = []
        for tag_url in self.tag_url_list:
            data_list = self.spider_index_data(tag_url=tag_url)
            data_list_all.extend(data_list)

        print(len(data_list_all))


if __name__ == '__main__':
    s = SpiderTopSoup()
    s.main()