ThinkPHP爬虫驱动系统设计

发布于:2025-07-05 ⋅ 阅读:(13) ⋅ 点赞:(0)

核心思路分析

基于 ThinkPHP 容器机制和反射实现,我们可以构建一个类似 think\Manager 的爬虫驱动系统。

1. 数据库结构设计

爬虫驱动配置表

CREATE TABLE `crawl_drivers` (
    `id` int(11) NOT NULL AUTO_INCREMENT,
    `site_id` int(11) NOT NULL COMMENT '站点ID',
    `site_name` varchar(100) NOT NULL COMMENT '站点名称',
    `driver_name` varchar(100) NOT NULL COMMENT '驱动名称(hdmoli)',
    `driver_class` varchar(255) NOT NULL COMMENT '驱动类路径',
    `driver_config` text COMMENT '驱动配置(JSON)',
    `status` tinyint(1) DEFAULT 1 COMMENT '状态',
    `created_at` timestamp DEFAULT CURRENT_TIMESTAMP,
    PRIMARY KEY (`id`),
    UNIQUE KEY `uk_site_driver` (`site_id`, `driver_name`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;

初始化数据

INSERT INTO `crawl_drivers` (`site_id`, `site_name`, `driver_name`, `driver_class`, `driver_config`) VALUES
(1, 'Hdmoli', 'hdmoli', 'app\\services\\docksite\\drivers\\HdmoliDriver', '{"timeout": 30, "retry": 3}');

2. 爬虫驱动管理器

<?php
namespace app\services\docksite;

use think\Manager;
use ReflectionClass;
use InvalidArgumentException;

class CrawlDriverManager extends Manager
{
    protected $namespace = 'app\\services\\docksite\\drivers\\';
    
    /**
     * 获取驱动实例
     */
    public function driver(string $name = null)
    {
        $name = $name ?: $this->getDefaultDriver();
        
        if (is_null($name)) {
            throw new InvalidArgumentException('无法解析爬虫驱动');
        }
        
        return $this->drivers[$name] = $this->getDriver($name);
    }
    
    /**
     * 创建驱动实例
     */
    protected function createDriver(string $name)
    {
        $driverInfo = $this->getDriverConfig($name);
        
        if (!$driverInfo) {
            throw new InvalidArgumentException("驱动 [{$name}] 不存在");
        }
        
        $driverClass = $driverInfo['driver_class'];
        
        // 使用反射检查类是否存在
        if (!class_exists($driverClass)) {
            throw new InvalidArgumentException("驱动类 [{$driverClass}] 不存在");
        }
        
        $reflect = new ReflectionClass($driverClass);
        
        // 检查是否实现了爬虫驱动接口
        if (!$reflect->implementsInterface(CrawlDriverInterface::class)) {
            throw new InvalidArgumentException("驱动类必须实现 CrawlDriverInterface 接口");
        }
        
        // 通过容器创建实例,支持依赖注入
        $config = json_decode($driverInfo['driver_config'] ?? '[]', true);
        return $this->app->invokeClass($driverClass, [$config]);
    }
    
    /**
     * 从数据库获取驱动配置
     */
    protected function getDriverConfig(string $name)
    {
        return app()->db->name('crawl_drivers')
            ->where('driver_name', $name)
            ->where('status', 1)
            ->find();
    }
    
    /**
     * 根据站点ID获取驱动
     */
    public function getBySiteId(int $siteId)
    {
        $driverInfo = app()->db->name('crawl_drivers')
            ->where('site_id', $siteId)
            ->where('status', 1)
            ->find();
            
        if (!$driverInfo) {
            throw new InvalidArgumentException("站点 [{$siteId}] 未配置爬虫驱动");
        }
        
        return $this->driver($driverInfo['driver_name']);
    }
    
    public function getDefaultDriver()
    {
        return null; // 无默认驱动
    }
}

3. 爬虫驱动接口

<?php
namespace app\services\docksite;

interface CrawlDriverInterface
{
    /**
     * 获取列表页数据
     */
    public function getListData(string $url, int $siteId): array;
    
    /**
     * 获取今日更新数据
     */
    public function getTodayData(string $url, int $siteId): array;
    
    /**
     * 获取详情页数据
     */
    public function getDetailData(string $url, int $siteId): array;
    
    /**
     * 获取所有影视数据
     */
    public function getAllFilms(int $siteId, string $siteUrl): bool;
}

4. Hdmoli驱动实现

<?php
namespace app\services\docksite\drivers;

use QL\QueryList;
use app\services\docksite\CrawlDriverInterface;
use app\services\docksite\shequ\BaseService;

class HdmoliDriver extends BaseService implements CrawlDriverInterface
{
    protected $config;
    
    public function __construct(array $config = [])
    {
        $this->config = array_merge([
            'timeout' => 30,
            'retry' => 3,
            'user_agent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        ], $config);
    }
    
    /**
     * 获取列表页数据
     */
    public function getListData(string $url, int $siteId): array
    {
        $rules = $this->getListRules();
        return $this->crawlData($url, $rules, $siteId);
    }
    
    /**
     * 获取今日更新数据
     */
    public function getTodayData(string $url, int $siteId): array
    {
        $rules = $this->getTodayRules();
        return $this->crawlTodayData($url, $rules, $siteId);
    }
    
    /**
     * 获取详情页数据
     */
    public function getDetailData(string $url, int $siteId): array
    {
        $rules = $this->getDetailRules();
        return $this->crawlData($url, $rules, $siteId);
    }
    
    /**
     * 获取所有影视数据
     */
    public function getAllFilms(int $siteId, string $siteUrl): bool
    {
        // 实现原有的 getFilms 逻辑
        return true;
    }
    
    /**
     * 通用爬取方法
     */
    protected function crawlData(string $url, array $rules, int $siteId): array
    {
        $html = $this->makeRequest($url);
        $ql = QueryList::getInstance();
        $ql = $ql->html($html);
        $ql = $ql->rules($rules['selectors'])->range($rules['range'])->query();
        
        return $ql->getData(function ($item) use ($siteId) {
            $item['site_id'] = $siteId;
            return $item;
        })->all();
    }
    
    /**
     * 今日更新专用爬取方法
     */
    protected function crawlTodayData(string $url, array $rules, int $siteId): array
    {
        $html = $this->makeRequest($url);
        $ql = QueryList::getInstance();
        $ql = $ql->html($html);
        $ql->find('div.module-item')->wrap("<div class='goods-item'></div>");
        
        return $ql->rules($rules['selectors'])
            ->range($rules['range'])
            ->query()
            ->getData(function ($item) use ($siteId) {
                $item['site_id'] = $siteId;
                return $item;
            })
            ->all();
    }
    
    /**
     * 发起HTTP请求
     */
    protected function makeRequest(string $url): string
    {
        // 使用原有的 curl_request 方法或重新实现
        return static::curl_request($url);
    }
    
    /**
     * 获取列表页规则
     */
    protected function getListRules(): array
    {
        return [
            'selectors' => [
                'href' => ['a.stui-vodlist__thumb', 'href'],
                'title' => ['a.stui-vodlist__thumb', 'attr(title)']
            ],
            'range' => 'ul.stui-vodlist>li'
        ];
    }
    
    /**
     * 获取今日更新规则
     */
    protected function getTodayRules(): array
    {
        return [
            'selectors' => [
                'href' => ['a.module-card-item-poster', 'href'],
                'title' => ['div.module-card-item-title>a>strong', 'text']
            ],
            'range' => 'div.goods-item'
        ];
    }
    
    /**
     * 获取详情页规则
     */
    protected function getDetailRules(): array
    {
        return [
            'selectors' => [
                'title' => ['.title', 'text'],
                'content' => ['.content', 'text']
            ],
            'range' => '.detail-container'
        ];
    }
}

5. 重构后的HdmoliService

<?php
namespace app\services\docksite\shequ;

use app\services\docksite\CrawlDriverManager;

class HdmoliService extends BaseService
{
    /**
     * 获取异步数据 - 使用驱动方式
     */
    public static function getAsyncData($url, $siteId)
    {
        /** @var CrawlDriverManager $crawlManager */
        $crawlManager = app()->make(CrawlDriverManager::class);
        $driver = $crawlManager->getBySiteId($siteId);
        
        return $driver->getListData($url, $siteId);
    }
    
    /**
     * 获取今日更新 - 使用驱动方式
     */
    public static function getAsyncTodayData($url, $siteId)
    {
        /** @var CrawlDriverManager $crawlManager */
        $crawlManager = app()->make(CrawlDriverManager::class);
        $driver = $crawlManager->getBySiteId($siteId);
        
        return $driver->getTodayData($url, $siteId);
    }
    
    /**
     * 获取所有影视剧 - 使用驱动方式
     */
    public static function getFilms($siteId, $siteUrl)
    {
        /** @var CrawlDriverManager $crawlManager */
        $crawlManager = app()->make(CrawlDriverManager::class);
        $driver = $crawlManager->getBySiteId($siteId);
        
        return $driver->getAllFilms($siteId, $siteUrl);
    }
}

6. 服务注册

config/provider.php 中注册服务:

return [
    // ... 其他服务
    \app\services\docksite\CrawlDriverManager::class,
];

优势总结

  1. 解耦合: 爬虫逻辑与具体实现分离
  2. 可扩展: 新增站点只需实现接口并配置数据库
  3. 动态配置: 通过数据库动态管理驱动配置
  4. 依赖注入: 利用 ThinkPHP 容器自动注入依赖
  5. 反射机制: 运行时动态加载和验证驱动类
  6. 统一管理: 类似框架内置的缓存、数据库等管理方式

这种设计充分利用了您代码仓库中的 ThinkPHP 框架特性,实现了高度可配置和可扩展的爬虫驱动系统。