使用easyswoole 的 spider组件实现定时抓取博客文章, 只做学术交流,不实际进行运用,如果造成不好的影响请及时联系我
easyswoole spider 组件地址: 点击查看
项目规划
做一个定时爬虫程序,定时抓取大佬博客(大佬的博客的链接可在代码中看到)的文章发布到自己的博客上,每次抓取后,将最新的文章记录到文件,下次再进行抓取只抓取到上次记录的文章即可。
1.项目建立
安装 easyswoole 框架
composer require easyswoole/easyswoole=3.x
php vendor/easyswoole/easyswoole/bin/easyswoole install
安装spider 组件
composer require easyswoole/spider
2.官方使用组件说明
组件快速使用
以百度搜索为例,根据搜索关键词爬出每次检索结果前几页的特定数据
Product
<?php
namespace App\Spider;
use EasySwoole\HttpClient\HttpClient;
use EasySwoole\Spider\Config\ProductConfig;
use EasySwoole\Spider\Hole\ProductAbstract;
use EasySwoole\Spider\ProductResult;
use QL\QueryList;
use EasySwoole\FastCache\Cache;
class ProductTest extends ProductAbstract
{
public function product():ProductResult
{
// TODO: Implement product() method.
// 请求地址数据
$httpClient = new HttpClient($this->productConfig->getUrl());
$httpClient->setHeader('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36');
$body = $httpClient->get()->getBody();
// 先将每个搜索结果的a标签内容拿到
$rules = [
'search_result' => ['.c-container .t', 'text', 'a']
];
$searchResult = QueryList::rules($rules)->html($body)->query()->getData();
$data = [];
foreach ($searchResult as $result) {
$item = [
'href' => QueryList::html($result['search_result'])->find('a')->attr('href'),
'text' => QueryList::html($result['search_result'])->find('a')->text()
];
$data[] = $item;
}
$productJobOtherInfo = $this->productConfig->getOtherInfo();
// 下一批任务
$productJobConfigs = [];
if ($productJobOtherInfo['page'] === 1) {
for($i=1;$i<5;$i++) {
$pn = $i*10;
$productJobConfig = [
'url' => "https://www.baidu.com/s?wd={$productJobOtherInfo['word']}&pn={$pn}",
'otherInfo' => [
'word' => $productJobOtherInfo['word'],
'page' => $i+1
]
];
$productJobConfigs[] = $productJobConfig;
}
$word = Cache::getInstance()->deQueue(self::SEARCH_WORDS);
if (!empty($word)) {
$productJobConfigs[] = [
'url' => "https://www.baidu.com/s?wd={$word}&pn=0",
'otherInfo' => [
'word' => $word,
'page' => 1
]
];
}
}
$result = new ProductResult();
$result->setProductJobConfigs($productJobConfigs)->setConsumeData($data);
return $result;
}
}
Consume
<?php
namespace App\Spider;
use EasySwoole\Spider\ConsumeJob;
use EasySwoole\Spider\Hole\ConsumeAbstract;
class ConsumeTest extends ConsumeAbstract
{
public function consume()
{
// TODO: Implement consume() method.
$data = $this->getJobData();
$items = '';
foreach ($data as $item) {
$items .= implode("\t", $item)."\n";
}
file_put_contents('baidu.txt', $items, FILE_APPEND);
}
}
注册爬虫组件
public static function mainServerCreate(EventRegister $register)
{
$spiderConfig = [
'product' => ProductTest::class, // 必须
'consume' => ConsumeTest::class, // 必须
'queueType' => SpiderConfig::QUEUE_TYPE_FAST_CACHE, // 通信类型默认是fast-cache不支持分布式,如需分布式可使用SpiderConfig::QUEUE_TYPE_REDIS,或者自行实现通信队列
'queue' => '自定义队列,如使用组件自带则不需要', // 自定义通信队列
'queueConfig' => '自定义队列配置,目前只有SpiderConfig::QUEUE_TYPE_REDIS需要',
'maxCurrency' => 128 // 最大协程并发数(单台机器)
];
SpiderServer::getInstance()
->setSpiderConfig($spiderConfig)
->attachProcess(ServerManager::getInstance()->getSwooleServer());
}
投递任务
$words = [
'php',
'java',
'go'
];
foreach ($words as $word) {
Cache::getInstance()->enQueue('SEARCH_WORDS', $word);
}
$wd = Cache::getInstance()->deQueue('SEARCH_WORDS');
SpiderClient::getInstance()->addJob(
'https://www.baidu.com/s?wd=php&pn=0',
[
'page' => 1,
'word' => $wd
]
);
下面就是自己实现的过程,有两个版本。
bat1.0
根据easyswoole 的官方介绍,相信会有小伙伴云里雾里,不知道投递任务从哪开始,又或者安装组件运行示例的时候出现了问题,下面跟我一起去实现自己的爬虫程序吧。
如果出现了提示 https
的问题,请参阅之前文章 Swoole 安装 Swoole 出现 Enable openssl support, require openssl library 的解决办法
现在默认大家已经安装好了框架,装好了spider组件, 那么你们的目录结构应该是这样的:
AppSpiderProductTest.php 是爬虫任务发布的类
AppSpiderConsumeTest.php 是爬虫任务爬取结果我们处理结果的类
下面我们按照程序运行来一步步的看涉及到的文件
1.EasySwooleEvent.php 注册爬虫组件及其他的组件
<?php
namespace EasySwoole\EasySwoole;
use App\Crontab\TaskOne;
use App\Spider\ConsumeTest;
use App\Spider\ProductTest;
use EasySwoole\EasySwoole\Crontab\Crontab;
use EasySwoole\EasySwoole\Swoole\EventRegister;
use EasySwoole\EasySwoole\AbstractInterface\Event;
use EasySwoole\Http\Request;
use EasySwoole\Http\Response;
use EasySwoole\ORM\Db\Connection;
use EasySwoole\ORM\DbManager;
use EasySwoole\Spider\Config\SpiderConfig;
use EasySwoole\Spider\SpiderServer;
class EasySwooleEvent implements Event
{
public static function initialize()
{
// TODO: Implement initialize() method.
date_default_timezone_set('Asia/Shanghai');
//从配置文件dev.php 或 produce.php 中获取 MYSQL配置, 注册MYSQL 链接池
$config = new \EasySwoole\ORM\Db\Config(Config::getInstance()->getConf('MYSQL'));
$config->setMaxObjectNum(20); //配置连接池最大数量
DbManager::getInstance()->addConnection(new Connection($config));
}
public static function mainServerCreate(EventRegister $register)
{
// TODO: Implement mainServerCreate() method.
//注册爬虫组件
$spiderConfig = [
'product' => ProductTest::class, // 必须
'consume' => ConsumeTest::class, // 必须
'queueType' => SpiderConfig::QUEUE_TYPE_FAST_CACHE, // 通信类型默认是fast-cache不支持分布式,如需分布式可使用SpiderConfig::QUEUE_TYPE_REDIS,或者自行实现通信队列
//'queue' => '自定义队列,如使用组件自带则不需要', // 自定义通信队列
//'queueConfig' => '自定义队列配置,目前只有SpiderConfig::QUEUE_TYPE_REDIS需要',
'maxCurrency' => 128 // 最大协程并发数(单台机器)
];
SpiderServer::getInstance()
->setSpiderConfig($spiderConfig)
->attachProcess(ServerManager::getInstance()->getSwooleServer());
//添加一个定时器,定时更新baijunyao博客文章
Crontab::getInstance()->addTask(TaskOne::class);
}
public static function onRequest(Request $request, Response $response): bool
{
// TODO: Implement onRequest() method.
return true;
}
public static function afterRequest(Request $request, Response $response): void
{
// TODO: Implement afterAction() method.
}
}
2.AppHttpControllerIndex.php 中 spider方法是我刚开始测试的时候使用的爬虫开始的入口. 当我运行后,在浏览器访问spider方法,就执行了一次爬虫程序。
<?php
namespace App\HttpController;
use App\Model\Metas;
use EasySwoole\FastCache\Cache;
use EasySwoole\Http\AbstractInterface\Controller;
use EasySwoole\Spider\SpiderClient;
class Index extends Controller
{
public function test(){
$metasModel = new Metas();
$list = $metasModel->where('type', 'category', '=')->indexBy('mid');
$title = "Docker 入门教程(七)Dockerfile";
$mid =0;
foreach($list as $key=>$value){
$title = strtolower($title);
if (strpos($title, strtolower($value['name'])) !== false) {
$mid = $key;
break ;
}
}
var_dump($mid);
}
public function index()
{
$words = [
'php',
'java',
'go'
];
foreach ($words as $word) {
Cache::getInstance()->enQueue('SEARCH_WORDS', $word);
}
$wd = Cache::getInstance()->deQueue('SEARCH_WORDS');
SpiderClient::getInstance()->addJob(
'https://www.baidu.com/s?wd=php&pn=0',
[
'page' => 1,
'word' => $wd
]
);
// $file = EASYSWOOLE_ROOT.'/vendor/easyswoole/easyswoole/src/Resource/Http/welcome.html';
// if(!is_file($file)){
// $file = EASYSWOOLE_ROOT.'/src/Resource/Http/welcome.html';
// }
// $this->response()->write(file_get_contents($file));
}
protected function actionNotFound(?string $action)
{
$this->response()->withStatus(404);
$file = EASYSWOOLE_ROOT.'/vendor/easyswoole/easyswoole/src/Resource/Http/404.html';
if(!is_file($file)){
$file = EASYSWOOLE_ROOT.'/src/Resource/Http/404.html';
}
$this->response()->write(file_get_contents($file));
}
public function spider(){
//获取最新的文章标题
$dir = EASYSWOOLE_ROOT . "/public";
$file = $dir . "/title.log";
if (!is_dir($dir)) {
mkdir($dir, 0777, true);
}
if (!file_exists($file)) {
$title = "";
} else {
$title = file_get_contents($file);
}
SpiderClient::getInstance()->addJob(
'https://baijunyao.com/?page=0',
[
'page' => 1,
'title' => $title,
'first_title' => '',
]
);
}
}
3.AppSpiderProductTest.php 抓取文章并将组合后的数据发送给Consume
<?php
namespace App\Spider;
use EasySwoole\FastCache\Cache;
use EasySwoole\HttpClient\HttpClient;
use EasySwoole\Spider\Hole\ProductAbstract;
use EasySwoole\Spider\ProductResult;
use QL\QueryList;
class ProductTest extends ProductAbstract
{
public function product(): ProductResult
{
// TODO: Implement product() method.
//请求地址数据
$preUrl = "https://baijunyao.com";
//每个任务的url
$httpClient = new HttpClient($this->productConfig->getUrl());
$httpClient->setHeader('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36');
$body = $httpClient->get()->getBody();
$rules = [
'title' => ['h3 .b-oa-title', 'text'],
'href' => ['h3 .b-oa-title', 'href'],
'img' => ['.row .bjy-lazyload', 'data-src']
];
$searchResult = QueryList::rules($rules)->html($body)->range(".b-one-article")->query()->getData()->all();
$data = [];
foreach ($searchResult as $result) {
//获取连接中的文章内容和图片
$httpClient2 = new HttpClient($result['href']);
$httpClient2->setHeader('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36');
$body = $httpClient2->get()->getBody();
$eles = QueryList::html($body)->find('.js-content');
$eles->find("p:last")->remove();
$content = $eles->html();
//preg_replace("/^\/upload\//", $preUrl . "/upload/", $content);
$item = [
'href' => $result['href'],
'text' => $result['title'],
'img' => $preUrl . $result['img'],
'content' => str_replace("/uploads/article/", $preUrl . "/uploads/article/", $content),
];
$data[] = $item;
}
$productJobOtherInfo = $this->productConfig->getOtherInfo();
//下一批任务
$productJobConfigs = [];
//已经获取的最新文章标题
$title = $productJobOtherInfo['title'];
$page = $productJobOtherInfo['page']; //页数
$firstTitle = $productJobOtherInfo['first_title']; //本次抓取的第一篇文章标题
//如果已经获取到最新的文章了,则就不再继续抓取了。
$ifGetLogTitle = false;
//发送给消费者的数据
$consumeData = [];
foreach($data as $key => $value){
if ($value['text'] == $title) {
$ifGetLogTitle = true;
if ($key ==0 ) $consumeData['list'] = [];
break;
} else {
$consumeData['list'][] = $value;
}
}
if ($page == 1) {
if(count($consumeData['list']) > 0 ) {
//有新的文章需要获取
//第一次获取,要记录最新的文章标题
$firstTitle = $consumeData['list'][0]['text'];
if (!$ifGetLogTitle) {
//加入下面的任务
$productJobConfigs[] = [
'url' => 'https://baijunyao.com/?page=' . ($page+1),
'otherInfo' => [
'page' => $page+1,
'first_title' => $firstTitle,
'title' => $title,
]
];
}
$consumeData['first_title'] = $firstTitle;
}
} else {
if (count($consumeData['list']) > 0 ) {
//不是第一页
if (!$ifGetLogTitle) {
//加入下面的任务
$productJobConfigs[] = [
'url' => 'https://baijunyao.com/?page=' . ($page+1),
'otherInfo' => [
'page' => $page+1,
'title' => $title,
'first_title' => $firstTitle,
]
];
}
$consumeData['first_title'] = $firstTitle;
}
}
$result = new ProductResult();
$result->setProductJobConfigs($productJobConfigs)->setConsumeData($consumeData);
return $result;
//先将每个搜索结果的a标签内容拿到
// $rules = [
// 'search_result' => ['.c-container .t', 'text', 'a']
// ];
// $searchResult = QueryList::rules($rules)->html($body)->query()->getData();
// $data = [];
//
// foreach ($searchResult as $result) {
// $item = [
// 'href' => QueryList::html($result['search_result'])->find('a')->attr('href'),
// 'text' => QueryList::html($result['search_result'])->find('a')->text()
// ];
// $data[] = $item;
// }
// $productJobOtherInfo = $this->productConfig->getOtherInfo();
// // 下一批任务
// $productJobConfigs = [];
// if ($productJobOtherInfo['page'] === 1) {
// for($i=1;$i<5;$i++) {
// $pn = $i*10;
// $productJobConfig = [
// 'url' => "https://www.baidu.com/s?wd={$productJobOtherInfo['word']}&pn={$pn}",
// 'otherInfo' => [
// 'word' => $productJobOtherInfo['word'],
// 'page' => $i+1
// ]
// ];
// $productJobConfigs[] = $productJobConfig;
// }
//
// $word = Cache::getInstance()->deQueue('SEARCH_WORDS');
// if (!empty($word)) {
// $productJobConfigs[] = [
// 'url' => "https://www.baidu.com/s?wd={$word}&pn=0",
// 'otherInfo' => [
// 'word' => $word,
// 'page' => 1
// ]
// ];
// }
// }
// $result = new ProductResult();
// $result->setProductJobConfigs($productJobConfigs)->setConsumeData($data);
// return $result;
}
}
- App/Spider/ConsumeTest.php 得到抓取的文章信息保存入我的数据库
<?php
namespace App\Spider;
use App\Model\Contents;
use App\Model\Metas;
use App\Model\Relationships;
use EasySwoole\ORM\DbManager;
use EasySwoole\Spider\Hole\ConsumeAbstract;
class ConsumeTest extends ConsumeAbstract
{
public function consume()
{
// TODO: Implement consume() method.
$data = $this->getJobData();
$list = $data['list'];
$insertData = [];
if (count($list) >0 ) {
foreach($list as $value) {
$insertData[] = [
'title' => $value['text'],
'text' => $value['content'],
'authorId' => 1,
'type' => 'post',
'status' => 'publish',
'allowComment' => 1,
'allowPing' =>1,
'allowFeed' => 1,
];
}
if (count($insertData) > 0 ) {
//获取所有的metas
$metasModel = new Metas();
$metas = $metasModel->where('type', 'category', '=')->all();
//批量插入
try {
DbManager::getInstance()->startTransaction();
foreach($insertData as $value) {
$mid = 1;
$title = strtolower($value['title']);
foreach ($metas as $v) {
if (strpos($title, strtolower($v['name'])) !== false) {
$mid = $v['mid'];
break ;
}
}
$model = new Contents();
$model2 = new Relationships();
$id = $model->data($value)->save();
$relation = [
'cid' => $id,
'mid' => $mid,
];
$model->update(['slug' => $id]);
$model2->data($relation)->save();
}
}catch (\Throwable $t) {
DbManager::getInstance()->rollback();
file_put_contents(EASYSWOOLE_ROOT . '/public/err.log', $t->getMessage());
} finally {
//提交事务
DbManager::getInstance()->commit();
//将第一个文章标题放到那里
if (!empty($data['first_title'])) {
file_put_contents(EASYSWOOLE_ROOT . '/public/title.log', $data['first_title']);
}
}
}
}
}
}
既然是爬虫那我不能每次都手动运行啊,所以借助了easyswoole 的定时任务,于是就有了以下的bat2.0版本
bat2.0
1.在EasySwooleEvent.php 中注册了定时器, 定时器的实现在 App/Crontab/TaskOne.php
<?php
namespace App\Crontab;
use EasySwoole\EasySwoole\Crontab\AbstractCronTask;
use EasySwoole\Spider\SpiderClient;
class TaskOne extends AbstractCronTask
{
public static function getRule(): string
{
// TODO: Implement getRule() method.
return '*/2 * * * *';
}
public static function getTaskName(): string
{
// TODO: Implement getTaskName() method.
return 'taskOne';
}
function run(int $taskId, int $workerIndex)
{
// TODO: Implement run() method.
// 定时任务处理逻辑
//获取最新的文章标题
$dir = EASYSWOOLE_ROOT . "/public";
$file = $dir . "/title.log";
if (!is_dir($dir)) {
mkdir($dir, 0777, true);
}
if (!file_exists($file)) {
$title = "";
} else {
$title = file_get_contents($file);
}
SpiderClient::getInstance()->addJob(
'https://baijunyao.com/?page=0',
[
'page' => 1,
'title' => $title,
'first_title' => '',
]
);
}
function onException(\Throwable $throwable, int $taskId, int $workerIndex)
{
// TODO: Implement onException() method.
echo $throwable->getMessage();
}
}
没两分钟进行一次抓取。