hyperf+Querylist爬取三次元网站 ,仅供交流学习请勿用于商业用途
安装及使用
cosplay网站爬虫
QuerylistController.php
<?php
declare(strict_types=1);
/**
* This file is part of Hyperf.
*
* @link https://www.hyperf.io
* @document https://hyperf.wiki
* @contact group@hyperf.io
* @license https://github.com/hyperf/hyperf/blob/master/LICENSE
*/
namespace App\Controller;
use QL\QueryList;
use Hyperf\DbConnection\Db;
class QuerylistController extends AbstractController
{
//方法
}
次元岛
public function cosplaycyd(){
$time=time();
// echo "<pre>";
// $res=Db::table("news")->where('id',568225)->get();
// var_dump($res);exit;
// 元数据采集规则
$rules = [
// 采集文章标题
'title' => ['p>a','text'],
// 采集链接
'link' => ['p>a','href'],
// 采集缩略图
//'img' => ['.list_thumbnail>img','src'],
'covermore' => ['a>img','src'],
];
// 切片选择器
// $range = '.content li';
$range = 'ul .fleft';
for ($i=12; $i>=1; $i--)
{
$rt = QueryList::get("http://ciyuandao.com/photo/list/0-5-".$i)->rules($rules)->range($range)->query()->getData();
$re = [];
foreach ($rt as $k=> $r){
$r['link']='http://ciyuandao.com'.$r['link'];
$rs = QueryList::get($r['link']);
$r['cover']=$r['covermore'];
$r['newstime']="";
$r['author']="次元岛";
$r['newstype']=31;
$r['create_time']=time();
$r['update_time']=time();
unset($r['link']);
$eles=$rs->find('.padding10:eq(0)');
$eles->find('.font14,h1')->remove();
// $r['content']=str_replace('data-original=','src=',$eles->html());
// echo('<meta name="referrer" content="no-referrer">'.$r['content']);
$r['content']=$eles->html();
if(!empty($r['content'])){
$re[]=$r;
}
}
// var_dump($re);
$res=Db::table("news")->insert($re);
var_dump($res);
}
return time()-$time;
}
获取数据:
推次元
public function cosplaytcy(){
$time=time();
// echo "<pre>";
// $res=Db::table("news")->where('id',568225)->get();
// var_dump($res);exit;
// 元数据采集规则
$rules = [
// 采集文章标题
'title' => ['h3>a','text'],
// 采集链接
'link' => ['h3>a','href'],
// 采集缩略图
//'img' => ['.list_thumbnail>img','src'],
'covermore' => ['.showImg>a>img','data-loadsrc'],
];
// 切片选择器
// $range = '.content li';
$range = '.cy2-coslist li';
for ($i=12; $i>=1; $i--)
{
if($i==1){
$url="https://t2cy.com/acg/cos/index.html";
}else{
$url="https://t2cy.com/acg/cos/index_".$i.'.html';
}
$rt = QueryList::get($url)->rules($rules)->range($range)->query()->getData();
$re = [];
foreach ($rt as $k=> $r){
$r['link']='https://t2cy.com/'.$r['link'];
$rs = QueryList::get($r['link']);
$r['cover']=$r['covermore']="https://t2cy.com".$r['covermore'];
$r['newstime']="";
$r['author']="推次元";
$r['newstype']=31;
$r['create_time']=time();
$r['update_time']=time();
unset($r['link']);
$eles=$rs->find('.cy_cosCon .tc');
$eles->find('img')->removeAttr('alt');
$r['content']=str_replace('data-loadsrc="','src="https://t2cy.com',$eles->html());
// echo('<meta name="referrer" content="no-referrer">'.$r['content']);
$r['content']='<meta name="referrer" content="no-referrer">'.$r['content'];
if(!empty($r['content'])){
$re[]=$r;
}
}
// var_dump($re);
$res=Db::table("news")->insert($re);
var_dump($res);
}
return time()-$time;
}
获得数据:
萌图志
public function cosplaymtz(){
// 元数据采集规则
$rules = [
// 采集文章标题
'title' => ['.card-item>h3>a','text'],
// 采集链接
'link' => ['.card-item>h3>a','href'],
// 采集缩略图
//'img' => ['.list_thumbnail>img','src'],
'covermore' => ['.card-item>.focus>a>img','data-original'],
];
// 切片选择器
// $range = '.content li';
$range = '.cardlist .span_1_of_4';
for ($i=50; $i>=21; $i--)
{
$url="http://96acg.com/san/page/".$i;
$rt = QueryList::get($url)->rules($rules)->range($range)->query()->getData();
$re = [];
foreach ($rt as $k=> $r){
$rs = QueryList::get($r['link']);
$r['cover']=$r['covermore'];
$r['newstime']="";
$r['author']="萌图志";
$r['newstype']=31;
$r['create_time']=time();
$r['update_time']=time();
unset($r['link']);
$eles=$rs->find('.article-content');
$eles->find('.article-social,.open-message,p')->remove();
$eles->find('noscript,img')->removeAttr('alt');
$eles->find('noscript,img')->removeAttr('title');
$r['content']=$eles->html();
$r['content']=str_replace('data-original="','src="',$eles->html());
$r['content']=str_replace('noscript','p',$eles->html());
$r['content']='<meta name="referrer" content="no-referrer">'.$r['content'];
if(!empty($r['content'])){
$re[]=$r;
}
}
// var_dump($re);
$res=Db::table("news")->insert($re);
var_dump($res);
}
}
获取数据:
绝对领域-jk制服
public function jkjdly(){
// 元数据采集规则
$rules = [
// 采集文章标题
'title' => ['.item-in>.post-info>h2>a','text'],
// 采集链接
'link' => ['.item-in>.post-info>h2>a','href'],
// 采集缩略图
//'img' => ['.list_thumbnail>img','src'],
'covermore' => ['.item-in>.post-module-thumb>a>img','data-src'],
];
// 切片选择器
$range = '.b2_gap .post-list-item';
for ($i=12; $i>=1; $i--)
{
echo($i);
$rt = QueryList::get("https://www.jdlingyu.com/tag/jk%e5%88%b6%e6%9c%8d/page/".$i)->rules($rules)->range($range)->query()->getData();
$re = [];
foreach ($rt as $k=> $r){
$rs = QueryList::get($r['link']);
$r['cover']=$r['covermore'];
$r['newstime']="";
$r['author']="绝对领域-jk制服";
$r['newstype']=31;
$r['create_time']=time();
$r['update_time']=time();
unset($r['link']);
$eles=$rs->find('.entry-content');
$eles->find('a')->remove();
$r['content']=$eles->html();
if(!empty($r['content'])){
$re[]=$r;
}
}
// var_dump($re);
$res=Db::table("news")->insert($re);
var_dump($res);
}
}
获取数据:
绝对领域的内容比较统一其他标签也可以用相同的逻辑抓取 如绝对领域cos正片只用换一下抓取链接即可
衍生品: