hyperf+Querylist爬取三次元网站

hyperf+Querylist爬取三次元网站 ,仅供交流学习请勿用于商业用途

安装及使用

Querylist中文文档

cosplay网站爬虫

QuerylistController.php

<?php

declare(strict_types=1);
/**
 * This file is part of Hyperf.
 *
 * @link     https://www.hyperf.io
 * @document https://hyperf.wiki
 * @contact  group@hyperf.io
 * @license  https://github.com/hyperf/hyperf/blob/master/LICENSE
 */
namespace App\Controller;
use QL\QueryList;
use Hyperf\DbConnection\Db;
class QuerylistController extends AbstractController
{
    //方法
}

次元岛

public function cosplaycyd(){
        $time=time();
        //  echo "<pre>";
        // $res=Db::table("news")->where('id',568225)->get();
        // var_dump($res);exit;
       // 元数据采集规则
       $rules = [
       // 采集文章标题
       'title' => ['p>a','text'],
       // 采集链接
       'link' => ['p>a','href'],
       // 采集缩略图
       //'img' => ['.list_thumbnail>img','src'],
       'covermore' => ['a>img','src'],
       ];
       
       // 切片选择器
       // $range = '.content li';
       
        $range = 'ul .fleft';
        for ($i=12; $i>=1; $i--)
        {
             
            $rt = QueryList::get("http://ciyuandao.com/photo/list/0-5-".$i)->rules($rules)->range($range)->query()->getData();
            $re = [];
            
            foreach ($rt as $k=> $r){
              
                    $r['link']='http://ciyuandao.com'.$r['link'];
                    $rs = QueryList::get($r['link']);
                    $r['cover']=$r['covermore'];
                    $r['newstime']="";
                    $r['author']="次元岛";
                    $r['newstype']=31;
                    $r['create_time']=time();
                    $r['update_time']=time();
                    unset($r['link']);
                    $eles=$rs->find('.padding10:eq(0)');
                    $eles->find('.font14,h1')->remove();
                    // $r['content']=str_replace('data-original=','src=',$eles->html());
                    // echo('<meta name="referrer" content="no-referrer">'.$r['content']);
                    $r['content']=$eles->html();
                    if(!empty($r['content'])){
                        $re[]=$r; 
                        
                    }
            }
            // var_dump($re);
            $res=Db::table("news")->insert($re);
            var_dump($res);
        }
       
        
        return time()-$time;
    }

获取数据:

推次元

public function cosplaytcy(){
        $time=time();
        //  echo "<pre>";
        // $res=Db::table("news")->where('id',568225)->get();
        // var_dump($res);exit;
       // 元数据采集规则
       $rules = [
       // 采集文章标题
       'title' => ['h3>a','text'],
       // 采集链接
       'link' => ['h3>a','href'],
       // 采集缩略图
       //'img' => ['.list_thumbnail>img','src'],
       'covermore' => ['.showImg>a>img','data-loadsrc'],
       ];
       
       // 切片选择器
       // $range = '.content li';
       
        $range = '.cy2-coslist li';
        for ($i=12; $i>=1; $i--)
        {   
            if($i==1){
                $url="https://t2cy.com/acg/cos/index.html";
            }else{
                $url="https://t2cy.com/acg/cos/index_".$i.'.html';
            }
            $rt = QueryList::get($url)->rules($rules)->range($range)->query()->getData();
            $re = [];
            foreach ($rt as $k=> $r){
              
                    $r['link']='https://t2cy.com/'.$r['link'];
                    $rs = QueryList::get($r['link']);
                    $r['cover']=$r['covermore']="https://t2cy.com".$r['covermore'];
                    $r['newstime']="";
                    $r['author']="推次元";
                    $r['newstype']=31;
                    $r['create_time']=time();
                    $r['update_time']=time();
                    unset($r['link']);
                    $eles=$rs->find('.cy_cosCon .tc');
                    $eles->find('img')->removeAttr('alt');
                    $r['content']=str_replace('data-loadsrc="','src="https://t2cy.com',$eles->html());
                    // echo('<meta name="referrer" content="no-referrer">'.$r['content']);
                    $r['content']='<meta name="referrer" content="no-referrer">'.$r['content'];
                    if(!empty($r['content'])){
                        $re[]=$r; 
                        
                    }
            }
            // var_dump($re);
            $res=Db::table("news")->insert($re);
            var_dump($res);
        }
       
        
        return time()-$time;
    }

获得数据:

萌图志

public function cosplaymtz(){
        
       // 元数据采集规则
       $rules = [
       // 采集文章标题
       'title' => ['.card-item>h3>a','text'],
       // 采集链接
       'link' => ['.card-item>h3>a','href'],
       // 采集缩略图
       //'img' => ['.list_thumbnail>img','src'],
       'covermore' => ['.card-item>.focus>a>img','data-original'],
       ];
       
       // 切片选择器
       // $range = '.content li';
       
        $range = '.cardlist .span_1_of_4';
        for ($i=50; $i>=21; $i--)
        {   
           
            $url="http://96acg.com/san/page/".$i;
            $rt = QueryList::get($url)->rules($rules)->range($range)->query()->getData();
            $re = [];
            foreach ($rt as $k=> $r){
                    $rs = QueryList::get($r['link']);
                    $r['cover']=$r['covermore'];
                    $r['newstime']="";
                    $r['author']="萌图志";
                    $r['newstype']=31;
                    $r['create_time']=time();
                    $r['update_time']=time();
                    unset($r['link']);
                    $eles=$rs->find('.article-content');
                    $eles->find('.article-social,.open-message,p')->remove();
                    $eles->find('noscript,img')->removeAttr('alt');
                    $eles->find('noscript,img')->removeAttr('title');
                    $r['content']=$eles->html();
                    $r['content']=str_replace('data-original="','src="',$eles->html());
                    $r['content']=str_replace('noscript','p',$eles->html());
                    $r['content']='<meta name="referrer" content="no-referrer">'.$r['content'];
                    if(!empty($r['content'])){
                        $re[]=$r; 
                       
                    }
            }
            // var_dump($re);
            $res=Db::table("news")->insert($re);
            var_dump($res);
        }
       
    }

获取数据:

绝对领域-jk制服

public function jkjdly(){
        
      // 元数据采集规则
      $rules = [
      // 采集文章标题
      'title' => ['.item-in>.post-info>h2>a','text'],
      // 采集链接
      'link' => ['.item-in>.post-info>h2>a','href'],
      // 采集缩略图
      //'img' => ['.list_thumbnail>img','src'],
      'covermore' => ['.item-in>.post-module-thumb>a>img','data-src'],
      ];
       
      // 切片选择器
      
       
        $range = '.b2_gap .post-list-item';
        for ($i=12; $i>=1; $i--)
        {
            echo($i);
            $rt = QueryList::get("https://www.jdlingyu.com/tag/jk%e5%88%b6%e6%9c%8d/page/".$i)->rules($rules)->range($range)->query()->getData();
            $re = [];
            
            foreach ($rt as $k=> $r){
                    $rs = QueryList::get($r['link']);
                    $r['cover']=$r['covermore'];
                    $r['newstime']="";
                    $r['author']="绝对领域-jk制服";
                    $r['newstype']=31;
                    $r['create_time']=time();
                    $r['update_time']=time();
                    unset($r['link']);
                    $eles=$rs->find('.entry-content');
                    $eles->find('a')->remove();
                    $r['content']=$eles->html();
                    if(!empty($r['content'])){
                        $re[]=$r; 
                    }
            }
            // var_dump($re);
            $res=Db::table("news")->insert($re);
            var_dump($res);
            
        }
       
    }

获取数据:

绝对领域的内容比较统一其他标签也可以用相同的逻辑抓取 如绝对领域cos正片只用换一下抓取链接即可

衍生品:

爬虫小马甲–三次元图

0 评论
内联反馈
查看所有评论