tp6+Querylist新闻爬虫

tp6+Querylist新闻爬虫 ,仅供交流学习请勿用于商业用途

安装及使用

Querylist中文文档

新闻爬虫

Querylistx.php

<?php
namespace app\composer\controller;

use app\BaseController;
use think\facade\View;
use QL\QueryList;
use GuzzleHttp\Psr7\Response;
use think\facade\Db;

use QL\Ext\PhantomJs;

class Querylistx extends BaseController
{
       //方法
}

腾讯新闻

public function tengxun(){
		$ql = QueryList::getInstance();
		$ql->use(PhantomJs::class,'/usr/local/bin/phantomjs');
        // 使用插件
        // $rt = $ql->browser('https://new.qq.com/ch/ent/')->getHtml();
        // print_r($rt);
        $key=input('key');
	   if(is_null($key))$key=0;
        $url = "https://new.qq.com/";
	   $types=["ch/ent/","ch/tech/","ch/antip/","tag/82542","ch/fashion/","ch/games/","ch/visit/","ch/comic/","ch/life/","ch/finance/","","ch/history/","ch/cul/","ch/baby/","ch/photo/"];
	   if($key==0){
	   	$type="腾讯娱乐";
	   	$newstype=9;
	   }elseif($key==1){
	   	$type="腾讯科技";
	   	$newstype=10;
	   }elseif($key==2){
	   	$type="腾讯抗疫";
	   	$newstype=24;
	   }elseif($key==3){
	   	$type="腾讯nba";
	   	$newstype=8;
	   }elseif($key==4){
	   	$type="腾讯时尚";
	   	$newstype=7;
	   }elseif($key==5){
	   	$type="腾讯游戏";
	   	$newstype=2;
	   }elseif($key==6){
	   	$type="腾讯旅游";
	   	$newstype=11;
	   }elseif($key==7){
	   	$type="腾讯动漫";
	   	$newstype=16;
	   }elseif($key==8){
	   	$type="腾讯生活";
	   	$newstype=13;
	   }elseif($key==9){
	   	$type="腾讯经济";
	   	$newstype=20;
	   }elseif($key==10){
	   	$type="腾讯要闻";
	   	$newstype=6;
	   }elseif($key==11){
	   	$type="腾讯历史";
	   	$newstype=14;
	   }elseif($key==12){
	   	$type="腾讯文化";
	   	$newstype=17;
	   }elseif($key==13){
	   	$type="腾讯育儿";
	   	$newstype=15;
	   }elseif($key==14){
	   	$type="腾讯图片";
	   	$newstype=19;
	   }
        $rules = [
       //// 采集文章标题
       'title' => ['h3>a','text'],
       // 采集链接
       'link' => ['h3>a','href'],
       //// 采集缩略图
       //'covermore' => ['a','src'],
       ];
        $range=".list li";
        // $rt = $ql->browser('https://xw.qq.com/m/ent?f=c_ent')->rules($rules)->range($range)->query()->getData();
         $rt = $ql->browser($url.$types[$key])->rules($rules)->range($range)->query()->getData();
          $re = [];
        foreach ($rt as $r){
        	if(!empty($r['link'])){
        	$rs = $ql->browser($r['link']);
            $r['covermore']=$rs->find(".content-article>p>img:eq(0)")->attr('src');
              if(!empty($r['covermore'])){
              	if(strpos($r['covermore'],'newsapp_bt') !== false){
                $r['author']=$type;
              	$r['newstype']=$newstype;
                $r['create_time']=time();
                $r['update_time']=time();
                $r['cover']=$r['covermore']="http:".$r['covermore'];
                // $r['newstime']=$rs->find(".year")->text().'/'.$rs->find(".md")->text()." ".$rs->find(".time")->text();
                $r['newstime']=date("Y-m-d H:i:s");
                $eles=$rs->find('.content-article');
                $eles->find('#Status,a,.videoPlayerWrap')->remove();
                $r['content']=str_replace('src="//inews.gtimg.com/','src="http://inews.gtimg.com/',$eles->html());
            	$rq=$r;
            	// Db::name('news')->strict(false)->insert($r);
            	// exit;
            	$re[]=$rq;
              	}
            	
              }
              
        	}
        }
        // echo("<pre>");
        // var_dump($re);
        // echo("ok");
        $res=Db::name('news')->strict(false)->insertAll(array_reverse($re));
           if($res){
           	echo "ok";
           }
	}

数据如下图

网易新闻

public function wangyi(){
		set_time_limit(300);
		$ql = QueryList::getInstance();
		$ql->use(PhantomJs::class,'/usr/local/bin/phantomjs');
        $key=input('key');
	  if(is_null($key))$key=0;
        $url = "https://";
	  $types=["ent.163.com/","travel.163.com/","edu.163.com/","sports.163.com/nba/","baby.163.com/","fashion.163.com/","news.163.com/","tech.163.com/","money.163.com/","art.163.com/"];
	  if($key==0){
	  	$type="网易娱乐";
	  	$newstype=9;
	  }elseif($key==1){
	  	$type="网易旅游";
	  	$newstype=11;
	  }elseif($key==2){
	  	$type="网易教育";
	  	$newstype=14;
	  }elseif($key==3){
	  	$type="网易nba";
	  	$newstype=8;
	  }elseif($key==4){
	  	$type="网易亲子";
	  	$newstype=15;
	  }elseif($key==5){
	   	$type="网易时尚";
	   	$newstype=7;
	   }elseif($key==6){
	   	$type="网易要闻";
	   	$newstype=6;
	   }elseif($key==7){
	   	$type="网易科技";
	   	$newstype=10;
	   }elseif($key==8){
	   	$type="网易财经";
	   	$newstype=20;
	   }elseif($key==9){
	   	$type="网易艺术";
	   	$newstype=13;
	   }
	   
        $rules = [
       //// 采集文章标题
       'title' => ['.news_title>h3>a','text'],
       // 采集链接
       'link' => ['.news_title>h3>a','href'],
       // 采集缩略图
       'covermore' => ['a>img','src'],
       ];
        $range=".ndi_main .news_article";
        $rt = $ql->browser($url.$types[$key])->rules($rules)->range($range)->removeHead()->query()->getData();
        // echo("<pre>");
        // var_dump($rt);
        // exit;
        $re = [];
        foreach ($rt as $r){
        	if(!empty($r['link'])){
        	    $rs = $ql->browser($r['link']);
                $r['author']=$type;
              	$r['newstype']=$newstype;
                $r['create_time']=time();
                $r['update_time']=time();
                $r['cover']=$r['covermore'];
                $info=$rs->find(".post_time_source")->text();
                $r['newstime']=substr($info,0,19);
                
                $eles=$rs->find('#endText');
                $eles->find('a,.gg200x300,.cDGray,.related_special')->remove();
                $r['content']=$eles->html();
                if(!empty($r['content'])){
                	$rq=$r;
                	// var_dump($r);
            	   //Db::name('news')->strict(false)->insert($r);
            	   //exit;
            	   $re[]=$rq;
                }
            	
        	}
        }
        // echo("<pre>");
        // var_dump($re);
        // echo("ok");
       $res=Db::name('news')->strict(false)->insertAll(array_reverse($re));
       if($res){
           	echo "ok";
        }
       
	}

数据如下图

澎湃新闻

//澎湃新闻爬虫
	public function pengpai(){
	   $key=input('key');
	   if(is_null($key))$key=0;
	   $url = "https://m.thepaper.cn/";
	   $types=["channel_25953","channel_25951","channel_25952","channel_25950","channel_90077"];
	   if($key==0){
	   	$type="生活";
	   	$newstype=13;
	   }elseif($key==1){
	   	$type="财经";
	   	$newstype=20;
	   }elseif($key==2){
	   	$type="思想";
	   	$newstype=14;
	   }elseif($key==3){
	   	$type="实事";
	   	$newstype=6;
	   }elseif($key==4){
	   	$type="抗疫";
	   	$newstype=24;
	   }
       // 元数据采集规则
       $rules = [
       //// 采集文章标题
       'title' => ['.list_item_title>span>a','text'],
       // 采集链接
       'link' => ['.list_item_thumb','href'],
       // 采集缩略图
       'covermore' => ['.list_item_thumb>img','src'],
       ];
        $range = '.list_item';
        $rt = QueryList::get($url.$types[$key])->rules($rules)->range($range)->query()->getData();
        
        $re = [];
        foreach ($rt as $r){
            $rs = QueryList::get($url.$r['link']);
            
            $info=$rs->find(".newsdetail_header .date")->text();
           
            $r['newstime']=substr($info,0,16);
            if(empty($r['newstime'])){
            	$r['newstime']=date("Y-m-d H:i:s");
            }
            $author=mb_strpos($info,'来源:');
            if(!$author){
        	   $r['author']='互联网';
            }else{
            	$r['author']=substr($info,$author+10);
            }
        	 $r['newstype']=$newstype;
             $r['create_time']=time();
             $r['update_time']=time();
             $r['cover']=$r['covermore'];
             $eles=$rs->find('.newsdetail_content');
             $eles->find('.news_infor_extra,.hide_word,a,video')->remove();
             $r['content']='<meta name="referrer" content="no-referrer">.'.$eles->html();
             if(!empty($r['content'])){
            	$rq=$r;
            
        	   $re[]=$rq;
            }
        }
        echo("<pre>");
        var_dump($re);
        echo("ok");
        //   $res=Db::name('news')->strict(false)->insertAll(array_reverse($re));
        //   if($res){
        //   	echo "ok";
        //   }
	}

数据如下图

搜狐新闻

public function souhu(){
		$ql = QueryList::getInstance();
		$ql->use(PhantomJs::class,'/usr/local/bin/phantomjs');
        $key=input('key');
	   if(is_null($key))$key=0;
        $url = "https://yule.sohu.com/";
	   $types=["?spm=smpc.home.top-nav.13.1588229653251AKEu1Rp","?spm=smpc.fb-sports-home.top-nav.6.1588747723680bgdSsuH","?spm=smpc.acg-home.header.22.1588748238587euHYma9"];
	   if($key==0){
	   	$type="搜狐娱乐";
	   	$newstype=9;
	   }elseif($key==1){
	   	$type="搜狐旅游";
	   	$newstype=11;
	   }elseif($key==2){
	   	$type="搜狐搞笑";
	   	$newstype=19;
	   }
        $rules = [
       //// 采集文章标题
       'title' => ['.feed-attr-click>h3>a','text'],
       // 采集链接
       'link' => ['.feed-attr-click>h3>a','href'],
       // 采集缩略图
       'covermore' => ['.feed-click-pic>a>img','src'],
       ];
        $range=".feed-page .feed-one-pic";
        $rt = $ql->browser($url.$types[$key])->rules($rules)->range($range)->query()->getData();
        $re = [];
        foreach ($rt as $r){
        	if(!empty($r['link'])){
        	    $rs = $ql->browser("http:".$r['link']);
                $r['author']=$type;
              	$r['newstype']=$newstype;
                $r['create_time']=time();
                $r['update_time']=time();
                $r['cover']=$r['covermore']="http:".$r['covermore'];
                $r['newstime']=$rs->find("#news-time")->text();
                $eles=$rs->find('.article');
                $eles->find('a')->remove();
                $r['content']=$eles->html();
                if(!empty($r['content'])){
                	$rq=$r;
            	    Db::name('news')->strict(false)->insert($r);
            	   //exit;
            	   $re[]=$rq;
                }
            	
        	}
        }
        echo("ok");

       //$res=Db::name('news')->strict(false)->insertAll(array_reverse($re));
       //    if($res){
       //    	echo "ok";
       // }
       
	}

数据如下图

微博

public function weibo(){
		
	    $ql = QueryList::getInstance();
	    
		$ql->use(PhantomJs::class,'/usr/local/bin/phantomjs');
		 $rules = [
       //// 采集文章标题
       'title' => ['h3','text'],
       //// 采集链接
       'link' => ['','href'],
       // 采集缩略图
       'cover1' => ['.pic:eq(0)>img','src'],
       'cover2' => ['.pic:eq(1)>img','src'],
       'cover3' => ['.pic:eq(2)>img','src'],
       ];
        $range=".pt_ul .UG_list_a";
        $rt = $ql->browser(function (\JonnyW\PhantomJs\Http\RequestInterface $r){
    $r->setMethod('GET');
    $r->setUrl('https://weibo.com/?category=10007');
    $r->addHeader('cookie',"SINAGLOBAL=9018490019652.922.1575514453154; YF-V5-G0=3751b8b40efecee990eab49e8d3b3354; login_sid_t=868855ad3490b248b0a125f4d48262e4; cross_origin_proto=SSL; Ugrow-G0=9ec894e3c5cc0435786b4ee8ec8a55cc; _s_tentry=www.baidu.com; UOR=ent.sina.com.cn,widget.weibo.com,www.baidu.com; Apache=5650114746322.545.1588820121589; ULV=1588820121605:6:2:2:5650114746322.545.1588820121589:1588760906109; WBtopGlobal_register_version=fd6b3a12bb72ffed; wb_view_log=1920*10801%26375*8123; WBStorage=42212210b087ca50|undefined; SCF=AlPNfaV5l1TF42Ncqk9m7LxHwLHoK9rgCFrfuQ9PhI1N7hFsXbQkYet6VcoZ_lSwECH2Za2ak4SJQ2BG_LTs5GY.; SUB=_2A25zt9P3DeRhGeFN6FAW8CfJyjyIHXVQxUI_rDV8PUJbmtAKLRfFkW9NQFTqHhxacMl-vwYBQXw19BcXXs1RYjy_; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWJq5cpPcO3I.v0E3RVov7o5JpX5K2hUgL.FoM0e0zNeh.feK52dJLoIp7LxKML1KBLBKnLxKqL1hnLBoMNe0eES054SK27; SUHB=0jB4BFsy-COek7; SSOLoginState=1588831143; un=13785132215; wvr=6; YF-Page-G0=4358a4493c1ebf8ed493ef9c46f04cae|1588831146|1588831146; wb_view_log_7332709510=375*8123; webim_unReadCount=%7B%22time%22%3A1588831511070%2C%22dm_pub_total%22%3A1%2C%22chat_group_client%22%3A0%2C%22chat_group_notice%22%3A0%2C%22allcountNum%22%3A44%2C%22msgbox%22%3A0%7D");
    $r->setTimeout(10000); // 10 seconds
    $r->setDelay(3); // 3 seconds
    return $r;
})->rules($rules)->range($range)->query()->getData();
echo("<pre>");
    foreach ($rt as $r){
        	$rs = $ql->browser(function (\JonnyW\PhantomJs\Http\RequestInterface $ra) use($r){
        	
    $ra->setMethod('GET');
    $ra->setUrl('https:'.$r['link']);
    $ra->addHeader('cookie',"SINAGLOBAL=9018490019652.922.1575514453154; un=13785132215; wb_view_log=1920*10801; YF-V5-G0=b1b8bc404aec69668ba2d36ae39dd980; Ugrow-G0=5c7144e56a57a456abed1d1511ad79e8; _s_tentry=test4.lianruanjt.cn; appkey=; Apache=8202356956297.876.1588907993326; ULV=1588907993331:7:3:3:8202356956297.876.1588907993326:1588820121605; login_sid_t=8741e09cd9ec34ab931c0b82854cee24; cross_origin_proto=SSL; SCF=AlPNfaV5l1TF42Ncqk9m7LxHwLHoK9rgCFrfuQ9PhI1NeicYiTXU02Vc1tk4m1uyhxxDrY0hxnreJv6fjOXFa1Y.; SUHB=0ebdoTqL9NaLts; wb_view_log_7332709510=375*8123%261920*10801; SUBP=0033WrSXqPxfM72wWs9jqgMF55529P9D9WWJq5cpPcO3I.v0E3RVov7o5JpVF020S0nfeKeEeo2f; SUB=_2AkMp6F_EdcPxrARSm_oSzGLnbY1H-jyaPTYyAn7uJhMyAxj_7gYhqSVutBF-XFIymCDYFdY9jkDYn_2symLFvEqJ; UOR=ent.sina.com.cn,widget.weibo.com,test4.lianruanjt.cn; webim_unReadCount=%7B%22time%22%3A1588908329416%2C%22dm_pub_total%22%3A0%2C%22chat_group_client%22%3A0%2C%22chat_group_notice%22%3A0%2C%22allcountNum%22%3A0%2C%22msgbox%22%3A0%7D; YF-Page-G0=2f0b518c8f18c7993f214275690d6fdf|1588918390|1588918390");
    $ra->setTimeout(10000); // 10 seconds
    $ra->setDelay(3); // 3 seconds
    return $ra;
    });
                $r['author']="微博美女";
              	$r['newstype']=19;
                $r['create_time']=time();
                $r['update_time']=time();
                $r['cover']=$r['cover1'];
                $r['covermore']=$r['cover1'].'··'.$r['cover2'].'··'.$r['cover3'];
                $r['newstime']=$rs->find(".WB_from>a")->attr('title');
                $eles=$rs->find('.WB_media_a>li');
                $eles->find('i')->remove();
                $r['content']=str_replace('src="//wx','src="https://wx',$eles->html());
                $r['content']='<meta name="referrer" content="no-referrer">'.str_replace('.sinaimg.cn/orj360','.sinaimg.cn/mw690',$r['content']);
            	$rq=$r;
            	// var_dump($r);
            	// Db::name('news')->strict(false)->insert($r);
            	// exit;
            	$re[]=$rq;
        	}
        
        $res=Db::name('news')->strict(false)->insertAll(array_reverse($re));
           if($res){
           	echo "ok";
           }
	}

数据如下图

0 评论
内联反馈
查看所有评论