tp6+Querylist新闻爬虫 ,仅供交流学习请勿用于商业用途
安装及使用
新闻爬虫
Querylistx.php
<?php
namespace app\composer\controller;
use app\BaseController;
use think\facade\View;
use QL\QueryList;
use GuzzleHttp\Psr7\Response;
use think\facade\Db;
use QL\Ext\PhantomJs;
class Querylistx extends BaseController
{
//方法
}
腾讯新闻
public function tengxun(){
$ql = QueryList::getInstance();
$ql->use(PhantomJs::class,'/usr/local/bin/phantomjs');
// 使用插件
// $rt = $ql->browser('https://new.qq.com/ch/ent/')->getHtml();
// print_r($rt);
$key=input('key');
if(is_null($key))$key=0;
$url = "https://new.qq.com/";
$types=["ch/ent/","ch/tech/","ch/antip/","tag/82542","ch/fashion/","ch/games/","ch/visit/","ch/comic/","ch/life/","ch/finance/","","ch/history/","ch/cul/","ch/baby/","ch/photo/"];
if($key==0){
$type="腾讯娱乐";
$newstype=9;
}elseif($key==1){
$type="腾讯科技";
$newstype=10;
}elseif($key==2){
$type="腾讯抗疫";
$newstype=24;
}elseif($key==3){
$type="腾讯nba";
$newstype=8;
}elseif($key==4){
$type="腾讯时尚";
$newstype=7;
}elseif($key==5){
$type="腾讯游戏";
$newstype=2;
}elseif($key==6){
$type="腾讯旅游";
$newstype=11;
}elseif($key==7){
$type="腾讯动漫";
$newstype=16;
}elseif($key==8){
$type="腾讯生活";
$newstype=13;
}elseif($key==9){
$type="腾讯经济";
$newstype=20;
}elseif($key==10){
$type="腾讯要闻";
$newstype=6;
}elseif($key==11){
$type="腾讯历史";
$newstype=14;
}elseif($key==12){
$type="腾讯文化";
$newstype=17;
}elseif($key==13){
$type="腾讯育儿";
$newstype=15;
}elseif($key==14){
$type="腾讯图片";
$newstype=19;
}
$rules = [
//// 采集文章标题
'title' => ['h3>a','text'],
// 采集链接
'link' => ['h3>a','href'],
//// 采集缩略图
//'covermore' => ['a','src'],
];
$range=".list li";
// $rt = $ql->browser('https://xw.qq.com/m/ent?f=c_ent')->rules($rules)->range($range)->query()->getData();
$rt = $ql->browser($url.$types[$key])->rules($rules)->range($range)->query()->getData();
$re = [];
foreach ($rt as $r){
if(!empty($r['link'])){
$rs = $ql->browser($r['link']);
$r['covermore']=$rs->find(".content-article>p>img:eq(0)")->attr('src');
if(!empty($r['covermore'])){
if(strpos($r['covermore'],'newsapp_bt') !== false){
$r['author']=$type;
$r['newstype']=$newstype;
$r['create_time']=time();
$r['update_time']=time();
$r['cover']=$r['covermore']="http:".$r['covermore'];
// $r['newstime']=$rs->find(".year")->text().'/'.$rs->find(".md")->text()." ".$rs->find(".time")->text();
$r['newstime']=date("Y-m-d H:i:s");
$eles=$rs->find('.content-article');
$eles->find('#Status,a,.videoPlayerWrap')->remove();
$r['content']=str_replace('src="//inews.gtimg.com/','src="http://inews.gtimg.com/',$eles->html());
$rq=$r;
// Db::name('news')->strict(false)->insert($r);
// exit;
$re[]=$rq;
}
}
}
}
// echo("<pre>");
// var_dump($re);
// echo("ok");
$res=Db::name('news')->strict(false)->insertAll(array_reverse($re));
if($res){
echo "ok";
}
}
数据如下图
网易新闻
public function wangyi(){
set_time_limit(300);
$ql = QueryList::getInstance();
$ql->use(PhantomJs::class,'/usr/local/bin/phantomjs');
$key=input('key');
if(is_null($key))$key=0;
$url = "https://";
$types=["ent.163.com/","travel.163.com/","edu.163.com/","sports.163.com/nba/","baby.163.com/","fashion.163.com/","news.163.com/","tech.163.com/","money.163.com/","art.163.com/"];
if($key==0){
$type="网易娱乐";
$newstype=9;
}elseif($key==1){
$type="网易旅游";
$newstype=11;
}elseif($key==2){
$type="网易教育";
$newstype=14;
}elseif($key==3){
$type="网易nba";
$newstype=8;
}elseif($key==4){
$type="网易亲子";
$newstype=15;
}elseif($key==5){
$type="网易时尚";
$newstype=7;
}elseif($key==6){
$type="网易要闻";
$newstype=6;
}elseif($key==7){
$type="网易科技";
$newstype=10;
}elseif($key==8){
$type="网易财经";
$newstype=20;
}elseif($key==9){
$type="网易艺术";
$newstype=13;
}
$rules = [
//// 采集文章标题
'title' => ['.news_title>h3>a','text'],
// 采集链接
'link' => ['.news_title>h3>a','href'],
// 采集缩略图
'covermore' => ['a>img','src'],
];
$range=".ndi_main .news_article";
$rt = $ql->browser($url.$types[$key])->rules($rules)->range($range)->removeHead()->query()->getData();
// echo("<pre>");
// var_dump($rt);
// exit;
$re = [];
foreach ($rt as $r){
if(!empty($r['link'])){
$rs = $ql->browser($r['link']);
$r['author']=$type;
$r['newstype']=$newstype;
$r['create_time']=time();
$r['update_time']=time();
$r['cover']=$r['covermore'];
$info=$rs->find(".post_time_source")->text();
$r['newstime']=substr($info,0,19);
$eles=$rs->find('#endText');
$eles->find('a,.gg200x300,.cDGray,.related_special')->remove();
$r['content']=$eles->html();
if(!empty($r['content'])){
$rq=$r;
// var_dump($r);
//Db::name('news')->strict(false)->insert($r);
//exit;
$re[]=$rq;
}
}
}
// echo("<pre>");
// var_dump($re);
// echo("ok");
$res=Db::name('news')->strict(false)->insertAll(array_reverse($re));
if($res){
echo "ok";
}
}
数据如下图
澎湃新闻
//澎湃新闻爬虫
public function pengpai(){
$key=input('key');
if(is_null($key))$key=0;
$url = "https://m.thepaper.cn/";
$types=["channel_25953","channel_25951","channel_25952","channel_25950","channel_90077"];
if($key==0){
$type="生活";
$newstype=13;
}elseif($key==1){
$type="财经";
$newstype=20;
}elseif($key==2){
$type="思想";
$newstype=14;
}elseif($key==3){
$type="实事";
$newstype=6;
}elseif($key==4){
$type="抗疫";
$newstype=24;
}
// 元数据采集规则
$rules = [
//// 采集文章标题
'title' => ['.list_item_title>span>a','text'],
// 采集链接
'link' => ['.list_item_thumb','href'],
// 采集缩略图
'covermore' => ['.list_item_thumb>img','src'],
];
$range = '.list_item';
$rt = QueryList::get($url.$types[$key])->rules($rules)->range($range)->query()->getData();
$re = [];
foreach ($rt as $r){
$rs = QueryList::get($url.$r['link']);
$info=$rs->find(".newsdetail_header .date")->text();
$r['newstime']=substr($info,0,16);
if(empty($r['newstime'])){
$r['newstime']=date("Y-m-d H:i:s");
}
$author=mb_strpos($info,'来源:');
if(!$author){
$r['author']='互联网';
}else{
$r['author']=substr($info,$author+10);
}
$r['newstype']=$newstype;
$r['create_time']=time();
$r['update_time']=time();
$r['cover']=$r['covermore'];
$eles=$rs->find('.newsdetail_content');
$eles->find('.news_infor_extra,.hide_word,a,video')->remove();
$r['content']='<meta name="referrer" content="no-referrer">.'.$eles->html();
if(!empty($r['content'])){
$rq=$r;
$re[]=$rq;
}
}
echo("<pre>");
var_dump($re);
echo("ok");
// $res=Db::name('news')->strict(false)->insertAll(array_reverse($re));
// if($res){
// echo "ok";
// }
}
数据如下图
搜狐新闻
public function souhu(){
$ql = QueryList::getInstance();
$ql->use(PhantomJs::class,'/usr/local/bin/phantomjs');
$key=input('key');
if(is_null($key))$key=0;
$url = "https://yule.sohu.com/";
$types=["?spm=smpc.home.top-nav.13.1588229653251AKEu1Rp","?spm=smpc.fb-sports-home.top-nav.6.1588747723680bgdSsuH","?spm=smpc.acg-home.header.22.1588748238587euHYma9"];
if($key==0){
$type="搜狐娱乐";
$newstype=9;
}elseif($key==1){
$type="搜狐旅游";
$newstype=11;
}elseif($key==2){
$type="搜狐搞笑";
$newstype=19;
}
$rules = [
//// 采集文章标题
'title' => ['.feed-attr-click>h3>a','text'],
// 采集链接
'link' => ['.feed-attr-click>h3>a','href'],
// 采集缩略图
'covermore' => ['.feed-click-pic>a>img','src'],
];
$range=".feed-page .feed-one-pic";
$rt = $ql->browser($url.$types[$key])->rules($rules)->range($range)->query()->getData();
$re = [];
foreach ($rt as $r){
if(!empty($r['link'])){
$rs = $ql->browser("http:".$r['link']);
$r['author']=$type;
$r['newstype']=$newstype;
$r['create_time']=time();
$r['update_time']=time();
$r['cover']=$r['covermore']="http:".$r['covermore'];
$r['newstime']=$rs->find("#news-time")->text();
$eles=$rs->find('.article');
$eles->find('a')->remove();
$r['content']=$eles->html();
if(!empty($r['content'])){
$rq=$r;
Db::name('news')->strict(false)->insert($r);
//exit;
$re[]=$rq;
}
}
}
echo("ok");
//$res=Db::name('news')->strict(false)->insertAll(array_reverse($re));
// if($res){
// echo "ok";
// }
}
数据如下图
微博
public function weibo(){
$ql = QueryList::getInstance();
$ql->use(PhantomJs::class,'/usr/local/bin/phantomjs');
$rules = [
//// 采集文章标题
'title' => ['h3','text'],
//// 采集链接
'link' => ['','href'],
// 采集缩略图
'cover1' => ['.pic:eq(0)>img','src'],
'cover2' => ['.pic:eq(1)>img','src'],
'cover3' => ['.pic:eq(2)>img','src'],
];
$range=".pt_ul .UG_list_a";
$rt = $ql->browser(function (\JonnyW\PhantomJs\Http\RequestInterface $r){
$r->setMethod('GET');
$r->setUrl('https://weibo.com/?category=10007');
$r->addHeader('cookie',"SINAGLOBAL=9018490019652.922.1575514453154; YF-V5-G0=3751b8b40efecee990eab49e8d3b3354; login_sid_t=868855ad3490b248b0a125f4d48262e4; cross_origin_proto=SSL; Ugrow-G0=9ec894e3c5cc0435786b4ee8ec8a55cc; _s_tentry=www.baidu.com; UOR=ent.sina.com.cn,widget.weibo.com,www.baidu.com; Apache=5650114746322.545.1588820121589; ULV=1588820121605:6:2:2:5650114746322.545.1588820121589:1588760906109; WBtopGlobal_register_version=fd6b3a12bb72ffed; wb_view_log=1920*10801%26375*8123; WBStorage=42212210b087ca50|undefined; SCF=AlPNfaV5l1TF42Ncqk9m7LxHwLHoK9rgCFrfuQ9PhI1N7hFsXbQkYet6VcoZ_lSwECH2Za2ak4SJQ2BG_LTs5GY.; SUB=_2A25zt9P3DeRhGeFN6FAW8CfJyjyIHXVQxUI_rDV8PUJbmtAKLRfFkW9NQFTqHhxacMl-vwYBQXw19BcXXs1RYjy_; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWJq5cpPcO3I.v0E3RVov7o5JpX5K2hUgL.FoM0e0zNeh.feK52dJLoIp7LxKML1KBLBKnLxKqL1hnLBoMNe0eES054SK27; SUHB=0jB4BFsy-COek7; SSOLoginState=1588831143; un=13785132215; wvr=6; YF-Page-G0=4358a4493c1ebf8ed493ef9c46f04cae|1588831146|1588831146; wb_view_log_7332709510=375*8123; webim_unReadCount=%7B%22time%22%3A1588831511070%2C%22dm_pub_total%22%3A1%2C%22chat_group_client%22%3A0%2C%22chat_group_notice%22%3A0%2C%22allcountNum%22%3A44%2C%22msgbox%22%3A0%7D");
$r->setTimeout(10000); // 10 seconds
$r->setDelay(3); // 3 seconds
return $r;
})->rules($rules)->range($range)->query()->getData();
echo("<pre>");
foreach ($rt as $r){
$rs = $ql->browser(function (\JonnyW\PhantomJs\Http\RequestInterface $ra) use($r){
$ra->setMethod('GET');
$ra->setUrl('https:'.$r['link']);
$ra->addHeader('cookie',"SINAGLOBAL=9018490019652.922.1575514453154; un=13785132215; wb_view_log=1920*10801; YF-V5-G0=b1b8bc404aec69668ba2d36ae39dd980; Ugrow-G0=5c7144e56a57a456abed1d1511ad79e8; _s_tentry=test4.lianruanjt.cn; appkey=; Apache=8202356956297.876.1588907993326; ULV=1588907993331:7:3:3:8202356956297.876.1588907993326:1588820121605; login_sid_t=8741e09cd9ec34ab931c0b82854cee24; cross_origin_proto=SSL; SCF=AlPNfaV5l1TF42Ncqk9m7LxHwLHoK9rgCFrfuQ9PhI1NeicYiTXU02Vc1tk4m1uyhxxDrY0hxnreJv6fjOXFa1Y.; SUHB=0ebdoTqL9NaLts; wb_view_log_7332709510=375*8123%261920*10801; SUBP=0033WrSXqPxfM72wWs9jqgMF55529P9D9WWJq5cpPcO3I.v0E3RVov7o5JpVF020S0nfeKeEeo2f; SUB=_2AkMp6F_EdcPxrARSm_oSzGLnbY1H-jyaPTYyAn7uJhMyAxj_7gYhqSVutBF-XFIymCDYFdY9jkDYn_2symLFvEqJ; UOR=ent.sina.com.cn,widget.weibo.com,test4.lianruanjt.cn; webim_unReadCount=%7B%22time%22%3A1588908329416%2C%22dm_pub_total%22%3A0%2C%22chat_group_client%22%3A0%2C%22chat_group_notice%22%3A0%2C%22allcountNum%22%3A0%2C%22msgbox%22%3A0%7D; YF-Page-G0=2f0b518c8f18c7993f214275690d6fdf|1588918390|1588918390");
$ra->setTimeout(10000); // 10 seconds
$ra->setDelay(3); // 3 seconds
return $ra;
});
$r['author']="微博美女";
$r['newstype']=19;
$r['create_time']=time();
$r['update_time']=time();
$r['cover']=$r['cover1'];
$r['covermore']=$r['cover1'].'··'.$r['cover2'].'··'.$r['cover3'];
$r['newstime']=$rs->find(".WB_from>a")->attr('title');
$eles=$rs->find('.WB_media_a>li');
$eles->find('i')->remove();
$r['content']=str_replace('src="//wx','src="https://wx',$eles->html());
$r['content']='<meta name="referrer" content="no-referrer">'.str_replace('.sinaimg.cn/orj360','.sinaimg.cn/mw690',$r['content']);
$rq=$r;
// var_dump($r);
// Db::name('news')->strict(false)->insert($r);
// exit;
$re[]=$rq;
}
$res=Db::name('news')->strict(false)->insertAll(array_reverse($re));
if($res){
echo "ok";
}
}
数据如下图