小伙伴们可以先看免费查询快递物流这篇文章,文章详细介绍了php正常爬取物流信息思路,此文章是在上个文章基础上加入协程,属于一个升级版,之前有写过一篇hyperf协程大批量匹配快递物流信息的文章,这个也是介绍了协程执行速度比传统方式是有个质的飞跃,这里用 hyperf协程免费查询快递物流 。
php7.2+swoole4+hyperf2.0
Hyperf\config\autoload\server.php配置为
<?php
declare(strict_types=1);
/**
* This file is part of Hyperf.
*
* @link https://www.hyperf.io
* @document https://hyperf.wiki
* @contact group@hyperf.io
* @license https://github.com/hyperf/hyperf/blob/master/LICENSE
*/
use Hyperf\Server\Server;
use Hyperf\Server\SwooleEvent;
return [
'mode' => SWOOLE_PROCESS,
'servers' => [
[
'name' => 'http',
'type' => Server::SERVER_HTTP,
'host' => '0.0.0.0',
'port' => 8080,
'sock_type' => SWOOLE_SOCK_TCP,
'callbacks' => [
SwooleEvent::ON_REQUEST => [Hyperf\HttpServer\Server::class, 'onRequest'],
],
],
],
'settings' => [
'enable_coroutine' => true,
'worker_num' => swoole_cpu_num(),
'pid_file' => BASE_PATH . '/runtime/hyperf.pid',
'open_tcp_nodelay' => true,
'max_coroutine' => 100000,
'open_http2_protocol' => true,
'max_request' => 100000,
'socket_buffer_size' => 2 * 1024 * 1024,
'buffer_output_size' => 2 * 1024 * 1024,
'hook_flags' => SWOOLE_HOOK_ALL | SWOOLE_HOOK_CURL,
'daemonize' => true,
],
'callbacks' => [
SwooleEvent::ON_WORKER_START => [Hyperf\Framework\Bootstrap\WorkerStartCallback::class, 'onWorkerStart'],
SwooleEvent::ON_PIPE_MESSAGE => [Hyperf\Framework\Bootstrap\PipeMessageCallback::class, 'onPipeMessage'],
SwooleEvent::ON_WORKER_EXIT => [Hyperf\Framework\Bootstrap\WorkerExitCallback::class, 'onWorkerExit'],
],
];
Hyperf\config\ routes.php 配置为
Router::get('/getkdstatus2', 'App\Controller\KuaidiController::getkdstatus2');
Hyperf\app\Controller\KuaidiController.php
<?php
declare(strict_types=1);
/**
* This file is part of Hyperf.
*
* @link https://www.hyperf.io
* @document https://hyperf.wiki
* @contact group@hyperf.io
* @license https://github.com/hyperf/hyperf/blob/master/LICENSE
*/
namespace App\Controller;
use QL\QueryList;
use Hyperf\DbConnection\Db;
use Hyperf\HttpServer\Contract\RequestInterface;
class KuaidiController extends AbstractController
{
//批量获取快递单号
public function getkdstatus2(RequestInterface $request){
$current=$request->input('current','未获取');
$yunorders=Db::table('yunorders')->where('status',3)->where('current', $current)->get(['id','expressCode']);
$count=$yunorders->count();
$limit=100;
$page=intval(ceil($count/$limit));
for($p=$page-1;$p>=0; $p--){
$offset=$p*$limit;
$yunorders=Db::table('yunorders')->where('status',3)->where('current', $current)->offset($offset)->limit($limit)->get(['id','expressCode']);
$count=$yunorders->count();
$yunorders=json_decode(json_encode($yunorders),true);
$info=$this->getinfo();
// var_dump($info);
$wg = new \Hyperf\Utils\WaitGroup();
//等待协程 如果不加这个一下执行几千几万条,会有Allowed memory size of 268435456 bytes exhausted,内存溢出,所有我们协程一次执行一百和协程,等待一百协程执行成功后在执行下一个一百协程。
// 计数器
$wg->add($count);
for ($i=$count-1;$i>=0; $i--){
co(function () use ($i,$yunorders,$info,$wg) {
$url='https://express.baidu.com/express/api/express?tokenV2='.$info['tokenV2'].'&nu='.$yunorders[$i]['expressCode'];
$baiduid=ucfirst(md5($this->getrandstr(6).rand(10000,99999)));
$info['cookie']="BAIDUID=".$baiduid.":FG=1;";
$header = array (
"Host:express.baidu.com",
"Content-Type:application/x-www-form-urlencoded",//post请求
"Connection: keep-alive",
'Referer:http://www.baidu.com',
//关键作用User-Agent 可是每次爬取结果都是无法爬取到百度搜索的内容,要验证 user-agent没有模拟好,所以不行。
'User-Agent: Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.146 Safari/537.36',
'Cookie:'.$info['cookie']
);
$ch = curl_init ();
curl_setopt ( $ch, CURLOPT_URL, $url );
curl_setopt ( $ch, CURLOPT_HTTPHEADER, $header );
curl_setopt ( $ch, CURLOPT_RETURNTRANSFER, 1 );
$content = curl_exec ( $ch );
if ($content == FALSE) {
echo "error:" . curl_error ( $ch );
}
curl_close ( $ch );
$res=json_decode($content,true);
if($res['data']['info']){
DB::table('yunorders')->where('id',$yunorders[$i]['id'])->update(['updated_at'=>date("Y-m-d H:i:s"),'current'=>$res['data']['info']['current']]);
}
$wg->done();
});
}
// 等待协程 A 和协程 B 运行完成
$wg->wait();
}
// echo("ok");
return 'ok';
}
//随机字符串
public function getrandstr($length){
$str = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890';
$randStr = str_shuffle($str);//打乱字符串
$rands= substr($randStr,0,$length);//substr(string,start,length);返回字符串的一部分
return $rands;
}
//获取快递页面信息
public function getinfo(){
//tn-- 提交搜索请求的来源站点 不加会不稳定
$url = 'https://www.baidu.com/s?tn=02003390_43_hao_pg&ie=utf-8&wd=%E5%BF%AB%E9%80%92';
$header = array (
"Host:www.baidu.com",
"Content-Type:application/x-www-form-urlencoded",//post请求
"Connection: keep-alive",
'Referer:http://www.baidu.com',
//关键作用User-Agent 可是每次爬取结果都是无法爬取到百度搜索的内容,要验证 user-agent没有模拟好,所以不行。
'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36',
//如果tokenV2获取失败用上边打开浏览器复制一下最新的cookie即可
'cookie:BIDUPSID=99230F5F14C63D007473C3D9F3787EA9; PSTM=1615876171; __yjs_duid=1_f42a699a56a29eb0b3b574aaf9ff6b971618551519092; BAIDUID=5AFE833F73F7A99737622161E381961A:FG=1; H_WISE_SIDS=110085_127969_174434_179348_184716_188333_188742_189755_190625_194085_196427_197242_197471_197711_199023_199568_201193_203310_203504_203880_203882_203885_204123_204713_204715_204717_204720_204817_204859_204902_205218_205414_205420_205424_205909_206927_206929_207234_207573_207716_207830_208065_208310_208721_209160_209394_209512_209568_209748_210127_210580_210669_210732_210736_210852_210890_210892_210895_210907_211059_211062_211113_211172_211180_211208_211296_211301_211350_211414_211442_211457_211580_211737_211754_211783_211985_212177_212295_212416_212532_212618_212778_212913_212924_212962_212967_213003_213036_213051_213059_213069_213140; MSA_WH=414_896; sugstore=1; BDORZ=FFFB88E999055A3F8A630C64834BD6D0; BD_UPN=12314753; MCITY=-150:; BAIDUID_BFESS=5AFE833F73F7A99737622161E381961A:FG=1; BA_HECTOR=2520252h012l81a5201h9945p15; ZFY=HW4om4BEMLRXyqfICzhl6K9WVCw:A49nZ4LGe7Sy7XSg:C; BDRCVFR[n9IS1zhFc9f]=mk3SLVN4HKm; delPer=0; PSINO=1; H_PS_PSSID=31254_26350; BDRCVFR[Ter2S3H5o_D]=mk3SLVN4HKm; BD_CK_SAM=1; H_PS_645EC=15a7eRjMVUdlLYA1lxLlzLU8A6RFQ20GGNI/PxnbcG8ev4RWhCEGU93iZ1E+YTdrFv47rdm9ywHP; baikeVisitId=b5175df9-f1c6-4b85-9607-ed08055147d8; COOKIE_SESSION=619_0_6_6_3_0_1_0_6_0_0_0_43_0_0_0_1653636800_0_1653960303|9#3743_18_1653547610|9'
);
$ch = curl_init ();
curl_setopt ( $ch, CURLOPT_URL, $url );
curl_setopt ( $ch, CURLOPT_HTTPHEADER, $header );
curl_setopt ( $ch, CURLOPT_RETURNTRANSFER, 1 );
curl_setopt ( $ch, CURLOPT_HEADER, 1 );
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
curl_setopt($ch, CURLOPT_SSLVERSION, 4);
$content = curl_exec ( $ch );
if ($content == FALSE) {
echo "error:" . curl_error ( $ch );
}
curl_close ( $ch );
//获取tokenV2
preg_match('/tokenV2=(.*?)"/i', $content, $match);
return ['tokenV2'=>$match[1]];
}
}
普通流程php curl 一秒一个,爬取二十个就用了二十秒,加了协程,1000单,大概是3秒左右。上面加了一个等待协程 ,如果不加这个一下执行几千几万条,会有Allowed memory size of 268435456 bytes exhausted,内存溢出,所有我们协程一次执行一百和协程,等待一百协程执行成功后在执行下一个一百协程。
qweerr
asdasd
d
zzzzzzzzzzz
dsdasdas
qwe