PHP扩展下载:https://github.com/krakjoe/pthreads
PHP手册文档:http://php.net/manual/zh/book.pthreads.php
在安装好扩展之后,就可以运用多线程了,下面贴个通过搜索结果抓取百度网盘内容的代码:
代码如下 |
复制代码 |
include 'include/CurlLoad.class.php'; // 引入读取库
/**
* 多线程抓取内容
* @param array $url 待抓取URL列表集合
* @return 成功返回指定内容,失败返回NULL
*/
function vget($url) {
$ret = BaiduSRLinksGet ( $url, 1 ); // 获取结果列表地址
if ($ret != null) {
if (array_key_exists ( "links", $ret )) {
$infos = array ();
$number = count ( $ret ['links'] );
for($i = 0; $i < $number; $i ++) {//循环创建线程对象
$thread_array [$i] = new baidu_thread_run ( $ret ['links'] [$i] );
$thread_array [$i]->start ();
}
foreach ( $thread_array as $thread_array_key => $thread_array_value ) {//检查线程是否执行结束
while ( $thread_array [$thread_array_key]->isRunning () ) {
usleep ( 10 );
}
if ($thread_array [$thread_array_key]->join ()) {//如果执行结束,取出结果
$temp = $thread_array [$thread_array_key]->data;
if ($temp != null)
$infos ['res'] [] = $temp;
}
}
$infos ['pages'] = $ret ['pages'];
$infos ['status'] = "1";
} else
$infos = null;
} else
$infos = null;
return $infos;
}
/**
* 获取百度搜索结果列表URL
*
* @param string $url
* 搜索结果页URL
* @param int $format
* 默认$format=0,获取默认地址;$format=1获取跳转后真实地址
* @return NULL multitype:array()
*/
function BaiduSRLinksGet($url, $format = 0) {
$html = CurlLoad::HtmlGet ( $url ); // 获取页面
if ($html == null)
return null;
try {
preg_match_all ( "/"url":"(?.*)"}/", $html, $rets ); // 搜索结果链接筛选
if (! array_key_exists ( 'links', $rets )) // 如果数组中不包含Links键名,表示获取失败
return null;
$ret = array ();
if ($format == 1) {
$number = count ( $rets ['links'] );
for($i = 0; $i < $number; $i ++) {
$headr_temp = CurlLoad::Get_Headers ( $rets ['links'] [$i], 1 ); // 通过headr获取真实地址
if (array_key_exists ( "Location", $headr_temp ))
$ret ['links'] [$i] = $headr_temp ['Location'];
else
$ret ['links'] = $rets ['links'];
}
} else
$ret ['links'] = $rets ['links'];
preg_match_all ( '/href="?/s?wd=site%3Apan.baidu.com%20(?.+?)&ie=utf-8">/', $html, $out );
unset ( $out ['url'] [0] );
$number = count ( $out ['url'] );
for($i = 1; $i < $number; $i ++) {
preg_match_all ( '/&pn=(.*)/', $out ['url'] [$i], $temp );
$ret ['pages'] [$temp [1] [0] / 10] = base64_encode ( $out ['url'] [$i] );
}
return $ret;
} catch ( Exception $e ) {
WriteLog ( $e );
return null;
}
}
/**
* 百度网盘资源信息获取
*
* @param string $url
* 网盘资源页URL
* @return NULL array
*/
function PanInfoGet($url) {
$html = CurlLoad::HtmlGet ( $url ); // 获取页面
if ($html == null)
return null;
try {
if (preg_match_all ( "/文件名:(?.*) 文件大小:(?.*) 分享者:(?.*) 分享时间:(?.*) 下载次数:(?[0-9]+)/", $html, $ret ) == 0)
return null;
$rets ['name'] = $ret ['name'] [0];
$rets ['size'] = $ret ['size'] [0];
$rets ['user'] = $ret ['user'] [0];
$rets ['date'] = $ret ['date'] [0];
$rets ['number'] = $ret ['number'] [0];
$rets ['link'] = $url;
return $rets;
} catch ( Exception $e ) {
WriteLog ( $e );
return null;
}
}
function WriteLog($str) {
$file = fopen ( "../error.log", "a+" );
fwrite ( $file, "Warning:" . date ( "Y/m/d H:i:s" ) . ":" . $str . "rn" );
fclose ( $file );
}
/**
* 多线程抓取对象
* @author MuXi
*
*/
class baidu_thread_run extends Thread {
public $url;
public $data;
public function __construct($url) {
$this->url = $url;
}
public function run() {
if (($url = $this->url)) {
$this->data = PanInfoGet ( $url );//线程执行方法
}
}
}
?>
|