• 坚守岗位守护绿城美丽与整洁 2018-12-05
  • 采集

    采集

    PHP通过pthreads扩展实现真正的多线程采集

    孤魂 发表了文章 ? 0 个评论 ? 2005 次浏览 ? 2015-12-25 09:11 ? 来自相关话题

    最近自己的项目采集,一直在使用PHP CURL的功能在进行采集,使用命令行执行PHP文件,解决了PHP运行超时的问题,但只能单线程采集。最近找到了使用pthreads实现多线程采集的方法,这里安装方法就不在详细说明了,如果你使用Phpstudy的套件的话,需要注意到两点,一是选择好正确的版本,php 5.x只能使用2.09以下的版本;其次是需要将php_pthreads.dll放在ext目录,然后在php.ini文件中加载此文件;最后需要将pthreadVC2.dll分别复制到./PHPa/目录和./Apache/bin/目录。下面分享一下我的采集源码。<?php
    set_time_limit(0);

    class new_thread_run extends Thread
    {
    public $url;
    public $data;
    public function __construct($url)
    {
    $this->url = $url;
    }
    public function run()
    {
    if (($url = $this->url)) {
    $this->data = model_http_curl_get($url);
    }
    }
    }
    function model_thread_result_get($urls_array)
    {
    if (class_exists('Thread')) {
    foreach ($urls_array as $key => $value) {
    $thread_array[$key] = new new_thread_run($value);
    $thread_array[$key]->start();
    }
    foreach ($thread_array as $thread_array_key => $thread_array_value) {
    while ($thread_array[$thread_array_key]->isRunning()) {
    usleep(10);
    }
    if ($thread_array[$thread_array_key]->join()) {
    $variable_data[$thread_array_key] = $thread_array[$thread_array_key]->data;
    }
    }
    } else {
    foreach ($urls_array as $key => $value) {
    $variable_data[$key] = model_http_curl_get($value);
    }
    }
    return $variable_data;
    }
    function model_http_curl_get($url)
    {
    $userAgent = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2)';
    $curl = curl_init();
    curl_setopt($curl, CURLOPT_URL, $url);
    curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
    curl_setopt($curl, CURLOPT_TIMEOUT, 20);
    curl_setopt($curl, CURLOPT_USERAGENT, $userAgent);
    curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);
    curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false);
    $result = curl_exec($curl);
    curl_close($curl);
    return $result;
    }

    //实际例子
    for ($i = 0; $i < 50; $i++) {
    $urls_array = "//www.baidu.com/s?wd=" . mt_rand(10000, 20000);
    }
    $t = microtime(true);
    $result = model_thread_result_get($urls_array);
    $e = microtime(true);
    echo "多线程:" . ($e - $t) . "\n";
    ?>参考链接:
    //www.thinkphp.cn/topic/22676.html//zyan.cc/pthreads/ 查看全部
    最近自己的项目采集,一直在使用PHP CURL的功能在进行采集,使用命令行执行PHP文件,解决了PHP运行超时的问题,但只能单线程采集。最近找到了使用pthreads实现多线程采集的方法,这里安装方法就不在详细说明了,如果你使用Phpstudy的套件的话,需要注意到两点,一是选择好正确的版本,php 5.x只能使用2.09以下的版本;其次是需要将php_pthreads.dll放在ext目录,然后在php.ini文件中加载此文件;最后需要将pthreadVC2.dll分别复制到./PHPa/目录和./Apache/bin/目录。下面分享一下我的采集源码。
    <?php
    set_time_limit(0);

    class new_thread_run extends Thread
    {
    public $url;
    public $data;
    public function __construct($url)
    {
    $this->url = $url;
    }
    public function run()
    {
    if (($url = $this->url)) {
    $this->data = model_http_curl_get($url);
    }
    }
    }
    function model_thread_result_get($urls_array)
    {
    if (class_exists('Thread')) {
    foreach ($urls_array as $key => $value) {
    $thread_array[$key] = new new_thread_run($value);
    $thread_array[$key]->start();
    }
    foreach ($thread_array as $thread_array_key => $thread_array_value) {
    while ($thread_array[$thread_array_key]->isRunning()) {
    usleep(10);
    }
    if ($thread_array[$thread_array_key]->join()) {
    $variable_data[$thread_array_key] = $thread_array[$thread_array_key]->data;
    }
    }
    } else {
    foreach ($urls_array as $key => $value) {
    $variable_data[$key] = model_http_curl_get($value);
    }
    }
    return $variable_data;
    }
    function model_http_curl_get($url)
    {
    $userAgent = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2)';
    $curl = curl_init();
    curl_setopt($curl, CURLOPT_URL, $url);
    curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
    curl_setopt($curl, CURLOPT_TIMEOUT, 20);
    curl_setopt($curl, CURLOPT_USERAGENT, $userAgent);
    curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);
    curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false);
    $result = curl_exec($curl);
    curl_close($curl);
    return $result;
    }

    //实际例子
    for ($i = 0; $i < 50; $i++) {
    $urls_array = "//www.baidu.com/s?wd=" . mt_rand(10000, 20000);
    }
    $t = microtime(true);
    $result = model_thread_result_get($urls_array);
    $e = microtime(true);
    echo "多线程:" . ($e - $t) . "\n";
    ?>
    参考链接:

    PHP通过pthreads扩展实现真正的多线程采集

    孤魂 发表了文章 ? 0 个评论 ? 2005 次浏览 ? 2015-12-25 09:11 ? 来自相关话题

    最近自己的项目采集,一直在使用PHP CURL的功能在进行采集,使用命令行执行PHP文件,解决了PHP运行超时的问题,但只能单线程采集。最近找到了使用pthreads实现多线程采集的方法,这里安装方法就不在详细说明了,如果你使用Phpstudy的套件的话,需要注意到两点,一是选择好正确的版本,php 5.x只能使用2.09以下的版本;其次是需要将php_pthreads.dll放在ext目录,然后在php.ini文件中加载此文件;最后需要将pthreadVC2.dll分别复制到./PHPa/目录和./Apache/bin/目录。下面分享一下我的采集源码。<?php
    set_time_limit(0);

    class new_thread_run extends Thread
    {
    public $url;
    public $data;
    public function __construct($url)
    {
    $this->url = $url;
    }
    public function run()
    {
    if (($url = $this->url)) {
    $this->data = model_http_curl_get($url);
    }
    }
    }
    function model_thread_result_get($urls_array)
    {
    if (class_exists('Thread')) {
    foreach ($urls_array as $key => $value) {
    $thread_array[$key] = new new_thread_run($value);
    $thread_array[$key]->start();
    }
    foreach ($thread_array as $thread_array_key => $thread_array_value) {
    while ($thread_array[$thread_array_key]->isRunning()) {
    usleep(10);
    }
    if ($thread_array[$thread_array_key]->join()) {
    $variable_data[$thread_array_key] = $thread_array[$thread_array_key]->data;
    }
    }
    } else {
    foreach ($urls_array as $key => $value) {
    $variable_data[$key] = model_http_curl_get($value);
    }
    }
    return $variable_data;
    }
    function model_http_curl_get($url)
    {
    $userAgent = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2)';
    $curl = curl_init();
    curl_setopt($curl, CURLOPT_URL, $url);
    curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
    curl_setopt($curl, CURLOPT_TIMEOUT, 20);
    curl_setopt($curl, CURLOPT_USERAGENT, $userAgent);
    curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);
    curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false);
    $result = curl_exec($curl);
    curl_close($curl);
    return $result;
    }

    //实际例子
    for ($i = 0; $i < 50; $i++) {
    $urls_array = "//www.baidu.com/s?wd=" . mt_rand(10000, 20000);
    }
    $t = microtime(true);
    $result = model_thread_result_get($urls_array);
    $e = microtime(true);
    echo "多线程:" . ($e - $t) . "\n";
    ?>参考链接:
    //www.thinkphp.cn/topic/22676.html//zyan.cc/pthreads/ 查看全部
    最近自己的项目采集,一直在使用PHP CURL的功能在进行采集,使用命令行执行PHP文件,解决了PHP运行超时的问题,但只能单线程采集。最近找到了使用pthreads实现多线程采集的方法,这里安装方法就不在详细说明了,如果你使用Phpstudy的套件的话,需要注意到两点,一是选择好正确的版本,php 5.x只能使用2.09以下的版本;其次是需要将php_pthreads.dll放在ext目录,然后在php.ini文件中加载此文件;最后需要将pthreadVC2.dll分别复制到./PHPa/目录和./Apache/bin/目录。下面分享一下我的采集源码。
    <?php
    set_time_limit(0);

    class new_thread_run extends Thread
    {
    public $url;
    public $data;
    public function __construct($url)
    {
    $this->url = $url;
    }
    public function run()
    {
    if (($url = $this->url)) {
    $this->data = model_http_curl_get($url);
    }
    }
    }
    function model_thread_result_get($urls_array)
    {
    if (class_exists('Thread')) {
    foreach ($urls_array as $key => $value) {
    $thread_array[$key] = new new_thread_run($value);
    $thread_array[$key]->start();
    }
    foreach ($thread_array as $thread_array_key => $thread_array_value) {
    while ($thread_array[$thread_array_key]->isRunning()) {
    usleep(10);
    }
    if ($thread_array[$thread_array_key]->join()) {
    $variable_data[$thread_array_key] = $thread_array[$thread_array_key]->data;
    }
    }
    } else {
    foreach ($urls_array as $key => $value) {
    $variable_data[$key] = model_http_curl_get($value);
    }
    }
    return $variable_data;
    }
    function model_http_curl_get($url)
    {
    $userAgent = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2)';
    $curl = curl_init();
    curl_setopt($curl, CURLOPT_URL, $url);
    curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
    curl_setopt($curl, CURLOPT_TIMEOUT, 20);
    curl_setopt($curl, CURLOPT_USERAGENT, $userAgent);
    curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);
    curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false);
    $result = curl_exec($curl);
    curl_close($curl);
    return $result;
    }

    //实际例子
    for ($i = 0; $i < 50; $i++) {
    $urls_array = "//www.baidu.com/s?wd=" . mt_rand(10000, 20000);
    }
    $t = microtime(true);
    $result = model_thread_result_get($urls_array);
    $e = microtime(true);
    echo "多线程:" . ($e - $t) . "\n";
    ?>
    参考链接:

  • 坚守岗位守护绿城美丽与整洁 2018-12-05