php 抓取新浪新闻的程序代码

作者：袖梨 2022-06-24

首先，需要下载一个simple_html_dom第三方扩展库，具体下载方式和使用详情可以查看:simple_html_dom的使用.

需要环境支持file_get_contents()函数和curl的支持,具体代码如下：

include_once('simple_html_dom.php');

  $ch = curl_init();

  curl_setopt($ch,CURLOPT_URL,$url);

  curl_setopt($ch,CURLOPT_HEADER,false);

  curl_setopt($ch,CURLOPT_RETURNTRANSFER,1);

  $output = curl_exec($ch);

  curl_close($ch);

  $html = new simple_html_dom();

  $html->load($output);

  $images = array();

  $arr = array();

  foreach($html->find('li  a') as $element){

 

    if( preg_match('#^http://tech.sina.com.cn/it/[d]{4}-[d]{1,2}-[d]{1,2}/[d]+.shtml$#i',$element->href)){

   array_push($images,$element->href);

  }

 }

  $images = array_unique($images);

  

  sort($images);

  for($i=0;$iload($data);  

    $arr = array();

    foreach($html->find('h1#artibodyTitle') as $element){

    $arr['title']= @iconv('gbk','utf-8', $element->innertext);

;

   }

   $str = '';

   foreach($html->find('div#artibody p') as $element){

    $str.= $element;      

   }

   $arr['content'] = $str;

   foreach($html->find('div.img_wrapper img') as $element){

      $arr['alt'] =$element->alt;

      $data = file_get_contents($element->src);

      $info = getimagesize($element->src);//get image information

      switch($info[2]){

        case 1:

       $str = 'gif';

       break;

        case 2:

       $str = 'jpg';

       break;

        case 3:

       $str = 'png';

       break;

        default:

       continue;

       break;

      }

      $filename = time().rand(1,999999).'.'.$str; 

      if(!is_dir($dirname)){

        mkdir($dirname,0777,true);

      }

      $fp = fopen($dirname.$filename,'w');            

      fwrite($fp,$data);

      fclose($fp);

      $arr['img'] = $dirname.$filename;

      

    }

   return $arr;

}

?>

如果大家有更好的建议，欢迎提出来

php 抓取新浪新闻的程序代码

相关文章

精彩推荐