PHP爬取⽂章(可做为扩展类直接使⽤)
开门见⼭的说,我是在GitHub上拿的代码,但由与那个项⽬年代⽐较久远了,的⼀些规则变了,不能⽤了,所以我就改了部分代码,达到直接⽤的地步。
功能:根据链接,爬取⽂章的⽂字和图⽚(下载到本地),以html形式保存。
直接贴代码,就⼀个⽂件,可以直接⽤
<?php
namespace WxCrawler;
/**
* ⽂章爬取类
*/
class WxCrawler
{
//内容div正则
private $wxContentDiv = '/<div class="rich_media_content " id="js_content" >(.*?)<\/div>/s';
//图⽚样式
private $imageStyle = '';
/**
* 爬取内容
* @param  $url
* @return false|string
* @author bignerd
爬虫可以干什么* @since  2016-08-16T10:13:58+0800
*/
private function _get($url)
{
return file_get_contents($url);
}
public function crawByUrl($url)
{
$content = $this->_get($url);
$basicInfo = $this->articleBasicInfo($content);
list($content_html, $content_text) = $this->contentHandle($content);
return array_merge($basicInfo,['content_html' => $content_html,'content_text' => $content_text]);
}
/**
* 处理⽂章源码,提取⽂章主体,处理图⽚链接
* @author bignerd
* @since  2016-08-16T15:59:27+0800
* @param  $content 抓取的⽂章源码
* @return [带图html⽂本,⽆图html⽂本]
*/
private function contentHandle($content)
{
$content_html_pattern = $this->wxContentDiv;
preg_match_all($content_html_pattern, $content, $html_matchs);
if(empty(array_filter($html_matchs))) {
echo '⽂章不存在';
exit();
}
$content_html = $html_matchs[0][0];
//去除掉hidden隐藏
$content_html = str_replace('','',$content_html);
//过滤掉iframe
$content_html = preg_replace('/<iframe(.*?)<\/iframe>/','',$content_html);
$content_html = preg_replace('/<iframe(.*?)<\/iframe>/','',$content_html);
$path = 'article/';
/** @var  带图⽚html⽂本 */
$content_html = preg_replace_callback('/data-src="(.*?)"/', function($matches) use ($path){
return 'src="' . $path . $this->getImg($matches[1]).'" '.$this->imageStyle;
}, $content_html);
//添加样式
$content_html = '<div >'.$content_html. '</div>';
/** @var  ⽆图html⽂本 */
$content_text = preg_replace('/<img.*?>/s','',$content_html);
return [$content_html,$content_text];
}
/**
* 获取⽂章的基本信息
* @author bignerd
* @since  2016-08-16T17:16:32+0800
* @param  $content ⽂章详情源码
* @return $basicInfo
*/
private function articleBasicInfo($content)
{
//待获取item
$item = [
'ct' => 'date',//发布时间
'msg_title' => 'title',//标题
'msg_desc' => 'digest',//描述
'msg_link' => 'content_url',//⽂章链接
'msg_cdn_url' => 'cover',//封⾯图⽚链接
'nickname' => 'wechatname',//名称
];
$basicInfo = [
'author' => '',
'copyright_stat' => '',
];
foreach ($item as $k => $v) {
if($k == 'msg_title')
$pattern = '/ var '.$k.' = (.*?)\.html\(false\);/s';
else
$pattern = '/ var '.$k.' = "(.*?)";/s';
preg_match_all($pattern,$content,$matches);
if(array_key_exists(1, $matches) && !empty($matches[1][0])){
$basicInfo[$v] = $this->htmlTransform($matches[1][0]);
}else{
$basicInfo[$v] = '';
}
}
//2020/4/3获取作者已失效
//  /** 获取作者 */
//  preg_match('/<em class="rich_media_meta rich_media_meta_text">(.*?)<\/em>/s', $content, $matchAuthor);
//  if(!empty($matchAuthor[1])) $basicInfo['author'] = $matchAuthor[1];
//  /** ⽂章类型 */
//  preg_match('/<span id="copyright_logo" class="rich_media_meta meta_original_tag">(.*?)<\/span>/s', $content, $matchType); //  if(!empty($matchType[1])) $basicInfo['copyright_stat'] = $matchType[1];
return $basicInfo;
}
/
**
* 特殊字符转换
* @author bignerd
* @since  2016-08-16T17:30:52+0800
* @param  $string
* @return $string
*/
*/
private function htmlTransform($string)
{
$string = str_replace('"','"',$string);
$string = str_replace('&','&',$string);
$string = str_replace('amp;','',$string);
$string = str_replace('<','<',$string);
$string = str_replace('>','>',$string);
$string = str_replace(' ',' ',$string);
$string = str_replace("\\", '',$string);
return $string;
}
/**
* @param $url
* @return string
*/
private function getImg($url){
$refer = "www.qq/";
$opt = [
'http'=>[
'header'=>"Referer: " . $refer
]
];
$context = stream_context_create($opt);
//接受数据流
$file_contents = file_get_contents($url,false, $context);
$imageSteam =  Imagecreatefromstring($file_contents);
$path = 'article/';
if(!file_exists($path))
mkdir($path,0777,true);
$fileName = time().rand(0,99999) . '.jpg';
//⽣成新图⽚
imagejpeg($imageSteam, $path . $fileName);
return $fileName;
}
}
$url = 'mp.weixin.qq/s/4gwonJ3m0wd-kwTA3SmU-g'; $crawler = new WxCrawler();
$content = $crawler->crawByUrl($url);
echo $content['content_html'];
通过改变$url,改变爬取的⽂章链接。