123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229 |
- <?php
- namespace App\Service;
- use App\Log;
- use QL\QueryList;
- class WxCrawler
- {
- //微信内容div正则
- private $wxContentDiv = '/<div class="rich_media_content.*?">(.*?)<\/div>/s';
- //微信图片样式
- private $imageStyle = 'style="height: auto !important;visibility: visible !important;"';
- /**
- * 爬取内容
- * @param $url
- * @return false|string
- */
- private function _get($url)
- {
- return file_get_contents($url);
- }
- public function crawByUrl($url)
- {
- $content = $this->_get($url);
- $basicInfo = $this->articleBasicInfo($content);
- $content_html = $this->contentHandle($content);
- return array_merge($basicInfo,['content' => $content_html]);
- }
- /**
- * 处理微信文章源码
- * @param $content
- * @return [带图html文本,无图html文本]
- */
- private function contentHandle($content)
- {
- $content_html_pattern = $this->wxContentDiv;
- preg_match_all($content_html_pattern, $content, $html_matches);
- if(empty(array_filter($html_matches))) {
- return [];
- }
- $content_html = $html_matches[1][0];
- //去除掉hidden隐藏
- $content_html = str_replace('style="visibility: hidden;"','',$content_html);
- //过滤掉iframe
- // $content_html = preg_replace('/<iframe(.*?)<\/iframe>/','',$content_html);
- // 过滤掉voice
- $content_html = preg_replace('/<mpvoice(.*?)<\/mpvoice>/','',$content_html);
- // 处理图片
- // Log::logInfo('图片处理开始'.date('Y-m-d H:i:s'), [], 'getDetailByLink');
- $content_html = preg_replace_callback('/(<img.*?src=[\'|\"])(.*?(?:[\'|\"]))(.*?[\/]?>)/', function($matches) {
- // return $matches[1] . $this->getImg($matches[2]).'" '.$this->imageStyle . $matches[3];
- return $matches[1] . urldecode(htmlspecialchars_decode($matches[2])) . " " . $this->imageStyle . ' src="' . urldecode(htmlspecialchars_decode($matches[2])) . ' ' .$matches[3];
- }, $content_html);
- // Log::logInfo('图片处理结束'.date('Y-m-d H:i:s'), [], 'getDetailByLink');
- $content_html = preg_replace_callback('/background-image: url\((.*?)\)/', function($matches) {
- return 'background-image: url(' . $this->getImg($matches[1]).') ';
- // return 'background-image: url(' . urldecode(htmlspecialchars_decode(($matches[1]))).') ';
- }, $content_html);
- $content_html = preg_replace_callback('/background: url\((.*?)\)/', function($matches) {
- return 'background: url(' . $this->getImg($matches[1]).') ';
- // return 'background-image: url(' . urldecode(htmlspecialchars_decode(($matches[1]))).') ';
- }, $content_html);
- // 处理视频封面
- $content_html = preg_replace_callback('/data-cover="(.*?)"/', function($matches) {
- // return 'data-cover="' . $this->getImg($matches[1]).'" '.$this->imageStyle;
- return 'data-cover="' . urldecode(htmlspecialchars_decode(($matches[1]))). '" ' .$this->imageStyle;
- }, $content_html);
- // 处理音频
- // $content_html = preg_replace_callback('/voice_encode_fileid="(.*?)"/', function($matches) {
- // return 'src="' . config('link.voice_media').$matches[1] .'" ';
- // }, $content_html);
- // $content_html = preg_replace_callback('/<mpvoice .*?(\t|\r\n|\n|\s)*<\/mpvoice>/is', function($matches) {
- // $mpVoice = urldecode($matches[0]);
- // preg_match('/voice_encode_fileid="(.*?)"/', $mpVoice, $voiceSrc);
- // preg_match('/name="(.*?)"/', $mpVoice, $voiceTitle);
- // $title = isset($voiceTitle[1]) ? $voiceTitle[1] : '-';
- // preg_match('/src="(.*?)"/', $mpVoice, $voiceLength);
- // $voiceLength = htmlspecialchars_decode($voiceLength[1]);
- // $info = parse_url($voiceLength);
- // parse_str($info['query'], $outPut);
- // $playLength = isset($outPut['play_length']) ? $outPut['play_length'] : '00:00';
- // $voiceHtmlNew = '<div class="audio" style="background: #f7f7f7;position: relative;border-radius: 8px;padding: 20px;line-height: 1.4;">
- // <img class="icon" style="width: 30px;height: 30px;display: inline-block;vertical-align: middle;" src="./voice.svg" alt="">
- // <div class="info" style="width: calc(100% - 40px);display: inline-block;vertical-align: middle;padding-left: 10px;box-sizing: border-box;" voice_encode_fileid="'. $voiceSrc[1] .'">
- // <h3 style="margin: 0;padding: 0;display: block;font-weight: 700;font-size: 17px;color: rgba(0,0,0,0.9);overflow: hidden;text-overflow: ellipsis;display: -webkit-box;-webkit-box-orient: vertical;-webkit-line-clamp: 2;word-wrap: break-word;-webkit-hyphens: auto;">'. $playLength .'</h3>
- // <span class="desc" style="display: block;font-size: 12px;color: rgba(0,0,0,0.5);width: auto;overflow: hidden;text-overflow: ellipsis;white-space: nowrap;word-wrap: normal;padding: 4px 72px 4px 0;">' .$title. '</span>
- // </div>
- // </div>';
- // return $voiceHtmlNew;
- // }, $content_html);
- //添加微信样式
- // var_dump(htmlspecialchars_decode($content_html));die;
- $content_html = '<div class="rich_media_content" style="max-width: 677px;margin-left: auto;margin-right: auto;">'.$content_html. '</div>';
- return $content_html;
- }
- /**
- * 获取文章的基本信息
- * @param $content
- * @return $basicInfo
- */
- private function articleBasicInfo($content)
- {
- $item = [
- 'title' => 'title', //标题
- 'description' => 'digest', //描述
- 'msg_source_url' => 'content_url', //文章链接
- 'image' => 'cover', //封面图片链接
- ];
- $basicInfo = [];
- foreach ($item as $k => $v) {
- if($k == 'msg_source_url')
- $pattern = '/var '.$k.' = \'(.*?)\';/s';
- else
- $pattern = '<meta property="og:'.$k.'" content="(.*?)" />';
- preg_match_all($pattern,$content,$matches);
- if(array_key_exists(1, $matches) && !empty($matches[1][0])){
- $basicInfo[$v] = $this->htmlTransform($matches[1][0]);
- }else{
- $basicInfo[$v] = '';
- }
- }
- return $basicInfo;
- }
- /**
- * 特殊字符转换
- * @param $string
- * @return $string
- */
- private function htmlTransform($string)
- {
- $string = str_replace('"','"',$string);
- $string = str_replace('&','&',$string);
- $string = str_replace('amp;','',$string);
- $string = str_replace('<','<',$string);
- $string = str_replace('>','>',$string);
- $string = str_replace(' ',' ',$string);
- $string = str_replace("\\", '',$string);
- $string = str_replace("x26", '&',$string);
- return $string;
- }
- /**
- * @param $url
- * @return string
- */
- private function getImg($url){
- $url = htmlspecialchars_decode($url);
- $url = trim($url, "'");
- $url = trim($url, '"');
- $refer = "http://www.qq.com/";
- $opt = [
- 'http'=>[
- 'header'=>"Referer: " . $refer
- ]
- ];
- if(strstr($url, 'mpvideo')) return $url;
- $context = stream_context_create($opt);
- //接受数据流
- $ossUrl = '';
- $file_contents = file_get_contents(urldecode($url),false, $context);
- $path = '/mnt/queryList/public/upload/';
- $fileType = self::getImgType($url);
- $fileName = time().rand(0,99999) . '.'.$fileType;
- $filePath = $path . $fileName;
- if(strstr($url, 'wx_fmt=svg') !== false) {
- $im = new \Imagick($url);
- $svg = '<?xml version="1.0" encoding="UTF-8" standalone="no"?>'.$file_contents;
- $im->readImageBlob($svg);
- $im->setImageFormat($fileType);
- $srcImage = $im->getImageGeometry(); //获取源图片宽和高
- $im->resizeImage($srcImage['width'], $srcImage['height'], \imagick::FILTER_LANCZOS, 1, false);
- $im->writeImage($filePath);
- $im->clear();
- $im->destroy();
- } else {
- if(strstr($url, 'wx_fmt=gif') === false) {
- $imageSteam = Imagecreatefromstring($file_contents);
- if(!file_exists($path))
- mkdir($path,0777,true);
- //生成新图片
- imagejpeg($imageSteam, $filePath);
- } else {
- $im = new \Imagick($url);
- $im->writeImages($filePath, true);
- $im->clear();
- $im->destroy();
- }
- }
- if(file_exists($filePath)) {
- $ossClient = new OssServices('weixiaomeng');
- // 上传阿里云
- $ossFile = $ossClient->upload($fileType, $filePath, 'upload/weixiaomeng/' . date("Y-m-d", time()));
- unlink($filePath);
- $ossUrl = $ossFile['oss-request-url'];
- }
- return empty($ossUrl) ? $url : $ossUrl;
- }
- /*
- * 获取图片类型
- * */
- public static function getImgType($url)
- {
- $info = getimagesize($url);
- $suffix = false;
- if($mime = $info['mime']){
- $suffix = explode('/',$mime)[1];
- }
- return $suffix;
- }
- }
|