猎户素材采集系统

WxCrawler.php 9.2KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229
  1. <?php
  2. namespace App\Service;
  3. use App\Log;
  4. use QL\QueryList;
  5. class WxCrawler
  6. {
  7. //微信内容div正则
  8. private $wxContentDiv = '/<div class="rich_media_content.*?">(.*?)<\/div>/s';
  9. //微信图片样式
  10. private $imageStyle = 'style="height: auto !important;visibility: visible !important;"';
  11. /**
  12. * 爬取内容
  13. * @param $url
  14. * @return false|string
  15. */
  16. private function _get($url)
  17. {
  18. return file_get_contents($url);
  19. }
  20. public function crawByUrl($url)
  21. {
  22. $content = $this->_get($url);
  23. $basicInfo = $this->articleBasicInfo($content);
  24. $content_html = $this->contentHandle($content);
  25. return array_merge($basicInfo,['content' => $content_html]);
  26. }
  27. /**
  28. * 处理微信文章源码
  29. * @param $content
  30. * @return [带图html文本,无图html文本]
  31. */
  32. private function contentHandle($content)
  33. {
  34. $content_html_pattern = $this->wxContentDiv;
  35. preg_match_all($content_html_pattern, $content, $html_matches);
  36. if(empty(array_filter($html_matches))) {
  37. return [];
  38. }
  39. $content_html = $html_matches[1][0];
  40. //去除掉hidden隐藏
  41. $content_html = str_replace('style="visibility: hidden;"','',$content_html);
  42. //过滤掉iframe
  43. // $content_html = preg_replace('/<iframe(.*?)<\/iframe>/','',$content_html);
  44. // 过滤掉voice
  45. $content_html = preg_replace('/<mpvoice(.*?)<\/mpvoice>/','',$content_html);
  46. // 处理图片
  47. // Log::logInfo('图片处理开始'.date('Y-m-d H:i:s'), [], 'getDetailByLink');
  48. $content_html = preg_replace_callback('/(<img.*?src=[\'|\"])(.*?(?:[\'|\"]))(.*?[\/]?>)/', function($matches) {
  49. // return $matches[1] . $this->getImg($matches[2]).'" '.$this->imageStyle . $matches[3];
  50. return $matches[1] . urldecode(htmlspecialchars_decode($matches[2])) . " " . $this->imageStyle . ' src="' . urldecode(htmlspecialchars_decode($matches[2])) . ' ' .$matches[3];
  51. }, $content_html);
  52. // Log::logInfo('图片处理结束'.date('Y-m-d H:i:s'), [], 'getDetailByLink');
  53. $content_html = preg_replace_callback('/background-image: url\((.*?)\)/', function($matches) {
  54. return 'background-image: url(' . $this->getImg($matches[1]).') ';
  55. // return 'background-image: url(' . urldecode(htmlspecialchars_decode(($matches[1]))).') ';
  56. }, $content_html);
  57. $content_html = preg_replace_callback('/background: url\((.*?)\)/', function($matches) {
  58. return 'background: url(' . $this->getImg($matches[1]).') ';
  59. // return 'background-image: url(' . urldecode(htmlspecialchars_decode(($matches[1]))).') ';
  60. }, $content_html);
  61. // 处理视频封面
  62. $content_html = preg_replace_callback('/data-cover="(.*?)"/', function($matches) {
  63. // return 'data-cover="' . $this->getImg($matches[1]).'" '.$this->imageStyle;
  64. return 'data-cover="' . urldecode(htmlspecialchars_decode(($matches[1]))). '" ' .$this->imageStyle;
  65. }, $content_html);
  66. // 处理音频
  67. // $content_html = preg_replace_callback('/voice_encode_fileid="(.*?)"/', function($matches) {
  68. // return 'src="' . config('link.voice_media').$matches[1] .'" ';
  69. // }, $content_html);
  70. // $content_html = preg_replace_callback('/<mpvoice .*?(\t|\r\n|\n|\s)*<\/mpvoice>/is', function($matches) {
  71. // $mpVoice = urldecode($matches[0]);
  72. // preg_match('/voice_encode_fileid="(.*?)"/', $mpVoice, $voiceSrc);
  73. // preg_match('/name="(.*?)"/', $mpVoice, $voiceTitle);
  74. // $title = isset($voiceTitle[1]) ? $voiceTitle[1] : '-';
  75. // preg_match('/src="(.*?)"/', $mpVoice, $voiceLength);
  76. // $voiceLength = htmlspecialchars_decode($voiceLength[1]);
  77. // $info = parse_url($voiceLength);
  78. // parse_str($info['query'], $outPut);
  79. // $playLength = isset($outPut['play_length']) ? $outPut['play_length'] : '00:00';
  80. // $voiceHtmlNew = '<div class="audio" style="background: #f7f7f7;position: relative;border-radius: 8px;padding: 20px;line-height: 1.4;">
  81. // <img class="icon" style="width: 30px;height: 30px;display: inline-block;vertical-align: middle;" src="./voice.svg" alt="">
  82. // <div class="info" style="width: calc(100% - 40px);display: inline-block;vertical-align: middle;padding-left: 10px;box-sizing: border-box;" voice_encode_fileid="'. $voiceSrc[1] .'">
  83. // <h3 style="margin: 0;padding: 0;display: block;font-weight: 700;font-size: 17px;color: rgba(0,0,0,0.9);overflow: hidden;text-overflow: ellipsis;display: -webkit-box;-webkit-box-orient: vertical;-webkit-line-clamp: 2;word-wrap: break-word;-webkit-hyphens: auto;">'. $playLength .'</h3>
  84. // <span class="desc" style="display: block;font-size: 12px;color: rgba(0,0,0,0.5);width: auto;overflow: hidden;text-overflow: ellipsis;white-space: nowrap;word-wrap: normal;padding: 4px 72px 4px 0;">' .$title. '</span>
  85. // </div>
  86. // </div>';
  87. // return $voiceHtmlNew;
  88. // }, $content_html);
  89. //添加微信样式
  90. // var_dump(htmlspecialchars_decode($content_html));die;
  91. $content_html = '<div class="rich_media_content" style="max-width: 677px;margin-left: auto;margin-right: auto;">'.$content_html. '</div>';
  92. return $content_html;
  93. }
  94. /**
  95. * 获取文章的基本信息
  96. * @param $content
  97. * @return $basicInfo
  98. */
  99. private function articleBasicInfo($content)
  100. {
  101. $item = [
  102. 'title' => 'title', //标题
  103. 'description' => 'digest', //描述
  104. 'msg_source_url' => 'content_url', //文章链接
  105. 'image' => 'cover', //封面图片链接
  106. ];
  107. $basicInfo = [];
  108. foreach ($item as $k => $v) {
  109. if($k == 'msg_source_url')
  110. $pattern = '/var '.$k.' = \'(.*?)\';/s';
  111. else
  112. $pattern = '<meta property="og:'.$k.'" content="(.*?)" />';
  113. preg_match_all($pattern,$content,$matches);
  114. if(array_key_exists(1, $matches) && !empty($matches[1][0])){
  115. $basicInfo[$v] = $this->htmlTransform($matches[1][0]);
  116. }else{
  117. $basicInfo[$v] = '';
  118. }
  119. }
  120. return $basicInfo;
  121. }
  122. /**
  123. * 特殊字符转换
  124. * @param $string
  125. * @return $string
  126. */
  127. private function htmlTransform($string)
  128. {
  129. $string = str_replace('&quot;','"',$string);
  130. $string = str_replace('&amp;','&',$string);
  131. $string = str_replace('amp;','',$string);
  132. $string = str_replace('&lt;','<',$string);
  133. $string = str_replace('&gt;','>',$string);
  134. $string = str_replace('&nbsp;',' ',$string);
  135. $string = str_replace("\\", '',$string);
  136. $string = str_replace("x26", '&',$string);
  137. return $string;
  138. }
  139. /**
  140. * @param $url
  141. * @return string
  142. */
  143. private function getImg($url){
  144. $url = htmlspecialchars_decode($url);
  145. $url = trim($url, "'");
  146. $url = trim($url, '"');
  147. $refer = "http://www.qq.com/";
  148. $opt = [
  149. 'http'=>[
  150. 'header'=>"Referer: " . $refer
  151. ]
  152. ];
  153. if(strstr($url, 'mpvideo')) return $url;
  154. $context = stream_context_create($opt);
  155. //接受数据流
  156. $ossUrl = '';
  157. $file_contents = file_get_contents(urldecode($url),false, $context);
  158. $path = '/mnt/queryList/public/upload/';
  159. $fileType = self::getImgType($url);
  160. $fileName = time().rand(0,99999) . '.'.$fileType;
  161. $filePath = $path . $fileName;
  162. if(strstr($url, 'wx_fmt=svg') !== false) {
  163. $im = new \Imagick($url);
  164. $svg = '<?xml version="1.0" encoding="UTF-8" standalone="no"?>'.$file_contents;
  165. $im->readImageBlob($svg);
  166. $im->setImageFormat($fileType);
  167. $srcImage = $im->getImageGeometry(); //获取源图片宽和高
  168. $im->resizeImage($srcImage['width'], $srcImage['height'], \imagick::FILTER_LANCZOS, 1, false);
  169. $im->writeImage($filePath);
  170. $im->clear();
  171. $im->destroy();
  172. } else {
  173. if(strstr($url, 'wx_fmt=gif') === false) {
  174. $imageSteam = Imagecreatefromstring($file_contents);
  175. if(!file_exists($path))
  176. mkdir($path,0777,true);
  177. //生成新图片
  178. imagejpeg($imageSteam, $filePath);
  179. } else {
  180. $im = new \Imagick($url);
  181. $im->writeImages($filePath, true);
  182. $im->clear();
  183. $im->destroy();
  184. }
  185. }
  186. if(file_exists($filePath)) {
  187. $ossClient = new OssServices('weixiaomeng');
  188. // 上传阿里云
  189. $ossFile = $ossClient->upload($fileType, $filePath, 'upload/weixiaomeng/' . date("Y-m-d", time()));
  190. unlink($filePath);
  191. $ossUrl = $ossFile['oss-request-url'];
  192. }
  193. return empty($ossUrl) ? $url : $ossUrl;
  194. }
  195. /*
  196. * 获取图片类型
  197. * */
  198. public static function getImgType($url)
  199. {
  200. $info = getimagesize($url);
  201. $suffix = false;
  202. if($mime = $info['mime']){
  203. $suffix = explode('/',$mime)[1];
  204. }
  205. return $suffix;
  206. }
  207. }