猎户素材采集系统

WxCrawler.php 9.4KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230
  1. <?php
  2. namespace App\Service;
  3. use App\Log;
  4. use QL\QueryList;
  5. class WxCrawler
  6. {
  7. //微信内容div正则
  8. private $wxContentDiv = '/<div class="rich_media_content(\s*)(js_underline_content){0,}(\s*)(autoTypeSetting24psection){0,}(\s*)"
  9. id="js_content" style="visibility: hidden;">(.*?)<\/div>/s';
  10. //微信图片样式
  11. private $imageStyle = 'style="height: auto !important;visibility: visible !important;"';
  12. /**
  13. * 爬取内容
  14. * @param $url
  15. * @return false|string
  16. */
  17. private function _get($url)
  18. {
  19. return file_get_contents($url);
  20. }
  21. public function crawByUrl($url)
  22. {
  23. $content = $this->_get($url);
  24. $basicInfo = $this->articleBasicInfo($content);
  25. $content_html = $this->contentHandle($content);
  26. return array_merge($basicInfo,['content' => $content_html]);
  27. }
  28. /**
  29. * 处理微信文章源码
  30. * @param $content
  31. * @return [带图html文本,无图html文本]
  32. */
  33. private function contentHandle($content)
  34. {
  35. $content_html_pattern = $this->wxContentDiv;
  36. preg_match_all($content_html_pattern, $content, $html_matches);
  37. if(empty(array_filter($html_matches))) {
  38. return [];
  39. }
  40. $content_html = $html_matches[0][0];
  41. //去除掉hidden隐藏
  42. $content_html = str_replace('style="visibility: hidden;"','',$content_html);
  43. //过滤掉iframe
  44. // $content_html = preg_replace('/<iframe(.*?)<\/iframe>/','',$content_html);
  45. // 过滤掉voice
  46. $content_html = preg_replace('/<mpvoice(.*?)<\/mpvoice>/','',$content_html);
  47. // 处理图片
  48. // Log::logInfo('图片处理开始'.date('Y-m-d H:i:s'), [], 'getDetailByLink');
  49. $content_html = preg_replace_callback('/(<img.*?src=[\'|\"])(.*?(?:[\'|\"]))(.*?[\/]?>)/', function($matches) {
  50. // return $matches[1] . $this->getImg($matches[2]).'" '.$this->imageStyle . $matches[3];
  51. return $matches[1] . urldecode(htmlspecialchars_decode($matches[2])) . " " . $this->imageStyle . ' src="' . urldecode(htmlspecialchars_decode($matches[2])) . ' ' .$matches[3];
  52. }, $content_html);
  53. // Log::logInfo('图片处理结束'.date('Y-m-d H:i:s'), [], 'getDetailByLink');
  54. $content_html = preg_replace_callback('/background-image: url\((.*?)\)/', function($matches) {
  55. return 'background-image: url(' . $this->getImg($matches[1]).') ';
  56. // return 'background-image: url(' . urldecode(htmlspecialchars_decode(($matches[1]))).') ';
  57. }, $content_html);
  58. $content_html = preg_replace_callback('/background: url\((.*?)\)/', function($matches) {
  59. return 'background: url(' . $this->getImg($matches[1]).') ';
  60. // return 'background-image: url(' . urldecode(htmlspecialchars_decode(($matches[1]))).') ';
  61. }, $content_html);
  62. // 处理视频封面
  63. $content_html = preg_replace_callback('/data-cover="(.*?)"/', function($matches) {
  64. // return 'data-cover="' . $this->getImg($matches[1]).'" '.$this->imageStyle;
  65. return 'data-cover="' . urldecode(htmlspecialchars_decode(($matches[1]))). '" ' .$this->imageStyle;
  66. }, $content_html);
  67. // 处理音频
  68. // $content_html = preg_replace_callback('/voice_encode_fileid="(.*?)"/', function($matches) {
  69. // return 'src="' . config('link.voice_media').$matches[1] .'" ';
  70. // }, $content_html);
  71. // $content_html = preg_replace_callback('/<mpvoice .*?(\t|\r\n|\n|\s)*<\/mpvoice>/is', function($matches) {
  72. // $mpVoice = urldecode($matches[0]);
  73. // preg_match('/voice_encode_fileid="(.*?)"/', $mpVoice, $voiceSrc);
  74. // preg_match('/name="(.*?)"/', $mpVoice, $voiceTitle);
  75. // $title = isset($voiceTitle[1]) ? $voiceTitle[1] : '-';
  76. // preg_match('/src="(.*?)"/', $mpVoice, $voiceLength);
  77. // $voiceLength = htmlspecialchars_decode($voiceLength[1]);
  78. // $info = parse_url($voiceLength);
  79. // parse_str($info['query'], $outPut);
  80. // $playLength = isset($outPut['play_length']) ? $outPut['play_length'] : '00:00';
  81. // $voiceHtmlNew = '<div class="audio" style="background: #f7f7f7;position: relative;border-radius: 8px;padding: 20px;line-height: 1.4;">
  82. // <img class="icon" style="width: 30px;height: 30px;display: inline-block;vertical-align: middle;" src="./voice.svg" alt="">
  83. // <div class="info" style="width: calc(100% - 40px);display: inline-block;vertical-align: middle;padding-left: 10px;box-sizing: border-box;" voice_encode_fileid="'. $voiceSrc[1] .'">
  84. // <h3 style="margin: 0;padding: 0;display: block;font-weight: 700;font-size: 17px;color: rgba(0,0,0,0.9);overflow: hidden;text-overflow: ellipsis;display: -webkit-box;-webkit-box-orient: vertical;-webkit-line-clamp: 2;word-wrap: break-word;-webkit-hyphens: auto;">'. $playLength .'</h3>
  85. // <span class="desc" style="display: block;font-size: 12px;color: rgba(0,0,0,0.5);width: auto;overflow: hidden;text-overflow: ellipsis;white-space: nowrap;word-wrap: normal;padding: 4px 72px 4px 0;">' .$title. '</span>
  86. // </div>
  87. // </div>';
  88. // return $voiceHtmlNew;
  89. // }, $content_html);
  90. //添加微信样式
  91. // var_dump(htmlspecialchars_decode($content_html));die;
  92. $content_html = '<div class="rich_media_content" style="max-width: 677px;margin-left: auto;margin-right: auto;">'.$content_html. '</div>';
  93. return $content_html;
  94. }
  95. /**
  96. * 获取文章的基本信息
  97. * @param $content
  98. * @return $basicInfo
  99. */
  100. private function articleBasicInfo($content)
  101. {
  102. $item = [
  103. 'title' => 'title', //标题
  104. 'description' => 'digest', //描述
  105. 'msg_source_url' => 'content_url', //文章链接
  106. 'image' => 'cover', //封面图片链接
  107. ];
  108. $basicInfo = [];
  109. foreach ($item as $k => $v) {
  110. if($k == 'msg_source_url')
  111. $pattern = '/var '.$k.' = \'(.*?)\';/s';
  112. else
  113. $pattern = '<meta property="og:'.$k.'" content="(.*?)" />';
  114. preg_match_all($pattern,$content,$matches);
  115. if(array_key_exists(1, $matches) && !empty($matches[1][0])){
  116. $basicInfo[$v] = $this->htmlTransform($matches[1][0]);
  117. }else{
  118. $basicInfo[$v] = '';
  119. }
  120. }
  121. return $basicInfo;
  122. }
  123. /**
  124. * 特殊字符转换
  125. * @param $string
  126. * @return $string
  127. */
  128. private function htmlTransform($string)
  129. {
  130. $string = str_replace('&quot;','"',$string);
  131. $string = str_replace('&amp;','&',$string);
  132. $string = str_replace('amp;','',$string);
  133. $string = str_replace('&lt;','<',$string);
  134. $string = str_replace('&gt;','>',$string);
  135. $string = str_replace('&nbsp;',' ',$string);
  136. $string = str_replace("\\", '',$string);
  137. $string = str_replace("x26", '&',$string);
  138. return $string;
  139. }
  140. /**
  141. * @param $url
  142. * @return string
  143. */
  144. private function getImg($url){
  145. $url = htmlspecialchars_decode($url);
  146. $url = trim($url, "'");
  147. $url = trim($url, '"');
  148. $refer = "http://www.qq.com/";
  149. $opt = [
  150. 'http'=>[
  151. 'header'=>"Referer: " . $refer
  152. ]
  153. ];
  154. if(strstr($url, 'mpvideo')) return $url;
  155. $context = stream_context_create($opt);
  156. //接受数据流
  157. $ossUrl = '';
  158. $file_contents = file_get_contents(urldecode($url),false, $context);
  159. $path = '/mnt/queryList/public/upload/';
  160. $fileType = self::getImgType($url);
  161. $fileName = time().rand(0,99999) . '.'.$fileType;
  162. $filePath = $path . $fileName;
  163. if(strstr($url, 'wx_fmt=svg') !== false) {
  164. $im = new \Imagick($url);
  165. $svg = '<?xml version="1.0" encoding="UTF-8" standalone="no"?>'.$file_contents;
  166. $im->readImageBlob($svg);
  167. $im->setImageFormat($fileType);
  168. $srcImage = $im->getImageGeometry(); //获取源图片宽和高
  169. $im->resizeImage($srcImage['width'], $srcImage['height'], \imagick::FILTER_LANCZOS, 1, false);
  170. $im->writeImage($filePath);
  171. $im->clear();
  172. $im->destroy();
  173. } else {
  174. if(strstr($url, 'wx_fmt=gif') === false) {
  175. $imageSteam = Imagecreatefromstring($file_contents);
  176. if(!file_exists($path))
  177. mkdir($path,0777,true);
  178. //生成新图片
  179. imagejpeg($imageSteam, $filePath);
  180. } else {
  181. $im = new \Imagick($url);
  182. $im->writeImages($filePath, true);
  183. $im->clear();
  184. $im->destroy();
  185. }
  186. }
  187. if(file_exists($filePath)) {
  188. $ossClient = new OssServices('weixiaomeng');
  189. // 上传阿里云
  190. $ossFile = $ossClient->upload($fileType, $filePath, 'upload/weixiaomeng/' . date("Y-m-d", time()));
  191. unlink($filePath);
  192. $ossUrl = $ossFile['oss-request-url'];
  193. }
  194. return empty($ossUrl) ? $url : $ossUrl;
  195. }
  196. /*
  197. * 获取图片类型
  198. * */
  199. public static function getImgType($url)
  200. {
  201. $info = getimagesize($url);
  202. $suffix = false;
  203. if($mime = $info['mime']){
  204. $suffix = explode('/',$mime)[1];
  205. }
  206. return $suffix;
  207. }
  208. }