猎户素材采集系统

ArticleService.php 1.6KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071
  1. <?php
  2. namespace App\Service;
  3. use QL\QueryList;
  4. use App\Log;
  5. class ArticleService
  6. {
  7. /*
  8. * 根据链接获取详情内容
  9. * */
  10. public static function getDetailByLink($link, &$data)
  11. {
  12. $linkInfo = parse_url($link);
  13. $host = isset($linkInfo['host']) ? $linkInfo['host'] : '';
  14. if(empty($host)) {
  15. return 1001;
  16. }
  17. $basicInfo = [
  18. 'title' => '',
  19. 'digest' => '',
  20. 'content_url' => $link,
  21. 'cover' => '',
  22. 'content' => '',
  23. ];
  24. switch ($host) {
  25. case 'mp.weixin.qq.com':
  26. $data = self::weChatLink($link);
  27. break;
  28. default:
  29. $data = self::newsDetail($link);
  30. break;
  31. }
  32. $data = array_merge($basicInfo, $data);
  33. return 0;
  34. }
  35. /*
  36. * 根据链接获取详情内容
  37. * */
  38. public static function weChatLink($link)
  39. {
  40. $crawler = new WxCrawler();
  41. $content = $crawler->crawByUrl($link);
  42. return $content;
  43. }
  44. /*
  45. * 新闻详情(通用)内容爬取
  46. * */
  47. public static function newsDetail($link)
  48. {
  49. // 获取第一个p元素的父元素中的html
  50. $html = file_get_contents($link);
  51. // $html = iconv('GBK','UTF-8//IGNORE',file_get_contents($link));
  52. $ql = QueryList::html($html);
  53. // 获取class为 ql 的元素对象
  54. $content = $ql->find('p:first')->parent()->html();
  55. $title = $ql->find('title')->text();
  56. return [
  57. 'content' => $content,
  58. 'title'=>$title
  59. ];
  60. }
  61. }