店播爬取Python脚本

dy_user_info.py 2.6KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879
  1. import time
  2. import threading
  3. import json
  4. from rds_model.rds_user_live_request_list import RdsUserLiveRequestList
  5. from libs.dy_user_live_info import DyUserLiveInfo
  6. from log.print_log import PrintLog
  7. def scrape():
  8. rds_user_info_list = RdsUserLiveRequestList()
  9. while True:
  10. try:
  11. uid = rds.get_request_param()
  12. if uid is None:
  13. time.sleep(2)
  14. continue
  15. uid = str(uid)
  16. # 若获取到的店播用户ID非纯数字,跳出循环
  17. if not uid.isdigit():
  18. continue
  19. PrintLog.print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + ' ' + uid + ' 开始抓取用户信息')
  20. res = DyUserLiveInfo.get_data(uid=uid)
  21. if (res == '') or (res is None):
  22. rds_user_info_list.record_score(0)
  23. rds_user_info_list.push_request_param(uid)
  24. PrintLog.print(
  25. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + '数据获取失败!响应数据为空!' + '\n'
  26. + uid + '\n'
  27. )
  28. continue
  29. response_json = json.loads(res)
  30. if response_json.get('data').get('user') is None:
  31. rds_user_info_list.record_score(0)
  32. rds_user_info_list.push_request_param(uid)
  33. PrintLog.print(
  34. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + '数据获取失败!达人数据为空!' + '\n'
  35. + uid + '\n'
  36. + res
  37. )
  38. continue
  39. data = json.dumps({
  40. "data": response_json.get('data'),
  41. "extra": {
  42. 'uid': uid
  43. }
  44. })
  45. print('抓取成功')
  46. rds_user_info_list.record_score(1)
  47. rds_user_info_list.push_data_list(data)
  48. except Exception as e:
  49. print('抓取失败')
  50. rds_user_info_list.record_score(0)
  51. rds_user_info_list.push_request_param(uid)
  52. PrintLog.print(time.strftime("%H:%M:%S", time.localtime()) + ' ' + uid + '数据异常:' + str(e))
  53. time.sleep(0.1)
  54. if __name__ == "__main__":
  55. print("主方法开始执行")
  56. rds = RdsUserLiveRequestList()
  57. print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + ' ' + ' 开始执行,待爬取弹幕直播队列长度:' + str(rds.get_user_live_monitor_len()))
  58. for i in range(1, 100):
  59. task = threading.Thread(target=scrape, name=i)
  60. task.start() # 准备就绪,等待cpu执行