店播爬取Python脚本

douyin_video_scraper_web.py 8.9KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302
  1. import json,time,sys,threading,warnings
  2. from rds_model.db_redis import DbRedis
  3. from log.print_log import PrintLog
  4. from libs.proxy import Proxy
  5. from web_dy import *
  6. from rds_model.rds_user_video_list import RdsUserVideoList
  7. from web_cookie import Cookie
  8. start_time = time.time()
  9. # -- coding: utf-8 --**
  10. def set_score(flag):
  11. rds = RdsUserVideoList()
  12. if flag == 'success':
  13. data_score = rds.get_score()
  14. if data_score is None:
  15. data_score = '1@@@1@@@0'
  16. else:
  17. data_score = data_score.split('@@@')
  18. total, success, fail = int(data_score[0]), int(data_score[1]), int(data_score[2])
  19. success = success + 1
  20. data_score = str(total) + '@@@' + str(success) + '@@@' + str(fail)
  21. rds.record_score(data_score)
  22. elif flag == 'fail':
  23. data_score = rds.get_score()
  24. if data_score is None:
  25. data_score = '1@@@0@@@1'
  26. else:
  27. data_score = data_score.split('@@@')
  28. total, success, fail = int(data_score[0]), int(data_score[1]), int(data_score[2])
  29. fail = fail + 1
  30. data_score = str(total) + '@@@' + str(success) + '@@@' + str(fail)
  31. rds.record_score(data_score)
  32. elif flag == 'all':
  33. data_score = rds.get_score()
  34. if data_score is None:
  35. data_score = '1@@@0@@@0'
  36. else:
  37. data_score = data_score.split('@@@')
  38. total, success, fail = int(data_score[0]), int(data_score[1]), int(data_score[2])
  39. total = total + 1
  40. data_score = str(total) + '@@@' + str(success) + '@@@' + str(fail)
  41. rds.record_score(data_score)
  42. return None
  43. def get_signature(url=None,method='_signature'):
  44. with open('/mnt/shop_live_scraper/signature.js', 'r', encoding='utf-8') as f:
  45. b = f.read()
  46. c = execjs.compile(b)
  47. # url=url.replace('%28','(').replace('%29',')').replace('%2C',',')
  48. d = c.call(method, url.replace('\n',''))
  49. return d
  50. def get_user_videos(sec_user_id, max_cursor=0, count=20):
  51. # ua,ck=get_ua_ck('get_user_videos')
  52. ua="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
  53. url='https://www.douyin.com/aweme/v1/web/aweme/post/?'
  54. param={
  55. 'device_platform': 'webapp',
  56. 'aid': '6383',
  57. 'channel': 'channel_pc_web',
  58. 'sec_user_id': sec_user_id,
  59. 'max_cursor': str(max_cursor),
  60. 'count': str(count),
  61. 'publish_video_strategy_type': '2',
  62. 'version_code': '160100',
  63. 'version_name': '16.1.0',
  64. 'cookie_enabled': 'true',
  65. 'screen_width': '1920',
  66. 'screen_height': '1080',
  67. 'browser_language': 'zh-CN',
  68. 'browser_platform': 'Win32',
  69. 'browser_name': 'Mozilla',
  70. 'browser_version':ua.replace('Mozilla/',''),
  71. 'browser_online': 'true',
  72. }
  73. url = url + parse.urlencode(param)
  74. _signature = get_signature(url)
  75. url+='&_signature='+quote(_signature)
  76. ck = Cookie.get()
  77. if ck is None:
  78. print('获取cookie失败')
  79. return None
  80. headers = {
  81. "authority": "www.douyin.com",
  82. "method": "GET",
  83. "path": str(url).replace('https://www.douyin.com',''),
  84. "scheme": "https",
  85. "accept": "application/json, text/plain, */*",
  86. # "accept-encoding": "gzip, deflate, br",
  87. "accept-language": "zh-CN,zh;q=0.9",
  88. "cookie": ck,
  89. "referer": "https://www.douyin.com/user/{sec_user_id}?enter_method=search_result&enter_from=search_result".format(sec_user_id=sec_user_id),
  90. "user-agent":ua,
  91. "withcredentials": "true",
  92. }
  93. if ck:
  94. headers['cookie']=ck
  95. retry = 0
  96. response_json = None
  97. while True:
  98. if retry > 20:
  99. Cookie.del_cookie(ck)
  100. break
  101. retry += 1
  102. proxy = Proxy.get()
  103. proxies = {
  104. "http": "http://" + proxy,
  105. "https": "http://" + proxy
  106. }
  107. try:
  108. response = requests.get(
  109. url,
  110. headers=headers,
  111. proxies=proxies,
  112. timeout=8
  113. )
  114. if (response.status_code == 200) and (response.text is not None) and (response.text != ''):
  115. response_json = response.json()
  116. if (response_json.get('aweme_list') is not None):
  117. print(
  118. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  119. + ' 数据获取成功!'
  120. + '\n'
  121. + str(sec_user_id)
  122. )
  123. break
  124. else:
  125. print(
  126. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  127. + ' 数据获取失败!'
  128. + '\n'
  129. + str(sec_user_id)
  130. + '\n'
  131. + response.text
  132. + Proxy.proxy_info
  133. )
  134. else:
  135. print(
  136. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  137. + ' 爬取http连接失败!'
  138. + str(response.status_code)
  139. + '\n'
  140. + Proxy.proxy_info
  141. + '\n'
  142. + str(sec_user_id)
  143. + '\n'
  144. )
  145. time.sleep(1)
  146. except requests.exceptions.ProxyError as e:
  147. print(
  148. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  149. + ' 代理过期!'
  150. + str(e)
  151. + '\n'
  152. + str(sec_user_id)
  153. + '\n'
  154. + Proxy.proxy_info
  155. )
  156. Proxy.del_proxy(proxy)
  157. pass
  158. except requests.exceptions.ConnectTimeout as e:
  159. print(
  160. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  161. + ' ConnectTimeout!'
  162. + str(e)
  163. + '\n'
  164. + str(sec_user_id)
  165. + '\n'
  166. + Proxy.proxy_info
  167. )
  168. Proxy.del_proxy(proxy)
  169. pass
  170. except Exception as e:
  171. print(
  172. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  173. + ' 请求抛出异常!'
  174. + str(e)
  175. + '\n'
  176. + str(sec_user_id)
  177. + '\n'
  178. + Proxy.proxy_info
  179. )
  180. pass
  181. return response_json
  182. def scrape():
  183. rds = RdsUserVideoList()
  184. user_info = rds.get_request_param()
  185. if user_info is None:
  186. return None
  187. sec_user_id = str(user_info)
  188. print(
  189. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  190. + ' '
  191. + str(sec_user_id)
  192. )
  193. try:
  194. videos = get_user_videos(sec_user_id=sec_user_id,max_cursor=0,count=20)
  195. if videos is None:
  196. # rds.push_request_id(sec_user_id)
  197. print(
  198. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  199. + ' 数据获取失败!响应数据为空!'
  200. + '\n'
  201. + str(sec_user_id)
  202. + '\n'
  203. )
  204. sys.exit(0)
  205. if isinstance(videos, dict):
  206. awemes = videos.get('aweme_list')
  207. else:
  208. # print(videos)
  209. awemes = None
  210. if awemes:
  211. set_score('success')
  212. data = str(sec_user_id) + '@@@' + json.dumps(videos)
  213. rds.push_data_list(data)
  214. else:
  215. set_score('fail')
  216. except Exception as e:
  217. set_score('fail')
  218. # rds.push_request_id(sec_user_id)
  219. print(
  220. time.strftime("%H:%M:%S", time.localtime())
  221. + ' '
  222. + str(sec_user_id)
  223. + '数据异常:'
  224. + str(e)
  225. )
  226. sys.exit(0)
  227. if __name__ == '__main__':
  228. print("主方法开始执行")
  229. # 并行线程数
  230. threading_count = int(sys.argv[1])
  231. rds = RdsUserVideoList()
  232. warnings.filterwarnings("ignore")
  233. print(
  234. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  235. + ' '
  236. + ' 开始执行,用户队列长度:'
  237. + str(rds.get_len())
  238. )
  239. while True:
  240. sys.stdout.flush()
  241. # 减去主线程
  242. active_count = threading.active_count() - 1
  243. increment = threading_count - active_count
  244. while increment > 0:
  245. sys.stdout.flush()
  246. # scrape()
  247. task = threading.Thread(target=scrape, args=())
  248. task.start() # 准备就绪, 等待cpu执行
  249. increment = increment - 1
  250. current_time = time.time()
  251. if current_time - start_time > 3600:
  252. print(
  253. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  254. + ' 主方法执行终止'
  255. )
  256. sys.exit(0)
  257. time.sleep(1)