店播爬取Python脚本

douyin_video_scraper_web.py 11KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339
  1. import json
  2. import sys
  3. import threading
  4. import time
  5. import warnings
  6. from libs.proxy import Proxy
  7. from log.print_log import PrintLog
  8. from rds_model.db_redis import DbRedis
  9. from rds_model.rds_user_video_list import RdsUserVideoList
  10. from web_cookie import Cookie
  11. from web_dy import *
  12. # -- coding: utf-8 --**
  13. start_time = int(time.time())
  14. def set_score(flag):
  15. rds = RdsUserVideoList()
  16. if flag == 'success':
  17. data_score = rds.get_score()
  18. if data_score is None:
  19. data_score = '1@@@1@@@0'
  20. else:
  21. data_score = data_score.split('@@@')
  22. total, success, fail = int(data_score[0]), int(data_score[1]), int(data_score[2])
  23. success = success + 1
  24. data_score = str(total) + '@@@' + str(success) + '@@@' + str(fail)
  25. rds.record_score(data_score)
  26. elif flag == 'fail':
  27. data_score = rds.get_score()
  28. if data_score is None:
  29. data_score = '1@@@0@@@1'
  30. else:
  31. data_score = data_score.split('@@@')
  32. total, success, fail = int(data_score[0]), int(data_score[1]), int(data_score[2])
  33. fail = fail + 1
  34. data_score = str(total) + '@@@' + str(success) + '@@@' + str(fail)
  35. rds.record_score(data_score)
  36. elif flag == 'all':
  37. data_score = rds.get_score()
  38. if data_score is None:
  39. data_score = '1@@@0@@@0'
  40. else:
  41. data_score = data_score.split('@@@')
  42. total, success, fail = int(data_score[0]), int(data_score[1]), int(data_score[2])
  43. total = total + 1
  44. data_score = str(total) + '@@@' + str(success) + '@@@' + str(fail)
  45. rds.record_score(data_score)
  46. return None
  47. def get_signature(url=None,method='_signature'):
  48. # with open('/mnt/kwai_data_center_test_scraper/signature.js', 'r', encoding='utf-8') as f:
  49. with open('/mnt/shop_live_scraper_shen/signature.js', 'r', encoding='utf-8') as f:
  50. b = f.read()
  51. c = execjs.compile(b)
  52. # url=url.replace('%28','(').replace('%29',')').replace('%2C',',')
  53. d = c.call(method, url.replace('\n',''))
  54. return d
  55. def get_ua_ck():
  56. ua_list=[
  57. "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
  58. "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
  59. "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3870.400 QQBrowser/10.8.4405.400"
  60. ]
  61. ck_list=[
  62. 'ttwid=1%7CTVzdM0P0u-8dtsmh6c-EaQEtBoTSOs_MG85FAg07AbA%7C1631502013%7C66442d8594de8e93ad18b73f3dfe0c94ed864c3d932824bcde9918b5be172321; passport_csrf_token=866923f1a32045fd82e47053158402a2',
  63. 'ttwid=1%7CGPDDu9-w3RGs2Pcd0wRlvLYoktpDt-v8LP5ZMyb1NBM%7C1630319594%7Cffb8de47e6da87dcfd76349b5ad34aa1f9b9d4332261a3a8436b932a893366c1; passport_csrf_token=79284b8777a7a54f3066cefef9af539e',
  64. 'ttwid=1%7CGsfqc7NpdOg4N-U-VX7Q77KsWjVTZ7gxLNifsisj8YE%7C1631618570%7Cafbb13a27fd1c2d7a8245454b1e0d7cd654d80848a320933a25d9ef77638c18c; passport_csrf_token=84911c9af94040a99cc10416bd27533d',
  65. 'ttwid=1%7C82FGr05YUOReYUB301ao_erqOQ3ilbXZdEy0tkMsdXY%7C1631863641%7C1dcebe643a96f00841a3b490db60de886bfe07ff3d276e509717abc4e1681ba6; passport_csrf_token=494ae3fffe00328101fd40e050ce49db',
  66. 'ttwid=1%7CwfnX3T9LY4_60iGoQNzyqYe5ahILFeRxfMuZ1pdgXf8%7C1632724192%7Cb613fddc0b533d5578dad4d5f9290705fdc6432aa854d492f4761d164dd3fdd5; passport_csrf_token=4a8afba333103b033e537003b72ee91b'
  67. ]
  68. return random.choice(ua_list),random.choice(ck_list)
  69. def get_user_videos(sec_user_id, max_cursor=0, count=20):
  70. ua,ck=get_ua_ck()
  71. # ua="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
  72. url='https://www.douyin.com/aweme/v1/web/aweme/post/?'
  73. param={
  74. 'device_platform': 'webapp',
  75. 'aid': '6383',
  76. 'channel': 'channel_pc_web',
  77. 'sec_user_id': sec_user_id,
  78. 'max_cursor': str(max_cursor),
  79. 'count': str(count),
  80. 'publish_video_strategy_type': '2',
  81. 'version_code': '170400',
  82. 'version_name': '17.4.0',
  83. 'cookie_enabled': 'true',
  84. 'screen_width': '1920',
  85. 'screen_height': '1080',
  86. 'browser_language': 'zh-CN',
  87. 'browser_platform': 'Win32',
  88. 'browser_name': 'Mozilla',
  89. 'browser_version':ua.replace('Mozilla/',''),
  90. 'browser_online': 'true',
  91. "source" : "channel_pc_web"
  92. }
  93. url = url + parse.urlencode(param)
  94. _signature = get_signature(url)
  95. url+='&_signature='+quote(_signature)
  96. ck = Cookie.get()
  97. if ck is None:
  98. print('获取cookie失败')
  99. return None
  100. headers = {
  101. "authority": "www.douyin.com",
  102. "method": "GET",
  103. "path": str(url).replace('https://www.douyin.com',''),
  104. "scheme": "https",
  105. "accept": "application/json, text/plain, */*",
  106. # "accept-encoding": "gzip, deflate, br",
  107. "accept-language": "zh-CN,zh;q=0.9",
  108. "cookie": ck,
  109. "referer": "https://www.douyin.com/user/{sec_user_id}?enter_method=search_result&enter_from=search_result".format(sec_user_id=sec_user_id),
  110. "user-agent":ua,
  111. "withcredentials": "true",
  112. "sec-ch-ua" : '"Google Chrome";v="93", " Not;A Brand";v="99", "Chromium";v="93"',
  113. "sec-ch-ua-mobile" : "?0",
  114. "sec-ch-ua-platform" : "Windows",
  115. "sec-fetch-dest" : "empty",
  116. "sec-fetch-mode" : "cors",
  117. "sec-fetch-site" : "same-origin"
  118. }
  119. if ck:
  120. headers['cookie']=ck
  121. retry = 0
  122. response_json = None
  123. while True:
  124. if retry > 20:
  125. Cookie.del_cookie(ck)
  126. break
  127. retry += 1
  128. # proxy = Proxy.get()
  129. proxy = Proxy.rola_get()
  130. proxies = {
  131. "http": "http://" + proxy,
  132. "https": "http://" + proxy
  133. }
  134. try:
  135. response = requests.get(
  136. url,
  137. headers=headers,
  138. proxies=proxies,
  139. timeout=8
  140. )
  141. if (response.status_code == 200) and (response.text is not None) and (response.text != ''):
  142. # print(response)
  143. response_json = response.json()
  144. if (response_json.get('aweme_list') is not None):
  145. print(
  146. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  147. + ' 数据获取成功!'
  148. + '\n'
  149. + str(sec_user_id)
  150. )
  151. break
  152. else:
  153. print(
  154. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  155. + ' 数据获取失败!'
  156. + '\n'
  157. + str(sec_user_id)
  158. + '\n'
  159. + response.text
  160. + Proxy.proxy_info
  161. )
  162. else:
  163. print(
  164. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  165. + ' 爬取http连接失败!'
  166. + str(response.status_code)
  167. + '\n'
  168. + Proxy.proxy_info
  169. + '\n'
  170. + str(sec_user_id)
  171. + '\n'
  172. + '爬取结果:' + str(response)
  173. + '\n'
  174. )
  175. time.sleep(1)
  176. except requests.exceptions.ProxyError as e:
  177. print(
  178. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  179. + ' 代理过期!'
  180. + str(e)
  181. + '\n'
  182. + str(sec_user_id)
  183. + '\n'
  184. + Proxy.proxy_info
  185. )
  186. # Proxy.del_proxy(proxy)
  187. Proxy.rola_del_proxy(proxy)
  188. pass
  189. except requests.exceptions.ConnectTimeout as e:
  190. print(
  191. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  192. + ' ConnectTimeout!'
  193. + str(e)
  194. + '\n'
  195. + str(sec_user_id)
  196. + '\n'
  197. + Proxy.proxy_info
  198. )
  199. # Proxy.del_proxy(proxy)
  200. Proxy.rola_del_proxy(proxy)
  201. pass
  202. except Exception as e:
  203. print(
  204. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  205. + ' 请求抛出异常!'
  206. + str(e)
  207. + '\n'
  208. + str(e.__traceback__.tb_lineno)
  209. + '\n'
  210. + str(sec_user_id)
  211. + '\n'
  212. + Proxy.proxy_info
  213. )
  214. pass
  215. return response_json
  216. def scrape():
  217. rds = RdsUserVideoList()
  218. while True:
  219. user_info = rds.get_request_param()
  220. if user_info is None:
  221. time.sleep(60)
  222. continue
  223. # return None
  224. sec_user_id = str(user_info)
  225. print(
  226. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  227. + ' '
  228. + str(sec_user_id)
  229. )
  230. try:
  231. videos = get_user_videos(sec_user_id=sec_user_id,max_cursor=0,count=1)
  232. if videos is None:
  233. # rds.push_request_id(sec_user_id)
  234. print(
  235. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  236. + ' 数据获取失败!响应数据为空!'
  237. + '\n'
  238. + str(sec_user_id)
  239. + '\n'
  240. )
  241. # sys.exit(0)
  242. if isinstance(videos, dict):
  243. awemes = videos.get('aweme_list')
  244. else:
  245. # print(videos)
  246. awemes = None
  247. if awemes:
  248. set_score('success')
  249. data = str(sec_user_id) + '@@@' + json.dumps(videos)
  250. rds.push_data_list(data)
  251. else:
  252. set_score('fail')
  253. except Exception as e:
  254. set_score('fail')
  255. rds.push_request_id(sec_user_id)
  256. print(
  257. time.strftime("%H:%M:%S", time.localtime())
  258. + ' '
  259. + str(sec_user_id)
  260. + '数据异常:'
  261. + str(e)
  262. )
  263. current_time = time.time()
  264. if current_time - start_time > 3600:
  265. sys.exit(0)
  266. # sys.exit(0)
  267. if __name__ == '__main__':
  268. print("主方法开始执行")
  269. # 并行线程数
  270. threading_count = int(sys.argv[1])
  271. # num = int(sys.argv[2])
  272. rds = RdsUserVideoList()
  273. warnings.filterwarnings("ignore")
  274. print(
  275. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  276. + ' '
  277. + ' 开始执行,用户队列长度:'
  278. + str(rds.get_len())
  279. )
  280. while True:
  281. sys.stdout.flush()
  282. current_time = time.time()
  283. if current_time - start_time > 3600:
  284. sys.exit(0)
  285. # 减去主线程
  286. active_count = threading.active_count() - 1
  287. increment = threading_count - active_count
  288. if increment > 0:
  289. task = threading.Thread(target=scrape, args=())
  290. task.start() # 准备就绪, 等待cpu执行
  291. # increment = increment - 1
  292. time.sleep(1)