店播爬取Python脚本

douyin_video_scraper_web.py 11KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334
  1. import json
  2. import sys
  3. import threading
  4. import time
  5. import warnings
  6. from libs.proxy import Proxy
  7. from log.print_log import PrintLog
  8. from rds_model.db_redis import DbRedis
  9. from rds_model.rds_user_video_list import RdsUserVideoList
  10. from web_cookie import Cookie
  11. from web_dy import *
  12. start_time = time.time()
  13. # -- coding: utf-8 --**
  14. def set_score(flag):
  15. rds = RdsUserVideoList()
  16. if flag == 'success':
  17. data_score = rds.get_score()
  18. if data_score is None:
  19. data_score = '1@@@1@@@0'
  20. else:
  21. data_score = data_score.split('@@@')
  22. total, success, fail = int(data_score[0]), int(data_score[1]), int(data_score[2])
  23. success = success + 1
  24. data_score = str(total) + '@@@' + str(success) + '@@@' + str(fail)
  25. rds.record_score(data_score)
  26. elif flag == 'fail':
  27. data_score = rds.get_score()
  28. if data_score is None:
  29. data_score = '1@@@0@@@1'
  30. else:
  31. data_score = data_score.split('@@@')
  32. total, success, fail = int(data_score[0]), int(data_score[1]), int(data_score[2])
  33. fail = fail + 1
  34. data_score = str(total) + '@@@' + str(success) + '@@@' + str(fail)
  35. rds.record_score(data_score)
  36. elif flag == 'all':
  37. data_score = rds.get_score()
  38. if data_score is None:
  39. data_score = '1@@@0@@@0'
  40. else:
  41. data_score = data_score.split('@@@')
  42. total, success, fail = int(data_score[0]), int(data_score[1]), int(data_score[2])
  43. total = total + 1
  44. data_score = str(total) + '@@@' + str(success) + '@@@' + str(fail)
  45. rds.record_score(data_score)
  46. return None
  47. def get_signature(url=None,method='_signature'):
  48. with open('/mnt/shop_live_scraper/signature.js', 'r', encoding='utf-8') as f:
  49. b = f.read()
  50. c = execjs.compile(b)
  51. # url=url.replace('%28','(').replace('%29',')').replace('%2C',',')
  52. d = c.call(method, url.replace('\n',''))
  53. return d
  54. def get_ua_ck():
  55. ua_list=[
  56. "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
  57. "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
  58. "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3870.400 QQBrowser/10.8.4405.400"
  59. ]
  60. ck_list=[
  61. 'ttwid=1%7CTVzdM0P0u-8dtsmh6c-EaQEtBoTSOs_MG85FAg07AbA%7C1631502013%7C66442d8594de8e93ad18b73f3dfe0c94ed864c3d932824bcde9918b5be172321; passport_csrf_token=866923f1a32045fd82e47053158402a2',
  62. 'ttwid=1%7CGPDDu9-w3RGs2Pcd0wRlvLYoktpDt-v8LP5ZMyb1NBM%7C1630319594%7Cffb8de47e6da87dcfd76349b5ad34aa1f9b9d4332261a3a8436b932a893366c1; passport_csrf_token=79284b8777a7a54f3066cefef9af539e',
  63. 'ttwid=1%7CGsfqc7NpdOg4N-U-VX7Q77KsWjVTZ7gxLNifsisj8YE%7C1631618570%7Cafbb13a27fd1c2d7a8245454b1e0d7cd654d80848a320933a25d9ef77638c18c; passport_csrf_token=84911c9af94040a99cc10416bd27533d',
  64. 'ttwid=1%7C82FGr05YUOReYUB301ao_erqOQ3ilbXZdEy0tkMsdXY%7C1631863641%7C1dcebe643a96f00841a3b490db60de886bfe07ff3d276e509717abc4e1681ba6; passport_csrf_token=494ae3fffe00328101fd40e050ce49db',
  65. 'ttwid=1%7CwfnX3T9LY4_60iGoQNzyqYe5ahILFeRxfMuZ1pdgXf8%7C1632724192%7Cb613fddc0b533d5578dad4d5f9290705fdc6432aa854d492f4761d164dd3fdd5; passport_csrf_token=4a8afba333103b033e537003b72ee91b'
  66. ]
  67. return random.choice(ua_list),random.choice(ck_list)
  68. def get_user_videos(sec_user_id, max_cursor=0, count=20):
  69. ua,ck=get_ua_ck()
  70. # ua="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
  71. url='https://www.douyin.com/aweme/v1/web/aweme/post/?'
  72. param={
  73. 'device_platform': 'webapp',
  74. 'aid': '6383',
  75. 'channel': 'channel_pc_web',
  76. 'sec_user_id': sec_user_id,
  77. 'max_cursor': str(max_cursor),
  78. 'count': str(count),
  79. 'publish_video_strategy_type': '2',
  80. 'version_code': '170400',
  81. 'version_name': '17.4.0',
  82. 'cookie_enabled': 'true',
  83. 'screen_width': '1920',
  84. 'screen_height': '1080',
  85. 'browser_language': 'zh-CN',
  86. 'browser_platform': 'Win32',
  87. 'browser_name': 'Mozilla',
  88. 'browser_version':ua.replace('Mozilla/',''),
  89. 'browser_online': 'true',
  90. "source" : "channel_pc_web"
  91. }
  92. url = url + parse.urlencode(param)
  93. _signature = get_signature(url)
  94. url+='&_signature='+quote(_signature)
  95. ck = Cookie.get()
  96. if ck is None:
  97. print('获取cookie失败')
  98. return None
  99. headers = {
  100. "authority": "www.douyin.com",
  101. "method": "GET",
  102. "path": str(url).replace('https://www.douyin.com',''),
  103. "scheme": "https",
  104. "accept": "application/json, text/plain, */*",
  105. # "accept-encoding": "gzip, deflate, br",
  106. "accept-language": "zh-CN,zh;q=0.9",
  107. "cookie": ck,
  108. "referer": "https://www.douyin.com/user/{sec_user_id}?enter_method=search_result&enter_from=search_result".format(sec_user_id=sec_user_id),
  109. "user-agent":ua,
  110. "withcredentials": "true",
  111. "sec-ch-ua" : '"Google Chrome";v="93", " Not;A Brand";v="99", "Chromium";v="93"',
  112. "sec-ch-ua-mobile" : "?0",
  113. "sec-ch-ua-platform" : "Windows",
  114. "sec-fetch-dest" : "empty",
  115. "sec-fetch-mode" : "cors",
  116. "sec-fetch-site" : "same-origin"
  117. }
  118. if ck:
  119. headers['cookie']=ck
  120. retry = 0
  121. response_json = None
  122. while True:
  123. if retry > 20:
  124. Cookie.del_cookie(ck)
  125. break
  126. retry += 1
  127. proxy = Proxy.get()
  128. proxies = {
  129. "http": "http://" + proxy,
  130. "https": "http://" + proxy
  131. }
  132. try:
  133. response = requests.get(
  134. url,
  135. headers=headers,
  136. proxies=proxies,
  137. timeout=8
  138. )
  139. if (response.status_code == 200) and (response.text is not None) and (response.text != ''):
  140. # print(response)
  141. response_json = response.json()
  142. if (response_json.get('aweme_list') is not None):
  143. print(
  144. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  145. + ' 数据获取成功!'
  146. + '\n'
  147. + str(sec_user_id)
  148. )
  149. break
  150. else:
  151. print(
  152. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  153. + ' 数据获取失败!'
  154. + '\n'
  155. + str(sec_user_id)
  156. + '\n'
  157. + response.text
  158. + Proxy.proxy_info
  159. )
  160. else:
  161. print(
  162. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  163. + ' 爬取http连接失败!'
  164. + str(response.status_code)
  165. + '\n'
  166. + Proxy.proxy_info
  167. + '\n'
  168. + str(sec_user_id)
  169. + '\n'
  170. + '爬取结果:' + str(response)
  171. + '\n'
  172. )
  173. time.sleep(1)
  174. except requests.exceptions.ProxyError as e:
  175. print(
  176. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  177. + ' 代理过期!'
  178. + str(e)
  179. + '\n'
  180. + str(sec_user_id)
  181. + '\n'
  182. + Proxy.proxy_info
  183. )
  184. Proxy.del_proxy(proxy)
  185. pass
  186. except requests.exceptions.ConnectTimeout as e:
  187. print(
  188. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  189. + ' ConnectTimeout!'
  190. + str(e)
  191. + '\n'
  192. + str(sec_user_id)
  193. + '\n'
  194. + Proxy.proxy_info
  195. )
  196. Proxy.del_proxy(proxy)
  197. pass
  198. except Exception as e:
  199. print(
  200. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  201. + ' 请求抛出异常!'
  202. + str(e)
  203. + '\n'
  204. + str(e.__traceback__.tb_lineno)
  205. + '\n'
  206. + str(sec_user_id)
  207. + '\n'
  208. + Proxy.proxy_info
  209. )
  210. pass
  211. return response_json
  212. def scrape():
  213. rds = RdsUserVideoList()
  214. user_info = rds.get_request_param()
  215. if user_info is None:
  216. return None
  217. sec_user_id = str(user_info)
  218. print(
  219. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  220. + ' '
  221. + str(sec_user_id)
  222. )
  223. try:
  224. videos = get_user_videos(sec_user_id=sec_user_id,max_cursor=0,count=50)
  225. if videos is None:
  226. # rds.push_request_id(sec_user_id)
  227. print(
  228. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  229. + ' 数据获取失败!响应数据为空!'
  230. + '\n'
  231. + str(sec_user_id)
  232. + '\n'
  233. )
  234. sys.exit(0)
  235. if isinstance(videos, dict):
  236. awemes = videos.get('aweme_list')
  237. else:
  238. # print(videos)
  239. awemes = None
  240. if awemes:
  241. set_score('success')
  242. data = str(sec_user_id) + '@@@' + json.dumps(videos)
  243. rds.push_data_list(data)
  244. else:
  245. set_score('fail')
  246. except Exception as e:
  247. set_score('fail')
  248. # rds.push_request_id(sec_user_id)
  249. print(
  250. time.strftime("%H:%M:%S", time.localtime())
  251. + ' '
  252. + str(sec_user_id)
  253. + '数据异常:'
  254. + str(e)
  255. )
  256. sys.exit(0)
  257. if __name__ == '__main__':
  258. print("主方法开始执行")
  259. # 并行线程数
  260. threading_count = int(sys.argv[1])
  261. rds = RdsUserVideoList()
  262. warnings.filterwarnings("ignore")
  263. print(
  264. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  265. + ' '
  266. + ' 开始执行,用户队列长度:'
  267. + str(rds.get_len())
  268. )
  269. while True:
  270. sys.stdout.flush()
  271. # 减去主线程
  272. active_count = threading.active_count() - 1
  273. increment = threading_count - active_count
  274. while increment > 0:
  275. sys.stdout.flush()
  276. # scrape()
  277. task = threading.Thread(target=scrape, args=())
  278. task.start() # 准备就绪, 等待cpu执行
  279. increment = increment - 1
  280. current_time = time.time()
  281. if current_time - start_time > 3600:
  282. print(
  283. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  284. + ' 主方法执行终止'
  285. )
  286. sys.exit(0)
  287. time.sleep(1)