店播爬取Python脚本

douyin_live_lottery_scraper.py 9.8KB


  1. from rds_model.rds_room_lottery_request_list import RdsRoomLotteryRequestList
  2. import time
  3. import json
  4. import sys
  5. import threading
  6. import random
  7. import urllib
  8. import requests
  9. from rds_model.db_redis import DbRedis
  10. from log.print_log import PrintLog
  11. from libs.Xg04 import X_Gorgon
  12. from libs.proxy import Proxy
  13. start_time = time.time()
  14. def get_random(i, random_type=1):
  15. if random_type == 1:
  16. return str(random.randint(1 * 10 ** (i - 1), 1 * 10 ** i - 1))
  17. elif random_type == 8:
  18. seed = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  19. sa = []
  20. for i in range(i):
  21. sa.append(random.choice(seed))
  22. salt = ''.join(sa)
  23. return salt
  24. else:
  25. seed = "1234567890abcde"
  26. sa = []
  27. for i in range(i):
  28. sa.append(random.choice(seed))
  29. salt = ''.join(sa)
  30. return salt
  31. def get_random_brand_type():
  32. brand_type = get_random(3, random_type=8) + '-' + get_random(2, random_type=8) + '00'
  33. return brand_type
  34. def get_mc():
  35. def a():
  36. seed = "1234567890ABCDEF"
  37. sa = []
  38. for i in range(2):
  39. sa.append(random.choice(seed))
  40. salt = ''.join(sa)
  41. return salt
  42. k = ''
  43. for i in range(6):
  44. k += a() + ':'
  45. return k[:-1]
  46. def get_trace():
  47. trace_list = [
  48. '00-70f99f2209e0b045dd14266ee1da0468-70f99f2209e0b045-01',
  49. '00-ce7faf4409b7fcc0ae6135fdd4250468-ce7faf4409b7fcc0-01',
  50. '00-ce7faf3b09b7fcc0ae6042f1d8100468-ce7faf3b09b7fcc0-01',
  51. '00-cdd79d2309b7fcc0ae6625a4cb190468-cdd79d2309b7fcc0-01',
  52. '00-cefde9f009b7fcc0ae6750e1349e0468-cefde9f009b7fcc0-01',
  53. '00-ced2e6ef09b7fcc0ae67dd7bfe000468-ced2e6ef09b7fcc0-01',
  54. '00-cefbfeb509b7fcc0ae659396a6ea0468-cefbfeb509b7fcc0-01',
  55. '00-cefaa25409b7fcc0ae657726a3c30468-cefaa25409b7fcc0-01',
  56. ]
  57. return random.choice(trace_list)
  58. def get_live_lottery_data(room_id, result):
  59. domain = 'webcast5-normal-ipv6-lf.amemv.com'
  60. url = 'https://' + domain + '/webcast/lottery/melon/check_user_right/?'
  61. rticket = str(int(time.time() * 1000))
  62. mc = get_mc
  63. udid = '8604' + get_random(11)
  64. trace_id = get_trace()
  65. # openudid = '3b22' + str(udid.uuid4())[-12:]
  66. device_id, iid, udid, openudid, cookie, V1, V2, device_type, device_brand = result[0], result[1], result[3], result[2], result[4], result[8], result[9], result[10], result[11]
  67. query = {
  68. "room_id" : str(room_id),
  69. "webcast_sdk_version" : "1690",
  70. "webcast_language" : "zh",
  71. "webcast_locale" :"zh_CN",
  72. "webcast_gps_access" : "1",
  73. "os_api" : "23",
  74. "device_type" : device_type,
  75. "ssmix" : "a",
  76. "manifest_version_code" : "120801",
  77. "dpi" : "640",
  78. "app_name" : "aweme",
  79. "version_name" : "12.8.0",
  80. "ts" : int(time.time()),
  81. "cpu_support64" : "true",
  82. "storage_type" : "0",
  83. "app_type" : "normal",
  84. "appTheme" : "dark",
  85. "ac" : "wifi",
  86. "host_abi" : "armeabi-v7a",
  87. "update_version_code" : "12809900",
  88. "channel" : "wandoujia_douyinnew_1128",
  89. "_rticket" : rticket,
  90. # "_rticket" : "1629688012123",
  91. "device_platform" : "android",
  92. "iid" : str(iid),
  93. "version_code" : "120800",
  94. "mac_address" : mc,
  95. #"mac_address" : "FC%3ADB%3AB3%3A56%3ABD%3AFD",
  96. "cdid" : "6c96979e-c729-419c-9516-3a85a7338d0c",
  97. "openudid" : str(openudid),
  98. "device_id" : str(device_id),
  99. "resolution" : "1440*2560",
  100. "os_version" : "6.0.1",
  101. "language" : "zh",
  102. "device_brand":device_brand,
  103. "aid" : "1128"
  104. }
  105. query_params = urllib.parse.urlencode(query)
  106. url = url + query_params
  107. body = ''
  108. xGorgon = X_Gorgon(query_params, body)
  109. userAgent = userAgent = f'com.ss.android.ugc.aweme/1208000 (Linux; U; Android 5.1.1; zh_CN; {device_type}; Build/LMY47V; Cronet/58.0.2991.0)'
  110. headers = {
  111. 'Host': domain,
  112. 'Connection': 'keep-alive',
  113. 'Cache-Control': 'max-age=0',
  114. 'Upgrade-Insecure-Requests': '1',
  115. 'User-Agent': userAgent,
  116. # 'accept-encoding': 'gzip, deflate',
  117. # "x-SS-REQ-TICKET": rticket,
  118. "x-gorgon": xGorgon.get('X-Gorgon'),
  119. "x-khronos": xGorgon.get('X-Khronos'),
  120. 'passport-sdk-version' : '17',
  121. 'sdk-version' : '2',
  122. 'x-ss-dp' : '1128',
  123. 'x-tt-trace-id' : trace_id
  124. }
  125. retry = 0
  126. response_json = None
  127. while True:
  128. if retry > 5:
  129. break
  130. retry += 1
  131. proxy = Proxy.get()
  132. proxies = {
  133. "http": "http://" + proxy,
  134. "https": "http://" + proxy
  135. }
  136. try:
  137. response = requests.get(
  138. url,
  139. headers=headers,
  140. proxies=proxies,
  141. timeout=8
  142. )
  143. if (response.status_code == 200) and (response.text is not None) and (response.text != ''):
  144. response_json = response.json()
  145. if (response_json.get('data') is not None):
  146. print(
  147. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  148. + ' 数据获取成功!'
  149. + '\n'
  150. + room_id
  151. + '\n'
  152. )
  153. break
  154. else:
  155. print(
  156. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  157. + ' 数据获取失败!'
  158. + '\n'
  159. + room_id
  160. + '\n'
  161. + response.text
  162. + Proxy.proxy_info
  163. )
  164. else:
  165. print(
  166. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  167. + ' 爬取http连接失败!'
  168. + str(response.status_code)
  169. + '\n'
  170. + Proxy.proxy_info
  171. + '\n'
  172. + room_id
  173. + '\n'
  174. + '爬取结果:' + str(response)
  175. + '\n'
  176. )
  177. time.sleep(1)
  178. except requests.exceptions.ProxyError as e:
  179. print(
  180. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  181. + ' 代理过期!'
  182. + str(e)
  183. + '\n'
  184. + room_id
  185. + '\n'
  186. + Proxy.proxy_info
  187. )
  188. Proxy.del_proxy(proxy)
  189. pass
  190. except requests.exceptions.ConnectTimeout as e:
  191. print(
  192. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  193. + ' ConnectTimeout!'
  194. + str(e)
  195. + '\n'
  196. + room_id
  197. + '\n'
  198. + Proxy.proxy_info
  199. )
  200. Proxy.del_proxy(proxy)
  201. pass
  202. except Exception as e:
  203. print(
  204. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  205. + ' 请求抛出异常!'
  206. + str(e)
  207. + '\n'
  208. + room_id
  209. + '\n'
  210. + Proxy.proxy_info
  211. )
  212. pass
  213. return response_json
  214. def scrape():
  215. rds_list = RdsRoomLotteryRequestList()
  216. room_info = rds.get_request_param()
  217. if room_info is None:
  218. return None
  219. room_info = json.loads(room_info)
  220. room_id = room_info.get('room_id')
  221. room_id = str(room_id)
  222. print(
  223. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  224. + ' '
  225. + str(room_id)
  226. )
  227. key = 'DOUYIN_SCRAPE_DID_IID_TTREQ_1221'
  228. rdid = DbRedis.connect().rpop(key)
  229. if rdid:
  230. result = rdid.split('@@@')
  231. else:
  232. return None
  233. DbRedis.connect().lpush(key, rdid)
  234. try:
  235. response_json = get_live_lottery_data(room_id, result)
  236. if response_json is None:
  237. # rds_list.record_score(0)
  238. # rds_list.push_request_id(room_id)
  239. print(
  240. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  241. + ' 数据获取失败!响应数据为空!'
  242. + '\n'
  243. + room_id
  244. + '\n'
  245. )
  246. sys.exit(0)
  247. data = json.dumps({
  248. "data": response_json.get('data'),
  249. "extra": {
  250. 'room_id': room_id
  251. }
  252. })
  253. # rds_list.record_score(1)
  254. rds_list.push_data_list(data)
  255. except Exception as e:
  256. # rds_list.record_score(0)
  257. rds_list.push_request_id(room_id)
  258. print(
  259. time.strftime("%H:%M:%S", time.localtime())
  260. + ' '
  261. + room_id
  262. + '数据异常:'
  263. + str(e)
  264. )
  265. sys.exit(0)
  266. if __name__ == "__main__":
  267. print("主方法开始执行")
  268. # 并行线程数
  269. threading_count = int(sys.argv[1])
  270. rds = RdsRoomLotteryRequestList()
  271. print(
  272. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  273. + ' '
  274. + ' 开始执行,更新直播队列长度:'
  275. + str(rds.get_len())
  276. )
  277. while True:
  278. sys.stdout.flush()
  279. # 减去主线程
  280. active_count = threading.active_count() - 1
  281. increment = threading_count - active_count
  282. while increment > 0:
  283. sys.stdout.flush()
  284. # scrape()
  285. task = threading.Thread(target=scrape, args=())
  286. task.start() # 准备就绪, 等待cpu执行
  287. increment = increment - 1
  288. current_time = time.time()
  289. if current_time - start_time > 3600:
  290. print(
  291. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  292. + ' 主方法执行终止'
  293. )
  294. sys.exit(0)
  295. time.sleep(1)