店播爬取Python脚本

douyin_live_lottery_scraper.py 11KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358
  1. from rds_model.rds_room_lottery_request_list import RdsRoomLotteryRequestList
  2. import time
  3. import json
  4. import sys
  5. import threading
  6. import random
  7. import urllib
  8. import requests
  9. from rds_model.db_redis import DbRedis
  10. from log.print_log import PrintLog
  11. from libs.Xg04 import X_Gorgon
  12. from libs.proxy import Proxy
  13. start_time = time.time()
  14. def get_random(i, random_type=1):
  15. if random_type == 1:
  16. return str(random.randint(1 * 10 ** (i - 1), 1 * 10 ** i - 1))
  17. elif random_type == 8:
  18. seed = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  19. sa = []
  20. for i in range(i):
  21. sa.append(random.choice(seed))
  22. salt = ''.join(sa)
  23. return salt
  24. else:
  25. seed = "1234567890abcde"
  26. sa = []
  27. for i in range(i):
  28. sa.append(random.choice(seed))
  29. salt = ''.join(sa)
  30. return salt
  31. def get_random_brand_type():
  32. brand_type = get_random(3, random_type=8) + '-' + get_random(2, random_type=8) + '00'
  33. return brand_type
  34. def get_random_trace():
  35. random_one = 'c' + get_random(15)
  36. random_two = 'ae6' + get_random(9) + '0468'
  37. trace_id = '00-' + str(random_one) + str(random_two) + '-' + str(random_one) + '-01'
  38. return trace_id
  39. def get_mc():
  40. def a():
  41. seed = "1234567890ABCDEF"
  42. sa = []
  43. for i in range(2):
  44. sa.append(random.choice(seed))
  45. salt = ''.join(sa)
  46. return salt
  47. k = ''
  48. for i in range(6):
  49. k += a() + ':'
  50. return k[:-1]
  51. def get_trace():
  52. trace_list = [
  53. '00-70f99f2209e0b045dd14266ee1da0468-70f99f2209e0b045-01',
  54. '00-ce7faf4409b7fcc0ae6135fdd4250468-ce7faf4409b7fcc0-01',
  55. '00-ce7faf3b09b7fcc0ae6042f1d8100468-ce7faf3b09b7fcc0-01',
  56. '00-cdd79d2309b7fcc0ae6625a4cb190468-cdd79d2309b7fcc0-01',
  57. '00-cefde9f009b7fcc0ae6750e1349e0468-cefde9f009b7fcc0-01',
  58. '00-ced2e6ef09b7fcc0ae67dd7bfe000468-ced2e6ef09b7fcc0-01',
  59. '00-cefbfeb509b7fcc0ae659396a6ea0468-cefbfeb509b7fcc0-01',
  60. '00-cefaa25409b7fcc0ae657726a3c30468-cefaa25409b7fcc0-01',
  61. '00-6eb478cf09b7fcc0ae6e437c9e160468-6eb478cf09b7fcc0-01',
  62. '00-6eb4752709b7fcc0ae69f22235260468-6eb4752709b7fcc0-01',
  63. '00-6eb4469109b7fcc0ae6ba6f250a90468-6eb4469109b7fcc0-01',
  64. '00-6eb43ea209b7fcc0ae66a34128bd0468-6eb43ea209b7fcc0-01',
  65. ]
  66. return random.choice(trace_list)
  67. def get_live_lottery_data(room_id, result):
  68. domain = 'webcast5-normal-ipv6-lf.amemv.com'
  69. url = 'https://' + domain + '/webcast/lottery/melon/check_user_right/?'
  70. rticket = str(int(time.time() * 1000))
  71. mc = get_mc
  72. udid = '8604' + get_random(11)
  73. trace_id = get_trace()
  74. # trace_id = get_random_trace()
  75. # openudid = '3b22' + str(udid.uuid4())[-12:]
  76. device_id, iid, udid, openudid, cookie, V1, V2, device_type, device_brand = result[0], result[1], result[3], result[2], result[4], result[8], result[9], result[10], result[11]
  77. query = {
  78. "room_id" : str(room_id),
  79. "webcast_sdk_version" : "1690",
  80. "webcast_language" : "zh",
  81. "webcast_locale" :"zh_CN",
  82. "webcast_gps_access" : "1",
  83. "os_api" : "23",
  84. "device_type" : device_type,
  85. "ssmix" : "a",
  86. "manifest_version_code" : "120801",
  87. "dpi" : "640",
  88. "app_name" : "aweme",
  89. "version_name" : "12.8.0",
  90. "ts" : int(time.time()),
  91. "cpu_support64" : "true",
  92. "storage_type" : "0",
  93. "app_type" : "normal",
  94. "appTheme" : "dark",
  95. "ac" : "wifi",
  96. "host_abi" : "armeabi-v7a",
  97. "update_version_code" : "12809900",
  98. "channel" : "wandoujia_douyinnew_1128",
  99. "_rticket" : rticket,
  100. # "_rticket" : "1629688012123",
  101. "device_platform" : "android",
  102. "iid" : str(iid),
  103. "version_code" : "120800",
  104. "mac_address" : mc,
  105. #"mac_address" : "FC%3ADB%3AB3%3A56%3ABD%3AFD",
  106. "cdid" : "6c96979e-c729-419c-9516-3a85a7338d0c",
  107. "openudid" : str(openudid),
  108. "device_id" : str(device_id),
  109. "resolution" : "1440*2560",
  110. "os_version" : "6.0.1",
  111. "language" : "zh",
  112. "device_brand":device_brand,
  113. "aid" : "1128"
  114. }
  115. query_params = urllib.parse.urlencode(query)
  116. url = url + query_params
  117. body = ''
  118. xGorgon = X_Gorgon(query_params, body)
  119. userAgent = userAgent = f'com.ss.android.ugc.aweme/1208000 (Linux; U; Android 5.1.1; zh_CN; {device_type}; Build/LMY47V; Cronet/58.0.2991.0)'
  120. headers = {
  121. 'Host': domain,
  122. 'Connection': 'keep-alive',
  123. 'Cache-Control': 'max-age=0',
  124. 'Upgrade-Insecure-Requests': '1',
  125. 'User-Agent': userAgent,
  126. # 'accept-encoding': 'gzip, deflate',
  127. # "x-SS-REQ-TICKET": rticket,
  128. "x-gorgon": xGorgon.get('X-Gorgon'),
  129. "x-khronos": xGorgon.get('X-Khronos'),
  130. 'passport-sdk-version' : '17',
  131. 'sdk-version' : '2',
  132. 'x-ss-dp' : '1128',
  133. 'x-tt-trace-id' : trace_id
  134. }
  135. retry = 0
  136. response_json = None
  137. while True:
  138. if retry > 3:
  139. break
  140. retry += 1
  141. proxy = Proxy.dailiyun_get()
  142. proxies = {
  143. "http": "http://" + proxy,
  144. "https": "http://" + proxy
  145. }
  146. try:
  147. response = requests.get(
  148. url,
  149. headers=headers,
  150. proxies=proxies,
  151. timeout=8
  152. )
  153. if (response.status_code == 200) and (response.text is not None) and (response.text != ''):
  154. response_json = response.json()
  155. if (response_json.get('data') is not None):
  156. print(
  157. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  158. + ' 数据获取成功!'
  159. + '\n'
  160. + room_id
  161. + '\n'
  162. )
  163. break
  164. else:
  165. print(
  166. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  167. + ' 数据获取失败!'
  168. + '\n'
  169. + room_id
  170. + '\n'
  171. + response.text
  172. + Proxy.proxy_info
  173. )
  174. else:
  175. print(
  176. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  177. + ' 爬取http连接失败!'
  178. + str(response.status_code)
  179. + '\n'
  180. + Proxy.proxy_info
  181. + '\n'
  182. + room_id + 'trace_id:' + trace_id
  183. + '\n'
  184. + '爬取结果:' + str(response)
  185. + '\n'
  186. )
  187. time.sleep(1)
  188. except requests.exceptions.ProxyError as e:
  189. print(
  190. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  191. + ' 代理过期!'
  192. + str(e)
  193. + '\n'
  194. + room_id
  195. + '\n'
  196. + Proxy.proxy_info
  197. )
  198. Proxy.dailiyun_del_proxy(proxy)
  199. pass
  200. except requests.exceptions.ConnectTimeout as e:
  201. print(
  202. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  203. + ' ConnectTimeout!'
  204. + str(e)
  205. + '\n'
  206. + room_id
  207. + '\n'
  208. + Proxy.proxy_info
  209. )
  210. Proxy.dailiyun_del_proxy(proxy)
  211. pass
  212. except Exception as e:
  213. print(
  214. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  215. + ' 请求抛出异常!'
  216. + str(e)
  217. + '\n'
  218. + room_id
  219. + '\n'
  220. + Proxy.proxy_info
  221. )
  222. pass
  223. return response_json
  224. def scrape():
  225. rds_list = RdsRoomLotteryRequestList()
  226. while(True):
  227. room_info = rds.get_request_param()
  228. if room_info is None:
  229. return None
  230. room_info = json.loads(room_info)
  231. room_id = room_info.get('room_id')
  232. room_id = str(room_id)
  233. print(
  234. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  235. + ' '
  236. + str(room_id)
  237. )
  238. key = 'DOUYIN_SCRAPE_DID_IID_TTREQ_1221'
  239. rdid = DbRedis.connect().rpop(key)
  240. if rdid:
  241. result = rdid.split('@@@')
  242. else:
  243. return None
  244. DbRedis.connect().lpush(key, rdid)
  245. try:
  246. response_json = get_live_lottery_data(room_id, result)
  247. if response_json is None:
  248. # rds_list.record_score(0)
  249. # rds_list.push_request_id(room_id)
  250. print(
  251. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  252. + ' 数据获取失败!响应数据为空!'
  253. + '\n'
  254. + room_id
  255. + '\n'
  256. )
  257. # sys.exit(0)
  258. data = json.dumps({
  259. "data": response_json.get('data'),
  260. "extra": {
  261. 'room_id': room_id
  262. }
  263. })
  264. # rds_list.record_score(1)
  265. rds_list.push_data_list(data)
  266. except Exception as e:
  267. # rds_list.record_score(0)
  268. rds_list.push_request_id(room_id)
  269. print(
  270. time.strftime("%H:%M:%S", time.localtime())
  271. + ' '
  272. + room_id
  273. + '数据异常:'
  274. + str(e)
  275. )
  276. # sys.exit(0)
  277. if __name__ == "__main__":
  278. print("主方法开始执行")
  279. # 并行线程数
  280. threading_count = int(sys.argv[1])
  281. rds = RdsRoomLotteryRequestList()
  282. print(
  283. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  284. + ' '
  285. + ' 开始执行,更新直播队列长度:'
  286. + str(rds.get_len())
  287. )
  288. while True:
  289. sys.stdout.flush()
  290. # 减去主线程
  291. active_count = threading.active_count() - 1
  292. increment = threading_count - active_count
  293. if increment > 0:
  294. sys.stdout.flush()
  295. # scrape()
  296. task = threading.Thread(target=scrape, args=())
  297. task.start() # 准备就绪, 等待cpu执行
  298. # increment = increment - 1
  299. current_time = time.time()
  300. if current_time - start_time > 3600:
  301. print(
  302. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  303. + ' 主方法执行终止'
  304. )
  305. sys.exit(0)
  306. time.sleep(1)