店播爬取Python脚本

douyin_live_lottery_scraper.py 12KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373
  1. from rds_model.rds_room_lottery_request_list import RdsRoomLotteryRequestList
  2. import time
  3. import json
  4. import sys
  5. import threading
  6. import random
  7. import urllib
  8. import requests
  9. from rds_model.db_redis import DbRedis
  10. from log.print_log import PrintLog
  11. from libs.Xg04 import X_Gorgon
  12. from libs.proxy import Proxy
  13. start_time = int(time.time())
  14. def get_random(i, random_type=1):
  15. if random_type == 1:
  16. return str(random.randint(1 * 10 ** (i - 1), 1 * 10 ** i - 1))
  17. elif random_type == 8:
  18. seed = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  19. sa = []
  20. for i in range(i):
  21. sa.append(random.choice(seed))
  22. salt = ''.join(sa)
  23. return salt
  24. else:
  25. seed = "1234567890abcde"
  26. sa = []
  27. for i in range(i):
  28. sa.append(random.choice(seed))
  29. salt = ''.join(sa)
  30. return salt
  31. def get_random_brand_type():
  32. brand_type = get_random(3, random_type=8) + '-' + get_random(2, random_type=8) + '00'
  33. return brand_type
  34. def get_random_trace():
  35. random_one = 'c' + get_random(15)
  36. random_two = 'ae6' + get_random(9) + '0468'
  37. trace_id = '00-' + str(random_one) + str(random_two) + '-' + str(random_one) + '-01'
  38. return trace_id
  39. def get_mc():
  40. def a():
  41. seed = "1234567890ABCDEF"
  42. sa = []
  43. for i in range(2):
  44. sa.append(random.choice(seed))
  45. salt = ''.join(sa)
  46. return salt
  47. k = ''
  48. for i in range(6):
  49. k += a() + ':'
  50. return k[:-1]
  51. def get_trace():
  52. trace_list = [
  53. '00-70f99f2209e0b045dd14266ee1da0468-70f99f2209e0b045-01',
  54. '00-ce7faf4409b7fcc0ae6135fdd4250468-ce7faf4409b7fcc0-01',
  55. '00-ce7faf3b09b7fcc0ae6042f1d8100468-ce7faf3b09b7fcc0-01',
  56. '00-cdd79d2309b7fcc0ae6625a4cb190468-cdd79d2309b7fcc0-01',
  57. '00-cefde9f009b7fcc0ae6750e1349e0468-cefde9f009b7fcc0-01',
  58. '00-ced2e6ef09b7fcc0ae67dd7bfe000468-ced2e6ef09b7fcc0-01',
  59. '00-cefbfeb509b7fcc0ae659396a6ea0468-cefbfeb509b7fcc0-01',
  60. '00-cefaa25409b7fcc0ae657726a3c30468-cefaa25409b7fcc0-01',
  61. '00-6eb478cf09b7fcc0ae6e437c9e160468-6eb478cf09b7fcc0-01',
  62. '00-6eb4752709b7fcc0ae69f22235260468-6eb4752709b7fcc0-01',
  63. '00-6eb4469109b7fcc0ae6ba6f250a90468-6eb4469109b7fcc0-01',
  64. '00-6eb43ea209b7fcc0ae66a34128bd0468-6eb43ea209b7fcc0-01',
  65. '00-98d3c43df91c019bdf0d508485048c4c-98d3c43df91c019b-01',
  66. '00-a77c64227e911ee70811198281710139-a77c64227e911ee7-01',
  67. '00-354016da2d2a9c3b31f771521b8c3316-354016da2d2a9c3b-01',
  68. ]
  69. return random.choice(trace_list)
  70. def get_live_lottery_data(room_id):
  71. domain = 'webcast5-normal-ipv6-lf.amemv.com'
  72. url = 'https://' + domain + '/webcast/lottery/melon/check_user_right/?'
  73. retry = 0
  74. response_json = None
  75. while True:
  76. if retry > 30:
  77. break
  78. # key = 'DOUYIN_SCRAPE_DID_IID_TTREQ_1221'
  79. # rdid = DbRedis.connect().rpop(key)
  80. # if rdid:
  81. # result = rdid.split('@@@')
  82. # DbRedis.connect().lpush(key, rdid)
  83. # else:
  84. # time.sleep(1)
  85. # continue
  86. # # return None
  87. key = "DOUYIN_REGISTER_QUEUE"
  88. register_device = DbRedis.connect().rpop(key)
  89. if register_device:
  90. result = json.loads(register_device)
  91. DbRedis.connect().lpush(key, register_device)
  92. else:
  93. time.sleep(0.1)
  94. continue
  95. retry += 1
  96. rticket = str(int(time.time() * 1000))
  97. mc = get_mc
  98. udid = '8604' + get_random(11)
  99. trace_id = get_trace()
  100. # trace_id = get_random_trace()
  101. # openudid = '3b22' + str(udid.uuid4())[-12:]
  102. # device_id, iid, udid, openudid, cookie, V1, V2, device_type, device_brand = result[0], result[1], result[3], result[2], result[4], result[8], result[9], result[10], result[11]
  103. device_id, iid, udid, openudid, cookie = result['device_id'], result['iid'], result['uuid'], result['openudid'], result['cookie']
  104. query = {
  105. "room_id" : str(room_id),
  106. "webcast_sdk_version" : "1690",
  107. "webcast_language" : "zh",
  108. "webcast_locale" :"zh_CN",
  109. "webcast_gps_access" : "1",
  110. "os_api" : "23",
  111. "device_type" : "HSF-FL00",
  112. "ssmix" : "a",
  113. "manifest_version_code" : "120801",
  114. "dpi" : "640",
  115. "app_name" : "aweme",
  116. "version_name" : "12.8.0",
  117. "ts" : str(int(time.time())),
  118. "cpu_support64" : "true",
  119. "storage_type" : "0",
  120. "app_type" : "normal",
  121. "appTheme" : "dark",
  122. "ac" : "wifi",
  123. "host_abi" : "armeabi-v7a",
  124. "update_version_code" : "12809900",
  125. "channel" : "wandoujia_douyinnew_1128",
  126. "_rticket" : rticket,
  127. # "_rticket" : "1629688012123",
  128. "device_platform" : "android",
  129. "iid" : str(iid),
  130. "version_code" : "120800",
  131. "mac_address" : mc,
  132. #"mac_address" : "FC%3ADB%3AB3%3A56%3ABD%3AFD",
  133. "cdid" : "6c96979e-c729-419c-9516-3a85a7338d0c",
  134. "openudid" : str(openudid),
  135. "device_id" : str(device_id),
  136. "resolution" : "1440*2560",
  137. "os_version" : "6.0.1",
  138. "language" : "zh",
  139. "device_brand" : "HUAWEI",
  140. "aid" : "1128"
  141. }
  142. query_params = urllib.parse.urlencode(query)
  143. url = url + query_params
  144. body = ""
  145. xGorgon = X_Gorgon(query_params, body)
  146. userAgent = "com.ss.android.ugc.aweme/1208000 (Linux; U; Android 5.1.1; zh_CN; HSF-FL00; Build/LMY47V; Cronet/58.0.2991.0)"
  147. headers = {
  148. "Host" : domain,
  149. "Connection" : "keep-alive",
  150. "Cache-Control" : "max-age=0",
  151. "Upgrade-Insecure-Requests" : "1",
  152. "User-Agent" : userAgent,
  153. # 'accept-encoding': 'gzip, deflate',
  154. # "x-SS-REQ-TICKET": rticket,
  155. "x-gorgon" : xGorgon.get('X-Gorgon'),
  156. "x-khronos" : xGorgon.get('X-Khronos'),
  157. "passport-sdk-version" : "17",
  158. "sdk-version" : "2",
  159. "x-ss-dp" : "1128",
  160. "x-tt-trace-id" : trace_id,
  161. "cookie" : cookie
  162. }
  163. proxy = Proxy.get()
  164. proxies = {
  165. "http": "http://" + proxy,
  166. "https": "http://" + proxy
  167. }
  168. try:
  169. response = requests.get(
  170. url,
  171. headers=headers,
  172. proxies=proxies,
  173. timeout=8
  174. )
  175. if (response.status_code == 200) and (response.text is not None) and (response.text != ''):
  176. response_json = response.json()
  177. if (response_json.get('data') is not None):
  178. print(
  179. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  180. + ' 数据获取成功!'
  181. + '\n'
  182. + room_id + 'trace_id:' + trace_id
  183. + '\n'
  184. )
  185. break
  186. else:
  187. print(
  188. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  189. + ' 数据获取失败!'
  190. + '\n'
  191. + room_id + 'trace_id:' + trace_id
  192. + '\n'
  193. + response.text
  194. + Proxy.proxy_info
  195. )
  196. else:
  197. print(
  198. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  199. + ' 爬取http连接失败!'
  200. + str(response.status_code)
  201. + '\n'
  202. + Proxy.proxy_info
  203. + '\n'
  204. + room_id + 'trace_id:' + trace_id
  205. + '\n'
  206. + '爬取结果:' + str(response)
  207. + '\n'
  208. )
  209. time.sleep(1)
  210. except requests.exceptions.ProxyError as e:
  211. print(
  212. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  213. + ' 代理过期!'
  214. + str(e)
  215. + '\n'
  216. + room_id + 'trace_id:' + trace_id
  217. + '\n'
  218. + Proxy.proxy_info
  219. )
  220. Proxy.del_proxy(proxy)
  221. pass
  222. except requests.exceptions.ConnectTimeout as e:
  223. print(
  224. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  225. + ' ConnectTimeout!'
  226. + str(e)
  227. + '\n'
  228. + room_id + 'trace_id:' + trace_id
  229. + '\n'
  230. + Proxy.proxy_info
  231. )
  232. Proxy.del_proxy(proxy)
  233. pass
  234. except Exception as e:
  235. print(
  236. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  237. + ' 请求抛出异常!'
  238. + str(e)
  239. + '\n'
  240. + room_id + 'trace_id:' + trace_id
  241. + '\n'
  242. + Proxy.proxy_info
  243. )
  244. pass
  245. return response_json
  246. def scrape():
  247. rds_list = RdsRoomLotteryRequestList()
  248. while True:
  249. room_info = rds.get_request_param()
  250. if room_info is None:
  251. time.sleep(1)
  252. continue
  253. # return None
  254. room_id = str(room_info)
  255. print(
  256. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  257. + ' '
  258. + str(room_id)
  259. )
  260. try:
  261. response_json = get_live_lottery_data(room_id)
  262. if response_json is None:
  263. # rds_list.record_score(0)
  264. # rds_list.push_request_id(room_id)
  265. print(
  266. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  267. + ' 数据获取失败!响应数据为空!'
  268. + '\n'
  269. + room_id
  270. + '\n'
  271. )
  272. # sys.exit(0)
  273. else:
  274. data = json.dumps({
  275. "data": response_json.get('data'),
  276. "extra": {
  277. 'room_id': room_id
  278. }
  279. })
  280. # rds_list.record_score(1)
  281. rds_list.push_data_list(data)
  282. except Exception as e:
  283. # rds_list.record_score(0)
  284. rds_list.push_request_id(room_id)
  285. print(
  286. time.strftime("%H:%M:%S", time.localtime())
  287. + ' '
  288. + room_id
  289. + '数据异常:'
  290. + str(e)
  291. )
  292. # sys.exit(0)
  293. if __name__ == "__main__":
  294. print("主方法开始执行")
  295. # 并行线程数
  296. threading_count = int(sys.argv[1])
  297. num = int(sys.argv[2])
  298. rds = RdsRoomLotteryRequestList()
  299. print(
  300. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  301. + ' '
  302. + ' 开始执行,更新直播队列长度:'
  303. + str(rds.get_len())
  304. )
  305. while True:
  306. sys.stdout.flush()
  307. # 减去主线程
  308. active_count = threading.active_count() - 1
  309. increment = threading_count - active_count
  310. if increment > 0:
  311. sys.stdout.flush()
  312. task = threading.Thread(target=scrape, args=())
  313. task.start() # 准备就绪, 等待cpu执行
  314. # increment = increment - 1
  315. # current_time = time.time()
  316. # if current_time - start_time > 3600:
  317. # print(
  318. # time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  319. # + ' 主方法执行终止'
  320. # )
  321. # sys.exit(0)
  322. time.sleep(1)