店播爬取Python脚本

douyin_hourly_ranklist_scraper.py 19KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654
  1. from rds_model.rds_hourly_rank_list import RdsDouyinHourlyRankList
  2. import time
  3. import json
  4. import sys
  5. import threading
  6. import random
  7. import urllib
  8. import requests
  9. from rds_model.db_redis import DbRedis
  10. from log.print_log import PrintLog
  11. from libs.Xg04 import X_Gorgon
  12. from libs.proxy import Proxy
  13. from libs.mysql_user_living import *
  14. start_time = time.time()
  15. def get_random(i, random_type=1):
  16. if random_type == 1:
  17. return str(random.randint(1 * 10 ** (i - 1), 1 * 10 ** i - 1))
  18. elif random_type == 8:
  19. seed = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  20. sa = []
  21. for i in range(i):
  22. sa.append(random.choice(seed))
  23. salt = ''.join(sa)
  24. return salt
  25. else:
  26. seed = "1234567890abcde"
  27. sa = []
  28. for i in range(i):
  29. sa.append(random.choice(seed))
  30. salt = ''.join(sa)
  31. return salt
  32. def get_random_brand_type():
  33. brand_type = get_random(3, random_type=8) + '-' + get_random(2, random_type=8) + '00'
  34. return brand_type
  35. def get_mc():
  36. def a():
  37. seed = "1234567890ABCDEF"
  38. sa = []
  39. for i in range(2):
  40. sa.append(random.choice(seed))
  41. salt = ''.join(sa)
  42. return salt
  43. k = ''
  44. for i in range(6):
  45. k += a() + ':'
  46. return k[:-1]
  47. def get_whole_station_rank_data(room_id, sec_anchor_id, anchor_id):
  48. domain = 'webcast5-normal-c-lq.amemv.com'
  49. url = 'https://' + domain + '/webcast/ranklist/hour/?'
  50. rticket = str(int(time.time() * 1000))
  51. ts = int(time.time())
  52. mc = get_mc
  53. udid = '8604' + get_random(11)
  54. query = {
  55. "manifest_version_code" : "110001",
  56. "dpi" : "480",
  57. "app_name" : "aweme",
  58. "version_name" : "11.0.0",
  59. "ts" : ts,
  60. "cpu_support64" : "true",
  61. "app_type" : "normal",
  62. "ac" : "wifi",
  63. "host_abi" : "armeabi-v7a",
  64. "channel" : "wandoujia_aweme_feisuo",
  65. "device_platform" : "android",
  66. "iid" : "3932281687270606",
  67. "version_code" : "110000",
  68. "cdid" : "1d06013c-ff0b-427b-b1ab-6700259c15c6",
  69. "openudid" : "291f3ce2efe59345",
  70. "hour_info" : "0",
  71. "room_id" : room_id,
  72. "rank_type" : "12",
  73. "sec_anchor_id" : sec_anchor_id,
  74. "anchor_id" : anchor_id,
  75. "webcast_sdk_version" : "1510",
  76. "webcast_language" : "zh",
  77. "webcast_locale" : "zh_CN",
  78. "os_api" : "23",
  79. "device_type" : "HUAWEI MLA-AL10",
  80. "ssmix" : "a",
  81. "update_version_code" : "11009900",
  82. "cdid" : "1d06013c-ff0b-427b-b1ab-6700259c15c6",
  83. "openudid" : "291f3ce2efe59345",
  84. "device_id" : '49388718822',
  85. "resolution" : "1080*1800",
  86. "os_version" : "6.0",
  87. "language" : "zh",
  88. "device_brand" : "HUAWEI",
  89. "_rticket" : rticket,
  90. "aid" : "1128"
  91. }
  92. query_params = urllib.parse.urlencode(query)
  93. url = url + query_params
  94. body = ''
  95. xGorgon = X_Gorgon(query_params, body)
  96. userAgent = 'okhttp/3.' + str(random.randint(0, 10)) + '.' + str(random.randint(0, 10)) + '.' + str(
  97. random.randint(1, 10))
  98. headers = {
  99. 'Host': domain,
  100. 'Connection': 'keep-alive',
  101. 'User-Agent': userAgent,
  102. 'accept-encoding': 'gzip, deflate',
  103. "x-gorgon": xGorgon.get('X-Gorgon'),
  104. "x-khronos": xGorgon.get('X-Khronos'),
  105. 'sdk-version' : '2',
  106. 'x-ss-dp' : '1128',
  107. 'x-tt-trace-id' : '00-70f99f2209e0b045dd14266ee1da0468-70f99f2209e0b045-01',
  108. }
  109. retry = 0
  110. response_json = None
  111. while True:
  112. if retry > 10:
  113. break
  114. retry += 1
  115. proxy = Proxy.get()
  116. proxies = {
  117. "http": "http://" + proxy,
  118. "https": "http://" + proxy
  119. }
  120. try:
  121. response = requests.get(
  122. url,
  123. headers=headers,
  124. proxies=proxies,
  125. timeout=8
  126. )
  127. if (response.status_code == 200) and (response.text is not None) and (response.text != ''):
  128. response_json = response.json()
  129. if (response_json.get('data') is not None):
  130. break
  131. else:
  132. print(
  133. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  134. + ' 数据获取失败!'
  135. + '\n'
  136. + room_id
  137. + '\n'
  138. + response.text
  139. + Proxy.proxy_info
  140. )
  141. else:
  142. print(
  143. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  144. + ' 爬取http连接失败!'
  145. + str(response.status_code)
  146. + '\n'
  147. + Proxy.proxy_info
  148. + '\n'
  149. + room_id
  150. + '\n'
  151. )
  152. time.sleep(1)
  153. except requests.exceptions.ProxyError as e:
  154. print(
  155. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  156. + ' 代理过期!'
  157. + str(e)
  158. + '\n'
  159. + room_id
  160. + '\n'
  161. + Proxy.proxy_info
  162. )
  163. Proxy.del_proxy(proxy)
  164. pass
  165. except requests.exceptions.ConnectTimeout as e:
  166. print(
  167. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  168. + ' ConnectTimeout!'
  169. + str(e)
  170. + '\n'
  171. + room_id
  172. + '\n'
  173. + Proxy.proxy_info
  174. )
  175. Proxy.del_proxy(proxy)
  176. pass
  177. except Exception as e:
  178. print(
  179. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  180. + ' 请求抛出异常!'
  181. + str(e)
  182. + '\n'
  183. + room_id
  184. + '\n'
  185. + Proxy.proxy_info
  186. )
  187. pass
  188. return response_json
  189. def get_commerce_rank_list_data(room_id, sec_anchor_id, anchor_id):
  190. domain = 'webcast5-normal-c-lq.amemv.com'
  191. url = 'https://' + domain + '/webcast/ranklist/hour/?'
  192. rticket = str(int(time.time() * 1000))
  193. mc = get_mc
  194. udid = '8604' + get_random(11)
  195. # openudid = '3b22' + str(udid.uuid4())[-12:]
  196. ts = int(time.time())
  197. query = {
  198. "style" : "3",
  199. "hour_info" : "0",
  200. "room_id" : room_id,
  201. "rank_type" : "31",
  202. "sec_anchor_id" : sec_anchor_id,
  203. "webcast_sdk_version" : "1710",
  204. "webcast_language" : "zh",
  205. "webcast_locale" : "zh_CN",
  206. "webcast_gps_access" : "2",
  207. "os_api" : "23",
  208. "device_type" : "HUAWEI+MLA-AL10",
  209. "ssmix" : "a",
  210. "manifest_version_code" : "130001",
  211. "dpi" : "480",
  212. "app_name" : "aweme",
  213. "version_name" : "13.0.0",
  214. "ts" : ts,
  215. "cpu_support64" : "true",
  216. "storage_type" : "0",
  217. "app_type" : "normal",
  218. "appTheme" : "dark",
  219. "ac" : "wifi",
  220. "host_abi" : "armeabi-v7a",
  221. "update_version_code" : "13009900",
  222. "channel" : "tengxun_new",
  223. "_rticket" : rticket,
  224. "device_platform" : "android",
  225. "iid" : "2876750595379005",
  226. "version_code" : "130000",
  227. "mac_address" : mc,
  228. "cdid" : "81542dc6-2aca-4ff6-ac58-d94179e9d3e6",
  229. "openudid" : "291f3ce2efe59345",
  230. "device_id" : "49388718822",
  231. "resolution" : "1080*1800",
  232. "os_version" : "6.0",
  233. "language" : "zh",
  234. "device_brand" : "HUAWEI",
  235. "aid" : "1128"
  236. }
  237. query_params = urllib.parse.urlencode(query)
  238. url = url + query_params
  239. body = ''
  240. xGorgon = X_Gorgon(query_params, body)
  241. userAgent = 'okhttp/3.' + str(random.randint(0, 10)) + '.' + str(random.randint(0, 10)) + '.' + str(
  242. random.randint(1, 10))
  243. headers = {
  244. 'Host': domain,
  245. 'Connection': 'keep-alive',
  246. 'Cache-Control': 'max-age=0',
  247. 'User-Agent': userAgent,
  248. 'accept-encoding': 'gzip, deflate',
  249. "x-gorgon": xGorgon.get('X-Gorgon'),
  250. "x-khronos": xGorgon.get('X-Khronos'),
  251. 'passport-sdk-version' : '18',
  252. 'sdk-version' : '2',
  253. 'x-ss-dp' : '1128',
  254. 'x-tt-trace-id' : '00-70f99f2209e0b045dd14266ee1da0468-70f99f2209e0b045-01',
  255. }
  256. retry = 0
  257. response_json = None
  258. while True:
  259. if retry > 10:
  260. break
  261. retry += 1
  262. proxy = Proxy.get()
  263. proxies = {
  264. "http": "http://" + proxy,
  265. "https": "http://" + proxy
  266. }
  267. try:
  268. response = requests.get(
  269. url,
  270. headers=headers,
  271. proxies=proxies,
  272. timeout=8
  273. )
  274. if (response.status_code == 200) and (response.text is not None) and (response.text != ''):
  275. response_json = response.json()
  276. if (response_json.get('data') is not None):
  277. break
  278. else:
  279. print(
  280. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  281. + ' 数据获取失败!'
  282. + '\n'
  283. + room_id
  284. + '\n'
  285. + response.text
  286. + Proxy.proxy_info
  287. )
  288. else:
  289. print(
  290. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  291. + ' 爬取http连接失败!'
  292. + str(response.status_code)
  293. + '\n'
  294. + Proxy.proxy_info
  295. + '\n'
  296. + room_id
  297. + '\n'
  298. )
  299. time.sleep(1)
  300. except requests.exceptions.ProxyError as e:
  301. print(
  302. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  303. + ' 代理过期!'
  304. + str(e)
  305. + '\n'
  306. + room_id
  307. + '\n'
  308. + Proxy.proxy_info
  309. )
  310. Proxy.del_proxy(proxy)
  311. pass
  312. except requests.exceptions.ConnectTimeout as e:
  313. print(
  314. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  315. + ' ConnectTimeout!'
  316. + str(e)
  317. + '\n'
  318. + room_id
  319. + '\n'
  320. + Proxy.proxy_info
  321. )
  322. Proxy.del_proxy(proxy)
  323. pass
  324. except Exception as e:
  325. print(
  326. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  327. + ' 请求抛出异常!'
  328. + str(e)
  329. + '\n'
  330. + room_id
  331. + '\n'
  332. + Proxy.proxy_info
  333. )
  334. pass
  335. return response_json
  336. def get_popularity_rank_list_data(room_id, sec_anchor_id, anchor_id):
  337. domain = 'webcast5-normal-c-lq.amemv.com'
  338. url = 'https://' + domain + '/webcast/ranklist/hour/?'
  339. rticket = str(int(time.time() * 1000))
  340. mc = get_mc
  341. udid = '8604' + get_random(11)
  342. ts = int(time.time())
  343. # openudid = '3b22' + str(udid.uuid4())[-12:]
  344. query = {
  345. "anchor_id" : anchor_id,
  346. "room_id" : room_id,
  347. "sec_anchor_id" : sec_anchor_id,
  348. "sec_user_id" : "null",
  349. "webcast_sdk_version" : "2010",
  350. "webcast_language" : "zh",
  351. "webcast_locale" : "zh_CN",
  352. "webcast_gps_access" : "2",
  353. "current_network_quality_info" : "%7B%22http_rtt%22%3A110%2C%22tcp_rtt%22%3A90%2C%22quic_rtt%22%3A90%2C%22downstream_throughput_kbps%22%3A8185%2C%22video_download_speed%22%3A411%2C%22quic_receive_loss_rate%22%3A-1%2C%22quic_send_loss_rate%22%3A-1%2C%22net_effective_connection_type%22%3A5%7D",
  354. "os_api" : "23",
  355. "device_type" : "HUAWEI+MLA-AL10",
  356. "ssmix" : "a",
  357. "manifest_version_code" : "160001",
  358. "dpi" : "480",
  359. "app_name" : "aweme",
  360. "version_name" : "16.0.0",
  361. "ts" : ts,
  362. "cpu_support64" : "true",
  363. "app_type" : "normal",
  364. "appTheme" : "dark",
  365. "ac" : "wifi",
  366. "host_abi" : "armeabi-v7a",
  367. "update_version_code" : "16009900",
  368. "channel" : "wandoujia_lesi_1128_0507",
  369. "_rticket" : rticket,
  370. "device_platform" : "android",
  371. "iid" : "273107070769192",
  372. "version_code" : "160000",
  373. "cdid" : "09e904ed-66a9-4e89-9661-afae7f61e6c5",
  374. "openudid" : "291f3ce2efe59345",
  375. "device_id" : "49388718822",
  376. "resolution" : "1080*1800",
  377. "os_version" : "6.0",
  378. "language" : "zh",
  379. "device_brand" : "HUAWEI",
  380. "aid" : "1128",
  381. "minor_status" : "0"
  382. }
  383. query_params = urllib.parse.urlencode(query)
  384. url = url + query_params
  385. body = ''
  386. xGorgon = X_Gorgon(query_params, body)
  387. userAgent = 'okhttp/3.' + str(random.randint(0, 10)) + '.' + str(random.randint(0, 10)) + '.' + str(
  388. random.randint(1, 10))
  389. headers = {
  390. 'Host': domain,
  391. 'Connection': 'keep-alive',
  392. 'Cache-Control': 'max-age=0',
  393. 'Upgrade-Insecure-Requests': '1',
  394. 'User-Agent': userAgent,
  395. 'accept-encoding': 'gzip, deflate',
  396. "x-gorgon": xGorgon.get('X-Gorgon'),
  397. "x-khronos": xGorgon.get('X-Khronos'),
  398. 'passport-sdk-version' : '18',
  399. 'sdk-version' : '2',
  400. 'x-ss-dp' : '1128',
  401. 'x-tt-trace-id' : '00-70f99f2209e0b045dd14266ee1da0468-70f99f2209e0b045-01',
  402. }
  403. retry = 0
  404. response_json = None
  405. while True:
  406. if retry > 10:
  407. break
  408. retry += 1
  409. proxy = Proxy.get()
  410. proxies = {
  411. "http": "http://" + proxy,
  412. "https": "http://" + proxy
  413. }
  414. try:
  415. response = requests.get(
  416. url,
  417. headers=headers,
  418. proxies=proxies,
  419. timeout=8
  420. )
  421. if (response.status_code == 200) and (response.text is not None) and (response.text != ''):
  422. response_json = response.json()
  423. if (response_json.get('data') is not None):
  424. break
  425. else:
  426. print(
  427. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  428. + ' 数据获取失败!'
  429. + '\n'
  430. + room_id
  431. + '\n'
  432. + response.text
  433. + Proxy.proxy_info
  434. )
  435. else:
  436. print(
  437. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  438. + ' 爬取http连接失败!'
  439. + str(response.status_code)
  440. + '\n'
  441. + Proxy.proxy_info
  442. + '\n'
  443. + room_id
  444. + '\n'
  445. )
  446. time.sleep(1)
  447. except requests.exceptions.ProxyError as e:
  448. print(
  449. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  450. + ' 代理过期!'
  451. + str(e)
  452. + '\n'
  453. + room_id
  454. + '\n'
  455. + Proxy.proxy_info
  456. )
  457. Proxy.del_proxy(proxy)
  458. pass
  459. except requests.exceptions.ConnectTimeout as e:
  460. print(
  461. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  462. + ' ConnectTimeout!'
  463. + str(e)
  464. + '\n'
  465. + room_id
  466. + '\n'
  467. + Proxy.proxy_info
  468. )
  469. Proxy.del_proxy(proxy)
  470. pass
  471. except Exception as e:
  472. print(
  473. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  474. + ' 请求抛出异常!'
  475. + str(e)
  476. + '\n'
  477. + room_id
  478. + '\n'
  479. + Proxy.proxy_info
  480. )
  481. pass
  482. return response_json
  483. def scrape(room_id,sec_anchor_id,anchor_id):
  484. rds_list = RdsDouyinHourlyRankList()
  485. time.sleep(0.1)
  486. try:
  487. # 全站小时榜
  488. whole_station_response_json = get_whole_station_rank_data(room_id=room_id, sec_anchor_id=sec_anchor_id, anchor_id=anchor_id)
  489. # 带货小时榜
  490. commerce_response_json = get_commerce_rank_list_data(room_id=room_id, sec_anchor_id=sec_anchor_id, anchor_id=anchor_id)
  491. # 人气小时榜
  492. popularity_response_json = get_popularity_rank_list_data(room_id=room_id, sec_anchor_id=sec_anchor_id, anchor_id=anchor_id)
  493. if whole_station_response_json is None:
  494. print(
  495. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  496. + ' 全站小时榜数据获取失败!响应数据为空!'
  497. )
  498. sys.exit(0)
  499. else:
  500. data = json.dumps({
  501. "data": whole_station_response_json.get('data'),
  502. "extra": {}
  503. })
  504. rds_list.push_whole_station_data_list(data)
  505. if commerce_response_json is None:
  506. print(
  507. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  508. + ' 带货小时榜数据获取失败!响应数据为空!'
  509. )
  510. sys.exit(0)
  511. else:
  512. data = json.dumps({
  513. "data": commerce_response_json.get('data'),
  514. "extra": {}
  515. })
  516. rds_list.push_commerce_data_list(data)
  517. if popularity_response_json is None:
  518. print(
  519. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  520. + ' 人气小时榜数据获取失败!响应数据为空!'
  521. )
  522. sys.exit(0)
  523. else:
  524. data = json.dumps({
  525. "data": popularity_response_json.get('data'),
  526. "extra": {}
  527. })
  528. rds_list.push_popularity_data_list(data)
  529. except Exception as e:
  530. print(
  531. time.strftime("%H:%M:%S", time.localtime())
  532. + ' '
  533. + '数据异常:'
  534. + str(e)
  535. )
  536. sys.exit(0)
  537. if __name__ == "__main__":
  538. print("主方法开始执行")
  539. # 并行线程数
  540. threading_count = int(sys.argv[1])
  541. rds = RdsDouyinHourlyRankList()
  542. print(
  543. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  544. + ' '
  545. + ' 开始执行,更新直播队列长度:'
  546. + str(rds.get_len())
  547. )
  548. while True:
  549. sys.stdout.flush()
  550. # 减去主线程
  551. active_count = threading.active_count() - 1
  552. increment = threading_count - active_count
  553. while increment > 0:
  554. sys.stdout.flush()
  555. room_info = MysqlUserLiving().get_living_info()
  556. room_id = False
  557. if room_info is None:
  558. time.sleep(60)
  559. continue
  560. room_id,anchor_id,sec_anchor_id = room_info
  561. task = threading.Thread(target=scrape, args=(room_id, sec_anchor_id, anchor_id,))
  562. task.start() # 准备就绪, 等待cpu执行
  563. increment = increment - 1
  564. current_time = time.time()
  565. if current_time - start_time > 300:
  566. print(
  567. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  568. + ' 主方法执行终止'
  569. )
  570. sys.exit(0)
  571. time.sleep(0.01)