店播爬取Python脚本

douyin_user_shop_promotion_on.py 11KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372
  1. from rds_model.rds_user_shop_promotion_list import RdsUserShopPromotionList
  2. import time
  3. import json
  4. import sys
  5. import threading
  6. import random
  7. import urllib
  8. import requests
  9. from rds_model.db_redis import DbRedis
  10. from libs.Xg04 import X_Gorgon
  11. from libs.proxy import Proxy
  12. from libs.mysql_user_living import *
  13. start_time = time.time()
  14. def get_random(i, random_type=1):
  15. if random_type == 1:
  16. return str(random.randint(1 * 10 ** (i - 1), 1 * 10 ** i - 1))
  17. elif random_type == 8:
  18. seed = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  19. sa = []
  20. for i in range(i):
  21. sa.append(random.choice(seed))
  22. salt = ''.join(sa)
  23. return salt
  24. else:
  25. seed = "1234567890abcde"
  26. sa = []
  27. for i in range(i):
  28. sa.append(random.choice(seed))
  29. salt = ''.join(sa)
  30. return salt
  31. def get_random_brand_type():
  32. brand_type = get_random(3, random_type=8) + '-' + get_random(2, random_type=8) + '00'
  33. return brand_type
  34. def get_mc():
  35. def a():
  36. seed = "1234567890ABCDEF"
  37. sa = []
  38. for i in range(2):
  39. sa.append(random.choice(seed))
  40. salt = ''.join(sa)
  41. return salt
  42. k = ''
  43. for i in range(6):
  44. k += a() + ':'
  45. return k[:-1]
  46. def get_trace():
  47. trace_list = [
  48. '00-70f99f2209e0b045dd14266ee1da0468-70f99f2209e0b045-01',
  49. '00-ce7faf4409b7fcc0ae6135fdd4250468-ce7faf4409b7fcc0-01',
  50. '00-ce7faf3b09b7fcc0ae6042f1d8100468-ce7faf3b09b7fcc0-01',
  51. '00-cdd79d2309b7fcc0ae6625a4cb190468-cdd79d2309b7fcc0-01',
  52. '00-cefde9f009b7fcc0ae6750e1349e0468-cefde9f009b7fcc0-01',
  53. '00-ced2e6ef09b7fcc0ae67dd7bfe000468-ced2e6ef09b7fcc0-01',
  54. '00-cefbfeb509b7fcc0ae659396a6ea0468-cefbfeb509b7fcc0-01',
  55. '00-cefaa25409b7fcc0ae657726a3c30468-cefaa25409b7fcc0-01',
  56. '00-63eb56f009b7fcc0ae600208011d0468-63eb56f009b7fcc0-01'
  57. ]
  58. return random.choice(trace_list)
  59. def get_user_shop_promotion_on_shelf_data(user_id, sec_user_id, result):
  60. domain = 'api3-normal-c-lq.amemv.com'
  61. # domain = 'aweme.snssdk.com'
  62. url = 'https://' + domain + '/aweme/v1/promotion/user/promotion/list/?'
  63. rticket = str(int(time.time() * 1000))
  64. mc = get_mc
  65. udid = '8604' + get_random(11)
  66. trace_id = get_trace()
  67. # openudid = '3b22' + str(udid.uuid4())[-12:]
  68. device_id, iid, udid, openudid, cookie, V1, V2, device_type, device_brand = result[0], result[1], result[3], result[2], result[4], result[8], result[9], result[10], result[11]
  69. query = {
  70. "user_id" : "2066709087257544",
  71. #"user_id" : str(user_id),
  72. "sec_user_id" : "MS4wLjABAAAASJ-Q7Mjt6chYEEju720IdOKArNsreGnDi8ADd3E64SsbAeqoXRQex9mCDG7u5DaY",
  73. #"sec_user_id" : str(sec_user_id),
  74. "cursor" : "0",
  75. "count" : "1000",
  76. "column_id" : "0",
  77. "goods_type" : "0",
  78. "shop_version" : "1",
  79. "os_api" : "23",
  80. "device_type" : "HUAWEI+MLA-AL10",
  81. # "device_type" : device_type,
  82. "ssmix" : "a",
  83. "manifest_version_code" : "130001",
  84. "dpi" : "480",
  85. "app_name" : "aweme",
  86. "version_name" : "13.0.0",
  87. "ts" : "1633763939",
  88. # "ts" : int(time.time()),
  89. "cpu_support64" : "true",
  90. "storage_type" : "0",
  91. "app_type" : "normal",
  92. "appTheme" : "dark",
  93. "ac" : "wifi",
  94. "host_abi" : "armeabi-v7a",
  95. "update_version_code" : "13009900",
  96. "channel" : "tengxun_new",
  97. "_rticket" : "1633763939688",
  98. # "_rticket" : rticket,
  99. "device_platform" : "android",
  100. "iid" : "1961967166955757",
  101. # "iid" : str(iid),
  102. "version_code" : "130000",
  103. # "mac_address" : mc,
  104. "mac_address" : "50%3A01%3AD9%3A21%3AED%3AC2",
  105. "cdid" : "ab363b15-99db-4ef5-bfb1-834a8e564105",
  106. "openudid" : "291f3ce2efe59345",
  107. # "openudid" : str(openudid),
  108. "device_id" : "49388718822",
  109. # "device_id" : str(device_id),
  110. "resolution" : "1080*1800",
  111. "os_version" : "6.0",
  112. "language" : "zh",
  113. # "device_brand":device_brand,
  114. "device_brand" : "HUAWEI",
  115. "aid" : "1128"
  116. }
  117. query_params = urllib.parse.urlencode(query)
  118. url = url + query_params
  119. body = ''
  120. xGorgon = X_Gorgon(query_params, body)
  121. # userAgent = f'aweme.snssdk.com/130001 (Linux; U; Android 6.0; zh_CN; {device_type}; Build/{device_type}; Cronet/TTNetVersion:414feb46 2020-09-08 QuicVersion:7aee791b 2020-06-05)'
  122. userAgent = 'com.ss.android.ugc.aweme/130001 (Linux; U; Android 6.0; zh_CN; HUAWEI+MLA-AL10; Build/HUAWEI+MLA-AL10; Cronet/TTNetVersion:414feb46 2020-09-08 QuicVersion:7aee791b 2020-06-05)'
  123. headers = {
  124. 'Host': domain,
  125. 'Connection': 'keep-alive',
  126. 'passport-sdk-version' : '18',
  127. 'sdk-version' : '2',
  128. 'X-SS-DP' : '1128',
  129. # 'x-tt-trace-id' : trace_id,
  130. # 'Upgrade-Insecure-Requests': '1',
  131. 'User-Agent': userAgent,
  132. # 'accept-encoding': 'gzip, deflate',
  133. # "x-SS-REQ-TICKET": rticket,
  134. "x-Gorgon": xGorgon.get('X-Gorgon'),
  135. "x-Khronos": xGorgon.get('X-Khronos'),
  136. # "cookie" : cookie,
  137. "accept": "application/json, text/plain, */*",
  138. }
  139. retry = 0
  140. response_json = None
  141. while True:
  142. if retry > 5:
  143. break
  144. retry += 1
  145. proxy = Proxy.dailiyun_get()
  146. proxies = {
  147. "http": "http://" + proxy,
  148. "https": "http://" + proxy
  149. }
  150. try:
  151. response = requests.get(
  152. url,
  153. headers=headers,
  154. proxies=proxies,
  155. timeout=8
  156. )
  157. if (response.status_code == 200) and (response.text is not None) and (response.text != ''):
  158. response_json = response.json()
  159. if (response_json.get('promotions') is not None):
  160. print(
  161. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  162. + ' 数据获取成功!'
  163. + '\n'
  164. + user_id
  165. + '\n'
  166. )
  167. break
  168. else:
  169. print(
  170. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  171. + ' 数据获取失败!'
  172. + '\n'
  173. + user_id
  174. + '\n'
  175. + response.text
  176. + Proxy.proxy_info
  177. + '\n'
  178. )
  179. else:
  180. print(
  181. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  182. + ' 爬取http连接失败!'
  183. #+ str(response.status_code)
  184. + str(url)
  185. + '\n'
  186. + str(headers)
  187. + '\n'
  188. + Proxy.proxy_info
  189. + '\n'
  190. + user_id
  191. + '\n'
  192. + '爬取结果:' + str(response)
  193. + '\n'
  194. )
  195. time.sleep(1)
  196. except requests.exceptions.ProxyError as e:
  197. print(
  198. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  199. + ' 代理过期!'
  200. + str(e)
  201. + '\n'
  202. + user_id
  203. + '\n'
  204. + Proxy.proxy_info
  205. + '\n'
  206. )
  207. Proxy.dailiyun_del_proxy(proxy)
  208. pass
  209. except requests.exceptions.ConnectTimeout as e:
  210. print(
  211. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  212. + ' ConnectTimeout!'
  213. + str(e)
  214. + '\n'
  215. + user_id
  216. + '\n'
  217. + Proxy.proxy_info
  218. + '\n'
  219. )
  220. Proxy.dailiyun_del_proxy(proxy)
  221. pass
  222. except Exception as e:
  223. print(
  224. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  225. + ' 请求抛出异常!'
  226. + str(e)
  227. + '\n'
  228. + user_id
  229. + '\n'
  230. + Proxy.proxy_info
  231. + '\n'
  232. )
  233. pass
  234. return response_json
  235. def scrape():
  236. rds_list = RdsUserShopPromotionList()
  237. room_info = rds.get_on_shelf_request_param()
  238. if room_info is None:
  239. return None
  240. # room_info = json.loads(room_info)
  241. room_info = room_info.split('@')
  242. room_id = room_info[0]
  243. user_id = room_info[1]
  244. room_id = str(room_id)
  245. user_id = str(user_id)
  246. # 根据用户id获取sec_uid
  247. user_info = MysqlUserLiving().get_user_info(user_id)
  248. sec_user_id, = user_info
  249. print(
  250. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  251. + ' user_id:' + str(user_id) + ' sec_uid:' + str(sec_user_id)
  252. )
  253. key = 'DOUYIN_SCRAPE_DID_IID_TTREQ_1221'
  254. rdid = DbRedis.connect().rpop(key)
  255. if rdid:
  256. result = rdid.split('@@@')
  257. else:
  258. return None
  259. DbRedis.connect().lpush(key, rdid)
  260. try:
  261. response_json = get_user_shop_promotion_on_shelf_data(user_id=user_id, sec_user_id=sec_user_id, result=result)
  262. if response_json is None:
  263. # rds_list.record_score(0)
  264. # rds_list.push_request_id(room_id)
  265. print(
  266. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  267. + ' 数据获取失败!响应数据为空!'
  268. + '\n'
  269. + user_id
  270. + '\n'
  271. )
  272. sys.exit(0)
  273. data = json.dumps({
  274. "data": response_json.get('promotions'),
  275. "extra": {
  276. 'room_id' : room_id,
  277. 'user_id' : user_id,
  278. }
  279. })
  280. # rds_list.record_score(1)
  281. rds_list.push_on_shelf_data_list(data)
  282. except Exception as e:
  283. # rds_list.record_score(0)
  284. rds_list.push_request_id_on_shelf(room_id + '@' + user_id)
  285. print(
  286. time.strftime("%H:%M:%S", time.localtime())
  287. + ' '
  288. + user_id
  289. + '数据异常:'
  290. + str(e)
  291. )
  292. sys.exit(0)
  293. if __name__ == "__main__":
  294. print("主方法开始执行")
  295. # 并行线程数
  296. threading_count = int(sys.argv[1])
  297. num = int(sys.argv[2])
  298. rds = RdsUserShopPromotionList()
  299. print(
  300. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  301. + ' '
  302. + ' 开始执行,更新直播队列长度:'
  303. + str(rds.get_on_shelf_len())
  304. )
  305. while True:
  306. sys.stdout.flush()
  307. # 减去主线程
  308. active_count = threading.active_count() - 1
  309. increment = threading_count - active_count
  310. while increment > 0:
  311. sys.stdout.flush()
  312. # scrape()
  313. task = threading.Thread(target=scrape, args=())
  314. task.start() # 准备就绪, 等待cpu执行
  315. increment = increment - 1
  316. current_time = time.time()
  317. if current_time - start_time > 3600:
  318. print(
  319. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  320. + ' 主方法执行终止'
  321. )
  322. sys.exit(0)
  323. time.sleep(1)