店播爬取Python脚本

douyin_comment_scraper.py 14KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426
  1. import requests
  2. import random
  3. import uuid
  4. import json
  5. import time
  6. import urllib
  7. import sys
  8. from libs.aesgzip import tt_encrypt
  9. from rds_model.db_redis import DbRedis
  10. from log.print_log import PrintLog
  11. from libs.proxy import Proxy
  12. from xlog03 import *
  13. def get_mc():
  14. def a():
  15. seed = "1234567890ABCDEF"
  16. sa = []
  17. for i in range(2):
  18. sa.append(random.choice(seed))
  19. salt = ''.join(sa)
  20. return salt
  21. k = ''
  22. for i in range(6):
  23. k += a() + ':'
  24. return k[:-1]
  25. def get_random(i, random_type=1):
  26. if random_type == 1:
  27. return str(random.randint(1 * 10 ** (i - 1), 1 * 10 ** i - 1))
  28. elif random_type == 8:
  29. seed = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  30. sa = []
  31. for i in range(i):
  32. sa.append(random.choice(seed))
  33. salt = ''.join(sa)
  34. return salt
  35. else:
  36. seed = "1234567890abcde"
  37. sa = []
  38. for i in range(i):
  39. sa.append(random.choice(seed))
  40. salt = ''.join(sa)
  41. return salt
  42. V1 = '8'
  43. V2 = '4'
  44. V3 = '8'
  45. V4 = '4'
  46. channel = 'update'
  47. device_type = 'RKK-YZ00'
  48. device_brand = 'HUAWEI'
  49. #print(channel, device_type)
  50. class DouYinApi:
  51. USER_AGENT = f'com.ss.android.ugc.aweme/{V1}{V2}0 (Linux; U; Android 5.1.1; zh_CN; {device_type}; Build/LMY47V; Cronet/58.0.2991.0)'
  52. COMMON_DEVICE_PARAMS = {
  53. 'retry_type': 'no_retry',
  54. 'ac': '4g',
  55. 'channel': channel,
  56. 'aid': '1128',
  57. 'app_name': 'aweme',
  58. 'version_code': f'{V1}{V2}0',
  59. 'version_name': f'{V1}.{V2}.0',
  60. 'device_platform': 'android',
  61. 'ssmix': 'a',
  62. 'device_type': device_type,
  63. 'device_brand': device_brand,
  64. 'language': 'zh',
  65. 'os_api': '26',
  66. 'os_version': '8.0.0',
  67. 'manifest_version_code': f'{V1}{V2}0',
  68. 'resolution': '720*1280',
  69. 'dpi': '320',
  70. 'update_version_code': f'{V1}{V2}02',
  71. 'mcc_mnc': '46000'
  72. }
  73. PROXY = {}
  74. def __init__(self, sessionid, proxies):
  75. """
  76. :param cid: client id
  77. """
  78. self.proxies = proxies
  79. self.__cid = ''
  80. self.__device_id = ''
  81. self.__iid = ''
  82. self.__uuid = ''
  83. self.__openudid = ''
  84. self.__user_agent = ''
  85. self.__device_params = {}
  86. self.__cookie = {
  87. }
  88. def common_params_update(self):
  89. common_params = {
  90. 'click_reason': '0',
  91. 'retry_type': 'no_retry',
  92. 'ac': '4g',
  93. 'channel': channel,
  94. 'aid': '1128',
  95. 'app_name': 'aweme',
  96. 'version_code': f'{V3}{V4}0',
  97. 'version_name': f'{V3}.{V4}.0',
  98. 'device_platform': 'android',
  99. 'ssmix': 'a',
  100. 'device_type': device_type,
  101. 'device_brand': device_brand,
  102. 'language': 'zh',
  103. 'os_api': '26',
  104. 'os_version': '8.0.0',
  105. 'manifest_version_code': f'{V3}{V4}0',
  106. 'resolution': '720*1280',
  107. 'dpi': '320',
  108. 'update_version_code': f'{V3}{V4}02',
  109. 'mcc_mnc': '46000'
  110. }
  111. self.__device_params.update(common_params)
  112. def init_device_ids(self, device_id, iid, udid, openudid, cc=None):
  113. self.__device_id = device_id
  114. self.__iid = iid
  115. self.__uuid = udid
  116. self.__openudid = openudid
  117. self.__user_agent = f'com.ss.android.ugc.aweme/{V1}{V2}0 (Linux; U; Android 5.1.1; zh_CN; {device_type}; Build/LMY47V; Cronet/58.0.2991.0)'
  118. device_ids = {
  119. 'uuid': udid,
  120. 'openudid': openudid,
  121. 'version_code': f'{V1}{V2}0',
  122. 'version_name': f'{V1}.{V2}.0',
  123. 'device_type': device_type,
  124. 'device_brand': device_brand,
  125. 'manifest_version_code': f'{V1}{V2}0',
  126. 'update_version_code': f'{V1}{V2}02'
  127. }
  128. if device_id and iid:
  129. device_ids.update({
  130. 'device_id': device_id,
  131. 'iid': iid,
  132. })
  133. self.__device_params = self.COMMON_DEVICE_PARAMS.copy()
  134. self.__device_params.update(device_ids)
  135. if cc:
  136. self.__cookie.update(cc)
  137. def __get_encrypted_device_info(self, device_id, openudid, udid, clientudid, serial_number, mac, iid):
  138. register_info = {
  139. "magic_tag": "ss_app_log",
  140. "header": {
  141. "display_name": "抖音短视频",
  142. "update_version_code": int(self.COMMON_DEVICE_PARAMS['update_version_code']),
  143. "manifest_version_code": int(self.COMMON_DEVICE_PARAMS['manifest_version_code']),
  144. "aid": 1128,
  145. "channel": self.COMMON_DEVICE_PARAMS['channel'],
  146. "package": "com.ss.android.ugc.aweme",
  147. "app_version": self.COMMON_DEVICE_PARAMS['version_name'],
  148. "version_code": int(self.COMMON_DEVICE_PARAMS['version_code']),
  149. "sdk_version": "2.7.5.8",
  150. "os": "Android",
  151. "os_version": self.COMMON_DEVICE_PARAMS['os_version'],
  152. "os_api": self.COMMON_DEVICE_PARAMS['os_api'],
  153. "device_model": self.COMMON_DEVICE_PARAMS['device_type'],
  154. "device_brand": self.COMMON_DEVICE_PARAMS['device_brand'],
  155. "device_manufacturer": self.COMMON_DEVICE_PARAMS['device_brand'],
  156. "cpu_abi": "armeabi-v7a",
  157. "build_serial": serial_number,
  158. "release_build": "2132ca7_20190320",
  159. "density_dpi": self.COMMON_DEVICE_PARAMS['dpi'],
  160. "display_density": "xhdpi",
  161. "resolution": "1280x720",
  162. "language": "zh",
  163. "mc": mac,
  164. "timezone": 8,
  165. "access": "4G",
  166. "not_request_sender": 0,
  167. "rom": "MIUI-9.11.7",
  168. "rom_version": "miui_V11_9.11.7",
  169. "openudid": str(openudid),
  170. "udid": str(udid),
  171. "clientudid": str(clientudid),
  172. "serial_number": str(serial_number),
  173. "sim_serial_number": [
  174. ],
  175. "region": "CN",
  176. "tz_name": "Asia/Shanghai",
  177. "tz_offset": 28800
  178. },
  179. "_gen_time": str(round(time.time() * 1000))
  180. }
  181. if device_id:
  182. register_info['header']['device_id'] = str(device_id)
  183. if iid:
  184. register_info['header']['iid'] = str(iid)
  185. register_info['header']['push_sdk'] = '[1, 2, 6, 7, 8, 9]'
  186. return tt_encrypt((json.dumps(register_info)))
  187. def register_device(self):
  188. try:
  189. self.common_params_update()
  190. udid = '8604' + get_random(11)
  191. serial_number = str(uuid.uuid4())[-12:]
  192. openudid = '3b22' + str(uuid.uuid4())[-12:]
  193. clientudid = str(uuid.uuid4())
  194. mc = get_mc()
  195. params = {
  196. 'uuid': udid,
  197. 'openudid': openudid,
  198. '_rticket': str(int(round(time.time() * 1000)))
  199. }
  200. params.update(self.COMMON_DEVICE_PARAMS)
  201. device_register_url = 'https://log.snssdk.com/service/2/device_register/?' + urllib.parse.urlencode(params)
  202. headers = {
  203. 'User-Agent': DouYinApi.USER_AGENT
  204. }
  205. d = self.__get_encrypted_device_info(None, openudid, udid, clientudid, serial_number, mc, iid=None)
  206. if self.proxies:
  207. resp = requests.post(device_register_url,
  208. data=d, proxies=self.proxies,
  209. headers=headers, verify=False,timeout=10)
  210. else:
  211. resp = requests.post(device_register_url,
  212. data=d,
  213. headers=headers, verify=False,timeout=10)
  214. cookie = resp.cookies.get_dict()
  215. if len(cookie) != 0:
  216. self.__cookie.update(cookie)
  217. resp = resp.json()
  218. ids = {
  219. 'new_user': resp['new_user'],
  220. 'device_id': str(resp['device_id']),
  221. 'iid': str(resp['install_id']),
  222. 'uuid': udid,
  223. 'openudid': openudid,
  224. 'serial_number': serial_number,
  225. 'clientudid': clientudid,
  226. 'mc': mc,
  227. 'cookie': urllib.parse.urlencode(self.__cookie)
  228. }
  229. return ids
  230. except Exception as e:
  231. print(e)
  232. return None
  233. def __add_other_params(self, douyin_url, params=None):
  234. if params is None:
  235. params = {}
  236. if not douyin_url.__contains__('?'):
  237. douyin_url = douyin_url + '?'
  238. common_params = urllib.parse.urlencode(self.__device_params)
  239. if douyin_url.endswith('?') or douyin_url.endswith('&'):
  240. douyin_url = douyin_url + common_params
  241. else:
  242. douyin_url = douyin_url + '&' + common_params
  243. if len(params) > 0:
  244. douyin_url = douyin_url + '&' + urllib.parse.urlencode(params)
  245. douyin_url = douyin_url + "&_rticket=" + str(int(round(time.time() * 1000))) + "&ts=" + str(int(time.time()))
  246. return douyin_url
  247. def get_video_comment_list(self, aweme_id, cursor, count):
  248. self.common_params_update()
  249. params = {
  250. 'aweme_id': aweme_id,
  251. 'cursor': str(cursor),
  252. 'count': str(count)
  253. }
  254. douyin_url = 'https://aweme.snssdk.com/aweme/v2/comment/list/'
  255. return self.__http_get(douyin_url, params)
  256. def __http_get(self, url, query_params=None):
  257. if query_params is None:
  258. query_params = {}
  259. url = self.__add_other_params(url, query_params)
  260. sign = self.__get_sign(url)
  261. headers = self.__get_headers(sign)
  262. if self.proxies:
  263. resp = requests.get(url, headers=headers, cookies=self.__cookie, proxies=self.proxies,
  264. verify=False, timeout=8)
  265. else:
  266. resp = requests.get(url, headers=headers, cookies=self.__cookie, verify=False)
  267. cookie = resp.cookies.get_dict()
  268. if len(cookie) != 0:
  269. self.__cookie.update(cookie)
  270. return resp.json()
  271. def __get_sign(self, url, form_params=None):
  272. stub = ''
  273. if form_params:
  274. a = urllib.parse.urlencode(form_params)
  275. stub = hashlib.md5(a.encode('utf-8')).hexdigest()
  276. ts = int(time.time())
  277. ppp = url[url.index('?') + 1:]
  278. s = getXGon(ppp, stub, urllib.parse.urlencode(self.__cookie))
  279. gorgon = xGorgon(ts, strToByte(s))
  280. sign = {
  281. 'X-Khronos': str(ts),
  282. 'X-Gorgon': gorgon,
  283. 'X-Pods': ''
  284. }
  285. if stub:
  286. sign.update({
  287. 'X-SS-STUB': stub.upper()
  288. })
  289. return sign
  290. def __get_headers(self, sign=None):
  291. if sign is None:
  292. sign = {}
  293. headers = {
  294. 'User-Agent': self.__user_agent,
  295. 'X-SS-REQ-TICKET': str(round(time.time() * 1000)),
  296. }
  297. headers.update(sign)
  298. return headers
  299. if __name__ == '__main__':
  300. import warnings
  301. warnings.filterwarnings("ignore")
  302. start_time = time.time()
  303. while True:
  304. server_time = int(time.time())
  305. current_time = time.time()
  306. if current_time - start_time > 300:
  307. print(
  308. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  309. + ' 主方法执行终止'
  310. )
  311. sys.exit(0)
  312. # key = 'DOUYIN_SCRAPE_DID_IID_TTREQ_1221C'
  313. # rdid = DbRedis.douyin_connect().rpop(key)
  314. # if rdid:
  315. # result = rdid.split('@@@')
  316. # if int(result[5])+360>server_time:
  317. # DbRedis.douyin_connect().lpush(key,rdid)
  318. # else:
  319. # continue
  320. # print(result)
  321. # else:
  322. # time.sleep(2)
  323. # continue
  324. proxy = Proxy.get()
  325. proxies = {
  326. "http": "http://" + proxy,
  327. "https": "http://" + proxy
  328. }
  329. douApi = DouYinApi('', proxies)
  330. result = douApi.register_device()
  331. if result is None:
  332. time.sleep(1)
  333. continue
  334. errorn = 0
  335. for index in range(100):
  336. try:
  337. ukey = 'BrandLiveData.DouyinUserVideoComment'
  338. aweme_u = DbRedis.douyin_connect().rpop(ukey)
  339. if aweme_u is None:
  340. time.sleep(10)
  341. continue
  342. awemes = aweme_u.split('@@@')
  343. aweme_id, uid, re_times = awemes[0], awemes[1], int(awemes[2])
  344. # device_id, iid, udid, openudid, cookie, V1, V2, device_type, device_brand = result[0], result[1], result[3], \
  345. # result[2], result[4], result[8], result[9], result[10], result[11]
  346. device_id, iid, udid, openudid, cookie = result['device_id'], result['iid'], result['uuid'], result['openudid'], result['cookie']
  347. # douApi.init_device_ids(device_id, iid, udid, openudid, cookie, V1, V2, device_type, device_brand)
  348. douApi.init_device_ids(device_id, iid, udid, openudid)
  349. comment = douApi.get_video_comment_list(aweme_id, 0, 20)
  350. comments = comment.get('comments')
  351. stime = time.strftime("%H:%M:%S", time.localtime())
  352. if comments:
  353. PrintLog.print(stime+" 成功"+str(index)+' '+ str(uid)+' '+ str(aweme_id))
  354. print(stime+" 成功"+str(index))
  355. dkey = 'BrandLiveData.DouyinUserVideoCommentResponsePython'
  356. data = str(uid) + '@@@' + json.dumps(comment)
  357. DbRedis.douyin_connect().lpush(dkey, data)
  358. else:
  359. PrintLog.print(stime+" 失败"+str(index)+' '+ str(uid)+' '+ str(aweme_id))
  360. print(stime+" 失败"+str(index))
  361. errorn = errorn + 1
  362. if re_times<1:
  363. re_times = re_times + 1
  364. aweme_u_new = str(aweme_id) + '@@@' + str(uid) + '@@@' + str(re_times)
  365. DbRedis.douyin_connect().lpush(ukey, aweme_u_new)
  366. if errorn>2:
  367. break
  368. except Exception as e:
  369. print(e)
  370. break