店播爬取Python脚本

douyin_video_scraper.py 15KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452
  1. import requests
  2. import random
  3. import uuid
  4. import json
  5. import time
  6. import urllib
  7. import sys
  8. from libs.aesgzip import tt_encrypt
  9. from rds_model.db_redis import DbRedis
  10. from log.print_log import PrintLog
  11. from libs.proxy import Proxy
  12. from xlog03 import *
  13. def get_mc():
  14. def a():
  15. seed = "1234567890ABCDEF"
  16. sa = []
  17. for i in range(2):
  18. sa.append(random.choice(seed))
  19. salt = ''.join(sa)
  20. return salt
  21. k = ''
  22. for i in range(6):
  23. k += a() + ':'
  24. return k[:-1]
  25. def get_random(i, random_type=1):
  26. if random_type == 1:
  27. return str(random.randint(1 * 10 ** (i - 1), 1 * 10 ** i - 1))
  28. elif random_type == 8:
  29. seed = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  30. sa = []
  31. for i in range(i):
  32. sa.append(random.choice(seed))
  33. salt = ''.join(sa)
  34. return salt
  35. else:
  36. seed = "1234567890abcde"
  37. sa = []
  38. for i in range(i):
  39. sa.append(random.choice(seed))
  40. salt = ''.join(sa)
  41. return salt
  42. def get_random_brand_type():
  43. brand_type = get_random(3, random_type=8) + '-' + get_random(2, random_type=8) + '00'
  44. return brand_type
  45. V1 = '8'
  46. V2 = '4'
  47. V3 = '8'
  48. V4 = '4'
  49. channel = 'update'
  50. device_type = 'RKK-YZ00'
  51. device_brand = 'HUAWEI'
  52. class DouYinApi:
  53. USER_AGENT = f'com.ss.android.ugc.aweme/{V1}{V2}0 (Linux; U; Android 5.1.1; zh_CN; {device_type}; Build/LMY47V; Cronet/58.0.2991.0)'
  54. COMMON_DEVICE_PARAMS = {
  55. 'retry_type': 'no_retry',
  56. 'ac': '4g',
  57. 'channel': channel,
  58. 'aid': '1128',
  59. 'app_name': 'aweme',
  60. 'version_code': f'{V1}{V2}0',
  61. 'version_name': f'{V1}.{V2}.0',
  62. 'device_platform': 'android',
  63. 'ssmix': 'a',
  64. 'device_type': device_type,
  65. 'device_brand': device_brand,
  66. 'language': 'zh',
  67. 'os_api': '26',
  68. 'os_version': '8.0.0',
  69. 'manifest_version_code': f'{V1}{V2}0',
  70. 'resolution': '720*1280',
  71. 'dpi': '320',
  72. 'update_version_code': f'{V1}{V2}02',
  73. 'mcc_mnc': '46000'
  74. }
  75. PROXY = {}
  76. def __init__(self, sessionid, proxies):
  77. """
  78. :param cid: client id
  79. """
  80. self.proxies = proxies
  81. self.__cid = ''
  82. self.__device_id = ''
  83. self.__iid = ''
  84. self.__uuid = ''
  85. self.__openudid = ''
  86. self.__user_agent = ''
  87. self.__device_params = {}
  88. self.__cookie = {
  89. }
  90. def common_params_update(self):
  91. common_params = {
  92. 'click_reason': '0',
  93. 'retry_type': 'no_retry',
  94. 'ac': '4g',
  95. 'channel': channel,
  96. 'aid': '1128',
  97. 'app_name': 'aweme',
  98. 'version_code': f'{V3}{V4}0',
  99. 'version_name': f'{V3}.{V4}.0',
  100. 'device_platform': 'android',
  101. 'ssmix': 'a',
  102. 'device_type': device_type,
  103. 'device_brand': device_brand,
  104. 'language': 'zh',
  105. 'os_api': '26',
  106. 'os_version': '8.0.0',
  107. 'manifest_version_code': f'{V3}{V4}0',
  108. 'resolution': '720*1280',
  109. 'dpi': '320',
  110. 'update_version_code': f'{V3}{V4}02',
  111. 'mcc_mnc': '46000'
  112. }
  113. self.__device_params.update(common_params)
  114. def init_device_ids(self, device_id, iid, udid, openudid, cc=None):
  115. self.__device_id = device_id
  116. self.__iid = iid
  117. self.__uuid = udid
  118. self.__openudid = openudid
  119. self.__user_agent = f'com.ss.android.ugc.aweme/{V1}{V2}0 (Linux; U; Android 5.1.1; zh_CN; {device_type}; Build/LMY47V; Cronet/58.0.2991.0)'
  120. device_ids = {
  121. 'uuid': udid,
  122. 'openudid': openudid,
  123. 'version_code': f'{V1}{V2}0',
  124. 'version_name': f'{V1}.{V2}.0',
  125. 'device_type': device_type,
  126. 'device_brand': device_brand,
  127. 'manifest_version_code': f'{V1}{V2}0',
  128. 'update_version_code': f'{V1}{V2}02'
  129. }
  130. if device_id and iid:
  131. device_ids.update({
  132. 'device_id': device_id,
  133. 'iid': iid,
  134. })
  135. self.__device_params = self.COMMON_DEVICE_PARAMS.copy()
  136. self.__device_params.update(device_ids)
  137. if cc:
  138. self.__cookie.update(cc)
  139. def __get_encrypted_device_info(self, device_id, openudid, udid, clientudid, serial_number, mac, iid):
  140. register_info = {
  141. "magic_tag": "ss_app_log",
  142. "header": {
  143. "display_name": "抖音短视频",
  144. "update_version_code": int(self.COMMON_DEVICE_PARAMS['update_version_code']),
  145. "manifest_version_code": int(self.COMMON_DEVICE_PARAMS['manifest_version_code']),
  146. "aid": 1128,
  147. "channel": self.COMMON_DEVICE_PARAMS['channel'],
  148. "package": "com.ss.android.ugc.aweme",
  149. "app_version": self.COMMON_DEVICE_PARAMS['version_name'],
  150. "version_code": int(self.COMMON_DEVICE_PARAMS['version_code']),
  151. "sdk_version": "2.7.5.8",
  152. "os": "Android",
  153. "os_version": self.COMMON_DEVICE_PARAMS['os_version'],
  154. "os_api": self.COMMON_DEVICE_PARAMS['os_api'],
  155. "device_model": self.COMMON_DEVICE_PARAMS['device_type'],
  156. "device_brand": self.COMMON_DEVICE_PARAMS['device_brand'],
  157. "device_manufacturer": self.COMMON_DEVICE_PARAMS['device_brand'],
  158. "cpu_abi": "armeabi-v7a",
  159. "build_serial": serial_number,
  160. "release_build": "2132ca7_20190320",
  161. "density_dpi": self.COMMON_DEVICE_PARAMS['dpi'],
  162. "display_density": "xhdpi",
  163. "resolution": "1280x720",
  164. "language": "zh",
  165. "mc": mac,
  166. "timezone": 8,
  167. "access": "4G",
  168. "not_request_sender": 0,
  169. "rom": "MIUI-9.11.7",
  170. "rom_version": "miui_V11_9.11.7",
  171. "openudid": str(openudid),
  172. "udid": str(udid),
  173. "clientudid": str(clientudid),
  174. "serial_number": str(serial_number),
  175. "sim_serial_number": [
  176. ],
  177. "region": "CN",
  178. "tz_name": "Asia/Shanghai",
  179. "tz_offset": 28800
  180. },
  181. "_gen_time": str(round(time.time() * 1000))
  182. }
  183. if device_id:
  184. register_info['header']['device_id'] = str(device_id)
  185. if iid:
  186. register_info['header']['iid'] = str(iid)
  187. register_info['header']['push_sdk'] = '[1, 2, 6, 7, 8, 9]'
  188. return tt_encrypt((json.dumps(register_info)))
  189. def register_device(self):
  190. try:
  191. self.common_params_update()
  192. udid = '8604' + get_random(11)
  193. serial_number = str(uuid.uuid4())[-12:]
  194. openudid = '3b22' + str(uuid.uuid4())[-12:]
  195. clientudid = str(uuid.uuid4())
  196. mc = get_mc()
  197. params = {
  198. 'uuid': udid,
  199. 'openudid': openudid,
  200. '_rticket': str(int(round(time.time() * 1000)))
  201. }
  202. params.update(self.COMMON_DEVICE_PARAMS)
  203. device_register_url = 'https://log.snssdk.com/service/2/device_register/?' + urllib.parse.urlencode(params)
  204. headers = {
  205. 'User-Agent': DouYinApi.USER_AGENT
  206. }
  207. d = self.__get_encrypted_device_info(None, openudid, udid, clientudid, serial_number, mc, iid=None)
  208. if self.proxies:
  209. resp = requests.post(device_register_url,
  210. data=d, proxies=self.proxies,
  211. headers=headers, verify=False,timeout=10)
  212. else:
  213. resp = requests.post(device_register_url,
  214. data=d,
  215. headers=headers, verify=False,timeout=10)
  216. cookie = resp.cookies.get_dict()
  217. if len(cookie) != 0:
  218. self.__cookie.update(cookie)
  219. if resp is None:
  220. return None
  221. resp = resp.json()
  222. ids = {
  223. 'new_user': resp['new_user'],
  224. 'device_id': str(resp['device_id']),
  225. 'iid': str(resp['install_id']),
  226. 'uuid': udid,
  227. 'openudid': openudid,
  228. 'serial_number': serial_number,
  229. 'clientudid': clientudid,
  230. 'mc': mc,
  231. 'cookie': urllib.parse.urlencode(self.__cookie)
  232. }
  233. return ids
  234. except Exception as e:
  235. print(e)
  236. return None
  237. def __add_other_params(self, douyin_url, params=None):
  238. if params is None:
  239. params = {}
  240. if not douyin_url.__contains__('?'):
  241. douyin_url = douyin_url + '?'
  242. common_params = urllib.parse.urlencode(self.__device_params)
  243. if douyin_url.endswith('?') or douyin_url.endswith('&'):
  244. douyin_url = douyin_url + common_params
  245. else:
  246. douyin_url = douyin_url + '&' + common_params
  247. if len(params) > 0:
  248. douyin_url = douyin_url + '&' + urllib.parse.urlencode(params)
  249. douyin_url = douyin_url + "&_rticket=" + str(int(round(time.time() * 1000))) + "&ts=" + str(int(time.time()))
  250. return douyin_url
  251. def get_user_post(self, user_id, max_cursor, count):
  252. """获取用户作品
  253. :param user_id: 用户ID
  254. :param max_cursor: 用于分页,第1页是0,后1页是上1页请求的时候返回的max_cursor
  255. :param count: 返回视频的条数
  256. :return:
  257. """
  258. self.common_params_update()
  259. params = {
  260. 'user_id': str(user_id),
  261. 'max_cursor': str(max_cursor),
  262. 'count': str(count)
  263. }
  264. douyin_url = 'https://aweme.snssdk.com/aweme/v1/aweme/post/'
  265. return self.__http_get(douyin_url, params)
  266. def __http_get(self, url, query_params=None):
  267. if query_params is None:
  268. query_params = {}
  269. url = self.__add_other_params(url, query_params)
  270. sign = self.__get_sign(url, query_params)
  271. headers = self.__get_headers(sign)
  272. if self.proxies:
  273. resp = requests.get(url, headers=headers, cookies=self.__cookie, proxies=self.proxies,
  274. verify=False, timeout=8)
  275. else:
  276. resp = requests.get(url, headers=headers, cookies=self.__cookie, verify=False)
  277. cookie = resp.cookies.get_dict()
  278. if len(cookie) != 0:
  279. self.__cookie.update(cookie)
  280. return resp.json()
  281. def __get_sign(self, url, form_params=None):
  282. stub = ''
  283. if form_params:
  284. a = urllib.parse.urlencode(form_params)
  285. stub = hashlib.md5(a.encode('utf-8')).hexdigest()
  286. ts = int(time.time())
  287. ppp = url[url.index('?') + 1:]
  288. s = getXGon(ppp, stub, urllib.parse.urlencode(self.__cookie))
  289. gorgon = xGorgon(ts, strToByte(s))
  290. sign = {
  291. 'X-Khronos': str(ts),
  292. 'X-Gorgon': gorgon,
  293. 'X-Pods': ''
  294. }
  295. #print(gorgon)
  296. if stub:
  297. sign.update({
  298. 'X-SS-STUB': stub.upper()
  299. })
  300. return sign
  301. def __get_headers(self, sign=None):
  302. if sign is None:
  303. sign = {}
  304. headers = {
  305. 'User-Agent': self.__user_agent,
  306. 'X-SS-REQ-TICKET': str(round(time.time() * 1000)),
  307. }
  308. headers.update(sign)
  309. return headers
  310. if __name__ == '__main__':
  311. import warnings
  312. start_time = time.time()
  313. warnings.filterwarnings("ignore")
  314. while True:
  315. server_time = int(time.time())
  316. current_time = time.time()
  317. if current_time - start_time > 300:
  318. print(
  319. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  320. + ' 主方法执行终止'
  321. )
  322. sys.exit(0)
  323. # key = 'DOUYIN_SCRAPE_DID_IID_TTREQ_1221'
  324. # rdid = DbRedis.douyin_connect().rpop(key)
  325. # if rdid:
  326. # result = rdid.split('@@@')
  327. # else:
  328. # time.sleep(2)
  329. # continue
  330. proxy = Proxy.get()
  331. proxies = {
  332. "http": "http://" + proxy,
  333. "https": "http://" + proxy
  334. }
  335. douApi = DouYinApi('', proxies)
  336. result = douApi.register_device()
  337. if result is None:
  338. time.sleep(1)
  339. continue
  340. errorn = 0
  341. suc_f = 0
  342. for index in range(1000):
  343. try:
  344. ukey = 'BrandLiveData.DouyinUserVideo'
  345. users = DbRedis.douyin_connect().rpop(ukey)
  346. if users is None:
  347. time.sleep(10)
  348. continue
  349. user = json.loads(users)
  350. user_id = user.get('uid')
  351. re_times = int(user.get('re_times')) + 1
  352. user.update({
  353. 're_times':re_times
  354. })
  355. users = json.dumps(user)
  356. # device_id, iid, udid, openudid, cookie, V1, V2, device_type, device_brand = result[0], result[1], result[3], result[2], result[4], result[8], result[9], result[10], result[11]
  357. device_id, iid, udid, openudid, cookie = result['device_id'], result['iid'], result['uuid'], \
  358. result['openudid'], result['cookie']
  359. if device_id == '0':
  360. print('device失败')
  361. break
  362. douApi.init_device_ids(device_id, iid, udid, openudid)
  363. videos = douApi.get_user_post(user_id, 0, 20)
  364. awemes = videos.get('aweme_list')
  365. stime = time.strftime("%H:%M:%S", time.localtime())
  366. if awemes:
  367. PrintLog.print(stime+" 成功"+str(index)+" "+str(user_id)+" V: "+str(V1+V2+'0')+" "+str(device_type))
  368. print(stime+" 成功"+str(index))
  369. dkey = 'BrandLiveData.DouyinUserVideoResponsePython'
  370. data = str(user_id) + '@@@' + json.dumps(videos)
  371. DbRedis.douyin_connect().lpush(dkey, data)
  372. else:
  373. PrintLog.print(stime+" 失败"+str(index)+' '+ str(user_id))
  374. print(stime+" 失败"+str(index))
  375. errorn = errorn + 1
  376. if re_times<1:
  377. ukey = 'BrandLiveData.DouyinUserVideo'
  378. DbRedis.douyin_connect().lpush(ukey, users)
  379. if errorn>2:
  380. break
  381. except Exception as e:
  382. print(
  383. time.strftime("%H:%M:%S", time.localtime())
  384. + ' 请求抛出异常!行号:'
  385. + str(e.__traceback__.tb_lineno)
  386. + ' 错误:'
  387. + str(e)
  388. + "\n"
  389. )
  390. break