店播爬取Python脚本

web_dy.py 10KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192
  1. import os
  2. import requests,json,random,execjs,uuid
  3. from urllib import parse
  4. from urllib.parse import quote
  5. # from libs.proxy import Proxy
  6. # -- coding: utf-8 --**
  7. class WebDouYin:
  8. def __init__(self):
  9. self.proxies = {
  10. }
  11. def get_signature(self,url=None,method='_signature'):
  12. dirname, filename = os.path.split(os.path.abspath(__file__))
  13. with open(dirname+ '/signature.js', 'r', encoding='utf-8') as f:
  14. b = f.read()
  15. c = execjs.compile(b)
  16. # url=url.replace('%28','(').replace('%29',')').replace('%2C',',')
  17. d = c.call(method, url.replace('\n',''))
  18. # print('_signature',d)
  19. return d
  20. def get_ck(self, proxy=None):
  21. requests.packages.urllib3.disable_warnings()
  22. headers = {
  23. 'authority': 'www.douyin.com',
  24. 'method': 'GET',
  25. 'path': '/',
  26. 'scheme': 'https',
  27. 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
  28. 'accept-encoding': 'gzip, deflate, br',
  29. 'accept-language': 'zh-CN,zh;q=0.9',
  30. 'upgrade-insecure-requests': '1',
  31. 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
  32. }
  33. # res = requests.get('https://www.douyin.com/', headers=headers,verify=False, proxies=proxy,timeout=8)
  34. res = requests.get("https://www.douyin.com/user/MS4wLjABAAAAKpCGhwidAtgmUXmYIT0zjp2QpGquUaOCEeVPE6_gHjQ", headers=headers,verify=False, proxies=proxy,timeout=8)
  35. cookie_dict=res.cookies.get_dict()
  36. ck=''
  37. if cookie_dict:
  38. for k,v in cookie_dict.items():
  39. ck+='%s=%s; '%(k,v)
  40. ck=ck[:-2]
  41. else:
  42. return None
  43. ck+='; passport_csrf_token='+str(uuid.uuid4()).replace('-','')
  44. # print(ck)
  45. return ck
  46. def get_ua_ck(self,type_name=None):
  47. ua_list=[
  48. "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
  49. "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
  50. "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3870.400 QQBrowser/10.8.4405.400"
  51. ]
  52. ck_list=[
  53. 'ttwid=1%7CTVzdM0P0u-8dtsmh6c-EaQEtBoTSOs_MG85FAg07AbA%7C1631502013%7C66442d8594de8e93ad18b73f3dfe0c94ed864c3d932824bcde9918b5be172321; passport_csrf_token=866923f1a32045fd82e47053158402a2',
  54. 'ttwid=1%7CGPDDu9-w3RGs2Pcd0wRlvLYoktpDt-v8LP5ZMyb1NBM%7C1630319594%7Cffb8de47e6da87dcfd76349b5ad34aa1f9b9d4332261a3a8436b932a893366c1; passport_csrf_token=79284b8777a7a54f3066cefef9af539e',
  55. 'ttwid=1%7CGsfqc7NpdOg4N-U-VX7Q77KsWjVTZ7gxLNifsisj8YE%7C1631618570%7Cafbb13a27fd1c2d7a8245454b1e0d7cd654d80848a320933a25d9ef77638c18c; passport_csrf_token=84911c9af94040a99cc10416bd27533d',
  56. 'ttwid=1%7C82FGr05YUOReYUB301ao_erqOQ3ilbXZdEy0tkMsdXY%7C1631863641%7C1dcebe643a96f00841a3b490db60de886bfe07ff3d276e509717abc4e1681ba6; passport_csrf_token=494ae3fffe00328101fd40e050ce49db',
  57. 'ttwid=1%7CwfnX3T9LY4_60iGoQNzyqYe5ahILFeRxfMuZ1pdgXf8%7C1632724192%7Cb613fddc0b533d5578dad4d5f9290705fdc6432aa854d492f4761d164dd3fdd5; passport_csrf_token=4a8afba333103b033e537003b72ee91b',
  58. 'ttwid=1%7CJZaclhj5Tgq6GB2KqL_6iqSu-BXCe9nC6-nJcjMEq24%7C1631262845%7Caf3e3acea147096036503a0238eccf3f30bbf31ac39d1812d69031aea79d3760; passport_csrf_token_default=b5e15d90e956f6b148bd3301ce6372f3; passport_csrf_token=b5e15d90e956f6b148bd3301ce6372f3; _tea_utm_cache_6383=undefined; __ac_signature=_02B4Z6wo00f01rm3RmAAAIDBLWEdbNgZ9Vq5k0LAAM9GpUBscdxSeV9NlDZq3ATFdnICqNTCV8cWCBBshe4Q9933w4I3vFIC2-ptR-6iU1HM7ufqE3zM2UJOe-WBklhX6bsTmlQaXggHi-rU16; FOLLOW_YELLOW_POINT_USER=MS4wLjABAAAAwcPuAI04fw8wfirPJqOv0hESAX-C0JbpI8JGzlgiX4s; FOLLOW_YELLOW_POINT_STATUE_INFO=1%2F1632903705967; tt_scid=ebxRLAmEaT7aTKjBzBzpqgKSCESII30busIfS0zONRZ6RFQ5Gmc52w58czEt3VQwd79e; s_v_web_id=verify_c5299c865f78fab640e630fad1f812cc; _tea_utm_cache_2018=undefined; odin_tt=c68622b8715a3aa3bfe289438613de2ffc711f9ceff12da4274f374e0c55268dce2f26fff0e7e3f1e0c645fe6491ee932a7e4b52a9be128e15ff8d49ace7822c; n_mh=1eNiy1CE3tbbhhZrqHZ8hb46OcXZmYiO3OSMVDwg0OE; sso_auth_status=aca31d09b431af5fb60642f0869a9112; sso_auth_status_ss=aca31d09b431af5fb60642f0869a9112; sso_uid_tt=d03690992f052ea087d48027b943a927; sso_uid_tt_ss=d03690992f052ea087d48027b943a927; toutiao_sso_user=cb6f6e70b5384aa94013e6cf5decb883; toutiao_sso_user_ss=cb6f6e70b5384aa94013e6cf5decb883; d_ticket=80ed5c329ed10dc1bb91b58dbecd3ac5bf128; passport_auth_status_ss=3be526de4218e101e6b5136365c332f8%2C18b527f1a55ff3bb63d05496e6ee56f5; sid_guard=b69c6f12202d7069d80061d32981f573%7C1632902665%7C5184000%7CSun%2C+28-Nov-2021+08%3A04%3A25+GMT; uid_tt=b536b77fe52631203eb01326194a96a1; uid_tt_ss=b536b77fe52631203eb01326194a96a1; sid_tt=b69c6f12202d7069d80061d32981f573; sessionid=b69c6f12202d7069d80061d32981f573; sessionid_ss=b69c6f12202d7069d80061d32981f573; sid_ucp_v1=1.0.0-KDU1YTg0M2Q5NTNhNmM2ZDI2NTVmM2Q5Y2E3NTUyM2E2ZTM4YzRhZWMKFQj1za_V-AIQibzQigYY7zE4AkDxBxoCaGwiIGI2OWM2ZjEyMjAyZDcwNjlkODAwNjFkMzI5ODFmNTcz; ssid_ucp_v1=1.0.0-KDU1YTg0M2Q5NTNhNmM2ZDI2NTVmM2Q5Y2E3NTUyM2E2ZTM4YzRhZWMKFQj1za_V-AIQibzQigYY7zE4AkDxBxoCaGwiIGI2OWM2ZjEyMjAyZDcwNjlkODAwNjFkMzI5ODFmNTcz; passport_auth_status=3be526de4218e101e6b5136365c332f8%2C18b527f1a55ff3bb63d05496e6ee56f5; msToken=Uk1zRW_2ZV5xpYN_5QTApoG2-zrYuL-qb4Y3bnuZRwm5LCauYqzp8sNlXMRlqDmwxpirVVBtovX0vano-KRPs_Xnssp6JayMGeWrxzmeV3Kw_jeD2dCZZah_0dRt; msToken=LH6B0ufdfs8Z66KLcA_ox0ZUoEnlgydp9-o2nelv2KxYzoWS6gB3QPLuJYIpcXhK0A6161RECrSy6-mECAz0GgCPrRtU-BdRzeILTOwDwu7FqV4S9p6OyJAA9fri'
  59. ]
  60. return random.choice(ua_list),random.choice(ck_list)
  61. def response(self,url,headers,proxy,data=None):
  62. # -- coding: utf-8 --**
  63. try:
  64. requests.packages.urllib3.disable_warnings()
  65. if data:
  66. res = requests.post(url, headers=headers,verify=False, proxies=proxy,data=data, timeout=8)
  67. else:
  68. res=requests.get(url,headers=headers,verify=False,proxies=proxy,timeout=8)
  69. # print(res)
  70. json_data=json.loads(res.content.decode())
  71. except Exception as e:
  72. json_data = "请求失败" + str(e)
  73. print(json_data)
  74. return json_data
  75. def get_user_videos(self,sec_user_id,max_cursor=0,count=10,proxy=None,cookie=None):
  76. ua,ck=self.get_ua_ck('get_user_videos')
  77. if cookie:
  78. ck = cookie
  79. url='https://www.douyin.com/aweme/v1/web/aweme/post/?'
  80. param={
  81. 'device_platform': 'webapp',
  82. 'aid': '6383',
  83. 'channel': 'channel_pc_web',
  84. 'sec_user_id': sec_user_id,
  85. 'max_cursor': str(max_cursor),
  86. 'count': str(count),
  87. 'publish_video_strategy_type': '2',
  88. 'version_code': '160100',
  89. 'version_name': '16.1.0',
  90. 'cookie_enabled': 'true',
  91. 'screen_width': '1920',
  92. 'screen_height': '1080',
  93. 'browser_language': 'zh-CN',
  94. 'browser_platform': 'Win32',
  95. 'browser_name': 'Mozilla',
  96. 'browser_version':ua.replace('Mozilla/',''),
  97. 'browser_online': 'true',
  98. }
  99. url = url + parse.urlencode(param)
  100. _signature = self.get_signature(url)
  101. url+='&_signature='+quote(_signature)
  102. headers = {
  103. "authority": "www.douyin.com",
  104. "method": "GET",
  105. "path": str(url).replace('https://www.douyin.com',''),
  106. "scheme": "https",
  107. "accept": "application/json, text/plain, */*",
  108. # "accept-encoding": "gzip, deflate, br",
  109. "accept-language": "zh-CN,zh;q=0.9",
  110. "cookie": ck,
  111. "referer": "https://www.douyin.com/user/{sec_user_id}?enter_method=search_result&enter_from=search_result".format(sec_user_id=sec_user_id),
  112. "user-agent":ua,
  113. "withcredentials": "true",
  114. }
  115. if ck:headers['cookie']=ck
  116. json_data=self.response(url=url,headers=headers,proxy=proxy)
  117. return json_data
  118. # print(json_data)
  119. def get_user_info(self,sec_user_id,proxy=None):
  120. ua,ck=self.get_ua_ck()
  121. # 随机获取一个cookie
  122. url='https://www.douyin.com/aweme/v1/web/user/profile/other/?'
  123. param={
  124. 'device_platform': 'webapp',
  125. 'aid': '6383',
  126. 'channel': 'channel_pc_web',
  127. 'publish_video_strategy_type': '2',
  128. 'source': 'channel_pc_web',
  129. 'sec_user_id': sec_user_id,
  130. 'version_code': '160100',
  131. 'version_name': '16.1.0',
  132. 'cookie_enabled': 'true',
  133. 'screen_width': '1920',
  134. 'screen_height': '1080',
  135. 'browser_language': 'zh-CN',
  136. 'browser_platform': 'Win32',
  137. 'browser_name': 'Mozilla',
  138. 'browser_version':ua.replace('Mozilla/',''),
  139. 'browser_online': 'true',
  140. }
  141. url = url + parse.urlencode(param)
  142. _signature = self.get_signature(url)
  143. url+='&_signature='+quote(_signature)
  144. headers = {
  145. "authority": "www.douyin.com",
  146. "method": "GET",
  147. "path": str(url).replace('https://www.douyin.com',''),
  148. "scheme": "https",
  149. "accept": "application/json, text/plain, */*",
  150. "accept-language": "zh-CN,zh;q=0.9",
  151. "cookie": ck,
  152. "referer": "https://www.douyin.com/user/{sec_user_id}?enter_method=search_result&enter_from=search_result".format(sec_user_id=sec_user_id),
  153. "user-agent":ua,
  154. "withcredentials": "true",
  155. }
  156. if ck:headers['cookie']=ck
  157. json_data=self.response(url=url,headers=headers,proxy=proxy)
  158. return json_data
  159. if __name__ == '__main__':
  160. webdy=WebDouYin()
  161. # res = webdy.get_user_videos('MS4wLjABAAAAqLPgx-hHf27EqGEtRQ6YyuQQTmikB5CBO1jXy61yhWKujGd8KO5G8V2vdcLQJAym')
  162. #print(res)
  163. # res =webdy.get_user_info('MS4wLjABAAAA2sPiyVAbQc3FsbJJeuyvZkuLjmPRpfKgCvuf41TdjII')
  164. # print(res)
  165. # proxy = Proxy.get()
  166. # print(proxy)
  167. # proxies = {
  168. # "http": "http://" + proxy,
  169. # "https": "http://" + proxy
  170. # }
  171. ck = webdy.get_ck()
  172. print(ck)