店播爬取Python脚本

web_dy.py 8.0KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186
  1. import requests,json,random,execjs,uuid
  2. from urllib import parse
  3. from urllib.parse import quote
  4. # from libs.proxy import Proxy
  5. # -- coding: utf-8 --**
  6. class WebDouYin:
  7. def __init__(self):
  8. self.proxies = {
  9. }
  10. def get_signature(self,url=None,method='_signature'):
  11. with open('/mnt/shop_live_scraper/signature.js', 'r', encoding='utf-8') as f:
  12. b = f.read()
  13. c = execjs.compile(b)
  14. # url=url.replace('%28','(').replace('%29',')').replace('%2C',',')
  15. d = c.call(method, url.replace('\n',''))
  16. # print('_signature',d)
  17. return d
  18. def get_ck(self, proxy=None):
  19. requests.packages.urllib3.disable_warnings()
  20. headers = {
  21. 'authority': 'www.douyin.com',
  22. 'method': 'GET',
  23. 'path': '/',
  24. 'scheme': 'https',
  25. 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
  26. 'accept-encoding': 'gzip, deflate, br',
  27. 'accept-language': 'zh-CN,zh;q=0.9',
  28. 'upgrade-insecure-requests': '1',
  29. 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
  30. }
  31. # res = requests.get('https://www.douyin.com/', headers=headers,verify=False, proxies=proxy,timeout=8)
  32. res = requests.get("https://www.douyin.com/user/MS4wLjABAAAAKpCGhwidAtgmUXmYIT0zjp2QpGquUaOCEeVPE6_gHjQ", headers=headers,verify=False, proxies=proxy,timeout=8)
  33. cookie_dict=res.cookies.get_dict()
  34. ck=''
  35. if cookie_dict:
  36. for k,v in cookie_dict.items():
  37. ck+='%s=%s; '%(k,v)
  38. ck=ck[:-2]
  39. else:
  40. return None
  41. ck+='; passport_csrf_token='+str(uuid.uuid4()).replace('-','')
  42. # print(ck)
  43. return ck
  44. def get_ua_ck(self,type_name=None):
  45. ua_list=[
  46. "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
  47. "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
  48. "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3870.400 QQBrowser/10.8.4405.400"
  49. ]
  50. ck_list=[
  51. 'ttwid=1%7CTVzdM0P0u-8dtsmh6c-EaQEtBoTSOs_MG85FAg07AbA%7C1631502013%7C66442d8594de8e93ad18b73f3dfe0c94ed864c3d932824bcde9918b5be172321; passport_csrf_token=866923f1a32045fd82e47053158402a2',
  52. 'ttwid=1%7CGPDDu9-w3RGs2Pcd0wRlvLYoktpDt-v8LP5ZMyb1NBM%7C1630319594%7Cffb8de47e6da87dcfd76349b5ad34aa1f9b9d4332261a3a8436b932a893366c1; passport_csrf_token=79284b8777a7a54f3066cefef9af539e',
  53. 'ttwid=1%7CGsfqc7NpdOg4N-U-VX7Q77KsWjVTZ7gxLNifsisj8YE%7C1631618570%7Cafbb13a27fd1c2d7a8245454b1e0d7cd654d80848a320933a25d9ef77638c18c; passport_csrf_token=84911c9af94040a99cc10416bd27533d',
  54. 'ttwid=1%7C82FGr05YUOReYUB301ao_erqOQ3ilbXZdEy0tkMsdXY%7C1631863641%7C1dcebe643a96f00841a3b490db60de886bfe07ff3d276e509717abc4e1681ba6; passport_csrf_token=494ae3fffe00328101fd40e050ce49db',
  55. 'ttwid=1%7CwfnX3T9LY4_60iGoQNzyqYe5ahILFeRxfMuZ1pdgXf8%7C1632724192%7Cb613fddc0b533d5578dad4d5f9290705fdc6432aa854d492f4761d164dd3fdd5; passport_csrf_token=4a8afba333103b033e537003b72ee91b'
  56. ]
  57. return random.choice(ua_list),random.choice(ck_list)
  58. def response(self,url,headers,proxy,data=None):
  59. # -- coding: utf-8 --**
  60. try:
  61. requests.packages.urllib3.disable_warnings()
  62. if data:
  63. res = requests.post(url, headers=headers,verify=False, proxies=proxy,data=data, timeout=8)
  64. else:
  65. res=requests.get(url,headers=headers,verify=False,proxies=proxy,timeout=8)
  66. # print(res)
  67. json_data=json.loads(res.content.decode())
  68. except Exception as e:
  69. json_data = "请求失败" + str(e)
  70. print(json_data)
  71. return json_data
  72. def get_user_videos(self,sec_user_id,max_cursor=0,count=10,proxy=None,cookie=None):
  73. ua,ck=self.get_ua_ck('get_user_videos')
  74. if cookie:
  75. ck = cookie
  76. url='https://www.douyin.com/aweme/v1/web/aweme/post/?'
  77. param={
  78. 'device_platform': 'webapp',
  79. 'aid': '6383',
  80. 'channel': 'channel_pc_web',
  81. 'sec_user_id': sec_user_id,
  82. 'max_cursor': str(max_cursor),
  83. 'count': str(count),
  84. 'publish_video_strategy_type': '2',
  85. 'version_code': '160100',
  86. 'version_name': '16.1.0',
  87. 'cookie_enabled': 'true',
  88. 'screen_width': '1920',
  89. 'screen_height': '1080',
  90. 'browser_language': 'zh-CN',
  91. 'browser_platform': 'Win32',
  92. 'browser_name': 'Mozilla',
  93. 'browser_version':ua.replace('Mozilla/',''),
  94. 'browser_online': 'true',
  95. }
  96. url = url + parse.urlencode(param)
  97. _signature = self.get_signature(url)
  98. url+='&_signature='+quote(_signature)
  99. headers = {
  100. "authority": "www.douyin.com",
  101. "method": "GET",
  102. "path": str(url).replace('https://www.douyin.com',''),
  103. "scheme": "https",
  104. "accept": "application/json, text/plain, */*",
  105. # "accept-encoding": "gzip, deflate, br",
  106. "accept-language": "zh-CN,zh;q=0.9",
  107. "cookie": ck,
  108. "referer": "https://www.douyin.com/user/{sec_user_id}?enter_method=search_result&enter_from=search_result".format(sec_user_id=sec_user_id),
  109. "user-agent":ua,
  110. "withcredentials": "true",
  111. }
  112. if ck:headers['cookie']=ck
  113. json_data=self.response(url=url,headers=headers,proxy=proxy)
  114. return json_data
  115. # print(json_data)
  116. def get_user_info(self,sec_user_id,proxy=None):
  117. ua,ck=self.get_ua_ck()
  118. # 随机获取一个cookie
  119. url='https://www.douyin.com/aweme/v1/web/user/profile/other/?'
  120. param={
  121. 'device_platform': 'webapp',
  122. 'aid': '6383',
  123. 'channel': 'channel_pc_web',
  124. 'publish_video_strategy_type': '2',
  125. 'source': 'channel_pc_web',
  126. 'sec_user_id': sec_user_id,
  127. 'version_code': '160100',
  128. 'version_name': '16.1.0',
  129. 'cookie_enabled': 'true',
  130. 'screen_width': '1920',
  131. 'screen_height': '1080',
  132. 'browser_language': 'zh-CN',
  133. 'browser_platform': 'Win32',
  134. 'browser_name': 'Mozilla',
  135. 'browser_version':ua.replace('Mozilla/',''),
  136. 'browser_online': 'true',
  137. }
  138. url = url + parse.urlencode(param)
  139. _signature = self.get_signature(url)
  140. url+='&_signature='+quote(_signature)
  141. headers = {
  142. "authority": "www.douyin.com",
  143. "method": "GET",
  144. "path": str(url).replace('https://www.douyin.com',''),
  145. "scheme": "https",
  146. "accept": "application/json, text/plain, */*",
  147. "accept-language": "zh-CN,zh;q=0.9",
  148. "cookie": ck,
  149. "referer": "https://www.douyin.com/user/{sec_user_id}?enter_method=search_result&enter_from=search_result".format(sec_user_id=sec_user_id),
  150. "user-agent":ua,
  151. "withcredentials": "true",
  152. }
  153. if ck:headers['cookie']=ck
  154. json_data=self.response(url=url,headers=headers,proxy=proxy)
  155. return json_data
  156. if __name__ == '__main__':
  157. webdy=WebDouYin()
  158. res = webdy.get_user_videos('MS4wLjABAAAAqLPgx-hHf27EqGEtRQ6YyuQQTmikB5CBO1jXy61yhWKujGd8KO5G8V2vdcLQJAym')
  159. print(res)
  160. # webdy.get_user_info('MS4wLjABAAAAC2euvL-0qMZyd80aNwZa-wX5KXuz_r7YVNHSBOogfVg')
  161. # proxy = Proxy.get()
  162. # print(proxy)
  163. # proxies = {
  164. # "http": "http://" + proxy,
  165. # "https": "http://" + proxy
  166. # }
  167. # ck = webdy.get_ck()
  168. # print(ck)