店播爬取Python脚本

web_cookie.py 8.1KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251
  1. import requests,json,random,uuid,sys,time,threading,datetime
  2. from libs.proxy import Proxy
  3. from rds_model.db_redis import DbRedis
  4. # -- coding: utf-8 --**
  5. start_time = time.time()
  6. class Cookie:
  7. redis = DbRedis.douyin_connect()
  8. cookie_info = ''
  9. @staticmethod
  10. def get():
  11. while True:
  12. cookie = Cookie.random_cookie()
  13. if cookie is not None:
  14. return cookie
  15. print('未拿到有效的代理')
  16. time.sleep(1)
  17. @staticmethod
  18. def random_cookie():
  19. key = 'CookieHash'
  20. cookie_dict = Cookie.redis.hgetall(key)
  21. if (cookie_dict is None) or (len(cookie_dict) == 0):
  22. return
  23. cookie_list = list(cookie_dict)
  24. now = int(time.time())
  25. while True:
  26. cookie = random.choice(cookie_list)#Cookie.choice_cookie()
  27. if cookie is None:
  28. return
  29. cookie_info = cookie_dict.get(cookie)
  30. if cookie_info is None:
  31. continue
  32. create_at = int(time.mktime(time.strptime(cookie_info, "%Y-%m-%d %H:%M:%S")))
  33. # 删除过期的代理
  34. if now - create_at >= 7200:
  35. Cookie.redis.hdel(key, cookie)
  36. cookie_list.remove(cookie)
  37. continue
  38. return cookie
  39. @staticmethod
  40. def del_cookie(cookie):
  41. cookie_info = Cookie.redis.hget('CookieHash', cookie)
  42. if cookie_info is None:
  43. return
  44. create_at = int(time.mktime(time.strptime(cookie_info, "%Y-%m-%d %H:%M:%S")))
  45. now = int(time.time())
  46. if create_at - now >= 7200:
  47. return
  48. # 删除失效的Cookie
  49. print('删除失效Cookie:' + cookie)
  50. def set(cookie):
  51. Cookie.redis.hset('CookieHash', cookie, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
  52. def get_url(self):
  53. url_list = [
  54. "https://www.douyin.com/user/MS4wLjABAAAAKpCGhwidAtgmUXmYIT0zjp2QpGquUaOCEeVPE6_gHjQ",
  55. "https://www.douyin.com/user/MS4wLjABAAAA3Gq3QsbpkmIchOTXXF51Fy3Dyb0xF7rLvI3QEQjoYwo",
  56. "https://www.douyin.com/user/MS4wLjABAAAAO6GOEmNyHKo8Kd1IdS95J884BAFTpR8eRDOBIAb2d7VNGWFBzhR2odpulkKJFV95",
  57. "https://www.douyin.com/user/MS4wLjABAAAAc6xH1Jxur09z-Oy5M9IVpckmAyPlQg5uj_B8fFrTIiNv5B-XyH6G9RCQt3qLsZZU",
  58. "https://www.douyin.com/user/MS4wLjABAAAA5v-bl3BibVonLiHoZWW173nKKy_yn2DACHErPbOoo3g",
  59. "https://www.douyin.com/user/MS4wLjABAAAATjfBnSaWkl6ZJiznrCzkt7-l_7pkD4pLkKyPkLtD2VU",
  60. "https://www.douyin.com/user/MS4wLjABAAAAZ-hqjDujmsKhlhuNV1R4OPrIWJ0XmhFnuJmy9h3u4VkovGqJycoOVGaCj8uqwQiJ",
  61. "https://www.douyin.com/user/MS4wLjABAAAAoMNLc-_Vx_TDrJQvpGEtgZfpR99JDKf6n23mXpsnMGg",
  62. "https://www.douyin.com/user/MS4wLjABAAAAMhl868Pj7GIBYNVX46kjLGS_eiprGHaDHe5ffqi91_s",
  63. "https://www.douyin.com/user/MS4wLjABAAAAR8ow3aH-TjB2c4TJFqtDvFPhmd3TBFyHF1zMCLd39rdO45zpMZXAIvdwsQ4_7gw6"
  64. ]
  65. return random.choice(url_list)
  66. def get_ck(self):
  67. requests.packages.urllib3.disable_warnings()
  68. headers = {
  69. 'authority': 'www.douyin.com',
  70. 'method': 'GET',
  71. 'path': '/',
  72. 'scheme': 'https',
  73. 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
  74. 'accept-encoding': 'gzip, deflate, br',
  75. 'accept-language': 'zh-CN,zh;q=0.9',
  76. 'upgrade-insecure-requests': '1',
  77. 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
  78. }
  79. retry = 0
  80. ck = None
  81. # 屏蔽验证https证书提示
  82. requests.packages.urllib3.disable_warnings()
  83. url = self.get_url()
  84. while True:
  85. if retry > 10:
  86. break
  87. retry += 1
  88. proxy = Proxy.get()
  89. proxies = {
  90. "http": "http://" + proxy,
  91. "https": "http://" + proxy
  92. }
  93. try:
  94. response = requests.get(
  95. url,
  96. headers=headers,
  97. proxies=proxies,
  98. timeout=8,
  99. verify=False
  100. )
  101. cookie_dict=response.cookies.get_dict()
  102. ck=''
  103. if cookie_dict:
  104. for k,v in cookie_dict.items():
  105. ck+='%s=%s; '%(k,v)
  106. ck=ck[:-2]
  107. ck+='; passport_csrf_token='+str(uuid.uuid4()).replace('-','')
  108. num = ck.count('ttwid')
  109. if num>0:
  110. Cookie.set(ck)
  111. print(
  112. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  113. + ' 数据获取成功!'
  114. + '\n'
  115. + ck
  116. + '\n'
  117. + Proxy.proxy_info
  118. )
  119. break
  120. else:
  121. print(
  122. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  123. + ' 数据获取失败!'
  124. + '\n'
  125. + ck
  126. + '\n'
  127. + Proxy.proxy_info
  128. + str(cookie_dict)
  129. )
  130. break
  131. else:
  132. print(
  133. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  134. + ' 爬取http连接失败!'
  135. + str(response.status_code)
  136. + '\n'
  137. + Proxy.proxy_info
  138. )
  139. time.sleep(1)
  140. except requests.exceptions.ProxyError as e:
  141. print(
  142. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  143. + ' 代理过期!'
  144. + str(e)
  145. + Proxy.proxy_info
  146. )
  147. Proxy.del_proxy(proxy)
  148. pass
  149. except requests.exceptions.ConnectTimeout as e:
  150. print(
  151. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  152. + ' ConnectTimeout!'
  153. + str(e)
  154. + '\n'
  155. + Proxy.proxy_info
  156. )
  157. Proxy.del_proxy(proxy)
  158. pass
  159. except Exception as e:
  160. print(
  161. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  162. + ' 请求抛出异常!'
  163. + str(e)
  164. + '\n'
  165. + Proxy.proxy_info
  166. )
  167. pass
  168. return ck
  169. if __name__ == "__main__":
  170. print("主方法开始执行")
  171. # 并行线程数
  172. threading_count = int(sys.argv[1])
  173. print(
  174. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  175. + ' '
  176. + ' 开始执行'
  177. )
  178. while True:
  179. sys.stdout.flush()
  180. # 减去主线程
  181. # active_count = threading.active_count() - 1
  182. # increment = threading_count - active_count
  183. # 获取cookie长度
  184. key = 'CookieHash'
  185. cookie_dict = Cookie.redis.hgetall(key)
  186. if len(cookie_dict) >= 1000:
  187. time.sleep(60)
  188. break
  189. cookie = Cookie()
  190. # while increment > 0:
  191. # sys.stdout.flush()
  192. cookie.get_ck()
  193. # task = threading.Thread(target=cookie.get_ck,)
  194. # task.start() # 准备就绪, 等待cpu执行
  195. # increment = increment - 1
  196. current_time = time.time()
  197. if current_time - start_time > 3600:
  198. print(
  199. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  200. + ' 主方法执行终止'
  201. )
  202. sys.exit(0)
  203. time.sleep(1)