店播爬取Python脚本

web_cookie.py 8.2KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255
  1. import requests,json,random,uuid,sys,time,threading,datetime
  2. from libs.proxy import Proxy
  3. from rds_model.db_redis import DbRedis
  4. # -- coding: utf-8 --**
  5. start_time = time.time()
  6. class Cookie:
  7. redis = DbRedis.connect()
  8. cookie_info = ''
  9. @staticmethod
  10. def get():
  11. while True:
  12. cookie = Cookie.random_cookie()
  13. if cookie is not None:
  14. return cookie
  15. print('未拿到有效的代理')
  16. time.sleep(1)
  17. @staticmethod
  18. def random_cookie():
  19. key = 'CookieHash'
  20. cookie_dict = Cookie.redis.hgetall(key)
  21. if (cookie_dict is None) or (len(cookie_dict) == 0):
  22. return
  23. cookie_list = list(cookie_dict)
  24. now = int(time.time())
  25. while True:
  26. cookie = random.choice(cookie_list)#Cookie.choice_cookie()
  27. if cookie is None:
  28. return
  29. cookie_info = cookie_dict.get(cookie)
  30. if cookie_info is None:
  31. continue
  32. create_at = int(time.mktime(time.strptime(cookie_info, "%Y-%m-%d %H:%M:%S")))
  33. # 删除过期的代理
  34. if now - create_at >= 7200:
  35. Cookie.redis.hdel(key, cookie)
  36. cookie_list.remove(cookie)
  37. continue
  38. return cookie
  39. @staticmethod
  40. def del_cookie(cookie):
  41. cookie_info = Cookie.redis.hget('CookieHash', cookie)
  42. if cookie_info is None:
  43. return
  44. create_at = int(time.mktime(time.strptime(cookie_info, "%Y-%m-%d %H:%M:%S")))
  45. now = int(time.time())
  46. if create_at - now >= 7200:
  47. return
  48. # 删除失效的Cookie
  49. print('删除失效Cookie:' + cookie)
  50. def set(cookie):
  51. Cookie.redis.hset('CookieHash', cookie, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
  52. def get_url(self):
  53. url_list = [
  54. "https://www.douyin.com/user/MS4wLjABAAAAKpCGhwidAtgmUXmYIT0zjp2QpGquUaOCEeVPE6_gHjQ",
  55. "https://www.douyin.com/user/MS4wLjABAAAA3Gq3QsbpkmIchOTXXF51Fy3Dyb0xF7rLvI3QEQjoYwo",
  56. "https://www.douyin.com/user/MS4wLjABAAAAO6GOEmNyHKo8Kd1IdS95J884BAFTpR8eRDOBIAb2d7VNGWFBzhR2odpulkKJFV95",
  57. "https://www.douyin.com/user/MS4wLjABAAAAc6xH1Jxur09z-Oy5M9IVpckmAyPlQg5uj_B8fFrTIiNv5B-XyH6G9RCQt3qLsZZU",
  58. "https://www.douyin.com/user/MS4wLjABAAAA5v-bl3BibVonLiHoZWW173nKKy_yn2DACHErPbOoo3g",
  59. "https://www.douyin.com/user/MS4wLjABAAAATjfBnSaWkl6ZJiznrCzkt7-l_7pkD4pLkKyPkLtD2VU",
  60. "https://www.douyin.com/user/MS4wLjABAAAAZ-hqjDujmsKhlhuNV1R4OPrIWJ0XmhFnuJmy9h3u4VkovGqJycoOVGaCj8uqwQiJ",
  61. "https://www.douyin.com/user/MS4wLjABAAAAoMNLc-_Vx_TDrJQvpGEtgZfpR99JDKf6n23mXpsnMGg",
  62. "https://www.douyin.com/user/MS4wLjABAAAAMhl868Pj7GIBYNVX46kjLGS_eiprGHaDHe5ffqi91_s",
  63. "https://www.douyin.com/user/MS4wLjABAAAAR8ow3aH-TjB2c4TJFqtDvFPhmd3TBFyHF1zMCLd39rdO45zpMZXAIvdwsQ4_7gw6"
  64. ]
  65. return random.choice(url_list)
  66. def get_ck(self):
  67. requests.packages.urllib3.disable_warnings()
  68. headers = {
  69. 'authority': 'www.douyin.com',
  70. 'method': 'GET',
  71. 'path': '/',
  72. 'scheme': 'https',
  73. 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
  74. 'accept-encoding': 'gzip, deflate, br',
  75. 'accept-language': 'zh-CN,zh;q=0.9',
  76. 'upgrade-insecure-requests': '1',
  77. 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
  78. }
  79. retry = 0
  80. ck = None
  81. # 屏蔽验证https证书提示
  82. requests.packages.urllib3.disable_warnings()
  83. url = self.get_url()
  84. while True:
  85. if retry > 10:
  86. break
  87. retry += 1
  88. # proxy = Proxy.get()
  89. proxy = Proxy.rola_get()
  90. proxies = {
  91. "http": "http://" + proxy,
  92. "https": "http://" + proxy
  93. }
  94. try:
  95. response = requests.get(
  96. url,
  97. headers=headers,
  98. proxies=proxies,
  99. timeout=8,
  100. verify=False
  101. )
  102. cookie_dict=response.cookies.get_dict()
  103. ck=''
  104. if cookie_dict:
  105. for k,v in cookie_dict.items():
  106. ck+='%s=%s; '%(k,v)
  107. ck=ck[:-2]
  108. ck+='; passport_csrf_token='+str(uuid.uuid4()).replace('-','')
  109. # num = ck.count('ttwid')
  110. # if num>0:
  111. if ck:
  112. Cookie.set(ck)
  113. print(
  114. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  115. + ' 数据获取成功!'
  116. + '\n'
  117. + ck
  118. + '\n'
  119. + Proxy.proxy_info
  120. )
  121. break
  122. else:
  123. print(
  124. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  125. + ' 数据获取失败!'
  126. + '\n'
  127. + ck
  128. + '\n'
  129. + Proxy.proxy_info
  130. + str(cookie_dict)
  131. )
  132. break
  133. else:
  134. print(
  135. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  136. + ' 爬取http连接失败!'
  137. + str(response.status_code)
  138. + '\n'
  139. + Proxy.proxy_info
  140. )
  141. time.sleep(1)
  142. except requests.exceptions.ProxyError as e:
  143. print(
  144. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  145. + ' 代理过期!'
  146. + str(e)
  147. + Proxy.proxy_info
  148. )
  149. # Proxy.del_proxy(proxy)
  150. Proxy.rola_del_proxy(proxy)
  151. pass
  152. except requests.exceptions.ConnectTimeout as e:
  153. print(
  154. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  155. + ' ConnectTimeout!'
  156. + str(e)
  157. + '\n'
  158. + Proxy.proxy_info
  159. )
  160. # Proxy.del_proxy(proxy)
  161. Proxy.rola_del_proxy(proxy)
  162. pass
  163. except Exception as e:
  164. print(
  165. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  166. + ' 请求抛出异常!'
  167. + str(e)
  168. + '\n'
  169. + Proxy.proxy_info
  170. )
  171. pass
  172. return ck
  173. if __name__ == "__main__":
  174. print("主方法开始执行")
  175. # 并行线程数
  176. threading_count = int(sys.argv[1])
  177. print(
  178. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  179. + ' '
  180. + ' 开始执行'
  181. )
  182. while True:
  183. sys.stdout.flush()
  184. # 减去主线程
  185. # active_count = threading.active_count() - 1
  186. # increment = threading_count - active_count
  187. # 获取cookie长度
  188. key = 'CookieHash'
  189. cookie_dict = Cookie.redis.hgetall(key)
  190. if len(cookie_dict) >= 1000:
  191. time.sleep(60)
  192. break
  193. cookie = Cookie()
  194. # while increment > 0:
  195. # sys.stdout.flush()
  196. cookie.get_ck()
  197. # task = threading.Thread(target=cookie.get_ck,)
  198. # task.start() # 准备就绪, 等待cpu执行
  199. # increment = increment - 1
  200. current_time = time.time()
  201. if current_time - start_time > 3600:
  202. print(
  203. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  204. + ' 主方法执行终止'
  205. )
  206. sys.exit(0)
  207. time.sleep(1)