店播爬取Python脚本

web_cookie.py 6.4KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221
  1. import requests,json,random,uuid,sys,time,threading,datetime
  2. from libs.proxy import Proxy
  3. from rds_model.db_redis import DbRedis
  4. # -- coding: utf-8 --**
  5. start_time = time.time()
  6. class Cookie:
  7. redis = DbRedis.douyin_connect()
  8. cookie_info = ''
  9. @staticmethod
  10. def get():
  11. while True:
  12. cookie = Cookie.random_cookie()
  13. if cookie is not None:
  14. return cookie
  15. print('未拿到有效的代理')
  16. time.sleep(1)
  17. @staticmethod
  18. def random_cookie():
  19. key = 'CookieHash'
  20. cookie_dict = Cookie.redis.hgetall(key)
  21. if (cookie_dict is None) or (len(cookie_dict) == 0):
  22. return
  23. cookie_list = list(cookie_dict)
  24. now = int(time.time())
  25. while True:
  26. cookie = random.choice(cookie_list)#Cookie.choice_cookie()
  27. if cookie is None:
  28. return
  29. cookie_info = cookie_dict.get(cookie)
  30. if cookie_info is None:
  31. continue
  32. create_at = int(time.mktime(time.strptime(cookie_info, "%Y-%m-%d %H:%M:%S")))
  33. # 删除过期的代理
  34. if now - create_at >= 3600:
  35. Cookie.redis.hdel(key, cookie)
  36. cookie_list.remove(cookie)
  37. continue
  38. return cookie
  39. @staticmethod
  40. def del_cookie(cookie):
  41. cookie_info = Cookie.redis.hget('CookieHash', cookie)
  42. if cookie_info is None:
  43. return
  44. create_at = int(time.mktime(time.strptime(cookie_info, "%Y-%m-%d %H:%M:%S")))
  45. now = int(time.time())
  46. if create_at - now >= 3600:
  47. return
  48. # 删除失效的Cookie
  49. print('删除失效Cookie:' + cookie)
  50. def set(cookie):
  51. Cookie.redis.hset('CookieHash', cookie, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
  52. def get_ck(self):
  53. requests.packages.urllib3.disable_warnings()
  54. headers = {
  55. 'authority': 'www.douyin.com',
  56. 'method': 'GET',
  57. 'path': '/',
  58. 'scheme': 'https',
  59. 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
  60. 'accept-encoding': 'gzip, deflate, br',
  61. 'accept-language': 'zh-CN,zh;q=0.9',
  62. 'upgrade-insecure-requests': '1',
  63. 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
  64. }
  65. retry = 0
  66. ck = None
  67. # 屏蔽验证https证书提示
  68. requests.packages.urllib3.disable_warnings()
  69. while True:
  70. if retry > 10:
  71. break
  72. retry += 1
  73. proxy = Proxy.get()
  74. proxies = {
  75. "http": "http://" + proxy,
  76. "https": "http://" + proxy
  77. }
  78. try:
  79. response = requests.get(
  80. 'https://www.douyin.com/',
  81. headers=headers,
  82. proxies=proxies,
  83. timeout=8,
  84. verify=False
  85. )
  86. cookie_dict=response.cookies.get_dict()
  87. ck=''
  88. if cookie_dict:
  89. for k,v in cookie_dict.items():
  90. ck+='%s=%s; '%(k,v)
  91. ck=ck[:-2]
  92. ck+='; passport_csrf_token='+str(uuid.uuid4()).replace('-','')
  93. Cookie.set(ck)
  94. print(
  95. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  96. + ' 数据获取成功!'
  97. + '\n'
  98. + ck
  99. + '\n'
  100. + Proxy.proxy_info
  101. )
  102. break
  103. else:
  104. print(
  105. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  106. + ' 爬取http连接失败!'
  107. + str(response.status_code)
  108. + '\n'
  109. + Proxy.proxy_info
  110. )
  111. time.sleep(1)
  112. except requests.exceptions.ProxyError as e:
  113. print(
  114. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  115. + ' 代理过期!'
  116. + str(e)
  117. + Proxy.proxy_info
  118. )
  119. Proxy.del_proxy(proxy)
  120. pass
  121. except requests.exceptions.ConnectTimeout as e:
  122. print(
  123. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  124. + ' ConnectTimeout!'
  125. + str(e)
  126. + '\n'
  127. + Proxy.proxy_info
  128. )
  129. Proxy.del_proxy(proxy)
  130. pass
  131. except Exception as e:
  132. print(
  133. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  134. + ' 请求抛出异常!'
  135. + str(e)
  136. + '\n'
  137. + Proxy.proxy_info
  138. )
  139. pass
  140. return ck
  141. if __name__ == "__main__":
  142. print("主方法开始执行")
  143. # 并行线程数
  144. threading_count = int(sys.argv[1])
  145. print(
  146. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  147. + ' '
  148. + ' 开始执行'
  149. )
  150. while True:
  151. sys.stdout.flush()
  152. # 减去主线程
  153. # active_count = threading.active_count() - 1
  154. # increment = threading_count - active_count
  155. # 获取cookie长度
  156. key = 'CookieHash'
  157. cookie_dict = Cookie.redis.hgetall(key)
  158. if len(cookie_dict) >= 1000:
  159. time.sleep(60)
  160. break
  161. cookie = Cookie()
  162. # while increment > 0:
  163. # sys.stdout.flush()
  164. cookie.get_ck()
  165. # task = threading.Thread(target=cookie.get_ck,)
  166. # task.start() # 准备就绪, 等待cpu执行
  167. # increment = increment - 1
  168. current_time = time.time()
  169. if current_time - start_time > 3600:
  170. print(
  171. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  172. + ' 主方法执行终止'
  173. )
  174. sys.exit(0)
  175. time.sleep(1)