import requests,json,random,uuid,sys,time,threading,datetime from libs.proxy import Proxy from rds_model.db_redis import DbRedis # -- coding: utf-8 --** start_time = time.time() class Cookie: redis = DbRedis.connect() cookie_info = '' @staticmethod def get(): while True: cookie = Cookie.random_cookie() if cookie is not None: return cookie print('未拿到有效的代理') time.sleep(1) @staticmethod def random_cookie(): key = 'CookieHash' cookie_dict = Cookie.redis.hgetall(key) if (cookie_dict is None) or (len(cookie_dict) == 0): return cookie_list = list(cookie_dict) now = int(time.time()) while True: cookie = random.choice(cookie_list)#Cookie.choice_cookie() if cookie is None: return cookie_info = cookie_dict.get(cookie) if cookie_info is None: continue create_at = int(time.mktime(time.strptime(cookie_info, "%Y-%m-%d %H:%M:%S"))) # 删除过期的代理 if now - create_at >= 7200: Cookie.redis.hdel(key, cookie) cookie_list.remove(cookie) continue return cookie @staticmethod def del_cookie(cookie): cookie_info = Cookie.redis.hget('CookieHash', cookie) if cookie_info is None: return create_at = int(time.mktime(time.strptime(cookie_info, "%Y-%m-%d %H:%M:%S"))) now = int(time.time()) if create_at - now >= 7200: return # 删除失效的Cookie print('删除失效Cookie:' + cookie) def set(cookie): Cookie.redis.hset('CookieHash', cookie, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) def get_url(self): url_list = [ "https://www.douyin.com/user/MS4wLjABAAAAKpCGhwidAtgmUXmYIT0zjp2QpGquUaOCEeVPE6_gHjQ", "https://www.douyin.com/user/MS4wLjABAAAA3Gq3QsbpkmIchOTXXF51Fy3Dyb0xF7rLvI3QEQjoYwo", "https://www.douyin.com/user/MS4wLjABAAAAO6GOEmNyHKo8Kd1IdS95J884BAFTpR8eRDOBIAb2d7VNGWFBzhR2odpulkKJFV95", "https://www.douyin.com/user/MS4wLjABAAAAc6xH1Jxur09z-Oy5M9IVpckmAyPlQg5uj_B8fFrTIiNv5B-XyH6G9RCQt3qLsZZU", "https://www.douyin.com/user/MS4wLjABAAAA5v-bl3BibVonLiHoZWW173nKKy_yn2DACHErPbOoo3g", "https://www.douyin.com/user/MS4wLjABAAAATjfBnSaWkl6ZJiznrCzkt7-l_7pkD4pLkKyPkLtD2VU", "https://www.douyin.com/user/MS4wLjABAAAAZ-hqjDujmsKhlhuNV1R4OPrIWJ0XmhFnuJmy9h3u4VkovGqJycoOVGaCj8uqwQiJ", "https://www.douyin.com/user/MS4wLjABAAAAoMNLc-_Vx_TDrJQvpGEtgZfpR99JDKf6n23mXpsnMGg", "https://www.douyin.com/user/MS4wLjABAAAAMhl868Pj7GIBYNVX46kjLGS_eiprGHaDHe5ffqi91_s", "https://www.douyin.com/user/MS4wLjABAAAAR8ow3aH-TjB2c4TJFqtDvFPhmd3TBFyHF1zMCLd39rdO45zpMZXAIvdwsQ4_7gw6" ] return random.choice(url_list) def get_ck(self): requests.packages.urllib3.disable_warnings() headers = { 'authority': 'www.douyin.com', 'method': 'GET', 'path': '/', 'scheme': 'https', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36', } retry = 0 ck = None # 屏蔽验证https证书提示 requests.packages.urllib3.disable_warnings() url = self.get_url() while True: if retry > 10: break retry += 1 # proxy = Proxy.get() proxy = Proxy.rola_get() proxies = { "http": "http://" + proxy, "https": "http://" + proxy } try: response = requests.get( url, headers=headers, proxies=proxies, timeout=8, verify=False ) cookie_dict=response.cookies.get_dict() ck='' if cookie_dict: for k,v in cookie_dict.items(): ck+='%s=%s; '%(k,v) ck=ck[:-2] ck+='; passport_csrf_token='+str(uuid.uuid4()).replace('-','') # num = ck.count('ttwid') # if num>0: if ck: Cookie.set(ck) print( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + ' 数据获取成功!' + '\n' + ck + '\n' + Proxy.proxy_info ) break else: print( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + ' 数据获取失败!' + '\n' + ck + '\n' + Proxy.proxy_info + str(cookie_dict) ) break else: print( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + ' 爬取http连接失败!' + str(response.status_code) + '\n' + Proxy.proxy_info ) time.sleep(1) except requests.exceptions.ProxyError as e: print( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + ' 代理过期!' + str(e) + Proxy.proxy_info ) # Proxy.del_proxy(proxy) Proxy.rola_del_proxy(proxy) pass except requests.exceptions.ConnectTimeout as e: print( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + ' ConnectTimeout!' + str(e) + '\n' + Proxy.proxy_info ) # Proxy.del_proxy(proxy) Proxy.rola_del_proxy(proxy) pass except Exception as e: print( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + ' 请求抛出异常!' + str(e) + '\n' + Proxy.proxy_info ) pass return ck if __name__ == "__main__": print("主方法开始执行") # 并行线程数 threading_count = int(sys.argv[1]) print( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + ' ' + ' 开始执行' ) while True: sys.stdout.flush() # 减去主线程 # active_count = threading.active_count() - 1 # increment = threading_count - active_count # 获取cookie长度 key = 'CookieHash' cookie_dict = Cookie.redis.hgetall(key) if len(cookie_dict) >= 1000: time.sleep(60) break cookie = Cookie() # while increment > 0: # sys.stdout.flush() cookie.get_ck() # task = threading.Thread(target=cookie.get_ck,) # task.start() # 准备就绪, 等待cpu执行 # increment = increment - 1 current_time = time.time() if current_time - start_time > 3600: print( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + ' 主方法执行终止' ) sys.exit(0) time.sleep(1)