123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255 |
- import requests,json,random,uuid,sys,time,threading,datetime
- from libs.proxy import Proxy
- from rds_model.db_redis import DbRedis
- # -- coding: utf-8 --**
- start_time = time.time()
- class Cookie:
- redis = DbRedis.connect()
- cookie_info = ''
- @staticmethod
- def get():
- while True:
- cookie = Cookie.random_cookie()
- if cookie is not None:
- return cookie
- print('未拿到有效的代理')
- time.sleep(1)
- @staticmethod
- def random_cookie():
- key = 'CookieHash'
- cookie_dict = Cookie.redis.hgetall(key)
- if (cookie_dict is None) or (len(cookie_dict) == 0):
- return
- cookie_list = list(cookie_dict)
- now = int(time.time())
- while True:
- cookie = random.choice(cookie_list)#Cookie.choice_cookie()
- if cookie is None:
- return
- cookie_info = cookie_dict.get(cookie)
- if cookie_info is None:
- continue
- create_at = int(time.mktime(time.strptime(cookie_info, "%Y-%m-%d %H:%M:%S")))
- # 删除过期的代理
- if now - create_at >= 7200:
- Cookie.redis.hdel(key, cookie)
- cookie_list.remove(cookie)
- continue
- return cookie
- @staticmethod
- def del_cookie(cookie):
- cookie_info = Cookie.redis.hget('CookieHash', cookie)
- if cookie_info is None:
- return
-
- create_at = int(time.mktime(time.strptime(cookie_info, "%Y-%m-%d %H:%M:%S")))
- now = int(time.time())
- if create_at - now >= 7200:
- return
- # 删除失效的Cookie
- print('删除失效Cookie:' + cookie)
- def set(cookie):
- Cookie.redis.hset('CookieHash', cookie, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
- def get_url(self):
- url_list = [
- "https://www.douyin.com/user/MS4wLjABAAAAKpCGhwidAtgmUXmYIT0zjp2QpGquUaOCEeVPE6_gHjQ",
- "https://www.douyin.com/user/MS4wLjABAAAA3Gq3QsbpkmIchOTXXF51Fy3Dyb0xF7rLvI3QEQjoYwo",
- "https://www.douyin.com/user/MS4wLjABAAAAO6GOEmNyHKo8Kd1IdS95J884BAFTpR8eRDOBIAb2d7VNGWFBzhR2odpulkKJFV95",
- "https://www.douyin.com/user/MS4wLjABAAAAc6xH1Jxur09z-Oy5M9IVpckmAyPlQg5uj_B8fFrTIiNv5B-XyH6G9RCQt3qLsZZU",
- "https://www.douyin.com/user/MS4wLjABAAAA5v-bl3BibVonLiHoZWW173nKKy_yn2DACHErPbOoo3g",
- "https://www.douyin.com/user/MS4wLjABAAAATjfBnSaWkl6ZJiznrCzkt7-l_7pkD4pLkKyPkLtD2VU",
- "https://www.douyin.com/user/MS4wLjABAAAAZ-hqjDujmsKhlhuNV1R4OPrIWJ0XmhFnuJmy9h3u4VkovGqJycoOVGaCj8uqwQiJ",
- "https://www.douyin.com/user/MS4wLjABAAAAoMNLc-_Vx_TDrJQvpGEtgZfpR99JDKf6n23mXpsnMGg",
- "https://www.douyin.com/user/MS4wLjABAAAAMhl868Pj7GIBYNVX46kjLGS_eiprGHaDHe5ffqi91_s",
- "https://www.douyin.com/user/MS4wLjABAAAAR8ow3aH-TjB2c4TJFqtDvFPhmd3TBFyHF1zMCLd39rdO45zpMZXAIvdwsQ4_7gw6"
- ]
- return random.choice(url_list)
- def get_ck(self):
- requests.packages.urllib3.disable_warnings()
- headers = {
- 'authority': 'www.douyin.com',
- 'method': 'GET',
- 'path': '/',
- 'scheme': 'https',
- 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
- 'accept-encoding': 'gzip, deflate, br',
- 'accept-language': 'zh-CN,zh;q=0.9',
- 'upgrade-insecure-requests': '1',
- 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
- }
- retry = 0
- ck = None
- # 屏蔽验证https证书提示
- requests.packages.urllib3.disable_warnings()
- url = self.get_url()
- while True:
- if retry > 10:
- break
- retry += 1
- # proxy = Proxy.get()
- proxy = Proxy.rola_get()
- proxies = {
- "http": "http://" + proxy,
- "https": "http://" + proxy
- }
- try:
- response = requests.get(
- url,
- headers=headers,
- proxies=proxies,
- timeout=8,
- verify=False
- )
- cookie_dict=response.cookies.get_dict()
- ck=''
- if cookie_dict:
- for k,v in cookie_dict.items():
- ck+='%s=%s; '%(k,v)
- ck=ck[:-2]
- ck+='; passport_csrf_token='+str(uuid.uuid4()).replace('-','')
- # num = ck.count('ttwid')
- # if num>0:
- if ck:
- Cookie.set(ck)
- print(
- time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
- + ' 数据获取成功!'
- + '\n'
- + ck
- + '\n'
- + Proxy.proxy_info
- )
- break
- else:
- print(
- time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
- + ' 数据获取失败!'
- + '\n'
- + ck
- + '\n'
- + Proxy.proxy_info
- + str(cookie_dict)
- )
- break
- else:
- print(
- time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
- + ' 爬取http连接失败!'
- + str(response.status_code)
- + '\n'
- + Proxy.proxy_info
- )
- time.sleep(1)
- except requests.exceptions.ProxyError as e:
- print(
- time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
- + ' 代理过期!'
- + str(e)
- + Proxy.proxy_info
- )
- # Proxy.del_proxy(proxy)
- Proxy.rola_del_proxy(proxy)
- pass
- except requests.exceptions.ConnectTimeout as e:
- print(
- time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
- + ' ConnectTimeout!'
- + str(e)
- + '\n'
- + Proxy.proxy_info
- )
- # Proxy.del_proxy(proxy)
- Proxy.rola_del_proxy(proxy)
- pass
- except Exception as e:
- print(
- time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
- + ' 请求抛出异常!'
- + str(e)
- + '\n'
- + Proxy.proxy_info
- )
- pass
- return ck
- if __name__ == "__main__":
- print("主方法开始执行")
- # 并行线程数
- threading_count = int(sys.argv[1])
- print(
- time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
- + ' '
- + ' 开始执行'
- )
- while True:
- sys.stdout.flush()
-
- # 减去主线程
- # active_count = threading.active_count() - 1
-
- # increment = threading_count - active_count
- # 获取cookie长度
- key = 'CookieHash'
- cookie_dict = Cookie.redis.hgetall(key)
- if len(cookie_dict) >= 1000:
- time.sleep(60)
- break
- cookie = Cookie()
-
- # while increment > 0:
- # sys.stdout.flush()
- cookie.get_ck()
- # task = threading.Thread(target=cookie.get_ck,)
- # task.start() # 准备就绪, 等待cpu执行
- # increment = increment - 1
-
- current_time = time.time()
- if current_time - start_time > 3600:
- print(
- time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
- + ' 主方法执行终止'
- )
- sys.exit(0)
- time.sleep(1)
|