123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302 |
- import json,time,sys,threading,warnings
- from rds_model.db_redis import DbRedis
- from log.print_log import PrintLog
- from libs.proxy import Proxy
- from web_dy import *
- from rds_model.rds_user_video_list import RdsUserVideoList
- from web_cookie import Cookie
- start_time = time.time()
- # -- coding: utf-8 --**
- def set_score(flag):
- rds = RdsUserVideoList()
- if flag == 'success':
- data_score = rds.get_score()
- if data_score is None:
- data_score = '1@@@1@@@0'
- else:
- data_score = data_score.split('@@@')
- total, success, fail = int(data_score[0]), int(data_score[1]), int(data_score[2])
- success = success + 1
- data_score = str(total) + '@@@' + str(success) + '@@@' + str(fail)
- rds.record_score(data_score)
- elif flag == 'fail':
- data_score = rds.get_score()
- if data_score is None:
- data_score = '1@@@0@@@1'
- else:
- data_score = data_score.split('@@@')
- total, success, fail = int(data_score[0]), int(data_score[1]), int(data_score[2])
- fail = fail + 1
- data_score = str(total) + '@@@' + str(success) + '@@@' + str(fail)
- rds.record_score(data_score)
- elif flag == 'all':
- data_score = rds.get_score()
- if data_score is None:
- data_score = '1@@@0@@@0'
- else:
- data_score = data_score.split('@@@')
- total, success, fail = int(data_score[0]), int(data_score[1]), int(data_score[2])
- total = total + 1
- data_score = str(total) + '@@@' + str(success) + '@@@' + str(fail)
- rds.record_score(data_score)
- return None
- def get_signature(url=None,method='_signature'):
- with open('/mnt/shop_live_scraper/signature.js', 'r', encoding='utf-8') as f:
- b = f.read()
-
- c = execjs.compile(b)
-
- # url=url.replace('%28','(').replace('%29',')').replace('%2C',',')
- d = c.call(method, url.replace('\n',''))
-
- return d
- def get_user_videos(sec_user_id, max_cursor=0, count=20):
- # ua,ck=get_ua_ck('get_user_videos')
- ua="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
-
- url='https://www.douyin.com/aweme/v1/web/aweme/post/?'
- param={
- 'device_platform': 'webapp',
- 'aid': '6383',
- 'channel': 'channel_pc_web',
- 'sec_user_id': sec_user_id,
- 'max_cursor': str(max_cursor),
- 'count': str(count),
- 'publish_video_strategy_type': '2',
- 'version_code': '160100',
- 'version_name': '16.1.0',
- 'cookie_enabled': 'true',
- 'screen_width': '1920',
- 'screen_height': '1080',
- 'browser_language': 'zh-CN',
- 'browser_platform': 'Win32',
- 'browser_name': 'Mozilla',
- 'browser_version':ua.replace('Mozilla/',''),
- 'browser_online': 'true',
- }
- url = url + parse.urlencode(param)
-
- _signature = get_signature(url)
- url+='&_signature='+quote(_signature)
- ck = Cookie.get()
- if ck is None:
- print('获取cookie失败')
- return None
-
- headers = {
- "authority": "www.douyin.com",
- "method": "GET",
- "path": str(url).replace('https://www.douyin.com',''),
- "scheme": "https",
- "accept": "application/json, text/plain, */*",
- # "accept-encoding": "gzip, deflate, br",
- "accept-language": "zh-CN,zh;q=0.9",
- "cookie": ck,
- "referer": "https://www.douyin.com/user/{sec_user_id}?enter_method=search_result&enter_from=search_result".format(sec_user_id=sec_user_id),
- "user-agent":ua,
- "withcredentials": "true",
- }
- if ck:
- headers['cookie']=ck
- retry = 0
- response_json = None
- while True:
- if retry > 20:
- Cookie.del_cookie(ck)
- break
- retry += 1
- proxy = Proxy.get()
- proxies = {
- "http": "http://" + proxy,
- "https": "http://" + proxy
- }
- try:
- response = requests.get(
- url,
- headers=headers,
- proxies=proxies,
- timeout=8
- )
- if (response.status_code == 200) and (response.text is not None) and (response.text != ''):
- response_json = response.json()
- if (response_json.get('aweme_list') is not None):
- print(
- time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
- + ' 数据获取成功!'
- + '\n'
- + str(sec_user_id)
- )
- break
- else:
- print(
- time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
- + ' 数据获取失败!'
- + '\n'
- + str(sec_user_id)
- + '\n'
- + response.text
- + Proxy.proxy_info
- )
- else:
- print(
- time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
- + ' 爬取http连接失败!'
- + str(response.status_code)
- + '\n'
- + Proxy.proxy_info
- + '\n'
- + str(sec_user_id)
- + '\n'
- )
- time.sleep(1)
- except requests.exceptions.ProxyError as e:
- print(
- time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
- + ' 代理过期!'
- + str(e)
- + '\n'
- + str(sec_user_id)
- + '\n'
- + Proxy.proxy_info
- )
- Proxy.del_proxy(proxy)
- pass
- except requests.exceptions.ConnectTimeout as e:
- print(
- time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
- + ' ConnectTimeout!'
- + str(e)
- + '\n'
- + str(sec_user_id)
- + '\n'
- + Proxy.proxy_info
- )
- Proxy.del_proxy(proxy)
- pass
- except Exception as e:
- print(
- time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
- + ' 请求抛出异常!'
- + str(e)
- + '\n'
- + str(sec_user_id)
- + '\n'
- + Proxy.proxy_info
- )
- pass
- return response_json
- def scrape():
-
- rds = RdsUserVideoList()
- user_info = rds.get_request_param()
- if user_info is None:
- return None
- sec_user_id = str(user_info)
- print(
- time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
- + ' '
- + str(sec_user_id)
- )
- try:
- videos = get_user_videos(sec_user_id=sec_user_id,max_cursor=0,count=20)
-
- if videos is None:
- # rds.push_request_id(sec_user_id)
- print(
- time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
- + ' 数据获取失败!响应数据为空!'
- + '\n'
- + str(sec_user_id)
- + '\n'
- )
- sys.exit(0)
- if isinstance(videos, dict):
- awemes = videos.get('aweme_list')
- else:
- # print(videos)
- awemes = None
- if awemes:
- set_score('success')
- data = str(sec_user_id) + '@@@' + json.dumps(videos)
- rds.push_data_list(data)
- else:
- set_score('fail')
- except Exception as e:
- set_score('fail')
- # rds.push_request_id(sec_user_id)
- print(
- time.strftime("%H:%M:%S", time.localtime())
- + ' '
- + str(sec_user_id)
- + '数据异常:'
- + str(e)
- )
- sys.exit(0)
- if __name__ == '__main__':
- print("主方法开始执行")
- # 并行线程数
- threading_count = int(sys.argv[1])
- rds = RdsUserVideoList()
- warnings.filterwarnings("ignore")
- print(
- time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
- + ' '
- + ' 开始执行,用户队列长度:'
- + str(rds.get_len())
- )
- while True:
- sys.stdout.flush()
-
- # 减去主线程
- active_count = threading.active_count() - 1
-
- increment = threading_count - active_count
-
- while increment > 0:
- sys.stdout.flush()
- # scrape()
- task = threading.Thread(target=scrape, args=())
- task.start() # 准备就绪, 等待cpu执行
- increment = increment - 1
-
- current_time = time.time()
- if current_time - start_time > 3600:
- print(
- time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
- + ' 主方法执行终止'
- )
- sys.exit(0)
- time.sleep(1)
|