123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654 |
- from rds_model.rds_hourly_rank_list import RdsDouyinHourlyRankList
- import time
- import json
- import sys
- import threading
- import random
- import urllib
- import requests
- from rds_model.db_redis import DbRedis
- from log.print_log import PrintLog
- from libs.Xg04 import X_Gorgon
- from libs.proxy import Proxy
- from libs.mysql_user_living import *
- start_time = time.time()
- def get_random(i, random_type=1):
- if random_type == 1:
- return str(random.randint(1 * 10 ** (i - 1), 1 * 10 ** i - 1))
- elif random_type == 8:
- seed = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
- sa = []
- for i in range(i):
- sa.append(random.choice(seed))
- salt = ''.join(sa)
- return salt
- else:
- seed = "1234567890abcde"
- sa = []
- for i in range(i):
- sa.append(random.choice(seed))
- salt = ''.join(sa)
- return salt
- def get_random_brand_type():
- brand_type = get_random(3, random_type=8) + '-' + get_random(2, random_type=8) + '00'
- return brand_type
- def get_mc():
- def a():
- seed = "1234567890ABCDEF"
- sa = []
- for i in range(2):
- sa.append(random.choice(seed))
- salt = ''.join(sa)
- return salt
- k = ''
- for i in range(6):
- k += a() + ':'
- return k[:-1]
- def get_whole_station_rank_data(room_id, sec_anchor_id, anchor_id):
- domain = 'webcast5-normal-c-lq.amemv.com'
- url = 'https://' + domain + '/webcast/ranklist/hour/?'
- rticket = str(int(time.time() * 1000))
- ts = int(time.time())
- mc = get_mc
- udid = '8604' + get_random(11)
- query = {
- "manifest_version_code" : "110001",
- "dpi" : "480",
- "app_name" : "aweme",
- "version_name" : "11.0.0",
- "ts" : ts,
- "cpu_support64" : "true",
- "app_type" : "normal",
- "ac" : "wifi",
- "host_abi" : "armeabi-v7a",
- "channel" : "wandoujia_aweme_feisuo",
- "device_platform" : "android",
- "iid" : "3932281687270606",
- "version_code" : "110000",
- "cdid" : "1d06013c-ff0b-427b-b1ab-6700259c15c6",
- "openudid" : "291f3ce2efe59345",
- "hour_info" : "0",
- "room_id" : room_id,
- "rank_type" : "12",
- "sec_anchor_id" : sec_anchor_id,
- "anchor_id" : anchor_id,
- "webcast_sdk_version" : "1510",
- "webcast_language" : "zh",
- "webcast_locale" : "zh_CN",
- "os_api" : "23",
- "device_type" : "HUAWEI MLA-AL10",
- "ssmix" : "a",
- "update_version_code" : "11009900",
- "cdid" : "1d06013c-ff0b-427b-b1ab-6700259c15c6",
- "openudid" : "291f3ce2efe59345",
- "device_id" : '49388718822',
- "resolution" : "1080*1800",
- "os_version" : "6.0",
- "language" : "zh",
- "device_brand" : "HUAWEI",
- "_rticket" : rticket,
- "aid" : "1128"
- }
- query_params = urllib.parse.urlencode(query)
- url = url + query_params
- body = ''
- xGorgon = X_Gorgon(query_params, body)
- userAgent = 'okhttp/3.' + str(random.randint(0, 10)) + '.' + str(random.randint(0, 10)) + '.' + str(
- random.randint(1, 10))
- headers = {
- 'Host': domain,
- 'Connection': 'keep-alive',
- 'User-Agent': userAgent,
- 'accept-encoding': 'gzip, deflate',
- "x-gorgon": xGorgon.get('X-Gorgon'),
- "x-khronos": xGorgon.get('X-Khronos'),
- 'sdk-version' : '2',
- 'x-ss-dp' : '1128',
- 'x-tt-trace-id' : '00-70f99f2209e0b045dd14266ee1da0468-70f99f2209e0b045-01',
- }
- retry = 0
- response_json = None
- while True:
- if retry > 10:
- break
- retry += 1
- proxy = Proxy.get()
- proxies = {
- "http": "http://" + proxy,
- "https": "http://" + proxy
- }
- try:
- response = requests.get(
- url,
- headers=headers,
- proxies=proxies,
- timeout=8
- )
- if (response.status_code == 200) and (response.text is not None) and (response.text != ''):
- response_json = response.json()
- if (response_json.get('data') is not None):
- break
- else:
- print(
- time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
- + ' 数据获取失败!'
- + '\n'
- + room_id
- + '\n'
- + response.text
- + Proxy.proxy_info
- )
-
- else:
- print(
- time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
- + ' 爬取http连接失败!'
- + str(response.status_code)
- + '\n'
- + Proxy.proxy_info
- + '\n'
- + room_id
- + '\n'
- )
- time.sleep(1)
- except requests.exceptions.ProxyError as e:
- print(
- time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
- + ' 代理过期!'
- + str(e)
- + '\n'
- + room_id
- + '\n'
- + Proxy.proxy_info
- )
- Proxy.del_proxy(proxy)
- pass
- except requests.exceptions.ConnectTimeout as e:
- print(
- time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
- + ' ConnectTimeout!'
- + str(e)
- + '\n'
- + room_id
- + '\n'
- + Proxy.proxy_info
- )
- Proxy.del_proxy(proxy)
- pass
- except Exception as e:
- print(
- time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
- + ' 请求抛出异常!'
- + str(e)
- + '\n'
- + room_id
- + '\n'
- + Proxy.proxy_info
- )
- pass
- return response_json
- def get_commerce_rank_list_data(room_id, sec_anchor_id, anchor_id):
-
- domain = 'webcast5-normal-c-lq.amemv.com'
- url = 'https://' + domain + '/webcast/ranklist/hour/?'
- rticket = str(int(time.time() * 1000))
- mc = get_mc
- udid = '8604' + get_random(11)
- # openudid = '3b22' + str(udid.uuid4())[-12:]
- ts = int(time.time())
-
- query = {
- "style" : "3",
- "hour_info" : "0",
- "room_id" : room_id,
- "rank_type" : "31",
- "sec_anchor_id" : sec_anchor_id,
- "webcast_sdk_version" : "1710",
- "webcast_language" : "zh",
- "webcast_locale" : "zh_CN",
- "webcast_gps_access" : "2",
- "os_api" : "23",
- "device_type" : "HUAWEI+MLA-AL10",
- "ssmix" : "a",
- "manifest_version_code" : "130001",
- "dpi" : "480",
- "app_name" : "aweme",
- "version_name" : "13.0.0",
- "ts" : ts,
- "cpu_support64" : "true",
- "storage_type" : "0",
- "app_type" : "normal",
- "appTheme" : "dark",
- "ac" : "wifi",
- "host_abi" : "armeabi-v7a",
- "update_version_code" : "13009900",
- "channel" : "tengxun_new",
- "_rticket" : rticket,
- "device_platform" : "android",
- "iid" : "2876750595379005",
- "version_code" : "130000",
- "mac_address" : mc,
- "cdid" : "81542dc6-2aca-4ff6-ac58-d94179e9d3e6",
- "openudid" : "291f3ce2efe59345",
- "device_id" : "49388718822",
- "resolution" : "1080*1800",
- "os_version" : "6.0",
- "language" : "zh",
- "device_brand" : "HUAWEI",
- "aid" : "1128"
- }
- query_params = urllib.parse.urlencode(query)
- url = url + query_params
- body = ''
- xGorgon = X_Gorgon(query_params, body)
- userAgent = 'okhttp/3.' + str(random.randint(0, 10)) + '.' + str(random.randint(0, 10)) + '.' + str(
- random.randint(1, 10))
- headers = {
- 'Host': domain,
- 'Connection': 'keep-alive',
- 'Cache-Control': 'max-age=0',
- 'User-Agent': userAgent,
- 'accept-encoding': 'gzip, deflate',
- "x-gorgon": xGorgon.get('X-Gorgon'),
- "x-khronos": xGorgon.get('X-Khronos'),
- 'passport-sdk-version' : '18',
- 'sdk-version' : '2',
- 'x-ss-dp' : '1128',
- 'x-tt-trace-id' : '00-70f99f2209e0b045dd14266ee1da0468-70f99f2209e0b045-01',
- }
- retry = 0
- response_json = None
- while True:
- if retry > 10:
- break
- retry += 1
- proxy = Proxy.get()
- proxies = {
- "http": "http://" + proxy,
- "https": "http://" + proxy
- }
- try:
- response = requests.get(
- url,
- headers=headers,
- proxies=proxies,
- timeout=8
- )
- if (response.status_code == 200) and (response.text is not None) and (response.text != ''):
- response_json = response.json()
- if (response_json.get('data') is not None):
- break
- else:
- print(
- time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
- + ' 数据获取失败!'
- + '\n'
- + room_id
- + '\n'
- + response.text
- + Proxy.proxy_info
- )
-
- else:
- print(
- time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
- + ' 爬取http连接失败!'
- + str(response.status_code)
- + '\n'
- + Proxy.proxy_info
- + '\n'
- + room_id
- + '\n'
- )
- time.sleep(1)
- except requests.exceptions.ProxyError as e:
- print(
- time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
- + ' 代理过期!'
- + str(e)
- + '\n'
- + room_id
- + '\n'
- + Proxy.proxy_info
- )
- Proxy.del_proxy(proxy)
- pass
- except requests.exceptions.ConnectTimeout as e:
- print(
- time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
- + ' ConnectTimeout!'
- + str(e)
- + '\n'
- + room_id
- + '\n'
- + Proxy.proxy_info
- )
- Proxy.del_proxy(proxy)
- pass
- except Exception as e:
- print(
- time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
- + ' 请求抛出异常!'
- + str(e)
- + '\n'
- + room_id
- + '\n'
- + Proxy.proxy_info
- )
- pass
- return response_json
- def get_popularity_rank_list_data(room_id, sec_anchor_id, anchor_id):
-
- domain = 'webcast5-normal-c-lq.amemv.com'
- url = 'https://' + domain + '/webcast/ranklist/hour/?'
- rticket = str(int(time.time() * 1000))
- mc = get_mc
- udid = '8604' + get_random(11)
- ts = int(time.time())
- # openudid = '3b22' + str(udid.uuid4())[-12:]
-
- query = {
- "anchor_id" : anchor_id,
- "room_id" : room_id,
- "sec_anchor_id" : sec_anchor_id,
- "sec_user_id" : "null",
- "webcast_sdk_version" : "2010",
- "webcast_language" : "zh",
- "webcast_locale" : "zh_CN",
- "webcast_gps_access" : "2",
- "current_network_quality_info" : "%7B%22http_rtt%22%3A110%2C%22tcp_rtt%22%3A90%2C%22quic_rtt%22%3A90%2C%22downstream_throughput_kbps%22%3A8185%2C%22video_download_speed%22%3A411%2C%22quic_receive_loss_rate%22%3A-1%2C%22quic_send_loss_rate%22%3A-1%2C%22net_effective_connection_type%22%3A5%7D",
- "os_api" : "23",
- "device_type" : "HUAWEI+MLA-AL10",
- "ssmix" : "a",
- "manifest_version_code" : "160001",
- "dpi" : "480",
- "app_name" : "aweme",
- "version_name" : "16.0.0",
- "ts" : ts,
- "cpu_support64" : "true",
- "app_type" : "normal",
- "appTheme" : "dark",
- "ac" : "wifi",
- "host_abi" : "armeabi-v7a",
- "update_version_code" : "16009900",
- "channel" : "wandoujia_lesi_1128_0507",
- "_rticket" : rticket,
- "device_platform" : "android",
- "iid" : "273107070769192",
- "version_code" : "160000",
- "cdid" : "09e904ed-66a9-4e89-9661-afae7f61e6c5",
- "openudid" : "291f3ce2efe59345",
- "device_id" : "49388718822",
- "resolution" : "1080*1800",
- "os_version" : "6.0",
- "language" : "zh",
- "device_brand" : "HUAWEI",
- "aid" : "1128",
- "minor_status" : "0"
- }
- query_params = urllib.parse.urlencode(query)
- url = url + query_params
- body = ''
- xGorgon = X_Gorgon(query_params, body)
- userAgent = 'okhttp/3.' + str(random.randint(0, 10)) + '.' + str(random.randint(0, 10)) + '.' + str(
- random.randint(1, 10))
- headers = {
- 'Host': domain,
- 'Connection': 'keep-alive',
- 'Cache-Control': 'max-age=0',
- 'Upgrade-Insecure-Requests': '1',
- 'User-Agent': userAgent,
- 'accept-encoding': 'gzip, deflate',
- "x-gorgon": xGorgon.get('X-Gorgon'),
- "x-khronos": xGorgon.get('X-Khronos'),
- 'passport-sdk-version' : '18',
- 'sdk-version' : '2',
- 'x-ss-dp' : '1128',
- 'x-tt-trace-id' : '00-70f99f2209e0b045dd14266ee1da0468-70f99f2209e0b045-01',
- }
- retry = 0
- response_json = None
- while True:
- if retry > 10:
- break
- retry += 1
- proxy = Proxy.get()
- proxies = {
- "http": "http://" + proxy,
- "https": "http://" + proxy
- }
- try:
- response = requests.get(
- url,
- headers=headers,
- proxies=proxies,
- timeout=8
- )
- if (response.status_code == 200) and (response.text is not None) and (response.text != ''):
- response_json = response.json()
- if (response_json.get('data') is not None):
- break
- else:
- print(
- time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
- + ' 数据获取失败!'
- + '\n'
- + room_id
- + '\n'
- + response.text
- + Proxy.proxy_info
- )
-
- else:
- print(
- time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
- + ' 爬取http连接失败!'
- + str(response.status_code)
- + '\n'
- + Proxy.proxy_info
- + '\n'
- + room_id
- + '\n'
- )
- time.sleep(1)
- except requests.exceptions.ProxyError as e:
- print(
- time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
- + ' 代理过期!'
- + str(e)
- + '\n'
- + room_id
- + '\n'
- + Proxy.proxy_info
- )
- Proxy.del_proxy(proxy)
- pass
- except requests.exceptions.ConnectTimeout as e:
- print(
- time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
- + ' ConnectTimeout!'
- + str(e)
- + '\n'
- + room_id
- + '\n'
- + Proxy.proxy_info
- )
- Proxy.del_proxy(proxy)
- pass
- except Exception as e:
- print(
- time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
- + ' 请求抛出异常!'
- + str(e)
- + '\n'
- + room_id
- + '\n'
- + Proxy.proxy_info
- )
- pass
- return response_json
- def scrape(room_id,sec_anchor_id,anchor_id):
- rds_list = RdsDouyinHourlyRankList()
- time.sleep(0.1)
- try:
- # 全站小时榜
- whole_station_response_json = get_whole_station_rank_data(room_id=room_id, sec_anchor_id=sec_anchor_id, anchor_id=anchor_id)
- # 带货小时榜
- commerce_response_json = get_commerce_rank_list_data(room_id=room_id, sec_anchor_id=sec_anchor_id, anchor_id=anchor_id)
- # 人气小时榜
- popularity_response_json = get_popularity_rank_list_data(room_id=room_id, sec_anchor_id=sec_anchor_id, anchor_id=anchor_id)
-
- if whole_station_response_json is None:
- print(
- time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
- + ' 全站小时榜数据获取失败!响应数据为空!'
- )
- sys.exit(0)
- else:
- data = json.dumps({
- "data": whole_station_response_json.get('data'),
- "extra": {}
- })
- rds_list.push_whole_station_data_list(data)
-
- if commerce_response_json is None:
- print(
- time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
- + ' 带货小时榜数据获取失败!响应数据为空!'
- )
- sys.exit(0)
- else:
- data = json.dumps({
- "data": commerce_response_json.get('data'),
- "extra": {}
- })
- rds_list.push_commerce_data_list(data)
-
- if popularity_response_json is None:
- print(
- time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
- + ' 人气小时榜数据获取失败!响应数据为空!'
- )
- sys.exit(0)
- else:
- data = json.dumps({
- "data": popularity_response_json.get('data'),
- "extra": {}
- })
- rds_list.push_popularity_data_list(data)
- except Exception as e:
- print(
- time.strftime("%H:%M:%S", time.localtime())
- + ' '
- + '数据异常:'
- + str(e)
- )
- sys.exit(0)
- if __name__ == "__main__":
- print("主方法开始执行")
- # 并行线程数
- threading_count = int(sys.argv[1])
- rds = RdsDouyinHourlyRankList()
- print(
- time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
- + ' '
- + ' 开始执行,更新直播队列长度:'
- + str(rds.get_len())
- )
- while True:
- sys.stdout.flush()
-
- # 减去主线程
- active_count = threading.active_count() - 1
-
- increment = threading_count - active_count
-
- while increment > 0:
- sys.stdout.flush()
- room_info = MysqlUserLiving().get_living_info()
- room_id = False
- if room_info is None:
- time.sleep(60)
- continue
-
- room_id,anchor_id,sec_anchor_id = room_info
-
- task = threading.Thread(target=scrape, args=(room_id, sec_anchor_id, anchor_id,))
- task.start() # 准备就绪, 等待cpu执行
- increment = increment - 1
-
- current_time = time.time()
- if current_time - start_time > 300:
- print(
- time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
- + ' 主方法执行终止'
- )
- sys.exit(0)
- time.sleep(0.01)
|