Browse Source

更新脚本

chenzhiyuan 3 years ago
parent
commit
361c5eb0f8

+ 10 - 152
douyin_user_scraper.py

@@ -6,10 +6,7 @@ import uuid
6 6
 import json
7 7
 from libs.aesgzip import tt_encrypt
8 8
 from xlog03 import *
9
-import sys
10 9
 from rds_model.db_redis import DbRedis
11
-from log.print_log import PrintLog
12
-from urllib.request import ProxyHandler,build_opener
13 10
 
14 11
 def get_mc():
15 12
     def a():
@@ -44,42 +41,11 @@ def get_random(i, random_type=1):
44 41
         salt = ''.join(sa)
45 42
         return salt
46 43
 
47
-
48
-def get_random_brand_type():
49
-    brand_type = get_random(3, random_type=8) + '-' + get_random(2, random_type=8) + '00'
50
-    return brand_type
51
-
52
-def get_scraper_proxy(proxy_key):
53
-    proxy_dict = redisModel.hgetall(proxy_key)
54
-    if (proxy_dict is None) or (len(proxy_dict) == 0):
55
-        return
56
-    proxy_list = list(proxy_dict)
57
-
58
-    now = int(time.time())
59
-    while True:
60
-        proxy = random.choice(proxy_list)
61
-        if proxy is None:
62
-            return
63
-        proxy_info = proxy_dict.get(proxy)
64
-        if proxy_info is None:
65
-            continue
66
-        proxy_info = json.loads(proxy_info)
67
-        expire_at = int(proxy_info.get('min_expired_at'))
68
-        # 删除过期的代理
69
-        if expire_at <= now:
70
-            redisModel.hdel(proxy_key, proxy)
71
-            proxy_list.remove(proxy)
72
-            continue
73
-
74
-        return proxy
75
-
76 44
 V1 = '6'
77 45
 V2 = '8'
78 46
 channel = 'huawei'
79 47
 device_type = 'RKK-YZ00'
80 48
 device_brand = 'HUAWEI'
81
-# redisModel = DbRedis.douyin_connect()
82
-redisModel = DbRedis.connect()
83 49
 
84 50
 print(channel, device_type)
85 51
 
@@ -273,18 +239,6 @@ class DouYinApi:
273 239
         douyin_url = douyin_url + "&_rticket=" + str(int(round(time.time() * 1000))) + "&ts=" + str(int(time.time()))
274 240
         return douyin_url
275 241
 
276
-    def get_user_info(self, user_id):
277
-        """获取用户信息
278
-        :param user_id: 用户ID
279
-        :return:
280
-        """
281
-        params = {
282
-            'user_id': user_id
283
-        }
284
-
285
-        douyin_url = 'https://aweme-eagle.snssdk.com/aweme/v1/user/?'
286
-        return self.__http_get(douyin_url, params)
287
-
288 242
     def __http_get(self, url, query_params=None):
289 243
         if query_params is None:
290 244
             query_params = {}
@@ -339,110 +293,14 @@ class DouYinApi:
339 293
         headers.update(sign)
340 294
         return headers
341 295
 
296
+    def get_user_info(self, user_id):
297
+        """获取用户信息
298
+        :param user_id: 用户ID
299
+        :return:
300
+        """
301
+        params = {
302
+            'user_id': user_id
303
+        }
342 304
 
343
-
344
-if __name__ == '__main__':
345
-    import warnings
346
-    warnings.filterwarnings("ignore")
347
-    param = str(sys.argv[1])
348
-
349
-    rkey = 'DOUYIN_USER_REGISTER_QUEUE'
350
-    pKey = 'IpProxyHashDaiLiYun'
351
-    uKey = 'DOUYIN_USER_SCRAPER_QUEUE'
352
-    sKey = 'DOUYIN_USER_INTODB_QUEUE'
353
-
354
-    cur_time = int(time.time())
355
-    if param == 'register':
356
-        print('===========注册程序===========')
357
-        PrintLog.print('===========注册程序===========')
358
-        while True:
359
-            now_time = int(time.time())
360
-            if (now_time - cur_time) > 270:
361
-                break
362
-
363
-            rltime = time.strftime("%H:%M:%S", time.localtime())
364
-            try:
365
-                llen = redisModel.llen(rkey)
366
-                if llen < 200:
367
-                    proxies = {
368
-                        "http": "http://lum-customer-c_44eb09ec-zone-data_center:4jme8s5hf9x6@zproxy.lum-superproxy.io:22225",
369
-                        "https": "http://lum-customer-c_44eb09ec-zone-data_center:4jme8s5hf9x6@zproxy.lum-superproxy.io:22225"
370
-                    }
371
-
372
-                    print(rltime + '注册代理:' + str(proxies))
373
-                    PrintLog.print(rltime + '注册代理:' + str(proxies))
374
-                    douApi = DouYinApi('', proxies)
375
-
376
-                    result = douApi.register_device()
377
-                    print(rltime + '结果:' + str(result))
378
-                    PrintLog.print(rltime + '结果:' + str(result))
379
-
380
-                    json_data = json.dumps(result)
381
-                    redisModel.lpush(rkey, json_data)
382
-                    print(rltime + '注册成功')
383
-                    PrintLog.print(rltime + '注册成功')
384
-                else:
385
-                    time.sleep(3)
386
-            except Exception as e:
387
-                print(rltime + '错误:' + str(e))
388
-                PrintLog.print(rltime + '错误:' + str(e))
389
-                continue
390
-
391
-    if param == 'scraper':
392
-        print('===========爬取程序===========')
393
-
394
-        while True:
395
-            now_time = int(time.time())
396
-            if (now_time - cur_time) > 270:
397
-                break
398
-
399
-            sltime = time.strftime("%H:%M:%S", time.localtime())
400
-            try:
401
-                proxy = get_scraper_proxy(pKey)
402
-                if proxy:
403
-                    proxies = {
404
-                        "http": "http://" + proxy,
405
-                        "https": "http://" + proxy
406
-                    }
407
-                else:
408
-                    time.sleep(2)
409
-                    continue
410
-
411
-                print(sltime + '爬取代理:' + str(proxies))
412
-                PrintLog.print(sltime + '爬取代理:' + str(proxies))
413
-                douApi = DouYinApi('', proxies)
414
-
415
-                rllen = redisModel.llen(rkey)
416
-                if rllen == 0:
417
-                    time.sleep(2)
418
-                    continue
419
-
420
-                json_data = redisModel.rpop(rkey)
421
-                dict_data = json.loads(json_data)
422
-                device_id, iid, udid, openudid, cookie = dict_data['device_id'], dict_data['iid'], dict_data['uuid'], \
423
-                                                         dict_data['openudid'], dict_data['cookie']
424
-                douApi.init_device_ids(device_id, iid, udid, openudid)
425
-
426
-                ullen = redisModel.llen(uKey)
427
-                if ullen == 0:
428
-                    time.sleep(2)
429
-                    continue
430
-
431
-                user_id = redisModel.rpop(uKey)
432
-                response = douApi.get_user_info(user_id)
433
-                if len(response.text) > 0 and response.json()['status_code'] == 0 and response.json()['user']:
434
-                    print(sltime + 'user_id:' + user_id + ' 爬取成功')
435
-                    PrintLog.print(sltime + 'user_id:' + user_id + ' 爬取成功')
436
-                    redisModel.lpush(sKey, response.text)
437
-                    redisModel.lpush(rkey, json_data)
438
-                else:
439
-                    print(sltime + 'user_id:' + user_id + ' 爬取失败')
440
-                    PrintLog.print(sltime + 'user_id:' + user_id + ' 爬取失败')
441
-                    scraper_time = dict_data['times']
442
-                    if scraper_time < 10:
443
-                        dict_data['times'] += 1
444
-                        redisModel.lpush(rkey, json.dumps(dict_data))
445
-            except Exception as e:
446
-                print(sltime + '错误:' + str(e))
447
-                PrintLog.print(sltime + '错误:' + str(e))
448
-                continue
305
+        douyin_url = 'https://aweme-eagle.snssdk.com/aweme/v1/user/?'
306
+        return self.__http_get(douyin_url, params)

+ 282 - 0
douyin_hourly_ranklist_commerce_scraper.py

@@ -0,0 +1,282 @@
1
+from rds_model.rds_hourly_rank_list import RdsDouyinHourlyRankList
2
+import time
3
+import json
4
+import sys
5
+import threading
6
+import random
7
+import urllib
8
+import requests
9
+
10
+from libs.Xg04 import X_Gorgon
11
+from libs.proxy import Proxy
12
+from libs.mysql_user_living import *
13
+from libs.db_redis import DbRedis
14
+
15
+start_time = time.time()
16
+
17
+def get_random(i, random_type=1):
18
+    if random_type == 1:
19
+        return str(random.randint(1 * 10 ** (i - 1), 1 * 10 ** i - 1))
20
+    elif random_type == 8:
21
+        seed = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
22
+        sa = []
23
+        for i in range(i):
24
+            sa.append(random.choice(seed))
25
+        salt = ''.join(sa)
26
+        return salt
27
+    else:
28
+        seed = "1234567890abcde"
29
+        sa = []
30
+        for i in range(i):
31
+            sa.append(random.choice(seed))
32
+        salt = ''.join(sa)
33
+        return salt
34
+
35
+def get_trace():
36
+    trace_list = [
37
+        '00-70f99f2209e0b045dd14266ee1da0468-70f99f2209e0b045-01',
38
+        '00-ce7faf4409b7fcc0ae6135fdd4250468-ce7faf4409b7fcc0-01',
39
+        '00-ce7faf3b09b7fcc0ae6042f1d8100468-ce7faf3b09b7fcc0-01',
40
+        '00-cdd79d2309b7fcc0ae6625a4cb190468-cdd79d2309b7fcc0-01',
41
+        '00-cefde9f009b7fcc0ae6750e1349e0468-cefde9f009b7fcc0-01',
42
+        '00-ced2e6ef09b7fcc0ae67dd7bfe000468-ced2e6ef09b7fcc0-01',
43
+        '00-cefbfeb509b7fcc0ae659396a6ea0468-cefbfeb509b7fcc0-01',
44
+        '00-cefaa25409b7fcc0ae657726a3c30468-cefaa25409b7fcc0-01',
45
+    ]
46
+
47
+    return random.choice(trace_list)
48
+
49
+def get_random_brand_type():
50
+    brand_type = get_random(3, random_type=8) + '-' + get_random(2, random_type=8) + '00'
51
+    return brand_type
52
+
53
+def get_mc():
54
+    def a():
55
+        seed = "1234567890ABCDEF"
56
+        sa = []
57
+        for i in range(2):
58
+            sa.append(random.choice(seed))
59
+        salt = ''.join(sa)
60
+        return salt
61
+
62
+    k = ''
63
+    for i in range(6):
64
+        k += a() + ':'
65
+    return k[:-1]
66
+
67
+def get_commerce_rank_list_data(room_id, sec_anchor_id, anchor_id, result):
68
+    
69
+    domain = 'webcast5-normal-c-lq.amemv.com'
70
+    url = 'https://' + domain + '/webcast/ranklist/hour/?'
71
+
72
+    rticket = str(int(time.time() * 1000))
73
+    mc = get_mc
74
+    udid = '8604' + get_random(11)
75
+    # openudid = '3b22' + str(udid.uuid4())[-12:]
76
+    ts = int(time.time())
77
+
78
+    trace_id = get_trace()
79
+    device_id, iid, udid, openudid, cookie, V1, V2, device_type, device_brand = result[0], result[1], result[3], result[2], result[4], result[8], result[9], result[10], result[11]
80
+    
81
+    query = {
82
+        "style" : "3",
83
+        "hour_info" : "0",
84
+        "room_id" : room_id,
85
+        "rank_type" : "31",
86
+        "sec_anchor_id" : sec_anchor_id,
87
+        "webcast_sdk_version" : "1710",
88
+        "webcast_language" : "zh",
89
+        "webcast_locale" : "zh_CN",
90
+        "webcast_gps_access" : "2",
91
+        "os_api" : "23",
92
+        "device_type" : device_type,
93
+        "ssmix" : "a",
94
+        "manifest_version_code" : "130001",
95
+        "dpi" : "480",
96
+        "app_name" : "aweme",
97
+        "version_name" : "13.0.0",
98
+        "ts" : ts,
99
+        "cpu_support64" : "true",
100
+        "storage_type" : "0",
101
+        "app_type" : "normal",
102
+        "appTheme" : "dark",
103
+        "ac" : "wifi",
104
+        "host_abi" : "armeabi-v7a",
105
+        "update_version_code" : "13009900",
106
+        "channel" : "tengxun_new",
107
+        "_rticket" : rticket,
108
+        "device_platform" : "android",
109
+        "iid" : iid,
110
+        "version_code" : "130000",
111
+        "mac_address" : mc,
112
+        "cdid" : "81542dc6-2aca-4ff6-ac58-d94179e9d3e6",
113
+        "openudid" : openudid,
114
+        "device_id" : device_id,
115
+        "resolution" : "1080*1800",
116
+        "os_version" : "6.0",
117
+        "language" : "zh",
118
+        "device_brand" : device_brand,
119
+        "aid" : "1128"
120
+    }
121
+
122
+    query_params = urllib.parse.urlencode(query)
123
+    url = url + query_params
124
+
125
+    body = ''
126
+
127
+    xGorgon = X_Gorgon(query_params, body)
128
+
129
+    userAgent = 'okhttp/3.' + str(random.randint(0, 10)) + '.' + str(random.randint(0, 10)) + '.' + str(
130
+        random.randint(1, 10))
131
+
132
+    headers = {
133
+        'Host': domain,
134
+        'Connection': 'keep-alive',
135
+        'Cache-Control': 'max-age=0',
136
+        'User-Agent': userAgent,
137
+        # 'accept-encoding': 'gzip, deflate',
138
+        "x-gorgon": xGorgon.get('X-Gorgon'),
139
+        "x-khronos": xGorgon.get('X-Khronos'),
140
+        'passport-sdk-version' : '18',
141
+        'sdk-version' : '2',
142
+        'x-ss-dp' : '1128',
143
+        'x-tt-trace-id' : trace_id,
144
+    }
145
+
146
+    retry = 0
147
+    response_json = None
148
+
149
+    while True:
150
+        if retry > 50:
151
+            break
152
+
153
+        retry += 1
154
+
155
+        proxy = Proxy.get()
156
+
157
+        proxies = {
158
+            "http": "http://" + proxy,
159
+            "https": "http://" + proxy
160
+        }
161
+
162
+        try:
163
+            response = requests.get(
164
+                url,
165
+                headers=headers,
166
+                proxies=proxies,
167
+                timeout=8
168
+            )
169
+            if (response.status_code == 200) and (response.text is not None) and (response.text != ''):
170
+                response_json = response.json()
171
+
172
+                if (response_json.get('data') is not None):
173
+                    print(
174
+                        time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
175
+                        + ' 数据获取成功!' 
176
+                    )
177
+                    break
178
+                else:
179
+                    print(
180
+                        time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
181
+                        + ' 数据获取失败!' 
182
+                        + '\n'
183
+                        + str(response.text)
184
+                        + str(Proxy.proxy_info)
185
+                    )
186
+                
187
+            else:
188
+                print(
189
+                    time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
190
+                    + ' 爬取http连接失败!' 
191
+                    + str(response.status_code) 
192
+                    + '\n'
193
+                    + str(Proxy.proxy_info)
194
+                    + '\n'
195
+                    + '爬取结果' + str(response)
196
+                    + '\n'
197
+                )
198
+                time.sleep(1)
199
+        except requests.exceptions.ProxyError as e:
200
+            print(
201
+                time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
202
+                + ' 代理过期!' 
203
+                + str(e) 
204
+                + '\n'
205
+                + str(Proxy.proxy_info)
206
+            )
207
+            Proxy.del_proxy(proxy)
208
+            pass
209
+        except requests.exceptions.ConnectTimeout as e:
210
+            print(
211
+                time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
212
+                + ' ConnectTimeout!' 
213
+                + str(e) 
214
+                + '\n'
215
+                + str(Proxy.proxy_info)
216
+            )
217
+            Proxy.del_proxy(proxy)
218
+            pass
219
+        except Exception as e:
220
+            print(
221
+                time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
222
+                + ' 请求抛出异常!' 
223
+                + str(e) 
224
+                + '\n'
225
+                + Proxy.proxy_info
226
+            )
227
+            pass
228
+
229
+    return response_json
230
+
231
+def scrape(room_id,sec_anchor_id,anchor_id):
232
+    rds_list = RdsDouyinHourlyRankList()
233
+
234
+    key = 'DOUYIN_SCRAPE_DID_IID_TTREQ_1221'
235
+    rdid = DbRedis.connect().rpop(key)
236
+    if rdid:
237
+        result = rdid.split('@@@')
238
+    else:
239
+        result = []
240
+        return None
241
+
242
+    DbRedis.connect().lpush(key, rdid)
243
+
244
+    try:
245
+        # 带货小时榜
246
+        commerce_response_json = get_commerce_rank_list_data(room_id=room_id, sec_anchor_id=sec_anchor_id, anchor_id=anchor_id, result=result)
247
+        
248
+        if commerce_response_json is None:
249
+            print(
250
+                time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
251
+                + ' 带货小时榜数据获取失败!响应数据为空!' 
252
+            )
253
+            sys.exit(0)
254
+        else:
255
+            data = json.dumps({
256
+                "data": commerce_response_json.get('data'),
257
+                "extra": {}
258
+            })
259
+            rds_list.push_commerce_data_list(data)
260
+
261
+    except Exception as e:
262
+        print(
263
+            time.strftime("%H:%M:%S", time.localtime()) 
264
+            + ' ' 
265
+            + '数据异常:' 
266
+            + str(e)
267
+        )
268
+
269
+    sys.exit(0)
270
+
271
+if __name__ == "__main__":
272
+    print("主方法开始执行")
273
+
274
+    room_info = MysqlUserLiving().get_living_info()
275
+
276
+    if room_info is None:
277
+        sys.exit(0)
278
+    
279
+    room_id,anchor_id,sec_anchor_id = room_info
280
+    
281
+    task = threading.Thread(target=scrape, args=(room_id, sec_anchor_id, anchor_id,))
282
+    task.start()  # 准备就绪, 等待cpu执行

+ 292 - 0
douyin_hourly_ranklist_popularity_scraper.py

@@ -0,0 +1,292 @@
1
+from rds_model.rds_hourly_rank_list import RdsDouyinHourlyRankList
2
+import time
3
+import json
4
+import sys
5
+import threading
6
+import random
7
+import urllib
8
+import requests
9
+
10
+from libs.Xg04 import X_Gorgon
11
+from libs.proxy import Proxy
12
+from libs.mysql_user_living import *
13
+from libs.db_redis import DbRedis
14
+
15
+start_time = time.time()
16
+
17
+def get_random(i, random_type=1):
18
+    if random_type == 1:
19
+        return str(random.randint(1 * 10 ** (i - 1), 1 * 10 ** i - 1))
20
+    elif random_type == 8:
21
+        seed = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
22
+        sa = []
23
+        for i in range(i):
24
+            sa.append(random.choice(seed))
25
+        salt = ''.join(sa)
26
+        return salt
27
+    else:
28
+        seed = "1234567890abcde"
29
+        sa = []
30
+        for i in range(i):
31
+            sa.append(random.choice(seed))
32
+        salt = ''.join(sa)
33
+        return salt
34
+
35
+def get_trace():
36
+    trace_list = [
37
+        '00-70f99f2209e0b045dd14266ee1da0468-70f99f2209e0b045-01',
38
+        '00-ce7faf4409b7fcc0ae6135fdd4250468-ce7faf4409b7fcc0-01',
39
+        '00-ce7faf3b09b7fcc0ae6042f1d8100468-ce7faf3b09b7fcc0-01',
40
+        '00-cdd79d2309b7fcc0ae6625a4cb190468-cdd79d2309b7fcc0-01',
41
+        '00-cefde9f009b7fcc0ae6750e1349e0468-cefde9f009b7fcc0-01',
42
+        '00-ced2e6ef09b7fcc0ae67dd7bfe000468-ced2e6ef09b7fcc0-01',
43
+        '00-cefbfeb509b7fcc0ae659396a6ea0468-cefbfeb509b7fcc0-01',
44
+        '00-cefaa25409b7fcc0ae657726a3c30468-cefaa25409b7fcc0-01',
45
+    ]
46
+
47
+    return random.choice(trace_list)
48
+
49
+def get_random_brand_type():
50
+    brand_type = get_random(3, random_type=8) + '-' + get_random(2, random_type=8) + '00'
51
+    return brand_type
52
+
53
+def get_mc():
54
+    def a():
55
+        seed = "1234567890ABCDEF"
56
+        sa = []
57
+        for i in range(2):
58
+            sa.append(random.choice(seed))
59
+        salt = ''.join(sa)
60
+        return salt
61
+
62
+    k = ''
63
+    for i in range(6):
64
+        k += a() + ':'
65
+    return k[:-1]
66
+
67
+def get_popularity_rank_list_data(room_id, sec_anchor_id, anchor_id, result):
68
+    
69
+    domain = 'webcast5-normal-c-lq.amemv.com'
70
+    url = 'https://' + domain + '/webcast/ranklist/hour/?'
71
+
72
+    rticket = str(int(time.time() * 1000))
73
+    mc = get_mc
74
+    udid = '8604' + get_random(11)
75
+    ts = int(time.time())
76
+    # openudid = '3b22' + str(udid.uuid4())[-12:]
77
+
78
+    trace_id = get_trace()
79
+
80
+    device_id, iid, udid, openudid, cookie, V1, V2, device_type, device_brand = result[0], result[1], result[3], result[2], result[4], result[8], result[9], result[10], result[11]
81
+    
82
+    query = {
83
+        "anchor_id" : anchor_id,
84
+        "room_id" : room_id,
85
+        "sec_anchor_id" : sec_anchor_id,
86
+        "sec_user_id" : "null",
87
+        "webcast_sdk_version" : "2010",
88
+        "webcast_language" : "zh",
89
+        "webcast_locale" : "zh_CN",
90
+        "webcast_gps_access" : "2",
91
+        "current_network_quality_info" : "%7B%22http_rtt%22%3A110%2C%22tcp_rtt%22%3A90%2C%22quic_rtt%22%3A90%2C%22downstream_throughput_kbps%22%3A8185%2C%22video_download_speed%22%3A411%2C%22quic_receive_loss_rate%22%3A-1%2C%22quic_send_loss_rate%22%3A-1%2C%22net_effective_connection_type%22%3A5%7D",
92
+        "os_api" : "23",
93
+        "device_type" : device_type,
94
+        "ssmix" : "a",
95
+        "manifest_version_code" : "160001",
96
+        "dpi" : "480",
97
+        "app_name" : "aweme",
98
+        "version_name" : "16.0.0",
99
+        "ts" : ts,
100
+        "cpu_support64" : "true",
101
+        "app_type" : "normal",
102
+        "appTheme" : "dark",
103
+        "ac" : "wifi",
104
+        "host_abi" : "armeabi-v7a",
105
+        "update_version_code" : "16009900",
106
+        "channel" : "wandoujia_lesi_1128_0507",
107
+        "_rticket" : rticket,
108
+        "device_platform" : "android",
109
+        "iid" : iid,
110
+        "version_code" : "160000",
111
+        "cdid" : "09e904ed-66a9-4e89-9661-afae7f61e6c5",
112
+        "openudid" : openudid,
113
+        "device_id" : device_id,
114
+        "resolution" : "1080*1800",
115
+        "os_version" : "6.0",
116
+        "language" : "zh",
117
+        "device_brand" : device_brand,
118
+        "aid" : "1128",
119
+        "minor_status" : "0"
120
+    }
121
+
122
+    query_params = urllib.parse.urlencode(query)
123
+    url = url + query_params
124
+
125
+    body = ''
126
+
127
+    xGorgon = X_Gorgon(query_params, body)
128
+
129
+    userAgent = 'okhttp/3.' + str(random.randint(0, 10)) + '.' + str(random.randint(0, 10)) + '.' + str(
130
+        random.randint(1, 10))
131
+
132
+    headers = {
133
+        'Host': domain,
134
+        'Connection': 'keep-alive',
135
+        'Cache-Control': 'max-age=0',
136
+        'Upgrade-Insecure-Requests': '1',
137
+        'User-Agent': userAgent,
138
+        # 'accept-encoding': 'gzip, deflate',
139
+        "x-gorgon": xGorgon.get('X-Gorgon'),
140
+        "x-khronos": xGorgon.get('X-Khronos'),
141
+        'passport-sdk-version' : '18',
142
+        'sdk-version' : '2',
143
+        'x-ss-dp' : '1128',
144
+        'x-tt-trace-id' : trace_id,
145
+    }
146
+
147
+    retry = 0
148
+    response_json = None
149
+
150
+    while True:
151
+        if retry > 50:
152
+            break
153
+
154
+        retry += 1
155
+
156
+        proxy = Proxy.get()
157
+
158
+        proxies = {
159
+            "http": "http://" + proxy,
160
+            "https": "http://" + proxy
161
+        }
162
+
163
+        try:
164
+            response = requests.get(
165
+                url,
166
+                headers=headers,
167
+                proxies=proxies,
168
+                timeout=8
169
+            )
170
+            if (response.status_code == 200) and (response.text is not None) and (response.text != ''):
171
+                response_json = response.json()
172
+
173
+                if (response_json.get('data') is not None):
174
+                    print(
175
+                        time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
176
+                        + ' 数据获取成功!' 
177
+                        + '\n'
178
+                        + room_id 
179
+                    )
180
+                    break
181
+                else:
182
+                    print(
183
+                        time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
184
+                        + ' 数据获取失败!' 
185
+                        + '\n'
186
+                        + room_id 
187
+                        + '\n'
188
+                        + response.text
189
+                        + Proxy.proxy_info
190
+                    )
191
+                
192
+            else:
193
+                print(
194
+                    time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
195
+                    + ' 爬取http连接失败!' 
196
+                    + str(response.status_code) 
197
+                    + '\n'
198
+                    + Proxy.proxy_info 
199
+                    + '\n'
200
+                    + room_id 
201
+                    + '\n'
202
+                )
203
+                time.sleep(1)
204
+        except requests.exceptions.ProxyError as e:
205
+            print(
206
+                time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
207
+                + ' 代理过期!' 
208
+                + str(e) 
209
+                + '\n'
210
+                + room_id 
211
+                + '\n'
212
+                + Proxy.proxy_info
213
+            )
214
+            Proxy.del_proxy(proxy)
215
+            pass
216
+        except requests.exceptions.ConnectTimeout as e:
217
+            print(
218
+                time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
219
+                + ' ConnectTimeout!' 
220
+                + str(e) 
221
+                + '\n'
222
+                + room_id 
223
+                + '\n'
224
+                + Proxy.proxy_info
225
+            )
226
+            Proxy.del_proxy(proxy)
227
+            pass
228
+        except Exception as e:
229
+            print(
230
+                time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
231
+                + ' 请求抛出异常!' 
232
+                + str(e) 
233
+                + '\n'
234
+                + room_id 
235
+                + '\n'
236
+                + Proxy.proxy_info
237
+            )
238
+            pass
239
+
240
+    return response_json
241
+
242
+def scrape(room_id,sec_anchor_id,anchor_id):
243
+    rds_list = RdsDouyinHourlyRankList()
244
+
245
+    key = 'DOUYIN_SCRAPE_DID_IID_TTREQ_1221'
246
+    rdid = DbRedis.connect().rpop(key)
247
+    if rdid:
248
+        result = rdid.split('@@@')
249
+    else:
250
+        return None
251
+
252
+    DbRedis.connect().lpush(key, rdid)
253
+
254
+    try:
255
+        # 人气小时榜
256
+        popularity_response_json = get_popularity_rank_list_data(room_id=room_id, sec_anchor_id=sec_anchor_id, anchor_id=anchor_id)
257
+        
258
+        if popularity_response_json is None:
259
+            print(
260
+                time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
261
+                + ' 人气小时榜数据获取失败!响应数据为空!' 
262
+            )
263
+            sys.exit(0)
264
+        else:
265
+            data = json.dumps({
266
+                "data": popularity_response_json.get('data'),
267
+                "extra": {}
268
+            })
269
+            rds_list.push_popularity_data_list(data)
270
+
271
+    except Exception as e:
272
+        print(
273
+            time.strftime("%H:%M:%S", time.localtime()) 
274
+            + ' ' 
275
+            + '数据异常:' 
276
+            + str(e)
277
+        )
278
+
279
+    sys.exit(0)
280
+
281
+if __name__ == "__main__":
282
+    print("主方法开始执行")
283
+
284
+    room_info = MysqlUserLiving().get_living_info()
285
+
286
+    if room_info is None:
287
+        sys.exit(0)
288
+    
289
+    room_id,anchor_id,sec_anchor_id = room_info
290
+    
291
+    task = threading.Thread(target=scrape, args=(room_id, sec_anchor_id, anchor_id,))
292
+    task.start()  # 准备就绪, 等待cpu执行

+ 0 - 653
douyin_hourly_ranklist_scraper.py

@@ -1,653 +0,0 @@
1
-from rds_model.rds_hourly_rank_list import RdsDouyinHourlyRankList
2
-import time
3
-import json
4
-import sys
5
-import threading
6
-import random
7
-import urllib
8
-import requests
9
-
10
-from rds_model.db_redis import DbRedis
11
-from log.print_log import PrintLog
12
-from libs.Xg04 import X_Gorgon
13
-from libs.proxy import Proxy
14
-from libs.mysql_user_living import *
15
-
16
-start_time = time.time()
17
-
18
-def get_random(i, random_type=1):
19
-    if random_type == 1:
20
-        return str(random.randint(1 * 10 ** (i - 1), 1 * 10 ** i - 1))
21
-    elif random_type == 8:
22
-        seed = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
23
-        sa = []
24
-        for i in range(i):
25
-            sa.append(random.choice(seed))
26
-        salt = ''.join(sa)
27
-        return salt
28
-    else:
29
-        seed = "1234567890abcde"
30
-        sa = []
31
-        for i in range(i):
32
-            sa.append(random.choice(seed))
33
-        salt = ''.join(sa)
34
-        return salt
35
-
36
-
37
-def get_random_brand_type():
38
-    brand_type = get_random(3, random_type=8) + '-' + get_random(2, random_type=8) + '00'
39
-    return brand_type
40
-
41
-def get_mc():
42
-    def a():
43
-        seed = "1234567890ABCDEF"
44
-        sa = []
45
-        for i in range(2):
46
-            sa.append(random.choice(seed))
47
-        salt = ''.join(sa)
48
-        return salt
49
-
50
-    k = ''
51
-    for i in range(6):
52
-        k += a() + ':'
53
-    return k[:-1]
54
-
55
-def get_whole_station_rank_data(room_id, sec_anchor_id, anchor_id):
56
-
57
-    domain = 'webcast5-normal-c-lq.amemv.com'
58
-    url = 'https://' + domain + '/webcast/ranklist/hour/?'
59
-
60
-    rticket = str(int(time.time() * 1000))
61
-    ts = int(time.time())
62
-    mc = get_mc
63
-    udid = '8604' + get_random(11)
64
-    query = {
65
-        "manifest_version_code" : "110001",
66
-        "dpi" : "480",
67
-        "app_name" : "aweme",
68
-        "version_name" : "11.0.0",
69
-        "ts" : ts,
70
-        "cpu_support64" : "true",
71
-        "app_type" : "normal",
72
-        "ac" : "wifi", 
73
-        "host_abi" : "armeabi-v7a",
74
-        "channel" : "wandoujia_aweme_feisuo",
75
-        "device_platform" : "android",
76
-        "iid" : "3932281687270606",
77
-        "version_code" : "110000",
78
-        "cdid" : "1d06013c-ff0b-427b-b1ab-6700259c15c6",
79
-        "openudid" : "291f3ce2efe59345",
80
-        "hour_info" : "0",
81
-        "room_id" : room_id,
82
-        "rank_type" : "12",
83
-        "sec_anchor_id" : sec_anchor_id,
84
-        "anchor_id" : anchor_id,
85
-        "webcast_sdk_version" : "1510",
86
-        "webcast_language" : "zh",
87
-        "webcast_locale" : "zh_CN",
88
-        "os_api" : "23",
89
-        "device_type" : "HUAWEI MLA-AL10",
90
-        "ssmix" : "a",
91
-        "update_version_code" : "11009900",
92
-        "cdid" : "1d06013c-ff0b-427b-b1ab-6700259c15c6",
93
-        "openudid" : "291f3ce2efe59345",
94
-        "device_id" : '49388718822',
95
-        "resolution" : "1080*1800",
96
-        "os_version" : "6.0",
97
-        "language" : "zh",
98
-        "device_brand" : "HUAWEI",
99
-        "_rticket" : rticket,
100
-        "aid" : "1128"
101
-    }
102
-
103
-    query_params = urllib.parse.urlencode(query)
104
-    url = url + query_params
105
-
106
-    body = ''
107
-
108
-    xGorgon = X_Gorgon(query_params, body)
109
-
110
-    userAgent = 'okhttp/3.' + str(random.randint(0, 10)) + '.' + str(random.randint(0, 10)) + '.' + str(
111
-        random.randint(1, 10))
112
-
113
-    headers = {
114
-        'Host': domain,
115
-        'Connection': 'keep-alive',
116
-        'User-Agent': userAgent,
117
-        'accept-encoding': 'gzip, deflate',
118
-        "x-gorgon": xGorgon.get('X-Gorgon'),
119
-        "x-khronos": xGorgon.get('X-Khronos'),
120
-        'sdk-version' : '2',
121
-        'x-ss-dp' : '1128',
122
-        'x-tt-trace-id' : '00-70f99f2209e0b045dd14266ee1da0468-70f99f2209e0b045-01',
123
-    }
124
-
125
-    retry = 0
126
-    response_json = None
127
-
128
-    while True:
129
-        if retry > 10:
130
-            break
131
-
132
-        retry += 1
133
-
134
-        proxy = Proxy.get()
135
-
136
-        proxies = {
137
-            "http": "http://" + proxy,
138
-            "https": "http://" + proxy
139
-        }
140
-
141
-        try:
142
-            response = requests.get(
143
-                url,
144
-                headers=headers,
145
-                proxies=proxies,
146
-                timeout=8
147
-            )
148
-            if (response.status_code == 200) and (response.text is not None) and (response.text != ''):
149
-                response_json = response.json()
150
-
151
-                if (response_json.get('data') is not None):
152
-                    break
153
-                else:
154
-                    print(
155
-                        time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
156
-                        + ' 数据获取失败!' 
157
-                        + '\n'
158
-                        + room_id 
159
-                        + '\n'
160
-                        + response.text
161
-                        + Proxy.proxy_info
162
-                    )
163
-                
164
-            else:
165
-                print(
166
-                    time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
167
-                    + ' 爬取http连接失败!' 
168
-                    + str(response.status_code) 
169
-                    + '\n'
170
-                    + Proxy.proxy_info 
171
-                    + '\n'
172
-                    + room_id 
173
-                    + '\n'
174
-                )
175
-                time.sleep(1)
176
-        except requests.exceptions.ProxyError as e:
177
-            print(
178
-                time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
179
-                + ' 代理过期!' 
180
-                + str(e) 
181
-                + '\n'
182
-                + room_id 
183
-                + '\n'
184
-                + Proxy.proxy_info
185
-            )
186
-            Proxy.del_proxy(proxy)
187
-            pass
188
-        except requests.exceptions.ConnectTimeout as e:
189
-            print(
190
-                time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
191
-                + ' ConnectTimeout!' 
192
-                + str(e) 
193
-                + '\n'
194
-                + room_id 
195
-                + '\n'
196
-                + Proxy.proxy_info
197
-            )
198
-            Proxy.del_proxy(proxy)
199
-            pass
200
-        except Exception as e:
201
-            print(
202
-                time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
203
-                + ' 请求抛出异常!' 
204
-                + str(e) 
205
-                + '\n'
206
-                + room_id 
207
-                + '\n'
208
-                + Proxy.proxy_info
209
-            )
210
-            pass
211
-
212
-    return response_json
213
-
214
-def get_commerce_rank_list_data(room_id, sec_anchor_id, anchor_id):
215
-    
216
-    domain = 'webcast5-normal-c-lq.amemv.com'
217
-    url = 'https://' + domain + '/webcast/ranklist/hour/?'
218
-
219
-    rticket = str(int(time.time() * 1000))
220
-    mc = get_mc
221
-    udid = '8604' + get_random(11)
222
-    # openudid = '3b22' + str(udid.uuid4())[-12:]
223
-    ts = int(time.time())
224
-    
225
-    query = {
226
-        "style" : "3",
227
-        "hour_info" : "0",
228
-        "room_id" : room_id,
229
-        "rank_type" : "31",
230
-        "sec_anchor_id" : sec_anchor_id,
231
-        "webcast_sdk_version" : "1710",
232
-        "webcast_language" : "zh",
233
-        "webcast_locale" : "zh_CN",
234
-        "webcast_gps_access" : "2",
235
-        "os_api" : "23",
236
-        "device_type" : "HUAWEI+MLA-AL10",
237
-        "ssmix" : "a",
238
-        "manifest_version_code" : "130001",
239
-        "dpi" : "480",
240
-        "app_name" : "aweme",
241
-        "version_name" : "13.0.0",
242
-        "ts" : ts,
243
-        "cpu_support64" : "true",
244
-        "storage_type" : "0",
245
-        "app_type" : "normal",
246
-        "appTheme" : "dark",
247
-        "ac" : "wifi",
248
-        "host_abi" : "armeabi-v7a",
249
-        "update_version_code" : "13009900",
250
-        "channel" : "tengxun_new",
251
-        "_rticket" : rticket,
252
-        "device_platform" : "android",
253
-        "iid" : "2876750595379005",
254
-        "version_code" : "130000",
255
-        "mac_address" : mc,
256
-        "cdid" : "81542dc6-2aca-4ff6-ac58-d94179e9d3e6",
257
-        "openudid" : "291f3ce2efe59345",
258
-        "device_id" : "49388718822",
259
-        "resolution" : "1080*1800",
260
-        "os_version" : "6.0",
261
-        "language" : "zh",
262
-        "device_brand" : "HUAWEI",
263
-        "aid" : "1128"
264
-    }
265
-
266
-    query_params = urllib.parse.urlencode(query)
267
-    url = url + query_params
268
-
269
-    body = ''
270
-
271
-    xGorgon = X_Gorgon(query_params, body)
272
-
273
-    userAgent = 'okhttp/3.' + str(random.randint(0, 10)) + '.' + str(random.randint(0, 10)) + '.' + str(
274
-        random.randint(1, 10))
275
-
276
-    headers = {
277
-        'Host': domain,
278
-        'Connection': 'keep-alive',
279
-        'Cache-Control': 'max-age=0',
280
-        'User-Agent': userAgent,
281
-        'accept-encoding': 'gzip, deflate',
282
-        "x-gorgon": xGorgon.get('X-Gorgon'),
283
-        "x-khronos": xGorgon.get('X-Khronos'),
284
-        'passport-sdk-version' : '18',
285
-        'sdk-version' : '2',
286
-        'x-ss-dp' : '1128',
287
-        'x-tt-trace-id' : '00-70f99f2209e0b045dd14266ee1da0468-70f99f2209e0b045-01',
288
-    }
289
-
290
-    retry = 0
291
-    response_json = None
292
-
293
-    while True:
294
-        if retry > 10:
295
-            break
296
-
297
-        retry += 1
298
-
299
-        proxy = Proxy.get()
300
-
301
-        proxies = {
302
-            "http": "http://" + proxy,
303
-            "https": "http://" + proxy
304
-        }
305
-
306
-        try:
307
-            response = requests.get(
308
-                url,
309
-                headers=headers,
310
-                proxies=proxies,
311
-                timeout=8
312
-            )
313
-            if (response.status_code == 200) and (response.text is not None) and (response.text != ''):
314
-                response_json = response.json()
315
-
316
-                if (response_json.get('data') is not None):
317
-                    break
318
-                else:
319
-                    print(
320
-                        time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
321
-                        + ' 数据获取失败!' 
322
-                        + '\n'
323
-                        + room_id 
324
-                        + '\n'
325
-                        + response.text
326
-                        + Proxy.proxy_info
327
-                    )
328
-                
329
-            else:
330
-                print(
331
-                    time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
332
-                    + ' 爬取http连接失败!' 
333
-                    + str(response.status_code) 
334
-                    + '\n'
335
-                    + Proxy.proxy_info 
336
-                    + '\n'
337
-                    + room_id 
338
-                    + '\n'
339
-                )
340
-                time.sleep(1)
341
-        except requests.exceptions.ProxyError as e:
342
-            print(
343
-                time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
344
-                + ' 代理过期!' 
345
-                + str(e) 
346
-                + '\n'
347
-                + room_id 
348
-                + '\n'
349
-                + Proxy.proxy_info
350
-            )
351
-            Proxy.del_proxy(proxy)
352
-            pass
353
-        except requests.exceptions.ConnectTimeout as e:
354
-            print(
355
-                time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
356
-                + ' ConnectTimeout!' 
357
-                + str(e) 
358
-                + '\n'
359
-                + room_id 
360
-                + '\n'
361
-                + Proxy.proxy_info
362
-            )
363
-            Proxy.del_proxy(proxy)
364
-            pass
365
-        except Exception as e:
366
-            print(
367
-                time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
368
-                + ' 请求抛出异常!' 
369
-                + str(e) 
370
-                + '\n'
371
-                + room_id 
372
-                + '\n'
373
-                + Proxy.proxy_info
374
-            )
375
-            pass
376
-
377
-    return response_json
378
-
379
-def get_popularity_rank_list_data(room_id, sec_anchor_id, anchor_id):
380
-    
381
-    domain = 'webcast5-normal-c-lq.amemv.com'
382
-    url = 'https://' + domain + '/webcast/ranklist/hour/?'
383
-
384
-    rticket = str(int(time.time() * 1000))
385
-    mc = get_mc
386
-    udid = '8604' + get_random(11)
387
-    ts = int(time.time())
388
-    # openudid = '3b22' + str(udid.uuid4())[-12:]
389
-    
390
-    query = {
391
-        "anchor_id" : anchor_id,
392
-        "room_id" : room_id,
393
-        "sec_anchor_id" : sec_anchor_id,
394
-        "sec_user_id" : "null",
395
-        "webcast_sdk_version" : "2010",
396
-        "webcast_language" : "zh",
397
-        "webcast_locale" : "zh_CN",
398
-        "webcast_gps_access" : "2",
399
-        "current_network_quality_info" : "%7B%22http_rtt%22%3A110%2C%22tcp_rtt%22%3A90%2C%22quic_rtt%22%3A90%2C%22downstream_throughput_kbps%22%3A8185%2C%22video_download_speed%22%3A411%2C%22quic_receive_loss_rate%22%3A-1%2C%22quic_send_loss_rate%22%3A-1%2C%22net_effective_connection_type%22%3A5%7D",
400
-        "os_api" : "23",
401
-        "device_type" : "HUAWEI+MLA-AL10",
402
-        "ssmix" : "a",
403
-        "manifest_version_code" : "160001",
404
-        "dpi" : "480",
405
-        "app_name" : "aweme",
406
-        "version_name" : "16.0.0",
407
-        "ts" : ts,
408
-        "cpu_support64" : "true",
409
-        "app_type" : "normal",
410
-        "appTheme" : "dark",
411
-        "ac" : "wifi",
412
-        "host_abi" : "armeabi-v7a",
413
-        "update_version_code" : "16009900",
414
-        "channel" : "wandoujia_lesi_1128_0507",
415
-        "_rticket" : rticket,
416
-        "device_platform" : "android",
417
-        "iid" : "273107070769192",
418
-        "version_code" : "160000",
419
-        "cdid" : "09e904ed-66a9-4e89-9661-afae7f61e6c5",
420
-        "openudid" : "291f3ce2efe59345",
421
-        "device_id" : "49388718822",
422
-        "resolution" : "1080*1800",
423
-        "os_version" : "6.0",
424
-        "language" : "zh",
425
-        "device_brand" : "HUAWEI",
426
-        "aid" : "1128",
427
-        "minor_status" : "0"
428
-    }
429
-
430
-    query_params = urllib.parse.urlencode(query)
431
-    url = url + query_params
432
-
433
-    body = ''
434
-
435
-    xGorgon = X_Gorgon(query_params, body)
436
-
437
-    userAgent = 'okhttp/3.' + str(random.randint(0, 10)) + '.' + str(random.randint(0, 10)) + '.' + str(
438
-        random.randint(1, 10))
439
-
440
-    headers = {
441
-        'Host': domain,
442
-        'Connection': 'keep-alive',
443
-        'Cache-Control': 'max-age=0',
444
-        'Upgrade-Insecure-Requests': '1',
445
-        'User-Agent': userAgent,
446
-        'accept-encoding': 'gzip, deflate',
447
-        "x-gorgon": xGorgon.get('X-Gorgon'),
448
-        "x-khronos": xGorgon.get('X-Khronos'),
449
-        'passport-sdk-version' : '18',
450
-        'sdk-version' : '2',
451
-        'x-ss-dp' : '1128',
452
-        'x-tt-trace-id' : '00-70f99f2209e0b045dd14266ee1da0468-70f99f2209e0b045-01',
453
-    }
454
-
455
-    retry = 0
456
-    response_json = None
457
-
458
-    while True:
459
-        if retry > 10:
460
-            break
461
-
462
-        retry += 1
463
-
464
-        proxy = Proxy.get()
465
-
466
-        proxies = {
467
-            "http": "http://" + proxy,
468
-            "https": "http://" + proxy
469
-        }
470
-
471
-        try:
472
-            response = requests.get(
473
-                url,
474
-                headers=headers,
475
-                proxies=proxies,
476
-                timeout=8
477
-            )
478
-            if (response.status_code == 200) and (response.text is not None) and (response.text != ''):
479
-                response_json = response.json()
480
-
481
-                if (response_json.get('data') is not None):
482
-                    break
483
-                else:
484
-                    print(
485
-                        time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
486
-                        + ' 数据获取失败!' 
487
-                        + '\n'
488
-                        + room_id 
489
-                        + '\n'
490
-                        + response.text
491
-                        + Proxy.proxy_info
492
-                    )
493
-                
494
-            else:
495
-                print(
496
-                    time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
497
-                    + ' 爬取http连接失败!' 
498
-                    + str(response.status_code) 
499
-                    + '\n'
500
-                    + Proxy.proxy_info 
501
-                    + '\n'
502
-                    + room_id 
503
-                    + '\n'
504
-                )
505
-                time.sleep(1)
506
-        except requests.exceptions.ProxyError as e:
507
-            print(
508
-                time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
509
-                + ' 代理过期!' 
510
-                + str(e) 
511
-                + '\n'
512
-                + room_id 
513
-                + '\n'
514
-                + Proxy.proxy_info
515
-            )
516
-            Proxy.del_proxy(proxy)
517
-            pass
518
-        except requests.exceptions.ConnectTimeout as e:
519
-            print(
520
-                time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
521
-                + ' ConnectTimeout!' 
522
-                + str(e) 
523
-                + '\n'
524
-                + room_id 
525
-                + '\n'
526
-                + Proxy.proxy_info
527
-            )
528
-            Proxy.del_proxy(proxy)
529
-            pass
530
-        except Exception as e:
531
-            print(
532
-                time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
533
-                + ' 请求抛出异常!' 
534
-                + str(e) 
535
-                + '\n'
536
-                + room_id 
537
-                + '\n'
538
-                + Proxy.proxy_info
539
-            )
540
-            pass
541
-
542
-    return response_json
543
-
544
-def scrape(room_id,sec_anchor_id,anchor_id):
545
-    rds_list = RdsDouyinHourlyRankList()
546
-    time.sleep(0.1)
547
-
548
-    try:
549
-        # 全站小时榜
550
-        whole_station_response_json = get_whole_station_rank_data(room_id=room_id, sec_anchor_id=sec_anchor_id, anchor_id=anchor_id)
551
-        # 带货小时榜
552
-        commerce_response_json = get_commerce_rank_list_data(room_id=room_id, sec_anchor_id=sec_anchor_id, anchor_id=anchor_id)
553
-        # 人气小时榜
554
-        popularity_response_json = get_popularity_rank_list_data(room_id=room_id, sec_anchor_id=sec_anchor_id, anchor_id=anchor_id)
555
-
556
-       
557
-        if whole_station_response_json is None:
558
-            print(
559
-                time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
560
-                + ' 全站小时榜数据获取失败!响应数据为空!' 
561
-            )
562
-            sys.exit(0)
563
-        else:
564
-            data = json.dumps({
565
-                "data": whole_station_response_json.get('data'),
566
-                "extra": {}
567
-            })
568
-            rds_list.push_whole_station_data_list(data)
569
-        
570
-        if commerce_response_json is None:
571
-            print(
572
-                time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
573
-                + ' 带货小时榜数据获取失败!响应数据为空!' 
574
-            )
575
-            sys.exit(0)
576
-        else:
577
-            data = json.dumps({
578
-                "data": commerce_response_json.get('data'),
579
-                "extra": {}
580
-            })
581
-            rds_list.push_commerce_data_list(data)
582
-        
583
-        if popularity_response_json is None:
584
-            print(
585
-                time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
586
-                + ' 人气小时榜数据获取失败!响应数据为空!' 
587
-            )
588
-            sys.exit(0)
589
-        else:
590
-            data = json.dumps({
591
-                "data": popularity_response_json.get('data'),
592
-                "extra": {}
593
-            })
594
-            rds_list.push_popularity_data_list(data)
595
-
596
-    except Exception as e:
597
-        print(
598
-            time.strftime("%H:%M:%S", time.localtime()) 
599
-            + ' ' 
600
-            + '数据异常:' 
601
-            + str(e)
602
-        )
603
-
604
-    sys.exit(0)
605
-
606
-if __name__ == "__main__":
607
-    print("主方法开始执行")
608
-    # 并行线程数
609
-    threading_count = int(sys.argv[1])
610
-
611
-    rds = RdsDouyinHourlyRankList()
612
-
613
-    print(
614
-        time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
615
-        + ' ' 
616
-        + ' 开始执行,更新直播队列长度:' 
617
-        + str(rds.get_len())
618
-    )
619
-
620
-    while True:
621
-        sys.stdout.flush()
622
-        
623
-        # 减去主线程
624
-        active_count = threading.active_count() - 1
625
-       
626
-        increment = threading_count - active_count
627
-        
628
-        while increment > 0:
629
-            sys.stdout.flush()
630
-
631
-            room_info = MysqlUserLiving().get_living_info()
632
-            room_id = False
633
-
634
-            if room_info is None:
635
-                time.sleep(60)
636
-                continue
637
-            
638
-            room_id,anchor_id,sec_anchor_id = room_info
639
-           
640
-            task = threading.Thread(target=scrape, args=(room_id, sec_anchor_id, anchor_id,))
641
-            task.start()  # 准备就绪, 等待cpu执行
642
-            increment = increment - 1
643
-           
644
-        current_time = time.time()
645
-
646
-        if current_time - start_time > 300:
647
-            print(
648
-                time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
649
-                + ' 主方法执行终止'
650
-            )
651
-            sys.exit(0)
652
-
653
-        time.sleep(0.01)

+ 275 - 0
douyin_hourly_ranklist_whole_scraper.py

@@ -0,0 +1,275 @@
1
+from rds_model.rds_hourly_rank_list import RdsDouyinHourlyRankList
2
+import time
3
+import json
4
+import sys
5
+import threading
6
+import random
7
+import urllib
8
+import requests
9
+
10
+from libs.Xg04 import X_Gorgon
11
+from libs.proxy import Proxy
12
+from libs.mysql_user_living import *
13
+from libs.db_redis import DbRedis
14
+
15
+start_time = time.time()
16
+
17
+def get_random(i, random_type=1):
18
+    if random_type == 1:
19
+        return str(random.randint(1 * 10 ** (i - 1), 1 * 10 ** i - 1))
20
+    elif random_type == 8:
21
+        seed = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
22
+        sa = []
23
+        for i in range(i):
24
+            sa.append(random.choice(seed))
25
+        salt = ''.join(sa)
26
+        return salt
27
+    else:
28
+        seed = "1234567890abcde"
29
+        sa = []
30
+        for i in range(i):
31
+            sa.append(random.choice(seed))
32
+        salt = ''.join(sa)
33
+        return salt
34
+
35
+
36
+def get_random_brand_type():
37
+    brand_type = get_random(3, random_type=8) + '-' + get_random(2, random_type=8) + '00'
38
+    return brand_type
39
+
40
+def get_mc():
41
+    def a():
42
+        seed = "1234567890ABCDEF"
43
+        sa = []
44
+        for i in range(2):
45
+            sa.append(random.choice(seed))
46
+        salt = ''.join(sa)
47
+        return salt
48
+
49
+    k = ''
50
+    for i in range(6):
51
+        k += a() + ':'
52
+    return k[:-1]
53
+
54
+def get_trace():
55
+    trace_list = [
56
+        '00-70f99f2209e0b045dd14266ee1da0468-70f99f2209e0b045-01',
57
+        '00-ce7faf4409b7fcc0ae6135fdd4250468-ce7faf4409b7fcc0-01',
58
+        '00-ce7faf3b09b7fcc0ae6042f1d8100468-ce7faf3b09b7fcc0-01',
59
+        '00-cdd79d2309b7fcc0ae6625a4cb190468-cdd79d2309b7fcc0-01',
60
+        '00-cefde9f009b7fcc0ae6750e1349e0468-cefde9f009b7fcc0-01',
61
+        '00-ced2e6ef09b7fcc0ae67dd7bfe000468-ced2e6ef09b7fcc0-01',
62
+        '00-cefbfeb509b7fcc0ae659396a6ea0468-cefbfeb509b7fcc0-01',
63
+        '00-cefaa25409b7fcc0ae657726a3c30468-cefaa25409b7fcc0-01',
64
+    ]
65
+
66
+    return random.choice(trace_list)
67
+
68
+def get_whole_station_rank_data(room_id, sec_anchor_id, anchor_id, result):
69
+
70
+    domain = 'webcast5-normal-c-lq.amemv.com'
71
+    url = 'https://' + domain + '/webcast/ranklist/hour/?'
72
+
73
+    rticket = str(int(time.time() * 1000))
74
+    ts = int(time.time())
75
+    mc = get_mc
76
+    udid = '8604' + get_random(11)
77
+    trace_id = get_trace()
78
+
79
+    device_id, iid, udid, openudid, cookie, V1, V2, device_type, device_brand = result[0], result[1], result[3], result[2], result[4], result[8], result[9], result[10], result[11]
80
+
81
+    query = {
82
+        "manifest_version_code" : "110001",
83
+        "dpi" : "480",
84
+        "app_name" : "aweme",
85
+        "version_name" : "11.0.0",
86
+        "ts" : ts,
87
+        "cpu_support64" : "true",
88
+        "app_type" : "normal",
89
+        "ac" : "wifi", 
90
+        "host_abi" : "armeabi-v7a",
91
+        "channel" : "wandoujia_aweme_feisuo",
92
+        "device_platform" : "android",
93
+        "iid" : iid,
94
+        "version_code" : "110000",
95
+        "cdid" : "1d06013c-ff0b-427b-b1ab-6700259c15c6",
96
+        "hour_info" : "0",
97
+        "room_id" : room_id,
98
+        "rank_type" : "12",
99
+        "sec_anchor_id" : sec_anchor_id,
100
+        "anchor_id" : anchor_id,
101
+        "webcast_sdk_version" : "1510",
102
+        "webcast_language" : "zh",
103
+        "webcast_locale" : "zh_CN",
104
+        "os_api" : "23",
105
+        "device_type" : device_type,
106
+        "ssmix" : "a",
107
+        "update_version_code" : "11009900",
108
+        "openudid" : openudid,
109
+        "device_id" : device_id,
110
+        "resolution" : "1080*1800",
111
+        "os_version" : "6.0",
112
+        "language" : "zh",
113
+        "device_brand" : device_brand,
114
+        "_rticket" : rticket,
115
+        "aid" : "1128"
116
+    }
117
+
118
+    query_params = urllib.parse.urlencode(query)
119
+    url = url + query_params
120
+
121
+    body = ''
122
+
123
+    xGorgon = X_Gorgon(query_params, body)
124
+
125
+    userAgent = 'okhttp/3.' + str(random.randint(0, 10)) + '.' + str(random.randint(0, 10)) + '.' + str(
126
+        random.randint(1, 10))
127
+
128
+    headers = {
129
+        'Host': domain,
130
+        'Connection': 'keep-alive',
131
+        'User-Agent': userAgent,
132
+        # 'accept-encoding': 'gzip, deflate',
133
+        "x-gorgon": xGorgon.get('X-Gorgon'),
134
+        "x-khronos": xGorgon.get('X-Khronos'),
135
+        'sdk-version' : '2',
136
+        'x-ss-dp' : '1128',
137
+        'x-tt-trace-id' : trace_id
138
+    }
139
+
140
+    retry = 0
141
+    response_json = None
142
+
143
+    while True:
144
+        if retry > 50:
145
+            break
146
+
147
+        retry += 1
148
+
149
+        proxy = Proxy.get()
150
+
151
+        proxies = {
152
+            "http": "http://" + proxy,
153
+            "https": "http://" + proxy
154
+        }
155
+
156
+        try:
157
+            response = requests.get(
158
+                url,
159
+                headers=headers,
160
+                proxies=proxies,
161
+                timeout=8
162
+            )
163
+            if (response.status_code == 200) and (response.text is not None) and (response.text != ''):
164
+                response_json = response.json()
165
+
166
+                if (response_json.get('data') is not None):
167
+                    print(
168
+                        time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
169
+                        + ' 数据获取成功!' 
170
+                    )
171
+
172
+                    break
173
+                else:
174
+                    print(
175
+                        time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
176
+                        + ' 数据获取失败!' 
177
+                        + '\n'
178
+                        + response.text
179
+                        + Proxy.proxy_info
180
+                    )
181
+                
182
+            else:
183
+                print(
184
+                    time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
185
+                    + ' 爬取http连接失败!' 
186
+                    + str(response.status_code) 
187
+                    + '\n'
188
+                    + Proxy.proxy_info 
189
+                    + '\n'
190
+                )
191
+                time.sleep(1)
192
+        except requests.exceptions.ProxyError as e:
193
+            print(
194
+                time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
195
+                + ' 代理过期!' 
196
+                + str(e) 
197
+                + '\n'
198
+                + Proxy.proxy_info
199
+            )
200
+            Proxy.del_proxy(proxy)
201
+            pass
202
+        except requests.exceptions.ConnectTimeout as e:
203
+            print(
204
+                time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
205
+                + ' ConnectTimeout!' 
206
+                + str(e) 
207
+                + '\n'
208
+                + Proxy.proxy_info
209
+            )
210
+            Proxy.del_proxy(proxy)
211
+            pass
212
+        except Exception as e:
213
+            print(
214
+                time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
215
+                + ' 请求抛出异常!' 
216
+                + str(e) 
217
+                + '\n'
218
+                + Proxy.proxy_info
219
+            )
220
+            pass
221
+
222
+    return response_json
223
+
224
+def scrape(room_id,sec_anchor_id,anchor_id):
225
+    rds_list = RdsDouyinHourlyRankList()
226
+
227
+    key = 'DOUYIN_SCRAPE_DID_IID_TTREQ_1221'
228
+    rdid = DbRedis.connect().rpop(key)
229
+    if rdid:
230
+        result = rdid.split('@@@')
231
+    else:
232
+        result = []
233
+        return None
234
+
235
+    DbRedis.connect().lpush(key, rdid)
236
+
237
+    try:
238
+        # 全站小时榜
239
+        whole_station_response_json = get_whole_station_rank_data(room_id=room_id, sec_anchor_id=sec_anchor_id, anchor_id=anchor_id, result=result)
240
+       
241
+        if whole_station_response_json is None:
242
+            print(
243
+                time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
244
+                + ' 全站小时榜数据获取失败!响应数据为空!' 
245
+            )
246
+            sys.exit(0)
247
+        else:
248
+            data = json.dumps({
249
+                "data": whole_station_response_json.get('data'),
250
+                "extra": {}
251
+            })
252
+            rds_list.push_whole_station_data_list(data)
253
+
254
+    except Exception as e:
255
+        print(
256
+            time.strftime("%H:%M:%S", time.localtime()) 
257
+            + ' ' 
258
+            + '数据异常:' 
259
+            + str(e)
260
+        )
261
+
262
+    sys.exit(0)
263
+
264
+if __name__ == "__main__":
265
+    print("主方法开始执行")
266
+
267
+    room_info = MysqlUserLiving().get_living_info()
268
+
269
+    if room_info is None:
270
+        sys.exit(0)
271
+    
272
+    room_id,anchor_id,sec_anchor_id = room_info
273
+    
274
+    task = threading.Thread(target=scrape, args=(room_id, sec_anchor_id, anchor_id,))
275
+    task.start()  # 准备就绪, 等待cpu执行

+ 47 - 10
douyin_live_lottery_scraper.py

@@ -50,14 +50,32 @@ def get_mc():
50 50
         k += a() + ':'
51 51
     return k[:-1]
52 52
 
53
-def get_live_lottery_data(room_id):
53
+def get_trace():
54
+    trace_list = [
55
+        '00-70f99f2209e0b045dd14266ee1da0468-70f99f2209e0b045-01',
56
+        '00-ce7faf4409b7fcc0ae6135fdd4250468-ce7faf4409b7fcc0-01',
57
+        '00-ce7faf3b09b7fcc0ae6042f1d8100468-ce7faf3b09b7fcc0-01',
58
+        '00-cdd79d2309b7fcc0ae6625a4cb190468-cdd79d2309b7fcc0-01',
59
+        '00-cefde9f009b7fcc0ae6750e1349e0468-cefde9f009b7fcc0-01',
60
+        '00-ced2e6ef09b7fcc0ae67dd7bfe000468-ced2e6ef09b7fcc0-01',
61
+        '00-cefbfeb509b7fcc0ae659396a6ea0468-cefbfeb509b7fcc0-01',
62
+        '00-cefaa25409b7fcc0ae657726a3c30468-cefaa25409b7fcc0-01',
63
+    ]
64
+
65
+    return random.choice(trace_list)
66
+
67
+def get_live_lottery_data(room_id, result):
54 68
     domain = 'webcast5-normal-ipv6-lf.amemv.com'
55 69
     url = 'https://' + domain + '/webcast/lottery/melon/check_user_right/?'
56 70
 
57 71
     rticket = str(int(time.time() * 1000))
58 72
     mc = get_mc
59 73
     udid = '8604' + get_random(11)
74
+    trace_id = get_trace()
60 75
     # openudid = '3b22' + str(udid.uuid4())[-12:]
76
+        
77
+
78
+    device_id, iid, udid, openudid, cookie, V1, V2, device_type, device_brand = result[0], result[1], result[3], result[2], result[4], result[8], result[9], result[10], result[11]
61 79
     
62 80
     query = {
63 81
         "room_id" : room_id,
@@ -72,7 +90,7 @@ def get_live_lottery_data(room_id):
72 90
         "dpi" : "640",
73 91
         "app_name" : "aweme",
74 92
         "version_name" : "12.8.0",
75
-        "ts" : "1629688012",
93
+        "ts" : int(time.time()),
76 94
         "cpu_support64" : "true",
77 95
         "storage_type" : "0",
78 96
         "app_type" : "normal",
@@ -84,13 +102,13 @@ def get_live_lottery_data(room_id):
84 102
         "_rticket" : rticket,
85 103
         # "_rticket" : "1629688012123",
86 104
         "device_platform" : "android",
87
-        "iid" : "3158219122552359",
105
+        "iid" : iid,
88 106
         "version_code" : "120800",
89 107
         "mac_address" : mc,
90 108
         #"mac_address" : "FC%3ADB%3AB3%3A56%3ABD%3AFD",
91 109
         "cdid" : "6c96979e-c729-419c-9516-3a85a7338d0c",
92
-        "openudid" : "89bb178775bf2aa9",
93
-        "device_id" : "60314377681",
110
+        "openudid" : openudid,
111
+        "device_id" : device_id,
94 112
         "resolution" : "1440*2560",
95 113
         "os_version" : "6.0.1",
96 114
         "language" : "zh",
@@ -114,21 +132,21 @@ def get_live_lottery_data(room_id):
114 132
         'Cache-Control': 'max-age=0',
115 133
         'Upgrade-Insecure-Requests': '1',
116 134
         'User-Agent': userAgent,
117
-        'accept-encoding': 'gzip, deflate',
135
+        # 'accept-encoding': 'gzip, deflate',
118 136
         # "x-SS-REQ-TICKET": rticket,
119 137
         "x-gorgon": xGorgon.get('X-Gorgon'),
120 138
         "x-khronos": xGorgon.get('X-Khronos'),
121 139
         'passport-sdk-version' : '17',
122 140
         'sdk-version' : '2',
123 141
         'x-ss-dp' : '1128',
124
-        'x-tt-trace-id' : '00-70f99f2209e0b045dd14266ee1da0468-70f99f2209e0b045-01',
142
+        'x-tt-trace-id' : trace_id
125 143
     }
126 144
 
127 145
     retry = 0
128 146
     response_json = None
129 147
 
130 148
     while True:
131
-        if retry > 10:
149
+        if retry > 5:
132 150
             break
133 151
 
134 152
         retry += 1
@@ -151,6 +169,14 @@ def get_live_lottery_data(room_id):
151 169
                 response_json = response.json()
152 170
 
153 171
                 if (response_json.get('data') is not None):
172
+                    print(
173
+                        time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
174
+                        + ' 数据获取成功!' 
175
+                        + '\n'
176
+                        + room_id 
177
+                        + '\n'
178
+                    )
179
+
154 180
                     break
155 181
                 else:
156 182
                     print(
@@ -173,6 +199,8 @@ def get_live_lottery_data(room_id):
173 199
                     + '\n'
174 200
                     + room_id 
175 201
                     + '\n'
202
+                    + '爬取结果:' + str(response)
203
+                    + '\n'
176 204
                 )
177 205
                 time.sleep(1)
178 206
         except requests.exceptions.ProxyError as e:
@@ -230,8 +258,17 @@ def scrape():
230 258
         + str(room_id)
231 259
     )
232 260
 
261
+    key = 'DOUYIN_SCRAPE_DID_IID_TTREQ_1221'
262
+    rdid = DbRedis.connect().rpop(key)
263
+    if rdid:
264
+        result = rdid.split('@@@')
265
+    else:
266
+        return None
267
+
268
+    DbRedis.connect().lpush(key, rdid)
269
+
233 270
     try:
234
-        response_json = get_live_lottery_data(room_id)
271
+        response_json = get_live_lottery_data(room_id, result)
235 272
        
236 273
         if response_json is None:
237 274
             # rds_list.record_score(0)
@@ -256,7 +293,7 @@ def scrape():
256 293
         rds_list.push_data_list(data)
257 294
     except Exception as e:
258 295
         # rds_list.record_score(0)
259
-        # rds_list.push_request_id(room_id)
296
+        rds_list.push_request_id(room_id)
260 297
         print(
261 298
             time.strftime("%H:%M:%S", time.localtime()) 
262 299
             + ' ' 

+ 52 - 0
douyin_scraper_register.py

@@ -0,0 +1,52 @@
1
+import json
2
+from xlog03 import *
3
+from rds_model.db_redis import DbRedis
4
+from log.print_log import PrintLog
5
+import douyin_class
6
+
7
+redisModel = DbRedis.outer_net_connect()
8
+
9
+if __name__ == '__main__':
10
+    import warnings
11
+    warnings.filterwarnings("ignore")
12
+
13
+    print('===========注册程序===========')
14
+    PrintLog.print('===========注册程序===========')
15
+
16
+    rkey = 'DOUYIN_REGISTER_QUEUE'
17
+    cur_time = int(time.time())
18
+    while True:
19
+        now_time = int(time.time())
20
+        if (now_time - cur_time) > 270:
21
+            break
22
+
23
+        str_time = time.strftime("%H:%M:%S", time.localtime())
24
+        try:
25
+            llen = redisModel.llen(rkey)
26
+            if llen < 50:
27
+                proxies = {
28
+                    "http": "http://lum-customer-c_44eb09ec-zone-data_center:4jme8s5hf9x6@zproxy.lum-superproxy.io:22225",
29
+                    "https": "http://lum-customer-c_44eb09ec-zone-data_center:4jme8s5hf9x6@zproxy.lum-superproxy.io:22225"
30
+                }
31
+
32
+                print(str_time + ' 注册代理:' + str(proxies))
33
+                PrintLog.print(str_time + ' 注册代理:' + str(proxies))
34
+                douApi = douyin_class.DouYinApi('', proxies)
35
+
36
+                result = douApi.register_device()
37
+                if result['device_id'] == '0':
38
+                    continue
39
+
40
+                print(str_time + ' 结果:' + str(result))
41
+                PrintLog.print(str_time + ' 结果:' + str(result))
42
+
43
+                json_data = json.dumps(result)
44
+                redisModel.lpush(rkey, json_data)
45
+                print(str_time + ' 注册成功')
46
+                PrintLog.print(str_time + ' 注册成功')
47
+            else:
48
+                time.sleep(3)
49
+        except Exception as e:
50
+            print(str_time + ' 错误:' + str(e))
51
+            PrintLog.print(str_time + ' 错误:' + str(e))
52
+            continue

+ 120 - 0
douyin_user_scraper_ab.py

@@ -0,0 +1,120 @@
1
+import json
2
+from xlog03 import *
3
+from rds_model.db_redis import DbRedis
4
+from log.print_log import PrintLog
5
+import douyin_class
6
+import random
7
+import threading
8
+
9
+redisModel = DbRedis.connect()
10
+redisDyModel = DbRedis.douyin_connect()
11
+
12
+def get_scraper_proxy(proxy_key):
13
+    proxy_dict = redisModel.hgetall(proxy_key)
14
+    if (proxy_dict is None) or (len(proxy_dict) == 0):
15
+        return
16
+    proxy_list = list(proxy_dict)
17
+
18
+    now = int(time.time())
19
+    while True:
20
+        proxy = random.choice(proxy_list)
21
+        if proxy is None:
22
+            return
23
+        proxy_info = proxy_dict.get(proxy)
24
+        if proxy_info is None:
25
+            continue
26
+        proxy_info = json.loads(proxy_info)
27
+        expire_at = int(proxy_info.get('expired_at'))
28
+        # 删除过期的代理
29
+        if expire_at <= now:
30
+            redisModel.hdel(proxy_key, proxy)
31
+            proxy_list.remove(proxy)
32
+            continue
33
+
34
+        return proxy
35
+
36
+
37
+def scraper():
38
+    rkey = 'DOUYIN_REGISTER_QUEUE'
39
+    pKey = 'IpProxyHash'
40
+    uKey = 'SL:List:Douyin:BarrageUserScrapeQueue'
41
+    sKey = 'SL:List:Douyin:BarrageUserDataQueue'
42
+
43
+    cur_time = int(time.time())
44
+    while True:
45
+        now_time = int(time.time())
46
+        if (now_time - cur_time) > 270:
47
+            print('thrend_' + threading.current_thread().name + ' finish')
48
+            break
49
+
50
+        rllen = redisDyModel.llen(rkey)
51
+        if rllen == 0:
52
+            time.sleep(2)
53
+            continue
54
+
55
+        ullen = redisDyModel.llen(uKey)
56
+        if ullen == 0:
57
+            time.sleep(2)
58
+            continue
59
+
60
+        json_data = redisDyModel.rpop(rkey)
61
+        str_time = time.strftime("%H:%M:%S", time.localtime())
62
+        try:
63
+            proxy = get_scraper_proxy(pKey)
64
+            if proxy:
65
+                proxies = {
66
+                    "http": "http://" + proxy,
67
+                    "https": "http://" + proxy
68
+                }
69
+            else:
70
+                time.sleep(2)
71
+                continue
72
+
73
+            print(str_time + ' 爬取代理:' + str(proxies))
74
+            PrintLog.print(str_time + ' 爬取代理:' + str(proxies))
75
+            douApi = douyin_class.DouYinApi('', proxies)
76
+
77
+            dict_data = json.loads(json_data)
78
+            device_id, iid, udid, openudid, cookie = dict_data['device_id'], dict_data['iid'], dict_data['uuid'], \
79
+                                                     dict_data['openudid'], dict_data['cookie']
80
+            douApi.init_device_ids(device_id, iid, udid, openudid)
81
+
82
+            user_id = redisDyModel.rpop(uKey)
83
+            response = douApi.get_user_info(user_id)
84
+            if len(response.text) > 0 and response.json()['status_code'] == 0 and response.json()['user']:
85
+                print(str_time + ' user_id:' + str(user_id) + ' 爬取成功')
86
+                PrintLog.print(str_time + ' user_id:' + str(user_id) + ' 爬取成功')
87
+                redisDyModel.lpush(sKey, response.text)
88
+                redisDyModel.lpush(rkey, json_data)
89
+            else:
90
+                print(str_time + ' user_id:' + str(user_id) + ' 爬取失败')
91
+                PrintLog.print(str_time + ' user_id:' + str(user_id) + ' 爬取失败')
92
+                scraper_time = dict_data['times']
93
+                if scraper_time < 10:
94
+                    dict_data['times'] += 1
95
+                    redisDyModel.lpush(rkey, json.dumps(dict_data))
96
+
97
+            time.sleep(1)
98
+        except Exception as e:
99
+            print(str_time + ' 错误:' + str(e))
100
+            PrintLog.print(str_time + ' 错误:' + str(e))
101
+            redisDyModel.lpush(rkey, json_data)
102
+            continue
103
+
104
+if __name__ == '__main__':
105
+    import warnings
106
+    warnings.filterwarnings("ignore")
107
+
108
+    print('===========爬取程序===========')
109
+    PrintLog.print('===========爬取程序===========')
110
+
111
+    threading_count = 10
112
+    for i in range(0, threading_count):
113
+        task = threading.Thread(target=scraper, name=i)
114
+        task.start()  # 准备就绪,等待cpu执行
115
+
116
+
117
+
118
+
119
+
120
+

+ 157 - 0
douyin_user_scraper_in.py

@@ -0,0 +1,157 @@
1
+import requests
2
+from xlog03 import *
3
+from rds_model.db_redis import DbRedis
4
+import random
5
+import json
6
+import threading
7
+from log.print_log import PrintLog
8
+
9
+redisModel = DbRedis.connect()
10
+redisDyModel = DbRedis.douyin_connect()
11
+
12
+def get_user_info(user_id, proxies):
13
+    params = {
14
+        'user_id': user_id,
15
+        'version_code': '9.8.1',
16
+        'js_sdk_version': '1.47.2.2',
17
+        'app_name': 'aweme',
18
+        'app_version': '9.8.1',
19
+        'channel': 'App%20Store',
20
+        'mcc_mnc': 46002,
21
+        'aid': 1128,
22
+        'screen_width': 640,
23
+        'os_api': 18,
24
+        'ac': 'WIFI',
25
+        'os_version': '13.3.1',
26
+        'device_platform': 'iphone',
27
+        'build_number': 98107,
28
+        'device_type': 'iPhone8,4',
29
+        'address_book_access': 1
30
+    }
31
+    url = 'https://api3-normal-c-lf.amemv.com/aweme/v1/user/profile/self/?'
32
+
33
+    douyin_url = parse_params(url, params)
34
+    response = http_get(douyin_url, proxies)
35
+
36
+    return response
37
+
38
+def parse_params(url, params):
39
+    if params is None:
40
+        params = {}
41
+
42
+    if not url.endswith('?'):
43
+        url = url + '?'
44
+
45
+    common_params = parse.urlencode(params)
46
+    douyin_url = url + common_params
47
+
48
+    return douyin_url
49
+
50
+def http_get(douyin_url, proxies):
51
+    if proxies:
52
+        resp = requests.get(douyin_url, proxies=proxies, verify=False, timeout=10)
53
+    else:
54
+        resp = requests.get(douyin_url, verify=False, timeout=10)
55
+
56
+    return resp
57
+
58
+def get_scraper_proxy(proxy_key):
59
+    proxy_dict = redisModel.hgetall(proxy_key)
60
+    if (proxy_dict is None) or (len(proxy_dict) == 0):
61
+        return
62
+    proxy_list = list(proxy_dict)
63
+
64
+    now = int(time.time())
65
+    while True:
66
+        proxy = random.choice(proxy_list)
67
+        if proxy is None:
68
+            return
69
+        proxy_info = proxy_dict.get(proxy)
70
+        if proxy_info is None:
71
+            continue
72
+        proxy_info = json.loads(proxy_info)
73
+        expire_at = int(proxy_info.get('expired_at'))
74
+        # 删除过期的代理
75
+        if expire_at <= now:
76
+            redisModel.hdel(proxy_key, proxy)
77
+            proxy_list.remove(proxy)
78
+            continue
79
+
80
+        return proxy
81
+
82
+def scraper():
83
+    pKey = 'IpProxyHash'
84
+    uKey = 'SL:List:Douyin:BarrageUserScrapeQueue'
85
+    sKey = 'SL:List:Douyin:BarrageUserDataQueue'
86
+    hKey = 'SL:Hash:Douyin:BarrageUserScrapeRecord'
87
+
88
+    cur_time = int(time.time())
89
+    while True:
90
+        now_time = int(time.time())
91
+        if (now_time - cur_time) > 270:
92
+            print('thrend_' + threading.current_thread().name + ' finish')
93
+            break
94
+
95
+        ullen = redisDyModel.llen(uKey)
96
+        if ullen == 0:
97
+            time.sleep(2)
98
+            continue
99
+
100
+        str_time = time.strftime("%H:%M:%S", time.localtime())
101
+        user_id = redisDyModel.rpop(uKey)
102
+
103
+        try:
104
+            proxy = get_scraper_proxy(pKey)
105
+            if proxy:
106
+                proxies = {
107
+                    "http": "http://" + proxy,
108
+                    "https": "http://" + proxy
109
+                }
110
+            else:
111
+                time.sleep(2)
112
+                continue
113
+
114
+            print(str_time + ' 爬取代理:' + str(proxies))
115
+            PrintLog.print(str_time + ' 爬取代理:' + str(proxies))
116
+
117
+            response = get_user_info(user_id, proxies)
118
+            if len(response.text) > 0 and response.json()['status_code'] == 0 and response.json()['user']:
119
+                print(str_time + ' user_id:' + str(user_id) + ' 爬取成功')
120
+                PrintLog.print(str_time + ' user_id:' + str(user_id) + ' 爬取成功')
121
+                redisDyModel.lpush(sKey, response.text)
122
+                redisDyModel.hdel(hKey, user_id)
123
+            else:
124
+                print(str_time + ' user_id:' + str(user_id) + ' 爬取失败')
125
+                PrintLog.print(str_time + ' user_id:' + str(user_id) + ' 爬取失败')
126
+
127
+                record_json = redisDyModel.hget(hKey, user_id)
128
+                if record_json:
129
+                    record_dict = json.loads(record_json)
130
+                    if record_dict['times'] < 10:
131
+                        record_dict['times'] += 1
132
+                        redisDyModel.hset(hKey, user_id, json.dumps(record_dict))
133
+                    else:
134
+                        redisDyModel.hdel(hKey, user_id)
135
+                else:
136
+                    record_dict = {'times': 1}
137
+                    redisDyModel.hset(hKey, user_id, json.dumps(record_dict))
138
+
139
+            time.sleep(1)
140
+        except Exception as e:
141
+            print(str_time + ' 错误:' + str(e))
142
+            PrintLog.print(str_time + ' 错误:' + str(e))
143
+            redisDyModel.lpush(uKey, user_id)
144
+            continue
145
+
146
+if __name__ == '__main__':
147
+    import warnings
148
+    warnings.filterwarnings("ignore")
149
+
150
+    print('===========爬取程序===========')
151
+    PrintLog.print('===========爬取程序===========')
152
+
153
+    threading_count = 50
154
+    for i in range(0, threading_count):
155
+        task = threading.Thread(target=scraper, name=i)
156
+        task.start()  # 准备就绪,等待cpu执行
157
+

+ 42 - 11
douyin_video_scraper_web.py

@@ -1,11 +1,15 @@
1
-import json,time,sys,threading,warnings
1
+import json
2
+import sys
3
+import threading
4
+import time
5
+import warnings
2 6
 
3
-from rds_model.db_redis import DbRedis
4
-from log.print_log import PrintLog
5 7
 from libs.proxy import Proxy
6
-from web_dy import *
8
+from log.print_log import PrintLog
9
+from rds_model.db_redis import DbRedis
7 10
 from rds_model.rds_user_video_list import RdsUserVideoList
8 11
 from web_cookie import Cookie
12
+from web_dy import *
9 13
 
10 14
 start_time = time.time()
11 15
 # -- coding: utf-8 --**
@@ -56,10 +60,25 @@ def get_signature(url=None,method='_signature'):
56 60
     
57 61
     return d
58 62
 
63
+def get_ua_ck():
64
+    ua_list=[
65
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
66
+        "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
67
+        "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3870.400 QQBrowser/10.8.4405.400"
68
+    ]
69
+    ck_list=[
70
+        'ttwid=1%7CTVzdM0P0u-8dtsmh6c-EaQEtBoTSOs_MG85FAg07AbA%7C1631502013%7C66442d8594de8e93ad18b73f3dfe0c94ed864c3d932824bcde9918b5be172321; passport_csrf_token=866923f1a32045fd82e47053158402a2',
71
+        'ttwid=1%7CGPDDu9-w3RGs2Pcd0wRlvLYoktpDt-v8LP5ZMyb1NBM%7C1630319594%7Cffb8de47e6da87dcfd76349b5ad34aa1f9b9d4332261a3a8436b932a893366c1; passport_csrf_token=79284b8777a7a54f3066cefef9af539e',
72
+        'ttwid=1%7CGsfqc7NpdOg4N-U-VX7Q77KsWjVTZ7gxLNifsisj8YE%7C1631618570%7Cafbb13a27fd1c2d7a8245454b1e0d7cd654d80848a320933a25d9ef77638c18c; passport_csrf_token=84911c9af94040a99cc10416bd27533d',
73
+        'ttwid=1%7C82FGr05YUOReYUB301ao_erqOQ3ilbXZdEy0tkMsdXY%7C1631863641%7C1dcebe643a96f00841a3b490db60de886bfe07ff3d276e509717abc4e1681ba6; passport_csrf_token=494ae3fffe00328101fd40e050ce49db',
74
+        'ttwid=1%7CwfnX3T9LY4_60iGoQNzyqYe5ahILFeRxfMuZ1pdgXf8%7C1632724192%7Cb613fddc0b533d5578dad4d5f9290705fdc6432aa854d492f4761d164dd3fdd5; passport_csrf_token=4a8afba333103b033e537003b72ee91b'
75
+    ]
76
+    return random.choice(ua_list),random.choice(ck_list)
77
+
59 78
 def get_user_videos(sec_user_id, max_cursor=0, count=20):
60 79
 
61
-    # ua,ck=get_ua_ck('get_user_videos')
62
-    ua="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
80
+    ua,ck=get_ua_ck()
81
+    # ua="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
63 82
     
64 83
     url='https://www.douyin.com/aweme/v1/web/aweme/post/?'
65 84
     param={
@@ -70,8 +89,8 @@ def get_user_videos(sec_user_id, max_cursor=0, count=20):
70 89
         'max_cursor': str(max_cursor),
71 90
         'count': str(count),
72 91
         'publish_video_strategy_type': '2',
73
-        'version_code': '160100',
74
-        'version_name': '16.1.0',
92
+        'version_code': '170400',
93
+        'version_name': '17.4.0',
75 94
         'cookie_enabled': 'true',
76 95
         'screen_width': '1920',
77 96
         'screen_height': '1080',
@@ -80,6 +99,7 @@ def get_user_videos(sec_user_id, max_cursor=0, count=20):
80 99
         'browser_name': 'Mozilla',
81 100
         'browser_version':ua.replace('Mozilla/',''),
82 101
         'browser_online': 'true',
102
+        "source" : "channel_pc_web"
83 103
     }
84 104
 
85 105
     url = url + parse.urlencode(param)
@@ -104,6 +124,12 @@ def get_user_videos(sec_user_id, max_cursor=0, count=20):
104 124
         "referer": "https://www.douyin.com/user/{sec_user_id}?enter_method=search_result&enter_from=search_result".format(sec_user_id=sec_user_id),
105 125
         "user-agent":ua,
106 126
         "withcredentials": "true",
127
+        "sec-ch-ua" : '"Google Chrome";v="93", " Not;A Brand";v="99", "Chromium";v="93"',
128
+        "sec-ch-ua-mobile" : "?0",
129
+        "sec-ch-ua-platform" : "Windows",
130
+        "sec-fetch-dest" : "empty",
131
+        "sec-fetch-mode" : "cors",
132
+        "sec-fetch-site" : "same-origin"
107 133
     }
108 134
 
109 135
     if ck:
@@ -134,6 +160,7 @@ def get_user_videos(sec_user_id, max_cursor=0, count=20):
134 160
                 timeout=8
135 161
             )
136 162
             if (response.status_code == 200) and (response.text is not None) and (response.text != ''):
163
+                # print(response)
137 164
                 response_json = response.json()
138 165
 
139 166
                 if (response_json.get('aweme_list') is not None):
@@ -164,6 +191,8 @@ def get_user_videos(sec_user_id, max_cursor=0, count=20):
164 191
                     + '\n'
165 192
                     + str(sec_user_id) 
166 193
                     + '\n'
194
+                    + '爬取结果:' + str(response)
195
+                    + '\n'
167 196
                 )
168 197
                 time.sleep(1)
169 198
         except requests.exceptions.ProxyError as e:
@@ -196,12 +225,14 @@ def get_user_videos(sec_user_id, max_cursor=0, count=20):
196 225
                 + ' 请求抛出异常!' 
197 226
                 + str(e) 
198 227
                 + '\n'
228
+                + str(e.__traceback__.tb_lineno)
229
+                + '\n'
199 230
                 + str(sec_user_id) 
200 231
                 + '\n'
201 232
                 + Proxy.proxy_info
202 233
             )
203 234
             pass
204
-
235
+        
205 236
     return response_json
206 237
 
207 238
 def scrape():
@@ -221,7 +252,7 @@ def scrape():
221 252
     )
222 253
 
223 254
     try:
224
-        videos = get_user_videos(sec_user_id=sec_user_id,max_cursor=0,count=20)
255
+        videos = get_user_videos(sec_user_id=sec_user_id,max_cursor=0,count=50)
225 256
        
226 257
         if videos is None:
227 258
             # rds.push_request_id(sec_user_id)
@@ -299,4 +330,4 @@ if __name__ == '__main__':
299 330
             )
300 331
             sys.exit(0)
301 332
 
302
-        time.sleep(1)
333
+        time.sleep(1)

+ 1 - 1
dy_barrage_scraper.py

@@ -48,7 +48,7 @@ def scrape():
48 48
                 if int(status) == 4:     # 直播已结束不再回塞数据到爬取队列
49 49
                     continue
50 50
 
51
-            # 直播未结束塞回队列
51
+            # 直播未结束爬取完成塞回队列
52 52
             time_diff = int(time.time()) - int(scrape_time)
53 53
             if time_diff > 5:
54 54
                 # 爬取前更新爬取时间塞回队列

+ 100 - 0
dy_live_reward.py

@@ -0,0 +1,100 @@
1
+import time
2
+import threading
3
+import json
4
+import sys
5
+
6
+from rds_model.rds_dy_live_reward_request_list import RdsDyLiveRewardRequestList
7
+from log.print_log import PrintLog
8
+from libs.mysql_dy_live import MysqlDyLive
9
+from libs.dy_live_reward_info import DyLiveRewardInfo
10
+
11
+
12
+def scrape():
13
+    while True:
14
+        try:
15
+            # 从 待爬取直播间 列表中获取一个 直播间信息
16
+            room_info = rds.get_request_param()
17
+
18
+            # 如果没有待爬取的直播,则等一秒,循环
19
+            if room_info is None:
20
+                time.sleep(1)
21
+                continue
22
+
23
+            # 判断是否到达爬取时间以确定是否需要爬取弹幕,并直接塞回队列尾部
24
+            room_dict = json.loads(room_info)
25
+            room_id = str(room_dict['room_id'])  # 直播间ID
26
+            scrape_time = room_dict['scrape_time']  # 上次抓取时间
27
+            uid = room_dict['uid']      # 直播网红ID
28
+
29
+            if (uid is None) or (room_id is None):
30
+                PrintLog.print(
31
+                    time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + '请求数据数据异常!' + '\n'
32
+                    + room_info
33
+                )
34
+                time.sleep(1)
35
+                continue
36
+
37
+            room_dict.setdefault('times', 0)
38
+            room_dict['times'] = (room_dict['times'] + 1) % 10
39
+
40
+            # 按照策略间隔性判断直播是否已停播
41
+            if room_dict['times'] == 0:  # 每爬取十次判断一次是否已关播
42
+                # 判断直播是否结束
43
+                live_info = MysqlDyLive().get_live_info(room_id)
44
+                if live_info is None:  # 直播不存在,不再回塞数据到爬取队列
45
+                    continue
46
+
47
+                pk_id, room_id, status = live_info
48
+
49
+                if int(status) == 4:  # 直播已结束不再回塞数据到爬取队列
50
+                    continue
51
+
52
+            time_diff = int(time.time()) - int(scrape_time)
53
+
54
+            if time_diff > 120:
55
+                # 爬取前更新爬取时间塞回队列
56
+                room_dict['scrape_time'] = int(time.time())
57
+                rds.push_request_id(json.dumps(room_dict))
58
+
59
+                response_json = DyLiveRewardInfo.get_data(room_id)
60
+                if (response_json.get('data') is None) or (response_json.get('data').get('ranks') is None):
61
+                    PrintLog.print(time.strftime("%H:%M:%S", time.localtime()) + ' 获取打赏数据异常:' + room_id + ' ' + uid)
62
+                    rds.record_score(0)
63
+                    continue
64
+
65
+                # 没有商品
66
+                if len(response_json.get('data').get('ranks')) == 0:
67
+                    PrintLog.print(time.strftime("%H:%M:%S", time.localtime()) + ' 没有打赏数据:' + room_id + ' ' + uid)
68
+                    continue
69
+
70
+                data = json.dumps({
71
+                    "data": response_json.get('data'),
72
+                    "extra": {
73
+                        'room_id': room_id,
74
+                        'uid': uid,
75
+                    }
76
+                })
77
+
78
+                rds.record_score(1)
79
+                rds.push_data_list(data)
80
+            else:
81
+                print('直播ID%s %d秒前曾爬取过,暂无需继续抓取' % (room_id, time_diff))
82
+                time.sleep(1)
83
+                rds.push_request_id(json.dumps(room_dict))
84
+
85
+        except Exception as e:
86
+            PrintLog.print(time.strftime("%H:%M:%S", time.localtime()) + ' ' + room_id + '数据异常:' + str(e))
87
+
88
+        time.sleep(0.1)
89
+
90
+
91
+if __name__ == "__main__":
92
+    print("主方法开始执行")
93
+
94
+    rds = RdsDyLiveRewardRequestList()
95
+
96
+    print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + ' ' + ' 开始执行,待爬取直播队列长度:' + str(rds.get_len()))
97
+
98
+    for i in range(1, 50):
99
+        task = threading.Thread(target=scrape, name=i)
100
+        task.start()  # 准备就绪,等待cpu执行

+ 79 - 0
libs/dy_live_reward_info.py

@@ -0,0 +1,79 @@
1
+import requests
2
+import time
3
+
4
+from libs.proxy import Proxy
5
+from log.print_log import PrintLog
6
+from libs.en0414 import DouYinApi
7
+
8
+
9
+class DyLiveRewardInfo:
10
+    @staticmethod
11
+    def get_data(room_id, retry=0):
12
+        while True:
13
+            if retry > 10:
14
+                break
15
+
16
+            retry += 1
17
+
18
+            proxy = Proxy.get()
19
+
20
+            proxies = {
21
+                "http": "http://" + proxy,
22
+                "https": "http://" + proxy
23
+            }
24
+
25
+            try:
26
+                dou_api = DouYinApi('', proxies)
27
+                result = dou_api.register_device()
28
+                device_id, iid, udid, openudid, cookie = result['device_id'], result['iid'], result['uuid'], result[
29
+                    'openudid'], result['cookie']
30
+                dou_api.init_device_ids(device_id, iid, udid, openudid)
31
+
32
+                params = {
33
+                    "has_market": "0",
34
+                    "is_activated": '0'
35
+                }
36
+                dou_api.comm_get('https://aweme.snssdk.com/service/2/app_alert/?', params)
37
+
38
+                response_json = dou_api.get_webcast_ranklist(room_id)
39
+
40
+                if (response_json is not None) and (response_json.get('status_code') == 0):
41
+                    print('成功:' + room_id)
42
+                    PrintLog.print('成功:' + room_id)
43
+                    return response_json
44
+
45
+                if retry >= 10:
46
+                    print('失败:' + room_id + '***')
47
+                    PrintLog.print('失败:' + '***')
48
+                    return None
49
+
50
+                retry += 1
51
+
52
+                return DyLiveRewardInfo.get_data(room_id, retry)
53
+            except requests.exceptions.ProxyError as e:
54
+                PrintLog.print(
55
+                    time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + '代理过期!' + str(e) + '\n'
56
+                    + room_id + '\n'
57
+                    + Proxy.proxy_info
58
+                )
59
+                Proxy.del_proxy(proxy)
60
+                pass
61
+            except requests.exceptions.ConnectTimeout as e:
62
+                PrintLog.print(
63
+                    time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + ' ConnectTimeout!' + str(e) + '\n'
64
+                    + room_id + '\n'
65
+                    + Proxy.proxy_info
66
+                )
67
+                Proxy.del_proxy(proxy)
68
+                pass
69
+            except Exception as e:
70
+                PrintLog.print(
71
+                    time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + '请求抛出异常!' + str(e) + '\n'
72
+                    + room_id
73
+                )
74
+                pass
75
+
76
+
77
+if __name__ == "__main__":
78
+    room_id = '6955048598123891496'
79
+    commodity_detail = DyLiveRewardInfo.get_data(room_id)

+ 1 - 1
libs/en0414.py

@@ -934,7 +934,7 @@ class DouYinApi:
934 934
             a = parse.urlencode(form_params)
935 935
             stub = hashlib.md5(a.encode('utf-8')).hexdigest()
936 936
         ts = int(time.time())
937
-        from lib_scraper.Xg04 import X_Gorgon
937
+        from libs.Xg04 import X_Gorgon
938 938
         params = url[url.index('?') + 1:]
939 939
         result = X_Gorgon(params, "", cookie)
940 940
         gorgon = result['X-Gorgon']

+ 1 - 1
libs/mysql_user_living.py

@@ -6,7 +6,7 @@ class MysqlUserLiving(DbMysql):
6 6
         super().__test__()
7 7
 
8 8
     def get_living_info(self):
9
-        sql = 'SELECT `room_id`,l.`uid`,u.`sec_uid` FROM douyin_live as l left join douyin_users as u on l.`uid`=u.`uid` WHERE `finish_time`=0 order by u.`follower_count` desc LIMIT 1'
9
+        sql = 'SELECT `room_id`,l.`uid`,u.`sec_uid` FROM douyin_live as l left join douyin_users as u on l.`uid`=u.`uid` WHERE `finish_time`=0 and `has_commerce_goods`=1 order by l.`sell_count_sum` desc LIMIT 1'
10 10
         try:
11 11
             # 查询数据
12 12
             self.test.execute(sql)

+ 17 - 1
rds_model/db_redis.py

@@ -36,4 +36,20 @@ class DbRedis:
36 36
                 decode_responses=True
37 37
             )
38 38
 
39
-        return DbRedis.__douyin_connection
39
+        return DbRedis.__douyin_connection
40
+
41
+    @staticmethod
42
+    def outer_net_connect():
43
+        if DbRedis.__connection is None:
44
+            host = 'r-2ze28bdb7389a8a4pd.redis.rds.aliyuncs.com'
45
+            port = 6379
46
+            password = 'Zhuaduoduo2017'
47
+
48
+            DbRedis.__connection = redis.StrictRedis(
49
+                host=host,
50
+                port=port,
51
+                password=password,
52
+                decode_responses=True
53
+            )
54
+
55
+        return DbRedis.__connection

+ 30 - 0
rds_model/rds_dy_live_reward_request_list.py

@@ -0,0 +1,30 @@
1
+from rds_model.db_redis import DbRedis
2
+
3
+
4
+class RdsDyLiveRewardRequestList:
5
+    def __init__(self):
6
+        self.redis = DbRedis.connect()
7
+
8
+    # 获取待爬取直播打赏的队列长度
9
+    def get_len(self):
10
+        key = 'BrandLiveData.DyUserRewardScraperWaiting'
11
+        return self.redis.llen(key)
12
+
13
+    # 获取待爬取打赏数据的直播间信息
14
+    def get_request_param(self):
15
+        key = 'BrandLiveData.DyUserRewardScraperWaiting'
16
+        return self.redis.rpop(key)
17
+
18
+    # 塞回队列
19
+    def push_request_id(self, data):
20
+        key = 'BrandLiveData.DyUserRewardScraperWaiting'
21
+        return self.redis.lpush(key, data)
22
+
23
+    # 打赏数据存储至队列等待处理
24
+    def push_data_list(self, data):
25
+        key = 'BrandLiveData.DyLiveRewardsDataList'
26
+        return self.redis.lpush(key, data)
27
+
28
+    def record_score(self, data):
29
+        key = 'BrandLiveData.DyLiveRewardsScore'
30
+        self.redis.lpush(key, data)

+ 44 - 14
web_cookie.py

@@ -47,7 +47,7 @@ class Cookie:
47 47
             create_at = int(time.mktime(time.strptime(cookie_info, "%Y-%m-%d %H:%M:%S")))
48 48
 
49 49
             # 删除过期的代理
50
-            if now - create_at >= 3600:
50
+            if now - create_at >= 7200:
51 51
                 Cookie.redis.hdel(key, cookie)
52 52
                 cookie_list.remove(cookie)
53 53
                 continue
@@ -64,7 +64,7 @@ class Cookie:
64 64
 
65 65
         now = int(time.time())
66 66
 
67
-        if create_at - now >= 3600:
67
+        if create_at - now >= 7200:
68 68
             return
69 69
 
70 70
         # 删除失效的Cookie
@@ -73,7 +73,23 @@ class Cookie:
73 73
     def set(cookie):
74 74
         Cookie.redis.hset('CookieHash', cookie, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
75 75
 
76
+    def get_url(self):
77
+        url_list = [
78
+            "https://www.douyin.com/user/MS4wLjABAAAAKpCGhwidAtgmUXmYIT0zjp2QpGquUaOCEeVPE6_gHjQ",
79
+            "https://www.douyin.com/user/MS4wLjABAAAA3Gq3QsbpkmIchOTXXF51Fy3Dyb0xF7rLvI3QEQjoYwo",
80
+            "https://www.douyin.com/user/MS4wLjABAAAAO6GOEmNyHKo8Kd1IdS95J884BAFTpR8eRDOBIAb2d7VNGWFBzhR2odpulkKJFV95",
81
+            "https://www.douyin.com/user/MS4wLjABAAAAc6xH1Jxur09z-Oy5M9IVpckmAyPlQg5uj_B8fFrTIiNv5B-XyH6G9RCQt3qLsZZU",
82
+            "https://www.douyin.com/user/MS4wLjABAAAA5v-bl3BibVonLiHoZWW173nKKy_yn2DACHErPbOoo3g",
83
+            "https://www.douyin.com/user/MS4wLjABAAAATjfBnSaWkl6ZJiznrCzkt7-l_7pkD4pLkKyPkLtD2VU",
84
+            "https://www.douyin.com/user/MS4wLjABAAAAZ-hqjDujmsKhlhuNV1R4OPrIWJ0XmhFnuJmy9h3u4VkovGqJycoOVGaCj8uqwQiJ",
85
+            "https://www.douyin.com/user/MS4wLjABAAAAoMNLc-_Vx_TDrJQvpGEtgZfpR99JDKf6n23mXpsnMGg",
86
+            "https://www.douyin.com/user/MS4wLjABAAAAMhl868Pj7GIBYNVX46kjLGS_eiprGHaDHe5ffqi91_s",
87
+            "https://www.douyin.com/user/MS4wLjABAAAAR8ow3aH-TjB2c4TJFqtDvFPhmd3TBFyHF1zMCLd39rdO45zpMZXAIvdwsQ4_7gw6"
88
+        ]
89
+        return random.choice(url_list)
90
+
76 91
     def get_ck(self):
92
+
77 93
         requests.packages.urllib3.disable_warnings()
78 94
         headers = {
79 95
             'authority': 'www.douyin.com',
@@ -92,6 +108,8 @@ class Cookie:
92 108
         # 屏蔽验证https证书提示
93 109
         requests.packages.urllib3.disable_warnings()
94 110
 
111
+        url = self.get_url()
112
+
95 113
         while True:
96 114
             if retry > 10:
97 115
                 break
@@ -107,7 +125,7 @@ class Cookie:
107 125
 
108 126
             try:
109 127
                 response = requests.get(
110
-                    'https://www.douyin.com/',
128
+                    url,
111 129
                     headers=headers,
112 130
                     proxies=proxies,
113 131
                     timeout=8,
@@ -123,17 +141,29 @@ class Cookie:
123 141
                     ck=ck[:-2]
124 142
                     ck+='; passport_csrf_token='+str(uuid.uuid4()).replace('-','')
125 143
 
126
-                    Cookie.set(ck)
127
-                    
128
-                    print(
129
-                        time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
130
-                        + ' 数据获取成功!' 
131
-                        + '\n'
132
-                        + ck
133
-                        + '\n'
134
-                        + Proxy.proxy_info
135
-                    )
136
-                    break
144
+                    num = ck.count('ttwid')
145
+                    if num>0:
146
+                        Cookie.set(ck)
147
+                        print(
148
+                            time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
149
+                            + ' 数据获取成功!' 
150
+                            + '\n'
151
+                            + ck
152
+                            + '\n'
153
+                            + Proxy.proxy_info
154
+                        )
155
+                        break
156
+                    else:
157
+                        print(
158
+                            time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
159
+                            + ' 数据获取失败!' 
160
+                            + '\n'
161
+                            + ck
162
+                            + '\n'
163
+                            + Proxy.proxy_info
164
+                            + str(cookie_dict)
165
+                        )
166
+                        break
137 167
                 else:
138 168
                     print(
139 169
                         time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 

+ 26 - 21
web_dy.py

@@ -1,7 +1,7 @@
1
-import requests,json,random,execjs,uuid,os
1
+import requests,json,random,execjs,uuid
2 2
 from urllib import parse
3 3
 from urllib.parse import quote
4
-from libs.proxy import Proxy
4
+# from libs.proxy import Proxy
5 5
 # -- coding: utf-8 --**
6 6
 
7 7
 class WebDouYin:
@@ -9,14 +9,14 @@ class WebDouYin:
9 9
         self.proxies = {
10 10
         }
11 11
     def get_signature(self,url=None,method='_signature'):
12
-
13
-        with open(os.getcwd()+"/signature.js", 'r', encoding='utf-8') as f:
12
+        with open('signature.js', 'r', encoding='utf-8') as f:
14 13
             b = f.read()
15 14
         
16 15
         c = execjs.compile(b)
17 16
         
18 17
         # url=url.replace('%28','(').replace('%29',')').replace('%2C',',')
19 18
         d = c.call(method, url.replace('\n',''))
19
+        
20 20
         # print('_signature',d)
21 21
         return d
22 22
     def get_ck(self, proxy=None):
@@ -32,7 +32,9 @@ class WebDouYin:
32 32
             'upgrade-insecure-requests': '1',
33 33
             'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
34 34
         }
35
-        res = requests.get('https://www.douyin.com/', headers=headers,verify=False, proxies=proxy,timeout=8)
35
+        # res = requests.get('https://www.douyin.com/', headers=headers,verify=False, proxies=proxy,timeout=8)
36
+        res = requests.get("https://www.douyin.com/user/MS4wLjABAAAAKpCGhwidAtgmUXmYIT0zjp2QpGquUaOCEeVPE6_gHjQ", headers=headers,verify=False, proxies=proxy,timeout=8)
37
+
36 38
         cookie_dict=res.cookies.get_dict()
37 39
         ck=''
38 40
 
@@ -48,11 +50,15 @@ class WebDouYin:
48 50
     def get_ua_ck(self,type_name=None):
49 51
         ua_list=[
50 52
             "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
53
+            "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
54
+            "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3870.400 QQBrowser/10.8.4405.400"
51 55
         ]
52 56
         ck_list=[
53
-            # 'ttwid=1%7CTVzdM0P0u-8dtsmh6c-EaQEtBoTSOs_MG85FAg07AbA%7C1631502013%7C66442d8594de8e93ad18b73f3dfe0c94ed864c3d932824bcde9918b5be172321; passport_csrf_token=866923f1a32045fd82e47053158402a2',
54
-            # 'ttwid=1%7CGPDDu9-w3RGs2Pcd0wRlvLYoktpDt-v8LP5ZMyb1NBM%7C1630319594%7Cffb8de47e6da87dcfd76349b5ad34aa1f9b9d4332261a3a8436b932a893366c1; passport_csrf_token=79284b8777a7a54f3066cefef9af539e'
55
-            'ttwid=1%7CGsfqc7NpdOg4N-U-VX7Q77KsWjVTZ7gxLNifsisj8YE%7C1631618570%7Cafbb13a27fd1c2d7a8245454b1e0d7cd654d80848a320933a25d9ef77638c18c; passport_csrf_token=84911c9af94040a99cc10416bd27533d'
57
+            'ttwid=1%7CTVzdM0P0u-8dtsmh6c-EaQEtBoTSOs_MG85FAg07AbA%7C1631502013%7C66442d8594de8e93ad18b73f3dfe0c94ed864c3d932824bcde9918b5be172321; passport_csrf_token=866923f1a32045fd82e47053158402a2',
58
+            'ttwid=1%7CGPDDu9-w3RGs2Pcd0wRlvLYoktpDt-v8LP5ZMyb1NBM%7C1630319594%7Cffb8de47e6da87dcfd76349b5ad34aa1f9b9d4332261a3a8436b932a893366c1; passport_csrf_token=79284b8777a7a54f3066cefef9af539e',
59
+            'ttwid=1%7CGsfqc7NpdOg4N-U-VX7Q77KsWjVTZ7gxLNifsisj8YE%7C1631618570%7Cafbb13a27fd1c2d7a8245454b1e0d7cd654d80848a320933a25d9ef77638c18c; passport_csrf_token=84911c9af94040a99cc10416bd27533d',
60
+            'ttwid=1%7C82FGr05YUOReYUB301ao_erqOQ3ilbXZdEy0tkMsdXY%7C1631863641%7C1dcebe643a96f00841a3b490db60de886bfe07ff3d276e509717abc4e1681ba6; passport_csrf_token=494ae3fffe00328101fd40e050ce49db',
61
+            'ttwid=1%7CwfnX3T9LY4_60iGoQNzyqYe5ahILFeRxfMuZ1pdgXf8%7C1632724192%7Cb613fddc0b533d5578dad4d5f9290705fdc6432aa854d492f4761d164dd3fdd5; passport_csrf_token=4a8afba333103b033e537003b72ee91b'
56 62
         ]
57 63
         return random.choice(ua_list),random.choice(ck_list)
58 64
     def response(self,url,headers,proxy,data=None):
@@ -144,7 +150,6 @@ class WebDouYin:
144 150
             'browser_version':ua.replace('Mozilla/',''),
145 151
             'browser_online': 'true',
146 152
         }
147
-
148 153
         url = url + parse.urlencode(param)
149 154
         _signature = self.get_signature(url)
150 155
         url+='&_signature='+quote(_signature)
@@ -161,21 +166,21 @@ class WebDouYin:
161 166
             "user-agent":ua,
162 167
             "withcredentials": "true",
163 168
         }
164
-
165 169
         if ck:headers['cookie']=ck
166 170
         json_data=self.response(url=url,headers=headers,proxy=proxy)
167 171
         return json_data
168 172
 if __name__ == '__main__':
169 173
     webdy=WebDouYin()
170
-    # res = webdy.get_user_videos('MS4wLjABAAAAqLPgx-hHf27EqGEtRQ6YyuQQTmikB5CBO1jXy61yhWKujGd8KO5G8V2vdcLQJAym')
171
-    # print(res)
172
-    info = webdy.get_user_info('MS4wLjABAAAAHYNHFpUR36AQSxdDpSFrI2uM4aDvSF-8vjtjNiLepD0')
173
-    proxy = Proxy.get()
174
-    print(proxy)
175
-    proxies = {
176
-        "http": "http://" + proxy,
177
-        "https": "http://" + proxy
178
-    }
174
+    res = webdy.get_user_videos('MS4wLjABAAAAqLPgx-hHf27EqGEtRQ6YyuQQTmikB5CBO1jXy61yhWKujGd8KO5G8V2vdcLQJAym')
175
+    print(res)
176
+    # webdy.get_user_info('MS4wLjABAAAAC2euvL-0qMZyd80aNwZa-wX5KXuz_r7YVNHSBOogfVg')
177
+
178
+    # proxy = Proxy.get()
179
+    # print(proxy)
180
+    # proxies = {
181
+    #     "http": "http://" + proxy,
182
+    #     "https": "http://" + proxy
183
+    # }
179 184
 
180
-#    ck = webdy.get_user_info("MS4wLjABAAAAC2euvL-0qMZyd80aNwZa-wX5KXuz_r7YVNHSBOogfVg")
181
-#    print(ck)
185
+    # ck = webdy.get_ck()
186
+    # print(ck)