Browse Source

增加抖音达人更新

chenzhiyuan 3 years ago
parent
commit
259d7f7ad3
3 changed files with 96 additions and 6 deletions
  1. 61 0
      dy_userinfo_update.py
  2. 27 0
      rds_model/rds_user_info_list.py
  3. 8 6
      web_dy.py

+ 61 - 0
dy_userinfo_update.py

@@ -0,0 +1,61 @@
1
+import time
2
+import threading
3
+import json
4
+
5
+
6
+from rds_model.rds_user_info_list import RdsUserInfoList
7
+from log.print_log import PrintLog
8
+from web_dy import WebDouYin
9
+
10
+
11
+def scrape():
12
+    rds_list = RdsUserInfoList()
13
+    web_dy = WebDouYin()
14
+    start_time = int(time.time())
15
+
16
+    while True:
17
+        try:
18
+            if int(time.time())-start_time > 5*60:
19
+                break
20
+            sec_uid = rds_list.get_wait_update_user()
21
+
22
+            if sec_uid is None:
23
+                time.sleep(0.1)
24
+                continue
25
+
26
+            sec_uid = str(sec_uid)
27
+
28
+            PrintLog.print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + ' ' + sec_uid + '开始抓取用户信息')
29
+            response_json = web_dy.get_user_info(sec_uid)
30
+            if response_json is None:
31
+                rds_list.put_user_info(sec_uid)
32
+                PrintLog.print(
33
+                    time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + '数据获取失败!响应数据为空!' + '\n'
34
+                    + sec_uid + '\n'
35
+                )
36
+
37
+            data = json.dumps({
38
+                "data": response_json.get('user'),
39
+                "extra": {
40
+                    'room_id': sec_uid
41
+                }
42
+            })
43
+
44
+            print('爬取成功')
45
+            rds_list.put_user_info(data)
46
+
47
+        except Exception as e:
48
+            print('爬取失败')
49
+            rds_list.put_wait_update_user(sec_uid)
50
+            PrintLog.print(time.strftime("%H:%M:%S", time.localtime()) + ' ' + sec_uid + '数据异常:' + str(e))
51
+        time.sleep(0.1)
52
+
53
+
54
+if __name__ == "__main__":
55
+    print("主方法开始执行")
56
+
57
+    rds = RdsUserInfoList()
58
+    print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + ' ' + ' 开始执行,待更新直播队列长度:' + str(rds.get_len()))
59
+    for i in range(1, 50):
60
+        task = threading.Thread(target=scrape, name=i)
61
+        task.start()  # 准备就绪,等待cpu执行

+ 27 - 0
rds_model/rds_user_info_list.py

@@ -0,0 +1,27 @@
1
+from rds_model.db_redis import DbRedis
2
+
3
+
4
+class RdsUserInfoList:
5
+    def __init__(self):
6
+        self.redis = DbRedis.connect()
7
+
8
+    # 获取待更新用户信息的队列长度
9
+    def get_len(self):
10
+        key = 'BrandLiveData.DyWaitingUpdateUserInfoList'
11
+        return self.redis.llen(key)
12
+
13
+    # 从待更新信息的用户队列头部获取一个
14
+    def get_wait_update_user(self):
15
+        key = 'BrandLiveData.DyWaitingUpdateUserInfoList'
16
+        return self.redis.lpop(key)
17
+
18
+    # 新增待更新用户
19
+    def put_wait_update_user(self, sec_uid):
20
+        key = 'BrandLiveData.DyWaitingUpdateUserInfoList'
21
+        return self.redis.lpush(key, sec_uid)
22
+
23
+    # 已获取好的用户信息加入到列队
24
+    def put_user_info(self, data):
25
+        key = 'BrandLiveData.DyUpdateUserInfoList'
26
+        self.redis.rpush(key, data)
27
+

+ 8 - 6
web_dy.py

@@ -1,4 +1,4 @@
1
-import requests,json,random,execjs,uuid
1
+import requests,json,random,execjs,uuid,os
2 2
 from urllib import parse
3 3
 from urllib.parse import quote
4 4
 from libs.proxy import Proxy
@@ -9,14 +9,14 @@ class WebDouYin:
9 9
         self.proxies = {
10 10
         }
11 11
     def get_signature(self,url=None,method='_signature'):
12
-        with open('/mnt/shop_live_scraper/signature.js', 'r', encoding='utf-8') as f:
12
+
13
+        with open(os.getcwd()+"/signature.js", 'r', encoding='utf-8') as f:
13 14
             b = f.read()
14 15
         
15 16
         c = execjs.compile(b)
16 17
         
17 18
         # url=url.replace('%28','(').replace('%29',')').replace('%2C',',')
18 19
         d = c.call(method, url.replace('\n',''))
19
-        
20 20
         # print('_signature',d)
21 21
         return d
22 22
     def get_ck(self, proxy=None):
@@ -144,6 +144,7 @@ class WebDouYin:
144 144
             'browser_version':ua.replace('Mozilla/',''),
145 145
             'browser_online': 'true',
146 146
         }
147
+
147 148
         url = url + parse.urlencode(param)
148 149
         _signature = self.get_signature(url)
149 150
         url+='&_signature='+quote(_signature)
@@ -160,6 +161,7 @@ class WebDouYin:
160 161
             "user-agent":ua,
161 162
             "withcredentials": "true",
162 163
         }
164
+
163 165
         if ck:headers['cookie']=ck
164 166
         json_data=self.response(url=url,headers=headers,proxy=proxy)
165 167
         return json_data
@@ -167,8 +169,7 @@ if __name__ == '__main__':
167 169
     webdy=WebDouYin()
168 170
     # res = webdy.get_user_videos('MS4wLjABAAAAqLPgx-hHf27EqGEtRQ6YyuQQTmikB5CBO1jXy61yhWKujGd8KO5G8V2vdcLQJAym')
169 171
     # print(res)
170
-    # webdy.get_user_info('MS4wLjABAAAAC2euvL-0qMZyd80aNwZa-wX5KXuz_r7YVNHSBOogfVg')
171
-
172
+    info = webdy.get_user_info('MS4wLjABAAAAHYNHFpUR36AQSxdDpSFrI2uM4aDvSF-8vjtjNiLepD0')
172 173
     proxy = Proxy.get()
173 174
     print(proxy)
174 175
     proxies = {
@@ -176,4 +177,5 @@ if __name__ == '__main__':
176 177
         "https": "http://" + proxy
177 178
     }
178 179
 
179
-    ck = webdy.get_ck()
180
+#    ck = webdy.get_user_info("MS4wLjABAAAAC2euvL-0qMZyd80aNwZa-wX5KXuz_r7YVNHSBOogfVg")
181
+#    print(ck)