|
@@ -1,159 +1,305 @@
|
1
|
1
|
# -- coding: utf-8 --**
|
2
|
|
-
|
3
|
|
-import json
|
4
|
|
-import time
|
5
|
|
-import sys
|
|
2
|
+import json,time,sys,threading,warnings
|
6
|
3
|
|
7
|
4
|
from rds_model.db_redis import DbRedis
|
8
|
5
|
from log.print_log import PrintLog
|
9
|
6
|
from libs.proxy import Proxy
|
10
|
7
|
from web_dy import *
|
11
|
8
|
from rds_model.rds_user_video_list import RdsUserVideoList
|
|
9
|
+from libs.web_cookie import Cookie
|
12
|
10
|
|
13
|
|
-if __name__ == '__main__':
|
14
|
|
- import warnings
|
15
|
|
- start_time = time.time()
|
|
11
|
+start_time = time.time()
|
|
12
|
+
|
|
13
|
+def set_score(flag):
|
|
14
|
+ rds = RdsUserVideoList()
|
|
15
|
+
|
|
16
|
+ if flag == 'success':
|
|
17
|
+ data_score = rds.get_score()
|
|
18
|
+ if data_score is None:
|
|
19
|
+ data_score = '1@@@1@@@0'
|
|
20
|
+ else:
|
|
21
|
+ data_score = data_score.split('@@@')
|
|
22
|
+ total, success, fail = int(data_score[0]), int(data_score[1]), int(data_score[2])
|
|
23
|
+ success = success + 1
|
|
24
|
+ data_score = str(total) + '@@@' + str(success) + '@@@' + str(fail)
|
|
25
|
+ rds.record_score(data_score)
|
|
26
|
+ elif flag == 'fail':
|
|
27
|
+ data_score = rds.get_score()
|
|
28
|
+ if data_score is None:
|
|
29
|
+ data_score = '1@@@0@@@1'
|
|
30
|
+ else:
|
|
31
|
+ data_score = data_score.split('@@@')
|
|
32
|
+ total, success, fail = int(data_score[0]), int(data_score[1]), int(data_score[2])
|
|
33
|
+ fail = fail + 1
|
|
34
|
+ data_score = str(total) + '@@@' + str(success) + '@@@' + str(fail)
|
|
35
|
+ rds.record_score(data_score)
|
|
36
|
+ elif flag == 'all':
|
|
37
|
+ data_score = rds.get_score()
|
|
38
|
+ if data_score is None:
|
|
39
|
+ data_score = '1@@@0@@@0'
|
|
40
|
+ else:
|
|
41
|
+ data_score = data_score.split('@@@')
|
|
42
|
+ total, success, fail = int(data_score[0]), int(data_score[1]), int(data_score[2])
|
|
43
|
+ total = total + 1
|
|
44
|
+ data_score = str(total) + '@@@' + str(success) + '@@@' + str(fail)
|
|
45
|
+ rds.record_score(data_score)
|
|
46
|
+ return ;
|
|
47
|
+
|
|
48
|
+def get_signature(url=None,method='_signature'):
|
|
49
|
+ with open('/mnt/shop_live_scraper/signature.js', 'r', encoding='utf-8') as f:
|
|
50
|
+ b = f.read()
|
16
|
51
|
|
17
|
|
- warnings.filterwarnings("ignore")
|
|
52
|
+ c = execjs.compile(b)
|
|
53
|
+
|
|
54
|
+ # url=url.replace('%28','(').replace('%29',')').replace('%2C',',')
|
|
55
|
+ d = c.call(method, url.replace('\n',''))
|
|
56
|
+
|
|
57
|
+ return d
|
|
58
|
+
|
|
59
|
+def get_user_videos(sec_user_id, max_cursor=0, count=20):
|
|
60
|
+
|
|
61
|
+ # ua,ck=get_ua_ck('get_user_videos')
|
|
62
|
+ ua="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
|
63
|
+
|
|
64
|
+ url='https://www.douyin.com/aweme/v1/web/aweme/post/?'
|
|
65
|
+ param={
|
|
66
|
+ 'device_platform': 'webapp',
|
|
67
|
+ 'aid': '6383',
|
|
68
|
+ 'channel': 'channel_pc_web',
|
|
69
|
+ 'sec_user_id': sec_user_id,
|
|
70
|
+ 'max_cursor': str(max_cursor),
|
|
71
|
+ 'count': str(count),
|
|
72
|
+ 'publish_video_strategy_type': '2',
|
|
73
|
+ 'version_code': '160100',
|
|
74
|
+ 'version_name': '16.1.0',
|
|
75
|
+ 'cookie_enabled': 'true',
|
|
76
|
+ 'screen_width': '1920',
|
|
77
|
+ 'screen_height': '1080',
|
|
78
|
+ 'browser_language': 'zh-CN',
|
|
79
|
+ 'browser_platform': 'Win32',
|
|
80
|
+ 'browser_name': 'Mozilla',
|
|
81
|
+ 'browser_version':ua.replace('Mozilla/',''),
|
|
82
|
+ 'browser_online': 'true',
|
|
83
|
+ }
|
|
84
|
+
|
|
85
|
+ url = url + parse.urlencode(param)
|
|
86
|
+
|
|
87
|
+ _signature = get_signature(url)
|
|
88
|
+ url+='&_signature='+quote(_signature)
|
|
89
|
+
|
|
90
|
+ ck = Cookie.get()
|
|
91
|
+ if ck is None:
|
|
92
|
+ print('获取cookie失败')
|
|
93
|
+ return None
|
|
94
|
+
|
|
95
|
+ headers = {
|
|
96
|
+ "authority": "www.douyin.com",
|
|
97
|
+ "method": "GET",
|
|
98
|
+ "path": str(url).replace('https://www.douyin.com',''),
|
|
99
|
+ "scheme": "https",
|
|
100
|
+ "accept": "application/json, text/plain, */*",
|
|
101
|
+ # "accept-encoding": "gzip, deflate, br",
|
|
102
|
+ "accept-language": "zh-CN,zh;q=0.9",
|
|
103
|
+ "cookie": ck,
|
|
104
|
+ "referer": "https://www.douyin.com/user/{sec_user_id}?enter_method=search_result&enter_from=search_result".format(sec_user_id=sec_user_id),
|
|
105
|
+ "user-agent":ua,
|
|
106
|
+ "withcredentials": "true",
|
|
107
|
+ }
|
|
108
|
+
|
|
109
|
+ if ck:
|
|
110
|
+ headers['cookie']=ck
|
|
111
|
+
|
|
112
|
+ retry = 0
|
|
113
|
+ cookie_retry = 100
|
|
114
|
+ response_json = None
|
|
115
|
+
|
18
|
116
|
while True:
|
|
117
|
+ if retry > 10:
|
|
118
|
+ Cookie.del_cookie(ck)
|
|
119
|
+ break
|
19
|
120
|
|
20
|
|
- server_time = int(time.time())
|
21
|
|
- current_time = time.time()
|
|
121
|
+ retry += 1
|
|
122
|
+ cookie_retry += 1
|
22
|
123
|
|
23
|
|
- if current_time - start_time > 300:
|
24
|
|
- print(
|
25
|
|
- time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
|
26
|
|
- + ' 主方法执行终止'
|
27
|
|
- )
|
28
|
|
- sys.exit(0)
|
29
|
|
-
|
30
|
124
|
proxy = Proxy.get()
|
31
|
|
- print(proxy)
|
|
125
|
+
|
32
|
126
|
proxies = {
|
33
|
127
|
"http": "http://" + proxy,
|
34
|
128
|
"https": "http://" + proxy
|
35
|
129
|
}
|
36
|
|
-
|
37
|
|
- errorn = 0
|
38
|
|
- suc_f = 0
|
39
|
|
- webdy=WebDouYin()
|
40
|
|
-
|
41
|
|
- rds = RdsUserVideoList()
|
42
|
130
|
|
43
|
|
- ck = rds.get_cookie()
|
44
|
|
- if ck is None:
|
45
|
|
- ck = webdy.get_ck(proxy=proxies)
|
46
|
|
- if ck:
|
47
|
|
- rds.set_cookie(ck)
|
48
|
|
- else:
|
49
|
|
- print('获取cookie失败')
|
50
|
|
- break
|
51
|
|
-
|
52
|
|
- for index in range(1000):
|
53
|
|
- try:
|
54
|
|
- # ukey = 'BrandLiveData.DouyinUserVideo'
|
55
|
|
- # users = DbRedis.douyin_connect().rpop(ukey)
|
56
|
|
- users = rds.get_request_param()
|
57
|
|
-
|
58
|
|
- if users is None:
|
59
|
|
- time.sleep(1)
|
60
|
|
- continue
|
61
|
|
- user = json.loads(users)
|
62
|
|
- user_id = user.get('uid')
|
63
|
|
- sec_user_id = user.get('sec_uid')
|
64
|
|
- re_times = int(user.get('re_times')) + 1
|
65
|
|
-
|
66
|
|
- user.update({
|
67
|
|
- 're_times':re_times,
|
68
|
|
- 'sec_uid' : sec_user_id
|
69
|
|
- })
|
70
|
|
-
|
71
|
|
- users = json.dumps(user)
|
72
|
|
- videos = webdy.get_user_videos(sec_user_id=sec_user_id,max_cursor=0,count=20,proxy=proxies,cookie=ck)
|
73
|
|
- # skey = 'BrandLiveData.DouyinUserVideoScore'
|
74
|
|
- # data_score = DbRedis.douyin_connect().rpop(skey)
|
75
|
|
-
|
76
|
|
- data_score = rds.get_score()
|
77
|
|
- if data_score is None:
|
78
|
|
- data_score = '1@@@0@@@0'
|
79
|
|
- else:
|
80
|
|
- data_score = data_score.split('@@@')
|
81
|
|
- total, success, fail = int(data_score[0]), int(data_score[1]), int(data_score[2])
|
82
|
|
- total = total + 1
|
83
|
|
- data_score = str(total) + '@@@' + str(success) + '@@@' + str(fail)
|
84
|
|
- rds.record_score(data_score)
|
85
|
|
- # DbRedis.douyin_connect().lpush(skey, data_score)
|
86
|
|
-
|
87
|
|
- if isinstance(videos, dict):
|
88
|
|
- awemes = videos.get('aweme_list')
|
|
131
|
+ try:
|
|
132
|
+ response = requests.get(
|
|
133
|
+ url,
|
|
134
|
+ headers=headers,
|
|
135
|
+ proxies=proxies,
|
|
136
|
+ timeout=8
|
|
137
|
+ )
|
|
138
|
+ if (response.status_code == 200) and (response.text is not None) and (response.text != ''):
|
|
139
|
+ response_json = response.json()
|
|
140
|
+
|
|
141
|
+ if (response_json.get('aweme_list') is not None):
|
|
142
|
+ break
|
89
|
143
|
else:
|
90
|
|
- print(videos)
|
91
|
|
- awemes = None
|
|
144
|
+ print(
|
|
145
|
+ time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
|
|
146
|
+ + ' 数据获取失败!'
|
|
147
|
+ + '\n'
|
|
148
|
+ + str(sec_user_id)
|
|
149
|
+ + '\n'
|
|
150
|
+ + response.text
|
|
151
|
+ + Proxy.proxy_info
|
|
152
|
+ )
|
92
|
153
|
|
93
|
|
- stime = time.strftime("%H:%M:%S", time.localtime())
|
94
|
|
- if awemes:
|
95
|
|
- # PrintLog.print(stime+" 成功"+str(index)+" "+str(sec_user_id))
|
96
|
|
- print(stime+" 成功"+str(index))
|
97
|
|
- # dkey = 'BrandLiveData.DouyinUserVideoResponsePython'
|
98
|
|
- data = str(user_id) + '@@@' + json.dumps(videos)
|
99
|
|
- # DbRedis.douyin_connect().lpush(dkey, data)
|
100
|
|
- rds.push_data_list(data)
|
101
|
|
-
|
102
|
|
- # skey = 'BrandLiveData.DouyinUserVideoScore'
|
103
|
|
- # data_score = DbRedis.douyin_connect().rpop(skey)
|
104
|
|
- data_score = rds.get_score()
|
105
|
|
- if data_score is None:
|
106
|
|
- data_score = '1@@@1@@@0'
|
107
|
|
- else:
|
108
|
|
- data_score = data_score.split('@@@')
|
109
|
|
- total, success, fail = int(data_score[0]), int(data_score[1]), int(data_score[2])
|
110
|
|
- success = success + 1
|
111
|
|
- data_score = str(total) + '@@@' + str(success) + '@@@' + str(fail)
|
112
|
|
- # DbRedis.douyin_connect().lpush(skey, data_score)
|
113
|
|
- rds.record_score(data_score)
|
114
|
|
-
|
115
|
|
- else:
|
116
|
|
- # skey = 'BrandLiveData.DouyinUserVideoScore'
|
117
|
|
- # data_score = DbRedis.douyin_connect().rpop(skey)
|
118
|
|
- data_score = rds.get_score()
|
119
|
|
- if data_score is None:
|
120
|
|
- data_score = '1@@@0@@@1'
|
121
|
|
- else:
|
122
|
|
- data_score = data_score.split('@@@')
|
123
|
|
- total, success, fail = int(data_score[0]), int(data_score[1]), int(data_score[2])
|
124
|
|
- fail = fail + 1
|
125
|
|
- data_score = str(total) + '@@@' + str(success) + '@@@' + str(fail)
|
126
|
|
- # DbRedis.douyin_connect().lpush(skey, data_score)
|
127
|
|
- rds.record_score(data_score)
|
128
|
|
- # PrintLog.print(stime+" 失败"+str(index)+' '+ str(sec_user_id))
|
129
|
|
- print(stime+" 失败"+str(index))
|
130
|
|
- errorn = errorn + 1
|
131
|
|
- if re_times<1:
|
132
|
|
- # ukey = 'BrandLiveData.DouyinUserVideo'
|
133
|
|
- # DbRedis.douyin_connect().lpush(ukey, users)
|
134
|
|
- rds.push_request_id(users)
|
135
|
|
- if errorn>2:
|
136
|
|
- break
|
137
|
|
- except Exception as e:
|
138
|
|
- # skey = 'BrandLiveData.DouyinUserVideoScore'
|
139
|
|
- # data_score = DbRedis.douyin_connect().rpop(skey)
|
140
|
|
- data_score = rds.get_score()
|
141
|
|
- if data_score is None:
|
142
|
|
- data_score = '1@@@0@@@1'
|
143
|
|
- else:
|
144
|
|
- data_score = data_score.split('@@@')
|
145
|
|
- total, success, fail = int(data_score[0]), int(data_score[1]), int(data_score[2])
|
146
|
|
- fail = fail + 1
|
147
|
|
- data_score = str(total) + '@@@' + str(success) + '@@@' + str(fail)
|
148
|
|
- # DbRedis.douyin_connect().lpush(skey, data_score)
|
149
|
|
- rds.record_score(data_score)
|
|
154
|
+ else:
|
150
|
155
|
print(
|
151
|
|
- time.strftime("%H:%M:%S", time.localtime())
|
152
|
|
- + ' 请求抛出异常!行号:'
|
153
|
|
- + str(e.__traceback__.tb_lineno)
|
154
|
|
- + ' 错误:'
|
155
|
|
- + str(e)
|
156
|
|
- + "\n"
|
|
156
|
+ time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
|
|
157
|
+ + ' 爬取http连接失败!'
|
|
158
|
+ + str(response.status_code)
|
|
159
|
+ + '\n'
|
|
160
|
+ + Proxy.proxy_info
|
|
161
|
+ + '\n'
|
|
162
|
+ + str(sec_user_id)
|
|
163
|
+ + '\n'
|
157
|
164
|
)
|
158
|
|
- break
|
|
165
|
+ time.sleep(1)
|
|
166
|
+ except requests.exceptions.ProxyError as e:
|
|
167
|
+ print(
|
|
168
|
+ time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
|
|
169
|
+ + ' 代理过期!'
|
|
170
|
+ + str(e)
|
|
171
|
+ + '\n'
|
|
172
|
+ + str(sec_user_id)
|
|
173
|
+ + '\n'
|
|
174
|
+ + Proxy.proxy_info
|
|
175
|
+ )
|
|
176
|
+ Proxy.del_proxy(proxy)
|
|
177
|
+ pass
|
|
178
|
+ except requests.exceptions.ConnectTimeout as e:
|
|
179
|
+ print(
|
|
180
|
+ time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
|
|
181
|
+ + ' ConnectTimeout!'
|
|
182
|
+ + str(e)
|
|
183
|
+ + '\n'
|
|
184
|
+ + str(sec_user_id)
|
|
185
|
+ + '\n'
|
|
186
|
+ + Proxy.proxy_info
|
|
187
|
+ )
|
|
188
|
+ Proxy.del_proxy(proxy)
|
|
189
|
+ pass
|
|
190
|
+ except Exception as e:
|
|
191
|
+ print(
|
|
192
|
+ time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
|
|
193
|
+ + ' 请求抛出异常!'
|
|
194
|
+ + str(e)
|
|
195
|
+ + '\n'
|
|
196
|
+ + str(sec_user_id)
|
|
197
|
+ + '\n'
|
|
198
|
+ + Proxy.proxy_info
|
|
199
|
+ )
|
|
200
|
+ pass
|
|
201
|
+
|
|
202
|
+ return response_json
|
|
203
|
+
|
|
204
|
+def scrape(sec_user_id):
|
|
205
|
+
|
|
206
|
+ rds = RdsUserVideoList()
|
|
207
|
+ sec_user_id = str(sec_user_id)
|
|
208
|
+
|
|
209
|
+ print(
|
|
210
|
+ time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
|
|
211
|
+ + ' '
|
|
212
|
+ + str(sec_user_id)
|
|
213
|
+ )
|
|
214
|
+
|
|
215
|
+ time.sleep(0.1)
|
|
216
|
+
|
|
217
|
+ try:
|
|
218
|
+ videos = get_user_videos(sec_user_id=sec_user_id,max_cursor=0,count=20)
|
|
219
|
+
|
|
220
|
+ if videos is None:
|
|
221
|
+ rds.push_request_id(sec_user_id)
|
|
222
|
+ print(
|
|
223
|
+ time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
|
|
224
|
+ + ' 数据获取失败!响应数据为空!'
|
|
225
|
+ + '\n'
|
|
226
|
+ + str(sec_user_id)
|
|
227
|
+ + '\n'
|
|
228
|
+ )
|
|
229
|
+ sys.exit(0)
|
|
230
|
+
|
|
231
|
+ if isinstance(videos, dict):
|
|
232
|
+ awemes = videos.get('aweme_list')
|
|
233
|
+ else:
|
|
234
|
+ print(videos)
|
|
235
|
+ awemes = None
|
|
236
|
+
|
|
237
|
+ if awemes:
|
|
238
|
+ set_score('success')
|
|
239
|
+ data = str(sec_user_id) + '@@@' + json.dumps(videos)
|
|
240
|
+ rds.push_data_list(data)
|
|
241
|
+ else:
|
|
242
|
+ set_score('fail')
|
|
243
|
+
|
|
244
|
+ except Exception as e:
|
|
245
|
+ set_score('fail')
|
|
246
|
+ rds.push_request_id(sec_user_id)
|
|
247
|
+ print(
|
|
248
|
+ time.strftime("%H:%M:%S", time.localtime())
|
|
249
|
+ + ' '
|
|
250
|
+ + str(sec_user_id)
|
|
251
|
+ + '数据异常:'
|
|
252
|
+ + str(e)
|
|
253
|
+ )
|
|
254
|
+
|
|
255
|
+ sys.exit(0)
|
|
256
|
+
|
|
257
|
+if __name__ == '__main__':
|
|
258
|
+ print("主方法开始执行")
|
|
259
|
+ # 并行线程数
|
|
260
|
+ threading_count = int(sys.argv[1])
|
|
261
|
+
|
|
262
|
+ rds = RdsUserVideoList()
|
|
263
|
+ warnings.filterwarnings("ignore")
|
|
264
|
+
|
|
265
|
+ print(
|
|
266
|
+ time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
|
|
267
|
+ + ' '
|
|
268
|
+ + ' 开始执行,用户队列长度:'
|
|
269
|
+ + str(rds.get_len())
|
|
270
|
+ )
|
|
271
|
+
|
|
272
|
+ while True:
|
|
273
|
+ sys.stdout.flush()
|
|
274
|
+
|
|
275
|
+ # 减去主线程
|
|
276
|
+ active_count = threading.active_count() - 1
|
|
277
|
+
|
|
278
|
+ increment = threading_count - active_count
|
|
279
|
+
|
|
280
|
+ while increment > 0:
|
|
281
|
+ sys.stdout.flush()
|
|
282
|
+
|
|
283
|
+ user_info = rds.get_request_param()
|
|
284
|
+ if user_info is None:
|
|
285
|
+ time.sleep(0.1)
|
|
286
|
+ continue
|
|
287
|
+
|
|
288
|
+ user_info = json.loads(user_info)
|
|
289
|
+ sec_user_id = user_info.get('sec_uid')
|
|
290
|
+
|
|
291
|
+ task = threading.Thread(target=scrape, args=(sec_user_id))
|
|
292
|
+ task.start() # 准备就绪, 等待cpu执行
|
|
293
|
+ increment = increment - 1
|
|
294
|
+
|
|
295
|
+ current_time = time.time()
|
|
296
|
+
|
|
297
|
+ if current_time - start_time > 300:
|
|
298
|
+ print(
|
|
299
|
+ time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
|
|
300
|
+ + ' 主方法执行终止'
|
|
301
|
+ )
|
|
302
|
+ sys.exit(0)
|
|
303
|
+
|
|
304
|
+ time.sleep(0.01)
|
159
|
305
|
|