153 lines
7.9 KiB
Python
153 lines
7.9 KiB
Python
import json
|
||
import random
|
||
import time
|
||
from math import ceil
|
||
|
||
import redis
|
||
import requests
|
||
from selenium import webdriver
|
||
from selenium.webdriver.chrome.options import Options
|
||
from selenium.webdriver.chrome.service import Service
|
||
|
||
|
||
from MediaSpiders.settings import REDIS_HOST, REDIS_PORT, REDIS_PWD, SOCIAL_USER_QUERY_ALL_API, SOCIAL_USER_UPDATE_API, \
|
||
WECHAT_USER_TYPE
|
||
from MediaSpiders.utils.http_utils import http_post, UA
|
||
from MediaSpiders.utils.login_utils import login
|
||
|
||
chrome_options = Options()
|
||
# 指定 chrome.exe 的完整路径
|
||
chrome_options.binary_location = r"C:\Users\DELL\Downloads\chrome-win64\chrome.exe"
|
||
# chrome_options.use_chromium = True
|
||
driver = webdriver.Chrome(
|
||
executable_path=r"C:\Users\DELL\Downloads\chromedriver-win64\chromedriver.exe",
|
||
options=chrome_options
|
||
)
|
||
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
|
||
"source": """
|
||
Object.defineProperty(navigator, 'webdriver', {
|
||
get: () => undefined
|
||
})
|
||
"""
|
||
})
|
||
redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PWD)
|
||
|
||
if __name__ == "__main__":
|
||
count_per_account = 200
|
||
total_count = 0
|
||
driver = login().login_with_selenium(
|
||
'https://mp.weixin.qq.com/',
|
||
'wechat_links_fetcher',
|
||
drivers=driver
|
||
)
|
||
break_flag = False
|
||
token_index = driver.current_url.rfind('token=')
|
||
token = driver.current_url[token_index + 6:]
|
||
print(f'获取 token 成功!当前 token 为 {token}')
|
||
raw_cookies = driver.get_cookies()
|
||
cookies = {}
|
||
for c in raw_cookies:
|
||
cookies[c['name']] = c['value']
|
||
print(f'获取 cookie 成功!')
|
||
headers = {
|
||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
|
||
'Chrome/136.0.0.0 Safari/537.36 Edg/136.0.0.0',
|
||
'Referer': f'https://mp.weixin.qq.com/cgi-bin/appmsg?t=media/'
|
||
f'appmsg_edit_v2&action=edit&isNew=1&type=77&createType=0&token={token}&lang=zh_CN'
|
||
}
|
||
query_api = SOCIAL_USER_QUERY_ALL_API.format(sortBy="updateTime", shuffleResult="false")
|
||
post_body = {
|
||
'userType': WECHAT_USER_TYPE,
|
||
'userFlag': 0
|
||
}
|
||
account_rsp = json.loads(
|
||
http_post(query_api, json.dumps(post_body), headers={"Content-Type": "application/json"}).text)
|
||
official_accounts = []
|
||
if account_rsp['code'] == 200:
|
||
official_accounts = account_rsp['content']
|
||
for account_line in official_accounts:
|
||
try:
|
||
if break_flag:
|
||
break
|
||
start_timestamp = int((time.time() - 500 * 24 * 3600) * 1000)
|
||
if 'updateTime' in account_line:
|
||
start_timestamp = account_line['updateTime']
|
||
start_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(start_timestamp / 1000))
|
||
account = account_line['userName']
|
||
search_account_api = f'https://mp.weixin.qq.com/cgi-bin/searchbiz?action=search_biz&begin=0&count=5&' \
|
||
f'query={account}&token={token}&lang=zh_CN&f=json&ajax=1'
|
||
print(f"开始搜索公众号“{account}”...")
|
||
time.sleep(3 + random.random())
|
||
response = requests.get(search_account_api, cookies=cookies, headers=headers)
|
||
rsp_body = json.loads(response.text)
|
||
index_end = ceil(count_per_account / 5)
|
||
if 'list' in rsp_body:
|
||
matched_account = {}
|
||
matched_account_flag = False
|
||
for item in rsp_body['list']:
|
||
if item['nickname'] == account:
|
||
matched_account_flag = True
|
||
matched_account = item
|
||
break
|
||
if not matched_account_flag:
|
||
print(f"未找到公众号“{account}”")
|
||
continue
|
||
fake_id = matched_account['fakeid']
|
||
update_time_flag = True # 用于记录获取到的历史列表是否已经超出最早的时间限制
|
||
next_start_timestamp = int(time.time() * 1000)
|
||
for index in range(index_end):
|
||
if update_time_flag:
|
||
if next_start_timestamp - start_timestamp < 12 * 3600 * 1000:
|
||
print(f"公众号“{account}”以及后续账号在12小时内已经扫码获取过文章链接,本次获取结束")
|
||
break_flag = True
|
||
else:
|
||
fetch_article_api = f'https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=' \
|
||
f'{index * 5}&count=5&fakeid={fake_id}&type=9&query=&token={token}' \
|
||
f'&lang=zh_CN&f=json&ajax=1'
|
||
print(f"开始获取公众号“{account}”在 {start_time} 后发表的的文章列表...")
|
||
time.sleep(3 + random.random())
|
||
article_response = requests.get(fetch_article_api, cookies=cookies, headers=headers)
|
||
article_rsp_body = json.loads(article_response.text)
|
||
if 'app_msg_list' in article_rsp_body:
|
||
for article in article_rsp_body['app_msg_list']:
|
||
title = article['title']
|
||
link = article['link']
|
||
update_time = article['update_time'] * 1000
|
||
if update_time > start_timestamp:
|
||
total_count += 1
|
||
time_str = time.strftime("%Y-%m-%d %H:%M:%S",
|
||
time.localtime(update_time / 1000))
|
||
print(f"[No. {total_count}] 获取到公众号“{account}”在 {time_str} "
|
||
f"发表的文章《{title}》,链接地址:{link}")
|
||
redis_client.sadd(f"MediaSpiders:Wechat_links:{account_line['id']}", link)
|
||
else:
|
||
update_time_flag = False
|
||
break
|
||
else:
|
||
print(json.dumps(article_rsp_body, ensure_ascii=False))
|
||
if 'base_resp' in article_rsp_body:
|
||
if article_rsp_body['base_resp']['err_msg'] == "freq control":
|
||
print("接口频率限制,稍后再试,本次获取结束")
|
||
break_flag = True
|
||
break
|
||
if not break_flag:
|
||
# 本循环内,只有12小时内扫过码以及接口频率限制退出,会导致 break_flag 为 True,这两种情况都不需要更新扫码状态
|
||
next_start_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(next_start_timestamp / 1000))
|
||
account_line['updateTime'] = next_start_timestamp
|
||
http_post(SOCIAL_USER_UPDATE_API,
|
||
data=json.dumps(account_line, ensure_ascii=False).encode('utf-8'),
|
||
headers={'User-Agent': UA, "Content-Type": "application/json"}
|
||
)
|
||
print(f"公众号“{account}”文章获取结束,该账号下一次获取起始时间为 {next_start_time}")
|
||
else:
|
||
print(json.dumps(rsp_body, ensure_ascii=False))
|
||
if 'base_resp' in rsp_body:
|
||
if rsp_body['base_resp']['err_msg'] == "freq control":
|
||
print("接口频率限制,稍后再试,本次获取结束")
|
||
break_flag = True
|
||
break
|
||
except Exception as e:
|
||
print(repr(e))
|
||
redis_client.close()
|
||
driver.quit()
|