osc/spiders/MediaSpiders/MediaSpiders/utils/wechat_links_fetcher.py

153 lines
7.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import random
import time
from math import ceil
import redis
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from MediaSpiders.settings import REDIS_HOST, REDIS_PORT, REDIS_PWD, SOCIAL_USER_QUERY_ALL_API, SOCIAL_USER_UPDATE_API, \
WECHAT_USER_TYPE
from MediaSpiders.utils.http_utils import http_post, UA
from MediaSpiders.utils.login_utils import login
chrome_options = Options()
# 指定 chrome.exe 的完整路径
chrome_options.binary_location = r"C:\Users\DELL\Downloads\chrome-win64\chrome.exe"
# chrome_options.use_chromium = True
driver = webdriver.Chrome(
executable_path=r"C:\Users\DELL\Downloads\chromedriver-win64\chromedriver.exe",
options=chrome_options
)
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PWD)
if __name__ == "__main__":
count_per_account = 200
total_count = 0
driver = login().login_with_selenium(
'https://mp.weixin.qq.com/',
'wechat_links_fetcher',
drivers=driver
)
break_flag = False
token_index = driver.current_url.rfind('token=')
token = driver.current_url[token_index + 6:]
print(f'获取 token 成功!当前 token 为 {token}')
raw_cookies = driver.get_cookies()
cookies = {}
for c in raw_cookies:
cookies[c['name']] = c['value']
print(f'获取 cookie 成功!')
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/136.0.0.0 Safari/537.36 Edg/136.0.0.0',
'Referer': f'https://mp.weixin.qq.com/cgi-bin/appmsg?t=media/'
f'appmsg_edit_v2&action=edit&isNew=1&type=77&createType=0&token={token}&lang=zh_CN'
}
query_api = SOCIAL_USER_QUERY_ALL_API.format(sortBy="updateTime", shuffleResult="false")
post_body = {
'userType': WECHAT_USER_TYPE,
'userFlag': 0
}
account_rsp = json.loads(
http_post(query_api, json.dumps(post_body), headers={"Content-Type": "application/json"}).text)
official_accounts = []
if account_rsp['code'] == 200:
official_accounts = account_rsp['content']
for account_line in official_accounts:
try:
if break_flag:
break
start_timestamp = int((time.time() - 500 * 24 * 3600) * 1000)
if 'updateTime' in account_line:
start_timestamp = account_line['updateTime']
start_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(start_timestamp / 1000))
account = account_line['userName']
search_account_api = f'https://mp.weixin.qq.com/cgi-bin/searchbiz?action=search_biz&begin=0&count=5&' \
f'query={account}&token={token}&lang=zh_CN&f=json&ajax=1'
print(f"开始搜索公众号“{account}”...")
time.sleep(3 + random.random())
response = requests.get(search_account_api, cookies=cookies, headers=headers)
rsp_body = json.loads(response.text)
index_end = ceil(count_per_account / 5)
if 'list' in rsp_body:
matched_account = {}
matched_account_flag = False
for item in rsp_body['list']:
if item['nickname'] == account:
matched_account_flag = True
matched_account = item
break
if not matched_account_flag:
print(f"未找到公众号“{account}")
continue
fake_id = matched_account['fakeid']
update_time_flag = True # 用于记录获取到的历史列表是否已经超出最早的时间限制
next_start_timestamp = int(time.time() * 1000)
for index in range(index_end):
if update_time_flag:
if next_start_timestamp - start_timestamp < 12 * 3600 * 1000:
print(f"公众号“{account}”以及后续账号在12小时内已经扫码获取过文章链接本次获取结束")
break_flag = True
else:
fetch_article_api = f'https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=' \
f'{index * 5}&count=5&fakeid={fake_id}&type=9&query=&token={token}' \
f'&lang=zh_CN&f=json&ajax=1'
print(f"开始获取公众号“{account}”在 {start_time} 后发表的的文章列表...")
time.sleep(3 + random.random())
article_response = requests.get(fetch_article_api, cookies=cookies, headers=headers)
article_rsp_body = json.loads(article_response.text)
if 'app_msg_list' in article_rsp_body:
for article in article_rsp_body['app_msg_list']:
title = article['title']
link = article['link']
update_time = article['update_time'] * 1000
if update_time > start_timestamp:
total_count += 1
time_str = time.strftime("%Y-%m-%d %H:%M:%S",
time.localtime(update_time / 1000))
print(f"[No. {total_count}] 获取到公众号“{account}”在 {time_str} "
f"发表的文章《{title}》,链接地址:{link}")
redis_client.sadd(f"MediaSpiders:Wechat_links:{account_line['id']}", link)
else:
update_time_flag = False
break
else:
print(json.dumps(article_rsp_body, ensure_ascii=False))
if 'base_resp' in article_rsp_body:
if article_rsp_body['base_resp']['err_msg'] == "freq control":
print("接口频率限制,稍后再试,本次获取结束")
break_flag = True
break
if not break_flag:
# 本循环内只有12小时内扫过码以及接口频率限制退出会导致 break_flag 为 True这两种情况都不需要更新扫码状态
next_start_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(next_start_timestamp / 1000))
account_line['updateTime'] = next_start_timestamp
http_post(SOCIAL_USER_UPDATE_API,
data=json.dumps(account_line, ensure_ascii=False).encode('utf-8'),
headers={'User-Agent': UA, "Content-Type": "application/json"}
)
print(f"公众号“{account}”文章获取结束,该账号下一次获取起始时间为 {next_start_time}")
else:
print(json.dumps(rsp_body, ensure_ascii=False))
if 'base_resp' in rsp_body:
if rsp_body['base_resp']['err_msg'] == "freq control":
print("接口频率限制,稍后再试,本次获取结束")
break_flag = True
break
except Exception as e:
print(repr(e))
redis_client.close()
driver.quit()