[微信公众号] 自动化采集优化已完成
This commit is contained in:
parent
2c974902ab
commit
d4ac0c27cd
@ -129,7 +129,7 @@ if __name__ == "__main__":
|
|||||||
if 'token=' in current_url:
|
if 'token=' in current_url:
|
||||||
logger.info("使用 Redis 中的 cookie 登录成功")
|
logger.info("使用 Redis 中的 cookie 登录成功")
|
||||||
need_manual_login = False
|
need_manual_login = False
|
||||||
break
|
|
||||||
else:
|
else:
|
||||||
# 二次验证:检查页面上是否有登录状态相关的元素
|
# 二次验证:检查页面上是否有登录状态相关的元素
|
||||||
try:
|
try:
|
||||||
@ -138,188 +138,188 @@ if __name__ == "__main__":
|
|||||||
".weui-desktop-account__nickname, .userinfo_nickname, .account_nickname")
|
".weui-desktop-account__nickname, .userinfo_nickname, .account_nickname")
|
||||||
logger.info("通过页面元素验证,登录成功")
|
logger.info("通过页面元素验证,登录成功")
|
||||||
need_manual_login = False
|
need_manual_login = False
|
||||||
break
|
|
||||||
except:
|
except:
|
||||||
logger.warning("Cookie 登录失败,尝试下一个 cookie 或手动登录")
|
logger.warning("Cookie 登录失败,尝试下一个 cookie 或手动登录")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"使用 cookie 登录时出错: {str(e)}")
|
logger.error(f"使用 cookie 登录时出错: {str(e)}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# 如果自动登录失败,进行手动登录
|
# 如果自动登录失败,进行手动登录
|
||||||
if need_manual_login:
|
if need_manual_login:
|
||||||
logger.info("所有 cookie 均无效,启动手动登录流程")
|
logger.info("所有 cookie 均无效,启动手动登录流程")
|
||||||
try:
|
|
||||||
driver.delete_all_cookies()
|
|
||||||
driver.get('https://mp.weixin.qq.com/')
|
|
||||||
time.sleep(2)
|
|
||||||
|
|
||||||
# 等待用户手动登录
|
|
||||||
logger.info("请在浏览器中手动完成登录(扫描二维码)")
|
|
||||||
logger.info("登录成功后,程序将自动继续执行")
|
|
||||||
|
|
||||||
# 设置最长等待时间(例如 120 秒)
|
|
||||||
max_wait_time = 120
|
|
||||||
start_time = time.time()
|
|
||||||
logged_in = False
|
|
||||||
|
|
||||||
while time.time() - start_time < max_wait_time:
|
|
||||||
current_url = driver.current_url
|
|
||||||
if 'token=' in current_url:
|
|
||||||
logged_in = True
|
|
||||||
logger.info("手动登录成功!")
|
|
||||||
break
|
|
||||||
|
|
||||||
# 检查页面元素
|
|
||||||
try:
|
try:
|
||||||
driver.find_element(By.CSS_SELECTOR,
|
driver.delete_all_cookies()
|
||||||
".weui-desktop-account__nickname, .userinfo_nickname, .account_nickname")
|
driver.get('https://mp.weixin.qq.com/')
|
||||||
logged_in = True
|
|
||||||
logger.info("通过页面元素确认手动登录成功!")
|
|
||||||
break
|
|
||||||
except:
|
|
||||||
time.sleep(2)
|
time.sleep(2)
|
||||||
|
|
||||||
if not logged_in:
|
# 等待用户手动登录
|
||||||
logger.error(f"等待 {max_wait_time} 秒后仍未登录成功,程序终止")
|
logger.info("请在浏览器中手动完成登录(扫描二维码)")
|
||||||
raise Exception("手动登录超时")
|
logger.info("登录成功后,程序将自动继续执行")
|
||||||
|
|
||||||
# 获取新的 cookie
|
# 设置最长等待时间(例如 120 秒)
|
||||||
|
max_wait_time = 120
|
||||||
|
start_time = time.time()
|
||||||
|
logged_in = False
|
||||||
|
|
||||||
|
while time.time() - start_time < max_wait_time:
|
||||||
|
current_url = driver.current_url
|
||||||
|
if 'token=' in current_url:
|
||||||
|
logged_in = True
|
||||||
|
logger.info("手动登录成功!")
|
||||||
|
break
|
||||||
|
|
||||||
|
# 检查页面元素
|
||||||
|
try:
|
||||||
|
driver.find_element(By.CSS_SELECTOR,
|
||||||
|
".weui-desktop-account__nickname, .userinfo_nickname, .account_nickname")
|
||||||
|
logged_in = True
|
||||||
|
logger.info("通过页面元素确认手动登录成功!")
|
||||||
|
break
|
||||||
|
except:
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
if not logged_in:
|
||||||
|
logger.error(f"等待 {max_wait_time} 秒后仍未登录成功,程序终止")
|
||||||
|
raise Exception("手动登录超时")
|
||||||
|
|
||||||
|
# 获取新的 cookie
|
||||||
|
raw_cookies = driver.get_cookies()
|
||||||
|
new_cookie_dict = {}
|
||||||
|
for c in raw_cookies:
|
||||||
|
new_cookie_dict[c['name']] = c['value']
|
||||||
|
|
||||||
|
# 将字典转换为字符串格式
|
||||||
|
new_cookie_string = "; ".join([f"{k}={v}" for k, v in new_cookie_dict.items()])
|
||||||
|
|
||||||
|
# 更新 Redis 中的 cookie
|
||||||
|
logger.info("更新 Redis 中的 cookie")
|
||||||
|
|
||||||
|
# 删除旧的 cookie
|
||||||
|
redis_client.delete("MediaSpiders:WeChatLinksFetcher_Cookies")
|
||||||
|
|
||||||
|
# 添加新的 cookie
|
||||||
|
redis_client.lpush("MediaSpiders:WeChatLinksFetcher_Cookies", new_cookie_string)
|
||||||
|
|
||||||
|
current_cookie = new_cookie_string
|
||||||
|
logger.info("Redis cookie 更新成功")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"手动登录过程出错: {str(e)}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
count_per_account = 200
|
||||||
|
total_count = 0
|
||||||
|
break_flag = False
|
||||||
|
|
||||||
|
token_index = driver.current_url.rfind('token=')
|
||||||
|
token = driver.current_url[token_index + 6:]
|
||||||
|
print(f'获取 token 成功!当前 token 为 {token}')
|
||||||
raw_cookies = driver.get_cookies()
|
raw_cookies = driver.get_cookies()
|
||||||
new_cookie_dict = {}
|
cookies = {}
|
||||||
for c in raw_cookies:
|
for c in raw_cookies:
|
||||||
new_cookie_dict[c['name']] = c['value']
|
cookies[c['name']] = c['value']
|
||||||
|
print(f'获取 cookie 成功!')
|
||||||
# 将字典转换为字符串格式
|
headers = {
|
||||||
new_cookie_string = "; ".join([f"{k}={v}" for k, v in new_cookie_dict.items()])
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
|
||||||
|
'Chrome/136.0.0.0 Safari/537.36 Edg/136.0.0.0',
|
||||||
# 更新 Redis 中的 cookie
|
'Referer': f'https://mp.weixin.qq.com/cgi-bin/appmsg?t=media/'
|
||||||
logger.info("更新 Redis 中的 cookie")
|
f'appmsg_edit_v2&action=edit&isNew=1&type=77&createType=0&token={token}&lang=zh_CN'
|
||||||
|
}
|
||||||
# 删除旧的 cookie
|
query_api = SOCIAL_USER_QUERY_ALL_API.format(sortBy="updateTime", shuffleResult="false")
|
||||||
redis_client.delete("MediaSpiders:WeChatLinksFetcher_Cookies")
|
post_body = {
|
||||||
|
'userType': WECHAT_USER_TYPE,
|
||||||
# 添加新的 cookie
|
'userFlag': 0
|
||||||
redis_client.lpush("MediaSpiders:WeChatLinksFetcher_Cookies", new_cookie_string)
|
}
|
||||||
|
account_rsp = json.loads(
|
||||||
current_cookie = new_cookie_string
|
http_post(query_api, json.dumps(post_body), headers={"Content-Type": "application/json"}).text)
|
||||||
logger.info("Redis cookie 更新成功")
|
official_accounts = []
|
||||||
|
if account_rsp['code'] == 200:
|
||||||
except Exception as e:
|
official_accounts = account_rsp['content']
|
||||||
logger.error(f"手动登录过程出错: {str(e)}")
|
for account_line in official_accounts:
|
||||||
raise
|
try:
|
||||||
|
if break_flag:
|
||||||
count_per_account = 200
|
break
|
||||||
total_count = 0
|
start_timestamp = int((time.time() - 500 * 24 * 3600) * 1000)
|
||||||
break_flag = False
|
if 'updateTime' in account_line:
|
||||||
|
start_timestamp = account_line['updateTime']
|
||||||
token_index = driver.current_url.rfind('token=')
|
start_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(start_timestamp / 1000))
|
||||||
token = driver.current_url[token_index + 6:]
|
account = account_line['userName']
|
||||||
print(f'获取 token 成功!当前 token 为 {token}')
|
search_account_api = f'https://mp.weixin.qq.com/cgi-bin/searchbiz?action=search_biz&begin=0&count=5&' \
|
||||||
raw_cookies = driver.get_cookies()
|
f'query={account}&token={token}&lang=zh_CN&f=json&ajax=1'
|
||||||
cookies = {}
|
print(f"开始搜索公众号“{account}”...")
|
||||||
for c in raw_cookies:
|
time.sleep(3 + random.random())
|
||||||
cookies[c['name']] = c['value']
|
response = requests.get(search_account_api, cookies=cookies, headers=headers)
|
||||||
print(f'获取 cookie 成功!')
|
rsp_body = json.loads(response.text)
|
||||||
headers = {
|
index_end = ceil(count_per_account / 5)
|
||||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
|
if 'list' in rsp_body:
|
||||||
'Chrome/136.0.0.0 Safari/537.36 Edg/136.0.0.0',
|
matched_account = {}
|
||||||
'Referer': f'https://mp.weixin.qq.com/cgi-bin/appmsg?t=media/'
|
matched_account_flag = False
|
||||||
f'appmsg_edit_v2&action=edit&isNew=1&type=77&createType=0&token={token}&lang=zh_CN'
|
for item in rsp_body['list']:
|
||||||
}
|
if item['nickname'] == account:
|
||||||
query_api = SOCIAL_USER_QUERY_ALL_API.format(sortBy="updateTime", shuffleResult="false")
|
matched_account_flag = True
|
||||||
post_body = {
|
matched_account = item
|
||||||
'userType': WECHAT_USER_TYPE,
|
break
|
||||||
'userFlag': 0
|
if not matched_account_flag:
|
||||||
}
|
print(f"未找到公众号“{account}”")
|
||||||
account_rsp = json.loads(
|
continue
|
||||||
http_post(query_api, json.dumps(post_body), headers={"Content-Type": "application/json"}).text)
|
fake_id = matched_account['fakeid']
|
||||||
official_accounts = []
|
update_time_flag = True # 用于记录获取到的历史列表是否已经超出最早的时间限制
|
||||||
if account_rsp['code'] == 200:
|
next_start_timestamp = int(time.time() * 1000)
|
||||||
official_accounts = account_rsp['content']
|
for index in range(index_end):
|
||||||
for account_line in official_accounts:
|
if update_time_flag:
|
||||||
try:
|
if next_start_timestamp - start_timestamp < 12 * 3600 * 1000:
|
||||||
if break_flag:
|
print(f"公众号“{account}”以及后续账号在12小时内已经扫码获取过文章链接,本次获取结束")
|
||||||
break
|
break_flag = True
|
||||||
start_timestamp = int((time.time() - 500 * 24 * 3600) * 1000)
|
|
||||||
if 'updateTime' in account_line:
|
|
||||||
start_timestamp = account_line['updateTime']
|
|
||||||
start_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(start_timestamp / 1000))
|
|
||||||
account = account_line['userName']
|
|
||||||
search_account_api = f'https://mp.weixin.qq.com/cgi-bin/searchbiz?action=search_biz&begin=0&count=5&' \
|
|
||||||
f'query={account}&token={token}&lang=zh_CN&f=json&ajax=1'
|
|
||||||
print(f"开始搜索公众号“{account}”...")
|
|
||||||
time.sleep(3 + random.random())
|
|
||||||
response = requests.get(search_account_api, cookies=cookies, headers=headers)
|
|
||||||
rsp_body = json.loads(response.text)
|
|
||||||
index_end = ceil(count_per_account / 5)
|
|
||||||
if 'list' in rsp_body:
|
|
||||||
matched_account = {}
|
|
||||||
matched_account_flag = False
|
|
||||||
for item in rsp_body['list']:
|
|
||||||
if item['nickname'] == account:
|
|
||||||
matched_account_flag = True
|
|
||||||
matched_account = item
|
|
||||||
break
|
|
||||||
if not matched_account_flag:
|
|
||||||
print(f"未找到公众号“{account}”")
|
|
||||||
continue
|
|
||||||
fake_id = matched_account['fakeid']
|
|
||||||
update_time_flag = True # 用于记录获取到的历史列表是否已经超出最早的时间限制
|
|
||||||
next_start_timestamp = int(time.time() * 1000)
|
|
||||||
for index in range(index_end):
|
|
||||||
if update_time_flag:
|
|
||||||
if next_start_timestamp - start_timestamp < 12 * 3600 * 1000:
|
|
||||||
print(f"公众号“{account}”以及后续账号在12小时内已经扫码获取过文章链接,本次获取结束")
|
|
||||||
break_flag = True
|
|
||||||
else:
|
|
||||||
fetch_article_api = f'https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=' \
|
|
||||||
f'{index * 5}&count=5&fakeid={fake_id}&type=9&query=&token={token}' \
|
|
||||||
f'&lang=zh_CN&f=json&ajax=1'
|
|
||||||
print(f"开始获取公众号“{account}”在 {start_time} 后发表的的文章列表...")
|
|
||||||
time.sleep(3 + random.random())
|
|
||||||
article_response = requests.get(fetch_article_api, cookies=cookies, headers=headers)
|
|
||||||
article_rsp_body = json.loads(article_response.text)
|
|
||||||
if 'app_msg_list' in article_rsp_body:
|
|
||||||
for article in article_rsp_body['app_msg_list']:
|
|
||||||
title = article['title']
|
|
||||||
link = article['link']
|
|
||||||
update_time = article['update_time'] * 1000
|
|
||||||
if update_time > start_timestamp:
|
|
||||||
total_count += 1
|
|
||||||
time_str = time.strftime("%Y-%m-%d %H:%M:%S",
|
|
||||||
time.localtime(update_time / 1000))
|
|
||||||
print(f"[No. {total_count}] 获取到公众号“{account}”在 {time_str} "
|
|
||||||
f"发表的文章《{title}》,链接地址:{link}")
|
|
||||||
redis_client.sadd(f"MediaSpiders:Wechat_links:{account_line['id']}", link)
|
|
||||||
else:
|
|
||||||
update_time_flag = False
|
|
||||||
break
|
|
||||||
else:
|
else:
|
||||||
print(json.dumps(article_rsp_body, ensure_ascii=False))
|
fetch_article_api = f'https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=' \
|
||||||
if 'base_resp' in article_rsp_body:
|
f'{index * 5}&count=5&fakeid={fake_id}&type=9&query=&token={token}' \
|
||||||
err_msg = article_rsp_body['base_resp']['err_msg']
|
f'&lang=zh_CN&f=json&ajax=1'
|
||||||
if err_msg == "freq control" or err_msg == "invalid session":
|
print(f"开始获取公众号“{account}”在 {start_time} 后发表的的文章列表...")
|
||||||
print("接口频率限制,稍后再试,本次获取结束")
|
time.sleep(3 + random.random())
|
||||||
break_flag = True
|
article_response = requests.get(fetch_article_api, cookies=cookies, headers=headers)
|
||||||
break
|
article_rsp_body = json.loads(article_response.text)
|
||||||
|
if 'app_msg_list' in article_rsp_body:
|
||||||
|
for article in article_rsp_body['app_msg_list']:
|
||||||
|
title = article['title']
|
||||||
|
link = article['link']
|
||||||
|
update_time = article['update_time'] * 1000
|
||||||
|
if update_time > start_timestamp:
|
||||||
|
total_count += 1
|
||||||
|
time_str = time.strftime("%Y-%m-%d %H:%M:%S",
|
||||||
|
time.localtime(update_time / 1000))
|
||||||
|
print(f"[No. {total_count}] 获取到公众号“{account}”在 {time_str} "
|
||||||
|
f"发表的文章《{title}》,链接地址:{link}")
|
||||||
|
redis_client.sadd(f"MediaSpiders:Wechat_links:{account_line['id']}", link)
|
||||||
|
else:
|
||||||
|
update_time_flag = False
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
print(json.dumps(article_rsp_body, ensure_ascii=False))
|
||||||
|
if 'base_resp' in article_rsp_body:
|
||||||
|
err_msg = article_rsp_body['base_resp']['err_msg']
|
||||||
|
if err_msg == "freq control" or err_msg == "invalid session":
|
||||||
|
print("接口频率限制,稍后再试,本次获取结束")
|
||||||
|
break_flag = True
|
||||||
|
break
|
||||||
|
|
||||||
if not break_flag:
|
if not break_flag:
|
||||||
# 本循环内,只有12小时内扫过码以及接口频率限制退出,会导致 break_flag 为 True,这两种情况都不需要更新扫码状态
|
# 本循环内,只有12小时内扫过码以及接口频率限制退出,会导致 break_flag 为 True,这两种情况都不需要更新扫码状态
|
||||||
next_start_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(next_start_timestamp / 1000))
|
next_start_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(next_start_timestamp / 1000))
|
||||||
account_line['updateTime'] = next_start_timestamp
|
account_line['updateTime'] = next_start_timestamp
|
||||||
http_post(SOCIAL_USER_UPDATE_API,
|
http_post(SOCIAL_USER_UPDATE_API,
|
||||||
data=json.dumps(account_line, ensure_ascii=False).encode('utf-8'),
|
data=json.dumps(account_line, ensure_ascii=False).encode('utf-8'),
|
||||||
headers={'User-Agent': UA, "Content-Type": "application/json"}
|
headers={'User-Agent': UA, "Content-Type": "application/json"}
|
||||||
)
|
)
|
||||||
print(f"公众号“{account}”文章获取结束,该账号下一次获取起始时间为 {next_start_time}")
|
print(f"公众号“{account}”文章获取结束,该账号下一次获取起始时间为 {next_start_time}")
|
||||||
else:
|
else:
|
||||||
print(json.dumps(rsp_body, ensure_ascii=False))
|
print(json.dumps(rsp_body, ensure_ascii=False))
|
||||||
if 'base_resp' in rsp_body:
|
if 'base_resp' in rsp_body:
|
||||||
if rsp_body['base_resp']['err_msg'] == "freq control":
|
if rsp_body['base_resp']['err_msg'] == "freq control":
|
||||||
print("接口频率限制,稍后再试,本次获取结束")
|
print("接口频率限制,稍后再试,本次获取结束")
|
||||||
break_flag = True
|
break_flag = True
|
||||||
break
|
break
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(repr(e))
|
print(repr(e))
|
||||||
redis_client.close()
|
redis_client.close()
|
||||||
driver.quit()
|
driver.quit()
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user