同步提交

This commit is contained in:
DELL 2026-04-16 15:56:52 +08:00
parent 36fa73bd12
commit 0e89a4f642
9 changed files with 506 additions and 12 deletions

View File

@ -77,6 +77,7 @@
</exclusion> </exclusion>
</exclusions> </exclusions>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.springframework.cloud</groupId> <groupId>org.springframework.cloud</groupId>
<artifactId>spring-cloud-stream-test-support</artifactId> <artifactId>spring-cloud-stream-test-support</artifactId>

View File

@ -477,4 +477,114 @@ public class DatabaseConnector {
e.printStackTrace(); e.printStackTrace();
} }
} }
/**
* 新闻导出
*/
public void exportToXlsxTest(String startTime) {
try {
Path dirPath = Paths.get(hotSearchExcelOutputPath);
if (!Files.exists(dirPath)) {
Files.createDirectories(dirPath);
}
String timestamp = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd"));
String fileName = "data_hot_search-" + timestamp + "-001.xlsx";
Path filePath = dirPath.resolve(fileName);
List<EsDataHotSearchView> esDataHotSearchViewList = esDataHotSearchRepository.findAllByEsLoadtimeAfter(startTime);
if (!esDataHotSearchViewList.isEmpty()) {
Field[] fields = esDataHotSearchViewList.get(0).getClass().getDeclaredFields(); // 通过反射获取类的成员信息并使用这些类成员为后续生成的excel表头做准备
try (Workbook workbook = new XSSFWorkbook();
ByteArrayOutputStream out = new ByteArrayOutputStream()) {
Sheet sheet = workbook.createSheet("data");
// 创建表头
Row headerRow = sheet.createRow(0);
CellStyle headerStyle = workbook.createCellStyle(); // 创建单元格
headerStyle.setFillForegroundColor(IndexedColors.LIGHT_BLUE.getIndex());
headerStyle.setFillPattern(FillPatternType.SOLID_FOREGROUND);
for (int i = 0; i < fields.length; i++) {
Cell cell = headerRow.createCell(i);
String formField = formField(fields[i]);
cell.setCellValue(formField);
cell.setCellStyle(headerStyle);
}
// 填充数据
int rowNum = 1;
for (EsDataHotSearchView item : esDataHotSearchViewList) {
Row row = sheet.createRow(rowNum++);
logger.debug("导出excel第" + rowNum + "");
// 0: esSid
row.createCell(0).setCellValue(item.getEsSid() != null ? item.getEsSid() : "");
// 1: esUrltime
row.createCell(1).setCellValue(item.getEsUrltime() != null ? item.getEsUrltime() : "");
// 2: esCarriertype
row.createCell(2).setCellValue(item.getEsCarriertype() != null ? item.getEsCarriertype() : "");
// 3: esSitename
row.createCell(3).setCellValue(item.getEsSitename() != null ? item.getEsSitename() : "");
// 4: esSimrank
row.createCell(4).setCellValue(item.getEsSimrank() != null ? String.valueOf(Float.valueOf(item.getEsSimrank()).intValue()) : "");
// 5: esUrltitle
String esUrltitle = item.getEsUrltitle();
if (esUrltitle != null && esUrltitle.length() > 10000) {
row.createCell(5).setCellValue(esUrltitle.substring(0, 10000));
} else {
row.createCell(5).setCellValue(esUrltitle != null ? esUrltitle : "");
}
// 6: esUrlcontent
String esUrlcontent = item.getEsUrlcontent();
if (esUrlcontent != null && esUrlcontent.length() > 10000) {
row.createCell(6).setCellValue(esUrlcontent.substring(0, 10000));
} else {
row.createCell(6).setCellValue(esUrlcontent != null ? esUrlcontent : "");
}
// 7: esUrlname
row.createCell(7).setCellValue(item.getEsUrlname() != null ? item.getEsUrlname() : "");
// 8: esHkey
row.createCell(8).setCellValue(item.getEsHkey() != null ? item.getEsHkey() : "");
// 9: esLasttime
String esLasttime = extractFilenamesFromJsonArray(item.getEsLasttime());
row.createCell(9).setCellValue(esLasttime);
// 10: esHeat
row.createCell(10).setCellValue(item.getEsHeat() != null ? item.getEsHeat() : "");
// 11: esLasttime
String esLoadtime = extractFilenamesFromJsonArray(item.getEsLoadtime());
row.createCell(11).setCellValue(esLoadtime);
}
logger.info("完成excel数据写入" + rowNum + "");
// 自动调整列宽
for (int i = 0; i < fields.length; i++) {
sheet.autoSizeColumn(i);
}
workbook.write(out);
try (FileOutputStream fos = new FileOutputStream(filePath.toFile())) {
workbook.write(fos);
}
} catch (IOException e) {
e.printStackTrace();
}
logger.info("excel导出完成");
} else logger.info("获取数据为空excel未导出");
} catch (Exception e) {
e.printStackTrace();
}
}
} }

View File

@ -3,6 +3,7 @@
BOT_NAME = 'MediaSpiders' BOT_NAME = 'MediaSpiders'
LOG_LEVEL = 'INFO' LOG_LEVEL = 'INFO'
# LOG_LEVEL = 'DEBUG'
SPIDER_MODULES = ['MediaSpiders.spiders'] SPIDER_MODULES = ['MediaSpiders.spiders']
NEWSPIDER_MODULE = 'MediaSpiders.spiders' NEWSPIDER_MODULE = 'MediaSpiders.spiders'
@ -110,8 +111,8 @@ CUSTOM_USER_AGENT = [
# 部署在外网采集fb时使用selenium_chrome # 部署在外网采集fb时使用selenium_chrome
SELENIUM_DRIVER_NAME = 'chrome' SELENIUM_DRIVER_NAME = 'chrome'
# SELENIUM_DRIVER_EXECUTABLE_PATH = 'local' SELENIUM_DRIVER_EXECUTABLE_PATH = 'local'
SELENIUM_DRIVER_EXECUTABLE_PATH = 'http://144.34.185.108:28098' # SELENIUM_DRIVER_EXECUTABLE_PATH = 'http://144.34.185.108:28098'
SELENIUM_DRIVER_ARGUMENTS = [ SELENIUM_DRIVER_ARGUMENTS = [
'--headless', '--headless',
'--no-sandbox', '--no-sandbox',

View File

@ -76,14 +76,15 @@ class WechatLinksFetcherSpider(scrapy.Spider):
def parse(self, response): def parse(self, response):
driver = response.request.meta['driver'] driver = response.request.meta['driver']
cookie_list = self.redis_client.lrange("MediaSpiders:WeChatLinksFetcher_Cookies", 0, -1) cookies_key = "MediaSpiders:WeChatLinksFetcher_Cookies"
cookie_list = self.redis_client.lrange(cookies_key, 0, -1)
cookie_parts = [ cookie_parts = [
item.decode('utf-8') if isinstance(item, bytes) else str(item) item.decode('utf-8') if isinstance(item, bytes) else str(item)
for item in cookie_list for item in cookie_list
] ]
# 尝试使用 Redis 中的 cookie 登录 # 遍历cookies记录当前索引
for item in cookie_parts: for cookie_index, item in enumerate(cookie_parts):
try: try:
driver.delete_all_cookies() driver.delete_all_cookies()
driver.get('https://mp.weixin.qq.com/') driver.get('https://mp.weixin.qq.com/')
@ -99,7 +100,7 @@ class WechatLinksFetcherSpider(scrapy.Spider):
else: else:
logger.warning(f"跳过 cookie: {name}") logger.warning(f"跳过 cookie: {name}")
logger.info(f"成功添加 {success_count}/{len(cookie_dict)} 个 cookie") logger.info(f"成功添加 {success_count}/{len(cookie_dict)} 个 cookie (索引: {cookie_index})")
# 验证 cookie 是否有效 # 验证 cookie 是否有效
driver.refresh() driver.refresh()
@ -204,6 +205,9 @@ class WechatLinksFetcherSpider(scrapy.Spider):
if err_msg == "freq control" or err_msg == "invalid session": if err_msg == "freq control" or err_msg == "invalid session":
logger.info("接口频率限制,稍后再试,本次获取结束") logger.info("接口频率限制,稍后再试,本次获取结束")
break_flag = True break_flag = True
# 删除当前使用的cookie
self._remove_invalid_cookie(cookies_key, cookie_index)
break break
if not break_flag: if not break_flag:
@ -222,12 +226,32 @@ class WechatLinksFetcherSpider(scrapy.Spider):
if rsp_body['base_resp']['err_msg'] == "freq control": if rsp_body['base_resp']['err_msg'] == "freq control":
logger.info("接口频率限制,稍后再试,本次获取结束") logger.info("接口频率限制,稍后再试,本次获取结束")
break_flag = True break_flag = True
# 删除当前使用的cookie
self._remove_invalid_cookie(cookies_key, cookie_index)
break break
except Exception as e: except Exception as e:
logger.info(repr(e)) logger.info(repr(e))
self.redis_client.close() self.redis_client.close()
driver.quit() driver.quit()
def _remove_invalid_cookie(self, cookies_key, cookie_index):
"""删除无效的cookie"""
try:
# 方法1标记并删除
self.redis_client.lset(cookies_key, cookie_index, "__invalid__")
self.redis_client.lrem(cookies_key, 1, "__invalid__")
logger.info(f"已删除无效的cookie索引: {cookie_index}")
# 方法2或者直接删除整个列表如果cookie全部无效
# cookie_count = self.redis_client.llen(cookies_key)
# if cookie_count <= 1:
# self.redis_client.delete(cookies_key)
# logger.info(f"已删除所有cookies: {cookies_key}")
except Exception as e:
logger.error(f"删除cookie失败: {e}")
def parse_cookie_string(cookie_str): def parse_cookie_string(cookie_str):
"""解析 cookie 字符串为 dict""" """解析 cookie 字符串为 dict"""
@ -280,3 +304,6 @@ def add_cookie_smart(driver, name, value, target_domain='mp.weixin.qq.com'):
# logger.warning(f"✗ {name} failed: {e}") # logger.warning(f"✗ {name} failed: {e}")
return False return False
return False # 所有 domain 都失败 return False # 所有 domain 都失败

View File

@ -2,3 +2,5 @@
# #
# Please refer to the documentation for information on how to create and manage # Please refer to the documentation for information on how to create and manage
# your spiders. # your spiders.

View File

@ -0,0 +1,353 @@
import time
import logging as logger
from selenium.webdriver.common.by import By
import redis
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from MediaSpiders.settings import REDIS_HOST, REDIS_PORT, REDIS_PWD
chrome_options = Options()
# 指定 chrome.exe 的完整路径
chrome_options.binary_location = r"D:\chrome-win64\chrome.exe"
# chrome_options.use_chromium = True
driver = webdriver.Chrome(
executable_path=r"D:\chromedriver-win64\chromedriver.exe",
options=chrome_options
)
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": "Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"
})
# Redis连接
redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PWD, decode_responses=True)
COOKIE_KEY = "MediaSpiders:WeChatLinksFetcher_Cookies"
def parse_cookie_string(cookie_str):
"""解析 cookie 字符串为 dict"""
cookie_dict = {}
for item in cookie_str.split(';'):
if '=' in item:
name, value = item.split('=', 1)
cookie_dict[name.strip()] = value.strip()
return cookie_dict
def add_cookie_smart(driver, name, value, target_domain='mp.weixin.qq.com'):
"""
智能添加 cookie先试目标域名失败则试父域再失败则跳过
"""
# 微信核心 cookie 必须用 mp.weixin.qq.com
wechat_critical = ['wxuin', 'slave_sid', 'slave_user', 'bizuin', 'data_ticket', 'token']
# 腾讯通用 cookie 可尝试父域
tencent_common = ['ptui_loginuin', 'RK', 'ptcz', 'ua_id']
# 策略 1: 核心 cookie → 精确域名
if name in wechat_critical:
domains_to_try = [target_domain]
# 策略 2: 腾讯通用 cookie → 先试目标域,再试父域
elif name in tencent_common:
domains_to_try = [target_domain, '.weixin.qq.com', '.qq.com']
# 策略 3: 其他 cookie → 默认 host-only不传 domain
else:
domains_to_try = [None, target_domain]
for domain in domains_to_try:
cookie = {
'name': name,
'value': value,
'path': '/',
'secure': True
}
if domain:
cookie['domain'] = domain
try:
driver.add_cookie(cookie)
# logger.debug(f"✓ {name} added with domain={domain or 'host-only'}")
return True
except Exception as e:
if 'invalid cookie domain' in str(e):
continue # 尝试下一个 domain
else:
# logger.warning(f"✗ {name} failed: {e}")
return False
return False # 所有 domain 都失败
def is_cookie_exists(cookie_str):
"""
判断cookie是否已存在Redis中
返回: (exists, duplicate_index)
"""
try:
# 解析新cookie
new_cookie_dict = parse_cookie_string(cookie_str)
# 获取Redis中所有现有cookie
existing_cookies = redis_client.lrange(COOKIE_KEY, 0, -1)
for idx, existing_cookie in enumerate(existing_cookies):
try:
existing_dict = parse_cookie_string(existing_cookie)
# 检查关键字段是否匹配
# 微信cookie的关键识别字段
key_fields = ['wxuin', 'slave_sid', 'slave_user', 'bizuin']
matches = 0
for field in key_fields:
if field in new_cookie_dict and field in existing_dict:
if new_cookie_dict[field] == existing_dict[field]:
matches += 1
# 如果匹配到2个以上关键字段认为是同一个cookie
if matches >= 2:
return True, idx
# 或者检查slave_sid最独特的标识
if 'slave_sid' in new_cookie_dict and 'slave_sid' in existing_dict:
if new_cookie_dict['slave_sid'] == existing_dict['slave_sid']:
return True, idx
except Exception as e:
logger.warning(f"解析现有cookie时出错: {e}")
continue
return False, -1
except Exception as e:
logger.error(f"判断cookie是否存在时出错: {e}")
return False, -1
def save_cookie_to_redis(cookie_str, force_save=False):
"""
保存cookie到Redis自动去重
Args:
cookie_str: cookie字符串
force_save: 是否强制保存即使存在也保存
Returns:
bool: 是否保存成功
"""
try:
# 检查是否已存在
exists, idx = is_cookie_exists(cookie_str)
if exists and not force_save:
logger.info(f"Cookie已存在 (索引: {idx}),跳过保存")
return False
if exists and force_save:
# 删除旧的,保存新的
redis_client.lset(COOKIE_KEY, idx, cookie_str)
logger.info(f"已更新现有cookie (索引: {idx})")
return True
else:
# 添加新cookie
redis_client.rpush(COOKIE_KEY, cookie_str)
logger.info(f"已添加新cookie当前总数: {redis_client.llen(COOKIE_KEY)}")
return True
except Exception as e:
logger.error(f"保存cookie到Redis失败: {e}")
return False
def cookie_dict_to_string(cookie_dict):
"""将cookie字典转换为字符串"""
return '; '.join([f"{k}={v}" for k, v in cookie_dict.items()])
def manual_login_and_get_cookie():
"""
手动扫码登录并获取cookie
"""
logger.info("开始手动扫码登录流程...")
try:
# 访问微信公众平台
driver.get("https://mp.weixin.qq.com/")
time.sleep(3)
# 等待页面加载
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.TAG_NAME, "body"))
)
# 检查是否已登录通过URL是否包含token
if "token=" in driver.current_url:
logger.info("检测到已登录状态直接获取cookie")
else:
logger.info("请手动扫描二维码登录...")
logger.info("等待登录完成...")
# 等待登录成功等待URL变化或特定元素出现
try:
# 等待URL中包含token
WebDriverWait(driver, 120).until(
lambda d: "token=" in d.current_url
)
logger.info("检测到登录成功!")
time.sleep(3)
except Exception as e:
logger.error("等待登录超时")
return None
# 获取当前页面所有cookies
cookies = driver.get_cookies()
if not cookies:
logger.error("未获取到cookies")
return None
# 转换为字符串格式
cookie_dict = {}
for cookie in cookies:
cookie_dict[cookie['name']] = cookie['value']
cookie_string = cookie_dict_to_string(cookie_dict)
# 获取token信息
token = None
if "token=" in driver.current_url:
token_index = driver.current_url.rfind('token=')
token = driver.current_url[token_index + 6:]
logger.info(f"获取到token: {token}")
logger.info(f"获取到 {len(cookie_dict)} 个cookie")
return {
'cookie_dict': cookie_dict,
'cookie_string': cookie_string,
'token': token,
'raw_cookies': cookies
}
except Exception as e:
logger.error(f"手动登录失败: {e}")
return None
def verify_cookie_valid(cookie_dict, token=None):
"""
验证cookie是否有效
Args:
cookie_dict: cookie字典
token: token字符串
Returns:
bool: cookie是否有效
"""
try:
if not token:
# 如果没有token尝试从cookie中构建
pass
# 构建请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Referer': f'https://mp.weixin.qq.com/',
}
# 尝试访问一个需要登录的接口
test_api = f'https://mp.weixin.qq.com/cgi-bin/bizlogin?action=validate&lang=zh_CN'
response = requests.get(test_api, cookies=cookie_dict, headers=headers, timeout=10)
# 检查响应
if response.status_code == 200:
try:
data = response.json()
if 'base_resp' in data:
err_msg = data['base_resp'].get('err_msg', '')
if err_msg in ['ok', '']:
logger.info("cookie验证有效")
return True
else:
logger.warning(f"cookie验证返回错误: {err_msg}")
return False
except:
# 如果不是JSON响应可能仍然有效
logger.info("cookie可能有效")
return True
else:
logger.warning(f"cookie验证请求失败: {response.status_code}")
return False
except Exception as e:
logger.error(f"验证cookie时出错: {e}")
return False
def main(type):
"""主函数"""
logger.info("微信公众号Cookie获取工具")
try:
# 1. 手动登录获取cookie
result = manual_login_and_get_cookie()
if not result:
logger.error("获取cookie失败")
return
cookie_string = result['cookie_string']
cookie_dict = result['cookie_dict']
token = result['token']
# 2. 验证cookie有效性
logger.info("正在验证cookie有效性...")
is_valid = verify_cookie_valid(cookie_dict, token)
if not is_valid:
logger.warning("cookie可能无效但仍会保存")
# 3. 检查cookie是否已存在并保存
logger.info("正在检查cookie是否已存在...")
exists, idx = is_cookie_exists(cookie_string)
if exists:
logger.info(f"Cookie已存在 (索引: {idx})")
# 询问是否覆盖
choice = type
if choice == 'y':
saved = save_cookie_to_redis(cookie_string, force_save=True)
if saved:
logger.info("已覆盖更新cookie")
else:
logger.info("取消保存")
else:
# 保存新cookie
saved = save_cookie_to_redis(cookie_string)
if saved:
logger.info("新cookie保存成功")
# 4. 显示当前cookie列表状态
total_cookies = redis_client.llen(COOKIE_KEY)
logger.info(f"当前Redis中cookie总数: {total_cookies}")
except KeyboardInterrupt:
logger.info("用户中断程序")
except Exception as e:
logger.error(f"程序执行出错: {e}")
finally:
driver.quit()
logger.info("程序结束")
if __name__ == "__main__":
# 是否覆盖原有cookie? y:覆盖、n:不覆盖
type = 'y'
# 运行主程序
main(type)

View File

@ -20,11 +20,11 @@ def http_get(url):
return rsp return rsp
def http_post(url, data, headers=None): def http_post(url, data, headers=None, timeout=60):
if headers: if headers:
rsp = requests.post(url, data=data, headers=headers) rsp = requests.post(url, data=data, headers=headers, timeout=timeout)
else: else:
rsp = requests.post(url, data=data, headers={'User-Agent': ua}) rsp = requests.post(url, data=data, headers={'User-Agent': ua}, timeout=timeout)
return rsp return rsp

View File

@ -26,8 +26,8 @@ def check_session(drive_path):
api = drive_path + '/graphql' api = drive_path + '/graphql'
post_body = '{"query": "{ grid { maxSession, sessionCount } }"}' post_body = '{"query": "{ grid { maxSession, sessionCount } }"}'
try: try:
# 添加超时控制,5分钟 = 300秒 # 添加超时控制,1分钟 = 600秒
response = http_post(api, post_body, timeout=300) response = http_post(api, post_body, timeout=60)
data_body = json.loads(response.content.decode()) data_body = json.loads(response.content.decode())
session_info = data_body['data']['grid'] session_info = data_body['data']['grid']
return session_info return session_info

View File

@ -9,4 +9,4 @@ dirpath = os.path.dirname(os.path.abspath(__file__))
sys.path.append(dirpath) sys.path.append(dirpath)
if __name__ == "__main__": if __name__ == "__main__":
execute(['scrapy', 'crawl', 'website_info_common', '-a', 'params={"job_id":"801","clusterName":"star_4"}']) execute(['scrapy', 'crawl', 'Website_report_list', '-a', 'params={"job_id":"801","clusterName":"star_4"}'])