From 0e89a4f642235f54a701243558d476b6c278bafa Mon Sep 17 00:00:00 2001 From: DELL Date: Thu, 16 Apr 2026 15:56:52 +0800 Subject: [PATCH] =?UTF-8?q?=E5=90=8C=E6=AD=A5=E6=8F=90=E4=BA=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- dsp/pom.xml | 1 + .../com/jsc/dsp/utils/DatabaseConnector.java | 110 ++++++ spiders/MediaSpiders/MediaSpiders/settings.py | 5 +- .../spiders/WechatLinksFetcherSpider.py | 35 +- .../MediaSpiders/spiders/__init__.py | 2 + .../utils/wechat_links_fetcher_add_cookie.py | 353 ++++++++++++++++++ .../WebsiteSpider/utils/http_utils.py | 6 +- .../WebsiteSpider/utils/selenium_utils.py | 4 +- spiders/WebsiteSpider/run.py | 2 +- 9 files changed, 506 insertions(+), 12 deletions(-) create mode 100644 spiders/MediaSpiders/MediaSpiders/utils/wechat_links_fetcher_add_cookie.py diff --git a/dsp/pom.xml b/dsp/pom.xml index 52dc383..ad321d7 100644 --- a/dsp/pom.xml +++ b/dsp/pom.xml @@ -77,6 +77,7 @@ + org.springframework.cloud spring-cloud-stream-test-support diff --git a/dsp/src/main/java/com/jsc/dsp/utils/DatabaseConnector.java b/dsp/src/main/java/com/jsc/dsp/utils/DatabaseConnector.java index 8d86e81..844c5e7 100644 --- a/dsp/src/main/java/com/jsc/dsp/utils/DatabaseConnector.java +++ b/dsp/src/main/java/com/jsc/dsp/utils/DatabaseConnector.java @@ -477,4 +477,114 @@ public class DatabaseConnector { e.printStackTrace(); } } + + /** + * 新闻导出 + */ + public void exportToXlsxTest(String startTime) { + try { + Path dirPath = Paths.get(hotSearchExcelOutputPath); + if (!Files.exists(dirPath)) { + Files.createDirectories(dirPath); + } + String timestamp = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd")); + String fileName = "data_hot_search-" + timestamp + "-001.xlsx"; + Path filePath = dirPath.resolve(fileName); + + List esDataHotSearchViewList = esDataHotSearchRepository.findAllByEsLoadtimeAfter(startTime); + if (!esDataHotSearchViewList.isEmpty()) { + Field[] fields = esDataHotSearchViewList.get(0).getClass().getDeclaredFields(); // 通过反射获取类的成员信息,并使用这些类成员为后续生成的excel表头做准备 + try (Workbook workbook = new XSSFWorkbook(); + ByteArrayOutputStream out = new ByteArrayOutputStream()) { + + Sheet sheet = workbook.createSheet("data"); + + // 创建表头 + Row headerRow = sheet.createRow(0); + CellStyle headerStyle = workbook.createCellStyle(); // 创建单元格 + headerStyle.setFillForegroundColor(IndexedColors.LIGHT_BLUE.getIndex()); + headerStyle.setFillPattern(FillPatternType.SOLID_FOREGROUND); + + for (int i = 0; i < fields.length; i++) { + Cell cell = headerRow.createCell(i); + String formField = formField(fields[i]); + cell.setCellValue(formField); + cell.setCellStyle(headerStyle); + } + // 填充数据 + int rowNum = 1; + for (EsDataHotSearchView item : esDataHotSearchViewList) { + Row row = sheet.createRow(rowNum++); + logger.debug("导出excel第" + rowNum + "行"); + // 0: esSid + row.createCell(0).setCellValue(item.getEsSid() != null ? item.getEsSid() : ""); + // 1: esUrltime + row.createCell(1).setCellValue(item.getEsUrltime() != null ? item.getEsUrltime() : ""); + + // 2: esCarriertype + row.createCell(2).setCellValue(item.getEsCarriertype() != null ? item.getEsCarriertype() : ""); + + // 3: esSitename + row.createCell(3).setCellValue(item.getEsSitename() != null ? item.getEsSitename() : ""); + + // 4: esSimrank + row.createCell(4).setCellValue(item.getEsSimrank() != null ? String.valueOf(Float.valueOf(item.getEsSimrank()).intValue()) : ""); + + // 5: esUrltitle + String esUrltitle = item.getEsUrltitle(); + if (esUrltitle != null && esUrltitle.length() > 10000) { + row.createCell(5).setCellValue(esUrltitle.substring(0, 10000)); + } else { + row.createCell(5).setCellValue(esUrltitle != null ? esUrltitle : ""); + } + + // 6: esUrlcontent + String esUrlcontent = item.getEsUrlcontent(); + if (esUrlcontent != null && esUrlcontent.length() > 10000) { + row.createCell(6).setCellValue(esUrlcontent.substring(0, 10000)); + } else { + row.createCell(6).setCellValue(esUrlcontent != null ? esUrlcontent : ""); + } + + // 7: esUrlname + row.createCell(7).setCellValue(item.getEsUrlname() != null ? item.getEsUrlname() : ""); + + // 8: esHkey + row.createCell(8).setCellValue(item.getEsHkey() != null ? item.getEsHkey() : ""); + + // 9: esLasttime + String esLasttime = extractFilenamesFromJsonArray(item.getEsLasttime()); + row.createCell(9).setCellValue(esLasttime); + + + // 10: esHeat + row.createCell(10).setCellValue(item.getEsHeat() != null ? item.getEsHeat() : ""); + + // 11: esLasttime + String esLoadtime = extractFilenamesFromJsonArray(item.getEsLoadtime()); + row.createCell(11).setCellValue(esLoadtime); + } + logger.info("完成excel数据写入,共" + rowNum + "行"); + + // 自动调整列宽 + for (int i = 0; i < fields.length; i++) { + sheet.autoSizeColumn(i); + } + + workbook.write(out); + + try (FileOutputStream fos = new FileOutputStream(filePath.toFile())) { + workbook.write(fos); + } + } catch (IOException e) { + e.printStackTrace(); + } + + logger.info("excel导出完成!"); + } else logger.info("获取数据为空,excel未导出"); + + } catch (Exception e) { + e.printStackTrace(); + } + } } \ No newline at end of file diff --git a/spiders/MediaSpiders/MediaSpiders/settings.py b/spiders/MediaSpiders/MediaSpiders/settings.py index 7215e25..666aa33 100644 --- a/spiders/MediaSpiders/MediaSpiders/settings.py +++ b/spiders/MediaSpiders/MediaSpiders/settings.py @@ -3,6 +3,7 @@ BOT_NAME = 'MediaSpiders' LOG_LEVEL = 'INFO' +# LOG_LEVEL = 'DEBUG' SPIDER_MODULES = ['MediaSpiders.spiders'] NEWSPIDER_MODULE = 'MediaSpiders.spiders' @@ -110,8 +111,8 @@ CUSTOM_USER_AGENT = [ # 部署在外网采集fb时使用selenium_chrome SELENIUM_DRIVER_NAME = 'chrome' -# SELENIUM_DRIVER_EXECUTABLE_PATH = 'local' -SELENIUM_DRIVER_EXECUTABLE_PATH = 'http://144.34.185.108:28098' +SELENIUM_DRIVER_EXECUTABLE_PATH = 'local' +# SELENIUM_DRIVER_EXECUTABLE_PATH = 'http://144.34.185.108:28098' SELENIUM_DRIVER_ARGUMENTS = [ '--headless', '--no-sandbox', diff --git a/spiders/MediaSpiders/MediaSpiders/spiders/WechatLinksFetcherSpider.py b/spiders/MediaSpiders/MediaSpiders/spiders/WechatLinksFetcherSpider.py index b6847d8..d89ca17 100644 --- a/spiders/MediaSpiders/MediaSpiders/spiders/WechatLinksFetcherSpider.py +++ b/spiders/MediaSpiders/MediaSpiders/spiders/WechatLinksFetcherSpider.py @@ -76,14 +76,15 @@ class WechatLinksFetcherSpider(scrapy.Spider): def parse(self, response): driver = response.request.meta['driver'] - cookie_list = self.redis_client.lrange("MediaSpiders:WeChatLinksFetcher_Cookies", 0, -1) + cookies_key = "MediaSpiders:WeChatLinksFetcher_Cookies" + cookie_list = self.redis_client.lrange(cookies_key, 0, -1) cookie_parts = [ item.decode('utf-8') if isinstance(item, bytes) else str(item) for item in cookie_list ] - # 尝试使用 Redis 中的 cookie 登录 - for item in cookie_parts: + # 遍历cookies,记录当前索引 + for cookie_index, item in enumerate(cookie_parts): try: driver.delete_all_cookies() driver.get('https://mp.weixin.qq.com/') @@ -99,7 +100,7 @@ class WechatLinksFetcherSpider(scrapy.Spider): else: logger.warning(f"跳过 cookie: {name}") - logger.info(f"成功添加 {success_count}/{len(cookie_dict)} 个 cookie") + logger.info(f"成功添加 {success_count}/{len(cookie_dict)} 个 cookie (索引: {cookie_index})") # 验证 cookie 是否有效 driver.refresh() @@ -204,6 +205,9 @@ class WechatLinksFetcherSpider(scrapy.Spider): if err_msg == "freq control" or err_msg == "invalid session": logger.info("接口频率限制,稍后再试,本次获取结束") break_flag = True + + # 删除当前使用的cookie + self._remove_invalid_cookie(cookies_key, cookie_index) break if not break_flag: @@ -222,12 +226,32 @@ class WechatLinksFetcherSpider(scrapy.Spider): if rsp_body['base_resp']['err_msg'] == "freq control": logger.info("接口频率限制,稍后再试,本次获取结束") break_flag = True + + # 删除当前使用的cookie + self._remove_invalid_cookie(cookies_key, cookie_index) break except Exception as e: logger.info(repr(e)) self.redis_client.close() driver.quit() + def _remove_invalid_cookie(self, cookies_key, cookie_index): + """删除无效的cookie""" + try: + # 方法1:标记并删除 + self.redis_client.lset(cookies_key, cookie_index, "__invalid__") + self.redis_client.lrem(cookies_key, 1, "__invalid__") + logger.info(f"已删除无效的cookie,索引: {cookie_index}") + + # 方法2:或者直接删除整个列表(如果cookie全部无效) + # cookie_count = self.redis_client.llen(cookies_key) + # if cookie_count <= 1: + # self.redis_client.delete(cookies_key) + # logger.info(f"已删除所有cookies: {cookies_key}") + + except Exception as e: + logger.error(f"删除cookie失败: {e}") + def parse_cookie_string(cookie_str): """解析 cookie 字符串为 dict""" @@ -280,3 +304,6 @@ def add_cookie_smart(driver, name, value, target_domain='mp.weixin.qq.com'): # logger.warning(f"✗ {name} failed: {e}") return False return False # 所有 domain 都失败 + + + diff --git a/spiders/MediaSpiders/MediaSpiders/spiders/__init__.py b/spiders/MediaSpiders/MediaSpiders/spiders/__init__.py index ebd689a..3980b75 100644 --- a/spiders/MediaSpiders/MediaSpiders/spiders/__init__.py +++ b/spiders/MediaSpiders/MediaSpiders/spiders/__init__.py @@ -2,3 +2,5 @@ # # Please refer to the documentation for information on how to create and manage # your spiders. + + diff --git a/spiders/MediaSpiders/MediaSpiders/utils/wechat_links_fetcher_add_cookie.py b/spiders/MediaSpiders/MediaSpiders/utils/wechat_links_fetcher_add_cookie.py new file mode 100644 index 0000000..69106b9 --- /dev/null +++ b/spiders/MediaSpiders/MediaSpiders/utils/wechat_links_fetcher_add_cookie.py @@ -0,0 +1,353 @@ +import time +import logging as logger + +from selenium.webdriver.common.by import By +import redis +import requests +from selenium import webdriver +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC + +from MediaSpiders.settings import REDIS_HOST, REDIS_PORT, REDIS_PWD + +chrome_options = Options() +# 指定 chrome.exe 的完整路径 +chrome_options.binary_location = r"D:\chrome-win64\chrome.exe" +# chrome_options.use_chromium = True +driver = webdriver.Chrome( + executable_path=r"D:\chromedriver-win64\chromedriver.exe", + options=chrome_options +) +driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", { + "source": "Object.defineProperty(navigator, 'webdriver', {get: () => undefined})" +}) + +# Redis连接 +redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PWD, decode_responses=True) +COOKIE_KEY = "MediaSpiders:WeChatLinksFetcher_Cookies" + + +def parse_cookie_string(cookie_str): + """解析 cookie 字符串为 dict""" + cookie_dict = {} + for item in cookie_str.split(';'): + if '=' in item: + name, value = item.split('=', 1) + cookie_dict[name.strip()] = value.strip() + return cookie_dict + + +def add_cookie_smart(driver, name, value, target_domain='mp.weixin.qq.com'): + """ + 智能添加 cookie:先试目标域名,失败则试父域,再失败则跳过 + """ + # 微信核心 cookie 必须用 mp.weixin.qq.com + wechat_critical = ['wxuin', 'slave_sid', 'slave_user', 'bizuin', 'data_ticket', 'token'] + + # 腾讯通用 cookie 可尝试父域 + tencent_common = ['ptui_loginuin', 'RK', 'ptcz', 'ua_id'] + + # 策略 1: 核心 cookie → 精确域名 + if name in wechat_critical: + domains_to_try = [target_domain] + # 策略 2: 腾讯通用 cookie → 先试目标域,再试父域 + elif name in tencent_common: + domains_to_try = [target_domain, '.weixin.qq.com', '.qq.com'] + # 策略 3: 其他 cookie → 默认 host-only(不传 domain) + else: + domains_to_try = [None, target_domain] + + for domain in domains_to_try: + cookie = { + 'name': name, + 'value': value, + 'path': '/', + 'secure': True + } + if domain: + cookie['domain'] = domain + + try: + driver.add_cookie(cookie) + # logger.debug(f"✓ {name} added with domain={domain or 'host-only'}") + return True + except Exception as e: + if 'invalid cookie domain' in str(e): + continue # 尝试下一个 domain + else: + # logger.warning(f"✗ {name} failed: {e}") + return False + return False # 所有 domain 都失败 + + +def is_cookie_exists(cookie_str): + """ + 判断cookie是否已存在Redis中 + 返回: (exists, duplicate_index) + """ + try: + # 解析新cookie + new_cookie_dict = parse_cookie_string(cookie_str) + + # 获取Redis中所有现有cookie + existing_cookies = redis_client.lrange(COOKIE_KEY, 0, -1) + + for idx, existing_cookie in enumerate(existing_cookies): + try: + existing_dict = parse_cookie_string(existing_cookie) + + # 检查关键字段是否匹配 + # 微信cookie的关键识别字段 + key_fields = ['wxuin', 'slave_sid', 'slave_user', 'bizuin'] + + matches = 0 + for field in key_fields: + if field in new_cookie_dict and field in existing_dict: + if new_cookie_dict[field] == existing_dict[field]: + matches += 1 + + # 如果匹配到2个以上关键字段,认为是同一个cookie + if matches >= 2: + return True, idx + + # 或者检查slave_sid(最独特的标识) + if 'slave_sid' in new_cookie_dict and 'slave_sid' in existing_dict: + if new_cookie_dict['slave_sid'] == existing_dict['slave_sid']: + return True, idx + + except Exception as e: + logger.warning(f"解析现有cookie时出错: {e}") + continue + + return False, -1 + + except Exception as e: + logger.error(f"判断cookie是否存在时出错: {e}") + return False, -1 + + +def save_cookie_to_redis(cookie_str, force_save=False): + """ + 保存cookie到Redis,自动去重 + + Args: + cookie_str: cookie字符串 + force_save: 是否强制保存(即使存在也保存) + + Returns: + bool: 是否保存成功 + """ + try: + # 检查是否已存在 + exists, idx = is_cookie_exists(cookie_str) + + if exists and not force_save: + logger.info(f"Cookie已存在 (索引: {idx}),跳过保存") + return False + + if exists and force_save: + # 删除旧的,保存新的 + redis_client.lset(COOKIE_KEY, idx, cookie_str) + logger.info(f"已更新现有cookie (索引: {idx})") + return True + else: + # 添加新cookie + redis_client.rpush(COOKIE_KEY, cookie_str) + logger.info(f"已添加新cookie,当前总数: {redis_client.llen(COOKIE_KEY)}") + return True + + except Exception as e: + logger.error(f"保存cookie到Redis失败: {e}") + return False + +def cookie_dict_to_string(cookie_dict): + """将cookie字典转换为字符串""" + return '; '.join([f"{k}={v}" for k, v in cookie_dict.items()]) + +def manual_login_and_get_cookie(): + """ + 手动扫码登录并获取cookie + """ + logger.info("开始手动扫码登录流程...") + + try: + # 访问微信公众平台 + driver.get("https://mp.weixin.qq.com/") + time.sleep(3) + + # 等待页面加载 + WebDriverWait(driver, 10).until( + EC.presence_of_element_located((By.TAG_NAME, "body")) + ) + + # 检查是否已登录(通过URL是否包含token) + if "token=" in driver.current_url: + logger.info("检测到已登录状态,直接获取cookie") + else: + logger.info("请手动扫描二维码登录...") + logger.info("等待登录完成...") + + # 等待登录成功(等待URL变化或特定元素出现) + try: + # 等待URL中包含token + WebDriverWait(driver, 120).until( + lambda d: "token=" in d.current_url + ) + logger.info("检测到登录成功!") + time.sleep(3) + except Exception as e: + logger.error("等待登录超时") + return None + + # 获取当前页面所有cookies + cookies = driver.get_cookies() + + if not cookies: + logger.error("未获取到cookies") + return None + + # 转换为字符串格式 + cookie_dict = {} + for cookie in cookies: + cookie_dict[cookie['name']] = cookie['value'] + + cookie_string = cookie_dict_to_string(cookie_dict) + + # 获取token信息 + token = None + if "token=" in driver.current_url: + token_index = driver.current_url.rfind('token=') + token = driver.current_url[token_index + 6:] + logger.info(f"获取到token: {token}") + + logger.info(f"获取到 {len(cookie_dict)} 个cookie") + + return { + 'cookie_dict': cookie_dict, + 'cookie_string': cookie_string, + 'token': token, + 'raw_cookies': cookies + } + + except Exception as e: + logger.error(f"手动登录失败: {e}") + return None + + +def verify_cookie_valid(cookie_dict, token=None): + """ + 验证cookie是否有效 + + Args: + cookie_dict: cookie字典 + token: token字符串 + + Returns: + bool: cookie是否有效 + """ + try: + if not token: + # 如果没有token,尝试从cookie中构建 + pass + + # 构建请求头 + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', + 'Referer': f'https://mp.weixin.qq.com/', + } + + # 尝试访问一个需要登录的接口 + test_api = f'https://mp.weixin.qq.com/cgi-bin/bizlogin?action=validate&lang=zh_CN' + + response = requests.get(test_api, cookies=cookie_dict, headers=headers, timeout=10) + + # 检查响应 + if response.status_code == 200: + try: + data = response.json() + if 'base_resp' in data: + err_msg = data['base_resp'].get('err_msg', '') + if err_msg in ['ok', '']: + logger.info("cookie验证有效") + return True + else: + logger.warning(f"cookie验证返回错误: {err_msg}") + return False + except: + # 如果不是JSON响应,可能仍然有效 + logger.info("cookie可能有效") + return True + else: + logger.warning(f"cookie验证请求失败: {response.status_code}") + return False + + except Exception as e: + logger.error(f"验证cookie时出错: {e}") + return False + + +def main(type): + """主函数""" + logger.info("微信公众号Cookie获取工具") + + try: + # 1. 手动登录获取cookie + result = manual_login_and_get_cookie() + + if not result: + logger.error("获取cookie失败") + return + + cookie_string = result['cookie_string'] + cookie_dict = result['cookie_dict'] + token = result['token'] + + # 2. 验证cookie有效性 + logger.info("正在验证cookie有效性...") + is_valid = verify_cookie_valid(cookie_dict, token) + + if not is_valid: + logger.warning("cookie可能无效,但仍会保存") + + # 3. 检查cookie是否已存在并保存 + logger.info("正在检查cookie是否已存在...") + exists, idx = is_cookie_exists(cookie_string) + + if exists: + logger.info(f"Cookie已存在 (索引: {idx})") + + # 询问是否覆盖 + choice = type + if choice == 'y': + saved = save_cookie_to_redis(cookie_string, force_save=True) + if saved: + logger.info("已覆盖更新cookie") + else: + logger.info("取消保存") + else: + # 保存新cookie + saved = save_cookie_to_redis(cookie_string) + if saved: + logger.info("新cookie保存成功") + + # 4. 显示当前cookie列表状态 + total_cookies = redis_client.llen(COOKIE_KEY) + logger.info(f"当前Redis中cookie总数: {total_cookies}") + + + except KeyboardInterrupt: + logger.info("用户中断程序") + except Exception as e: + logger.error(f"程序执行出错: {e}") + finally: + driver.quit() + logger.info("程序结束") + +if __name__ == "__main__": + # 是否覆盖原有cookie? y:覆盖、n:不覆盖 + type = 'y' + # 运行主程序 + main(type) + + diff --git a/spiders/WebsiteSpider/WebsiteSpider/utils/http_utils.py b/spiders/WebsiteSpider/WebsiteSpider/utils/http_utils.py index 3fdf289..13a1da2 100644 --- a/spiders/WebsiteSpider/WebsiteSpider/utils/http_utils.py +++ b/spiders/WebsiteSpider/WebsiteSpider/utils/http_utils.py @@ -20,11 +20,11 @@ def http_get(url): return rsp -def http_post(url, data, headers=None): +def http_post(url, data, headers=None, timeout=60): if headers: - rsp = requests.post(url, data=data, headers=headers) + rsp = requests.post(url, data=data, headers=headers, timeout=timeout) else: - rsp = requests.post(url, data=data, headers={'User-Agent': ua}) + rsp = requests.post(url, data=data, headers={'User-Agent': ua}, timeout=timeout) return rsp diff --git a/spiders/WebsiteSpider/WebsiteSpider/utils/selenium_utils.py b/spiders/WebsiteSpider/WebsiteSpider/utils/selenium_utils.py index 258ca43..8e34698 100644 --- a/spiders/WebsiteSpider/WebsiteSpider/utils/selenium_utils.py +++ b/spiders/WebsiteSpider/WebsiteSpider/utils/selenium_utils.py @@ -26,8 +26,8 @@ def check_session(drive_path): api = drive_path + '/graphql' post_body = '{"query": "{ grid { maxSession, sessionCount } }"}' try: - # 添加超时控制,5分钟 = 300秒 - response = http_post(api, post_body, timeout=300) + # 添加超时控制,1分钟 = 600秒 + response = http_post(api, post_body, timeout=60) data_body = json.loads(response.content.decode()) session_info = data_body['data']['grid'] return session_info diff --git a/spiders/WebsiteSpider/run.py b/spiders/WebsiteSpider/run.py index 9d09b3c..80ce930 100644 --- a/spiders/WebsiteSpider/run.py +++ b/spiders/WebsiteSpider/run.py @@ -9,4 +9,4 @@ dirpath = os.path.dirname(os.path.abspath(__file__)) sys.path.append(dirpath) if __name__ == "__main__": - execute(['scrapy', 'crawl', 'website_info_common', '-a', 'params={"job_id":"801","clusterName":"star_4"}']) + execute(['scrapy', 'crawl', 'Website_report_list', '-a', 'params={"job_id":"801","clusterName":"star_4"}'])