From d01035577bbc925945ca180ac7e47b110e6c0e93 Mon Sep 17 00:00:00 2001 From: DELL Date: Sat, 28 Feb 2026 16:15:32 +0800 Subject: [PATCH] =?UTF-8?q?[=E4=BB=8A=E6=97=A5=E5=A4=B4=E6=9D=A1]=20?= =?UTF-8?q?=E9=87=87=E9=9B=86=E5=85=B3=E8=81=94=E6=96=B0=E9=97=BB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../scrapy_selenium/middlewares.py | 2 +- .../spiders/BaiduHotSearchSprder.py | 1 + .../spiders/HotSearchRedisSpider.py | 295 ++++++++++++++++++ .../MediaSpiders/spiders/hot_search_spider.py | 90 ++++-- .../utils/hot_search_json_parser.py | 13 +- .../MediaSpiders/utils/time_utils.py | 61 +++- spiders/MediaSpiders/run.py | 2 +- 7 files changed, 431 insertions(+), 33 deletions(-) create mode 100644 spiders/MediaSpiders/MediaSpiders/spiders/HotSearchRedisSpider.py diff --git a/spiders/MediaSpiders/MediaSpiders/scrapy_selenium/middlewares.py b/spiders/MediaSpiders/MediaSpiders/scrapy_selenium/middlewares.py index bc3b37d..acc696a 100644 --- a/spiders/MediaSpiders/MediaSpiders/scrapy_selenium/middlewares.py +++ b/spiders/MediaSpiders/MediaSpiders/scrapy_selenium/middlewares.py @@ -76,7 +76,7 @@ class SeleniumMiddleware: } edge_options.add_experimental_option("prefs", prefs) - self.driver = Edge(executable_path=r"C:\Program Files\Python38\msedgedriver.exe", options=edge_options) + self.driver = Edge(executable_path=r"D:\msedgedriver.exe", options=edge_options) @classmethod def from_crawler(cls, crawler): diff --git a/spiders/MediaSpiders/MediaSpiders/spiders/BaiduHotSearchSprder.py b/spiders/MediaSpiders/MediaSpiders/spiders/BaiduHotSearchSprder.py index d9c6d70..f6ac5eb 100644 --- a/spiders/MediaSpiders/MediaSpiders/spiders/BaiduHotSearchSprder.py +++ b/spiders/MediaSpiders/MediaSpiders/spiders/BaiduHotSearchSprder.py @@ -45,6 +45,7 @@ class BaiduHotSearchSprder(scrapy.Spider): 'MediaSpiders.middlewares.DumpFilterSpiderMiddleware': 543, 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware': None }, + 'DOWNLOADER_MIDDLEWARES': {}, 'BATCH_SAVE_SIZE': 50 } diff --git a/spiders/MediaSpiders/MediaSpiders/spiders/HotSearchRedisSpider.py b/spiders/MediaSpiders/MediaSpiders/spiders/HotSearchRedisSpider.py new file mode 100644 index 0000000..94438ea --- /dev/null +++ b/spiders/MediaSpiders/MediaSpiders/spiders/HotSearchRedisSpider.py @@ -0,0 +1,295 @@ +# -*- coding: utf-8 -*- +import json +import time +from typing import List, Dict, Any, Optional + +import scrapy +from scrapy_selenium import SeleniumRequest +from selenium.common.exceptions import TimeoutException, NoSuchElementException +from selenium.webdriver.common.by import By +from selenium.webdriver.remote.webelement import WebElement +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.support.ui import WebDriverWait + +from MediaSpiders.items import MediaspidersItem +from MediaSpiders.utils.date_utils import get_time_stamp +from MediaSpiders.utils.hot_search_json_parser import url_response +from MediaSpiders.utils.string_utils import get_str_md5 +from MediaSpiders.utils.time_utils import get_current_timestamp + + +class HotSearchRedisSpider(scrapy.Spider): + name = 'HotSearchRedisSpider' + + custom_settings = { + 'PROTO_MODULE_PATH': 'MediaSpiders.proto.Es_pb2', + 'PROTO_CLASS_NAME': 'EsSets', + 'PROTO_FIELD_NAME': 'Es', + 'PROTO_SAVE_FILE_NAME': 'public_info_data_', + 'IMAGES_STORE': r'/usr/local/temp_image/twitter', + 'IMAGES_RESULT_FIELD': 'es_urlimage', + 'FILES_STORE': r'/usr/local/videos', + 'FILES_RESULT_FIELD': 'es_video', + 'ZIP_FILE_NAME': 'image_data_ship_', + 'FILE_ZIP_FILE_NAME': 'image_data_plane_', + 'ITEM_PIPELINES': { + 'scrapy.pipelines.images.ImagesPipeline': 2, + 'scrapy.pipelines.files.FilesPipeline': 1, + 'MediaSpiders.pipelines.ProtobufSavePipeline': 300, + }, + 'SPIDER_MIDDLEWARES': { + 'MediaSpiders.middlewares.DumpFilterSpiderMiddleware': 543, + 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware': None + } + } + + # 常量定义 + TOUTIAO_HOT_URL = 'https://www.toutiao.com/hot-event/hot-board/?origin=toutiao_pc' + BAIDU_URL = 'https://www.toutiao.com/' + PAGE_LOAD_TIMEOUT = 10 + ELEMENT_WAIT_TIMEOUT = 5 + MAX_NEWS_PER_HOT = 6 + MAX_HOT_ITEMS = 10 + + # 选择器定义 + URL_SELECTORS = [ + '.l-content a', + '.feed-card-wtt-l p a', + '.feed-card-article-l a' + ] + + AUTHOR_SELECTORS = [ + "//div[@class='author-info']/div[@class='desc']/a[@class='name']", + "//div[@class='user-info']/a[@class='user-name']" + ] + + CONTENT_SELECTORS = [ + "//div[@class='article-content']//p", + "//article/div[@class='weitoutiao-html']" + ] + + TIME_SELECTORS = [ + "//p[@class='abstract']/span[@class='time']", + "//div[@class='article-meta']/span[1]" + ] + + # 需要过滤的文本模式 + SKIP_PATTERNS = ['版权', '声明', '邮箱', '记者', '编辑', '来源', '投稿', '责任编辑'] + + def __init__(self, params=None, *args, **kwargs): + super().__init__(*args, **kwargs) + self.url_time = get_current_timestamp() + self.total_num = 0 + self.authorization = None + self.job_id = None + + if params: + try: + json_params = json.loads(params) + self.total_num = int(json_params.get('totalNum', 0)) + self.authorization = json_params.get('authorization') + self.job_id = json_params.get('job_id') + except (json.JSONDecodeError, ValueError) as e: + self.logger.error(f"解析参数失败: {e}") + + def start_requests(self): + """开始请求""" + yield SeleniumRequest( + url=self.BAIDU_URL, + callback=self.parse_parent, + wait_time=self.PAGE_LOAD_TIMEOUT + ) + + def parse_parent(self, response): + """解析热点列表页面""" + driver = response.request.meta['driver'] + + # 获取热点数据 + hot_items = self._fetch_hot_items() + + for hot_item in hot_items[:self.MAX_HOT_ITEMS]: + if not hot_item.get('fake_url'): + self.logger.warning(f"热点 {hot_item['hot_word']} 无有效URL,跳过") + continue + + yield from self._process_hot_item(driver, hot_item) + + def _fetch_hot_items(self) -> List[Dict[str, Any]]: + """获取热点数据""" + try: + rsp_body = url_response(self.TOUTIAO_HOT_URL) + if rsp_body.get('status') != "success": + self.logger.error("获取热点数据失败") + return [] + + result_array = [] + for line in rsp_body.get('data', []): + try: + result_array.append({ + "hot_id": line.get('ClusterIdStr', ''), + "hot_word": line.get('Title', ''), + "hot_value": int(line.get('HotValue', 0)), + "fake_url": line.get('Url', '') + }) + except Exception as e: + self.logger.error(f"解析热点数据失败: {e}") + self.logger.debug(f"问题数据: {line}") + + return result_array + + except Exception as e: + self.logger.error(f"获取热点数据异常: {e}") + return [] + + def _process_hot_item(self, driver, hot_item: Dict[str, Any]): + """处理单个热点项""" + try: + # 加载热点页面 + driver.get(hot_item['fake_url']) + self._wait_for_page_load(driver) + + # 获取新闻卡片 + news_cards = self._get_news_cards(driver) + news_urls_array = [] + for card in news_cards: + news_url = self._extract_url_from_card(card) + if not news_url: + continue + + if "video" in news_url.lower(): + self.logger.info(f"跳过视频链接: {news_url}") + continue + news_urls_array.append(news_url) + + for url in news_urls_array: + yield from self._process_news_page(driver, url, hot_item) + + except Exception as e: + self.logger.error(f"处理热点 '{hot_item['hot_word']}' 失败: {e}") + + def _wait_for_page_load(self, driver, timeout: int = None): + """等待页面加载""" + timeout = timeout or self.PAGE_LOAD_TIMEOUT + time.sleep(2) # 基础等待 + try: + WebDriverWait(driver, timeout).until( + EC.presence_of_element_located((By.TAG_NAME, "body")) + ) + except TimeoutException: + self.logger.warning("页面加载超时") + + def _get_news_cards(self, driver) -> List[WebElement]: + """获取新闻卡片列表""" + try: + cards = driver.find_elements(By.CSS_SELECTOR, ".feed-card-wrapper") + return cards[:self.MAX_NEWS_PER_HOT] + except Exception as e: + self.logger.error(f"获取新闻卡片失败: {e}") + return [] + + def _extract_url_from_card(self, card: WebElement) -> Optional[str]: + """从卡片中提取URL""" + for selector in self.URL_SELECTORS: + try: + element = card.find_element(By.CSS_SELECTOR, selector) + url = element.get_attribute('href') + if url and url.startswith(('http://', 'https://')): + return url + except NoSuchElementException: + continue + return None + + def _process_news_page(self, driver, news_url: str, hot_item: Dict[str, Any]): + """处理单个新闻页面""" + try: + driver.get(news_url) + self._wait_for_page_load(driver) + + # 提取页面信息 + author = self._extract_text(driver, self.AUTHOR_SELECTORS) + content = self._extract_content(driver) + url_time = self._extract_time(driver) + + if not content: + self.logger.warning(f"页面无有效内容: {news_url}") + return + + # 创建item + even_details_item = MediaspidersItem() + even_details_item['es_carriertype'] = 'news' + even_details_item['es_srcname'] = 'https://www.toutiao.com/' + even_details_item['es_sitename'] = '今日头条' + even_details_item['es_sid'] = get_str_md5(news_url) + even_details_item['es_urltitle'] = author + even_details_item['es_authors'] = author + even_details_item['es_urlcontent'] = content + even_details_item['es_urltime'] = url_time + even_details_item['es_lasttime'] = url_time + even_details_item['es_urlname'] = news_url + even_details_item['es_hkey'] = hot_item['hot_id'] + even_details_item['es_urltopic'] = hot_item['hot_word'] + even_details_item['es_video'] = '' + + yield even_details_item + + except Exception as e: + self.logger.error(f"处理新闻页面失败 {news_url}: {e}") + + def _extract_text(self, context, selectors: List[str]) -> Optional[str]: + """从上下文中提取可见元素的文本""" + for selector in selectors: + try: + elements = context.find_elements(By.XPATH, selector) + + for elem in elements: + if elem.is_displayed(): + text = elem.text.strip() + if text: + return text + except Exception as e: + self.logger.debug(f"选择器 '{selector}' 未匹配: {e}") + return None + + def _extract_content(self, driver) -> str: + """提取文章内容""" + try: + time.sleep(2) # 等待内容加载 + + content_lines = [] + for selector in self.CONTENT_SELECTORS: + try: + paragraphs = driver.find_elements(By.XPATH, selector) + + for p in paragraphs: + if selector == '.weitoutiao-html': + text = p[0].text.strip() + else: + text = p.text.strip() + + if text != '': + content_lines.append(text) + + except Exception as e: + self.logger.debug(f"选择器 '{selector}' 提取失败: {e}") + + return '\n'.join(content_lines) if content_lines else "" + + except Exception as e: + self.logger.error(f"提取内容失败: {e}") + return "" + + def _is_valid_content(self, text: str) -> bool: + """验证内容是否有效""" + if not text or len(text) <= 10: + return False + return not any(pattern in text for pattern in self.SKIP_PATTERNS) + + def _extract_time(self, driver) -> Optional[int]: + """提取发布时间""" + time_text = self._extract_text(driver, self.TIME_SELECTORS) + if time_text: + try: + return get_time_stamp(time_text) + except Exception as e: + self.logger.debug(f"时间转换失败: {time_text}, {e}") + return self.url_time diff --git a/spiders/MediaSpiders/MediaSpiders/spiders/hot_search_spider.py b/spiders/MediaSpiders/MediaSpiders/spiders/hot_search_spider.py index 217e73f..a7e08a3 100644 --- a/spiders/MediaSpiders/MediaSpiders/spiders/hot_search_spider.py +++ b/spiders/MediaSpiders/MediaSpiders/spiders/hot_search_spider.py @@ -1,7 +1,11 @@ -import scrapy import json -from MediaSpiders.utils.hot_search_json_parser import parse_weibo_response, parse_toutiao_response + +import redis +import scrapy + from MediaSpiders.items import MediaspidersItem +from MediaSpiders.utils.hot_search_json_parser import parse_weibo_response, parse_toutiao_response +from MediaSpiders.utils.time_utils import get_current_timestamp class HotSearchSpider(scrapy.Spider): @@ -22,42 +26,76 @@ class HotSearchSpider(scrapy.Spider): 'MediaSpiders.pipelines.ProtobufSavePipeline': 300, # 'MediaSpiders.pipelines.HotSearchSaveToMySQL': 300 }, - 'SPIDER_MIDDLEWARES': {}, + 'SPIDER_MIDDLEWARES': { + 'MediaSpiders.middlewares.DumpFilterSpiderMiddleware': 543, + 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware': None + }, 'DOWNLOADER_MIDDLEWARES': {}, 'BATCH_SAVE_SIZE': 50 } start_urls = [ - 'https://weibo.com/ajax/side/hotSearch', + # 'https://weibo.com/ajax/side/hotSearch', 'https://www.toutiao.com/hot-event/hot-board/?origin=toutiao_pc' ] def __init__(self, params=None, *args, **kwargs): super(HotSearchSpider, self).__init__(*args, **kwargs) + self.job_id = None + self.collected_items = [] + self.redis_client = redis.Redis( + host=self.settings['REDIS_HOST'], + port=self.settings['REDIS_PORT'], + password=self.settings['REDIS_PWD'] + ) if params: - json_params = json.loads(params) - if 'job_id' in json_params: - self.job_id = json_params['job_id'] + try: + json_params = json.loads(params) + if 'job_id' in json_params: + self.job_id = json_params['job_id'] + if 'max_items' in json_params: + self.max_items = int(json_params['max_items']) + except Exception as e: + self.logger.error(f"解析参数失败: {str(e)}") + + def start_requests(self): + """发起初始请求""" + self.logger.info(f"开始爬取热搜数据,任务ID: {self.job_id if self.job_id else 'N/A'}") + self.url_time = get_current_timestamp() + for url in self.start_urls: + yield scrapy.Request( + url=url, + callback=self.parse + ) def parse(self, response): result_array = [] - if 'weibo.com' in response.url: - result_array = parse_weibo_response(response.text) - elif 'toutiao.com' in response.url: - result_array = parse_toutiao_response(response.text) + try: + if 'weibo.com' in response.url: + result_array = parse_weibo_response(response.text) + elif 'toutiao.com' in response.url: + result_array = parse_toutiao_response(response.text) + + for line in result_array: + hot_search_item = MediaspidersItem() + hot_search_item['es_carriertype'] = 'hot_search' + hot_search_item['es_sid'] = line['id'] + hot_search_item['es_hkey'] = line['hot_id'] + hot_search_item['es_urltitle'] = line['hot_word'] + hot_search_item['es_urlcontent'] = line['hot_word'] + hot_search_item['es_heat'] = line['hot_value'] + hot_search_item['es_catalog'] = line['category'] + hot_search_item['es_simrank'] = line['realtime_rank'] + hot_search_item['es_sitename'] = line['platform'] + hot_search_item['es_urltime'] = line['onboard_time'] + hot_search_item['es_lasttime'] = line['crawl_time'] + hot_search_item['es_urlname'] = line['fake_url'] + + yield hot_search_item + + + except Exception as e: + self.logger.exception(f"解析异常: {str(e)}") + + - for line in result_array: - hot_search_item = MediaspidersItem() - hot_search_item['es_carriertype'] = 'hot_search' - hot_search_item['es_sid'] = line['id'] - hot_search_item['es_hkey'] = line['hot_id'] - hot_search_item['es_urltitle'] = line['hot_word'] - hot_search_item['es_urlcontent'] = line['hot_word'] - hot_search_item['es_heat'] = line['hot_value'] - hot_search_item['es_catalog'] = line['category'] - hot_search_item['es_simrank'] = line['realtime_rank'] - hot_search_item['es_sitename'] = line['platform'] - hot_search_item['es_urltime'] = line['onboard_time'] - hot_search_item['es_lasttime'] = line['crawl_time'] - hot_search_item['es_urlname'] = line['fake_url'] - yield hot_search_item diff --git a/spiders/MediaSpiders/MediaSpiders/utils/hot_search_json_parser.py b/spiders/MediaSpiders/MediaSpiders/utils/hot_search_json_parser.py index 2f4fbda..dc068a2 100644 --- a/spiders/MediaSpiders/MediaSpiders/utils/hot_search_json_parser.py +++ b/spiders/MediaSpiders/MediaSpiders/utils/hot_search_json_parser.py @@ -2,11 +2,13 @@ import json import uuid import logging import time + +import requests + from MediaSpiders.utils.string_utils import get_str_md5 -def parse_weibo_response(rsp_str): - rsp_body = json.loads(rsp_str) +def parse_weibo_response(rsp_body): result_array = [] if rsp_body['ok'] == 1: realtime_data = rsp_body['data']['realtime'] @@ -56,7 +58,7 @@ def parse_toutiao_response(rsp_str): "platform": "今日头条", "onboard_time": current_timestamp, "crawl_time": current_timestamp, - "fake_url": f"https://www.toutiao.com/hot-event/hot-board/{custom_sid}" + "fake_url": line['Url'] } if 'InterestCategory' in line: result_line['category'] = ",".join(line['InterestCategory']) @@ -66,6 +68,11 @@ def parse_toutiao_response(rsp_str): logging.info(json.dumps(line, ensure_ascii=False)) return result_array +def url_response(url): + rsp_str = requests.get(url).text + return json.loads(rsp_str) + + if __name__ == "__main__": # rsp_file = open("./toutiao_hot_search.json", 'r', encoding='utf-8') diff --git a/spiders/MediaSpiders/MediaSpiders/utils/time_utils.py b/spiders/MediaSpiders/MediaSpiders/utils/time_utils.py index b34b829..54eaf3b 100644 --- a/spiders/MediaSpiders/MediaSpiders/utils/time_utils.py +++ b/spiders/MediaSpiders/MediaSpiders/utils/time_utils.py @@ -1,10 +1,67 @@ -import datetime import time - +from datetime import datetime, timezone, timedelta +import re def get_current_timestamp(): return int(time.time() * 1000) +def str_to_timestamp(dt_str: str, tz_offset: int = 8) -> int: + """ + 将时间字符串转为 Unix 时间戳(秒) + + 支持格式: + - 'YYYY-MM-DD HH:MM' + - 'YYYY-MM-DD HH:MM:SS' + - 以及包含额外文本的混合字符串(如:"2026-02-27 20:11·头条新锐创作者") + + Args: + dt_str: 时间字符串(会自动提取其中的时间部分) + tz_offset: 时区偏移(小时),中国用 8 + + Returns: + 整数时间戳 + + Raises: + ValueError: 无法提取有效时间格式时抛出 + """ + # 去除首尾空格 + dt_str = dt_str.strip() + + # 使用正则表达式提取时间部分(匹配 YYYY-MM-DD HH:MM 或 YYYY-MM-DD HH:MM:SS) + time_pattern = r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}(?::\d{2})?)' + match = re.search(time_pattern, dt_str) + + if not match: + raise ValueError(f"无法从字符串中提取有效时间格式: {dt_str}") + + # 获取匹配到的时间字符串 + time_str = match.group(1) + + # 根据格式解析 + try: + if len(time_str) == 16: # 'YYYY-MM-DD HH:MM' + dt = datetime.strptime(time_str, '%Y-%m-%d %H:%M') + elif len(time_str) == 19: # 'YYYY-MM-DD HH:MM:SS' + dt = datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S') + else: + # 尝试自动解析 + for fmt in ['%Y-%m-%d %H:%M', '%Y-%m-%d %H:%M:%S']: + try: + dt = datetime.strptime(time_str, fmt) + break + except ValueError: + continue + else: + raise ValueError(f"无法解析的时间格式: {time_str}") + except ValueError as e: + raise ValueError(f"时间格式解析失败: {time_str}") from e + + # 创建时区 + tz = timezone(timedelta(hours=tz_offset)) + + # 返回时间戳 + return int(dt.replace(tzinfo=tz).timestamp()) + def get_time_stamp(date_str): try: diff --git a/spiders/MediaSpiders/run.py b/spiders/MediaSpiders/run.py index 366e6ef..9bcdf93 100644 --- a/spiders/MediaSpiders/run.py +++ b/spiders/MediaSpiders/run.py @@ -20,4 +20,4 @@ dirpath = os.path.dirname(os.path.abspath(__file__)) sys.path.append(dirpath) # 等效于:scrapy crawl FacebookUserSpider -a params="{}" # execute(['scrapy', 'crawl', 'hot_search_spider', '-a', 'params={}']) -execute(['scrapy', 'crawl', 'BaiduHotSearchSprder', '-a', 'params={}']) \ No newline at end of file +execute(['scrapy', 'crawl', 'HotSearchRedisSpider', '-a', 'params={}']) \ No newline at end of file