From 734cab960b443d54a7a807fd26e2cdda94bbec89 Mon Sep 17 00:00:00 2001 From: DELL Date: Fri, 6 Mar 2026 14:37:12 +0800 Subject: [PATCH] =?UTF-8?q?[=E4=BB=8A=E6=97=A5=E5=A4=B4=E6=9D=A1]=20?= =?UTF-8?q?=E8=A7=86=E9=A2=91=E9=A1=B5=E9=9D=A2=E9=87=87=E9=9B=86=E4=BC=98?= =?UTF-8?q?=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../spiders/HotSearchRedisSpider.py | 47 ++++++++++++------- 1 file changed, 31 insertions(+), 16 deletions(-) diff --git a/spiders/MediaSpiders/MediaSpiders/spiders/HotSearchRedisSpider.py b/spiders/MediaSpiders/MediaSpiders/spiders/HotSearchRedisSpider.py index 5576181..cc3c120 100644 --- a/spiders/MediaSpiders/MediaSpiders/spiders/HotSearchRedisSpider.py +++ b/spiders/MediaSpiders/MediaSpiders/spiders/HotSearchRedisSpider.py @@ -48,7 +48,7 @@ class HotSearchRedisSpider(scrapy.Spider): PAGE_LOAD_TIMEOUT = 10 ELEMENT_WAIT_TIMEOUT = 5 MAX_NEWS_PER_HOT = 1 - MAX_HOT_ITEMS = 10 + MAX_HOT_ITEMS = 15 # 选择器定义 # URL_SELECTORS = [ @@ -271,9 +271,9 @@ class HotSearchRedisSpider(scrapy.Spider): except NoSuchElementException: break - if "video" in news_url.lower() or not news_url: - self.logger.info(f"跳过该链接采集: {news_url}") - break + # if "video" in news_url.lower() or not news_url: + # self.logger.info(f"跳过该链接采集: {news_url}") + # break news_urls_array.append(news_url) else: @@ -301,19 +301,34 @@ class HotSearchRedisSpider(scrapy.Spider): def _process_news_page(self, driver, news_url: str, hot_item: Dict[str, Any]): """处理单个新闻页面""" try: - driver.get(news_url) - self._wait_for_page_load(driver) + if "video" in news_url.lower() or not news_url: + driver.get(news_url) + self._wait_for_page_load(driver) + title = driver.find_elements(By.XPATH, "//div[@class='ttp-video-extras-title']/h1")[0].get_attribute('title') + time_text = driver.find_elements(By.XPATH, "//div[@class='meta-info']/span[@class='publish-time']")[0].text.replace("发布于 ", "").strip() + author = driver.find_elements(By.XPATH, "//div[@class='author-info']/a[@class='author-name']")[0].text.strip() + content = news_url - # 标题采集 - try: - title = driver.find_elements(By.XPATH, "//div[@class='article-content']/h1")[0].text.strip() - except Exception as e: - logging.error(f'标题采集失败,已使用热搜名称...') - title = hot_item['hot_word'] - # 提取页面信息 - author = self._extract_text(driver, self.AUTHOR_SELECTORS) - content = self._extract_content(driver) - url_time = self._extract_time(driver) + if time_text: + try: + url_time = get_time_stamp(time_text) + except Exception as e: + self.logger.debug(f"时间转换失败: {time_text}, {e}") + + else: + driver.get(news_url) + self._wait_for_page_load(driver) + + # 标题采集 + try: + title = driver.find_elements(By.XPATH, "//div[@class='article-content']/h1")[0].text.strip() + except Exception as e: + logging.error(f'标题采集失败,已使用热搜名称...') + title = hot_item['hot_word'] + # 提取页面信息 + author = self._extract_text(driver, self.AUTHOR_SELECTORS) + content = self._extract_content(driver) + url_time = self._extract_time(driver) if not content: self.logger.warning(f"页面无有效内容: {news_url}")