From 592e8b6cfbcfa2669678fcb35b0de54396691477 Mon Sep 17 00:00:00 2001 From: DELL Date: Thu, 5 Mar 2026 10:13:00 +0800 Subject: [PATCH] =?UTF-8?q?[=E4=BB=8A=E6=97=A5=E5=A4=B4=E6=9D=A1]=20?= =?UTF-8?q?=E6=A0=87=E9=A2=98=E9=87=87=E9=9B=86=E4=BC=98=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../MediaSpiders/spiders/HotSearchRedisSpider.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/spiders/MediaSpiders/MediaSpiders/spiders/HotSearchRedisSpider.py b/spiders/MediaSpiders/MediaSpiders/spiders/HotSearchRedisSpider.py index c707258..5576181 100644 --- a/spiders/MediaSpiders/MediaSpiders/spiders/HotSearchRedisSpider.py +++ b/spiders/MediaSpiders/MediaSpiders/spiders/HotSearchRedisSpider.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- import json +import logging import time from typing import List, Dict, Any, Optional @@ -303,6 +304,12 @@ class HotSearchRedisSpider(scrapy.Spider): driver.get(news_url) self._wait_for_page_load(driver) + # 标题采集 + try: + title = driver.find_elements(By.XPATH, "//div[@class='article-content']/h1")[0].text.strip() + except Exception as e: + logging.error(f'标题采集失败,已使用热搜名称...') + title = hot_item['hot_word'] # 提取页面信息 author = self._extract_text(driver, self.AUTHOR_SELECTORS) content = self._extract_content(driver) @@ -318,7 +325,7 @@ class HotSearchRedisSpider(scrapy.Spider): even_details_item['es_srcname'] = 'https://www.toutiao.com/' even_details_item['es_sitename'] = '今日头条' even_details_item['es_sid'] = get_str_md5(news_url) - even_details_item['es_urltitle'] = author + even_details_item['es_urltitle'] = title even_details_item['es_authors'] = author even_details_item['es_urlcontent'] = content even_details_item['es_urltime'] = url_time