diff --git a/spiders/MediaSpiders/MediaSpiders/spiders/HotSearchRedisSpider.py b/spiders/MediaSpiders/MediaSpiders/spiders/HotSearchRedisSpider.py index c707258..5576181 100644 --- a/spiders/MediaSpiders/MediaSpiders/spiders/HotSearchRedisSpider.py +++ b/spiders/MediaSpiders/MediaSpiders/spiders/HotSearchRedisSpider.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- import json +import logging import time from typing import List, Dict, Any, Optional @@ -303,6 +304,12 @@ class HotSearchRedisSpider(scrapy.Spider): driver.get(news_url) self._wait_for_page_load(driver) + # 标题采集 + try: + title = driver.find_elements(By.XPATH, "//div[@class='article-content']/h1")[0].text.strip() + except Exception as e: + logging.error(f'标题采集失败,已使用热搜名称...') + title = hot_item['hot_word'] # 提取页面信息 author = self._extract_text(driver, self.AUTHOR_SELECTORS) content = self._extract_content(driver) @@ -318,7 +325,7 @@ class HotSearchRedisSpider(scrapy.Spider): even_details_item['es_srcname'] = 'https://www.toutiao.com/' even_details_item['es_sitename'] = '今日头条' even_details_item['es_sid'] = get_str_md5(news_url) - even_details_item['es_urltitle'] = author + even_details_item['es_urltitle'] = title even_details_item['es_authors'] = author even_details_item['es_urlcontent'] = content even_details_item['es_urltime'] = url_time