[今日头条] 标题采集优化

This commit is contained in:
DELL 2026-03-05 10:13:00 +08:00
parent 5b3bf034f8
commit 592e8b6cfb

View File

@ -1,5 +1,6 @@
# -*- coding: utf-8 -*-
import json
import logging
import time
from typing import List, Dict, Any, Optional
@ -303,6 +304,12 @@ class HotSearchRedisSpider(scrapy.Spider):
driver.get(news_url)
self._wait_for_page_load(driver)
# 标题采集
try:
title = driver.find_elements(By.XPATH, "//div[@class='article-content']/h1")[0].text.strip()
except Exception as e:
logging.error(f'标题采集失败,已使用热搜名称...')
title = hot_item['hot_word']
# 提取页面信息
author = self._extract_text(driver, self.AUTHOR_SELECTORS)
content = self._extract_content(driver)
@ -318,7 +325,7 @@ class HotSearchRedisSpider(scrapy.Spider):
even_details_item['es_srcname'] = 'https://www.toutiao.com/'
even_details_item['es_sitename'] = '今日头条'
even_details_item['es_sid'] = get_str_md5(news_url)
even_details_item['es_urltitle'] = author
even_details_item['es_urltitle'] = title
even_details_item['es_authors'] = author
even_details_item['es_urlcontent'] = content
even_details_item['es_urltime'] = url_time