[今日头条] 视频页面采集优化

This commit is contained in:
DELL 2026-03-06 14:37:12 +08:00
parent 592e8b6cfb
commit 734cab960b

View File

@ -48,7 +48,7 @@ class HotSearchRedisSpider(scrapy.Spider):
PAGE_LOAD_TIMEOUT = 10
ELEMENT_WAIT_TIMEOUT = 5
MAX_NEWS_PER_HOT = 1
MAX_HOT_ITEMS = 10
MAX_HOT_ITEMS = 15
# 选择器定义
# URL_SELECTORS = [
@ -271,9 +271,9 @@ class HotSearchRedisSpider(scrapy.Spider):
except NoSuchElementException:
break
if "video" in news_url.lower() or not news_url:
self.logger.info(f"跳过该链接采集: {news_url}")
break
# if "video" in news_url.lower() or not news_url:
# self.logger.info(f"跳过该链接采集: {news_url}")
# break
news_urls_array.append(news_url)
else:
@ -301,6 +301,21 @@ class HotSearchRedisSpider(scrapy.Spider):
def _process_news_page(self, driver, news_url: str, hot_item: Dict[str, Any]):
"""处理单个新闻页面"""
try:
if "video" in news_url.lower() or not news_url:
driver.get(news_url)
self._wait_for_page_load(driver)
title = driver.find_elements(By.XPATH, "//div[@class='ttp-video-extras-title']/h1")[0].get_attribute('title')
time_text = driver.find_elements(By.XPATH, "//div[@class='meta-info']/span[@class='publish-time']")[0].text.replace("发布于 ", "").strip()
author = driver.find_elements(By.XPATH, "//div[@class='author-info']/a[@class='author-name']")[0].text.strip()
content = news_url
if time_text:
try:
url_time = get_time_stamp(time_text)
except Exception as e:
self.logger.debug(f"时间转换失败: {time_text}, {e}")
else:
driver.get(news_url)
self._wait_for_page_load(driver)