[今日头条] 视频页面采集优化

This commit is contained in:
DELL 2026-03-06 14:37:12 +08:00
parent 592e8b6cfb
commit 734cab960b

View File

@ -48,7 +48,7 @@ class HotSearchRedisSpider(scrapy.Spider):
PAGE_LOAD_TIMEOUT = 10 PAGE_LOAD_TIMEOUT = 10
ELEMENT_WAIT_TIMEOUT = 5 ELEMENT_WAIT_TIMEOUT = 5
MAX_NEWS_PER_HOT = 1 MAX_NEWS_PER_HOT = 1
MAX_HOT_ITEMS = 10 MAX_HOT_ITEMS = 15
# 选择器定义 # 选择器定义
# URL_SELECTORS = [ # URL_SELECTORS = [
@ -271,9 +271,9 @@ class HotSearchRedisSpider(scrapy.Spider):
except NoSuchElementException: except NoSuchElementException:
break break
if "video" in news_url.lower() or not news_url: # if "video" in news_url.lower() or not news_url:
self.logger.info(f"跳过该链接采集: {news_url}") # self.logger.info(f"跳过该链接采集: {news_url}")
break # break
news_urls_array.append(news_url) news_urls_array.append(news_url)
else: else:
@ -301,19 +301,34 @@ class HotSearchRedisSpider(scrapy.Spider):
def _process_news_page(self, driver, news_url: str, hot_item: Dict[str, Any]): def _process_news_page(self, driver, news_url: str, hot_item: Dict[str, Any]):
"""处理单个新闻页面""" """处理单个新闻页面"""
try: try:
driver.get(news_url) if "video" in news_url.lower() or not news_url:
self._wait_for_page_load(driver) driver.get(news_url)
self._wait_for_page_load(driver)
title = driver.find_elements(By.XPATH, "//div[@class='ttp-video-extras-title']/h1")[0].get_attribute('title')
time_text = driver.find_elements(By.XPATH, "//div[@class='meta-info']/span[@class='publish-time']")[0].text.replace("发布于 ", "").strip()
author = driver.find_elements(By.XPATH, "//div[@class='author-info']/a[@class='author-name']")[0].text.strip()
content = news_url
# 标题采集 if time_text:
try: try:
title = driver.find_elements(By.XPATH, "//div[@class='article-content']/h1")[0].text.strip() url_time = get_time_stamp(time_text)
except Exception as e: except Exception as e:
logging.error(f'标题采集失败,已使用热搜名称...') self.logger.debug(f"时间转换失败: {time_text}, {e}")
title = hot_item['hot_word']
# 提取页面信息 else:
author = self._extract_text(driver, self.AUTHOR_SELECTORS) driver.get(news_url)
content = self._extract_content(driver) self._wait_for_page_load(driver)
url_time = self._extract_time(driver)
# 标题采集
try:
title = driver.find_elements(By.XPATH, "//div[@class='article-content']/h1")[0].text.strip()
except Exception as e:
logging.error(f'标题采集失败,已使用热搜名称...')
title = hot_item['hot_word']
# 提取页面信息
author = self._extract_text(driver, self.AUTHOR_SELECTORS)
content = self._extract_content(driver)
url_time = self._extract_time(driver)
if not content: if not content:
self.logger.warning(f"页面无有效内容: {news_url}") self.logger.warning(f"页面无有效内容: {news_url}")