[今日头条] 视频页面采集优化
This commit is contained in:
parent
592e8b6cfb
commit
734cab960b
@ -48,7 +48,7 @@ class HotSearchRedisSpider(scrapy.Spider):
|
||||
PAGE_LOAD_TIMEOUT = 10
|
||||
ELEMENT_WAIT_TIMEOUT = 5
|
||||
MAX_NEWS_PER_HOT = 1
|
||||
MAX_HOT_ITEMS = 10
|
||||
MAX_HOT_ITEMS = 15
|
||||
|
||||
# 选择器定义
|
||||
# URL_SELECTORS = [
|
||||
@ -271,9 +271,9 @@ class HotSearchRedisSpider(scrapy.Spider):
|
||||
except NoSuchElementException:
|
||||
break
|
||||
|
||||
if "video" in news_url.lower() or not news_url:
|
||||
self.logger.info(f"跳过该链接采集: {news_url}")
|
||||
break
|
||||
# if "video" in news_url.lower() or not news_url:
|
||||
# self.logger.info(f"跳过该链接采集: {news_url}")
|
||||
# break
|
||||
news_urls_array.append(news_url)
|
||||
|
||||
else:
|
||||
@ -301,6 +301,21 @@ class HotSearchRedisSpider(scrapy.Spider):
|
||||
def _process_news_page(self, driver, news_url: str, hot_item: Dict[str, Any]):
|
||||
"""处理单个新闻页面"""
|
||||
try:
|
||||
if "video" in news_url.lower() or not news_url:
|
||||
driver.get(news_url)
|
||||
self._wait_for_page_load(driver)
|
||||
title = driver.find_elements(By.XPATH, "//div[@class='ttp-video-extras-title']/h1")[0].get_attribute('title')
|
||||
time_text = driver.find_elements(By.XPATH, "//div[@class='meta-info']/span[@class='publish-time']")[0].text.replace("发布于 ", "").strip()
|
||||
author = driver.find_elements(By.XPATH, "//div[@class='author-info']/a[@class='author-name']")[0].text.strip()
|
||||
content = news_url
|
||||
|
||||
if time_text:
|
||||
try:
|
||||
url_time = get_time_stamp(time_text)
|
||||
except Exception as e:
|
||||
self.logger.debug(f"时间转换失败: {time_text}, {e}")
|
||||
|
||||
else:
|
||||
driver.get(news_url)
|
||||
self._wait_for_page_load(driver)
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user