[今日头条] 视频页面采集优化
This commit is contained in:
parent
592e8b6cfb
commit
734cab960b
@ -48,7 +48,7 @@ class HotSearchRedisSpider(scrapy.Spider):
|
|||||||
PAGE_LOAD_TIMEOUT = 10
|
PAGE_LOAD_TIMEOUT = 10
|
||||||
ELEMENT_WAIT_TIMEOUT = 5
|
ELEMENT_WAIT_TIMEOUT = 5
|
||||||
MAX_NEWS_PER_HOT = 1
|
MAX_NEWS_PER_HOT = 1
|
||||||
MAX_HOT_ITEMS = 10
|
MAX_HOT_ITEMS = 15
|
||||||
|
|
||||||
# 选择器定义
|
# 选择器定义
|
||||||
# URL_SELECTORS = [
|
# URL_SELECTORS = [
|
||||||
@ -271,9 +271,9 @@ class HotSearchRedisSpider(scrapy.Spider):
|
|||||||
except NoSuchElementException:
|
except NoSuchElementException:
|
||||||
break
|
break
|
||||||
|
|
||||||
if "video" in news_url.lower() or not news_url:
|
# if "video" in news_url.lower() or not news_url:
|
||||||
self.logger.info(f"跳过该链接采集: {news_url}")
|
# self.logger.info(f"跳过该链接采集: {news_url}")
|
||||||
break
|
# break
|
||||||
news_urls_array.append(news_url)
|
news_urls_array.append(news_url)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
@ -301,19 +301,34 @@ class HotSearchRedisSpider(scrapy.Spider):
|
|||||||
def _process_news_page(self, driver, news_url: str, hot_item: Dict[str, Any]):
|
def _process_news_page(self, driver, news_url: str, hot_item: Dict[str, Any]):
|
||||||
"""处理单个新闻页面"""
|
"""处理单个新闻页面"""
|
||||||
try:
|
try:
|
||||||
driver.get(news_url)
|
if "video" in news_url.lower() or not news_url:
|
||||||
self._wait_for_page_load(driver)
|
driver.get(news_url)
|
||||||
|
self._wait_for_page_load(driver)
|
||||||
|
title = driver.find_elements(By.XPATH, "//div[@class='ttp-video-extras-title']/h1")[0].get_attribute('title')
|
||||||
|
time_text = driver.find_elements(By.XPATH, "//div[@class='meta-info']/span[@class='publish-time']")[0].text.replace("发布于 ", "").strip()
|
||||||
|
author = driver.find_elements(By.XPATH, "//div[@class='author-info']/a[@class='author-name']")[0].text.strip()
|
||||||
|
content = news_url
|
||||||
|
|
||||||
# 标题采集
|
if time_text:
|
||||||
try:
|
try:
|
||||||
title = driver.find_elements(By.XPATH, "//div[@class='article-content']/h1")[0].text.strip()
|
url_time = get_time_stamp(time_text)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f'标题采集失败,已使用热搜名称...')
|
self.logger.debug(f"时间转换失败: {time_text}, {e}")
|
||||||
title = hot_item['hot_word']
|
|
||||||
# 提取页面信息
|
else:
|
||||||
author = self._extract_text(driver, self.AUTHOR_SELECTORS)
|
driver.get(news_url)
|
||||||
content = self._extract_content(driver)
|
self._wait_for_page_load(driver)
|
||||||
url_time = self._extract_time(driver)
|
|
||||||
|
# 标题采集
|
||||||
|
try:
|
||||||
|
title = driver.find_elements(By.XPATH, "//div[@class='article-content']/h1")[0].text.strip()
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f'标题采集失败,已使用热搜名称...')
|
||||||
|
title = hot_item['hot_word']
|
||||||
|
# 提取页面信息
|
||||||
|
author = self._extract_text(driver, self.AUTHOR_SELECTORS)
|
||||||
|
content = self._extract_content(driver)
|
||||||
|
url_time = self._extract_time(driver)
|
||||||
|
|
||||||
if not content:
|
if not content:
|
||||||
self.logger.warning(f"页面无有效内容: {news_url}")
|
self.logger.warning(f"页面无有效内容: {news_url}")
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user