[今日头条] 事件脉络采集

This commit is contained in:
DELL 2026-03-04 15:44:01 +08:00
parent 00789ba275
commit 5b3bf034f8
2 changed files with 142 additions and 40 deletions

View File

@ -5,7 +5,7 @@ from typing import List, Dict, Any, Optional
import scrapy import scrapy
from scrapy_selenium import SeleniumRequest from scrapy_selenium import SeleniumRequest
from selenium.common.exceptions import TimeoutException, NoSuchElementException from selenium.common.exceptions import TimeoutException, NoSuchElementException, ElementClickInterceptedException
from selenium.webdriver.common.by import By from selenium.webdriver.common.by import By
from selenium.webdriver.remote.webelement import WebElement from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support import expected_conditions as EC
@ -46,15 +46,16 @@ class HotSearchRedisSpider(scrapy.Spider):
BAIDU_URL = 'https://www.toutiao.com/' BAIDU_URL = 'https://www.toutiao.com/'
PAGE_LOAD_TIMEOUT = 10 PAGE_LOAD_TIMEOUT = 10
ELEMENT_WAIT_TIMEOUT = 5 ELEMENT_WAIT_TIMEOUT = 5
MAX_NEWS_PER_HOT = 6 MAX_NEWS_PER_HOT = 1
MAX_HOT_ITEMS = 10 MAX_HOT_ITEMS = 10
# 选择器定义 # 选择器定义
URL_SELECTORS = [ # URL_SELECTORS = [
'.l-content a', # '.card-render-wrapper a'
'.feed-card-wtt-l p a', # # '.l-content a',
'.feed-card-article-l a' # # '.feed-card-wtt-l p a',
] # # '.feed-card-article-l a'
# ]
AUTHOR_SELECTORS = [ AUTHOR_SELECTORS = [
"//div[@class='author-info']/div[@class='desc']/a[@class='name']", "//div[@class='author-info']/div[@class='desc']/a[@class='name']",
@ -103,9 +104,16 @@ class HotSearchRedisSpider(scrapy.Spider):
driver = response.request.meta['driver'] driver = response.request.meta['driver']
# 获取热点数据 # 获取热点数据
hot_items = self._fetch_hot_items() hot_items = self._fetch_hot_items()[:self.MAX_HOT_ITEMS]
# hot_items = []
# hot_items.append({
# "fake_url": "https://www.toutiao.com/trending/7612920230477565459/?rank=14&log_from=4dda3d0c958f48_1772529869512",
# 'hot_id': '76132246866893fda27',
# 'hot_value': 5432429101,
# 'hot_word': '伊朗反击最初两天650民美士兵伤亡'
# })
for hot_item in hot_items[:self.MAX_HOT_ITEMS]: for hot_item in hot_items:
if not hot_item.get('fake_url'): if not hot_item.get('fake_url'):
self.logger.warning(f"热点 {hot_item['hot_word']} 无有效URL跳过") self.logger.warning(f"热点 {hot_item['hot_word']} 无有效URL跳过")
continue continue
@ -142,28 +150,141 @@ class HotSearchRedisSpider(scrapy.Spider):
def _process_hot_item(self, driver, hot_item: Dict[str, Any]): def _process_hot_item(self, driver, hot_item: Dict[str, Any]):
"""处理单个热点项""" """处理单个热点项"""
try: try:
yield from self._get_event_details(driver, hot_item)
# 加载热点页面 # 加载热点页面
driver.get(hot_item['fake_url']) driver.get(hot_item['fake_url'])
self._wait_for_page_load(driver) self._wait_for_page_load(driver)
# 获取新闻卡片 # 拿到所有标题,判断是否存在“事件脉络”
news_cards = self._get_news_cards(driver) context_list = driver.find_elements(By.XPATH, "//div[@class='block-title']")
self.logger.info(f"context_list:{context_list}")
for context in context_list:
block_title = context.text.strip()
if block_title == "事件脉络":
yield from self._get_event_timeline(context, driver, hot_item)
continue
except Exception as e:
self.logger.error(f"处理热点 '{hot_item['hot_word']}' 失败: {e}")
def _get_event_timeline(self, context, driver, hot_item: Dict[str, Any]):
self.logger.info("开始采集事件脉络...")
# 定位按钮元素
button_element = context.find_elements(By.XPATH,
"//div[@class='timeline-card-wrapper']/div[@class='load-more']/button")[
0]
# 尝试普通点击
try:
button_element.click()
except ElementClickInterceptedException:
# 如果被遮挡使用JavaScript点击
self.logger.info("使用JavaScript点击按钮")
driver.execute_script("arguments[0].click();", button_element)
# 等待内容加载
time.sleep(2)
# 获取当前所有的脉络信息
event_list = context.find_elements(By.XPATH,
"//div[@class='timeline-card-wrapper']/div[@class='time-nodes']/div[@class='time-node']")
self.logger.info(f"找到 {len(event_list)} 个事件")
url_content = ''
for idx, even in enumerate(event_list, 1):
try:
# 获取标题和时间
title_elem = even.find_element(By.XPATH, ".//div[@class='title']")
title_text = title_elem.text
# 获取脉络内容
content_element = even.find_element(By.XPATH, ".//a[@class='content']/p")
content = content_element.text.strip()
# 检查是否有"最新"标签
try:
tag_element = content_element.find_element(By.XPATH, ".//span[@class='tag']")
tag = tag_element.text.strip() # "最新"
# 从内容中移除标签文本(如果需要)
content = content.replace(tag, "").strip()
except:
tag = ""
# 拼接标题和内容,用换行符隔开
if url_content:
url_content += "\n" # 在已有内容后添加换行符
# 添加当前条目的标题和内容
url_content += f"{title_text}\n{content}"
self.logger.info(f"已添加第{idx}条: {title_text[:20]}...")
except Exception as e:
self.logger.error(f"处理第{idx}个事件时出错: {e}")
continue
timeNow = get_current_timestamp()
# 创建item
event_timeline_item = MediaspidersItem()
event_timeline_item['es_carriertype'] = 'news'
event_timeline_item['es_srcname'] = 'https://www.toutiao.com/'
event_timeline_item['es_sitename'] = '今日头条'
event_timeline_item['es_sid'] = get_str_md5(hot_item["fake_url"])
event_timeline_item['es_urltitle'] = hot_item['hot_word']
event_timeline_item['es_authors'] = ''
event_timeline_item['es_urlcontent'] = url_content
event_timeline_item['es_urltime'] = timeNow
event_timeline_item['es_lasttime'] = timeNow
event_timeline_item['es_urlname'] = hot_item["fake_url"]
event_timeline_item['es_hkey'] = hot_item['hot_id']
event_timeline_item['es_urltopic'] = hot_item['hot_word']
event_timeline_item['es_video'] = ''
yield event_timeline_item
self.logger.info(f"事件脉络-采集成功 '{hot_item['hot_word']}'{hot_item['fake_url']}")
def _get_event_details(self, driver, hot_item: Dict[str, Any]):
"""获取事件详情卡片列表"""
self.logger.info(f"开始采集事件详情-{hot_item['hot_word']}: {hot_item['fake_url']}")
hot_url = hot_item['fake_url']
driver.get(hot_url)
self._wait_for_page_load(driver)
# 如果 api 采集的url为榜单页则采集卡片否则就直接采集详情页
if "article" not in hot_url:
cards = driver.find_elements(By.XPATH,
"//div[@class='block-content']/div[@class='card-render-wrapper']")
news_cards = cards[:self.MAX_NEWS_PER_HOT]
news_urls_array = [] news_urls_array = []
for card in news_cards: for card in news_cards:
news_url = self._extract_url_from_card(card) """从卡片中提取URL"""
if not news_url: # for selector in self.URL_SELECTORS:
continue try:
element = card.find_element(By.CSS_SELECTOR, '.card-render-wrapper a')
url = element.get_attribute('href')
if url and url.startswith(('http://', 'https://')):
news_url = url
except NoSuchElementException:
break
if "video" in news_url.lower(): if "video" in news_url.lower() or not news_url:
self.logger.info(f"跳过视频链接: {news_url}") self.logger.info(f"跳过该链接采集: {news_url}")
continue break
news_urls_array.append(news_url) news_urls_array.append(news_url)
else:
# 将详情页赋值
news_urls_array = [hot_url]
try:
# 开始采集
for url in news_urls_array: for url in news_urls_array:
yield from self._process_news_page(driver, url, hot_item) yield from self._process_news_page(driver, url, hot_item)
except Exception as e: except Exception as e:
self.logger.error(f"处理热点 '{hot_item['hot_word']}' 失败: {e}") self.logger.error(f"获取事件详情卡片失败: {e}")
def _wait_for_page_load(self, driver, timeout: int = None): def _wait_for_page_load(self, driver, timeout: int = None):
"""等待页面加载""" """等待页面加载"""
@ -176,27 +297,6 @@ class HotSearchRedisSpider(scrapy.Spider):
except TimeoutException: except TimeoutException:
self.logger.warning("页面加载超时") self.logger.warning("页面加载超时")
def _get_news_cards(self, driver) -> List[WebElement]:
"""获取新闻卡片列表"""
try:
cards = driver.find_elements(By.CSS_SELECTOR, ".feed-card-wrapper")
return cards[:self.MAX_NEWS_PER_HOT]
except Exception as e:
self.logger.error(f"获取新闻卡片失败: {e}")
return []
def _extract_url_from_card(self, card: WebElement) -> Optional[str]:
"""从卡片中提取URL"""
for selector in self.URL_SELECTORS:
try:
element = card.find_element(By.CSS_SELECTOR, selector)
url = element.get_attribute('href')
if url and url.startswith(('http://', 'https://')):
return url
except NoSuchElementException:
continue
return None
def _process_news_page(self, driver, news_url: str, hot_item: Dict[str, Any]): def _process_news_page(self, driver, news_url: str, hot_item: Dict[str, Any]):
"""处理单个新闻页面""" """处理单个新闻页面"""
try: try:
@ -230,6 +330,8 @@ class HotSearchRedisSpider(scrapy.Spider):
yield even_details_item yield even_details_item
self.logger.info(f"事件详情-采集成功 '{hot_item['hot_word']}'{news_url}")
except Exception as e: except Exception as e:
self.logger.error(f"处理新闻页面失败 {news_url}: {e}") self.logger.error(f"处理新闻页面失败 {news_url}: {e}")

View File

@ -84,7 +84,7 @@ class HotSearchSpider(scrapy.Spider):
hot_search_item['es_sitename'] = line['platform'] hot_search_item['es_sitename'] = line['platform']
hot_search_item['es_urltime'] = line['onboard_time'] hot_search_item['es_urltime'] = line['onboard_time']
hot_search_item['es_lasttime'] = line['crawl_time'] hot_search_item['es_lasttime'] = line['crawl_time']
hot_search_item['es_urlname'] = line['fake_url'] hot_search_item['es_urlname'] = line['fake_url'] + "&news"
yield hot_search_item yield hot_search_item