[今日头条] 事件脉络采集

This commit is contained in:
DELL 2026-03-04 15:44:01 +08:00
parent 00789ba275
commit 5b3bf034f8
2 changed files with 142 additions and 40 deletions

View File

@ -5,7 +5,7 @@ from typing import List, Dict, Any, Optional
import scrapy
from scrapy_selenium import SeleniumRequest
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.common.exceptions import TimeoutException, NoSuchElementException, ElementClickInterceptedException
from selenium.webdriver.common.by import By
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.support import expected_conditions as EC
@ -46,15 +46,16 @@ class HotSearchRedisSpider(scrapy.Spider):
BAIDU_URL = 'https://www.toutiao.com/'
PAGE_LOAD_TIMEOUT = 10
ELEMENT_WAIT_TIMEOUT = 5
MAX_NEWS_PER_HOT = 6
MAX_NEWS_PER_HOT = 1
MAX_HOT_ITEMS = 10
# 选择器定义
URL_SELECTORS = [
'.l-content a',
'.feed-card-wtt-l p a',
'.feed-card-article-l a'
]
# URL_SELECTORS = [
# '.card-render-wrapper a'
# # '.l-content a',
# # '.feed-card-wtt-l p a',
# # '.feed-card-article-l a'
# ]
AUTHOR_SELECTORS = [
"//div[@class='author-info']/div[@class='desc']/a[@class='name']",
@ -103,9 +104,16 @@ class HotSearchRedisSpider(scrapy.Spider):
driver = response.request.meta['driver']
# 获取热点数据
hot_items = self._fetch_hot_items()
hot_items = self._fetch_hot_items()[:self.MAX_HOT_ITEMS]
# hot_items = []
# hot_items.append({
# "fake_url": "https://www.toutiao.com/trending/7612920230477565459/?rank=14&log_from=4dda3d0c958f48_1772529869512",
# 'hot_id': '76132246866893fda27',
# 'hot_value': 5432429101,
# 'hot_word': '伊朗反击最初两天650民美士兵伤亡'
# })
for hot_item in hot_items[:self.MAX_HOT_ITEMS]:
for hot_item in hot_items:
if not hot_item.get('fake_url'):
self.logger.warning(f"热点 {hot_item['hot_word']} 无有效URL跳过")
continue
@ -142,28 +150,141 @@ class HotSearchRedisSpider(scrapy.Spider):
def _process_hot_item(self, driver, hot_item: Dict[str, Any]):
"""处理单个热点项"""
try:
yield from self._get_event_details(driver, hot_item)
# 加载热点页面
driver.get(hot_item['fake_url'])
self._wait_for_page_load(driver)
# 获取新闻卡片
news_cards = self._get_news_cards(driver)
# 拿到所有标题,判断是否存在“事件脉络”
context_list = driver.find_elements(By.XPATH, "//div[@class='block-title']")
self.logger.info(f"context_list:{context_list}")
for context in context_list:
block_title = context.text.strip()
if block_title == "事件脉络":
yield from self._get_event_timeline(context, driver, hot_item)
continue
except Exception as e:
self.logger.error(f"处理热点 '{hot_item['hot_word']}' 失败: {e}")
def _get_event_timeline(self, context, driver, hot_item: Dict[str, Any]):
self.logger.info("开始采集事件脉络...")
# 定位按钮元素
button_element = context.find_elements(By.XPATH,
"//div[@class='timeline-card-wrapper']/div[@class='load-more']/button")[
0]
# 尝试普通点击
try:
button_element.click()
except ElementClickInterceptedException:
# 如果被遮挡使用JavaScript点击
self.logger.info("使用JavaScript点击按钮")
driver.execute_script("arguments[0].click();", button_element)
# 等待内容加载
time.sleep(2)
# 获取当前所有的脉络信息
event_list = context.find_elements(By.XPATH,
"//div[@class='timeline-card-wrapper']/div[@class='time-nodes']/div[@class='time-node']")
self.logger.info(f"找到 {len(event_list)} 个事件")
url_content = ''
for idx, even in enumerate(event_list, 1):
try:
# 获取标题和时间
title_elem = even.find_element(By.XPATH, ".//div[@class='title']")
title_text = title_elem.text
# 获取脉络内容
content_element = even.find_element(By.XPATH, ".//a[@class='content']/p")
content = content_element.text.strip()
# 检查是否有"最新"标签
try:
tag_element = content_element.find_element(By.XPATH, ".//span[@class='tag']")
tag = tag_element.text.strip() # "最新"
# 从内容中移除标签文本(如果需要)
content = content.replace(tag, "").strip()
except:
tag = ""
# 拼接标题和内容,用换行符隔开
if url_content:
url_content += "\n" # 在已有内容后添加换行符
# 添加当前条目的标题和内容
url_content += f"{title_text}\n{content}"
self.logger.info(f"已添加第{idx}条: {title_text[:20]}...")
except Exception as e:
self.logger.error(f"处理第{idx}个事件时出错: {e}")
continue
timeNow = get_current_timestamp()
# 创建item
event_timeline_item = MediaspidersItem()
event_timeline_item['es_carriertype'] = 'news'
event_timeline_item['es_srcname'] = 'https://www.toutiao.com/'
event_timeline_item['es_sitename'] = '今日头条'
event_timeline_item['es_sid'] = get_str_md5(hot_item["fake_url"])
event_timeline_item['es_urltitle'] = hot_item['hot_word']
event_timeline_item['es_authors'] = ''
event_timeline_item['es_urlcontent'] = url_content
event_timeline_item['es_urltime'] = timeNow
event_timeline_item['es_lasttime'] = timeNow
event_timeline_item['es_urlname'] = hot_item["fake_url"]
event_timeline_item['es_hkey'] = hot_item['hot_id']
event_timeline_item['es_urltopic'] = hot_item['hot_word']
event_timeline_item['es_video'] = ''
yield event_timeline_item
self.logger.info(f"事件脉络-采集成功 '{hot_item['hot_word']}'{hot_item['fake_url']}")
def _get_event_details(self, driver, hot_item: Dict[str, Any]):
"""获取事件详情卡片列表"""
self.logger.info(f"开始采集事件详情-{hot_item['hot_word']}: {hot_item['fake_url']}")
hot_url = hot_item['fake_url']
driver.get(hot_url)
self._wait_for_page_load(driver)
# 如果 api 采集的url为榜单页则采集卡片否则就直接采集详情页
if "article" not in hot_url:
cards = driver.find_elements(By.XPATH,
"//div[@class='block-content']/div[@class='card-render-wrapper']")
news_cards = cards[:self.MAX_NEWS_PER_HOT]
news_urls_array = []
for card in news_cards:
news_url = self._extract_url_from_card(card)
if not news_url:
continue
"""从卡片中提取URL"""
# for selector in self.URL_SELECTORS:
try:
element = card.find_element(By.CSS_SELECTOR, '.card-render-wrapper a')
url = element.get_attribute('href')
if url and url.startswith(('http://', 'https://')):
news_url = url
except NoSuchElementException:
break
if "video" in news_url.lower():
self.logger.info(f"跳过视频链接: {news_url}")
continue
if "video" in news_url.lower() or not news_url:
self.logger.info(f"跳过该链接采集: {news_url}")
break
news_urls_array.append(news_url)
else:
# 将详情页赋值
news_urls_array = [hot_url]
try:
# 开始采集
for url in news_urls_array:
yield from self._process_news_page(driver, url, hot_item)
except Exception as e:
self.logger.error(f"处理热点 '{hot_item['hot_word']}' 失败: {e}")
self.logger.error(f"获取事件详情卡片失败: {e}")
def _wait_for_page_load(self, driver, timeout: int = None):
"""等待页面加载"""
@ -176,27 +297,6 @@ class HotSearchRedisSpider(scrapy.Spider):
except TimeoutException:
self.logger.warning("页面加载超时")
def _get_news_cards(self, driver) -> List[WebElement]:
"""获取新闻卡片列表"""
try:
cards = driver.find_elements(By.CSS_SELECTOR, ".feed-card-wrapper")
return cards[:self.MAX_NEWS_PER_HOT]
except Exception as e:
self.logger.error(f"获取新闻卡片失败: {e}")
return []
def _extract_url_from_card(self, card: WebElement) -> Optional[str]:
"""从卡片中提取URL"""
for selector in self.URL_SELECTORS:
try:
element = card.find_element(By.CSS_SELECTOR, selector)
url = element.get_attribute('href')
if url and url.startswith(('http://', 'https://')):
return url
except NoSuchElementException:
continue
return None
def _process_news_page(self, driver, news_url: str, hot_item: Dict[str, Any]):
"""处理单个新闻页面"""
try:
@ -230,6 +330,8 @@ class HotSearchRedisSpider(scrapy.Spider):
yield even_details_item
self.logger.info(f"事件详情-采集成功 '{hot_item['hot_word']}'{news_url}")
except Exception as e:
self.logger.error(f"处理新闻页面失败 {news_url}: {e}")

View File

@ -84,7 +84,7 @@ class HotSearchSpider(scrapy.Spider):
hot_search_item['es_sitename'] = line['platform']
hot_search_item['es_urltime'] = line['onboard_time']
hot_search_item['es_lasttime'] = line['crawl_time']
hot_search_item['es_urlname'] = line['fake_url']
hot_search_item['es_urlname'] = line['fake_url'] + "&news"
yield hot_search_item