[今日头条] 事件脉络采集
This commit is contained in:
parent
00789ba275
commit
5b3bf034f8
@ -5,7 +5,7 @@ from typing import List, Dict, Any, Optional
|
||||
|
||||
import scrapy
|
||||
from scrapy_selenium import SeleniumRequest
|
||||
from selenium.common.exceptions import TimeoutException, NoSuchElementException
|
||||
from selenium.common.exceptions import TimeoutException, NoSuchElementException, ElementClickInterceptedException
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.remote.webelement import WebElement
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
@ -46,15 +46,16 @@ class HotSearchRedisSpider(scrapy.Spider):
|
||||
BAIDU_URL = 'https://www.toutiao.com/'
|
||||
PAGE_LOAD_TIMEOUT = 10
|
||||
ELEMENT_WAIT_TIMEOUT = 5
|
||||
MAX_NEWS_PER_HOT = 6
|
||||
MAX_NEWS_PER_HOT = 1
|
||||
MAX_HOT_ITEMS = 10
|
||||
|
||||
# 选择器定义
|
||||
URL_SELECTORS = [
|
||||
'.l-content a',
|
||||
'.feed-card-wtt-l p a',
|
||||
'.feed-card-article-l a'
|
||||
]
|
||||
# URL_SELECTORS = [
|
||||
# '.card-render-wrapper a'
|
||||
# # '.l-content a',
|
||||
# # '.feed-card-wtt-l p a',
|
||||
# # '.feed-card-article-l a'
|
||||
# ]
|
||||
|
||||
AUTHOR_SELECTORS = [
|
||||
"//div[@class='author-info']/div[@class='desc']/a[@class='name']",
|
||||
@ -103,9 +104,16 @@ class HotSearchRedisSpider(scrapy.Spider):
|
||||
driver = response.request.meta['driver']
|
||||
|
||||
# 获取热点数据
|
||||
hot_items = self._fetch_hot_items()
|
||||
hot_items = self._fetch_hot_items()[:self.MAX_HOT_ITEMS]
|
||||
# hot_items = []
|
||||
# hot_items.append({
|
||||
# "fake_url": "https://www.toutiao.com/trending/7612920230477565459/?rank=14&log_from=4dda3d0c958f48_1772529869512",
|
||||
# 'hot_id': '76132246866893fda27',
|
||||
# 'hot_value': 5432429101,
|
||||
# 'hot_word': '伊朗:反击最初两天650民美士兵伤亡'
|
||||
# })
|
||||
|
||||
for hot_item in hot_items[:self.MAX_HOT_ITEMS]:
|
||||
for hot_item in hot_items:
|
||||
if not hot_item.get('fake_url'):
|
||||
self.logger.warning(f"热点 {hot_item['hot_word']} 无有效URL,跳过")
|
||||
continue
|
||||
@ -142,28 +150,141 @@ class HotSearchRedisSpider(scrapy.Spider):
|
||||
def _process_hot_item(self, driver, hot_item: Dict[str, Any]):
|
||||
"""处理单个热点项"""
|
||||
try:
|
||||
|
||||
yield from self._get_event_details(driver, hot_item)
|
||||
|
||||
# 加载热点页面
|
||||
driver.get(hot_item['fake_url'])
|
||||
self._wait_for_page_load(driver)
|
||||
|
||||
# 获取新闻卡片
|
||||
news_cards = self._get_news_cards(driver)
|
||||
# 拿到所有标题,判断是否存在“事件脉络”
|
||||
context_list = driver.find_elements(By.XPATH, "//div[@class='block-title']")
|
||||
self.logger.info(f"context_list:{context_list}")
|
||||
|
||||
for context in context_list:
|
||||
block_title = context.text.strip()
|
||||
|
||||
if block_title == "事件脉络":
|
||||
yield from self._get_event_timeline(context, driver, hot_item)
|
||||
continue
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"处理热点 '{hot_item['hot_word']}' 失败: {e}")
|
||||
|
||||
def _get_event_timeline(self, context, driver, hot_item: Dict[str, Any]):
|
||||
self.logger.info("开始采集事件脉络...")
|
||||
|
||||
# 定位按钮元素
|
||||
button_element = context.find_elements(By.XPATH,
|
||||
"//div[@class='timeline-card-wrapper']/div[@class='load-more']/button")[
|
||||
0]
|
||||
# 尝试普通点击
|
||||
try:
|
||||
button_element.click()
|
||||
except ElementClickInterceptedException:
|
||||
# 如果被遮挡,使用JavaScript点击
|
||||
self.logger.info("使用JavaScript点击按钮")
|
||||
driver.execute_script("arguments[0].click();", button_element)
|
||||
|
||||
# 等待内容加载
|
||||
time.sleep(2)
|
||||
# 获取当前所有的脉络信息
|
||||
event_list = context.find_elements(By.XPATH,
|
||||
"//div[@class='timeline-card-wrapper']/div[@class='time-nodes']/div[@class='time-node']")
|
||||
self.logger.info(f"找到 {len(event_list)} 个事件")
|
||||
url_content = ''
|
||||
for idx, even in enumerate(event_list, 1):
|
||||
try:
|
||||
# 获取标题和时间
|
||||
title_elem = even.find_element(By.XPATH, ".//div[@class='title']")
|
||||
title_text = title_elem.text
|
||||
|
||||
# 获取脉络内容
|
||||
content_element = even.find_element(By.XPATH, ".//a[@class='content']/p")
|
||||
content = content_element.text.strip()
|
||||
|
||||
# 检查是否有"最新"标签
|
||||
try:
|
||||
tag_element = content_element.find_element(By.XPATH, ".//span[@class='tag']")
|
||||
tag = tag_element.text.strip() # "最新"
|
||||
# 从内容中移除标签文本(如果需要)
|
||||
content = content.replace(tag, "").strip()
|
||||
except:
|
||||
tag = ""
|
||||
|
||||
# 拼接标题和内容,用换行符隔开
|
||||
if url_content:
|
||||
url_content += "\n" # 在已有内容后添加换行符
|
||||
|
||||
# 添加当前条目的标题和内容
|
||||
url_content += f"{title_text}\n{content}"
|
||||
|
||||
self.logger.info(f"已添加第{idx}条: {title_text[:20]}...")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"处理第{idx}个事件时出错: {e}")
|
||||
continue
|
||||
|
||||
timeNow = get_current_timestamp()
|
||||
# 创建item
|
||||
event_timeline_item = MediaspidersItem()
|
||||
event_timeline_item['es_carriertype'] = 'news'
|
||||
event_timeline_item['es_srcname'] = 'https://www.toutiao.com/'
|
||||
event_timeline_item['es_sitename'] = '今日头条'
|
||||
event_timeline_item['es_sid'] = get_str_md5(hot_item["fake_url"])
|
||||
event_timeline_item['es_urltitle'] = hot_item['hot_word']
|
||||
event_timeline_item['es_authors'] = ''
|
||||
event_timeline_item['es_urlcontent'] = url_content
|
||||
event_timeline_item['es_urltime'] = timeNow
|
||||
event_timeline_item['es_lasttime'] = timeNow
|
||||
event_timeline_item['es_urlname'] = hot_item["fake_url"]
|
||||
event_timeline_item['es_hkey'] = hot_item['hot_id']
|
||||
event_timeline_item['es_urltopic'] = hot_item['hot_word']
|
||||
event_timeline_item['es_video'] = ''
|
||||
|
||||
yield event_timeline_item
|
||||
self.logger.info(f"事件脉络-采集成功 '{hot_item['hot_word']}':{hot_item['fake_url']}")
|
||||
|
||||
|
||||
def _get_event_details(self, driver, hot_item: Dict[str, Any]):
|
||||
"""获取事件详情卡片列表"""
|
||||
self.logger.info(f"开始采集事件详情-{hot_item['hot_word']}: {hot_item['fake_url']}")
|
||||
|
||||
hot_url = hot_item['fake_url']
|
||||
driver.get(hot_url)
|
||||
self._wait_for_page_load(driver)
|
||||
# 如果 api 采集的url为榜单页,则采集卡片,否则就直接采集详情页
|
||||
if "article" not in hot_url:
|
||||
cards = driver.find_elements(By.XPATH,
|
||||
"//div[@class='block-content']/div[@class='card-render-wrapper']")
|
||||
news_cards = cards[:self.MAX_NEWS_PER_HOT]
|
||||
news_urls_array = []
|
||||
for card in news_cards:
|
||||
news_url = self._extract_url_from_card(card)
|
||||
if not news_url:
|
||||
continue
|
||||
"""从卡片中提取URL"""
|
||||
# for selector in self.URL_SELECTORS:
|
||||
try:
|
||||
element = card.find_element(By.CSS_SELECTOR, '.card-render-wrapper a')
|
||||
url = element.get_attribute('href')
|
||||
if url and url.startswith(('http://', 'https://')):
|
||||
news_url = url
|
||||
except NoSuchElementException:
|
||||
break
|
||||
|
||||
if "video" in news_url.lower():
|
||||
self.logger.info(f"跳过视频链接: {news_url}")
|
||||
continue
|
||||
if "video" in news_url.lower() or not news_url:
|
||||
self.logger.info(f"跳过该链接采集: {news_url}")
|
||||
break
|
||||
news_urls_array.append(news_url)
|
||||
|
||||
else:
|
||||
# 将详情页赋值
|
||||
news_urls_array = [hot_url]
|
||||
try:
|
||||
# 开始采集
|
||||
for url in news_urls_array:
|
||||
yield from self._process_news_page(driver, url, hot_item)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"处理热点 '{hot_item['hot_word']}' 失败: {e}")
|
||||
self.logger.error(f"获取事件详情卡片失败: {e}")
|
||||
|
||||
def _wait_for_page_load(self, driver, timeout: int = None):
|
||||
"""等待页面加载"""
|
||||
@ -176,27 +297,6 @@ class HotSearchRedisSpider(scrapy.Spider):
|
||||
except TimeoutException:
|
||||
self.logger.warning("页面加载超时")
|
||||
|
||||
def _get_news_cards(self, driver) -> List[WebElement]:
|
||||
"""获取新闻卡片列表"""
|
||||
try:
|
||||
cards = driver.find_elements(By.CSS_SELECTOR, ".feed-card-wrapper")
|
||||
return cards[:self.MAX_NEWS_PER_HOT]
|
||||
except Exception as e:
|
||||
self.logger.error(f"获取新闻卡片失败: {e}")
|
||||
return []
|
||||
|
||||
def _extract_url_from_card(self, card: WebElement) -> Optional[str]:
|
||||
"""从卡片中提取URL"""
|
||||
for selector in self.URL_SELECTORS:
|
||||
try:
|
||||
element = card.find_element(By.CSS_SELECTOR, selector)
|
||||
url = element.get_attribute('href')
|
||||
if url and url.startswith(('http://', 'https://')):
|
||||
return url
|
||||
except NoSuchElementException:
|
||||
continue
|
||||
return None
|
||||
|
||||
def _process_news_page(self, driver, news_url: str, hot_item: Dict[str, Any]):
|
||||
"""处理单个新闻页面"""
|
||||
try:
|
||||
@ -230,6 +330,8 @@ class HotSearchRedisSpider(scrapy.Spider):
|
||||
|
||||
yield even_details_item
|
||||
|
||||
self.logger.info(f"事件详情-采集成功 '{hot_item['hot_word']}':{news_url}")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"处理新闻页面失败 {news_url}: {e}")
|
||||
|
||||
|
||||
@ -84,7 +84,7 @@ class HotSearchSpider(scrapy.Spider):
|
||||
hot_search_item['es_sitename'] = line['platform']
|
||||
hot_search_item['es_urltime'] = line['onboard_time']
|
||||
hot_search_item['es_lasttime'] = line['crawl_time']
|
||||
hot_search_item['es_urlname'] = line['fake_url']
|
||||
hot_search_item['es_urlname'] = line['fake_url'] + "&news"
|
||||
|
||||
yield hot_search_item
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user