[今日头条] 采集关联新闻

This commit is contained in:
DELL 2026-02-28 16:15:32 +08:00
parent feb48f3579
commit d01035577b
7 changed files with 431 additions and 33 deletions

View File

@ -76,7 +76,7 @@ class SeleniumMiddleware:
}
edge_options.add_experimental_option("prefs", prefs)
self.driver = Edge(executable_path=r"C:\Program Files\Python38\msedgedriver.exe", options=edge_options)
self.driver = Edge(executable_path=r"D:\msedgedriver.exe", options=edge_options)
@classmethod
def from_crawler(cls, crawler):

View File

@ -45,6 +45,7 @@ class BaiduHotSearchSprder(scrapy.Spider):
'MediaSpiders.middlewares.DumpFilterSpiderMiddleware': 543,
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware': None
},
'DOWNLOADER_MIDDLEWARES': {},
'BATCH_SAVE_SIZE': 50
}

View File

@ -0,0 +1,295 @@
# -*- coding: utf-8 -*-
import json
import time
from typing import List, Dict, Any, Optional
import scrapy
from scrapy_selenium import SeleniumRequest
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from MediaSpiders.items import MediaspidersItem
from MediaSpiders.utils.date_utils import get_time_stamp
from MediaSpiders.utils.hot_search_json_parser import url_response
from MediaSpiders.utils.string_utils import get_str_md5
from MediaSpiders.utils.time_utils import get_current_timestamp
class HotSearchRedisSpider(scrapy.Spider):
name = 'HotSearchRedisSpider'
custom_settings = {
'PROTO_MODULE_PATH': 'MediaSpiders.proto.Es_pb2',
'PROTO_CLASS_NAME': 'EsSets',
'PROTO_FIELD_NAME': 'Es',
'PROTO_SAVE_FILE_NAME': 'public_info_data_',
'IMAGES_STORE': r'/usr/local/temp_image/twitter',
'IMAGES_RESULT_FIELD': 'es_urlimage',
'FILES_STORE': r'/usr/local/videos',
'FILES_RESULT_FIELD': 'es_video',
'ZIP_FILE_NAME': 'image_data_ship_',
'FILE_ZIP_FILE_NAME': 'image_data_plane_',
'ITEM_PIPELINES': {
'scrapy.pipelines.images.ImagesPipeline': 2,
'scrapy.pipelines.files.FilesPipeline': 1,
'MediaSpiders.pipelines.ProtobufSavePipeline': 300,
},
'SPIDER_MIDDLEWARES': {
'MediaSpiders.middlewares.DumpFilterSpiderMiddleware': 543,
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware': None
}
}
# 常量定义
TOUTIAO_HOT_URL = 'https://www.toutiao.com/hot-event/hot-board/?origin=toutiao_pc'
BAIDU_URL = 'https://www.toutiao.com/'
PAGE_LOAD_TIMEOUT = 10
ELEMENT_WAIT_TIMEOUT = 5
MAX_NEWS_PER_HOT = 6
MAX_HOT_ITEMS = 10
# 选择器定义
URL_SELECTORS = [
'.l-content a',
'.feed-card-wtt-l p a',
'.feed-card-article-l a'
]
AUTHOR_SELECTORS = [
"//div[@class='author-info']/div[@class='desc']/a[@class='name']",
"//div[@class='user-info']/a[@class='user-name']"
]
CONTENT_SELECTORS = [
"//div[@class='article-content']//p",
"//article/div[@class='weitoutiao-html']"
]
TIME_SELECTORS = [
"//p[@class='abstract']/span[@class='time']",
"//div[@class='article-meta']/span[1]"
]
# 需要过滤的文本模式
SKIP_PATTERNS = ['版权', '声明', '邮箱', '记者', '编辑', '来源', '投稿', '责任编辑']
def __init__(self, params=None, *args, **kwargs):
super().__init__(*args, **kwargs)
self.url_time = get_current_timestamp()
self.total_num = 0
self.authorization = None
self.job_id = None
if params:
try:
json_params = json.loads(params)
self.total_num = int(json_params.get('totalNum', 0))
self.authorization = json_params.get('authorization')
self.job_id = json_params.get('job_id')
except (json.JSONDecodeError, ValueError) as e:
self.logger.error(f"解析参数失败: {e}")
def start_requests(self):
"""开始请求"""
yield SeleniumRequest(
url=self.BAIDU_URL,
callback=self.parse_parent,
wait_time=self.PAGE_LOAD_TIMEOUT
)
def parse_parent(self, response):
"""解析热点列表页面"""
driver = response.request.meta['driver']
# 获取热点数据
hot_items = self._fetch_hot_items()
for hot_item in hot_items[:self.MAX_HOT_ITEMS]:
if not hot_item.get('fake_url'):
self.logger.warning(f"热点 {hot_item['hot_word']} 无有效URL跳过")
continue
yield from self._process_hot_item(driver, hot_item)
def _fetch_hot_items(self) -> List[Dict[str, Any]]:
"""获取热点数据"""
try:
rsp_body = url_response(self.TOUTIAO_HOT_URL)
if rsp_body.get('status') != "success":
self.logger.error("获取热点数据失败")
return []
result_array = []
for line in rsp_body.get('data', []):
try:
result_array.append({
"hot_id": line.get('ClusterIdStr', ''),
"hot_word": line.get('Title', ''),
"hot_value": int(line.get('HotValue', 0)),
"fake_url": line.get('Url', '')
})
except Exception as e:
self.logger.error(f"解析热点数据失败: {e}")
self.logger.debug(f"问题数据: {line}")
return result_array
except Exception as e:
self.logger.error(f"获取热点数据异常: {e}")
return []
def _process_hot_item(self, driver, hot_item: Dict[str, Any]):
"""处理单个热点项"""
try:
# 加载热点页面
driver.get(hot_item['fake_url'])
self._wait_for_page_load(driver)
# 获取新闻卡片
news_cards = self._get_news_cards(driver)
news_urls_array = []
for card in news_cards:
news_url = self._extract_url_from_card(card)
if not news_url:
continue
if "video" in news_url.lower():
self.logger.info(f"跳过视频链接: {news_url}")
continue
news_urls_array.append(news_url)
for url in news_urls_array:
yield from self._process_news_page(driver, url, hot_item)
except Exception as e:
self.logger.error(f"处理热点 '{hot_item['hot_word']}' 失败: {e}")
def _wait_for_page_load(self, driver, timeout: int = None):
"""等待页面加载"""
timeout = timeout or self.PAGE_LOAD_TIMEOUT
time.sleep(2) # 基础等待
try:
WebDriverWait(driver, timeout).until(
EC.presence_of_element_located((By.TAG_NAME, "body"))
)
except TimeoutException:
self.logger.warning("页面加载超时")
def _get_news_cards(self, driver) -> List[WebElement]:
"""获取新闻卡片列表"""
try:
cards = driver.find_elements(By.CSS_SELECTOR, ".feed-card-wrapper")
return cards[:self.MAX_NEWS_PER_HOT]
except Exception as e:
self.logger.error(f"获取新闻卡片失败: {e}")
return []
def _extract_url_from_card(self, card: WebElement) -> Optional[str]:
"""从卡片中提取URL"""
for selector in self.URL_SELECTORS:
try:
element = card.find_element(By.CSS_SELECTOR, selector)
url = element.get_attribute('href')
if url and url.startswith(('http://', 'https://')):
return url
except NoSuchElementException:
continue
return None
def _process_news_page(self, driver, news_url: str, hot_item: Dict[str, Any]):
"""处理单个新闻页面"""
try:
driver.get(news_url)
self._wait_for_page_load(driver)
# 提取页面信息
author = self._extract_text(driver, self.AUTHOR_SELECTORS)
content = self._extract_content(driver)
url_time = self._extract_time(driver)
if not content:
self.logger.warning(f"页面无有效内容: {news_url}")
return
# 创建item
even_details_item = MediaspidersItem()
even_details_item['es_carriertype'] = 'news'
even_details_item['es_srcname'] = 'https://www.toutiao.com/'
even_details_item['es_sitename'] = '今日头条'
even_details_item['es_sid'] = get_str_md5(news_url)
even_details_item['es_urltitle'] = author
even_details_item['es_authors'] = author
even_details_item['es_urlcontent'] = content
even_details_item['es_urltime'] = url_time
even_details_item['es_lasttime'] = url_time
even_details_item['es_urlname'] = news_url
even_details_item['es_hkey'] = hot_item['hot_id']
even_details_item['es_urltopic'] = hot_item['hot_word']
even_details_item['es_video'] = ''
yield even_details_item
except Exception as e:
self.logger.error(f"处理新闻页面失败 {news_url}: {e}")
def _extract_text(self, context, selectors: List[str]) -> Optional[str]:
"""从上下文中提取可见元素的文本"""
for selector in selectors:
try:
elements = context.find_elements(By.XPATH, selector)
for elem in elements:
if elem.is_displayed():
text = elem.text.strip()
if text:
return text
except Exception as e:
self.logger.debug(f"选择器 '{selector}' 未匹配: {e}")
return None
def _extract_content(self, driver) -> str:
"""提取文章内容"""
try:
time.sleep(2) # 等待内容加载
content_lines = []
for selector in self.CONTENT_SELECTORS:
try:
paragraphs = driver.find_elements(By.XPATH, selector)
for p in paragraphs:
if selector == '.weitoutiao-html':
text = p[0].text.strip()
else:
text = p.text.strip()
if text != '':
content_lines.append(text)
except Exception as e:
self.logger.debug(f"选择器 '{selector}' 提取失败: {e}")
return '\n'.join(content_lines) if content_lines else ""
except Exception as e:
self.logger.error(f"提取内容失败: {e}")
return ""
def _is_valid_content(self, text: str) -> bool:
"""验证内容是否有效"""
if not text or len(text) <= 10:
return False
return not any(pattern in text for pattern in self.SKIP_PATTERNS)
def _extract_time(self, driver) -> Optional[int]:
"""提取发布时间"""
time_text = self._extract_text(driver, self.TIME_SELECTORS)
if time_text:
try:
return get_time_stamp(time_text)
except Exception as e:
self.logger.debug(f"时间转换失败: {time_text}, {e}")
return self.url_time

View File

@ -1,7 +1,11 @@
import scrapy
import json
from MediaSpiders.utils.hot_search_json_parser import parse_weibo_response, parse_toutiao_response
import redis
import scrapy
from MediaSpiders.items import MediaspidersItem
from MediaSpiders.utils.hot_search_json_parser import parse_weibo_response, parse_toutiao_response
from MediaSpiders.utils.time_utils import get_current_timestamp
class HotSearchSpider(scrapy.Spider):
@ -22,42 +26,76 @@ class HotSearchSpider(scrapy.Spider):
'MediaSpiders.pipelines.ProtobufSavePipeline': 300,
# 'MediaSpiders.pipelines.HotSearchSaveToMySQL': 300
},
'SPIDER_MIDDLEWARES': {},
'SPIDER_MIDDLEWARES': {
'MediaSpiders.middlewares.DumpFilterSpiderMiddleware': 543,
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware': None
},
'DOWNLOADER_MIDDLEWARES': {},
'BATCH_SAVE_SIZE': 50
}
start_urls = [
'https://weibo.com/ajax/side/hotSearch',
# 'https://weibo.com/ajax/side/hotSearch',
'https://www.toutiao.com/hot-event/hot-board/?origin=toutiao_pc'
]
def __init__(self, params=None, *args, **kwargs):
super(HotSearchSpider, self).__init__(*args, **kwargs)
self.job_id = None
self.collected_items = []
self.redis_client = redis.Redis(
host=self.settings['REDIS_HOST'],
port=self.settings['REDIS_PORT'],
password=self.settings['REDIS_PWD']
)
if params:
json_params = json.loads(params)
if 'job_id' in json_params:
self.job_id = json_params['job_id']
try:
json_params = json.loads(params)
if 'job_id' in json_params:
self.job_id = json_params['job_id']
if 'max_items' in json_params:
self.max_items = int(json_params['max_items'])
except Exception as e:
self.logger.error(f"解析参数失败: {str(e)}")
def start_requests(self):
"""发起初始请求"""
self.logger.info(f"开始爬取热搜数据任务ID: {self.job_id if self.job_id else 'N/A'}")
self.url_time = get_current_timestamp()
for url in self.start_urls:
yield scrapy.Request(
url=url,
callback=self.parse
)
def parse(self, response):
result_array = []
if 'weibo.com' in response.url:
result_array = parse_weibo_response(response.text)
elif 'toutiao.com' in response.url:
result_array = parse_toutiao_response(response.text)
try:
if 'weibo.com' in response.url:
result_array = parse_weibo_response(response.text)
elif 'toutiao.com' in response.url:
result_array = parse_toutiao_response(response.text)
for line in result_array:
hot_search_item = MediaspidersItem()
hot_search_item['es_carriertype'] = 'hot_search'
hot_search_item['es_sid'] = line['id']
hot_search_item['es_hkey'] = line['hot_id']
hot_search_item['es_urltitle'] = line['hot_word']
hot_search_item['es_urlcontent'] = line['hot_word']
hot_search_item['es_heat'] = line['hot_value']
hot_search_item['es_catalog'] = line['category']
hot_search_item['es_simrank'] = line['realtime_rank']
hot_search_item['es_sitename'] = line['platform']
hot_search_item['es_urltime'] = line['onboard_time']
hot_search_item['es_lasttime'] = line['crawl_time']
hot_search_item['es_urlname'] = line['fake_url']
yield hot_search_item
except Exception as e:
self.logger.exception(f"解析异常: {str(e)}")
for line in result_array:
hot_search_item = MediaspidersItem()
hot_search_item['es_carriertype'] = 'hot_search'
hot_search_item['es_sid'] = line['id']
hot_search_item['es_hkey'] = line['hot_id']
hot_search_item['es_urltitle'] = line['hot_word']
hot_search_item['es_urlcontent'] = line['hot_word']
hot_search_item['es_heat'] = line['hot_value']
hot_search_item['es_catalog'] = line['category']
hot_search_item['es_simrank'] = line['realtime_rank']
hot_search_item['es_sitename'] = line['platform']
hot_search_item['es_urltime'] = line['onboard_time']
hot_search_item['es_lasttime'] = line['crawl_time']
hot_search_item['es_urlname'] = line['fake_url']
yield hot_search_item

View File

@ -2,11 +2,13 @@ import json
import uuid
import logging
import time
import requests
from MediaSpiders.utils.string_utils import get_str_md5
def parse_weibo_response(rsp_str):
rsp_body = json.loads(rsp_str)
def parse_weibo_response(rsp_body):
result_array = []
if rsp_body['ok'] == 1:
realtime_data = rsp_body['data']['realtime']
@ -56,7 +58,7 @@ def parse_toutiao_response(rsp_str):
"platform": "今日头条",
"onboard_time": current_timestamp,
"crawl_time": current_timestamp,
"fake_url": f"https://www.toutiao.com/hot-event/hot-board/{custom_sid}"
"fake_url": line['Url']
}
if 'InterestCategory' in line:
result_line['category'] = ",".join(line['InterestCategory'])
@ -66,6 +68,11 @@ def parse_toutiao_response(rsp_str):
logging.info(json.dumps(line, ensure_ascii=False))
return result_array
def url_response(url):
rsp_str = requests.get(url).text
return json.loads(rsp_str)
if __name__ == "__main__":
# rsp_file = open("./toutiao_hot_search.json", 'r', encoding='utf-8')

View File

@ -1,10 +1,67 @@
import datetime
import time
from datetime import datetime, timezone, timedelta
import re
def get_current_timestamp():
return int(time.time() * 1000)
def str_to_timestamp(dt_str: str, tz_offset: int = 8) -> int:
"""
将时间字符串转为 Unix 时间戳
支持格式
- 'YYYY-MM-DD HH:MM'
- 'YYYY-MM-DD HH:MM:SS'
- 以及包含额外文本的混合字符串"2026-02-27 20:11·头条新锐创作者"
Args:
dt_str: 时间字符串会自动提取其中的时间部分
tz_offset: 时区偏移小时中国用 8
Returns:
整数时间戳
Raises:
ValueError: 无法提取有效时间格式时抛出
"""
# 去除首尾空格
dt_str = dt_str.strip()
# 使用正则表达式提取时间部分(匹配 YYYY-MM-DD HH:MM 或 YYYY-MM-DD HH:MM:SS
time_pattern = r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}(?::\d{2})?)'
match = re.search(time_pattern, dt_str)
if not match:
raise ValueError(f"无法从字符串中提取有效时间格式: {dt_str}")
# 获取匹配到的时间字符串
time_str = match.group(1)
# 根据格式解析
try:
if len(time_str) == 16: # 'YYYY-MM-DD HH:MM'
dt = datetime.strptime(time_str, '%Y-%m-%d %H:%M')
elif len(time_str) == 19: # 'YYYY-MM-DD HH:MM:SS'
dt = datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S')
else:
# 尝试自动解析
for fmt in ['%Y-%m-%d %H:%M', '%Y-%m-%d %H:%M:%S']:
try:
dt = datetime.strptime(time_str, fmt)
break
except ValueError:
continue
else:
raise ValueError(f"无法解析的时间格式: {time_str}")
except ValueError as e:
raise ValueError(f"时间格式解析失败: {time_str}") from e
# 创建时区
tz = timezone(timedelta(hours=tz_offset))
# 返回时间戳
return int(dt.replace(tzinfo=tz).timestamp())
def get_time_stamp(date_str):
try:

View File

@ -20,4 +20,4 @@ dirpath = os.path.dirname(os.path.abspath(__file__))
sys.path.append(dirpath)
# 等效于scrapy crawl FacebookUserSpider -a params="{}"
# execute(['scrapy', 'crawl', 'hot_search_spider', '-a', 'params={}'])
execute(['scrapy', 'crawl', 'BaiduHotSearchSprder', '-a', 'params={}'])
execute(['scrapy', 'crawl', 'HotSearchRedisSpider', '-a', 'params={}'])