diff --git a/spiders/MediaSpiders/MediaSpiders/spiders/HotSearchRedisSpider.py b/spiders/MediaSpiders/MediaSpiders/spiders/HotSearchRedisSpider.py index 6362dde..c707258 100644 --- a/spiders/MediaSpiders/MediaSpiders/spiders/HotSearchRedisSpider.py +++ b/spiders/MediaSpiders/MediaSpiders/spiders/HotSearchRedisSpider.py @@ -5,7 +5,7 @@ from typing import List, Dict, Any, Optional import scrapy from scrapy_selenium import SeleniumRequest -from selenium.common.exceptions import TimeoutException, NoSuchElementException +from selenium.common.exceptions import TimeoutException, NoSuchElementException, ElementClickInterceptedException from selenium.webdriver.common.by import By from selenium.webdriver.remote.webelement import WebElement from selenium.webdriver.support import expected_conditions as EC @@ -46,15 +46,16 @@ class HotSearchRedisSpider(scrapy.Spider): BAIDU_URL = 'https://www.toutiao.com/' PAGE_LOAD_TIMEOUT = 10 ELEMENT_WAIT_TIMEOUT = 5 - MAX_NEWS_PER_HOT = 6 + MAX_NEWS_PER_HOT = 1 MAX_HOT_ITEMS = 10 # 选择器定义 - URL_SELECTORS = [ - '.l-content a', - '.feed-card-wtt-l p a', - '.feed-card-article-l a' - ] + # URL_SELECTORS = [ + # '.card-render-wrapper a' + # # '.l-content a', + # # '.feed-card-wtt-l p a', + # # '.feed-card-article-l a' + # ] AUTHOR_SELECTORS = [ "//div[@class='author-info']/div[@class='desc']/a[@class='name']", @@ -103,9 +104,16 @@ class HotSearchRedisSpider(scrapy.Spider): driver = response.request.meta['driver'] # 获取热点数据 - hot_items = self._fetch_hot_items() + hot_items = self._fetch_hot_items()[:self.MAX_HOT_ITEMS] + # hot_items = [] + # hot_items.append({ + # "fake_url": "https://www.toutiao.com/trending/7612920230477565459/?rank=14&log_from=4dda3d0c958f48_1772529869512", + # 'hot_id': '76132246866893fda27', + # 'hot_value': 5432429101, + # 'hot_word': '伊朗:反击最初两天650民美士兵伤亡' + # }) - for hot_item in hot_items[:self.MAX_HOT_ITEMS]: + for hot_item in hot_items: if not hot_item.get('fake_url'): self.logger.warning(f"热点 {hot_item['hot_word']} 无有效URL,跳过") continue @@ -142,28 +150,141 @@ class HotSearchRedisSpider(scrapy.Spider): def _process_hot_item(self, driver, hot_item: Dict[str, Any]): """处理单个热点项""" try: + + yield from self._get_event_details(driver, hot_item) + # 加载热点页面 driver.get(hot_item['fake_url']) self._wait_for_page_load(driver) - # 获取新闻卡片 - news_cards = self._get_news_cards(driver) + # 拿到所有标题,判断是否存在“事件脉络” + context_list = driver.find_elements(By.XPATH, "//div[@class='block-title']") + self.logger.info(f"context_list:{context_list}") + + for context in context_list: + block_title = context.text.strip() + + if block_title == "事件脉络": + yield from self._get_event_timeline(context, driver, hot_item) + continue + + except Exception as e: + self.logger.error(f"处理热点 '{hot_item['hot_word']}' 失败: {e}") + + def _get_event_timeline(self, context, driver, hot_item: Dict[str, Any]): + self.logger.info("开始采集事件脉络...") + + # 定位按钮元素 + button_element = context.find_elements(By.XPATH, + "//div[@class='timeline-card-wrapper']/div[@class='load-more']/button")[ + 0] + # 尝试普通点击 + try: + button_element.click() + except ElementClickInterceptedException: + # 如果被遮挡,使用JavaScript点击 + self.logger.info("使用JavaScript点击按钮") + driver.execute_script("arguments[0].click();", button_element) + + # 等待内容加载 + time.sleep(2) + # 获取当前所有的脉络信息 + event_list = context.find_elements(By.XPATH, + "//div[@class='timeline-card-wrapper']/div[@class='time-nodes']/div[@class='time-node']") + self.logger.info(f"找到 {len(event_list)} 个事件") + url_content = '' + for idx, even in enumerate(event_list, 1): + try: + # 获取标题和时间 + title_elem = even.find_element(By.XPATH, ".//div[@class='title']") + title_text = title_elem.text + + # 获取脉络内容 + content_element = even.find_element(By.XPATH, ".//a[@class='content']/p") + content = content_element.text.strip() + + # 检查是否有"最新"标签 + try: + tag_element = content_element.find_element(By.XPATH, ".//span[@class='tag']") + tag = tag_element.text.strip() # "最新" + # 从内容中移除标签文本(如果需要) + content = content.replace(tag, "").strip() + except: + tag = "" + + # 拼接标题和内容,用换行符隔开 + if url_content: + url_content += "\n" # 在已有内容后添加换行符 + + # 添加当前条目的标题和内容 + url_content += f"{title_text}\n{content}" + + self.logger.info(f"已添加第{idx}条: {title_text[:20]}...") + + except Exception as e: + self.logger.error(f"处理第{idx}个事件时出错: {e}") + continue + + timeNow = get_current_timestamp() + # 创建item + event_timeline_item = MediaspidersItem() + event_timeline_item['es_carriertype'] = 'news' + event_timeline_item['es_srcname'] = 'https://www.toutiao.com/' + event_timeline_item['es_sitename'] = '今日头条' + event_timeline_item['es_sid'] = get_str_md5(hot_item["fake_url"]) + event_timeline_item['es_urltitle'] = hot_item['hot_word'] + event_timeline_item['es_authors'] = '' + event_timeline_item['es_urlcontent'] = url_content + event_timeline_item['es_urltime'] = timeNow + event_timeline_item['es_lasttime'] = timeNow + event_timeline_item['es_urlname'] = hot_item["fake_url"] + event_timeline_item['es_hkey'] = hot_item['hot_id'] + event_timeline_item['es_urltopic'] = hot_item['hot_word'] + event_timeline_item['es_video'] = '' + + yield event_timeline_item + self.logger.info(f"事件脉络-采集成功 '{hot_item['hot_word']}':{hot_item['fake_url']}") + + + def _get_event_details(self, driver, hot_item: Dict[str, Any]): + """获取事件详情卡片列表""" + self.logger.info(f"开始采集事件详情-{hot_item['hot_word']}: {hot_item['fake_url']}") + + hot_url = hot_item['fake_url'] + driver.get(hot_url) + self._wait_for_page_load(driver) + # 如果 api 采集的url为榜单页,则采集卡片,否则就直接采集详情页 + if "article" not in hot_url: + cards = driver.find_elements(By.XPATH, + "//div[@class='block-content']/div[@class='card-render-wrapper']") + news_cards = cards[:self.MAX_NEWS_PER_HOT] news_urls_array = [] for card in news_cards: - news_url = self._extract_url_from_card(card) - if not news_url: - continue + """从卡片中提取URL""" + # for selector in self.URL_SELECTORS: + try: + element = card.find_element(By.CSS_SELECTOR, '.card-render-wrapper a') + url = element.get_attribute('href') + if url and url.startswith(('http://', 'https://')): + news_url = url + except NoSuchElementException: + break - if "video" in news_url.lower(): - self.logger.info(f"跳过视频链接: {news_url}") - continue + if "video" in news_url.lower() or not news_url: + self.logger.info(f"跳过该链接采集: {news_url}") + break news_urls_array.append(news_url) + else: + # 将详情页赋值 + news_urls_array = [hot_url] + try: + # 开始采集 for url in news_urls_array: yield from self._process_news_page(driver, url, hot_item) except Exception as e: - self.logger.error(f"处理热点 '{hot_item['hot_word']}' 失败: {e}") + self.logger.error(f"获取事件详情卡片失败: {e}") def _wait_for_page_load(self, driver, timeout: int = None): """等待页面加载""" @@ -176,27 +297,6 @@ class HotSearchRedisSpider(scrapy.Spider): except TimeoutException: self.logger.warning("页面加载超时") - def _get_news_cards(self, driver) -> List[WebElement]: - """获取新闻卡片列表""" - try: - cards = driver.find_elements(By.CSS_SELECTOR, ".feed-card-wrapper") - return cards[:self.MAX_NEWS_PER_HOT] - except Exception as e: - self.logger.error(f"获取新闻卡片失败: {e}") - return [] - - def _extract_url_from_card(self, card: WebElement) -> Optional[str]: - """从卡片中提取URL""" - for selector in self.URL_SELECTORS: - try: - element = card.find_element(By.CSS_SELECTOR, selector) - url = element.get_attribute('href') - if url and url.startswith(('http://', 'https://')): - return url - except NoSuchElementException: - continue - return None - def _process_news_page(self, driver, news_url: str, hot_item: Dict[str, Any]): """处理单个新闻页面""" try: @@ -230,6 +330,8 @@ class HotSearchRedisSpider(scrapy.Spider): yield even_details_item + self.logger.info(f"事件详情-采集成功 '{hot_item['hot_word']}':{news_url}") + except Exception as e: self.logger.error(f"处理新闻页面失败 {news_url}: {e}") diff --git a/spiders/MediaSpiders/MediaSpiders/spiders/hot_search_spider.py b/spiders/MediaSpiders/MediaSpiders/spiders/hot_search_spider.py index cac058d..722a457 100644 --- a/spiders/MediaSpiders/MediaSpiders/spiders/hot_search_spider.py +++ b/spiders/MediaSpiders/MediaSpiders/spiders/hot_search_spider.py @@ -84,7 +84,7 @@ class HotSearchSpider(scrapy.Spider): hot_search_item['es_sitename'] = line['platform'] hot_search_item['es_urltime'] = line['onboard_time'] hot_search_item['es_lasttime'] = line['crawl_time'] - hot_search_item['es_urlname'] = line['fake_url'] + hot_search_item['es_urlname'] = line['fake_url'] + "&news" yield hot_search_item