import logging import os import time from urllib.parse import urlparse from selenium import webdriver from selenium.common.exceptions import ( WebDriverException, TimeoutException, SessionNotCreatedException, InvalidSessionIdException ) from selenium.webdriver.chrome.options import Options logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class RemoteMHTMLSaver: def __init__( self, remote_url="http://144.34.185.108:28098/wd/hub", headless=True, max_retries=3, retry_delay=2 ): """ 初始化远程 MHTML 保存器(支持自动重建 session) :param remote_url: 远程 Selenium 地址 :param headless: 是否无头 :param max_retries: 单次操作最大重试次数 :param retry_delay: 重试前等待时间(秒) """ self.remote_url = remote_url self.headless = headless self.max_retries = max_retries self.retry_delay = retry_delay self.driver = None self._init_driver() def _build_chrome_options(self): """构建 Chrome 选项(可复用)""" chrome_options = Options() if self.headless: chrome_options.add_argument('--headless=new') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-dev-shm-usage') chrome_options.add_argument('--disable-gpu') chrome_options.add_argument('--window-size=1920,1080') chrome_options.add_argument( "--user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.3650.75 Safari/537.36" ) chrome_options.add_experimental_option('excludeSwitches', ['enable-automation']) chrome_options.add_experimental_option('useAutomationExtension', False) return chrome_options def _init_driver(self): """初始化或重新初始化 WebDriver""" if self.driver: try: self.driver.quit() except Exception: pass # 忽略关闭失败 logger.info(f"正在创建新的远程 WebDriver 会话: {self.remote_url}") for attempt in range(3): try: self.driver = webdriver.Remote( command_executor=self.remote_url, options=self._build_chrome_options() ) # 注入反检测脚本 self.driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', { 'source': ''' delete navigator.__proto__.webdriver; window.chrome = { runtime: {} }; Object.defineProperty(navigator, 'languages', { get: () => ['zh-CN', 'zh', 'en'] }); ''' }) logger.info("✅ 远程 WebDriver 会话创建成功") return except Exception as e: logger.warning(f"创建 WebDriver 失败 (尝试 {attempt + 1}/3): {e}") if attempt < 2: time.sleep(2) else: raise RuntimeError(f"无法连接到远程 Selenium 服务: {e}") def save_as_mhtml(self, url, output_path=None, timeout=30, wait_time=5): """ 保存网页为 MHTML,支持自动重试和 session 重建 """ if output_path is None: domain = urlparse(url).netloc.replace('www.', '').split('.')[0] or 'page' output_path = f"{domain}.mhtml" if not output_path.lower().endswith('.mhtml'): output_path += '.mhtml' last_exception = None for retry in range(self.max_retries + 1): try: # 检查 driver 是否有效 if not self.driver: self._init_driver() self.driver.set_page_load_timeout(timeout) logger.info(f"[{retry + 1}/{self.max_retries + 1}] 加载页面: {url}") self.driver.get(url) time.sleep(wait_time) logger.info("生成 MHTML 快照...") result = self.driver.execute_cdp_cmd('Page.captureSnapshot', {'format': 'mhtml'}) mhtml_content = result['data'] # 写入本地文件 with open(output_path, 'w', encoding='utf-8', newline='') as f: f.write(mhtml_content) file_size = os.path.getsize(output_path) if file_size == 0: raise RuntimeError("生成了空文件") logger.info(f"✅ 保存成功: {os.path.abspath(output_path)} ({file_size} 字节)") return os.path.abspath(output_path) except (WebDriverException, InvalidSessionIdException, SessionNotCreatedException) as e: last_exception = e logger.warning(f"WebDriver 异常 (retry {retry + 1}): {e}") if retry < self.max_retries: logger.info("正在重建 WebDriver 会话...") self._init_driver() time.sleep(self.retry_delay) else: logger.error("达到最大重试次数,放弃") break except TimeoutException as e: last_exception = e logger.warning(f"页面加载超时 (retry {retry + 1}): {e}") if retry < self.max_retries: time.sleep(self.retry_delay) else: break except Exception as e: last_exception = e logger.error(f"未知错误 (retry {retry + 1}): {e}") break # 非 WebDriver 错误,不重试 # 如果所有重试失败 if os.path.exists(output_path): try: os.remove(output_path) except OSError: pass raise RuntimeError(f"保存失败({type(last_exception).__name__}): {last_exception}") def quit(self): """显式关闭浏览器""" if self.driver: try: self.driver.quit() logger.info("WebDriver 会话已关闭") except Exception: pass self.driver = None def __del__(self): self.quit() # ===== 测试 ===== if __name__ == "__main__": saver = RemoteMHTMLSaver( remote_url="http://144.34.185.108:28098/wd/hub", # ← 替换为你的云服务器公网 IP headless=True ) try: saver.save_as_mhtml( url="https://www.epochtimes.com/gb/25/12/22/n14660274.htm", output_path="remote_example2.mhtml" ) except Exception as e: print(f"❌ 失败: {e}") saver.quit()