import logging import os import time from urllib.parse import urlparse from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.service import Service as ChromeService # 配置日志 logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.StreamHandler(), logging.FileHandler('mhtml_saver.log', encoding='utf-8') ] ) logger = logging.getLogger(__name__) class MHTMLSaver: def __init__(self, headless=True): logger.info("正在初始化 Chrome WebDriver(自动匹配版本)...") service = ChromeService(executable_path="C:/Program Files/Python38/chromedriver.exe") user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.3650.75 Safari/537.36 Edg/143.0.3650.75" # Chrome 选项 chrome_options = Options() chrome_options.add_argument('--headless=new') chrome_options.add_argument('--disable-gpu') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-dev-shm-usage') chrome_options.add_argument(f'--user-agent={user_agent}') chrome_options.add_argument('--save-page-as-mhtml') # 启用 MHTML 支持 chrome_options.add_experimental_option('excludeSwitches', ['enable-automation']) chrome_options.add_experimental_option('useAutomationExtension', False) chrome_options.add_argument('--lang=zh-CN') chrome_options.add_experimental_option('prefs', { 'intl.accept_languages': 'zh-CN,zh,en' }) # 或启动时指定(部分版本支持) chrome_options.add_argument('--window-size=1920,1080') # 隐藏 webdriver 特征 chrome_options.add_argument("--disable-blink-features=AutomationControlled") # 隐藏 "navigator.webdriver" chrome_options.add_experimental_option('excludeSwitches', ['enable-automation']) chrome_options.add_experimental_option('useAutomationExtension', False) self.driver = webdriver.Chrome(service=service, options=chrome_options) def save_as_mhtml(self, url, output_path=None, timeout=30, wait_time=5): """ 将网页保存为 MHTML 文件 :param url: 目标网页 URL :param output_path: 输出路径(.mhtml) :param timeout: 页面加载超时(秒) :param wait_time: 页面加载后等待时间(秒),用于动态内容渲染 :return: 保存的文件绝对路径 """ if output_path is None: parsed = urlparse(url) domain = parsed.netloc.replace('www.', '').split('.')[0] or 'page' output_path = f"{domain}.mhtml" if not output_path.lower().endswith('.mhtml'): output_path += '.mhtml' try: # 设置超时 self.driver.set_page_load_timeout(timeout) # 启动后注入脚本(双重保险) self.driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', { 'source': ''' delete navigator.__proto__.webdriver; window.navigator.permissions.query = (parameters) => { return parameters.name === 'notifications' ? Promise.resolve({ state: Notification.permission }) : originalQuery(parameters); }; ''' }) # 在 driver.get() 之后设置窗口大小 logger.info(f"正在加载页面: {url}") self.driver.get(url) self.driver.set_window_size(1920, 1080) # 等待页面动态内容加载(可调整) logger.info(f"等待 {wait_time} 秒以确保页面完全渲染...") time.sleep(wait_time) # ✅ 关键:调用 CDP 命令捕获 MHTML logger.info("正在生成 MHTML 快照...") result = self.driver.execute_cdp_cmd('Page.captureSnapshot', {'format': 'mhtml'}) # ✅ result['data'] 是 Base64 编码的 MHTML 文本(实际是纯字符串) mhtml_content = result['data'] # ✅ 以文本模式写入(UTF-8) with open(output_path, 'w', encoding='utf-8', newline='') as f: f.write(mhtml_content) # 验证文件 file_size = os.path.getsize(output_path) if file_size == 0: raise RuntimeError("生成了空文件") logger.info(f"✅ MHTML 保存成功: {os.path.abspath(output_path)} (大小: {file_size} 字节)") return os.path.abspath(output_path) except Exception as e: logger.error(f"❌ 保存失败: {e}") raise def quit(self): if self.driver: self.driver.quit() logger.info("浏览器已关闭") # ===== 测试入口 ===== if __name__ == "__main__": # 示例 URL(可替换为你自己的) test_url = "https://cn.ultraiso.net/jiaocheng/ke-lu-guang-pan.html" saver = MHTMLSaver(headless=True) try: output_file = saver.save_as_mhtml( url=test_url, output_path="example.mhtml", timeout=30, wait_time=5 ) print(f"\n🎉 成功保存 MHTML 文件: {output_file}") except Exception as e: print(f"\n💥 保存失败: {e}") finally: saver.quit()