osc/research/pdf_downloader/save_page_as_mhtml.py
2025-12-26 08:54:58 +08:00

142 lines
5.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import logging
import os
import time
from urllib.parse import urlparse
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service as ChromeService
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(),
logging.FileHandler('mhtml_saver.log', encoding='utf-8')
]
)
logger = logging.getLogger(__name__)
class MHTMLSaver:
def __init__(self, headless=True):
logger.info("正在初始化 Chrome WebDriver自动匹配版本...")
service = ChromeService(executable_path="C:/Program Files/Python38/chromedriver.exe")
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.3650.75 Safari/537.36 Edg/143.0.3650.75"
# Chrome 选项
chrome_options = Options()
chrome_options.add_argument('--headless=new')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument(f'--user-agent={user_agent}')
chrome_options.add_argument('--save-page-as-mhtml') # 启用 MHTML 支持
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument('--lang=zh-CN')
chrome_options.add_experimental_option('prefs', {
'intl.accept_languages': 'zh-CN,zh,en'
})
# 或启动时指定(部分版本支持)
chrome_options.add_argument('--window-size=1920,1080')
# 隐藏 webdriver 特征
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
# 隐藏 "navigator.webdriver"
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
chrome_options.add_experimental_option('useAutomationExtension', False)
self.driver = webdriver.Chrome(service=service, options=chrome_options)
def save_as_mhtml(self, url, output_path=None, timeout=30, wait_time=5):
"""
将网页保存为 MHTML 文件
:param url: 目标网页 URL
:param output_path: 输出路径(.mhtml
:param timeout: 页面加载超时(秒)
:param wait_time: 页面加载后等待时间(秒),用于动态内容渲染
:return: 保存的文件绝对路径
"""
if output_path is None:
parsed = urlparse(url)
domain = parsed.netloc.replace('www.', '').split('.')[0] or 'page'
output_path = f"{domain}.mhtml"
if not output_path.lower().endswith('.mhtml'):
output_path += '.mhtml'
try:
# 设置超时
self.driver.set_page_load_timeout(timeout)
# 启动后注入脚本(双重保险)
self.driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
'source': '''
delete navigator.__proto__.webdriver;
window.navigator.permissions.query = (parameters) => {
return parameters.name === 'notifications' ?
Promise.resolve({ state: Notification.permission }) :
originalQuery(parameters);
};
'''
})
# 在 driver.get() 之后设置窗口大小
logger.info(f"正在加载页面: {url}")
self.driver.get(url)
self.driver.set_window_size(1920, 1080)
# 等待页面动态内容加载(可调整)
logger.info(f"等待 {wait_time} 秒以确保页面完全渲染...")
time.sleep(wait_time)
# ✅ 关键:调用 CDP 命令捕获 MHTML
logger.info("正在生成 MHTML 快照...")
result = self.driver.execute_cdp_cmd('Page.captureSnapshot', {'format': 'mhtml'})
# ✅ result['data'] 是 Base64 编码的 MHTML 文本(实际是纯字符串)
mhtml_content = result['data']
# ✅ 以文本模式写入UTF-8
with open(output_path, 'w', encoding='utf-8', newline='') as f:
f.write(mhtml_content)
# 验证文件
file_size = os.path.getsize(output_path)
if file_size == 0:
raise RuntimeError("生成了空文件")
logger.info(f"✅ MHTML 保存成功: {os.path.abspath(output_path)} (大小: {file_size} 字节)")
return os.path.abspath(output_path)
except Exception as e:
logger.error(f"❌ 保存失败: {e}")
raise
def quit(self):
if self.driver:
self.driver.quit()
logger.info("浏览器已关闭")
# ===== 测试入口 =====
if __name__ == "__main__":
# 示例 URL可替换为你自己的
test_url = "https://cn.ultraiso.net/jiaocheng/ke-lu-guang-pan.html"
saver = MHTMLSaver(headless=True)
try:
output_file = saver.save_as_mhtml(
url=test_url,
output_path="example.mhtml",
timeout=30,
wait_time=5
)
print(f"\n🎉 成功保存 MHTML 文件: {output_file}")
except Exception as e:
print(f"\n💥 保存失败: {e}")
finally:
saver.quit()