142 lines
5.4 KiB
Python
142 lines
5.4 KiB
Python
import logging
|
||
import os
|
||
import time
|
||
from urllib.parse import urlparse
|
||
|
||
from selenium import webdriver
|
||
from selenium.webdriver.chrome.options import Options
|
||
from selenium.webdriver.chrome.service import Service as ChromeService
|
||
|
||
# 配置日志
|
||
logging.basicConfig(
|
||
level=logging.INFO,
|
||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||
handlers=[
|
||
logging.StreamHandler(),
|
||
logging.FileHandler('mhtml_saver.log', encoding='utf-8')
|
||
]
|
||
)
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
class MHTMLSaver:
|
||
def __init__(self, headless=True):
|
||
logger.info("正在初始化 Chrome WebDriver(自动匹配版本)...")
|
||
service = ChromeService(executable_path="C:/Program Files/Python38/chromedriver.exe")
|
||
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.3650.75 Safari/537.36 Edg/143.0.3650.75"
|
||
|
||
# Chrome 选项
|
||
chrome_options = Options()
|
||
chrome_options.add_argument('--headless=new')
|
||
chrome_options.add_argument('--disable-gpu')
|
||
chrome_options.add_argument('--no-sandbox')
|
||
chrome_options.add_argument('--disable-dev-shm-usage')
|
||
chrome_options.add_argument(f'--user-agent={user_agent}')
|
||
chrome_options.add_argument('--save-page-as-mhtml') # 启用 MHTML 支持
|
||
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
|
||
chrome_options.add_experimental_option('useAutomationExtension', False)
|
||
chrome_options.add_argument('--lang=zh-CN')
|
||
chrome_options.add_experimental_option('prefs', {
|
||
'intl.accept_languages': 'zh-CN,zh,en'
|
||
})
|
||
# 或启动时指定(部分版本支持)
|
||
chrome_options.add_argument('--window-size=1920,1080')
|
||
|
||
# 隐藏 webdriver 特征
|
||
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
|
||
# 隐藏 "navigator.webdriver"
|
||
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
|
||
chrome_options.add_experimental_option('useAutomationExtension', False)
|
||
|
||
self.driver = webdriver.Chrome(service=service, options=chrome_options)
|
||
|
||
def save_as_mhtml(self, url, output_path=None, timeout=30, wait_time=5):
|
||
"""
|
||
将网页保存为 MHTML 文件
|
||
:param url: 目标网页 URL
|
||
:param output_path: 输出路径(.mhtml)
|
||
:param timeout: 页面加载超时(秒)
|
||
:param wait_time: 页面加载后等待时间(秒),用于动态内容渲染
|
||
:return: 保存的文件绝对路径
|
||
"""
|
||
if output_path is None:
|
||
parsed = urlparse(url)
|
||
domain = parsed.netloc.replace('www.', '').split('.')[0] or 'page'
|
||
output_path = f"{domain}.mhtml"
|
||
|
||
if not output_path.lower().endswith('.mhtml'):
|
||
output_path += '.mhtml'
|
||
|
||
try:
|
||
# 设置超时
|
||
self.driver.set_page_load_timeout(timeout)
|
||
|
||
# 启动后注入脚本(双重保险)
|
||
self.driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
|
||
'source': '''
|
||
delete navigator.__proto__.webdriver;
|
||
window.navigator.permissions.query = (parameters) => {
|
||
return parameters.name === 'notifications' ?
|
||
Promise.resolve({ state: Notification.permission }) :
|
||
originalQuery(parameters);
|
||
};
|
||
'''
|
||
})
|
||
# 在 driver.get() 之后设置窗口大小
|
||
|
||
logger.info(f"正在加载页面: {url}")
|
||
self.driver.get(url)
|
||
self.driver.set_window_size(1920, 1080)
|
||
|
||
# 等待页面动态内容加载(可调整)
|
||
logger.info(f"等待 {wait_time} 秒以确保页面完全渲染...")
|
||
time.sleep(wait_time)
|
||
|
||
# ✅ 关键:调用 CDP 命令捕获 MHTML
|
||
logger.info("正在生成 MHTML 快照...")
|
||
result = self.driver.execute_cdp_cmd('Page.captureSnapshot', {'format': 'mhtml'})
|
||
|
||
# ✅ result['data'] 是 Base64 编码的 MHTML 文本(实际是纯字符串)
|
||
mhtml_content = result['data']
|
||
|
||
# ✅ 以文本模式写入(UTF-8)
|
||
with open(output_path, 'w', encoding='utf-8', newline='') as f:
|
||
f.write(mhtml_content)
|
||
|
||
# 验证文件
|
||
file_size = os.path.getsize(output_path)
|
||
if file_size == 0:
|
||
raise RuntimeError("生成了空文件")
|
||
|
||
logger.info(f"✅ MHTML 保存成功: {os.path.abspath(output_path)} (大小: {file_size} 字节)")
|
||
return os.path.abspath(output_path)
|
||
|
||
except Exception as e:
|
||
logger.error(f"❌ 保存失败: {e}")
|
||
raise
|
||
|
||
def quit(self):
|
||
if self.driver:
|
||
self.driver.quit()
|
||
logger.info("浏览器已关闭")
|
||
|
||
|
||
# ===== 测试入口 =====
|
||
if __name__ == "__main__":
|
||
# 示例 URL(可替换为你自己的)
|
||
test_url = "https://cn.ultraiso.net/jiaocheng/ke-lu-guang-pan.html"
|
||
|
||
saver = MHTMLSaver(headless=True)
|
||
try:
|
||
output_file = saver.save_as_mhtml(
|
||
url=test_url,
|
||
output_path="example.mhtml",
|
||
timeout=30,
|
||
wait_time=5
|
||
)
|
||
print(f"\n🎉 成功保存 MHTML 文件: {output_file}")
|
||
except Exception as e:
|
||
print(f"\n💥 保存失败: {e}")
|
||
finally:
|
||
saver.quit()
|