osc/research/pdf_downloader/save_remote_as_mhtml.py
2025-12-26 08:54:58 +08:00

191 lines
6.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import logging
import os
import time
from urllib.parse import urlparse
from selenium import webdriver
from selenium.common.exceptions import (
WebDriverException,
TimeoutException,
SessionNotCreatedException,
InvalidSessionIdException
)
from selenium.webdriver.chrome.options import Options
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class RemoteMHTMLSaver:
def __init__(
self,
remote_url="http://144.34.185.108:28098/wd/hub",
headless=True,
max_retries=3,
retry_delay=2
):
"""
初始化远程 MHTML 保存器(支持自动重建 session
:param remote_url: 远程 Selenium 地址
:param headless: 是否无头
:param max_retries: 单次操作最大重试次数
:param retry_delay: 重试前等待时间(秒)
"""
self.remote_url = remote_url
self.headless = headless
self.max_retries = max_retries
self.retry_delay = retry_delay
self.driver = None
self._init_driver()
def _build_chrome_options(self):
"""构建 Chrome 选项(可复用)"""
chrome_options = Options()
if self.headless:
chrome_options.add_argument('--headless=new')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--window-size=1920,1080')
chrome_options.add_argument(
"--user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.3650.75 Safari/537.36"
)
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
chrome_options.add_experimental_option('useAutomationExtension', False)
return chrome_options
def _init_driver(self):
"""初始化或重新初始化 WebDriver"""
if self.driver:
try:
self.driver.quit()
except Exception:
pass # 忽略关闭失败
logger.info(f"正在创建新的远程 WebDriver 会话: {self.remote_url}")
for attempt in range(3):
try:
self.driver = webdriver.Remote(
command_executor=self.remote_url,
options=self._build_chrome_options()
)
# 注入反检测脚本
self.driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
'source': '''
delete navigator.__proto__.webdriver;
window.chrome = { runtime: {} };
Object.defineProperty(navigator, 'languages', {
get: () => ['zh-CN', 'zh', 'en']
});
'''
})
logger.info("✅ 远程 WebDriver 会话创建成功")
return
except Exception as e:
logger.warning(f"创建 WebDriver 失败 (尝试 {attempt + 1}/3): {e}")
if attempt < 2:
time.sleep(2)
else:
raise RuntimeError(f"无法连接到远程 Selenium 服务: {e}")
def save_as_mhtml(self, url, output_path=None, timeout=30, wait_time=5):
"""
保存网页为 MHTML支持自动重试和 session 重建
"""
if output_path is None:
domain = urlparse(url).netloc.replace('www.', '').split('.')[0] or 'page'
output_path = f"{domain}.mhtml"
if not output_path.lower().endswith('.mhtml'):
output_path += '.mhtml'
last_exception = None
for retry in range(self.max_retries + 1):
try:
# 检查 driver 是否有效
if not self.driver:
self._init_driver()
self.driver.set_page_load_timeout(timeout)
logger.info(f"[{retry + 1}/{self.max_retries + 1}] 加载页面: {url}")
self.driver.get(url)
time.sleep(wait_time)
logger.info("生成 MHTML 快照...")
result = self.driver.execute_cdp_cmd('Page.captureSnapshot', {'format': 'mhtml'})
mhtml_content = result['data']
# 写入本地文件
with open(output_path, 'w', encoding='utf-8', newline='') as f:
f.write(mhtml_content)
file_size = os.path.getsize(output_path)
if file_size == 0:
raise RuntimeError("生成了空文件")
logger.info(f"✅ 保存成功: {os.path.abspath(output_path)} ({file_size} 字节)")
return os.path.abspath(output_path)
except (WebDriverException, InvalidSessionIdException, SessionNotCreatedException) as e:
last_exception = e
logger.warning(f"WebDriver 异常 (retry {retry + 1}): {e}")
if retry < self.max_retries:
logger.info("正在重建 WebDriver 会话...")
self._init_driver()
time.sleep(self.retry_delay)
else:
logger.error("达到最大重试次数,放弃")
break
except TimeoutException as e:
last_exception = e
logger.warning(f"页面加载超时 (retry {retry + 1}): {e}")
if retry < self.max_retries:
time.sleep(self.retry_delay)
else:
break
except Exception as e:
last_exception = e
logger.error(f"未知错误 (retry {retry + 1}): {e}")
break # 非 WebDriver 错误,不重试
# 如果所有重试失败
if os.path.exists(output_path):
try:
os.remove(output_path)
except OSError:
pass
raise RuntimeError(f"保存失败({type(last_exception).__name__}: {last_exception}")
def quit(self):
"""显式关闭浏览器"""
if self.driver:
try:
self.driver.quit()
logger.info("WebDriver 会话已关闭")
except Exception:
pass
self.driver = None
def __del__(self):
self.quit()
# ===== 测试 =====
if __name__ == "__main__":
saver = RemoteMHTMLSaver(
remote_url="http://144.34.185.108:28098/wd/hub", # ← 替换为你的云服务器公网 IP
headless=True
)
try:
saver.save_as_mhtml(
url="https://www.epochtimes.com/gb/25/12/22/n14660274.htm",
output_path="remote_example2.mhtml"
)
except Exception as e:
print(f"❌ 失败: {e}")
saver.quit()