191 lines
6.8 KiB
Python
191 lines
6.8 KiB
Python
import logging
|
||
import os
|
||
import time
|
||
from urllib.parse import urlparse
|
||
|
||
from selenium import webdriver
|
||
from selenium.common.exceptions import (
|
||
WebDriverException,
|
||
TimeoutException,
|
||
SessionNotCreatedException,
|
||
InvalidSessionIdException
|
||
)
|
||
from selenium.webdriver.chrome.options import Options
|
||
|
||
logging.basicConfig(level=logging.INFO)
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
class RemoteMHTMLSaver:
|
||
def __init__(
|
||
self,
|
||
remote_url="http://144.34.185.108:28098/wd/hub",
|
||
headless=True,
|
||
max_retries=3,
|
||
retry_delay=2
|
||
):
|
||
"""
|
||
初始化远程 MHTML 保存器(支持自动重建 session)
|
||
:param remote_url: 远程 Selenium 地址
|
||
:param headless: 是否无头
|
||
:param max_retries: 单次操作最大重试次数
|
||
:param retry_delay: 重试前等待时间(秒)
|
||
"""
|
||
self.remote_url = remote_url
|
||
self.headless = headless
|
||
self.max_retries = max_retries
|
||
self.retry_delay = retry_delay
|
||
self.driver = None
|
||
self._init_driver()
|
||
|
||
def _build_chrome_options(self):
|
||
"""构建 Chrome 选项(可复用)"""
|
||
chrome_options = Options()
|
||
if self.headless:
|
||
chrome_options.add_argument('--headless=new')
|
||
chrome_options.add_argument('--no-sandbox')
|
||
chrome_options.add_argument('--disable-dev-shm-usage')
|
||
chrome_options.add_argument('--disable-gpu')
|
||
chrome_options.add_argument('--window-size=1920,1080')
|
||
chrome_options.add_argument(
|
||
"--user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.3650.75 Safari/537.36"
|
||
)
|
||
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
|
||
chrome_options.add_experimental_option('useAutomationExtension', False)
|
||
return chrome_options
|
||
|
||
def _init_driver(self):
|
||
"""初始化或重新初始化 WebDriver"""
|
||
if self.driver:
|
||
try:
|
||
self.driver.quit()
|
||
except Exception:
|
||
pass # 忽略关闭失败
|
||
|
||
logger.info(f"正在创建新的远程 WebDriver 会话: {self.remote_url}")
|
||
for attempt in range(3):
|
||
try:
|
||
self.driver = webdriver.Remote(
|
||
command_executor=self.remote_url,
|
||
options=self._build_chrome_options()
|
||
)
|
||
# 注入反检测脚本
|
||
self.driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
|
||
'source': '''
|
||
delete navigator.__proto__.webdriver;
|
||
window.chrome = { runtime: {} };
|
||
Object.defineProperty(navigator, 'languages', {
|
||
get: () => ['zh-CN', 'zh', 'en']
|
||
});
|
||
'''
|
||
})
|
||
logger.info("✅ 远程 WebDriver 会话创建成功")
|
||
return
|
||
except Exception as e:
|
||
logger.warning(f"创建 WebDriver 失败 (尝试 {attempt + 1}/3): {e}")
|
||
if attempt < 2:
|
||
time.sleep(2)
|
||
else:
|
||
raise RuntimeError(f"无法连接到远程 Selenium 服务: {e}")
|
||
|
||
def save_as_mhtml(self, url, output_path=None, timeout=30, wait_time=5):
|
||
"""
|
||
保存网页为 MHTML,支持自动重试和 session 重建
|
||
"""
|
||
if output_path is None:
|
||
domain = urlparse(url).netloc.replace('www.', '').split('.')[0] or 'page'
|
||
output_path = f"{domain}.mhtml"
|
||
if not output_path.lower().endswith('.mhtml'):
|
||
output_path += '.mhtml'
|
||
|
||
last_exception = None
|
||
|
||
for retry in range(self.max_retries + 1):
|
||
try:
|
||
# 检查 driver 是否有效
|
||
if not self.driver:
|
||
self._init_driver()
|
||
|
||
self.driver.set_page_load_timeout(timeout)
|
||
logger.info(f"[{retry + 1}/{self.max_retries + 1}] 加载页面: {url}")
|
||
self.driver.get(url)
|
||
time.sleep(wait_time)
|
||
|
||
logger.info("生成 MHTML 快照...")
|
||
result = self.driver.execute_cdp_cmd('Page.captureSnapshot', {'format': 'mhtml'})
|
||
mhtml_content = result['data']
|
||
|
||
# 写入本地文件
|
||
with open(output_path, 'w', encoding='utf-8', newline='') as f:
|
||
f.write(mhtml_content)
|
||
|
||
file_size = os.path.getsize(output_path)
|
||
if file_size == 0:
|
||
raise RuntimeError("生成了空文件")
|
||
|
||
logger.info(f"✅ 保存成功: {os.path.abspath(output_path)} ({file_size} 字节)")
|
||
return os.path.abspath(output_path)
|
||
|
||
except (WebDriverException, InvalidSessionIdException, SessionNotCreatedException) as e:
|
||
last_exception = e
|
||
logger.warning(f"WebDriver 异常 (retry {retry + 1}): {e}")
|
||
if retry < self.max_retries:
|
||
logger.info("正在重建 WebDriver 会话...")
|
||
self._init_driver()
|
||
time.sleep(self.retry_delay)
|
||
else:
|
||
logger.error("达到最大重试次数,放弃")
|
||
break
|
||
|
||
except TimeoutException as e:
|
||
last_exception = e
|
||
logger.warning(f"页面加载超时 (retry {retry + 1}): {e}")
|
||
if retry < self.max_retries:
|
||
time.sleep(self.retry_delay)
|
||
else:
|
||
break
|
||
|
||
except Exception as e:
|
||
last_exception = e
|
||
logger.error(f"未知错误 (retry {retry + 1}): {e}")
|
||
break # 非 WebDriver 错误,不重试
|
||
|
||
# 如果所有重试失败
|
||
if os.path.exists(output_path):
|
||
try:
|
||
os.remove(output_path)
|
||
except OSError:
|
||
pass
|
||
|
||
raise RuntimeError(f"保存失败({type(last_exception).__name__}): {last_exception}")
|
||
|
||
def quit(self):
|
||
"""显式关闭浏览器"""
|
||
if self.driver:
|
||
try:
|
||
self.driver.quit()
|
||
logger.info("WebDriver 会话已关闭")
|
||
except Exception:
|
||
pass
|
||
self.driver = None
|
||
|
||
def __del__(self):
|
||
self.quit()
|
||
|
||
|
||
# ===== 测试 =====
|
||
if __name__ == "__main__":
|
||
saver = RemoteMHTMLSaver(
|
||
remote_url="http://144.34.185.108:28098/wd/hub", # ← 替换为你的云服务器公网 IP
|
||
headless=True
|
||
)
|
||
try:
|
||
saver.save_as_mhtml(
|
||
url="https://www.epochtimes.com/gb/25/12/22/n14660274.htm",
|
||
output_path="remote_example2.mhtml"
|
||
)
|
||
except Exception as e:
|
||
print(f"❌ 失败: {e}")
|
||
|
||
saver.quit()
|