202 lines
7.2 KiB
Python
202 lines
7.2 KiB
Python
|
|
import base64
|
|||
|
|
import logging
|
|||
|
|
import os
|
|||
|
|
import time
|
|||
|
|
from urllib.parse import urlparse
|
|||
|
|
|
|||
|
|
from selenium import webdriver
|
|||
|
|
from selenium.common.exceptions import (
|
|||
|
|
WebDriverException,
|
|||
|
|
TimeoutException,
|
|||
|
|
SessionNotCreatedException,
|
|||
|
|
InvalidSessionIdException
|
|||
|
|
)
|
|||
|
|
from selenium.webdriver.chrome.options import Options
|
|||
|
|
|
|||
|
|
logging.basicConfig(level=logging.INFO)
|
|||
|
|
logger = logging.getLogger(__name__)
|
|||
|
|
|
|||
|
|
|
|||
|
|
class RemotePDFSaver:
|
|||
|
|
def __init__(
|
|||
|
|
self,
|
|||
|
|
remote_url="http://144.34.185.108:28098/wd/hub",
|
|||
|
|
headless=True,
|
|||
|
|
max_retries=3,
|
|||
|
|
retry_delay=2,
|
|||
|
|
print_options=None
|
|||
|
|
):
|
|||
|
|
"""
|
|||
|
|
初始化远程 PDF 保存器(支持自动重建 session)
|
|||
|
|
:param remote_url: 远程 Selenium 地址
|
|||
|
|
:param headless: 是否无头模式
|
|||
|
|
:param max_retries: 单次操作最大重试次数
|
|||
|
|
:param retry_delay: 重试前等待时间(秒)
|
|||
|
|
:param print_options: PDF 打印选项(参考 DevTools Protocol)
|
|||
|
|
"""
|
|||
|
|
self.remote_url = remote_url
|
|||
|
|
self.headless = headless
|
|||
|
|
self.max_retries = max_retries
|
|||
|
|
self.retry_delay = retry_delay
|
|||
|
|
self.print_options = print_options or {
|
|||
|
|
'landscape': False,
|
|||
|
|
'displayHeaderFooter': False,
|
|||
|
|
'printBackground': True,
|
|||
|
|
'preferCSSPageSize': True,
|
|||
|
|
'paperWidth': 8.27, # A4 宽(英寸)
|
|||
|
|
'paperHeight': 11.69, # A4 高(英寸)
|
|||
|
|
}
|
|||
|
|
self.driver = None
|
|||
|
|
self._init_driver()
|
|||
|
|
|
|||
|
|
def _build_chrome_options(self):
|
|||
|
|
"""构建 Chrome 选项(可复用)"""
|
|||
|
|
chrome_options = Options()
|
|||
|
|
if self.headless:
|
|||
|
|
chrome_options.add_argument('--headless=new')
|
|||
|
|
chrome_options.add_argument('--no-sandbox')
|
|||
|
|
chrome_options.add_argument('--disable-dev-shm-usage')
|
|||
|
|
chrome_options.add_argument('--disable-gpu')
|
|||
|
|
chrome_options.add_argument('--window-size=1920,1080')
|
|||
|
|
chrome_options.add_argument(
|
|||
|
|
"--user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.3650.75 Safari/537.36"
|
|||
|
|
)
|
|||
|
|
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
|
|||
|
|
chrome_options.add_experimental_option('useAutomationExtension', False)
|
|||
|
|
return chrome_options
|
|||
|
|
|
|||
|
|
def _init_driver(self):
|
|||
|
|
"""初始化或重新初始化 WebDriver"""
|
|||
|
|
if self.driver:
|
|||
|
|
try:
|
|||
|
|
self.driver.quit()
|
|||
|
|
except Exception:
|
|||
|
|
pass # 忽略关闭失败
|
|||
|
|
|
|||
|
|
logger.info(f"正在创建新的远程 WebDriver 会话: {self.remote_url}")
|
|||
|
|
for attempt in range(3):
|
|||
|
|
try:
|
|||
|
|
self.driver = webdriver.Remote(
|
|||
|
|
command_executor=self.remote_url,
|
|||
|
|
options=self._build_chrome_options()
|
|||
|
|
)
|
|||
|
|
# 注入反检测脚本
|
|||
|
|
self.driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
|
|||
|
|
'source': '''
|
|||
|
|
delete navigator.__proto__.webdriver;
|
|||
|
|
window.chrome = { runtime: {} };
|
|||
|
|
Object.defineProperty(navigator, 'languages', {
|
|||
|
|
get: () => ['zh-CN', 'zh', 'en']
|
|||
|
|
});
|
|||
|
|
'''
|
|||
|
|
})
|
|||
|
|
logger.info("✅ 远程 WebDriver 会话创建成功")
|
|||
|
|
return
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning(f"创建 WebDriver 失败 (尝试 {attempt + 1}/3): {e}")
|
|||
|
|
if attempt < 2:
|
|||
|
|
time.sleep(2)
|
|||
|
|
else:
|
|||
|
|
raise RuntimeError(f"无法连接到远程 Selenium 服务: {e}")
|
|||
|
|
|
|||
|
|
def save_as_pdf(self, url, output_path=None, timeout=30, wait_time=5):
|
|||
|
|
"""
|
|||
|
|
保存网页为 PDF,支持自动重试和 session 重建
|
|||
|
|
"""
|
|||
|
|
if output_path is None:
|
|||
|
|
domain = urlparse(url).netloc.replace('www.', '').split('.')[0] or 'page'
|
|||
|
|
output_path = f"{domain}.pdf"
|
|||
|
|
if not output_path.lower().endswith('.pdf'):
|
|||
|
|
output_path += '.pdf'
|
|||
|
|
|
|||
|
|
last_exception = None
|
|||
|
|
|
|||
|
|
for retry in range(self.max_retries + 1):
|
|||
|
|
try:
|
|||
|
|
# 检查 driver 是否有效
|
|||
|
|
if not self.driver:
|
|||
|
|
self._init_driver()
|
|||
|
|
|
|||
|
|
self.driver.set_page_load_timeout(timeout)
|
|||
|
|
logger.info(f"[{retry + 1}/{self.max_retries + 1}] 加载页面: {url}")
|
|||
|
|
self.driver.get(url)
|
|||
|
|
time.sleep(wait_time)
|
|||
|
|
|
|||
|
|
logger.info("生成 PDF...")
|
|||
|
|
result = self.driver.execute_cdp_cmd('Page.printToPDF', self.print_options)
|
|||
|
|
pdf_data = base64.b64decode(result['data'])
|
|||
|
|
|
|||
|
|
# 写入本地 PDF 文件(二进制)
|
|||
|
|
with open(output_path, 'wb') as f:
|
|||
|
|
f.write(pdf_data)
|
|||
|
|
|
|||
|
|
file_size = os.path.getsize(output_path)
|
|||
|
|
if file_size == 0:
|
|||
|
|
raise RuntimeError("生成了空文件")
|
|||
|
|
|
|||
|
|
logger.info(f"✅ 保存成功: {os.path.abspath(output_path)} ({file_size} 字节)")
|
|||
|
|
return os.path.abspath(output_path)
|
|||
|
|
|
|||
|
|
except (WebDriverException, InvalidSessionIdException, SessionNotCreatedException) as e:
|
|||
|
|
last_exception = e
|
|||
|
|
logger.warning(f"WebDriver 异常 (retry {retry + 1}): {e}")
|
|||
|
|
if retry < self.max_retries:
|
|||
|
|
logger.info("正在重建 WebDriver 会话...")
|
|||
|
|
self._init_driver()
|
|||
|
|
time.sleep(self.retry_delay)
|
|||
|
|
else:
|
|||
|
|
logger.error("达到最大重试次数,放弃")
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
except TimeoutException as e:
|
|||
|
|
last_exception = e
|
|||
|
|
logger.warning(f"页面加载超时 (retry {retry + 1}): {e}")
|
|||
|
|
if retry < self.max_retries:
|
|||
|
|
time.sleep(self.retry_delay)
|
|||
|
|
else:
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
last_exception = e
|
|||
|
|
logger.error(f"未知错误 (retry {retry + 1}): {e}")
|
|||
|
|
break # 非 WebDriver 错误,不重试
|
|||
|
|
|
|||
|
|
# 清理失败生成的空文件
|
|||
|
|
if os.path.exists(output_path):
|
|||
|
|
try:
|
|||
|
|
os.remove(output_path)
|
|||
|
|
except OSError:
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
raise RuntimeError(f"保存失败({type(last_exception).__name__}): {last_exception}")
|
|||
|
|
|
|||
|
|
def quit(self):
|
|||
|
|
"""显式关闭浏览器"""
|
|||
|
|
if self.driver:
|
|||
|
|
try:
|
|||
|
|
self.driver.quit()
|
|||
|
|
logger.info("WebDriver 会话已关闭")
|
|||
|
|
except Exception:
|
|||
|
|
pass
|
|||
|
|
self.driver = None
|
|||
|
|
|
|||
|
|
def __del__(self):
|
|||
|
|
self.quit()
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ===== 测试 =====
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
saver = RemotePDFSaver(
|
|||
|
|
remote_url="http://144.34.185.108:28098/wd/hub", # ← 替换为你的云服务器公网 IP
|
|||
|
|
headless=True
|
|||
|
|
)
|
|||
|
|
try:
|
|||
|
|
saver.save_as_pdf(
|
|||
|
|
url="https://www.epochtimes.com/gb/25/12/22/n14660274.htm",
|
|||
|
|
output_path="remote_example2.pdf"
|
|||
|
|
)
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f"❌ 失败: {e}")
|
|||
|
|
|
|||
|
|
saver.quit()
|