146 lines
5.4 KiB
Python
146 lines
5.4 KiB
Python
import base64
|
||
import logging
|
||
import os
|
||
import time
|
||
from urllib.parse import urlparse
|
||
|
||
from selenium import webdriver
|
||
from selenium.webdriver.chrome.options import Options
|
||
from selenium.webdriver.chrome.service import Service as ChromeService
|
||
|
||
# 配置日志
|
||
logging.basicConfig(
|
||
level=logging.INFO,
|
||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||
handlers=[
|
||
logging.StreamHandler(),
|
||
logging.FileHandler('pdf_saver.log', encoding='utf-8')
|
||
]
|
||
)
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
class PDFSaver:
|
||
def __init__(self, headless=True):
|
||
logger.info("正在初始化 Chrome WebDriver(自动匹配版本)...")
|
||
service = ChromeService(executable_path="D:/chromedriver.exe")
|
||
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.3650.75 Safari/537.36 Edg/143.0.3650.75"
|
||
|
||
# Chrome 选项
|
||
chrome_options = Options()
|
||
if headless:
|
||
chrome_options.add_argument('--headless=new')
|
||
chrome_options.add_argument('--disable-gpu')
|
||
chrome_options.add_argument('--no-sandbox')
|
||
chrome_options.add_argument('--disable-dev-shm-usage')
|
||
chrome_options.add_argument(f'--user-agent={user_agent}')
|
||
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
|
||
chrome_options.add_experimental_option('useAutomationExtension', False)
|
||
chrome_options.add_argument('--lang=zh-CN')
|
||
chrome_options.add_experimental_option('prefs', {
|
||
'intl.accept_languages': 'zh-CN,zh,en'
|
||
})
|
||
chrome_options.add_argument('--window-size=1920,1080')
|
||
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
|
||
chrome_options.page_load_strategy = 'eager'
|
||
|
||
# 注意:PDF 打印不需要 --save-page-as-mhtml
|
||
self.driver = webdriver.Chrome(service=service, options=chrome_options)
|
||
|
||
def save_as_pdf(self, url, output_path=None, timeout=30, wait_time=5, print_options=None):
|
||
"""
|
||
将网页保存为 PDF 文件
|
||
:param url: 目标网页 URL
|
||
:param output_path: 输出路径(.pdf)
|
||
:param timeout: 页面加载超时(秒)
|
||
:param wait_time: 页面加载后等待时间(秒),用于动态内容渲染
|
||
:param print_options: PDF 打印选项(可选),参考 https://chromedevtools.github.io/devtools-protocol/tot/Page/#method-printToPDF
|
||
:return: 保存的文件绝对路径
|
||
"""
|
||
if output_path is None:
|
||
parsed = urlparse(url)
|
||
domain = parsed.netloc.replace('www.', '').split('.')[0] or 'page'
|
||
output_path = f"{domain}.pdf"
|
||
|
||
if not output_path.lower().endswith('.pdf'):
|
||
output_path += '.pdf'
|
||
|
||
# 默认打印选项(可按需调整)
|
||
default_print_options = {
|
||
'landscape': False,
|
||
'displayHeaderFooter': False,
|
||
'printBackground': True,
|
||
'preferCSSPageSize': True,
|
||
'paperWidth': 8.27, # A4 宽(英寸)
|
||
'paperHeight': 11.69, # A4 高(英寸)
|
||
}
|
||
if print_options:
|
||
default_print_options.update(print_options)
|
||
|
||
try:
|
||
self.driver.set_page_load_timeout(timeout)
|
||
|
||
# 隐藏自动化特征
|
||
self.driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
|
||
'source': '''
|
||
delete navigator.__proto__.webdriver;
|
||
window.navigator.permissions.query = (parameters) => {
|
||
return parameters.name === 'notifications' ?
|
||
Promise.resolve({ state: Notification.permission }) :
|
||
originalQuery(parameters);
|
||
};
|
||
const originalQuery = window.navigator.permissions.query;
|
||
'''
|
||
})
|
||
|
||
logger.info(f"正在加载页面: {url}")
|
||
self.driver.get(url)
|
||
self.driver.set_window_size(1920, 1080)
|
||
|
||
logger.info(f"等待 {wait_time} 秒以确保页面完全渲染...")
|
||
time.sleep(wait_time)
|
||
|
||
logger.info("正在生成 PDF...")
|
||
result = self.driver.execute_cdp_cmd('Page.printToPDF', default_print_options)
|
||
|
||
# result['data'] 是 Base64 编码的 PDF
|
||
pdf_data = base64.b64decode(result['data'])
|
||
|
||
with open(output_path, 'wb') as f:
|
||
f.write(pdf_data)
|
||
|
||
file_size = os.path.getsize(output_path)
|
||
if file_size == 0:
|
||
raise RuntimeError("生成了空文件")
|
||
|
||
logger.info(f"✅ PDF 保存成功: {os.path.abspath(output_path)} (大小: {file_size} 字节)")
|
||
return os.path.abspath(output_path)
|
||
|
||
except Exception as e:
|
||
logger.error(f"❌ 保存失败: {e}")
|
||
raise
|
||
|
||
def quit(self):
|
||
if self.driver:
|
||
self.driver.quit()
|
||
logger.info("浏览器已关闭")
|
||
|
||
|
||
# ===== 测试入口 =====
|
||
if __name__ == "__main__":
|
||
test_url = "https://cn.ultraiso.net/jiaocheng/ke-lu-guang-pan.html"
|
||
|
||
saver = PDFSaver(headless=True)
|
||
try:
|
||
output_file = saver.save_as_pdf(
|
||
url=test_url,
|
||
output_path="example.pdf",
|
||
timeout=30,
|
||
wait_time=5
|
||
)
|
||
print(f"\n🎉 成功保存 PDF 文件: {output_file}")
|
||
except Exception as e:
|
||
print(f"\n💥 保存失败: {e}")
|
||
finally:
|
||
saver.quit()
|