osc/research/pdf_downloader/save_page_as_pdf.py
2026-01-19 09:17:10 +08:00

146 lines
5.4 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import base64
import logging
import os
import time
from urllib.parse import urlparse
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service as ChromeService
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(),
logging.FileHandler('pdf_saver.log', encoding='utf-8')
]
)
logger = logging.getLogger(__name__)
class PDFSaver:
def __init__(self, headless=True):
logger.info("正在初始化 Chrome WebDriver自动匹配版本...")
service = ChromeService(executable_path="D:/chromedriver.exe")
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.3650.75 Safari/537.36 Edg/143.0.3650.75"
# Chrome 选项
chrome_options = Options()
if headless:
chrome_options.add_argument('--headless=new')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument(f'--user-agent={user_agent}')
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument('--lang=zh-CN')
chrome_options.add_experimental_option('prefs', {
'intl.accept_languages': 'zh-CN,zh,en'
})
chrome_options.add_argument('--window-size=1920,1080')
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.page_load_strategy = 'eager'
# 注意PDF 打印不需要 --save-page-as-mhtml
self.driver = webdriver.Chrome(service=service, options=chrome_options)
def save_as_pdf(self, url, output_path=None, timeout=30, wait_time=5, print_options=None):
"""
将网页保存为 PDF 文件
:param url: 目标网页 URL
:param output_path: 输出路径(.pdf
:param timeout: 页面加载超时(秒)
:param wait_time: 页面加载后等待时间(秒),用于动态内容渲染
:param print_options: PDF 打印选项(可选),参考 https://chromedevtools.github.io/devtools-protocol/tot/Page/#method-printToPDF
:return: 保存的文件绝对路径
"""
if output_path is None:
parsed = urlparse(url)
domain = parsed.netloc.replace('www.', '').split('.')[0] or 'page'
output_path = f"{domain}.pdf"
if not output_path.lower().endswith('.pdf'):
output_path += '.pdf'
# 默认打印选项(可按需调整)
default_print_options = {
'landscape': False,
'displayHeaderFooter': False,
'printBackground': True,
'preferCSSPageSize': True,
'paperWidth': 8.27, # A4 宽(英寸)
'paperHeight': 11.69, # A4 高(英寸)
}
if print_options:
default_print_options.update(print_options)
try:
self.driver.set_page_load_timeout(timeout)
# 隐藏自动化特征
self.driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
'source': '''
delete navigator.__proto__.webdriver;
window.navigator.permissions.query = (parameters) => {
return parameters.name === 'notifications' ?
Promise.resolve({ state: Notification.permission }) :
originalQuery(parameters);
};
const originalQuery = window.navigator.permissions.query;
'''
})
logger.info(f"正在加载页面: {url}")
self.driver.get(url)
self.driver.set_window_size(1920, 1080)
logger.info(f"等待 {wait_time} 秒以确保页面完全渲染...")
time.sleep(wait_time)
logger.info("正在生成 PDF...")
result = self.driver.execute_cdp_cmd('Page.printToPDF', default_print_options)
# result['data'] 是 Base64 编码的 PDF
pdf_data = base64.b64decode(result['data'])
with open(output_path, 'wb') as f:
f.write(pdf_data)
file_size = os.path.getsize(output_path)
if file_size == 0:
raise RuntimeError("生成了空文件")
logger.info(f"✅ PDF 保存成功: {os.path.abspath(output_path)} (大小: {file_size} 字节)")
return os.path.abspath(output_path)
except Exception as e:
logger.error(f"❌ 保存失败: {e}")
raise
def quit(self):
if self.driver:
self.driver.quit()
logger.info("浏览器已关闭")
# ===== 测试入口 =====
if __name__ == "__main__":
test_url = "https://cn.ultraiso.net/jiaocheng/ke-lu-guang-pan.html"
saver = PDFSaver(headless=True)
try:
output_file = saver.save_as_pdf(
url=test_url,
output_path="example.pdf",
timeout=30,
wait_time=5
)
print(f"\n🎉 成功保存 PDF 文件: {output_file}")
except Exception as e:
print(f"\n💥 保存失败: {e}")
finally:
saver.quit()