import base64 import logging import os import time from urllib.parse import urlparse from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.service import Service as ChromeService # 配置日志 logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.StreamHandler(), logging.FileHandler('pdf_saver.log', encoding='utf-8') ] ) logger = logging.getLogger(__name__) class PDFSaver: def __init__(self, headless=True): logger.info("正在初始化 Chrome WebDriver(自动匹配版本)...") service = ChromeService(executable_path="D:/chromedriver.exe") user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.3650.75 Safari/537.36 Edg/143.0.3650.75" # Chrome 选项 chrome_options = Options() if headless: chrome_options.add_argument('--headless=new') chrome_options.add_argument('--disable-gpu') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-dev-shm-usage') chrome_options.add_argument(f'--user-agent={user_agent}') chrome_options.add_experimental_option('excludeSwitches', ['enable-automation']) chrome_options.add_experimental_option('useAutomationExtension', False) chrome_options.add_argument('--lang=zh-CN') chrome_options.add_experimental_option('prefs', { 'intl.accept_languages': 'zh-CN,zh,en' }) chrome_options.add_argument('--window-size=1920,1080') chrome_options.add_argument("--disable-blink-features=AutomationControlled") chrome_options.page_load_strategy = 'eager' # 注意:PDF 打印不需要 --save-page-as-mhtml self.driver = webdriver.Chrome(service=service, options=chrome_options) def save_as_pdf(self, url, output_path=None, timeout=30, wait_time=5, print_options=None): """ 将网页保存为 PDF 文件 :param url: 目标网页 URL :param output_path: 输出路径(.pdf) :param timeout: 页面加载超时(秒) :param wait_time: 页面加载后等待时间(秒),用于动态内容渲染 :param print_options: PDF 打印选项(可选),参考 https://chromedevtools.github.io/devtools-protocol/tot/Page/#method-printToPDF :return: 保存的文件绝对路径 """ if output_path is None: parsed = urlparse(url) domain = parsed.netloc.replace('www.', '').split('.')[0] or 'page' output_path = f"{domain}.pdf" if not output_path.lower().endswith('.pdf'): output_path += '.pdf' # 默认打印选项(可按需调整) default_print_options = { 'landscape': False, 'displayHeaderFooter': False, 'printBackground': True, 'preferCSSPageSize': True, 'paperWidth': 8.27, # A4 宽(英寸) 'paperHeight': 11.69, # A4 高(英寸) } if print_options: default_print_options.update(print_options) try: self.driver.set_page_load_timeout(timeout) # 隐藏自动化特征 self.driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', { 'source': ''' delete navigator.__proto__.webdriver; window.navigator.permissions.query = (parameters) => { return parameters.name === 'notifications' ? Promise.resolve({ state: Notification.permission }) : originalQuery(parameters); }; const originalQuery = window.navigator.permissions.query; ''' }) logger.info(f"正在加载页面: {url}") self.driver.get(url) self.driver.set_window_size(1920, 1080) logger.info(f"等待 {wait_time} 秒以确保页面完全渲染...") time.sleep(wait_time) logger.info("正在生成 PDF...") result = self.driver.execute_cdp_cmd('Page.printToPDF', default_print_options) # result['data'] 是 Base64 编码的 PDF pdf_data = base64.b64decode(result['data']) with open(output_path, 'wb') as f: f.write(pdf_data) file_size = os.path.getsize(output_path) if file_size == 0: raise RuntimeError("生成了空文件") logger.info(f"✅ PDF 保存成功: {os.path.abspath(output_path)} (大小: {file_size} 字节)") return os.path.abspath(output_path) except Exception as e: logger.error(f"❌ 保存失败: {e}") raise def quit(self): if self.driver: self.driver.quit() logger.info("浏览器已关闭") # ===== 测试入口 ===== if __name__ == "__main__": test_url = "https://cn.ultraiso.net/jiaocheng/ke-lu-guang-pan.html" saver = PDFSaver(headless=True) try: output_file = saver.save_as_pdf( url=test_url, output_path="example.pdf", timeout=30, wait_time=5 ) print(f"\n🎉 成功保存 PDF 文件: {output_file}") except Exception as e: print(f"\n💥 保存失败: {e}") finally: saver.quit()