From 970d86ed7dfc9f21069e2a4e8a0db4d87095feb1 Mon Sep 17 00:00:00 2001 From: yuxin-pc Date: Fri, 26 Dec 2025 08:54:58 +0800 Subject: [PATCH] =?UTF-8?q?=E8=B0=83=E7=94=A8selenium=E4=B8=8B=E8=BD=BD?= =?UTF-8?q?=E7=BD=91=E9=A1=B5=E7=9A=84=E6=96=B9=E6=B3=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../pdf_downloader/save-page-with-selenium.py | 322 ++++++++++++++++++ research/pdf_downloader/save_page_as_mhtml.py | 141 ++++++++ research/pdf_downloader/save_page_as_pdf.py | 144 ++++++++ .../pdf_downloader/save_remote_as_mhtml.py | 190 +++++++++++ research/pdf_downloader/save_remote_as_pdf.py | 201 +++++++++++ 5 files changed, 998 insertions(+) create mode 100644 research/pdf_downloader/save-page-with-selenium.py create mode 100644 research/pdf_downloader/save_page_as_mhtml.py create mode 100644 research/pdf_downloader/save_page_as_pdf.py create mode 100644 research/pdf_downloader/save_remote_as_mhtml.py create mode 100644 research/pdf_downloader/save_remote_as_pdf.py diff --git a/research/pdf_downloader/save-page-with-selenium.py b/research/pdf_downloader/save-page-with-selenium.py new file mode 100644 index 0000000..48bc560 --- /dev/null +++ b/research/pdf_downloader/save-page-with-selenium.py @@ -0,0 +1,322 @@ +import logging +import os +import queue +import threading +import time +from datetime import datetime + +import pymysql +from tqdm import tqdm + +from save_page_as_pdf import PDFSaver +from save_remote_as_mhtml import RemoteMHTMLSaver +from save_page_as_mhtml import MHTMLSaver +import tldextract + +# 配置日志 +from save_remote_as_pdf import RemotePDFSaver + +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s', + handlers=[ + logging.StreamHandler(), + logging.FileHandler('pdf_downloader.log') + ] +) +logger = logging.getLogger(__name__) + +# =============== MySQL 配置 =============== +MYSQL_CONFIG = { + 'host': '47.113.231.200', + 'port': 28089, + 'user': 'root', + 'password': 'passok123A', + 'database': 'dsp', + 'charset': 'utf8mb4', + 'autocommit': False # 手动控制事务 +} +# ========================================= + +# 配置参数 +BATCH_SIZE = 500 +MAX_WORKERS = 1 +TIMEOUT = 10 +PDF_OUTPUT_DIR = 'D:/data/output/pdf' +MIN_PDF_SIZE = 80 * 1024 # 80KB + +MHTML_OUTPUT_DIR = 'D:/data/output/mhtml' +os.makedirs(PDF_OUTPUT_DIR, exist_ok=True) + +running = True +running_interval_seconds = 15 + +remote_host_name = [ + 'epochtimes.com', + # 'secretchina.com' +] + + +class PDFDownloader: + def __init__(self): + self.db_lock = threading.Lock() + self.task_queue = queue.Queue(maxsize=MAX_WORKERS * 3) + self.processed_count = 0 + self.success_count = 0 + self.fail_count = 0 + self.small_file_count = 0 # 新增:统计小文件数量 + self.last_loadtime = self.get_last_loadtime() + self.total_rows = self.get_total_rows() + self.start_time = time.time() + self.skip_hosts = [] + self.local_handler = None + self.remote_handler = None + + # 替换 MYSQL_CONFIG 中的连接方式 + def get_db_connection(self): + return pymysql.connect( + host=MYSQL_CONFIG['host'], + port=MYSQL_CONFIG['port'], + user=MYSQL_CONFIG['user'], + password=MYSQL_CONFIG['password'], + database=MYSQL_CONFIG['database'], + charset='utf8mb4', + autocommit=False + ) + + def get_total_rows(self): + """获取总记录数""" + with self.get_db_connection() as conn: + cursor = conn.cursor() + cursor.execute( + "SELECT COUNT(*) FROM indeximos " + "WHERE (es_video IS NULL OR es_video IN ('-2', '-1')) " + "AND es_loadtime > %s", self.last_loadtime + ) + return cursor.fetchone()[0] + + def get_last_loadtime(self): + """获取上次导出数据的时间""" + with self.get_db_connection() as conn: + cursor = conn.cursor() + cursor.execute( + "SELECT config_value FROM config " + "WHERE config_name = 'last_loadtime' " + ) + return cursor.fetchone()[0] + + def use_remote_selenium(self, url): + for host in remote_host_name: + if host in url: + return True + return False + + def format_pdf_filename(self, row): + """格式化PDF文件名""" + es_urltitle = row[2] or 'untitled' + es_urltime = str(row[3]) or '19700101_000000' + es_sitename = row[4] or 'anonymous' + + def clean_filename(text): + if not text: + return '' + invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*'] + for char in invalid_chars: + text = text.replace(char, '_') + return text.strip()[:100] + + try: + dt = datetime.strptime(es_urltime, '%Y-%m-%d %H:%M:%S') + es_urltime_fix = dt.strftime('%Y%m%d_%H%M%S') + except: + es_urltime_fix = '19700101_000000' + + filename = f"{clean_filename(es_urltitle)}_{es_urltime_fix}_{es_sitename}.pdf" + return os.path.join(PDF_OUTPUT_DIR, filename) + + def format_mhtml_filename(self, row): + """格式化PDF文件名""" + es_urltitle = row[2] or 'untitled' + es_urltime = str(row[3]) or '19700101_000000' + es_sitename = row[4] or 'anonymous' + + def clean_filename(text): + if not text: + return '' + invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*'] + for char in invalid_chars: + text = text.replace(char, '_') + return text.strip()[:100] + + try: + dt = datetime.strptime(es_urltime, '%Y-%m-%d %H:%M:%S') + es_urltime_fix = dt.strftime('%Y%m%d_%H%M%S') + except: + es_urltime_fix = '19700101_000000' + + filename = f"{clean_filename(es_urltitle)}_{es_urltime_fix}_{es_sitename}.mhtml" + return os.path.join(PDF_OUTPUT_DIR, filename) + + def fetch_data_batch(self, offset): + """分页获取数据""" + with self.get_db_connection() as conn: + cursor = conn.cursor() + cursor.execute( + "SELECT es_sid, es_urlname, es_urltitle, es_urltime, es_sitename, es_authors FROM indeximos " + "WHERE (es_video IS NULL OR es_video IN ('-2', '-1')) " + "AND es_loadtime > %s " + "ORDER BY es_urltime LIMIT %s OFFSET %s", + (self.last_loadtime, BATCH_SIZE, offset) + ) + return cursor.fetchall() + + def update_file_status(self, es_sid, status, retry=3): + """更新数据库状态""" + for attempt in range(retry): + try: + with self.db_lock, self.get_db_connection() as conn: + cursor = conn.cursor() + cursor.execute( + "UPDATE indeximos SET es_video = %s WHERE es_sid = %s", + (status, es_sid)) + conn.commit() + return True + except Exception as e: + if attempt == retry - 1: + logger.error(f"更新数据库失败(es_sid={es_sid}): {e}") + return False + time.sleep(1) + + def extract_main_domain(self, url): + extracted = tldextract.extract(url) + # 组合注册域名(主域名) + main_domain = f"{extracted.domain}.{extracted.suffix}" + return main_domain + + def download_worker(self): + """工作线程函数""" + while True: + try: + task = self.task_queue.get(timeout=1) + if task is None: + break + + row = task + url = row[1] + if self.extract_main_domain(url) in self.skip_hosts: + self.small_file_count += 1 + self.processed_count += 1 + self.task_queue.task_done() + print(f"小文件规避,暂时跳过URL:{url}") + continue + output_file = self.format_pdf_filename(row) # 获取格式化后的文件名 + + try: + os.makedirs(os.path.dirname(output_file), exist_ok=True) + + # 调用下载函数 + if self.use_remote_selenium(url): + if self.remote_handler is None: + self.remote_handler = RemotePDFSaver() + success = self.remote_handler.save_as_pdf( + url=url, + output_path=output_file, + timeout=TIMEOUT + ) + else: + if self.local_handler is None: + self.local_handler = PDFSaver() + success = self.local_handler.save_as_pdf( + url=url, + output_path=output_file, + timeout=TIMEOUT + ) + + # 验证下载结果 + if success and os.path.exists(output_file): + file_size = os.path.getsize(output_file) + + if file_size >= MIN_PDF_SIZE: # 文件大小合格 + self.update_file_status(row[0], output_file) + self.success_count += 1 + else: # 文件太小 + self.update_file_status(row[0], '-2') + self.small_file_count += 1 + logger.warning(f"文件过小({file_size}字节): {output_file}") + try: + os.remove(output_file) + self.skip_hosts.append(self.extract_main_domain(url)) + except: + pass + else: # 下载失败 + self.update_file_status(row[0], '0') + self.fail_count += 1 + if os.path.exists(output_file): + try: + os.remove(output_file) + except: + pass + + except Exception as e: + logger.error(f"下载出现异常(es_sid={row[0]}, url={url}): {str(e)}") + self.update_file_status(row[0], '-1') + self.fail_count += 1 + + self.processed_count += 1 + self.task_queue.task_done() + + except queue.Empty: + continue + + def run(self): + """启动下载任务""" + threads = [] + + # 创建工作线程 + for _ in range(MAX_WORKERS): + t = threading.Thread(target=self.download_worker) + t.start() + threads.append(t) + + # 使用进度条显示进度 + with tqdm(total=self.total_rows, desc="处理进度", unit="条") as pbar: + offset = 0 + while True: + batch = self.fetch_data_batch(offset) + if not batch: + break + + for row in batch: + self.task_queue.put(row) + + pbar.update(len(batch)) + pbar.set_postfix({ + '成功': self.success_count, + '失败': self.fail_count, + '小文件': self.small_file_count, + '速度': f"{self.processed_count / (time.time() - self.start_time):.1f}条/秒" + }) + + offset += BATCH_SIZE + + self.task_queue.join() + + for _ in range(MAX_WORKERS): + self.task_queue.put(None) + + for t in threads: + t.join() + + total_time = time.time() - self.start_time + print(f"\n处理完成! 总计: {self.total_rows}条") + print(f"成功: {self.success_count}条, 失败: {self.fail_count}条, 小文件: {self.small_file_count}条") + print(f"总耗时: {total_time:.2f}秒, 平均速度: {self.total_rows / total_time:.2f}条/秒") + + +if __name__ == "__main__": + while running: + print(f"开始处理,总记录数: {PDFDownloader().get_total_rows()}") + downloader = PDFDownloader() + downloader.run() + print(f"运行完成,暂停{running_interval_seconds}秒后开始下一次运行...") + time.sleep(running_interval_seconds) diff --git a/research/pdf_downloader/save_page_as_mhtml.py b/research/pdf_downloader/save_page_as_mhtml.py new file mode 100644 index 0000000..21607c3 --- /dev/null +++ b/research/pdf_downloader/save_page_as_mhtml.py @@ -0,0 +1,141 @@ +import logging +import os +import time +from urllib.parse import urlparse + +from selenium import webdriver +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.chrome.service import Service as ChromeService + +# 配置日志 +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s', + handlers=[ + logging.StreamHandler(), + logging.FileHandler('mhtml_saver.log', encoding='utf-8') + ] +) +logger = logging.getLogger(__name__) + + +class MHTMLSaver: + def __init__(self, headless=True): + logger.info("正在初始化 Chrome WebDriver(自动匹配版本)...") + service = ChromeService(executable_path="C:/Program Files/Python38/chromedriver.exe") + user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.3650.75 Safari/537.36 Edg/143.0.3650.75" + + # Chrome 选项 + chrome_options = Options() + chrome_options.add_argument('--headless=new') + chrome_options.add_argument('--disable-gpu') + chrome_options.add_argument('--no-sandbox') + chrome_options.add_argument('--disable-dev-shm-usage') + chrome_options.add_argument(f'--user-agent={user_agent}') + chrome_options.add_argument('--save-page-as-mhtml') # 启用 MHTML 支持 + chrome_options.add_experimental_option('excludeSwitches', ['enable-automation']) + chrome_options.add_experimental_option('useAutomationExtension', False) + chrome_options.add_argument('--lang=zh-CN') + chrome_options.add_experimental_option('prefs', { + 'intl.accept_languages': 'zh-CN,zh,en' + }) + # 或启动时指定(部分版本支持) + chrome_options.add_argument('--window-size=1920,1080') + + # 隐藏 webdriver 特征 + chrome_options.add_argument("--disable-blink-features=AutomationControlled") + # 隐藏 "navigator.webdriver" + chrome_options.add_experimental_option('excludeSwitches', ['enable-automation']) + chrome_options.add_experimental_option('useAutomationExtension', False) + + self.driver = webdriver.Chrome(service=service, options=chrome_options) + + def save_as_mhtml(self, url, output_path=None, timeout=30, wait_time=5): + """ + 将网页保存为 MHTML 文件 + :param url: 目标网页 URL + :param output_path: 输出路径(.mhtml) + :param timeout: 页面加载超时(秒) + :param wait_time: 页面加载后等待时间(秒),用于动态内容渲染 + :return: 保存的文件绝对路径 + """ + if output_path is None: + parsed = urlparse(url) + domain = parsed.netloc.replace('www.', '').split('.')[0] or 'page' + output_path = f"{domain}.mhtml" + + if not output_path.lower().endswith('.mhtml'): + output_path += '.mhtml' + + try: + # 设置超时 + self.driver.set_page_load_timeout(timeout) + + # 启动后注入脚本(双重保险) + self.driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', { + 'source': ''' + delete navigator.__proto__.webdriver; + window.navigator.permissions.query = (parameters) => { + return parameters.name === 'notifications' ? + Promise.resolve({ state: Notification.permission }) : + originalQuery(parameters); + }; + ''' + }) + # 在 driver.get() 之后设置窗口大小 + + logger.info(f"正在加载页面: {url}") + self.driver.get(url) + self.driver.set_window_size(1920, 1080) + + # 等待页面动态内容加载(可调整) + logger.info(f"等待 {wait_time} 秒以确保页面完全渲染...") + time.sleep(wait_time) + + # ✅ 关键:调用 CDP 命令捕获 MHTML + logger.info("正在生成 MHTML 快照...") + result = self.driver.execute_cdp_cmd('Page.captureSnapshot', {'format': 'mhtml'}) + + # ✅ result['data'] 是 Base64 编码的 MHTML 文本(实际是纯字符串) + mhtml_content = result['data'] + + # ✅ 以文本模式写入(UTF-8) + with open(output_path, 'w', encoding='utf-8', newline='') as f: + f.write(mhtml_content) + + # 验证文件 + file_size = os.path.getsize(output_path) + if file_size == 0: + raise RuntimeError("生成了空文件") + + logger.info(f"✅ MHTML 保存成功: {os.path.abspath(output_path)} (大小: {file_size} 字节)") + return os.path.abspath(output_path) + + except Exception as e: + logger.error(f"❌ 保存失败: {e}") + raise + + def quit(self): + if self.driver: + self.driver.quit() + logger.info("浏览器已关闭") + + +# ===== 测试入口 ===== +if __name__ == "__main__": + # 示例 URL(可替换为你自己的) + test_url = "https://cn.ultraiso.net/jiaocheng/ke-lu-guang-pan.html" + + saver = MHTMLSaver(headless=True) + try: + output_file = saver.save_as_mhtml( + url=test_url, + output_path="example.mhtml", + timeout=30, + wait_time=5 + ) + print(f"\n🎉 成功保存 MHTML 文件: {output_file}") + except Exception as e: + print(f"\n💥 保存失败: {e}") + finally: + saver.quit() diff --git a/research/pdf_downloader/save_page_as_pdf.py b/research/pdf_downloader/save_page_as_pdf.py new file mode 100644 index 0000000..e54fb55 --- /dev/null +++ b/research/pdf_downloader/save_page_as_pdf.py @@ -0,0 +1,144 @@ +import base64 +import logging +import os +import time +from urllib.parse import urlparse + +from selenium import webdriver +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.chrome.service import Service as ChromeService + +# 配置日志 +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s', + handlers=[ + logging.StreamHandler(), + logging.FileHandler('pdf_saver.log', encoding='utf-8') + ] +) +logger = logging.getLogger(__name__) + + +class PDFSaver: + def __init__(self, headless=True): + logger.info("正在初始化 Chrome WebDriver(自动匹配版本)...") + service = ChromeService(executable_path="C:/Program Files/Python38/chromedriver.exe") + user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.3650.75 Safari/537.36 Edg/143.0.3650.75" + + # Chrome 选项 + chrome_options = Options() + if headless: + chrome_options.add_argument('--headless=new') + chrome_options.add_argument('--disable-gpu') + chrome_options.add_argument('--no-sandbox') + chrome_options.add_argument('--disable-dev-shm-usage') + chrome_options.add_argument(f'--user-agent={user_agent}') + chrome_options.add_experimental_option('excludeSwitches', ['enable-automation']) + chrome_options.add_experimental_option('useAutomationExtension', False) + chrome_options.add_argument('--lang=zh-CN') + chrome_options.add_experimental_option('prefs', { + 'intl.accept_languages': 'zh-CN,zh,en' + }) + chrome_options.add_argument('--window-size=1920,1080') + chrome_options.add_argument("--disable-blink-features=AutomationControlled") + + # 注意:PDF 打印不需要 --save-page-as-mhtml + self.driver = webdriver.Chrome(service=service, options=chrome_options) + + def save_as_pdf(self, url, output_path=None, timeout=30, wait_time=5, print_options=None): + """ + 将网页保存为 PDF 文件 + :param url: 目标网页 URL + :param output_path: 输出路径(.pdf) + :param timeout: 页面加载超时(秒) + :param wait_time: 页面加载后等待时间(秒),用于动态内容渲染 + :param print_options: PDF 打印选项(可选),参考 https://chromedevtools.github.io/devtools-protocol/tot/Page/#method-printToPDF + :return: 保存的文件绝对路径 + """ + if output_path is None: + parsed = urlparse(url) + domain = parsed.netloc.replace('www.', '').split('.')[0] or 'page' + output_path = f"{domain}.pdf" + + if not output_path.lower().endswith('.pdf'): + output_path += '.pdf' + + # 默认打印选项(可按需调整) + default_print_options = { + 'landscape': False, + 'displayHeaderFooter': False, + 'printBackground': True, + 'preferCSSPageSize': True, + 'paperWidth': 8.27, # A4 宽(英寸) + 'paperHeight': 11.69, # A4 高(英寸) + } + if print_options: + default_print_options.update(print_options) + + try: + self.driver.set_page_load_timeout(timeout) + + # 隐藏自动化特征 + self.driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', { + 'source': ''' + delete navigator.__proto__.webdriver; + window.navigator.permissions.query = (parameters) => { + return parameters.name === 'notifications' ? + Promise.resolve({ state: Notification.permission }) : + originalQuery(parameters); + }; + const originalQuery = window.navigator.permissions.query; + ''' + }) + + logger.info(f"正在加载页面: {url}") + self.driver.get(url) + self.driver.set_window_size(1920, 1080) + + logger.info(f"等待 {wait_time} 秒以确保页面完全渲染...") + time.sleep(wait_time) + + logger.info("正在生成 PDF...") + result = self.driver.execute_cdp_cmd('Page.printToPDF', default_print_options) + + # result['data'] 是 Base64 编码的 PDF + pdf_data = base64.b64decode(result['data']) + + with open(output_path, 'wb') as f: + f.write(pdf_data) + + file_size = os.path.getsize(output_path) + if file_size == 0: + raise RuntimeError("生成了空文件") + + logger.info(f"✅ PDF 保存成功: {os.path.abspath(output_path)} (大小: {file_size} 字节)") + return os.path.abspath(output_path) + + except Exception as e: + logger.error(f"❌ 保存失败: {e}") + raise + + def quit(self): + if self.driver: + self.driver.quit() + logger.info("浏览器已关闭") + + +# ===== 测试入口 ===== +if __name__ == "__main__": + test_url = "https://cn.ultraiso.net/jiaocheng/ke-lu-guang-pan.html" + + saver = PDFSaver(headless=True) + try: + output_file = saver.save_as_pdf( + url=test_url, + output_path="example.pdf", + timeout=30, + wait_time=5 + ) + print(f"\n🎉 成功保存 PDF 文件: {output_file}") + except Exception as e: + print(f"\n💥 保存失败: {e}") + finally: + saver.quit() diff --git a/research/pdf_downloader/save_remote_as_mhtml.py b/research/pdf_downloader/save_remote_as_mhtml.py new file mode 100644 index 0000000..f59fb44 --- /dev/null +++ b/research/pdf_downloader/save_remote_as_mhtml.py @@ -0,0 +1,190 @@ +import logging +import os +import time +from urllib.parse import urlparse + +from selenium import webdriver +from selenium.common.exceptions import ( + WebDriverException, + TimeoutException, + SessionNotCreatedException, + InvalidSessionIdException +) +from selenium.webdriver.chrome.options import Options + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +class RemoteMHTMLSaver: + def __init__( + self, + remote_url="http://144.34.185.108:28098/wd/hub", + headless=True, + max_retries=3, + retry_delay=2 + ): + """ + 初始化远程 MHTML 保存器(支持自动重建 session) + :param remote_url: 远程 Selenium 地址 + :param headless: 是否无头 + :param max_retries: 单次操作最大重试次数 + :param retry_delay: 重试前等待时间(秒) + """ + self.remote_url = remote_url + self.headless = headless + self.max_retries = max_retries + self.retry_delay = retry_delay + self.driver = None + self._init_driver() + + def _build_chrome_options(self): + """构建 Chrome 选项(可复用)""" + chrome_options = Options() + if self.headless: + chrome_options.add_argument('--headless=new') + chrome_options.add_argument('--no-sandbox') + chrome_options.add_argument('--disable-dev-shm-usage') + chrome_options.add_argument('--disable-gpu') + chrome_options.add_argument('--window-size=1920,1080') + chrome_options.add_argument( + "--user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.3650.75 Safari/537.36" + ) + chrome_options.add_experimental_option('excludeSwitches', ['enable-automation']) + chrome_options.add_experimental_option('useAutomationExtension', False) + return chrome_options + + def _init_driver(self): + """初始化或重新初始化 WebDriver""" + if self.driver: + try: + self.driver.quit() + except Exception: + pass # 忽略关闭失败 + + logger.info(f"正在创建新的远程 WebDriver 会话: {self.remote_url}") + for attempt in range(3): + try: + self.driver = webdriver.Remote( + command_executor=self.remote_url, + options=self._build_chrome_options() + ) + # 注入反检测脚本 + self.driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', { + 'source': ''' + delete navigator.__proto__.webdriver; + window.chrome = { runtime: {} }; + Object.defineProperty(navigator, 'languages', { + get: () => ['zh-CN', 'zh', 'en'] + }); + ''' + }) + logger.info("✅ 远程 WebDriver 会话创建成功") + return + except Exception as e: + logger.warning(f"创建 WebDriver 失败 (尝试 {attempt + 1}/3): {e}") + if attempt < 2: + time.sleep(2) + else: + raise RuntimeError(f"无法连接到远程 Selenium 服务: {e}") + + def save_as_mhtml(self, url, output_path=None, timeout=30, wait_time=5): + """ + 保存网页为 MHTML,支持自动重试和 session 重建 + """ + if output_path is None: + domain = urlparse(url).netloc.replace('www.', '').split('.')[0] or 'page' + output_path = f"{domain}.mhtml" + if not output_path.lower().endswith('.mhtml'): + output_path += '.mhtml' + + last_exception = None + + for retry in range(self.max_retries + 1): + try: + # 检查 driver 是否有效 + if not self.driver: + self._init_driver() + + self.driver.set_page_load_timeout(timeout) + logger.info(f"[{retry + 1}/{self.max_retries + 1}] 加载页面: {url}") + self.driver.get(url) + time.sleep(wait_time) + + logger.info("生成 MHTML 快照...") + result = self.driver.execute_cdp_cmd('Page.captureSnapshot', {'format': 'mhtml'}) + mhtml_content = result['data'] + + # 写入本地文件 + with open(output_path, 'w', encoding='utf-8', newline='') as f: + f.write(mhtml_content) + + file_size = os.path.getsize(output_path) + if file_size == 0: + raise RuntimeError("生成了空文件") + + logger.info(f"✅ 保存成功: {os.path.abspath(output_path)} ({file_size} 字节)") + return os.path.abspath(output_path) + + except (WebDriverException, InvalidSessionIdException, SessionNotCreatedException) as e: + last_exception = e + logger.warning(f"WebDriver 异常 (retry {retry + 1}): {e}") + if retry < self.max_retries: + logger.info("正在重建 WebDriver 会话...") + self._init_driver() + time.sleep(self.retry_delay) + else: + logger.error("达到最大重试次数,放弃") + break + + except TimeoutException as e: + last_exception = e + logger.warning(f"页面加载超时 (retry {retry + 1}): {e}") + if retry < self.max_retries: + time.sleep(self.retry_delay) + else: + break + + except Exception as e: + last_exception = e + logger.error(f"未知错误 (retry {retry + 1}): {e}") + break # 非 WebDriver 错误,不重试 + + # 如果所有重试失败 + if os.path.exists(output_path): + try: + os.remove(output_path) + except OSError: + pass + + raise RuntimeError(f"保存失败({type(last_exception).__name__}): {last_exception}") + + def quit(self): + """显式关闭浏览器""" + if self.driver: + try: + self.driver.quit() + logger.info("WebDriver 会话已关闭") + except Exception: + pass + self.driver = None + + def __del__(self): + self.quit() + + +# ===== 测试 ===== +if __name__ == "__main__": + saver = RemoteMHTMLSaver( + remote_url="http://144.34.185.108:28098/wd/hub", # ← 替换为你的云服务器公网 IP + headless=True + ) + try: + saver.save_as_mhtml( + url="https://www.epochtimes.com/gb/25/12/22/n14660274.htm", + output_path="remote_example2.mhtml" + ) + except Exception as e: + print(f"❌ 失败: {e}") + + saver.quit() diff --git a/research/pdf_downloader/save_remote_as_pdf.py b/research/pdf_downloader/save_remote_as_pdf.py new file mode 100644 index 0000000..dac38e7 --- /dev/null +++ b/research/pdf_downloader/save_remote_as_pdf.py @@ -0,0 +1,201 @@ +import base64 +import logging +import os +import time +from urllib.parse import urlparse + +from selenium import webdriver +from selenium.common.exceptions import ( + WebDriverException, + TimeoutException, + SessionNotCreatedException, + InvalidSessionIdException +) +from selenium.webdriver.chrome.options import Options + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +class RemotePDFSaver: + def __init__( + self, + remote_url="http://144.34.185.108:28098/wd/hub", + headless=True, + max_retries=3, + retry_delay=2, + print_options=None + ): + """ + 初始化远程 PDF 保存器(支持自动重建 session) + :param remote_url: 远程 Selenium 地址 + :param headless: 是否无头模式 + :param max_retries: 单次操作最大重试次数 + :param retry_delay: 重试前等待时间(秒) + :param print_options: PDF 打印选项(参考 DevTools Protocol) + """ + self.remote_url = remote_url + self.headless = headless + self.max_retries = max_retries + self.retry_delay = retry_delay + self.print_options = print_options or { + 'landscape': False, + 'displayHeaderFooter': False, + 'printBackground': True, + 'preferCSSPageSize': True, + 'paperWidth': 8.27, # A4 宽(英寸) + 'paperHeight': 11.69, # A4 高(英寸) + } + self.driver = None + self._init_driver() + + def _build_chrome_options(self): + """构建 Chrome 选项(可复用)""" + chrome_options = Options() + if self.headless: + chrome_options.add_argument('--headless=new') + chrome_options.add_argument('--no-sandbox') + chrome_options.add_argument('--disable-dev-shm-usage') + chrome_options.add_argument('--disable-gpu') + chrome_options.add_argument('--window-size=1920,1080') + chrome_options.add_argument( + "--user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.3650.75 Safari/537.36" + ) + chrome_options.add_experimental_option('excludeSwitches', ['enable-automation']) + chrome_options.add_experimental_option('useAutomationExtension', False) + return chrome_options + + def _init_driver(self): + """初始化或重新初始化 WebDriver""" + if self.driver: + try: + self.driver.quit() + except Exception: + pass # 忽略关闭失败 + + logger.info(f"正在创建新的远程 WebDriver 会话: {self.remote_url}") + for attempt in range(3): + try: + self.driver = webdriver.Remote( + command_executor=self.remote_url, + options=self._build_chrome_options() + ) + # 注入反检测脚本 + self.driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', { + 'source': ''' + delete navigator.__proto__.webdriver; + window.chrome = { runtime: {} }; + Object.defineProperty(navigator, 'languages', { + get: () => ['zh-CN', 'zh', 'en'] + }); + ''' + }) + logger.info("✅ 远程 WebDriver 会话创建成功") + return + except Exception as e: + logger.warning(f"创建 WebDriver 失败 (尝试 {attempt + 1}/3): {e}") + if attempt < 2: + time.sleep(2) + else: + raise RuntimeError(f"无法连接到远程 Selenium 服务: {e}") + + def save_as_pdf(self, url, output_path=None, timeout=30, wait_time=5): + """ + 保存网页为 PDF,支持自动重试和 session 重建 + """ + if output_path is None: + domain = urlparse(url).netloc.replace('www.', '').split('.')[0] or 'page' + output_path = f"{domain}.pdf" + if not output_path.lower().endswith('.pdf'): + output_path += '.pdf' + + last_exception = None + + for retry in range(self.max_retries + 1): + try: + # 检查 driver 是否有效 + if not self.driver: + self._init_driver() + + self.driver.set_page_load_timeout(timeout) + logger.info(f"[{retry + 1}/{self.max_retries + 1}] 加载页面: {url}") + self.driver.get(url) + time.sleep(wait_time) + + logger.info("生成 PDF...") + result = self.driver.execute_cdp_cmd('Page.printToPDF', self.print_options) + pdf_data = base64.b64decode(result['data']) + + # 写入本地 PDF 文件(二进制) + with open(output_path, 'wb') as f: + f.write(pdf_data) + + file_size = os.path.getsize(output_path) + if file_size == 0: + raise RuntimeError("生成了空文件") + + logger.info(f"✅ 保存成功: {os.path.abspath(output_path)} ({file_size} 字节)") + return os.path.abspath(output_path) + + except (WebDriverException, InvalidSessionIdException, SessionNotCreatedException) as e: + last_exception = e + logger.warning(f"WebDriver 异常 (retry {retry + 1}): {e}") + if retry < self.max_retries: + logger.info("正在重建 WebDriver 会话...") + self._init_driver() + time.sleep(self.retry_delay) + else: + logger.error("达到最大重试次数,放弃") + break + + except TimeoutException as e: + last_exception = e + logger.warning(f"页面加载超时 (retry {retry + 1}): {e}") + if retry < self.max_retries: + time.sleep(self.retry_delay) + else: + break + + except Exception as e: + last_exception = e + logger.error(f"未知错误 (retry {retry + 1}): {e}") + break # 非 WebDriver 错误,不重试 + + # 清理失败生成的空文件 + if os.path.exists(output_path): + try: + os.remove(output_path) + except OSError: + pass + + raise RuntimeError(f"保存失败({type(last_exception).__name__}): {last_exception}") + + def quit(self): + """显式关闭浏览器""" + if self.driver: + try: + self.driver.quit() + logger.info("WebDriver 会话已关闭") + except Exception: + pass + self.driver = None + + def __del__(self): + self.quit() + + +# ===== 测试 ===== +if __name__ == "__main__": + saver = RemotePDFSaver( + remote_url="http://144.34.185.108:28098/wd/hub", # ← 替换为你的云服务器公网 IP + headless=True + ) + try: + saver.save_as_pdf( + url="https://www.epochtimes.com/gb/25/12/22/n14660274.htm", + output_path="remote_example2.pdf" + ) + except Exception as e: + print(f"❌ 失败: {e}") + + saver.quit()