jsc-dsp #1

Merged
yuxin merged 9 commits from jsc-dsp into main 2026-01-19 01:30:58 +00:00
5 changed files with 998 additions and 0 deletions
Showing only changes of commit 970d86ed7d - Show all commits

View File

@ -0,0 +1,322 @@
import logging
import os
import queue
import threading
import time
from datetime import datetime
import pymysql
from tqdm import tqdm
from save_page_as_pdf import PDFSaver
from save_remote_as_mhtml import RemoteMHTMLSaver
from save_page_as_mhtml import MHTMLSaver
import tldextract
# 配置日志
from save_remote_as_pdf import RemotePDFSaver
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(),
logging.FileHandler('pdf_downloader.log')
]
)
logger = logging.getLogger(__name__)
# =============== MySQL 配置 ===============
MYSQL_CONFIG = {
'host': '47.113.231.200',
'port': 28089,
'user': 'root',
'password': 'passok123A',
'database': 'dsp',
'charset': 'utf8mb4',
'autocommit': False # 手动控制事务
}
# =========================================
# 配置参数
BATCH_SIZE = 500
MAX_WORKERS = 1
TIMEOUT = 10
PDF_OUTPUT_DIR = 'D:/data/output/pdf'
MIN_PDF_SIZE = 80 * 1024 # 80KB
MHTML_OUTPUT_DIR = 'D:/data/output/mhtml'
os.makedirs(PDF_OUTPUT_DIR, exist_ok=True)
running = True
running_interval_seconds = 15
remote_host_name = [
'epochtimes.com',
# 'secretchina.com'
]
class PDFDownloader:
def __init__(self):
self.db_lock = threading.Lock()
self.task_queue = queue.Queue(maxsize=MAX_WORKERS * 3)
self.processed_count = 0
self.success_count = 0
self.fail_count = 0
self.small_file_count = 0 # 新增:统计小文件数量
self.last_loadtime = self.get_last_loadtime()
self.total_rows = self.get_total_rows()
self.start_time = time.time()
self.skip_hosts = []
self.local_handler = None
self.remote_handler = None
# 替换 MYSQL_CONFIG 中的连接方式
def get_db_connection(self):
return pymysql.connect(
host=MYSQL_CONFIG['host'],
port=MYSQL_CONFIG['port'],
user=MYSQL_CONFIG['user'],
password=MYSQL_CONFIG['password'],
database=MYSQL_CONFIG['database'],
charset='utf8mb4',
autocommit=False
)
def get_total_rows(self):
"""获取总记录数"""
with self.get_db_connection() as conn:
cursor = conn.cursor()
cursor.execute(
"SELECT COUNT(*) FROM indeximos "
"WHERE (es_video IS NULL OR es_video IN ('-2', '-1')) "
"AND es_loadtime > %s", self.last_loadtime
)
return cursor.fetchone()[0]
def get_last_loadtime(self):
"""获取上次导出数据的时间"""
with self.get_db_connection() as conn:
cursor = conn.cursor()
cursor.execute(
"SELECT config_value FROM config "
"WHERE config_name = 'last_loadtime' "
)
return cursor.fetchone()[0]
def use_remote_selenium(self, url):
for host in remote_host_name:
if host in url:
return True
return False
def format_pdf_filename(self, row):
"""格式化PDF文件名"""
es_urltitle = row[2] or 'untitled'
es_urltime = str(row[3]) or '19700101_000000'
es_sitename = row[4] or 'anonymous'
def clean_filename(text):
if not text:
return ''
invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*']
for char in invalid_chars:
text = text.replace(char, '_')
return text.strip()[:100]
try:
dt = datetime.strptime(es_urltime, '%Y-%m-%d %H:%M:%S')
es_urltime_fix = dt.strftime('%Y%m%d_%H%M%S')
except:
es_urltime_fix = '19700101_000000'
filename = f"{clean_filename(es_urltitle)}_{es_urltime_fix}_{es_sitename}.pdf"
return os.path.join(PDF_OUTPUT_DIR, filename)
def format_mhtml_filename(self, row):
"""格式化PDF文件名"""
es_urltitle = row[2] or 'untitled'
es_urltime = str(row[3]) or '19700101_000000'
es_sitename = row[4] or 'anonymous'
def clean_filename(text):
if not text:
return ''
invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*']
for char in invalid_chars:
text = text.replace(char, '_')
return text.strip()[:100]
try:
dt = datetime.strptime(es_urltime, '%Y-%m-%d %H:%M:%S')
es_urltime_fix = dt.strftime('%Y%m%d_%H%M%S')
except:
es_urltime_fix = '19700101_000000'
filename = f"{clean_filename(es_urltitle)}_{es_urltime_fix}_{es_sitename}.mhtml"
return os.path.join(PDF_OUTPUT_DIR, filename)
def fetch_data_batch(self, offset):
"""分页获取数据"""
with self.get_db_connection() as conn:
cursor = conn.cursor()
cursor.execute(
"SELECT es_sid, es_urlname, es_urltitle, es_urltime, es_sitename, es_authors FROM indeximos "
"WHERE (es_video IS NULL OR es_video IN ('-2', '-1')) "
"AND es_loadtime > %s "
"ORDER BY es_urltime LIMIT %s OFFSET %s",
(self.last_loadtime, BATCH_SIZE, offset)
)
return cursor.fetchall()
def update_file_status(self, es_sid, status, retry=3):
"""更新数据库状态"""
for attempt in range(retry):
try:
with self.db_lock, self.get_db_connection() as conn:
cursor = conn.cursor()
cursor.execute(
"UPDATE indeximos SET es_video = %s WHERE es_sid = %s",
(status, es_sid))
conn.commit()
return True
except Exception as e:
if attempt == retry - 1:
logger.error(f"更新数据库失败(es_sid={es_sid}): {e}")
return False
time.sleep(1)
def extract_main_domain(self, url):
extracted = tldextract.extract(url)
# 组合注册域名(主域名)
main_domain = f"{extracted.domain}.{extracted.suffix}"
return main_domain
def download_worker(self):
"""工作线程函数"""
while True:
try:
task = self.task_queue.get(timeout=1)
if task is None:
break
row = task
url = row[1]
if self.extract_main_domain(url) in self.skip_hosts:
self.small_file_count += 1
self.processed_count += 1
self.task_queue.task_done()
print(f"小文件规避暂时跳过URL{url}")
continue
output_file = self.format_pdf_filename(row) # 获取格式化后的文件名
try:
os.makedirs(os.path.dirname(output_file), exist_ok=True)
# 调用下载函数
if self.use_remote_selenium(url):
if self.remote_handler is None:
self.remote_handler = RemotePDFSaver()
success = self.remote_handler.save_as_pdf(
url=url,
output_path=output_file,
timeout=TIMEOUT
)
else:
if self.local_handler is None:
self.local_handler = PDFSaver()
success = self.local_handler.save_as_pdf(
url=url,
output_path=output_file,
timeout=TIMEOUT
)
# 验证下载结果
if success and os.path.exists(output_file):
file_size = os.path.getsize(output_file)
if file_size >= MIN_PDF_SIZE: # 文件大小合格
self.update_file_status(row[0], output_file)
self.success_count += 1
else: # 文件太小
self.update_file_status(row[0], '-2')
self.small_file_count += 1
logger.warning(f"文件过小({file_size}字节): {output_file}")
try:
os.remove(output_file)
self.skip_hosts.append(self.extract_main_domain(url))
except:
pass
else: # 下载失败
self.update_file_status(row[0], '0')
self.fail_count += 1
if os.path.exists(output_file):
try:
os.remove(output_file)
except:
pass
except Exception as e:
logger.error(f"下载出现异常(es_sid={row[0]}, url={url}): {str(e)}")
self.update_file_status(row[0], '-1')
self.fail_count += 1
self.processed_count += 1
self.task_queue.task_done()
except queue.Empty:
continue
def run(self):
"""启动下载任务"""
threads = []
# 创建工作线程
for _ in range(MAX_WORKERS):
t = threading.Thread(target=self.download_worker)
t.start()
threads.append(t)
# 使用进度条显示进度
with tqdm(total=self.total_rows, desc="处理进度", unit="") as pbar:
offset = 0
while True:
batch = self.fetch_data_batch(offset)
if not batch:
break
for row in batch:
self.task_queue.put(row)
pbar.update(len(batch))
pbar.set_postfix({
'成功': self.success_count,
'失败': self.fail_count,
'小文件': self.small_file_count,
'速度': f"{self.processed_count / (time.time() - self.start_time):.1f}条/秒"
})
offset += BATCH_SIZE
self.task_queue.join()
for _ in range(MAX_WORKERS):
self.task_queue.put(None)
for t in threads:
t.join()
total_time = time.time() - self.start_time
print(f"\n处理完成! 总计: {self.total_rows}")
print(f"成功: {self.success_count}条, 失败: {self.fail_count}条, 小文件: {self.small_file_count}")
print(f"总耗时: {total_time:.2f}秒, 平均速度: {self.total_rows / total_time:.2f}条/秒")
if __name__ == "__main__":
while running:
print(f"开始处理,总记录数: {PDFDownloader().get_total_rows()}")
downloader = PDFDownloader()
downloader.run()
print(f"运行完成,暂停{running_interval_seconds}秒后开始下一次运行...")
time.sleep(running_interval_seconds)

View File

@ -0,0 +1,141 @@
import logging
import os
import time
from urllib.parse import urlparse
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service as ChromeService
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(),
logging.FileHandler('mhtml_saver.log', encoding='utf-8')
]
)
logger = logging.getLogger(__name__)
class MHTMLSaver:
def __init__(self, headless=True):
logger.info("正在初始化 Chrome WebDriver自动匹配版本...")
service = ChromeService(executable_path="C:/Program Files/Python38/chromedriver.exe")
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.3650.75 Safari/537.36 Edg/143.0.3650.75"
# Chrome 选项
chrome_options = Options()
chrome_options.add_argument('--headless=new')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument(f'--user-agent={user_agent}')
chrome_options.add_argument('--save-page-as-mhtml') # 启用 MHTML 支持
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument('--lang=zh-CN')
chrome_options.add_experimental_option('prefs', {
'intl.accept_languages': 'zh-CN,zh,en'
})
# 或启动时指定(部分版本支持)
chrome_options.add_argument('--window-size=1920,1080')
# 隐藏 webdriver 特征
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
# 隐藏 "navigator.webdriver"
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
chrome_options.add_experimental_option('useAutomationExtension', False)
self.driver = webdriver.Chrome(service=service, options=chrome_options)
def save_as_mhtml(self, url, output_path=None, timeout=30, wait_time=5):
"""
将网页保存为 MHTML 文件
:param url: 目标网页 URL
:param output_path: 输出路径.mhtml
:param timeout: 页面加载超时
:param wait_time: 页面加载后等待时间用于动态内容渲染
:return: 保存的文件绝对路径
"""
if output_path is None:
parsed = urlparse(url)
domain = parsed.netloc.replace('www.', '').split('.')[0] or 'page'
output_path = f"{domain}.mhtml"
if not output_path.lower().endswith('.mhtml'):
output_path += '.mhtml'
try:
# 设置超时
self.driver.set_page_load_timeout(timeout)
# 启动后注入脚本(双重保险)
self.driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
'source': '''
delete navigator.__proto__.webdriver;
window.navigator.permissions.query = (parameters) => {
return parameters.name === 'notifications' ?
Promise.resolve({ state: Notification.permission }) :
originalQuery(parameters);
};
'''
})
# 在 driver.get() 之后设置窗口大小
logger.info(f"正在加载页面: {url}")
self.driver.get(url)
self.driver.set_window_size(1920, 1080)
# 等待页面动态内容加载(可调整)
logger.info(f"等待 {wait_time} 秒以确保页面完全渲染...")
time.sleep(wait_time)
# ✅ 关键:调用 CDP 命令捕获 MHTML
logger.info("正在生成 MHTML 快照...")
result = self.driver.execute_cdp_cmd('Page.captureSnapshot', {'format': 'mhtml'})
# ✅ result['data'] 是 Base64 编码的 MHTML 文本(实际是纯字符串)
mhtml_content = result['data']
# ✅ 以文本模式写入UTF-8
with open(output_path, 'w', encoding='utf-8', newline='') as f:
f.write(mhtml_content)
# 验证文件
file_size = os.path.getsize(output_path)
if file_size == 0:
raise RuntimeError("生成了空文件")
logger.info(f"✅ MHTML 保存成功: {os.path.abspath(output_path)} (大小: {file_size} 字节)")
return os.path.abspath(output_path)
except Exception as e:
logger.error(f"❌ 保存失败: {e}")
raise
def quit(self):
if self.driver:
self.driver.quit()
logger.info("浏览器已关闭")
# ===== 测试入口 =====
if __name__ == "__main__":
# 示例 URL可替换为你自己的
test_url = "https://cn.ultraiso.net/jiaocheng/ke-lu-guang-pan.html"
saver = MHTMLSaver(headless=True)
try:
output_file = saver.save_as_mhtml(
url=test_url,
output_path="example.mhtml",
timeout=30,
wait_time=5
)
print(f"\n🎉 成功保存 MHTML 文件: {output_file}")
except Exception as e:
print(f"\n💥 保存失败: {e}")
finally:
saver.quit()

View File

@ -0,0 +1,144 @@
import base64
import logging
import os
import time
from urllib.parse import urlparse
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service as ChromeService
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(),
logging.FileHandler('pdf_saver.log', encoding='utf-8')
]
)
logger = logging.getLogger(__name__)
class PDFSaver:
def __init__(self, headless=True):
logger.info("正在初始化 Chrome WebDriver自动匹配版本...")
service = ChromeService(executable_path="C:/Program Files/Python38/chromedriver.exe")
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.3650.75 Safari/537.36 Edg/143.0.3650.75"
# Chrome 选项
chrome_options = Options()
if headless:
chrome_options.add_argument('--headless=new')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument(f'--user-agent={user_agent}')
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument('--lang=zh-CN')
chrome_options.add_experimental_option('prefs', {
'intl.accept_languages': 'zh-CN,zh,en'
})
chrome_options.add_argument('--window-size=1920,1080')
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
# 注意PDF 打印不需要 --save-page-as-mhtml
self.driver = webdriver.Chrome(service=service, options=chrome_options)
def save_as_pdf(self, url, output_path=None, timeout=30, wait_time=5, print_options=None):
"""
将网页保存为 PDF 文件
:param url: 目标网页 URL
:param output_path: 输出路径.pdf
:param timeout: 页面加载超时
:param wait_time: 页面加载后等待时间用于动态内容渲染
:param print_options: PDF 打印选项可选参考 https://chromedevtools.github.io/devtools-protocol/tot/Page/#method-printToPDF
:return: 保存的文件绝对路径
"""
if output_path is None:
parsed = urlparse(url)
domain = parsed.netloc.replace('www.', '').split('.')[0] or 'page'
output_path = f"{domain}.pdf"
if not output_path.lower().endswith('.pdf'):
output_path += '.pdf'
# 默认打印选项(可按需调整)
default_print_options = {
'landscape': False,
'displayHeaderFooter': False,
'printBackground': True,
'preferCSSPageSize': True,
'paperWidth': 8.27, # A4 宽(英寸)
'paperHeight': 11.69, # A4 高(英寸)
}
if print_options:
default_print_options.update(print_options)
try:
self.driver.set_page_load_timeout(timeout)
# 隐藏自动化特征
self.driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
'source': '''
delete navigator.__proto__.webdriver;
window.navigator.permissions.query = (parameters) => {
return parameters.name === 'notifications' ?
Promise.resolve({ state: Notification.permission }) :
originalQuery(parameters);
};
const originalQuery = window.navigator.permissions.query;
'''
})
logger.info(f"正在加载页面: {url}")
self.driver.get(url)
self.driver.set_window_size(1920, 1080)
logger.info(f"等待 {wait_time} 秒以确保页面完全渲染...")
time.sleep(wait_time)
logger.info("正在生成 PDF...")
result = self.driver.execute_cdp_cmd('Page.printToPDF', default_print_options)
# result['data'] 是 Base64 编码的 PDF
pdf_data = base64.b64decode(result['data'])
with open(output_path, 'wb') as f:
f.write(pdf_data)
file_size = os.path.getsize(output_path)
if file_size == 0:
raise RuntimeError("生成了空文件")
logger.info(f"✅ PDF 保存成功: {os.path.abspath(output_path)} (大小: {file_size} 字节)")
return os.path.abspath(output_path)
except Exception as e:
logger.error(f"❌ 保存失败: {e}")
raise
def quit(self):
if self.driver:
self.driver.quit()
logger.info("浏览器已关闭")
# ===== 测试入口 =====
if __name__ == "__main__":
test_url = "https://cn.ultraiso.net/jiaocheng/ke-lu-guang-pan.html"
saver = PDFSaver(headless=True)
try:
output_file = saver.save_as_pdf(
url=test_url,
output_path="example.pdf",
timeout=30,
wait_time=5
)
print(f"\n🎉 成功保存 PDF 文件: {output_file}")
except Exception as e:
print(f"\n💥 保存失败: {e}")
finally:
saver.quit()

View File

@ -0,0 +1,190 @@
import logging
import os
import time
from urllib.parse import urlparse
from selenium import webdriver
from selenium.common.exceptions import (
WebDriverException,
TimeoutException,
SessionNotCreatedException,
InvalidSessionIdException
)
from selenium.webdriver.chrome.options import Options
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class RemoteMHTMLSaver:
def __init__(
self,
remote_url="http://144.34.185.108:28098/wd/hub",
headless=True,
max_retries=3,
retry_delay=2
):
"""
初始化远程 MHTML 保存器支持自动重建 session
:param remote_url: 远程 Selenium 地址
:param headless: 是否无头
:param max_retries: 单次操作最大重试次数
:param retry_delay: 重试前等待时间
"""
self.remote_url = remote_url
self.headless = headless
self.max_retries = max_retries
self.retry_delay = retry_delay
self.driver = None
self._init_driver()
def _build_chrome_options(self):
"""构建 Chrome 选项(可复用)"""
chrome_options = Options()
if self.headless:
chrome_options.add_argument('--headless=new')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--window-size=1920,1080')
chrome_options.add_argument(
"--user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.3650.75 Safari/537.36"
)
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
chrome_options.add_experimental_option('useAutomationExtension', False)
return chrome_options
def _init_driver(self):
"""初始化或重新初始化 WebDriver"""
if self.driver:
try:
self.driver.quit()
except Exception:
pass # 忽略关闭失败
logger.info(f"正在创建新的远程 WebDriver 会话: {self.remote_url}")
for attempt in range(3):
try:
self.driver = webdriver.Remote(
command_executor=self.remote_url,
options=self._build_chrome_options()
)
# 注入反检测脚本
self.driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
'source': '''
delete navigator.__proto__.webdriver;
window.chrome = { runtime: {} };
Object.defineProperty(navigator, 'languages', {
get: () => ['zh-CN', 'zh', 'en']
});
'''
})
logger.info("✅ 远程 WebDriver 会话创建成功")
return
except Exception as e:
logger.warning(f"创建 WebDriver 失败 (尝试 {attempt + 1}/3): {e}")
if attempt < 2:
time.sleep(2)
else:
raise RuntimeError(f"无法连接到远程 Selenium 服务: {e}")
def save_as_mhtml(self, url, output_path=None, timeout=30, wait_time=5):
"""
保存网页为 MHTML支持自动重试和 session 重建
"""
if output_path is None:
domain = urlparse(url).netloc.replace('www.', '').split('.')[0] or 'page'
output_path = f"{domain}.mhtml"
if not output_path.lower().endswith('.mhtml'):
output_path += '.mhtml'
last_exception = None
for retry in range(self.max_retries + 1):
try:
# 检查 driver 是否有效
if not self.driver:
self._init_driver()
self.driver.set_page_load_timeout(timeout)
logger.info(f"[{retry + 1}/{self.max_retries + 1}] 加载页面: {url}")
self.driver.get(url)
time.sleep(wait_time)
logger.info("生成 MHTML 快照...")
result = self.driver.execute_cdp_cmd('Page.captureSnapshot', {'format': 'mhtml'})
mhtml_content = result['data']
# 写入本地文件
with open(output_path, 'w', encoding='utf-8', newline='') as f:
f.write(mhtml_content)
file_size = os.path.getsize(output_path)
if file_size == 0:
raise RuntimeError("生成了空文件")
logger.info(f"✅ 保存成功: {os.path.abspath(output_path)} ({file_size} 字节)")
return os.path.abspath(output_path)
except (WebDriverException, InvalidSessionIdException, SessionNotCreatedException) as e:
last_exception = e
logger.warning(f"WebDriver 异常 (retry {retry + 1}): {e}")
if retry < self.max_retries:
logger.info("正在重建 WebDriver 会话...")
self._init_driver()
time.sleep(self.retry_delay)
else:
logger.error("达到最大重试次数,放弃")
break
except TimeoutException as e:
last_exception = e
logger.warning(f"页面加载超时 (retry {retry + 1}): {e}")
if retry < self.max_retries:
time.sleep(self.retry_delay)
else:
break
except Exception as e:
last_exception = e
logger.error(f"未知错误 (retry {retry + 1}): {e}")
break # 非 WebDriver 错误,不重试
# 如果所有重试失败
if os.path.exists(output_path):
try:
os.remove(output_path)
except OSError:
pass
raise RuntimeError(f"保存失败({type(last_exception).__name__}: {last_exception}")
def quit(self):
"""显式关闭浏览器"""
if self.driver:
try:
self.driver.quit()
logger.info("WebDriver 会话已关闭")
except Exception:
pass
self.driver = None
def __del__(self):
self.quit()
# ===== 测试 =====
if __name__ == "__main__":
saver = RemoteMHTMLSaver(
remote_url="http://144.34.185.108:28098/wd/hub", # ← 替换为你的云服务器公网 IP
headless=True
)
try:
saver.save_as_mhtml(
url="https://www.epochtimes.com/gb/25/12/22/n14660274.htm",
output_path="remote_example2.mhtml"
)
except Exception as e:
print(f"❌ 失败: {e}")
saver.quit()

View File

@ -0,0 +1,201 @@
import base64
import logging
import os
import time
from urllib.parse import urlparse
from selenium import webdriver
from selenium.common.exceptions import (
WebDriverException,
TimeoutException,
SessionNotCreatedException,
InvalidSessionIdException
)
from selenium.webdriver.chrome.options import Options
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class RemotePDFSaver:
def __init__(
self,
remote_url="http://144.34.185.108:28098/wd/hub",
headless=True,
max_retries=3,
retry_delay=2,
print_options=None
):
"""
初始化远程 PDF 保存器支持自动重建 session
:param remote_url: 远程 Selenium 地址
:param headless: 是否无头模式
:param max_retries: 单次操作最大重试次数
:param retry_delay: 重试前等待时间
:param print_options: PDF 打印选项参考 DevTools Protocol
"""
self.remote_url = remote_url
self.headless = headless
self.max_retries = max_retries
self.retry_delay = retry_delay
self.print_options = print_options or {
'landscape': False,
'displayHeaderFooter': False,
'printBackground': True,
'preferCSSPageSize': True,
'paperWidth': 8.27, # A4 宽(英寸)
'paperHeight': 11.69, # A4 高(英寸)
}
self.driver = None
self._init_driver()
def _build_chrome_options(self):
"""构建 Chrome 选项(可复用)"""
chrome_options = Options()
if self.headless:
chrome_options.add_argument('--headless=new')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--window-size=1920,1080')
chrome_options.add_argument(
"--user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.3650.75 Safari/537.36"
)
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
chrome_options.add_experimental_option('useAutomationExtension', False)
return chrome_options
def _init_driver(self):
"""初始化或重新初始化 WebDriver"""
if self.driver:
try:
self.driver.quit()
except Exception:
pass # 忽略关闭失败
logger.info(f"正在创建新的远程 WebDriver 会话: {self.remote_url}")
for attempt in range(3):
try:
self.driver = webdriver.Remote(
command_executor=self.remote_url,
options=self._build_chrome_options()
)
# 注入反检测脚本
self.driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
'source': '''
delete navigator.__proto__.webdriver;
window.chrome = { runtime: {} };
Object.defineProperty(navigator, 'languages', {
get: () => ['zh-CN', 'zh', 'en']
});
'''
})
logger.info("✅ 远程 WebDriver 会话创建成功")
return
except Exception as e:
logger.warning(f"创建 WebDriver 失败 (尝试 {attempt + 1}/3): {e}")
if attempt < 2:
time.sleep(2)
else:
raise RuntimeError(f"无法连接到远程 Selenium 服务: {e}")
def save_as_pdf(self, url, output_path=None, timeout=30, wait_time=5):
"""
保存网页为 PDF支持自动重试和 session 重建
"""
if output_path is None:
domain = urlparse(url).netloc.replace('www.', '').split('.')[0] or 'page'
output_path = f"{domain}.pdf"
if not output_path.lower().endswith('.pdf'):
output_path += '.pdf'
last_exception = None
for retry in range(self.max_retries + 1):
try:
# 检查 driver 是否有效
if not self.driver:
self._init_driver()
self.driver.set_page_load_timeout(timeout)
logger.info(f"[{retry + 1}/{self.max_retries + 1}] 加载页面: {url}")
self.driver.get(url)
time.sleep(wait_time)
logger.info("生成 PDF...")
result = self.driver.execute_cdp_cmd('Page.printToPDF', self.print_options)
pdf_data = base64.b64decode(result['data'])
# 写入本地 PDF 文件(二进制)
with open(output_path, 'wb') as f:
f.write(pdf_data)
file_size = os.path.getsize(output_path)
if file_size == 0:
raise RuntimeError("生成了空文件")
logger.info(f"✅ 保存成功: {os.path.abspath(output_path)} ({file_size} 字节)")
return os.path.abspath(output_path)
except (WebDriverException, InvalidSessionIdException, SessionNotCreatedException) as e:
last_exception = e
logger.warning(f"WebDriver 异常 (retry {retry + 1}): {e}")
if retry < self.max_retries:
logger.info("正在重建 WebDriver 会话...")
self._init_driver()
time.sleep(self.retry_delay)
else:
logger.error("达到最大重试次数,放弃")
break
except TimeoutException as e:
last_exception = e
logger.warning(f"页面加载超时 (retry {retry + 1}): {e}")
if retry < self.max_retries:
time.sleep(self.retry_delay)
else:
break
except Exception as e:
last_exception = e
logger.error(f"未知错误 (retry {retry + 1}): {e}")
break # 非 WebDriver 错误,不重试
# 清理失败生成的空文件
if os.path.exists(output_path):
try:
os.remove(output_path)
except OSError:
pass
raise RuntimeError(f"保存失败({type(last_exception).__name__}: {last_exception}")
def quit(self):
"""显式关闭浏览器"""
if self.driver:
try:
self.driver.quit()
logger.info("WebDriver 会话已关闭")
except Exception:
pass
self.driver = None
def __del__(self):
self.quit()
# ===== 测试 =====
if __name__ == "__main__":
saver = RemotePDFSaver(
remote_url="http://144.34.185.108:28098/wd/hub", # ← 替换为你的云服务器公网 IP
headless=True
)
try:
saver.save_as_pdf(
url="https://www.epochtimes.com/gb/25/12/22/n14660274.htm",
output_path="remote_example2.pdf"
)
except Exception as e:
print(f"❌ 失败: {e}")
saver.quit()