From 0b2abd342a64d3d384875b44a9872349f30080d6 Mon Sep 17 00:00:00 2001 From: yuxin-pc Date: Tue, 20 Jan 2026 11:07:39 +0800 Subject: [PATCH] Delete decode-url-for-rodong-news.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 不再使用该脚本 --- .../decode-url-for-rodong-news.py | 171 ------------------ 1 file changed, 171 deletions(-) delete mode 100644 research/pdf_downloader/decode-url-for-rodong-news.py diff --git a/research/pdf_downloader/decode-url-for-rodong-news.py b/research/pdf_downloader/decode-url-for-rodong-news.py deleted file mode 100644 index cd2fee4..0000000 --- a/research/pdf_downloader/decode-url-for-rodong-news.py +++ /dev/null @@ -1,171 +0,0 @@ -import mysql.connector -import base64 -import urllib.parse -import re - -# === 数据库配置 === -DB_CONFIG = { - 'host': '47.113.231.200', - 'port': 28089, - 'user': 'root', - 'password': 'passok123A', - 'database': 'dsp', - 'charset': 'utf8mb4', -} - - -def decode_rodong_url(url): - """ - 从朝鲜劳动新闻URL中提取并Base64解码参数部分 - 示例输入: http://www.rodong.rep.kp/cn/index.php?MTJAMjAyNi0wMS0wNS0wMDJAMUAxQEAwQDNA== - 输出: '12@2026-01-05-002@1@1@@0@37@' 或 None(若无法解析) - """ - if not url or 'index.php?' not in url: - return None - - try: - # 方法1:使用 urllib.parse 解析 - parsed = urllib.parse.urlparse(url) - query = parsed.query - - # 如果 query 为空,尝试用正则兜底(应对非常规URL) - if not query: - match = re.search(r'index\.php\?([A-Za-z0-9+/=]+)', url) - if match: - query = match.group(1) - else: - return None - - # Base64 解码 - decoded_bytes = base64.b64decode(query) - decoded_str = decoded_bytes.decode('utf-8') - return decoded_str - - except Exception as e: - # 记录错误但不中断整体流程 - print(f" 解码失败 (URL: {url[:60]}...): {e}") - return None - - -def main(): - try: - # 连接数据库 - conn = mysql.connector.connect(**DB_CONFIG) - cursor = conn.cursor(buffered=True) - - # 查询所有需要处理的记录(只处理包含 index.php? 的 URL) - print("正在查询待处理的新闻记录...") - cursor.execute(""" - SELECT es_sid, es_urlname - FROM indeximos - WHERE es_sitename = '劳动新闻' - AND (es_tags IS NULL OR es_tags = '') - """) - records = cursor.fetchall() - - if not records: - print("没有找到需要处理的记录。") - return - - print(f"共找到 {len(records)} 条待处理记录。") - - updated_count = 0 - for i, (es_sid, es_urlname) in enumerate(records, 1): - print(f"[{i}/{len(records)}] 处理 ID={es_sid} ...", end=" ") - - decoded = decode_rodong_url(es_urlname) - if decoded is not None: - # 更新 es_tags 字段 - update_query = "UPDATE indeximos SET es_tags = %s WHERE es_sid = %s" - cursor.execute(update_query, (decoded, es_sid)) - conn.commit() - updated_count += 1 - print(f"成功 → {decoded[:50]}{'...' if len(decoded) > 50 else ''}") - else: - print("跳过(无法解码)") - - print(f"\n✅ 完成!共更新 {updated_count} 条记录。") - - except mysql.connector.Error as db_err: - print(f"❌ 数据库错误: {db_err}") - except Exception as e: - print(f"❌ 脚本执行出错: {e}") - finally: - if 'cursor' in locals(): - cursor.close() - if 'conn' in locals() and conn.is_connected(): - conn.close() - print("数据库连接已关闭。") - - -if __name__ == "__main__": - - # 动态替换 SQL 中的表名(注意:表名不能用参数化,需手动拼接,但确保安全) - # 为安全起见,可加校验 - if not re.match(r'^[a-zA-Z_][a-zA-Z0-9_]*$', 'indeximos'): - raise ValueError("表名包含非法字符!") - - # 临时替换函数中的表名(更优雅的方式是传参,此处为简洁) - import sys - - module = sys.modules[__name__] - - - # 修改 main 函数中的 SQL(通过字符串替换) - # 实际建议:将表名作为全局变量或参数传递 - - # 更简单做法:在 main() 上方定义 TABLE_NAME,然后在 SQL 中直接引用 - # 我们重写 main 函数内部逻辑以支持变量表名 - - # 重新定义带表名参数的主逻辑 - def main_with_table(table_name): - try: - conn = mysql.connector.connect(**DB_CONFIG) - cursor = conn.cursor(buffered=True) - - # 查询 - query_sql = f""" - SELECT es_sid, es_urlname - FROM `{table_name}` - WHERE es_urlname LIKE '%index.php?%' - AND (es_tags IS NULL OR es_tags = '') - """ - cursor.execute(query_sql) - records = cursor.fetchall() - - if not records: - print("没有找到需要处理的记录。") - return - - print(f"共找到 {len(records)} 条待处理记录。") - - updated_count = 0 - for i, (es_sid, es_urlname) in enumerate(records, 1): - print(f"[{i}/{len(records)}] 处理 ID={es_sid} ...", end=" ") - - decoded = decode_rodong_url(es_urlname) - if decoded is not None: - update_sql = f"UPDATE `{table_name}` SET es_tags = %s WHERE es_sid = %s" - cursor.execute(update_sql, (decoded, es_sid)) - conn.commit() - updated_count += 1 - print(f"成功 → {decoded[:50]}{'...' if len(decoded) > 50 else ''}") - else: - print("跳过(无法解码)") - - print(f"\n✅ 完成!共更新 {updated_count} 条记录。") - - except mysql.connector.Error as db_err: - print(f"❌ 数据库错误: {db_err}") - except Exception as e: - print(f"❌ 脚本执行出错: {e}") - finally: - if 'cursor' in locals(): - cursor.close() - if 'conn' in locals() and conn.is_connected(): - conn.close() - print("数据库连接已关闭。") - - - # 执行 - main_with_table('indeximos')