osc/research/pdf_downloader/set_raw_title_kcna.py
2026-01-20 11:07:54 +08:00

120 lines
3.5 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import pymysql
from typing import Dict, List, Tuple, Optional
# ================== 配置区 ==================
DB_CONFIG = {
'host': '47.113.231.200',
'port': 28089,
'user': 'root',
'password': 'passok123A',
'database': 'dsp',
'charset': 'utf8mb4',
}
# 仅用于指定哪些 es_srcname 的记录需要处理(值可为空,因为不再做替换)
TARGET_SRCNAMES: List[str] = [
"http://www.kcna.kp/cn/category/articles/q/5394b80bdae203fadef02522cfb578c0.kcmsf",
# 添加你需要处理的站点名
]
# ================== 工具函数 ==================
def get_suffix_32(url: str) -> Optional[str]:
"""获取 URL 最后 32 个字符,不足则返回 None"""
if not url or len(url) < 32:
return None
return url[-32:]
def find_foreign_by_suffix(cursor, suffix: str, exclude_id: int) -> Optional[Tuple[str, str]]:
"""
根据后缀查找外文记录(排除自身)
"""
query = """
SELECT es_urltitle, es_urlcontent
FROM indeximos
WHERE
es_sid != %s
AND es_urlname IS NOT NULL
AND CHAR_LENGTH(es_urlname) >= 32
AND RIGHT(es_urlname, 32) = %s
LIMIT 1
"""
cursor.execute(query, (exclude_id, suffix))
result = cursor.fetchone()
return result if result else None
def update_chinese_record(cursor, record_id: int, title: str, content: str):
"""更新中文记录的 es_title 和 es_content"""
update_query = """
UPDATE indeximos
SET es_title = %s, es_content = %s
WHERE es_sid = %s
"""
cursor.execute(update_query, (title, content, record_id))
# ================== 主逻辑 ==================
def main():
if not TARGET_SRCNAMES:
print("⚠️ 未指定任何目标 es_srcname程序退出。")
return
conn = pymysql.connect(**DB_CONFIG)
cursor = conn.cursor()
try:
# 获取所有目标站点的中文记录
placeholders = ','.join(['%s'] * len(TARGET_SRCNAMES))
query = f"""
SELECT es_sid, es_srcname, es_urlname
FROM indeximos
WHERE es_srcname IN ({placeholders})
AND es_urlname IS NOT NULL
AND es_urlname != ''
AND es_loadtime > '2026-01-16 10:40:00'
"""
cursor.execute(query, TARGET_SRCNAMES)
records = cursor.fetchall()
total = len(records)
print(f"共加载 {total} 条来自 {TARGET_SRCNAMES} 的记录用于匹配...")
updated_count = 0
skipped_short = 0
for idx, (record_id, es_srcname, es_urlname) in enumerate(records, 1):
suffix = get_suffix_32(es_urlname)
if suffix is None:
skipped_short += 1
continue
foreign_data = find_foreign_by_suffix(cursor, suffix, record_id)
if foreign_data:
title, content = foreign_data
update_chinese_record(cursor, record_id, title, content)
updated_count += 1
print(f"[{idx}/{total}] ✅ 已更新 ID={record_id} | src={es_srcname}")
conn.commit()
print("\n" + "=" * 50)
print(f"✅ 匹配完成!")
print(f" - 成功更新: {updated_count}")
print(f" - 因 URL 长度 <32 跳过: {skipped_short}")
print(f" - 总处理: {total}")
except Exception as e:
conn.rollback()
print(f"❌ 发生错误,已回滚: {e}")
raise
finally:
cursor.close()
conn.close()
if __name__ == "__main__":
main()