osc/research/pdf_downloader/set_raw_title_rodong.py

161 lines
4.9 KiB
Python
Raw Normal View History

2026-01-19 09:17:10 +08:00
import pymysql
import jieba
from collections import Counter
from typing import List, Tuple, Set
# ================== 配置区 ==================
DB_CONFIG = {
'host': '47.113.231.200',
'port': 28089,
'user': 'root',
'password': 'passok123A',
'database': 'dsp',
'charset': 'utf8mb4',
}
# 指定需要处理的中文站点es_srcname
TARGET_SRCNAMES: List[str] = [
"http://www.rodong.rep.kp/cn/index.php?MUBAMUAxQA==",
# 添加你的站点
]
FOREIGN_SRCNAME = 'http://www.rodong.rep.kp/ko/index.php?MUBAMUAxQA=='
# 相似度阈值(关键词重合率),建议 0.3 ~ 0.6
SIMILARITY_THRESHOLD = 0.3
# ================== 文本相似度函数 ==================
def extract_keywords(text: str) -> Set[str]:
"""提取中文关键词:分词 + 过滤单字、数字、标点"""
if not text:
return set()
words = jieba.lcut(text)
return {w for w in words if len(w) >= 2 and w.isalpha()}
def keyword_overlap_similarity(title1: str, title2: str) -> float:
"""计算两个中文标题的关键词重合率"""
kw1 = extract_keywords(title1)
kw2 = extract_keywords(title2)
if not kw1 and not kw2:
return 1.0 if title1 == title2 else 0.0
if not kw1 or not kw2:
return 0.0
overlap = kw1 & kw2
return len(overlap) / max(len(kw1), len(kw2))
# ================== 数据库操作 ==================
def get_chinese_records(cursor) -> List[Tuple]:
"""获取待处理的中文记录"""
if not TARGET_SRCNAMES:
return []
placeholders = ','.join(['%s'] * len(TARGET_SRCNAMES))
query = f"""
SELECT es_sid, es_srcname, es_urlname, es_urltitle, es_urltime
FROM indeximos
WHERE es_srcname IN ({placeholders})
AND es_urltitle IS NOT NULL AND TRIM(es_urltitle) != ''
AND es_urltime IS NOT NULL
2026-01-20 11:07:54 +08:00
AND es_loadtime > '2026-01-16 10:40:00'
2026-01-19 09:17:10 +08:00
"""
cursor.execute(query, TARGET_SRCNAMES)
return cursor.fetchall()
def get_foreign_candidates_by_time(cursor, pub_time) -> List[Tuple]:
"""
获取同一发布时间的所有外文候选记录要求 es_abstract 不为空
"""
query = """
2026-01-20 11:07:54 +08:00
SELECT es_sid, es_title, es_urltitle, es_urlcontent
2026-01-19 09:17:10 +08:00
FROM indeximos
WHERE es_urltime = %s
2026-01-20 11:07:54 +08:00
AND es_title IS NOT NULL AND TRIM(es_title) != ''
2026-01-19 09:17:10 +08:00
AND es_urlcontent IS NOT NULL
2026-01-20 11:07:54 +08:00
AND es_loadtime > '2026-01-16 10:40:00'
2026-01-19 09:17:10 +08:00
"""
cursor.execute(query, (pub_time,))
return cursor.fetchall()
def update_chinese_record(cursor, record_id: int, new_title: str, content: str):
"""更新中文记录的标题和内容"""
update_query = """
UPDATE indeximos
SET es_title = %s, es_content = %s
WHERE es_sid = %s
"""
cursor.execute(update_query, (new_title, content, record_id))
# ================== 主逻辑 ==================
def main():
if not TARGET_SRCNAMES:
print("⚠️ 未指定目标站点,退出。")
return
conn = pymysql.connect(**DB_CONFIG)
cursor = conn.cursor()
try:
chinese_records = get_chinese_records(cursor)
total = len(chinese_records)
print(f"共加载 {total} 条中文记录用于匹配...")
matched_count = 0
for idx, (cid, srcname, urlname, zh_title, pub_time) in enumerate(chinese_records, 1):
print(f"\n[{idx}/{total}] ID={cid}, 时间={pub_time}, 标题='{zh_title[:30]}...'")
candidates = get_foreign_candidates_by_time(cursor, pub_time)
if not candidates:
print(" → 无同时间且有翻译标题的外文记录")
continue
best_score = 0.0
best_candidate = None
for fid, trans_title, ori_title, content in candidates:
# 跳过自己(理论上不会发生,但安全起见)
if fid == cid:
continue
score = keyword_overlap_similarity(zh_title, trans_title)
print(f" 候选ID={fid} | 翻译标题='{trans_title[:30]}...' | 重合度={score:.3f}")
if score > best_score:
best_score = score
best_candidate = (ori_title, content)
if best_candidate and best_score >= SIMILARITY_THRESHOLD:
final_title, final_content = best_candidate
update_chinese_record(cursor, cid, final_title, final_content)
matched_count += 1
print(f" ✅ 匹配成功! 重合度={best_score:.3f}")
else:
print(f" ❌ 未达阈值(最高相似度={best_score:.3f}")
conn.commit()
print("\n" + "=" * 50)
print(f"✅ 匹配完成!成功关联 {matched_count} / {total} 条记录。")
except Exception as e:
conn.rollback()
print(f"❌ 发生错误,已回滚: {e}")
raise
finally:
cursor.close()
conn.close()
if __name__ == "__main__":
main()