161 lines
4.9 KiB
Python
161 lines
4.9 KiB
Python
import pymysql
|
||
import jieba
|
||
from collections import Counter
|
||
from typing import List, Tuple, Set
|
||
|
||
# ================== 配置区 ==================
|
||
|
||
DB_CONFIG = {
|
||
'host': '47.113.231.200',
|
||
'port': 28089,
|
||
'user': 'root',
|
||
'password': 'passok123A',
|
||
'database': 'dsp',
|
||
'charset': 'utf8mb4',
|
||
}
|
||
|
||
# 指定需要处理的中文站点(es_srcname)
|
||
TARGET_SRCNAMES: List[str] = [
|
||
"http://www.rodong.rep.kp/cn/index.php?MUBAMUAxQA==",
|
||
# 添加你的站点
|
||
]
|
||
|
||
FOREIGN_SRCNAME = 'http://www.rodong.rep.kp/ko/index.php?MUBAMUAxQA=='
|
||
|
||
# 相似度阈值(关键词重合率),建议 0.3 ~ 0.6
|
||
SIMILARITY_THRESHOLD = 0.3
|
||
|
||
|
||
# ================== 文本相似度函数 ==================
|
||
|
||
def extract_keywords(text: str) -> Set[str]:
|
||
"""提取中文关键词:分词 + 过滤单字、数字、标点"""
|
||
if not text:
|
||
return set()
|
||
words = jieba.lcut(text)
|
||
return {w for w in words if len(w) >= 2 and w.isalpha()}
|
||
|
||
|
||
def keyword_overlap_similarity(title1: str, title2: str) -> float:
|
||
"""计算两个中文标题的关键词重合率"""
|
||
kw1 = extract_keywords(title1)
|
||
kw2 = extract_keywords(title2)
|
||
|
||
if not kw1 and not kw2:
|
||
return 1.0 if title1 == title2 else 0.0
|
||
if not kw1 or not kw2:
|
||
return 0.0
|
||
|
||
overlap = kw1 & kw2
|
||
return len(overlap) / max(len(kw1), len(kw2))
|
||
|
||
|
||
# ================== 数据库操作 ==================
|
||
|
||
def get_chinese_records(cursor) -> List[Tuple]:
|
||
"""获取待处理的中文记录"""
|
||
if not TARGET_SRCNAMES:
|
||
return []
|
||
placeholders = ','.join(['%s'] * len(TARGET_SRCNAMES))
|
||
query = f"""
|
||
SELECT es_sid, es_srcname, es_urlname, es_urltitle, es_urltime
|
||
FROM indeximos
|
||
WHERE es_srcname IN ({placeholders})
|
||
AND es_urltitle IS NOT NULL AND TRIM(es_urltitle) != ''
|
||
AND es_urltime IS NOT NULL
|
||
AND es_loadtime > '2026-01-16 10:40:00'
|
||
"""
|
||
cursor.execute(query, TARGET_SRCNAMES)
|
||
return cursor.fetchall()
|
||
|
||
|
||
def get_foreign_candidates_by_time(cursor, pub_time) -> List[Tuple]:
|
||
"""
|
||
获取同一发布时间的所有外文候选记录(要求 es_abstract 不为空)
|
||
"""
|
||
query = """
|
||
SELECT es_sid, es_title, es_urltitle, es_urlcontent
|
||
FROM indeximos
|
||
WHERE es_urltime = %s
|
||
AND es_title IS NOT NULL AND TRIM(es_title) != ''
|
||
AND es_urlcontent IS NOT NULL
|
||
AND es_loadtime > '2026-01-16 10:40:00'
|
||
"""
|
||
cursor.execute(query, (pub_time,))
|
||
return cursor.fetchall()
|
||
|
||
|
||
def update_chinese_record(cursor, record_id: int, new_title: str, content: str):
|
||
"""更新中文记录的标题和内容"""
|
||
update_query = """
|
||
UPDATE indeximos
|
||
SET es_title = %s, es_content = %s
|
||
WHERE es_sid = %s
|
||
"""
|
||
cursor.execute(update_query, (new_title, content, record_id))
|
||
|
||
|
||
# ================== 主逻辑 ==================
|
||
|
||
def main():
|
||
if not TARGET_SRCNAMES:
|
||
print("⚠️ 未指定目标站点,退出。")
|
||
return
|
||
|
||
conn = pymysql.connect(**DB_CONFIG)
|
||
cursor = conn.cursor()
|
||
|
||
try:
|
||
chinese_records = get_chinese_records(cursor)
|
||
total = len(chinese_records)
|
||
print(f"共加载 {total} 条中文记录用于匹配...")
|
||
|
||
matched_count = 0
|
||
|
||
for idx, (cid, srcname, urlname, zh_title, pub_time) in enumerate(chinese_records, 1):
|
||
print(f"\n[{idx}/{total}] ID={cid}, 时间={pub_time}, 标题='{zh_title[:30]}...'")
|
||
|
||
candidates = get_foreign_candidates_by_time(cursor, pub_time)
|
||
if not candidates:
|
||
print(" → 无同时间且有翻译标题的外文记录")
|
||
continue
|
||
|
||
best_score = 0.0
|
||
best_candidate = None
|
||
|
||
for fid, trans_title, ori_title, content in candidates:
|
||
# 跳过自己(理论上不会发生,但安全起见)
|
||
if fid == cid:
|
||
continue
|
||
|
||
score = keyword_overlap_similarity(zh_title, trans_title)
|
||
print(f" 候选ID={fid} | 翻译标题='{trans_title[:30]}...' | 重合度={score:.3f}")
|
||
|
||
if score > best_score:
|
||
best_score = score
|
||
best_candidate = (ori_title, content)
|
||
|
||
if best_candidate and best_score >= SIMILARITY_THRESHOLD:
|
||
final_title, final_content = best_candidate
|
||
update_chinese_record(cursor, cid, final_title, final_content)
|
||
matched_count += 1
|
||
print(f" ✅ 匹配成功! 重合度={best_score:.3f}")
|
||
else:
|
||
print(f" ❌ 未达阈值(最高相似度={best_score:.3f})")
|
||
|
||
conn.commit()
|
||
print("\n" + "=" * 50)
|
||
print(f"✅ 匹配完成!成功关联 {matched_count} / {total} 条记录。")
|
||
|
||
except Exception as e:
|
||
conn.rollback()
|
||
print(f"❌ 发生错误,已回滚: {e}")
|
||
raise
|
||
finally:
|
||
cursor.close()
|
||
conn.close()
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|