import pymysql import jieba from collections import Counter from typing import List, Tuple, Set # ================== 配置区 ================== DB_CONFIG = { 'host': '47.113.231.200', 'port': 28089, 'user': 'root', 'password': 'passok123A', 'database': 'dsp', 'charset': 'utf8mb4', } # 指定需要处理的中文站点(es_srcname) TARGET_SRCNAMES: List[str] = [ "http://www.rodong.rep.kp/cn/index.php?MUBAMUAxQA==", # 添加你的站点 ] FOREIGN_SRCNAME = 'http://www.rodong.rep.kp/ko/index.php?MUBAMUAxQA==' # 相似度阈值(关键词重合率),建议 0.3 ~ 0.6 SIMILARITY_THRESHOLD = 0.3 # ================== 文本相似度函数 ================== def extract_keywords(text: str) -> Set[str]: """提取中文关键词:分词 + 过滤单字、数字、标点""" if not text: return set() words = jieba.lcut(text) return {w for w in words if len(w) >= 2 and w.isalpha()} def keyword_overlap_similarity(title1: str, title2: str) -> float: """计算两个中文标题的关键词重合率""" kw1 = extract_keywords(title1) kw2 = extract_keywords(title2) if not kw1 and not kw2: return 1.0 if title1 == title2 else 0.0 if not kw1 or not kw2: return 0.0 overlap = kw1 & kw2 return len(overlap) / max(len(kw1), len(kw2)) # ================== 数据库操作 ================== def get_chinese_records(cursor) -> List[Tuple]: """获取待处理的中文记录""" if not TARGET_SRCNAMES: return [] placeholders = ','.join(['%s'] * len(TARGET_SRCNAMES)) query = f""" SELECT es_sid, es_srcname, es_urlname, es_urltitle, es_urltime FROM indeximos WHERE es_srcname IN ({placeholders}) AND es_urltitle IS NOT NULL AND TRIM(es_urltitle) != '' AND es_urltime IS NOT NULL AND es_loadtime > '2026-01-16 10:40:00' """ cursor.execute(query, TARGET_SRCNAMES) return cursor.fetchall() def get_foreign_candidates_by_time(cursor, pub_time) -> List[Tuple]: """ 获取同一发布时间的所有外文候选记录(要求 es_abstract 不为空) """ query = """ SELECT es_sid, es_title, es_urltitle, es_urlcontent FROM indeximos WHERE es_urltime = %s AND es_title IS NOT NULL AND TRIM(es_title) != '' AND es_urlcontent IS NOT NULL AND es_loadtime > '2026-01-16 10:40:00' """ cursor.execute(query, (pub_time,)) return cursor.fetchall() def update_chinese_record(cursor, record_id: int, new_title: str, content: str): """更新中文记录的标题和内容""" update_query = """ UPDATE indeximos SET es_title = %s, es_content = %s WHERE es_sid = %s """ cursor.execute(update_query, (new_title, content, record_id)) # ================== 主逻辑 ================== def main(): if not TARGET_SRCNAMES: print("⚠️ 未指定目标站点,退出。") return conn = pymysql.connect(**DB_CONFIG) cursor = conn.cursor() try: chinese_records = get_chinese_records(cursor) total = len(chinese_records) print(f"共加载 {total} 条中文记录用于匹配...") matched_count = 0 for idx, (cid, srcname, urlname, zh_title, pub_time) in enumerate(chinese_records, 1): print(f"\n[{idx}/{total}] ID={cid}, 时间={pub_time}, 标题='{zh_title[:30]}...'") candidates = get_foreign_candidates_by_time(cursor, pub_time) if not candidates: print(" → 无同时间且有翻译标题的外文记录") continue best_score = 0.0 best_candidate = None for fid, trans_title, ori_title, content in candidates: # 跳过自己(理论上不会发生,但安全起见) if fid == cid: continue score = keyword_overlap_similarity(zh_title, trans_title) print(f" 候选ID={fid} | 翻译标题='{trans_title[:30]}...' | 重合度={score:.3f}") if score > best_score: best_score = score best_candidate = (ori_title, content) if best_candidate and best_score >= SIMILARITY_THRESHOLD: final_title, final_content = best_candidate update_chinese_record(cursor, cid, final_title, final_content) matched_count += 1 print(f" ✅ 匹配成功! 重合度={best_score:.3f}") else: print(f" ❌ 未达阈值(最高相似度={best_score:.3f})") conn.commit() print("\n" + "=" * 50) print(f"✅ 匹配完成!成功关联 {matched_count} / {total} 条记录。") except Exception as e: conn.rollback() print(f"❌ 发生错误,已回滚: {e}") raise finally: cursor.close() conn.close() if __name__ == "__main__": main()