[twitter]新增推特推文翻译功能
This commit is contained in:
parent
7b3a83a1ab
commit
b827e33dbd
@ -19,6 +19,8 @@ from MediaSpiders.utils.login_utils import login
|
|||||||
from MediaSpiders.utils.time_utils import get_time_stamp, get_current_timestamp
|
from MediaSpiders.utils.time_utils import get_time_stamp, get_current_timestamp
|
||||||
from selenium.webdriver.common.action_chains import ActionChains
|
from selenium.webdriver.common.action_chains import ActionChains
|
||||||
|
|
||||||
|
from MediaSpiders.utils.traslate_utils import translate_single, translate_content_with_paragraphs, needs_translation
|
||||||
|
|
||||||
|
|
||||||
def form_cookie_dict(cookie_string):
|
def form_cookie_dict(cookie_string):
|
||||||
cookie_string_list = cookie_string.split(';')
|
cookie_string_list = cookie_string.split(';')
|
||||||
@ -188,14 +190,25 @@ class TwitterSpider(scrapy.Spider):
|
|||||||
item['es_isrepost'] = 1
|
item['es_isrepost'] = 1
|
||||||
item['es_urltitle'] = author_full_text
|
item['es_urltitle'] = author_full_text
|
||||||
item['es_catalog1'] = author_full_text
|
item['es_catalog1'] = author_full_text
|
||||||
|
# 判断是否需要翻译
|
||||||
|
if needs_translation(author_full_text):
|
||||||
|
item['es_content'] = translate_single(author_full_text) # TODO 翻译
|
||||||
|
else:
|
||||||
|
item['es_content'] = url_content
|
||||||
legacy = result['quoted_status_result']['result']['legacy']
|
legacy = result['quoted_status_result']['result']['legacy']
|
||||||
self.logger.info('采集引用推文原文信息')
|
self.logger.info('采集引用推文原文信息')
|
||||||
elif 'retweeted_status_result' in legacy:
|
elif 'retweeted_status_result' in legacy:
|
||||||
item['es_isrepost'] = 1
|
item['es_isrepost'] = 1
|
||||||
legacy = legacy['retweeted_status_result']['result']['legacy']
|
legacy = legacy['retweeted_status_result']['result']['legacy']
|
||||||
self.logger.info('采集转发推文原文信息')
|
self.logger.info('采集转发推文原文信息')
|
||||||
item['es_content'] = legacy['full_text']
|
|
||||||
item['es_urlcontent'] = legacy['full_text']
|
item['es_urlcontent'] = legacy['full_text']
|
||||||
|
# 获取文本
|
||||||
|
url_content = legacy['full_text']
|
||||||
|
# 判断是否需要翻译
|
||||||
|
if needs_translation(url_content):
|
||||||
|
item['es_content'] = translate_content_with_paragraphs(url_content) # TODO 翻译
|
||||||
|
else:
|
||||||
|
item['es_content'] = url_content
|
||||||
# 下载图片
|
# 下载图片
|
||||||
image_url_list = []
|
image_url_list = []
|
||||||
if 'entities' in legacy and 'media' in legacy['entities']:
|
if 'entities' in legacy and 'media' in legacy['entities']:
|
||||||
|
|||||||
94
spiders/MediaSpiders/MediaSpiders/utils/traslate_utils.py
Normal file
94
spiders/MediaSpiders/MediaSpiders/utils/traslate_utils.py
Normal file
@ -0,0 +1,94 @@
|
|||||||
|
from MediaSpiders.settings import MAX_TEXT_LENGTH, TRANSLATE_API_URL, REQUEST_DELAY
|
||||||
|
import requests
|
||||||
|
import time
|
||||||
|
from typing import List, Tuple, Optional
|
||||||
|
from langdetect import detect, LangDetectException
|
||||||
|
|
||||||
|
def normalize_newlines(text: str) -> str:
|
||||||
|
"""将 \r\n 和 \r 统一转换为 \n"""
|
||||||
|
if not text:
|
||||||
|
return text
|
||||||
|
return text.replace('\r\n', '\n').replace('\r', '\n')
|
||||||
|
|
||||||
|
|
||||||
|
def translate_single(text: str, source_lang: str = "auto", target_lang: str = "zh") -> Optional[str]:
|
||||||
|
"""翻译单段文本,失败返回 None"""
|
||||||
|
if not text or not text.strip():
|
||||||
|
return ""
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"text": text[:MAX_TEXT_LENGTH],
|
||||||
|
"source_lang": source_lang,
|
||||||
|
"target_lang": target_lang
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = requests.post(TRANSLATE_API_URL, json=payload, timeout=10)
|
||||||
|
response.raise_for_status()
|
||||||
|
result = response.json()
|
||||||
|
return result.get("translated_text")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"⚠️ 翻译失败: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def translate_content_with_paragraphs(content: str) -> str:
|
||||||
|
"""
|
||||||
|
按段落翻译内容,支持容错:
|
||||||
|
- 某段失败 → 跳过该段(保留空行或原文)
|
||||||
|
- 返回拼接后的完整内容
|
||||||
|
"""
|
||||||
|
if not content:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
# 标准化换行符
|
||||||
|
content = normalize_newlines(content)
|
||||||
|
paragraphs = content.split('\n')
|
||||||
|
translated_paragraphs = []
|
||||||
|
|
||||||
|
for para in paragraphs:
|
||||||
|
if not para.strip():
|
||||||
|
# 保留空行
|
||||||
|
translated_paragraphs.append("")
|
||||||
|
continue
|
||||||
|
|
||||||
|
trans = translate_single(para)
|
||||||
|
if trans is None:
|
||||||
|
# 段落翻译失败:跳过该段(可选:保留原文或留空)
|
||||||
|
print(f" ⚠️ 段落翻译失败,跳过: {para[:30]}...")
|
||||||
|
translated_paragraphs.append("") # 或 append(para) 保留原文
|
||||||
|
else:
|
||||||
|
translated_paragraphs.append(trans)
|
||||||
|
|
||||||
|
time.sleep(REQUEST_DELAY)
|
||||||
|
|
||||||
|
return '\n'.join(translated_paragraphs)
|
||||||
|
|
||||||
|
|
||||||
|
# ================== 数据库操作 ==================
|
||||||
|
|
||||||
|
def update_record(cursor, es_sid: int, new_title: str, new_content: str):
|
||||||
|
update_query = """
|
||||||
|
UPDATE indeximos
|
||||||
|
SET es_title = % s, es_content = % s
|
||||||
|
WHERE es_sid = % s
|
||||||
|
"""
|
||||||
|
cursor.execute(update_query, (new_title, new_content, es_sid))
|
||||||
|
|
||||||
|
|
||||||
|
def needs_translation(text: str) -> bool:
|
||||||
|
"""
|
||||||
|
判断文本是否需要翻译:
|
||||||
|
- 如果检测到语言是 'zh'(中文),则不需要翻译,返回 False
|
||||||
|
- 否则需要翻译,返回 True
|
||||||
|
- 若无法检测(如空文本、纯符号等)
|
||||||
|
"""
|
||||||
|
if not text or not text.strip():
|
||||||
|
return False # 空文本无需翻译
|
||||||
|
|
||||||
|
try:
|
||||||
|
lang = detect(text.strip())
|
||||||
|
return lang != 'zh-cn'
|
||||||
|
except LangDetectException:
|
||||||
|
# 无法检测语言(如全是数字、标点等),保守起见视为需要翻译
|
||||||
|
return True
|
||||||
Loading…
x
Reference in New Issue
Block a user