diff --git a/spiders/MediaSpiders/MediaSpiders/spiders/TwitterUserSpider.py b/spiders/MediaSpiders/MediaSpiders/spiders/TwitterUserSpider.py index 2449034..ccaf245 100644 --- a/spiders/MediaSpiders/MediaSpiders/spiders/TwitterUserSpider.py +++ b/spiders/MediaSpiders/MediaSpiders/spiders/TwitterUserSpider.py @@ -19,6 +19,8 @@ from MediaSpiders.utils.login_utils import login from MediaSpiders.utils.time_utils import get_time_stamp, get_current_timestamp from selenium.webdriver.common.action_chains import ActionChains +from MediaSpiders.utils.traslate_utils import translate_single, translate_content_with_paragraphs, needs_translation + def form_cookie_dict(cookie_string): cookie_string_list = cookie_string.split(';') @@ -188,14 +190,25 @@ class TwitterSpider(scrapy.Spider): item['es_isrepost'] = 1 item['es_urltitle'] = author_full_text item['es_catalog1'] = author_full_text + # 判断是否需要翻译 + if needs_translation(author_full_text): + item['es_content'] = translate_single(author_full_text) # TODO 翻译 + else: + item['es_content'] = url_content legacy = result['quoted_status_result']['result']['legacy'] self.logger.info('采集引用推文原文信息') elif 'retweeted_status_result' in legacy: item['es_isrepost'] = 1 legacy = legacy['retweeted_status_result']['result']['legacy'] self.logger.info('采集转发推文原文信息') - item['es_content'] = legacy['full_text'] item['es_urlcontent'] = legacy['full_text'] + # 获取文本 + url_content = legacy['full_text'] + # 判断是否需要翻译 + if needs_translation(url_content): + item['es_content'] = translate_content_with_paragraphs(url_content) # TODO 翻译 + else: + item['es_content'] = url_content # 下载图片 image_url_list = [] if 'entities' in legacy and 'media' in legacy['entities']: diff --git a/spiders/MediaSpiders/MediaSpiders/utils/traslate_utils.py b/spiders/MediaSpiders/MediaSpiders/utils/traslate_utils.py new file mode 100644 index 0000000..39ea668 --- /dev/null +++ b/spiders/MediaSpiders/MediaSpiders/utils/traslate_utils.py @@ -0,0 +1,94 @@ +from MediaSpiders.settings import MAX_TEXT_LENGTH, TRANSLATE_API_URL, REQUEST_DELAY +import requests +import time +from typing import List, Tuple, Optional +from langdetect import detect, LangDetectException + +def normalize_newlines(text: str) -> str: + """将 \r\n 和 \r 统一转换为 \n""" + if not text: + return text + return text.replace('\r\n', '\n').replace('\r', '\n') + + +def translate_single(text: str, source_lang: str = "auto", target_lang: str = "zh") -> Optional[str]: + """翻译单段文本,失败返回 None""" + if not text or not text.strip(): + return "" + + payload = { + "text": text[:MAX_TEXT_LENGTH], + "source_lang": source_lang, + "target_lang": target_lang + } + + try: + response = requests.post(TRANSLATE_API_URL, json=payload, timeout=10) + response.raise_for_status() + result = response.json() + return result.get("translated_text") + except Exception as e: + print(f"⚠️ 翻译失败: {e}") + return None + + +def translate_content_with_paragraphs(content: str) -> str: + """ + 按段落翻译内容,支持容错: + - 某段失败 → 跳过该段(保留空行或原文) + - 返回拼接后的完整内容 + """ + if not content: + return "" + + # 标准化换行符 + content = normalize_newlines(content) + paragraphs = content.split('\n') + translated_paragraphs = [] + + for para in paragraphs: + if not para.strip(): + # 保留空行 + translated_paragraphs.append("") + continue + + trans = translate_single(para) + if trans is None: + # 段落翻译失败:跳过该段(可选:保留原文或留空) + print(f" ⚠️ 段落翻译失败,跳过: {para[:30]}...") + translated_paragraphs.append("") # 或 append(para) 保留原文 + else: + translated_paragraphs.append(trans) + + time.sleep(REQUEST_DELAY) + + return '\n'.join(translated_paragraphs) + + +# ================== 数据库操作 ================== + +def update_record(cursor, es_sid: int, new_title: str, new_content: str): + update_query = """ + UPDATE indeximos + SET es_title = % s, es_content = % s + WHERE es_sid = % s + """ + cursor.execute(update_query, (new_title, new_content, es_sid)) + + +def needs_translation(text: str) -> bool: + """ + 判断文本是否需要翻译: + - 如果检测到语言是 'zh'(中文),则不需要翻译,返回 False + - 否则需要翻译,返回 True + - 若无法检测(如空文本、纯符号等) + """ + if not text or not text.strip(): + return False # 空文本无需翻译 + + try: + lang = detect(text.strip()) + return lang != 'zh-cn' + except LangDetectException: + # 无法检测语言(如全是数字、标点等),保守起见视为需要翻译 + return True \ No newline at end of file