From 399165404e1962a072f936ed57aec79ac864d794 Mon Sep 17 00:00:00 2001 From: DELL Date: Tue, 20 Jan 2026 16:13:05 +0800 Subject: [PATCH 1/3] =?UTF-8?q?[=E9=80=9A=E7=94=A8=E7=BF=BB=E8=AF=91]=20?= =?UTF-8?q?=E5=8A=9F=E8=83=BD=E6=8F=90=E4=BA=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- spiders/MediaSpiders/run.py | 2 +- .../WebsiteSpider/WebsiteSpider/__init__.py | 175 ++++++++++++++++++ spiders/WebsiteSpider/WebsiteSpider/items.py | 1 + .../WebsiteSpider/WebsiteSpider/settings.py | 13 +- .../spiders/website_info_common.py | 7 +- .../WebsiteSpider/utils/parser_utils.py | 26 ++- .../WebsiteSpider/utils/traslate_utils.py | 75 ++++++++ 7 files changed, 293 insertions(+), 6 deletions(-) create mode 100644 spiders/WebsiteSpider/WebsiteSpider/utils/traslate_utils.py diff --git a/spiders/MediaSpiders/run.py b/spiders/MediaSpiders/run.py index 8f74cad..89519af 100644 --- a/spiders/MediaSpiders/run.py +++ b/spiders/MediaSpiders/run.py @@ -19,4 +19,4 @@ dirpath = os.path.dirname(os.path.abspath(__file__)) sys.path.append(dirpath) # 等效于:scrapy crawl FacebookUserSpider -a params="{}" -execute(['scrapy', 'crawl', 'TwitterUserSpider', '-a', 'params={}']) +execute(['scrapy', 'crawl', 'WeiboUserSpider', '-a', 'params={}']) diff --git a/spiders/WebsiteSpider/WebsiteSpider/__init__.py b/spiders/WebsiteSpider/WebsiteSpider/__init__.py index e69de29..6651183 100644 --- a/spiders/WebsiteSpider/WebsiteSpider/__init__.py +++ b/spiders/WebsiteSpider/WebsiteSpider/__init__.py @@ -0,0 +1,175 @@ +import time +from typing import List, Tuple, Optional + +import pymysql +import requests + +# ================== 配置区 ================== + +# 数据库配置 +DB_CONFIG = { + 'host': '47.113.231.200', + 'port': 28089, + 'user': 'root', + 'password': 'passok123A', + 'database': 'dsp', + 'charset': 'utf8mb4', + 'autocommit': False # 手动控制事务 +} + +# 翻译 API 地址(替换为你的服务器 IP 或域名) +TRANSLATE_API_URL = "http://47.113.231.200:28081/translate" + +# 指定时间(格式:YYYY-MM-DD HH:MM:SS) +LOADTIME_AFTER = "2026-01-16 10:40:00" + +# 目标站点列表 +TARGET_SRCNAMES = [ + 'http://www.rodong.rep.kp/ko/index.php?MUBAMUAxQA==', + 'http://www.kcna.kp/kp/category/articles/q/5394b80bdae203fadef02522cfb578c0.kcmsf', + 'https://energynow.com/category/press_releases/', + 'https://www.fao.org/newsroom/en' # 添加你的站点 +] + +# 单次请求间隔(秒),避免 API 被限流 +REQUEST_DELAY = 1 + +# 最大文本长度(与 API 一致) +MAX_TEXT_LENGTH = 5000 + + +def normalize_newlines(text: str) -> str: + """将 \r\n 和 \r 统一转换为 \n""" + if not text: + return text + return text.replace('\r\n', '\n').replace('\r', '\n') + + +def translate_single(text: str, source_lang: str = "auto", target_lang: str = "zh") -> Optional[str]: + """翻译单段文本,失败返回 None""" + if not text or not text.strip(): + return "" + + payload = { + "text": text[:MAX_TEXT_LENGTH], + "source_lang": source_lang, + "target_lang": target_lang + } + + try: + response = requests.post(TRANSLATE_API_URL, json=payload, timeout=10) + response.raise_for_status() + result = response.json() + return result.get("translated_text") + except Exception as e: + print(f"⚠️ 翻译失败: {e}") + return None + + +def translate_content_with_paragraphs(content: str) -> str: + """ + 按段落翻译内容,支持容错: + - 某段失败 → 跳过该段(保留空行或原文) + - 返回拼接后的完整内容 + """ + if not content: + return "" + + # 标准化换行符 + content = normalize_newlines(content) + paragraphs = content.split('\n') + translated_paragraphs = [] + + for para in paragraphs: + if not para.strip(): + # 保留空行 + translated_paragraphs.append("") + continue + + trans = translate_single(para) + if trans is None: + # 段落翻译失败:跳过该段(可选:保留原文或留空) + print(f" ⚠️ 段落翻译失败,跳过: {para[:30]}...") + translated_paragraphs.append("") # 或 append(para) 保留原文 + else: + translated_paragraphs.append(trans) + + time.sleep(REQUEST_DELAY) + + return '\n'.join(translated_paragraphs) + + +# ================== 数据库操作 ================== + +def update_record(cursor, es_sid: int, new_title: str, new_content: str): + update_query = """ + UPDATE indeximos + SET es_title = % s, es_content = % s + WHERE es_sid = % s + """ + cursor.execute(update_query, (new_title, new_content, es_sid)) + + +# ================== 主逻辑 ================== + +def main(): + conn = pymysql.connect(**DB_CONFIG) + cursor = conn.cursor() + + try: + placeholders = ','.join(['%s'] * len(TARGET_SRCNAMES)) + query = f""" + SELECT es_sid, es_urltitle, es_urlcontent + FROM indeximos + WHERE es_loadtime > %s + AND (es_title IS NULL OR TRIM(es_title) = '') + AND es_srcname IN ({placeholders}) + AND LENGTH(es_video) > 5 + """ + params = [LOADTIME_AFTER] + TARGET_SRCNAMES + cursor.execute(query, params) + records: List[Tuple] = cursor.fetchall() + + total = len(records) + print(f"✅ 共找到 {total} 条待翻译记录") + + if total == 0: + return + + success_count = 0 + + for idx, (es_sid, urltitle, urlcontent) in enumerate(records, 1): + print(f"\n[{idx}/{total}] 处理 es_sid={es_sid}") + start_time = time.time() + + # 翻译标题 + title_trans = translate_single(urltitle) if urltitle else "" + if title_trans is None: + print(" → 标题翻译失败,跳过整条") + continue + + # 翻译内容(按段落,容错) + content_trans = translate_content_with_paragraphs(urlcontent) + + # 更新数据库 + update_record(cursor, es_sid, title_trans, content_trans) + success_count += 1 + + elapsed = time.time() - start_time + print(f" ✅ 翻译成功 | 耗时: {elapsed:.2f}s | 标题: {title_trans[:30]}...") + + conn.commit() + + print(f"\n🎉 完成!成功翻译 {success_count} / {total} 条记录") + + except Exception as e: + conn.rollback() + print(f"❌ 发生错误: {e}") + raise + finally: + cursor.close() + conn.close() + + +if __name__ == "__main__": + main() diff --git a/spiders/WebsiteSpider/WebsiteSpider/items.py b/spiders/WebsiteSpider/WebsiteSpider/items.py index 0163cba..79d6d92 100644 --- a/spiders/WebsiteSpider/WebsiteSpider/items.py +++ b/spiders/WebsiteSpider/WebsiteSpider/items.py @@ -19,6 +19,7 @@ class WebsiteSpiderItem(scrapy.Item): es_extname = scrapy.Field() es_channel = scrapy.Field() es_groupname = scrapy.Field() + es_title = scrapy.Field() es_urltitle = scrapy.Field() es_urltopic = scrapy.Field() es_lasttime = scrapy.Field() diff --git a/spiders/WebsiteSpider/WebsiteSpider/settings.py b/spiders/WebsiteSpider/WebsiteSpider/settings.py index 44d14b5..975337e 100644 --- a/spiders/WebsiteSpider/WebsiteSpider/settings.py +++ b/spiders/WebsiteSpider/WebsiteSpider/settings.py @@ -26,8 +26,10 @@ PER_BATCH_IP_USE_TIMES = 5 # 代理中间件每次从ip池获取一批ip,定 # REDIS_HOST = '38.54.94.107' # REDIS_PORT = '28097' -REDIS_HOST = '10.55.13.3' -REDIS_PORT = '7379' +# REDIS_HOST = '10.55.13.3' +# REDIS_PORT = '7379' +REDIS_HOST = '107.182.191.3' +REDIS_PORT = 7379 REDIS_PWD = 'jlkj-841-2-redis' REDIS_PARAMS = { 'password': 'jlkj-841-2-redis', @@ -164,3 +166,10 @@ ITEM_PIPELINES = { 'scrapy.pipelines.images.ImagesPipeline': 2, 'WebsiteSpider.pipelines.ProtobufSavePipeline': 300, } + +############################## 翻译 +MAX_TEXT_LENGTH = 100 +# 翻译 API 地址(替换为你的服务器 IP 或域名) +TRANSLATE_API_URL = "http://47.113.231.200:28081/translate" +# 单次请求间隔(秒),避免 API 被限流 +REQUEST_DELAY = 1 diff --git a/spiders/WebsiteSpider/WebsiteSpider/spiders/website_info_common.py b/spiders/WebsiteSpider/WebsiteSpider/spiders/website_info_common.py index 63a50a6..47a127b 100644 --- a/spiders/WebsiteSpider/WebsiteSpider/spiders/website_info_common.py +++ b/spiders/WebsiteSpider/WebsiteSpider/spiders/website_info_common.py @@ -5,7 +5,9 @@ import re import scrapy import validators from scrapy_redis.spiders import RedisSpider +import redis +from WebsiteSpider.settings import REDIS_HOST, REDIS_PORT, REDIS_PWD from WebsiteSpider.scrapy_selenium import SeleniumRequest from WebsiteSpider.utils.http_utils import build_url from WebsiteSpider.utils.parser_utils import parse_item_from_response @@ -18,7 +20,8 @@ class WebsiteInfoCommonSpider(RedisSpider): super(WebsiteInfoCommonSpider, self).__init__(*args, **kwargs) json_params = json.loads(params) self.name = 'WebSite_' + json_params['clusterName'] - self.redis_client = None + self.redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, + password=REDIS_PWD) if 'job_id' in json_params: self.job_id = json_params['job_id'] @@ -36,7 +39,7 @@ class WebsiteInfoCommonSpider(RedisSpider): # 根据url特征判断是否为内容页,若是则解析文本内容 detail_page_reg = parse_rule['detailPageReg'] if detail_page_reg == "" or re.search(detail_page_reg, response.url) is not None: - yield_flag, webpage_item = parse_item_from_response(response, parse_rule) + yield_flag, webpage_item = parse_item_from_response(response, parse_rule, self.redis_client) if yield_flag: yield webpage_item diff --git a/spiders/WebsiteSpider/WebsiteSpider/utils/parser_utils.py b/spiders/WebsiteSpider/WebsiteSpider/utils/parser_utils.py index 3851dc6..041e510 100644 --- a/spiders/WebsiteSpider/WebsiteSpider/utils/parser_utils.py +++ b/spiders/WebsiteSpider/WebsiteSpider/utils/parser_utils.py @@ -9,9 +9,10 @@ from scrapy.loader import ItemLoader from WebsiteSpider.items import WebsiteSpiderItem from WebsiteSpider.utils.date_utils import transfer_time_zone, get_time_stamp from WebsiteSpider.utils.http_utils import filter_html_tags, build_url +from WebsiteSpider.utils.traslate_utils import translate_single, translate_content_with_paragraphs, update_record -def parse_item_from_response(response, parse_rule): +def parse_item_from_response(response, parse_rule, redis_client): current_url = response.url allowed_domains = parse_rule['allowDomain'].split(';') mapping = parse_rule['fieldMappings'] @@ -116,6 +117,7 @@ def parse_item_from_response(response, parse_rule): logger.info("urltime: %s" % webpage_item['es_urltime']) except KeyError: logger.info('时间解析失败,当前页面url: %s' % response.url) + time_parse_rule = None if 'dateReg' in mapping: time_parse_rule = { @@ -155,4 +157,26 @@ def parse_item_from_response(response, parse_rule): logger.info('时间无法解析,解析规则是:' + mapping['es_urltime']) if filter_VIP_content: logger.info('当前内容是VIP文章,并不完整,已经过滤。') + if yield_flag: + try: + # 1. 从 Redis 获取原始数据 + raw_urls = redis_client.lrange('WebsiteSpider:translate_sites', 0, -1) + translate_list = [ + url_bytes.decode('utf-8').strip() + for url_bytes in raw_urls + if url_bytes and url_bytes.decode('utf-8').strip() + ] + if webpage_item['es_srcname'] in translate_list: + # 翻译标题 + webpage_item['es_title'] = translate_single(webpage_item['es_urltitle']) + if webpage_item['es_title'] is None: + logger.warning(" → 标题翻译失败,跳过整条") + else: + logger.info(f"翻译成功,标题译文长度:{len(webpage_item['es_title'])}") + # 翻译内容(按段落,容错) + webpage_item['es_content'] = translate_content_with_paragraphs(webpage_item['es_urlcontent']) + logger.info(f"翻译成功,内容译文长度:{len(webpage_item['es_content'])}") + except Exception as e: + logger.error(repr(e)) + return yield_flag, webpage_item diff --git a/spiders/WebsiteSpider/WebsiteSpider/utils/traslate_utils.py b/spiders/WebsiteSpider/WebsiteSpider/utils/traslate_utils.py new file mode 100644 index 0000000..11cb332 --- /dev/null +++ b/spiders/WebsiteSpider/WebsiteSpider/utils/traslate_utils.py @@ -0,0 +1,75 @@ +from WebsiteSpider.settings import MAX_TEXT_LENGTH, TRANSLATE_API_URL, REQUEST_DELAY +import requests +import time +from typing import List, Tuple, Optional + +def normalize_newlines(text: str) -> str: + """将 \r\n 和 \r 统一转换为 \n""" + if not text: + return text + return text.replace('\r\n', '\n').replace('\r', '\n') + + +def translate_single(text: str, source_lang: str = "auto", target_lang: str = "zh") -> Optional[str]: + """翻译单段文本,失败返回 None""" + if not text or not text.strip(): + return "" + + payload = { + "text": text[:MAX_TEXT_LENGTH], + "source_lang": source_lang, + "target_lang": target_lang + } + + try: + response = requests.post(TRANSLATE_API_URL, json=payload, timeout=10) + response.raise_for_status() + result = response.json() + return result.get("translated_text") + except Exception as e: + print(f"⚠️ 翻译失败: {e}") + return None + + +def translate_content_with_paragraphs(content: str) -> str: + """ + 按段落翻译内容,支持容错: + - 某段失败 → 跳过该段(保留空行或原文) + - 返回拼接后的完整内容 + """ + if not content: + return "" + + # 标准化换行符 + content = normalize_newlines(content) + paragraphs = content.split('\n') + translated_paragraphs = [] + + for para in paragraphs: + if not para.strip(): + # 保留空行 + translated_paragraphs.append("") + continue + + trans = translate_single(para) + if trans is None: + # 段落翻译失败:跳过该段(可选:保留原文或留空) + print(f" ⚠️ 段落翻译失败,跳过: {para[:30]}...") + translated_paragraphs.append("") # 或 append(para) 保留原文 + else: + translated_paragraphs.append(trans) + + time.sleep(REQUEST_DELAY) + + return '\n'.join(translated_paragraphs) + + +# ================== 数据库操作 ================== + +def update_record(cursor, es_sid: int, new_title: str, new_content: str): + update_query = """ + UPDATE indeximos + SET es_title = % s, es_content = % s + WHERE es_sid = % s + """ + cursor.execute(update_query, (new_title, new_content, es_sid)) From 0008e619d1af444a9ddf085f9a8fac04e57d3af2 Mon Sep 17 00:00:00 2001 From: DELL Date: Tue, 20 Jan 2026 16:35:35 +0800 Subject: [PATCH 2/3] =?UTF-8?q?[Twitter]=E5=88=A0=E9=99=A4=E5=A4=9A?= =?UTF-8?q?=E4=BD=99=E6=B3=A8=E9=87=8A?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../MediaSpiders/spiders/TwitterUserSpider.py | 81 ------------------- 1 file changed, 81 deletions(-) diff --git a/spiders/MediaSpiders/MediaSpiders/spiders/TwitterUserSpider.py b/spiders/MediaSpiders/MediaSpiders/spiders/TwitterUserSpider.py index fa210d4..3a62562 100644 --- a/spiders/MediaSpiders/MediaSpiders/spiders/TwitterUserSpider.py +++ b/spiders/MediaSpiders/MediaSpiders/spiders/TwitterUserSpider.py @@ -89,85 +89,6 @@ class TwitterSpider(scrapy.Spider): login_users=login_users, response=response ) - # driver = response.request.meta['driver'] - # driver.maximize_window() - # # 1. 打开第一个标签页 - # driver.get('https://x.com/i/flow/login') - # user_list = [] - # for u in login_users: - # user_list.append(json.loads(u.decode())) - # - # login_user = random.choice(user_list) - - # wait = WebDriverWait(driver, 15) - # # 2. 通过 JS 打开第二个标签页(新 Tab) - # time.sleep(5) - # driver.execute_script("window.open('');") - # driver.execute_script("window.open('https://x.com/i/flow/login', '_blank');") - # - # # 3. 获取所有标签页句柄 - # handles = driver.window_handles # [handle1, handle2] - # - # # 4. 切换到第二个标签页(可选) - # driver.switch_to.window(handles[1]) - # - # logger.info(f"login as user {login_user['uid']}") - # # time.sleep(random.uniform(1.5, 3.0)) - # # driver.find_element_by_xpath("//input").send_keys(login_user['uid']) - # # 等待并定位用户名输入框 - # username_input = wait.until( - # EC.element_to_be_clickable((By.CSS_SELECTOR, 'input[autocomplete="username"]')) - # ) - # - # # 模拟真人逐字输入(带随机延迟) - # username = login_user['uid'] - # for char in username: - # username_input.send_keys(char) - # time.sleep(random.uniform(0.05, 0.2)) # 每个字符间隔 50~200ms - # - # time.sleep(random.uniform(0.3, 0.8)) # 输入完后稍作停顿 - # - # # 尝试点击 "Next" 按钮(主逻辑:带文本的按钮) - # try: - # next_button = wait.until( - # EC.element_to_be_clickable( - # (By.XPATH, "//button[.//span[contains(text(), 'Next') or contains(text(), '下一步')]]") - # ) - # ) - # body = driver.find_element(By.TAG_NAME, "body") - # ActionChains(driver).move_to_element_with_offset(body, 100, 100).perform() - # time.sleep(0.5) - # # 模拟鼠标移动到按钮并点击 - # actions = ActionChains(driver) - # actions.move_to_element(next_button).pause(random.uniform(0.2, 0.6)).click().perform() - # - # except Exception as e: - # logger.info("主 Next 按钮未找到,尝试备用定位方式") - # try: - # # 备用:通过 role 定位第二个 button - # next_button = driver.find_element(By.XPATH, "//button[@role='button'][2]") - # actions = ActionChains(driver) - # actions.move_to_element(next_button).pause(random.uniform(0.2, 0.6)).click().perform() - # except Exception as e2: - # logger.error(f"两种方式均无法点击 Next 按钮: {e}, {e2}") - # raise - # time.sleep(random.uniform(1.5, 5.0)) # 等待页面加载 - # try: - # logger.info("输入手机号验证...") - # driver.find_element_by_xpath("//input[@name='text']").send_keys("+8619962025641") - # # driver.find_element_by_xpath("//button[@data-testid='ocfEnterTextNextButton']").click() - # driver.find_element_by_xpath(driver.find_element_by_xpath("//button[.//span[text()='下一步']]")).click() - # time.sleep(random.uniform(1.5, 5.0)) # 等待页面加载 - # except Exception: - # logger.info("无需输入手机号验证") - # driver.find_element_by_xpath("//input[@name='password']").send_keys(login_user['pwd']) - # driver.find_element_by_xpath("//button[@data-testid='LoginForm_Login_Button']").click() - # time.sleep(random.uniform(1.5, 5.0)) # 等待页面加载 - # try: - # driver.find_element_by_xpath("//button[@data-testid='confirmationSheetConfirm']").click() - # time.sleep(random.uniform(1.5, 10.0)) # 等待页面加载 - # except: - # time.sleep(5) cookies = driver.get_cookies() # 取cookie中的ct0为x-csrf-token,取gt为x-guest-token self.cookie_dict = {} @@ -190,10 +111,8 @@ class TwitterSpider(scrapy.Spider): 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6', 'content-type': 'application/json', 'authorization': self.authorization, - # 'x-twitter-active-user': 'yes', 'Origin': 'https://twitter.com', 'Cookie': cookie_string, - # 'Connection': 'keep-alive', 'X-Csrf-Token': ct0 } self.filter_key = self.settings['TWITTER_FILTER_KEY'] From 92c8cdf9b20dc7b53b8f4c1b48c3313ae1b841c5 Mon Sep 17 00:00:00 2001 From: DELL Date: Tue, 20 Jan 2026 16:36:44 +0800 Subject: [PATCH 3/3] =?UTF-8?q?[=E5=BE=AE=E5=8D=9A]redis=20=E6=B7=BB?= =?UTF-8?q?=E5=8A=A0=20cookie=20=E6=88=90=E5=8A=9F=E8=AF=B7=E6=B1=82?= =?UTF-8?q?=E8=8E=B7=E5=8F=96=E8=BF=94=E5=8F=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../MediaSpiders/spiders/WeiboUserSpider.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/spiders/MediaSpiders/MediaSpiders/spiders/WeiboUserSpider.py b/spiders/MediaSpiders/MediaSpiders/spiders/WeiboUserSpider.py index d433a4a..f0fed88 100644 --- a/spiders/MediaSpiders/MediaSpiders/spiders/WeiboUserSpider.py +++ b/spiders/MediaSpiders/MediaSpiders/spiders/WeiboUserSpider.py @@ -2,10 +2,12 @@ import json import time +import redis import scrapy from redisbloom.client import Client from MediaSpiders.items import MediaspidersItem +from MediaSpiders.spiders.TwitterUserSpider import form_cookie_dict from MediaSpiders.utils.http_utils import http_post from MediaSpiders.utils.string_utils import find_text from MediaSpiders.utils.time_utils import get_time_stamp, get_current_timestamp @@ -59,6 +61,11 @@ class WeiboSpider(scrapy.Spider): account_rsp = json.loads( http_post(account_query_api, json.dumps(post_data), headers={"Content-Type": "application/json"}).text) self.simhash_filter_key = self.settings['WEIBO_SIMHASH_FILTER_KEY'] + # 从 redis 中 获取 微博所需的 cookie + cookie_string = redis.Redis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT'], + password=self.settings['REDIS_PWD']).get("MediaSpiders:Weibo_Cookies").decode() + self.cookie_dict = form_cookie_dict(cookie_string) + all_user_info = [] if account_rsp['code'] == 200: all_user_info = account_rsp['content'] @@ -67,7 +74,10 @@ class WeiboSpider(scrapy.Spider): if uid[:6] != '107603': uid = f'107603{uid}' yield scrapy.Request('https://m.weibo.cn/api/container/getIndex?containerid=%s' % uid, - callback=self.parse, meta={'currentCount': 0, 'uid': uid}) + callback=self.parse, + meta={'currentCount': 0, 'uid': uid}, + cookies=self.cookie_dict + ) def parse(self, response): rsp = json.loads(response.text)