From e4f28c6a89d0d040e28ec0c5f68b49ff9316c747 Mon Sep 17 00:00:00 2001 From: DELL Date: Thu, 12 Feb 2026 09:47:28 +0800 Subject: [PATCH] =?UTF-8?q?=E5=90=8C=E6=AD=A5=E7=BD=91=E5=90=A7=E9=97=B4?= =?UTF-8?q?=E7=94=B5=E8=84=91=E6=9B=B4=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- spiders/MediaSpiders/run.py | 3 ++- spiders/WebsiteSpider/WebsiteSpider/utils/parser_utils.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/spiders/MediaSpiders/run.py b/spiders/MediaSpiders/run.py index 8f74cad..ca0d174 100644 --- a/spiders/MediaSpiders/run.py +++ b/spiders/MediaSpiders/run.py @@ -19,4 +19,5 @@ dirpath = os.path.dirname(os.path.abspath(__file__)) sys.path.append(dirpath) # 等效于:scrapy crawl FacebookUserSpider -a params="{}" -execute(['scrapy', 'crawl', 'TwitterUserSpider', '-a', 'params={}']) +# execute(['scrapy', 'crawl', 'LinkedinUserSpider', '-a', 'params={}']) +execute(['scrapy', 'crawl', 'TwitterUserSpider', '-a', 'params={}']) \ No newline at end of file diff --git a/spiders/WebsiteSpider/WebsiteSpider/utils/parser_utils.py b/spiders/WebsiteSpider/WebsiteSpider/utils/parser_utils.py index 4cf52ed..cac3e5a 100644 --- a/spiders/WebsiteSpider/WebsiteSpider/utils/parser_utils.py +++ b/spiders/WebsiteSpider/WebsiteSpider/utils/parser_utils.py @@ -174,7 +174,8 @@ def parse_item_from_response(response, parse_rule, redis_client): else: logger.info(f"翻译成功,标题译文长度:{len(webpage_item['es_abstract'])}") # 翻译内容(按段落,容错) - webpage_item['es_content'] = translate_content_with_paragraphs(webpage_item['es_urlcontent']) + no_tag_content = filter_html_tags(webpage_item['es_urlcontent'], retain_img_br=False) + webpage_item['es_content'] = translate_content_with_paragraphs(no_tag_content) logger.info(f"翻译成功,内容译文长度:{len(webpage_item['es_content'])}") except Exception as e: logger.error(repr(e))