diff --git a/spiders/MediaSpiders/run.py b/spiders/MediaSpiders/run.py index 8f74cad..ca0d174 100644 --- a/spiders/MediaSpiders/run.py +++ b/spiders/MediaSpiders/run.py @@ -19,4 +19,5 @@ dirpath = os.path.dirname(os.path.abspath(__file__)) sys.path.append(dirpath) # 等效于:scrapy crawl FacebookUserSpider -a params="{}" -execute(['scrapy', 'crawl', 'TwitterUserSpider', '-a', 'params={}']) +# execute(['scrapy', 'crawl', 'LinkedinUserSpider', '-a', 'params={}']) +execute(['scrapy', 'crawl', 'TwitterUserSpider', '-a', 'params={}']) \ No newline at end of file diff --git a/spiders/WebsiteSpider/WebsiteSpider/utils/parser_utils.py b/spiders/WebsiteSpider/WebsiteSpider/utils/parser_utils.py index 4cf52ed..cac3e5a 100644 --- a/spiders/WebsiteSpider/WebsiteSpider/utils/parser_utils.py +++ b/spiders/WebsiteSpider/WebsiteSpider/utils/parser_utils.py @@ -174,7 +174,8 @@ def parse_item_from_response(response, parse_rule, redis_client): else: logger.info(f"翻译成功,标题译文长度:{len(webpage_item['es_abstract'])}") # 翻译内容(按段落,容错) - webpage_item['es_content'] = translate_content_with_paragraphs(webpage_item['es_urlcontent']) + no_tag_content = filter_html_tags(webpage_item['es_urlcontent'], retain_img_br=False) + webpage_item['es_content'] = translate_content_with_paragraphs(no_tag_content) logger.info(f"翻译成功,内容译文长度:{len(webpage_item['es_content'])}") except Exception as e: logger.error(repr(e))