Compare commits
2 Commits
8c84df0fdc
...
4d3cb2381a
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4d3cb2381a | ||
|
|
073f4325d0 |
@ -185,13 +185,13 @@ class TwitterSpider(scrapy.Spider):
|
||||
item['es_urltime'] = get_time_stamp(
|
||||
str(created_at)) + 8 * 3600 * 1000 # TW默认使用的是零时区,转换为北京时间
|
||||
if 'quoted_status_result' in result:
|
||||
item['es_isrepost'] = 'yes'
|
||||
item['es_isrepost'] = 1
|
||||
item['es_urltitle'] = author_full_text
|
||||
item['es_catalog1'] = author_full_text
|
||||
legacy = result['quoted_status_result']['result']['legacy']
|
||||
self.logger.info('采集引用推文原文信息')
|
||||
elif 'retweeted_status_result' in legacy:
|
||||
item['es_isrepost'] = 'yes'
|
||||
item['es_isrepost'] = 1
|
||||
legacy = legacy['retweeted_status_result']['result']['legacy']
|
||||
self.logger.info('采集转发推文原文信息')
|
||||
item['es_content'] = legacy['full_text']
|
||||
|
||||
@ -167,15 +167,26 @@ def parse_item_from_response(response, parse_rule, redis_client):
|
||||
if url_bytes and url_bytes.decode('utf-8').strip()
|
||||
]
|
||||
if webpage_item['es_srcname'] in translate_list:
|
||||
# 标题内容 替换
|
||||
original_title = webpage_item['es_urltitle']
|
||||
original_content = webpage_item['es_urlcontent']
|
||||
|
||||
# 翻译标题
|
||||
webpage_item['es_abstract'] = translate_single(webpage_item['es_urltitle'])
|
||||
if webpage_item['es_abstract'] is None:
|
||||
ranslated_title = translate_single(original_title)
|
||||
if ranslated_title is None:
|
||||
logger.warning(" → 标题翻译失败,跳过整条")
|
||||
else:
|
||||
logger.info(f"翻译成功,标题译文长度:{len(webpage_item['es_abstract'])}")
|
||||
logger.info(f"翻译成功,标题译文长度:{len(ranslated_title)}")
|
||||
# 翻译内容(按段落,容错)
|
||||
webpage_item['es_content'] = translate_content_with_paragraphs(webpage_item['es_urlcontent'])
|
||||
logger.info(f"翻译成功,内容译文长度:{len(webpage_item['es_content'])}")
|
||||
translated_content = translate_content_with_paragraphs(original_content)
|
||||
logger.info(f"翻译成功,内容译文长度:{len(translated_content)}")
|
||||
|
||||
# 当所有内容执行完成,则执行内容替换
|
||||
webpage_item['es_abstract'] = original_title # 原标题
|
||||
webpage_item['es_content'] = original_content # 原文
|
||||
webpage_item['es_urltitle'] = ranslated_title # 翻译标题
|
||||
webpage_item['es_urlcontent'] = translated_content # 译文
|
||||
|
||||
except Exception as e:
|
||||
logger.error(repr(e))
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user