Compare commits

..

No commits in common. "e4f28c6a89d0d040e28ec0c5f68b49ff9316c747" and "cfe6c3af8541f0e653b36b509fcbf992de9f8afb" have entirely different histories.

3 changed files with 2 additions and 20 deletions

View File

@ -171,25 +171,12 @@ class TwitterSpider(scrapy.Spider):
item['es_urlname'] = 'https://x.com/%s/status/%s' % (screen_name, result['rest_id']) item['es_urlname'] = 'https://x.com/%s/status/%s' % (screen_name, result['rest_id'])
item['es_authors'] = screen_name item['es_authors'] = screen_name
item['es_extname'] = user_name item['es_extname'] = user_name
device_html = result['source']
device_type = re.search(r'>([^<]+)</a>', device_html).group(1)
legacy = result['legacy'] legacy = result['legacy']
author_full_text = legacy['full_text'] author_full_text = legacy['full_text']
created_at = legacy['created_at'] created_at = legacy['created_at']
# 评论、转发、点赞数量
item['es_commentcount'] = legacy['reply_count'] item['es_commentcount'] = legacy['reply_count']
item['es_forwardcount'] = legacy['retweet_count'] item['es_forwardcount'] = legacy['retweet_count']
item['es_likecount'] = legacy['favorite_count'] item['es_likecount'] = legacy['favorite_count']
# 评论+ 转发+ 点赞数量 TODO
interaction_count = legacy['reply_count'] + legacy['retweet_count'] + legacy['favorite_count']
# 语种
lang = legacy['lang']
# 推文话题 、 提及
topic = legacy['entities']['hashtags']
mentions = legacy['entities']['user_mentions']
item['es_lasttime'] = get_current_timestamp() item['es_lasttime'] = get_current_timestamp()
item['es_loadtime'] = get_current_timestamp() item['es_loadtime'] = get_current_timestamp()
item['es_urltime'] = get_time_stamp( item['es_urltime'] = get_time_stamp(
@ -204,16 +191,13 @@ class TwitterSpider(scrapy.Spider):
else: else:
item['es_catalog2'] = '' item['es_catalog2'] = ''
legacy = result['quoted_status_result']['result']['legacy'] legacy = result['quoted_status_result']['result']['legacy']
original_tweet = result['quoted_status_result']['result']['rest_id']
self.logger.info('采集引用推文原文信息') self.logger.info('采集引用推文原文信息')
elif 'retweeted_status_result' in legacy: elif 'retweeted_status_result' in legacy:
item['es_isrepost'] = '1' item['es_isrepost'] = '1'
legacy = legacy['retweeted_status_result']['result']['legacy'] legacy = legacy['retweeted_status_result']['result']['legacy']
original_tweet = result['retweeted_status_result']['result']['rest_id']
self.logger.info('采集转发推文原文信息') self.logger.info('采集转发推文原文信息')
else: else:
item['es_isrepost'] = '0' item['es_isrepost'] = '0'
original_tweet = ''
self.logger.info('采集原文信息') self.logger.info('采集原文信息')
item['es_urlcontent'] = legacy['full_text'] item['es_urlcontent'] = legacy['full_text']

View File

@ -19,5 +19,4 @@ dirpath = os.path.dirname(os.path.abspath(__file__))
sys.path.append(dirpath) sys.path.append(dirpath)
# 等效于scrapy crawl FacebookUserSpider -a params="{}" # 等效于scrapy crawl FacebookUserSpider -a params="{}"
# execute(['scrapy', 'crawl', 'LinkedinUserSpider', '-a', 'params={}'])
execute(['scrapy', 'crawl', 'TwitterUserSpider', '-a', 'params={}']) execute(['scrapy', 'crawl', 'TwitterUserSpider', '-a', 'params={}'])

View File

@ -174,8 +174,7 @@ def parse_item_from_response(response, parse_rule, redis_client):
else: else:
logger.info(f"翻译成功,标题译文长度:{len(webpage_item['es_abstract'])}") logger.info(f"翻译成功,标题译文长度:{len(webpage_item['es_abstract'])}")
# 翻译内容(按段落,容错) # 翻译内容(按段落,容错)
no_tag_content = filter_html_tags(webpage_item['es_urlcontent'], retain_img_br=False) webpage_item['es_content'] = translate_content_with_paragraphs(webpage_item['es_urlcontent'])
webpage_item['es_content'] = translate_content_with_paragraphs(no_tag_content)
logger.info(f"翻译成功,内容译文长度:{len(webpage_item['es_content'])}") logger.info(f"翻译成功,内容译文长度:{len(webpage_item['es_content'])}")
except Exception as e: except Exception as e:
logger.error(repr(e)) logger.error(repr(e))