Compare commits
3 Commits
cfe6c3af85
...
e4f28c6a89
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e4f28c6a89 | ||
|
|
953fdc81dc | ||
|
|
3fdd2f5473 |
@ -171,12 +171,25 @@ class TwitterSpider(scrapy.Spider):
|
|||||||
item['es_urlname'] = 'https://x.com/%s/status/%s' % (screen_name, result['rest_id'])
|
item['es_urlname'] = 'https://x.com/%s/status/%s' % (screen_name, result['rest_id'])
|
||||||
item['es_authors'] = screen_name
|
item['es_authors'] = screen_name
|
||||||
item['es_extname'] = user_name
|
item['es_extname'] = user_name
|
||||||
|
|
||||||
|
device_html = result['source']
|
||||||
|
device_type = re.search(r'>([^<]+)</a>', device_html).group(1)
|
||||||
|
|
||||||
legacy = result['legacy']
|
legacy = result['legacy']
|
||||||
author_full_text = legacy['full_text']
|
author_full_text = legacy['full_text']
|
||||||
created_at = legacy['created_at']
|
created_at = legacy['created_at']
|
||||||
|
# 评论、转发、点赞数量
|
||||||
item['es_commentcount'] = legacy['reply_count']
|
item['es_commentcount'] = legacy['reply_count']
|
||||||
item['es_forwardcount'] = legacy['retweet_count']
|
item['es_forwardcount'] = legacy['retweet_count']
|
||||||
item['es_likecount'] = legacy['favorite_count']
|
item['es_likecount'] = legacy['favorite_count']
|
||||||
|
# 评论+ 转发+ 点赞数量 TODO
|
||||||
|
interaction_count = legacy['reply_count'] + legacy['retweet_count'] + legacy['favorite_count']
|
||||||
|
# 语种
|
||||||
|
lang = legacy['lang']
|
||||||
|
# 推文话题 、 提及
|
||||||
|
topic = legacy['entities']['hashtags']
|
||||||
|
mentions = legacy['entities']['user_mentions']
|
||||||
|
|
||||||
item['es_lasttime'] = get_current_timestamp()
|
item['es_lasttime'] = get_current_timestamp()
|
||||||
item['es_loadtime'] = get_current_timestamp()
|
item['es_loadtime'] = get_current_timestamp()
|
||||||
item['es_urltime'] = get_time_stamp(
|
item['es_urltime'] = get_time_stamp(
|
||||||
@ -191,13 +204,16 @@ class TwitterSpider(scrapy.Spider):
|
|||||||
else:
|
else:
|
||||||
item['es_catalog2'] = ''
|
item['es_catalog2'] = ''
|
||||||
legacy = result['quoted_status_result']['result']['legacy']
|
legacy = result['quoted_status_result']['result']['legacy']
|
||||||
|
original_tweet = result['quoted_status_result']['result']['rest_id']
|
||||||
self.logger.info('采集引用推文原文信息')
|
self.logger.info('采集引用推文原文信息')
|
||||||
elif 'retweeted_status_result' in legacy:
|
elif 'retweeted_status_result' in legacy:
|
||||||
item['es_isrepost'] = '1'
|
item['es_isrepost'] = '1'
|
||||||
legacy = legacy['retweeted_status_result']['result']['legacy']
|
legacy = legacy['retweeted_status_result']['result']['legacy']
|
||||||
|
original_tweet = result['retweeted_status_result']['result']['rest_id']
|
||||||
self.logger.info('采集转发推文原文信息')
|
self.logger.info('采集转发推文原文信息')
|
||||||
else:
|
else:
|
||||||
item['es_isrepost'] = '0'
|
item['es_isrepost'] = '0'
|
||||||
|
original_tweet = ''
|
||||||
self.logger.info('采集原文信息')
|
self.logger.info('采集原文信息')
|
||||||
|
|
||||||
item['es_urlcontent'] = legacy['full_text']
|
item['es_urlcontent'] = legacy['full_text']
|
||||||
|
|||||||
@ -19,4 +19,5 @@ dirpath = os.path.dirname(os.path.abspath(__file__))
|
|||||||
|
|
||||||
sys.path.append(dirpath)
|
sys.path.append(dirpath)
|
||||||
# 等效于:scrapy crawl FacebookUserSpider -a params="{}"
|
# 等效于:scrapy crawl FacebookUserSpider -a params="{}"
|
||||||
execute(['scrapy', 'crawl', 'TwitterUserSpider', '-a', 'params={}'])
|
# execute(['scrapy', 'crawl', 'LinkedinUserSpider', '-a', 'params={}'])
|
||||||
|
execute(['scrapy', 'crawl', 'TwitterUserSpider', '-a', 'params={}'])
|
||||||
@ -174,7 +174,8 @@ def parse_item_from_response(response, parse_rule, redis_client):
|
|||||||
else:
|
else:
|
||||||
logger.info(f"翻译成功,标题译文长度:{len(webpage_item['es_abstract'])}")
|
logger.info(f"翻译成功,标题译文长度:{len(webpage_item['es_abstract'])}")
|
||||||
# 翻译内容(按段落,容错)
|
# 翻译内容(按段落,容错)
|
||||||
webpage_item['es_content'] = translate_content_with_paragraphs(webpage_item['es_urlcontent'])
|
no_tag_content = filter_html_tags(webpage_item['es_urlcontent'], retain_img_br=False)
|
||||||
|
webpage_item['es_content'] = translate_content_with_paragraphs(no_tag_content)
|
||||||
logger.info(f"翻译成功,内容译文长度:{len(webpage_item['es_content'])}")
|
logger.info(f"翻译成功,内容译文长度:{len(webpage_item['es_content'])}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(repr(e))
|
logger.error(repr(e))
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user