diff --git a/spiders/MediaSpiders/MediaSpiders/spiders/TwitterUserSpider.py b/spiders/MediaSpiders/MediaSpiders/spiders/TwitterUserSpider.py index 4b1102f..d11ffcc 100644 --- a/spiders/MediaSpiders/MediaSpiders/spiders/TwitterUserSpider.py +++ b/spiders/MediaSpiders/MediaSpiders/spiders/TwitterUserSpider.py @@ -176,12 +176,25 @@ class TwitterSpider(scrapy.Spider): item['es_urlname'] = 'https://x.com/%s/status/%s' % (screen_name, result['rest_id']) item['es_authors'] = screen_name item['es_extname'] = user_name + + device_html = result['source'] + device_type = re.search(r'>([^<]+)', device_html).group(1) + legacy = result['legacy'] author_full_text = legacy['full_text'] created_at = legacy['created_at'] + # 评论、转发、点赞数量 item['es_commentcount'] = legacy['reply_count'] item['es_forwardcount'] = legacy['retweet_count'] item['es_likecount'] = legacy['favorite_count'] + # 评论+ 转发+ 点赞数量 TODO + interaction_count = legacy['reply_count'] + legacy['retweet_count'] + legacy['favorite_count'] + # 语种 + lang = legacy['lang'] + # 推文话题 、 提及 + topic = legacy['entities']['hashtags'] + mentions = legacy['entities']['user_mentions'] + item['es_lasttime'] = get_current_timestamp() item['es_loadtime'] = get_current_timestamp() item['es_urltime'] = get_time_stamp( @@ -196,13 +209,16 @@ class TwitterSpider(scrapy.Spider): else: item['es_catalog2'] = '' legacy = result['quoted_status_result']['result']['legacy'] + original_tweet = result['quoted_status_result']['result']['rest_id'] self.logger.info('采集引用推文原文信息') elif 'retweeted_status_result' in legacy: item['es_isrepost'] = '1' legacy = legacy['retweeted_status_result']['result']['legacy'] + original_tweet = result['retweeted_status_result']['result']['rest_id'] self.logger.info('采集转发推文原文信息') else: item['es_isrepost'] = '0' + original_tweet = '' self.logger.info('采集原文信息') item['es_urlcontent'] = legacy['full_text']