[twitter]新增推特用户信息采集字段
This commit is contained in:
parent
b827e33dbd
commit
d5be45ec95
@ -183,13 +183,16 @@ class TwitterUserInfoItem(scrapy.Item):
|
||||
username = scrapy.Field() # VARCHAR(50) - 用户名(@后部分)
|
||||
nickname = scrapy.Field() # VARCHAR(100) - 显示名称
|
||||
user_url = scrapy.Field() # VARCHAR(255) - 主页URL
|
||||
user_link = scrapy.Field() # VARCHAR(255) - 用户链接
|
||||
avatar_url = scrapy.Field() # VARCHAR(500) - 头像原始URL
|
||||
avatar_path = scrapy.Field() # VARCHAR(255) - 本地头像路径
|
||||
|
||||
backgroud_image_url = scrapy.Field() # VARCHAR(255) - 背景图原始URL
|
||||
background_image_path = scrapy.Field() # VARCHAR(255) - 背景图路径
|
||||
intro = scrapy.Field() # TEXT - 简介
|
||||
city = scrapy.Field() # VARCHAR(100) - 城市
|
||||
join_date = scrapy.Field() # DATETIME - 加入时间
|
||||
|
||||
signature = scrapy.Field() # VARCHAR(255) - 用户签名
|
||||
tags = scrapy.Field() # VARCHAR(255) - 标签:官方代表/媒体实体/名人
|
||||
post_count = scrapy.Field() # INT UNSIGNED - 推文数
|
||||
is_verified = scrapy.Field() # VARCHAR(10) - 是否认证 ("True"/"False")
|
||||
follow_count = scrapy.Field() # INT UNSIGNED - 关注人数
|
||||
|
||||
@ -273,6 +273,7 @@ class TwitterUserDataSaveToMySQL(object):
|
||||
def __init__(self):
|
||||
self.db = None
|
||||
self.cursor = None
|
||||
self.update_fileds = []
|
||||
|
||||
def open_spider(self, spider):
|
||||
self.db = pymysql.connect(host='47.113.231.200', port=28089, user='root', passwd='passok123A',
|
||||
@ -291,17 +292,7 @@ class TwitterUserDataSaveToMySQL(object):
|
||||
return item
|
||||
self.table_name = "twitter_user_info"
|
||||
|
||||
value = item.get('avatar_path')
|
||||
|
||||
# 处理 avatar_path
|
||||
if isinstance(value, list) and len(value) > 0:
|
||||
value = value[0].get('path', '') if isinstance(value[0], dict) else str(value[0])
|
||||
elif isinstance(value, dict):
|
||||
value = value.get('path', '')
|
||||
else:
|
||||
value = str(value) if value else ''
|
||||
|
||||
item['avatar_path'] = value
|
||||
self.extract_avatar_and_background_paths(item)
|
||||
try:
|
||||
user_id = item.get('user_id')
|
||||
if not user_id:
|
||||
@ -347,7 +338,7 @@ class TwitterUserDataSaveToMySQL(object):
|
||||
def _needs_update(self, db_record, item):
|
||||
"""比较数据库记录与 item 是否有差异"""
|
||||
for field in item.fields:
|
||||
if field in ['id', 'created_at', 'updated_at', 'image_urls']:
|
||||
if field in ['id', 'created_at', 'updated_at', 'image_urls', 'crawl_time', 'join_date']:
|
||||
continue
|
||||
|
||||
item_val = item.get(field)
|
||||
@ -360,8 +351,9 @@ class TwitterUserDataSaveToMySQL(object):
|
||||
db_val = None
|
||||
|
||||
if item_val != db_val:
|
||||
return True
|
||||
return False
|
||||
self.update_fileds.append(field)
|
||||
|
||||
return len(self.update_fileds)>0
|
||||
|
||||
|
||||
def _update_item(self, record_uuid, item):
|
||||
@ -369,8 +361,8 @@ class TwitterUserDataSaveToMySQL(object):
|
||||
update_fields = []
|
||||
update_vals = []
|
||||
|
||||
for field in item.fields:
|
||||
if field in ['id', 'created_at', 'updated_at', 'image_urls']:
|
||||
for field in self.update_fileds:
|
||||
if field in ['id', 'created_at', 'image_urls']:
|
||||
continue
|
||||
value = item.get(field)
|
||||
|
||||
@ -439,3 +431,16 @@ class TwitterUserDataSaveToMySQL(object):
|
||||
logging.error(f"数据库操作发生未知错误:{e}")
|
||||
raise
|
||||
|
||||
def extract_avatar_and_background_paths(self, item):
|
||||
value = item.get('avatar_path', [])
|
||||
if not isinstance(value, list):
|
||||
value = []
|
||||
|
||||
def get_path(val):
|
||||
return val.get('path', '') if isinstance(val, dict) else str(val)
|
||||
|
||||
avatar = get_path(value[0]) if len(value) > 0 else None
|
||||
background = get_path(value[1]) if len(value) > 1 else None
|
||||
|
||||
item['avatar_path'] = avatar
|
||||
item['background_image_path'] = background
|
||||
|
||||
@ -206,6 +206,14 @@ EXTENSIONS = {
|
||||
'MediaSpiders.extensions.SetCrawlerStatusExtensions': 501
|
||||
}
|
||||
|
||||
############################## 翻译
|
||||
MAX_TEXT_LENGTH = 100
|
||||
# 翻译 API 地址(替换为你的服务器 IP 或域名)
|
||||
TRANSLATE_API_URL = "http://47.113.231.200:28082/translate"
|
||||
# 单次请求间隔(秒),避免 API 被限流
|
||||
REQUEST_DELAY = 1
|
||||
|
||||
|
||||
# Enable or disable extensions
|
||||
# See https://docs.scrapy.org/en/latest/topics/extensions.html
|
||||
# EXTENSIONS = {
|
||||
|
||||
@ -48,6 +48,16 @@ class TwitterSpider(scrapy.Spider):
|
||||
super(TwitterSpider, self).__init__(*args, **kwargs)
|
||||
self.total_num = 100
|
||||
self.authorization = 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'
|
||||
self.tags = {
|
||||
"620632841": "媒体实体", # 纽约时报中文网
|
||||
"1714100357582770176": "媒体实体", # 昨天
|
||||
"218434058": "官方代表", # 高市早苗
|
||||
"121669059": "媒体实体", # yonhapnews
|
||||
"8149482": "媒体实体", # 美国之音中文网
|
||||
"46574977": "媒体实体", # 华尔街日报中文网
|
||||
"1260553941714186241": "名人", # 李老师不是你老师
|
||||
"106379129": "官方代表", # 이재명
|
||||
}
|
||||
if params:
|
||||
json_params = json.loads(params)
|
||||
if 'totalNum' in json_params:
|
||||
@ -122,18 +132,12 @@ class TwitterSpider(scrapy.Spider):
|
||||
'uid': user_info['userUid'],
|
||||
'uname': user_info['userName'],
|
||||
'proxy': 'http://127.0.0.1:10809',
|
||||
'currentCount': 0
|
||||
},
|
||||
cookies=self.cookie_dict, headers=self.header)
|
||||
|
||||
def parse(self, response):
|
||||
uid = response.request.meta['uid']
|
||||
uname = response.request.meta['uname']
|
||||
current_count = response.request.meta['currentCount']
|
||||
if current_count > 0:
|
||||
self.logger.info("翻页采集:第%s页" % int(current_count / 20 + 1))
|
||||
else:
|
||||
self.logger.info("首页采集")
|
||||
try:
|
||||
rsp = json.loads(response.text)
|
||||
entries = []
|
||||
@ -142,14 +146,19 @@ class TwitterSpider(scrapy.Spider):
|
||||
item['crawl_time'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
|
||||
item['is_newest'] = 1
|
||||
item['platform_type'] = "Twitter"
|
||||
item['user_id'] = instructions['rest_id']
|
||||
item['user_id'] = int(instructions['rest_id'])
|
||||
item['nickname'] = instructions['core']['name']
|
||||
item['username'] = instructions['core']['screen_name']
|
||||
item['user_url'] = f'https://x.com/{uname}'
|
||||
item['user_link'] = f'https://x.com/{uname}'
|
||||
item['avatar_url'] = instructions['avatar']['image_url']
|
||||
item['image_urls'] = [instructions['avatar']['image_url']]
|
||||
item['intro'] = instructions['legacy']['description']
|
||||
item['city'] = instructions.get('legacy', {}).get('location', {}).get('location', '').strip()
|
||||
item['city'] = instructions.get('location', {}).get('location', '').strip()
|
||||
item['backgroud_image_url'] = instructions.get('legacy', {}).get('profile_banner_url', '')
|
||||
item['image_urls'] = [
|
||||
instructions['avatar']['image_url'],
|
||||
instructions.get('legacy', {}).get('profile_banner_url', '').strip()
|
||||
]
|
||||
try:
|
||||
# 转换为 datetime 对象
|
||||
ts = get_time_stamp(
|
||||
@ -159,10 +168,14 @@ class TwitterSpider(scrapy.Spider):
|
||||
except (ValueError, KeyError) as e:
|
||||
item['join_date'] = None # 或记录日志
|
||||
logger.error('时间转换失败:' + e)
|
||||
item['signature'] = instructions.get('legacy', {}).get('description', '').strip() or instructions.get('profile_bio', {}).get(
|
||||
'description', '').strip()
|
||||
item['post_count'] = instructions['legacy']['statuses_count']
|
||||
item['follow_count'] = instructions['legacy']['friends_count']
|
||||
item['fans_count'] = instructions['legacy']['followers_count']
|
||||
item['is_verified'] = str(instructions['is_blue_verified'])
|
||||
item['tags'] = self.tags[uid]
|
||||
|
||||
verified_type = instructions.get('verification', {}).get('verified_type', None) # 认证类型
|
||||
yield item
|
||||
except:
|
||||
|
||||
@ -19,4 +19,4 @@ dirpath = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
sys.path.append(dirpath)
|
||||
# 等效于:scrapy crawl FacebookUserSpider -a params="{}"
|
||||
execute(['scrapy', 'crawl', 'TwitterUserInfoSpider', '-a', 'params={}'])
|
||||
execute(['scrapy', 'crawl', 'TwitterUserSpider', '-a', 'params={}'])
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user