diff --git a/spiders/MediaSpiders/MediaSpiders/items.py b/spiders/MediaSpiders/MediaSpiders/items.py index 2c8c453..0a76f4a 100644 --- a/spiders/MediaSpiders/MediaSpiders/items.py +++ b/spiders/MediaSpiders/MediaSpiders/items.py @@ -183,13 +183,16 @@ class TwitterUserInfoItem(scrapy.Item): username = scrapy.Field() # VARCHAR(50) - 用户名(@后部分) nickname = scrapy.Field() # VARCHAR(100) - 显示名称 user_url = scrapy.Field() # VARCHAR(255) - 主页URL + user_link = scrapy.Field() # VARCHAR(255) - 用户链接 avatar_url = scrapy.Field() # VARCHAR(500) - 头像原始URL avatar_path = scrapy.Field() # VARCHAR(255) - 本地头像路径 - + backgroud_image_url = scrapy.Field() # VARCHAR(255) - 背景图原始URL + background_image_path = scrapy.Field() # VARCHAR(255) - 背景图路径 intro = scrapy.Field() # TEXT - 简介 city = scrapy.Field() # VARCHAR(100) - 城市 join_date = scrapy.Field() # DATETIME - 加入时间 - + signature = scrapy.Field() # VARCHAR(255) - 用户签名 + tags = scrapy.Field() # VARCHAR(255) - 标签:官方代表/媒体实体/名人 post_count = scrapy.Field() # INT UNSIGNED - 推文数 is_verified = scrapy.Field() # VARCHAR(10) - 是否认证 ("True"/"False") follow_count = scrapy.Field() # INT UNSIGNED - 关注人数 diff --git a/spiders/MediaSpiders/MediaSpiders/pipelines.py b/spiders/MediaSpiders/MediaSpiders/pipelines.py index 259e33e..8c1565e 100644 --- a/spiders/MediaSpiders/MediaSpiders/pipelines.py +++ b/spiders/MediaSpiders/MediaSpiders/pipelines.py @@ -273,6 +273,7 @@ class TwitterUserDataSaveToMySQL(object): def __init__(self): self.db = None self.cursor = None + self.update_fileds = [] def open_spider(self, spider): self.db = pymysql.connect(host='47.113.231.200', port=28089, user='root', passwd='passok123A', @@ -291,17 +292,7 @@ class TwitterUserDataSaveToMySQL(object): return item self.table_name = "twitter_user_info" - value = item.get('avatar_path') - - # 处理 avatar_path - if isinstance(value, list) and len(value) > 0: - value = value[0].get('path', '') if isinstance(value[0], dict) else str(value[0]) - elif isinstance(value, dict): - value = value.get('path', '') - else: - value = str(value) if value else '' - - item['avatar_path'] = value + self.extract_avatar_and_background_paths(item) try: user_id = item.get('user_id') if not user_id: @@ -347,7 +338,7 @@ class TwitterUserDataSaveToMySQL(object): def _needs_update(self, db_record, item): """比较数据库记录与 item 是否有差异""" for field in item.fields: - if field in ['id', 'created_at', 'updated_at', 'image_urls']: + if field in ['id', 'created_at', 'updated_at', 'image_urls', 'crawl_time', 'join_date']: continue item_val = item.get(field) @@ -360,8 +351,9 @@ class TwitterUserDataSaveToMySQL(object): db_val = None if item_val != db_val: - return True - return False + self.update_fileds.append(field) + + return len(self.update_fileds)>0 def _update_item(self, record_uuid, item): @@ -369,8 +361,8 @@ class TwitterUserDataSaveToMySQL(object): update_fields = [] update_vals = [] - for field in item.fields: - if field in ['id', 'created_at', 'updated_at', 'image_urls']: + for field in self.update_fileds: + if field in ['id', 'created_at', 'image_urls']: continue value = item.get(field) @@ -439,3 +431,16 @@ class TwitterUserDataSaveToMySQL(object): logging.error(f"数据库操作发生未知错误:{e}") raise + def extract_avatar_and_background_paths(self, item): + value = item.get('avatar_path', []) + if not isinstance(value, list): + value = [] + + def get_path(val): + return val.get('path', '') if isinstance(val, dict) else str(val) + + avatar = get_path(value[0]) if len(value) > 0 else None + background = get_path(value[1]) if len(value) > 1 else None + + item['avatar_path'] = avatar + item['background_image_path'] = background diff --git a/spiders/MediaSpiders/MediaSpiders/settings.py b/spiders/MediaSpiders/MediaSpiders/settings.py index 1692ee0..e00a062 100644 --- a/spiders/MediaSpiders/MediaSpiders/settings.py +++ b/spiders/MediaSpiders/MediaSpiders/settings.py @@ -206,6 +206,14 @@ EXTENSIONS = { 'MediaSpiders.extensions.SetCrawlerStatusExtensions': 501 } +############################## 翻译 +MAX_TEXT_LENGTH = 100 +# 翻译 API 地址(替换为你的服务器 IP 或域名) +TRANSLATE_API_URL = "http://47.113.231.200:28082/translate" +# 单次请求间隔(秒),避免 API 被限流 +REQUEST_DELAY = 1 + + # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html # EXTENSIONS = { diff --git a/spiders/MediaSpiders/MediaSpiders/spiders/TwitterUserInfoSpider.py b/spiders/MediaSpiders/MediaSpiders/spiders/TwitterUserInfoSpider.py index 7f16725..168bcfd 100644 --- a/spiders/MediaSpiders/MediaSpiders/spiders/TwitterUserInfoSpider.py +++ b/spiders/MediaSpiders/MediaSpiders/spiders/TwitterUserInfoSpider.py @@ -48,6 +48,16 @@ class TwitterSpider(scrapy.Spider): super(TwitterSpider, self).__init__(*args, **kwargs) self.total_num = 100 self.authorization = 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA' + self.tags = { + "620632841": "媒体实体", # 纽约时报中文网 + "1714100357582770176": "媒体实体", # 昨天 + "218434058": "官方代表", # 高市早苗 + "121669059": "媒体实体", # yonhapnews + "8149482": "媒体实体", # 美国之音中文网 + "46574977": "媒体实体", # 华尔街日报中文网 + "1260553941714186241": "名人", # 李老师不是你老师 + "106379129": "官方代表", # 이재명 + } if params: json_params = json.loads(params) if 'totalNum' in json_params: @@ -122,18 +132,12 @@ class TwitterSpider(scrapy.Spider): 'uid': user_info['userUid'], 'uname': user_info['userName'], 'proxy': 'http://127.0.0.1:10809', - 'currentCount': 0 }, cookies=self.cookie_dict, headers=self.header) def parse(self, response): uid = response.request.meta['uid'] uname = response.request.meta['uname'] - current_count = response.request.meta['currentCount'] - if current_count > 0: - self.logger.info("翻页采集:第%s页" % int(current_count / 20 + 1)) - else: - self.logger.info("首页采集") try: rsp = json.loads(response.text) entries = [] @@ -142,14 +146,19 @@ class TwitterSpider(scrapy.Spider): item['crawl_time'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) item['is_newest'] = 1 item['platform_type'] = "Twitter" - item['user_id'] = instructions['rest_id'] + item['user_id'] = int(instructions['rest_id']) item['nickname'] = instructions['core']['name'] item['username'] = instructions['core']['screen_name'] item['user_url'] = f'https://x.com/{uname}' + item['user_link'] = f'https://x.com/{uname}' item['avatar_url'] = instructions['avatar']['image_url'] - item['image_urls'] = [instructions['avatar']['image_url']] item['intro'] = instructions['legacy']['description'] - item['city'] = instructions.get('legacy', {}).get('location', {}).get('location', '').strip() + item['city'] = instructions.get('location', {}).get('location', '').strip() + item['backgroud_image_url'] = instructions.get('legacy', {}).get('profile_banner_url', '') + item['image_urls'] = [ + instructions['avatar']['image_url'], + instructions.get('legacy', {}).get('profile_banner_url', '').strip() + ] try: # 转换为 datetime 对象 ts = get_time_stamp( @@ -159,10 +168,14 @@ class TwitterSpider(scrapy.Spider): except (ValueError, KeyError) as e: item['join_date'] = None # 或记录日志 logger.error('时间转换失败:' + e) + item['signature'] = instructions.get('legacy', {}).get('description', '').strip() or instructions.get('profile_bio', {}).get( + 'description', '').strip() item['post_count'] = instructions['legacy']['statuses_count'] item['follow_count'] = instructions['legacy']['friends_count'] item['fans_count'] = instructions['legacy']['followers_count'] item['is_verified'] = str(instructions['is_blue_verified']) + item['tags'] = self.tags[uid] + verified_type = instructions.get('verification', {}).get('verified_type', None) # 认证类型 yield item except: diff --git a/spiders/MediaSpiders/run.py b/spiders/MediaSpiders/run.py index 217d3b9..8f74cad 100644 --- a/spiders/MediaSpiders/run.py +++ b/spiders/MediaSpiders/run.py @@ -19,4 +19,4 @@ dirpath = os.path.dirname(os.path.abspath(__file__)) sys.path.append(dirpath) # 等效于:scrapy crawl FacebookUserSpider -a params="{}" -execute(['scrapy', 'crawl', 'TwitterUserInfoSpider', '-a', 'params={}']) +execute(['scrapy', 'crawl', 'TwitterUserSpider', '-a', 'params={}'])