Compare commits

..

2 Commits

Author SHA1 Message Date
DELL
d5be45ec95 [twitter]新增推特用户信息采集字段 2026-01-22 13:51:16 +08:00
DELL
b827e33dbd [twitter]新增推特推文翻译功能 2026-01-22 13:50:52 +08:00
7 changed files with 165 additions and 29 deletions

View File

@ -183,13 +183,16 @@ class TwitterUserInfoItem(scrapy.Item):
username = scrapy.Field() # VARCHAR(50) - 用户名(@后部分) username = scrapy.Field() # VARCHAR(50) - 用户名(@后部分)
nickname = scrapy.Field() # VARCHAR(100) - 显示名称 nickname = scrapy.Field() # VARCHAR(100) - 显示名称
user_url = scrapy.Field() # VARCHAR(255) - 主页URL user_url = scrapy.Field() # VARCHAR(255) - 主页URL
user_link = scrapy.Field() # VARCHAR(255) - 用户链接
avatar_url = scrapy.Field() # VARCHAR(500) - 头像原始URL avatar_url = scrapy.Field() # VARCHAR(500) - 头像原始URL
avatar_path = scrapy.Field() # VARCHAR(255) - 本地头像路径 avatar_path = scrapy.Field() # VARCHAR(255) - 本地头像路径
backgroud_image_url = scrapy.Field() # VARCHAR(255) - 背景图原始URL
background_image_path = scrapy.Field() # VARCHAR(255) - 背景图路径
intro = scrapy.Field() # TEXT - 简介 intro = scrapy.Field() # TEXT - 简介
city = scrapy.Field() # VARCHAR(100) - 城市 city = scrapy.Field() # VARCHAR(100) - 城市
join_date = scrapy.Field() # DATETIME - 加入时间 join_date = scrapy.Field() # DATETIME - 加入时间
signature = scrapy.Field() # VARCHAR(255) - 用户签名
tags = scrapy.Field() # VARCHAR(255) - 标签:官方代表/媒体实体/名人
post_count = scrapy.Field() # INT UNSIGNED - 推文数 post_count = scrapy.Field() # INT UNSIGNED - 推文数
is_verified = scrapy.Field() # VARCHAR(10) - 是否认证 ("True"/"False") is_verified = scrapy.Field() # VARCHAR(10) - 是否认证 ("True"/"False")
follow_count = scrapy.Field() # INT UNSIGNED - 关注人数 follow_count = scrapy.Field() # INT UNSIGNED - 关注人数

View File

@ -273,6 +273,7 @@ class TwitterUserDataSaveToMySQL(object):
def __init__(self): def __init__(self):
self.db = None self.db = None
self.cursor = None self.cursor = None
self.update_fileds = []
def open_spider(self, spider): def open_spider(self, spider):
self.db = pymysql.connect(host='47.113.231.200', port=28089, user='root', passwd='passok123A', self.db = pymysql.connect(host='47.113.231.200', port=28089, user='root', passwd='passok123A',
@ -291,17 +292,7 @@ class TwitterUserDataSaveToMySQL(object):
return item return item
self.table_name = "twitter_user_info" self.table_name = "twitter_user_info"
value = item.get('avatar_path') self.extract_avatar_and_background_paths(item)
# 处理 avatar_path
if isinstance(value, list) and len(value) > 0:
value = value[0].get('path', '') if isinstance(value[0], dict) else str(value[0])
elif isinstance(value, dict):
value = value.get('path', '')
else:
value = str(value) if value else ''
item['avatar_path'] = value
try: try:
user_id = item.get('user_id') user_id = item.get('user_id')
if not user_id: if not user_id:
@ -347,7 +338,7 @@ class TwitterUserDataSaveToMySQL(object):
def _needs_update(self, db_record, item): def _needs_update(self, db_record, item):
"""比较数据库记录与 item 是否有差异""" """比较数据库记录与 item 是否有差异"""
for field in item.fields: for field in item.fields:
if field in ['id', 'created_at', 'updated_at', 'image_urls']: if field in ['id', 'created_at', 'updated_at', 'image_urls', 'crawl_time', 'join_date']:
continue continue
item_val = item.get(field) item_val = item.get(field)
@ -360,8 +351,9 @@ class TwitterUserDataSaveToMySQL(object):
db_val = None db_val = None
if item_val != db_val: if item_val != db_val:
return True self.update_fileds.append(field)
return False
return len(self.update_fileds)>0
def _update_item(self, record_uuid, item): def _update_item(self, record_uuid, item):
@ -369,8 +361,8 @@ class TwitterUserDataSaveToMySQL(object):
update_fields = [] update_fields = []
update_vals = [] update_vals = []
for field in item.fields: for field in self.update_fileds:
if field in ['id', 'created_at', 'updated_at', 'image_urls']: if field in ['id', 'created_at', 'image_urls']:
continue continue
value = item.get(field) value = item.get(field)
@ -439,3 +431,16 @@ class TwitterUserDataSaveToMySQL(object):
logging.error(f"数据库操作发生未知错误:{e}") logging.error(f"数据库操作发生未知错误:{e}")
raise raise
def extract_avatar_and_background_paths(self, item):
value = item.get('avatar_path', [])
if not isinstance(value, list):
value = []
def get_path(val):
return val.get('path', '') if isinstance(val, dict) else str(val)
avatar = get_path(value[0]) if len(value) > 0 else None
background = get_path(value[1]) if len(value) > 1 else None
item['avatar_path'] = avatar
item['background_image_path'] = background

View File

@ -206,6 +206,14 @@ EXTENSIONS = {
'MediaSpiders.extensions.SetCrawlerStatusExtensions': 501 'MediaSpiders.extensions.SetCrawlerStatusExtensions': 501
} }
############################## 翻译
MAX_TEXT_LENGTH = 100
# 翻译 API 地址(替换为你的服务器 IP 或域名)
TRANSLATE_API_URL = "http://47.113.231.200:28082/translate"
# 单次请求间隔(秒),避免 API 被限流
REQUEST_DELAY = 1
# Enable or disable extensions # Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html # See https://docs.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = { # EXTENSIONS = {

View File

@ -48,6 +48,16 @@ class TwitterSpider(scrapy.Spider):
super(TwitterSpider, self).__init__(*args, **kwargs) super(TwitterSpider, self).__init__(*args, **kwargs)
self.total_num = 100 self.total_num = 100
self.authorization = 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA' self.authorization = 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'
self.tags = {
"620632841": "媒体实体", # 纽约时报中文网
"1714100357582770176": "媒体实体", # 昨天
"218434058": "官方代表", # 高市早苗
"121669059": "媒体实体", # yonhapnews
"8149482": "媒体实体", # 美国之音中文网
"46574977": "媒体实体", # 华尔街日报中文网
"1260553941714186241": "名人", # 李老师不是你老师
"106379129": "官方代表", # 이재명
}
if params: if params:
json_params = json.loads(params) json_params = json.loads(params)
if 'totalNum' in json_params: if 'totalNum' in json_params:
@ -122,18 +132,12 @@ class TwitterSpider(scrapy.Spider):
'uid': user_info['userUid'], 'uid': user_info['userUid'],
'uname': user_info['userName'], 'uname': user_info['userName'],
'proxy': 'http://127.0.0.1:10809', 'proxy': 'http://127.0.0.1:10809',
'currentCount': 0
}, },
cookies=self.cookie_dict, headers=self.header) cookies=self.cookie_dict, headers=self.header)
def parse(self, response): def parse(self, response):
uid = response.request.meta['uid'] uid = response.request.meta['uid']
uname = response.request.meta['uname'] uname = response.request.meta['uname']
current_count = response.request.meta['currentCount']
if current_count > 0:
self.logger.info("翻页采集:第%s" % int(current_count / 20 + 1))
else:
self.logger.info("首页采集")
try: try:
rsp = json.loads(response.text) rsp = json.loads(response.text)
entries = [] entries = []
@ -142,14 +146,19 @@ class TwitterSpider(scrapy.Spider):
item['crawl_time'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) item['crawl_time'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
item['is_newest'] = 1 item['is_newest'] = 1
item['platform_type'] = "Twitter" item['platform_type'] = "Twitter"
item['user_id'] = instructions['rest_id'] item['user_id'] = int(instructions['rest_id'])
item['nickname'] = instructions['core']['name'] item['nickname'] = instructions['core']['name']
item['username'] = instructions['core']['screen_name'] item['username'] = instructions['core']['screen_name']
item['user_url'] = f'https://x.com/{uname}' item['user_url'] = f'https://x.com/{uname}'
item['user_link'] = f'https://x.com/{uname}'
item['avatar_url'] = instructions['avatar']['image_url'] item['avatar_url'] = instructions['avatar']['image_url']
item['image_urls'] = [instructions['avatar']['image_url']]
item['intro'] = instructions['legacy']['description'] item['intro'] = instructions['legacy']['description']
item['city'] = instructions.get('legacy', {}).get('location', {}).get('location', '').strip() item['city'] = instructions.get('location', {}).get('location', '').strip()
item['backgroud_image_url'] = instructions.get('legacy', {}).get('profile_banner_url', '')
item['image_urls'] = [
instructions['avatar']['image_url'],
instructions.get('legacy', {}).get('profile_banner_url', '').strip()
]
try: try:
# 转换为 datetime 对象 # 转换为 datetime 对象
ts = get_time_stamp( ts = get_time_stamp(
@ -159,10 +168,14 @@ class TwitterSpider(scrapy.Spider):
except (ValueError, KeyError) as e: except (ValueError, KeyError) as e:
item['join_date'] = None # 或记录日志 item['join_date'] = None # 或记录日志
logger.error('时间转换失败:' + e) logger.error('时间转换失败:' + e)
item['signature'] = instructions.get('legacy', {}).get('description', '').strip() or instructions.get('profile_bio', {}).get(
'description', '').strip()
item['post_count'] = instructions['legacy']['statuses_count'] item['post_count'] = instructions['legacy']['statuses_count']
item['follow_count'] = instructions['legacy']['friends_count'] item['follow_count'] = instructions['legacy']['friends_count']
item['fans_count'] = instructions['legacy']['followers_count'] item['fans_count'] = instructions['legacy']['followers_count']
item['is_verified'] = str(instructions['is_blue_verified']) item['is_verified'] = str(instructions['is_blue_verified'])
item['tags'] = self.tags[uid]
verified_type = instructions.get('verification', {}).get('verified_type', None) # 认证类型 verified_type = instructions.get('verification', {}).get('verified_type', None) # 认证类型
yield item yield item
except: except:

View File

@ -19,6 +19,8 @@ from MediaSpiders.utils.login_utils import login
from MediaSpiders.utils.time_utils import get_time_stamp, get_current_timestamp from MediaSpiders.utils.time_utils import get_time_stamp, get_current_timestamp
from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.common.action_chains import ActionChains
from MediaSpiders.utils.traslate_utils import translate_single, translate_content_with_paragraphs, needs_translation
def form_cookie_dict(cookie_string): def form_cookie_dict(cookie_string):
cookie_string_list = cookie_string.split(';') cookie_string_list = cookie_string.split(';')
@ -188,14 +190,25 @@ class TwitterSpider(scrapy.Spider):
item['es_isrepost'] = 1 item['es_isrepost'] = 1
item['es_urltitle'] = author_full_text item['es_urltitle'] = author_full_text
item['es_catalog1'] = author_full_text item['es_catalog1'] = author_full_text
# 判断是否需要翻译
if needs_translation(author_full_text):
item['es_content'] = translate_single(author_full_text) # TODO 翻译
else:
item['es_content'] = url_content
legacy = result['quoted_status_result']['result']['legacy'] legacy = result['quoted_status_result']['result']['legacy']
self.logger.info('采集引用推文原文信息') self.logger.info('采集引用推文原文信息')
elif 'retweeted_status_result' in legacy: elif 'retweeted_status_result' in legacy:
item['es_isrepost'] = 1 item['es_isrepost'] = 1
legacy = legacy['retweeted_status_result']['result']['legacy'] legacy = legacy['retweeted_status_result']['result']['legacy']
self.logger.info('采集转发推文原文信息') self.logger.info('采集转发推文原文信息')
item['es_content'] = legacy['full_text']
item['es_urlcontent'] = legacy['full_text'] item['es_urlcontent'] = legacy['full_text']
# 获取文本
url_content = legacy['full_text']
# 判断是否需要翻译
if needs_translation(url_content):
item['es_content'] = translate_content_with_paragraphs(url_content) # TODO 翻译
else:
item['es_content'] = url_content
# 下载图片 # 下载图片
image_url_list = [] image_url_list = []
if 'entities' in legacy and 'media' in legacy['entities']: if 'entities' in legacy and 'media' in legacy['entities']:

View File

@ -0,0 +1,94 @@
from MediaSpiders.settings import MAX_TEXT_LENGTH, TRANSLATE_API_URL, REQUEST_DELAY
import requests
import time
from typing import List, Tuple, Optional
from langdetect import detect, LangDetectException
def normalize_newlines(text: str) -> str:
"""\r\n\r 统一转换为 \n"""
if not text:
return text
return text.replace('\r\n', '\n').replace('\r', '\n')
def translate_single(text: str, source_lang: str = "auto", target_lang: str = "zh") -> Optional[str]:
"""翻译单段文本,失败返回 None"""
if not text or not text.strip():
return ""
payload = {
"text": text[:MAX_TEXT_LENGTH],
"source_lang": source_lang,
"target_lang": target_lang
}
try:
response = requests.post(TRANSLATE_API_URL, json=payload, timeout=10)
response.raise_for_status()
result = response.json()
return result.get("translated_text")
except Exception as e:
print(f"⚠️ 翻译失败: {e}")
return None
def translate_content_with_paragraphs(content: str) -> str:
"""
按段落翻译内容支持容错
- 某段失败 跳过该段保留空行或原文
- 返回拼接后的完整内容
"""
if not content:
return ""
# 标准化换行符
content = normalize_newlines(content)
paragraphs = content.split('\n')
translated_paragraphs = []
for para in paragraphs:
if not para.strip():
# 保留空行
translated_paragraphs.append("")
continue
trans = translate_single(para)
if trans is None:
# 段落翻译失败:跳过该段(可选:保留原文或留空)
print(f" ⚠️ 段落翻译失败,跳过: {para[:30]}...")
translated_paragraphs.append("") # 或 append(para) 保留原文
else:
translated_paragraphs.append(trans)
time.sleep(REQUEST_DELAY)
return '\n'.join(translated_paragraphs)
# ================== 数据库操作 ==================
def update_record(cursor, es_sid: int, new_title: str, new_content: str):
update_query = """
UPDATE indeximos
SET es_title = % s, es_content = % s
WHERE es_sid = % s
"""
cursor.execute(update_query, (new_title, new_content, es_sid))
def needs_translation(text: str) -> bool:
"""
判断文本是否需要翻译
- 如果检测到语言是 'zh'中文则不需要翻译返回 False
- 否则需要翻译返回 True
- 若无法检测如空文本纯符号等
"""
if not text or not text.strip():
return False # 空文本无需翻译
try:
lang = detect(text.strip())
return lang != 'zh-cn'
except LangDetectException:
# 无法检测语言(如全是数字、标点等),保守起见视为需要翻译
return True

View File

@ -19,4 +19,4 @@ dirpath = os.path.dirname(os.path.abspath(__file__))
sys.path.append(dirpath) sys.path.append(dirpath)
# 等效于scrapy crawl FacebookUserSpider -a params="{}" # 等效于scrapy crawl FacebookUserSpider -a params="{}"
execute(['scrapy', 'crawl', 'TwitterUserInfoSpider', '-a', 'params={}']) execute(['scrapy', 'crawl', 'TwitterUserSpider', '-a', 'params={}'])