Compare commits
No commits in common. "d5be45ec95e76b6426395c93392f1ea16284b33f" and "7b3a83a1aba9bbc7f5162db5dbfb0ae8cf062d89" have entirely different histories.
d5be45ec95
...
7b3a83a1ab
@ -183,16 +183,13 @@ class TwitterUserInfoItem(scrapy.Item):
|
|||||||
username = scrapy.Field() # VARCHAR(50) - 用户名(@后部分)
|
username = scrapy.Field() # VARCHAR(50) - 用户名(@后部分)
|
||||||
nickname = scrapy.Field() # VARCHAR(100) - 显示名称
|
nickname = scrapy.Field() # VARCHAR(100) - 显示名称
|
||||||
user_url = scrapy.Field() # VARCHAR(255) - 主页URL
|
user_url = scrapy.Field() # VARCHAR(255) - 主页URL
|
||||||
user_link = scrapy.Field() # VARCHAR(255) - 用户链接
|
|
||||||
avatar_url = scrapy.Field() # VARCHAR(500) - 头像原始URL
|
avatar_url = scrapy.Field() # VARCHAR(500) - 头像原始URL
|
||||||
avatar_path = scrapy.Field() # VARCHAR(255) - 本地头像路径
|
avatar_path = scrapy.Field() # VARCHAR(255) - 本地头像路径
|
||||||
backgroud_image_url = scrapy.Field() # VARCHAR(255) - 背景图原始URL
|
|
||||||
background_image_path = scrapy.Field() # VARCHAR(255) - 背景图路径
|
|
||||||
intro = scrapy.Field() # TEXT - 简介
|
intro = scrapy.Field() # TEXT - 简介
|
||||||
city = scrapy.Field() # VARCHAR(100) - 城市
|
city = scrapy.Field() # VARCHAR(100) - 城市
|
||||||
join_date = scrapy.Field() # DATETIME - 加入时间
|
join_date = scrapy.Field() # DATETIME - 加入时间
|
||||||
signature = scrapy.Field() # VARCHAR(255) - 用户签名
|
|
||||||
tags = scrapy.Field() # VARCHAR(255) - 标签:官方代表/媒体实体/名人
|
|
||||||
post_count = scrapy.Field() # INT UNSIGNED - 推文数
|
post_count = scrapy.Field() # INT UNSIGNED - 推文数
|
||||||
is_verified = scrapy.Field() # VARCHAR(10) - 是否认证 ("True"/"False")
|
is_verified = scrapy.Field() # VARCHAR(10) - 是否认证 ("True"/"False")
|
||||||
follow_count = scrapy.Field() # INT UNSIGNED - 关注人数
|
follow_count = scrapy.Field() # INT UNSIGNED - 关注人数
|
||||||
|
|||||||
@ -273,7 +273,6 @@ class TwitterUserDataSaveToMySQL(object):
|
|||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.db = None
|
self.db = None
|
||||||
self.cursor = None
|
self.cursor = None
|
||||||
self.update_fileds = []
|
|
||||||
|
|
||||||
def open_spider(self, spider):
|
def open_spider(self, spider):
|
||||||
self.db = pymysql.connect(host='47.113.231.200', port=28089, user='root', passwd='passok123A',
|
self.db = pymysql.connect(host='47.113.231.200', port=28089, user='root', passwd='passok123A',
|
||||||
@ -292,7 +291,17 @@ class TwitterUserDataSaveToMySQL(object):
|
|||||||
return item
|
return item
|
||||||
self.table_name = "twitter_user_info"
|
self.table_name = "twitter_user_info"
|
||||||
|
|
||||||
self.extract_avatar_and_background_paths(item)
|
value = item.get('avatar_path')
|
||||||
|
|
||||||
|
# 处理 avatar_path
|
||||||
|
if isinstance(value, list) and len(value) > 0:
|
||||||
|
value = value[0].get('path', '') if isinstance(value[0], dict) else str(value[0])
|
||||||
|
elif isinstance(value, dict):
|
||||||
|
value = value.get('path', '')
|
||||||
|
else:
|
||||||
|
value = str(value) if value else ''
|
||||||
|
|
||||||
|
item['avatar_path'] = value
|
||||||
try:
|
try:
|
||||||
user_id = item.get('user_id')
|
user_id = item.get('user_id')
|
||||||
if not user_id:
|
if not user_id:
|
||||||
@ -338,7 +347,7 @@ class TwitterUserDataSaveToMySQL(object):
|
|||||||
def _needs_update(self, db_record, item):
|
def _needs_update(self, db_record, item):
|
||||||
"""比较数据库记录与 item 是否有差异"""
|
"""比较数据库记录与 item 是否有差异"""
|
||||||
for field in item.fields:
|
for field in item.fields:
|
||||||
if field in ['id', 'created_at', 'updated_at', 'image_urls', 'crawl_time', 'join_date']:
|
if field in ['id', 'created_at', 'updated_at', 'image_urls']:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
item_val = item.get(field)
|
item_val = item.get(field)
|
||||||
@ -351,9 +360,8 @@ class TwitterUserDataSaveToMySQL(object):
|
|||||||
db_val = None
|
db_val = None
|
||||||
|
|
||||||
if item_val != db_val:
|
if item_val != db_val:
|
||||||
self.update_fileds.append(field)
|
return True
|
||||||
|
return False
|
||||||
return len(self.update_fileds)>0
|
|
||||||
|
|
||||||
|
|
||||||
def _update_item(self, record_uuid, item):
|
def _update_item(self, record_uuid, item):
|
||||||
@ -361,8 +369,8 @@ class TwitterUserDataSaveToMySQL(object):
|
|||||||
update_fields = []
|
update_fields = []
|
||||||
update_vals = []
|
update_vals = []
|
||||||
|
|
||||||
for field in self.update_fileds:
|
for field in item.fields:
|
||||||
if field in ['id', 'created_at', 'image_urls']:
|
if field in ['id', 'created_at', 'updated_at', 'image_urls']:
|
||||||
continue
|
continue
|
||||||
value = item.get(field)
|
value = item.get(field)
|
||||||
|
|
||||||
@ -431,16 +439,3 @@ class TwitterUserDataSaveToMySQL(object):
|
|||||||
logging.error(f"数据库操作发生未知错误:{e}")
|
logging.error(f"数据库操作发生未知错误:{e}")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
def extract_avatar_and_background_paths(self, item):
|
|
||||||
value = item.get('avatar_path', [])
|
|
||||||
if not isinstance(value, list):
|
|
||||||
value = []
|
|
||||||
|
|
||||||
def get_path(val):
|
|
||||||
return val.get('path', '') if isinstance(val, dict) else str(val)
|
|
||||||
|
|
||||||
avatar = get_path(value[0]) if len(value) > 0 else None
|
|
||||||
background = get_path(value[1]) if len(value) > 1 else None
|
|
||||||
|
|
||||||
item['avatar_path'] = avatar
|
|
||||||
item['background_image_path'] = background
|
|
||||||
|
|||||||
@ -206,14 +206,6 @@ EXTENSIONS = {
|
|||||||
'MediaSpiders.extensions.SetCrawlerStatusExtensions': 501
|
'MediaSpiders.extensions.SetCrawlerStatusExtensions': 501
|
||||||
}
|
}
|
||||||
|
|
||||||
############################## 翻译
|
|
||||||
MAX_TEXT_LENGTH = 100
|
|
||||||
# 翻译 API 地址(替换为你的服务器 IP 或域名)
|
|
||||||
TRANSLATE_API_URL = "http://47.113.231.200:28082/translate"
|
|
||||||
# 单次请求间隔(秒),避免 API 被限流
|
|
||||||
REQUEST_DELAY = 1
|
|
||||||
|
|
||||||
|
|
||||||
# Enable or disable extensions
|
# Enable or disable extensions
|
||||||
# See https://docs.scrapy.org/en/latest/topics/extensions.html
|
# See https://docs.scrapy.org/en/latest/topics/extensions.html
|
||||||
# EXTENSIONS = {
|
# EXTENSIONS = {
|
||||||
|
|||||||
@ -48,16 +48,6 @@ class TwitterSpider(scrapy.Spider):
|
|||||||
super(TwitterSpider, self).__init__(*args, **kwargs)
|
super(TwitterSpider, self).__init__(*args, **kwargs)
|
||||||
self.total_num = 100
|
self.total_num = 100
|
||||||
self.authorization = 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'
|
self.authorization = 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'
|
||||||
self.tags = {
|
|
||||||
"620632841": "媒体实体", # 纽约时报中文网
|
|
||||||
"1714100357582770176": "媒体实体", # 昨天
|
|
||||||
"218434058": "官方代表", # 高市早苗
|
|
||||||
"121669059": "媒体实体", # yonhapnews
|
|
||||||
"8149482": "媒体实体", # 美国之音中文网
|
|
||||||
"46574977": "媒体实体", # 华尔街日报中文网
|
|
||||||
"1260553941714186241": "名人", # 李老师不是你老师
|
|
||||||
"106379129": "官方代表", # 이재명
|
|
||||||
}
|
|
||||||
if params:
|
if params:
|
||||||
json_params = json.loads(params)
|
json_params = json.loads(params)
|
||||||
if 'totalNum' in json_params:
|
if 'totalNum' in json_params:
|
||||||
@ -132,12 +122,18 @@ class TwitterSpider(scrapy.Spider):
|
|||||||
'uid': user_info['userUid'],
|
'uid': user_info['userUid'],
|
||||||
'uname': user_info['userName'],
|
'uname': user_info['userName'],
|
||||||
'proxy': 'http://127.0.0.1:10809',
|
'proxy': 'http://127.0.0.1:10809',
|
||||||
|
'currentCount': 0
|
||||||
},
|
},
|
||||||
cookies=self.cookie_dict, headers=self.header)
|
cookies=self.cookie_dict, headers=self.header)
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
uid = response.request.meta['uid']
|
uid = response.request.meta['uid']
|
||||||
uname = response.request.meta['uname']
|
uname = response.request.meta['uname']
|
||||||
|
current_count = response.request.meta['currentCount']
|
||||||
|
if current_count > 0:
|
||||||
|
self.logger.info("翻页采集:第%s页" % int(current_count / 20 + 1))
|
||||||
|
else:
|
||||||
|
self.logger.info("首页采集")
|
||||||
try:
|
try:
|
||||||
rsp = json.loads(response.text)
|
rsp = json.loads(response.text)
|
||||||
entries = []
|
entries = []
|
||||||
@ -146,19 +142,14 @@ class TwitterSpider(scrapy.Spider):
|
|||||||
item['crawl_time'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
|
item['crawl_time'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
|
||||||
item['is_newest'] = 1
|
item['is_newest'] = 1
|
||||||
item['platform_type'] = "Twitter"
|
item['platform_type'] = "Twitter"
|
||||||
item['user_id'] = int(instructions['rest_id'])
|
item['user_id'] = instructions['rest_id']
|
||||||
item['nickname'] = instructions['core']['name']
|
item['nickname'] = instructions['core']['name']
|
||||||
item['username'] = instructions['core']['screen_name']
|
item['username'] = instructions['core']['screen_name']
|
||||||
item['user_url'] = f'https://x.com/{uname}'
|
item['user_url'] = f'https://x.com/{uname}'
|
||||||
item['user_link'] = f'https://x.com/{uname}'
|
|
||||||
item['avatar_url'] = instructions['avatar']['image_url']
|
item['avatar_url'] = instructions['avatar']['image_url']
|
||||||
|
item['image_urls'] = [instructions['avatar']['image_url']]
|
||||||
item['intro'] = instructions['legacy']['description']
|
item['intro'] = instructions['legacy']['description']
|
||||||
item['city'] = instructions.get('location', {}).get('location', '').strip()
|
item['city'] = instructions.get('legacy', {}).get('location', {}).get('location', '').strip()
|
||||||
item['backgroud_image_url'] = instructions.get('legacy', {}).get('profile_banner_url', '')
|
|
||||||
item['image_urls'] = [
|
|
||||||
instructions['avatar']['image_url'],
|
|
||||||
instructions.get('legacy', {}).get('profile_banner_url', '').strip()
|
|
||||||
]
|
|
||||||
try:
|
try:
|
||||||
# 转换为 datetime 对象
|
# 转换为 datetime 对象
|
||||||
ts = get_time_stamp(
|
ts = get_time_stamp(
|
||||||
@ -168,14 +159,10 @@ class TwitterSpider(scrapy.Spider):
|
|||||||
except (ValueError, KeyError) as e:
|
except (ValueError, KeyError) as e:
|
||||||
item['join_date'] = None # 或记录日志
|
item['join_date'] = None # 或记录日志
|
||||||
logger.error('时间转换失败:' + e)
|
logger.error('时间转换失败:' + e)
|
||||||
item['signature'] = instructions.get('legacy', {}).get('description', '').strip() or instructions.get('profile_bio', {}).get(
|
|
||||||
'description', '').strip()
|
|
||||||
item['post_count'] = instructions['legacy']['statuses_count']
|
item['post_count'] = instructions['legacy']['statuses_count']
|
||||||
item['follow_count'] = instructions['legacy']['friends_count']
|
item['follow_count'] = instructions['legacy']['friends_count']
|
||||||
item['fans_count'] = instructions['legacy']['followers_count']
|
item['fans_count'] = instructions['legacy']['followers_count']
|
||||||
item['is_verified'] = str(instructions['is_blue_verified'])
|
item['is_verified'] = str(instructions['is_blue_verified'])
|
||||||
item['tags'] = self.tags[uid]
|
|
||||||
|
|
||||||
verified_type = instructions.get('verification', {}).get('verified_type', None) # 认证类型
|
verified_type = instructions.get('verification', {}).get('verified_type', None) # 认证类型
|
||||||
yield item
|
yield item
|
||||||
except:
|
except:
|
||||||
|
|||||||
@ -19,8 +19,6 @@ from MediaSpiders.utils.login_utils import login
|
|||||||
from MediaSpiders.utils.time_utils import get_time_stamp, get_current_timestamp
|
from MediaSpiders.utils.time_utils import get_time_stamp, get_current_timestamp
|
||||||
from selenium.webdriver.common.action_chains import ActionChains
|
from selenium.webdriver.common.action_chains import ActionChains
|
||||||
|
|
||||||
from MediaSpiders.utils.traslate_utils import translate_single, translate_content_with_paragraphs, needs_translation
|
|
||||||
|
|
||||||
|
|
||||||
def form_cookie_dict(cookie_string):
|
def form_cookie_dict(cookie_string):
|
||||||
cookie_string_list = cookie_string.split(';')
|
cookie_string_list = cookie_string.split(';')
|
||||||
@ -190,25 +188,14 @@ class TwitterSpider(scrapy.Spider):
|
|||||||
item['es_isrepost'] = 1
|
item['es_isrepost'] = 1
|
||||||
item['es_urltitle'] = author_full_text
|
item['es_urltitle'] = author_full_text
|
||||||
item['es_catalog1'] = author_full_text
|
item['es_catalog1'] = author_full_text
|
||||||
# 判断是否需要翻译
|
|
||||||
if needs_translation(author_full_text):
|
|
||||||
item['es_content'] = translate_single(author_full_text) # TODO 翻译
|
|
||||||
else:
|
|
||||||
item['es_content'] = url_content
|
|
||||||
legacy = result['quoted_status_result']['result']['legacy']
|
legacy = result['quoted_status_result']['result']['legacy']
|
||||||
self.logger.info('采集引用推文原文信息')
|
self.logger.info('采集引用推文原文信息')
|
||||||
elif 'retweeted_status_result' in legacy:
|
elif 'retweeted_status_result' in legacy:
|
||||||
item['es_isrepost'] = 1
|
item['es_isrepost'] = 1
|
||||||
legacy = legacy['retweeted_status_result']['result']['legacy']
|
legacy = legacy['retweeted_status_result']['result']['legacy']
|
||||||
self.logger.info('采集转发推文原文信息')
|
self.logger.info('采集转发推文原文信息')
|
||||||
|
item['es_content'] = legacy['full_text']
|
||||||
item['es_urlcontent'] = legacy['full_text']
|
item['es_urlcontent'] = legacy['full_text']
|
||||||
# 获取文本
|
|
||||||
url_content = legacy['full_text']
|
|
||||||
# 判断是否需要翻译
|
|
||||||
if needs_translation(url_content):
|
|
||||||
item['es_content'] = translate_content_with_paragraphs(url_content) # TODO 翻译
|
|
||||||
else:
|
|
||||||
item['es_content'] = url_content
|
|
||||||
# 下载图片
|
# 下载图片
|
||||||
image_url_list = []
|
image_url_list = []
|
||||||
if 'entities' in legacy and 'media' in legacy['entities']:
|
if 'entities' in legacy and 'media' in legacy['entities']:
|
||||||
|
|||||||
@ -1,94 +0,0 @@
|
|||||||
from MediaSpiders.settings import MAX_TEXT_LENGTH, TRANSLATE_API_URL, REQUEST_DELAY
|
|
||||||
import requests
|
|
||||||
import time
|
|
||||||
from typing import List, Tuple, Optional
|
|
||||||
from langdetect import detect, LangDetectException
|
|
||||||
|
|
||||||
def normalize_newlines(text: str) -> str:
|
|
||||||
"""将 \r\n 和 \r 统一转换为 \n"""
|
|
||||||
if not text:
|
|
||||||
return text
|
|
||||||
return text.replace('\r\n', '\n').replace('\r', '\n')
|
|
||||||
|
|
||||||
|
|
||||||
def translate_single(text: str, source_lang: str = "auto", target_lang: str = "zh") -> Optional[str]:
|
|
||||||
"""翻译单段文本,失败返回 None"""
|
|
||||||
if not text or not text.strip():
|
|
||||||
return ""
|
|
||||||
|
|
||||||
payload = {
|
|
||||||
"text": text[:MAX_TEXT_LENGTH],
|
|
||||||
"source_lang": source_lang,
|
|
||||||
"target_lang": target_lang
|
|
||||||
}
|
|
||||||
|
|
||||||
try:
|
|
||||||
response = requests.post(TRANSLATE_API_URL, json=payload, timeout=10)
|
|
||||||
response.raise_for_status()
|
|
||||||
result = response.json()
|
|
||||||
return result.get("translated_text")
|
|
||||||
except Exception as e:
|
|
||||||
print(f"⚠️ 翻译失败: {e}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def translate_content_with_paragraphs(content: str) -> str:
|
|
||||||
"""
|
|
||||||
按段落翻译内容,支持容错:
|
|
||||||
- 某段失败 → 跳过该段(保留空行或原文)
|
|
||||||
- 返回拼接后的完整内容
|
|
||||||
"""
|
|
||||||
if not content:
|
|
||||||
return ""
|
|
||||||
|
|
||||||
# 标准化换行符
|
|
||||||
content = normalize_newlines(content)
|
|
||||||
paragraphs = content.split('\n')
|
|
||||||
translated_paragraphs = []
|
|
||||||
|
|
||||||
for para in paragraphs:
|
|
||||||
if not para.strip():
|
|
||||||
# 保留空行
|
|
||||||
translated_paragraphs.append("")
|
|
||||||
continue
|
|
||||||
|
|
||||||
trans = translate_single(para)
|
|
||||||
if trans is None:
|
|
||||||
# 段落翻译失败:跳过该段(可选:保留原文或留空)
|
|
||||||
print(f" ⚠️ 段落翻译失败,跳过: {para[:30]}...")
|
|
||||||
translated_paragraphs.append("") # 或 append(para) 保留原文
|
|
||||||
else:
|
|
||||||
translated_paragraphs.append(trans)
|
|
||||||
|
|
||||||
time.sleep(REQUEST_DELAY)
|
|
||||||
|
|
||||||
return '\n'.join(translated_paragraphs)
|
|
||||||
|
|
||||||
|
|
||||||
# ================== 数据库操作 ==================
|
|
||||||
|
|
||||||
def update_record(cursor, es_sid: int, new_title: str, new_content: str):
|
|
||||||
update_query = """
|
|
||||||
UPDATE indeximos
|
|
||||||
SET es_title = % s, es_content = % s
|
|
||||||
WHERE es_sid = % s
|
|
||||||
"""
|
|
||||||
cursor.execute(update_query, (new_title, new_content, es_sid))
|
|
||||||
|
|
||||||
|
|
||||||
def needs_translation(text: str) -> bool:
|
|
||||||
"""
|
|
||||||
判断文本是否需要翻译:
|
|
||||||
- 如果检测到语言是 'zh'(中文),则不需要翻译,返回 False
|
|
||||||
- 否则需要翻译,返回 True
|
|
||||||
- 若无法检测(如空文本、纯符号等)
|
|
||||||
"""
|
|
||||||
if not text or not text.strip():
|
|
||||||
return False # 空文本无需翻译
|
|
||||||
|
|
||||||
try:
|
|
||||||
lang = detect(text.strip())
|
|
||||||
return lang != 'zh-cn'
|
|
||||||
except LangDetectException:
|
|
||||||
# 无法检测语言(如全是数字、标点等),保守起见视为需要翻译
|
|
||||||
return True
|
|
||||||
@ -19,4 +19,4 @@ dirpath = os.path.dirname(os.path.abspath(__file__))
|
|||||||
|
|
||||||
sys.path.append(dirpath)
|
sys.path.append(dirpath)
|
||||||
# 等效于:scrapy crawl FacebookUserSpider -a params="{}"
|
# 等效于:scrapy crawl FacebookUserSpider -a params="{}"
|
||||||
execute(['scrapy', 'crawl', 'TwitterUserSpider', '-a', 'params={}'])
|
execute(['scrapy', 'crawl', 'TwitterUserInfoSpider', '-a', 'params={}'])
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user