Compare commits
70 Commits
remote-dsp
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0e89a4f642 | ||
|
|
36fa73bd12 | ||
|
|
734cab960b | ||
|
|
592e8b6cfb | ||
|
|
5b3bf034f8 | ||
|
|
00789ba275 | ||
|
|
8c3f2ffad0 | ||
|
|
85158d00db | ||
|
|
e1893f0ae3 | ||
|
|
d01035577b | ||
|
|
feb48f3579 | ||
|
|
d4ac0c27cd | ||
|
|
2c974902ab | ||
|
|
b5384f8bcd | ||
|
|
2e9654966b | ||
|
|
e4f28c6a89 | ||
|
|
953fdc81dc | ||
|
|
cfe6c3af85 | ||
|
|
63b0d7090a | ||
|
|
dfb65f11bf | ||
|
|
d3a46db615 | ||
|
|
cde711f1d5 | ||
|
|
15819c7260 | ||
|
|
ca3d5fb9f3 | ||
|
|
5a3a1a5c52 | ||
|
|
df4c8cceac | ||
|
|
d44a40bd1f | ||
|
|
eea15d62e1 | ||
|
|
3fdd2f5473 | ||
|
|
d023703622 | ||
|
|
a375b8ead1 | ||
|
|
1f3d824cda | ||
|
|
49d20396b2 | ||
|
|
31580190e2 | ||
|
|
959ffe6b2e | ||
|
|
93a8ff5ef4 | ||
|
|
5cc725fae9 | ||
|
|
f62c55de99 | ||
|
|
4d2035b9fa | ||
|
|
bceee18244 | ||
|
|
d5be45ec95 | ||
|
|
b827e33dbd | ||
|
|
7b3a83a1ab | ||
|
|
8631b0febf | ||
|
|
bf91c06801 | ||
|
|
4d3cb2381a | ||
|
|
073f4325d0 | ||
|
|
8c84df0fdc | ||
|
|
f7a210473a | ||
|
|
ce478f495c | ||
|
|
92c8cdf9b2 | ||
|
|
0008e619d1 | ||
|
|
399165404e | ||
|
|
910794aff7 | ||
|
|
ee9473a6c3 | ||
|
|
0b2abd342a | ||
|
|
9a36e9c5b5 | ||
|
|
488bc2fdca | ||
|
|
91d3f484f0 | ||
| 5a087b3275 | |||
|
|
89df3771e7 | ||
|
|
becee60b6c | ||
|
|
ee958357b0 | ||
|
|
a69ff25ce4 | ||
|
|
79b2d0d20a | ||
|
|
81cac59adb | ||
|
|
970d86ed7d | ||
|
|
4de86e5f40 | ||
|
|
3cb5a02f03 | ||
|
|
9f3eb9cf94 |
2
.gitignore
vendored
2
.gitignore
vendored
@ -8,6 +8,7 @@ __pycache__/
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
.vscode/
|
||||
/.venv/
|
||||
env/
|
||||
build/
|
||||
develop-eggs/
|
||||
@ -64,3 +65,4 @@ target/
|
||||
node_modules/
|
||||
.arts/
|
||||
.jlsp/
|
||||
*.iml
|
||||
|
||||
21
deploy/baidu-translate/Dockerfile
Normal file
21
deploy/baidu-translate/Dockerfile
Normal file
@ -0,0 +1,21 @@
|
||||
# 使用官方 Python 3.8.2 slim 镜像(精简版)
|
||||
FROM python:3.8.2-slim
|
||||
|
||||
# 设置工作目录
|
||||
WORKDIR /app
|
||||
|
||||
# 安装依赖前先复制 requirements(利用 Docker 缓存)
|
||||
COPY requirements.txt .
|
||||
|
||||
# 升级 pip 并安装依赖(使用国内源加速,可选)
|
||||
RUN pip install --no-cache-dir --upgrade pip \
|
||||
&& pip install --no-cache-dir -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple/
|
||||
|
||||
# 复制应用代码
|
||||
COPY . .
|
||||
|
||||
# 暴露端口
|
||||
EXPOSE 5000
|
||||
|
||||
# 启动命令(使用 gunicorn 提升生产性能)
|
||||
CMD ["gunicorn", "--bind", "0.0.0.0:5000", "--workers", "2", "--timeout", "60", "main:app"]
|
||||
57
deploy/baidu-translate/main.py
Normal file
57
deploy/baidu-translate/main.py
Normal file
@ -0,0 +1,57 @@
|
||||
# main.py
|
||||
|
||||
from flask import Flask, request, jsonify
|
||||
from translate import translate_text
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
|
||||
@app.route('/translate', methods=['POST'])
|
||||
def translate_api():
|
||||
"""
|
||||
多语言翻译接口
|
||||
请求体示例:
|
||||
{
|
||||
"text": "Hello world",
|
||||
"source_lang": "en", // 可选,默认 auto
|
||||
"target_lang": "zh" // 可选,默认 zh
|
||||
}
|
||||
"""
|
||||
data = request.get_json()
|
||||
if not data or 'text' not in data:
|
||||
return jsonify({"error": "缺少参数 'text'"}), 400
|
||||
|
||||
text = data['text']
|
||||
source_lang = data.get('source_lang', 'auto')
|
||||
target_lang = data.get('target_lang', 'zh')
|
||||
|
||||
result = translate_text(text, source_lang, target_lang)
|
||||
|
||||
if result['success']:
|
||||
return jsonify({
|
||||
"translated_text": result['translated_text'],
|
||||
"source_lang": source_lang,
|
||||
"target_lang": target_lang
|
||||
})
|
||||
else:
|
||||
return jsonify({"error": result['error']}), 400
|
||||
|
||||
|
||||
@app.route('/health', methods=['GET'])
|
||||
def health_check():
|
||||
return jsonify({"status": "ok", "service": "baidu-translate"})
|
||||
|
||||
|
||||
@app.route('/', methods=['GET'])
|
||||
def index():
|
||||
return jsonify({
|
||||
"message": "Baidu Translate API Service",
|
||||
"endpoints": {
|
||||
"translate": "POST /translate",
|
||||
"health": "GET /health"
|
||||
}
|
||||
})
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.run(host='0.0.0.0', port=5000, debug=False)
|
||||
3
deploy/baidu-translate/requirements.txt
Normal file
3
deploy/baidu-translate/requirements.txt
Normal file
@ -0,0 +1,3 @@
|
||||
Flask==2.3.3
|
||||
requests==2.31.0
|
||||
gunicorn==21.2.0
|
||||
26
deploy/baidu-translate/run.sh
Normal file
26
deploy/baidu-translate/run.sh
Normal file
@ -0,0 +1,26 @@
|
||||
#!/bin/bash
|
||||
|
||||
# 构建镜像
|
||||
echo "正在构建 Docker 镜像..."
|
||||
docker build -t baidu-translate-api:latest .
|
||||
|
||||
# 停止并删除旧容器(如果存在)
|
||||
if [ "$(docker ps -q -f name=baidu-translate)" ]; then
|
||||
echo "停止旧容器..."
|
||||
docker stop baidu-translate
|
||||
fi
|
||||
|
||||
if [ "$(docker ps -aq -f status=exited -f name=baidu-translate)" ]; then
|
||||
echo "删除旧容器..."
|
||||
docker rm baidu-translate
|
||||
fi
|
||||
|
||||
# 启动新容器
|
||||
echo "启动容器..."
|
||||
docker run -d \
|
||||
--name baidu-translate \
|
||||
-p 28081:5000 \
|
||||
--restart unless-stopped \
|
||||
baidu-translate-api:latest
|
||||
|
||||
echo "服务已启动!访问 http://<服务器IP>:5000/health"
|
||||
26
deploy/baidu-translate/settings.py
Normal file
26
deploy/baidu-translate/settings.py
Normal file
@ -0,0 +1,26 @@
|
||||
# settings.py
|
||||
|
||||
# 百度翻译 API 配置
|
||||
BAIDU_APP_ID = "20200811000539778"
|
||||
BAIDU_SECRET_KEY = "uK9IyUhuEWX3PIqN75iC"
|
||||
|
||||
TIMEOUT = 10
|
||||
MAX_TEXT_LENGTH = 100
|
||||
|
||||
# 百度语言代码映射(ISO 639-1 → Baidu Code)
|
||||
BAIDU_LANG_MAP = {
|
||||
'zh': 'zh',
|
||||
'en': 'en',
|
||||
'ko': 'kor',
|
||||
'ja': 'jp',
|
||||
'fr': 'fra',
|
||||
'es': 'spa',
|
||||
'ru': 'ru',
|
||||
'de': 'de',
|
||||
'pt': 'pt',
|
||||
'it': 'it',
|
||||
'ar': 'ara',
|
||||
'th': 'th',
|
||||
'vi': 'vie',
|
||||
# 可继续扩展
|
||||
}
|
||||
71
deploy/baidu-translate/translate.py
Normal file
71
deploy/baidu-translate/translate.py
Normal file
@ -0,0 +1,71 @@
|
||||
# translate.py
|
||||
|
||||
import hashlib
|
||||
import random
|
||||
import requests
|
||||
import settings
|
||||
|
||||
|
||||
def iso_to_baidu_lang(iso_code: str) -> str:
|
||||
"""将 ISO 639-1 语言代码转换为百度翻译所需代码"""
|
||||
code = settings.BAIDU_LANG_MAP.get(iso_code.lower())
|
||||
if code is None:
|
||||
raise ValueError(f"不支持的语言代码: {iso_code}")
|
||||
return code
|
||||
|
||||
|
||||
def translate_text(text: str, source_lang: str = "auto", target_lang: str = "zh") -> dict:
|
||||
"""
|
||||
使用百度翻译 API 进行多语言翻译
|
||||
:param text: 原文
|
||||
:param source_lang: 源语言 ISO 代码(如 'ko', 'en'),默认 'auto' 自动检测
|
||||
:param target_lang: 目标语言 ISO 代码,默认 'zh'
|
||||
:return: {'success': bool, 'translated_text': str, 'error': str (optional)}
|
||||
"""
|
||||
if not text or not text.strip():
|
||||
return {"success": False, "error": "输入文本为空"}
|
||||
|
||||
try:
|
||||
from_lang = "auto" if source_lang == "auto" else iso_to_baidu_lang(source_lang)
|
||||
to_lang = iso_to_baidu_lang(target_lang)
|
||||
except ValueError as e:
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
q = text[:settings.MAX_TEXT_LENGTH]
|
||||
|
||||
try:
|
||||
salt = random.randint(32768, 65536)
|
||||
sign_str = settings.BAIDU_APP_ID + q + str(salt) + settings.BAIDU_SECRET_KEY
|
||||
sign = hashlib.md5(sign_str.encode()).hexdigest()
|
||||
|
||||
payload = {
|
||||
'q': q,
|
||||
'from': from_lang,
|
||||
'to': to_lang,
|
||||
'appid': settings.BAIDU_APP_ID,
|
||||
'salt': salt,
|
||||
'sign': sign
|
||||
}
|
||||
|
||||
response = requests.post(
|
||||
"https://fanyi-api.baidu.com/api/trans/vip/translate",
|
||||
data=payload,
|
||||
timeout=settings.TIMEOUT
|
||||
)
|
||||
response.raise_for_status()
|
||||
result = response.json()
|
||||
|
||||
if 'error_code' in result:
|
||||
error_msg = f"百度API错误 {result.get('error_code')}: {result.get('error_msg', '')}"
|
||||
return {"success": False, "error": error_msg}
|
||||
|
||||
if 'trans_result' not in result or not result['trans_result']:
|
||||
return {"success": False, "error": "翻译结果为空"}
|
||||
|
||||
translated = result['trans_result'][0]['dst']
|
||||
return {"success": True, "translated_text": translated}
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
return {"success": False, "error": f"网络请求失败: {str(e)}"}
|
||||
except Exception as e:
|
||||
return {"success": False, "error": f"未知错误: {str(e)}"}
|
||||
198
dsp/dsp.iml
198
dsp/dsp.iml
@ -1,199 +1,23 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module org.jetbrains.idea.maven.project.MavenProjectsManager.isMavenModule="true" type="JAVA_MODULE" version="4">
|
||||
<module version="4">
|
||||
<component name="FacetManager">
|
||||
<facet type="web" name="Web">
|
||||
<configuration>
|
||||
<webroots />
|
||||
</configuration>
|
||||
</facet>
|
||||
<facet type="jpa" name="JPA">
|
||||
<configuration>
|
||||
<setting name="validation-enabled" value="true" />
|
||||
<setting name="provider-name" value="Hibernate" />
|
||||
<datasource-mapping>
|
||||
<factory-entry name="entityManagerFactory" />
|
||||
</datasource-mapping>
|
||||
<naming-strategy-map />
|
||||
</configuration>
|
||||
</facet>
|
||||
<facet type="Spring" name="Spring">
|
||||
<configuration />
|
||||
</facet>
|
||||
</component>
|
||||
<component name="NewModuleRootManager" LANGUAGE_LEVEL="JDK_1_8">
|
||||
<output url="file://$MODULE_DIR$/target/classes" />
|
||||
<output-test url="file://$MODULE_DIR$/target/test-classes" />
|
||||
<content url="file://$MODULE_DIR$">
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/main/java" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/main/resources" type="java-resource" />
|
||||
<excludeFolder url="file://$MODULE_DIR$/target" />
|
||||
</content>
|
||||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
<orderEntry type="library" name="Maven: org.jetbrains:annotations:26.0.2" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.commons:commons-compress:1.18" level="project" />
|
||||
<orderEntry type="library" name="Maven: com.alibaba:fastjson:1.2.75" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.projectlombok:lombok:1.18.16" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.springframework.boot:spring-boot-starter-web:2.4.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.springframework.boot:spring-boot-starter:2.4.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.springframework.boot:spring-boot-starter-logging:2.4.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: ch.qos.logback:logback-classic:1.2.3" level="project" />
|
||||
<orderEntry type="library" name="Maven: ch.qos.logback:logback-core:1.2.3" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-to-slf4j:2.13.3" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-api:2.13.3" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.slf4j:jul-to-slf4j:1.7.30" level="project" />
|
||||
<orderEntry type="library" name="Maven: jakarta.annotation:jakarta.annotation-api:1.3.5" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.yaml:snakeyaml:1.27" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.springframework.boot:spring-boot-starter-json:2.4.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: com.fasterxml.jackson.datatype:jackson-datatype-jdk8:2.11.3" level="project" />
|
||||
<orderEntry type="library" name="Maven: com.fasterxml.jackson.datatype:jackson-datatype-jsr310:2.11.3" level="project" />
|
||||
<orderEntry type="library" name="Maven: com.fasterxml.jackson.module:jackson-module-parameter-names:2.11.3" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.springframework.boot:spring-boot-starter-tomcat:2.4.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.tomcat.embed:tomcat-embed-core:9.0.41" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.glassfish:jakarta.el:3.0.3" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.tomcat.embed:tomcat-embed-websocket:9.0.41" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.springframework:spring-web:5.3.2" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.springframework:spring-beans:5.3.2" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.springframework:spring-webmvc:5.3.2" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.springframework:spring-aop:5.3.2" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.springframework:spring-expression:5.3.2" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.springframework.boot:spring-boot-starter-data-elasticsearch:2.4.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.springframework.data:spring-data-elasticsearch:4.1.2" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.springframework:spring-tx:5.3.2" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.springframework.data:spring-data-commons:2.4.2" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.elasticsearch.plugin:transport-netty4-client:7.7.0" level="project" />
|
||||
<orderEntry type="library" name="Maven: io.netty:netty-buffer:4.1.55.Final" level="project" />
|
||||
<orderEntry type="library" name="Maven: io.netty:netty-codec:4.1.55.Final" level="project" />
|
||||
<orderEntry type="library" name="Maven: io.netty:netty-codec-http:4.1.55.Final" level="project" />
|
||||
<orderEntry type="library" name="Maven: io.netty:netty-common:4.1.55.Final" level="project" />
|
||||
<orderEntry type="library" name="Maven: io.netty:netty-handler:4.1.55.Final" level="project" />
|
||||
<orderEntry type="library" name="Maven: io.netty:netty-resolver:4.1.55.Final" level="project" />
|
||||
<orderEntry type="library" name="Maven: io.netty:netty-transport:4.1.55.Final" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.elasticsearch.client:elasticsearch-rest-high-level-client:7.7.0" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.elasticsearch:elasticsearch:7.7.0" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.elasticsearch:elasticsearch-core:7.7.0" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.elasticsearch:elasticsearch-secure-sm:7.7.0" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.elasticsearch:elasticsearch-x-content:7.7.0" level="project" />
|
||||
<orderEntry type="library" name="Maven: com.fasterxml.jackson.dataformat:jackson-dataformat-smile:2.11.3" level="project" />
|
||||
<orderEntry type="library" name="Maven: com.fasterxml.jackson.dataformat:jackson-dataformat-yaml:2.11.3" level="project" />
|
||||
<orderEntry type="library" name="Maven: com.fasterxml.jackson.dataformat:jackson-dataformat-cbor:2.11.3" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.elasticsearch:elasticsearch-geo:7.7.0" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.lucene:lucene-core:8.5.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.lucene:lucene-analyzers-common:8.5.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.lucene:lucene-backward-codecs:8.5.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.lucene:lucene-grouping:8.5.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.lucene:lucene-highlighter:8.5.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.lucene:lucene-join:8.5.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.lucene:lucene-memory:8.5.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.lucene:lucene-misc:8.5.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.lucene:lucene-queries:8.5.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.lucene:lucene-queryparser:8.5.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.lucene:lucene-sandbox:8.5.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.lucene:lucene-spatial-extras:8.5.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.lucene:lucene-spatial3d:8.5.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.lucene:lucene-suggest:8.5.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.elasticsearch:elasticsearch-cli:7.7.0" level="project" />
|
||||
<orderEntry type="library" name="Maven: com.carrotsearch:hppc:0.8.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: joda-time:joda-time:2.10.4" level="project" />
|
||||
<orderEntry type="library" name="Maven: com.tdunning:t-digest:3.2" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.hdrhistogram:HdrHistogram:2.1.9" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.elasticsearch:jna:4.5.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.elasticsearch.client:elasticsearch-rest-client:7.7.0" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.httpcomponents:httpclient:4.5.13" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.httpcomponents:httpcore:4.4.14" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.httpcomponents:httpasyncclient:4.1.4" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.httpcomponents:httpcore-nio:4.4.14" level="project" />
|
||||
<orderEntry type="library" name="Maven: commons-codec:commons-codec:1.15" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.elasticsearch.plugin:mapper-extras-client:7.7.0" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.elasticsearch.plugin:parent-join-client:7.7.0" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.elasticsearch.plugin:aggs-matrix-stats-client:7.7.0" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.elasticsearch.plugin:rank-eval-client:7.7.0" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.elasticsearch.plugin:lang-mustache-client:7.7.0" level="project" />
|
||||
<orderEntry type="library" name="Maven: com.github.spullara.mustache.java:compiler:0.9.6" level="project" />
|
||||
<orderEntry type="library" name="Maven: com.fasterxml.jackson.core:jackson-core:2.11.3" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.slf4j:slf4j-api:1.7.30" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.springframework.cloud:spring-cloud-stream:3.0.7.RELEASE" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.springframework.boot:spring-boot-starter-validation:2.4.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.hibernate.validator:hibernate-validator:6.1.6.Final" level="project" />
|
||||
<orderEntry type="library" name="Maven: jakarta.validation:jakarta.validation-api:2.0.2" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.jboss.logging:jboss-logging:3.4.1.Final" level="project" />
|
||||
<orderEntry type="library" name="Maven: com.fasterxml:classmate:1.5.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.springframework:spring-messaging:5.3.2" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.springframework.integration:spring-integration-core:5.4.2" level="project" />
|
||||
<orderEntry type="library" name="Maven: io.projectreactor:reactor-core:3.4.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.reactivestreams:reactive-streams:1.0.3" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.springframework.integration:spring-integration-jmx:5.4.2" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.springframework.retry:spring-retry:1.3.0" level="project" />
|
||||
<orderEntry type="library" name="Maven: javax.annotation:javax.annotation-api:1.3.2" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.springframework.cloud:spring-cloud-function-context:3.0.9.RELEASE" level="project" />
|
||||
<orderEntry type="library" name="Maven: net.jodah:typetools:0.6.2" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.springframework.cloud:spring-cloud-function-core:3.0.9.RELEASE" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.springframework.cloud:spring-cloud-stream-binder-kafka:3.0.7.RELEASE" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.springframework.cloud:spring-cloud-stream-binder-kafka-core:3.0.7.RELEASE" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.springframework.integration:spring-integration-kafka:5.4.2" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.kafka:kafka-clients:2.6.0" level="project" />
|
||||
<orderEntry type="library" name="Maven: com.github.luben:zstd-jni:1.4.4-7" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.lz4:lz4-java:1.7.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.xerial.snappy:snappy-java:1.1.7.3" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.springframework.kafka:spring-kafka:2.6.4" level="project" />
|
||||
<orderEntry type="library" name="Maven: com.google.protobuf:protobuf-java:3.11.4" level="project" />
|
||||
<orderEntry type="library" name="Maven: com.squareup.okhttp3:okhttp:4.8.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: com.squareup.okio:okio:2.7.0" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.jetbrains.kotlin:kotlin-stdlib-common:1.4.21" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.jetbrains.kotlin:kotlin-stdlib:1.4.21" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: org.springframework.boot:spring-boot-starter-test:2.4.1" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: org.springframework.boot:spring-boot-test:2.4.1" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: org.springframework.boot:spring-boot-test-autoconfigure:2.4.1" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: com.jayway.jsonpath:json-path:2.4.0" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: net.minidev:json-smart:2.3" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: net.minidev:accessors-smart:1.2" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: org.ow2.asm:asm:5.0.4" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: jakarta.xml.bind:jakarta.xml.bind-api:2.3.3" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: jakarta.activation:jakarta.activation-api:1.2.2" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: org.assertj:assertj-core:3.18.1" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: org.hamcrest:hamcrest:2.2" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: org.junit.jupiter:junit-jupiter:5.7.0" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: org.junit.jupiter:junit-jupiter-params:5.7.0" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: org.junit.jupiter:junit-jupiter-engine:5.7.0" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: org.junit.platform:junit-platform-engine:1.7.0" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: org.mockito:mockito-core:3.6.28" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: net.bytebuddy:byte-buddy:1.10.18" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: net.bytebuddy:byte-buddy-agent:1.10.18" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: org.objenesis:objenesis:3.1" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: org.mockito:mockito-junit-jupiter:3.6.28" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: org.skyscreamer:jsonassert:1.5.0" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: com.vaadin.external.google:android-json:0.0.20131108.vaadin1" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.springframework:spring-core:5.3.2" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.springframework:spring-jcl:5.3.2" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: org.springframework:spring-test:5.3.2" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: org.xmlunit:xmlunit-core:2.7.0" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: org.springframework.cloud:spring-cloud-stream-test-support:3.0.7.RELEASE" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.springframework.boot:spring-boot-autoconfigure:2.4.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: com.fasterxml.jackson.core:jackson-databind:2.11.3" level="project" />
|
||||
<orderEntry type="library" name="Maven: com.fasterxml.jackson.core:jackson-annotations:2.11.3" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: org.springframework.kafka:spring-kafka-test:2.6.4" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.springframework:spring-context:5.3.2" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: org.apache.kafka:kafka-clients:test:2.6.0" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: org.apache.kafka:kafka-streams:2.6.0" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: org.apache.kafka:connect-json:2.6.0" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: org.apache.kafka:connect-api:2.6.0" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: org.rocksdb:rocksdbjni:5.18.4" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: org.apache.kafka:kafka-streams-test-utils:2.6.0" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: org.apache.kafka:kafka_2.13:2.6.0" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: com.fasterxml.jackson.module:jackson-module-scala_2.13:2.11.3" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: com.fasterxml.jackson.module:jackson-module-paranamer:2.11.3" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: com.thoughtworks.paranamer:paranamer:2.8" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: com.fasterxml.jackson.dataformat:jackson-dataformat-csv:2.11.3" level="project" />
|
||||
<orderEntry type="library" name="Maven: net.sf.jopt-simple:jopt-simple:5.0.4" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: com.yammer.metrics:metrics-core:2.2.0" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: org.scala-lang.modules:scala-collection-compat_2.13:2.1.6" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: org.scala-lang.modules:scala-java8-compat_2.13:0.9.1" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: org.scala-lang:scala-library:2.13.2" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: org.scala-lang:scala-reflect:2.13.2" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: com.typesafe.scala-logging:scala-logging_2.13:3.9.2" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: org.apache.zookeeper:zookeeper:3.5.8" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: org.apache.zookeeper:zookeeper-jute:3.5.8" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: org.apache.yetus:audience-annotations:0.5.0" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: io.netty:netty-transport-native-epoll:4.1.55.Final" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: io.netty:netty-transport-native-unix-common:4.1.55.Final" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: commons-cli:commons-cli:1.4" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: org.apache.kafka:kafka_2.13:test:2.6.0" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: org.junit.jupiter:junit-jupiter-api:5.7.0" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: org.apiguardian:apiguardian-api:1.1.0" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: org.opentest4j:opentest4j:1.2.0" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: org.junit.platform:junit-platform-commons:1.7.0" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.springframework.boot:spring-boot-devtools:2.4.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.springframework.boot:spring-boot:2.4.1" level="project" />
|
||||
<orderEntry type="library" scope="RUNTIME" name="Maven: mysql:mysql-connector-java:8.0.22" level="project" />
|
||||
</component>
|
||||
</module>
|
||||
33
dsp/pom.xml
33
dsp/pom.xml
@ -77,6 +77,7 @@
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.springframework.cloud</groupId>
|
||||
<artifactId>spring-cloud-stream-test-support</artifactId>
|
||||
@ -94,15 +95,39 @@
|
||||
<optional>true</optional>
|
||||
<scope>true</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.projectlombok</groupId>
|
||||
<artifactId>lombok</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>mysql</groupId>
|
||||
<artifactId>mysql-connector-java</artifactId>
|
||||
<scope>runtime</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>commons-net</groupId>
|
||||
<artifactId>commons-net</artifactId>
|
||||
<version>3.10.0</version> <!-- 或使用最新版本 -->
|
||||
</dependency>
|
||||
<!-- Spring Data JPA -->
|
||||
<dependency>
|
||||
<groupId>org.springframework.boot</groupId>
|
||||
<artifactId>spring-boot-starter-data-jpa</artifactId>
|
||||
</dependency>
|
||||
|
||||
<!-- Apache POI for Excel (xlsx) -->
|
||||
<dependency>
|
||||
<groupId>org.apache.poi</groupId>
|
||||
<artifactId>poi</artifactId>
|
||||
<version>5.2.4</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.poi</groupId>
|
||||
<artifactId>poi-ooxml</artifactId>
|
||||
<version>5.2.4</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.jcraft</groupId>
|
||||
<artifactId>jsch</artifactId>
|
||||
<version>0.1.55</version> <!-- 推荐使用 0.1.55+ -->
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
|
||||
<dependencyManagement>
|
||||
|
||||
@ -3,9 +3,10 @@ package com.jsc.dsp;
|
||||
import org.springframework.boot.SpringApplication;
|
||||
import org.springframework.boot.autoconfigure.SpringBootApplication;
|
||||
import org.springframework.boot.autoconfigure.jdbc.DataSourceAutoConfiguration;
|
||||
import org.springframework.data.jpa.repository.config.EnableJpaRepositories;
|
||||
import org.springframework.scheduling.annotation.EnableScheduling;
|
||||
|
||||
@SpringBootApplication(exclude = DataSourceAutoConfiguration.class)
|
||||
@SpringBootApplication
|
||||
@EnableScheduling
|
||||
|
||||
public class DspApplication {
|
||||
|
||||
@ -16,18 +16,6 @@ import java.util.concurrent.Executors;
|
||||
@Component
|
||||
public class Configuration {
|
||||
|
||||
@Value("${es.ip}")
|
||||
String esIp;
|
||||
|
||||
@Value("${es.port}")
|
||||
Integer esPort;
|
||||
|
||||
@Value("${es.username}")
|
||||
String esUsername;
|
||||
|
||||
@Value("${es.password}")
|
||||
String esPassword;
|
||||
|
||||
@Bean
|
||||
public JacksonJsonParser getJacksonParser() {
|
||||
return new JacksonJsonParser();
|
||||
@ -48,8 +36,4 @@ public class Configuration {
|
||||
return Executors.newFixedThreadPool(4);
|
||||
}
|
||||
|
||||
@Bean
|
||||
public RestHighLevelClient esClient() {
|
||||
return EsUtils.getElasticsearchClient(esIp, esPort, esUsername, esPassword);
|
||||
}
|
||||
}
|
||||
|
||||
@ -0,0 +1,77 @@
|
||||
package com.jsc.dsp.controller;
|
||||
|
||||
import com.alibaba.fastjson.JSONObject;
|
||||
import com.jsc.dsp.model.ReturnT;
|
||||
import com.jsc.dsp.utils.ExportAndUploadUtils;
|
||||
import com.jsc.dsp.utils.DatabaseConnector;
|
||||
import org.springframework.web.bind.annotation.PostMapping;
|
||||
import org.springframework.web.bind.annotation.RequestBody;
|
||||
import org.springframework.web.bind.annotation.RequestMapping;
|
||||
import org.springframework.web.bind.annotation.RestController;
|
||||
|
||||
import javax.annotation.Resource;
|
||||
|
||||
@RestController
|
||||
@RequestMapping("/export")
|
||||
public class ExportController {
|
||||
|
||||
@Resource
|
||||
DatabaseConnector databaseConnector;
|
||||
|
||||
@Resource
|
||||
ExportAndUploadUtils exportAndUploadUtils;
|
||||
|
||||
@PostMapping("/exportExcel")
|
||||
public ReturnT<String> exportExcel(@RequestBody JSONObject object) {
|
||||
try {
|
||||
String startTime = object.getString("startTime");
|
||||
databaseConnector.exportToXlsx(startTime);
|
||||
return new ReturnT<>(200, "", "");
|
||||
} catch (Exception e) {
|
||||
return new ReturnT<>(500, e.getMessage(), "");
|
||||
}
|
||||
}
|
||||
|
||||
@PostMapping("/exportTwitterExcel")
|
||||
public ReturnT<String> triggerTwitterTask(@RequestBody JSONObject object) {
|
||||
try {
|
||||
String startTime = object.getString("startTime");
|
||||
databaseConnector.twitterToXlsx(startTime);
|
||||
return new ReturnT<>(200, "", "");
|
||||
} catch (Exception e) {
|
||||
return new ReturnT<>(500, e.getMessage(), "");
|
||||
}
|
||||
}
|
||||
|
||||
@PostMapping("/exportHotSearchExcel")
|
||||
public ReturnT<String> exportHotSearchExcel(@RequestBody JSONObject object) {
|
||||
try {
|
||||
String startTime = object.getString("startTime");
|
||||
databaseConnector.hotSearchToXlsx(startTime);
|
||||
return new ReturnT<>(200, "", "");
|
||||
} catch (Exception e) {
|
||||
return new ReturnT<>(500, e.getMessage(), "");
|
||||
}
|
||||
}
|
||||
|
||||
@PostMapping("/triggerTask")
|
||||
public ReturnT<String> triggerTask() {
|
||||
try {
|
||||
new Thread(() -> exportAndUploadUtils.exportNewsDataAndUpload()).start();
|
||||
return new ReturnT<>(200, "", "");
|
||||
} catch (Exception e) {
|
||||
return new ReturnT<>(500, e.getMessage(), "");
|
||||
}
|
||||
}
|
||||
|
||||
@PostMapping("/triggerHotSearchTask")
|
||||
public ReturnT<String> triggerHotSearchTask() {
|
||||
try {
|
||||
new Thread(() -> exportAndUploadUtils.exportHotSearchAndUpload()).start();
|
||||
return new ReturnT<>(200, "", "");
|
||||
} catch (Exception e) {
|
||||
return new ReturnT<>(500, e.getMessage(), "");
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
10
dsp/src/main/java/com/jsc/dsp/dao/ConfigRepository.java
Normal file
10
dsp/src/main/java/com/jsc/dsp/dao/ConfigRepository.java
Normal file
@ -0,0 +1,10 @@
|
||||
package com.jsc.dsp.dao;
|
||||
|
||||
import com.jsc.dsp.model.Config;
|
||||
import org.springframework.data.jpa.repository.JpaRepository;
|
||||
import org.springframework.stereotype.Repository;
|
||||
|
||||
@Repository
|
||||
public interface ConfigRepository extends JpaRepository<Config, Integer> {
|
||||
Config findFirstByConfigName(String configName);
|
||||
}
|
||||
@ -0,0 +1,12 @@
|
||||
package com.jsc.dsp.dao;
|
||||
|
||||
import com.jsc.dsp.model.EsDataHotSearchView;
|
||||
import org.springframework.data.jpa.repository.JpaRepository;
|
||||
import org.springframework.stereotype.Repository;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@Repository
|
||||
public interface EsDataHotSearchRepository extends JpaRepository<EsDataHotSearchView, String> {
|
||||
List<EsDataHotSearchView> findAllByEsLoadtimeAfter(String loadtime);
|
||||
}
|
||||
12
dsp/src/main/java/com/jsc/dsp/dao/EsDataNewsRepository.java
Normal file
12
dsp/src/main/java/com/jsc/dsp/dao/EsDataNewsRepository.java
Normal file
@ -0,0 +1,12 @@
|
||||
package com.jsc.dsp.dao;
|
||||
|
||||
import com.jsc.dsp.model.EsDataNewsView;
|
||||
import org.springframework.data.jpa.repository.JpaRepository;
|
||||
import org.springframework.stereotype.Repository;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@Repository
|
||||
public interface EsDataNewsRepository extends JpaRepository<EsDataNewsView, String> {
|
||||
List<EsDataNewsView> findAllByEsLoadtimeAfter(String loadtime);
|
||||
}
|
||||
@ -0,0 +1,12 @@
|
||||
package com.jsc.dsp.dao;
|
||||
|
||||
import com.jsc.dsp.model.EsDataTwitterView;
|
||||
import org.springframework.data.jpa.repository.JpaRepository;
|
||||
import org.springframework.stereotype.Repository;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@Repository
|
||||
public interface EsDataTwitterRepository extends JpaRepository<EsDataTwitterView, String> {
|
||||
List<EsDataTwitterView> findAllByEsLoadtimeAfter(String loadtime);
|
||||
}
|
||||
@ -0,0 +1,9 @@
|
||||
package com.jsc.dsp.dao;
|
||||
|
||||
import com.jsc.dsp.model.Indeximos;
|
||||
import org.springframework.data.jpa.repository.JpaRepository;
|
||||
import org.springframework.stereotype.Repository;
|
||||
|
||||
@Repository
|
||||
public interface IndeximosRepository extends JpaRepository<Indeximos, String> {
|
||||
}
|
||||
15
dsp/src/main/java/com/jsc/dsp/model/Config.java
Normal file
15
dsp/src/main/java/com/jsc/dsp/model/Config.java
Normal file
@ -0,0 +1,15 @@
|
||||
package com.jsc.dsp.model;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
import javax.persistence.Entity;
|
||||
import javax.persistence.Id;
|
||||
|
||||
@Entity
|
||||
@Data
|
||||
public class Config {
|
||||
@Id
|
||||
Integer id;
|
||||
String configName;
|
||||
String configValue;
|
||||
}
|
||||
38
dsp/src/main/java/com/jsc/dsp/model/EsDataHotSearchView.java
Normal file
38
dsp/src/main/java/com/jsc/dsp/model/EsDataHotSearchView.java
Normal file
@ -0,0 +1,38 @@
|
||||
package com.jsc.dsp.model;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
import javax.persistence.Entity;
|
||||
import javax.persistence.Id;
|
||||
import javax.persistence.Table;
|
||||
|
||||
@Entity
|
||||
@Data
|
||||
@Table(name = "es_data_hot_search")
|
||||
public class EsDataHotSearchView {
|
||||
|
||||
@Id
|
||||
private String esSid;
|
||||
|
||||
private String esUrltime;
|
||||
|
||||
private String esCarriertype;
|
||||
|
||||
private String esSitename;
|
||||
|
||||
private String esSimrank;
|
||||
|
||||
private String esUrltitle;
|
||||
|
||||
private String esUrlcontent;
|
||||
|
||||
private String esUrlname;
|
||||
|
||||
private String esHkey;
|
||||
|
||||
private String esLasttime;
|
||||
|
||||
private String esHeat;
|
||||
|
||||
private String esLoadtime;
|
||||
}
|
||||
38
dsp/src/main/java/com/jsc/dsp/model/EsDataNewsView.java
Normal file
38
dsp/src/main/java/com/jsc/dsp/model/EsDataNewsView.java
Normal file
@ -0,0 +1,38 @@
|
||||
package com.jsc.dsp.model;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
import javax.persistence.Entity;
|
||||
import javax.persistence.Id;
|
||||
import javax.persistence.Table;
|
||||
|
||||
@Entity
|
||||
@Data
|
||||
@Table(name = "es_data_news")
|
||||
public class EsDataNewsView {
|
||||
@Id
|
||||
String esSid;
|
||||
String esAuthors;
|
||||
String esCarriertype;
|
||||
String esCatalog;
|
||||
String esCollection;
|
||||
Float esDoclength;
|
||||
String esLang;
|
||||
String esLasttime;
|
||||
String esLinks;
|
||||
String esLoadtime;
|
||||
String esSitename;
|
||||
String esSrcname;
|
||||
String esUrlcontent;
|
||||
String esUrlcontentTranslate;
|
||||
String esUrlimage;
|
||||
String esUrlname;
|
||||
String esUrltime;
|
||||
String esUrltitle;
|
||||
String esUrltitleTranslate;
|
||||
String esAbstract;
|
||||
String esKeywords;
|
||||
String file;
|
||||
String esHkey;
|
||||
String esUrltopic;
|
||||
}
|
||||
54
dsp/src/main/java/com/jsc/dsp/model/EsDataTwitterView.java
Normal file
54
dsp/src/main/java/com/jsc/dsp/model/EsDataTwitterView.java
Normal file
@ -0,0 +1,54 @@
|
||||
package com.jsc.dsp.model;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
import javax.persistence.Entity;
|
||||
import javax.persistence.Id;
|
||||
import javax.persistence.Table;
|
||||
|
||||
@Entity
|
||||
@Data
|
||||
@Table(name = "es_data_twitter")
|
||||
public class EsDataTwitterView {
|
||||
|
||||
@Id
|
||||
private String esUrltime;
|
||||
|
||||
private String esAuthors;
|
||||
|
||||
private String esCarriertype;
|
||||
|
||||
private String esSitename;
|
||||
|
||||
private String esUrlcontent;
|
||||
|
||||
private String esUrlcontentTranslate;
|
||||
|
||||
private String esUrlname;
|
||||
|
||||
private String esUrltitle;
|
||||
|
||||
private String esUrltitleTranslate;
|
||||
|
||||
private String esVideo;
|
||||
|
||||
private String esExtname;
|
||||
|
||||
private String esIsrepost;
|
||||
|
||||
private String esCatalog1;
|
||||
|
||||
private String esForwardcount;
|
||||
|
||||
private String esLikecount;
|
||||
|
||||
private String esCommentcount;
|
||||
|
||||
private String esHkey;
|
||||
|
||||
private String esUrlimage;
|
||||
|
||||
private String esUserid;
|
||||
|
||||
private String esLoadtime;
|
||||
}
|
||||
@ -2,10 +2,17 @@ package com.jsc.dsp.model;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
import javax.persistence.Entity;
|
||||
import javax.persistence.Id;
|
||||
import javax.persistence.Table;
|
||||
import java.io.Serializable;
|
||||
|
||||
@Entity
|
||||
@Data
|
||||
@Table(name = "indeximos")
|
||||
public class Indeximos implements Serializable {
|
||||
@Id
|
||||
String es_sid;
|
||||
String es_abstract;
|
||||
String es_annex;
|
||||
String es_attachment;
|
||||
@ -56,7 +63,6 @@ public class Indeximos implements Serializable {
|
||||
String es_repostuid;
|
||||
String es_repostuname;
|
||||
String es_rultopic;
|
||||
String es_sid;
|
||||
String es_simhash;
|
||||
String es_similarity;
|
||||
String es_similaritycount;
|
||||
|
||||
29
dsp/src/main/java/com/jsc/dsp/service/ConfigService.java
Normal file
29
dsp/src/main/java/com/jsc/dsp/service/ConfigService.java
Normal file
@ -0,0 +1,29 @@
|
||||
package com.jsc.dsp.service;
|
||||
|
||||
import com.jsc.dsp.dao.ConfigRepository;
|
||||
import com.jsc.dsp.model.Config;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import javax.annotation.Resource;
|
||||
|
||||
@Service
|
||||
public class ConfigService {
|
||||
|
||||
@Resource
|
||||
ConfigRepository configRepository;
|
||||
|
||||
public String getConfigValueByName(String configName) {
|
||||
return getConfigByName(configName).getConfigValue();
|
||||
}
|
||||
|
||||
public Config getConfigByName(String configName) {
|
||||
return configRepository.findFirstByConfigName(configName);
|
||||
}
|
||||
|
||||
public void setConfigValueByName(String configName, String configValue) {
|
||||
Config config = getConfigByName(configName);
|
||||
config.setConfigValue(configValue);
|
||||
configRepository.save(config);
|
||||
}
|
||||
|
||||
}
|
||||
@ -6,6 +6,7 @@ import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
|
||||
import org.springframework.boot.json.JacksonJsonParser;
|
||||
import org.springframework.cloud.stream.annotation.EnableBinding;
|
||||
import org.springframework.cloud.stream.annotation.StreamListener;
|
||||
@ -21,6 +22,7 @@ import java.util.concurrent.Executors;
|
||||
|
||||
@Component
|
||||
@EnableBinding(FileDlBinding.class)
|
||||
@ConditionalOnProperty(name = "switch.enable-file-dl-service", havingValue = "true", matchIfMissing = true)
|
||||
public class FileDlService extends StreamService {
|
||||
|
||||
@Autowired
|
||||
@ -78,31 +80,14 @@ public class FileDlService extends StreamService {
|
||||
}
|
||||
int dlResult = fileUtils.downloadFromUrl(fileURL, protoSavePath);
|
||||
if (dlResult == 1) {
|
||||
File transferPath = new File(transferBackupPath);
|
||||
File transferPath = new File(protoSavePath);
|
||||
File[] files = transferPath.listFiles();
|
||||
if (files != null && files.length > 0) {
|
||||
for (File transferFile : files) {
|
||||
if (transferFile.getName().endsWith(".tar.gz")) {
|
||||
if (transferFile.getName().startsWith("attach")) {
|
||||
try {
|
||||
fileUtils.UnzipTarGzip(transferFile.getAbsolutePath(), nginxPath);
|
||||
logger.info("Unzip attachments " + transferFile.getName());
|
||||
} catch (Exception e) {
|
||||
logger.error("Unzip error!");
|
||||
}
|
||||
} else {
|
||||
try {
|
||||
fileUtils.UnzipTarGzip(transferFile.getAbsolutePath(), fileUnzipPath);
|
||||
logger.info("Unzip " + transferFile.getName());
|
||||
} catch (Exception e) {
|
||||
logger.error("Unzip error!");
|
||||
}
|
||||
}
|
||||
fileUtils.moveFileToBackupFolder(transferFile, keepBackupFile);
|
||||
}
|
||||
fileUtils.moveFileToBackupFolder(transferFile, keepBackupFile);
|
||||
}
|
||||
// Runnable upload2Ceph = () -> fileUtils.uploadToCeph(fileUnzipPath);
|
||||
// pool.execute(upload2Ceph);
|
||||
}
|
||||
} else if (dlResult == 0) {
|
||||
logger.error("File " + fileName + " download failure");
|
||||
|
||||
@ -12,12 +12,14 @@ import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
|
||||
import org.springframework.boot.json.JacksonJsonParser;
|
||||
import org.springframework.cloud.stream.annotation.EnableBinding;
|
||||
import org.springframework.cloud.stream.annotation.StreamListener;
|
||||
import org.springframework.messaging.support.MessageBuilder;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import javax.annotation.Resource;
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.OutputStream;
|
||||
@ -30,6 +32,7 @@ import java.util.Map;
|
||||
|
||||
@Component
|
||||
@EnableBinding(ProtobufBinding.class)
|
||||
@ConditionalOnProperty(name = "switch.enable-protobuf-service", havingValue = "true", matchIfMissing = true)
|
||||
public class ProtobufService extends StreamService {
|
||||
|
||||
@Autowired
|
||||
@ -41,9 +44,6 @@ public class ProtobufService extends StreamService {
|
||||
@Value("${custom.proto_save_path}")
|
||||
String protoSavePath;
|
||||
|
||||
@Value("${custom.transfer_backup_path}")
|
||||
String transferBackupPath;
|
||||
|
||||
@Value("${custom.keep_backup_file}")
|
||||
String keepBackupFile;
|
||||
|
||||
@ -55,7 +55,7 @@ public class ProtobufService extends StreamService {
|
||||
|
||||
private final Logger logger = LogManager.getLogger(ProtobufService.class.getName());
|
||||
|
||||
@Autowired
|
||||
@Resource
|
||||
private ProtobufBinding source;
|
||||
|
||||
@Override
|
||||
@ -169,7 +169,7 @@ public class ProtobufService extends StreamService {
|
||||
}
|
||||
logger.debug("protobuf done");
|
||||
// 转移备份目录的todist文件
|
||||
File transferPath = new File(transferBackupPath);
|
||||
File transferPath = new File(protoSavePath);
|
||||
File[] files = transferPath.listFiles();
|
||||
if (files != null && files.length > 0) {
|
||||
for (File transferFile : files) {
|
||||
|
||||
@ -1,31 +1,25 @@
|
||||
package com.jsc.dsp.service;
|
||||
|
||||
import com.alibaba.fastjson.JSON;
|
||||
import com.alibaba.fastjson.JSONArray;
|
||||
import com.alibaba.fastjson.JSONObject;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.protobuf.Descriptors.FieldDescriptor;
|
||||
import com.jsc.dsp.binding.StorageBinding;
|
||||
import com.jsc.dsp.model.Indeximos;
|
||||
import com.jsc.dsp.proto.EsOuterClass.Es;
|
||||
import com.jsc.dsp.proto.EsOuterClass.EsSets;
|
||||
import com.jsc.dsp.utils.DBUtils;
|
||||
import com.jsc.dsp.utils.EsUtils;
|
||||
import com.jsc.dsp.utils.FileUtils;
|
||||
import com.jsc.dsp.utils.DatabaseConnector;
|
||||
import com.jsc.dsp.utils.StringUtils;
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
import org.elasticsearch.action.bulk.BulkRequest;
|
||||
import org.elasticsearch.action.index.IndexRequest;
|
||||
import org.elasticsearch.common.xcontent.XContentType;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
|
||||
import org.springframework.boot.json.JacksonJsonParser;
|
||||
import org.springframework.cloud.stream.annotation.EnableBinding;
|
||||
import org.springframework.cloud.stream.annotation.StreamListener;
|
||||
import org.springframework.messaging.support.MessageBuilder;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import javax.annotation.Resource;
|
||||
import java.lang.reflect.Field;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.*;
|
||||
@ -33,6 +27,7 @@ import java.util.Base64.Decoder;
|
||||
|
||||
@Component
|
||||
@EnableBinding(StorageBinding.class)
|
||||
@ConditionalOnProperty(name = "switch.enable-storage-service", havingValue = "true", matchIfMissing = true)
|
||||
public class StorageService extends StreamService {
|
||||
|
||||
@Autowired
|
||||
@ -44,42 +39,24 @@ public class StorageService extends StreamService {
|
||||
@Autowired
|
||||
JacksonJsonParser jsonParser;
|
||||
|
||||
@Value("${es.ip}")
|
||||
String esIp;
|
||||
|
||||
@Value("${es.port}")
|
||||
Integer esPort;
|
||||
|
||||
@Value("${es.username}")
|
||||
String esUsername;
|
||||
|
||||
@Value("${es.password}")
|
||||
String esPassword;
|
||||
|
||||
@Value("${es.index}")
|
||||
String esIndex;
|
||||
|
||||
@Value("${custom.dev-mode}")
|
||||
boolean devMode;
|
||||
|
||||
@Value("${custom.local-file-storage-path}")
|
||||
String localFileStoragePath;
|
||||
|
||||
@Value("${db.driver}")
|
||||
String dbDriver;
|
||||
@Value("${custom.websiteWhiteList}")
|
||||
String websiteWhiteListString;
|
||||
|
||||
@Value("${db.url}")
|
||||
String dbUrl;
|
||||
|
||||
@Value("${db.user}")
|
||||
String dbUser;
|
||||
|
||||
@Value("${db.password}")
|
||||
String dbPassword;
|
||||
@Value("${custom.twitterWhiteList}")
|
||||
String twitterWhiteListString;
|
||||
|
||||
@Resource
|
||||
DatabaseConnector databaseConnector;
|
||||
|
||||
private final Logger logger = LogManager.getLogger(StorageService.class.getName());
|
||||
|
||||
|
||||
@Override
|
||||
public void sendMessage(byte[] msg) {
|
||||
source.StorageOutput().send(MessageBuilder.withPayload(msg).build());
|
||||
@ -91,8 +68,10 @@ public class StorageService extends StreamService {
|
||||
@Override
|
||||
@StreamListener(StorageBinding.STORAGE_PIPELINE_IN)
|
||||
public void receiveMessage(Object payload) {
|
||||
List<String> websiteWhiteList = Arrays.asList(websiteWhiteListString.split(";"));
|
||||
List<String> twitterWhiteList = Arrays.asList(twitterWhiteListString.split(";"));
|
||||
|
||||
String tempString;
|
||||
ObjectMapper objectMapper = new ObjectMapper();
|
||||
try {
|
||||
tempString = new String(base64.decode(payload.toString()), StandardCharsets.UTF_8);
|
||||
Map<String, Object> data = jsonParser.parseMap(tempString);
|
||||
@ -101,7 +80,6 @@ public class StorageService extends StreamService {
|
||||
if ("public_info_data_".equals(protoName)) {
|
||||
EsSets.Builder esSetsBuilder = EsSets.newBuilder();
|
||||
EsSets esSets = EsSets.parseFrom(data.get("content").toString().getBytes(StandardCharsets.ISO_8859_1));
|
||||
List<Object> localStorageItems = new ArrayList<>();
|
||||
List<Indeximos> dbStorageItems = new ArrayList<>();
|
||||
BulkRequest bulkRequest = new BulkRequest();
|
||||
bulkRequest.timeout("5s");
|
||||
@ -111,7 +89,7 @@ public class StorageService extends StreamService {
|
||||
Map<FieldDescriptor, Object> fieldsMap = es.getAllFields();
|
||||
Indeximos indeximos = new Indeximos();
|
||||
for (FieldDescriptor key : fieldsMap.keySet()) {
|
||||
boolean hasField = DBUtils.hasField(Indeximos.class, key.getName());
|
||||
boolean hasField = databaseConnector.hasField(Indeximos.class, key.getName());
|
||||
if (!hasField) {
|
||||
continue;
|
||||
}
|
||||
@ -130,7 +108,7 @@ public class StorageService extends StreamService {
|
||||
} else {
|
||||
Field field = indeximos.getClass().getDeclaredField(key.getName());
|
||||
field.setAccessible(true);
|
||||
String fieldType = DBUtils.getFieldType(Indeximos.class, key.getName());
|
||||
String fieldType = databaseConnector.getFieldType(Indeximos.class, key.getName());
|
||||
if (fieldType.contains("Float")) {
|
||||
field.set(indeximos, Float.valueOf(value));
|
||||
} else {
|
||||
@ -138,68 +116,97 @@ public class StorageService extends StreamService {
|
||||
}
|
||||
}
|
||||
}
|
||||
String uuid = UUID.randomUUID().toString().replaceAll("-", "");
|
||||
String es_urlname = indeximos.getEs_urlname();
|
||||
if (!es_urlname.isEmpty()) {
|
||||
// 根据urlname生成固定的UUID,避免重复入库相同的文章
|
||||
UUID _uuid = UUID.nameUUIDFromBytes(es_urlname.getBytes());
|
||||
uuid = _uuid.toString().replaceAll("-", "");
|
||||
}
|
||||
indeximos.setEs_sid(uuid);
|
||||
indeximos.setEs_links(indeximos.getEs_links());
|
||||
indeximos.setEs_loadtime(StringUtils.TimestampToStringDate(System.currentTimeMillis()));
|
||||
builder.setEsSid(uuid);
|
||||
for (Field f : indeximos.getClass().getDeclaredFields()) {
|
||||
f.setAccessible(true);
|
||||
//判断字段是否为空,并且对象属性中的基本都会转为对象类型来判断
|
||||
if (f.get(indeximos) == null) {
|
||||
String fieldType = DBUtils.getFieldType(Indeximos.class, f.getName());
|
||||
if (fieldType.contains("Float")) {
|
||||
f.set(indeximos, 0.0f);
|
||||
} else {
|
||||
if (!dateFields.contains(f.getName())) {
|
||||
f.set(indeximos, "");
|
||||
// 只导出白名单站点的数据
|
||||
if (websiteWhiteList.contains(indeximos.getEs_sitename())) {
|
||||
logger.info("开始处理站点【" + indeximos.getEs_sitename() + "】的数据入库流程");
|
||||
String uuid = UUID.randomUUID().toString().replaceAll("-", "");
|
||||
String es_urlname = indeximos.getEs_urlname();
|
||||
if (!es_urlname.isEmpty()) {
|
||||
// 根据urlname生成固定的UUID,避免重复入库相同的文章
|
||||
UUID _uuid = UUID.nameUUIDFromBytes(es_urlname.getBytes());
|
||||
uuid = _uuid.toString().replaceAll("-", "");
|
||||
}
|
||||
indeximos.setEs_urltitle(indeximos.getEs_urltitle().trim());
|
||||
indeximos.setEs_sid(uuid);
|
||||
indeximos.setEs_links(indeximos.getEs_links());
|
||||
indeximos.setEs_loadtime(StringUtils.TimestampToStringDate(System.currentTimeMillis()));
|
||||
builder.setEsSid(uuid);
|
||||
for (Field f : indeximos.getClass().getDeclaredFields()) {
|
||||
f.setAccessible(true);
|
||||
//判断字段是否为空,并且对象属性中的基本都会转为对象类型来判断
|
||||
if (f.get(indeximos) == null) {
|
||||
String fieldType = databaseConnector.getFieldType(Indeximos.class, f.getName());
|
||||
if (fieldType.contains("Float")) {
|
||||
f.set(indeximos, 0.0f);
|
||||
} else {
|
||||
if (!dateFields.contains(f.getName())) {
|
||||
f.set(indeximos, "");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
IndexRequest indexRequest = new IndexRequest(esIndex);
|
||||
indexRequest.id(indeximos.getEs_sid());
|
||||
indexRequest.source(objectMapper.writeValueAsString(indeximos), XContentType.JSON);
|
||||
bulkRequest.add(indexRequest);
|
||||
Es es_temp = builder.build();
|
||||
esSetsBuilder.addEs(es_temp);
|
||||
List<String> localizedOption = JSON.parseArray(indeximos.getEs_urltopic(), String.class);
|
||||
if (indeximos.getEs_carriertype().equals("wechat")) {
|
||||
dbStorageItems.add(indeximos);
|
||||
}
|
||||
if (localizedOption != null && localizedOption.size() > 0) {
|
||||
//本地存储用
|
||||
if (localizedOption.contains("json")) {
|
||||
localStorageItems.add(indeximos);
|
||||
if (indeximos.getEs_carriertype().equals("media") && twitterWhiteList.contains(indeximos.getEs_authors())) {
|
||||
logger.info("开始处理推特用户【" + indeximos.getEs_authors() + "】的数据入库流程");
|
||||
String uuid = UUID.randomUUID().toString().replaceAll("-", "");
|
||||
String es_urlname = indeximos.getEs_urlname();
|
||||
if (!es_urlname.isEmpty()) {
|
||||
// 根据urlname生成固定的UUID,避免重复入库相同的文章
|
||||
UUID _uuid = UUID.nameUUIDFromBytes(es_urlname.getBytes());
|
||||
uuid = _uuid.toString().replaceAll("-", "");
|
||||
}
|
||||
//入库MySQL
|
||||
if (localizedOption.contains("mysql")) {
|
||||
dbStorageItems.add(indeximos);
|
||||
indeximos.setEs_sid(uuid);
|
||||
indeximos.setEs_loadtime(StringUtils.TimestampToStringDate(System.currentTimeMillis()));
|
||||
builder.setEsSid(uuid);
|
||||
for (Field f : indeximos.getClass().getDeclaredFields()) {
|
||||
f.setAccessible(true);
|
||||
//判断字段是否为空,并且对象属性中的基本都会转为对象类型来判断
|
||||
if (f.get(indeximos) == null) {
|
||||
String fieldType = databaseConnector.getFieldType(Indeximos.class, f.getName());
|
||||
if (fieldType.contains("Float")) {
|
||||
f.set(indeximos, 0.0f);
|
||||
} else {
|
||||
if (!dateFields.contains(f.getName())) {
|
||||
f.set(indeximos, "");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
dbStorageItems.add(indeximos);
|
||||
}
|
||||
if (indeximos.getEs_carriertype().equals("hot_search")) {
|
||||
logger.info("开始处理热搜【" + indeximos.getEs_sitename() + "】的数据入库流程");
|
||||
String uuid = UUID.randomUUID().toString().replaceAll("-", "");
|
||||
String es_urlname = indeximos.getEs_urlname();
|
||||
if (!es_urlname.isEmpty()) {
|
||||
// 根据urlname生成固定的UUID,避免重复入库相同的文章
|
||||
UUID _uuid = UUID.nameUUIDFromBytes(es_urlname.getBytes());
|
||||
uuid = _uuid.toString().replaceAll("-", "");
|
||||
}
|
||||
indeximos.setEs_sid(uuid);
|
||||
indeximos.setEs_loadtime(StringUtils.TimestampToStringDate(System.currentTimeMillis()));
|
||||
builder.setEsSid(uuid);
|
||||
for (Field f : indeximos.getClass().getDeclaredFields()) {
|
||||
f.setAccessible(true);
|
||||
//判断字段是否为空,并且对象属性中的基本都会转为对象类型来判断
|
||||
if (f.get(indeximos) == null) {
|
||||
String fieldType = databaseConnector.getFieldType(Indeximos.class, f.getName());
|
||||
if (fieldType.contains("Float")) {
|
||||
f.set(indeximos, 0.0f);
|
||||
} else {
|
||||
if (!dateFields.contains(f.getName())) {
|
||||
f.set(indeximos, "");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
dbStorageItems.add(indeximos);
|
||||
}
|
||||
}
|
||||
EsUtils.EsSaveBulkRequest(esIp, esPort, esUsername, esPassword, bulkRequest);
|
||||
if (localStorageItems.size() > 0) {
|
||||
String entityItemsString = JSON.toJSONString(localStorageItems);
|
||||
String entityFileFullPath = localFileStoragePath + esIndex + "_" + System.currentTimeMillis() + ".json";
|
||||
if (FileUtils.saveStringToFile(entityItemsString, entityFileFullPath)) {
|
||||
logger.info("Local file store to " + entityFileFullPath);
|
||||
} else {
|
||||
logger.error("Local file store error!");
|
||||
}
|
||||
}
|
||||
|
||||
if (dbStorageItems.size() > 0) {
|
||||
if (DBUtils.insertIntoDB(dbDriver, dbUrl, dbUser, dbPassword, dbStorageItems)) {
|
||||
logger.info("Store to MySQL Database");
|
||||
} else {
|
||||
logger.error("MySQL Database Storage error!");
|
||||
}
|
||||
databaseConnector.insertIntoDB(dbStorageItems);
|
||||
}
|
||||
data.put("content", new String(esSetsBuilder.build().toByteArray(), StandardCharsets.ISO_8859_1));
|
||||
}
|
||||
|
||||
36
dsp/src/main/java/com/jsc/dsp/task/AutoUpload.java
Normal file
36
dsp/src/main/java/com/jsc/dsp/task/AutoUpload.java
Normal file
@ -0,0 +1,36 @@
|
||||
package com.jsc.dsp.task;
|
||||
|
||||
import com.jsc.dsp.utils.ExportAndUploadUtils;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
|
||||
import org.springframework.scheduling.annotation.Scheduled;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import javax.annotation.Resource;
|
||||
|
||||
@Component
|
||||
@ConditionalOnProperty(name = "switch.auto-export-and-upload", havingValue = "true", matchIfMissing = true)
|
||||
public class AutoUpload {
|
||||
|
||||
@Resource
|
||||
ExportAndUploadUtils exportAndUploadUtils;
|
||||
|
||||
@Value("${custom.ftpUploadPath}")
|
||||
String ftpUploadPath;
|
||||
|
||||
@Scheduled(cron = "${custom.exportNewsTaskSchedule}")
|
||||
public void exportNewsDataAndUpload() {
|
||||
exportAndUploadUtils.exportNewsDataAndUpload();
|
||||
}
|
||||
|
||||
@Scheduled(cron = "${custom.exportTwitterTaskSchedule}")
|
||||
public void exportTwitterDataAndUpload() {
|
||||
exportAndUploadUtils.exportTwitterDataAndUpload();
|
||||
}
|
||||
|
||||
@Scheduled(cron = "${custom.exportHotSearchTaskSchedule}")
|
||||
public void exportHotSearchAndUpload() {
|
||||
exportAndUploadUtils.exportHotSearchAndUpload();
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,135 +0,0 @@
|
||||
package com.jsc.dsp.utils;
|
||||
|
||||
import com.alibaba.fastjson.JSON;
|
||||
import com.alibaba.fastjson.JSONArray;
|
||||
import com.alibaba.fastjson.JSONObject;
|
||||
import com.jsc.dsp.model.SearchAggregation;
|
||||
import com.jsc.dsp.model.TargetSocial;
|
||||
import com.jsc.dsp.model.TargetWebsite;
|
||||
|
||||
import java.util.Date;
|
||||
import java.util.Map;
|
||||
import java.util.logging.Logger;
|
||||
|
||||
import static com.jsc.dsp.utils.EsUtils.performAggregationSearch;
|
||||
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.kafka.support.LogIfLevelEnabled;
|
||||
import org.springframework.scheduling.annotation.Scheduled;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
@Component
|
||||
public class AutoPatroller {
|
||||
|
||||
private final Logger logger = Logger.getLogger(this.getClass().getName());
|
||||
|
||||
long updateInterval = 1500L;
|
||||
|
||||
@Value("${custom.websiteQueryAPI}")
|
||||
String websiteQueryAPI;
|
||||
|
||||
@Value("${custom.websiteUpdateAPI}")
|
||||
String websiteUpdateAPI;
|
||||
|
||||
@Value("${custom.socialQueryAPI}")
|
||||
String socialQueryAPI;
|
||||
|
||||
@Value("${custom.socialUpdateAPI}")
|
||||
String socialUpdateAPI;
|
||||
|
||||
@Value("${es.ip}")
|
||||
String esIp;
|
||||
|
||||
@Value("${es.port}")
|
||||
Integer esPort;
|
||||
|
||||
@Value("${es.username}")
|
||||
String esUsername;
|
||||
|
||||
@Value("${es.password}")
|
||||
String esPassword;
|
||||
|
||||
@Scheduled(cron = "0 45 0/3 * * *")
|
||||
public void checkNewsSite() {
|
||||
checkWebsite("es_sitename", "es_carriertype", "news");
|
||||
}
|
||||
|
||||
@Scheduled(cron = "0 15 1/3 * * *")
|
||||
public void checkWechat() {
|
||||
checkSocial("es_authors", "es_carriertype", "wechat", "5");
|
||||
}
|
||||
|
||||
@Scheduled(cron = "0 0 2/4 * * *")
|
||||
public void checkArticleSite() {
|
||||
checkWebsite("es_sitename", "es_carriertype", "article");
|
||||
}
|
||||
|
||||
public void checkWebsite(String aggFieldName, String queryFieldName, String queryFieldValue) {
|
||||
try {
|
||||
Map<String, SearchAggregation> searchAggregationMap = performAggregationSearch(
|
||||
esIp, esPort, esUsername, esPassword, aggFieldName, queryFieldName, queryFieldValue);
|
||||
JSONObject dataObject = new JSONObject();
|
||||
dataObject.put("carrierType", queryFieldValue);
|
||||
String rsp = HttpUtils.post(websiteQueryAPI, dataObject);
|
||||
JSONObject rspObj = JSON.parseObject(rsp);
|
||||
if (rspObj.getIntValue("code") == 200) {
|
||||
JSONArray rspArr = rspObj.getJSONArray("content");
|
||||
for (Object obj : rspArr) {
|
||||
TargetWebsite targetWebsite = JSONObject.parseObject(obj.toString(), TargetWebsite.class);
|
||||
String siteName = targetWebsite.getSiteName();
|
||||
if (searchAggregationMap.containsKey(siteName)) {
|
||||
SearchAggregation checkInfo = searchAggregationMap.get(siteName);
|
||||
targetWebsite.setCheckTotalNum(checkInfo.getCount());
|
||||
targetWebsite.setCheckLastTime(checkInfo.getLastTime());
|
||||
targetWebsite.setCheckUpdateTime(new Date());
|
||||
String updateRsp = HttpUtils.post(websiteUpdateAPI, targetWebsite);
|
||||
JSONObject updateRspObj = JSONObject.parseObject(updateRsp);
|
||||
if (updateRspObj.getIntValue("code") != 200) {
|
||||
logger.warning("更新站点【" + siteName + "】巡检信息失败");
|
||||
}
|
||||
Thread.sleep(updateInterval);
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
logger.info("站点巡检完毕");
|
||||
}
|
||||
|
||||
public void checkSocial(String aggFieldName, String queryFieldName, String queryFieldValue, String socialTypeCode) {
|
||||
try {
|
||||
Map<String, SearchAggregation> searchAggregationMap = performAggregationSearch(
|
||||
esIp, esPort, esUsername, esPassword, aggFieldName, queryFieldName, queryFieldValue);
|
||||
TargetSocial postData = new TargetSocial();
|
||||
postData.setUserFlag("0");
|
||||
postData.setUserType(socialTypeCode);
|
||||
String rsp = HttpUtils.post(socialQueryAPI, postData);
|
||||
JSONObject rspObj = JSON.parseObject(rsp);
|
||||
if (rspObj.getIntValue("code") == 200) {
|
||||
JSONArray rspArr = rspObj.getJSONArray("content");
|
||||
for (Object obj : rspArr) {
|
||||
TargetSocial targetSocial = JSONObject.parseObject(obj.toString(), TargetSocial.class);
|
||||
String userName = targetSocial.getUserName();
|
||||
if (searchAggregationMap.containsKey(userName)) {
|
||||
SearchAggregation checkInfo = searchAggregationMap.get(userName);
|
||||
targetSocial.setCheckTotalNum(checkInfo.getCount());
|
||||
targetSocial.setCheckLastTime(checkInfo.getLastTime());
|
||||
targetSocial.setCheckUpdateTime(new Date());
|
||||
String updateRsp = HttpUtils.post(socialUpdateAPI, targetSocial);
|
||||
JSONObject updateRspObj = JSONObject.parseObject(updateRsp);
|
||||
if (updateRspObj.getIntValue("code") != 200) {
|
||||
logger.warning("更新账号【" + userName + "】巡检信息失败");
|
||||
}
|
||||
Thread.sleep(updateInterval);
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
logger.info("社交帐号巡检完毕");
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,131 +0,0 @@
|
||||
package com.jsc.dsp.utils;
|
||||
|
||||
import com.alibaba.fastjson.JSONArray;
|
||||
import com.jsc.dsp.model.Indeximos;
|
||||
|
||||
import java.io.File;
|
||||
import java.lang.reflect.Field;
|
||||
import java.lang.reflect.Method;
|
||||
import java.sql.Connection;
|
||||
import java.sql.DriverManager;
|
||||
import java.sql.PreparedStatement;
|
||||
import java.sql.SQLException;
|
||||
import java.util.*;
|
||||
import java.util.logging.Logger;
|
||||
|
||||
|
||||
public class DBUtils {
|
||||
|
||||
public static Connection conn = null;
|
||||
|
||||
private static final List<String> floatFields = Arrays.asList("es_doclength", "es_negativeProbability", "es_simrank");
|
||||
|
||||
private static final Logger logger = Logger.getLogger("com.jsc.dsp.utils.DBUtils");
|
||||
|
||||
public static Connection getConnection(String driver, String url, String user, String password) {
|
||||
try {
|
||||
Class.forName(driver);
|
||||
return DriverManager.getConnection(url, user, password);
|
||||
} catch (ClassNotFoundException | SQLException e) {
|
||||
logger.warning("Cannot get DB connection!");
|
||||
logger.warning(e.getMessage());
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private static Map<String, Object> getObjectMap(Indeximos object) {
|
||||
Map<String, Object> resultMap = new HashMap<>();
|
||||
Field[] fields = object.getClass().getDeclaredFields();
|
||||
for (Field field : fields) {
|
||||
String fieldName = field.getName();
|
||||
String firstLetter = fieldName.substring(0, 1).toUpperCase();
|
||||
String getter = "get" + firstLetter + fieldName.substring(1);
|
||||
try {
|
||||
Method method = object.getClass().getMethod(getter);
|
||||
Object fieldValue = method.invoke(object);
|
||||
resultMap.put(fieldName, fieldValue);
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
return resultMap;
|
||||
}
|
||||
|
||||
public static boolean insertIntoDB(String driver, String url, String user, String password, List<Indeximos> objectList) {
|
||||
if (conn == null) {
|
||||
conn = getConnection(driver, url, user, password);
|
||||
}
|
||||
if (conn != null) {
|
||||
try {
|
||||
PreparedStatement pres = null;
|
||||
for (Indeximos object : objectList) {
|
||||
Map<String, Object> objectMap = getObjectMap(object);
|
||||
Object[] keyObjects = objectMap.keySet().toArray();
|
||||
List<String> keys = new ArrayList<>();
|
||||
List<String> values = new ArrayList<>();
|
||||
for (Object ko : keyObjects) {
|
||||
String key = ko.toString();
|
||||
keys.add(key);
|
||||
Object value = objectMap.get(key);
|
||||
if (floatFields.contains(key)) {
|
||||
values.add(value.toString());
|
||||
} else {
|
||||
if (value != null && value.toString().length() > 0) {
|
||||
values.add("'" + value.toString().replace("'", "\\'") + "'");
|
||||
} else {
|
||||
values.add("null");
|
||||
}
|
||||
}
|
||||
}
|
||||
String sqlInsert = "REPLACE INTO indeximos(" + String.join(", ", keys) + ") VALUES("
|
||||
+ String.join(", ", values) + ")";
|
||||
pres = conn.prepareStatement(sqlInsert);
|
||||
pres.addBatch();
|
||||
}
|
||||
if (pres != null) {
|
||||
pres.executeBatch();
|
||||
pres.close();
|
||||
}
|
||||
return true;
|
||||
} catch (SQLException e) {
|
||||
logger.warning("Fail to insert data to Database");
|
||||
logger.warning(e.getMessage());
|
||||
conn = getConnection(driver, url, user, password);
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static boolean hasField(Class<?> clazz, String fieldName) {
|
||||
try {
|
||||
clazz.getDeclaredField(fieldName);
|
||||
return true;
|
||||
} catch (NoSuchFieldException e) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public static String getFieldType(Class<?> clazz, String fieldName) {
|
||||
try {
|
||||
Field field = clazz.getDeclaredField(fieldName);
|
||||
return field.getType().getName();
|
||||
} catch (NoSuchFieldException e) {
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
List<Indeximos> objectList = JSONArray.parseArray(FileUtils.readContentFromFile(
|
||||
"D:/data/local-storage/indeximos_1700030748332.json"), Indeximos.class);
|
||||
insertIntoDB(
|
||||
"com.mysql.cj.jdbc.Driver",
|
||||
"jdbc:mysql://8.130.95.27:28089/dsp",
|
||||
"root",
|
||||
"passok123A",
|
||||
objectList);
|
||||
}
|
||||
|
||||
}
|
||||
590
dsp/src/main/java/com/jsc/dsp/utils/DatabaseConnector.java
Normal file
590
dsp/src/main/java/com/jsc/dsp/utils/DatabaseConnector.java
Normal file
@ -0,0 +1,590 @@
|
||||
package com.jsc.dsp.utils;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.jsc.dsp.dao.EsDataHotSearchRepository;
|
||||
import com.jsc.dsp.dao.EsDataNewsRepository;
|
||||
import com.jsc.dsp.dao.EsDataTwitterRepository;
|
||||
import com.jsc.dsp.dao.IndeximosRepository;
|
||||
import com.jsc.dsp.model.EsDataHotSearchView;
|
||||
import com.jsc.dsp.model.EsDataNewsView;
|
||||
import com.jsc.dsp.model.EsDataTwitterView;
|
||||
import com.jsc.dsp.model.Indeximos;
|
||||
import org.apache.poi.ss.usermodel.*;
|
||||
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import javax.annotation.Resource;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.lang.reflect.Field;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.time.LocalDateTime;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
import java.util.List;
|
||||
|
||||
import com.fasterxml.jackson.databind.JsonNode;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
||||
@Service
|
||||
public class DatabaseConnector {
|
||||
|
||||
@Resource
|
||||
IndeximosRepository indeximosRepository;
|
||||
|
||||
@Resource
|
||||
EsDataNewsRepository esDataNewsRepository;
|
||||
|
||||
@Resource
|
||||
EsDataTwitterRepository esDataTwitterRepository;
|
||||
|
||||
@Resource
|
||||
EsDataHotSearchRepository esDataHotSearchRepository;
|
||||
|
||||
@Value("${custom.newsExcelOutputPath}")
|
||||
String newsExcelOutputPath;
|
||||
|
||||
@Value("${custom.twitterExcelOutputPath}")
|
||||
String twitterExcelOutputPath;
|
||||
|
||||
@Value("${custom.hotSearchExcelOutputPath}")
|
||||
String hotSearchExcelOutputPath;
|
||||
|
||||
private static final ObjectMapper objectMapper = new ObjectMapper();
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(this.getClass().getName());
|
||||
|
||||
public void insertIntoDB(List<Indeximos> objectList) {
|
||||
try {
|
||||
indeximosRepository.saveAll(objectList);
|
||||
} catch (Exception e) {
|
||||
logger.warn("Fail to insert data to Database");
|
||||
logger.warn(e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public boolean hasField(Class<?> clazz, String fieldName) {
|
||||
try {
|
||||
clazz.getDeclaredField(fieldName);
|
||||
return true;
|
||||
} catch (NoSuchFieldException e) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public String getFieldType(Class<?> clazz, String fieldName) {
|
||||
try {
|
||||
Field field = clazz.getDeclaredField(fieldName);
|
||||
return field.getType().getName();
|
||||
} catch (NoSuchFieldException e) {
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
public void exportToXlsx(String startTime) {
|
||||
try {
|
||||
Path dirPath = Paths.get(newsExcelOutputPath);
|
||||
if (!Files.exists(dirPath)) {
|
||||
Files.createDirectories(dirPath);
|
||||
}
|
||||
String timestamp = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd"));
|
||||
String fileName = "data_news-" + timestamp + "-001.xlsx";
|
||||
Path filePath = dirPath.resolve(fileName);
|
||||
|
||||
List<EsDataNewsView> esDataNewsViewList = esDataNewsRepository.findAllByEsLoadtimeAfter(startTime);
|
||||
if (!esDataNewsViewList.isEmpty()) {
|
||||
Field[] fields = esDataNewsViewList.get(0).getClass().getDeclaredFields(); // 通过反射获取类的成员信息,并使用这些类成员为后续生成的excel表头做准备
|
||||
try (Workbook workbook = new XSSFWorkbook();
|
||||
ByteArrayOutputStream out = new ByteArrayOutputStream()) {
|
||||
|
||||
Sheet sheet = workbook.createSheet("data");
|
||||
|
||||
// 创建表头
|
||||
Row headerRow = sheet.createRow(0);
|
||||
CellStyle headerStyle = workbook.createCellStyle(); // 创建单元格
|
||||
headerStyle.setFillForegroundColor(IndexedColors.LIGHT_BLUE.getIndex());
|
||||
headerStyle.setFillPattern(FillPatternType.SOLID_FOREGROUND);
|
||||
|
||||
for (int i = 0; i < fields.length; i++) {
|
||||
Cell cell = headerRow.createCell(i);
|
||||
String formField = formField(fields[i]);
|
||||
cell.setCellValue(formField);
|
||||
cell.setCellStyle(headerStyle);
|
||||
}
|
||||
// 填充数据
|
||||
int rowNum = 1;
|
||||
for (EsDataNewsView item : esDataNewsViewList) {
|
||||
if (item.getFile() == null || item.getFile().length() < 5) {
|
||||
continue;
|
||||
} else {
|
||||
String fileFullPath = item.getFile();
|
||||
int i = fileFullPath.indexOf(File.separator);
|
||||
item.setFile(fileFullPath.substring(i + 1));
|
||||
}
|
||||
Row row = sheet.createRow(rowNum++);
|
||||
logger.debug("导出excel第" + rowNum + "行");
|
||||
row.createCell(0).setCellValue(item.getEsSid());
|
||||
row.createCell(1).setCellValue(item.getEsAuthors());
|
||||
row.createCell(2).setCellValue(item.getEsCarriertype());
|
||||
row.createCell(3).setCellValue(item.getEsCatalog());
|
||||
row.createCell(4).setCellValue(item.getEsCollection());
|
||||
row.createCell(5).setCellValue(item.getEsDoclength());
|
||||
row.createCell(6).setCellValue(item.getEsLang());
|
||||
row.createCell(7).setCellValue(item.getEsLasttime());
|
||||
if (item.getEsLinks().length() > 10000) {
|
||||
row.createCell(8).setCellValue(item.getEsLinks().substring(0, 10000));
|
||||
} else {
|
||||
row.createCell(8).setCellValue(item.getEsLinks());
|
||||
}
|
||||
row.createCell(9).setCellValue(item.getEsLoadtime());
|
||||
row.createCell(10).setCellValue(item.getEsSitename());
|
||||
row.createCell(11).setCellValue(item.getEsSrcname());
|
||||
if (item.getEsUrlcontent().length() > 30000) {
|
||||
row.createCell(12).setCellValue(item.getEsUrlcontent().substring(0, 30000));
|
||||
} else {
|
||||
row.createCell(12).setCellValue(item.getEsUrlcontent());
|
||||
}
|
||||
if (item.getEsUrlcontentTranslate().length() > 30000) {
|
||||
row.createCell(13).setCellValue(item.getEsUrlcontentTranslate().substring(0, 30000));
|
||||
} else {
|
||||
row.createCell(13).setCellValue(item.getEsUrlcontentTranslate());
|
||||
}
|
||||
row.createCell(14).setCellValue(item.getEsUrlimage());
|
||||
row.createCell(15).setCellValue(item.getEsUrlname());
|
||||
row.createCell(16).setCellValue(item.getEsUrltime());
|
||||
row.createCell(17).setCellValue(item.getEsUrltitle());
|
||||
row.createCell(18).setCellValue(item.getEsUrltitleTranslate());
|
||||
row.createCell(19).setCellValue(item.getEsAbstract());
|
||||
row.createCell(20).setCellValue(item.getEsKeywords());
|
||||
row.createCell(21).setCellValue(item.getFile());
|
||||
row.createCell(22).setCellValue(item.getEsHkey());
|
||||
row.createCell(23).setCellValue(item.getEsUrltopic());
|
||||
}
|
||||
logger.info("完成excel数据写入,共" + rowNum + "行");
|
||||
|
||||
// 自动调整列宽
|
||||
for (int i = 0; i < fields.length; i++) {
|
||||
sheet.autoSizeColumn(i);
|
||||
}
|
||||
|
||||
workbook.write(out);
|
||||
|
||||
try (FileOutputStream fos = new FileOutputStream(filePath.toFile())) {
|
||||
workbook.write(fos);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
logger.info("excel导出完成!");
|
||||
} else logger.info("获取数据为空,excel未导出");
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void twitterToXlsx(String startTime) {
|
||||
try {
|
||||
Path dirPath = Paths.get(twitterExcelOutputPath);
|
||||
if (!Files.exists(dirPath)) {
|
||||
Files.createDirectories(dirPath);
|
||||
}
|
||||
String timestamp = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd"));
|
||||
String fileName = "data_twitter-" + timestamp + "-001.xlsx";
|
||||
Path filePath = dirPath.resolve(fileName);
|
||||
|
||||
List<EsDataTwitterView> esDataNewsViewList = esDataTwitterRepository.findAllByEsLoadtimeAfter(startTime);
|
||||
if (!esDataNewsViewList.isEmpty()) {
|
||||
Field[] fields = esDataNewsViewList.get(0).getClass().getDeclaredFields(); // 通过反射获取类的成员信息,并使用这些类成员为后续生成的excel表头做准备
|
||||
try (Workbook workbook = new XSSFWorkbook();
|
||||
ByteArrayOutputStream out = new ByteArrayOutputStream()) {
|
||||
|
||||
Sheet sheet = workbook.createSheet("data");
|
||||
|
||||
// 创建表头
|
||||
Row headerRow = sheet.createRow(0);
|
||||
CellStyle headerStyle = workbook.createCellStyle(); // 创建单元格
|
||||
headerStyle.setFillForegroundColor(IndexedColors.LIGHT_BLUE.getIndex());
|
||||
headerStyle.setFillPattern(FillPatternType.SOLID_FOREGROUND);
|
||||
|
||||
for (int i = 0; i < fields.length; i++) {
|
||||
Cell cell = headerRow.createCell(i);
|
||||
String formField = formField(fields[i]);
|
||||
cell.setCellValue(formField);
|
||||
cell.setCellStyle(headerStyle);
|
||||
}
|
||||
// 填充数据
|
||||
int rowNum = 1;
|
||||
for (EsDataTwitterView item : esDataNewsViewList) {
|
||||
Row row = sheet.createRow(rowNum++);
|
||||
logger.debug("导出excel第" + rowNum + "行");
|
||||
// 0: esUrltime
|
||||
row.createCell(0).setCellValue(item.getEsUrltime() != null ? item.getEsUrltime() : "");
|
||||
|
||||
// 1: esAuthors
|
||||
row.createCell(1).setCellValue(item.getEsAuthors() != null ? item.getEsAuthors() : "");
|
||||
|
||||
// 2: esCarriertype
|
||||
row.createCell(2).setCellValue(item.getEsCarriertype() != null ? item.getEsCarriertype() : "");
|
||||
|
||||
// 3: esSitename
|
||||
row.createCell(3).setCellValue(item.getEsSitename() != null ? item.getEsSitename() : "");
|
||||
|
||||
// 4: esUrlcontent
|
||||
String esUrlcontent = item.getEsUrlcontent();
|
||||
if (esUrlcontent != null && esUrlcontent.length() > 10000) {
|
||||
row.createCell(4).setCellValue(esUrlcontent.substring(0, 10000));
|
||||
} else {
|
||||
row.createCell(4).setCellValue(esUrlcontent != null ? esUrlcontent : "");
|
||||
}
|
||||
|
||||
// 5: esUrlcontentTranslate
|
||||
String esUrlcontentTranslate = item.getEsUrlcontentTranslate();
|
||||
if (esUrlcontentTranslate != null && esUrlcontentTranslate.length() > 10000) {
|
||||
row.createCell(5).setCellValue(esUrlcontentTranslate.substring(0, 10000));
|
||||
} else {
|
||||
row.createCell(5).setCellValue(esUrlcontentTranslate != null ? esUrlcontentTranslate : "");
|
||||
}
|
||||
|
||||
// 6: esUrlname
|
||||
row.createCell(6).setCellValue(item.getEsUrlname() != null ? item.getEsUrlname() : "");
|
||||
|
||||
// 7: esUrltitle
|
||||
String esUrltitle = item.getEsUrltitle();
|
||||
if (esUrltitle != null && esUrltitle.length() > 10000) {
|
||||
row.createCell(7).setCellValue(esUrltitle.substring(0, 10000));
|
||||
} else {
|
||||
row.createCell(7).setCellValue(esUrltitle != null ? esUrltitle : "");
|
||||
}
|
||||
|
||||
// 8: es_urltitle_translate
|
||||
String esUrltitleTranslate = item.getEsUrltitleTranslate();
|
||||
if (esUrltitleTranslate != null && esUrltitleTranslate.length() > 10000) {
|
||||
row.createCell(8).setCellValue(esUrltitleTranslate.substring(0, 10000));
|
||||
} else {
|
||||
row.createCell(8).setCellValue(esUrltitleTranslate != null ? esUrltitleTranslate : "");
|
||||
}
|
||||
|
||||
// 9: esVideo
|
||||
String videoFilenames = extractFilenamesFromJsonArray(item.getEsVideo());
|
||||
row.createCell(9).setCellValue(videoFilenames);
|
||||
|
||||
// 10: esExtname
|
||||
row.createCell(10).setCellValue(item.getEsExtname() != null ? item.getEsExtname() : "");
|
||||
|
||||
// 11: esIsrepost
|
||||
row.createCell(11).setCellValue(item.getEsIsrepost() != null ? item.getEsIsrepost() : "");
|
||||
|
||||
// 12: esCatalog1
|
||||
row.createCell(12).setCellValue(item.getEsCatalog1() != null ? item.getEsCatalog1() : "");
|
||||
|
||||
// 13: esForwardcount
|
||||
row.createCell(13).setCellValue(item.getEsForwardcount() != null ? item.getEsForwardcount() : "");
|
||||
|
||||
// 14: esLikecount
|
||||
row.createCell(14).setCellValue(item.getEsLikecount() != null ? item.getEsLikecount() : "");
|
||||
|
||||
// 15: esCommentcount
|
||||
row.createCell(15).setCellValue(item.getEsCommentcount() != null ? item.getEsCommentcount() : "");
|
||||
|
||||
// 16: esHkey
|
||||
row.createCell(16).setCellValue(item.getEsHkey() != null ? item.getEsHkey() : "");
|
||||
|
||||
// 17: esUrlimage
|
||||
String imageFilenames = extractFilenamesFromJsonArray(item.getEsUrlimage());
|
||||
row.createCell(17).setCellValue(imageFilenames);
|
||||
|
||||
// 18: esUserid
|
||||
row.createCell(18).setCellValue(item.getEsUserid() != null ? item.getEsUserid() : "");
|
||||
|
||||
// 19: esLoadtime
|
||||
row.createCell(19).setCellValue(item.getEsLoadtime() != null ? item.getEsLoadtime() : "");
|
||||
|
||||
}
|
||||
logger.info("完成excel数据写入,共" + rowNum + "行");
|
||||
|
||||
// 自动调整列宽
|
||||
for (int i = 0; i < fields.length; i++) {
|
||||
sheet.autoSizeColumn(i);
|
||||
}
|
||||
|
||||
workbook.write(out);
|
||||
|
||||
try (FileOutputStream fos = new FileOutputStream(filePath.toFile())) {
|
||||
workbook.write(fos);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
logger.info("excel导出完成!");
|
||||
} else logger.info("获取数据为空,excel未导出");
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private String formField(Field field) {
|
||||
String fieldString = field.getName();
|
||||
return StringUtils.camelToSnake(fieldString);
|
||||
}
|
||||
|
||||
|
||||
public String extractFilenamesFromJsonArray(String jsonStr) {
|
||||
if (jsonStr == null || jsonStr.trim().isEmpty()) {
|
||||
return "";
|
||||
}
|
||||
|
||||
try {
|
||||
JsonNode array = objectMapper.readTree(jsonStr.replace("'", "\"").trim());
|
||||
if (!array.isArray()) {
|
||||
return "";
|
||||
}
|
||||
|
||||
List<String> filenames = new ArrayList<>();
|
||||
for (JsonNode node : array) {
|
||||
if (node.has("path")) {
|
||||
String url = node.get("path").asText().trim();
|
||||
if (!url.isEmpty()) {
|
||||
// 提取文件名(支持带参数的 URL)
|
||||
String filename = url.split("\\?")[0]; // 去掉 ? 后的参数
|
||||
filename = filename.substring(filename.lastIndexOf('/') + 1);
|
||||
if (!filename.isEmpty()) {
|
||||
filenames.add(filename);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return String.join(",", filenames);
|
||||
} catch (Exception e) {
|
||||
// 如果解析失败,返回空或原始内容(根据需求)
|
||||
return ""; // 或者 return jsonStr; 用于调试
|
||||
}
|
||||
}
|
||||
|
||||
public void hotSearchToXlsx(String startTime) {
|
||||
try {
|
||||
Path dirPath = Paths.get(hotSearchExcelOutputPath);
|
||||
if (!Files.exists(dirPath)) {
|
||||
Files.createDirectories(dirPath);
|
||||
}
|
||||
String timestamp = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd"));
|
||||
String fileName = "data_hot_search-" + timestamp + "-001.xlsx";
|
||||
Path filePath = dirPath.resolve(fileName);
|
||||
|
||||
List<EsDataHotSearchView> esDataHotSearchViewList = esDataHotSearchRepository.findAllByEsLoadtimeAfter(startTime);
|
||||
if (!esDataHotSearchViewList.isEmpty()) {
|
||||
Field[] fields = esDataHotSearchViewList.get(0).getClass().getDeclaredFields(); // 通过反射获取类的成员信息,并使用这些类成员为后续生成的excel表头做准备
|
||||
try (Workbook workbook = new XSSFWorkbook();
|
||||
ByteArrayOutputStream out = new ByteArrayOutputStream()) {
|
||||
|
||||
Sheet sheet = workbook.createSheet("data");
|
||||
|
||||
// 创建表头
|
||||
Row headerRow = sheet.createRow(0);
|
||||
CellStyle headerStyle = workbook.createCellStyle(); // 创建单元格
|
||||
headerStyle.setFillForegroundColor(IndexedColors.LIGHT_BLUE.getIndex());
|
||||
headerStyle.setFillPattern(FillPatternType.SOLID_FOREGROUND);
|
||||
|
||||
for (int i = 0; i < fields.length; i++) {
|
||||
Cell cell = headerRow.createCell(i);
|
||||
String formField = formField(fields[i]);
|
||||
cell.setCellValue(formField);
|
||||
cell.setCellStyle(headerStyle);
|
||||
}
|
||||
// 填充数据
|
||||
int rowNum = 1;
|
||||
for (EsDataHotSearchView item : esDataHotSearchViewList) {
|
||||
Row row = sheet.createRow(rowNum++);
|
||||
logger.debug("导出excel第" + rowNum + "行");
|
||||
// 0: esSid
|
||||
row.createCell(0).setCellValue(item.getEsSid() != null ? item.getEsSid() : "");
|
||||
// 1: esUrltime
|
||||
row.createCell(1).setCellValue(item.getEsUrltime() != null ? item.getEsUrltime() : "");
|
||||
|
||||
// 2: esCarriertype
|
||||
row.createCell(2).setCellValue(item.getEsCarriertype() != null ? item.getEsCarriertype() : "");
|
||||
|
||||
// 3: esSitename
|
||||
row.createCell(3).setCellValue(item.getEsSitename() != null ? item.getEsSitename() : "");
|
||||
|
||||
// 4: esSimrank
|
||||
row.createCell(4).setCellValue(item.getEsSimrank() != null ? String.valueOf(Float.valueOf(item.getEsSimrank()).intValue()) : "");
|
||||
|
||||
// 5: esUrltitle
|
||||
String esUrltitle = item.getEsUrltitle();
|
||||
if (esUrltitle != null && esUrltitle.length() > 10000) {
|
||||
row.createCell(5).setCellValue(esUrltitle.substring(0, 10000));
|
||||
} else {
|
||||
row.createCell(5).setCellValue(esUrltitle != null ? esUrltitle : "");
|
||||
}
|
||||
|
||||
// 6: esUrlcontent
|
||||
String esUrlcontent = item.getEsUrlcontent();
|
||||
if (esUrlcontent != null && esUrlcontent.length() > 10000) {
|
||||
row.createCell(6).setCellValue(esUrlcontent.substring(0, 10000));
|
||||
} else {
|
||||
row.createCell(6).setCellValue(esUrlcontent != null ? esUrlcontent : "");
|
||||
}
|
||||
|
||||
// 7: esUrlname
|
||||
row.createCell(7).setCellValue(item.getEsUrlname() != null ? item.getEsUrlname() : "");
|
||||
|
||||
// 8: esHkey
|
||||
row.createCell(8).setCellValue(item.getEsHkey() != null ? item.getEsHkey() : "");
|
||||
|
||||
// 9: esLasttime
|
||||
String esLasttime = extractFilenamesFromJsonArray(item.getEsLasttime());
|
||||
row.createCell(9).setCellValue(esLasttime);
|
||||
|
||||
|
||||
// 10: esHeat
|
||||
row.createCell(10).setCellValue(item.getEsHeat() != null ? item.getEsHeat() : "");
|
||||
|
||||
// 1: esLasttime
|
||||
String esLoadtime = extractFilenamesFromJsonArray(item.getEsLoadtime());
|
||||
row.createCell(11).setCellValue(esLoadtime);
|
||||
}
|
||||
logger.info("完成excel数据写入,共" + rowNum + "行");
|
||||
|
||||
// 自动调整列宽
|
||||
for (int i = 0; i < fields.length; i++) {
|
||||
sheet.autoSizeColumn(i);
|
||||
}
|
||||
|
||||
workbook.write(out);
|
||||
|
||||
try (FileOutputStream fos = new FileOutputStream(filePath.toFile())) {
|
||||
workbook.write(fos);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
||||
logger.info("excel导出完成!");
|
||||
} else logger.info("获取数据为空,excel未导出");
|
||||
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 新闻导出
|
||||
*/
|
||||
public void exportToXlsxTest(String startTime) {
|
||||
try {
|
||||
Path dirPath = Paths.get(hotSearchExcelOutputPath);
|
||||
if (!Files.exists(dirPath)) {
|
||||
Files.createDirectories(dirPath);
|
||||
}
|
||||
String timestamp = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd"));
|
||||
String fileName = "data_hot_search-" + timestamp + "-001.xlsx";
|
||||
Path filePath = dirPath.resolve(fileName);
|
||||
|
||||
List<EsDataHotSearchView> esDataHotSearchViewList = esDataHotSearchRepository.findAllByEsLoadtimeAfter(startTime);
|
||||
if (!esDataHotSearchViewList.isEmpty()) {
|
||||
Field[] fields = esDataHotSearchViewList.get(0).getClass().getDeclaredFields(); // 通过反射获取类的成员信息,并使用这些类成员为后续生成的excel表头做准备
|
||||
try (Workbook workbook = new XSSFWorkbook();
|
||||
ByteArrayOutputStream out = new ByteArrayOutputStream()) {
|
||||
|
||||
Sheet sheet = workbook.createSheet("data");
|
||||
|
||||
// 创建表头
|
||||
Row headerRow = sheet.createRow(0);
|
||||
CellStyle headerStyle = workbook.createCellStyle(); // 创建单元格
|
||||
headerStyle.setFillForegroundColor(IndexedColors.LIGHT_BLUE.getIndex());
|
||||
headerStyle.setFillPattern(FillPatternType.SOLID_FOREGROUND);
|
||||
|
||||
for (int i = 0; i < fields.length; i++) {
|
||||
Cell cell = headerRow.createCell(i);
|
||||
String formField = formField(fields[i]);
|
||||
cell.setCellValue(formField);
|
||||
cell.setCellStyle(headerStyle);
|
||||
}
|
||||
// 填充数据
|
||||
int rowNum = 1;
|
||||
for (EsDataHotSearchView item : esDataHotSearchViewList) {
|
||||
Row row = sheet.createRow(rowNum++);
|
||||
logger.debug("导出excel第" + rowNum + "行");
|
||||
// 0: esSid
|
||||
row.createCell(0).setCellValue(item.getEsSid() != null ? item.getEsSid() : "");
|
||||
// 1: esUrltime
|
||||
row.createCell(1).setCellValue(item.getEsUrltime() != null ? item.getEsUrltime() : "");
|
||||
|
||||
// 2: esCarriertype
|
||||
row.createCell(2).setCellValue(item.getEsCarriertype() != null ? item.getEsCarriertype() : "");
|
||||
|
||||
// 3: esSitename
|
||||
row.createCell(3).setCellValue(item.getEsSitename() != null ? item.getEsSitename() : "");
|
||||
|
||||
// 4: esSimrank
|
||||
row.createCell(4).setCellValue(item.getEsSimrank() != null ? String.valueOf(Float.valueOf(item.getEsSimrank()).intValue()) : "");
|
||||
|
||||
// 5: esUrltitle
|
||||
String esUrltitle = item.getEsUrltitle();
|
||||
if (esUrltitle != null && esUrltitle.length() > 10000) {
|
||||
row.createCell(5).setCellValue(esUrltitle.substring(0, 10000));
|
||||
} else {
|
||||
row.createCell(5).setCellValue(esUrltitle != null ? esUrltitle : "");
|
||||
}
|
||||
|
||||
// 6: esUrlcontent
|
||||
String esUrlcontent = item.getEsUrlcontent();
|
||||
if (esUrlcontent != null && esUrlcontent.length() > 10000) {
|
||||
row.createCell(6).setCellValue(esUrlcontent.substring(0, 10000));
|
||||
} else {
|
||||
row.createCell(6).setCellValue(esUrlcontent != null ? esUrlcontent : "");
|
||||
}
|
||||
|
||||
// 7: esUrlname
|
||||
row.createCell(7).setCellValue(item.getEsUrlname() != null ? item.getEsUrlname() : "");
|
||||
|
||||
// 8: esHkey
|
||||
row.createCell(8).setCellValue(item.getEsHkey() != null ? item.getEsHkey() : "");
|
||||
|
||||
// 9: esLasttime
|
||||
String esLasttime = extractFilenamesFromJsonArray(item.getEsLasttime());
|
||||
row.createCell(9).setCellValue(esLasttime);
|
||||
|
||||
|
||||
// 10: esHeat
|
||||
row.createCell(10).setCellValue(item.getEsHeat() != null ? item.getEsHeat() : "");
|
||||
|
||||
// 11: esLasttime
|
||||
String esLoadtime = extractFilenamesFromJsonArray(item.getEsLoadtime());
|
||||
row.createCell(11).setCellValue(esLoadtime);
|
||||
}
|
||||
logger.info("完成excel数据写入,共" + rowNum + "行");
|
||||
|
||||
// 自动调整列宽
|
||||
for (int i = 0; i < fields.length; i++) {
|
||||
sheet.autoSizeColumn(i);
|
||||
}
|
||||
|
||||
workbook.write(out);
|
||||
|
||||
try (FileOutputStream fos = new FileOutputStream(filePath.toFile())) {
|
||||
workbook.write(fos);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
||||
logger.info("excel导出完成!");
|
||||
} else logger.info("获取数据为空,excel未导出");
|
||||
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
||||
425
dsp/src/main/java/com/jsc/dsp/utils/ExportAndUploadUtils.java
Normal file
425
dsp/src/main/java/com/jsc/dsp/utils/ExportAndUploadUtils.java
Normal file
@ -0,0 +1,425 @@
|
||||
package com.jsc.dsp.utils;
|
||||
|
||||
import com.jsc.dsp.service.ConfigService;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import javax.annotation.Resource;
|
||||
import java.io.*;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.nio.file.StandardCopyOption;
|
||||
import java.nio.file.attribute.BasicFileAttributes;
|
||||
import java.nio.file.attribute.FileTime;
|
||||
import java.text.ParseException;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.time.LocalDate;
|
||||
import java.time.LocalDateTime;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
import java.time.format.DateTimeParseException;
|
||||
import java.util.Comparator;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.zip.ZipEntry;
|
||||
import java.util.zip.ZipOutputStream;
|
||||
|
||||
@Component
|
||||
public class ExportAndUploadUtils {
|
||||
|
||||
@Resource
|
||||
DatabaseConnector databaseConnector;
|
||||
|
||||
@Resource
|
||||
FTPConnector ftpConnector;
|
||||
|
||||
@Resource
|
||||
SFTPConnector sftpConnector;
|
||||
|
||||
@Resource
|
||||
ConfigService configService;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(this.getClass().getName());
|
||||
|
||||
private static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";
|
||||
private static final SimpleDateFormat sdf = new SimpleDateFormat(DATE_FORMAT);
|
||||
|
||||
|
||||
@Value("${custom.newsExcelOutputPath}")
|
||||
String newsExcelOutputPath;
|
||||
|
||||
@Value("${custom.twitterExcelOutputPath}")
|
||||
String twitterExcelOutputPath;
|
||||
|
||||
@Value("${custom.hotSearchExcelOutputPath}")
|
||||
String hotSearchExcelOutputPath;
|
||||
|
||||
@Value("${custom.backupFilePath}")
|
||||
String backupFilePath;
|
||||
|
||||
@Value("${custom.pagesOutputPath}")
|
||||
String pagesOutputPath;
|
||||
|
||||
@Value("${custom.ftpUploadPath}")
|
||||
String ftpUploadPath;
|
||||
|
||||
/**
|
||||
* 每周一、三、五的早上8点,执行导出数据的任务
|
||||
*/
|
||||
public void exportNewsDataAndUpload() {
|
||||
logger.info("开始导出excel和pdf数据...");
|
||||
String lastLoadTime = configService.getConfigValueByName("last_loadtime");
|
||||
String currentLoadTime = StringUtils.DateToString(new Date());
|
||||
String timestamp = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd"));
|
||||
databaseConnector.exportToXlsx(lastLoadTime);
|
||||
copyPagesFiles(lastLoadTime, currentLoadTime);
|
||||
configService.setConfigValueByName("last_loadtime", currentLoadTime);
|
||||
String zipFileName = "data_news-" + timestamp + "-001.zip";
|
||||
String zipFileFullName = backupFilePath + File.separator + zipFileName;
|
||||
String remoteZipPath = ftpUploadPath + "/" + zipFileName;
|
||||
zipAndUploadDirectory(newsExcelOutputPath, zipFileFullName, remoteZipPath);
|
||||
}
|
||||
|
||||
public void exportTwitterDataAndUpload() {
|
||||
logger.info("开始导出twitter excel数据...");
|
||||
String twitterLastLoadTime = configService.getConfigValueByName("twitter_last_loadtime");
|
||||
String currentLoadTime = StringUtils.DateToString(new Date());
|
||||
String timestamp = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd"));
|
||||
databaseConnector.twitterToXlsx(twitterLastLoadTime);
|
||||
unzipAndMoveVideosImages(twitterLastLoadTime, currentLoadTime);
|
||||
configService.setConfigValueByName("twitter_last_loadtime", currentLoadTime);
|
||||
String zipFileName = "data_twitter-" + timestamp + "-001.zip";
|
||||
String zipFileFullName = backupFilePath + File.separator + zipFileName;
|
||||
String remoteZipPath = ftpUploadPath + "/" + zipFileName;
|
||||
zipAndUploadDirectory(twitterExcelOutputPath, zipFileFullName, remoteZipPath);
|
||||
}
|
||||
|
||||
public void exportHotSearchAndUpload() {
|
||||
logger.info("开始导出百度热搜 excel数据...");
|
||||
String hotSearchLastLoadTime = configService.getConfigValueByName("hot_search_last_loadtime");
|
||||
String currentLoadTime = StringUtils.DateToString(new Date());
|
||||
String timestamp = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd"));
|
||||
databaseConnector.hotSearchToXlsx(hotSearchLastLoadTime);
|
||||
configService.setConfigValueByName("hot_search_last_loadtime", currentLoadTime);
|
||||
String zipFileName = "data_hot_search-" + timestamp + "-001.zip";
|
||||
String zipFileFullName = backupFilePath + File.separator + zipFileName;
|
||||
String remoteZipPath = ftpUploadPath + "/" + zipFileName;
|
||||
zipAndUploadDirectory(hotSearchExcelOutputPath, zipFileFullName, remoteZipPath);
|
||||
}
|
||||
|
||||
/**
|
||||
* 将指定目录打包成 ZIP 文件(保存到指定本地路径),并上传到 FTP 服务器
|
||||
*
|
||||
* @param sourceDirPath 本地要打包的源目录路径(如:/data/reports)
|
||||
* @param localZipPath 本地 ZIP 文件保存路径(如:/backup/archives/reports_20251224.zip)
|
||||
* @param remoteZipPath FTP 上的目标路径(如:/ftp/backups/reports_20251224.zip)
|
||||
*/
|
||||
public void zipAndUploadDirectory(String sourceDirPath, String localZipPath, String remoteZipPath) {
|
||||
Path sourceDir = Paths.get(sourceDirPath);
|
||||
if (!Files.exists(sourceDir) || !Files.isDirectory(sourceDir)) {
|
||||
logger.error("源目录不存在或不是一个目录: {}", sourceDirPath);
|
||||
return;
|
||||
}
|
||||
|
||||
Path localZipFile = Paths.get(localZipPath);
|
||||
Path zipParent = localZipFile.getParent();
|
||||
if (zipParent != null && !Files.exists(zipParent)) {
|
||||
try {
|
||||
Files.createDirectories(zipParent);
|
||||
logger.debug("创建 ZIP 父目录: {}", zipParent);
|
||||
} catch (IOException e) {
|
||||
logger.error("无法创建 ZIP 父目录: {}", zipParent, e);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// 打包目录到指定本地 ZIP 路径
|
||||
try {
|
||||
zipDirectory(sourceDir, localZipFile.toFile());
|
||||
} catch (IOException e) {
|
||||
logger.error("打包目录失败: {}", sourceDirPath, e);
|
||||
return;
|
||||
}
|
||||
|
||||
// 上传 ZIP 文件
|
||||
try (InputStream zipInputStream = Files.newInputStream(localZipFile)) {
|
||||
boolean uploaded = sftpConnector.uploadFile(zipInputStream, remoteZipPath);
|
||||
if (uploaded) {
|
||||
logger.info("ZIP 文件上传成功 - 本地: {}, FTP: {}", localZipPath, remoteZipPath);
|
||||
} else {
|
||||
logger.error("ZIP 文件上传失败 - FTP: {}", remoteZipPath);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
logger.error("读取本地 ZIP 文件失败: {}", localZipPath, e);
|
||||
}
|
||||
|
||||
// 注意:此处不再删除 localZipFile,由调用方决定是否保留或清理
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 将目录递归打包成 ZIP 文件
|
||||
*
|
||||
* @param sourceDir 要打包的源目录
|
||||
* @param zipFile 输出的 ZIP 文件
|
||||
* @throws IOException
|
||||
*/
|
||||
private void zipDirectory(Path sourceDir, File zipFile) throws IOException {
|
||||
try (ZipOutputStream zipOut = new ZipOutputStream(new FileOutputStream(zipFile))) {
|
||||
Files.walk(sourceDir)
|
||||
.filter(path -> !Files.isDirectory(path)) // 只处理文件
|
||||
.forEach(path -> {
|
||||
ZipEntry zipEntry = new ZipEntry(sourceDir.relativize(path).toString());
|
||||
try {
|
||||
zipOut.putNextEntry(zipEntry);
|
||||
Files.copy(path, zipOut);
|
||||
zipOut.closeEntry();
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("打包文件失败: " + path, e);
|
||||
}
|
||||
});
|
||||
}
|
||||
logger.info("目录打包完成: {} -> {}", sourceDir, zipFile.getAbsolutePath());
|
||||
try {
|
||||
Files.walk(sourceDir)
|
||||
.sorted(Comparator.reverseOrder()) // 先处理子文件/子目录,再处理父目录(但这里只删文件)
|
||||
.filter(path -> !Files.isDirectory(path)) // 只删除文件
|
||||
.forEach(path -> {
|
||||
try {
|
||||
Files.delete(path);
|
||||
logger.debug("已删除文件: {}", path);
|
||||
} catch (IOException e) {
|
||||
logger.warn("无法删除文件: {}", path, e);
|
||||
}
|
||||
});
|
||||
logger.info("源目录已清空(仅删除文件,保留目录结构): {}", sourceDir);
|
||||
} catch (IOException e) {
|
||||
logger.error("清空源目录时发生错误", e);
|
||||
// 注意:即使清理失败,ZIP 已生成并会继续上传,根据业务决定是否抛异常
|
||||
// 如果要求“必须清理成功才算成功”,可在此 throw 异常
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 解压存档文件并移动视频/图片目录
|
||||
*
|
||||
* @param startTime 业务开始时间(格式:yyyy-MM-dd HH:mm:ss,实际未使用但保留接口兼容性)
|
||||
* @param endTime 业务结束时间(格式:yyyy-MM-dd HH:mm:ss)
|
||||
*/
|
||||
public void unzipAndMoveVideosImages(String startTime, String endTime) {
|
||||
logger.info("开始处理存档文件: startTime={}, endTime={}", startTime, endTime);
|
||||
|
||||
try {
|
||||
// 1. 计算endTime前一日日期
|
||||
LocalDate archiveDate = parseEndDate(endTime).minusDays(1);
|
||||
String dateStr = archiveDate.format(DateTimeFormatter.ISO_DATE); // yyyy-MM-dd
|
||||
|
||||
// 2. 构建存档目录路径: D:/data/dbzq_backup/{yyyy}/{yyyy-MM}/{yyyy-MM-dd}
|
||||
String year = String.valueOf(archiveDate.getYear());
|
||||
String yearMonth = archiveDate.format(DateTimeFormatter.ofPattern("yyyy-MM"));
|
||||
Path archiveBaseDir = Paths.get("D:/data/dbzq_backup", year, yearMonth, dateStr);
|
||||
|
||||
if (!Files.exists(archiveBaseDir) || !Files.isDirectory(archiveBaseDir)) {
|
||||
logger.error("存档目录不存在: {}", archiveBaseDir);
|
||||
throw new FileNotFoundException("存档目录不存在: " + archiveBaseDir);
|
||||
}
|
||||
logger.info("使用存档目录: {}", archiveBaseDir);
|
||||
|
||||
// 3. 确保输出目录存在
|
||||
Path outputDir = Paths.get(twitterExcelOutputPath);
|
||||
Files.createDirectories(outputDir);
|
||||
logger.info("输出目录: {}", outputDir);
|
||||
|
||||
// 4. 处理视频压缩包 (image_data_plane_*.tar.gz)
|
||||
processArchiveFiles(
|
||||
archiveBaseDir,
|
||||
"image_data_plane_",
|
||||
"videos",
|
||||
outputDir
|
||||
);
|
||||
|
||||
// 5. 处理图片压缩包 (image_data_ship_*.tar.gz)
|
||||
processArchiveFiles(
|
||||
archiveBaseDir,
|
||||
"image_data_ship_",
|
||||
"images",
|
||||
outputDir
|
||||
);
|
||||
|
||||
logger.info("存档文件处理完成: {}", dateStr);
|
||||
|
||||
} catch (Exception e) {
|
||||
logger.error("存档处理失败 [endTime={}]", endTime, e);
|
||||
throw new RuntimeException("存档处理异常: " + e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 解析结束时间字符串(兼容多种常见格式)
|
||||
*/
|
||||
private LocalDate parseEndDate(String endTime) {
|
||||
// 尝试常见时间格式
|
||||
String[] patterns = {
|
||||
"yyyy-MM-dd HH:mm:ss",
|
||||
"yyyy-MM-dd'T'HH:mm:ss",
|
||||
"yyyy-MM-dd HH:mm",
|
||||
"yyyy-MM-dd"
|
||||
};
|
||||
|
||||
for (String pattern : patterns) {
|
||||
try {
|
||||
DateTimeFormatter formatter = DateTimeFormatter.ofPattern(pattern);
|
||||
return LocalDate.parse(endTime.substring(0, 10), DateTimeFormatter.ISO_DATE); // 直接取日期部分
|
||||
} catch (Exception ignored) {
|
||||
// 尝试下一种格式
|
||||
}
|
||||
}
|
||||
|
||||
// 最终尝试完整解析
|
||||
try {
|
||||
return LocalDate.parse(endTime.trim().split("\\s+")[0]); // 取日期部分
|
||||
} catch (DateTimeParseException e) {
|
||||
throw new IllegalArgumentException("无法解析 endTime 格式: " + endTime +
|
||||
",支持格式: yyyy-MM-dd[ HH:mm:ss]");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 处理指定前缀的压缩包
|
||||
*
|
||||
* @param archiveDir 存档目录
|
||||
* @param filePrefix 文件前缀 (如 "image_data_plane_")
|
||||
* @param targetDirName 目标目录名 (如 "videos")
|
||||
* @param outputDir 输出根目录
|
||||
*/
|
||||
private void processArchiveFiles(Path archiveDir, String filePrefix,
|
||||
String targetDirName, Path outputDir) throws IOException {
|
||||
// 查找所有匹配的tar.gz文件
|
||||
List<Path> tarFiles = Files.list(archiveDir)
|
||||
.filter(path -> Files.isRegularFile(path)
|
||||
&& path.getFileName().toString().startsWith(filePrefix)
|
||||
&& path.getFileName().toString().endsWith(".tar.gz"))
|
||||
.sorted() // 按文件名排序确保处理顺序
|
||||
.collect(Collectors.toList());
|
||||
|
||||
if (tarFiles.isEmpty()) {
|
||||
logger.warn("未找到 {} 开头的压缩包: {}", filePrefix, archiveDir);
|
||||
return;
|
||||
}
|
||||
|
||||
logger.info("找到 {} 个 {} 压缩包: {}", tarFiles.size(), filePrefix,
|
||||
tarFiles.stream().map(Path::getFileName).collect(Collectors.toList()));
|
||||
|
||||
// 创建全局临时目录(用于合并所有压缩包内容)
|
||||
Path tempMergeDir = Files.createTempDirectory("archive_merge_");
|
||||
logger.debug("创建临时合并目录: {}", tempMergeDir);
|
||||
|
||||
try {
|
||||
// 步骤1: 依次解压所有tar.gz到临时目录
|
||||
int totalFiles = 0;
|
||||
for (Path tarFile : tarFiles) {
|
||||
logger.info("解压压缩包: {}", tarFile.getFileName());
|
||||
totalFiles += FileUtils.extractTarGz(tarFile.toFile(), tempMergeDir.toFile());
|
||||
}
|
||||
|
||||
if (totalFiles == 0) {
|
||||
logger.warn("解压后未发现任何文件,跳过移动: {}", filePrefix);
|
||||
return;
|
||||
}
|
||||
logger.info("共解压 {} 个文件到临时目录", totalFiles);
|
||||
|
||||
// 步骤2: 平铺移动所有文件到目标目录(不保留目录结构,同名覆盖)
|
||||
Path targetPath = outputDir.resolve(targetDirName);
|
||||
Files.createDirectories(targetPath); // 确保目标目录存在
|
||||
|
||||
int movedCount = FileUtils.flattenAndMoveFiles(tempMergeDir, targetPath);
|
||||
|
||||
logger.info("成功平铺移动 {} 个文件到: {}", movedCount, targetPath);
|
||||
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
} finally {
|
||||
// 清理临时目录
|
||||
try {
|
||||
FileUtils.deleteDirectory(tempMergeDir);
|
||||
logger.debug("已清理临时目录: {}", tempMergeDir);
|
||||
} catch (Exception e) {
|
||||
logger.warn("清理临时目录失败: {}", tempMergeDir, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void copyPagesFiles(String startTime, String endTime) {
|
||||
try {
|
||||
logger.info("开始复制PDF...");
|
||||
// 解析时间范围
|
||||
Date start = sdf.parse(startTime);
|
||||
Date end = sdf.parse(endTime);
|
||||
|
||||
// 源目录
|
||||
Path sourceDir = Paths.get(pagesOutputPath);
|
||||
if (!Files.exists(sourceDir) || !Files.isDirectory(sourceDir)) {
|
||||
logger.error("源目录不存在或不是目录: " + pagesOutputPath);
|
||||
return;
|
||||
}
|
||||
|
||||
// 目标目录:在 excelOutputPath 下创建 pdf 子目录
|
||||
Path targetBaseDir = Paths.get(newsExcelOutputPath);
|
||||
Path targetPdfDir = targetBaseDir.resolve("pdf");
|
||||
|
||||
// 确保目标目录存在
|
||||
if (!Files.exists(targetPdfDir)) {
|
||||
Files.createDirectories(targetPdfDir);
|
||||
}
|
||||
|
||||
// 遍历源目录中的所有 PDF 文件
|
||||
Files.walk(sourceDir)
|
||||
.filter(path -> !Files.isDirectory(path))
|
||||
.filter(path -> path.toString().toLowerCase().endsWith(".pdf"))
|
||||
.forEach(path -> {
|
||||
try {
|
||||
// 获取文件创建时间(注意:Linux/macOS 可能不支持 creationTime)
|
||||
BasicFileAttributes attrs = Files.readAttributes(path, BasicFileAttributes.class);
|
||||
FileTime creationTime = attrs.creationTime();
|
||||
Date fileCreationDate = new Date(creationTime.toMillis());
|
||||
|
||||
// 如果 creationTime 在某些系统上不可靠,可替换为 lastModifiedTime:
|
||||
// Date fileCreationDate = new Date(Files.getLastModifiedTime(path).toMillis());
|
||||
|
||||
// 判断文件时间是否在指定范围内
|
||||
if (!fileCreationDate.before(start) && !fileCreationDate.after(end)) {
|
||||
// 构建目标路径(保留相对结构,或直接放平?这里按原相对路径保留)
|
||||
Path relativePath = sourceDir.relativize(path);
|
||||
Path targetPath = targetPdfDir.resolve(relativePath);
|
||||
|
||||
// 确保目标子目录存在
|
||||
Path targetParent = targetPath.getParent();
|
||||
if (targetParent != null && !Files.exists(targetParent)) {
|
||||
Files.createDirectories(targetParent);
|
||||
}
|
||||
|
||||
// 复制文件
|
||||
Files.copy(path, targetPath, StandardCopyOption.REPLACE_EXISTING);
|
||||
logger.info("已复制文件: " + path + " -> " + targetPath);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
logger.error("处理文件时出错: " + path + " - " + e.getMessage());
|
||||
}
|
||||
});
|
||||
|
||||
logger.info("PDF 文件复制完成,目标目录: " + targetPdfDir.toAbsolutePath());
|
||||
|
||||
} catch (ParseException e) {
|
||||
logger.error("时间格式解析错误,请确保使用格式: " + DATE_FORMAT);
|
||||
e.printStackTrace();
|
||||
} catch (IOException e) {
|
||||
logger.error("IO 错误: " + e.getMessage());
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
108
dsp/src/main/java/com/jsc/dsp/utils/FTPConnector.java
Normal file
108
dsp/src/main/java/com/jsc/dsp/utils/FTPConnector.java
Normal file
@ -0,0 +1,108 @@
|
||||
package com.jsc.dsp.utils;
|
||||
|
||||
import org.apache.commons.net.ftp.FTP;
|
||||
import org.apache.commons.net.ftp.FTPClient;
|
||||
import org.apache.commons.net.ftp.FTPReply;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
@Component
|
||||
public class FTPConnector {
|
||||
|
||||
Logger log = LoggerFactory.getLogger(this.getClass().getName());
|
||||
|
||||
@Value("${ftp.host}")
|
||||
String host;
|
||||
|
||||
@Value("${ftp.port}")
|
||||
Integer port;
|
||||
|
||||
@Value("${ftp.username}")
|
||||
String username;
|
||||
|
||||
@Value("${ftp.password}")
|
||||
String password;
|
||||
|
||||
@Value("${ftp.timeout}")
|
||||
Integer timeout;
|
||||
|
||||
public boolean uploadFile(InputStream inputStream, String remotePath) {
|
||||
FTPClient ftpClient = new FTPClient();
|
||||
try {
|
||||
// 连接 FTP 服务器
|
||||
ftpClient.connect(host, port);
|
||||
ftpClient.login(username, password);
|
||||
ftpClient.setConnectTimeout(timeout);
|
||||
ftpClient.setSoTimeout(timeout);
|
||||
|
||||
// 设置文件类型为二进制(避免文本模式损坏文件)
|
||||
ftpClient.setFileType(FTP.BINARY_FILE_TYPE);
|
||||
|
||||
// 启用被动模式(适用于 NAT/防火墙环境)
|
||||
ftpClient.enterLocalPassiveMode();
|
||||
|
||||
// 检查登录是否成功
|
||||
if (!FTPReply.isPositiveCompletion(ftpClient.getReplyCode())) {
|
||||
ftpClient.disconnect();
|
||||
log.error("FTP 登录失败");
|
||||
return false;
|
||||
}
|
||||
|
||||
// 创建目录(如果路径包含子目录)
|
||||
createDirectories(ftpClient, remotePath);
|
||||
|
||||
// 上传文件
|
||||
boolean success = ftpClient.storeFile(remotePath, inputStream);
|
||||
if (success) {
|
||||
log.info("文件上传成功: {}", remotePath);
|
||||
} else {
|
||||
log.error("FTP 上传失败,错误码: {}", ftpClient.getReplyCode());
|
||||
}
|
||||
return success;
|
||||
|
||||
} catch (IOException e) {
|
||||
log.error("FTP 上传异常: {}", e.getMessage(), e);
|
||||
return false;
|
||||
} finally {
|
||||
try {
|
||||
if (inputStream != null) {
|
||||
inputStream.close();
|
||||
}
|
||||
if (ftpClient.isConnected()) {
|
||||
ftpClient.logout();
|
||||
ftpClient.disconnect();
|
||||
}
|
||||
} catch (IOException e) {
|
||||
log.warn("关闭 FTP 连接时出错", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 递归创建远程目录(如果路径中包含目录)
|
||||
*/
|
||||
private void createDirectories(FTPClient ftpClient, String remoteFilePath) throws IOException {
|
||||
String[] pathParts = remoteFilePath.split("/");
|
||||
StringBuilder currentPath = new StringBuilder();
|
||||
|
||||
for (int i = 0; i < pathParts.length - 1; i++) {
|
||||
if (!pathParts[i].isEmpty()) {
|
||||
currentPath.append("/").append(pathParts[i]);
|
||||
// 尝试切换目录,如果失败则创建
|
||||
if (!ftpClient.changeWorkingDirectory(currentPath.toString())) {
|
||||
boolean made = ftpClient.makeDirectory(currentPath.toString());
|
||||
if (made) {
|
||||
log.debug("创建 FTP 目录: {}", currentPath);
|
||||
}
|
||||
ftpClient.changeWorkingDirectory(currentPath.toString());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,23 +1,25 @@
|
||||
package com.jsc.dsp.utils;
|
||||
|
||||
import org.apache.commons.compress.archivers.ArchiveEntry;
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
|
||||
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
|
||||
|
||||
import java.io.*;
|
||||
import java.net.HttpURLConnection;
|
||||
import java.net.URL;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.nio.file.StandardCopyOption;
|
||||
import java.util.Calendar;
|
||||
import java.util.Date;
|
||||
import java.util.HashSet;
|
||||
import java.util.logging.Logger;
|
||||
import java.nio.file.*;
|
||||
import java.nio.file.attribute.BasicFileAttributes;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
public class FileUtils {
|
||||
private final Logger logger = Logger.getLogger(this.getClass().getName());
|
||||
private final Logger logger = LoggerFactory.getLogger(this.getClass().getName());
|
||||
|
||||
public FileUtils() {
|
||||
}
|
||||
@ -79,7 +81,7 @@ public class FileUtils {
|
||||
public int downloadFromUrl(String urlStr, String savePath) {
|
||||
try {
|
||||
if (downloadedFileSet.contains(urlStr)) {
|
||||
logger.warning("File exist from " + urlStr);
|
||||
logger.warn("File exist from " + urlStr);
|
||||
return 2;
|
||||
}
|
||||
String[] urlCascade = urlStr.split("/");
|
||||
@ -183,6 +185,180 @@ public class FileUtils {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 解压tar.gz文件到指定目录
|
||||
*/
|
||||
/**
|
||||
* 解压tar.gz文件到指定目录,返回解压的文件数量(不包含目录)
|
||||
*
|
||||
* @return 解压的普通文件数量
|
||||
*/
|
||||
public static int extractTarGz(File tarFile, File destDir) throws IOException {
|
||||
if (!destDir.exists() && !destDir.mkdirs()) {
|
||||
throw new IOException("无法创建目标目录: " + destDir.getAbsolutePath());
|
||||
}
|
||||
|
||||
int fileCount = 0;
|
||||
|
||||
try (FileInputStream fis = new FileInputStream(tarFile);
|
||||
BufferedInputStream bis = new BufferedInputStream(fis);
|
||||
GzipCompressorInputStream gzIn = new GzipCompressorInputStream(bis);
|
||||
TarArchiveInputStream tarIn = new TarArchiveInputStream(gzIn)) {
|
||||
|
||||
TarArchiveEntry entry;
|
||||
|
||||
while ((entry = tarIn.getNextTarEntry()) != null) {
|
||||
// 跳过空条目、符号链接、特殊设备文件
|
||||
if (entry.getName().trim().isEmpty()
|
||||
|| entry.isSymbolicLink()
|
||||
|| entry.isCharacterDevice()
|
||||
|| entry.isBlockDevice()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// 安全校验:防止路径遍历攻击
|
||||
Path entryPath = destDir.toPath().resolve(entry.getName()).normalize();
|
||||
if (!entryPath.startsWith(destDir.toPath().normalize())) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// 创建目录结构(为后续文件写入做准备)
|
||||
if (entry.isDirectory()) {
|
||||
Files.createDirectories(entryPath);
|
||||
} else {
|
||||
Files.createDirectories(entryPath.getParent());
|
||||
Files.copy(tarIn, entryPath, StandardCopyOption.REPLACE_EXISTING);
|
||||
fileCount++;
|
||||
}
|
||||
}
|
||||
return fileCount;
|
||||
|
||||
} catch (IOException e) {
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 递归删除目录(含子目录和文件)
|
||||
*/
|
||||
public static void deleteDirectory(Path path) throws IOException {
|
||||
if (!Files.exists(path)) return;
|
||||
|
||||
Files.walkFileTree(path, new SimpleFileVisitor<Path>() {
|
||||
@Override
|
||||
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
|
||||
Files.delete(file);
|
||||
return FileVisitResult.CONTINUE;
|
||||
}
|
||||
|
||||
@Override
|
||||
public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException {
|
||||
Files.delete(dir);
|
||||
return FileVisitResult.CONTINUE;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
public static void moveAllFilesRecursively(Path sourceDir, Path targetDir) throws IOException {
|
||||
if (!Files.exists(sourceDir) || !Files.isDirectory(sourceDir)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// 使用Files.walk递归遍历所有文件
|
||||
try (Stream<Path> walkStream = Files.walk(sourceDir)) {
|
||||
walkStream
|
||||
.filter(path -> !Files.isDirectory(path)) // 只处理文件
|
||||
.sorted() // 确保先创建父目录再移动文件
|
||||
.forEach(file -> {
|
||||
try {
|
||||
// 计算相对路径(相对于sourceDir)
|
||||
Path relativePath = sourceDir.relativize(file);
|
||||
|
||||
// 构建目标文件路径
|
||||
Path targetFile = targetDir.resolve(relativePath);
|
||||
|
||||
// 确保目标父目录存在
|
||||
Files.createDirectories(targetFile.getParent());
|
||||
|
||||
// 移动文件(覆盖同名文件)
|
||||
Files.move(file, targetFile,
|
||||
StandardCopyOption.REPLACE_EXISTING,
|
||||
StandardCopyOption.COPY_ATTRIBUTES);
|
||||
|
||||
} catch (IOException e) {
|
||||
throw new UncheckedIOException(e); // 便于Stream中抛出
|
||||
}
|
||||
});
|
||||
} catch (UncheckedIOException e) {
|
||||
throw e.getCause() instanceof IOException ? (IOException) e.getCause() : new IOException(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 递归遍历源目录,将所有文件平铺移动到目标目录(不保留目录结构,同名覆盖)
|
||||
*
|
||||
* @param sourceDir 源目录(临时解压目录)
|
||||
* @param targetDir 目标目录(如 D:/output/twitter/videos)
|
||||
* @return 成功移动的文件数量
|
||||
*/
|
||||
public static int flattenAndMoveFiles(Path sourceDir, Path targetDir) throws Exception {
|
||||
if (!Files.exists(sourceDir) || !Files.isDirectory(sourceDir)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
AtomicInteger movedCount = new AtomicInteger(0);
|
||||
Map<String, Path> duplicateFiles = new HashMap<>(); // 记录被覆盖的文件
|
||||
|
||||
try (Stream<Path> walkStream = Files.walk(sourceDir)) {
|
||||
walkStream
|
||||
.filter(path -> Files.isRegularFile(path)) // 只处理普通文件
|
||||
.forEach(file -> {
|
||||
try {
|
||||
String fileName = file.getFileName().toString();
|
||||
Path targetFile = targetDir.resolve(fileName);
|
||||
|
||||
// 检测同名文件覆盖(用于日志记录)
|
||||
boolean willOverwrite = Files.exists(targetFile);
|
||||
if (willOverwrite) {
|
||||
duplicateFiles.put(fileName, file);
|
||||
}
|
||||
|
||||
// 移动文件(覆盖同名文件)
|
||||
Files.move(file, targetFile);
|
||||
|
||||
movedCount.incrementAndGet();
|
||||
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
});
|
||||
} catch (UncheckedIOException e) {
|
||||
throw e.getCause() instanceof IOException ? (IOException) e.getCause() : new IOException(e);
|
||||
}
|
||||
|
||||
return movedCount.get();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 清空目录内容(保留目录本身)
|
||||
*/
|
||||
public static void cleanDirectory(Path dir) throws IOException {
|
||||
if (!Files.exists(dir)) return;
|
||||
|
||||
try (DirectoryStream<Path> stream = Files.newDirectoryStream(dir)) {
|
||||
for (Path entry : stream) {
|
||||
if (Files.isDirectory(entry)) {
|
||||
deleteDirectory(entry);
|
||||
} else {
|
||||
Files.delete(entry);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static void main(String[] args) {
|
||||
saveStringToFile("{\"aaa\":\"测试测试testtest\"}", "E:/yuxin/test.json");
|
||||
}
|
||||
|
||||
138
dsp/src/main/java/com/jsc/dsp/utils/SFTPConnector.java
Normal file
138
dsp/src/main/java/com/jsc/dsp/utils/SFTPConnector.java
Normal file
@ -0,0 +1,138 @@
|
||||
package com.jsc.dsp.utils;
|
||||
|
||||
import com.jcraft.jsch.*;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.Properties;
|
||||
|
||||
@Component
|
||||
public class SFTPConnector {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(SFTPConnector.class);
|
||||
|
||||
@Value("${sftp.host}")
|
||||
private String host;
|
||||
|
||||
@Value("${sftp.port:22}") // SFTP 默认端口 22
|
||||
private Integer port;
|
||||
|
||||
@Value("${sftp.username}")
|
||||
private String username;
|
||||
|
||||
@Value("${sftp.password}") // 支持密码认证(生产环境建议改用私钥)
|
||||
private String password;
|
||||
|
||||
@Value("${sftp.timeout:30000}")
|
||||
private Integer timeout; // 单位:毫秒
|
||||
|
||||
@Value("${sftp.strictHostKeyChecking:false}") // false 仅用于测试环境!
|
||||
private boolean strictHostKeyChecking;
|
||||
|
||||
/**
|
||||
* 上传文件到 SFTP 服务器(密码认证)
|
||||
*
|
||||
* @param inputStream 源文件流(方法内部负责关闭)
|
||||
* @param remotePath 远程绝对路径,如 /upload/2024/file.pdf
|
||||
* @return 上传成功返回 true
|
||||
*/
|
||||
public boolean uploadFile(InputStream inputStream, String remotePath) {
|
||||
Session session = null;
|
||||
ChannelSftp channelSftp = null;
|
||||
try {
|
||||
// 1. 初始化 JSch 会话
|
||||
JSch jsch = new JSch();
|
||||
session = jsch.getSession(username, host, port);
|
||||
session.setPassword(password);
|
||||
session.setTimeout(timeout);
|
||||
|
||||
// 2. 配置 SSH 连接参数(安全提示:生产环境必须启用 StrictHostKeyChecking 并配置 known_hosts)
|
||||
Properties config = new Properties();
|
||||
config.put("StrictHostKeyChecking", String.valueOf(strictHostKeyChecking));
|
||||
session.setConfig(config);
|
||||
|
||||
// 3. 建立连接
|
||||
session.connect();
|
||||
channelSftp = (ChannelSftp) session.openChannel("sftp");
|
||||
channelSftp.connect(timeout);
|
||||
|
||||
// 4. 确保目标目录存在
|
||||
ensureDirectoryExists(channelSftp, remotePath);
|
||||
|
||||
// 5. 上传文件(JSch 会完整读取流,但不关闭流)
|
||||
channelSftp.put(inputStream, remotePath);
|
||||
log.info("SFTP 文件上传成功: {}", remotePath);
|
||||
return true;
|
||||
|
||||
} catch (JSchException | SftpException e) {
|
||||
log.error("SFTP 上传失败 [host={}, path={}]: {}", host, remotePath, e.getMessage(), e);
|
||||
return false;
|
||||
} catch (Exception e) {
|
||||
log.error("SFTP 上传异常 [path={}]: {}", remotePath, e.getMessage(), e);
|
||||
return false;
|
||||
} finally {
|
||||
// 6. 资源清理(先关流,再关通道/会话)
|
||||
closeQuietly(inputStream);
|
||||
if (channelSftp != null && channelSftp.isConnected()) {
|
||||
try {
|
||||
channelSftp.disconnect();
|
||||
} catch (Exception e) {
|
||||
log.warn("关闭 SFTP 通道异常", e);
|
||||
}
|
||||
}
|
||||
if (session != null && session.isConnected()) {
|
||||
session.disconnect();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 递归创建远程目录(基于 ChannelSftp)
|
||||
*
|
||||
* @param sftp SFTP 通道
|
||||
* @param remotePath 完整远程文件路径(含文件名)
|
||||
* @throws SftpException 目录创建失败时抛出
|
||||
*/
|
||||
private void ensureDirectoryExists(ChannelSftp sftp, String remotePath) throws SftpException {
|
||||
String dirPath = extractDirectory(remotePath);
|
||||
if ("/".equals(dirPath)) return;
|
||||
|
||||
String[] dirs = dirPath.split("/");
|
||||
StringBuilder current = new StringBuilder();
|
||||
for (String dir : dirs) {
|
||||
if (dir.isEmpty()) continue;
|
||||
current.append("/").append(dir);
|
||||
try {
|
||||
sftp.cd(current.toString()); // 尝试进入目录
|
||||
} catch (SftpException e) {
|
||||
sftp.mkdir(current.toString()); // 不存在则创建
|
||||
sftp.cd(current.toString());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 从完整路径提取目录部分(如 /a/b/file.txt → /a/b)
|
||||
*/
|
||||
private String extractDirectory(String path) {
|
||||
int lastSlash = path.lastIndexOf('/');
|
||||
return (lastSlash <= 0) ? "/" : path.substring(0, lastSlash);
|
||||
}
|
||||
|
||||
/**
|
||||
* 安静关闭输入流
|
||||
*/
|
||||
private void closeQuietly(InputStream is) {
|
||||
if (is != null) {
|
||||
try {
|
||||
is.close();
|
||||
} catch (IOException e) {
|
||||
log.debug("关闭输入流时忽略异常", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -116,6 +116,29 @@ public class StringUtils {
|
||||
return wordList;
|
||||
}
|
||||
|
||||
public static String camelToSnake(String camel) {
|
||||
if (camel == null || camel.isEmpty()) {
|
||||
return camel;
|
||||
}
|
||||
StringBuilder result = new StringBuilder();
|
||||
result.append(Character.toLowerCase(camel.charAt(0)));
|
||||
for (int i = 1; i < camel.length(); i++) {
|
||||
char ch = camel.charAt(i);
|
||||
if (Character.isUpperCase(ch)) {
|
||||
// 如果前一个字符不是大写,或者后一个不是小写,则加下划线
|
||||
char prev = camel.charAt(i - 1);
|
||||
if (!Character.isUpperCase(prev) ||
|
||||
(i + 1 < camel.length() && Character.isLowerCase(camel.charAt(i + 1)))) {
|
||||
result.append('_');
|
||||
}
|
||||
result.append(Character.toLowerCase(ch));
|
||||
} else {
|
||||
result.append(ch);
|
||||
}
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
initFilterMap("http://39.98.151.140:28081/api/open/wordBank/queryAll");
|
||||
}
|
||||
|
||||
94
dsp/src/main/java/com/jsc/dsp/utils/TodistParseUtil.java
Normal file
94
dsp/src/main/java/com/jsc/dsp/utils/TodistParseUtil.java
Normal file
@ -0,0 +1,94 @@
|
||||
package com.jsc.dsp.utils;
|
||||
|
||||
import com.alibaba.fastjson.JSON;
|
||||
import com.alibaba.fastjson.JSONArray;
|
||||
import com.alibaba.fastjson.JSONObject;
|
||||
import com.google.protobuf.Descriptors;
|
||||
import com.google.protobuf.GeneratedMessageV3;
|
||||
import com.google.protobuf.InvalidProtocolBufferException;
|
||||
import com.jsc.dsp.proto.EsOuterClass;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* 备选方案:使用 FastJSON 手动转换(无额外依赖)
|
||||
*/
|
||||
public class TodistParseUtil {
|
||||
|
||||
public static String protobufToJson(EsOuterClass.EsSets esSets) {
|
||||
JSONObject root = new JSONObject();
|
||||
|
||||
// 处理 repeated Es 字段
|
||||
JSONArray esArray = new JSONArray();
|
||||
for (EsOuterClass.Es es : esSets.getEsList()) {
|
||||
esArray.add(messageToJson(es));
|
||||
}
|
||||
root.put("es", esArray);
|
||||
|
||||
return JSON.toJSONString(root, true); // pretty format
|
||||
}
|
||||
|
||||
private static JSONObject messageToJson(GeneratedMessageV3 message) {
|
||||
JSONObject json = new JSONObject();
|
||||
Map<Descriptors.FieldDescriptor, Object> fields = message.getAllFields();
|
||||
|
||||
for (Map.Entry<Descriptors.FieldDescriptor, Object> entry : fields.entrySet()) {
|
||||
Descriptors.FieldDescriptor field = entry.getKey();
|
||||
Object value = entry.getValue();
|
||||
|
||||
if (field.isRepeated()) {
|
||||
JSONArray array = new JSONArray();
|
||||
if (value instanceof Iterable) {
|
||||
for (Object item : (Iterable<?>) value) {
|
||||
array.add(convertFieldValue(item));
|
||||
}
|
||||
}
|
||||
json.put(field.getName(), array);
|
||||
} else {
|
||||
json.put(field.getName(), convertFieldValue(value));
|
||||
}
|
||||
}
|
||||
return json;
|
||||
}
|
||||
|
||||
private static Object convertFieldValue(Object value) {
|
||||
if (value instanceof GeneratedMessageV3) {
|
||||
return messageToJson((GeneratedMessageV3) value);
|
||||
}
|
||||
// 其他类型直接返回(Protobuf 基本类型可被 FastJSON 识别)
|
||||
return value;
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
|
||||
String filePath = "C:/Users/yuxin/Documents/xwechat_files/wxid_dtvj9sibla0d21_9cb3/msg/file/2026-02/public_info_data_1770264282958.todist";
|
||||
try {
|
||||
// 1. 流式读取文件(避免大文件 OOM)
|
||||
byte[] data = Files.readAllBytes(Paths.get(filePath));
|
||||
|
||||
// 2. Protobuf 反序列化
|
||||
EsOuterClass.EsSets esSets = EsOuterClass.EsSets.parseFrom(data);
|
||||
System.out.println("✅ 成功解析 EsSets,共 " + esSets.getEsCount() + " 条记录");
|
||||
|
||||
// 3. 转换为 JSON(使用 Protobuf 原生 JsonFormat)
|
||||
String json = protobufToJson(esSets);
|
||||
|
||||
// 4. 输出格式化 JSON
|
||||
System.out.println("/n📄 JSON Output:");
|
||||
System.out.println(json);
|
||||
|
||||
} catch (InvalidProtocolBufferException e) {
|
||||
System.err.println("❌ Protobuf 解析失败: " + e.getMessage());
|
||||
e.printStackTrace();
|
||||
} catch (IOException e) {
|
||||
System.err.println("❌ 文件读取失败: " + e.getMessage());
|
||||
e.printStackTrace();
|
||||
} catch (Exception e) {
|
||||
System.err.println("❌ 未知错误: " + e.getMessage());
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1,18 +1,19 @@
|
||||
server:
|
||||
port: 8084
|
||||
port: 18084
|
||||
servlet:
|
||||
context-path: /dsp
|
||||
spring:
|
||||
cloud:
|
||||
stream:
|
||||
kafka:
|
||||
binder:
|
||||
brokers: 47.113.231.200:9092
|
||||
zkNodes: 47.113.231.200:2181
|
||||
auto-create-topics: true
|
||||
healthTimeout: 600
|
||||
bindings:
|
||||
file_dl_pipeline_in:
|
||||
destination: stream-file-dl
|
||||
group: file-dl
|
||||
group: file-dl-test
|
||||
consumer:
|
||||
pollTimeout: 60
|
||||
file_dl_pipeline_out:
|
||||
@ -20,7 +21,7 @@ spring:
|
||||
content-type: text/plain
|
||||
protobuf_pipeline_in:
|
||||
destination: stream-protobuf
|
||||
group: protobuf
|
||||
group: protobuf-test
|
||||
consumer:
|
||||
pollTimeout: 60
|
||||
protobuf_pipeline_out:
|
||||
@ -28,7 +29,7 @@ spring:
|
||||
content-type: text/plain
|
||||
storage_pipeline_in:
|
||||
destination: stream-db
|
||||
group: db
|
||||
group: db-test
|
||||
consumer:
|
||||
pollTimeout: 60
|
||||
storage_pipeline_out:
|
||||
@ -43,38 +44,64 @@ spring:
|
||||
records: 10
|
||||
interval:
|
||||
ms: 3600000
|
||||
datasource:
|
||||
url: jdbc:mysql://47.113.231.200:28089/dsp?useSSL=false&serverTimezone=Asia/Shanghai&allowPublicKeyRetrieval=true
|
||||
username: root
|
||||
password: passok123A
|
||||
driver-class-name: com.mysql.cj.jdbc.Driver
|
||||
|
||||
jpa:
|
||||
database-platform: org.hibernate.dialect.MySQL8Dialect
|
||||
show-sql: true
|
||||
|
||||
topics:
|
||||
stream-protobuf: com.jsc.dsp.service.ProtobufService
|
||||
stream-db: com.jsc.dsp.service.StorageService
|
||||
stream-file-dl: com.jsc.dsp.service.FileDlService
|
||||
|
||||
es:
|
||||
ip: 8.130.95.27
|
||||
port: 28087
|
||||
username: elastic
|
||||
password: passok123A
|
||||
index: indeximos
|
||||
type: default
|
||||
ceph:
|
||||
aws-access-key: JH8OF0D9ZJYYXBFYB5OD
|
||||
aws-secret-key: FuptELjiPQOQNR6tPOVL777n3dGe3bZCDJphyiz0
|
||||
endpoint: 192.168.1.16:28090
|
||||
db:
|
||||
driver: com.mysql.cj.jdbc.Driver
|
||||
url: jdbc:mysql://8.130.95.27:28089/dsp
|
||||
user: root
|
||||
password: passok123A
|
||||
# 本地调试时这几个开关设置为 false
|
||||
switch:
|
||||
enable-storage-service: false
|
||||
enable-file-dl-service: false
|
||||
enable-protobuf-service: false
|
||||
auto-export-and-upload: false
|
||||
|
||||
ftp:
|
||||
host: 144.34.185.108
|
||||
port: 21
|
||||
username: jsc-2b
|
||||
password: 1234qwer%
|
||||
timeout: 5000
|
||||
passive-mode: true
|
||||
|
||||
sftp:
|
||||
host: 74.121.148.204
|
||||
port: 22
|
||||
username: root
|
||||
password: NSgRMhIXL6gp
|
||||
|
||||
custom:
|
||||
dev-mode: false
|
||||
filter-words-query-api: http://47.115.228.133:28081/api/open/wordBank/queryAll
|
||||
filter-words-update-interval-ms: 3600000
|
||||
local-file-storage-path: E:/data/local-storage/
|
||||
local-file-storage-path: D:/data/local-storage/
|
||||
proto_save_path: D:/data/spider_data/proto/
|
||||
transfer_backup_path: E:/data/transfer_backup/
|
||||
file_unzip_path: E:/html-full/
|
||||
keep_backup_file: E:/data/dbzq_backup/
|
||||
nginx_path: E:/OSC-3.0/app/osdp_board/html/
|
||||
transfer_backup_path: D:/data/transfer_backup/
|
||||
file_unzip_path: D:/html-full/
|
||||
keep_backup_file: D:/data/dbzq_backup/
|
||||
nginx_path: D:/OSC-3.0/app/osdp_board/html/
|
||||
websiteQueryAPI: http://47.115.228.133:28081/api/open/target/website/queryAllInfo
|
||||
websiteUpdateAPI: http://47.115.228.133:28081/api/open/target/website/update
|
||||
socialQueryAPI: http://47.115.228.133:28081/api/open/target/social/queryAll?sortBy=id&shuffleResult=false
|
||||
socialUpdateAPI: http://47.115.228.133:28081/api/open/target/social/update
|
||||
websiteWhiteList: 能源界(国内信息);能源界(国际信息);中国能源新闻网;新华能源网;中国能源网(能源战略);中国农网(三农要闻);中国经济网(三农经济);中华粮网(粮食安全);美国之音(中国版面);美国之音(中美关系);美国之音(台海两岸版面);美国之音(港澳版面);看中国(看大陆版面);看中国(重点新闻);德国之声(中国报道);纽约时报中文网(中国版面);大纪元(一周大陆新闻);EnergyNow;联合国粮农组织;路透社(中国版面);朝中社;劳动新闻;美国农业部食品和营养服务局;布鲁金斯学会(亚太版面);俄罗斯新闻社;美国能源部;朝鲜新闻;联邦能源管理委员会;华盛顿邮报;ChinaAid;美国战略与国际研究中心;美国外交关系委员会;美国兰德;国际危机组织;美国国务院东亚与太平洋事务局;俄罗斯卫星通讯社;尤里·列瓦达分析中心;塔斯社;韩国外交部
|
||||
twitterWhiteList: nytchinese;YesterdayBigcat;takaichi_sanae;yonhapcn;VOAChinese;ChineseWSJ;whyyoutouzhele;Jaemyung_Lee
|
||||
newsExcelOutputPath: D:/data/output/upload
|
||||
twitterExcelOutputPath: D:/data/output/twitter
|
||||
hotSearchExcelOutputPath: D:/data/output/hotSearch
|
||||
backupFilePath: D:/data/output/backup
|
||||
pagesOutputPath: D:/data/output/pdf
|
||||
ftpUploadPath: /home/jsc-2b
|
||||
exportNewsTaskSchedule: "0 30 8 * * 1,2,3,4,5,6,7"
|
||||
exportTwitterTaskSchedule: "0 30 6 * * 1,2,3,4,5,6,7"
|
||||
exportHotSearchTaskSchedule: "0 0 20 * * 1,2,3,4,5,6,7"
|
||||
@ -8,7 +8,7 @@
|
||||
<contextName>logback</contextName>
|
||||
|
||||
<!-- name的值是变量的名称,value的值时变量定义的值。通过定义的值会被插入到logger上下文中。定义后,可以使“${}”来使用变量。 -->
|
||||
<property name="log.path" value="E:/dsp-logs" />
|
||||
<property name="log.path" value="D:/dsp-logs" />
|
||||
|
||||
<!--0. 日志格式和颜色渲染 -->
|
||||
<!-- 彩色日志依赖的渲染类 -->
|
||||
@ -27,7 +27,7 @@
|
||||
<encoder>
|
||||
<Pattern>${CONSOLE_LOG_PATTERN}</Pattern>
|
||||
<!-- 设置字符集 -->
|
||||
<charset>GBK</charset>
|
||||
<charset>UTF-8</charset>
|
||||
</encoder>
|
||||
</appender>
|
||||
|
||||
|
||||
348
research/pdf_downloader/save-page-with-selenium.py
Normal file
348
research/pdf_downloader/save-page-with-selenium.py
Normal file
@ -0,0 +1,348 @@
|
||||
import logging
|
||||
import os
|
||||
import queue
|
||||
import threading
|
||||
import time
|
||||
from datetime import datetime
|
||||
import random
|
||||
|
||||
import pymysql
|
||||
from tqdm import tqdm
|
||||
|
||||
from save_page_as_pdf import PDFSaver
|
||||
from save_remote_as_mhtml import RemoteMHTMLSaver
|
||||
from save_page_as_mhtml import MHTMLSaver
|
||||
import tldextract
|
||||
|
||||
# 配置日志
|
||||
from save_remote_as_pdf import RemotePDFSaver
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.StreamHandler(),
|
||||
logging.FileHandler('pdf_downloader.log')
|
||||
]
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# =============== MySQL 配置 ===============
|
||||
MYSQL_CONFIG = {
|
||||
'host': '47.113.231.200',
|
||||
'port': 28089,
|
||||
'user': 'root',
|
||||
'password': 'passok123A',
|
||||
'database': 'dsp',
|
||||
'charset': 'utf8mb4',
|
||||
'autocommit': False # 手动控制事务
|
||||
}
|
||||
# =========================================
|
||||
|
||||
# 配置参数
|
||||
BATCH_SIZE = 500
|
||||
MAX_WORKERS = 1
|
||||
TIMEOUT = 10
|
||||
PDF_OUTPUT_DIR = 'D:/data/output/pdf'
|
||||
MIN_PDF_SIZE = 5 * 1024 # 80KB
|
||||
|
||||
MHTML_OUTPUT_DIR = 'D:/data/output/mhtml'
|
||||
os.makedirs(PDF_OUTPUT_DIR, exist_ok=True)
|
||||
|
||||
running = True
|
||||
running_interval_seconds = 10
|
||||
|
||||
skip_host_name = [
|
||||
'epochtimes.com',
|
||||
'secretchina.com',
|
||||
# 'rodong.rep.kp',
|
||||
# 'kcna.kp'
|
||||
]
|
||||
|
||||
|
||||
class PDFDownloader:
|
||||
def __init__(self):
|
||||
self.db_lock = threading.Lock()
|
||||
self.db_connection = None
|
||||
self.task_queue = queue.Queue(maxsize=MAX_WORKERS * 3)
|
||||
self.processed_count = 0
|
||||
self.success_count = 0
|
||||
self.fail_count = 0
|
||||
self.small_file_count = 0 # 新增:统计小文件数量
|
||||
self.last_loadtime = self.get_last_loadtime()
|
||||
self.total_rows = self.get_total_rows()
|
||||
self.start_time = time.time()
|
||||
self.skip_hosts = []
|
||||
self.local_handler = None
|
||||
self.remote_handler = None
|
||||
|
||||
# 替换 MYSQL_CONFIG 中的连接方式
|
||||
def get_db_connection(self):
|
||||
self.db_connection = pymysql.connect(
|
||||
host=MYSQL_CONFIG['host'],
|
||||
port=MYSQL_CONFIG['port'],
|
||||
user=MYSQL_CONFIG['user'],
|
||||
password=MYSQL_CONFIG['password'],
|
||||
database=MYSQL_CONFIG['database'],
|
||||
charset='utf8mb4',
|
||||
autocommit=False
|
||||
)
|
||||
|
||||
def get_total_rows(self):
|
||||
"""获取总记录数"""
|
||||
if self.db_connection is None:
|
||||
self.get_db_connection()
|
||||
cursor = self.db_connection.cursor()
|
||||
cursor.execute(
|
||||
"SELECT COUNT(*) FROM indeximos "
|
||||
"WHERE (es_video IS NULL OR es_video IN ('-1')) "
|
||||
"AND es_loadtime > %s", self.last_loadtime
|
||||
)
|
||||
return cursor.fetchone()[0]
|
||||
|
||||
def get_last_loadtime(self):
|
||||
"""获取上次导出数据的时间"""
|
||||
if self.db_connection is None:
|
||||
self.get_db_connection()
|
||||
cursor = self.db_connection.cursor()
|
||||
cursor.execute(
|
||||
"SELECT config_value FROM config "
|
||||
"WHERE config_name = 'last_loadtime' "
|
||||
)
|
||||
return cursor.fetchone()[0]
|
||||
|
||||
def use_remote_selenium(self, url):
|
||||
for host in skip_host_name:
|
||||
if host in url:
|
||||
return True
|
||||
return False
|
||||
|
||||
def format_pdf_filename(self, row):
|
||||
"""格式化PDF文件名"""
|
||||
es_urltitle = row[2] or 'untitled'
|
||||
es_urltime = str(row[3]) or '19700101_000000'
|
||||
es_sitename = row[4] or 'anonymous'
|
||||
|
||||
def clean_filename(text):
|
||||
if not text:
|
||||
return ''
|
||||
invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*']
|
||||
for char in invalid_chars:
|
||||
text = text.replace(char, '_')
|
||||
return text.strip()[:100]
|
||||
|
||||
try:
|
||||
dt = datetime.strptime(es_urltime, '%Y-%m-%d %H:%M:%S')
|
||||
es_urltime_fix = dt.strftime('%Y%m%d_%H%M%S')
|
||||
except:
|
||||
es_urltime_fix = '19700101_000000'
|
||||
|
||||
filename = f"{clean_filename(es_urltitle)}_{es_urltime_fix}_{es_sitename}.pdf"
|
||||
return os.path.join(PDF_OUTPUT_DIR, filename)
|
||||
|
||||
def format_mhtml_filename(self, row):
|
||||
"""格式化PDF文件名"""
|
||||
es_urltitle = row[2] or 'untitled'
|
||||
es_urltime = str(row[3]) or '19700101_000000'
|
||||
es_sitename = row[4] or 'anonymous'
|
||||
|
||||
def clean_filename(text):
|
||||
if not text:
|
||||
return ''
|
||||
invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*']
|
||||
for char in invalid_chars:
|
||||
text = text.replace(char, '_')
|
||||
return text.strip()[:100]
|
||||
|
||||
try:
|
||||
dt = datetime.strptime(es_urltime, '%Y-%m-%d %H:%M:%S')
|
||||
es_urltime_fix = dt.strftime('%Y%m%d_%H%M%S')
|
||||
except:
|
||||
es_urltime_fix = '19700101_000000'
|
||||
|
||||
filename = f"{clean_filename(es_urltitle)}_{es_urltime_fix}_{es_sitename}.mhtml"
|
||||
return os.path.join(PDF_OUTPUT_DIR, filename)
|
||||
|
||||
def fetch_data_batch(self, offset):
|
||||
"""分页获取数据"""
|
||||
if self.db_connection is None:
|
||||
self.get_db_connection()
|
||||
cursor = self.db_connection.cursor()
|
||||
cursor.execute(
|
||||
"SELECT es_sid, es_urlname, es_urltitle, es_urltime, es_sitename, es_authors FROM indeximos "
|
||||
"WHERE (es_video IS NULL OR es_video IN ('-1')) "
|
||||
"AND es_loadtime > %s "
|
||||
"ORDER BY es_urltime LIMIT %s OFFSET %s",
|
||||
(self.last_loadtime, BATCH_SIZE, offset)
|
||||
)
|
||||
return cursor.fetchall()
|
||||
|
||||
def update_file_status(self, es_sid, status, retry=3):
|
||||
"""更新数据库状态"""
|
||||
for attempt in range(retry):
|
||||
try:
|
||||
with self.db_lock:
|
||||
if self.db_connection is None:
|
||||
self.get_db_connection()
|
||||
cursor = self.db_connection.cursor()
|
||||
cursor.execute(
|
||||
"UPDATE indeximos SET es_video = %s WHERE es_sid = %s",
|
||||
(status, es_sid))
|
||||
self.db_connection.commit()
|
||||
return True
|
||||
except Exception as e:
|
||||
if attempt == retry - 1:
|
||||
logger.error(f"更新数据库失败(es_sid={es_sid}): {e}")
|
||||
return False
|
||||
time.sleep(1)
|
||||
|
||||
def extract_main_domain(self, url):
|
||||
extracted = tldextract.extract(url)
|
||||
# 组合注册域名(主域名)
|
||||
main_domain = f"{extracted.domain}.{extracted.suffix}"
|
||||
return main_domain
|
||||
|
||||
def download_worker(self):
|
||||
"""工作线程函数"""
|
||||
while True:
|
||||
try:
|
||||
task = self.task_queue.get(timeout=1)
|
||||
if task is None:
|
||||
break
|
||||
|
||||
row = task
|
||||
url = row[1]
|
||||
if self.extract_main_domain(url) in self.skip_hosts:
|
||||
self.small_file_count += 1
|
||||
self.processed_count += 1
|
||||
self.task_queue.task_done()
|
||||
print(f"小文件规避,暂时跳过URL:{url}")
|
||||
continue
|
||||
output_file = self.format_pdf_filename(row) # 获取格式化后的文件名
|
||||
|
||||
try:
|
||||
os.makedirs(os.path.dirname(output_file), exist_ok=True)
|
||||
|
||||
# 调用下载函数
|
||||
if self.use_remote_selenium(url):
|
||||
self.processed_count += 1
|
||||
self.task_queue.task_done()
|
||||
continue
|
||||
# if self.remote_handler is None:
|
||||
# self.remote_handler = RemotePDFSaver()
|
||||
# success = self.remote_handler.save_as_pdf(
|
||||
# url=url,
|
||||
# output_path=output_file,
|
||||
# timeout=TIMEOUT
|
||||
# )
|
||||
else:
|
||||
if self.local_handler is None:
|
||||
self.local_handler = PDFSaver(headless=False)
|
||||
success = self.local_handler.save_as_pdf(
|
||||
url=url,
|
||||
output_path=output_file,
|
||||
timeout=TIMEOUT,
|
||||
wait_time=5
|
||||
)
|
||||
|
||||
# 验证下载结果
|
||||
if success and os.path.exists(output_file):
|
||||
file_size = os.path.getsize(output_file)
|
||||
|
||||
if file_size >= MIN_PDF_SIZE: # 文件大小合格
|
||||
self.update_file_status(row[0], output_file)
|
||||
self.success_count += 1
|
||||
else: # 文件太小
|
||||
self.update_file_status(row[0], '-2')
|
||||
self.small_file_count += 1
|
||||
logger.warning(f"文件过小({file_size}字节): {output_file}")
|
||||
try:
|
||||
os.remove(output_file)
|
||||
self.skip_hosts.append(self.extract_main_domain(url))
|
||||
except:
|
||||
pass
|
||||
else: # 下载失败
|
||||
self.update_file_status(row[0], '0')
|
||||
self.fail_count += 1
|
||||
if os.path.exists(output_file):
|
||||
try:
|
||||
os.remove(output_file)
|
||||
except:
|
||||
pass
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"下载出现异常(es_sid={row[0]}, url={url}): {str(e)}")
|
||||
self.update_file_status(row[0], '-1')
|
||||
self.fail_count += 1
|
||||
|
||||
self.processed_count += 1
|
||||
self.task_queue.task_done()
|
||||
|
||||
except queue.Empty:
|
||||
continue
|
||||
|
||||
def run(self):
|
||||
"""启动下载任务"""
|
||||
threads = []
|
||||
|
||||
# 创建工作线程
|
||||
for _ in range(MAX_WORKERS):
|
||||
t = threading.Thread(target=self.download_worker)
|
||||
t.start()
|
||||
threads.append(t)
|
||||
|
||||
# 使用进度条显示进度
|
||||
with tqdm(total=self.total_rows, desc="处理进度", unit="条") as pbar:
|
||||
offset = 0
|
||||
while True:
|
||||
batch = self.fetch_data_batch(offset)
|
||||
if not batch:
|
||||
break
|
||||
batch_list = list(batch)
|
||||
random.shuffle(batch_list)
|
||||
batch = tuple(batch_list)
|
||||
for row in batch:
|
||||
self.task_queue.put(row)
|
||||
|
||||
pbar.update(len(batch))
|
||||
pbar.set_postfix({
|
||||
'成功': self.success_count,
|
||||
'失败': self.fail_count,
|
||||
'小文件': self.small_file_count,
|
||||
'速度': f"{self.processed_count / (time.time() - self.start_time):.1f}条/秒"
|
||||
})
|
||||
|
||||
offset += BATCH_SIZE
|
||||
|
||||
self.task_queue.join()
|
||||
|
||||
for _ in range(MAX_WORKERS):
|
||||
self.task_queue.put(None)
|
||||
|
||||
for t in threads:
|
||||
t.join()
|
||||
|
||||
total_time = time.time() - self.start_time
|
||||
print(f"\n处理完成! 总计: {self.total_rows}条")
|
||||
print(f"成功: {self.success_count}条, 失败: {self.fail_count}条, 小文件: {self.small_file_count}条")
|
||||
print(f"总耗时: {total_time:.2f}秒, 平均速度: {self.total_rows / total_time:.2f}条/秒")
|
||||
|
||||
def terminate(self):
|
||||
if self.local_handler:
|
||||
self.local_handler.quit()
|
||||
if self.remote_handler:
|
||||
self.remote_handler.quit()
|
||||
self.db_connection.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
while running:
|
||||
try:
|
||||
print(f"开始处理,总记录数: {PDFDownloader().get_total_rows()}")
|
||||
downloader = PDFDownloader()
|
||||
downloader.run()
|
||||
print(f"运行完成,暂停{running_interval_seconds}秒后开始下一次运行...")
|
||||
downloader.terminate()
|
||||
time.sleep(running_interval_seconds)
|
||||
except Exception as e:
|
||||
print(repr(e))
|
||||
141
research/pdf_downloader/save_page_as_mhtml.py
Normal file
141
research/pdf_downloader/save_page_as_mhtml.py
Normal file
@ -0,0 +1,141 @@
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.webdriver.chrome.service import Service as ChromeService
|
||||
|
||||
# 配置日志
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.StreamHandler(),
|
||||
logging.FileHandler('mhtml_saver.log', encoding='utf-8')
|
||||
]
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MHTMLSaver:
|
||||
def __init__(self, headless=True):
|
||||
logger.info("正在初始化 Chrome WebDriver(自动匹配版本)...")
|
||||
service = ChromeService(executable_path="C:/Program Files/Python38/chromedriver.exe")
|
||||
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.3650.75 Safari/537.36 Edg/143.0.3650.75"
|
||||
|
||||
# Chrome 选项
|
||||
chrome_options = Options()
|
||||
chrome_options.add_argument('--headless=new')
|
||||
chrome_options.add_argument('--disable-gpu')
|
||||
chrome_options.add_argument('--no-sandbox')
|
||||
chrome_options.add_argument('--disable-dev-shm-usage')
|
||||
chrome_options.add_argument(f'--user-agent={user_agent}')
|
||||
chrome_options.add_argument('--save-page-as-mhtml') # 启用 MHTML 支持
|
||||
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
|
||||
chrome_options.add_experimental_option('useAutomationExtension', False)
|
||||
chrome_options.add_argument('--lang=zh-CN')
|
||||
chrome_options.add_experimental_option('prefs', {
|
||||
'intl.accept_languages': 'zh-CN,zh,en'
|
||||
})
|
||||
# 或启动时指定(部分版本支持)
|
||||
chrome_options.add_argument('--window-size=1920,1080')
|
||||
|
||||
# 隐藏 webdriver 特征
|
||||
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
|
||||
# 隐藏 "navigator.webdriver"
|
||||
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
|
||||
chrome_options.add_experimental_option('useAutomationExtension', False)
|
||||
|
||||
self.driver = webdriver.Chrome(service=service, options=chrome_options)
|
||||
|
||||
def save_as_mhtml(self, url, output_path=None, timeout=30, wait_time=5):
|
||||
"""
|
||||
将网页保存为 MHTML 文件
|
||||
:param url: 目标网页 URL
|
||||
:param output_path: 输出路径(.mhtml)
|
||||
:param timeout: 页面加载超时(秒)
|
||||
:param wait_time: 页面加载后等待时间(秒),用于动态内容渲染
|
||||
:return: 保存的文件绝对路径
|
||||
"""
|
||||
if output_path is None:
|
||||
parsed = urlparse(url)
|
||||
domain = parsed.netloc.replace('www.', '').split('.')[0] or 'page'
|
||||
output_path = f"{domain}.mhtml"
|
||||
|
||||
if not output_path.lower().endswith('.mhtml'):
|
||||
output_path += '.mhtml'
|
||||
|
||||
try:
|
||||
# 设置超时
|
||||
self.driver.set_page_load_timeout(timeout)
|
||||
|
||||
# 启动后注入脚本(双重保险)
|
||||
self.driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
|
||||
'source': '''
|
||||
delete navigator.__proto__.webdriver;
|
||||
window.navigator.permissions.query = (parameters) => {
|
||||
return parameters.name === 'notifications' ?
|
||||
Promise.resolve({ state: Notification.permission }) :
|
||||
originalQuery(parameters);
|
||||
};
|
||||
'''
|
||||
})
|
||||
# 在 driver.get() 之后设置窗口大小
|
||||
|
||||
logger.info(f"正在加载页面: {url}")
|
||||
self.driver.get(url)
|
||||
self.driver.set_window_size(1920, 1080)
|
||||
|
||||
# 等待页面动态内容加载(可调整)
|
||||
logger.info(f"等待 {wait_time} 秒以确保页面完全渲染...")
|
||||
time.sleep(wait_time)
|
||||
|
||||
# ✅ 关键:调用 CDP 命令捕获 MHTML
|
||||
logger.info("正在生成 MHTML 快照...")
|
||||
result = self.driver.execute_cdp_cmd('Page.captureSnapshot', {'format': 'mhtml'})
|
||||
|
||||
# ✅ result['data'] 是 Base64 编码的 MHTML 文本(实际是纯字符串)
|
||||
mhtml_content = result['data']
|
||||
|
||||
# ✅ 以文本模式写入(UTF-8)
|
||||
with open(output_path, 'w', encoding='utf-8', newline='') as f:
|
||||
f.write(mhtml_content)
|
||||
|
||||
# 验证文件
|
||||
file_size = os.path.getsize(output_path)
|
||||
if file_size == 0:
|
||||
raise RuntimeError("生成了空文件")
|
||||
|
||||
logger.info(f"✅ MHTML 保存成功: {os.path.abspath(output_path)} (大小: {file_size} 字节)")
|
||||
return os.path.abspath(output_path)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ 保存失败: {e}")
|
||||
raise
|
||||
|
||||
def quit(self):
|
||||
if self.driver:
|
||||
self.driver.quit()
|
||||
logger.info("浏览器已关闭")
|
||||
|
||||
|
||||
# ===== 测试入口 =====
|
||||
if __name__ == "__main__":
|
||||
# 示例 URL(可替换为你自己的)
|
||||
test_url = "https://cn.ultraiso.net/jiaocheng/ke-lu-guang-pan.html"
|
||||
|
||||
saver = MHTMLSaver(headless=True)
|
||||
try:
|
||||
output_file = saver.save_as_mhtml(
|
||||
url=test_url,
|
||||
output_path="example.mhtml",
|
||||
timeout=30,
|
||||
wait_time=5
|
||||
)
|
||||
print(f"\n🎉 成功保存 MHTML 文件: {output_file}")
|
||||
except Exception as e:
|
||||
print(f"\n💥 保存失败: {e}")
|
||||
finally:
|
||||
saver.quit()
|
||||
145
research/pdf_downloader/save_page_as_pdf.py
Normal file
145
research/pdf_downloader/save_page_as_pdf.py
Normal file
@ -0,0 +1,145 @@
|
||||
import base64
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.webdriver.chrome.service import Service as ChromeService
|
||||
|
||||
# 配置日志
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.StreamHandler(),
|
||||
logging.FileHandler('pdf_saver.log', encoding='utf-8')
|
||||
]
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PDFSaver:
|
||||
def __init__(self, headless=True):
|
||||
logger.info("正在初始化 Chrome WebDriver(自动匹配版本)...")
|
||||
service = ChromeService(executable_path="D:/chromedriver.exe")
|
||||
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.3650.75 Safari/537.36 Edg/143.0.3650.75"
|
||||
|
||||
# Chrome 选项
|
||||
chrome_options = Options()
|
||||
if headless:
|
||||
chrome_options.add_argument('--headless=new')
|
||||
chrome_options.add_argument('--disable-gpu')
|
||||
chrome_options.add_argument('--no-sandbox')
|
||||
chrome_options.add_argument('--disable-dev-shm-usage')
|
||||
chrome_options.add_argument(f'--user-agent={user_agent}')
|
||||
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
|
||||
chrome_options.add_experimental_option('useAutomationExtension', False)
|
||||
chrome_options.add_argument('--lang=zh-CN')
|
||||
chrome_options.add_experimental_option('prefs', {
|
||||
'intl.accept_languages': 'zh-CN,zh,en'
|
||||
})
|
||||
chrome_options.add_argument('--window-size=1920,1080')
|
||||
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
|
||||
chrome_options.page_load_strategy = 'eager'
|
||||
|
||||
# 注意:PDF 打印不需要 --save-page-as-mhtml
|
||||
self.driver = webdriver.Chrome(service=service, options=chrome_options)
|
||||
|
||||
def save_as_pdf(self, url, output_path=None, timeout=30, wait_time=5, print_options=None):
|
||||
"""
|
||||
将网页保存为 PDF 文件
|
||||
:param url: 目标网页 URL
|
||||
:param output_path: 输出路径(.pdf)
|
||||
:param timeout: 页面加载超时(秒)
|
||||
:param wait_time: 页面加载后等待时间(秒),用于动态内容渲染
|
||||
:param print_options: PDF 打印选项(可选),参考 https://chromedevtools.github.io/devtools-protocol/tot/Page/#method-printToPDF
|
||||
:return: 保存的文件绝对路径
|
||||
"""
|
||||
if output_path is None:
|
||||
parsed = urlparse(url)
|
||||
domain = parsed.netloc.replace('www.', '').split('.')[0] or 'page'
|
||||
output_path = f"{domain}.pdf"
|
||||
|
||||
if not output_path.lower().endswith('.pdf'):
|
||||
output_path += '.pdf'
|
||||
|
||||
# 默认打印选项(可按需调整)
|
||||
default_print_options = {
|
||||
'landscape': False,
|
||||
'displayHeaderFooter': False,
|
||||
'printBackground': True,
|
||||
'preferCSSPageSize': True,
|
||||
'paperWidth': 8.27, # A4 宽(英寸)
|
||||
'paperHeight': 11.69, # A4 高(英寸)
|
||||
}
|
||||
if print_options:
|
||||
default_print_options.update(print_options)
|
||||
|
||||
try:
|
||||
self.driver.set_page_load_timeout(timeout)
|
||||
|
||||
# 隐藏自动化特征
|
||||
self.driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
|
||||
'source': '''
|
||||
delete navigator.__proto__.webdriver;
|
||||
window.navigator.permissions.query = (parameters) => {
|
||||
return parameters.name === 'notifications' ?
|
||||
Promise.resolve({ state: Notification.permission }) :
|
||||
originalQuery(parameters);
|
||||
};
|
||||
const originalQuery = window.navigator.permissions.query;
|
||||
'''
|
||||
})
|
||||
|
||||
logger.info(f"正在加载页面: {url}")
|
||||
self.driver.get(url)
|
||||
self.driver.set_window_size(1920, 1080)
|
||||
|
||||
logger.info(f"等待 {wait_time} 秒以确保页面完全渲染...")
|
||||
time.sleep(wait_time)
|
||||
|
||||
logger.info("正在生成 PDF...")
|
||||
result = self.driver.execute_cdp_cmd('Page.printToPDF', default_print_options)
|
||||
|
||||
# result['data'] 是 Base64 编码的 PDF
|
||||
pdf_data = base64.b64decode(result['data'])
|
||||
|
||||
with open(output_path, 'wb') as f:
|
||||
f.write(pdf_data)
|
||||
|
||||
file_size = os.path.getsize(output_path)
|
||||
if file_size == 0:
|
||||
raise RuntimeError("生成了空文件")
|
||||
|
||||
logger.info(f"✅ PDF 保存成功: {os.path.abspath(output_path)} (大小: {file_size} 字节)")
|
||||
return os.path.abspath(output_path)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ 保存失败: {e}")
|
||||
raise
|
||||
|
||||
def quit(self):
|
||||
if self.driver:
|
||||
self.driver.quit()
|
||||
logger.info("浏览器已关闭")
|
||||
|
||||
|
||||
# ===== 测试入口 =====
|
||||
if __name__ == "__main__":
|
||||
test_url = "https://cn.ultraiso.net/jiaocheng/ke-lu-guang-pan.html"
|
||||
|
||||
saver = PDFSaver(headless=True)
|
||||
try:
|
||||
output_file = saver.save_as_pdf(
|
||||
url=test_url,
|
||||
output_path="example.pdf",
|
||||
timeout=30,
|
||||
wait_time=5
|
||||
)
|
||||
print(f"\n🎉 成功保存 PDF 文件: {output_file}")
|
||||
except Exception as e:
|
||||
print(f"\n💥 保存失败: {e}")
|
||||
finally:
|
||||
saver.quit()
|
||||
190
research/pdf_downloader/save_remote_as_mhtml.py
Normal file
190
research/pdf_downloader/save_remote_as_mhtml.py
Normal file
@ -0,0 +1,190 @@
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from selenium import webdriver
|
||||
from selenium.common.exceptions import (
|
||||
WebDriverException,
|
||||
TimeoutException,
|
||||
SessionNotCreatedException,
|
||||
InvalidSessionIdException
|
||||
)
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RemoteMHTMLSaver:
|
||||
def __init__(
|
||||
self,
|
||||
remote_url="http://144.34.185.108:28098/wd/hub",
|
||||
headless=True,
|
||||
max_retries=3,
|
||||
retry_delay=2
|
||||
):
|
||||
"""
|
||||
初始化远程 MHTML 保存器(支持自动重建 session)
|
||||
:param remote_url: 远程 Selenium 地址
|
||||
:param headless: 是否无头
|
||||
:param max_retries: 单次操作最大重试次数
|
||||
:param retry_delay: 重试前等待时间(秒)
|
||||
"""
|
||||
self.remote_url = remote_url
|
||||
self.headless = headless
|
||||
self.max_retries = max_retries
|
||||
self.retry_delay = retry_delay
|
||||
self.driver = None
|
||||
self._init_driver()
|
||||
|
||||
def _build_chrome_options(self):
|
||||
"""构建 Chrome 选项(可复用)"""
|
||||
chrome_options = Options()
|
||||
if self.headless:
|
||||
chrome_options.add_argument('--headless=new')
|
||||
chrome_options.add_argument('--no-sandbox')
|
||||
chrome_options.add_argument('--disable-dev-shm-usage')
|
||||
chrome_options.add_argument('--disable-gpu')
|
||||
chrome_options.add_argument('--window-size=1920,1080')
|
||||
chrome_options.add_argument(
|
||||
"--user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.3650.75 Safari/537.36"
|
||||
)
|
||||
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
|
||||
chrome_options.add_experimental_option('useAutomationExtension', False)
|
||||
return chrome_options
|
||||
|
||||
def _init_driver(self):
|
||||
"""初始化或重新初始化 WebDriver"""
|
||||
if self.driver:
|
||||
try:
|
||||
self.driver.quit()
|
||||
except Exception:
|
||||
pass # 忽略关闭失败
|
||||
|
||||
logger.info(f"正在创建新的远程 WebDriver 会话: {self.remote_url}")
|
||||
for attempt in range(3):
|
||||
try:
|
||||
self.driver = webdriver.Remote(
|
||||
command_executor=self.remote_url,
|
||||
options=self._build_chrome_options()
|
||||
)
|
||||
# 注入反检测脚本
|
||||
self.driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
|
||||
'source': '''
|
||||
delete navigator.__proto__.webdriver;
|
||||
window.chrome = { runtime: {} };
|
||||
Object.defineProperty(navigator, 'languages', {
|
||||
get: () => ['zh-CN', 'zh', 'en']
|
||||
});
|
||||
'''
|
||||
})
|
||||
logger.info("✅ 远程 WebDriver 会话创建成功")
|
||||
return
|
||||
except Exception as e:
|
||||
logger.warning(f"创建 WebDriver 失败 (尝试 {attempt + 1}/3): {e}")
|
||||
if attempt < 2:
|
||||
time.sleep(2)
|
||||
else:
|
||||
raise RuntimeError(f"无法连接到远程 Selenium 服务: {e}")
|
||||
|
||||
def save_as_mhtml(self, url, output_path=None, timeout=30, wait_time=5):
|
||||
"""
|
||||
保存网页为 MHTML,支持自动重试和 session 重建
|
||||
"""
|
||||
if output_path is None:
|
||||
domain = urlparse(url).netloc.replace('www.', '').split('.')[0] or 'page'
|
||||
output_path = f"{domain}.mhtml"
|
||||
if not output_path.lower().endswith('.mhtml'):
|
||||
output_path += '.mhtml'
|
||||
|
||||
last_exception = None
|
||||
|
||||
for retry in range(self.max_retries + 1):
|
||||
try:
|
||||
# 检查 driver 是否有效
|
||||
if not self.driver:
|
||||
self._init_driver()
|
||||
|
||||
self.driver.set_page_load_timeout(timeout)
|
||||
logger.info(f"[{retry + 1}/{self.max_retries + 1}] 加载页面: {url}")
|
||||
self.driver.get(url)
|
||||
time.sleep(wait_time)
|
||||
|
||||
logger.info("生成 MHTML 快照...")
|
||||
result = self.driver.execute_cdp_cmd('Page.captureSnapshot', {'format': 'mhtml'})
|
||||
mhtml_content = result['data']
|
||||
|
||||
# 写入本地文件
|
||||
with open(output_path, 'w', encoding='utf-8', newline='') as f:
|
||||
f.write(mhtml_content)
|
||||
|
||||
file_size = os.path.getsize(output_path)
|
||||
if file_size == 0:
|
||||
raise RuntimeError("生成了空文件")
|
||||
|
||||
logger.info(f"✅ 保存成功: {os.path.abspath(output_path)} ({file_size} 字节)")
|
||||
return os.path.abspath(output_path)
|
||||
|
||||
except (WebDriverException, InvalidSessionIdException, SessionNotCreatedException) as e:
|
||||
last_exception = e
|
||||
logger.warning(f"WebDriver 异常 (retry {retry + 1}): {e}")
|
||||
if retry < self.max_retries:
|
||||
logger.info("正在重建 WebDriver 会话...")
|
||||
self._init_driver()
|
||||
time.sleep(self.retry_delay)
|
||||
else:
|
||||
logger.error("达到最大重试次数,放弃")
|
||||
break
|
||||
|
||||
except TimeoutException as e:
|
||||
last_exception = e
|
||||
logger.warning(f"页面加载超时 (retry {retry + 1}): {e}")
|
||||
if retry < self.max_retries:
|
||||
time.sleep(self.retry_delay)
|
||||
else:
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
last_exception = e
|
||||
logger.error(f"未知错误 (retry {retry + 1}): {e}")
|
||||
break # 非 WebDriver 错误,不重试
|
||||
|
||||
# 如果所有重试失败
|
||||
if os.path.exists(output_path):
|
||||
try:
|
||||
os.remove(output_path)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
raise RuntimeError(f"保存失败({type(last_exception).__name__}): {last_exception}")
|
||||
|
||||
def quit(self):
|
||||
"""显式关闭浏览器"""
|
||||
if self.driver:
|
||||
try:
|
||||
self.driver.quit()
|
||||
logger.info("WebDriver 会话已关闭")
|
||||
except Exception:
|
||||
pass
|
||||
self.driver = None
|
||||
|
||||
def __del__(self):
|
||||
self.quit()
|
||||
|
||||
|
||||
# ===== 测试 =====
|
||||
if __name__ == "__main__":
|
||||
saver = RemoteMHTMLSaver(
|
||||
remote_url="http://144.34.185.108:28098/wd/hub", # ← 替换为你的云服务器公网 IP
|
||||
headless=True
|
||||
)
|
||||
try:
|
||||
saver.save_as_mhtml(
|
||||
url="https://www.epochtimes.com/gb/25/12/22/n14660274.htm",
|
||||
output_path="remote_example2.mhtml"
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"❌ 失败: {e}")
|
||||
|
||||
saver.quit()
|
||||
201
research/pdf_downloader/save_remote_as_pdf.py
Normal file
201
research/pdf_downloader/save_remote_as_pdf.py
Normal file
@ -0,0 +1,201 @@
|
||||
import base64
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from selenium import webdriver
|
||||
from selenium.common.exceptions import (
|
||||
WebDriverException,
|
||||
TimeoutException,
|
||||
SessionNotCreatedException,
|
||||
InvalidSessionIdException
|
||||
)
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RemotePDFSaver:
|
||||
def __init__(
|
||||
self,
|
||||
remote_url="http://144.34.185.108:28098/wd/hub",
|
||||
headless=True,
|
||||
max_retries=3,
|
||||
retry_delay=2,
|
||||
print_options=None
|
||||
):
|
||||
"""
|
||||
初始化远程 PDF 保存器(支持自动重建 session)
|
||||
:param remote_url: 远程 Selenium 地址
|
||||
:param headless: 是否无头模式
|
||||
:param max_retries: 单次操作最大重试次数
|
||||
:param retry_delay: 重试前等待时间(秒)
|
||||
:param print_options: PDF 打印选项(参考 DevTools Protocol)
|
||||
"""
|
||||
self.remote_url = remote_url
|
||||
self.headless = headless
|
||||
self.max_retries = max_retries
|
||||
self.retry_delay = retry_delay
|
||||
self.print_options = print_options or {
|
||||
'landscape': False,
|
||||
'displayHeaderFooter': False,
|
||||
'printBackground': True,
|
||||
'preferCSSPageSize': True,
|
||||
'paperWidth': 8.27, # A4 宽(英寸)
|
||||
'paperHeight': 11.69, # A4 高(英寸)
|
||||
}
|
||||
self.driver = None
|
||||
self._init_driver()
|
||||
|
||||
def _build_chrome_options(self):
|
||||
"""构建 Chrome 选项(可复用)"""
|
||||
chrome_options = Options()
|
||||
if self.headless:
|
||||
chrome_options.add_argument('--headless=new')
|
||||
chrome_options.add_argument('--no-sandbox')
|
||||
chrome_options.add_argument('--disable-dev-shm-usage')
|
||||
chrome_options.add_argument('--disable-gpu')
|
||||
chrome_options.add_argument('--window-size=1920,1080')
|
||||
chrome_options.add_argument(
|
||||
"--user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.3650.75 Safari/537.36"
|
||||
)
|
||||
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
|
||||
chrome_options.add_experimental_option('useAutomationExtension', False)
|
||||
return chrome_options
|
||||
|
||||
def _init_driver(self):
|
||||
"""初始化或重新初始化 WebDriver"""
|
||||
if self.driver:
|
||||
try:
|
||||
self.driver.quit()
|
||||
except Exception:
|
||||
pass # 忽略关闭失败
|
||||
|
||||
logger.info(f"正在创建新的远程 WebDriver 会话: {self.remote_url}")
|
||||
for attempt in range(3):
|
||||
try:
|
||||
self.driver = webdriver.Remote(
|
||||
command_executor=self.remote_url,
|
||||
options=self._build_chrome_options()
|
||||
)
|
||||
# 注入反检测脚本
|
||||
self.driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
|
||||
'source': '''
|
||||
delete navigator.__proto__.webdriver;
|
||||
window.chrome = { runtime: {} };
|
||||
Object.defineProperty(navigator, 'languages', {
|
||||
get: () => ['zh-CN', 'zh', 'en']
|
||||
});
|
||||
'''
|
||||
})
|
||||
logger.info("✅ 远程 WebDriver 会话创建成功")
|
||||
return
|
||||
except Exception as e:
|
||||
logger.warning(f"创建 WebDriver 失败 (尝试 {attempt + 1}/3): {e}")
|
||||
if attempt < 2:
|
||||
time.sleep(2)
|
||||
else:
|
||||
raise RuntimeError(f"无法连接到远程 Selenium 服务: {e}")
|
||||
|
||||
def save_as_pdf(self, url, output_path=None, timeout=30, wait_time=5):
|
||||
"""
|
||||
保存网页为 PDF,支持自动重试和 session 重建
|
||||
"""
|
||||
if output_path is None:
|
||||
domain = urlparse(url).netloc.replace('www.', '').split('.')[0] or 'page'
|
||||
output_path = f"{domain}.pdf"
|
||||
if not output_path.lower().endswith('.pdf'):
|
||||
output_path += '.pdf'
|
||||
|
||||
last_exception = None
|
||||
|
||||
for retry in range(self.max_retries + 1):
|
||||
try:
|
||||
# 检查 driver 是否有效
|
||||
if not self.driver:
|
||||
self._init_driver()
|
||||
|
||||
self.driver.set_page_load_timeout(timeout)
|
||||
logger.info(f"[{retry + 1}/{self.max_retries + 1}] 加载页面: {url}")
|
||||
self.driver.get(url)
|
||||
time.sleep(wait_time)
|
||||
|
||||
logger.info("生成 PDF...")
|
||||
result = self.driver.execute_cdp_cmd('Page.printToPDF', self.print_options)
|
||||
pdf_data = base64.b64decode(result['data'])
|
||||
|
||||
# 写入本地 PDF 文件(二进制)
|
||||
with open(output_path, 'wb') as f:
|
||||
f.write(pdf_data)
|
||||
|
||||
file_size = os.path.getsize(output_path)
|
||||
if file_size == 0:
|
||||
raise RuntimeError("生成了空文件")
|
||||
|
||||
logger.info(f"✅ 保存成功: {os.path.abspath(output_path)} ({file_size} 字节)")
|
||||
return os.path.abspath(output_path)
|
||||
|
||||
except (WebDriverException, InvalidSessionIdException, SessionNotCreatedException) as e:
|
||||
last_exception = e
|
||||
logger.warning(f"WebDriver 异常 (retry {retry + 1}): {e}")
|
||||
if retry < self.max_retries:
|
||||
logger.info("正在重建 WebDriver 会话...")
|
||||
self._init_driver()
|
||||
time.sleep(self.retry_delay)
|
||||
else:
|
||||
logger.error("达到最大重试次数,放弃")
|
||||
break
|
||||
|
||||
except TimeoutException as e:
|
||||
last_exception = e
|
||||
logger.warning(f"页面加载超时 (retry {retry + 1}): {e}")
|
||||
if retry < self.max_retries:
|
||||
time.sleep(self.retry_delay)
|
||||
else:
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
last_exception = e
|
||||
logger.error(f"未知错误 (retry {retry + 1}): {e}")
|
||||
break # 非 WebDriver 错误,不重试
|
||||
|
||||
# 清理失败生成的空文件
|
||||
if os.path.exists(output_path):
|
||||
try:
|
||||
os.remove(output_path)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
raise RuntimeError(f"保存失败({type(last_exception).__name__}): {last_exception}")
|
||||
|
||||
def quit(self):
|
||||
"""显式关闭浏览器"""
|
||||
if self.driver:
|
||||
try:
|
||||
self.driver.quit()
|
||||
logger.info("WebDriver 会话已关闭")
|
||||
except Exception:
|
||||
pass
|
||||
self.driver = None
|
||||
|
||||
def __del__(self):
|
||||
self.quit()
|
||||
|
||||
|
||||
# ===== 测试 =====
|
||||
if __name__ == "__main__":
|
||||
saver = RemotePDFSaver(
|
||||
remote_url="http://144.34.185.108:28098/wd/hub", # ← 替换为你的云服务器公网 IP
|
||||
headless=True
|
||||
)
|
||||
try:
|
||||
saver.save_as_pdf(
|
||||
url="https://www.epochtimes.com/gb/25/12/22/n14660274.htm",
|
||||
output_path="remote_example2.pdf"
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"❌ 失败: {e}")
|
||||
|
||||
saver.quit()
|
||||
119
research/pdf_downloader/set_raw_title_kcna.py
Normal file
119
research/pdf_downloader/set_raw_title_kcna.py
Normal file
@ -0,0 +1,119 @@
|
||||
import pymysql
|
||||
from typing import Dict, List, Tuple, Optional
|
||||
|
||||
# ================== 配置区 ==================
|
||||
|
||||
DB_CONFIG = {
|
||||
'host': '47.113.231.200',
|
||||
'port': 28089,
|
||||
'user': 'root',
|
||||
'password': 'passok123A',
|
||||
'database': 'dsp',
|
||||
'charset': 'utf8mb4',
|
||||
}
|
||||
|
||||
# 仅用于指定哪些 es_srcname 的记录需要处理(值可为空,因为不再做替换)
|
||||
TARGET_SRCNAMES: List[str] = [
|
||||
"http://www.kcna.kp/cn/category/articles/q/5394b80bdae203fadef02522cfb578c0.kcmsf",
|
||||
# 添加你需要处理的站点名
|
||||
]
|
||||
|
||||
|
||||
# ================== 工具函数 ==================
|
||||
|
||||
def get_suffix_32(url: str) -> Optional[str]:
|
||||
"""获取 URL 最后 32 个字符,不足则返回 None"""
|
||||
if not url or len(url) < 32:
|
||||
return None
|
||||
return url[-32:]
|
||||
|
||||
|
||||
def find_foreign_by_suffix(cursor, suffix: str, exclude_id: int) -> Optional[Tuple[str, str]]:
|
||||
"""
|
||||
根据后缀查找外文记录(排除自身)
|
||||
"""
|
||||
query = """
|
||||
SELECT es_urltitle, es_urlcontent
|
||||
FROM indeximos
|
||||
WHERE
|
||||
es_sid != %s
|
||||
AND es_urlname IS NOT NULL
|
||||
AND CHAR_LENGTH(es_urlname) >= 32
|
||||
AND RIGHT(es_urlname, 32) = %s
|
||||
LIMIT 1
|
||||
"""
|
||||
cursor.execute(query, (exclude_id, suffix))
|
||||
result = cursor.fetchone()
|
||||
return result if result else None
|
||||
|
||||
|
||||
def update_chinese_record(cursor, record_id: int, title: str, content: str):
|
||||
"""更新中文记录的 es_title 和 es_content"""
|
||||
update_query = """
|
||||
UPDATE indeximos
|
||||
SET es_title = %s, es_content = %s
|
||||
WHERE es_sid = %s
|
||||
"""
|
||||
cursor.execute(update_query, (title, content, record_id))
|
||||
|
||||
|
||||
# ================== 主逻辑 ==================
|
||||
|
||||
def main():
|
||||
if not TARGET_SRCNAMES:
|
||||
print("⚠️ 未指定任何目标 es_srcname,程序退出。")
|
||||
return
|
||||
|
||||
conn = pymysql.connect(**DB_CONFIG)
|
||||
cursor = conn.cursor()
|
||||
|
||||
try:
|
||||
# 获取所有目标站点的中文记录
|
||||
placeholders = ','.join(['%s'] * len(TARGET_SRCNAMES))
|
||||
query = f"""
|
||||
SELECT es_sid, es_srcname, es_urlname
|
||||
FROM indeximos
|
||||
WHERE es_srcname IN ({placeholders})
|
||||
AND es_urlname IS NOT NULL
|
||||
AND es_urlname != ''
|
||||
AND es_loadtime > '2026-01-16 10:40:00'
|
||||
"""
|
||||
cursor.execute(query, TARGET_SRCNAMES)
|
||||
records = cursor.fetchall()
|
||||
total = len(records)
|
||||
print(f"共加载 {total} 条来自 {TARGET_SRCNAMES} 的记录用于匹配...")
|
||||
|
||||
updated_count = 0
|
||||
skipped_short = 0
|
||||
|
||||
for idx, (record_id, es_srcname, es_urlname) in enumerate(records, 1):
|
||||
suffix = get_suffix_32(es_urlname)
|
||||
if suffix is None:
|
||||
skipped_short += 1
|
||||
continue
|
||||
|
||||
foreign_data = find_foreign_by_suffix(cursor, suffix, record_id)
|
||||
if foreign_data:
|
||||
title, content = foreign_data
|
||||
update_chinese_record(cursor, record_id, title, content)
|
||||
updated_count += 1
|
||||
print(f"[{idx}/{total}] ✅ 已更新 ID={record_id} | src={es_srcname}")
|
||||
|
||||
conn.commit()
|
||||
print("\n" + "=" * 50)
|
||||
print(f"✅ 匹配完成!")
|
||||
print(f" - 成功更新: {updated_count} 条")
|
||||
print(f" - 因 URL 长度 <32 跳过: {skipped_short} 条")
|
||||
print(f" - 总处理: {total} 条")
|
||||
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
print(f"❌ 发生错误,已回滚: {e}")
|
||||
raise
|
||||
finally:
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
160
research/pdf_downloader/set_raw_title_rodong.py
Normal file
160
research/pdf_downloader/set_raw_title_rodong.py
Normal file
@ -0,0 +1,160 @@
|
||||
import pymysql
|
||||
import jieba
|
||||
from collections import Counter
|
||||
from typing import List, Tuple, Set
|
||||
|
||||
# ================== 配置区 ==================
|
||||
|
||||
DB_CONFIG = {
|
||||
'host': '47.113.231.200',
|
||||
'port': 28089,
|
||||
'user': 'root',
|
||||
'password': 'passok123A',
|
||||
'database': 'dsp',
|
||||
'charset': 'utf8mb4',
|
||||
}
|
||||
|
||||
# 指定需要处理的中文站点(es_srcname)
|
||||
TARGET_SRCNAMES: List[str] = [
|
||||
"http://www.rodong.rep.kp/cn/index.php?MUBAMUAxQA==",
|
||||
# 添加你的站点
|
||||
]
|
||||
|
||||
FOREIGN_SRCNAME = 'http://www.rodong.rep.kp/ko/index.php?MUBAMUAxQA=='
|
||||
|
||||
# 相似度阈值(关键词重合率),建议 0.3 ~ 0.6
|
||||
SIMILARITY_THRESHOLD = 0.3
|
||||
|
||||
|
||||
# ================== 文本相似度函数 ==================
|
||||
|
||||
def extract_keywords(text: str) -> Set[str]:
|
||||
"""提取中文关键词:分词 + 过滤单字、数字、标点"""
|
||||
if not text:
|
||||
return set()
|
||||
words = jieba.lcut(text)
|
||||
return {w for w in words if len(w) >= 2 and w.isalpha()}
|
||||
|
||||
|
||||
def keyword_overlap_similarity(title1: str, title2: str) -> float:
|
||||
"""计算两个中文标题的关键词重合率"""
|
||||
kw1 = extract_keywords(title1)
|
||||
kw2 = extract_keywords(title2)
|
||||
|
||||
if not kw1 and not kw2:
|
||||
return 1.0 if title1 == title2 else 0.0
|
||||
if not kw1 or not kw2:
|
||||
return 0.0
|
||||
|
||||
overlap = kw1 & kw2
|
||||
return len(overlap) / max(len(kw1), len(kw2))
|
||||
|
||||
|
||||
# ================== 数据库操作 ==================
|
||||
|
||||
def get_chinese_records(cursor) -> List[Tuple]:
|
||||
"""获取待处理的中文记录"""
|
||||
if not TARGET_SRCNAMES:
|
||||
return []
|
||||
placeholders = ','.join(['%s'] * len(TARGET_SRCNAMES))
|
||||
query = f"""
|
||||
SELECT es_sid, es_srcname, es_urlname, es_urltitle, es_urltime
|
||||
FROM indeximos
|
||||
WHERE es_srcname IN ({placeholders})
|
||||
AND es_urltitle IS NOT NULL AND TRIM(es_urltitle) != ''
|
||||
AND es_urltime IS NOT NULL
|
||||
AND es_loadtime > '2026-01-16 10:40:00'
|
||||
"""
|
||||
cursor.execute(query, TARGET_SRCNAMES)
|
||||
return cursor.fetchall()
|
||||
|
||||
|
||||
def get_foreign_candidates_by_time(cursor, pub_time) -> List[Tuple]:
|
||||
"""
|
||||
获取同一发布时间的所有外文候选记录(要求 es_abstract 不为空)
|
||||
"""
|
||||
query = """
|
||||
SELECT es_sid, es_title, es_urltitle, es_urlcontent
|
||||
FROM indeximos
|
||||
WHERE es_urltime = %s
|
||||
AND es_title IS NOT NULL AND TRIM(es_title) != ''
|
||||
AND es_urlcontent IS NOT NULL
|
||||
AND es_loadtime > '2026-01-16 10:40:00'
|
||||
"""
|
||||
cursor.execute(query, (pub_time,))
|
||||
return cursor.fetchall()
|
||||
|
||||
|
||||
def update_chinese_record(cursor, record_id: int, new_title: str, content: str):
|
||||
"""更新中文记录的标题和内容"""
|
||||
update_query = """
|
||||
UPDATE indeximos
|
||||
SET es_title = %s, es_content = %s
|
||||
WHERE es_sid = %s
|
||||
"""
|
||||
cursor.execute(update_query, (new_title, content, record_id))
|
||||
|
||||
|
||||
# ================== 主逻辑 ==================
|
||||
|
||||
def main():
|
||||
if not TARGET_SRCNAMES:
|
||||
print("⚠️ 未指定目标站点,退出。")
|
||||
return
|
||||
|
||||
conn = pymysql.connect(**DB_CONFIG)
|
||||
cursor = conn.cursor()
|
||||
|
||||
try:
|
||||
chinese_records = get_chinese_records(cursor)
|
||||
total = len(chinese_records)
|
||||
print(f"共加载 {total} 条中文记录用于匹配...")
|
||||
|
||||
matched_count = 0
|
||||
|
||||
for idx, (cid, srcname, urlname, zh_title, pub_time) in enumerate(chinese_records, 1):
|
||||
print(f"\n[{idx}/{total}] ID={cid}, 时间={pub_time}, 标题='{zh_title[:30]}...'")
|
||||
|
||||
candidates = get_foreign_candidates_by_time(cursor, pub_time)
|
||||
if not candidates:
|
||||
print(" → 无同时间且有翻译标题的外文记录")
|
||||
continue
|
||||
|
||||
best_score = 0.0
|
||||
best_candidate = None
|
||||
|
||||
for fid, trans_title, ori_title, content in candidates:
|
||||
# 跳过自己(理论上不会发生,但安全起见)
|
||||
if fid == cid:
|
||||
continue
|
||||
|
||||
score = keyword_overlap_similarity(zh_title, trans_title)
|
||||
print(f" 候选ID={fid} | 翻译标题='{trans_title[:30]}...' | 重合度={score:.3f}")
|
||||
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_candidate = (ori_title, content)
|
||||
|
||||
if best_candidate and best_score >= SIMILARITY_THRESHOLD:
|
||||
final_title, final_content = best_candidate
|
||||
update_chinese_record(cursor, cid, final_title, final_content)
|
||||
matched_count += 1
|
||||
print(f" ✅ 匹配成功! 重合度={best_score:.3f}")
|
||||
else:
|
||||
print(f" ❌ 未达阈值(最高相似度={best_score:.3f})")
|
||||
|
||||
conn.commit()
|
||||
print("\n" + "=" * 50)
|
||||
print(f"✅ 匹配完成!成功关联 {matched_count} / {total} 条记录。")
|
||||
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
print(f"❌ 发生错误,已回滚: {e}")
|
||||
raise
|
||||
finally:
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
172
research/pdf_downloader/translate-news.py
Normal file
172
research/pdf_downloader/translate-news.py
Normal file
@ -0,0 +1,172 @@
|
||||
import time
|
||||
from typing import List, Tuple, Optional
|
||||
|
||||
import pymysql
|
||||
import requests
|
||||
|
||||
# ================== 配置区 ==================
|
||||
|
||||
# 数据库配置
|
||||
DB_CONFIG = {
|
||||
'host': '47.113.231.200',
|
||||
'port': 28089,
|
||||
'user': 'root',
|
||||
'password': 'passok123A',
|
||||
'database': 'dsp',
|
||||
'charset': 'utf8mb4',
|
||||
'autocommit': False # 手动控制事务
|
||||
}
|
||||
|
||||
# 翻译 API 地址(替换为你的服务器 IP 或域名)
|
||||
TRANSLATE_API_URL = "http://47.113.231.200:28081/translate"
|
||||
|
||||
# 指定时间(格式:YYYY-MM-DD HH:MM:SS)
|
||||
LOADTIME_AFTER = "2026-02-10 11:59:00"
|
||||
|
||||
# 目标站点列表
|
||||
TARGET_SRCNAMES = [
|
||||
'https://www.38north.org/' # 添加你的站点
|
||||
]
|
||||
|
||||
# 单次请求间隔(秒),避免 API 被限流
|
||||
REQUEST_DELAY = 1
|
||||
|
||||
# 最大文本长度(与 API 一致)
|
||||
MAX_TEXT_LENGTH = 5000
|
||||
|
||||
|
||||
def normalize_newlines(text: str) -> str:
|
||||
"""将 \r\n 和 \r 统一转换为 \n"""
|
||||
if not text:
|
||||
return text
|
||||
return text.replace('\r\n', '\n').replace('\r', '\n')
|
||||
|
||||
|
||||
def translate_single(text: str, source_lang: str = "auto", target_lang: str = "zh") -> Optional[str]:
|
||||
"""翻译单段文本,失败返回 None"""
|
||||
if not text or not text.strip():
|
||||
return ""
|
||||
|
||||
payload = {
|
||||
"text": text[:MAX_TEXT_LENGTH],
|
||||
"source_lang": source_lang,
|
||||
"target_lang": target_lang
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.post(TRANSLATE_API_URL, json=payload, timeout=10)
|
||||
response.raise_for_status()
|
||||
result = response.json()
|
||||
return result.get("translated_text")
|
||||
except Exception as e:
|
||||
print(f"⚠️ 翻译失败: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def translate_content_with_paragraphs(content: str) -> str:
|
||||
"""
|
||||
按段落翻译内容,支持容错:
|
||||
- 某段失败 → 跳过该段(保留空行或原文)
|
||||
- 返回拼接后的完整内容
|
||||
"""
|
||||
if not content:
|
||||
return ""
|
||||
|
||||
# 标准化换行符
|
||||
content = normalize_newlines(content)
|
||||
paragraphs = content.split('\n')
|
||||
translated_paragraphs = []
|
||||
|
||||
for para in paragraphs:
|
||||
if not para.strip():
|
||||
# 保留空行
|
||||
translated_paragraphs.append("")
|
||||
continue
|
||||
|
||||
trans = translate_single(para)
|
||||
if trans is None:
|
||||
# 段落翻译失败:跳过该段(可选:保留原文或留空)
|
||||
print(f" ⚠️ 段落翻译失败,跳过: {para[:30]}...")
|
||||
translated_paragraphs.append("") # 或 append(para) 保留原文
|
||||
else:
|
||||
translated_paragraphs.append(trans)
|
||||
|
||||
time.sleep(REQUEST_DELAY)
|
||||
|
||||
return '\n'.join(translated_paragraphs)
|
||||
|
||||
|
||||
# ================== 数据库操作 ==================
|
||||
|
||||
def update_record(cursor, es_sid: int, new_title: str, new_content: str):
|
||||
update_query = """
|
||||
UPDATE indeximos
|
||||
SET es_abstract = % s, es_content = % s
|
||||
WHERE es_sid = % s
|
||||
"""
|
||||
cursor.execute(update_query, (new_title, new_content, es_sid))
|
||||
|
||||
|
||||
# ================== 主逻辑 ==================
|
||||
|
||||
def main():
|
||||
conn = pymysql.connect(**DB_CONFIG)
|
||||
cursor = conn.cursor()
|
||||
|
||||
try:
|
||||
placeholders = ','.join(['%s'] * len(TARGET_SRCNAMES))
|
||||
query = f"""
|
||||
SELECT es_sid, es_urltitle, es_urlcontent
|
||||
FROM indeximos
|
||||
WHERE es_loadtime > %s
|
||||
AND (es_content IS NULL OR TRIM(es_content) = '')
|
||||
-- AND es_srcname IN ({placeholders})
|
||||
AND LENGTH(es_video) > 5
|
||||
"""
|
||||
params = [LOADTIME_AFTER] + TARGET_SRCNAMES
|
||||
cursor.execute(query, params)
|
||||
records: List[Tuple] = cursor.fetchall()
|
||||
|
||||
total = len(records)
|
||||
print(f"✅ 共找到 {total} 条待翻译记录")
|
||||
|
||||
if total == 0:
|
||||
return
|
||||
|
||||
success_count = 0
|
||||
|
||||
for idx, (es_sid, urltitle, urlcontent) in enumerate(records, 1):
|
||||
print(f"\n[{idx}/{total}] 处理 es_sid={es_sid}")
|
||||
start_time = time.time()
|
||||
|
||||
# 翻译标题
|
||||
title_trans = translate_single(urltitle) if urltitle else ""
|
||||
if title_trans is None:
|
||||
print(" → 标题翻译失败,跳过整条")
|
||||
continue
|
||||
|
||||
# 翻译内容(按段落,容错)
|
||||
content_trans = translate_content_with_paragraphs(urlcontent)
|
||||
|
||||
# 更新数据库
|
||||
update_record(cursor, es_sid, title_trans, content_trans)
|
||||
success_count += 1
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
print(f" ✅ 翻译成功 | 耗时: {elapsed:.2f}s | 标题: {title_trans[:30]}...")
|
||||
|
||||
conn.commit()
|
||||
|
||||
print(f"\n🎉 完成!成功翻译 {success_count} / {total} 条记录")
|
||||
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
print(f"❌ 发生错误: {e}")
|
||||
raise
|
||||
finally:
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -1,4 +1,4 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
l# -*- coding: utf-8 -*-
|
||||
import os
|
||||
import sys
|
||||
import platform
|
||||
|
||||
@ -170,3 +170,31 @@ class TelegramMember(scrapy.Item):
|
||||
role = scrapy.Field() # 成员角色:c-创建者、a-管理员、u-普通成员
|
||||
mobile = scrapy.Field() # 成员手机号
|
||||
profile_photo = scrapy.Field() # 头像 blob
|
||||
|
||||
|
||||
class TwitterUserInfoItem(scrapy.Item):
|
||||
"""对应数据库表 twitter_user_info 的 Scrapy Item"""
|
||||
|
||||
crawl_time = scrapy.Field() # DATETIME - 数据爬取时间
|
||||
is_newest = scrapy.Field() # TINYINT(1) - 是否最新
|
||||
platform_type = scrapy.Field() # VARCHAR(20) - 平台类型
|
||||
|
||||
user_id = scrapy.Field() # BIGINT UNSIGNED - Twitter 用户唯一ID
|
||||
username = scrapy.Field() # VARCHAR(50) - 用户名(@后部分)
|
||||
nickname = scrapy.Field() # VARCHAR(100) - 显示名称
|
||||
user_url = scrapy.Field() # VARCHAR(255) - 主页URL
|
||||
user_link = scrapy.Field() # VARCHAR(255) - 用户链接
|
||||
avatar_url = scrapy.Field() # VARCHAR(500) - 头像原始URL
|
||||
avatar_path = scrapy.Field() # VARCHAR(255) - 本地头像路径
|
||||
backgroud_image_url = scrapy.Field() # VARCHAR(255) - 背景图原始URL
|
||||
background_image_path = scrapy.Field() # VARCHAR(255) - 背景图路径
|
||||
intro = scrapy.Field() # TEXT - 简介
|
||||
city = scrapy.Field() # VARCHAR(100) - 城市
|
||||
join_date = scrapy.Field() # DATETIME - 加入时间
|
||||
signature = scrapy.Field() # VARCHAR(255) - 用户签名
|
||||
tags = scrapy.Field() # VARCHAR(255) - 标签:官方代表/媒体实体/名人
|
||||
post_count = scrapy.Field() # INT UNSIGNED - 推文数
|
||||
is_verified = scrapy.Field() # VARCHAR(10) - 是否认证 ("True"/"False")
|
||||
follow_count = scrapy.Field() # INT UNSIGNED - 关注人数
|
||||
fans_count = scrapy.Field() # INT UNSIGNED - 粉丝数
|
||||
image_urls = scrapy.Field()
|
||||
|
||||
@ -7,6 +7,9 @@ import logging
|
||||
import os
|
||||
import tarfile
|
||||
import time
|
||||
from scrapy.exceptions import DropItem
|
||||
import uuid
|
||||
|
||||
# Define your item pipelines here
|
||||
#
|
||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||
@ -265,3 +268,179 @@ class TelegramDataSaveToMySQL(object):
|
||||
except pymysql.err.DataError as de:
|
||||
logging.error(repr(de))
|
||||
return item
|
||||
|
||||
class TwitterUserDataSaveToMySQL(object):
|
||||
def __init__(self):
|
||||
self.db = None
|
||||
self.cursor = None
|
||||
self.update_fileds = []
|
||||
|
||||
def open_spider(self, spider):
|
||||
self.db = pymysql.connect(host='47.113.231.200', port=28089, user='root', passwd='passok123A',
|
||||
db='dsp', charset='utf8mb4')
|
||||
self.cursor = self.db.cursor()
|
||||
|
||||
def close_spider(self, spider):
|
||||
if self.cursor:
|
||||
self.cursor.close()
|
||||
if self.db:
|
||||
self.db.close()
|
||||
|
||||
def process_item(self, item, spider):
|
||||
# 可选:只处理特定 Item
|
||||
if item.__class__.__name__ != 'TwitterUserInfoItem':
|
||||
return item
|
||||
self.table_name = "twitter_user_info"
|
||||
|
||||
self.extract_avatar_and_background_paths(item)
|
||||
try:
|
||||
user_id = item.get('user_id')
|
||||
if not user_id:
|
||||
logging.warning("缺少 user_id,跳过处理。")
|
||||
return item
|
||||
|
||||
# 1. 基于 user_id 生成稳定 UUID(命名空间 + 字符串)
|
||||
stable_uuid = str(uuid.uuid5(uuid.NAMESPACE_DNS, f"twitter_user_{user_id}"))
|
||||
|
||||
# 2. 查询数据库是否已存在
|
||||
existing = self._select_by_uuid(stable_uuid)
|
||||
|
||||
if existing:
|
||||
# 3. 比较字段,判断是否需要更新
|
||||
if self._needs_update(existing, item):
|
||||
# 4. 执行更新
|
||||
self._update_item(stable_uuid, item)
|
||||
logging.info(f"用户 {user_id} 数据已更新。")
|
||||
else:
|
||||
logging.debug(f"用户 {user_id} 数据无变化,跳过更新。")
|
||||
else:
|
||||
# 5. 插入新记录
|
||||
self._insert_item_with_uuid(stable_uuid, item)
|
||||
logging.info(f"用户 {user_id} 新数据已插入。")
|
||||
|
||||
except Exception as e:
|
||||
spider.logger.error(f"处理用户数据失败 (user_id={item.get('user_id')}): {e}")
|
||||
raise DropItem(f"Database error: {e}")
|
||||
|
||||
return item
|
||||
|
||||
def _select_by_uuid(self, record_uuid):
|
||||
"""根据 UUID 查询整行数据"""
|
||||
sql = f"SELECT * FROM dsp.{self.table_name} WHERE id = %s"
|
||||
self.cursor.execute(sql, (record_uuid,))
|
||||
row = self.cursor.fetchone()
|
||||
if row:
|
||||
# 获取列名
|
||||
columns = [desc[0] for desc in self.cursor.description]
|
||||
return dict(zip(columns, row))
|
||||
return None
|
||||
|
||||
def _needs_update(self, db_record, item):
|
||||
"""比较数据库记录与 item 是否有差异"""
|
||||
for field in item.fields:
|
||||
if field in ['id', 'created_at', 'updated_at', 'image_urls', 'crawl_time', 'join_date']:
|
||||
continue
|
||||
|
||||
item_val = item.get(field)
|
||||
db_val = db_record.get(field)
|
||||
|
||||
# 标准化空值:None 和 '' 视为等价
|
||||
if item_val is None or item_val == '':
|
||||
item_val = None
|
||||
if db_val is None or db_val == '':
|
||||
db_val = None
|
||||
|
||||
if item_val != db_val:
|
||||
self.update_fileds.append(field)
|
||||
|
||||
return len(self.update_fileds)>0
|
||||
|
||||
|
||||
def _update_item(self, record_uuid, item):
|
||||
"""更新不一致的字段 + updated_at"""
|
||||
update_fields = []
|
||||
update_vals = []
|
||||
|
||||
for field in self.update_fileds:
|
||||
if field in ['id', 'created_at', 'image_urls']:
|
||||
continue
|
||||
value = item.get(field)
|
||||
|
||||
if value is None or value == '':
|
||||
continue
|
||||
|
||||
update_fields.append(f"{field} = %s")
|
||||
update_vals.append(value)
|
||||
|
||||
if not update_fields:
|
||||
return
|
||||
|
||||
update_vals.append(record_uuid) # WHERE id = %s
|
||||
sql = f"UPDATE dsp.{self.table_name} SET {', '.join(update_fields)}, updated_at = CURRENT_TIMESTAMP WHERE id = %s"
|
||||
self.cursor.execute(sql, update_vals)
|
||||
self.db.commit()
|
||||
|
||||
def _insert_item_with_uuid(self, record_uuid, item):
|
||||
"""插入新记录,指定 id 为 stable_uuid"""
|
||||
cols = ['id']
|
||||
vals = [record_uuid]
|
||||
|
||||
for field in item.fields:
|
||||
if field in ['image_urls', 'id']:
|
||||
continue
|
||||
|
||||
# 获取字段值
|
||||
value = item.get(field)
|
||||
|
||||
# 处理 avatar_path:兼容 dict / list / str
|
||||
if field == 'avatar_path':
|
||||
if isinstance(value, list) and len(value) > 0:
|
||||
value = value[0].get('path', '') if isinstance(value[0], dict) else str(value[0])
|
||||
elif isinstance(value, dict):
|
||||
value = value.get('path', '')
|
||||
else:
|
||||
value = str(value) if value else ''
|
||||
|
||||
# 跳过 None 和空字符串
|
||||
if value is None or value == '':
|
||||
continue
|
||||
|
||||
cols.append(field)
|
||||
vals.append(value)
|
||||
|
||||
if not cols:
|
||||
logging.warning("没有有效的字段可供插入。")
|
||||
return
|
||||
|
||||
placeholders = ', '.join(['%s'] * len(cols))
|
||||
cols_str = ', '.join(cols)
|
||||
sql = f"INSERT INTO dsp.twitter_user_info ({cols_str}) VALUES ({placeholders})"
|
||||
|
||||
try:
|
||||
self.cursor.execute(sql, vals)
|
||||
self.db.commit()
|
||||
except pymysql.err.IntegrityError as ie:
|
||||
self.db.rollback()
|
||||
logging.debug(f"数据重复,已跳过插入:{ie}")
|
||||
except pymysql.err.DataError as de:
|
||||
self.db.rollback()
|
||||
logging.error(f"数据格式错误(如字段超长、类型不匹配等):{de}")
|
||||
raise
|
||||
except Exception as e:
|
||||
self.db.rollback()
|
||||
logging.error(f"数据库操作发生未知错误:{e}")
|
||||
raise
|
||||
|
||||
def extract_avatar_and_background_paths(self, item):
|
||||
value = item.get('avatar_path', [])
|
||||
if not isinstance(value, list):
|
||||
value = []
|
||||
|
||||
def get_path(val):
|
||||
return val.get('path', '') if isinstance(val, dict) else str(val)
|
||||
|
||||
avatar = get_path(value[0]) if len(value) > 0 else None
|
||||
background = get_path(value[1]) if len(value) > 1 else None
|
||||
|
||||
item['avatar_path'] = avatar
|
||||
item['background_image_path'] = background
|
||||
|
||||
@ -59,7 +59,24 @@ class SeleniumMiddleware:
|
||||
# Edge in headless mode
|
||||
edge_options = EdgeOptions()
|
||||
edge_options.use_chromium = True
|
||||
self.driver = Edge(executable_path='MicrosoftWebDriver.exe', options=edge_options)
|
||||
# edge_options.add_argument("--headless")
|
||||
# 隐藏“受自动化软件控制”提示栏
|
||||
edge_options.add_argument('--disable-blink-features=AutomationControlled')
|
||||
edge_options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
||||
# 禁用自动化扩展
|
||||
edge_options.add_experimental_option('useAutomationExtension', False)
|
||||
edge_options.add_argument(
|
||||
"--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36 Edg/143.0.0.0")
|
||||
edge_options.add_argument("--window-size=1920,1080")
|
||||
# 设置浏览器的 高级偏好设置
|
||||
prefs = {
|
||||
# "profile.managed_default_content_settings.images": 2, # 禁用图片加载:2 表示“禁止”,1 表示“允许”
|
||||
"credentials_enable_service": False, # 禁用保存密码提示
|
||||
"profile.password_manager_enabled": False # 禁用密码管理器
|
||||
}
|
||||
edge_options.add_experimental_option("prefs", prefs)
|
||||
|
||||
self.driver = Edge(executable_path=r"D:\msedgedriver.exe", options=edge_options)
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
@ -93,7 +110,7 @@ class SeleniumMiddleware:
|
||||
self.proxy_count = 0
|
||||
ip = request.meta['proxy'].split(':')[1][2:]
|
||||
port = int(request.meta['proxy'].split(':')[2])
|
||||
user_agent = 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)'
|
||||
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36 Edg/143.0.0.0'
|
||||
self.driver.get("about:config")
|
||||
script = '''
|
||||
var prefs = Components.classes["@mozilla.org/preferences-service;1"].getService(Components.interfaces.nsIPrefBranch);
|
||||
|
||||
@ -3,6 +3,7 @@
|
||||
BOT_NAME = 'MediaSpiders'
|
||||
|
||||
LOG_LEVEL = 'INFO'
|
||||
# LOG_LEVEL = 'DEBUG'
|
||||
|
||||
SPIDER_MODULES = ['MediaSpiders.spiders']
|
||||
NEWSPIDER_MODULE = 'MediaSpiders.spiders'
|
||||
@ -34,6 +35,12 @@ MYSQL_DB_USER = 'root'
|
||||
MYSQL_DB_PASSWD = 'passok123A'
|
||||
MYSQL_DB_SCHEMA = 'oscm'
|
||||
|
||||
TWITTER_USER_MYSQL_DB_HOST = '47.113.231.200'
|
||||
TWITTER_USER_MYSQL_DB_PORT = 28089
|
||||
TWITTER_USER_MYSQL_DB_USER = 'root'
|
||||
TWITTER_USER_MYSQL_DB_PASSWD = 'passok123A'
|
||||
TWITTER_USER_MYSQL_DB_SCHEMA = 'dsp'
|
||||
|
||||
CRAWL_JOB_UPDATE_API = 'http://47.115.228.133:28081/api/open/crawljob'
|
||||
WORD_BANK_QUERY_API = 'http://47.115.228.133:28081/api/open/wordBank/queryAll'
|
||||
RULES_PARSER_QUERY_API = 'http://47.115.228.133:28081/api/rules/parser/queryPageable/0/1'
|
||||
@ -44,6 +51,7 @@ BATCH_SAVE_SIZE = 5
|
||||
|
||||
TWITTER_FILTER_KEY = 'URL_Filter:MediaSpiders:Twitter_Filter'
|
||||
FACEBOOK_FILTER_KEY = 'URL_Filter:MediaSpiders:Facebook_Filter'
|
||||
LINKEDIN_FILTER_KEY = 'URL_Filter:MediaSpiders:Linkedin_Filter'
|
||||
YOUTUBE_FILTER_KEY = 'URL_Filter:MediaSpiders:Youtube_Filter'
|
||||
WEIBO_FILTER_KEY = 'URL_Filter:MediaSpiders:Weibo_Filter'
|
||||
WECHAT_FILTER_KEY = 'URL_Filter:MediaSpiders:Wechat_Filter'
|
||||
@ -51,6 +59,7 @@ FLICKR_FILTER_KEY = 'URL_Filter:MediaSpiders:Flickr_Filter'
|
||||
|
||||
TWITTER_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Twitter_Filter'
|
||||
FACEBOOK_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Facebook_Filter'
|
||||
LINKEDIN_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Linkedin_Filter'
|
||||
YOUTUBE_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Youtube_Filter'
|
||||
WEIBO_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Weibo_Filter'
|
||||
WECHAT_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Wechat_Filter'
|
||||
@ -58,10 +67,16 @@ FLICKR_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Flickr_Filter'
|
||||
|
||||
WECHAT_LINKS_KEY = "MediaSpiders:Wechat_links"
|
||||
|
||||
# TWITTER_BEARER_TOKEN = "AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA" # old
|
||||
TWITTER_API_KEY = "JFY7dt"
|
||||
TWITTER_BEARER_TOKEN = "AAAAAAAAAAAAAAAAAAAAAO8MTQEAAAAAQWidbP34N0nykDnUEDweEpyRgsc%3Dxt0hX1whV1hlmbMsStkB7ZU3pjXOINOCh2DMPoIAwljwrOWgvE"
|
||||
TWITTER_ACCESS_TOKEN = "1294829483816398849-gscLJCEF9ZObZJikjCmjXtxoW6YVWu"
|
||||
TWITTER_ACCESS_TOKEN_SECRET = "1XvTHZXzN0JBQulTBOvCTgXVPzVGYWe50zH1r4qXLper3"
|
||||
|
||||
SOCIAL_USER_QUERY_ALL_API = "http://47.115.228.133:28081/api/open/target/social/queryAll?sortBy={sortBy}&shuffleResult={shuffleResult}"
|
||||
SOCIAL_USER_UPDATE_API = "http://47.115.228.133:28081/api/open/target/social/update"
|
||||
WEIBO_USER_TYPE = 0
|
||||
TWITTER_USER_TYPE = 1
|
||||
TWITTER_USER_TYPE = 21
|
||||
FACEBOOK_USER_TYPE = 2
|
||||
YOUTUBE_USER_TYPE = 3
|
||||
FLICKR_USER_TYPE = 4
|
||||
@ -73,21 +88,31 @@ TWITTER_URL_KEY = 'MediaSpiders:Twitter_URL_Key'
|
||||
TWITTER_PID_KEY = ''
|
||||
KAFKA_PROCESS_QUEUE = ['stream-protobuf', 'stream-db']
|
||||
|
||||
# CUSTOM_USER_AGENT = [
|
||||
# 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
|
||||
# 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0',
|
||||
# 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
|
||||
# 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363',
|
||||
# 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
|
||||
# 'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50',
|
||||
# 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0',
|
||||
# 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E',
|
||||
# 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201'
|
||||
# ]
|
||||
|
||||
CUSTOM_USER_AGENT = [
|
||||
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0',
|
||||
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363',
|
||||
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
|
||||
'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50',
|
||||
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0',
|
||||
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E',
|
||||
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201'
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.1958',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 OPR/117.0.0.',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.3'
|
||||
]
|
||||
|
||||
# 部署在外网采集fb时使用selenium_chrome
|
||||
SELENIUM_DRIVER_NAME = 'chrome'
|
||||
SELENIUM_DRIVER_EXECUTABLE_PATH = 'http://144.34.185.108:28098'
|
||||
SELENIUM_DRIVER_EXECUTABLE_PATH = 'local'
|
||||
# SELENIUM_DRIVER_EXECUTABLE_PATH = 'http://144.34.185.108:28098'
|
||||
SELENIUM_DRIVER_ARGUMENTS = [
|
||||
'--headless',
|
||||
'--no-sandbox',
|
||||
@ -184,6 +209,14 @@ EXTENSIONS = {
|
||||
'MediaSpiders.extensions.SetCrawlerStatusExtensions': 501
|
||||
}
|
||||
|
||||
############################## 翻译
|
||||
MAX_TEXT_LENGTH = 100
|
||||
# 翻译 API 地址(替换为你的服务器 IP 或域名)
|
||||
TRANSLATE_API_URL = "http://47.113.231.200:28082/translate"
|
||||
# 单次请求间隔(秒),避免 API 被限流
|
||||
REQUEST_DELAY = 1
|
||||
|
||||
|
||||
# Enable or disable extensions
|
||||
# See https://docs.scrapy.org/en/latest/topics/extensions.html
|
||||
# EXTENSIONS = {
|
||||
|
||||
@ -0,0 +1,369 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import json
|
||||
import hashlib
|
||||
import re
|
||||
import time
|
||||
import random
|
||||
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.common.exceptions import (
|
||||
TimeoutException,
|
||||
StaleElementReferenceException,
|
||||
WebDriverException
|
||||
)
|
||||
import scrapy
|
||||
from scrapy_selenium import SeleniumRequest
|
||||
from MediaSpiders.items import MediaspidersItem
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from scrapy.exceptions import CloseSpider
|
||||
|
||||
from MediaSpiders.utils.string_utils import get_str_md5
|
||||
from MediaSpiders.utils.time_utils import get_current_timestamp
|
||||
|
||||
|
||||
class BaiduHotSearchSprder(scrapy.Spider):
|
||||
name = 'BaiduHotSearchSprder'
|
||||
comment_urls = []
|
||||
custom_settings = {
|
||||
'PROTO_MODULE_PATH': 'MediaSpiders.proto.Es_pb2',
|
||||
'PROTO_CLASS_NAME': 'EsSets',
|
||||
'PROTO_FIELD_NAME': 'Es',
|
||||
'PROTO_SAVE_FILE_NAME': 'public_info_data_',
|
||||
'IMAGES_STORE': r'/usr/local/temp_image/twitter',
|
||||
'IMAGES_RESULT_FIELD': 'es_urlimage',
|
||||
'FILES_STORE': r'/usr/local/videos',
|
||||
'FILES_RESULT_FIELD': 'es_video',
|
||||
'ZIP_FILE_NAME': 'image_data_publicinfo_',
|
||||
'FILE_ZIP_FILE_NAME': 'image_data_plane_',
|
||||
'ITEM_PIPELINES': {
|
||||
'scrapy.pipelines.images.ImagesPipeline': 2,
|
||||
'MediaSpiders.pipelines.ProtobufSavePipeline': 300,
|
||||
# 'MediaSpiders.pipelines.TwitterUserDataSaveToMySQL': 300,
|
||||
},
|
||||
'SPIDER_MIDDLEWARES': {
|
||||
'MediaSpiders.middlewares.DumpFilterSpiderMiddleware': 543,
|
||||
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware': None
|
||||
},
|
||||
'DOWNLOADER_MIDDLEWARES': {},
|
||||
'BATCH_SAVE_SIZE': 50
|
||||
}
|
||||
|
||||
start_urls = 'https://top.baidu.com/board?tab=realtime'
|
||||
|
||||
def __init__(self, params=None, *args, **kwargs):
|
||||
super(BaiduHotSearchSprder, self).__init__(*args, **kwargs)
|
||||
self.job_id = None
|
||||
self.collected_items = 0
|
||||
self.max_items = 50 # 设定最大爬取数量,防止无限循环
|
||||
self.retry_count = 0
|
||||
self.max_retries = 3
|
||||
|
||||
if params:
|
||||
try:
|
||||
json_params = json.loads(params)
|
||||
if 'job_id' in json_params:
|
||||
self.job_id = json_params['job_id']
|
||||
if 'max_items' in json_params:
|
||||
self.max_items = int(json_params['max_items'])
|
||||
except Exception as e:
|
||||
self.logger.error(f"解析参数失败: {str(e)}")
|
||||
|
||||
def start_requests(self):
|
||||
"""发起初始请求"""
|
||||
self.logger.info(f"开始爬取百度热搜,任务ID: {self.job_id if self.job_id else 'N/A'}")
|
||||
self.url_time = get_current_timestamp()
|
||||
yield SeleniumRequest(
|
||||
url=self.start_urls,
|
||||
callback=self.parse,
|
||||
meta={'retry_count': 0},
|
||||
wait_time=5
|
||||
)
|
||||
|
||||
def parse(self, response):
|
||||
"""解析热搜榜单数据"""
|
||||
self.logger.info("开始解析百度热搜数据...")
|
||||
|
||||
driver = response.request.meta['driver']
|
||||
|
||||
try:
|
||||
# 设置页面大小避免元素不可见
|
||||
driver.set_window_size(1400, 1000)
|
||||
|
||||
# 访问主域,确保Cookie正确设置
|
||||
driver.get(self.start_urls)
|
||||
|
||||
# 等待主要内容加载
|
||||
try:
|
||||
WebDriverWait(driver, 15).until(
|
||||
EC.presence_of_element_located((By.CSS_SELECTOR, ".category-wrap_iQLoo, .board-item"))
|
||||
)
|
||||
self.logger.info("页面主要内容加载完成")
|
||||
except TimeoutException:
|
||||
self.logger.warning("等待主要内容超时,尝试直接处理可用元素")
|
||||
|
||||
# 滚动页面以确保所有元素加载
|
||||
self._scroll_page(driver)
|
||||
|
||||
# 尝试多种选择器策略,提高兼容性
|
||||
hot_search_items = self._get_hot_search_items(driver)
|
||||
|
||||
if not hot_search_items:
|
||||
self.logger.error("未找到任何热搜项,检查页面结构是否发生变化")
|
||||
self.retry_count += 1
|
||||
if self.retry_count <= self.max_retries:
|
||||
self.logger.info(f"重试第 {self.retry_count}/{self.max_retries} 次")
|
||||
driver.refresh()
|
||||
return SeleniumRequest(
|
||||
url=self.start_urls,
|
||||
callback=self.parse,
|
||||
meta={'retry_count': self.retry_count},
|
||||
dont_filter=True,
|
||||
wait_time=5
|
||||
)
|
||||
else:
|
||||
self.logger.error("达到最大重试次数,终止爬虫")
|
||||
raise CloseSpider("页面结构可能已更改,无法提取数据")
|
||||
|
||||
self.logger.info(f"找到 {len(hot_search_items)} 个热搜项")
|
||||
hot_search_items_list = []
|
||||
# 处理每个热搜项
|
||||
for index, item in enumerate(hot_search_items):
|
||||
try:
|
||||
hot_search_item = self._extract_hot_search_data(item, driver)
|
||||
if hot_search_item:
|
||||
self.collected_items += 1
|
||||
# hot_search_item['es_simrank'] = self.collected_items
|
||||
self.logger.info(f"成功提取第 {self.collected_items} 条数据: {hot_search_item['es_urltitle']}")
|
||||
# hot_search_items_list.append(hot_search_items)
|
||||
yield hot_search_item
|
||||
except StaleElementReferenceException:
|
||||
self.logger.warning(f"第 {index + 1} 项元素已失效,跳过处理")
|
||||
continue
|
||||
except Exception as e:
|
||||
self.logger.error(f"处理第 {index + 1} 项时出错: {str(e)}", exc_info=True)
|
||||
continue
|
||||
|
||||
self.logger.info(f"本次爬取共收集 {self.collected_items} 条有效数据")
|
||||
|
||||
except WebDriverException as e:
|
||||
self.logger.error(f"WebDriver异常: {str(e)}", exc_info=True)
|
||||
if 'retry_count' not in response.meta or response.meta['retry_count'] < self.max_retries:
|
||||
retry_count = response.meta.get('retry_count', 0) + 1
|
||||
self.logger.info(f"尝试重新请求,重试次数: {retry_count}")
|
||||
yield SeleniumRequest(
|
||||
url=self.start_urls,
|
||||
callback=self.parse,
|
||||
meta={'retry_count': retry_count},
|
||||
dont_filter=True,
|
||||
wait_time=5 + retry_count * 2 # 指数退避
|
||||
)
|
||||
except Exception as e:
|
||||
self.logger.error(f"处理页面时发生未预期错误: {str(e)}", exc_info=True)
|
||||
finally:
|
||||
# 可以在此处添加清理代码
|
||||
pass
|
||||
|
||||
def _scroll_page(self, driver):
|
||||
"""滚动页面确保所有元素加载"""
|
||||
try:
|
||||
# 缓慢滚动到底部
|
||||
scroll_pause_time = 1
|
||||
screen_height = driver.execute_script("return window.screen.height;")
|
||||
scrolls = 5
|
||||
|
||||
for i in range(scrolls):
|
||||
driver.execute_script(f"window.scrollTo(0, {screen_height * i});")
|
||||
time.sleep(scroll_pause_time * (1 + random.random()))
|
||||
|
||||
# 滚回到顶部
|
||||
driver.execute_script("window.scrollTo(0, 0);")
|
||||
time.sleep(scroll_pause_time)
|
||||
|
||||
self.logger.info("页面滚动完成,确保元素加载")
|
||||
except Exception as e:
|
||||
self.logger.warning(f"页面滚动时发生异常: {str(e)}")
|
||||
|
||||
def _get_hot_search_items(self, driver):
|
||||
"""尝试多种策略获取热搜项"""
|
||||
selectors = [
|
||||
'.category-wrap_iQLoo.horizontal_1eKyQ', # 主要选择器
|
||||
'.board-item', # 备用选择器
|
||||
'div[data-index]' # 基于属性的选择器
|
||||
]
|
||||
|
||||
for selector in selectors:
|
||||
try:
|
||||
items = driver.find_elements(By.CSS_SELECTOR, selector)
|
||||
if items and len(items) > 0:
|
||||
self.logger.info(f"使用选择器 '{selector}' 成功找到 {len(items)} 个元素")
|
||||
return items
|
||||
except Exception as e:
|
||||
self.logger.debug(f"选择器 '{selector}' 失败: {str(e)}")
|
||||
|
||||
# 如果CSS选择器都失败,尝试XPath
|
||||
try:
|
||||
xpath_patterns = [
|
||||
'//div[contains(@class, "category-wrap") and contains(@class, "horizontal")]',
|
||||
'//div[contains(@class, "board-item")]',
|
||||
'//div[@data-index]'
|
||||
]
|
||||
|
||||
for xpath in xpath_patterns:
|
||||
items = driver.find_elements(By.XPATH, xpath)
|
||||
if items and len(items) > 0:
|
||||
self.logger.info(f"使用XPath '{xpath}' 成功找到 {len(items)} 个元素")
|
||||
return items
|
||||
except Exception as e:
|
||||
self.logger.debug(f"XPath策略失败: {str(e)}")
|
||||
|
||||
return []
|
||||
|
||||
def _extract_hot_search_data(self, item, driver):
|
||||
"""提取单个热搜项数据(变量集中收集 + 末尾统一赋值)"""
|
||||
|
||||
# 常量定义
|
||||
TITLE_SELECTORS = ['.c-single-text-ellipsis', '.title_dIF3B']
|
||||
RANK_SELECTORS = ['.index_1Ew5p', '.hot-index_1Bl1a']
|
||||
LINK_SELECTORS = [
|
||||
'.look-more_3oNWC',
|
||||
'a[href*="www.baidu.com/s?"]',
|
||||
'.hot-desc_1m_jR a',
|
||||
'.content_3Kk0y a'
|
||||
]
|
||||
DESC_SELECTORS = [
|
||||
'.hot-desc_1m_jR.large_nSuFU',
|
||||
'.hot-desc_1m_jR.small_Uvkd3',
|
||||
'.desc_3CT34',
|
||||
'.content_3Kk0y'
|
||||
]
|
||||
|
||||
HOT_NUM_SELECTOR = '.hot-index_1Bl1a'
|
||||
|
||||
# 辅助函数
|
||||
def find_visible_element(selectors, context=item):
|
||||
for selector in selectors:
|
||||
try:
|
||||
elem = context.find_element(By.CSS_SELECTOR, selector)
|
||||
if elem.is_displayed():
|
||||
return elem
|
||||
except Exception as e:
|
||||
self.logger.debug(f"选择器 '{selector}' 未匹配: {str(e)}")
|
||||
return None
|
||||
|
||||
def clean_text(text, remove_phrases=("查看更多>", "查看更多", "查看全文", "展开全文")):
|
||||
"""深度清理文本:移除干扰短语 + 合并连续空格"""
|
||||
if not text:
|
||||
return ""
|
||||
# 移除指定短语
|
||||
for phrase in remove_phrases:
|
||||
text = text.replace(phrase, "")
|
||||
# 清理多余空白(包括\xa0等特殊空格)
|
||||
text = re.sub(r'\s+', ' ', text).strip()
|
||||
return text.strip()
|
||||
|
||||
def normalize_url(url):
|
||||
if not url or not isinstance(url, str):
|
||||
return ""
|
||||
url = url.strip()
|
||||
if url.startswith('//'):
|
||||
url = 'https:' + url
|
||||
if url.startswith('data:') or not url.startswith(('http://', 'https://')):
|
||||
return ""
|
||||
return url
|
||||
|
||||
# 1. 提取标题
|
||||
title_elem = find_visible_element(TITLE_SELECTORS)
|
||||
if not title_elem:
|
||||
self.logger.warning("标题元素未找到,跳过该项")
|
||||
return None
|
||||
title = clean_text(title_elem.text)
|
||||
if not title:
|
||||
self.logger.warning("标题内容为空,跳过该项")
|
||||
return None
|
||||
|
||||
# 基础字段
|
||||
now_ms = get_current_timestamp()
|
||||
site_name = '百度热搜'
|
||||
carrier_type = 'hot_search'
|
||||
hkey = get_str_md5(title)
|
||||
|
||||
# 排名(默认使用当前收集序号)
|
||||
rank = str(self.collected_items)
|
||||
rank_elem = find_visible_element(RANK_SELECTORS)
|
||||
if rank_elem:
|
||||
extracted_rank = clean_text(rank_elem.text)
|
||||
if extracted_rank:
|
||||
rank = extracted_rank
|
||||
|
||||
# 链接与SID
|
||||
url_href = ""
|
||||
link_elem = find_visible_element(LINK_SELECTORS)
|
||||
if link_elem:
|
||||
raw_href = link_elem.get_attribute('href')
|
||||
url_href = normalize_url(raw_href) if raw_href else ""
|
||||
|
||||
if url_href:
|
||||
sid = hashlib.md5(url_href.lower().encode('utf-8')).hexdigest()
|
||||
else:
|
||||
fallback_seed = f"no_link_{title}_{now_ms}"
|
||||
sid = hashlib.md5(fallback_seed.encode('utf-8')).hexdigest()
|
||||
|
||||
# 热度
|
||||
heat = 0
|
||||
try:
|
||||
hot_elem = item.find_element(By.CSS_SELECTOR, HOT_NUM_SELECTOR)
|
||||
hot_val = clean_text(hot_elem.text).replace(',', '')
|
||||
if hot_val.isdigit():
|
||||
heat = int(hot_val)
|
||||
except Exception as e:
|
||||
self.logger.debug(f"热度提取失败: {str(e)}")
|
||||
|
||||
# 描述
|
||||
desc = ""
|
||||
desc_elem = find_visible_element(DESC_SELECTORS)
|
||||
if desc_elem:
|
||||
desc = clean_text(desc_elem.text)
|
||||
|
||||
# 内容
|
||||
detail_url = ""
|
||||
try:
|
||||
# 1. 先定位到描述容器
|
||||
desc_container = item.find_element(By.CSS_SELECTOR, '.hot-desc_1m_jR')
|
||||
|
||||
# 2. 在容器内精准定位"查看更多"链接
|
||||
detail_link = desc_container.find_element(By.CSS_SELECTOR, 'a.look-more_3oNWC[href*="www.baidu.com/s?"]')
|
||||
|
||||
# 3. 获取并标准化URL
|
||||
raw_href = detail_link.get_attribute('href') or ""
|
||||
detail_url = normalize_url(raw_href) # 使用之前定义的URL标准化函数
|
||||
|
||||
self.logger.debug(f"成功提取详情页URL: {detail_url}")
|
||||
except Exception as e:
|
||||
self.logger.debug(f"提取详情页URL失败: {str(e)}")
|
||||
# 失败时保持空字符串,后续会使用主链接作为备选
|
||||
|
||||
# 图片
|
||||
# img_element = item.find_element(By.CSS_SELECTOR, 'img[src*="cdn.bcebos.com/hotboard_image"]')
|
||||
# img_url = img_element.get_attribute('src') or ""
|
||||
|
||||
# ==================== 3. 统一创建并赋值Item(唯一赋值点) ====================
|
||||
hot_search_item = MediaspidersItem()
|
||||
hot_search_item['es_sitename'] = site_name
|
||||
hot_search_item['es_urltitle'] = title
|
||||
hot_search_item['es_urlcontent'] = desc
|
||||
hot_search_item['es_carriertype'] = carrier_type
|
||||
hot_search_item['es_urltime'] = self.url_time
|
||||
hot_search_item['es_lasttime'] = now_ms
|
||||
hot_search_item['es_loadtime'] = now_ms
|
||||
hot_search_item['es_hkey'] = hkey
|
||||
hot_search_item['es_simrank'] = rank
|
||||
hot_search_item['es_heat'] = heat
|
||||
hot_search_item['es_sid'] = sid
|
||||
hot_search_item['es_urlname'] = detail_url
|
||||
|
||||
# 条件字段:仅当存在有效图片时赋值
|
||||
# if img_url:
|
||||
# hot_search_item['image_urls'] = [img_url] # ImagesPipeline要求列表格式
|
||||
|
||||
return hot_search_item
|
||||
@ -11,6 +11,7 @@ from scrapy_selenium import SeleniumRequest
|
||||
from selenium.webdriver.common.action_chains import ActionChains
|
||||
|
||||
from MediaSpiders.items import MediaspidersItem
|
||||
from MediaSpiders.spiders.TwitterUserInfoSpider import form_cookie_dict
|
||||
from MediaSpiders.utils.http_utils import http_post
|
||||
from MediaSpiders.utils.string_utils import get_str_md5
|
||||
from MediaSpiders.utils.time_utils import get_time_stamp, get_current_timestamp
|
||||
@ -66,21 +67,34 @@ class FacebookSpider(scrapy.Spider):
|
||||
logger.info("login facebook")
|
||||
driver = response.request.meta['driver']
|
||||
driver.maximize_window()
|
||||
driver.get('https://m.facebook.com/')
|
||||
time.sleep(3)
|
||||
# 获取采集登录账号并登录
|
||||
login_users = self.redis_client.smembers('MediaSpiders:Facebook_login_accounts')
|
||||
user_list = []
|
||||
for u in login_users:
|
||||
user_list.append(json.loads(u.decode()))
|
||||
login_user = random.choice(user_list)
|
||||
driver.find_element_by_xpath(
|
||||
'//input[@name="email"]').send_keys(login_user['uid'])
|
||||
driver.find_element_by_xpath(
|
||||
'//input[@name="pass"]').send_keys(login_user['pwd'])
|
||||
driver.find_element_by_xpath('//button[@name="login"]').click()
|
||||
time.sleep(10)
|
||||
logger.info("login as %s" % login_user['uid'])
|
||||
# 访问主域,再设 Cookie
|
||||
driver.get("https://www.facebook.com/")
|
||||
time.sleep(2)
|
||||
|
||||
# 添加 Cookie(确保 domain 是 .facebook.com)
|
||||
cookie_string = self.redis_client.get("MediaSpiders:Facebook_Cookies").decode()
|
||||
cookie_dict = form_cookie_dict(cookie_string) # 你已有此函数
|
||||
|
||||
# 转换为 Selenium 所需格式(必须含 domain 和 path)
|
||||
cookies_to_add = []
|
||||
for name, value in cookie_dict.items():
|
||||
cookies_to_add.append({
|
||||
'name': name,
|
||||
'value': value,
|
||||
'domain': '.facebook.com',
|
||||
'path': '/',
|
||||
'secure': True
|
||||
})
|
||||
|
||||
for cookie in cookies_to_add:
|
||||
try:
|
||||
driver.add_cookie(cookie)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to add cookie {cookie['name']}: {e}")
|
||||
|
||||
driver.refresh()
|
||||
time.sleep(5)
|
||||
|
||||
# 获取待采集目标账号,并逐个请求
|
||||
account_query_api = self.settings['SOCIAL_USER_QUERY_ALL_API']
|
||||
account_query_api = account_query_api.format(sortBy="id", shuffleResult="true")
|
||||
@ -88,9 +102,11 @@ class FacebookSpider(scrapy.Spider):
|
||||
'userType': self.settings['FACEBOOK_USER_TYPE'],
|
||||
'userFlag': 0
|
||||
}
|
||||
|
||||
account_rsp = json.loads(
|
||||
http_post(account_query_api, json.dumps(post_data), headers={"Content-Type": "application/json"}).text)
|
||||
all_user_info = []
|
||||
|
||||
if account_rsp['code'] == 200:
|
||||
all_user_info = account_rsp['content']
|
||||
logger.info('GET %s users' % account_rsp['message'])
|
||||
@ -107,6 +123,7 @@ class FacebookSpider(scrapy.Spider):
|
||||
time.sleep(5)
|
||||
last_page_articles_count = 0
|
||||
logger.info("Current URL: {}".format(current_url))
|
||||
#
|
||||
current_page_articles = driver.find_elements_by_xpath(
|
||||
"//div[@data-pagelet='ProfileTimeline']/div[position() > %s]" % last_page_articles_count)
|
||||
items = self.get_article(current_page_articles, uid, driver)
|
||||
@ -129,51 +146,95 @@ class FacebookSpider(scrapy.Spider):
|
||||
for article in articles:
|
||||
item = MediaspidersItem()
|
||||
try:
|
||||
uname = article.find_element_by_xpath('.//h2//strong/span').text
|
||||
article_url = article.find_element_by_xpath(".//a[contains(@href,'/posts/')]").get_attribute('href')
|
||||
# === 用户名:从 h2 下的 b/span 或直接 span 提取 ===
|
||||
try:
|
||||
uname = article.find_element_by_xpath('.//h2//b//span').text
|
||||
except:
|
||||
try:
|
||||
uname = article.find_element_by_xpath('.//h2//span[@dir="auto"]').text
|
||||
except:
|
||||
uname = uid
|
||||
|
||||
# === 文章链接和时间:从包含 /posts/pfbid 的 a 标签提取 ===
|
||||
post_link_elem = article.find_element_by_xpath(".//a[contains(@href,'/posts/pfbid')]")
|
||||
article_url = post_link_elem.get_attribute('href')
|
||||
article_url = article_url.split("?")[0]
|
||||
article_time = post_link_elem.text # 时间文本直接在 a 标签内
|
||||
|
||||
# === 展开全文(如有)===
|
||||
try:
|
||||
clickable_fields = article.find_elements_by_xpath(".//div[@role='button']")
|
||||
if len(clickable_fields) > 0:
|
||||
for cf in clickable_fields:
|
||||
cf_text = cf.text
|
||||
if cf_text is not None and cf_text == "展开":
|
||||
if cf_text is not None and ("展开" in cf_text or "See more" in cf_text):
|
||||
cf.click()
|
||||
time.sleep(1)
|
||||
break
|
||||
except Exception as e:
|
||||
logger.debug(repr(e))
|
||||
article_text_lines = article.find_elements_by_xpath(".//div[@data-ad-preview='message']")
|
||||
text_info = []
|
||||
for line in article_text_lines:
|
||||
text_info.append(line.text)
|
||||
article_text = "".join(text_info)
|
||||
article_time = article.find_element_by_xpath(".//a[contains(@href,'/posts/')]/span").text
|
||||
|
||||
# === 正文内容:使用 data-ad-rendering-role="story_message" ===
|
||||
try:
|
||||
article_text_lines = article.find_elements_by_xpath(
|
||||
".//div[@data-ad-rendering-role='story_message']")
|
||||
text_info = []
|
||||
for line in article_text_lines:
|
||||
text_info.append(line.text)
|
||||
article_text = "".join(text_info)
|
||||
except:
|
||||
article_text = ""
|
||||
|
||||
# === 时间戳处理 ===
|
||||
logger.info(f"article_time: {article_time}")
|
||||
article_time = get_time_stamp(
|
||||
article_time) # 这里的 article_time 必须是中文模式下的时间,比如“1天”、“5小时”等,需要登陆Facebook后切换语言
|
||||
article_time = get_time_stamp(article_time)
|
||||
logger.info(f"urltime: {article_time}")
|
||||
|
||||
# === 图片提取 ===
|
||||
img_urls = []
|
||||
imgs = article.find_elements_by_xpath(".//a[contains(@href,'/photo/')]//img")
|
||||
for img in imgs:
|
||||
img_urls.append(img.get_attribute("src"))
|
||||
try:
|
||||
imgs = article.find_elements_by_xpath(".//a[contains(@href,'/photo/')]//img")
|
||||
for img in imgs:
|
||||
src = img.get_attribute("src")
|
||||
if src and "emoji" not in src: # 过滤 emoji 图片
|
||||
img_urls.append(src)
|
||||
except:
|
||||
pass
|
||||
|
||||
# === 视频(暂留空)===
|
||||
video_urls = []
|
||||
article_id = get_str_md5(article_text)
|
||||
|
||||
# === 互动数据:点赞、评论、转发 ===
|
||||
like_count = 0
|
||||
comment_count = 0
|
||||
forward_count = 0
|
||||
like_count_str = article.find_element_by_xpath(
|
||||
".//div[@data-visualcompletion='ignore-dynamic']//span[@aria-hidden='true']").text
|
||||
comment_and_forward_element = article.find_elements_by_xpath(".//div[@tabindex='0']//span[@dir='auto']")
|
||||
|
||||
try:
|
||||
if like_count_str:
|
||||
like_count = int(like_count_str.replace(",", ""))
|
||||
if len(comment_and_forward_element) > 1:
|
||||
comment_count_str = comment_and_forward_element[0].text
|
||||
forward_count_str = comment_and_forward_element[1].text
|
||||
comment_count = int(comment_count_str.replace(",", ""))
|
||||
forward_count = int(forward_count_str.replace(",", ""))
|
||||
# 点赞数:通过 aria-label 匹配
|
||||
like_label_elem = article.find_element_by_xpath(
|
||||
".//div[@aria-label and contains(@aria-label, '赞:')]")
|
||||
like_label = like_label_elem.get_attribute("aria-label")
|
||||
import re
|
||||
like_match = re.search(r'(\d+)', like_label)
|
||||
if like_match:
|
||||
like_count = int(like_match.group(1))
|
||||
except:
|
||||
pass
|
||||
|
||||
try:
|
||||
# 评论和转发:通常在 toolbar 内的两个 span 中
|
||||
stat_spans = article.find_elements_by_xpath(
|
||||
".//div[@role='toolbar']//span[@class='xt0b8zv x135b78x']")
|
||||
if len(stat_spans) >= 2:
|
||||
comment_count = int(stat_spans[0].text.replace(",", "")) if stat_spans[0].text.replace(",",
|
||||
"").isdigit() else 0
|
||||
forward_count = int(stat_spans[1].text.replace(",", "")) if stat_spans[1].text.replace(",",
|
||||
"").isdigit() else 0
|
||||
except:
|
||||
logger.warning("获取点赞/评论/转发数量异常")
|
||||
|
||||
# === 填充 Item ===
|
||||
article_id = get_str_md5(article_text)
|
||||
item['es_sid'] = str(article_id)
|
||||
item['es_hkey'] = str(article_id)
|
||||
item['es_content'] = str(article_text).replace('查看翻译', '')
|
||||
@ -193,17 +254,23 @@ class FacebookSpider(scrapy.Spider):
|
||||
item['es_sitename'] = 'facebook'
|
||||
item['es_srcname'] = 'facebook'
|
||||
item['es_carriertype'] = 'media'
|
||||
# 判重
|
||||
|
||||
# === 判重逻辑 ===
|
||||
if item['es_lasttime'] - item['es_urltime'] > 48 * 3600 * 1000:
|
||||
if self.bloom_filter.bfAdd(self.settings['FACEBOOK_FILTER_KEY'], article_id) <= 0:
|
||||
logger.info("跳过已采集内容")
|
||||
continue
|
||||
|
||||
if item['es_urlcontent'].endswith('展开'):
|
||||
logger.info("跳过未展开的内容")
|
||||
continue
|
||||
|
||||
article_items.append(item)
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(repr(e))
|
||||
logger.debug("解析单条帖子失败: %s" % repr(e))
|
||||
continue
|
||||
|
||||
logger.info("用户 {} 的发文数量为 {}".format(uid, len(article_items)))
|
||||
return article_items
|
||||
|
||||
|
||||
@ -0,0 +1,417 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
from typing import List, Dict, Any, Optional
|
||||
|
||||
import scrapy
|
||||
from scrapy_selenium import SeleniumRequest
|
||||
from selenium.common.exceptions import TimeoutException, NoSuchElementException, ElementClickInterceptedException
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.remote.webelement import WebElement
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
|
||||
from MediaSpiders.items import MediaspidersItem
|
||||
from MediaSpiders.utils.date_utils import get_time_stamp
|
||||
from MediaSpiders.utils.hot_search_json_parser import url_response
|
||||
from MediaSpiders.utils.string_utils import get_str_md5
|
||||
from MediaSpiders.utils.time_utils import get_current_timestamp
|
||||
|
||||
|
||||
class HotSearchRedisSpider(scrapy.Spider):
|
||||
name = 'HotSearchRedisSpider'
|
||||
|
||||
custom_settings = {
|
||||
'PROTO_MODULE_PATH': 'MediaSpiders.proto.Es_pb2',
|
||||
'PROTO_CLASS_NAME': 'EsSets',
|
||||
'PROTO_FIELD_NAME': 'Es',
|
||||
'PROTO_SAVE_FILE_NAME': 'public_info_data_',
|
||||
'IMAGES_STORE': r'/usr/local/temp_image/twitter',
|
||||
'IMAGES_RESULT_FIELD': 'es_urlimage',
|
||||
'FILES_STORE': r'/usr/local/videos',
|
||||
'FILES_RESULT_FIELD': 'es_video',
|
||||
'ZIP_FILE_NAME': 'image_data_ship_',
|
||||
'FILE_ZIP_FILE_NAME': 'image_data_plane_',
|
||||
'ITEM_PIPELINES': {
|
||||
'MediaSpiders.pipelines.ProtobufSavePipeline': 300
|
||||
},
|
||||
'SPIDER_MIDDLEWARES': {
|
||||
'MediaSpiders.middlewares.DumpFilterSpiderMiddleware': 543,
|
||||
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware': None
|
||||
}
|
||||
}
|
||||
|
||||
# 常量定义
|
||||
TOUTIAO_HOT_URL = 'https://www.toutiao.com/hot-event/hot-board/?origin=toutiao_pc'
|
||||
BAIDU_URL = 'https://www.toutiao.com/'
|
||||
PAGE_LOAD_TIMEOUT = 10
|
||||
ELEMENT_WAIT_TIMEOUT = 5
|
||||
MAX_NEWS_PER_HOT = 1
|
||||
MAX_HOT_ITEMS = 15
|
||||
|
||||
# 选择器定义
|
||||
# URL_SELECTORS = [
|
||||
# '.card-render-wrapper a'
|
||||
# # '.l-content a',
|
||||
# # '.feed-card-wtt-l p a',
|
||||
# # '.feed-card-article-l a'
|
||||
# ]
|
||||
|
||||
AUTHOR_SELECTORS = [
|
||||
"//div[@class='author-info']/div[@class='desc']/a[@class='name']",
|
||||
"//div[@class='user-info']/a[@class='user-name']"
|
||||
]
|
||||
|
||||
CONTENT_SELECTORS = [
|
||||
"//div[@class='article-content']//p",
|
||||
"//article/div[@class='weitoutiao-html']"
|
||||
]
|
||||
|
||||
TIME_SELECTORS = [
|
||||
"//p[@class='abstract']/span[@class='time']",
|
||||
"//div[@class='article-meta']/span[1]"
|
||||
]
|
||||
|
||||
# 需要过滤的文本模式
|
||||
SKIP_PATTERNS = ['版权', '声明', '邮箱', '记者', '编辑', '来源', '投稿', '责任编辑']
|
||||
|
||||
def __init__(self, params=None, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.url_time = get_current_timestamp()
|
||||
self.total_num = 0
|
||||
self.authorization = None
|
||||
self.job_id = None
|
||||
|
||||
if params:
|
||||
try:
|
||||
json_params = json.loads(params)
|
||||
self.total_num = int(json_params.get('totalNum', 0))
|
||||
self.authorization = json_params.get('authorization')
|
||||
self.job_id = json_params.get('job_id')
|
||||
except (json.JSONDecodeError, ValueError) as e:
|
||||
self.logger.error(f"解析参数失败: {e}")
|
||||
|
||||
def start_requests(self):
|
||||
"""开始请求"""
|
||||
yield SeleniumRequest(
|
||||
url=self.BAIDU_URL,
|
||||
callback=self.parse_parent,
|
||||
wait_time=self.PAGE_LOAD_TIMEOUT
|
||||
)
|
||||
|
||||
def parse_parent(self, response):
|
||||
"""解析热点列表页面"""
|
||||
driver = response.request.meta['driver']
|
||||
|
||||
# 获取热点数据
|
||||
hot_items = self._fetch_hot_items()[:self.MAX_HOT_ITEMS]
|
||||
# hot_items = []
|
||||
# hot_items.append({
|
||||
# "fake_url": "https://www.toutiao.com/trending/7612920230477565459/?rank=14&log_from=4dda3d0c958f48_1772529869512",
|
||||
# 'hot_id': '76132246866893fda27',
|
||||
# 'hot_value': 5432429101,
|
||||
# 'hot_word': '伊朗:反击最初两天650民美士兵伤亡'
|
||||
# })
|
||||
|
||||
for hot_item in hot_items:
|
||||
if not hot_item.get('fake_url'):
|
||||
self.logger.warning(f"热点 {hot_item['hot_word']} 无有效URL,跳过")
|
||||
continue
|
||||
|
||||
yield from self._process_hot_item(driver, hot_item)
|
||||
|
||||
def _fetch_hot_items(self) -> List[Dict[str, Any]]:
|
||||
"""获取热点数据"""
|
||||
try:
|
||||
rsp_body = url_response(self.TOUTIAO_HOT_URL)
|
||||
if rsp_body.get('status') != "success":
|
||||
self.logger.error("获取热点数据失败")
|
||||
return []
|
||||
|
||||
result_array = []
|
||||
for line in rsp_body.get('data', []):
|
||||
try:
|
||||
result_array.append({
|
||||
"hot_id": line.get('ClusterIdStr', ''),
|
||||
"hot_word": line.get('Title', ''),
|
||||
"hot_value": int(line.get('HotValue', 0)),
|
||||
"fake_url": line.get('Url', '')
|
||||
})
|
||||
except Exception as e:
|
||||
self.logger.error(f"解析热点数据失败: {e}")
|
||||
self.logger.debug(f"问题数据: {line}")
|
||||
|
||||
return result_array
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"获取热点数据异常: {e}")
|
||||
return []
|
||||
|
||||
def _process_hot_item(self, driver, hot_item: Dict[str, Any]):
|
||||
"""处理单个热点项"""
|
||||
try:
|
||||
|
||||
yield from self._get_event_details(driver, hot_item)
|
||||
|
||||
# 加载热点页面
|
||||
driver.get(hot_item['fake_url'])
|
||||
self._wait_for_page_load(driver)
|
||||
|
||||
# 拿到所有标题,判断是否存在“事件脉络”
|
||||
context_list = driver.find_elements(By.XPATH, "//div[@class='block-title']")
|
||||
self.logger.info(f"context_list:{context_list}")
|
||||
|
||||
for context in context_list:
|
||||
block_title = context.text.strip()
|
||||
|
||||
if block_title == "事件脉络":
|
||||
yield from self._get_event_timeline(context, driver, hot_item)
|
||||
continue
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"处理热点 '{hot_item['hot_word']}' 失败: {e}")
|
||||
|
||||
def _get_event_timeline(self, context, driver, hot_item: Dict[str, Any]):
|
||||
self.logger.info("开始采集事件脉络...")
|
||||
|
||||
# 定位按钮元素
|
||||
button_element = context.find_elements(By.XPATH,
|
||||
"//div[@class='timeline-card-wrapper']/div[@class='load-more']/button")[
|
||||
0]
|
||||
# 尝试普通点击
|
||||
try:
|
||||
button_element.click()
|
||||
except ElementClickInterceptedException:
|
||||
# 如果被遮挡,使用JavaScript点击
|
||||
self.logger.info("使用JavaScript点击按钮")
|
||||
driver.execute_script("arguments[0].click();", button_element)
|
||||
|
||||
# 等待内容加载
|
||||
time.sleep(2)
|
||||
# 获取当前所有的脉络信息
|
||||
event_list = context.find_elements(By.XPATH,
|
||||
"//div[@class='timeline-card-wrapper']/div[@class='time-nodes']/div[@class='time-node']")
|
||||
self.logger.info(f"找到 {len(event_list)} 个事件")
|
||||
url_content = ''
|
||||
for idx, even in enumerate(event_list, 1):
|
||||
try:
|
||||
# 获取标题和时间
|
||||
title_elem = even.find_element(By.XPATH, ".//div[@class='title']")
|
||||
title_text = title_elem.text
|
||||
|
||||
# 获取脉络内容
|
||||
content_element = even.find_element(By.XPATH, ".//a[@class='content']/p")
|
||||
content = content_element.text.strip()
|
||||
|
||||
# 检查是否有"最新"标签
|
||||
try:
|
||||
tag_element = content_element.find_element(By.XPATH, ".//span[@class='tag']")
|
||||
tag = tag_element.text.strip() # "最新"
|
||||
# 从内容中移除标签文本(如果需要)
|
||||
content = content.replace(tag, "").strip()
|
||||
except:
|
||||
tag = ""
|
||||
|
||||
# 拼接标题和内容,用换行符隔开
|
||||
if url_content:
|
||||
url_content += "\n" # 在已有内容后添加换行符
|
||||
|
||||
# 添加当前条目的标题和内容
|
||||
url_content += f"{title_text}\n{content}"
|
||||
|
||||
self.logger.info(f"已添加第{idx}条: {title_text[:20]}...")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"处理第{idx}个事件时出错: {e}")
|
||||
continue
|
||||
|
||||
timeNow = get_current_timestamp()
|
||||
# 创建item
|
||||
event_timeline_item = MediaspidersItem()
|
||||
event_timeline_item['es_carriertype'] = 'news'
|
||||
event_timeline_item['es_srcname'] = 'https://www.toutiao.com/'
|
||||
event_timeline_item['es_sitename'] = '今日头条'
|
||||
event_timeline_item['es_sid'] = get_str_md5(hot_item["fake_url"])
|
||||
event_timeline_item['es_urltitle'] = hot_item['hot_word']
|
||||
event_timeline_item['es_authors'] = ''
|
||||
event_timeline_item['es_urlcontent'] = url_content
|
||||
event_timeline_item['es_urltime'] = timeNow
|
||||
event_timeline_item['es_lasttime'] = timeNow
|
||||
event_timeline_item['es_urlname'] = hot_item["fake_url"]
|
||||
event_timeline_item['es_hkey'] = hot_item['hot_id']
|
||||
event_timeline_item['es_urltopic'] = hot_item['hot_word']
|
||||
event_timeline_item['es_video'] = ''
|
||||
|
||||
yield event_timeline_item
|
||||
self.logger.info(f"事件脉络-采集成功 '{hot_item['hot_word']}':{hot_item['fake_url']}")
|
||||
|
||||
|
||||
def _get_event_details(self, driver, hot_item: Dict[str, Any]):
|
||||
"""获取事件详情卡片列表"""
|
||||
self.logger.info(f"开始采集事件详情-{hot_item['hot_word']}: {hot_item['fake_url']}")
|
||||
|
||||
hot_url = hot_item['fake_url']
|
||||
driver.get(hot_url)
|
||||
self._wait_for_page_load(driver)
|
||||
# 如果 api 采集的url为榜单页,则采集卡片,否则就直接采集详情页
|
||||
if "article" not in hot_url:
|
||||
cards = driver.find_elements(By.XPATH,
|
||||
"//div[@class='block-content']/div[@class='card-render-wrapper']")
|
||||
news_cards = cards[:self.MAX_NEWS_PER_HOT]
|
||||
news_urls_array = []
|
||||
for card in news_cards:
|
||||
"""从卡片中提取URL"""
|
||||
# for selector in self.URL_SELECTORS:
|
||||
try:
|
||||
element = card.find_element(By.CSS_SELECTOR, '.card-render-wrapper a')
|
||||
url = element.get_attribute('href')
|
||||
if url and url.startswith(('http://', 'https://')):
|
||||
news_url = url
|
||||
except NoSuchElementException:
|
||||
break
|
||||
|
||||
# if "video" in news_url.lower() or not news_url:
|
||||
# self.logger.info(f"跳过该链接采集: {news_url}")
|
||||
# break
|
||||
news_urls_array.append(news_url)
|
||||
|
||||
else:
|
||||
# 将详情页赋值
|
||||
news_urls_array = [hot_url]
|
||||
try:
|
||||
# 开始采集
|
||||
for url in news_urls_array:
|
||||
yield from self._process_news_page(driver, url, hot_item)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"获取事件详情卡片失败: {e}")
|
||||
|
||||
def _wait_for_page_load(self, driver, timeout: int = None):
|
||||
"""等待页面加载"""
|
||||
timeout = timeout or self.PAGE_LOAD_TIMEOUT
|
||||
time.sleep(2) # 基础等待
|
||||
try:
|
||||
WebDriverWait(driver, timeout).until(
|
||||
EC.presence_of_element_located((By.TAG_NAME, "body"))
|
||||
)
|
||||
except TimeoutException:
|
||||
self.logger.warning("页面加载超时")
|
||||
|
||||
def _process_news_page(self, driver, news_url: str, hot_item: Dict[str, Any]):
|
||||
"""处理单个新闻页面"""
|
||||
try:
|
||||
if "video" in news_url.lower() or not news_url:
|
||||
driver.get(news_url)
|
||||
self._wait_for_page_load(driver)
|
||||
title = driver.find_elements(By.XPATH, "//div[@class='ttp-video-extras-title']/h1")[0].get_attribute('title')
|
||||
time_text = driver.find_elements(By.XPATH, "//div[@class='meta-info']/span[@class='publish-time']")[0].text.replace("发布于 ", "").strip()
|
||||
author = driver.find_elements(By.XPATH, "//div[@class='author-info']/a[@class='author-name']")[0].text.strip()
|
||||
content = news_url
|
||||
|
||||
if time_text:
|
||||
try:
|
||||
url_time = get_time_stamp(time_text)
|
||||
except Exception as e:
|
||||
self.logger.debug(f"时间转换失败: {time_text}, {e}")
|
||||
|
||||
else:
|
||||
driver.get(news_url)
|
||||
self._wait_for_page_load(driver)
|
||||
|
||||
# 标题采集
|
||||
try:
|
||||
title = driver.find_elements(By.XPATH, "//div[@class='article-content']/h1")[0].text.strip()
|
||||
except Exception as e:
|
||||
logging.error(f'标题采集失败,已使用热搜名称...')
|
||||
title = hot_item['hot_word']
|
||||
# 提取页面信息
|
||||
author = self._extract_text(driver, self.AUTHOR_SELECTORS)
|
||||
content = self._extract_content(driver)
|
||||
url_time = self._extract_time(driver)
|
||||
|
||||
if not content:
|
||||
self.logger.warning(f"页面无有效内容: {news_url}")
|
||||
return
|
||||
|
||||
# 创建item
|
||||
even_details_item = MediaspidersItem()
|
||||
even_details_item['es_carriertype'] = 'news'
|
||||
even_details_item['es_srcname'] = 'https://www.toutiao.com/'
|
||||
even_details_item['es_sitename'] = '今日头条'
|
||||
even_details_item['es_sid'] = get_str_md5(news_url)
|
||||
even_details_item['es_urltitle'] = title
|
||||
even_details_item['es_authors'] = author
|
||||
even_details_item['es_urlcontent'] = content
|
||||
even_details_item['es_urltime'] = url_time
|
||||
even_details_item['es_lasttime'] = url_time
|
||||
even_details_item['es_urlname'] = news_url
|
||||
even_details_item['es_hkey'] = hot_item['hot_id']
|
||||
even_details_item['es_urltopic'] = hot_item['hot_word']
|
||||
even_details_item['es_video'] = ''
|
||||
|
||||
yield even_details_item
|
||||
|
||||
self.logger.info(f"事件详情-采集成功 '{hot_item['hot_word']}':{news_url}")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"处理新闻页面失败 {news_url}: {e}")
|
||||
|
||||
def _extract_text(self, context, selectors: List[str]) -> Optional[str]:
|
||||
"""从上下文中提取可见元素的文本"""
|
||||
for selector in selectors:
|
||||
try:
|
||||
elements = context.find_elements(By.XPATH, selector)
|
||||
|
||||
for elem in elements:
|
||||
if elem.is_displayed():
|
||||
text = elem.text.strip()
|
||||
if text:
|
||||
return text
|
||||
except Exception as e:
|
||||
self.logger.debug(f"选择器 '{selector}' 未匹配: {e}")
|
||||
return None
|
||||
|
||||
def _extract_content(self, driver) -> str:
|
||||
"""提取文章内容"""
|
||||
try:
|
||||
time.sleep(2) # 等待内容加载
|
||||
|
||||
content_lines = []
|
||||
for selector in self.CONTENT_SELECTORS:
|
||||
try:
|
||||
paragraphs = driver.find_elements(By.XPATH, selector)
|
||||
|
||||
for p in paragraphs:
|
||||
if selector == '.weitoutiao-html':
|
||||
text = p[0].text.strip()
|
||||
else:
|
||||
text = p.text.strip()
|
||||
|
||||
if text != '':
|
||||
content_lines.append(text)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.debug(f"选择器 '{selector}' 提取失败: {e}")
|
||||
|
||||
return '\n'.join(content_lines) if content_lines else ""
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"提取内容失败: {e}")
|
||||
return ""
|
||||
|
||||
def _is_valid_content(self, text: str) -> bool:
|
||||
"""验证内容是否有效"""
|
||||
if not text or len(text) <= 10:
|
||||
return False
|
||||
return not any(pattern in text for pattern in self.SKIP_PATTERNS)
|
||||
|
||||
def _extract_time(self, driver) -> Optional[int]:
|
||||
"""提取发布时间"""
|
||||
time_text = self._extract_text(driver, self.TIME_SELECTORS)
|
||||
if time_text:
|
||||
try:
|
||||
return get_time_stamp(time_text)
|
||||
except Exception as e:
|
||||
self.logger.debug(f"时间转换失败: {time_text}, {e}")
|
||||
return self.url_time
|
||||
408
spiders/MediaSpiders/MediaSpiders/spiders/LinkedinSpider.py
Normal file
408
spiders/MediaSpiders/MediaSpiders/spiders/LinkedinSpider.py
Normal file
@ -0,0 +1,408 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# 标准库
|
||||
from datetime import datetime, timedelta
|
||||
import json
|
||||
import logging as logger
|
||||
import re
|
||||
import time
|
||||
import hashlib
|
||||
|
||||
import redis
|
||||
import scrapy
|
||||
from redisbloom.client import Client
|
||||
from scrapy_selenium import SeleniumRequest
|
||||
from selenium.webdriver.common.by import By
|
||||
|
||||
from MediaSpiders.items import MediaspidersItem
|
||||
from MediaSpiders.utils.string_utils import get_str_md5
|
||||
from MediaSpiders.utils.time_utils import get_current_timestamp
|
||||
|
||||
|
||||
class LinkedinSpider(scrapy.Spider):
|
||||
name = 'LinkedinUserSpider'
|
||||
comment_urls = []
|
||||
custom_settings = {
|
||||
'PROTO_MODULE_PATH': 'MediaSpiders.proto.Es_pb2',
|
||||
'PROTO_CLASS_NAME': 'EsSets',
|
||||
'PROTO_FIELD_NAME': 'Es',
|
||||
'PROTO_SAVE_FILE_NAME': 'public_info_data_',
|
||||
'IMAGES_STORE': r'/usr/local/temp_image/linkedin',
|
||||
'IMAGES_RESULT_FIELD': 'es_urlimage',
|
||||
'FILES_STORE': r'/usr/local/videos',
|
||||
'FILES_RESULT_FIELD': 'es_video',
|
||||
'ZIP_FILE_NAME': 'image_data_publicinfo_',
|
||||
'FILE_ZIP_FILE_NAME': 'image_data_plane_',
|
||||
'ITEM_PIPELINES': {
|
||||
'scrapy.pipelines.images.ImagesPipeline': 2,
|
||||
'scrapy.pipelines.files.FilesPipeline': 1,
|
||||
'MediaSpiders.pipelines.ProtobufSavePipeline': 300
|
||||
},
|
||||
'DOWNLOAD_DELAY': 2,
|
||||
'SPIDER_MIDDLEWARES': {
|
||||
'MediaSpiders.middlewares.DumpFilterSpiderMiddleware': 543,
|
||||
'MediaSpiders.middlewares.KeywordFilterSpiderMiddleware': 544,
|
||||
'MediaSpiders.middlewares.SimhashFilterSpiderMiddleware': 545,
|
||||
}
|
||||
}
|
||||
|
||||
def __init__(self, params=None, *args, **kwargs):
|
||||
super(LinkedinSpider, self).__init__(*args, **kwargs)
|
||||
json_params = json.loads(params)
|
||||
logger.info(json_params)
|
||||
self.crawl_comment = False
|
||||
self.redis_client = None
|
||||
self.bloom_filter = None
|
||||
self.simhash_filter_key = None
|
||||
if 'job_id' in json_params:
|
||||
self.job_id = json_params['job_id']
|
||||
|
||||
def start_requests(self):
|
||||
self.redis_client = redis.Redis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT'],
|
||||
password=self.settings['REDIS_PWD'])
|
||||
self.bloom_filter = Client(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT'],
|
||||
password=self.settings['REDIS_PWD'])
|
||||
self.simhash_filter_key = self.settings['LINKEDIN_SIMHASH_FILTER_KEY']
|
||||
yield SeleniumRequest(url='https://www.google.com/', callback=self.parse)
|
||||
|
||||
def parse(self, response):
|
||||
logger.info("login linkedin")
|
||||
driver = response.request.meta['driver']
|
||||
driver.maximize_window()
|
||||
# 访问主域,再设 Cookie
|
||||
driver.get("https://www.linkedin.com/")
|
||||
time.sleep(2)
|
||||
|
||||
# 添加 Cookie(确保 domain 是 .linkedin.com)
|
||||
cookie_string = self.redis_client.get("MediaSpiders:Linkedin_Cookies").decode()
|
||||
cookie_dict = self.form_cookie_dict(cookie_string) # 你已有此函数
|
||||
|
||||
# 转换为 Selenium 所需格式(必须含 domain 和 path)
|
||||
cookies_to_add = []
|
||||
for name, value in cookie_dict.items():
|
||||
cookies_to_add.append({
|
||||
'name': name,
|
||||
'value': value,
|
||||
'domain': '.linkedin.com',
|
||||
'path': '/',
|
||||
'secure': True
|
||||
})
|
||||
|
||||
for cookie in cookies_to_add:
|
||||
try:
|
||||
driver.add_cookie(cookie)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to add cookie {cookie['name']}: {e}")
|
||||
|
||||
driver.refresh()
|
||||
time.sleep(5)
|
||||
|
||||
# 获取待采集目标账号,并逐个请求
|
||||
# account_query_api = self.settings['SOCIAL_USER_QUERY_ALL_API']
|
||||
# account_query_api = account_query_api.format(sortBy="id", shuffleResult="true")
|
||||
# post_data = {
|
||||
# 'userType': self.settings['FACEBOOK_USER_TYPE'],
|
||||
# 'userFlag': 0
|
||||
# }
|
||||
#
|
||||
# account_rsp = json.loads(
|
||||
# http_post(account_query_api, json.dumps(post_data), headers={"Content-Type": "application/json"}).text)
|
||||
# all_user_info = []
|
||||
|
||||
# if account_rsp['code'] == 200:
|
||||
# all_user_info = account_rsp['content']
|
||||
# logger.info('GET %s users' % account_rsp['message'])
|
||||
# driver.set_window_size(1920, 1080)
|
||||
|
||||
all_user_info = [
|
||||
{'id': 87, 'userFlag': '0', 'userName': 'andrewyng', 'userType': '2', 'userUid': 'USForcesJapan.J'}]
|
||||
for user_info in all_user_info:
|
||||
user_name = user_info['userName']
|
||||
# 修复2: 移除 URL 末尾空格
|
||||
current_url = f'https://www.linkedin.com/in/{user_name}/recent-activity/all/'
|
||||
driver.get(current_url)
|
||||
time.sleep(5)
|
||||
|
||||
# 修复3: 智能滚动加载(替代固定坐标点击)
|
||||
self.smart_scroll(driver, max_scrolls=5)
|
||||
|
||||
# ✅ 修复 XPath:使用现代 LinkedIn 动态卡片定位方式
|
||||
current_page_articles = driver.find_elements(
|
||||
By.XPATH,
|
||||
"//div[contains(@class, 'feed-shared-update-v2')]"
|
||||
)
|
||||
|
||||
logger.info(f"Found {len(current_page_articles)} articles for {user_name}")
|
||||
items = self.get_linkedin_articles(current_page_articles, user_name, user_info['userUid'])
|
||||
|
||||
for item in items:
|
||||
if item.get('es_commentcount', 0) > 0:
|
||||
self.comment_urls.append({
|
||||
'url': item['es_urlname'],
|
||||
'article_id': item['es_sid'],
|
||||
'article_author': item['es_authors'],
|
||||
'article_text': item['es_urlcontent']
|
||||
})
|
||||
logger.info(f"用户 {item['es_userid']} 发文: {item['es_urlcontent'][:50]}...")
|
||||
yield item
|
||||
|
||||
# 评论爬取需单独设计(LinkedIn 评论需点击展开)
|
||||
if self.crawl_comment and self.comment_urls:
|
||||
comment_url = self.comment_urls.pop()
|
||||
yield SeleniumRequest(
|
||||
url=comment_url['url'],
|
||||
callback=self.linkedin_comment_parse,
|
||||
meta={'article_id': comment_url['article_id'], 'driver': driver}
|
||||
)
|
||||
|
||||
def smart_scroll(self, driver, max_scrolls=5):
|
||||
"""智能滚动:检测内容增量加载"""
|
||||
last_height = driver.execute_script("return document.body.scrollHeight")
|
||||
for i in range(max_scrolls):
|
||||
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
||||
time.sleep(3) # 等待动态加载
|
||||
|
||||
# 检查是否加载了新内容
|
||||
new_height = driver.execute_script("return document.body.scrollHeight")
|
||||
if new_height == last_height:
|
||||
logger.info(f"滚动条 {i + 1}:未加载新内容,停止")
|
||||
break
|
||||
last_height = new_height
|
||||
logger.info(f"滚动条 {i + 1}:加载了新内容到高度 {new_height}")
|
||||
|
||||
def get_linkedin_articles(self, articles, Uname, uid):
|
||||
article_items = []
|
||||
for idx, article in enumerate(articles):
|
||||
try:
|
||||
|
||||
# === 1. 作者姓名 ===
|
||||
try:
|
||||
author_elem = article.find_element(By.XPATH,
|
||||
".//span[contains(@class, 'update-components-actor__title')]//span[@aria-hidden='true']")
|
||||
uname = author_elem.text.strip()
|
||||
except:
|
||||
uname = Uname
|
||||
|
||||
# === 2. 发布时间(相对时间转绝对时间戳)===
|
||||
try:
|
||||
time_elem = article.find_element(By.XPATH,
|
||||
".//span[contains(@class, 'update-components-actor__sub-description')]")
|
||||
relative_time = time_elem.text.split('•')[0].strip() # "1 个月前"
|
||||
article_time = self.parse_linkedin_relative_time(relative_time)
|
||||
except Exception as e:
|
||||
logger.warning(f"Time parse failed: {e}")
|
||||
article_time = get_current_timestamp() - 86400000 # 默认24小时前
|
||||
|
||||
# === 3. 正文内容(处理"展开"按钮)===
|
||||
try:
|
||||
|
||||
# 提取正文(多段落合并)
|
||||
content_parts = article.find_elements(By.XPATH,
|
||||
".//div[contains(@class, 'update-components-text')]//span[@dir='ltr']")
|
||||
article_text = " ".join([p.text for p in content_parts if p.text.strip()])
|
||||
except:
|
||||
article_text = ""
|
||||
|
||||
# === 4. 文章链接(从 actor 链接提取)===
|
||||
try:
|
||||
# 获取文章链接
|
||||
activity_urn = article.get_attribute("data-urn")
|
||||
url_name = f"https://www.linkedin.com/feed/update/{activity_urn}"
|
||||
except:
|
||||
article_url = f"https://www.linkedin.com/in/{uname}/"
|
||||
|
||||
# === 5. 图片提取 ===
|
||||
img_urls = []
|
||||
try:
|
||||
img_urls = [
|
||||
img.get_attribute('data-delayed-url').strip()
|
||||
for img in
|
||||
article.find_elements(By.XPATH, ".//img[contains(@class, 'update-components-image__image')]")
|
||||
if img.get_attribute('data-delayed-url')
|
||||
]
|
||||
except:
|
||||
pass
|
||||
|
||||
# === 6. 互动数据(从 aria-label 提取)===
|
||||
like_count = comment_count = forward_count = 0
|
||||
try:
|
||||
# 点赞数
|
||||
like_btn = article.find_element(By.XPATH,
|
||||
".//span[contains(@class, 'social-details-social-counts')]").text
|
||||
like_count = self.extract_number(like_btn)
|
||||
|
||||
# 评论数
|
||||
comment_btn = article.find_element(By.XPATH, ".//button[contains(@aria-label, '评论')]").text
|
||||
comment_count = self.extract_number(comment_btn)
|
||||
|
||||
# 转发数
|
||||
repost_btn = article.find_element(By.XPATH,
|
||||
".//button[contains(@aria-label, '转发')]").text
|
||||
forward_count = self.extract_number(repost_btn)
|
||||
except Exception as e:
|
||||
logger.debug(f"Interaction count parse failed: {e}")
|
||||
|
||||
try:
|
||||
# == 7 获取是否年度影响人物(linkedin特有标识),定位到包含 "• 3 度+" 的 span
|
||||
degree_span = article.find_element(
|
||||
By.XPATH,
|
||||
"//span[@aria-hidden='true' and contains(., '•') and contains(., '度+')]"
|
||||
)
|
||||
degree_text = degree_span.text.strip()
|
||||
except Exception as e:
|
||||
degree_text = ""
|
||||
|
||||
es_content = article_text.replace('[Original text:]', '').strip()
|
||||
|
||||
# === 7. 构建 Item ===
|
||||
article_id = get_str_md5(f"{uname}{article_text}{article_time}")
|
||||
item = MediaspidersItem()
|
||||
item['es_sid'] = article_id
|
||||
item['es_hkey'] = article_id
|
||||
item['es_content'] = es_content
|
||||
item['es_urlcontent'] = es_content
|
||||
item['es_urltime'] = article_time # 注意:如果你之前已转为字符串时间,这里就是 str;否则是时间戳
|
||||
item['es_lasttime'] = get_current_timestamp()
|
||||
item['es_loadtime'] = get_current_timestamp()
|
||||
item['es_urltitle'] = uname
|
||||
item['es_authors'] = uname
|
||||
item['es_userid'] = uid
|
||||
item['image_urls'] = img_urls
|
||||
item['file_urls'] = []
|
||||
item['es_urlname'] = url_name
|
||||
item['es_commentcount'] = comment_count
|
||||
item['es_forwardcount'] = forward_count
|
||||
item['es_likecount'] = like_count
|
||||
item['es_sitename'] = 'linkedin'
|
||||
item['es_srcname'] = 'linkedin'
|
||||
item['es_carriertype'] = 'media'
|
||||
item['es_heat'] = degree_text
|
||||
|
||||
# 判重逻辑
|
||||
if item['es_lasttime'] - item['es_urltime'] > 48 * 3600 * 1000:
|
||||
if self.bloom_filter.bfAdd(self.settings['LINKEDIN_FILTER_KEY'], article_id) <= 0:
|
||||
logger.info(f"跳过已采集内容: {article_id[:10]}...")
|
||||
continue
|
||||
|
||||
if not item['es_urlcontent']:
|
||||
logger.warning("跳过空内容动态")
|
||||
continue
|
||||
|
||||
article_items.append(item)
|
||||
logger.debug(f"Article {idx}: {uname} - {article_text[:30]}...")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"解析动态失败 (index {idx}): {repr(e)}")
|
||||
continue
|
||||
|
||||
logger.info(f"用户 {uid} 共采集 {len(article_items)} 条有效动态")
|
||||
return article_items
|
||||
|
||||
def parse_linkedin_relative_time(self, text):
|
||||
"""将"1 个月前"等相对时间转为时间戳"""
|
||||
now = datetime.now()
|
||||
text = text.lower().replace(' ', '')
|
||||
|
||||
if '秒前' in text or 'secondsago' in text:
|
||||
seconds = int(re.search(r'\d+', text).group())
|
||||
return int((now - timedelta(seconds=seconds)).timestamp() * 1000)
|
||||
elif '分钟前' in text or 'minutesago' in text:
|
||||
minutes = int(re.search(r'\d+', text).group())
|
||||
return int((now - timedelta(minutes=minutes)).timestamp() * 1000)
|
||||
elif '小时前' in text or 'hoursago' in text:
|
||||
hours = int(re.search(r'\d+', text).group())
|
||||
return int((now - timedelta(hours=hours)).timestamp() * 1000)
|
||||
elif '天前' in text or 'daysago' in text:
|
||||
days = int(re.search(r'\d+', text).group())
|
||||
return int((now - timedelta(days=days)).timestamp() * 1000)
|
||||
elif '周前' in text or 'weeksago' in text:
|
||||
weeks = int(re.search(r'\d+', text).group())
|
||||
return int((now - timedelta(weeks=weeks)).timestamp() * 1000)
|
||||
elif '月前' in text or 'monthsago' in text:
|
||||
months = int(re.search(r'\d+', text).group())
|
||||
# 简化处理:1个月≈30天
|
||||
return int((now - timedelta(days=months * 30)).timestamp() * 1000)
|
||||
elif '年前' in text or 'yearsago' in text:
|
||||
years = int(re.search(r'\d+', text).group())
|
||||
return int((now - timedelta(days=years * 365)).timestamp() * 1000)
|
||||
else:
|
||||
return get_current_timestamp() - 86400000 # 默认24小时前
|
||||
|
||||
def extract_number(self, text):
|
||||
"""从"1,234 个赞"提取数字 1234"""
|
||||
try:
|
||||
num_str = re.search(r'[\d,]+', text).group().replace(',', '')
|
||||
return int(num_str)
|
||||
except:
|
||||
return 0
|
||||
|
||||
def linkedin_comment_parse(self, response):
|
||||
driver = response.meta['driver']
|
||||
article_id = response.meta['article_id']
|
||||
|
||||
# 点击"评论"按钮展开评论区
|
||||
try:
|
||||
comment_btn = driver.find_element(By.XPATH,
|
||||
"//button[contains(@class, 'comments-comment-button')]")
|
||||
comment_btn.click()
|
||||
time.sleep(3)
|
||||
except:
|
||||
logger.warning("未找到评论按钮,跳过评论爬取")
|
||||
return
|
||||
|
||||
# 滚动加载评论
|
||||
self.smart_scroll(driver, max_scrolls=3)
|
||||
|
||||
# 提取评论
|
||||
comment_elements = driver.find_elements(By.XPATH,
|
||||
"//div[contains(@class, 'comments-comment-item')]")
|
||||
|
||||
for comment in comment_elements:
|
||||
try:
|
||||
author = comment.find_element(By.XPATH,
|
||||
".//span[contains(@class, 'comments-post-meta__name-text')]").text.strip()
|
||||
content = comment.find_element(By.XPATH,
|
||||
".//span[contains(@class, 'comments-comment-item-content')]").text.strip()
|
||||
comment_id = get_str_md5(f"{author}{content}")
|
||||
|
||||
item = MediaspidersItem()
|
||||
item['es_sid'] = comment_id
|
||||
item['es_hkey'] = article_id
|
||||
item['es_content'] = content
|
||||
item['es_authors'] = author
|
||||
item['es_userid'] = author
|
||||
item['es_urltime'] = get_current_timestamp()
|
||||
item['es_sitename'] = 'linkedin'
|
||||
item['es_srcname'] = 'linkedin_comment'
|
||||
item['es_carriertype'] = 'comment'
|
||||
yield item
|
||||
except:
|
||||
continue
|
||||
|
||||
# 继续处理队列中的其他评论
|
||||
if self.comment_urls:
|
||||
next_comment = self.comment_urls.pop()
|
||||
yield SeleniumRequest(
|
||||
url=next_comment['url'],
|
||||
callback=self.linkedin_comment_parse,
|
||||
meta={'article_id': next_comment['article_id'], 'driver': driver}
|
||||
)
|
||||
|
||||
def form_cookie_dict(self, cookie_str: str) -> dict:
|
||||
# 清理前缀(兼容中英文冒号)
|
||||
for prefix in ["Cookie:", "Cookie:"]:
|
||||
if cookie_str.startswith(prefix):
|
||||
cookie_str = cookie_str[len(prefix):].strip()
|
||||
break
|
||||
|
||||
cookie_dict = {}
|
||||
for item in cookie_str.split(';'):
|
||||
item = item.strip()
|
||||
if not item or '=' not in item:
|
||||
continue
|
||||
name, value = item.split('=', 1) # 仅分割第一个等号
|
||||
name, value = name.strip(), value.strip()
|
||||
# 移除 value 两端双引号(Selenium 不需要)
|
||||
if value.startswith('"') and value.endswith('"'):
|
||||
value = value[1:-1]
|
||||
cookie_dict[name] = value
|
||||
return cookie_dict
|
||||
@ -0,0 +1,182 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from datetime import datetime
|
||||
from datetime import timezone, timedelta
|
||||
import json
|
||||
import logging as logger
|
||||
import random
|
||||
import re
|
||||
import time
|
||||
from urllib import parse
|
||||
|
||||
import redis
|
||||
import scrapy
|
||||
from scrapy_selenium import SeleniumRequest
|
||||
from MediaSpiders.items import MediaspidersItem, TwitterUserInfoItem
|
||||
from MediaSpiders.utils.http_utils import http_post
|
||||
from MediaSpiders.utils.login_utils import login
|
||||
from MediaSpiders.utils.time_utils import get_time_stamp, get_current_timestamp
|
||||
|
||||
|
||||
def form_cookie_dict(cookie_string):
|
||||
cookie_string_list = cookie_string.split(';')
|
||||
cookie_dict = {}
|
||||
for cookie in cookie_string_list:
|
||||
key = cookie.split('=')[0].replace(' ', '')
|
||||
cookie_dict[key] = cookie.split('=')[1]
|
||||
return cookie_dict
|
||||
|
||||
|
||||
class TwitterSpider(scrapy.Spider):
|
||||
name = 'TwitterUserInfoSpider'
|
||||
custom_settings = {
|
||||
|
||||
'PROTO_SAVE_FILE_NAME': 'public_twitter_user_info_data_',
|
||||
'IMAGES_STORE': r'/usr/local/temp_image/twitteruserinfo',
|
||||
'IMAGES_RESULT_FIELD': 'avatar_path',
|
||||
'ITEM_PIPELINES': {
|
||||
'scrapy.pipelines.images.ImagesPipeline': 2,
|
||||
'MediaSpiders.pipelines.TwitterUserDataSaveToMySQL': 300,
|
||||
},
|
||||
'SPIDER_MIDDLEWARES': {
|
||||
'MediaSpiders.middlewares.DumpFilterSpiderMiddleware': 543,
|
||||
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware': None
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
def __init__(self, params=None, *args, **kwargs):
|
||||
super(TwitterSpider, self).__init__(*args, **kwargs)
|
||||
self.total_num = 100
|
||||
self.authorization = 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'
|
||||
self.tags = {
|
||||
"620632841": "媒体实体", # 纽约时报中文网
|
||||
"1714100357582770176": "媒体实体", # 昨天
|
||||
"218434058": "官方代表", # 高市早苗
|
||||
"121669059": "媒体实体", # yonhapnews
|
||||
"8149482": "媒体实体", # 美国之音中文网
|
||||
"46574977": "媒体实体", # 华尔街日报中文网
|
||||
"1260553941714186241": "名人", # 李老师不是你老师
|
||||
"106379129": "官方代表", # 이재명
|
||||
}
|
||||
if params:
|
||||
json_params = json.loads(params)
|
||||
if 'totalNum' in json_params:
|
||||
self.total_num = int(json_params['totalNum'])
|
||||
if 'authorization' in json_params:
|
||||
self.authorization = json_params['authorization']
|
||||
if 'job_id' in json_params:
|
||||
self.job_id = json_params['job_id']
|
||||
|
||||
def start_requests(self):
|
||||
yield SeleniumRequest(url='https://www.google.com/', callback=self.login_twitter)
|
||||
|
||||
def login_twitter(self, response):
|
||||
self.redis_client = redis.Redis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT'],
|
||||
password=self.settings['REDIS_PWD'])
|
||||
self.simhash_filter_key = self.settings['TWITTER_SIMHASH_FILTER_KEY']
|
||||
cookie_string = None
|
||||
# 获取采集登录账号并登录
|
||||
login_users = self.redis_client.smembers('MediaSpiders:Twitter_login_accounts')
|
||||
# 尝试自动化登录网页获取 cookies,若失败则从redis中 使用已有cookies
|
||||
# try:
|
||||
#
|
||||
# driver = login().login_with_selenium(
|
||||
# 'https://x.com/i/flow/login',
|
||||
# self.name,
|
||||
# login_users=login_users,
|
||||
# response=response
|
||||
# )
|
||||
# cookies = driver.get_cookies()
|
||||
# # 取cookie中的ct0为x-csrf-token,取gt为x-guest-token
|
||||
# self.cookie_dict = {}
|
||||
# for cookie in cookies:
|
||||
# self.cookie_dict[cookie['name']] = cookie['value']
|
||||
# except Exception as e:
|
||||
# logger.info("自动化获取cookies失败")
|
||||
cookie_string = self.redis_client.get("MediaSpiders:Twitter_Cookies").decode()
|
||||
self.cookie_dict = form_cookie_dict(cookie_string)
|
||||
|
||||
ct0 = self.cookie_dict.get('ct0')
|
||||
if not ct0:
|
||||
logger.error("redis中cookie缺失ct0 (CSRF token)!")
|
||||
return
|
||||
self.header = {
|
||||
'Host': 'api.twitter.com',
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36 Edg/143.0.0.0',
|
||||
'Accept': '*/*',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
|
||||
'content-type': 'application/json',
|
||||
'authorization': self.authorization,
|
||||
'Origin': 'https://twitter.com',
|
||||
'Cookie': cookie_string,
|
||||
'X-Csrf-Token': ct0
|
||||
}
|
||||
self.filter_key = self.settings['TWITTER_FILTER_KEY']
|
||||
self.pid_key = self.settings['TWITTER_PID_KEY']
|
||||
url_key = self.redis_client.get("MediaSpiders:Twitter_URL_Key").decode()
|
||||
account_query_api = self.settings['SOCIAL_USER_QUERY_ALL_API']
|
||||
account_query_api = account_query_api.format(sortBy="id", shuffleResult="true")
|
||||
post_data = {
|
||||
'userType': self.settings['TWITTER_USER_TYPE'],
|
||||
'userFlag': 0
|
||||
}
|
||||
account_rsp = json.loads(
|
||||
http_post(account_query_api, json.dumps(post_data), headers={"Content-Type": "application/json"}).text)
|
||||
all_user_info = []
|
||||
if account_rsp['code'] == 200:
|
||||
all_user_info = account_rsp['content']
|
||||
for user_info in all_user_info:
|
||||
graphql_url = f'https://x.com/i/api/graphql/-oaLodhGbbnzJBACb1kk2Q/UserByScreenName?variables=%7B%22screen_name%22%3A%22{user_info["userName"]}%22%2C%22withGrokTranslatedBio%22%3Afalse%7D&features=%7B%22hidden_profile_subscriptions_enabled%22%3Atrue%2C%22profile_label_improvements_pcf_label_in_post_enabled%22%3Atrue%2C%22responsive_web_profile_redirect_enabled%22%3Afalse%2C%22rweb_tipjar_consumption_enabled%22%3Atrue%2C%22verified_phone_label_enabled%22%3Afalse%2C%22subscriptions_verification_info_is_identity_verified_enabled%22%3Atrue%2C%22subscriptions_verification_info_verified_since_enabled%22%3Atrue%2C%22highlights_tweets_tab_ui_enabled%22%3Atrue%2C%22responsive_web_twitter_article_notes_tab_enabled%22%3Atrue%2C%22subscriptions_feature_can_gift_premium%22%3Atrue%2C%22creator_subscriptions_tweet_preview_api_enabled%22%3Atrue%2C%22responsive_web_graphql_skip_user_profile_image_extensions_enabled%22%3Afalse%2C%22responsive_web_graphql_timeline_navigation_enabled%22%3Atrue%7D&fieldToggles=%7B%22withPayments%22%3Afalse%2C%22withAuxiliaryUserLabels%22%3Atrue%7D'
|
||||
yield scrapy.Request(url=graphql_url, callback=self.parse,
|
||||
meta={
|
||||
'uid': user_info['userUid'],
|
||||
'uname': user_info['userName'],
|
||||
'proxy': 'http://127.0.0.1:10809',
|
||||
},
|
||||
cookies=self.cookie_dict, headers=self.header)
|
||||
|
||||
def parse(self, response):
|
||||
uid = response.request.meta['uid']
|
||||
uname = response.request.meta['uname']
|
||||
try:
|
||||
rsp = json.loads(response.text)
|
||||
entries = []
|
||||
instructions = rsp['data']['user']['result']
|
||||
item = TwitterUserInfoItem()
|
||||
item['crawl_time'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
|
||||
item['is_newest'] = 1
|
||||
item['platform_type'] = "Twitter"
|
||||
item['user_id'] = int(instructions['rest_id'])
|
||||
item['nickname'] = instructions['core']['name']
|
||||
item['username'] = instructions['core']['screen_name']
|
||||
item['user_url'] = f'https://x.com/{uname}'
|
||||
item['user_link'] = f'https://x.com/{uname}'
|
||||
item['avatar_url'] = instructions['avatar']['image_url']
|
||||
item['intro'] = instructions['legacy']['description']
|
||||
item['city'] = instructions.get('location', {}).get('location', '').strip()
|
||||
item['backgroud_image_url'] = instructions.get('legacy', {}).get('profile_banner_url', '')
|
||||
item['image_urls'] = [
|
||||
instructions['avatar']['image_url'],
|
||||
instructions.get('legacy', {}).get('profile_banner_url', '').strip()
|
||||
]
|
||||
try:
|
||||
# 转换为 datetime 对象
|
||||
ts = get_time_stamp(
|
||||
str(instructions['core']['created_at'])) + 8 * 3600 * 1000
|
||||
dt = datetime.fromtimestamp(ts / 1000, tz=timezone.utc)
|
||||
item['join_date'] = dt.strftime('%Y-%m-%d %H:%M:%S') # '2012-06-28 12:25:01'
|
||||
except (ValueError, KeyError) as e:
|
||||
item['join_date'] = None # 或记录日志
|
||||
logger.error('时间转换失败:' + e)
|
||||
item['signature'] = instructions.get('legacy', {}).get('description', '').strip() or instructions.get('profile_bio', {}).get(
|
||||
'description', '').strip()
|
||||
item['post_count'] = instructions['legacy']['statuses_count']
|
||||
item['follow_count'] = instructions['legacy']['friends_count']
|
||||
item['fans_count'] = instructions['legacy']['followers_count']
|
||||
item['is_verified'] = str(instructions['is_blue_verified'])
|
||||
item['tags'] = self.tags[uid]
|
||||
|
||||
verified_type = instructions.get('verification', {}).get('verified_type', None) # 认证类型
|
||||
yield item
|
||||
except:
|
||||
self.logger.error("解析response错误")
|
||||
@ -1,9 +1,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import json
|
||||
import logging as logger
|
||||
import random
|
||||
import re
|
||||
import time
|
||||
from urllib import parse
|
||||
|
||||
import redis
|
||||
@ -12,7 +10,18 @@ from scrapy_selenium import SeleniumRequest
|
||||
|
||||
from MediaSpiders.items import MediaspidersItem
|
||||
from MediaSpiders.utils.http_utils import http_post
|
||||
from MediaSpiders.utils.login_utils import login
|
||||
from MediaSpiders.utils.time_utils import get_time_stamp, get_current_timestamp
|
||||
from MediaSpiders.utils.traslate_utils import translate_single, translate_content_with_paragraphs, needs_translation
|
||||
|
||||
|
||||
def form_cookie_dict(cookie_string):
|
||||
cookie_string_list = cookie_string.split(';')
|
||||
cookie_dict = {}
|
||||
for cookie in cookie_string_list:
|
||||
key = cookie.split('=')[0].replace(' ', '')
|
||||
cookie_dict[key] = cookie.split('=')[1]
|
||||
return cookie_dict
|
||||
|
||||
|
||||
class TwitterSpider(scrapy.Spider):
|
||||
@ -26,8 +35,8 @@ class TwitterSpider(scrapy.Spider):
|
||||
'IMAGES_RESULT_FIELD': 'es_urlimage',
|
||||
'FILES_STORE': r'/usr/local/videos',
|
||||
'FILES_RESULT_FIELD': 'es_video',
|
||||
'ZIP_FILE_NAME': 'image_data_publicinfo_',
|
||||
'FILE_ZIP_FILE_NAME': 'image_data_plane_',
|
||||
'ZIP_FILE_NAME': 'image_data_ship_', # 图片包名称
|
||||
'FILE_ZIP_FILE_NAME': 'image_data_plane_', # 视频包名称
|
||||
'ITEM_PIPELINES': {
|
||||
'scrapy.pipelines.images.ImagesPipeline': 2,
|
||||
'scrapy.pipelines.files.FilesPipeline': 1,
|
||||
@ -38,6 +47,7 @@ class TwitterSpider(scrapy.Spider):
|
||||
# 'MediaSpiders.middlewares.KeywordFilterSpiderMiddleware': 544,
|
||||
# 'MediaSpiders.middlewares.SimhashFilterSpiderMiddleware': 545,
|
||||
# 'scrapy_splash.SplashDeduplicateArgsMiddleware': 700,
|
||||
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware': None
|
||||
}
|
||||
|
||||
}
|
||||
@ -62,59 +72,45 @@ class TwitterSpider(scrapy.Spider):
|
||||
self.redis_client = redis.Redis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT'],
|
||||
password=self.settings['REDIS_PWD'])
|
||||
self.simhash_filter_key = self.settings['TWITTER_SIMHASH_FILTER_KEY']
|
||||
logger.info("login twitter")
|
||||
driver = response.request.meta['driver']
|
||||
driver.maximize_window()
|
||||
driver.get('https://twitter.com/i/flow/login')
|
||||
time.sleep(5)
|
||||
# 获取采集登录账号并登录
|
||||
login_users = self.redis_client.smembers('MediaSpiders:Twitter_login_accounts')
|
||||
user_list = []
|
||||
for u in login_users:
|
||||
user_list.append(json.loads(u.decode()))
|
||||
login_user = random.choice(user_list)
|
||||
logger.info(f"login as user {login_user['uid']}")
|
||||
driver.find_element_by_xpath("//input").send_keys(login_user['uid'])
|
||||
try:
|
||||
next_button = driver.find_element_by_xpath("//div[@role='button'][2]")
|
||||
next_button.click()
|
||||
except Exception:
|
||||
logger.info("点击“下一步”的button元素")
|
||||
next_button = driver.find_element_by_xpath("//button[@role='button'][2]")
|
||||
next_button.click()
|
||||
time.sleep(5)
|
||||
try:
|
||||
logger.info("输入手机号验证...")
|
||||
driver.find_element_by_xpath("//input[@name='text']").send_keys("+8619962025641")
|
||||
driver.find_element_by_xpath("//button[@data-testid='ocfEnterTextNextButton']").click()
|
||||
time.sleep(5)
|
||||
except Exception:
|
||||
logger.info("无需输入手机号验证")
|
||||
driver.find_element_by_xpath("//input[@name='password']").send_keys(login_user['pwd'])
|
||||
driver.find_element_by_xpath("//button[@data-testid='LoginForm_Login_Button']").click()
|
||||
time.sleep(5)
|
||||
try:
|
||||
driver.find_element_by_xpath("//button[@data-testid='confirmationSheetConfirm']").click()
|
||||
time.sleep(10)
|
||||
except:
|
||||
time.sleep(5)
|
||||
cookies = driver.get_cookies()
|
||||
# cookies = json.loads(response.text)['cookies']
|
||||
# 取cookie中的ct0为x-csrf-token,取gt为x-guest-token
|
||||
self.cookie_dict = {}
|
||||
for cookie in cookies:
|
||||
self.cookie_dict[cookie['name']] = cookie['value']
|
||||
# 从redis中 使用已有cookies,否则自动化登录网页获取cookies
|
||||
cookie_string = self.redis_client.get("MediaSpiders:Twitter_Cookies").decode()
|
||||
ct0 = None
|
||||
if cookie_string:
|
||||
self.cookie_dict = form_cookie_dict(cookie_string)
|
||||
# 5. 构建 headers
|
||||
ct0 = self.cookie_dict.get('ct0')
|
||||
if not ct0:
|
||||
logger.error("redis中cookie缺失ct0 (CSRF token)!")
|
||||
return
|
||||
else:
|
||||
try:
|
||||
|
||||
driver = login().login_with_selenium(
|
||||
'https://x.com/i/flow/login',
|
||||
self.name,
|
||||
login_users=login_users,
|
||||
response=response
|
||||
)
|
||||
cookies = driver.get_cookies()
|
||||
# 取cookie中的ct0为x-csrf-token,取gt为x-guest-token
|
||||
self.cookie_dict = {}
|
||||
for cookie in cookies:
|
||||
self.cookie_dict[cookie['name']] = cookie['value']
|
||||
except Exception as e:
|
||||
logger.info("自动化获取cookies失败")
|
||||
|
||||
self.header = {
|
||||
'Host': 'api.twitter.com',
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0',
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36 Edg/143.0.0.0',
|
||||
'Accept': '*/*',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
|
||||
'content-type': 'application/json',
|
||||
'authorization': self.authorization,
|
||||
# 'x-twitter-active-user': 'yes',
|
||||
'Origin': 'https://twitter.com',
|
||||
'Connection': 'keep-alive',
|
||||
'X-Csrf-Token': self.cookie_dict['ct0']
|
||||
'Cookie': cookie_string,
|
||||
'X-Csrf-Token': ct0
|
||||
}
|
||||
self.filter_key = self.settings['TWITTER_FILTER_KEY']
|
||||
self.pid_key = self.settings['TWITTER_PID_KEY']
|
||||
@ -131,9 +127,14 @@ class TwitterSpider(scrapy.Spider):
|
||||
if account_rsp['code'] == 200:
|
||||
all_user_info = account_rsp['content']
|
||||
for user_info in all_user_info:
|
||||
graphql_url = f'https://twitter.com/i/api/graphql/{url_key}/UserTweets?variables=%7B%22userId%22%3A%22{user_info["userUid"]}%22%2C%22count%22%3A20%2C%22includePromotedContent%22%3Atrue%2C%22withQuickPromoteEligibilityTweetFields%22%3Atrue%2C%22withVoice%22%3Atrue%2C%22withV2Timeline%22%3Atrue%7D&features=%7B%22rweb_lists_timeline_redesign_enabled%22%3Atrue%2C%22responsive_web_graphql_exclude_directive_enabled%22%3Atrue%2C%22verified_phone_label_enabled%22%3Afalse%2C%22creator_subscriptions_tweet_preview_api_enabled%22%3Atrue%2C%22responsive_web_graphql_timeline_navigation_enabled%22%3Atrue%2C%22responsive_web_graphql_skip_user_profile_image_extensions_enabled%22%3Afalse%2C%22tweetypie_unmention_optimization_enabled%22%3Atrue%2C%22responsive_web_edit_tweet_api_enabled%22%3Atrue%2C%22graphql_is_translatable_rweb_tweet_is_translatable_enabled%22%3Atrue%2C%22view_counts_everywhere_api_enabled%22%3Atrue%2C%22longform_notetweets_consumption_enabled%22%3Atrue%2C%22responsive_web_twitter_article_tweet_consumption_enabled%22%3Afalse%2C%22tweet_awards_web_tipping_enabled%22%3Afalse%2C%22freedom_of_speech_not_reach_fetch_enabled%22%3Atrue%2C%22standardized_nudges_misinfo%22%3Atrue%2C%22tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled%22%3Atrue%2C%22longform_notetweets_rich_text_read_enabled%22%3Atrue%2C%22longform_notetweets_inline_media_enabled%22%3Atrue%2C%22responsive_web_media_download_video_enabled%22%3Afalse%2C%22responsive_web_enhance_cards_enabled%22%3Afalse%7D&fieldToggles=%7B%22withArticleRichContentState%22%3Afalse%7D'
|
||||
yield scrapy.Request(graphql_url, callback=self.parse,
|
||||
meta={'uid': user_info['userUid'], 'currentCount': 0},
|
||||
graphql_url = f'https://x.com/i/api/graphql/{url_key}/UserTweets?variables=%7B%22userId%22%3A%22{user_info["userUid"]}%22%2C%22count%22%3A20%2C%22includePromotedContent%22%3Atrue%2C%22withQuickPromoteEligibilityTweetFields%22%3Atrue%2C%22withVoice%22%3Atrue%7D&features=%7B%22rweb_video_screen_enabled%22%3Afalse%2C%22profile_label_improvements_pcf_label_in_post_enabled%22%3Atrue%2C%22responsive_web_profile_redirect_enabled%22%3Afalse%2C%22rweb_tipjar_consumption_enabled%22%3Atrue%2C%22verified_phone_label_enabled%22%3Afalse%2C%22creator_subscriptions_tweet_preview_api_enabled%22%3Atrue%2C%22responsive_web_graphql_timeline_navigation_enabled%22%3Atrue%2C%22responsive_web_graphql_skip_user_profile_image_extensions_enabled%22%3Afalse%2C%22premium_content_api_read_enabled%22%3Afalse%2C%22communities_web_enable_tweet_community_results_fetch%22%3Atrue%2C%22c9s_tweet_anatomy_moderator_badge_enabled%22%3Atrue%2C%22responsive_web_grok_analyze_button_fetch_trends_enabled%22%3Afalse%2C%22responsive_web_grok_analyze_post_followups_enabled%22%3Atrue%2C%22responsive_web_jetfuel_frame%22%3Atrue%2C%22responsive_web_grok_share_attachment_enabled%22%3Atrue%2C%22responsive_web_grok_annotations_enabled%22%3Afalse%2C%22articles_preview_enabled%22%3Atrue%2C%22responsive_web_edit_tweet_api_enabled%22%3Atrue%2C%22graphql_is_translatable_rweb_tweet_is_translatable_enabled%22%3Atrue%2C%22view_counts_everywhere_api_enabled%22%3Atrue%2C%22longform_notetweets_consumption_enabled%22%3Atrue%2C%22responsive_web_twitter_article_tweet_consumption_enabled%22%3Atrue%2C%22tweet_awards_web_tipping_enabled%22%3Afalse%2C%22responsive_web_grok_show_grok_translated_post%22%3Afalse%2C%22responsive_web_grok_analysis_button_from_backend%22%3Atrue%2C%22post_ctas_fetch_enabled%22%3Afalse%2C%22creator_subscriptions_quote_tweet_preview_enabled%22%3Afalse%2C%22freedom_of_speech_not_reach_fetch_enabled%22%3Atrue%2C%22standardized_nudges_misinfo%22%3Atrue%2C%22tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled%22%3Atrue%2C%22longform_notetweets_rich_text_read_enabled%22%3Atrue%2C%22longform_notetweets_inline_media_enabled%22%3Atrue%2C%22responsive_web_grok_image_annotation_enabled%22%3Atrue%2C%22responsive_web_grok_imagine_annotation_enabled%22%3Atrue%2C%22responsive_web_grok_community_note_auto_translation_is_enabled%22%3Afalse%2C%22responsive_web_enhance_cards_enabled%22%3Afalse%7D&fieldToggles=%7B%22withArticlePlainText%22%3Afalse%7D'
|
||||
|
||||
yield scrapy.Request(url=graphql_url, callback=self.parse,
|
||||
meta={
|
||||
'uid': user_info['userUid'],
|
||||
# 'proxy': 'http://127.0.0.1:10808',
|
||||
'currentCount': 0
|
||||
},
|
||||
cookies=self.cookie_dict, headers=self.header)
|
||||
|
||||
def parse(self, response):
|
||||
@ -164,34 +165,65 @@ class TwitterSpider(scrapy.Spider):
|
||||
result = entry['content']['itemContent']['tweet_results']['result']
|
||||
item['es_userid'] = result['core']['user_results']['result']['rest_id']
|
||||
item['es_hkey'] = result['rest_id']
|
||||
item['es_district'] = result['core']['user_results']['result']['legacy']['location']
|
||||
screen_name = result['core']['user_results']['result']['legacy']['screen_name']
|
||||
user_name = result['core']['user_results']['result']['legacy']['name']
|
||||
item['es_urlname'] = 'https://twitter.com/%s/status/%s' % (screen_name, result['rest_id'])
|
||||
item['es_district'] = result['core']['user_results']['result']['location']
|
||||
screen_name = result['core']['user_results']['result']['core']['screen_name']
|
||||
user_name = result['core']['user_results']['result']['core']['name']
|
||||
item['es_urlname'] = 'https://x.com/%s/status/%s' % (screen_name, result['rest_id'])
|
||||
item['es_authors'] = screen_name
|
||||
item['es_extname'] = user_name
|
||||
|
||||
device_html = result['source']
|
||||
device_type = re.search(r'>([^<]+)</a>', device_html).group(1)
|
||||
|
||||
legacy = result['legacy']
|
||||
author_full_text = legacy['full_text']
|
||||
created_at = legacy['created_at']
|
||||
# 评论、转发、点赞数量
|
||||
item['es_commentcount'] = legacy['reply_count']
|
||||
item['es_forwardcount'] = legacy['retweet_count']
|
||||
item['es_likecount'] = legacy['favorite_count']
|
||||
# 评论+ 转发+ 点赞数量 TODO
|
||||
interaction_count = legacy['reply_count'] + legacy['retweet_count'] + legacy['favorite_count']
|
||||
# 语种
|
||||
lang = legacy['lang']
|
||||
# 推文话题 、 提及
|
||||
topic = legacy['entities']['hashtags']
|
||||
mentions = legacy['entities']['user_mentions']
|
||||
|
||||
item['es_lasttime'] = get_current_timestamp()
|
||||
item['es_loadtime'] = get_current_timestamp()
|
||||
item['es_urltime'] = get_time_stamp(
|
||||
str(created_at)) + 8 * 3600 * 1000 # TW默认使用的是零时区,转换为北京时间
|
||||
if 'quoted_status_result' in result:
|
||||
item['es_isrepost'] = 'yes'
|
||||
item['es_isrepost'] = '1'
|
||||
item['es_urltitle'] = author_full_text
|
||||
item['es_catalog1'] = author_full_text
|
||||
# 判断是否需要翻译
|
||||
if needs_translation(author_full_text):
|
||||
item['es_catalog2'] = translate_single(author_full_text)
|
||||
else:
|
||||
item['es_catalog2'] = ''
|
||||
legacy = result['quoted_status_result']['result']['legacy']
|
||||
original_tweet = result['quoted_status_result']['result']['rest_id']
|
||||
self.logger.info('采集引用推文原文信息')
|
||||
elif 'retweeted_status_result' in legacy:
|
||||
item['es_isrepost'] = 'yes'
|
||||
item['es_isrepost'] = '1'
|
||||
legacy = legacy['retweeted_status_result']['result']['legacy']
|
||||
original_tweet = result['retweeted_status_result']['result']['rest_id']
|
||||
self.logger.info('采集转发推文原文信息')
|
||||
item['es_content'] = legacy['full_text']
|
||||
else:
|
||||
item['es_isrepost'] = '0'
|
||||
original_tweet = ''
|
||||
self.logger.info('采集原文信息')
|
||||
|
||||
item['es_urlcontent'] = legacy['full_text']
|
||||
# 获取文本
|
||||
url_content = legacy['full_text']
|
||||
# 判断是否需要翻译
|
||||
if needs_translation(url_content):
|
||||
item['es_content'] = translate_content_with_paragraphs(url_content)
|
||||
else:
|
||||
item['es_content'] = ''
|
||||
# 下载图片
|
||||
image_url_list = []
|
||||
if 'entities' in legacy and 'media' in legacy['entities']:
|
||||
|
||||
@ -0,0 +1,309 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import json
|
||||
import logging as logger
|
||||
import random
|
||||
import time
|
||||
from math import ceil
|
||||
|
||||
import redis
|
||||
import requests
|
||||
import scrapy
|
||||
from scrapy_selenium import SeleniumRequest
|
||||
|
||||
from MediaSpiders.settings import REDIS_HOST, REDIS_PORT, REDIS_PWD, SOCIAL_USER_QUERY_ALL_API, SOCIAL_USER_UPDATE_API, \
|
||||
WECHAT_USER_TYPE
|
||||
from MediaSpiders.utils.http_utils import http_post, UA
|
||||
from MediaSpiders.utils.time_utils import get_current_timestamp
|
||||
|
||||
|
||||
class WechatLinksFetcherSpider(scrapy.Spider):
|
||||
name = 'WechatLinksFetcherSpider'
|
||||
|
||||
custom_settings = {
|
||||
'PROTO_MODULE_PATH': 'MediaSpiders.proto.Es_pb2',
|
||||
'PROTO_CLASS_NAME': 'EsSets',
|
||||
'PROTO_FIELD_NAME': 'Es',
|
||||
'PROTO_SAVE_FILE_NAME': 'public_info_data_',
|
||||
'IMAGES_STORE': r'/usr/local/temp_image/twitter',
|
||||
'IMAGES_RESULT_FIELD': 'es_urlimage',
|
||||
'FILES_STORE': r'/usr/local/videos',
|
||||
'FILES_RESULT_FIELD': 'es_video',
|
||||
'ZIP_FILE_NAME': 'image_data_ship_',
|
||||
'FILE_ZIP_FILE_NAME': 'image_data_plane_',
|
||||
'ITEM_PIPELINES': {
|
||||
'scrapy.pipelines.images.ImagesPipeline': 2,
|
||||
'scrapy.pipelines.files.FilesPipeline': 1,
|
||||
'MediaSpiders.pipelines.ProtobufSavePipeline': 300,
|
||||
},
|
||||
'SPIDER_MIDDLEWARES': {
|
||||
'MediaSpiders.middlewares.DumpFilterSpiderMiddleware': 543,
|
||||
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware': None
|
||||
}
|
||||
}
|
||||
|
||||
# 常量定义
|
||||
PAGE_LOAD_TIMEOUT = 10
|
||||
ELEMENT_WAIT_TIMEOUT = 5
|
||||
MAX_NEWS_PER_HOT = 6
|
||||
MAX_HOT_ITEMS = 10
|
||||
|
||||
# 需要过滤的文本模式
|
||||
SKIP_PATTERNS = ['版权', '声明', '邮箱', '记者', '编辑', '来源', '投稿', '责任编辑']
|
||||
|
||||
def __init__(self, params=None, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.url_time = get_current_timestamp()
|
||||
self.total_num = 0
|
||||
self.authorization = None
|
||||
self.job_id = None
|
||||
self.redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PWD)
|
||||
|
||||
if params:
|
||||
try:
|
||||
json_params = json.loads(params)
|
||||
self.total_num = int(json_params.get('totalNum', 0))
|
||||
self.authorization = json_params.get('authorization')
|
||||
self.job_id = json_params.get('job_id')
|
||||
except (json.JSONDecodeError, ValueError) as e:
|
||||
self.logger.error(f"解析参数失败: {e}")
|
||||
|
||||
def start_requests(self):
|
||||
"""开始请求"""
|
||||
yield SeleniumRequest(
|
||||
url='https://mp.weixin.qq.com/',
|
||||
callback=self.parse,
|
||||
)
|
||||
|
||||
def parse(self, response):
|
||||
driver = response.request.meta['driver']
|
||||
cookies_key = "MediaSpiders:WeChatLinksFetcher_Cookies"
|
||||
cookie_list = self.redis_client.lrange(cookies_key, 0, -1)
|
||||
cookie_parts = [
|
||||
item.decode('utf-8') if isinstance(item, bytes) else str(item)
|
||||
for item in cookie_list
|
||||
]
|
||||
|
||||
# 遍历cookies,记录当前索引
|
||||
for cookie_index, item in enumerate(cookie_parts):
|
||||
try:
|
||||
driver.delete_all_cookies()
|
||||
driver.get('https://mp.weixin.qq.com/')
|
||||
time.sleep(2)
|
||||
|
||||
cookie_string = item
|
||||
cookie_dict = parse_cookie_string(cookie_string)
|
||||
|
||||
success_count = 0
|
||||
for name, value in cookie_dict.items():
|
||||
if add_cookie_smart(driver, name, value):
|
||||
success_count += 1
|
||||
else:
|
||||
logger.warning(f"跳过 cookie: {name}")
|
||||
|
||||
logger.info(f"成功添加 {success_count}/{len(cookie_dict)} 个 cookie (索引: {cookie_index})")
|
||||
|
||||
# 验证 cookie 是否有效
|
||||
driver.refresh()
|
||||
time.sleep(5)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"使用 cookie 登录时出错: {str(e)}")
|
||||
continue
|
||||
|
||||
count_per_account = 200
|
||||
total_count = 0
|
||||
break_flag = False
|
||||
|
||||
token_index = driver.current_url.rfind('token=')
|
||||
token = driver.current_url[token_index + 6:]
|
||||
logger.info(f'获取 token 成功!当前 token 为 {token}')
|
||||
raw_cookies = driver.get_cookies()
|
||||
cookies = {}
|
||||
for c in raw_cookies:
|
||||
cookies[c['name']] = c['value']
|
||||
logger.info(f'获取 cookie 成功!')
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
|
||||
'Chrome/136.0.0.0 Safari/537.36 Edg/136.0.0.0',
|
||||
'Referer': f'https://mp.weixin.qq.com/cgi-bin/appmsg?t=media/'
|
||||
f'appmsg_edit_v2&action=edit&isNew=1&type=77&createType=0&token={token}&lang=zh_CN'
|
||||
}
|
||||
query_api = SOCIAL_USER_QUERY_ALL_API.format(sortBy="updateTime", shuffleResult="false")
|
||||
post_body = {
|
||||
'userType': WECHAT_USER_TYPE,
|
||||
'userFlag': 0
|
||||
}
|
||||
account_rsp = json.loads(
|
||||
http_post(query_api, json.dumps(post_body), headers={"Content-Type": "application/json"}).text)
|
||||
official_accounts = []
|
||||
if account_rsp['code'] == 200:
|
||||
official_accounts = account_rsp['content'][:10]
|
||||
for account_line in official_accounts:
|
||||
try:
|
||||
if break_flag:
|
||||
break
|
||||
start_timestamp = int((time.time() - 500 * 24 * 3600) * 1000)
|
||||
if 'updateTime' in account_line:
|
||||
start_timestamp = account_line['updateTime']
|
||||
start_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(start_timestamp / 1000))
|
||||
account = account_line['userName']
|
||||
search_account_api = f'https://mp.weixin.qq.com/cgi-bin/searchbiz?action=search_biz&begin=0&count=5&' \
|
||||
f'query={account}&token={token}&lang=zh_CN&f=json&ajax=1'
|
||||
logger.info(f"开始搜索公众号“{account}”...")
|
||||
time.sleep(3 + random.random())
|
||||
response = requests.get(search_account_api, cookies=cookies, headers=headers)
|
||||
rsp_body = json.loads(response.text)
|
||||
index_end = ceil(count_per_account / 5)
|
||||
if 'list' in rsp_body:
|
||||
matched_account = {}
|
||||
matched_account_flag = False
|
||||
for item in rsp_body['list']:
|
||||
if item['nickname'] == account:
|
||||
matched_account_flag = True
|
||||
matched_account = item
|
||||
break
|
||||
if not matched_account_flag:
|
||||
logger.info(f"未找到公众号“{account}”")
|
||||
continue
|
||||
fake_id = matched_account['fakeid']
|
||||
update_time_flag = True # 用于记录获取到的历史列表是否已经超出最早的时间限制
|
||||
next_start_timestamp = int(time.time() * 1000)
|
||||
for index in range(index_end):
|
||||
if update_time_flag:
|
||||
if next_start_timestamp - start_timestamp < 12 * 3600 * 1000:
|
||||
logger.info(f"公众号“{account}”以及后续账号在12小时内已经扫码获取过文章链接,本次获取结束")
|
||||
break_flag = True
|
||||
else:
|
||||
fetch_article_api = f'https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=' \
|
||||
f'{index * 5}&count=5&fakeid={fake_id}&type=9&query=&token={token}' \
|
||||
f'&lang=zh_CN&f=json&ajax=1'
|
||||
logger.info(f"开始获取公众号“{account}”在 {start_time} 后发表的的文章列表...")
|
||||
time.sleep(3 + random.random())
|
||||
article_response = requests.get(fetch_article_api, cookies=cookies, headers=headers)
|
||||
article_rsp_body = json.loads(article_response.text)
|
||||
if 'app_msg_list' in article_rsp_body:
|
||||
for article in article_rsp_body['app_msg_list']:
|
||||
title = article['title']
|
||||
link = article['link']
|
||||
update_time = article['update_time'] * 1000
|
||||
if update_time > start_timestamp:
|
||||
total_count += 1
|
||||
time_str = time.strftime("%Y-%m-%d %H:%M:%S",
|
||||
time.localtime(update_time / 1000))
|
||||
logger.info(f"[No. {total_count}] 获取到公众号“{account}”在 {time_str} "
|
||||
f"发表的文章《{title}》,链接地址:{link}")
|
||||
self.redis_client.sadd(
|
||||
f"MediaSpiders:Wechat_links:{account_line['id']}",
|
||||
link)
|
||||
else:
|
||||
update_time_flag = False
|
||||
break
|
||||
else:
|
||||
logger.info(json.dumps(article_rsp_body, ensure_ascii=False))
|
||||
if 'base_resp' in article_rsp_body:
|
||||
err_msg = article_rsp_body['base_resp']['err_msg']
|
||||
if err_msg == "freq control" or err_msg == "invalid session":
|
||||
logger.info("接口频率限制,稍后再试,本次获取结束")
|
||||
break_flag = True
|
||||
|
||||
# 删除当前使用的cookie
|
||||
self._remove_invalid_cookie(cookies_key, cookie_index)
|
||||
break
|
||||
|
||||
if not break_flag:
|
||||
# 本循环内,只有12小时内扫过码以及接口频率限制退出,会导致 break_flag 为 True,这两种情况都不需要更新扫码状态
|
||||
next_start_time = time.strftime("%Y-%m-%d %H:%M:%S",
|
||||
time.localtime(next_start_timestamp / 1000))
|
||||
account_line['updateTime'] = next_start_timestamp
|
||||
http_post(SOCIAL_USER_UPDATE_API,
|
||||
data=json.dumps(account_line, ensure_ascii=False).encode('utf-8'),
|
||||
headers={'User-Agent': UA, "Content-Type": "application/json"}
|
||||
)
|
||||
logger.info(f"公众号“{account}”文章获取结束,该账号下一次获取起始时间为 {next_start_time}")
|
||||
else:
|
||||
logger.info(json.dumps(rsp_body, ensure_ascii=False))
|
||||
if 'base_resp' in rsp_body:
|
||||
if rsp_body['base_resp']['err_msg'] == "freq control":
|
||||
logger.info("接口频率限制,稍后再试,本次获取结束")
|
||||
break_flag = True
|
||||
|
||||
# 删除当前使用的cookie
|
||||
self._remove_invalid_cookie(cookies_key, cookie_index)
|
||||
break
|
||||
except Exception as e:
|
||||
logger.info(repr(e))
|
||||
self.redis_client.close()
|
||||
driver.quit()
|
||||
|
||||
def _remove_invalid_cookie(self, cookies_key, cookie_index):
|
||||
"""删除无效的cookie"""
|
||||
try:
|
||||
# 方法1:标记并删除
|
||||
self.redis_client.lset(cookies_key, cookie_index, "__invalid__")
|
||||
self.redis_client.lrem(cookies_key, 1, "__invalid__")
|
||||
logger.info(f"已删除无效的cookie,索引: {cookie_index}")
|
||||
|
||||
# 方法2:或者直接删除整个列表(如果cookie全部无效)
|
||||
# cookie_count = self.redis_client.llen(cookies_key)
|
||||
# if cookie_count <= 1:
|
||||
# self.redis_client.delete(cookies_key)
|
||||
# logger.info(f"已删除所有cookies: {cookies_key}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"删除cookie失败: {e}")
|
||||
|
||||
|
||||
def parse_cookie_string(cookie_str):
|
||||
"""解析 cookie 字符串为 dict"""
|
||||
cookie_dict = {}
|
||||
for item in cookie_str.split(';'):
|
||||
if '=' in item:
|
||||
name, value = item.split('=', 1)
|
||||
cookie_dict[name.strip()] = value.strip()
|
||||
return cookie_dict
|
||||
|
||||
|
||||
def add_cookie_smart(driver, name, value, target_domain='mp.weixin.qq.com'):
|
||||
"""
|
||||
智能添加 cookie:先试目标域名,失败则试父域,再失败则跳过
|
||||
"""
|
||||
# 微信核心 cookie 必须用 mp.weixin.qq.com
|
||||
wechat_critical = ['wxuin', 'slave_sid', 'slave_user', 'bizuin', 'data_ticket', 'token']
|
||||
|
||||
# 腾讯通用 cookie 可尝试父域
|
||||
tencent_common = ['ptui_loginuin', 'RK', 'ptcz', 'ua_id']
|
||||
|
||||
# 策略 1: 核心 cookie → 精确域名
|
||||
if name in wechat_critical:
|
||||
domains_to_try = [target_domain]
|
||||
# 策略 2: 腾讯通用 cookie → 先试目标域,再试父域
|
||||
elif name in tencent_common:
|
||||
domains_to_try = [target_domain, '.weixin.qq.com', '.qq.com']
|
||||
# 策略 3: 其他 cookie → 默认 host-only(不传 domain)
|
||||
else:
|
||||
domains_to_try = [None, target_domain]
|
||||
|
||||
for domain in domains_to_try:
|
||||
cookie = {
|
||||
'name': name,
|
||||
'value': value,
|
||||
'path': '/',
|
||||
'secure': True
|
||||
}
|
||||
if domain:
|
||||
cookie['domain'] = domain
|
||||
|
||||
try:
|
||||
driver.add_cookie(cookie)
|
||||
# logger.debug(f"✓ {name} added with domain={domain or 'host-only'}")
|
||||
return True
|
||||
except Exception as e:
|
||||
if 'invalid cookie domain' in str(e):
|
||||
continue # 尝试下一个 domain
|
||||
else:
|
||||
# logger.warning(f"✗ {name} failed: {e}")
|
||||
return False
|
||||
return False # 所有 domain 都失败
|
||||
|
||||
|
||||
|
||||
@ -2,10 +2,12 @@
|
||||
import json
|
||||
import time
|
||||
|
||||
import redis
|
||||
import scrapy
|
||||
from redisbloom.client import Client
|
||||
|
||||
from MediaSpiders.items import MediaspidersItem
|
||||
from MediaSpiders.spiders.TwitterUserSpider import form_cookie_dict
|
||||
from MediaSpiders.utils.http_utils import http_post
|
||||
from MediaSpiders.utils.string_utils import find_text
|
||||
from MediaSpiders.utils.time_utils import get_time_stamp, get_current_timestamp
|
||||
@ -59,6 +61,11 @@ class WeiboSpider(scrapy.Spider):
|
||||
account_rsp = json.loads(
|
||||
http_post(account_query_api, json.dumps(post_data), headers={"Content-Type": "application/json"}).text)
|
||||
self.simhash_filter_key = self.settings['WEIBO_SIMHASH_FILTER_KEY']
|
||||
# 从 redis 中 获取 微博所需的 cookie
|
||||
cookie_string = redis.Redis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT'],
|
||||
password=self.settings['REDIS_PWD']).get("MediaSpiders:Weibo_Cookies").decode()
|
||||
self.cookie_dict = form_cookie_dict(cookie_string)
|
||||
|
||||
all_user_info = []
|
||||
if account_rsp['code'] == 200:
|
||||
all_user_info = account_rsp['content']
|
||||
@ -67,7 +74,10 @@ class WeiboSpider(scrapy.Spider):
|
||||
if uid[:6] != '107603':
|
||||
uid = f'107603{uid}'
|
||||
yield scrapy.Request('https://m.weibo.cn/api/container/getIndex?containerid=%s' % uid,
|
||||
callback=self.parse, meta={'currentCount': 0, 'uid': uid})
|
||||
callback=self.parse,
|
||||
meta={'currentCount': 0, 'uid': uid},
|
||||
cookies=self.cookie_dict
|
||||
)
|
||||
|
||||
def parse(self, response):
|
||||
rsp = json.loads(response.text)
|
||||
|
||||
@ -2,3 +2,5 @@
|
||||
#
|
||||
# Please refer to the documentation for information on how to create and manage
|
||||
# your spiders.
|
||||
|
||||
|
||||
|
||||
@ -1,7 +1,11 @@
|
||||
import scrapy
|
||||
import json
|
||||
from MediaSpiders.utils.hot_search_json_parser import parse_weibo_response, parse_toutiao_response
|
||||
|
||||
import redis
|
||||
import scrapy
|
||||
|
||||
from MediaSpiders.items import MediaspidersItem
|
||||
from MediaSpiders.utils.hot_search_json_parser import parse_weibo_response, parse_toutiao_response
|
||||
from MediaSpiders.utils.time_utils import get_current_timestamp
|
||||
|
||||
|
||||
class HotSearchSpider(scrapy.Spider):
|
||||
@ -22,42 +26,68 @@ class HotSearchSpider(scrapy.Spider):
|
||||
'MediaSpiders.pipelines.ProtobufSavePipeline': 300,
|
||||
# 'MediaSpiders.pipelines.HotSearchSaveToMySQL': 300
|
||||
},
|
||||
'SPIDER_MIDDLEWARES': {},
|
||||
'SPIDER_MIDDLEWARES': {
|
||||
'MediaSpiders.middlewares.DumpFilterSpiderMiddleware': 543,
|
||||
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware': None
|
||||
},
|
||||
'DOWNLOADER_MIDDLEWARES': {},
|
||||
'BATCH_SAVE_SIZE': 50
|
||||
}
|
||||
|
||||
start_urls = [
|
||||
'https://weibo.com/ajax/side/hotSearch',
|
||||
# 'https://weibo.com/ajax/side/hotSearch',
|
||||
'https://www.toutiao.com/hot-event/hot-board/?origin=toutiao_pc'
|
||||
]
|
||||
|
||||
def __init__(self, params=None, *args, **kwargs):
|
||||
super(HotSearchSpider, self).__init__(*args, **kwargs)
|
||||
self.job_id = None
|
||||
self.collected_items = []
|
||||
if params:
|
||||
json_params = json.loads(params)
|
||||
if 'job_id' in json_params:
|
||||
self.job_id = json_params['job_id']
|
||||
try:
|
||||
json_params = json.loads(params)
|
||||
if 'job_id' in json_params:
|
||||
self.job_id = json_params['job_id']
|
||||
if 'max_items' in json_params:
|
||||
self.max_items = int(json_params['max_items'])
|
||||
except Exception as e:
|
||||
self.logger.error(f"解析参数失败: {str(e)}")
|
||||
|
||||
def start_requests(self):
|
||||
"""发起初始请求"""
|
||||
self.logger.info(f"开始爬取热搜数据,任务ID: {self.job_id if self.job_id else 'N/A'}")
|
||||
self.url_time = get_current_timestamp()
|
||||
for url in self.start_urls:
|
||||
yield scrapy.Request(
|
||||
url=url,
|
||||
callback=self.parse
|
||||
)
|
||||
|
||||
def parse(self, response):
|
||||
result_array = []
|
||||
if 'weibo.com' in response.url:
|
||||
result_array = parse_weibo_response(response.text)
|
||||
elif 'toutiao.com' in response.url:
|
||||
result_array = parse_toutiao_response(response.text)
|
||||
try:
|
||||
if 'weibo.com' in response.url:
|
||||
result_array = parse_weibo_response(response.text)
|
||||
elif 'toutiao.com' in response.url:
|
||||
result_array = parse_toutiao_response(response.text)
|
||||
|
||||
for line in result_array:
|
||||
hot_search_item = MediaspidersItem()
|
||||
hot_search_item['es_carriertype'] = 'hot_search'
|
||||
hot_search_item['es_sid'] = line['id']
|
||||
hot_search_item['es_hkey'] = line['hot_id']
|
||||
hot_search_item['es_urltitle'] = line['hot_word']
|
||||
hot_search_item['es_urlcontent'] = line['hot_word']
|
||||
hot_search_item['es_heat'] = line['hot_value']
|
||||
hot_search_item['es_catalog'] = line['category']
|
||||
hot_search_item['es_simrank'] = line['realtime_rank']
|
||||
hot_search_item['es_sitename'] = line['platform']
|
||||
hot_search_item['es_urltime'] = line['onboard_time']
|
||||
hot_search_item['es_lasttime'] = line['crawl_time']
|
||||
hot_search_item['es_urlname'] = line['fake_url']
|
||||
yield hot_search_item
|
||||
for line in result_array:
|
||||
hot_search_item = MediaspidersItem()
|
||||
hot_search_item['es_carriertype'] = 'hot_search'
|
||||
hot_search_item['es_sid'] = line['id']
|
||||
hot_search_item['es_hkey'] = line['hot_id']
|
||||
hot_search_item['es_urltitle'] = line['hot_word']
|
||||
hot_search_item['es_urlcontent'] = line['hot_word']
|
||||
hot_search_item['es_heat'] = line['hot_value']
|
||||
hot_search_item['es_catalog'] = line['category']
|
||||
hot_search_item['es_simrank'] = line['realtime_rank']
|
||||
hot_search_item['es_sitename'] = line['platform']
|
||||
hot_search_item['es_urltime'] = line['onboard_time']
|
||||
hot_search_item['es_lasttime'] = line['crawl_time']
|
||||
hot_search_item['es_urlname'] = line['fake_url'] + "&news"
|
||||
|
||||
yield hot_search_item
|
||||
|
||||
|
||||
except Exception as e:
|
||||
self.logger.exception(f"解析异常: {str(e)}")
|
||||
|
||||
@ -87,7 +87,9 @@ def get_format_time(pattern, time_str):
|
||||
date = result.group(1)
|
||||
time_t = result.group(2)
|
||||
date = date.replace('/', '-').replace(".", "-").replace(
|
||||
",", "-").replace("年", "-").replace("月", "-").replace("日", "").replace(' ', '-').replace('--', '-')
|
||||
",", "-").replace("年", "-").replace("月", "-").replace("日", "").replace(
|
||||
"년", "-").replace("월", "-").replace("일", "").replace(
|
||||
' ', '-').replace('--', '-')
|
||||
date_array = date.split('-')
|
||||
for i in range(len(date_array)):
|
||||
if (date_array[i].endswith('st') or
|
||||
@ -128,7 +130,7 @@ def get_format_time(pattern, time_str):
|
||||
|
||||
if __name__ == '__main__':
|
||||
# a = [' 令和4年6月9日', 'www.kcna.kp (主体111.6.6.)', '民國111年06月09日 ', 'Jun. 9, 2022', '111年 06月 21日']
|
||||
a = ['06.10.2023 03:24']
|
||||
a = ['2026년 1월 6일 화요일 1면 [사진있음]']
|
||||
for _ in a:
|
||||
print(get_time_stamp(_))
|
||||
# print(get_time_stamp(_, {r"(\d{4}年\d{1,2}月\d{2}日)\D*(\d{2}:\d{2}:\d{2})*\D*": ['%Y-%m-%d %H:%M:%S']}))
|
||||
# print(get_time_stamp(_))
|
||||
print(get_time_stamp(_, {r"(\d{4}년 \d{1,2}월 \d{1,2}일)\D*(\d{2}:\d{2}:\d{2})*\D*": ['%Y-%m-%d %H:%M:%S']}))
|
||||
|
||||
@ -2,11 +2,13 @@ import json
|
||||
import uuid
|
||||
import logging
|
||||
import time
|
||||
|
||||
import requests
|
||||
|
||||
from MediaSpiders.utils.string_utils import get_str_md5
|
||||
|
||||
|
||||
def parse_weibo_response(rsp_str):
|
||||
rsp_body = json.loads(rsp_str)
|
||||
def parse_weibo_response(rsp_body):
|
||||
result_array = []
|
||||
if rsp_body['ok'] == 1:
|
||||
realtime_data = rsp_body['data']['realtime']
|
||||
@ -56,7 +58,7 @@ def parse_toutiao_response(rsp_str):
|
||||
"platform": "今日头条",
|
||||
"onboard_time": current_timestamp,
|
||||
"crawl_time": current_timestamp,
|
||||
"fake_url": f"https://www.toutiao.com/hot-event/hot-board/{custom_sid}"
|
||||
"fake_url": line['Url']
|
||||
}
|
||||
if 'InterestCategory' in line:
|
||||
result_line['category'] = ",".join(line['InterestCategory'])
|
||||
@ -66,6 +68,11 @@ def parse_toutiao_response(rsp_str):
|
||||
logging.info(json.dumps(line, ensure_ascii=False))
|
||||
return result_array
|
||||
|
||||
def url_response(url):
|
||||
rsp_str = requests.get(url).text
|
||||
return json.loads(rsp_str)
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# rsp_file = open("./toutiao_hot_search.json", 'r', encoding='utf-8')
|
||||
|
||||
170
spiders/MediaSpiders/MediaSpiders/utils/login_utils.py
Normal file
170
spiders/MediaSpiders/MediaSpiders/utils/login_utils.py
Normal file
@ -0,0 +1,170 @@
|
||||
import random
|
||||
import json
|
||||
import time
|
||||
import logging as logger
|
||||
|
||||
from selenium.webdriver import ActionChains
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.webdriver.common.by import By
|
||||
from MediaSpiders.settings import REDIS_HOST, REDIS_PORT, REDIS_PWD, SOCIAL_USER_QUERY_ALL_API, SOCIAL_USER_UPDATE_API, \
|
||||
WECHAT_USER_TYPE
|
||||
from MediaSpiders.utils.http_utils import http_post, UA
|
||||
|
||||
class login:
|
||||
def __init__(self):
|
||||
self.name = None
|
||||
self.url = None
|
||||
|
||||
|
||||
|
||||
def login_with_selenium(self, login_url, site_name, login_users=None, response=None, drivers=None):
|
||||
"""
|
||||
使用 Selenium 自动登录指定站点(从 Redis 账号池中随机选一个账号)
|
||||
|
||||
:param driver: Selenium WebDriver 实例
|
||||
:param login_url: 登录页面 URL
|
||||
:param site_name: 站点名称(如 'Facebook')
|
||||
:param login_users: Redis 客户端获取的账号密码
|
||||
"""
|
||||
self.name = site_name
|
||||
self.url = login_url
|
||||
logger.info(f"Starting login to {site_name}...")
|
||||
if response is not None:
|
||||
driver = response.request.meta['driver']
|
||||
elif drivers is not None:
|
||||
driver = drivers
|
||||
|
||||
if login_users is not None:
|
||||
# 解析 redis 中 账号密码
|
||||
user_list = [json.loads(u.decode()) for u in login_users]
|
||||
self.login_user = random.choice(user_list)
|
||||
|
||||
if self.name == 'FacebookUserSpider':
|
||||
self.facebook_login(driver)
|
||||
elif self.name == 'TwitterUserSpider' or self.name == 'TwitterUserInfoSpider' :
|
||||
self.twitter_login(driver)
|
||||
elif self.name == 'wechat_links_fetcher':
|
||||
self.wechat_links_login(driver)
|
||||
|
||||
|
||||
time.sleep(10) # 等待登录完成(可优化为显式等待)
|
||||
|
||||
return driver
|
||||
|
||||
"""
|
||||
FaceBook 登录 获取cookie
|
||||
"""
|
||||
def facebook_login(self, driver):
|
||||
# 打开登录页
|
||||
driver.maximize_window() # 注意:原代码中有空格!
|
||||
time.sleep(3)
|
||||
driver.get(self.url)
|
||||
|
||||
driver.find_element_by_xpath(
|
||||
'//input[@name="email"]').send_keys(self.login_user['uid'])
|
||||
driver.find_element_by_xpath(
|
||||
'//input[@name="pass"]').send_keys(self.login_user['pwd'])
|
||||
driver.find_element_by_xpath('//button[@name="login"]').click()
|
||||
time.sleep(10)
|
||||
logger.info(f"Logged in to {self.name} as {self.login_user['uid']}")
|
||||
|
||||
"""
|
||||
Twitter 登录 获取cookie
|
||||
"""
|
||||
def twitter_login(self, driver):
|
||||
# 打开登录页
|
||||
driver.maximize_window() # 注意:原代码中有空格!
|
||||
time.sleep(3)
|
||||
driver.get(self.url)
|
||||
|
||||
# 隐藏指纹
|
||||
driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
|
||||
'source': '''
|
||||
delete navigator.__proto__.webdriver;
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
|
||||
'''
|
||||
})
|
||||
wait = WebDriverWait(driver, 15)
|
||||
# 2. 通过 JS 打开第二个标签页(新 Tab)
|
||||
time.sleep(5)
|
||||
driver.execute_script("window.open('');")
|
||||
driver.execute_script("window.open('https://x.com/i/flow/login', '_blank');")
|
||||
|
||||
# 3. 获取所有标签页句柄
|
||||
handles = driver.window_handles # [handle1, handle2]
|
||||
|
||||
# 4. 切换到第二个标签页(可选)
|
||||
driver.switch_to.window(handles[1])
|
||||
|
||||
logger.info(f"login as user {self.login_user['uid']}")
|
||||
# time.sleep(random.uniform(1.5, 3.0))
|
||||
# driver.find_element_by_xpath("//input").send_keys(login_user['uid'])
|
||||
# 等待并定位用户名输入框
|
||||
username_input = wait.until(
|
||||
EC.element_to_be_clickable((By.CSS_SELECTOR, 'input[autocomplete="username"]'))
|
||||
)
|
||||
|
||||
# 模拟真人逐字输入(带随机延迟)
|
||||
username = self.login_user['uid']
|
||||
for char in username:
|
||||
username_input.send_keys(char)
|
||||
time.sleep(random.uniform(0.05, 0.2)) # 每个字符间隔 50~200ms
|
||||
|
||||
time.sleep(random.uniform(0.3, 0.8)) # 输入完后稍作停顿
|
||||
|
||||
# 尝试点击 "Next" 按钮(主逻辑:带文本的按钮)
|
||||
try:
|
||||
next_button = wait.until(
|
||||
EC.element_to_be_clickable(
|
||||
(By.XPATH, "//button[.//span[contains(text(), 'Next') or contains(text(), '下一步')]]")
|
||||
)
|
||||
)
|
||||
body = driver.find_element(By.TAG_NAME, "body")
|
||||
ActionChains(driver).move_to_element_with_offset(body, 100, 100).perform()
|
||||
time.sleep(0.5)
|
||||
# 模拟鼠标移动到按钮并点击
|
||||
actions = ActionChains(driver)
|
||||
actions.move_to_element(next_button).pause(random.uniform(0.2, 0.6)).click().perform()
|
||||
|
||||
except Exception as e:
|
||||
logger.info("主 Next 按钮未找到,尝试备用定位方式")
|
||||
try:
|
||||
# 备用:通过 role 定位第二个 button
|
||||
next_button = driver.find_element(By.XPATH, "//button[@role='button'][2]")
|
||||
actions = ActionChains(driver)
|
||||
actions.move_to_element(next_button).pause(random.uniform(0.2, 0.6)).click().perform()
|
||||
except Exception as e2:
|
||||
logger.error(f"两种方式均无法点击 Next 按钮: {e}, {e2}")
|
||||
raise
|
||||
time.sleep(random.uniform(1.5, 5.0)) # 等待页面加载
|
||||
try:
|
||||
logger.info("输入手机号验证...")
|
||||
driver.find_element_by_xpath("//input[@name='text']").send_keys("+8619962025641")
|
||||
# driver.find_element_by_xpath("//button[@data-testid='ocfEnterTextNextButton']").click()
|
||||
driver.find_element_by_xpath(driver.find_element_by_xpath("//button[.//span[text()='下一步']]")).click()
|
||||
time.sleep(random.uniform(1.5, 5.0)) # 等待页面加载
|
||||
except Exception:
|
||||
logger.info("无需输入手机号验证")
|
||||
driver.find_element_by_xpath("//input[@name='password']").send_keys(self.login_user['pwd'])
|
||||
driver.find_element_by_xpath("//button[@data-testid='LoginForm_Login_Button']").click()
|
||||
time.sleep(random.uniform(1.5, 5.0)) # 等待页面加载
|
||||
try:
|
||||
driver.find_element_by_xpath("//button[@data-testid='confirmationSheetConfirm']").click()
|
||||
time.sleep(random.uniform(1.5, 10.0)) # 等待页面加载
|
||||
except:
|
||||
time.sleep(5)
|
||||
logger.info(f"Logged in to {self.name} as {self.login_user['uid']}")
|
||||
|
||||
def wechat_links_login(self, driver):
|
||||
driver.maximize_window()
|
||||
driver.get(self.url)
|
||||
print("等待打开登录后的页面...")
|
||||
while True:
|
||||
delay = random.randint(5, 11)
|
||||
time.sleep(delay)
|
||||
if 'token=' in driver.current_url:
|
||||
print("登录成功!")
|
||||
logger.info(f"Logged in to {self.name}")
|
||||
break
|
||||
|
||||
@ -1,10 +1,67 @@
|
||||
import datetime
|
||||
import time
|
||||
|
||||
from datetime import datetime, timezone, timedelta
|
||||
import re
|
||||
|
||||
def get_current_timestamp():
|
||||
return int(time.time() * 1000)
|
||||
|
||||
def str_to_timestamp(dt_str: str, tz_offset: int = 8) -> int:
|
||||
"""
|
||||
将时间字符串转为 Unix 时间戳(秒)
|
||||
|
||||
支持格式:
|
||||
- 'YYYY-MM-DD HH:MM'
|
||||
- 'YYYY-MM-DD HH:MM:SS'
|
||||
- 以及包含额外文本的混合字符串(如:"2026-02-27 20:11·头条新锐创作者")
|
||||
|
||||
Args:
|
||||
dt_str: 时间字符串(会自动提取其中的时间部分)
|
||||
tz_offset: 时区偏移(小时),中国用 8
|
||||
|
||||
Returns:
|
||||
整数时间戳
|
||||
|
||||
Raises:
|
||||
ValueError: 无法提取有效时间格式时抛出
|
||||
"""
|
||||
# 去除首尾空格
|
||||
dt_str = dt_str.strip()
|
||||
|
||||
# 使用正则表达式提取时间部分(匹配 YYYY-MM-DD HH:MM 或 YYYY-MM-DD HH:MM:SS)
|
||||
time_pattern = r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}(?::\d{2})?)'
|
||||
match = re.search(time_pattern, dt_str)
|
||||
|
||||
if not match:
|
||||
raise ValueError(f"无法从字符串中提取有效时间格式: {dt_str}")
|
||||
|
||||
# 获取匹配到的时间字符串
|
||||
time_str = match.group(1)
|
||||
|
||||
# 根据格式解析
|
||||
try:
|
||||
if len(time_str) == 16: # 'YYYY-MM-DD HH:MM'
|
||||
dt = datetime.strptime(time_str, '%Y-%m-%d %H:%M')
|
||||
elif len(time_str) == 19: # 'YYYY-MM-DD HH:MM:SS'
|
||||
dt = datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S')
|
||||
else:
|
||||
# 尝试自动解析
|
||||
for fmt in ['%Y-%m-%d %H:%M', '%Y-%m-%d %H:%M:%S']:
|
||||
try:
|
||||
dt = datetime.strptime(time_str, fmt)
|
||||
break
|
||||
except ValueError:
|
||||
continue
|
||||
else:
|
||||
raise ValueError(f"无法解析的时间格式: {time_str}")
|
||||
except ValueError as e:
|
||||
raise ValueError(f"时间格式解析失败: {time_str}") from e
|
||||
|
||||
# 创建时区
|
||||
tz = timezone(timedelta(hours=tz_offset))
|
||||
|
||||
# 返回时间戳
|
||||
return int(dt.replace(tzinfo=tz).timestamp())
|
||||
|
||||
|
||||
def get_time_stamp(date_str):
|
||||
try:
|
||||
|
||||
94
spiders/MediaSpiders/MediaSpiders/utils/traslate_utils.py
Normal file
94
spiders/MediaSpiders/MediaSpiders/utils/traslate_utils.py
Normal file
@ -0,0 +1,94 @@
|
||||
from MediaSpiders.settings import MAX_TEXT_LENGTH, TRANSLATE_API_URL, REQUEST_DELAY
|
||||
import requests
|
||||
import time
|
||||
from typing import List, Tuple, Optional
|
||||
from langdetect import detect, LangDetectException
|
||||
|
||||
def normalize_newlines(text: str) -> str:
|
||||
"""将 \r\n 和 \r 统一转换为 \n"""
|
||||
if not text:
|
||||
return text
|
||||
return text.replace('\r\n', '\n').replace('\r', '\n')
|
||||
|
||||
|
||||
def translate_single(text: str, source_lang: str = "auto", target_lang: str = "zh") -> Optional[str]:
|
||||
"""翻译单段文本,失败返回 None"""
|
||||
if not text or not text.strip():
|
||||
return ""
|
||||
|
||||
payload = {
|
||||
"text": text[:MAX_TEXT_LENGTH],
|
||||
"source_lang": source_lang,
|
||||
"target_lang": target_lang
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.post(TRANSLATE_API_URL, json=payload, timeout=10)
|
||||
response.raise_for_status()
|
||||
result = response.json()
|
||||
return result.get("translated_text")
|
||||
except Exception as e:
|
||||
print(f"⚠️ 翻译失败: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def translate_content_with_paragraphs(content: str) -> str:
|
||||
"""
|
||||
按段落翻译内容,支持容错:
|
||||
- 某段失败 → 跳过该段(保留空行或原文)
|
||||
- 返回拼接后的完整内容
|
||||
"""
|
||||
if not content:
|
||||
return ""
|
||||
|
||||
# 标准化换行符
|
||||
content = normalize_newlines(content)
|
||||
paragraphs = content.split('\n')
|
||||
translated_paragraphs = []
|
||||
|
||||
for para in paragraphs:
|
||||
if not para.strip():
|
||||
# 保留空行
|
||||
translated_paragraphs.append("")
|
||||
continue
|
||||
|
||||
trans = translate_single(para)
|
||||
if trans is None:
|
||||
# 段落翻译失败:跳过该段(可选:保留原文或留空)
|
||||
print(f" ⚠️ 段落翻译失败,跳过: {para[:30]}...")
|
||||
translated_paragraphs.append("") # 或 append(para) 保留原文
|
||||
else:
|
||||
translated_paragraphs.append(trans)
|
||||
|
||||
time.sleep(REQUEST_DELAY)
|
||||
|
||||
return '\n'.join(translated_paragraphs)
|
||||
|
||||
|
||||
# ================== 数据库操作 ==================
|
||||
|
||||
def update_record(cursor, es_sid: int, new_title: str, new_content: str):
|
||||
update_query = """
|
||||
UPDATE indeximos
|
||||
SET es_title = % s, es_content = % s
|
||||
WHERE es_sid = % s
|
||||
"""
|
||||
cursor.execute(update_query, (new_title, new_content, es_sid))
|
||||
|
||||
|
||||
def needs_translation(text: str) -> bool:
|
||||
"""
|
||||
判断文本是否需要翻译:
|
||||
- 如果检测到语言是 'zh'(中文),则不需要翻译,返回 False
|
||||
- 否则需要翻译,返回 True
|
||||
- 若无法检测(如空文本、纯符号等)
|
||||
"""
|
||||
if not text or not text.strip():
|
||||
return False # 空文本无需翻译
|
||||
|
||||
try:
|
||||
lang = detect(text.strip())
|
||||
return lang != 'zh-cn'
|
||||
except LangDetectException:
|
||||
# 无法检测语言(如全是数字、标点等),保守起见视为需要翻译
|
||||
return True
|
||||
@ -2,147 +2,322 @@ import json
|
||||
import random
|
||||
import time
|
||||
from math import ceil
|
||||
import logging as logger
|
||||
|
||||
from selenium.webdriver.common.by import By
|
||||
import redis
|
||||
import requests
|
||||
from msedge.selenium_tools import Edge
|
||||
from msedge.selenium_tools import EdgeOptions
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
|
||||
from MediaSpiders.settings import REDIS_HOST, REDIS_PORT, REDIS_PWD, SOCIAL_USER_QUERY_ALL_API, SOCIAL_USER_UPDATE_API, \
|
||||
WECHAT_USER_TYPE
|
||||
from MediaSpiders.utils.http_utils import http_post, UA
|
||||
|
||||
edge_options = EdgeOptions()
|
||||
edge_options.use_chromium = True
|
||||
driver = Edge(executable_path='msedgedriver.exe', options=edge_options)
|
||||
chrome_options = Options()
|
||||
# 指定 chrome.exe 的完整路径
|
||||
chrome_options.binary_location = "D:/chrome-win64/chrome.exe"
|
||||
# chrome_options.use_chromium = True
|
||||
driver = webdriver.Chrome(
|
||||
executable_path=r"D:\chromedriver.exe",
|
||||
options=chrome_options
|
||||
)
|
||||
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
|
||||
"source": """
|
||||
Object.defineProperty(navigator, 'webdriver', {
|
||||
get: () => undefined
|
||||
})
|
||||
"""
|
||||
"source": "Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"
|
||||
})
|
||||
redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PWD)
|
||||
|
||||
if __name__ == "__main__":
|
||||
count_per_account = 200
|
||||
total_count = 0
|
||||
driver.maximize_window()
|
||||
query_api = SOCIAL_USER_QUERY_ALL_API.format(sortBy="updateTime", shuffleResult="false")
|
||||
post_body = {
|
||||
'userType': WECHAT_USER_TYPE,
|
||||
'userFlag': 0
|
||||
}
|
||||
account_rsp = json.loads(
|
||||
http_post(query_api, json.dumps(post_body), headers={"Content-Type": "application/json"}).text)
|
||||
official_accounts = []
|
||||
if account_rsp['code'] == 200:
|
||||
official_accounts = account_rsp['content']
|
||||
driver.get('https://mp.weixin.qq.com/')
|
||||
print("等待打开登录后的页面...")
|
||||
while True:
|
||||
delay = random.randint(5, 11)
|
||||
time.sleep(delay)
|
||||
if 'token=' in driver.current_url:
|
||||
print("登录成功!")
|
||||
break
|
||||
break_flag = False
|
||||
token_index = driver.current_url.rfind('token=')
|
||||
token = driver.current_url[token_index + 6:]
|
||||
print(f'获取 token 成功!当前 token 为 {token}')
|
||||
raw_cookies = driver.get_cookies()
|
||||
cookies = {}
|
||||
for c in raw_cookies:
|
||||
cookies[c['name']] = c['value']
|
||||
print(f'获取 cookie 成功!')
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
|
||||
'Chrome/136.0.0.0 Safari/537.36 Edg/136.0.0.0',
|
||||
'Referer': f'https://mp.weixin.qq.com/cgi-bin/appmsg?t=media/'
|
||||
f'appmsg_edit_v2&action=edit&isNew=1&type=77&createType=0&token={token}&lang=zh_CN'
|
||||
}
|
||||
for account_line in official_accounts:
|
||||
|
||||
def parse_cookie_string(cookie_str):
|
||||
"""解析 cookie 字符串为 dict"""
|
||||
cookie_dict = {}
|
||||
for item in cookie_str.split(';'):
|
||||
if '=' in item:
|
||||
name, value = item.split('=', 1)
|
||||
cookie_dict[name.strip()] = value.strip()
|
||||
return cookie_dict
|
||||
|
||||
|
||||
def add_cookie_smart(driver, name, value, target_domain='mp.weixin.qq.com'):
|
||||
"""
|
||||
智能添加 cookie:先试目标域名,失败则试父域,再失败则跳过
|
||||
"""
|
||||
# 微信核心 cookie 必须用 mp.weixin.qq.com
|
||||
wechat_critical = ['wxuin', 'slave_sid', 'slave_user', 'bizuin', 'data_ticket', 'token']
|
||||
|
||||
# 腾讯通用 cookie 可尝试父域
|
||||
tencent_common = ['ptui_loginuin', 'RK', 'ptcz', 'ua_id']
|
||||
|
||||
# 策略 1: 核心 cookie → 精确域名
|
||||
if name in wechat_critical:
|
||||
domains_to_try = [target_domain]
|
||||
# 策略 2: 腾讯通用 cookie → 先试目标域,再试父域
|
||||
elif name in tencent_common:
|
||||
domains_to_try = [target_domain, '.weixin.qq.com', '.qq.com']
|
||||
# 策略 3: 其他 cookie → 默认 host-only(不传 domain)
|
||||
else:
|
||||
domains_to_try = [None, target_domain]
|
||||
|
||||
for domain in domains_to_try:
|
||||
cookie = {
|
||||
'name': name,
|
||||
'value': value,
|
||||
'path': '/',
|
||||
'secure': True
|
||||
}
|
||||
if domain:
|
||||
cookie['domain'] = domain
|
||||
|
||||
try:
|
||||
if break_flag:
|
||||
break
|
||||
start_timestamp = int((time.time() - 500 * 24 * 3600) * 1000)
|
||||
if 'updateTime' in account_line:
|
||||
start_timestamp = account_line['updateTime']
|
||||
start_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(start_timestamp / 1000))
|
||||
account = account_line['userName']
|
||||
search_account_api = f'https://mp.weixin.qq.com/cgi-bin/searchbiz?action=search_biz&begin=0&count=5&' \
|
||||
f'query={account}&token={token}&lang=zh_CN&f=json&ajax=1'
|
||||
print(f"开始搜索公众号“{account}”...")
|
||||
time.sleep(3 + random.random())
|
||||
response = requests.get(search_account_api, cookies=cookies, headers=headers)
|
||||
rsp_body = json.loads(response.text)
|
||||
index_end = ceil(count_per_account / 5)
|
||||
if 'list' in rsp_body:
|
||||
matched_account = {}
|
||||
matched_account_flag = False
|
||||
for item in rsp_body['list']:
|
||||
if item['nickname'] == account:
|
||||
matched_account_flag = True
|
||||
matched_account = item
|
||||
break
|
||||
if not matched_account_flag:
|
||||
print(f"未找到公众号“{account}”")
|
||||
continue
|
||||
fake_id = matched_account['fakeid']
|
||||
update_time_flag = True # 用于记录获取到的历史列表是否已经超出最早的时间限制
|
||||
next_start_timestamp = int(time.time() * 1000)
|
||||
for index in range(index_end):
|
||||
if update_time_flag:
|
||||
if next_start_timestamp - start_timestamp < 12 * 3600 * 1000:
|
||||
print(f"公众号“{account}”以及后续账号在12小时内已经扫码获取过文章链接,本次获取结束")
|
||||
break_flag = True
|
||||
else:
|
||||
fetch_article_api = f'https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=' \
|
||||
f'{index * 5}&count=5&fakeid={fake_id}&type=9&query=&token={token}' \
|
||||
f'&lang=zh_CN&f=json&ajax=1'
|
||||
print(f"开始获取公众号“{account}”在 {start_time} 后发表的的文章列表...")
|
||||
time.sleep(3 + random.random())
|
||||
article_response = requests.get(fetch_article_api, cookies=cookies, headers=headers)
|
||||
article_rsp_body = json.loads(article_response.text)
|
||||
if 'app_msg_list' in article_rsp_body:
|
||||
for article in article_rsp_body['app_msg_list']:
|
||||
title = article['title']
|
||||
link = article['link']
|
||||
update_time = article['update_time'] * 1000
|
||||
if update_time > start_timestamp:
|
||||
total_count += 1
|
||||
time_str = time.strftime("%Y-%m-%d %H:%M:%S",
|
||||
time.localtime(update_time / 1000))
|
||||
print(f"[No. {total_count}] 获取到公众号“{account}”在 {time_str} "
|
||||
f"发表的文章《{title}》,链接地址:{link}")
|
||||
redis_client.sadd(f"MediaSpiders:Wechat_links:{account_line['id']}", link)
|
||||
else:
|
||||
update_time_flag = False
|
||||
break
|
||||
else:
|
||||
print(json.dumps(article_rsp_body, ensure_ascii=False))
|
||||
if 'base_resp' in article_rsp_body:
|
||||
if article_rsp_body['base_resp']['err_msg'] == "freq control":
|
||||
print("接口频率限制,稍后再试,本次获取结束")
|
||||
break_flag = True
|
||||
break
|
||||
if not break_flag:
|
||||
# 本循环内,只有12小时内扫过码以及接口频率限制退出,会导致 break_flag 为 True,这两种情况都不需要更新扫码状态
|
||||
next_start_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(next_start_timestamp / 1000))
|
||||
account_line['updateTime'] = next_start_timestamp
|
||||
http_post(SOCIAL_USER_UPDATE_API,
|
||||
data=json.dumps(account_line, ensure_ascii=False).encode('utf-8'),
|
||||
headers={'User-Agent': UA, "Content-Type": "application/json"}
|
||||
)
|
||||
print(f"公众号“{account}”文章获取结束,该账号下一次获取起始时间为 {next_start_time}")
|
||||
else:
|
||||
print(json.dumps(rsp_body, ensure_ascii=False))
|
||||
if 'base_resp' in rsp_body:
|
||||
if rsp_body['base_resp']['err_msg'] == "freq control":
|
||||
print("接口频率限制,稍后再试,本次获取结束")
|
||||
break_flag = True
|
||||
break
|
||||
driver.add_cookie(cookie)
|
||||
# logger.debug(f"✓ {name} added with domain={domain or 'host-only'}")
|
||||
return True
|
||||
except Exception as e:
|
||||
print(repr(e))
|
||||
redis_client.close()
|
||||
driver.quit()
|
||||
if 'invalid cookie domain' in str(e):
|
||||
continue # 尝试下一个 domain
|
||||
else:
|
||||
# logger.warning(f"✗ {name} failed: {e}")
|
||||
return False
|
||||
return False # 所有 domain 都失败
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cookie_list = redis_client.lrange("MediaSpiders:WeChatLinksFetcher_Cookies", 0, -1)
|
||||
cookie_parts = [
|
||||
item.decode('utf-8') if isinstance(item, bytes) else str(item)
|
||||
for item in cookie_list
|
||||
]
|
||||
|
||||
# 标记是否需要手动登录
|
||||
need_manual_login = True
|
||||
current_cookie = None
|
||||
|
||||
if not cookie_parts:
|
||||
logger.warning("Redis 中没有可用的 cookie,需要手动登录")
|
||||
need_manual_login = True
|
||||
else:
|
||||
# 尝试使用 Redis 中的 cookie 登录
|
||||
for item in cookie_parts:
|
||||
current_cookie = item
|
||||
try:
|
||||
driver.delete_all_cookies()
|
||||
driver.get('https://mp.weixin.qq.com/')
|
||||
time.sleep(2)
|
||||
|
||||
cookie_string = item
|
||||
cookie_dict = parse_cookie_string(cookie_string)
|
||||
|
||||
success_count = 0
|
||||
for name, value in cookie_dict.items():
|
||||
if add_cookie_smart(driver, name, value):
|
||||
success_count += 1
|
||||
else:
|
||||
logger.warning(f"跳过 cookie: {name}")
|
||||
|
||||
logger.info(f"成功添加 {success_count}/{len(cookie_dict)} 个 cookie")
|
||||
|
||||
# 验证 cookie 是否有效
|
||||
driver.refresh()
|
||||
time.sleep(5)
|
||||
|
||||
# 检查是否登录成功 - 通过检查 URL 中是否包含 token 或页面元素
|
||||
current_url = driver.current_url
|
||||
if 'token=' in current_url:
|
||||
logger.info("使用 Redis 中的 cookie 登录成功")
|
||||
need_manual_login = False
|
||||
|
||||
else:
|
||||
# 二次验证:检查页面上是否有登录状态相关的元素
|
||||
try:
|
||||
# 检查是否有用户头像或用户名元素
|
||||
driver.find_element(By.CSS_SELECTOR,
|
||||
".weui-desktop-account__nickname, .userinfo_nickname, .account_nickname")
|
||||
logger.info("通过页面元素验证,登录成功")
|
||||
need_manual_login = False
|
||||
|
||||
except:
|
||||
logger.warning("Cookie 登录失败,尝试下一个 cookie 或手动登录")
|
||||
except Exception as e:
|
||||
logger.error(f"使用 cookie 登录时出错: {str(e)}")
|
||||
continue
|
||||
|
||||
# 如果自动登录失败,进行手动登录
|
||||
if need_manual_login:
|
||||
logger.info("所有 cookie 均无效,启动手动登录流程")
|
||||
try:
|
||||
driver.delete_all_cookies()
|
||||
driver.get('https://mp.weixin.qq.com/')
|
||||
time.sleep(2)
|
||||
|
||||
# 等待用户手动登录
|
||||
logger.info("请在浏览器中手动完成登录(扫描二维码)")
|
||||
logger.info("登录成功后,程序将自动继续执行")
|
||||
|
||||
# 设置最长等待时间(例如 120 秒)
|
||||
max_wait_time = 120
|
||||
start_time = time.time()
|
||||
logged_in = False
|
||||
|
||||
while time.time() - start_time < max_wait_time:
|
||||
current_url = driver.current_url
|
||||
if 'token=' in current_url:
|
||||
logged_in = True
|
||||
logger.info("手动登录成功!")
|
||||
break
|
||||
|
||||
# 检查页面元素
|
||||
try:
|
||||
driver.find_element(By.CSS_SELECTOR,
|
||||
".weui-desktop-account__nickname, .userinfo_nickname, .account_nickname")
|
||||
logged_in = True
|
||||
logger.info("通过页面元素确认手动登录成功!")
|
||||
break
|
||||
except:
|
||||
time.sleep(2)
|
||||
|
||||
if not logged_in:
|
||||
logger.error(f"等待 {max_wait_time} 秒后仍未登录成功,程序终止")
|
||||
raise Exception("手动登录超时")
|
||||
|
||||
# 获取新的 cookie
|
||||
raw_cookies = driver.get_cookies()
|
||||
new_cookie_dict = {}
|
||||
for c in raw_cookies:
|
||||
new_cookie_dict[c['name']] = c['value']
|
||||
|
||||
# 将字典转换为字符串格式
|
||||
new_cookie_string = "; ".join([f"{k}={v}" for k, v in new_cookie_dict.items()])
|
||||
|
||||
# 更新 Redis 中的 cookie
|
||||
logger.info("更新 Redis 中的 cookie")
|
||||
|
||||
# 删除旧的 cookie
|
||||
redis_client.delete("MediaSpiders:WeChatLinksFetcher_Cookies")
|
||||
|
||||
# 添加新的 cookie
|
||||
redis_client.lpush("MediaSpiders:WeChatLinksFetcher_Cookies", new_cookie_string)
|
||||
|
||||
current_cookie = new_cookie_string
|
||||
logger.info("Redis cookie 更新成功")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"手动登录过程出错: {str(e)}")
|
||||
raise
|
||||
|
||||
count_per_account = 200
|
||||
total_count = 0
|
||||
break_flag = False
|
||||
|
||||
token_index = driver.current_url.rfind('token=')
|
||||
token = driver.current_url[token_index + 6:]
|
||||
print(f'获取 token 成功!当前 token 为 {token}')
|
||||
raw_cookies = driver.get_cookies()
|
||||
cookies = {}
|
||||
for c in raw_cookies:
|
||||
cookies[c['name']] = c['value']
|
||||
print(f'获取 cookie 成功!')
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
|
||||
'Chrome/136.0.0.0 Safari/537.36 Edg/136.0.0.0',
|
||||
'Referer': f'https://mp.weixin.qq.com/cgi-bin/appmsg?t=media/'
|
||||
f'appmsg_edit_v2&action=edit&isNew=1&type=77&createType=0&token={token}&lang=zh_CN'
|
||||
}
|
||||
query_api = SOCIAL_USER_QUERY_ALL_API.format(sortBy="updateTime", shuffleResult="false")
|
||||
post_body = {
|
||||
'userType': WECHAT_USER_TYPE,
|
||||
'userFlag': 0
|
||||
}
|
||||
account_rsp = json.loads(
|
||||
http_post(query_api, json.dumps(post_body), headers={"Content-Type": "application/json"}).text)
|
||||
official_accounts = []
|
||||
if account_rsp['code'] == 200:
|
||||
official_accounts = account_rsp['content']
|
||||
for account_line in official_accounts:
|
||||
try:
|
||||
if break_flag:
|
||||
break
|
||||
start_timestamp = int((time.time() - 500 * 24 * 3600) * 1000)
|
||||
if 'updateTime' in account_line:
|
||||
start_timestamp = account_line['updateTime']
|
||||
start_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(start_timestamp / 1000))
|
||||
account = account_line['userName']
|
||||
search_account_api = f'https://mp.weixin.qq.com/cgi-bin/searchbiz?action=search_biz&begin=0&count=5&' \
|
||||
f'query={account}&token={token}&lang=zh_CN&f=json&ajax=1'
|
||||
print(f"开始搜索公众号“{account}”...")
|
||||
time.sleep(3 + random.random())
|
||||
response = requests.get(search_account_api, cookies=cookies, headers=headers)
|
||||
rsp_body = json.loads(response.text)
|
||||
index_end = ceil(count_per_account / 5)
|
||||
if 'list' in rsp_body:
|
||||
matched_account = {}
|
||||
matched_account_flag = False
|
||||
for item in rsp_body['list']:
|
||||
if item['nickname'] == account:
|
||||
matched_account_flag = True
|
||||
matched_account = item
|
||||
break
|
||||
if not matched_account_flag:
|
||||
print(f"未找到公众号“{account}”")
|
||||
continue
|
||||
fake_id = matched_account['fakeid']
|
||||
update_time_flag = True # 用于记录获取到的历史列表是否已经超出最早的时间限制
|
||||
next_start_timestamp = int(time.time() * 1000)
|
||||
for index in range(index_end):
|
||||
if update_time_flag:
|
||||
if next_start_timestamp - start_timestamp < 12 * 3600 * 1000:
|
||||
print(f"公众号“{account}”以及后续账号在12小时内已经扫码获取过文章链接,本次获取结束")
|
||||
break_flag = True
|
||||
else:
|
||||
fetch_article_api = f'https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=' \
|
||||
f'{index * 5}&count=5&fakeid={fake_id}&type=9&query=&token={token}' \
|
||||
f'&lang=zh_CN&f=json&ajax=1'
|
||||
print(f"开始获取公众号“{account}”在 {start_time} 后发表的的文章列表...")
|
||||
time.sleep(3 + random.random())
|
||||
article_response = requests.get(fetch_article_api, cookies=cookies, headers=headers)
|
||||
article_rsp_body = json.loads(article_response.text)
|
||||
if 'app_msg_list' in article_rsp_body:
|
||||
for article in article_rsp_body['app_msg_list']:
|
||||
title = article['title']
|
||||
link = article['link']
|
||||
update_time = article['update_time'] * 1000
|
||||
if update_time > start_timestamp:
|
||||
total_count += 1
|
||||
time_str = time.strftime("%Y-%m-%d %H:%M:%S",
|
||||
time.localtime(update_time / 1000))
|
||||
print(f"[No. {total_count}] 获取到公众号“{account}”在 {time_str} "
|
||||
f"发表的文章《{title}》,链接地址:{link}")
|
||||
redis_client.sadd(f"MediaSpiders:Wechat_links:{account_line['id']}",
|
||||
link)
|
||||
else:
|
||||
update_time_flag = False
|
||||
break
|
||||
else:
|
||||
print(json.dumps(article_rsp_body, ensure_ascii=False))
|
||||
if 'base_resp' in article_rsp_body:
|
||||
err_msg = article_rsp_body['base_resp']['err_msg']
|
||||
if err_msg == "freq control" or err_msg == "invalid session":
|
||||
print("接口频率限制,稍后再试,本次获取结束")
|
||||
break_flag = True
|
||||
break
|
||||
|
||||
if not break_flag:
|
||||
# 本循环内,只有12小时内扫过码以及接口频率限制退出,会导致 break_flag 为 True,这两种情况都不需要更新扫码状态
|
||||
next_start_time = time.strftime("%Y-%m-%d %H:%M:%S",
|
||||
time.localtime(next_start_timestamp / 1000))
|
||||
account_line['updateTime'] = next_start_timestamp
|
||||
http_post(SOCIAL_USER_UPDATE_API,
|
||||
data=json.dumps(account_line, ensure_ascii=False).encode('utf-8'),
|
||||
headers={'User-Agent': UA, "Content-Type": "application/json"}
|
||||
)
|
||||
print(f"公众号“{account}”文章获取结束,该账号下一次获取起始时间为 {next_start_time}")
|
||||
else:
|
||||
print(json.dumps(rsp_body, ensure_ascii=False))
|
||||
if 'base_resp' in rsp_body:
|
||||
if rsp_body['base_resp']['err_msg'] == "freq control":
|
||||
print("接口频率限制,稍后再试,本次获取结束")
|
||||
break_flag = True
|
||||
break
|
||||
except Exception as e:
|
||||
print(repr(e))
|
||||
redis_client.close()
|
||||
driver.quit()
|
||||
|
||||
@ -0,0 +1,353 @@
|
||||
import time
|
||||
import logging as logger
|
||||
|
||||
from selenium.webdriver.common.by import By
|
||||
import redis
|
||||
import requests
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
|
||||
from MediaSpiders.settings import REDIS_HOST, REDIS_PORT, REDIS_PWD
|
||||
|
||||
chrome_options = Options()
|
||||
# 指定 chrome.exe 的完整路径
|
||||
chrome_options.binary_location = r"D:\chrome-win64\chrome.exe"
|
||||
# chrome_options.use_chromium = True
|
||||
driver = webdriver.Chrome(
|
||||
executable_path=r"D:\chromedriver-win64\chromedriver.exe",
|
||||
options=chrome_options
|
||||
)
|
||||
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
|
||||
"source": "Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"
|
||||
})
|
||||
|
||||
# Redis连接
|
||||
redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PWD, decode_responses=True)
|
||||
COOKIE_KEY = "MediaSpiders:WeChatLinksFetcher_Cookies"
|
||||
|
||||
|
||||
def parse_cookie_string(cookie_str):
|
||||
"""解析 cookie 字符串为 dict"""
|
||||
cookie_dict = {}
|
||||
for item in cookie_str.split(';'):
|
||||
if '=' in item:
|
||||
name, value = item.split('=', 1)
|
||||
cookie_dict[name.strip()] = value.strip()
|
||||
return cookie_dict
|
||||
|
||||
|
||||
def add_cookie_smart(driver, name, value, target_domain='mp.weixin.qq.com'):
|
||||
"""
|
||||
智能添加 cookie:先试目标域名,失败则试父域,再失败则跳过
|
||||
"""
|
||||
# 微信核心 cookie 必须用 mp.weixin.qq.com
|
||||
wechat_critical = ['wxuin', 'slave_sid', 'slave_user', 'bizuin', 'data_ticket', 'token']
|
||||
|
||||
# 腾讯通用 cookie 可尝试父域
|
||||
tencent_common = ['ptui_loginuin', 'RK', 'ptcz', 'ua_id']
|
||||
|
||||
# 策略 1: 核心 cookie → 精确域名
|
||||
if name in wechat_critical:
|
||||
domains_to_try = [target_domain]
|
||||
# 策略 2: 腾讯通用 cookie → 先试目标域,再试父域
|
||||
elif name in tencent_common:
|
||||
domains_to_try = [target_domain, '.weixin.qq.com', '.qq.com']
|
||||
# 策略 3: 其他 cookie → 默认 host-only(不传 domain)
|
||||
else:
|
||||
domains_to_try = [None, target_domain]
|
||||
|
||||
for domain in domains_to_try:
|
||||
cookie = {
|
||||
'name': name,
|
||||
'value': value,
|
||||
'path': '/',
|
||||
'secure': True
|
||||
}
|
||||
if domain:
|
||||
cookie['domain'] = domain
|
||||
|
||||
try:
|
||||
driver.add_cookie(cookie)
|
||||
# logger.debug(f"✓ {name} added with domain={domain or 'host-only'}")
|
||||
return True
|
||||
except Exception as e:
|
||||
if 'invalid cookie domain' in str(e):
|
||||
continue # 尝试下一个 domain
|
||||
else:
|
||||
# logger.warning(f"✗ {name} failed: {e}")
|
||||
return False
|
||||
return False # 所有 domain 都失败
|
||||
|
||||
|
||||
def is_cookie_exists(cookie_str):
|
||||
"""
|
||||
判断cookie是否已存在Redis中
|
||||
返回: (exists, duplicate_index)
|
||||
"""
|
||||
try:
|
||||
# 解析新cookie
|
||||
new_cookie_dict = parse_cookie_string(cookie_str)
|
||||
|
||||
# 获取Redis中所有现有cookie
|
||||
existing_cookies = redis_client.lrange(COOKIE_KEY, 0, -1)
|
||||
|
||||
for idx, existing_cookie in enumerate(existing_cookies):
|
||||
try:
|
||||
existing_dict = parse_cookie_string(existing_cookie)
|
||||
|
||||
# 检查关键字段是否匹配
|
||||
# 微信cookie的关键识别字段
|
||||
key_fields = ['wxuin', 'slave_sid', 'slave_user', 'bizuin']
|
||||
|
||||
matches = 0
|
||||
for field in key_fields:
|
||||
if field in new_cookie_dict and field in existing_dict:
|
||||
if new_cookie_dict[field] == existing_dict[field]:
|
||||
matches += 1
|
||||
|
||||
# 如果匹配到2个以上关键字段,认为是同一个cookie
|
||||
if matches >= 2:
|
||||
return True, idx
|
||||
|
||||
# 或者检查slave_sid(最独特的标识)
|
||||
if 'slave_sid' in new_cookie_dict and 'slave_sid' in existing_dict:
|
||||
if new_cookie_dict['slave_sid'] == existing_dict['slave_sid']:
|
||||
return True, idx
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"解析现有cookie时出错: {e}")
|
||||
continue
|
||||
|
||||
return False, -1
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"判断cookie是否存在时出错: {e}")
|
||||
return False, -1
|
||||
|
||||
|
||||
def save_cookie_to_redis(cookie_str, force_save=False):
|
||||
"""
|
||||
保存cookie到Redis,自动去重
|
||||
|
||||
Args:
|
||||
cookie_str: cookie字符串
|
||||
force_save: 是否强制保存(即使存在也保存)
|
||||
|
||||
Returns:
|
||||
bool: 是否保存成功
|
||||
"""
|
||||
try:
|
||||
# 检查是否已存在
|
||||
exists, idx = is_cookie_exists(cookie_str)
|
||||
|
||||
if exists and not force_save:
|
||||
logger.info(f"Cookie已存在 (索引: {idx}),跳过保存")
|
||||
return False
|
||||
|
||||
if exists and force_save:
|
||||
# 删除旧的,保存新的
|
||||
redis_client.lset(COOKIE_KEY, idx, cookie_str)
|
||||
logger.info(f"已更新现有cookie (索引: {idx})")
|
||||
return True
|
||||
else:
|
||||
# 添加新cookie
|
||||
redis_client.rpush(COOKIE_KEY, cookie_str)
|
||||
logger.info(f"已添加新cookie,当前总数: {redis_client.llen(COOKIE_KEY)}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"保存cookie到Redis失败: {e}")
|
||||
return False
|
||||
|
||||
def cookie_dict_to_string(cookie_dict):
|
||||
"""将cookie字典转换为字符串"""
|
||||
return '; '.join([f"{k}={v}" for k, v in cookie_dict.items()])
|
||||
|
||||
def manual_login_and_get_cookie():
|
||||
"""
|
||||
手动扫码登录并获取cookie
|
||||
"""
|
||||
logger.info("开始手动扫码登录流程...")
|
||||
|
||||
try:
|
||||
# 访问微信公众平台
|
||||
driver.get("https://mp.weixin.qq.com/")
|
||||
time.sleep(3)
|
||||
|
||||
# 等待页面加载
|
||||
WebDriverWait(driver, 10).until(
|
||||
EC.presence_of_element_located((By.TAG_NAME, "body"))
|
||||
)
|
||||
|
||||
# 检查是否已登录(通过URL是否包含token)
|
||||
if "token=" in driver.current_url:
|
||||
logger.info("检测到已登录状态,直接获取cookie")
|
||||
else:
|
||||
logger.info("请手动扫描二维码登录...")
|
||||
logger.info("等待登录完成...")
|
||||
|
||||
# 等待登录成功(等待URL变化或特定元素出现)
|
||||
try:
|
||||
# 等待URL中包含token
|
||||
WebDriverWait(driver, 120).until(
|
||||
lambda d: "token=" in d.current_url
|
||||
)
|
||||
logger.info("检测到登录成功!")
|
||||
time.sleep(3)
|
||||
except Exception as e:
|
||||
logger.error("等待登录超时")
|
||||
return None
|
||||
|
||||
# 获取当前页面所有cookies
|
||||
cookies = driver.get_cookies()
|
||||
|
||||
if not cookies:
|
||||
logger.error("未获取到cookies")
|
||||
return None
|
||||
|
||||
# 转换为字符串格式
|
||||
cookie_dict = {}
|
||||
for cookie in cookies:
|
||||
cookie_dict[cookie['name']] = cookie['value']
|
||||
|
||||
cookie_string = cookie_dict_to_string(cookie_dict)
|
||||
|
||||
# 获取token信息
|
||||
token = None
|
||||
if "token=" in driver.current_url:
|
||||
token_index = driver.current_url.rfind('token=')
|
||||
token = driver.current_url[token_index + 6:]
|
||||
logger.info(f"获取到token: {token}")
|
||||
|
||||
logger.info(f"获取到 {len(cookie_dict)} 个cookie")
|
||||
|
||||
return {
|
||||
'cookie_dict': cookie_dict,
|
||||
'cookie_string': cookie_string,
|
||||
'token': token,
|
||||
'raw_cookies': cookies
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"手动登录失败: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def verify_cookie_valid(cookie_dict, token=None):
|
||||
"""
|
||||
验证cookie是否有效
|
||||
|
||||
Args:
|
||||
cookie_dict: cookie字典
|
||||
token: token字符串
|
||||
|
||||
Returns:
|
||||
bool: cookie是否有效
|
||||
"""
|
||||
try:
|
||||
if not token:
|
||||
# 如果没有token,尝试从cookie中构建
|
||||
pass
|
||||
|
||||
# 构建请求头
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
||||
'Referer': f'https://mp.weixin.qq.com/',
|
||||
}
|
||||
|
||||
# 尝试访问一个需要登录的接口
|
||||
test_api = f'https://mp.weixin.qq.com/cgi-bin/bizlogin?action=validate&lang=zh_CN'
|
||||
|
||||
response = requests.get(test_api, cookies=cookie_dict, headers=headers, timeout=10)
|
||||
|
||||
# 检查响应
|
||||
if response.status_code == 200:
|
||||
try:
|
||||
data = response.json()
|
||||
if 'base_resp' in data:
|
||||
err_msg = data['base_resp'].get('err_msg', '')
|
||||
if err_msg in ['ok', '']:
|
||||
logger.info("cookie验证有效")
|
||||
return True
|
||||
else:
|
||||
logger.warning(f"cookie验证返回错误: {err_msg}")
|
||||
return False
|
||||
except:
|
||||
# 如果不是JSON响应,可能仍然有效
|
||||
logger.info("cookie可能有效")
|
||||
return True
|
||||
else:
|
||||
logger.warning(f"cookie验证请求失败: {response.status_code}")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"验证cookie时出错: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def main(type):
|
||||
"""主函数"""
|
||||
logger.info("微信公众号Cookie获取工具")
|
||||
|
||||
try:
|
||||
# 1. 手动登录获取cookie
|
||||
result = manual_login_and_get_cookie()
|
||||
|
||||
if not result:
|
||||
logger.error("获取cookie失败")
|
||||
return
|
||||
|
||||
cookie_string = result['cookie_string']
|
||||
cookie_dict = result['cookie_dict']
|
||||
token = result['token']
|
||||
|
||||
# 2. 验证cookie有效性
|
||||
logger.info("正在验证cookie有效性...")
|
||||
is_valid = verify_cookie_valid(cookie_dict, token)
|
||||
|
||||
if not is_valid:
|
||||
logger.warning("cookie可能无效,但仍会保存")
|
||||
|
||||
# 3. 检查cookie是否已存在并保存
|
||||
logger.info("正在检查cookie是否已存在...")
|
||||
exists, idx = is_cookie_exists(cookie_string)
|
||||
|
||||
if exists:
|
||||
logger.info(f"Cookie已存在 (索引: {idx})")
|
||||
|
||||
# 询问是否覆盖
|
||||
choice = type
|
||||
if choice == 'y':
|
||||
saved = save_cookie_to_redis(cookie_string, force_save=True)
|
||||
if saved:
|
||||
logger.info("已覆盖更新cookie")
|
||||
else:
|
||||
logger.info("取消保存")
|
||||
else:
|
||||
# 保存新cookie
|
||||
saved = save_cookie_to_redis(cookie_string)
|
||||
if saved:
|
||||
logger.info("新cookie保存成功")
|
||||
|
||||
# 4. 显示当前cookie列表状态
|
||||
total_cookies = redis_client.llen(COOKIE_KEY)
|
||||
logger.info(f"当前Redis中cookie总数: {total_cookies}")
|
||||
|
||||
|
||||
except KeyboardInterrupt:
|
||||
logger.info("用户中断程序")
|
||||
except Exception as e:
|
||||
logger.error(f"程序执行出错: {e}")
|
||||
finally:
|
||||
driver.quit()
|
||||
logger.info("程序结束")
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 是否覆盖原有cookie? y:覆盖、n:不覆盖
|
||||
type = 'y'
|
||||
# 运行主程序
|
||||
main(type)
|
||||
|
||||
|
||||
@ -4,7 +4,20 @@ import sys
|
||||
|
||||
from scrapy.cmdline import execute
|
||||
|
||||
"""
|
||||
命令行启动:
|
||||
1、 Win CMD
|
||||
D:\dev\code\PythonCode\osc\spiders\MediaSpiders\.venv\Scripts\activate.bat
|
||||
scrapy crawl FacebookUserSpider -a params="{}"
|
||||
|
||||
2、Windows PowerShell
|
||||
D:\dev\code\PythonCode\osc\spiders\MediaSpiders\.venv\Scripts\Activate.ps1
|
||||
scrapy crawl FacebookUserSpider -a params="{}"
|
||||
"""
|
||||
|
||||
dirpath = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
sys.path.append(dirpath)
|
||||
execute(['scrapy', 'crawl', 'FacebookUserSpider', '-a', 'params={}'])
|
||||
# 等效于:scrapy crawl FacebookUserSpider -a params="{}"
|
||||
# execute(['scrapy', 'crawl', 'hot_search_spider', '-a', 'params={}'])
|
||||
execute(['scrapy', 'crawl', 'WechatLinksFetcherSpider', '-a', 'params={}'])
|
||||
@ -19,6 +19,16 @@ from ShipSpiders.utils.http_utils import http_post
|
||||
from ShipSpiders.utils.time_utils import to_unix_timestamp
|
||||
|
||||
|
||||
def cookie_dict_to_str(dict_cookie):
|
||||
str_cookie = ""
|
||||
for line in dict_cookie:
|
||||
str_cookie += line['name']
|
||||
str_cookie += "="
|
||||
str_cookie += line['value']
|
||||
str_cookie += ";"
|
||||
return str_cookie[:-1]
|
||||
|
||||
|
||||
class TrackpointsSpider(scrapy.Spider):
|
||||
name = 'shipxy_track'
|
||||
settings = get_project_settings()
|
||||
@ -71,21 +81,22 @@ class TrackpointsSpider(scrapy.Spider):
|
||||
self.driver.get('https://www.shipxy.com/Home/Login')
|
||||
time.sleep(2)
|
||||
logger.info('Logging in with user_id and password...')
|
||||
pwdbutton = self.driver.find_element_by_xpath("//a[text()='密码登录']")
|
||||
pwdbutton.click()
|
||||
pwd_button = self.driver.find_element_by_xpath("//a[text()='密码登录']")
|
||||
pwd_button.click()
|
||||
time.sleep(0.5)
|
||||
userName = self.driver.find_element_by_id('userName')
|
||||
userPWD = self.driver.find_element_by_id('userPWD')
|
||||
user_name = self.driver.find_element_by_id('userName')
|
||||
user_pwd = self.driver.find_element_by_id('userPWD')
|
||||
button = self.driver.find_element_by_id('loginBtn')
|
||||
userName.send_keys(self.settings['SHIPXY_LOGIN_ACCOUNT'])
|
||||
userPWD.send_keys(self.settings['SHIPXY_LOGIN_PASSWD'])
|
||||
user_name.send_keys(self.settings['SHIPXY_LOGIN_ACCOUNT'])
|
||||
user_pwd.send_keys(self.settings['SHIPXY_LOGIN_PASSWD'])
|
||||
button.click()
|
||||
time.sleep(5)
|
||||
self.driver.get('https://www.shipxy.com/IHS')
|
||||
logger.info('Logged in! Updating cookies...')
|
||||
self.driver.get('https://www.shipxy.com/')
|
||||
logger.info(f"Logged in as {self.settings['SHIPXY_LOGIN_ACCOUNT']}, Updating cookies...")
|
||||
except:
|
||||
pass
|
||||
self.dict_cookie = self.driver.get_cookies()
|
||||
logger.info(self.dict_cookie)
|
||||
|
||||
# logger.info('Getting normal ship trackpoints...')
|
||||
# slat, elat = -90 * 1000000, 90 * 1000000
|
||||
@ -128,41 +139,68 @@ class TrackpointsSpider(scrapy.Spider):
|
||||
mmsi = sensitive_target['targetValue']
|
||||
track_api = f'https://www.shipxy.com/Ship/GetTrackAll' \
|
||||
f'?shipid={mmsi}&btime={btime}&etime={etime}&limit=1&enc=0'
|
||||
get_md5_databody = {
|
||||
"shipid": mmsi,
|
||||
"btime": btime,
|
||||
"etime": etime,
|
||||
"limit": "1",
|
||||
"enc": "0"
|
||||
}
|
||||
md5_token = self.get_md5_token_from_webpage(get_md5_databody)
|
||||
cookie_str = cookie_dict_to_str(self.dict_cookie)
|
||||
yield scrapy.Request(url=track_api, callback=self.parse_sensitive_ship, cookies=self.dict_cookie,
|
||||
meta={'mmsi': mmsi})
|
||||
meta={'mmsi': mmsi},
|
||||
headers={
|
||||
# "Cookie": cookie_str,
|
||||
"S": md5_token['sign'],
|
||||
"T": md5_token['timestamp']
|
||||
})
|
||||
|
||||
def get_md5_token_from_webpage(self, data):
|
||||
js_script = f'return window.R0VOQ1NJR04({data});'
|
||||
result = self.driver.execute_script(js_script)
|
||||
return result
|
||||
|
||||
def decode_track(self, data):
|
||||
js_script = f'return analyseAisTrack("{data}").data;'
|
||||
result = self.driver.execute_script(js_script)
|
||||
return result
|
||||
|
||||
def parse_sensitive_ship(self, response):
|
||||
mmsi = response.meta['mmsi']
|
||||
rsp_obj = json.loads(response.text)
|
||||
if rsp_obj['status'] != 0:
|
||||
# logger.info('[SENSITIVE SHIP] No track data of sensitive ship MMSI: %s' % mmsi)
|
||||
logger.info('[SENSITIVE SHIP] No track data of sensitive ship MMSI: %s' % mmsi)
|
||||
logger.info(response.text)
|
||||
return
|
||||
data = rsp_obj['data']
|
||||
tracks = []
|
||||
track_decode_api = self.settings['TRACK_DECODE_SERVICE']
|
||||
retry_times = 1
|
||||
while retry_times <= 3:
|
||||
try:
|
||||
decode_data = requests.post(track_decode_api, data=data)
|
||||
if decode_data.content == b'500': # 解码器返回错误值为 b'500'
|
||||
logger.warning(
|
||||
"解析服务错误!重启服务中... 第 %d 次" % retry_times)
|
||||
time.sleep(3)
|
||||
retry_times += 1
|
||||
continue
|
||||
else:
|
||||
tracks = json.loads(decode_data.content)
|
||||
break
|
||||
except:
|
||||
logger.warning(
|
||||
"解析失败!3 秒后重试第 %d 次..." % retry_times)
|
||||
time.sleep(3)
|
||||
retry_times += 1
|
||||
|
||||
if retry_times > 3:
|
||||
logger.warning(
|
||||
"[SENSITIVE SHIP] 数据丢失 MMSI: %s" % mmsi)
|
||||
return
|
||||
# logger.info(f">>>>>>>>>>>>> data: {data}")
|
||||
# tracks = []
|
||||
# track_decode_api = self.settings['TRACK_DECODE_SERVICE']
|
||||
# retry_times = 1
|
||||
# while retry_times <= 3:
|
||||
# try:
|
||||
# decode_data = requests.post(track_decode_api, data=data)
|
||||
# if decode_data.content == b'500': # 解码器返回错误值为 b'500'
|
||||
# logger.warning(
|
||||
# "解析服务错误!重启服务中... 第 %d 次" % retry_times)
|
||||
# time.sleep(3)
|
||||
# retry_times += 1
|
||||
# continue
|
||||
# else:
|
||||
# tracks = json.loads(decode_data.content)
|
||||
# break
|
||||
# except:
|
||||
# logger.warning(
|
||||
# "解析失败!3 秒后重试第 %d 次..." % retry_times)
|
||||
# time.sleep(3)
|
||||
# retry_times += 1
|
||||
#
|
||||
# if retry_times > 3:
|
||||
# logger.warning(
|
||||
# "[SENSITIVE SHIP] 数据丢失 MMSI: %s" % mmsi)
|
||||
# return
|
||||
tracks = self.decode_track(data)
|
||||
logger.info('[SENSITIVE SHIP] MMSI %s 共 %d 个轨迹点 ' % (mmsi, len(tracks)))
|
||||
for track in tracks:
|
||||
last_time = track['utc'] * 1000
|
||||
|
||||
@ -0,0 +1,175 @@
|
||||
import time
|
||||
from typing import List, Tuple, Optional
|
||||
|
||||
import pymysql
|
||||
import requests
|
||||
|
||||
# ================== 配置区 ==================
|
||||
|
||||
# 数据库配置
|
||||
DB_CONFIG = {
|
||||
'host': '47.113.231.200',
|
||||
'port': 28089,
|
||||
'user': 'root',
|
||||
'password': 'passok123A',
|
||||
'database': 'dsp',
|
||||
'charset': 'utf8mb4',
|
||||
'autocommit': False # 手动控制事务
|
||||
}
|
||||
|
||||
# 翻译 API 地址(替换为你的服务器 IP 或域名)
|
||||
TRANSLATE_API_URL = "http://47.113.231.200:28081/translate"
|
||||
|
||||
# 指定时间(格式:YYYY-MM-DD HH:MM:SS)
|
||||
LOADTIME_AFTER = "2026-01-16 10:40:00"
|
||||
|
||||
# 目标站点列表
|
||||
TARGET_SRCNAMES = [
|
||||
'http://www.rodong.rep.kp/ko/index.php?MUBAMUAxQA==',
|
||||
'http://www.kcna.kp/kp/category/articles/q/5394b80bdae203fadef02522cfb578c0.kcmsf',
|
||||
'https://energynow.com/category/press_releases/',
|
||||
'https://www.fao.org/newsroom/en' # 添加你的站点
|
||||
]
|
||||
|
||||
# 单次请求间隔(秒),避免 API 被限流
|
||||
REQUEST_DELAY = 1
|
||||
|
||||
# 最大文本长度(与 API 一致)
|
||||
MAX_TEXT_LENGTH = 5000
|
||||
|
||||
|
||||
def normalize_newlines(text: str) -> str:
|
||||
"""将 \r\n 和 \r 统一转换为 \n"""
|
||||
if not text:
|
||||
return text
|
||||
return text.replace('\r\n', '\n').replace('\r', '\n')
|
||||
|
||||
|
||||
def translate_single(text: str, source_lang: str = "auto", target_lang: str = "zh") -> Optional[str]:
|
||||
"""翻译单段文本,失败返回 None"""
|
||||
if not text or not text.strip():
|
||||
return ""
|
||||
|
||||
payload = {
|
||||
"text": text[:MAX_TEXT_LENGTH],
|
||||
"source_lang": source_lang,
|
||||
"target_lang": target_lang
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.post(TRANSLATE_API_URL, json=payload, timeout=10)
|
||||
response.raise_for_status()
|
||||
result = response.json()
|
||||
return result.get("translated_text")
|
||||
except Exception as e:
|
||||
print(f"⚠️ 翻译失败: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def translate_content_with_paragraphs(content: str) -> str:
|
||||
"""
|
||||
按段落翻译内容,支持容错:
|
||||
- 某段失败 → 跳过该段(保留空行或原文)
|
||||
- 返回拼接后的完整内容
|
||||
"""
|
||||
if not content:
|
||||
return ""
|
||||
|
||||
# 标准化换行符
|
||||
content = normalize_newlines(content)
|
||||
paragraphs = content.split('\n')
|
||||
translated_paragraphs = []
|
||||
|
||||
for para in paragraphs:
|
||||
if not para.strip():
|
||||
# 保留空行
|
||||
translated_paragraphs.append("")
|
||||
continue
|
||||
|
||||
trans = translate_single(para)
|
||||
if trans is None:
|
||||
# 段落翻译失败:跳过该段(可选:保留原文或留空)
|
||||
print(f" ⚠️ 段落翻译失败,跳过: {para[:30]}...")
|
||||
translated_paragraphs.append("") # 或 append(para) 保留原文
|
||||
else:
|
||||
translated_paragraphs.append(trans)
|
||||
|
||||
time.sleep(REQUEST_DELAY)
|
||||
|
||||
return '\n'.join(translated_paragraphs)
|
||||
|
||||
|
||||
# ================== 数据库操作 ==================
|
||||
|
||||
def update_record(cursor, es_sid: int, new_title: str, new_content: str):
|
||||
update_query = """
|
||||
UPDATE indeximos
|
||||
SET es_title = % s, es_content = % s
|
||||
WHERE es_sid = % s
|
||||
"""
|
||||
cursor.execute(update_query, (new_title, new_content, es_sid))
|
||||
|
||||
|
||||
# ================== 主逻辑 ==================
|
||||
|
||||
def main():
|
||||
conn = pymysql.connect(**DB_CONFIG)
|
||||
cursor = conn.cursor()
|
||||
|
||||
try:
|
||||
placeholders = ','.join(['%s'] * len(TARGET_SRCNAMES))
|
||||
query = f"""
|
||||
SELECT es_sid, es_urltitle, es_urlcontent
|
||||
FROM indeximos
|
||||
WHERE es_loadtime > %s
|
||||
AND (es_title IS NULL OR TRIM(es_title) = '')
|
||||
AND es_srcname IN ({placeholders})
|
||||
AND LENGTH(es_video) > 5
|
||||
"""
|
||||
params = [LOADTIME_AFTER] + TARGET_SRCNAMES
|
||||
cursor.execute(query, params)
|
||||
records: List[Tuple] = cursor.fetchall()
|
||||
|
||||
total = len(records)
|
||||
print(f"✅ 共找到 {total} 条待翻译记录")
|
||||
|
||||
if total == 0:
|
||||
return
|
||||
|
||||
success_count = 0
|
||||
|
||||
for idx, (es_sid, urltitle, urlcontent) in enumerate(records, 1):
|
||||
print(f"\n[{idx}/{total}] 处理 es_sid={es_sid}")
|
||||
start_time = time.time()
|
||||
|
||||
# 翻译标题
|
||||
title_trans = translate_single(urltitle) if urltitle else ""
|
||||
if title_trans is None:
|
||||
print(" → 标题翻译失败,跳过整条")
|
||||
continue
|
||||
|
||||
# 翻译内容(按段落,容错)
|
||||
content_trans = translate_content_with_paragraphs(urlcontent)
|
||||
|
||||
# 更新数据库
|
||||
update_record(cursor, es_sid, title_trans, content_trans)
|
||||
success_count += 1
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
print(f" ✅ 翻译成功 | 耗时: {elapsed:.2f}s | 标题: {title_trans[:30]}...")
|
||||
|
||||
conn.commit()
|
||||
|
||||
print(f"\n🎉 完成!成功翻译 {success_count} / {total} 条记录")
|
||||
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
print(f"❌ 发生错误: {e}")
|
||||
raise
|
||||
finally:
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -19,6 +19,7 @@ class WebsiteSpiderItem(scrapy.Item):
|
||||
es_extname = scrapy.Field()
|
||||
es_channel = scrapy.Field()
|
||||
es_groupname = scrapy.Field()
|
||||
es_title = scrapy.Field()
|
||||
es_urltitle = scrapy.Field()
|
||||
es_urltopic = scrapy.Field()
|
||||
es_lasttime = scrapy.Field()
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
syntax = "proto3";
|
||||
|
||||
message EsSets //es<EFBFBD><EFBFBD>
|
||||
message EsSets
|
||||
{
|
||||
repeated Es Es = 1;
|
||||
}
|
||||
@ -8,78 +8,79 @@ message EsSets //es<65><73>
|
||||
|
||||
message Es
|
||||
{
|
||||
string es_sid = 1; //<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
string es_subjectId = 2; //<EFBFBD><EFBFBD><EFBFBD><EFBFBD>id
|
||||
string es_hkey = 3; //URLΨһ<EFBFBD><EFBFBD><EFBFBD>
|
||||
string es_pkey = 4; //<EFBFBD><EFBFBD>URL<EFBFBD><EFBFBD><EFBFBD>
|
||||
string es_startid = 5; //<EFBFBD><EFBFBD>ʼ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
string es_urlname = 6; //URL<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
string es_sitename = 7; //<EFBFBD><EFBFBD>վ<EFBFBD><EFBFBD>
|
||||
string es_extname = 8; //<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
string es_channel = 9; //<EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ƶ<EFBFBD><EFBFBD>
|
||||
string es_groupname = 10; //<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
string es_urltitle = 11; //<EFBFBD><EFBFBD>ҳ<EFBFBD><EFBFBD><EFBFBD>ñ<EFBFBD><EFBFBD><EFBFBD>
|
||||
string es_urltopic = 12; //<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ҳ<title><EFBFBD>ñ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ı<EFBFBD><EFBFBD><EFBFBD>
|
||||
string es_lasttime = 13; //<EFBFBD>ɼ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ڣ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʱ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
string es_loadtime = 14; //<EFBFBD><EFBFBD><EFBFBD>ʱ<EFBFBD>䣨ʵ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ES<EFBFBD><EFBFBD><EFBFBD>ʱ<EFBFBD>䣩
|
||||
string es_urldate = 15; //<EFBFBD><EFBFBD><EFBFBD>µķ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ڣ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
string es_urltime = 16; //<EFBFBD><EFBFBD><EFBFBD>µķ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ڣ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʱ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
string es_srcname = 17; //<EFBFBD><EFBFBD><EFBFBD>µ<EFBFBD><EFBFBD><EFBFBD>Դ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ȱʧ<EFBFBD><EFBFBD>
|
||||
string es_authors = 18; //<EFBFBD><EFBFBD><EFBFBD>µ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ߣ<EFBFBD><EFBFBD><EFBFBD>ȱʧ<EFBFBD><EFBFBD>
|
||||
string es_district = 19; //<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>µ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>µĵ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ȱʧ<EFBFBD><EFBFBD>
|
||||
string es_catalog = 20; //<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
string es_catalog1 = 21; //<EFBFBD><EFBFBD><EFBFBD><EFBFBD>һ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
string es_catalog2 = 22; //<EFBFBD><EFBFBD><EFBFBD>¶<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
string es_keywords = 23; //<EFBFBD><EFBFBD><EFBFBD>¹ؼ<EFBFBD><EFBFBD>ʣ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>½<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ģ<EFBFBD>
|
||||
string es_abstract = 24; //<EFBFBD><EFBFBD><EFBFBD>µ<EFBFBD>ժҪ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>½<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ģ<EFBFBD>
|
||||
string es_simflag = 25; //<EFBFBD>ظ<EFBFBD><EFBFBD><EFBFBD>ǣ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>֮<EFBFBD>ظ<EFBFBD><EFBFBD><EFBFBD>ҳ<EFBFBD><EFBFBD>HKEY
|
||||
string es_simrank = 26; //<EFBFBD><EFBFBD><EFBFBD>ƶ<EFBFBD><EFBFBD><EFBFBD>ֵ
|
||||
string es_urlimage = 27; //ͼƬ<EFBFBD><EFBFBD>ַ
|
||||
string es_imageflag = 28; //<EFBFBD><EFBFBD>ҳ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ͼƬ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ŀ
|
||||
string es_tableflag = 29; //<EFBFBD><EFBFBD>ҳ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ŀ
|
||||
string es_doclength = 30; //<EFBFBD><EFBFBD><EFBFBD>ij<EFBFBD><EFBFBD><EFBFBD>
|
||||
string es_content = 31; //<EFBFBD><EFBFBD>ҳ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ݣ<EFBFBD><EFBFBD><EFBFBD>ͼƬ<EFBFBD><EFBFBD>
|
||||
string es_urlcontent = 32; //<EFBFBD><EFBFBD>ҳ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ݣ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ͼƬ<EFBFBD><EFBFBD>
|
||||
string es_bbsnum = 33; //<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
string es_pagelevel = 34; //<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʼҳ<EFBFBD>濪ʼ<EFBFBD>IJ<EFBFBD><EFBFBD><EFBFBD>
|
||||
string es_urllevel = 35; //<EFBFBD><EFBFBD><EFBFBD>ӵ<EFBFBD>Ŀ¼<EFBFBD><EFBFBD><EFBFBD>
|
||||
string es_simhash = 36; //<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>simhashֵ
|
||||
string es_ip = 37; //<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ip
|
||||
string es_heat = 38; //<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ȶ<EFBFBD>
|
||||
string es_similaritycount = 39; //<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
string es_similarity = 40; //<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>id
|
||||
string es_similaritytime = 41; //<EFBFBD><EFBFBD><EFBFBD>ƶȼ<EFBFBD><EFBFBD><EFBFBD>ʱ<EFBFBD><EFBFBD>
|
||||
string es_emotion = 42; //<EFBFBD><EFBFBD><EFBFBD>
|
||||
string es_warningtime = 43; //Ԥ<EFBFBD><EFBFBD>ʱ<EFBFBD><EFBFBD>
|
||||
string es_carriertype = 44; //<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
string es_commentcount = 45; //<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
string es_forwardcount = 46; //ת<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
string es_positiveWords = 47; //<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
string es_negativeWords = 48; //<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
string es_negativeProbability = 49; //<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
string es_reportinfo = 50; //<EFBFBD>Ƿ<EFBFBD><EFBFBD>ϱ<EFBFBD><EFBFBD><EFBFBD>Ϣ
|
||||
string es_attention = 51; //<EFBFBD>Ƿ<EFBFBD><EFBFBD>ע
|
||||
string es_warning = 52; //<EFBFBD>Ƿ<EFBFBD>Ԥ<EFBFBD><EFBFBD>
|
||||
string es_readsign = 53; //<EFBFBD>Ƿ<EFBFBD><EFBFBD>Ѷ<EFBFBD>
|
||||
string es_briefing = 54; //<EFBFBD>Ƿ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
string es_warning_word = 55; //Ԥ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
string es_attentiontime = 56; //<EFBFBD><EFBFBD>עʱ<EFBFBD><EFBFBD>
|
||||
string es_collection = 57; //<EFBFBD>Ƿ<EFBFBD><EFBFBD>ղ<EFBFBD>
|
||||
string es_attachment = 58; //<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
string es_userid = 59;//number,<EFBFBD>û<EFBFBD>id<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>罻ý<EFBFBD><EFBFBD><EFBFBD>˻<EFBFBD>)
|
||||
string es_contenttype = 60;//string,<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ͣ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Post<EFBFBD><EFBFBD><EFBFBD>ͣ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>status<EFBFBD><EFBFBD>link<EFBFBD><EFBFBD>photo<EFBFBD><EFBFBD>video<EFBFBD><EFBFBD>event<EFBFBD><EFBFBD>music<EFBFBD><EFBFBD>note<EFBFBD><EFBFBD>offer<EFBFBD><EFBFBD>album<EFBFBD>ȣ<EFBFBD>
|
||||
string es_likecount = 61;//number,<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
string es_links = 62;//string<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>е<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ӵ<EFBFBD>ַ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ƶ<EFBFBD>ļ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ӵ<EFBFBD>ַ
|
||||
string es_reactioncount = 63;//number,<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
string es_linkdesc = 64;//string<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>һ<EFBFBD><EFBFBD>post <EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ϊ<EFBFBD><EFBFBD><EFBFBD>ӣ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ӵ<EFBFBD>һЩ<EFBFBD><EFBFBD>Ϣ
|
||||
string es_repostuid = 65;//number<EFBFBD><EFBFBD>ת<EFBFBD><EFBFBD>ԭ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ߵ<EFBFBD>ID
|
||||
string es_repostuname =66;//string<EFBFBD><EFBFBD>ת<EFBFBD><EFBFBD>ԭ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ߵ<EFBFBD>name
|
||||
string es_repostid = 67;//string<EFBFBD><EFBFBD>ת<EFBFBD><EFBFBD>ԭ<EFBFBD><EFBFBD>ID
|
||||
string es_tags = 68;//string<EFBFBD><EFBFBD><EFBFBD>ἰ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
string es_mentionsaccount = 69;//string<EFBFBD><EFBFBD><EFBFBD>ἰ<EFBFBD>˺<EFBFBD>
|
||||
string es_video = 70;//string<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>е<EFBFBD><EFBFBD><EFBFBD>Ƶ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
string es_isrepost = 71;//boolean<EFBFBD><EFBFBD><EFBFBD>Ƿ<EFBFBD>ת<EFBFBD><EFBFBD>
|
||||
string es_lang = 72;//string<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
string es_client = 73;//string<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ͻ<EFBFBD><EFBFBD><EFBFBD>
|
||||
string es_sid = 1;
|
||||
string es_subjectId = 2;
|
||||
string es_hkey = 3;
|
||||
string es_pkey = 4;
|
||||
string es_startid = 5;
|
||||
string es_urlname = 6;
|
||||
string es_sitename = 7;
|
||||
string es_extname = 8;
|
||||
string es_channel = 9;
|
||||
string es_groupname = 10;
|
||||
string es_urltitle = 11;
|
||||
string es_urltopic = 12;
|
||||
string es_lasttime = 13;
|
||||
string es_loadtime = 14;
|
||||
string es_urldate = 15;
|
||||
string es_urltime = 16;
|
||||
string es_srcname = 17;
|
||||
string es_authors = 18;
|
||||
string es_district = 19;
|
||||
string es_catalog = 20;
|
||||
string es_catalog1 = 21;
|
||||
string es_catalog2 = 22;
|
||||
string es_keywords = 23;
|
||||
string es_abstract = 24;
|
||||
string es_simflag = 25;
|
||||
string es_simrank = 26;
|
||||
string es_urlimage = 27;
|
||||
string es_imageflag = 28;
|
||||
string es_tableflag = 29;
|
||||
string es_doclength = 30;
|
||||
string es_content = 31;
|
||||
string es_urlcontent = 32;
|
||||
string es_bbsnum = 33;
|
||||
string es_pagelevel = 34;
|
||||
string es_urllevel = 35;
|
||||
string es_simhash = 36;
|
||||
string es_ip = 37;
|
||||
string es_heat = 38;
|
||||
string es_similaritycount = 39;
|
||||
string es_similarity = 40;
|
||||
string es_similaritytime = 41;
|
||||
string es_emotion = 42;
|
||||
string es_warningtime = 43;
|
||||
string es_carriertype = 44;
|
||||
string es_commentcount = 45;
|
||||
string es_forwardcount = 46;
|
||||
string es_positiveWords = 47;
|
||||
string es_negativeWords = 48;
|
||||
string es_negativeProbability = 49;
|
||||
string es_reportinfo = 50;
|
||||
string es_attention = 51;
|
||||
string es_warning = 52;
|
||||
string es_readsign = 53;
|
||||
string es_briefing = 54;
|
||||
string es_warning_word = 55;
|
||||
string es_attentiontime = 56;
|
||||
string es_collection = 57;
|
||||
string es_attachment = 58;
|
||||
string es_userid = 59;
|
||||
string es_contenttype = 60;
|
||||
string es_likecount = 61;
|
||||
string es_links = 62;
|
||||
string es_reactioncount = 63;
|
||||
string es_linkdesc = 64;
|
||||
string es_repostuid = 65;
|
||||
string es_repostuname =66;
|
||||
string es_repostid = 67;
|
||||
string es_tags = 68;
|
||||
string es_mentionsaccount = 69;
|
||||
string es_video = 70;
|
||||
string es_isrepost = 71;
|
||||
string es_lang = 72;
|
||||
string es_client = 73;
|
||||
string es_snapshot = 74;
|
||||
string es_title = 75;
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@ -18,7 +18,7 @@ DESCRIPTOR = _descriptor.FileDescriptor(
|
||||
package='',
|
||||
syntax='proto3',
|
||||
serialized_options=None,
|
||||
serialized_pb=b'\n\x08\x45s.proto\"\x19\n\x06\x45sSets\x12\x0f\n\x02\x45s\x18\x01 \x03(\x0b\x32\x03.Es\"\xba\x0c\n\x02\x45s\x12\x0e\n\x06\x65s_sid\x18\x01 \x01(\t\x12\x14\n\x0c\x65s_subjectId\x18\x02 \x01(\t\x12\x0f\n\x07\x65s_hkey\x18\x03 \x01(\t\x12\x0f\n\x07\x65s_pkey\x18\x04 \x01(\t\x12\x12\n\nes_startid\x18\x05 \x01(\t\x12\x12\n\nes_urlname\x18\x06 \x01(\t\x12\x13\n\x0b\x65s_sitename\x18\x07 \x01(\t\x12\x12\n\nes_extname\x18\x08 \x01(\t\x12\x12\n\nes_channel\x18\t \x01(\t\x12\x14\n\x0c\x65s_groupname\x18\n \x01(\t\x12\x13\n\x0b\x65s_urltitle\x18\x0b \x01(\t\x12\x13\n\x0b\x65s_urltopic\x18\x0c \x01(\t\x12\x13\n\x0b\x65s_lasttime\x18\r \x01(\t\x12\x13\n\x0b\x65s_loadtime\x18\x0e \x01(\t\x12\x12\n\nes_urldate\x18\x0f \x01(\t\x12\x12\n\nes_urltime\x18\x10 \x01(\t\x12\x12\n\nes_srcname\x18\x11 \x01(\t\x12\x12\n\nes_authors\x18\x12 \x01(\t\x12\x13\n\x0b\x65s_district\x18\x13 \x01(\t\x12\x12\n\nes_catalog\x18\x14 \x01(\t\x12\x13\n\x0b\x65s_catalog1\x18\x15 \x01(\t\x12\x13\n\x0b\x65s_catalog2\x18\x16 \x01(\t\x12\x13\n\x0b\x65s_keywords\x18\x17 \x01(\t\x12\x13\n\x0b\x65s_abstract\x18\x18 \x01(\t\x12\x12\n\nes_simflag\x18\x19 \x01(\t\x12\x12\n\nes_simrank\x18\x1a \x01(\t\x12\x13\n\x0b\x65s_urlimage\x18\x1b \x01(\t\x12\x14\n\x0c\x65s_imageflag\x18\x1c \x01(\t\x12\x14\n\x0c\x65s_tableflag\x18\x1d \x01(\t\x12\x14\n\x0c\x65s_doclength\x18\x1e \x01(\t\x12\x12\n\nes_content\x18\x1f \x01(\t\x12\x15\n\res_urlcontent\x18 \x01(\t\x12\x11\n\tes_bbsnum\x18! \x01(\t\x12\x14\n\x0c\x65s_pagelevel\x18\" \x01(\t\x12\x13\n\x0b\x65s_urllevel\x18# \x01(\t\x12\x12\n\nes_simhash\x18$ \x01(\t\x12\r\n\x05\x65s_ip\x18% \x01(\t\x12\x0f\n\x07\x65s_heat\x18& \x01(\t\x12\x1a\n\x12\x65s_similaritycount\x18\' \x01(\t\x12\x15\n\res_similarity\x18( \x01(\t\x12\x19\n\x11\x65s_similaritytime\x18) \x01(\t\x12\x12\n\nes_emotion\x18* \x01(\t\x12\x16\n\x0e\x65s_warningtime\x18+ \x01(\t\x12\x16\n\x0e\x65s_carriertype\x18, \x01(\t\x12\x17\n\x0f\x65s_commentcount\x18- \x01(\t\x12\x17\n\x0f\x65s_forwardcount\x18. \x01(\t\x12\x18\n\x10\x65s_positiveWords\x18/ \x01(\t\x12\x18\n\x10\x65s_negativeWords\x18\x30 \x01(\t\x12\x1e\n\x16\x65s_negativeProbability\x18\x31 \x01(\t\x12\x15\n\res_reportinfo\x18\x32 \x01(\t\x12\x14\n\x0c\x65s_attention\x18\x33 \x01(\t\x12\x12\n\nes_warning\x18\x34 \x01(\t\x12\x13\n\x0b\x65s_readsign\x18\x35 \x01(\t\x12\x13\n\x0b\x65s_briefing\x18\x36 \x01(\t\x12\x17\n\x0f\x65s_warning_word\x18\x37 \x01(\t\x12\x18\n\x10\x65s_attentiontime\x18\x38 \x01(\t\x12\x15\n\res_collection\x18\x39 \x01(\t\x12\x15\n\res_attachment\x18: \x01(\t\x12\x11\n\tes_userid\x18; \x01(\t\x12\x16\n\x0e\x65s_contenttype\x18< \x01(\t\x12\x14\n\x0c\x65s_likecount\x18= \x01(\t\x12\x10\n\x08\x65s_links\x18> \x01(\t\x12\x18\n\x10\x65s_reactioncount\x18? \x01(\t\x12\x13\n\x0b\x65s_linkdesc\x18@ \x01(\t\x12\x14\n\x0c\x65s_repostuid\x18\x41 \x01(\t\x12\x16\n\x0e\x65s_repostuname\x18\x42 \x01(\t\x12\x13\n\x0b\x65s_repostid\x18\x43 \x01(\t\x12\x0f\n\x07\x65s_tags\x18\x44 \x01(\t\x12\x1a\n\x12\x65s_mentionsaccount\x18\x45 \x01(\t\x12\x10\n\x08\x65s_video\x18\x46 \x01(\t\x12\x13\n\x0b\x65s_isrepost\x18G \x01(\t\x12\x0f\n\x07\x65s_lang\x18H \x01(\t\x12\x11\n\tes_client\x18I \x01(\t\x12\x13\n\x0b\x65s_snapshot\x18J \x01(\tb\x06proto3'
|
||||
serialized_pb=b'\n\x08\x45s.proto\"\x19\n\x06\x45sSets\x12\x0f\n\x02\x45s\x18\x01 \x03(\x0b\x32\x03.Es\"\xcc\x0c\n\x02\x45s\x12\x0e\n\x06\x65s_sid\x18\x01 \x01(\t\x12\x14\n\x0c\x65s_subjectId\x18\x02 \x01(\t\x12\x0f\n\x07\x65s_hkey\x18\x03 \x01(\t\x12\x0f\n\x07\x65s_pkey\x18\x04 \x01(\t\x12\x12\n\nes_startid\x18\x05 \x01(\t\x12\x12\n\nes_urlname\x18\x06 \x01(\t\x12\x13\n\x0b\x65s_sitename\x18\x07 \x01(\t\x12\x12\n\nes_extname\x18\x08 \x01(\t\x12\x12\n\nes_channel\x18\t \x01(\t\x12\x14\n\x0c\x65s_groupname\x18\n \x01(\t\x12\x13\n\x0b\x65s_urltitle\x18\x0b \x01(\t\x12\x13\n\x0b\x65s_urltopic\x18\x0c \x01(\t\x12\x13\n\x0b\x65s_lasttime\x18\r \x01(\t\x12\x13\n\x0b\x65s_loadtime\x18\x0e \x01(\t\x12\x12\n\nes_urldate\x18\x0f \x01(\t\x12\x12\n\nes_urltime\x18\x10 \x01(\t\x12\x12\n\nes_srcname\x18\x11 \x01(\t\x12\x12\n\nes_authors\x18\x12 \x01(\t\x12\x13\n\x0b\x65s_district\x18\x13 \x01(\t\x12\x12\n\nes_catalog\x18\x14 \x01(\t\x12\x13\n\x0b\x65s_catalog1\x18\x15 \x01(\t\x12\x13\n\x0b\x65s_catalog2\x18\x16 \x01(\t\x12\x13\n\x0b\x65s_keywords\x18\x17 \x01(\t\x12\x13\n\x0b\x65s_abstract\x18\x18 \x01(\t\x12\x12\n\nes_simflag\x18\x19 \x01(\t\x12\x12\n\nes_simrank\x18\x1a \x01(\t\x12\x13\n\x0b\x65s_urlimage\x18\x1b \x01(\t\x12\x14\n\x0c\x65s_imageflag\x18\x1c \x01(\t\x12\x14\n\x0c\x65s_tableflag\x18\x1d \x01(\t\x12\x14\n\x0c\x65s_doclength\x18\x1e \x01(\t\x12\x12\n\nes_content\x18\x1f \x01(\t\x12\x15\n\res_urlcontent\x18 \x01(\t\x12\x11\n\tes_bbsnum\x18! \x01(\t\x12\x14\n\x0c\x65s_pagelevel\x18\" \x01(\t\x12\x13\n\x0b\x65s_urllevel\x18# \x01(\t\x12\x12\n\nes_simhash\x18$ \x01(\t\x12\r\n\x05\x65s_ip\x18% \x01(\t\x12\x0f\n\x07\x65s_heat\x18& \x01(\t\x12\x1a\n\x12\x65s_similaritycount\x18\' \x01(\t\x12\x15\n\res_similarity\x18( \x01(\t\x12\x19\n\x11\x65s_similaritytime\x18) \x01(\t\x12\x12\n\nes_emotion\x18* \x01(\t\x12\x16\n\x0e\x65s_warningtime\x18+ \x01(\t\x12\x16\n\x0e\x65s_carriertype\x18, \x01(\t\x12\x17\n\x0f\x65s_commentcount\x18- \x01(\t\x12\x17\n\x0f\x65s_forwardcount\x18. \x01(\t\x12\x18\n\x10\x65s_positiveWords\x18/ \x01(\t\x12\x18\n\x10\x65s_negativeWords\x18\x30 \x01(\t\x12\x1e\n\x16\x65s_negativeProbability\x18\x31 \x01(\t\x12\x15\n\res_reportinfo\x18\x32 \x01(\t\x12\x14\n\x0c\x65s_attention\x18\x33 \x01(\t\x12\x12\n\nes_warning\x18\x34 \x01(\t\x12\x13\n\x0b\x65s_readsign\x18\x35 \x01(\t\x12\x13\n\x0b\x65s_briefing\x18\x36 \x01(\t\x12\x17\n\x0f\x65s_warning_word\x18\x37 \x01(\t\x12\x18\n\x10\x65s_attentiontime\x18\x38 \x01(\t\x12\x15\n\res_collection\x18\x39 \x01(\t\x12\x15\n\res_attachment\x18: \x01(\t\x12\x11\n\tes_userid\x18; \x01(\t\x12\x16\n\x0e\x65s_contenttype\x18< \x01(\t\x12\x14\n\x0c\x65s_likecount\x18= \x01(\t\x12\x10\n\x08\x65s_links\x18> \x01(\t\x12\x18\n\x10\x65s_reactioncount\x18? \x01(\t\x12\x13\n\x0b\x65s_linkdesc\x18@ \x01(\t\x12\x14\n\x0c\x65s_repostuid\x18\x41 \x01(\t\x12\x16\n\x0e\x65s_repostuname\x18\x42 \x01(\t\x12\x13\n\x0b\x65s_repostid\x18\x43 \x01(\t\x12\x0f\n\x07\x65s_tags\x18\x44 \x01(\t\x12\x1a\n\x12\x65s_mentionsaccount\x18\x45 \x01(\t\x12\x10\n\x08\x65s_video\x18\x46 \x01(\t\x12\x13\n\x0b\x65s_isrepost\x18G \x01(\t\x12\x0f\n\x07\x65s_lang\x18H \x01(\t\x12\x11\n\tes_client\x18I \x01(\t\x12\x13\n\x0b\x65s_snapshot\x18J \x01(\t\x12\x10\n\x08\x65s_title\x18K \x01(\tb\x06proto3'
|
||||
)
|
||||
|
||||
|
||||
@ -580,6 +580,13 @@ _ES = _descriptor.Descriptor(
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='es_title', full_name='Es.es_title', index=74,
|
||||
number=75, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=b"".decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR),
|
||||
],
|
||||
extensions=[
|
||||
],
|
||||
@ -593,7 +600,7 @@ _ES = _descriptor.Descriptor(
|
||||
oneofs=[
|
||||
],
|
||||
serialized_start=40,
|
||||
serialized_end=1634,
|
||||
serialized_end=1652,
|
||||
)
|
||||
|
||||
_ESSETS.fields_by_name['Es'].message_type = _ES
|
||||
|
||||
@ -12,8 +12,9 @@ SCHEDULER_PERSIST = True
|
||||
SELENIUM_DRIVER_NAME = 'firefox'
|
||||
SELENIUM_DRIVER_EXECUTABLE_PATH = [
|
||||
'http://10.55.13.121:28095',
|
||||
'http://10.55.13.108:28095',
|
||||
# 'http://10.55.13.108:28095',
|
||||
'http://10.55.13.3:28095',
|
||||
'http://74.121.148.204:28095'
|
||||
]
|
||||
SELENIUM_DRIVER_ARGUMENTS = ['-headless'] # '--headless' if using chrome instead of firefox
|
||||
SELENIUM_DRIVER_PREFERENCES = {
|
||||
@ -26,8 +27,10 @@ PER_BATCH_IP_USE_TIMES = 5 # 代理中间件每次从ip池获取一批ip,定
|
||||
|
||||
# REDIS_HOST = '38.54.94.107'
|
||||
# REDIS_PORT = '28097'
|
||||
REDIS_HOST = '10.55.13.3'
|
||||
REDIS_PORT = '7379'
|
||||
# REDIS_HOST = '10.55.13.3'
|
||||
# REDIS_PORT = '7379'
|
||||
REDIS_HOST = '107.182.191.3'
|
||||
REDIS_PORT = 7379
|
||||
REDIS_PWD = 'jlkj-841-2-redis'
|
||||
REDIS_PARAMS = {
|
||||
'password': 'jlkj-841-2-redis',
|
||||
@ -164,3 +167,10 @@ ITEM_PIPELINES = {
|
||||
'scrapy.pipelines.images.ImagesPipeline': 2,
|
||||
'WebsiteSpider.pipelines.ProtobufSavePipeline': 300,
|
||||
}
|
||||
|
||||
############################## 翻译
|
||||
MAX_TEXT_LENGTH = 5999
|
||||
# 翻译 API 地址(替换为你的服务器 IP 或域名)
|
||||
TRANSLATE_API_URL = "http://47.113.231.200:28082/translate"
|
||||
# 单次请求间隔(秒),避免 API 被限流
|
||||
REQUEST_DELAY = 1
|
||||
|
||||
@ -5,7 +5,9 @@ import re
|
||||
import scrapy
|
||||
import validators
|
||||
from scrapy_redis.spiders import RedisSpider
|
||||
import redis
|
||||
|
||||
from WebsiteSpider.settings import REDIS_HOST, REDIS_PORT, REDIS_PWD
|
||||
from WebsiteSpider.scrapy_selenium import SeleniumRequest
|
||||
from WebsiteSpider.utils.http_utils import build_url
|
||||
from WebsiteSpider.utils.parser_utils import parse_item_from_response
|
||||
@ -18,7 +20,8 @@ class WebsiteInfoCommonSpider(RedisSpider):
|
||||
super(WebsiteInfoCommonSpider, self).__init__(*args, **kwargs)
|
||||
json_params = json.loads(params)
|
||||
self.name = 'WebSite_' + json_params['clusterName']
|
||||
self.redis_client = None
|
||||
self.redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT,
|
||||
password=REDIS_PWD)
|
||||
if 'job_id' in json_params:
|
||||
self.job_id = json_params['job_id']
|
||||
|
||||
@ -36,7 +39,7 @@ class WebsiteInfoCommonSpider(RedisSpider):
|
||||
# 根据url特征判断是否为内容页,若是则解析文本内容
|
||||
detail_page_reg = parse_rule['detailPageReg']
|
||||
if detail_page_reg == "" or re.search(detail_page_reg, response.url) is not None:
|
||||
yield_flag, webpage_item = parse_item_from_response(response, parse_rule)
|
||||
yield_flag, webpage_item = parse_item_from_response(response, parse_rule, self.redis_client)
|
||||
if yield_flag:
|
||||
yield webpage_item
|
||||
|
||||
|
||||
@ -89,7 +89,9 @@ def get_format_time(pattern, time_str):
|
||||
date = result.group(1)
|
||||
time_t = result.group(2)
|
||||
date = date.replace('/', '-').replace(".", "-").replace(
|
||||
",", "-").replace("年", "-").replace("月", "-").replace("日", "").replace(' ', '-').replace('--', '-')
|
||||
",", "-").replace("年", "-").replace("月", "-").replace("日", "").replace(
|
||||
"년", "-").replace("월", "-").replace("일", "").replace(
|
||||
' ', '-').replace('--', '-')
|
||||
date_array = date.split('-')
|
||||
for i in range(len(date_array)):
|
||||
if (date_array[i].endswith('st') or
|
||||
@ -135,7 +137,7 @@ def get_format_time(pattern, time_str):
|
||||
|
||||
if __name__ == '__main__':
|
||||
# a = [' 令和4年6月9日', 'www.kcna.kp (主体111.6.6.)', '民國111年06月09日 ', 'Jun. 9, 2022', '111年 06月 21日']
|
||||
a = ['July 26, 2024 12:53 PM']
|
||||
a = ['Wed, 12/03/2025 - 12:00']
|
||||
for _ in a:
|
||||
print(get_time_stamp(_))
|
||||
# print(get_time_stamp(_, {r"(\w+ \d+, \d{4})\D*(\d+:\d+)\D*": ['%B-%d-%Y %H:%M:%S']}))
|
||||
# print(get_time_stamp(_, {r"(\d{2}.\d{2}.\d{4})\D*(\d{2}\d{2}\d{2})*\D*": ['%d-%m-%Y %H:%M:%S']}))
|
||||
|
||||
@ -20,11 +20,11 @@ def http_get(url):
|
||||
return rsp
|
||||
|
||||
|
||||
def http_post(url, data, headers=None):
|
||||
def http_post(url, data, headers=None, timeout=60):
|
||||
if headers:
|
||||
rsp = requests.post(url, data=data, headers=headers)
|
||||
rsp = requests.post(url, data=data, headers=headers, timeout=timeout)
|
||||
else:
|
||||
rsp = requests.post(url, data=data, headers={'User-Agent': ua})
|
||||
rsp = requests.post(url, data=data, headers={'User-Agent': ua}, timeout=timeout)
|
||||
return rsp
|
||||
|
||||
|
||||
|
||||
@ -9,9 +9,10 @@ from scrapy.loader import ItemLoader
|
||||
from WebsiteSpider.items import WebsiteSpiderItem
|
||||
from WebsiteSpider.utils.date_utils import transfer_time_zone, get_time_stamp
|
||||
from WebsiteSpider.utils.http_utils import filter_html_tags, build_url
|
||||
from WebsiteSpider.utils.traslate_utils import translate_single, translate_content_with_paragraphs, update_record
|
||||
|
||||
|
||||
def parse_item_from_response(response, parse_rule):
|
||||
def parse_item_from_response(response, parse_rule, redis_client):
|
||||
current_url = response.url
|
||||
allowed_domains = parse_rule['allowDomain'].split(';')
|
||||
mapping = parse_rule['fieldMappings']
|
||||
@ -116,6 +117,7 @@ def parse_item_from_response(response, parse_rule):
|
||||
logger.info("urltime: %s" % webpage_item['es_urltime'])
|
||||
except KeyError:
|
||||
logger.info('时间解析失败,当前页面url: %s' % response.url)
|
||||
|
||||
time_parse_rule = None
|
||||
if 'dateReg' in mapping:
|
||||
time_parse_rule = {
|
||||
@ -155,4 +157,27 @@ def parse_item_from_response(response, parse_rule):
|
||||
logger.info('时间无法解析,解析规则是:' + mapping['es_urltime'])
|
||||
if filter_VIP_content:
|
||||
logger.info('当前内容是VIP文章,并不完整,已经过滤。')
|
||||
if yield_flag:
|
||||
try:
|
||||
# 1. 从 Redis 获取原始数据
|
||||
raw_urls = redis_client.lrange('WebsiteSpider:translate_sites', 0, -1)
|
||||
translate_list = [
|
||||
url_bytes.decode('utf-8').strip()
|
||||
for url_bytes in raw_urls
|
||||
if url_bytes and url_bytes.decode('utf-8').strip()
|
||||
]
|
||||
if webpage_item['es_srcname'] in translate_list:
|
||||
# 翻译标题
|
||||
webpage_item['es_abstract'] = translate_single(webpage_item['es_urltitle'])
|
||||
if webpage_item['es_abstract'] is None:
|
||||
logger.warning(" → 标题翻译失败,跳过整条")
|
||||
else:
|
||||
logger.info(f"翻译成功,标题译文长度:{len(webpage_item['es_abstract'])}")
|
||||
# 翻译内容(按段落,容错)
|
||||
no_tag_content = filter_html_tags(webpage_item['es_urlcontent'], retain_img_br=False)
|
||||
webpage_item['es_content'] = translate_content_with_paragraphs(no_tag_content)
|
||||
logger.info(f"翻译成功,内容译文长度:{len(webpage_item['es_content'])}")
|
||||
except Exception as e:
|
||||
logger.error(repr(e))
|
||||
|
||||
return yield_flag, webpage_item
|
||||
|
||||
@ -3,6 +3,7 @@ import logging as logger
|
||||
import os
|
||||
import random
|
||||
|
||||
import requests
|
||||
from PIL import Image
|
||||
from selenium.webdriver import DesiredCapabilities
|
||||
from selenium.webdriver.firefox.options import Options
|
||||
@ -25,10 +26,15 @@ def check_session(drive_path):
|
||||
api = drive_path + '/graphql'
|
||||
post_body = '{"query": "{ grid { maxSession, sessionCount } }"}'
|
||||
try:
|
||||
response = http_post(api, post_body)
|
||||
# 添加超时控制,1分钟 = 600秒
|
||||
response = http_post(api, post_body, timeout=60)
|
||||
data_body = json.loads(response.content.decode())
|
||||
session_info = data_body['data']['grid']
|
||||
return session_info
|
||||
except requests.exceptions.Timeout as e:
|
||||
logger.error("获取地址为 {} 的 Selenium 信息超时(超过5分钟):".format(drive_path))
|
||||
logger.error(repr(e))
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.warning("获取地址为 {} 的 Selenium 信息失败,错误信息是:".format(drive_path))
|
||||
logger.warning(repr(e))
|
||||
|
||||
78
spiders/WebsiteSpider/WebsiteSpider/utils/traslate_utils.py
Normal file
78
spiders/WebsiteSpider/WebsiteSpider/utils/traslate_utils.py
Normal file
@ -0,0 +1,78 @@
|
||||
from WebsiteSpider.settings import MAX_TEXT_LENGTH, TRANSLATE_API_URL, REQUEST_DELAY
|
||||
import requests
|
||||
import time
|
||||
from typing import List, Tuple, Optional
|
||||
|
||||
def normalize_newlines(text: str) -> str:
|
||||
"""将 \r\n 和 \r 统一转换为 \n"""
|
||||
if not text:
|
||||
return text
|
||||
return text.replace('\r\n', '\n').replace('\r', '\n')
|
||||
|
||||
|
||||
def translate_single(text: str, source_lang: str = "auto", target_lang: str = "zh") -> Optional[str]:
|
||||
"""翻译单段文本,失败返回 None"""
|
||||
if not text or not text.strip():
|
||||
return ""
|
||||
|
||||
payload = {
|
||||
"text": text[:MAX_TEXT_LENGTH],
|
||||
"source_lang": source_lang,
|
||||
"target_lang": target_lang
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.post(TRANSLATE_API_URL, json=payload, timeout=10)
|
||||
response.raise_for_status()
|
||||
result = response.json()
|
||||
return result.get("translated_text")
|
||||
except Exception as e:
|
||||
print(f"⚠️ 翻译失败: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def translate_content_with_paragraphs(content: str) -> str:
|
||||
"""
|
||||
按段落翻译内容,支持容错:
|
||||
- 某段失败 → 跳过该段(保留空行或原文)
|
||||
- 返回拼接后的完整内容
|
||||
"""
|
||||
if not content:
|
||||
return ""
|
||||
|
||||
# 标准化换行符
|
||||
content = normalize_newlines(content)
|
||||
paragraphs = content.split('\n')
|
||||
translated_paragraphs = []
|
||||
|
||||
for para in paragraphs:
|
||||
if not para.strip():
|
||||
# 保留空行
|
||||
translated_paragraphs.append("")
|
||||
continue
|
||||
|
||||
trans = translate_single(para)
|
||||
if trans is None:
|
||||
# 段落翻译失败:跳过该段(可选:保留原文或留空)
|
||||
print(f" ⚠️ 段落翻译失败,跳过: {para[:30]}...")
|
||||
translated_paragraphs.append("") # 或 append(para) 保留原文
|
||||
else:
|
||||
translated_paragraphs.append(trans)
|
||||
|
||||
time.sleep(REQUEST_DELAY)
|
||||
|
||||
return '\n'.join(translated_paragraphs)
|
||||
|
||||
|
||||
# ================== 数据库操作 ==================
|
||||
|
||||
def update_record(cursor, es_sid: int, new_title: str, new_content: str):
|
||||
update_query = """
|
||||
UPDATE indeximos
|
||||
SET es_title = % s, es_content = % s
|
||||
WHERE es_sid = % s
|
||||
"""
|
||||
cursor.execute(update_query, (new_title, new_content, es_sid))
|
||||
|
||||
if __name__ == "__main__":
|
||||
print(translate_content_with_paragraphs("ВСУ провалили наступление на Сумском и Харьковском направлениях, сообщили РИА Новости в силовых структурах. В результате слаженных действий российских бойцов контратаки отражены, а противник обращен в бегство. Введенные ЕС ограничения на передвижения российских дипломатов противоречат Венской конвенции о дипломатических сношениях и мешают нормальной работе дипмиссий. Об этом заявил РИА Новости посол России в Бельгии Денис Гончар. Вице-президент США Джей Ди Вэнс посетит с визитом Армению и Азербайджан. Поездка в Ереван состоится 9-10 февраля, в Баку – 10-11 февраля. В Вашингтон Вэнс вернется \"в среду вечером\", сообщает его пресс-пул. Либерально-демократическая партия под руководством премьер-министра Японии Санаэ Такаити победила на выборах в ключевую нижнюю палату парламента. Представители ЛДП получат 316 из 465 мандатов и смогут проводить законопроекты, даже если они не получат поддержки верхней палаты, где партия не имеет большинства. В России самая низкая безработица в странах \"Большой двадцатки\", выяснило РИА Новости, изучив данные национальных статслужб по итогам 2025 года. Уровень безработицы в России в декабре составил 2,2 процента, что на одну десятую процента ниже показателя 2024 года."))
|
||||
@ -9,4 +9,4 @@ dirpath = os.path.dirname(os.path.abspath(__file__))
|
||||
sys.path.append(dirpath)
|
||||
|
||||
if __name__ == "__main__":
|
||||
execute(['scrapy', 'crawl', 'website_info_common', '-a', 'params={"job_id":"801","clusterName":"star_4"}'])
|
||||
execute(['scrapy', 'crawl', 'Website_report_list', '-a', 'params={"job_id":"801","clusterName":"star_4"}'])
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user