Compare commits

..

No commits in common. "main" and "remote-dsp" have entirely different histories.

83 changed files with 3665 additions and 7805 deletions

4
.gitignore vendored
View File

@ -8,7 +8,6 @@ __pycache__/
# Distribution / packaging # Distribution / packaging
.Python .Python
.vscode/ .vscode/
/.venv/
env/ env/
build/ build/
develop-eggs/ develop-eggs/
@ -64,5 +63,4 @@ target/
._* ._*
node_modules/ node_modules/
.arts/ .arts/
.jlsp/ .jlsp/
*.iml

View File

@ -1,21 +0,0 @@
# 使用官方 Python 3.8.2 slim 镜像(精简版)
FROM python:3.8.2-slim
# 设置工作目录
WORKDIR /app
# 安装依赖前先复制 requirements利用 Docker 缓存)
COPY requirements.txt .
# 升级 pip 并安装依赖(使用国内源加速,可选)
RUN pip install --no-cache-dir --upgrade pip \
&& pip install --no-cache-dir -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple/
# 复制应用代码
COPY . .
# 暴露端口
EXPOSE 5000
# 启动命令(使用 gunicorn 提升生产性能)
CMD ["gunicorn", "--bind", "0.0.0.0:5000", "--workers", "2", "--timeout", "60", "main:app"]

View File

@ -1,57 +0,0 @@
# main.py
from flask import Flask, request, jsonify
from translate import translate_text
app = Flask(__name__)
@app.route('/translate', methods=['POST'])
def translate_api():
"""
多语言翻译接口
请求体示例:
{
"text": "Hello world",
"source_lang": "en", // 可选默认 auto
"target_lang": "zh" // 可选默认 zh
}
"""
data = request.get_json()
if not data or 'text' not in data:
return jsonify({"error": "缺少参数 'text'"}), 400
text = data['text']
source_lang = data.get('source_lang', 'auto')
target_lang = data.get('target_lang', 'zh')
result = translate_text(text, source_lang, target_lang)
if result['success']:
return jsonify({
"translated_text": result['translated_text'],
"source_lang": source_lang,
"target_lang": target_lang
})
else:
return jsonify({"error": result['error']}), 400
@app.route('/health', methods=['GET'])
def health_check():
return jsonify({"status": "ok", "service": "baidu-translate"})
@app.route('/', methods=['GET'])
def index():
return jsonify({
"message": "Baidu Translate API Service",
"endpoints": {
"translate": "POST /translate",
"health": "GET /health"
}
})
if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000, debug=False)

View File

@ -1,3 +0,0 @@
Flask==2.3.3
requests==2.31.0
gunicorn==21.2.0

View File

@ -1,26 +0,0 @@
#!/bin/bash
# 构建镜像
echo "正在构建 Docker 镜像..."
docker build -t baidu-translate-api:latest .
# 停止并删除旧容器(如果存在)
if [ "$(docker ps -q -f name=baidu-translate)" ]; then
echo "停止旧容器..."
docker stop baidu-translate
fi
if [ "$(docker ps -aq -f status=exited -f name=baidu-translate)" ]; then
echo "删除旧容器..."
docker rm baidu-translate
fi
# 启动新容器
echo "启动容器..."
docker run -d \
--name baidu-translate \
-p 28081:5000 \
--restart unless-stopped \
baidu-translate-api:latest
echo "服务已启动!访问 http://<服务器IP>:5000/health"

View File

@ -1,26 +0,0 @@
# settings.py
# 百度翻译 API 配置
BAIDU_APP_ID = "20200811000539778"
BAIDU_SECRET_KEY = "uK9IyUhuEWX3PIqN75iC"
TIMEOUT = 10
MAX_TEXT_LENGTH = 100
# 百度语言代码映射ISO 639-1 → Baidu Code
BAIDU_LANG_MAP = {
'zh': 'zh',
'en': 'en',
'ko': 'kor',
'ja': 'jp',
'fr': 'fra',
'es': 'spa',
'ru': 'ru',
'de': 'de',
'pt': 'pt',
'it': 'it',
'ar': 'ara',
'th': 'th',
'vi': 'vie',
# 可继续扩展
}

View File

@ -1,71 +0,0 @@
# translate.py
import hashlib
import random
import requests
import settings
def iso_to_baidu_lang(iso_code: str) -> str:
"""将 ISO 639-1 语言代码转换为百度翻译所需代码"""
code = settings.BAIDU_LANG_MAP.get(iso_code.lower())
if code is None:
raise ValueError(f"不支持的语言代码: {iso_code}")
return code
def translate_text(text: str, source_lang: str = "auto", target_lang: str = "zh") -> dict:
"""
使用百度翻译 API 进行多语言翻译
:param text: 原文
:param source_lang: 源语言 ISO 代码 'ko', 'en'默认 'auto' 自动检测
:param target_lang: 目标语言 ISO 代码默认 'zh'
:return: {'success': bool, 'translated_text': str, 'error': str (optional)}
"""
if not text or not text.strip():
return {"success": False, "error": "输入文本为空"}
try:
from_lang = "auto" if source_lang == "auto" else iso_to_baidu_lang(source_lang)
to_lang = iso_to_baidu_lang(target_lang)
except ValueError as e:
return {"success": False, "error": str(e)}
q = text[:settings.MAX_TEXT_LENGTH]
try:
salt = random.randint(32768, 65536)
sign_str = settings.BAIDU_APP_ID + q + str(salt) + settings.BAIDU_SECRET_KEY
sign = hashlib.md5(sign_str.encode()).hexdigest()
payload = {
'q': q,
'from': from_lang,
'to': to_lang,
'appid': settings.BAIDU_APP_ID,
'salt': salt,
'sign': sign
}
response = requests.post(
"https://fanyi-api.baidu.com/api/trans/vip/translate",
data=payload,
timeout=settings.TIMEOUT
)
response.raise_for_status()
result = response.json()
if 'error_code' in result:
error_msg = f"百度API错误 {result.get('error_code')}: {result.get('error_msg', '')}"
return {"success": False, "error": error_msg}
if 'trans_result' not in result or not result['trans_result']:
return {"success": False, "error": "翻译结果为空"}
translated = result['trans_result'][0]['dst']
return {"success": True, "translated_text": translated}
except requests.exceptions.RequestException as e:
return {"success": False, "error": f"网络请求失败: {str(e)}"}
except Exception as e:
return {"success": False, "error": f"未知错误: {str(e)}"}

View File

@ -1,23 +1,199 @@
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<module version="4"> <module org.jetbrains.idea.maven.project.MavenProjectsManager.isMavenModule="true" type="JAVA_MODULE" version="4">
<component name="FacetManager"> <component name="FacetManager">
<facet type="web" name="Web"> <facet type="web" name="Web">
<configuration> <configuration>
<webroots /> <webroots />
</configuration> </configuration>
</facet> </facet>
<facet type="jpa" name="JPA">
<configuration>
<setting name="validation-enabled" value="true" />
<setting name="provider-name" value="Hibernate" />
<datasource-mapping>
<factory-entry name="entityManagerFactory" />
</datasource-mapping>
<naming-strategy-map />
</configuration>
</facet>
<facet type="Spring" name="Spring"> <facet type="Spring" name="Spring">
<configuration /> <configuration />
</facet> </facet>
</component> </component>
<component name="NewModuleRootManager" LANGUAGE_LEVEL="JDK_1_8">
<output url="file://$MODULE_DIR$/target/classes" />
<output-test url="file://$MODULE_DIR$/target/test-classes" />
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/src/main/java" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/main/resources" type="java-resource" />
<excludeFolder url="file://$MODULE_DIR$/target" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
<orderEntry type="library" name="Maven: org.jetbrains:annotations:26.0.2" level="project" />
<orderEntry type="library" name="Maven: org.apache.commons:commons-compress:1.18" level="project" />
<orderEntry type="library" name="Maven: com.alibaba:fastjson:1.2.75" level="project" />
<orderEntry type="library" name="Maven: org.projectlombok:lombok:1.18.16" level="project" />
<orderEntry type="library" name="Maven: org.springframework.boot:spring-boot-starter-web:2.4.1" level="project" />
<orderEntry type="library" name="Maven: org.springframework.boot:spring-boot-starter:2.4.1" level="project" />
<orderEntry type="library" name="Maven: org.springframework.boot:spring-boot-starter-logging:2.4.1" level="project" />
<orderEntry type="library" name="Maven: ch.qos.logback:logback-classic:1.2.3" level="project" />
<orderEntry type="library" name="Maven: ch.qos.logback:logback-core:1.2.3" level="project" />
<orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-to-slf4j:2.13.3" level="project" />
<orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-api:2.13.3" level="project" />
<orderEntry type="library" name="Maven: org.slf4j:jul-to-slf4j:1.7.30" level="project" />
<orderEntry type="library" name="Maven: jakarta.annotation:jakarta.annotation-api:1.3.5" level="project" />
<orderEntry type="library" name="Maven: org.yaml:snakeyaml:1.27" level="project" />
<orderEntry type="library" name="Maven: org.springframework.boot:spring-boot-starter-json:2.4.1" level="project" />
<orderEntry type="library" name="Maven: com.fasterxml.jackson.datatype:jackson-datatype-jdk8:2.11.3" level="project" />
<orderEntry type="library" name="Maven: com.fasterxml.jackson.datatype:jackson-datatype-jsr310:2.11.3" level="project" />
<orderEntry type="library" name="Maven: com.fasterxml.jackson.module:jackson-module-parameter-names:2.11.3" level="project" />
<orderEntry type="library" name="Maven: org.springframework.boot:spring-boot-starter-tomcat:2.4.1" level="project" />
<orderEntry type="library" name="Maven: org.apache.tomcat.embed:tomcat-embed-core:9.0.41" level="project" />
<orderEntry type="library" name="Maven: org.glassfish:jakarta.el:3.0.3" level="project" />
<orderEntry type="library" name="Maven: org.apache.tomcat.embed:tomcat-embed-websocket:9.0.41" level="project" />
<orderEntry type="library" name="Maven: org.springframework:spring-web:5.3.2" level="project" />
<orderEntry type="library" name="Maven: org.springframework:spring-beans:5.3.2" level="project" />
<orderEntry type="library" name="Maven: org.springframework:spring-webmvc:5.3.2" level="project" />
<orderEntry type="library" name="Maven: org.springframework:spring-aop:5.3.2" level="project" />
<orderEntry type="library" name="Maven: org.springframework:spring-expression:5.3.2" level="project" />
<orderEntry type="library" name="Maven: org.springframework.boot:spring-boot-starter-data-elasticsearch:2.4.1" level="project" />
<orderEntry type="library" name="Maven: org.springframework.data:spring-data-elasticsearch:4.1.2" level="project" />
<orderEntry type="library" name="Maven: org.springframework:spring-tx:5.3.2" level="project" />
<orderEntry type="library" name="Maven: org.springframework.data:spring-data-commons:2.4.2" level="project" />
<orderEntry type="library" name="Maven: org.elasticsearch.plugin:transport-netty4-client:7.7.0" level="project" />
<orderEntry type="library" name="Maven: io.netty:netty-buffer:4.1.55.Final" level="project" />
<orderEntry type="library" name="Maven: io.netty:netty-codec:4.1.55.Final" level="project" />
<orderEntry type="library" name="Maven: io.netty:netty-codec-http:4.1.55.Final" level="project" />
<orderEntry type="library" name="Maven: io.netty:netty-common:4.1.55.Final" level="project" />
<orderEntry type="library" name="Maven: io.netty:netty-handler:4.1.55.Final" level="project" />
<orderEntry type="library" name="Maven: io.netty:netty-resolver:4.1.55.Final" level="project" />
<orderEntry type="library" name="Maven: io.netty:netty-transport:4.1.55.Final" level="project" />
<orderEntry type="library" name="Maven: org.elasticsearch.client:elasticsearch-rest-high-level-client:7.7.0" level="project" />
<orderEntry type="library" name="Maven: org.elasticsearch:elasticsearch:7.7.0" level="project" />
<orderEntry type="library" name="Maven: org.elasticsearch:elasticsearch-core:7.7.0" level="project" />
<orderEntry type="library" name="Maven: org.elasticsearch:elasticsearch-secure-sm:7.7.0" level="project" />
<orderEntry type="library" name="Maven: org.elasticsearch:elasticsearch-x-content:7.7.0" level="project" />
<orderEntry type="library" name="Maven: com.fasterxml.jackson.dataformat:jackson-dataformat-smile:2.11.3" level="project" />
<orderEntry type="library" name="Maven: com.fasterxml.jackson.dataformat:jackson-dataformat-yaml:2.11.3" level="project" />
<orderEntry type="library" name="Maven: com.fasterxml.jackson.dataformat:jackson-dataformat-cbor:2.11.3" level="project" />
<orderEntry type="library" name="Maven: org.elasticsearch:elasticsearch-geo:7.7.0" level="project" />
<orderEntry type="library" name="Maven: org.apache.lucene:lucene-core:8.5.1" level="project" />
<orderEntry type="library" name="Maven: org.apache.lucene:lucene-analyzers-common:8.5.1" level="project" />
<orderEntry type="library" name="Maven: org.apache.lucene:lucene-backward-codecs:8.5.1" level="project" />
<orderEntry type="library" name="Maven: org.apache.lucene:lucene-grouping:8.5.1" level="project" />
<orderEntry type="library" name="Maven: org.apache.lucene:lucene-highlighter:8.5.1" level="project" />
<orderEntry type="library" name="Maven: org.apache.lucene:lucene-join:8.5.1" level="project" />
<orderEntry type="library" name="Maven: org.apache.lucene:lucene-memory:8.5.1" level="project" />
<orderEntry type="library" name="Maven: org.apache.lucene:lucene-misc:8.5.1" level="project" />
<orderEntry type="library" name="Maven: org.apache.lucene:lucene-queries:8.5.1" level="project" />
<orderEntry type="library" name="Maven: org.apache.lucene:lucene-queryparser:8.5.1" level="project" />
<orderEntry type="library" name="Maven: org.apache.lucene:lucene-sandbox:8.5.1" level="project" />
<orderEntry type="library" name="Maven: org.apache.lucene:lucene-spatial-extras:8.5.1" level="project" />
<orderEntry type="library" name="Maven: org.apache.lucene:lucene-spatial3d:8.5.1" level="project" />
<orderEntry type="library" name="Maven: org.apache.lucene:lucene-suggest:8.5.1" level="project" />
<orderEntry type="library" name="Maven: org.elasticsearch:elasticsearch-cli:7.7.0" level="project" />
<orderEntry type="library" name="Maven: com.carrotsearch:hppc:0.8.1" level="project" />
<orderEntry type="library" name="Maven: joda-time:joda-time:2.10.4" level="project" />
<orderEntry type="library" name="Maven: com.tdunning:t-digest:3.2" level="project" />
<orderEntry type="library" name="Maven: org.hdrhistogram:HdrHistogram:2.1.9" level="project" />
<orderEntry type="library" name="Maven: org.elasticsearch:jna:4.5.1" level="project" />
<orderEntry type="library" name="Maven: org.elasticsearch.client:elasticsearch-rest-client:7.7.0" level="project" />
<orderEntry type="library" name="Maven: org.apache.httpcomponents:httpclient:4.5.13" level="project" />
<orderEntry type="library" name="Maven: org.apache.httpcomponents:httpcore:4.4.14" level="project" />
<orderEntry type="library" name="Maven: org.apache.httpcomponents:httpasyncclient:4.1.4" level="project" />
<orderEntry type="library" name="Maven: org.apache.httpcomponents:httpcore-nio:4.4.14" level="project" />
<orderEntry type="library" name="Maven: commons-codec:commons-codec:1.15" level="project" />
<orderEntry type="library" name="Maven: org.elasticsearch.plugin:mapper-extras-client:7.7.0" level="project" />
<orderEntry type="library" name="Maven: org.elasticsearch.plugin:parent-join-client:7.7.0" level="project" />
<orderEntry type="library" name="Maven: org.elasticsearch.plugin:aggs-matrix-stats-client:7.7.0" level="project" />
<orderEntry type="library" name="Maven: org.elasticsearch.plugin:rank-eval-client:7.7.0" level="project" />
<orderEntry type="library" name="Maven: org.elasticsearch.plugin:lang-mustache-client:7.7.0" level="project" />
<orderEntry type="library" name="Maven: com.github.spullara.mustache.java:compiler:0.9.6" level="project" />
<orderEntry type="library" name="Maven: com.fasterxml.jackson.core:jackson-core:2.11.3" level="project" />
<orderEntry type="library" name="Maven: org.slf4j:slf4j-api:1.7.30" level="project" />
<orderEntry type="library" name="Maven: org.springframework.cloud:spring-cloud-stream:3.0.7.RELEASE" level="project" />
<orderEntry type="library" name="Maven: org.springframework.boot:spring-boot-starter-validation:2.4.1" level="project" />
<orderEntry type="library" name="Maven: org.hibernate.validator:hibernate-validator:6.1.6.Final" level="project" />
<orderEntry type="library" name="Maven: jakarta.validation:jakarta.validation-api:2.0.2" level="project" />
<orderEntry type="library" name="Maven: org.jboss.logging:jboss-logging:3.4.1.Final" level="project" />
<orderEntry type="library" name="Maven: com.fasterxml:classmate:1.5.1" level="project" />
<orderEntry type="library" name="Maven: org.springframework:spring-messaging:5.3.2" level="project" />
<orderEntry type="library" name="Maven: org.springframework.integration:spring-integration-core:5.4.2" level="project" />
<orderEntry type="library" name="Maven: io.projectreactor:reactor-core:3.4.1" level="project" />
<orderEntry type="library" name="Maven: org.reactivestreams:reactive-streams:1.0.3" level="project" />
<orderEntry type="library" name="Maven: org.springframework.integration:spring-integration-jmx:5.4.2" level="project" />
<orderEntry type="library" name="Maven: org.springframework.retry:spring-retry:1.3.0" level="project" />
<orderEntry type="library" name="Maven: javax.annotation:javax.annotation-api:1.3.2" level="project" />
<orderEntry type="library" name="Maven: org.springframework.cloud:spring-cloud-function-context:3.0.9.RELEASE" level="project" />
<orderEntry type="library" name="Maven: net.jodah:typetools:0.6.2" level="project" />
<orderEntry type="library" name="Maven: org.springframework.cloud:spring-cloud-function-core:3.0.9.RELEASE" level="project" />
<orderEntry type="library" name="Maven: org.springframework.cloud:spring-cloud-stream-binder-kafka:3.0.7.RELEASE" level="project" />
<orderEntry type="library" name="Maven: org.springframework.cloud:spring-cloud-stream-binder-kafka-core:3.0.7.RELEASE" level="project" />
<orderEntry type="library" name="Maven: org.springframework.integration:spring-integration-kafka:5.4.2" level="project" />
<orderEntry type="library" name="Maven: org.apache.kafka:kafka-clients:2.6.0" level="project" />
<orderEntry type="library" name="Maven: com.github.luben:zstd-jni:1.4.4-7" level="project" />
<orderEntry type="library" name="Maven: org.lz4:lz4-java:1.7.1" level="project" />
<orderEntry type="library" name="Maven: org.xerial.snappy:snappy-java:1.1.7.3" level="project" />
<orderEntry type="library" name="Maven: org.springframework.kafka:spring-kafka:2.6.4" level="project" />
<orderEntry type="library" name="Maven: com.google.protobuf:protobuf-java:3.11.4" level="project" />
<orderEntry type="library" name="Maven: com.squareup.okhttp3:okhttp:4.8.1" level="project" />
<orderEntry type="library" name="Maven: com.squareup.okio:okio:2.7.0" level="project" />
<orderEntry type="library" name="Maven: org.jetbrains.kotlin:kotlin-stdlib-common:1.4.21" level="project" />
<orderEntry type="library" name="Maven: org.jetbrains.kotlin:kotlin-stdlib:1.4.21" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.springframework.boot:spring-boot-starter-test:2.4.1" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.springframework.boot:spring-boot-test:2.4.1" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.springframework.boot:spring-boot-test-autoconfigure:2.4.1" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: com.jayway.jsonpath:json-path:2.4.0" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: net.minidev:json-smart:2.3" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: net.minidev:accessors-smart:1.2" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.ow2.asm:asm:5.0.4" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: jakarta.xml.bind:jakarta.xml.bind-api:2.3.3" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: jakarta.activation:jakarta.activation-api:1.2.2" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.assertj:assertj-core:3.18.1" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.hamcrest:hamcrest:2.2" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.junit.jupiter:junit-jupiter:5.7.0" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.junit.jupiter:junit-jupiter-params:5.7.0" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.junit.jupiter:junit-jupiter-engine:5.7.0" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.junit.platform:junit-platform-engine:1.7.0" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.mockito:mockito-core:3.6.28" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: net.bytebuddy:byte-buddy:1.10.18" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: net.bytebuddy:byte-buddy-agent:1.10.18" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.objenesis:objenesis:3.1" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.mockito:mockito-junit-jupiter:3.6.28" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.skyscreamer:jsonassert:1.5.0" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: com.vaadin.external.google:android-json:0.0.20131108.vaadin1" level="project" />
<orderEntry type="library" name="Maven: org.springframework:spring-core:5.3.2" level="project" />
<orderEntry type="library" name="Maven: org.springframework:spring-jcl:5.3.2" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.springframework:spring-test:5.3.2" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.xmlunit:xmlunit-core:2.7.0" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.springframework.cloud:spring-cloud-stream-test-support:3.0.7.RELEASE" level="project" />
<orderEntry type="library" name="Maven: org.springframework.boot:spring-boot-autoconfigure:2.4.1" level="project" />
<orderEntry type="library" name="Maven: com.fasterxml.jackson.core:jackson-databind:2.11.3" level="project" />
<orderEntry type="library" name="Maven: com.fasterxml.jackson.core:jackson-annotations:2.11.3" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.springframework.kafka:spring-kafka-test:2.6.4" level="project" />
<orderEntry type="library" name="Maven: org.springframework:spring-context:5.3.2" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.apache.kafka:kafka-clients:test:2.6.0" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.apache.kafka:kafka-streams:2.6.0" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.apache.kafka:connect-json:2.6.0" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.apache.kafka:connect-api:2.6.0" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.rocksdb:rocksdbjni:5.18.4" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.apache.kafka:kafka-streams-test-utils:2.6.0" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.apache.kafka:kafka_2.13:2.6.0" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: com.fasterxml.jackson.module:jackson-module-scala_2.13:2.11.3" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: com.fasterxml.jackson.module:jackson-module-paranamer:2.11.3" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: com.thoughtworks.paranamer:paranamer:2.8" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: com.fasterxml.jackson.dataformat:jackson-dataformat-csv:2.11.3" level="project" />
<orderEntry type="library" name="Maven: net.sf.jopt-simple:jopt-simple:5.0.4" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: com.yammer.metrics:metrics-core:2.2.0" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.scala-lang.modules:scala-collection-compat_2.13:2.1.6" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.scala-lang.modules:scala-java8-compat_2.13:0.9.1" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.scala-lang:scala-library:2.13.2" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.scala-lang:scala-reflect:2.13.2" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: com.typesafe.scala-logging:scala-logging_2.13:3.9.2" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.apache.zookeeper:zookeeper:3.5.8" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.apache.zookeeper:zookeeper-jute:3.5.8" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.apache.yetus:audience-annotations:0.5.0" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: io.netty:netty-transport-native-epoll:4.1.55.Final" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: io.netty:netty-transport-native-unix-common:4.1.55.Final" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: commons-cli:commons-cli:1.4" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.apache.kafka:kafka_2.13:test:2.6.0" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.junit.jupiter:junit-jupiter-api:5.7.0" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.apiguardian:apiguardian-api:1.1.0" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.opentest4j:opentest4j:1.2.0" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.junit.platform:junit-platform-commons:1.7.0" level="project" />
<orderEntry type="library" name="Maven: org.springframework.boot:spring-boot-devtools:2.4.1" level="project" />
<orderEntry type="library" name="Maven: org.springframework.boot:spring-boot:2.4.1" level="project" />
<orderEntry type="library" scope="RUNTIME" name="Maven: mysql:mysql-connector-java:8.0.22" level="project" />
</component>
</module> </module>

View File

@ -77,7 +77,6 @@
</exclusion> </exclusion>
</exclusions> </exclusions>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.springframework.cloud</groupId> <groupId>org.springframework.cloud</groupId>
<artifactId>spring-cloud-stream-test-support</artifactId> <artifactId>spring-cloud-stream-test-support</artifactId>
@ -95,39 +94,15 @@
<optional>true</optional> <optional>true</optional>
<scope>true</scope> <scope>true</scope>
</dependency> </dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
</dependency>
<dependency> <dependency>
<groupId>mysql</groupId> <groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId> <artifactId>mysql-connector-java</artifactId>
<scope>runtime</scope> <scope>runtime</scope>
</dependency> </dependency>
<dependency>
<groupId>commons-net</groupId>
<artifactId>commons-net</artifactId>
<version>3.10.0</version> <!-- 或使用最新版本 -->
</dependency>
<!-- Spring Data JPA -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-jpa</artifactId>
</dependency>
<!-- Apache POI for Excel (xlsx) -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>5.2.4</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>5.2.4</version>
</dependency>
<dependency>
<groupId>com.jcraft</groupId>
<artifactId>jsch</artifactId>
<version>0.1.55</version> <!-- 推荐使用 0.1.55+ -->
</dependency>
</dependencies> </dependencies>
<dependencyManagement> <dependencyManagement>

View File

@ -3,10 +3,9 @@ package com.jsc.dsp;
import org.springframework.boot.SpringApplication; import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication; import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.boot.autoconfigure.jdbc.DataSourceAutoConfiguration; import org.springframework.boot.autoconfigure.jdbc.DataSourceAutoConfiguration;
import org.springframework.data.jpa.repository.config.EnableJpaRepositories;
import org.springframework.scheduling.annotation.EnableScheduling; import org.springframework.scheduling.annotation.EnableScheduling;
@SpringBootApplication @SpringBootApplication(exclude = DataSourceAutoConfiguration.class)
@EnableScheduling @EnableScheduling
public class DspApplication { public class DspApplication {

View File

@ -16,6 +16,18 @@ import java.util.concurrent.Executors;
@Component @Component
public class Configuration { public class Configuration {
@Value("${es.ip}")
String esIp;
@Value("${es.port}")
Integer esPort;
@Value("${es.username}")
String esUsername;
@Value("${es.password}")
String esPassword;
@Bean @Bean
public JacksonJsonParser getJacksonParser() { public JacksonJsonParser getJacksonParser() {
return new JacksonJsonParser(); return new JacksonJsonParser();
@ -36,4 +48,8 @@ public class Configuration {
return Executors.newFixedThreadPool(4); return Executors.newFixedThreadPool(4);
} }
@Bean
public RestHighLevelClient esClient() {
return EsUtils.getElasticsearchClient(esIp, esPort, esUsername, esPassword);
}
} }

View File

@ -1,77 +0,0 @@
package com.jsc.dsp.controller;
import com.alibaba.fastjson.JSONObject;
import com.jsc.dsp.model.ReturnT;
import com.jsc.dsp.utils.ExportAndUploadUtils;
import com.jsc.dsp.utils.DatabaseConnector;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestBody;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
import javax.annotation.Resource;
@RestController
@RequestMapping("/export")
public class ExportController {
@Resource
DatabaseConnector databaseConnector;
@Resource
ExportAndUploadUtils exportAndUploadUtils;
@PostMapping("/exportExcel")
public ReturnT<String> exportExcel(@RequestBody JSONObject object) {
try {
String startTime = object.getString("startTime");
databaseConnector.exportToXlsx(startTime);
return new ReturnT<>(200, "", "");
} catch (Exception e) {
return new ReturnT<>(500, e.getMessage(), "");
}
}
@PostMapping("/exportTwitterExcel")
public ReturnT<String> triggerTwitterTask(@RequestBody JSONObject object) {
try {
String startTime = object.getString("startTime");
databaseConnector.twitterToXlsx(startTime);
return new ReturnT<>(200, "", "");
} catch (Exception e) {
return new ReturnT<>(500, e.getMessage(), "");
}
}
@PostMapping("/exportHotSearchExcel")
public ReturnT<String> exportHotSearchExcel(@RequestBody JSONObject object) {
try {
String startTime = object.getString("startTime");
databaseConnector.hotSearchToXlsx(startTime);
return new ReturnT<>(200, "", "");
} catch (Exception e) {
return new ReturnT<>(500, e.getMessage(), "");
}
}
@PostMapping("/triggerTask")
public ReturnT<String> triggerTask() {
try {
new Thread(() -> exportAndUploadUtils.exportNewsDataAndUpload()).start();
return new ReturnT<>(200, "", "");
} catch (Exception e) {
return new ReturnT<>(500, e.getMessage(), "");
}
}
@PostMapping("/triggerHotSearchTask")
public ReturnT<String> triggerHotSearchTask() {
try {
new Thread(() -> exportAndUploadUtils.exportHotSearchAndUpload()).start();
return new ReturnT<>(200, "", "");
} catch (Exception e) {
return new ReturnT<>(500, e.getMessage(), "");
}
}
}

View File

@ -1,10 +0,0 @@
package com.jsc.dsp.dao;
import com.jsc.dsp.model.Config;
import org.springframework.data.jpa.repository.JpaRepository;
import org.springframework.stereotype.Repository;
@Repository
public interface ConfigRepository extends JpaRepository<Config, Integer> {
Config findFirstByConfigName(String configName);
}

View File

@ -1,12 +0,0 @@
package com.jsc.dsp.dao;
import com.jsc.dsp.model.EsDataHotSearchView;
import org.springframework.data.jpa.repository.JpaRepository;
import org.springframework.stereotype.Repository;
import java.util.List;
@Repository
public interface EsDataHotSearchRepository extends JpaRepository<EsDataHotSearchView, String> {
List<EsDataHotSearchView> findAllByEsLoadtimeAfter(String loadtime);
}

View File

@ -1,12 +0,0 @@
package com.jsc.dsp.dao;
import com.jsc.dsp.model.EsDataNewsView;
import org.springframework.data.jpa.repository.JpaRepository;
import org.springframework.stereotype.Repository;
import java.util.List;
@Repository
public interface EsDataNewsRepository extends JpaRepository<EsDataNewsView, String> {
List<EsDataNewsView> findAllByEsLoadtimeAfter(String loadtime);
}

View File

@ -1,12 +0,0 @@
package com.jsc.dsp.dao;
import com.jsc.dsp.model.EsDataTwitterView;
import org.springframework.data.jpa.repository.JpaRepository;
import org.springframework.stereotype.Repository;
import java.util.List;
@Repository
public interface EsDataTwitterRepository extends JpaRepository<EsDataTwitterView, String> {
List<EsDataTwitterView> findAllByEsLoadtimeAfter(String loadtime);
}

View File

@ -1,9 +0,0 @@
package com.jsc.dsp.dao;
import com.jsc.dsp.model.Indeximos;
import org.springframework.data.jpa.repository.JpaRepository;
import org.springframework.stereotype.Repository;
@Repository
public interface IndeximosRepository extends JpaRepository<Indeximos, String> {
}

View File

@ -1,15 +0,0 @@
package com.jsc.dsp.model;
import lombok.Data;
import javax.persistence.Entity;
import javax.persistence.Id;
@Entity
@Data
public class Config {
@Id
Integer id;
String configName;
String configValue;
}

View File

@ -1,38 +0,0 @@
package com.jsc.dsp.model;
import lombok.Data;
import javax.persistence.Entity;
import javax.persistence.Id;
import javax.persistence.Table;
@Entity
@Data
@Table(name = "es_data_hot_search")
public class EsDataHotSearchView {
@Id
private String esSid;
private String esUrltime;
private String esCarriertype;
private String esSitename;
private String esSimrank;
private String esUrltitle;
private String esUrlcontent;
private String esUrlname;
private String esHkey;
private String esLasttime;
private String esHeat;
private String esLoadtime;
}

View File

@ -1,38 +0,0 @@
package com.jsc.dsp.model;
import lombok.Data;
import javax.persistence.Entity;
import javax.persistence.Id;
import javax.persistence.Table;
@Entity
@Data
@Table(name = "es_data_news")
public class EsDataNewsView {
@Id
String esSid;
String esAuthors;
String esCarriertype;
String esCatalog;
String esCollection;
Float esDoclength;
String esLang;
String esLasttime;
String esLinks;
String esLoadtime;
String esSitename;
String esSrcname;
String esUrlcontent;
String esUrlcontentTranslate;
String esUrlimage;
String esUrlname;
String esUrltime;
String esUrltitle;
String esUrltitleTranslate;
String esAbstract;
String esKeywords;
String file;
String esHkey;
String esUrltopic;
}

View File

@ -1,54 +0,0 @@
package com.jsc.dsp.model;
import lombok.Data;
import javax.persistence.Entity;
import javax.persistence.Id;
import javax.persistence.Table;
@Entity
@Data
@Table(name = "es_data_twitter")
public class EsDataTwitterView {
@Id
private String esUrltime;
private String esAuthors;
private String esCarriertype;
private String esSitename;
private String esUrlcontent;
private String esUrlcontentTranslate;
private String esUrlname;
private String esUrltitle;
private String esUrltitleTranslate;
private String esVideo;
private String esExtname;
private String esIsrepost;
private String esCatalog1;
private String esForwardcount;
private String esLikecount;
private String esCommentcount;
private String esHkey;
private String esUrlimage;
private String esUserid;
private String esLoadtime;
}

View File

@ -2,17 +2,10 @@ package com.jsc.dsp.model;
import lombok.Data; import lombok.Data;
import javax.persistence.Entity;
import javax.persistence.Id;
import javax.persistence.Table;
import java.io.Serializable; import java.io.Serializable;
@Entity
@Data @Data
@Table(name = "indeximos")
public class Indeximos implements Serializable { public class Indeximos implements Serializable {
@Id
String es_sid;
String es_abstract; String es_abstract;
String es_annex; String es_annex;
String es_attachment; String es_attachment;
@ -63,6 +56,7 @@ public class Indeximos implements Serializable {
String es_repostuid; String es_repostuid;
String es_repostuname; String es_repostuname;
String es_rultopic; String es_rultopic;
String es_sid;
String es_simhash; String es_simhash;
String es_similarity; String es_similarity;
String es_similaritycount; String es_similaritycount;

View File

@ -1,29 +0,0 @@
package com.jsc.dsp.service;
import com.jsc.dsp.dao.ConfigRepository;
import com.jsc.dsp.model.Config;
import org.springframework.stereotype.Service;
import javax.annotation.Resource;
@Service
public class ConfigService {
@Resource
ConfigRepository configRepository;
public String getConfigValueByName(String configName) {
return getConfigByName(configName).getConfigValue();
}
public Config getConfigByName(String configName) {
return configRepository.findFirstByConfigName(configName);
}
public void setConfigValueByName(String configName, String configValue) {
Config config = getConfigByName(configName);
config.setConfigValue(configValue);
configRepository.save(config);
}
}

View File

@ -6,7 +6,6 @@ import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value; import org.springframework.beans.factory.annotation.Value;
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
import org.springframework.boot.json.JacksonJsonParser; import org.springframework.boot.json.JacksonJsonParser;
import org.springframework.cloud.stream.annotation.EnableBinding; import org.springframework.cloud.stream.annotation.EnableBinding;
import org.springframework.cloud.stream.annotation.StreamListener; import org.springframework.cloud.stream.annotation.StreamListener;
@ -22,7 +21,6 @@ import java.util.concurrent.Executors;
@Component @Component
@EnableBinding(FileDlBinding.class) @EnableBinding(FileDlBinding.class)
@ConditionalOnProperty(name = "switch.enable-file-dl-service", havingValue = "true", matchIfMissing = true)
public class FileDlService extends StreamService { public class FileDlService extends StreamService {
@Autowired @Autowired
@ -80,14 +78,31 @@ public class FileDlService extends StreamService {
} }
int dlResult = fileUtils.downloadFromUrl(fileURL, protoSavePath); int dlResult = fileUtils.downloadFromUrl(fileURL, protoSavePath);
if (dlResult == 1) { if (dlResult == 1) {
File transferPath = new File(protoSavePath); File transferPath = new File(transferBackupPath);
File[] files = transferPath.listFiles(); File[] files = transferPath.listFiles();
if (files != null && files.length > 0) { if (files != null && files.length > 0) {
for (File transferFile : files) { for (File transferFile : files) {
if (transferFile.getName().endsWith(".tar.gz")) { if (transferFile.getName().endsWith(".tar.gz")) {
fileUtils.moveFileToBackupFolder(transferFile, keepBackupFile); if (transferFile.getName().startsWith("attach")) {
try {
fileUtils.UnzipTarGzip(transferFile.getAbsolutePath(), nginxPath);
logger.info("Unzip attachments " + transferFile.getName());
} catch (Exception e) {
logger.error("Unzip error!");
}
} else {
try {
fileUtils.UnzipTarGzip(transferFile.getAbsolutePath(), fileUnzipPath);
logger.info("Unzip " + transferFile.getName());
} catch (Exception e) {
logger.error("Unzip error!");
}
}
} }
fileUtils.moveFileToBackupFolder(transferFile, keepBackupFile);
} }
// Runnable upload2Ceph = () -> fileUtils.uploadToCeph(fileUnzipPath);
// pool.execute(upload2Ceph);
} }
} else if (dlResult == 0) { } else if (dlResult == 0) {
logger.error("File " + fileName + " download failure"); logger.error("File " + fileName + " download failure");

View File

@ -12,14 +12,12 @@ import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value; import org.springframework.beans.factory.annotation.Value;
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
import org.springframework.boot.json.JacksonJsonParser; import org.springframework.boot.json.JacksonJsonParser;
import org.springframework.cloud.stream.annotation.EnableBinding; import org.springframework.cloud.stream.annotation.EnableBinding;
import org.springframework.cloud.stream.annotation.StreamListener; import org.springframework.cloud.stream.annotation.StreamListener;
import org.springframework.messaging.support.MessageBuilder; import org.springframework.messaging.support.MessageBuilder;
import org.springframework.stereotype.Component; import org.springframework.stereotype.Component;
import javax.annotation.Resource;
import java.io.File; import java.io.File;
import java.io.FileOutputStream; import java.io.FileOutputStream;
import java.io.OutputStream; import java.io.OutputStream;
@ -32,7 +30,6 @@ import java.util.Map;
@Component @Component
@EnableBinding(ProtobufBinding.class) @EnableBinding(ProtobufBinding.class)
@ConditionalOnProperty(name = "switch.enable-protobuf-service", havingValue = "true", matchIfMissing = true)
public class ProtobufService extends StreamService { public class ProtobufService extends StreamService {
@Autowired @Autowired
@ -44,6 +41,9 @@ public class ProtobufService extends StreamService {
@Value("${custom.proto_save_path}") @Value("${custom.proto_save_path}")
String protoSavePath; String protoSavePath;
@Value("${custom.transfer_backup_path}")
String transferBackupPath;
@Value("${custom.keep_backup_file}") @Value("${custom.keep_backup_file}")
String keepBackupFile; String keepBackupFile;
@ -55,7 +55,7 @@ public class ProtobufService extends StreamService {
private final Logger logger = LogManager.getLogger(ProtobufService.class.getName()); private final Logger logger = LogManager.getLogger(ProtobufService.class.getName());
@Resource @Autowired
private ProtobufBinding source; private ProtobufBinding source;
@Override @Override
@ -169,7 +169,7 @@ public class ProtobufService extends StreamService {
} }
logger.debug("protobuf done"); logger.debug("protobuf done");
// 转移备份目录的todist文件 // 转移备份目录的todist文件
File transferPath = new File(protoSavePath); File transferPath = new File(transferBackupPath);
File[] files = transferPath.listFiles(); File[] files = transferPath.listFiles();
if (files != null && files.length > 0) { if (files != null && files.length > 0) {
for (File transferFile : files) { for (File transferFile : files) {

View File

@ -1,25 +1,31 @@
package com.jsc.dsp.service; package com.jsc.dsp.service;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.protobuf.Descriptors.FieldDescriptor; import com.google.protobuf.Descriptors.FieldDescriptor;
import com.jsc.dsp.binding.StorageBinding; import com.jsc.dsp.binding.StorageBinding;
import com.jsc.dsp.model.Indeximos; import com.jsc.dsp.model.Indeximos;
import com.jsc.dsp.proto.EsOuterClass.Es; import com.jsc.dsp.proto.EsOuterClass.Es;
import com.jsc.dsp.proto.EsOuterClass.EsSets; import com.jsc.dsp.proto.EsOuterClass.EsSets;
import com.jsc.dsp.utils.DatabaseConnector; import com.jsc.dsp.utils.DBUtils;
import com.jsc.dsp.utils.EsUtils;
import com.jsc.dsp.utils.FileUtils;
import com.jsc.dsp.utils.StringUtils; import com.jsc.dsp.utils.StringUtils;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
import org.elasticsearch.action.bulk.BulkRequest; import org.elasticsearch.action.bulk.BulkRequest;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.common.xcontent.XContentType;
import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value; import org.springframework.beans.factory.annotation.Value;
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
import org.springframework.boot.json.JacksonJsonParser; import org.springframework.boot.json.JacksonJsonParser;
import org.springframework.cloud.stream.annotation.EnableBinding; import org.springframework.cloud.stream.annotation.EnableBinding;
import org.springframework.cloud.stream.annotation.StreamListener; import org.springframework.cloud.stream.annotation.StreamListener;
import org.springframework.messaging.support.MessageBuilder; import org.springframework.messaging.support.MessageBuilder;
import org.springframework.stereotype.Component; import org.springframework.stereotype.Component;
import javax.annotation.Resource;
import java.lang.reflect.Field; import java.lang.reflect.Field;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.util.*; import java.util.*;
@ -27,7 +33,6 @@ import java.util.Base64.Decoder;
@Component @Component
@EnableBinding(StorageBinding.class) @EnableBinding(StorageBinding.class)
@ConditionalOnProperty(name = "switch.enable-storage-service", havingValue = "true", matchIfMissing = true)
public class StorageService extends StreamService { public class StorageService extends StreamService {
@Autowired @Autowired
@ -39,24 +44,42 @@ public class StorageService extends StreamService {
@Autowired @Autowired
JacksonJsonParser jsonParser; JacksonJsonParser jsonParser;
@Value("${es.ip}")
String esIp;
@Value("${es.port}")
Integer esPort;
@Value("${es.username}")
String esUsername;
@Value("${es.password}")
String esPassword;
@Value("${es.index}")
String esIndex;
@Value("${custom.dev-mode}") @Value("${custom.dev-mode}")
boolean devMode; boolean devMode;
@Value("${custom.local-file-storage-path}") @Value("${custom.local-file-storage-path}")
String localFileStoragePath; String localFileStoragePath;
@Value("${custom.websiteWhiteList}") @Value("${db.driver}")
String websiteWhiteListString; String dbDriver;
@Value("${custom.twitterWhiteList}") @Value("${db.url}")
String twitterWhiteListString; String dbUrl;
@Value("${db.user}")
String dbUser;
@Value("${db.password}")
String dbPassword;
@Resource
DatabaseConnector databaseConnector;
private final Logger logger = LogManager.getLogger(StorageService.class.getName()); private final Logger logger = LogManager.getLogger(StorageService.class.getName());
@Override @Override
public void sendMessage(byte[] msg) { public void sendMessage(byte[] msg) {
source.StorageOutput().send(MessageBuilder.withPayload(msg).build()); source.StorageOutput().send(MessageBuilder.withPayload(msg).build());
@ -68,10 +91,8 @@ public class StorageService extends StreamService {
@Override @Override
@StreamListener(StorageBinding.STORAGE_PIPELINE_IN) @StreamListener(StorageBinding.STORAGE_PIPELINE_IN)
public void receiveMessage(Object payload) { public void receiveMessage(Object payload) {
List<String> websiteWhiteList = Arrays.asList(websiteWhiteListString.split(";"));
List<String> twitterWhiteList = Arrays.asList(twitterWhiteListString.split(";"));
String tempString; String tempString;
ObjectMapper objectMapper = new ObjectMapper();
try { try {
tempString = new String(base64.decode(payload.toString()), StandardCharsets.UTF_8); tempString = new String(base64.decode(payload.toString()), StandardCharsets.UTF_8);
Map<String, Object> data = jsonParser.parseMap(tempString); Map<String, Object> data = jsonParser.parseMap(tempString);
@ -80,6 +101,7 @@ public class StorageService extends StreamService {
if ("public_info_data_".equals(protoName)) { if ("public_info_data_".equals(protoName)) {
EsSets.Builder esSetsBuilder = EsSets.newBuilder(); EsSets.Builder esSetsBuilder = EsSets.newBuilder();
EsSets esSets = EsSets.parseFrom(data.get("content").toString().getBytes(StandardCharsets.ISO_8859_1)); EsSets esSets = EsSets.parseFrom(data.get("content").toString().getBytes(StandardCharsets.ISO_8859_1));
List<Object> localStorageItems = new ArrayList<>();
List<Indeximos> dbStorageItems = new ArrayList<>(); List<Indeximos> dbStorageItems = new ArrayList<>();
BulkRequest bulkRequest = new BulkRequest(); BulkRequest bulkRequest = new BulkRequest();
bulkRequest.timeout("5s"); bulkRequest.timeout("5s");
@ -89,7 +111,7 @@ public class StorageService extends StreamService {
Map<FieldDescriptor, Object> fieldsMap = es.getAllFields(); Map<FieldDescriptor, Object> fieldsMap = es.getAllFields();
Indeximos indeximos = new Indeximos(); Indeximos indeximos = new Indeximos();
for (FieldDescriptor key : fieldsMap.keySet()) { for (FieldDescriptor key : fieldsMap.keySet()) {
boolean hasField = databaseConnector.hasField(Indeximos.class, key.getName()); boolean hasField = DBUtils.hasField(Indeximos.class, key.getName());
if (!hasField) { if (!hasField) {
continue; continue;
} }
@ -108,7 +130,7 @@ public class StorageService extends StreamService {
} else { } else {
Field field = indeximos.getClass().getDeclaredField(key.getName()); Field field = indeximos.getClass().getDeclaredField(key.getName());
field.setAccessible(true); field.setAccessible(true);
String fieldType = databaseConnector.getFieldType(Indeximos.class, key.getName()); String fieldType = DBUtils.getFieldType(Indeximos.class, key.getName());
if (fieldType.contains("Float")) { if (fieldType.contains("Float")) {
field.set(indeximos, Float.valueOf(value)); field.set(indeximos, Float.valueOf(value));
} else { } else {
@ -116,97 +138,68 @@ public class StorageService extends StreamService {
} }
} }
} }
// 只导出白名单站点的数据 String uuid = UUID.randomUUID().toString().replaceAll("-", "");
if (websiteWhiteList.contains(indeximos.getEs_sitename())) { String es_urlname = indeximos.getEs_urlname();
logger.info("开始处理站点【" + indeximos.getEs_sitename() + "】的数据入库流程"); if (!es_urlname.isEmpty()) {
String uuid = UUID.randomUUID().toString().replaceAll("-", ""); // 根据urlname生成固定的UUID避免重复入库相同的文章
String es_urlname = indeximos.getEs_urlname(); UUID _uuid = UUID.nameUUIDFromBytes(es_urlname.getBytes());
if (!es_urlname.isEmpty()) { uuid = _uuid.toString().replaceAll("-", "");
// 根据urlname生成固定的UUID避免重复入库相同的文章 }
UUID _uuid = UUID.nameUUIDFromBytes(es_urlname.getBytes()); indeximos.setEs_sid(uuid);
uuid = _uuid.toString().replaceAll("-", ""); indeximos.setEs_links(indeximos.getEs_links());
} indeximos.setEs_loadtime(StringUtils.TimestampToStringDate(System.currentTimeMillis()));
indeximos.setEs_urltitle(indeximos.getEs_urltitle().trim()); builder.setEsSid(uuid);
indeximos.setEs_sid(uuid); for (Field f : indeximos.getClass().getDeclaredFields()) {
indeximos.setEs_links(indeximos.getEs_links()); f.setAccessible(true);
indeximos.setEs_loadtime(StringUtils.TimestampToStringDate(System.currentTimeMillis())); //判断字段是否为空并且对象属性中的基本都会转为对象类型来判断
builder.setEsSid(uuid); if (f.get(indeximos) == null) {
for (Field f : indeximos.getClass().getDeclaredFields()) { String fieldType = DBUtils.getFieldType(Indeximos.class, f.getName());
f.setAccessible(true); if (fieldType.contains("Float")) {
//判断字段是否为空并且对象属性中的基本都会转为对象类型来判断 f.set(indeximos, 0.0f);
if (f.get(indeximos) == null) { } else {
String fieldType = databaseConnector.getFieldType(Indeximos.class, f.getName()); if (!dateFields.contains(f.getName())) {
if (fieldType.contains("Float")) { f.set(indeximos, "");
f.set(indeximos, 0.0f);
} else {
if (!dateFields.contains(f.getName())) {
f.set(indeximos, "");
}
} }
} }
} }
}
IndexRequest indexRequest = new IndexRequest(esIndex);
indexRequest.id(indeximos.getEs_sid());
indexRequest.source(objectMapper.writeValueAsString(indeximos), XContentType.JSON);
bulkRequest.add(indexRequest);
Es es_temp = builder.build();
esSetsBuilder.addEs(es_temp);
List<String> localizedOption = JSON.parseArray(indeximos.getEs_urltopic(), String.class);
if (indeximos.getEs_carriertype().equals("wechat")) {
dbStorageItems.add(indeximos); dbStorageItems.add(indeximos);
} }
if (indeximos.getEs_carriertype().equals("media") && twitterWhiteList.contains(indeximos.getEs_authors())) { if (localizedOption != null && localizedOption.size() > 0) {
logger.info("开始处理推特用户【" + indeximos.getEs_authors() + "】的数据入库流程"); //本地存储用
String uuid = UUID.randomUUID().toString().replaceAll("-", ""); if (localizedOption.contains("json")) {
String es_urlname = indeximos.getEs_urlname(); localStorageItems.add(indeximos);
if (!es_urlname.isEmpty()) {
// 根据urlname生成固定的UUID避免重复入库相同的文章
UUID _uuid = UUID.nameUUIDFromBytes(es_urlname.getBytes());
uuid = _uuid.toString().replaceAll("-", "");
} }
indeximos.setEs_sid(uuid); //入库MySQL
indeximos.setEs_loadtime(StringUtils.TimestampToStringDate(System.currentTimeMillis())); if (localizedOption.contains("mysql")) {
builder.setEsSid(uuid); dbStorageItems.add(indeximos);
for (Field f : indeximos.getClass().getDeclaredFields()) {
f.setAccessible(true);
//判断字段是否为空并且对象属性中的基本都会转为对象类型来判断
if (f.get(indeximos) == null) {
String fieldType = databaseConnector.getFieldType(Indeximos.class, f.getName());
if (fieldType.contains("Float")) {
f.set(indeximos, 0.0f);
} else {
if (!dateFields.contains(f.getName())) {
f.set(indeximos, "");
}
}
}
} }
dbStorageItems.add(indeximos);
}
if (indeximos.getEs_carriertype().equals("hot_search")) {
logger.info("开始处理热搜【" + indeximos.getEs_sitename() + "】的数据入库流程");
String uuid = UUID.randomUUID().toString().replaceAll("-", "");
String es_urlname = indeximos.getEs_urlname();
if (!es_urlname.isEmpty()) {
// 根据urlname生成固定的UUID避免重复入库相同的文章
UUID _uuid = UUID.nameUUIDFromBytes(es_urlname.getBytes());
uuid = _uuid.toString().replaceAll("-", "");
}
indeximos.setEs_sid(uuid);
indeximos.setEs_loadtime(StringUtils.TimestampToStringDate(System.currentTimeMillis()));
builder.setEsSid(uuid);
for (Field f : indeximos.getClass().getDeclaredFields()) {
f.setAccessible(true);
//判断字段是否为空并且对象属性中的基本都会转为对象类型来判断
if (f.get(indeximos) == null) {
String fieldType = databaseConnector.getFieldType(Indeximos.class, f.getName());
if (fieldType.contains("Float")) {
f.set(indeximos, 0.0f);
} else {
if (!dateFields.contains(f.getName())) {
f.set(indeximos, "");
}
}
}
}
dbStorageItems.add(indeximos);
} }
} }
EsUtils.EsSaveBulkRequest(esIp, esPort, esUsername, esPassword, bulkRequest);
if (localStorageItems.size() > 0) {
String entityItemsString = JSON.toJSONString(localStorageItems);
String entityFileFullPath = localFileStoragePath + esIndex + "_" + System.currentTimeMillis() + ".json";
if (FileUtils.saveStringToFile(entityItemsString, entityFileFullPath)) {
logger.info("Local file store to " + entityFileFullPath);
} else {
logger.error("Local file store error!");
}
}
if (dbStorageItems.size() > 0) { if (dbStorageItems.size() > 0) {
databaseConnector.insertIntoDB(dbStorageItems); if (DBUtils.insertIntoDB(dbDriver, dbUrl, dbUser, dbPassword, dbStorageItems)) {
logger.info("Store to MySQL Database");
} else {
logger.error("MySQL Database Storage error!");
}
} }
data.put("content", new String(esSetsBuilder.build().toByteArray(), StandardCharsets.ISO_8859_1)); data.put("content", new String(esSetsBuilder.build().toByteArray(), StandardCharsets.ISO_8859_1));
} }

View File

@ -1,36 +0,0 @@
package com.jsc.dsp.task;
import com.jsc.dsp.utils.ExportAndUploadUtils;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import javax.annotation.Resource;
@Component
@ConditionalOnProperty(name = "switch.auto-export-and-upload", havingValue = "true", matchIfMissing = true)
public class AutoUpload {
@Resource
ExportAndUploadUtils exportAndUploadUtils;
@Value("${custom.ftpUploadPath}")
String ftpUploadPath;
@Scheduled(cron = "${custom.exportNewsTaskSchedule}")
public void exportNewsDataAndUpload() {
exportAndUploadUtils.exportNewsDataAndUpload();
}
@Scheduled(cron = "${custom.exportTwitterTaskSchedule}")
public void exportTwitterDataAndUpload() {
exportAndUploadUtils.exportTwitterDataAndUpload();
}
@Scheduled(cron = "${custom.exportHotSearchTaskSchedule}")
public void exportHotSearchAndUpload() {
exportAndUploadUtils.exportHotSearchAndUpload();
}
}

View File

@ -0,0 +1,135 @@
package com.jsc.dsp.utils;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.jsc.dsp.model.SearchAggregation;
import com.jsc.dsp.model.TargetSocial;
import com.jsc.dsp.model.TargetWebsite;
import java.util.Date;
import java.util.Map;
import java.util.logging.Logger;
import static com.jsc.dsp.utils.EsUtils.performAggregationSearch;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.kafka.support.LogIfLevelEnabled;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
@Component
public class AutoPatroller {
private final Logger logger = Logger.getLogger(this.getClass().getName());
long updateInterval = 1500L;
@Value("${custom.websiteQueryAPI}")
String websiteQueryAPI;
@Value("${custom.websiteUpdateAPI}")
String websiteUpdateAPI;
@Value("${custom.socialQueryAPI}")
String socialQueryAPI;
@Value("${custom.socialUpdateAPI}")
String socialUpdateAPI;
@Value("${es.ip}")
String esIp;
@Value("${es.port}")
Integer esPort;
@Value("${es.username}")
String esUsername;
@Value("${es.password}")
String esPassword;
@Scheduled(cron = "0 45 0/3 * * *")
public void checkNewsSite() {
checkWebsite("es_sitename", "es_carriertype", "news");
}
@Scheduled(cron = "0 15 1/3 * * *")
public void checkWechat() {
checkSocial("es_authors", "es_carriertype", "wechat", "5");
}
@Scheduled(cron = "0 0 2/4 * * *")
public void checkArticleSite() {
checkWebsite("es_sitename", "es_carriertype", "article");
}
public void checkWebsite(String aggFieldName, String queryFieldName, String queryFieldValue) {
try {
Map<String, SearchAggregation> searchAggregationMap = performAggregationSearch(
esIp, esPort, esUsername, esPassword, aggFieldName, queryFieldName, queryFieldValue);
JSONObject dataObject = new JSONObject();
dataObject.put("carrierType", queryFieldValue);
String rsp = HttpUtils.post(websiteQueryAPI, dataObject);
JSONObject rspObj = JSON.parseObject(rsp);
if (rspObj.getIntValue("code") == 200) {
JSONArray rspArr = rspObj.getJSONArray("content");
for (Object obj : rspArr) {
TargetWebsite targetWebsite = JSONObject.parseObject(obj.toString(), TargetWebsite.class);
String siteName = targetWebsite.getSiteName();
if (searchAggregationMap.containsKey(siteName)) {
SearchAggregation checkInfo = searchAggregationMap.get(siteName);
targetWebsite.setCheckTotalNum(checkInfo.getCount());
targetWebsite.setCheckLastTime(checkInfo.getLastTime());
targetWebsite.setCheckUpdateTime(new Date());
String updateRsp = HttpUtils.post(websiteUpdateAPI, targetWebsite);
JSONObject updateRspObj = JSONObject.parseObject(updateRsp);
if (updateRspObj.getIntValue("code") != 200) {
logger.warning("更新站点【" + siteName + "】巡检信息失败");
}
Thread.sleep(updateInterval);
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
logger.info("站点巡检完毕");
}
public void checkSocial(String aggFieldName, String queryFieldName, String queryFieldValue, String socialTypeCode) {
try {
Map<String, SearchAggregation> searchAggregationMap = performAggregationSearch(
esIp, esPort, esUsername, esPassword, aggFieldName, queryFieldName, queryFieldValue);
TargetSocial postData = new TargetSocial();
postData.setUserFlag("0");
postData.setUserType(socialTypeCode);
String rsp = HttpUtils.post(socialQueryAPI, postData);
JSONObject rspObj = JSON.parseObject(rsp);
if (rspObj.getIntValue("code") == 200) {
JSONArray rspArr = rspObj.getJSONArray("content");
for (Object obj : rspArr) {
TargetSocial targetSocial = JSONObject.parseObject(obj.toString(), TargetSocial.class);
String userName = targetSocial.getUserName();
if (searchAggregationMap.containsKey(userName)) {
SearchAggregation checkInfo = searchAggregationMap.get(userName);
targetSocial.setCheckTotalNum(checkInfo.getCount());
targetSocial.setCheckLastTime(checkInfo.getLastTime());
targetSocial.setCheckUpdateTime(new Date());
String updateRsp = HttpUtils.post(socialUpdateAPI, targetSocial);
JSONObject updateRspObj = JSONObject.parseObject(updateRsp);
if (updateRspObj.getIntValue("code") != 200) {
logger.warning("更新账号【" + userName + "】巡检信息失败");
}
Thread.sleep(updateInterval);
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
logger.info("社交帐号巡检完毕");
}
}

View File

@ -0,0 +1,131 @@
package com.jsc.dsp.utils;
import com.alibaba.fastjson.JSONArray;
import com.jsc.dsp.model.Indeximos;
import java.io.File;
import java.lang.reflect.Field;
import java.lang.reflect.Method;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.*;
import java.util.logging.Logger;
public class DBUtils {
public static Connection conn = null;
private static final List<String> floatFields = Arrays.asList("es_doclength", "es_negativeProbability", "es_simrank");
private static final Logger logger = Logger.getLogger("com.jsc.dsp.utils.DBUtils");
public static Connection getConnection(String driver, String url, String user, String password) {
try {
Class.forName(driver);
return DriverManager.getConnection(url, user, password);
} catch (ClassNotFoundException | SQLException e) {
logger.warning("Cannot get DB connection!");
logger.warning(e.getMessage());
return null;
}
}
private static Map<String, Object> getObjectMap(Indeximos object) {
Map<String, Object> resultMap = new HashMap<>();
Field[] fields = object.getClass().getDeclaredFields();
for (Field field : fields) {
String fieldName = field.getName();
String firstLetter = fieldName.substring(0, 1).toUpperCase();
String getter = "get" + firstLetter + fieldName.substring(1);
try {
Method method = object.getClass().getMethod(getter);
Object fieldValue = method.invoke(object);
resultMap.put(fieldName, fieldValue);
} catch (Exception e) {
e.printStackTrace();
}
}
return resultMap;
}
public static boolean insertIntoDB(String driver, String url, String user, String password, List<Indeximos> objectList) {
if (conn == null) {
conn = getConnection(driver, url, user, password);
}
if (conn != null) {
try {
PreparedStatement pres = null;
for (Indeximos object : objectList) {
Map<String, Object> objectMap = getObjectMap(object);
Object[] keyObjects = objectMap.keySet().toArray();
List<String> keys = new ArrayList<>();
List<String> values = new ArrayList<>();
for (Object ko : keyObjects) {
String key = ko.toString();
keys.add(key);
Object value = objectMap.get(key);
if (floatFields.contains(key)) {
values.add(value.toString());
} else {
if (value != null && value.toString().length() > 0) {
values.add("'" + value.toString().replace("'", "\\'") + "'");
} else {
values.add("null");
}
}
}
String sqlInsert = "REPLACE INTO indeximos(" + String.join(", ", keys) + ") VALUES("
+ String.join(", ", values) + ")";
pres = conn.prepareStatement(sqlInsert);
pres.addBatch();
}
if (pres != null) {
pres.executeBatch();
pres.close();
}
return true;
} catch (SQLException e) {
logger.warning("Fail to insert data to Database");
logger.warning(e.getMessage());
conn = getConnection(driver, url, user, password);
return false;
}
} else {
return false;
}
}
public static boolean hasField(Class<?> clazz, String fieldName) {
try {
clazz.getDeclaredField(fieldName);
return true;
} catch (NoSuchFieldException e) {
return false;
}
}
public static String getFieldType(Class<?> clazz, String fieldName) {
try {
Field field = clazz.getDeclaredField(fieldName);
return field.getType().getName();
} catch (NoSuchFieldException e) {
return "";
}
}
public static void main(String[] args) {
List<Indeximos> objectList = JSONArray.parseArray(FileUtils.readContentFromFile(
"D:/data/local-storage/indeximos_1700030748332.json"), Indeximos.class);
insertIntoDB(
"com.mysql.cj.jdbc.Driver",
"jdbc:mysql://8.130.95.27:28089/dsp",
"root",
"passok123A",
objectList);
}
}

View File

@ -1,590 +0,0 @@
package com.jsc.dsp.utils;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.jsc.dsp.dao.EsDataHotSearchRepository;
import com.jsc.dsp.dao.EsDataNewsRepository;
import com.jsc.dsp.dao.EsDataTwitterRepository;
import com.jsc.dsp.dao.IndeximosRepository;
import com.jsc.dsp.model.EsDataHotSearchView;
import com.jsc.dsp.model.EsDataNewsView;
import com.jsc.dsp.model.EsDataTwitterView;
import com.jsc.dsp.model.Indeximos;
import org.apache.poi.ss.usermodel.*;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
import javax.annotation.Resource;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.lang.reflect.Field;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.List;
import com.fasterxml.jackson.databind.JsonNode;
import java.util.ArrayList;
@Service
public class DatabaseConnector {
@Resource
IndeximosRepository indeximosRepository;
@Resource
EsDataNewsRepository esDataNewsRepository;
@Resource
EsDataTwitterRepository esDataTwitterRepository;
@Resource
EsDataHotSearchRepository esDataHotSearchRepository;
@Value("${custom.newsExcelOutputPath}")
String newsExcelOutputPath;
@Value("${custom.twitterExcelOutputPath}")
String twitterExcelOutputPath;
@Value("${custom.hotSearchExcelOutputPath}")
String hotSearchExcelOutputPath;
private static final ObjectMapper objectMapper = new ObjectMapper();
private final Logger logger = LoggerFactory.getLogger(this.getClass().getName());
public void insertIntoDB(List<Indeximos> objectList) {
try {
indeximosRepository.saveAll(objectList);
} catch (Exception e) {
logger.warn("Fail to insert data to Database");
logger.warn(e.getMessage());
}
}
public boolean hasField(Class<?> clazz, String fieldName) {
try {
clazz.getDeclaredField(fieldName);
return true;
} catch (NoSuchFieldException e) {
return false;
}
}
public String getFieldType(Class<?> clazz, String fieldName) {
try {
Field field = clazz.getDeclaredField(fieldName);
return field.getType().getName();
} catch (NoSuchFieldException e) {
return "";
}
}
public void exportToXlsx(String startTime) {
try {
Path dirPath = Paths.get(newsExcelOutputPath);
if (!Files.exists(dirPath)) {
Files.createDirectories(dirPath);
}
String timestamp = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd"));
String fileName = "data_news-" + timestamp + "-001.xlsx";
Path filePath = dirPath.resolve(fileName);
List<EsDataNewsView> esDataNewsViewList = esDataNewsRepository.findAllByEsLoadtimeAfter(startTime);
if (!esDataNewsViewList.isEmpty()) {
Field[] fields = esDataNewsViewList.get(0).getClass().getDeclaredFields(); // 通过反射获取类的成员信息并使用这些类成员为后续生成的excel表头做准备
try (Workbook workbook = new XSSFWorkbook();
ByteArrayOutputStream out = new ByteArrayOutputStream()) {
Sheet sheet = workbook.createSheet("data");
// 创建表头
Row headerRow = sheet.createRow(0);
CellStyle headerStyle = workbook.createCellStyle(); // 创建单元格
headerStyle.setFillForegroundColor(IndexedColors.LIGHT_BLUE.getIndex());
headerStyle.setFillPattern(FillPatternType.SOLID_FOREGROUND);
for (int i = 0; i < fields.length; i++) {
Cell cell = headerRow.createCell(i);
String formField = formField(fields[i]);
cell.setCellValue(formField);
cell.setCellStyle(headerStyle);
}
// 填充数据
int rowNum = 1;
for (EsDataNewsView item : esDataNewsViewList) {
if (item.getFile() == null || item.getFile().length() < 5) {
continue;
} else {
String fileFullPath = item.getFile();
int i = fileFullPath.indexOf(File.separator);
item.setFile(fileFullPath.substring(i + 1));
}
Row row = sheet.createRow(rowNum++);
logger.debug("导出excel第" + rowNum + "");
row.createCell(0).setCellValue(item.getEsSid());
row.createCell(1).setCellValue(item.getEsAuthors());
row.createCell(2).setCellValue(item.getEsCarriertype());
row.createCell(3).setCellValue(item.getEsCatalog());
row.createCell(4).setCellValue(item.getEsCollection());
row.createCell(5).setCellValue(item.getEsDoclength());
row.createCell(6).setCellValue(item.getEsLang());
row.createCell(7).setCellValue(item.getEsLasttime());
if (item.getEsLinks().length() > 10000) {
row.createCell(8).setCellValue(item.getEsLinks().substring(0, 10000));
} else {
row.createCell(8).setCellValue(item.getEsLinks());
}
row.createCell(9).setCellValue(item.getEsLoadtime());
row.createCell(10).setCellValue(item.getEsSitename());
row.createCell(11).setCellValue(item.getEsSrcname());
if (item.getEsUrlcontent().length() > 30000) {
row.createCell(12).setCellValue(item.getEsUrlcontent().substring(0, 30000));
} else {
row.createCell(12).setCellValue(item.getEsUrlcontent());
}
if (item.getEsUrlcontentTranslate().length() > 30000) {
row.createCell(13).setCellValue(item.getEsUrlcontentTranslate().substring(0, 30000));
} else {
row.createCell(13).setCellValue(item.getEsUrlcontentTranslate());
}
row.createCell(14).setCellValue(item.getEsUrlimage());
row.createCell(15).setCellValue(item.getEsUrlname());
row.createCell(16).setCellValue(item.getEsUrltime());
row.createCell(17).setCellValue(item.getEsUrltitle());
row.createCell(18).setCellValue(item.getEsUrltitleTranslate());
row.createCell(19).setCellValue(item.getEsAbstract());
row.createCell(20).setCellValue(item.getEsKeywords());
row.createCell(21).setCellValue(item.getFile());
row.createCell(22).setCellValue(item.getEsHkey());
row.createCell(23).setCellValue(item.getEsUrltopic());
}
logger.info("完成excel数据写入" + rowNum + "");
// 自动调整列宽
for (int i = 0; i < fields.length; i++) {
sheet.autoSizeColumn(i);
}
workbook.write(out);
try (FileOutputStream fos = new FileOutputStream(filePath.toFile())) {
workbook.write(fos);
}
} catch (IOException e) {
e.printStackTrace();
}
logger.info("excel导出完成");
} else logger.info("获取数据为空excel未导出");
} catch (Exception e) {
e.printStackTrace();
}
}
public void twitterToXlsx(String startTime) {
try {
Path dirPath = Paths.get(twitterExcelOutputPath);
if (!Files.exists(dirPath)) {
Files.createDirectories(dirPath);
}
String timestamp = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd"));
String fileName = "data_twitter-" + timestamp + "-001.xlsx";
Path filePath = dirPath.resolve(fileName);
List<EsDataTwitterView> esDataNewsViewList = esDataTwitterRepository.findAllByEsLoadtimeAfter(startTime);
if (!esDataNewsViewList.isEmpty()) {
Field[] fields = esDataNewsViewList.get(0).getClass().getDeclaredFields(); // 通过反射获取类的成员信息并使用这些类成员为后续生成的excel表头做准备
try (Workbook workbook = new XSSFWorkbook();
ByteArrayOutputStream out = new ByteArrayOutputStream()) {
Sheet sheet = workbook.createSheet("data");
// 创建表头
Row headerRow = sheet.createRow(0);
CellStyle headerStyle = workbook.createCellStyle(); // 创建单元格
headerStyle.setFillForegroundColor(IndexedColors.LIGHT_BLUE.getIndex());
headerStyle.setFillPattern(FillPatternType.SOLID_FOREGROUND);
for (int i = 0; i < fields.length; i++) {
Cell cell = headerRow.createCell(i);
String formField = formField(fields[i]);
cell.setCellValue(formField);
cell.setCellStyle(headerStyle);
}
// 填充数据
int rowNum = 1;
for (EsDataTwitterView item : esDataNewsViewList) {
Row row = sheet.createRow(rowNum++);
logger.debug("导出excel第" + rowNum + "");
// 0: esUrltime
row.createCell(0).setCellValue(item.getEsUrltime() != null ? item.getEsUrltime() : "");
// 1: esAuthors
row.createCell(1).setCellValue(item.getEsAuthors() != null ? item.getEsAuthors() : "");
// 2: esCarriertype
row.createCell(2).setCellValue(item.getEsCarriertype() != null ? item.getEsCarriertype() : "");
// 3: esSitename
row.createCell(3).setCellValue(item.getEsSitename() != null ? item.getEsSitename() : "");
// 4: esUrlcontent
String esUrlcontent = item.getEsUrlcontent();
if (esUrlcontent != null && esUrlcontent.length() > 10000) {
row.createCell(4).setCellValue(esUrlcontent.substring(0, 10000));
} else {
row.createCell(4).setCellValue(esUrlcontent != null ? esUrlcontent : "");
}
// 5: esUrlcontentTranslate
String esUrlcontentTranslate = item.getEsUrlcontentTranslate();
if (esUrlcontentTranslate != null && esUrlcontentTranslate.length() > 10000) {
row.createCell(5).setCellValue(esUrlcontentTranslate.substring(0, 10000));
} else {
row.createCell(5).setCellValue(esUrlcontentTranslate != null ? esUrlcontentTranslate : "");
}
// 6: esUrlname
row.createCell(6).setCellValue(item.getEsUrlname() != null ? item.getEsUrlname() : "");
// 7: esUrltitle
String esUrltitle = item.getEsUrltitle();
if (esUrltitle != null && esUrltitle.length() > 10000) {
row.createCell(7).setCellValue(esUrltitle.substring(0, 10000));
} else {
row.createCell(7).setCellValue(esUrltitle != null ? esUrltitle : "");
}
// 8: es_urltitle_translate
String esUrltitleTranslate = item.getEsUrltitleTranslate();
if (esUrltitleTranslate != null && esUrltitleTranslate.length() > 10000) {
row.createCell(8).setCellValue(esUrltitleTranslate.substring(0, 10000));
} else {
row.createCell(8).setCellValue(esUrltitleTranslate != null ? esUrltitleTranslate : "");
}
// 9: esVideo
String videoFilenames = extractFilenamesFromJsonArray(item.getEsVideo());
row.createCell(9).setCellValue(videoFilenames);
// 10: esExtname
row.createCell(10).setCellValue(item.getEsExtname() != null ? item.getEsExtname() : "");
// 11: esIsrepost
row.createCell(11).setCellValue(item.getEsIsrepost() != null ? item.getEsIsrepost() : "");
// 12: esCatalog1
row.createCell(12).setCellValue(item.getEsCatalog1() != null ? item.getEsCatalog1() : "");
// 13: esForwardcount
row.createCell(13).setCellValue(item.getEsForwardcount() != null ? item.getEsForwardcount() : "");
// 14: esLikecount
row.createCell(14).setCellValue(item.getEsLikecount() != null ? item.getEsLikecount() : "");
// 15: esCommentcount
row.createCell(15).setCellValue(item.getEsCommentcount() != null ? item.getEsCommentcount() : "");
// 16: esHkey
row.createCell(16).setCellValue(item.getEsHkey() != null ? item.getEsHkey() : "");
// 17: esUrlimage
String imageFilenames = extractFilenamesFromJsonArray(item.getEsUrlimage());
row.createCell(17).setCellValue(imageFilenames);
// 18: esUserid
row.createCell(18).setCellValue(item.getEsUserid() != null ? item.getEsUserid() : "");
// 19: esLoadtime
row.createCell(19).setCellValue(item.getEsLoadtime() != null ? item.getEsLoadtime() : "");
}
logger.info("完成excel数据写入" + rowNum + "");
// 自动调整列宽
for (int i = 0; i < fields.length; i++) {
sheet.autoSizeColumn(i);
}
workbook.write(out);
try (FileOutputStream fos = new FileOutputStream(filePath.toFile())) {
workbook.write(fos);
}
} catch (IOException e) {
e.printStackTrace();
}
logger.info("excel导出完成");
} else logger.info("获取数据为空excel未导出");
} catch (Exception e) {
e.printStackTrace();
}
}
private String formField(Field field) {
String fieldString = field.getName();
return StringUtils.camelToSnake(fieldString);
}
public String extractFilenamesFromJsonArray(String jsonStr) {
if (jsonStr == null || jsonStr.trim().isEmpty()) {
return "";
}
try {
JsonNode array = objectMapper.readTree(jsonStr.replace("'", "\"").trim());
if (!array.isArray()) {
return "";
}
List<String> filenames = new ArrayList<>();
for (JsonNode node : array) {
if (node.has("path")) {
String url = node.get("path").asText().trim();
if (!url.isEmpty()) {
// 提取文件名支持带参数的 URL
String filename = url.split("\\?")[0]; // 去掉 ? 后的参数
filename = filename.substring(filename.lastIndexOf('/') + 1);
if (!filename.isEmpty()) {
filenames.add(filename);
}
}
}
}
return String.join(",", filenames);
} catch (Exception e) {
// 如果解析失败返回空或原始内容根据需求
return ""; // 或者 return jsonStr; 用于调试
}
}
public void hotSearchToXlsx(String startTime) {
try {
Path dirPath = Paths.get(hotSearchExcelOutputPath);
if (!Files.exists(dirPath)) {
Files.createDirectories(dirPath);
}
String timestamp = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd"));
String fileName = "data_hot_search-" + timestamp + "-001.xlsx";
Path filePath = dirPath.resolve(fileName);
List<EsDataHotSearchView> esDataHotSearchViewList = esDataHotSearchRepository.findAllByEsLoadtimeAfter(startTime);
if (!esDataHotSearchViewList.isEmpty()) {
Field[] fields = esDataHotSearchViewList.get(0).getClass().getDeclaredFields(); // 通过反射获取类的成员信息并使用这些类成员为后续生成的excel表头做准备
try (Workbook workbook = new XSSFWorkbook();
ByteArrayOutputStream out = new ByteArrayOutputStream()) {
Sheet sheet = workbook.createSheet("data");
// 创建表头
Row headerRow = sheet.createRow(0);
CellStyle headerStyle = workbook.createCellStyle(); // 创建单元格
headerStyle.setFillForegroundColor(IndexedColors.LIGHT_BLUE.getIndex());
headerStyle.setFillPattern(FillPatternType.SOLID_FOREGROUND);
for (int i = 0; i < fields.length; i++) {
Cell cell = headerRow.createCell(i);
String formField = formField(fields[i]);
cell.setCellValue(formField);
cell.setCellStyle(headerStyle);
}
// 填充数据
int rowNum = 1;
for (EsDataHotSearchView item : esDataHotSearchViewList) {
Row row = sheet.createRow(rowNum++);
logger.debug("导出excel第" + rowNum + "");
// 0: esSid
row.createCell(0).setCellValue(item.getEsSid() != null ? item.getEsSid() : "");
// 1: esUrltime
row.createCell(1).setCellValue(item.getEsUrltime() != null ? item.getEsUrltime() : "");
// 2: esCarriertype
row.createCell(2).setCellValue(item.getEsCarriertype() != null ? item.getEsCarriertype() : "");
// 3: esSitename
row.createCell(3).setCellValue(item.getEsSitename() != null ? item.getEsSitename() : "");
// 4: esSimrank
row.createCell(4).setCellValue(item.getEsSimrank() != null ? String.valueOf(Float.valueOf(item.getEsSimrank()).intValue()) : "");
// 5: esUrltitle
String esUrltitle = item.getEsUrltitle();
if (esUrltitle != null && esUrltitle.length() > 10000) {
row.createCell(5).setCellValue(esUrltitle.substring(0, 10000));
} else {
row.createCell(5).setCellValue(esUrltitle != null ? esUrltitle : "");
}
// 6: esUrlcontent
String esUrlcontent = item.getEsUrlcontent();
if (esUrlcontent != null && esUrlcontent.length() > 10000) {
row.createCell(6).setCellValue(esUrlcontent.substring(0, 10000));
} else {
row.createCell(6).setCellValue(esUrlcontent != null ? esUrlcontent : "");
}
// 7: esUrlname
row.createCell(7).setCellValue(item.getEsUrlname() != null ? item.getEsUrlname() : "");
// 8: esHkey
row.createCell(8).setCellValue(item.getEsHkey() != null ? item.getEsHkey() : "");
// 9: esLasttime
String esLasttime = extractFilenamesFromJsonArray(item.getEsLasttime());
row.createCell(9).setCellValue(esLasttime);
// 10: esHeat
row.createCell(10).setCellValue(item.getEsHeat() != null ? item.getEsHeat() : "");
// 1: esLasttime
String esLoadtime = extractFilenamesFromJsonArray(item.getEsLoadtime());
row.createCell(11).setCellValue(esLoadtime);
}
logger.info("完成excel数据写入" + rowNum + "");
// 自动调整列宽
for (int i = 0; i < fields.length; i++) {
sheet.autoSizeColumn(i);
}
workbook.write(out);
try (FileOutputStream fos = new FileOutputStream(filePath.toFile())) {
workbook.write(fos);
}
} catch (IOException e) {
e.printStackTrace();
}
logger.info("excel导出完成");
} else logger.info("获取数据为空excel未导出");
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* 新闻导出
*/
public void exportToXlsxTest(String startTime) {
try {
Path dirPath = Paths.get(hotSearchExcelOutputPath);
if (!Files.exists(dirPath)) {
Files.createDirectories(dirPath);
}
String timestamp = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd"));
String fileName = "data_hot_search-" + timestamp + "-001.xlsx";
Path filePath = dirPath.resolve(fileName);
List<EsDataHotSearchView> esDataHotSearchViewList = esDataHotSearchRepository.findAllByEsLoadtimeAfter(startTime);
if (!esDataHotSearchViewList.isEmpty()) {
Field[] fields = esDataHotSearchViewList.get(0).getClass().getDeclaredFields(); // 通过反射获取类的成员信息并使用这些类成员为后续生成的excel表头做准备
try (Workbook workbook = new XSSFWorkbook();
ByteArrayOutputStream out = new ByteArrayOutputStream()) {
Sheet sheet = workbook.createSheet("data");
// 创建表头
Row headerRow = sheet.createRow(0);
CellStyle headerStyle = workbook.createCellStyle(); // 创建单元格
headerStyle.setFillForegroundColor(IndexedColors.LIGHT_BLUE.getIndex());
headerStyle.setFillPattern(FillPatternType.SOLID_FOREGROUND);
for (int i = 0; i < fields.length; i++) {
Cell cell = headerRow.createCell(i);
String formField = formField(fields[i]);
cell.setCellValue(formField);
cell.setCellStyle(headerStyle);
}
// 填充数据
int rowNum = 1;
for (EsDataHotSearchView item : esDataHotSearchViewList) {
Row row = sheet.createRow(rowNum++);
logger.debug("导出excel第" + rowNum + "");
// 0: esSid
row.createCell(0).setCellValue(item.getEsSid() != null ? item.getEsSid() : "");
// 1: esUrltime
row.createCell(1).setCellValue(item.getEsUrltime() != null ? item.getEsUrltime() : "");
// 2: esCarriertype
row.createCell(2).setCellValue(item.getEsCarriertype() != null ? item.getEsCarriertype() : "");
// 3: esSitename
row.createCell(3).setCellValue(item.getEsSitename() != null ? item.getEsSitename() : "");
// 4: esSimrank
row.createCell(4).setCellValue(item.getEsSimrank() != null ? String.valueOf(Float.valueOf(item.getEsSimrank()).intValue()) : "");
// 5: esUrltitle
String esUrltitle = item.getEsUrltitle();
if (esUrltitle != null && esUrltitle.length() > 10000) {
row.createCell(5).setCellValue(esUrltitle.substring(0, 10000));
} else {
row.createCell(5).setCellValue(esUrltitle != null ? esUrltitle : "");
}
// 6: esUrlcontent
String esUrlcontent = item.getEsUrlcontent();
if (esUrlcontent != null && esUrlcontent.length() > 10000) {
row.createCell(6).setCellValue(esUrlcontent.substring(0, 10000));
} else {
row.createCell(6).setCellValue(esUrlcontent != null ? esUrlcontent : "");
}
// 7: esUrlname
row.createCell(7).setCellValue(item.getEsUrlname() != null ? item.getEsUrlname() : "");
// 8: esHkey
row.createCell(8).setCellValue(item.getEsHkey() != null ? item.getEsHkey() : "");
// 9: esLasttime
String esLasttime = extractFilenamesFromJsonArray(item.getEsLasttime());
row.createCell(9).setCellValue(esLasttime);
// 10: esHeat
row.createCell(10).setCellValue(item.getEsHeat() != null ? item.getEsHeat() : "");
// 11: esLasttime
String esLoadtime = extractFilenamesFromJsonArray(item.getEsLoadtime());
row.createCell(11).setCellValue(esLoadtime);
}
logger.info("完成excel数据写入" + rowNum + "");
// 自动调整列宽
for (int i = 0; i < fields.length; i++) {
sheet.autoSizeColumn(i);
}
workbook.write(out);
try (FileOutputStream fos = new FileOutputStream(filePath.toFile())) {
workbook.write(fos);
}
} catch (IOException e) {
e.printStackTrace();
}
logger.info("excel导出完成");
} else logger.info("获取数据为空excel未导出");
} catch (Exception e) {
e.printStackTrace();
}
}
}

View File

@ -1,425 +0,0 @@
package com.jsc.dsp.utils;
import com.jsc.dsp.service.ConfigService;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;
import javax.annotation.Resource;
import java.io.*;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;
import java.nio.file.attribute.BasicFileAttributes;
import java.nio.file.attribute.FileTime;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.time.format.DateTimeParseException;
import java.util.Comparator;
import java.util.Date;
import java.util.List;
import java.util.stream.Collectors;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
@Component
public class ExportAndUploadUtils {
@Resource
DatabaseConnector databaseConnector;
@Resource
FTPConnector ftpConnector;
@Resource
SFTPConnector sftpConnector;
@Resource
ConfigService configService;
private final Logger logger = LoggerFactory.getLogger(this.getClass().getName());
private static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";
private static final SimpleDateFormat sdf = new SimpleDateFormat(DATE_FORMAT);
@Value("${custom.newsExcelOutputPath}")
String newsExcelOutputPath;
@Value("${custom.twitterExcelOutputPath}")
String twitterExcelOutputPath;
@Value("${custom.hotSearchExcelOutputPath}")
String hotSearchExcelOutputPath;
@Value("${custom.backupFilePath}")
String backupFilePath;
@Value("${custom.pagesOutputPath}")
String pagesOutputPath;
@Value("${custom.ftpUploadPath}")
String ftpUploadPath;
/**
* 每周一五的早上8点执行导出数据的任务
*/
public void exportNewsDataAndUpload() {
logger.info("开始导出excel和pdf数据...");
String lastLoadTime = configService.getConfigValueByName("last_loadtime");
String currentLoadTime = StringUtils.DateToString(new Date());
String timestamp = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd"));
databaseConnector.exportToXlsx(lastLoadTime);
copyPagesFiles(lastLoadTime, currentLoadTime);
configService.setConfigValueByName("last_loadtime", currentLoadTime);
String zipFileName = "data_news-" + timestamp + "-001.zip";
String zipFileFullName = backupFilePath + File.separator + zipFileName;
String remoteZipPath = ftpUploadPath + "/" + zipFileName;
zipAndUploadDirectory(newsExcelOutputPath, zipFileFullName, remoteZipPath);
}
public void exportTwitterDataAndUpload() {
logger.info("开始导出twitter excel数据...");
String twitterLastLoadTime = configService.getConfigValueByName("twitter_last_loadtime");
String currentLoadTime = StringUtils.DateToString(new Date());
String timestamp = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd"));
databaseConnector.twitterToXlsx(twitterLastLoadTime);
unzipAndMoveVideosImages(twitterLastLoadTime, currentLoadTime);
configService.setConfigValueByName("twitter_last_loadtime", currentLoadTime);
String zipFileName = "data_twitter-" + timestamp + "-001.zip";
String zipFileFullName = backupFilePath + File.separator + zipFileName;
String remoteZipPath = ftpUploadPath + "/" + zipFileName;
zipAndUploadDirectory(twitterExcelOutputPath, zipFileFullName, remoteZipPath);
}
public void exportHotSearchAndUpload() {
logger.info("开始导出百度热搜 excel数据...");
String hotSearchLastLoadTime = configService.getConfigValueByName("hot_search_last_loadtime");
String currentLoadTime = StringUtils.DateToString(new Date());
String timestamp = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd"));
databaseConnector.hotSearchToXlsx(hotSearchLastLoadTime);
configService.setConfigValueByName("hot_search_last_loadtime", currentLoadTime);
String zipFileName = "data_hot_search-" + timestamp + "-001.zip";
String zipFileFullName = backupFilePath + File.separator + zipFileName;
String remoteZipPath = ftpUploadPath + "/" + zipFileName;
zipAndUploadDirectory(hotSearchExcelOutputPath, zipFileFullName, remoteZipPath);
}
/**
* 将指定目录打包成 ZIP 文件保存到指定本地路径并上传到 FTP 服务器
*
* @param sourceDirPath 本地要打包的源目录路径/data/reports
* @param localZipPath 本地 ZIP 文件保存路径/backup/archives/reports_20251224.zip
* @param remoteZipPath FTP 上的目标路径/ftp/backups/reports_20251224.zip
*/
public void zipAndUploadDirectory(String sourceDirPath, String localZipPath, String remoteZipPath) {
Path sourceDir = Paths.get(sourceDirPath);
if (!Files.exists(sourceDir) || !Files.isDirectory(sourceDir)) {
logger.error("源目录不存在或不是一个目录: {}", sourceDirPath);
return;
}
Path localZipFile = Paths.get(localZipPath);
Path zipParent = localZipFile.getParent();
if (zipParent != null && !Files.exists(zipParent)) {
try {
Files.createDirectories(zipParent);
logger.debug("创建 ZIP 父目录: {}", zipParent);
} catch (IOException e) {
logger.error("无法创建 ZIP 父目录: {}", zipParent, e);
return;
}
}
// 打包目录到指定本地 ZIP 路径
try {
zipDirectory(sourceDir, localZipFile.toFile());
} catch (IOException e) {
logger.error("打包目录失败: {}", sourceDirPath, e);
return;
}
// 上传 ZIP 文件
try (InputStream zipInputStream = Files.newInputStream(localZipFile)) {
boolean uploaded = sftpConnector.uploadFile(zipInputStream, remoteZipPath);
if (uploaded) {
logger.info("ZIP 文件上传成功 - 本地: {}, FTP: {}", localZipPath, remoteZipPath);
} else {
logger.error("ZIP 文件上传失败 - FTP: {}", remoteZipPath);
}
} catch (IOException e) {
logger.error("读取本地 ZIP 文件失败: {}", localZipPath, e);
}
// 注意此处不再删除 localZipFile由调用方决定是否保留或清理
}
/**
* 将目录递归打包成 ZIP 文件
*
* @param sourceDir 要打包的源目录
* @param zipFile 输出的 ZIP 文件
* @throws IOException
*/
private void zipDirectory(Path sourceDir, File zipFile) throws IOException {
try (ZipOutputStream zipOut = new ZipOutputStream(new FileOutputStream(zipFile))) {
Files.walk(sourceDir)
.filter(path -> !Files.isDirectory(path)) // 只处理文件
.forEach(path -> {
ZipEntry zipEntry = new ZipEntry(sourceDir.relativize(path).toString());
try {
zipOut.putNextEntry(zipEntry);
Files.copy(path, zipOut);
zipOut.closeEntry();
} catch (IOException e) {
throw new RuntimeException("打包文件失败: " + path, e);
}
});
}
logger.info("目录打包完成: {} -> {}", sourceDir, zipFile.getAbsolutePath());
try {
Files.walk(sourceDir)
.sorted(Comparator.reverseOrder()) // 先处理子文件/子目录再处理父目录但这里只删文件
.filter(path -> !Files.isDirectory(path)) // 只删除文件
.forEach(path -> {
try {
Files.delete(path);
logger.debug("已删除文件: {}", path);
} catch (IOException e) {
logger.warn("无法删除文件: {}", path, e);
}
});
logger.info("源目录已清空(仅删除文件,保留目录结构): {}", sourceDir);
} catch (IOException e) {
logger.error("清空源目录时发生错误", e);
// 注意即使清理失败ZIP 已生成并会继续上传根据业务决定是否抛异常
// 如果要求必须清理成功才算成功可在此 throw 异常
}
}
/**
* 解压存档文件并移动视频/图片目录
*
* @param startTime 业务开始时间格式yyyy-MM-dd HH:mm:ss实际未使用但保留接口兼容性
* @param endTime 业务结束时间格式yyyy-MM-dd HH:mm:ss
*/
public void unzipAndMoveVideosImages(String startTime, String endTime) {
logger.info("开始处理存档文件: startTime={}, endTime={}", startTime, endTime);
try {
// 1. 计算endTime前一日日期
LocalDate archiveDate = parseEndDate(endTime).minusDays(1);
String dateStr = archiveDate.format(DateTimeFormatter.ISO_DATE); // yyyy-MM-dd
// 2. 构建存档目录路径: D:/data/dbzq_backup/{yyyy}/{yyyy-MM}/{yyyy-MM-dd}
String year = String.valueOf(archiveDate.getYear());
String yearMonth = archiveDate.format(DateTimeFormatter.ofPattern("yyyy-MM"));
Path archiveBaseDir = Paths.get("D:/data/dbzq_backup", year, yearMonth, dateStr);
if (!Files.exists(archiveBaseDir) || !Files.isDirectory(archiveBaseDir)) {
logger.error("存档目录不存在: {}", archiveBaseDir);
throw new FileNotFoundException("存档目录不存在: " + archiveBaseDir);
}
logger.info("使用存档目录: {}", archiveBaseDir);
// 3. 确保输出目录存在
Path outputDir = Paths.get(twitterExcelOutputPath);
Files.createDirectories(outputDir);
logger.info("输出目录: {}", outputDir);
// 4. 处理视频压缩包 (image_data_plane_*.tar.gz)
processArchiveFiles(
archiveBaseDir,
"image_data_plane_",
"videos",
outputDir
);
// 5. 处理图片压缩包 (image_data_ship_*.tar.gz)
processArchiveFiles(
archiveBaseDir,
"image_data_ship_",
"images",
outputDir
);
logger.info("存档文件处理完成: {}", dateStr);
} catch (Exception e) {
logger.error("存档处理失败 [endTime={}]", endTime, e);
throw new RuntimeException("存档处理异常: " + e.getMessage(), e);
}
}
/**
* 解析结束时间字符串兼容多种常见格式
*/
private LocalDate parseEndDate(String endTime) {
// 尝试常见时间格式
String[] patterns = {
"yyyy-MM-dd HH:mm:ss",
"yyyy-MM-dd'T'HH:mm:ss",
"yyyy-MM-dd HH:mm",
"yyyy-MM-dd"
};
for (String pattern : patterns) {
try {
DateTimeFormatter formatter = DateTimeFormatter.ofPattern(pattern);
return LocalDate.parse(endTime.substring(0, 10), DateTimeFormatter.ISO_DATE); // 直接取日期部分
} catch (Exception ignored) {
// 尝试下一种格式
}
}
// 最终尝试完整解析
try {
return LocalDate.parse(endTime.trim().split("\\s+")[0]); // 取日期部分
} catch (DateTimeParseException e) {
throw new IllegalArgumentException("无法解析 endTime 格式: " + endTime +
",支持格式: yyyy-MM-dd[ HH:mm:ss]");
}
}
/**
* 处理指定前缀的压缩包
*
* @param archiveDir 存档目录
* @param filePrefix 文件前缀 ( "image_data_plane_")
* @param targetDirName 目标目录名 ( "videos")
* @param outputDir 输出根目录
*/
private void processArchiveFiles(Path archiveDir, String filePrefix,
String targetDirName, Path outputDir) throws IOException {
// 查找所有匹配的tar.gz文件
List<Path> tarFiles = Files.list(archiveDir)
.filter(path -> Files.isRegularFile(path)
&& path.getFileName().toString().startsWith(filePrefix)
&& path.getFileName().toString().endsWith(".tar.gz"))
.sorted() // 按文件名排序确保处理顺序
.collect(Collectors.toList());
if (tarFiles.isEmpty()) {
logger.warn("未找到 {} 开头的压缩包: {}", filePrefix, archiveDir);
return;
}
logger.info("找到 {} 个 {} 压缩包: {}", tarFiles.size(), filePrefix,
tarFiles.stream().map(Path::getFileName).collect(Collectors.toList()));
// 创建全局临时目录用于合并所有压缩包内容
Path tempMergeDir = Files.createTempDirectory("archive_merge_");
logger.debug("创建临时合并目录: {}", tempMergeDir);
try {
// 步骤1: 依次解压所有tar.gz到临时目录
int totalFiles = 0;
for (Path tarFile : tarFiles) {
logger.info("解压压缩包: {}", tarFile.getFileName());
totalFiles += FileUtils.extractTarGz(tarFile.toFile(), tempMergeDir.toFile());
}
if (totalFiles == 0) {
logger.warn("解压后未发现任何文件,跳过移动: {}", filePrefix);
return;
}
logger.info("共解压 {} 个文件到临时目录", totalFiles);
// 步骤2: 平铺移动所有文件到目标目录不保留目录结构同名覆盖
Path targetPath = outputDir.resolve(targetDirName);
Files.createDirectories(targetPath); // 确保目标目录存在
int movedCount = FileUtils.flattenAndMoveFiles(tempMergeDir, targetPath);
logger.info("成功平铺移动 {} 个文件到: {}", movedCount, targetPath);
} catch (Exception e) {
e.printStackTrace();
} finally {
// 清理临时目录
try {
FileUtils.deleteDirectory(tempMergeDir);
logger.debug("已清理临时目录: {}", tempMergeDir);
} catch (Exception e) {
logger.warn("清理临时目录失败: {}", tempMergeDir, e);
}
}
}
public void copyPagesFiles(String startTime, String endTime) {
try {
logger.info("开始复制PDF...");
// 解析时间范围
Date start = sdf.parse(startTime);
Date end = sdf.parse(endTime);
// 源目录
Path sourceDir = Paths.get(pagesOutputPath);
if (!Files.exists(sourceDir) || !Files.isDirectory(sourceDir)) {
logger.error("源目录不存在或不是目录: " + pagesOutputPath);
return;
}
// 目标目录 excelOutputPath 下创建 pdf 子目录
Path targetBaseDir = Paths.get(newsExcelOutputPath);
Path targetPdfDir = targetBaseDir.resolve("pdf");
// 确保目标目录存在
if (!Files.exists(targetPdfDir)) {
Files.createDirectories(targetPdfDir);
}
// 遍历源目录中的所有 PDF 文件
Files.walk(sourceDir)
.filter(path -> !Files.isDirectory(path))
.filter(path -> path.toString().toLowerCase().endsWith(".pdf"))
.forEach(path -> {
try {
// 获取文件创建时间注意Linux/macOS 可能不支持 creationTime
BasicFileAttributes attrs = Files.readAttributes(path, BasicFileAttributes.class);
FileTime creationTime = attrs.creationTime();
Date fileCreationDate = new Date(creationTime.toMillis());
// 如果 creationTime 在某些系统上不可靠可替换为 lastModifiedTime
// Date fileCreationDate = new Date(Files.getLastModifiedTime(path).toMillis());
// 判断文件时间是否在指定范围内
if (!fileCreationDate.before(start) && !fileCreationDate.after(end)) {
// 构建目标路径保留相对结构或直接放平这里按原相对路径保留
Path relativePath = sourceDir.relativize(path);
Path targetPath = targetPdfDir.resolve(relativePath);
// 确保目标子目录存在
Path targetParent = targetPath.getParent();
if (targetParent != null && !Files.exists(targetParent)) {
Files.createDirectories(targetParent);
}
// 复制文件
Files.copy(path, targetPath, StandardCopyOption.REPLACE_EXISTING);
logger.info("已复制文件: " + path + " -> " + targetPath);
}
} catch (IOException e) {
logger.error("处理文件时出错: " + path + " - " + e.getMessage());
}
});
logger.info("PDF 文件复制完成,目标目录: " + targetPdfDir.toAbsolutePath());
} catch (ParseException e) {
logger.error("时间格式解析错误,请确保使用格式: " + DATE_FORMAT);
e.printStackTrace();
} catch (IOException e) {
logger.error("IO 错误: " + e.getMessage());
e.printStackTrace();
}
}
}

View File

@ -1,108 +0,0 @@
package com.jsc.dsp.utils;
import org.apache.commons.net.ftp.FTP;
import org.apache.commons.net.ftp.FTPClient;
import org.apache.commons.net.ftp.FTPReply;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;
import java.io.IOException;
import java.io.InputStream;
@Component
public class FTPConnector {
Logger log = LoggerFactory.getLogger(this.getClass().getName());
@Value("${ftp.host}")
String host;
@Value("${ftp.port}")
Integer port;
@Value("${ftp.username}")
String username;
@Value("${ftp.password}")
String password;
@Value("${ftp.timeout}")
Integer timeout;
public boolean uploadFile(InputStream inputStream, String remotePath) {
FTPClient ftpClient = new FTPClient();
try {
// 连接 FTP 服务器
ftpClient.connect(host, port);
ftpClient.login(username, password);
ftpClient.setConnectTimeout(timeout);
ftpClient.setSoTimeout(timeout);
// 设置文件类型为二进制避免文本模式损坏文件
ftpClient.setFileType(FTP.BINARY_FILE_TYPE);
// 启用被动模式适用于 NAT/防火墙环境
ftpClient.enterLocalPassiveMode();
// 检查登录是否成功
if (!FTPReply.isPositiveCompletion(ftpClient.getReplyCode())) {
ftpClient.disconnect();
log.error("FTP 登录失败");
return false;
}
// 创建目录如果路径包含子目录
createDirectories(ftpClient, remotePath);
// 上传文件
boolean success = ftpClient.storeFile(remotePath, inputStream);
if (success) {
log.info("文件上传成功: {}", remotePath);
} else {
log.error("FTP 上传失败,错误码: {}", ftpClient.getReplyCode());
}
return success;
} catch (IOException e) {
log.error("FTP 上传异常: {}", e.getMessage(), e);
return false;
} finally {
try {
if (inputStream != null) {
inputStream.close();
}
if (ftpClient.isConnected()) {
ftpClient.logout();
ftpClient.disconnect();
}
} catch (IOException e) {
log.warn("关闭 FTP 连接时出错", e);
}
}
}
/**
* 递归创建远程目录如果路径中包含目录
*/
private void createDirectories(FTPClient ftpClient, String remoteFilePath) throws IOException {
String[] pathParts = remoteFilePath.split("/");
StringBuilder currentPath = new StringBuilder();
for (int i = 0; i < pathParts.length - 1; i++) {
if (!pathParts[i].isEmpty()) {
currentPath.append("/").append(pathParts[i]);
// 尝试切换目录如果失败则创建
if (!ftpClient.changeWorkingDirectory(currentPath.toString())) {
boolean made = ftpClient.makeDirectory(currentPath.toString());
if (made) {
log.debug("创建 FTP 目录: {}", currentPath);
}
ftpClient.changeWorkingDirectory(currentPath.toString());
}
}
}
}
}

View File

@ -1,25 +1,23 @@
package com.jsc.dsp.utils; package com.jsc.dsp.utils;
import org.apache.commons.compress.archivers.ArchiveEntry; import org.apache.commons.compress.archivers.ArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream; import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
import java.io.*; import java.io.*;
import java.net.HttpURLConnection; import java.net.HttpURLConnection;
import java.net.URL; import java.net.URL;
import java.nio.file.*; import java.nio.file.Files;
import java.nio.file.attribute.BasicFileAttributes; import java.nio.file.Path;
import java.util.*; import java.nio.file.Paths;
import java.util.concurrent.atomic.AtomicInteger; import java.nio.file.StandardCopyOption;
import java.util.stream.Collectors; import java.util.Calendar;
import java.util.stream.Stream; import java.util.Date;
import java.util.HashSet;
import org.slf4j.Logger; import java.util.logging.Logger;
import org.slf4j.LoggerFactory;
public class FileUtils { public class FileUtils {
private final Logger logger = LoggerFactory.getLogger(this.getClass().getName()); private final Logger logger = Logger.getLogger(this.getClass().getName());
public FileUtils() { public FileUtils() {
} }
@ -81,7 +79,7 @@ public class FileUtils {
public int downloadFromUrl(String urlStr, String savePath) { public int downloadFromUrl(String urlStr, String savePath) {
try { try {
if (downloadedFileSet.contains(urlStr)) { if (downloadedFileSet.contains(urlStr)) {
logger.warn("File exist from " + urlStr); logger.warning("File exist from " + urlStr);
return 2; return 2;
} }
String[] urlCascade = urlStr.split("/"); String[] urlCascade = urlStr.split("/");
@ -185,180 +183,6 @@ public class FileUtils {
} }
} }
/**
* 解压tar.gz文件到指定目录
*/
/**
* 解压tar.gz文件到指定目录返回解压的文件数量不包含目录
*
* @return 解压的普通文件数量
*/
public static int extractTarGz(File tarFile, File destDir) throws IOException {
if (!destDir.exists() && !destDir.mkdirs()) {
throw new IOException("无法创建目标目录: " + destDir.getAbsolutePath());
}
int fileCount = 0;
try (FileInputStream fis = new FileInputStream(tarFile);
BufferedInputStream bis = new BufferedInputStream(fis);
GzipCompressorInputStream gzIn = new GzipCompressorInputStream(bis);
TarArchiveInputStream tarIn = new TarArchiveInputStream(gzIn)) {
TarArchiveEntry entry;
while ((entry = tarIn.getNextTarEntry()) != null) {
// 跳过空条目符号链接特殊设备文件
if (entry.getName().trim().isEmpty()
|| entry.isSymbolicLink()
|| entry.isCharacterDevice()
|| entry.isBlockDevice()) {
continue;
}
// 安全校验防止路径遍历攻击
Path entryPath = destDir.toPath().resolve(entry.getName()).normalize();
if (!entryPath.startsWith(destDir.toPath().normalize())) {
continue;
}
// 创建目录结构为后续文件写入做准备
if (entry.isDirectory()) {
Files.createDirectories(entryPath);
} else {
Files.createDirectories(entryPath.getParent());
Files.copy(tarIn, entryPath, StandardCopyOption.REPLACE_EXISTING);
fileCount++;
}
}
return fileCount;
} catch (IOException e) {
throw e;
}
}
/**
* 递归删除目录含子目录和文件
*/
public static void deleteDirectory(Path path) throws IOException {
if (!Files.exists(path)) return;
Files.walkFileTree(path, new SimpleFileVisitor<Path>() {
@Override
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
Files.delete(file);
return FileVisitResult.CONTINUE;
}
@Override
public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException {
Files.delete(dir);
return FileVisitResult.CONTINUE;
}
});
}
public static void moveAllFilesRecursively(Path sourceDir, Path targetDir) throws IOException {
if (!Files.exists(sourceDir) || !Files.isDirectory(sourceDir)) {
return;
}
// 使用Files.walk递归遍历所有文件
try (Stream<Path> walkStream = Files.walk(sourceDir)) {
walkStream
.filter(path -> !Files.isDirectory(path)) // 只处理文件
.sorted() // 确保先创建父目录再移动文件
.forEach(file -> {
try {
// 计算相对路径相对于sourceDir
Path relativePath = sourceDir.relativize(file);
// 构建目标文件路径
Path targetFile = targetDir.resolve(relativePath);
// 确保目标父目录存在
Files.createDirectories(targetFile.getParent());
// 移动文件覆盖同名文件
Files.move(file, targetFile,
StandardCopyOption.REPLACE_EXISTING,
StandardCopyOption.COPY_ATTRIBUTES);
} catch (IOException e) {
throw new UncheckedIOException(e); // 便于Stream中抛出
}
});
} catch (UncheckedIOException e) {
throw e.getCause() instanceof IOException ? (IOException) e.getCause() : new IOException(e);
}
}
/**
* 递归遍历源目录将所有文件平铺移动到目标目录不保留目录结构同名覆盖
*
* @param sourceDir 源目录临时解压目录
* @param targetDir 目标目录 D:/output/twitter/videos
* @return 成功移动的文件数量
*/
public static int flattenAndMoveFiles(Path sourceDir, Path targetDir) throws Exception {
if (!Files.exists(sourceDir) || !Files.isDirectory(sourceDir)) {
return 0;
}
AtomicInteger movedCount = new AtomicInteger(0);
Map<String, Path> duplicateFiles = new HashMap<>(); // 记录被覆盖的文件
try (Stream<Path> walkStream = Files.walk(sourceDir)) {
walkStream
.filter(path -> Files.isRegularFile(path)) // 只处理普通文件
.forEach(file -> {
try {
String fileName = file.getFileName().toString();
Path targetFile = targetDir.resolve(fileName);
// 检测同名文件覆盖用于日志记录
boolean willOverwrite = Files.exists(targetFile);
if (willOverwrite) {
duplicateFiles.put(fileName, file);
}
// 移动文件覆盖同名文件
Files.move(file, targetFile);
movedCount.incrementAndGet();
} catch (Exception e) {
e.printStackTrace();
}
});
} catch (UncheckedIOException e) {
throw e.getCause() instanceof IOException ? (IOException) e.getCause() : new IOException(e);
}
return movedCount.get();
}
/**
* 清空目录内容保留目录本身
*/
public static void cleanDirectory(Path dir) throws IOException {
if (!Files.exists(dir)) return;
try (DirectoryStream<Path> stream = Files.newDirectoryStream(dir)) {
for (Path entry : stream) {
if (Files.isDirectory(entry)) {
deleteDirectory(entry);
} else {
Files.delete(entry);
}
}
}
}
public static void main(String[] args) { public static void main(String[] args) {
saveStringToFile("{\"aaa\":\"测试测试testtest\"}", "E:/yuxin/test.json"); saveStringToFile("{\"aaa\":\"测试测试testtest\"}", "E:/yuxin/test.json");
} }

View File

@ -1,138 +0,0 @@
package com.jsc.dsp.utils;
import com.jcraft.jsch.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;
import java.io.IOException;
import java.io.InputStream;
import java.util.Properties;
@Component
public class SFTPConnector {
private static final Logger log = LoggerFactory.getLogger(SFTPConnector.class);
@Value("${sftp.host}")
private String host;
@Value("${sftp.port:22}") // SFTP 默认端口 22
private Integer port;
@Value("${sftp.username}")
private String username;
@Value("${sftp.password}") // 支持密码认证生产环境建议改用私钥
private String password;
@Value("${sftp.timeout:30000}")
private Integer timeout; // 单位毫秒
@Value("${sftp.strictHostKeyChecking:false}") // false 仅用于测试环境
private boolean strictHostKeyChecking;
/**
* 上传文件到 SFTP 服务器密码认证
*
* @param inputStream 源文件流方法内部负责关闭
* @param remotePath 远程绝对路径 /upload/2024/file.pdf
* @return 上传成功返回 true
*/
public boolean uploadFile(InputStream inputStream, String remotePath) {
Session session = null;
ChannelSftp channelSftp = null;
try {
// 1. 初始化 JSch 会话
JSch jsch = new JSch();
session = jsch.getSession(username, host, port);
session.setPassword(password);
session.setTimeout(timeout);
// 2. 配置 SSH 连接参数安全提示生产环境必须启用 StrictHostKeyChecking 并配置 known_hosts
Properties config = new Properties();
config.put("StrictHostKeyChecking", String.valueOf(strictHostKeyChecking));
session.setConfig(config);
// 3. 建立连接
session.connect();
channelSftp = (ChannelSftp) session.openChannel("sftp");
channelSftp.connect(timeout);
// 4. 确保目标目录存在
ensureDirectoryExists(channelSftp, remotePath);
// 5. 上传文件JSch 会完整读取流但不关闭流
channelSftp.put(inputStream, remotePath);
log.info("SFTP 文件上传成功: {}", remotePath);
return true;
} catch (JSchException | SftpException e) {
log.error("SFTP 上传失败 [host={}, path={}]: {}", host, remotePath, e.getMessage(), e);
return false;
} catch (Exception e) {
log.error("SFTP 上传异常 [path={}]: {}", remotePath, e.getMessage(), e);
return false;
} finally {
// 6. 资源清理先关流再关通道/会话
closeQuietly(inputStream);
if (channelSftp != null && channelSftp.isConnected()) {
try {
channelSftp.disconnect();
} catch (Exception e) {
log.warn("关闭 SFTP 通道异常", e);
}
}
if (session != null && session.isConnected()) {
session.disconnect();
}
}
}
/**
* 递归创建远程目录基于 ChannelSftp
*
* @param sftp SFTP 通道
* @param remotePath 完整远程文件路径含文件名
* @throws SftpException 目录创建失败时抛出
*/
private void ensureDirectoryExists(ChannelSftp sftp, String remotePath) throws SftpException {
String dirPath = extractDirectory(remotePath);
if ("/".equals(dirPath)) return;
String[] dirs = dirPath.split("/");
StringBuilder current = new StringBuilder();
for (String dir : dirs) {
if (dir.isEmpty()) continue;
current.append("/").append(dir);
try {
sftp.cd(current.toString()); // 尝试进入目录
} catch (SftpException e) {
sftp.mkdir(current.toString()); // 不存在则创建
sftp.cd(current.toString());
}
}
}
/**
* 从完整路径提取目录部分 /a/b/file.txt /a/b
*/
private String extractDirectory(String path) {
int lastSlash = path.lastIndexOf('/');
return (lastSlash <= 0) ? "/" : path.substring(0, lastSlash);
}
/**
* 安静关闭输入流
*/
private void closeQuietly(InputStream is) {
if (is != null) {
try {
is.close();
} catch (IOException e) {
log.debug("关闭输入流时忽略异常", e);
}
}
}
}

View File

@ -116,29 +116,6 @@ public class StringUtils {
return wordList; return wordList;
} }
public static String camelToSnake(String camel) {
if (camel == null || camel.isEmpty()) {
return camel;
}
StringBuilder result = new StringBuilder();
result.append(Character.toLowerCase(camel.charAt(0)));
for (int i = 1; i < camel.length(); i++) {
char ch = camel.charAt(i);
if (Character.isUpperCase(ch)) {
// 如果前一个字符不是大写或者后一个不是小写则加下划线
char prev = camel.charAt(i - 1);
if (!Character.isUpperCase(prev) ||
(i + 1 < camel.length() && Character.isLowerCase(camel.charAt(i + 1)))) {
result.append('_');
}
result.append(Character.toLowerCase(ch));
} else {
result.append(ch);
}
}
return result.toString();
}
public static void main(String[] args) { public static void main(String[] args) {
initFilterMap("http://39.98.151.140:28081/api/open/wordBank/queryAll"); initFilterMap("http://39.98.151.140:28081/api/open/wordBank/queryAll");
} }

View File

@ -1,94 +0,0 @@
package com.jsc.dsp.utils;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.google.protobuf.Descriptors;
import com.google.protobuf.GeneratedMessageV3;
import com.google.protobuf.InvalidProtocolBufferException;
import com.jsc.dsp.proto.EsOuterClass;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Map;
/**
* 备选方案使用 FastJSON 手动转换无额外依赖
*/
public class TodistParseUtil {
public static String protobufToJson(EsOuterClass.EsSets esSets) {
JSONObject root = new JSONObject();
// 处理 repeated Es 字段
JSONArray esArray = new JSONArray();
for (EsOuterClass.Es es : esSets.getEsList()) {
esArray.add(messageToJson(es));
}
root.put("es", esArray);
return JSON.toJSONString(root, true); // pretty format
}
private static JSONObject messageToJson(GeneratedMessageV3 message) {
JSONObject json = new JSONObject();
Map<Descriptors.FieldDescriptor, Object> fields = message.getAllFields();
for (Map.Entry<Descriptors.FieldDescriptor, Object> entry : fields.entrySet()) {
Descriptors.FieldDescriptor field = entry.getKey();
Object value = entry.getValue();
if (field.isRepeated()) {
JSONArray array = new JSONArray();
if (value instanceof Iterable) {
for (Object item : (Iterable<?>) value) {
array.add(convertFieldValue(item));
}
}
json.put(field.getName(), array);
} else {
json.put(field.getName(), convertFieldValue(value));
}
}
return json;
}
private static Object convertFieldValue(Object value) {
if (value instanceof GeneratedMessageV3) {
return messageToJson((GeneratedMessageV3) value);
}
// 其他类型直接返回Protobuf 基本类型可被 FastJSON 识别
return value;
}
public static void main(String[] args) {
String filePath = "C:/Users/yuxin/Documents/xwechat_files/wxid_dtvj9sibla0d21_9cb3/msg/file/2026-02/public_info_data_1770264282958.todist";
try {
// 1. 流式读取文件避免大文件 OOM
byte[] data = Files.readAllBytes(Paths.get(filePath));
// 2. Protobuf 反序列化
EsOuterClass.EsSets esSets = EsOuterClass.EsSets.parseFrom(data);
System.out.println("✅ 成功解析 EsSets" + esSets.getEsCount() + " 条记录");
// 3. 转换为 JSON使用 Protobuf 原生 JsonFormat
String json = protobufToJson(esSets);
// 4. 输出格式化 JSON
System.out.println("/n📄 JSON Output:");
System.out.println(json);
} catch (InvalidProtocolBufferException e) {
System.err.println("❌ Protobuf 解析失败: " + e.getMessage());
e.printStackTrace();
} catch (IOException e) {
System.err.println("❌ 文件读取失败: " + e.getMessage());
e.printStackTrace();
} catch (Exception e) {
System.err.println("❌ 未知错误: " + e.getMessage());
e.printStackTrace();
}
}
}

View File

@ -1,19 +1,18 @@
server: server:
port: 18084 port: 8084
servlet:
context-path: /dsp
spring: spring:
cloud: cloud:
stream: stream:
kafka: kafka:
binder: binder:
brokers: 47.113.231.200:9092 brokers: 47.113.231.200:9092
zkNodes: 47.113.231.200:2181
auto-create-topics: true auto-create-topics: true
healthTimeout: 600 healthTimeout: 600
bindings: bindings:
file_dl_pipeline_in: file_dl_pipeline_in:
destination: stream-file-dl destination: stream-file-dl
group: file-dl-test group: file-dl
consumer: consumer:
pollTimeout: 60 pollTimeout: 60
file_dl_pipeline_out: file_dl_pipeline_out:
@ -21,7 +20,7 @@ spring:
content-type: text/plain content-type: text/plain
protobuf_pipeline_in: protobuf_pipeline_in:
destination: stream-protobuf destination: stream-protobuf
group: protobuf-test group: protobuf
consumer: consumer:
pollTimeout: 60 pollTimeout: 60
protobuf_pipeline_out: protobuf_pipeline_out:
@ -29,7 +28,7 @@ spring:
content-type: text/plain content-type: text/plain
storage_pipeline_in: storage_pipeline_in:
destination: stream-db destination: stream-db
group: db-test group: db
consumer: consumer:
pollTimeout: 60 pollTimeout: 60
storage_pipeline_out: storage_pipeline_out:
@ -44,64 +43,38 @@ spring:
records: 10 records: 10
interval: interval:
ms: 3600000 ms: 3600000
datasource:
url: jdbc:mysql://47.113.231.200:28089/dsp?useSSL=false&serverTimezone=Asia/Shanghai&allowPublicKeyRetrieval=true
username: root
password: passok123A
driver-class-name: com.mysql.cj.jdbc.Driver
jpa:
database-platform: org.hibernate.dialect.MySQL8Dialect
show-sql: true
topics: topics:
stream-protobuf: com.jsc.dsp.service.ProtobufService stream-protobuf: com.jsc.dsp.service.ProtobufService
stream-db: com.jsc.dsp.service.StorageService stream-db: com.jsc.dsp.service.StorageService
stream-file-dl: com.jsc.dsp.service.FileDlService stream-file-dl: com.jsc.dsp.service.FileDlService
# 本地调试时这几个开关设置为 false es:
switch: ip: 8.130.95.27
enable-storage-service: false port: 28087
enable-file-dl-service: false username: elastic
enable-protobuf-service: false password: passok123A
auto-export-and-upload: false index: indeximos
type: default
ftp: ceph:
host: 144.34.185.108 aws-access-key: JH8OF0D9ZJYYXBFYB5OD
port: 21 aws-secret-key: FuptELjiPQOQNR6tPOVL777n3dGe3bZCDJphyiz0
username: jsc-2b endpoint: 192.168.1.16:28090
password: 1234qwer% db:
timeout: 5000 driver: com.mysql.cj.jdbc.Driver
passive-mode: true url: jdbc:mysql://8.130.95.27:28089/dsp
user: root
sftp: password: passok123A
host: 74.121.148.204
port: 22
username: root
password: NSgRMhIXL6gp
custom: custom:
dev-mode: false dev-mode: false
filter-words-query-api: http://47.115.228.133:28081/api/open/wordBank/queryAll filter-words-query-api: http://47.115.228.133:28081/api/open/wordBank/queryAll
filter-words-update-interval-ms: 3600000 filter-words-update-interval-ms: 3600000
local-file-storage-path: D:/data/local-storage/ local-file-storage-path: E:/data/local-storage/
proto_save_path: D:/data/spider_data/proto/ proto_save_path: D:/data/spider_data/proto/
transfer_backup_path: D:/data/transfer_backup/ transfer_backup_path: E:/data/transfer_backup/
file_unzip_path: D:/html-full/ file_unzip_path: E:/html-full/
keep_backup_file: D:/data/dbzq_backup/ keep_backup_file: E:/data/dbzq_backup/
nginx_path: D:/OSC-3.0/app/osdp_board/html/ nginx_path: E:/OSC-3.0/app/osdp_board/html/
websiteQueryAPI: http://47.115.228.133:28081/api/open/target/website/queryAllInfo websiteQueryAPI: http://47.115.228.133:28081/api/open/target/website/queryAllInfo
websiteUpdateAPI: http://47.115.228.133:28081/api/open/target/website/update websiteUpdateAPI: http://47.115.228.133:28081/api/open/target/website/update
socialQueryAPI: http://47.115.228.133:28081/api/open/target/social/queryAll?sortBy=id&shuffleResult=false socialQueryAPI: http://47.115.228.133:28081/api/open/target/social/queryAll?sortBy=id&shuffleResult=false
socialUpdateAPI: http://47.115.228.133:28081/api/open/target/social/update socialUpdateAPI: http://47.115.228.133:28081/api/open/target/social/update
websiteWhiteList: 能源界(国内信息);能源界(国际信息);中国能源新闻网;新华能源网;中国能源网(能源战略);中国农网(三农要闻);中国经济网(三农经济);中华粮网(粮食安全);美国之音(中国版面);美国之音(中美关系);美国之音(台海两岸版面);美国之音(港澳版面);看中国(看大陆版面);看中国(重点新闻);德国之声(中国报道);纽约时报中文网(中国版面);大纪元(一周大陆新闻);EnergyNow;联合国粮农组织;路透社(中国版面);朝中社;劳动新闻;美国农业部食品和营养服务局;布鲁金斯学会(亚太版面);俄罗斯新闻社;美国能源部;朝鲜新闻;联邦能源管理委员会;华盛顿邮报;ChinaAid;美国战略与国际研究中心;美国外交关系委员会;美国兰德;国际危机组织;美国国务院东亚与太平洋事务局;俄罗斯卫星通讯社;尤里·列瓦达分析中心;塔斯社;韩国外交部
twitterWhiteList: nytchinese;YesterdayBigcat;takaichi_sanae;yonhapcn;VOAChinese;ChineseWSJ;whyyoutouzhele;Jaemyung_Lee
newsExcelOutputPath: D:/data/output/upload
twitterExcelOutputPath: D:/data/output/twitter
hotSearchExcelOutputPath: D:/data/output/hotSearch
backupFilePath: D:/data/output/backup
pagesOutputPath: D:/data/output/pdf
ftpUploadPath: /home/jsc-2b
exportNewsTaskSchedule: "0 30 8 * * 1,2,3,4,5,6,7"
exportTwitterTaskSchedule: "0 30 6 * * 1,2,3,4,5,6,7"
exportHotSearchTaskSchedule: "0 0 20 * * 1,2,3,4,5,6,7"

View File

@ -8,7 +8,7 @@
<contextName>logback</contextName> <contextName>logback</contextName>
<!-- name的值是变量的名称value的值时变量定义的值。通过定义的值会被插入到logger上下文中。定义后可以使“${}”来使用变量。 --> <!-- name的值是变量的名称value的值时变量定义的值。通过定义的值会被插入到logger上下文中。定义后可以使“${}”来使用变量。 -->
<property name="log.path" value="D:/dsp-logs" /> <property name="log.path" value="E:/dsp-logs" />
<!--0. 日志格式和颜色渲染 --> <!--0. 日志格式和颜色渲染 -->
<!-- 彩色日志依赖的渲染类 --> <!-- 彩色日志依赖的渲染类 -->
@ -27,7 +27,7 @@
<encoder> <encoder>
<Pattern>${CONSOLE_LOG_PATTERN}</Pattern> <Pattern>${CONSOLE_LOG_PATTERN}</Pattern>
<!-- 设置字符集 --> <!-- 设置字符集 -->
<charset>UTF-8</charset> <charset>GBK</charset>
</encoder> </encoder>
</appender> </appender>

View File

@ -1,348 +0,0 @@
import logging
import os
import queue
import threading
import time
from datetime import datetime
import random
import pymysql
from tqdm import tqdm
from save_page_as_pdf import PDFSaver
from save_remote_as_mhtml import RemoteMHTMLSaver
from save_page_as_mhtml import MHTMLSaver
import tldextract
# 配置日志
from save_remote_as_pdf import RemotePDFSaver
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(),
logging.FileHandler('pdf_downloader.log')
]
)
logger = logging.getLogger(__name__)
# =============== MySQL 配置 ===============
MYSQL_CONFIG = {
'host': '47.113.231.200',
'port': 28089,
'user': 'root',
'password': 'passok123A',
'database': 'dsp',
'charset': 'utf8mb4',
'autocommit': False # 手动控制事务
}
# =========================================
# 配置参数
BATCH_SIZE = 500
MAX_WORKERS = 1
TIMEOUT = 10
PDF_OUTPUT_DIR = 'D:/data/output/pdf'
MIN_PDF_SIZE = 5 * 1024 # 80KB
MHTML_OUTPUT_DIR = 'D:/data/output/mhtml'
os.makedirs(PDF_OUTPUT_DIR, exist_ok=True)
running = True
running_interval_seconds = 10
skip_host_name = [
'epochtimes.com',
'secretchina.com',
# 'rodong.rep.kp',
# 'kcna.kp'
]
class PDFDownloader:
def __init__(self):
self.db_lock = threading.Lock()
self.db_connection = None
self.task_queue = queue.Queue(maxsize=MAX_WORKERS * 3)
self.processed_count = 0
self.success_count = 0
self.fail_count = 0
self.small_file_count = 0 # 新增:统计小文件数量
self.last_loadtime = self.get_last_loadtime()
self.total_rows = self.get_total_rows()
self.start_time = time.time()
self.skip_hosts = []
self.local_handler = None
self.remote_handler = None
# 替换 MYSQL_CONFIG 中的连接方式
def get_db_connection(self):
self.db_connection = pymysql.connect(
host=MYSQL_CONFIG['host'],
port=MYSQL_CONFIG['port'],
user=MYSQL_CONFIG['user'],
password=MYSQL_CONFIG['password'],
database=MYSQL_CONFIG['database'],
charset='utf8mb4',
autocommit=False
)
def get_total_rows(self):
"""获取总记录数"""
if self.db_connection is None:
self.get_db_connection()
cursor = self.db_connection.cursor()
cursor.execute(
"SELECT COUNT(*) FROM indeximos "
"WHERE (es_video IS NULL OR es_video IN ('-1')) "
"AND es_loadtime > %s", self.last_loadtime
)
return cursor.fetchone()[0]
def get_last_loadtime(self):
"""获取上次导出数据的时间"""
if self.db_connection is None:
self.get_db_connection()
cursor = self.db_connection.cursor()
cursor.execute(
"SELECT config_value FROM config "
"WHERE config_name = 'last_loadtime' "
)
return cursor.fetchone()[0]
def use_remote_selenium(self, url):
for host in skip_host_name:
if host in url:
return True
return False
def format_pdf_filename(self, row):
"""格式化PDF文件名"""
es_urltitle = row[2] or 'untitled'
es_urltime = str(row[3]) or '19700101_000000'
es_sitename = row[4] or 'anonymous'
def clean_filename(text):
if not text:
return ''
invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*']
for char in invalid_chars:
text = text.replace(char, '_')
return text.strip()[:100]
try:
dt = datetime.strptime(es_urltime, '%Y-%m-%d %H:%M:%S')
es_urltime_fix = dt.strftime('%Y%m%d_%H%M%S')
except:
es_urltime_fix = '19700101_000000'
filename = f"{clean_filename(es_urltitle)}_{es_urltime_fix}_{es_sitename}.pdf"
return os.path.join(PDF_OUTPUT_DIR, filename)
def format_mhtml_filename(self, row):
"""格式化PDF文件名"""
es_urltitle = row[2] or 'untitled'
es_urltime = str(row[3]) or '19700101_000000'
es_sitename = row[4] or 'anonymous'
def clean_filename(text):
if not text:
return ''
invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*']
for char in invalid_chars:
text = text.replace(char, '_')
return text.strip()[:100]
try:
dt = datetime.strptime(es_urltime, '%Y-%m-%d %H:%M:%S')
es_urltime_fix = dt.strftime('%Y%m%d_%H%M%S')
except:
es_urltime_fix = '19700101_000000'
filename = f"{clean_filename(es_urltitle)}_{es_urltime_fix}_{es_sitename}.mhtml"
return os.path.join(PDF_OUTPUT_DIR, filename)
def fetch_data_batch(self, offset):
"""分页获取数据"""
if self.db_connection is None:
self.get_db_connection()
cursor = self.db_connection.cursor()
cursor.execute(
"SELECT es_sid, es_urlname, es_urltitle, es_urltime, es_sitename, es_authors FROM indeximos "
"WHERE (es_video IS NULL OR es_video IN ('-1')) "
"AND es_loadtime > %s "
"ORDER BY es_urltime LIMIT %s OFFSET %s",
(self.last_loadtime, BATCH_SIZE, offset)
)
return cursor.fetchall()
def update_file_status(self, es_sid, status, retry=3):
"""更新数据库状态"""
for attempt in range(retry):
try:
with self.db_lock:
if self.db_connection is None:
self.get_db_connection()
cursor = self.db_connection.cursor()
cursor.execute(
"UPDATE indeximos SET es_video = %s WHERE es_sid = %s",
(status, es_sid))
self.db_connection.commit()
return True
except Exception as e:
if attempt == retry - 1:
logger.error(f"更新数据库失败(es_sid={es_sid}): {e}")
return False
time.sleep(1)
def extract_main_domain(self, url):
extracted = tldextract.extract(url)
# 组合注册域名(主域名)
main_domain = f"{extracted.domain}.{extracted.suffix}"
return main_domain
def download_worker(self):
"""工作线程函数"""
while True:
try:
task = self.task_queue.get(timeout=1)
if task is None:
break
row = task
url = row[1]
if self.extract_main_domain(url) in self.skip_hosts:
self.small_file_count += 1
self.processed_count += 1
self.task_queue.task_done()
print(f"小文件规避暂时跳过URL{url}")
continue
output_file = self.format_pdf_filename(row) # 获取格式化后的文件名
try:
os.makedirs(os.path.dirname(output_file), exist_ok=True)
# 调用下载函数
if self.use_remote_selenium(url):
self.processed_count += 1
self.task_queue.task_done()
continue
# if self.remote_handler is None:
# self.remote_handler = RemotePDFSaver()
# success = self.remote_handler.save_as_pdf(
# url=url,
# output_path=output_file,
# timeout=TIMEOUT
# )
else:
if self.local_handler is None:
self.local_handler = PDFSaver(headless=False)
success = self.local_handler.save_as_pdf(
url=url,
output_path=output_file,
timeout=TIMEOUT,
wait_time=5
)
# 验证下载结果
if success and os.path.exists(output_file):
file_size = os.path.getsize(output_file)
if file_size >= MIN_PDF_SIZE: # 文件大小合格
self.update_file_status(row[0], output_file)
self.success_count += 1
else: # 文件太小
self.update_file_status(row[0], '-2')
self.small_file_count += 1
logger.warning(f"文件过小({file_size}字节): {output_file}")
try:
os.remove(output_file)
self.skip_hosts.append(self.extract_main_domain(url))
except:
pass
else: # 下载失败
self.update_file_status(row[0], '0')
self.fail_count += 1
if os.path.exists(output_file):
try:
os.remove(output_file)
except:
pass
except Exception as e:
logger.error(f"下载出现异常(es_sid={row[0]}, url={url}): {str(e)}")
self.update_file_status(row[0], '-1')
self.fail_count += 1
self.processed_count += 1
self.task_queue.task_done()
except queue.Empty:
continue
def run(self):
"""启动下载任务"""
threads = []
# 创建工作线程
for _ in range(MAX_WORKERS):
t = threading.Thread(target=self.download_worker)
t.start()
threads.append(t)
# 使用进度条显示进度
with tqdm(total=self.total_rows, desc="处理进度", unit="") as pbar:
offset = 0
while True:
batch = self.fetch_data_batch(offset)
if not batch:
break
batch_list = list(batch)
random.shuffle(batch_list)
batch = tuple(batch_list)
for row in batch:
self.task_queue.put(row)
pbar.update(len(batch))
pbar.set_postfix({
'成功': self.success_count,
'失败': self.fail_count,
'小文件': self.small_file_count,
'速度': f"{self.processed_count / (time.time() - self.start_time):.1f}条/秒"
})
offset += BATCH_SIZE
self.task_queue.join()
for _ in range(MAX_WORKERS):
self.task_queue.put(None)
for t in threads:
t.join()
total_time = time.time() - self.start_time
print(f"\n处理完成! 总计: {self.total_rows}")
print(f"成功: {self.success_count}条, 失败: {self.fail_count}条, 小文件: {self.small_file_count}")
print(f"总耗时: {total_time:.2f}秒, 平均速度: {self.total_rows / total_time:.2f}条/秒")
def terminate(self):
if self.local_handler:
self.local_handler.quit()
if self.remote_handler:
self.remote_handler.quit()
self.db_connection.close()
if __name__ == "__main__":
while running:
try:
print(f"开始处理,总记录数: {PDFDownloader().get_total_rows()}")
downloader = PDFDownloader()
downloader.run()
print(f"运行完成,暂停{running_interval_seconds}秒后开始下一次运行...")
downloader.terminate()
time.sleep(running_interval_seconds)
except Exception as e:
print(repr(e))

View File

@ -1,141 +0,0 @@
import logging
import os
import time
from urllib.parse import urlparse
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service as ChromeService
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(),
logging.FileHandler('mhtml_saver.log', encoding='utf-8')
]
)
logger = logging.getLogger(__name__)
class MHTMLSaver:
def __init__(self, headless=True):
logger.info("正在初始化 Chrome WebDriver自动匹配版本...")
service = ChromeService(executable_path="C:/Program Files/Python38/chromedriver.exe")
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.3650.75 Safari/537.36 Edg/143.0.3650.75"
# Chrome 选项
chrome_options = Options()
chrome_options.add_argument('--headless=new')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument(f'--user-agent={user_agent}')
chrome_options.add_argument('--save-page-as-mhtml') # 启用 MHTML 支持
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument('--lang=zh-CN')
chrome_options.add_experimental_option('prefs', {
'intl.accept_languages': 'zh-CN,zh,en'
})
# 或启动时指定(部分版本支持)
chrome_options.add_argument('--window-size=1920,1080')
# 隐藏 webdriver 特征
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
# 隐藏 "navigator.webdriver"
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
chrome_options.add_experimental_option('useAutomationExtension', False)
self.driver = webdriver.Chrome(service=service, options=chrome_options)
def save_as_mhtml(self, url, output_path=None, timeout=30, wait_time=5):
"""
将网页保存为 MHTML 文件
:param url: 目标网页 URL
:param output_path: 输出路径.mhtml
:param timeout: 页面加载超时
:param wait_time: 页面加载后等待时间用于动态内容渲染
:return: 保存的文件绝对路径
"""
if output_path is None:
parsed = urlparse(url)
domain = parsed.netloc.replace('www.', '').split('.')[0] or 'page'
output_path = f"{domain}.mhtml"
if not output_path.lower().endswith('.mhtml'):
output_path += '.mhtml'
try:
# 设置超时
self.driver.set_page_load_timeout(timeout)
# 启动后注入脚本(双重保险)
self.driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
'source': '''
delete navigator.__proto__.webdriver;
window.navigator.permissions.query = (parameters) => {
return parameters.name === 'notifications' ?
Promise.resolve({ state: Notification.permission }) :
originalQuery(parameters);
};
'''
})
# 在 driver.get() 之后设置窗口大小
logger.info(f"正在加载页面: {url}")
self.driver.get(url)
self.driver.set_window_size(1920, 1080)
# 等待页面动态内容加载(可调整)
logger.info(f"等待 {wait_time} 秒以确保页面完全渲染...")
time.sleep(wait_time)
# ✅ 关键:调用 CDP 命令捕获 MHTML
logger.info("正在生成 MHTML 快照...")
result = self.driver.execute_cdp_cmd('Page.captureSnapshot', {'format': 'mhtml'})
# ✅ result['data'] 是 Base64 编码的 MHTML 文本(实际是纯字符串)
mhtml_content = result['data']
# ✅ 以文本模式写入UTF-8
with open(output_path, 'w', encoding='utf-8', newline='') as f:
f.write(mhtml_content)
# 验证文件
file_size = os.path.getsize(output_path)
if file_size == 0:
raise RuntimeError("生成了空文件")
logger.info(f"✅ MHTML 保存成功: {os.path.abspath(output_path)} (大小: {file_size} 字节)")
return os.path.abspath(output_path)
except Exception as e:
logger.error(f"❌ 保存失败: {e}")
raise
def quit(self):
if self.driver:
self.driver.quit()
logger.info("浏览器已关闭")
# ===== 测试入口 =====
if __name__ == "__main__":
# 示例 URL可替换为你自己的
test_url = "https://cn.ultraiso.net/jiaocheng/ke-lu-guang-pan.html"
saver = MHTMLSaver(headless=True)
try:
output_file = saver.save_as_mhtml(
url=test_url,
output_path="example.mhtml",
timeout=30,
wait_time=5
)
print(f"\n🎉 成功保存 MHTML 文件: {output_file}")
except Exception as e:
print(f"\n💥 保存失败: {e}")
finally:
saver.quit()

View File

@ -1,145 +0,0 @@
import base64
import logging
import os
import time
from urllib.parse import urlparse
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service as ChromeService
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(),
logging.FileHandler('pdf_saver.log', encoding='utf-8')
]
)
logger = logging.getLogger(__name__)
class PDFSaver:
def __init__(self, headless=True):
logger.info("正在初始化 Chrome WebDriver自动匹配版本...")
service = ChromeService(executable_path="D:/chromedriver.exe")
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.3650.75 Safari/537.36 Edg/143.0.3650.75"
# Chrome 选项
chrome_options = Options()
if headless:
chrome_options.add_argument('--headless=new')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument(f'--user-agent={user_agent}')
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument('--lang=zh-CN')
chrome_options.add_experimental_option('prefs', {
'intl.accept_languages': 'zh-CN,zh,en'
})
chrome_options.add_argument('--window-size=1920,1080')
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.page_load_strategy = 'eager'
# 注意PDF 打印不需要 --save-page-as-mhtml
self.driver = webdriver.Chrome(service=service, options=chrome_options)
def save_as_pdf(self, url, output_path=None, timeout=30, wait_time=5, print_options=None):
"""
将网页保存为 PDF 文件
:param url: 目标网页 URL
:param output_path: 输出路径.pdf
:param timeout: 页面加载超时
:param wait_time: 页面加载后等待时间用于动态内容渲染
:param print_options: PDF 打印选项可选参考 https://chromedevtools.github.io/devtools-protocol/tot/Page/#method-printToPDF
:return: 保存的文件绝对路径
"""
if output_path is None:
parsed = urlparse(url)
domain = parsed.netloc.replace('www.', '').split('.')[0] or 'page'
output_path = f"{domain}.pdf"
if not output_path.lower().endswith('.pdf'):
output_path += '.pdf'
# 默认打印选项(可按需调整)
default_print_options = {
'landscape': False,
'displayHeaderFooter': False,
'printBackground': True,
'preferCSSPageSize': True,
'paperWidth': 8.27, # A4 宽(英寸)
'paperHeight': 11.69, # A4 高(英寸)
}
if print_options:
default_print_options.update(print_options)
try:
self.driver.set_page_load_timeout(timeout)
# 隐藏自动化特征
self.driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
'source': '''
delete navigator.__proto__.webdriver;
window.navigator.permissions.query = (parameters) => {
return parameters.name === 'notifications' ?
Promise.resolve({ state: Notification.permission }) :
originalQuery(parameters);
};
const originalQuery = window.navigator.permissions.query;
'''
})
logger.info(f"正在加载页面: {url}")
self.driver.get(url)
self.driver.set_window_size(1920, 1080)
logger.info(f"等待 {wait_time} 秒以确保页面完全渲染...")
time.sleep(wait_time)
logger.info("正在生成 PDF...")
result = self.driver.execute_cdp_cmd('Page.printToPDF', default_print_options)
# result['data'] 是 Base64 编码的 PDF
pdf_data = base64.b64decode(result['data'])
with open(output_path, 'wb') as f:
f.write(pdf_data)
file_size = os.path.getsize(output_path)
if file_size == 0:
raise RuntimeError("生成了空文件")
logger.info(f"✅ PDF 保存成功: {os.path.abspath(output_path)} (大小: {file_size} 字节)")
return os.path.abspath(output_path)
except Exception as e:
logger.error(f"❌ 保存失败: {e}")
raise
def quit(self):
if self.driver:
self.driver.quit()
logger.info("浏览器已关闭")
# ===== 测试入口 =====
if __name__ == "__main__":
test_url = "https://cn.ultraiso.net/jiaocheng/ke-lu-guang-pan.html"
saver = PDFSaver(headless=True)
try:
output_file = saver.save_as_pdf(
url=test_url,
output_path="example.pdf",
timeout=30,
wait_time=5
)
print(f"\n🎉 成功保存 PDF 文件: {output_file}")
except Exception as e:
print(f"\n💥 保存失败: {e}")
finally:
saver.quit()

View File

@ -1,190 +0,0 @@
import logging
import os
import time
from urllib.parse import urlparse
from selenium import webdriver
from selenium.common.exceptions import (
WebDriverException,
TimeoutException,
SessionNotCreatedException,
InvalidSessionIdException
)
from selenium.webdriver.chrome.options import Options
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class RemoteMHTMLSaver:
def __init__(
self,
remote_url="http://144.34.185.108:28098/wd/hub",
headless=True,
max_retries=3,
retry_delay=2
):
"""
初始化远程 MHTML 保存器支持自动重建 session
:param remote_url: 远程 Selenium 地址
:param headless: 是否无头
:param max_retries: 单次操作最大重试次数
:param retry_delay: 重试前等待时间
"""
self.remote_url = remote_url
self.headless = headless
self.max_retries = max_retries
self.retry_delay = retry_delay
self.driver = None
self._init_driver()
def _build_chrome_options(self):
"""构建 Chrome 选项(可复用)"""
chrome_options = Options()
if self.headless:
chrome_options.add_argument('--headless=new')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--window-size=1920,1080')
chrome_options.add_argument(
"--user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.3650.75 Safari/537.36"
)
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
chrome_options.add_experimental_option('useAutomationExtension', False)
return chrome_options
def _init_driver(self):
"""初始化或重新初始化 WebDriver"""
if self.driver:
try:
self.driver.quit()
except Exception:
pass # 忽略关闭失败
logger.info(f"正在创建新的远程 WebDriver 会话: {self.remote_url}")
for attempt in range(3):
try:
self.driver = webdriver.Remote(
command_executor=self.remote_url,
options=self._build_chrome_options()
)
# 注入反检测脚本
self.driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
'source': '''
delete navigator.__proto__.webdriver;
window.chrome = { runtime: {} };
Object.defineProperty(navigator, 'languages', {
get: () => ['zh-CN', 'zh', 'en']
});
'''
})
logger.info("✅ 远程 WebDriver 会话创建成功")
return
except Exception as e:
logger.warning(f"创建 WebDriver 失败 (尝试 {attempt + 1}/3): {e}")
if attempt < 2:
time.sleep(2)
else:
raise RuntimeError(f"无法连接到远程 Selenium 服务: {e}")
def save_as_mhtml(self, url, output_path=None, timeout=30, wait_time=5):
"""
保存网页为 MHTML支持自动重试和 session 重建
"""
if output_path is None:
domain = urlparse(url).netloc.replace('www.', '').split('.')[0] or 'page'
output_path = f"{domain}.mhtml"
if not output_path.lower().endswith('.mhtml'):
output_path += '.mhtml'
last_exception = None
for retry in range(self.max_retries + 1):
try:
# 检查 driver 是否有效
if not self.driver:
self._init_driver()
self.driver.set_page_load_timeout(timeout)
logger.info(f"[{retry + 1}/{self.max_retries + 1}] 加载页面: {url}")
self.driver.get(url)
time.sleep(wait_time)
logger.info("生成 MHTML 快照...")
result = self.driver.execute_cdp_cmd('Page.captureSnapshot', {'format': 'mhtml'})
mhtml_content = result['data']
# 写入本地文件
with open(output_path, 'w', encoding='utf-8', newline='') as f:
f.write(mhtml_content)
file_size = os.path.getsize(output_path)
if file_size == 0:
raise RuntimeError("生成了空文件")
logger.info(f"✅ 保存成功: {os.path.abspath(output_path)} ({file_size} 字节)")
return os.path.abspath(output_path)
except (WebDriverException, InvalidSessionIdException, SessionNotCreatedException) as e:
last_exception = e
logger.warning(f"WebDriver 异常 (retry {retry + 1}): {e}")
if retry < self.max_retries:
logger.info("正在重建 WebDriver 会话...")
self._init_driver()
time.sleep(self.retry_delay)
else:
logger.error("达到最大重试次数,放弃")
break
except TimeoutException as e:
last_exception = e
logger.warning(f"页面加载超时 (retry {retry + 1}): {e}")
if retry < self.max_retries:
time.sleep(self.retry_delay)
else:
break
except Exception as e:
last_exception = e
logger.error(f"未知错误 (retry {retry + 1}): {e}")
break # 非 WebDriver 错误,不重试
# 如果所有重试失败
if os.path.exists(output_path):
try:
os.remove(output_path)
except OSError:
pass
raise RuntimeError(f"保存失败({type(last_exception).__name__}: {last_exception}")
def quit(self):
"""显式关闭浏览器"""
if self.driver:
try:
self.driver.quit()
logger.info("WebDriver 会话已关闭")
except Exception:
pass
self.driver = None
def __del__(self):
self.quit()
# ===== 测试 =====
if __name__ == "__main__":
saver = RemoteMHTMLSaver(
remote_url="http://144.34.185.108:28098/wd/hub", # ← 替换为你的云服务器公网 IP
headless=True
)
try:
saver.save_as_mhtml(
url="https://www.epochtimes.com/gb/25/12/22/n14660274.htm",
output_path="remote_example2.mhtml"
)
except Exception as e:
print(f"❌ 失败: {e}")
saver.quit()

View File

@ -1,201 +0,0 @@
import base64
import logging
import os
import time
from urllib.parse import urlparse
from selenium import webdriver
from selenium.common.exceptions import (
WebDriverException,
TimeoutException,
SessionNotCreatedException,
InvalidSessionIdException
)
from selenium.webdriver.chrome.options import Options
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class RemotePDFSaver:
def __init__(
self,
remote_url="http://144.34.185.108:28098/wd/hub",
headless=True,
max_retries=3,
retry_delay=2,
print_options=None
):
"""
初始化远程 PDF 保存器支持自动重建 session
:param remote_url: 远程 Selenium 地址
:param headless: 是否无头模式
:param max_retries: 单次操作最大重试次数
:param retry_delay: 重试前等待时间
:param print_options: PDF 打印选项参考 DevTools Protocol
"""
self.remote_url = remote_url
self.headless = headless
self.max_retries = max_retries
self.retry_delay = retry_delay
self.print_options = print_options or {
'landscape': False,
'displayHeaderFooter': False,
'printBackground': True,
'preferCSSPageSize': True,
'paperWidth': 8.27, # A4 宽(英寸)
'paperHeight': 11.69, # A4 高(英寸)
}
self.driver = None
self._init_driver()
def _build_chrome_options(self):
"""构建 Chrome 选项(可复用)"""
chrome_options = Options()
if self.headless:
chrome_options.add_argument('--headless=new')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--window-size=1920,1080')
chrome_options.add_argument(
"--user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.3650.75 Safari/537.36"
)
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
chrome_options.add_experimental_option('useAutomationExtension', False)
return chrome_options
def _init_driver(self):
"""初始化或重新初始化 WebDriver"""
if self.driver:
try:
self.driver.quit()
except Exception:
pass # 忽略关闭失败
logger.info(f"正在创建新的远程 WebDriver 会话: {self.remote_url}")
for attempt in range(3):
try:
self.driver = webdriver.Remote(
command_executor=self.remote_url,
options=self._build_chrome_options()
)
# 注入反检测脚本
self.driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
'source': '''
delete navigator.__proto__.webdriver;
window.chrome = { runtime: {} };
Object.defineProperty(navigator, 'languages', {
get: () => ['zh-CN', 'zh', 'en']
});
'''
})
logger.info("✅ 远程 WebDriver 会话创建成功")
return
except Exception as e:
logger.warning(f"创建 WebDriver 失败 (尝试 {attempt + 1}/3): {e}")
if attempt < 2:
time.sleep(2)
else:
raise RuntimeError(f"无法连接到远程 Selenium 服务: {e}")
def save_as_pdf(self, url, output_path=None, timeout=30, wait_time=5):
"""
保存网页为 PDF支持自动重试和 session 重建
"""
if output_path is None:
domain = urlparse(url).netloc.replace('www.', '').split('.')[0] or 'page'
output_path = f"{domain}.pdf"
if not output_path.lower().endswith('.pdf'):
output_path += '.pdf'
last_exception = None
for retry in range(self.max_retries + 1):
try:
# 检查 driver 是否有效
if not self.driver:
self._init_driver()
self.driver.set_page_load_timeout(timeout)
logger.info(f"[{retry + 1}/{self.max_retries + 1}] 加载页面: {url}")
self.driver.get(url)
time.sleep(wait_time)
logger.info("生成 PDF...")
result = self.driver.execute_cdp_cmd('Page.printToPDF', self.print_options)
pdf_data = base64.b64decode(result['data'])
# 写入本地 PDF 文件(二进制)
with open(output_path, 'wb') as f:
f.write(pdf_data)
file_size = os.path.getsize(output_path)
if file_size == 0:
raise RuntimeError("生成了空文件")
logger.info(f"✅ 保存成功: {os.path.abspath(output_path)} ({file_size} 字节)")
return os.path.abspath(output_path)
except (WebDriverException, InvalidSessionIdException, SessionNotCreatedException) as e:
last_exception = e
logger.warning(f"WebDriver 异常 (retry {retry + 1}): {e}")
if retry < self.max_retries:
logger.info("正在重建 WebDriver 会话...")
self._init_driver()
time.sleep(self.retry_delay)
else:
logger.error("达到最大重试次数,放弃")
break
except TimeoutException as e:
last_exception = e
logger.warning(f"页面加载超时 (retry {retry + 1}): {e}")
if retry < self.max_retries:
time.sleep(self.retry_delay)
else:
break
except Exception as e:
last_exception = e
logger.error(f"未知错误 (retry {retry + 1}): {e}")
break # 非 WebDriver 错误,不重试
# 清理失败生成的空文件
if os.path.exists(output_path):
try:
os.remove(output_path)
except OSError:
pass
raise RuntimeError(f"保存失败({type(last_exception).__name__}: {last_exception}")
def quit(self):
"""显式关闭浏览器"""
if self.driver:
try:
self.driver.quit()
logger.info("WebDriver 会话已关闭")
except Exception:
pass
self.driver = None
def __del__(self):
self.quit()
# ===== 测试 =====
if __name__ == "__main__":
saver = RemotePDFSaver(
remote_url="http://144.34.185.108:28098/wd/hub", # ← 替换为你的云服务器公网 IP
headless=True
)
try:
saver.save_as_pdf(
url="https://www.epochtimes.com/gb/25/12/22/n14660274.htm",
output_path="remote_example2.pdf"
)
except Exception as e:
print(f"❌ 失败: {e}")
saver.quit()

View File

@ -1,119 +0,0 @@
import pymysql
from typing import Dict, List, Tuple, Optional
# ================== 配置区 ==================
DB_CONFIG = {
'host': '47.113.231.200',
'port': 28089,
'user': 'root',
'password': 'passok123A',
'database': 'dsp',
'charset': 'utf8mb4',
}
# 仅用于指定哪些 es_srcname 的记录需要处理(值可为空,因为不再做替换)
TARGET_SRCNAMES: List[str] = [
"http://www.kcna.kp/cn/category/articles/q/5394b80bdae203fadef02522cfb578c0.kcmsf",
# 添加你需要处理的站点名
]
# ================== 工具函数 ==================
def get_suffix_32(url: str) -> Optional[str]:
"""获取 URL 最后 32 个字符,不足则返回 None"""
if not url or len(url) < 32:
return None
return url[-32:]
def find_foreign_by_suffix(cursor, suffix: str, exclude_id: int) -> Optional[Tuple[str, str]]:
"""
根据后缀查找外文记录排除自身
"""
query = """
SELECT es_urltitle, es_urlcontent
FROM indeximos
WHERE
es_sid != %s
AND es_urlname IS NOT NULL
AND CHAR_LENGTH(es_urlname) >= 32
AND RIGHT(es_urlname, 32) = %s
LIMIT 1
"""
cursor.execute(query, (exclude_id, suffix))
result = cursor.fetchone()
return result if result else None
def update_chinese_record(cursor, record_id: int, title: str, content: str):
"""更新中文记录的 es_title 和 es_content"""
update_query = """
UPDATE indeximos
SET es_title = %s, es_content = %s
WHERE es_sid = %s
"""
cursor.execute(update_query, (title, content, record_id))
# ================== 主逻辑 ==================
def main():
if not TARGET_SRCNAMES:
print("⚠️ 未指定任何目标 es_srcname程序退出。")
return
conn = pymysql.connect(**DB_CONFIG)
cursor = conn.cursor()
try:
# 获取所有目标站点的中文记录
placeholders = ','.join(['%s'] * len(TARGET_SRCNAMES))
query = f"""
SELECT es_sid, es_srcname, es_urlname
FROM indeximos
WHERE es_srcname IN ({placeholders})
AND es_urlname IS NOT NULL
AND es_urlname != ''
AND es_loadtime > '2026-01-16 10:40:00'
"""
cursor.execute(query, TARGET_SRCNAMES)
records = cursor.fetchall()
total = len(records)
print(f"共加载 {total} 条来自 {TARGET_SRCNAMES} 的记录用于匹配...")
updated_count = 0
skipped_short = 0
for idx, (record_id, es_srcname, es_urlname) in enumerate(records, 1):
suffix = get_suffix_32(es_urlname)
if suffix is None:
skipped_short += 1
continue
foreign_data = find_foreign_by_suffix(cursor, suffix, record_id)
if foreign_data:
title, content = foreign_data
update_chinese_record(cursor, record_id, title, content)
updated_count += 1
print(f"[{idx}/{total}] ✅ 已更新 ID={record_id} | src={es_srcname}")
conn.commit()
print("\n" + "=" * 50)
print(f"✅ 匹配完成!")
print(f" - 成功更新: {updated_count}")
print(f" - 因 URL 长度 <32 跳过: {skipped_short}")
print(f" - 总处理: {total}")
except Exception as e:
conn.rollback()
print(f"❌ 发生错误,已回滚: {e}")
raise
finally:
cursor.close()
conn.close()
if __name__ == "__main__":
main()

View File

@ -1,160 +0,0 @@
import pymysql
import jieba
from collections import Counter
from typing import List, Tuple, Set
# ================== 配置区 ==================
DB_CONFIG = {
'host': '47.113.231.200',
'port': 28089,
'user': 'root',
'password': 'passok123A',
'database': 'dsp',
'charset': 'utf8mb4',
}
# 指定需要处理的中文站点es_srcname
TARGET_SRCNAMES: List[str] = [
"http://www.rodong.rep.kp/cn/index.php?MUBAMUAxQA==",
# 添加你的站点
]
FOREIGN_SRCNAME = 'http://www.rodong.rep.kp/ko/index.php?MUBAMUAxQA=='
# 相似度阈值(关键词重合率),建议 0.3 ~ 0.6
SIMILARITY_THRESHOLD = 0.3
# ================== 文本相似度函数 ==================
def extract_keywords(text: str) -> Set[str]:
"""提取中文关键词:分词 + 过滤单字、数字、标点"""
if not text:
return set()
words = jieba.lcut(text)
return {w for w in words if len(w) >= 2 and w.isalpha()}
def keyword_overlap_similarity(title1: str, title2: str) -> float:
"""计算两个中文标题的关键词重合率"""
kw1 = extract_keywords(title1)
kw2 = extract_keywords(title2)
if not kw1 and not kw2:
return 1.0 if title1 == title2 else 0.0
if not kw1 or not kw2:
return 0.0
overlap = kw1 & kw2
return len(overlap) / max(len(kw1), len(kw2))
# ================== 数据库操作 ==================
def get_chinese_records(cursor) -> List[Tuple]:
"""获取待处理的中文记录"""
if not TARGET_SRCNAMES:
return []
placeholders = ','.join(['%s'] * len(TARGET_SRCNAMES))
query = f"""
SELECT es_sid, es_srcname, es_urlname, es_urltitle, es_urltime
FROM indeximos
WHERE es_srcname IN ({placeholders})
AND es_urltitle IS NOT NULL AND TRIM(es_urltitle) != ''
AND es_urltime IS NOT NULL
AND es_loadtime > '2026-01-16 10:40:00'
"""
cursor.execute(query, TARGET_SRCNAMES)
return cursor.fetchall()
def get_foreign_candidates_by_time(cursor, pub_time) -> List[Tuple]:
"""
获取同一发布时间的所有外文候选记录要求 es_abstract 不为空
"""
query = """
SELECT es_sid, es_title, es_urltitle, es_urlcontent
FROM indeximos
WHERE es_urltime = %s
AND es_title IS NOT NULL AND TRIM(es_title) != ''
AND es_urlcontent IS NOT NULL
AND es_loadtime > '2026-01-16 10:40:00'
"""
cursor.execute(query, (pub_time,))
return cursor.fetchall()
def update_chinese_record(cursor, record_id: int, new_title: str, content: str):
"""更新中文记录的标题和内容"""
update_query = """
UPDATE indeximos
SET es_title = %s, es_content = %s
WHERE es_sid = %s
"""
cursor.execute(update_query, (new_title, content, record_id))
# ================== 主逻辑 ==================
def main():
if not TARGET_SRCNAMES:
print("⚠️ 未指定目标站点,退出。")
return
conn = pymysql.connect(**DB_CONFIG)
cursor = conn.cursor()
try:
chinese_records = get_chinese_records(cursor)
total = len(chinese_records)
print(f"共加载 {total} 条中文记录用于匹配...")
matched_count = 0
for idx, (cid, srcname, urlname, zh_title, pub_time) in enumerate(chinese_records, 1):
print(f"\n[{idx}/{total}] ID={cid}, 时间={pub_time}, 标题='{zh_title[:30]}...'")
candidates = get_foreign_candidates_by_time(cursor, pub_time)
if not candidates:
print(" → 无同时间且有翻译标题的外文记录")
continue
best_score = 0.0
best_candidate = None
for fid, trans_title, ori_title, content in candidates:
# 跳过自己(理论上不会发生,但安全起见)
if fid == cid:
continue
score = keyword_overlap_similarity(zh_title, trans_title)
print(f" 候选ID={fid} | 翻译标题='{trans_title[:30]}...' | 重合度={score:.3f}")
if score > best_score:
best_score = score
best_candidate = (ori_title, content)
if best_candidate and best_score >= SIMILARITY_THRESHOLD:
final_title, final_content = best_candidate
update_chinese_record(cursor, cid, final_title, final_content)
matched_count += 1
print(f" ✅ 匹配成功! 重合度={best_score:.3f}")
else:
print(f" ❌ 未达阈值(最高相似度={best_score:.3f}")
conn.commit()
print("\n" + "=" * 50)
print(f"✅ 匹配完成!成功关联 {matched_count} / {total} 条记录。")
except Exception as e:
conn.rollback()
print(f"❌ 发生错误,已回滚: {e}")
raise
finally:
cursor.close()
conn.close()
if __name__ == "__main__":
main()

View File

@ -1,172 +0,0 @@
import time
from typing import List, Tuple, Optional
import pymysql
import requests
# ================== 配置区 ==================
# 数据库配置
DB_CONFIG = {
'host': '47.113.231.200',
'port': 28089,
'user': 'root',
'password': 'passok123A',
'database': 'dsp',
'charset': 'utf8mb4',
'autocommit': False # 手动控制事务
}
# 翻译 API 地址(替换为你的服务器 IP 或域名)
TRANSLATE_API_URL = "http://47.113.231.200:28081/translate"
# 指定时间格式YYYY-MM-DD HH:MM:SS
LOADTIME_AFTER = "2026-02-10 11:59:00"
# 目标站点列表
TARGET_SRCNAMES = [
'https://www.38north.org/' # 添加你的站点
]
# 单次请求间隔(秒),避免 API 被限流
REQUEST_DELAY = 1
# 最大文本长度(与 API 一致)
MAX_TEXT_LENGTH = 5000
def normalize_newlines(text: str) -> str:
"""\r\n\r 统一转换为 \n"""
if not text:
return text
return text.replace('\r\n', '\n').replace('\r', '\n')
def translate_single(text: str, source_lang: str = "auto", target_lang: str = "zh") -> Optional[str]:
"""翻译单段文本,失败返回 None"""
if not text or not text.strip():
return ""
payload = {
"text": text[:MAX_TEXT_LENGTH],
"source_lang": source_lang,
"target_lang": target_lang
}
try:
response = requests.post(TRANSLATE_API_URL, json=payload, timeout=10)
response.raise_for_status()
result = response.json()
return result.get("translated_text")
except Exception as e:
print(f"⚠️ 翻译失败: {e}")
return None
def translate_content_with_paragraphs(content: str) -> str:
"""
按段落翻译内容支持容错
- 某段失败 跳过该段保留空行或原文
- 返回拼接后的完整内容
"""
if not content:
return ""
# 标准化换行符
content = normalize_newlines(content)
paragraphs = content.split('\n')
translated_paragraphs = []
for para in paragraphs:
if not para.strip():
# 保留空行
translated_paragraphs.append("")
continue
trans = translate_single(para)
if trans is None:
# 段落翻译失败:跳过该段(可选:保留原文或留空)
print(f" ⚠️ 段落翻译失败,跳过: {para[:30]}...")
translated_paragraphs.append("") # 或 append(para) 保留原文
else:
translated_paragraphs.append(trans)
time.sleep(REQUEST_DELAY)
return '\n'.join(translated_paragraphs)
# ================== 数据库操作 ==================
def update_record(cursor, es_sid: int, new_title: str, new_content: str):
update_query = """
UPDATE indeximos
SET es_abstract = % s, es_content = % s
WHERE es_sid = % s
"""
cursor.execute(update_query, (new_title, new_content, es_sid))
# ================== 主逻辑 ==================
def main():
conn = pymysql.connect(**DB_CONFIG)
cursor = conn.cursor()
try:
placeholders = ','.join(['%s'] * len(TARGET_SRCNAMES))
query = f"""
SELECT es_sid, es_urltitle, es_urlcontent
FROM indeximos
WHERE es_loadtime > %s
AND (es_content IS NULL OR TRIM(es_content) = '')
-- AND es_srcname IN ({placeholders})
AND LENGTH(es_video) > 5
"""
params = [LOADTIME_AFTER] + TARGET_SRCNAMES
cursor.execute(query, params)
records: List[Tuple] = cursor.fetchall()
total = len(records)
print(f"✅ 共找到 {total} 条待翻译记录")
if total == 0:
return
success_count = 0
for idx, (es_sid, urltitle, urlcontent) in enumerate(records, 1):
print(f"\n[{idx}/{total}] 处理 es_sid={es_sid}")
start_time = time.time()
# 翻译标题
title_trans = translate_single(urltitle) if urltitle else ""
if title_trans is None:
print(" → 标题翻译失败,跳过整条")
continue
# 翻译内容(按段落,容错)
content_trans = translate_content_with_paragraphs(urlcontent)
# 更新数据库
update_record(cursor, es_sid, title_trans, content_trans)
success_count += 1
elapsed = time.time() - start_time
print(f" ✅ 翻译成功 | 耗时: {elapsed:.2f}s | 标题: {title_trans[:30]}...")
conn.commit()
print(f"\n🎉 完成!成功翻译 {success_count} / {total} 条记录")
except Exception as e:
conn.rollback()
print(f"❌ 发生错误: {e}")
raise
finally:
cursor.close()
conn.close()
if __name__ == "__main__":
main()

View File

@ -1,4 +1,4 @@
l# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import os import os
import sys import sys
import platform import platform

View File

@ -170,31 +170,3 @@ class TelegramMember(scrapy.Item):
role = scrapy.Field() # 成员角色c-创建者、a-管理员、u-普通成员 role = scrapy.Field() # 成员角色c-创建者、a-管理员、u-普通成员
mobile = scrapy.Field() # 成员手机号 mobile = scrapy.Field() # 成员手机号
profile_photo = scrapy.Field() # 头像 blob profile_photo = scrapy.Field() # 头像 blob
class TwitterUserInfoItem(scrapy.Item):
"""对应数据库表 twitter_user_info 的 Scrapy Item"""
crawl_time = scrapy.Field() # DATETIME - 数据爬取时间
is_newest = scrapy.Field() # TINYINT(1) - 是否最新
platform_type = scrapy.Field() # VARCHAR(20) - 平台类型
user_id = scrapy.Field() # BIGINT UNSIGNED - Twitter 用户唯一ID
username = scrapy.Field() # VARCHAR(50) - 用户名(@后部分)
nickname = scrapy.Field() # VARCHAR(100) - 显示名称
user_url = scrapy.Field() # VARCHAR(255) - 主页URL
user_link = scrapy.Field() # VARCHAR(255) - 用户链接
avatar_url = scrapy.Field() # VARCHAR(500) - 头像原始URL
avatar_path = scrapy.Field() # VARCHAR(255) - 本地头像路径
backgroud_image_url = scrapy.Field() # VARCHAR(255) - 背景图原始URL
background_image_path = scrapy.Field() # VARCHAR(255) - 背景图路径
intro = scrapy.Field() # TEXT - 简介
city = scrapy.Field() # VARCHAR(100) - 城市
join_date = scrapy.Field() # DATETIME - 加入时间
signature = scrapy.Field() # VARCHAR(255) - 用户签名
tags = scrapy.Field() # VARCHAR(255) - 标签:官方代表/媒体实体/名人
post_count = scrapy.Field() # INT UNSIGNED - 推文数
is_verified = scrapy.Field() # VARCHAR(10) - 是否认证 ("True"/"False")
follow_count = scrapy.Field() # INT UNSIGNED - 关注人数
fans_count = scrapy.Field() # INT UNSIGNED - 粉丝数
image_urls = scrapy.Field()

View File

@ -7,9 +7,6 @@ import logging
import os import os
import tarfile import tarfile
import time import time
from scrapy.exceptions import DropItem
import uuid
# Define your item pipelines here # Define your item pipelines here
# #
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
@ -268,179 +265,3 @@ class TelegramDataSaveToMySQL(object):
except pymysql.err.DataError as de: except pymysql.err.DataError as de:
logging.error(repr(de)) logging.error(repr(de))
return item return item
class TwitterUserDataSaveToMySQL(object):
def __init__(self):
self.db = None
self.cursor = None
self.update_fileds = []
def open_spider(self, spider):
self.db = pymysql.connect(host='47.113.231.200', port=28089, user='root', passwd='passok123A',
db='dsp', charset='utf8mb4')
self.cursor = self.db.cursor()
def close_spider(self, spider):
if self.cursor:
self.cursor.close()
if self.db:
self.db.close()
def process_item(self, item, spider):
# 可选:只处理特定 Item
if item.__class__.__name__ != 'TwitterUserInfoItem':
return item
self.table_name = "twitter_user_info"
self.extract_avatar_and_background_paths(item)
try:
user_id = item.get('user_id')
if not user_id:
logging.warning("缺少 user_id跳过处理。")
return item
# 1. 基于 user_id 生成稳定 UUID命名空间 + 字符串)
stable_uuid = str(uuid.uuid5(uuid.NAMESPACE_DNS, f"twitter_user_{user_id}"))
# 2. 查询数据库是否已存在
existing = self._select_by_uuid(stable_uuid)
if existing:
# 3. 比较字段,判断是否需要更新
if self._needs_update(existing, item):
# 4. 执行更新
self._update_item(stable_uuid, item)
logging.info(f"用户 {user_id} 数据已更新。")
else:
logging.debug(f"用户 {user_id} 数据无变化,跳过更新。")
else:
# 5. 插入新记录
self._insert_item_with_uuid(stable_uuid, item)
logging.info(f"用户 {user_id} 新数据已插入。")
except Exception as e:
spider.logger.error(f"处理用户数据失败 (user_id={item.get('user_id')}): {e}")
raise DropItem(f"Database error: {e}")
return item
def _select_by_uuid(self, record_uuid):
"""根据 UUID 查询整行数据"""
sql = f"SELECT * FROM dsp.{self.table_name} WHERE id = %s"
self.cursor.execute(sql, (record_uuid,))
row = self.cursor.fetchone()
if row:
# 获取列名
columns = [desc[0] for desc in self.cursor.description]
return dict(zip(columns, row))
return None
def _needs_update(self, db_record, item):
"""比较数据库记录与 item 是否有差异"""
for field in item.fields:
if field in ['id', 'created_at', 'updated_at', 'image_urls', 'crawl_time', 'join_date']:
continue
item_val = item.get(field)
db_val = db_record.get(field)
# 标准化空值None 和 '' 视为等价
if item_val is None or item_val == '':
item_val = None
if db_val is None or db_val == '':
db_val = None
if item_val != db_val:
self.update_fileds.append(field)
return len(self.update_fileds)>0
def _update_item(self, record_uuid, item):
"""更新不一致的字段 + updated_at"""
update_fields = []
update_vals = []
for field in self.update_fileds:
if field in ['id', 'created_at', 'image_urls']:
continue
value = item.get(field)
if value is None or value == '':
continue
update_fields.append(f"{field} = %s")
update_vals.append(value)
if not update_fields:
return
update_vals.append(record_uuid) # WHERE id = %s
sql = f"UPDATE dsp.{self.table_name} SET {', '.join(update_fields)}, updated_at = CURRENT_TIMESTAMP WHERE id = %s"
self.cursor.execute(sql, update_vals)
self.db.commit()
def _insert_item_with_uuid(self, record_uuid, item):
"""插入新记录,指定 id 为 stable_uuid"""
cols = ['id']
vals = [record_uuid]
for field in item.fields:
if field in ['image_urls', 'id']:
continue
# 获取字段值
value = item.get(field)
# 处理 avatar_path兼容 dict / list / str
if field == 'avatar_path':
if isinstance(value, list) and len(value) > 0:
value = value[0].get('path', '') if isinstance(value[0], dict) else str(value[0])
elif isinstance(value, dict):
value = value.get('path', '')
else:
value = str(value) if value else ''
# 跳过 None 和空字符串
if value is None or value == '':
continue
cols.append(field)
vals.append(value)
if not cols:
logging.warning("没有有效的字段可供插入。")
return
placeholders = ', '.join(['%s'] * len(cols))
cols_str = ', '.join(cols)
sql = f"INSERT INTO dsp.twitter_user_info ({cols_str}) VALUES ({placeholders})"
try:
self.cursor.execute(sql, vals)
self.db.commit()
except pymysql.err.IntegrityError as ie:
self.db.rollback()
logging.debug(f"数据重复,已跳过插入:{ie}")
except pymysql.err.DataError as de:
self.db.rollback()
logging.error(f"数据格式错误(如字段超长、类型不匹配等):{de}")
raise
except Exception as e:
self.db.rollback()
logging.error(f"数据库操作发生未知错误:{e}")
raise
def extract_avatar_and_background_paths(self, item):
value = item.get('avatar_path', [])
if not isinstance(value, list):
value = []
def get_path(val):
return val.get('path', '') if isinstance(val, dict) else str(val)
avatar = get_path(value[0]) if len(value) > 0 else None
background = get_path(value[1]) if len(value) > 1 else None
item['avatar_path'] = avatar
item['background_image_path'] = background

View File

@ -59,24 +59,7 @@ class SeleniumMiddleware:
# Edge in headless mode # Edge in headless mode
edge_options = EdgeOptions() edge_options = EdgeOptions()
edge_options.use_chromium = True edge_options.use_chromium = True
# edge_options.add_argument("--headless") self.driver = Edge(executable_path='MicrosoftWebDriver.exe', options=edge_options)
# 隐藏“受自动化软件控制”提示栏
edge_options.add_argument('--disable-blink-features=AutomationControlled')
edge_options.add_experimental_option("excludeSwitches", ["enable-automation"])
# 禁用自动化扩展
edge_options.add_experimental_option('useAutomationExtension', False)
edge_options.add_argument(
"--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36 Edg/143.0.0.0")
edge_options.add_argument("--window-size=1920,1080")
# 设置浏览器的 高级偏好设置
prefs = {
# "profile.managed_default_content_settings.images": 2, # 禁用图片加载:2 表示“禁止”1 表示“允许”
"credentials_enable_service": False, # 禁用保存密码提示
"profile.password_manager_enabled": False # 禁用密码管理器
}
edge_options.add_experimental_option("prefs", prefs)
self.driver = Edge(executable_path=r"D:\msedgedriver.exe", options=edge_options)
@classmethod @classmethod
def from_crawler(cls, crawler): def from_crawler(cls, crawler):
@ -110,7 +93,7 @@ class SeleniumMiddleware:
self.proxy_count = 0 self.proxy_count = 0
ip = request.meta['proxy'].split(':')[1][2:] ip = request.meta['proxy'].split(':')[1][2:]
port = int(request.meta['proxy'].split(':')[2]) port = int(request.meta['proxy'].split(':')[2])
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36 Edg/143.0.0.0' user_agent = 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)'
self.driver.get("about:config") self.driver.get("about:config")
script = ''' script = '''
var prefs = Components.classes["@mozilla.org/preferences-service;1"].getService(Components.interfaces.nsIPrefBranch); var prefs = Components.classes["@mozilla.org/preferences-service;1"].getService(Components.interfaces.nsIPrefBranch);

View File

@ -3,7 +3,6 @@
BOT_NAME = 'MediaSpiders' BOT_NAME = 'MediaSpiders'
LOG_LEVEL = 'INFO' LOG_LEVEL = 'INFO'
# LOG_LEVEL = 'DEBUG'
SPIDER_MODULES = ['MediaSpiders.spiders'] SPIDER_MODULES = ['MediaSpiders.spiders']
NEWSPIDER_MODULE = 'MediaSpiders.spiders' NEWSPIDER_MODULE = 'MediaSpiders.spiders'
@ -35,12 +34,6 @@ MYSQL_DB_USER = 'root'
MYSQL_DB_PASSWD = 'passok123A' MYSQL_DB_PASSWD = 'passok123A'
MYSQL_DB_SCHEMA = 'oscm' MYSQL_DB_SCHEMA = 'oscm'
TWITTER_USER_MYSQL_DB_HOST = '47.113.231.200'
TWITTER_USER_MYSQL_DB_PORT = 28089
TWITTER_USER_MYSQL_DB_USER = 'root'
TWITTER_USER_MYSQL_DB_PASSWD = 'passok123A'
TWITTER_USER_MYSQL_DB_SCHEMA = 'dsp'
CRAWL_JOB_UPDATE_API = 'http://47.115.228.133:28081/api/open/crawljob' CRAWL_JOB_UPDATE_API = 'http://47.115.228.133:28081/api/open/crawljob'
WORD_BANK_QUERY_API = 'http://47.115.228.133:28081/api/open/wordBank/queryAll' WORD_BANK_QUERY_API = 'http://47.115.228.133:28081/api/open/wordBank/queryAll'
RULES_PARSER_QUERY_API = 'http://47.115.228.133:28081/api/rules/parser/queryPageable/0/1' RULES_PARSER_QUERY_API = 'http://47.115.228.133:28081/api/rules/parser/queryPageable/0/1'
@ -51,7 +44,6 @@ BATCH_SAVE_SIZE = 5
TWITTER_FILTER_KEY = 'URL_Filter:MediaSpiders:Twitter_Filter' TWITTER_FILTER_KEY = 'URL_Filter:MediaSpiders:Twitter_Filter'
FACEBOOK_FILTER_KEY = 'URL_Filter:MediaSpiders:Facebook_Filter' FACEBOOK_FILTER_KEY = 'URL_Filter:MediaSpiders:Facebook_Filter'
LINKEDIN_FILTER_KEY = 'URL_Filter:MediaSpiders:Linkedin_Filter'
YOUTUBE_FILTER_KEY = 'URL_Filter:MediaSpiders:Youtube_Filter' YOUTUBE_FILTER_KEY = 'URL_Filter:MediaSpiders:Youtube_Filter'
WEIBO_FILTER_KEY = 'URL_Filter:MediaSpiders:Weibo_Filter' WEIBO_FILTER_KEY = 'URL_Filter:MediaSpiders:Weibo_Filter'
WECHAT_FILTER_KEY = 'URL_Filter:MediaSpiders:Wechat_Filter' WECHAT_FILTER_KEY = 'URL_Filter:MediaSpiders:Wechat_Filter'
@ -59,7 +51,6 @@ FLICKR_FILTER_KEY = 'URL_Filter:MediaSpiders:Flickr_Filter'
TWITTER_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Twitter_Filter' TWITTER_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Twitter_Filter'
FACEBOOK_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Facebook_Filter' FACEBOOK_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Facebook_Filter'
LINKEDIN_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Linkedin_Filter'
YOUTUBE_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Youtube_Filter' YOUTUBE_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Youtube_Filter'
WEIBO_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Weibo_Filter' WEIBO_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Weibo_Filter'
WECHAT_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Wechat_Filter' WECHAT_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Wechat_Filter'
@ -67,16 +58,10 @@ FLICKR_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Flickr_Filter'
WECHAT_LINKS_KEY = "MediaSpiders:Wechat_links" WECHAT_LINKS_KEY = "MediaSpiders:Wechat_links"
# TWITTER_BEARER_TOKEN = "AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA" # old
TWITTER_API_KEY = "JFY7dt"
TWITTER_BEARER_TOKEN = "AAAAAAAAAAAAAAAAAAAAAO8MTQEAAAAAQWidbP34N0nykDnUEDweEpyRgsc%3Dxt0hX1whV1hlmbMsStkB7ZU3pjXOINOCh2DMPoIAwljwrOWgvE"
TWITTER_ACCESS_TOKEN = "1294829483816398849-gscLJCEF9ZObZJikjCmjXtxoW6YVWu"
TWITTER_ACCESS_TOKEN_SECRET = "1XvTHZXzN0JBQulTBOvCTgXVPzVGYWe50zH1r4qXLper3"
SOCIAL_USER_QUERY_ALL_API = "http://47.115.228.133:28081/api/open/target/social/queryAll?sortBy={sortBy}&shuffleResult={shuffleResult}" SOCIAL_USER_QUERY_ALL_API = "http://47.115.228.133:28081/api/open/target/social/queryAll?sortBy={sortBy}&shuffleResult={shuffleResult}"
SOCIAL_USER_UPDATE_API = "http://47.115.228.133:28081/api/open/target/social/update" SOCIAL_USER_UPDATE_API = "http://47.115.228.133:28081/api/open/target/social/update"
WEIBO_USER_TYPE = 0 WEIBO_USER_TYPE = 0
TWITTER_USER_TYPE = 21 TWITTER_USER_TYPE = 1
FACEBOOK_USER_TYPE = 2 FACEBOOK_USER_TYPE = 2
YOUTUBE_USER_TYPE = 3 YOUTUBE_USER_TYPE = 3
FLICKR_USER_TYPE = 4 FLICKR_USER_TYPE = 4
@ -88,31 +73,21 @@ TWITTER_URL_KEY = 'MediaSpiders:Twitter_URL_Key'
TWITTER_PID_KEY = '' TWITTER_PID_KEY = ''
KAFKA_PROCESS_QUEUE = ['stream-protobuf', 'stream-db'] KAFKA_PROCESS_QUEUE = ['stream-protobuf', 'stream-db']
# CUSTOM_USER_AGENT = [
# 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
# 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0',
# 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
# 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363',
# 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
# 'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50',
# 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0',
# 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E',
# 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201'
# ]
CUSTOM_USER_AGENT = [ CUSTOM_USER_AGENT = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0', 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.1958', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 OPR/117.0.0.', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.3' 'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201'
] ]
# 部署在外网采集fb时使用selenium_chrome # 部署在外网采集fb时使用selenium_chrome
SELENIUM_DRIVER_NAME = 'chrome' SELENIUM_DRIVER_NAME = 'chrome'
SELENIUM_DRIVER_EXECUTABLE_PATH = 'local' SELENIUM_DRIVER_EXECUTABLE_PATH = 'http://144.34.185.108:28098'
# SELENIUM_DRIVER_EXECUTABLE_PATH = 'http://144.34.185.108:28098'
SELENIUM_DRIVER_ARGUMENTS = [ SELENIUM_DRIVER_ARGUMENTS = [
'--headless', '--headless',
'--no-sandbox', '--no-sandbox',
@ -209,14 +184,6 @@ EXTENSIONS = {
'MediaSpiders.extensions.SetCrawlerStatusExtensions': 501 'MediaSpiders.extensions.SetCrawlerStatusExtensions': 501
} }
############################## 翻译
MAX_TEXT_LENGTH = 100
# 翻译 API 地址(替换为你的服务器 IP 或域名)
TRANSLATE_API_URL = "http://47.113.231.200:28082/translate"
# 单次请求间隔(秒),避免 API 被限流
REQUEST_DELAY = 1
# Enable or disable extensions # Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html # See https://docs.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = { # EXTENSIONS = {

View File

@ -1,369 +0,0 @@
# -*- coding: utf-8 -*-
import json
import hashlib
import re
import time
import random
from selenium.webdriver.common.by import By
from selenium.common.exceptions import (
TimeoutException,
StaleElementReferenceException,
WebDriverException
)
import scrapy
from scrapy_selenium import SeleniumRequest
from MediaSpiders.items import MediaspidersItem
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from scrapy.exceptions import CloseSpider
from MediaSpiders.utils.string_utils import get_str_md5
from MediaSpiders.utils.time_utils import get_current_timestamp
class BaiduHotSearchSprder(scrapy.Spider):
name = 'BaiduHotSearchSprder'
comment_urls = []
custom_settings = {
'PROTO_MODULE_PATH': 'MediaSpiders.proto.Es_pb2',
'PROTO_CLASS_NAME': 'EsSets',
'PROTO_FIELD_NAME': 'Es',
'PROTO_SAVE_FILE_NAME': 'public_info_data_',
'IMAGES_STORE': r'/usr/local/temp_image/twitter',
'IMAGES_RESULT_FIELD': 'es_urlimage',
'FILES_STORE': r'/usr/local/videos',
'FILES_RESULT_FIELD': 'es_video',
'ZIP_FILE_NAME': 'image_data_publicinfo_',
'FILE_ZIP_FILE_NAME': 'image_data_plane_',
'ITEM_PIPELINES': {
'scrapy.pipelines.images.ImagesPipeline': 2,
'MediaSpiders.pipelines.ProtobufSavePipeline': 300,
# 'MediaSpiders.pipelines.TwitterUserDataSaveToMySQL': 300,
},
'SPIDER_MIDDLEWARES': {
'MediaSpiders.middlewares.DumpFilterSpiderMiddleware': 543,
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware': None
},
'DOWNLOADER_MIDDLEWARES': {},
'BATCH_SAVE_SIZE': 50
}
start_urls = 'https://top.baidu.com/board?tab=realtime'
def __init__(self, params=None, *args, **kwargs):
super(BaiduHotSearchSprder, self).__init__(*args, **kwargs)
self.job_id = None
self.collected_items = 0
self.max_items = 50 # 设定最大爬取数量,防止无限循环
self.retry_count = 0
self.max_retries = 3
if params:
try:
json_params = json.loads(params)
if 'job_id' in json_params:
self.job_id = json_params['job_id']
if 'max_items' in json_params:
self.max_items = int(json_params['max_items'])
except Exception as e:
self.logger.error(f"解析参数失败: {str(e)}")
def start_requests(self):
"""发起初始请求"""
self.logger.info(f"开始爬取百度热搜任务ID: {self.job_id if self.job_id else 'N/A'}")
self.url_time = get_current_timestamp()
yield SeleniumRequest(
url=self.start_urls,
callback=self.parse,
meta={'retry_count': 0},
wait_time=5
)
def parse(self, response):
"""解析热搜榜单数据"""
self.logger.info("开始解析百度热搜数据...")
driver = response.request.meta['driver']
try:
# 设置页面大小避免元素不可见
driver.set_window_size(1400, 1000)
# 访问主域确保Cookie正确设置
driver.get(self.start_urls)
# 等待主要内容加载
try:
WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.CSS_SELECTOR, ".category-wrap_iQLoo, .board-item"))
)
self.logger.info("页面主要内容加载完成")
except TimeoutException:
self.logger.warning("等待主要内容超时,尝试直接处理可用元素")
# 滚动页面以确保所有元素加载
self._scroll_page(driver)
# 尝试多种选择器策略,提高兼容性
hot_search_items = self._get_hot_search_items(driver)
if not hot_search_items:
self.logger.error("未找到任何热搜项,检查页面结构是否发生变化")
self.retry_count += 1
if self.retry_count <= self.max_retries:
self.logger.info(f"重试第 {self.retry_count}/{self.max_retries}")
driver.refresh()
return SeleniumRequest(
url=self.start_urls,
callback=self.parse,
meta={'retry_count': self.retry_count},
dont_filter=True,
wait_time=5
)
else:
self.logger.error("达到最大重试次数,终止爬虫")
raise CloseSpider("页面结构可能已更改,无法提取数据")
self.logger.info(f"找到 {len(hot_search_items)} 个热搜项")
hot_search_items_list = []
# 处理每个热搜项
for index, item in enumerate(hot_search_items):
try:
hot_search_item = self._extract_hot_search_data(item, driver)
if hot_search_item:
self.collected_items += 1
# hot_search_item['es_simrank'] = self.collected_items
self.logger.info(f"成功提取第 {self.collected_items} 条数据: {hot_search_item['es_urltitle']}")
# hot_search_items_list.append(hot_search_items)
yield hot_search_item
except StaleElementReferenceException:
self.logger.warning(f"{index + 1} 项元素已失效,跳过处理")
continue
except Exception as e:
self.logger.error(f"处理第 {index + 1} 项时出错: {str(e)}", exc_info=True)
continue
self.logger.info(f"本次爬取共收集 {self.collected_items} 条有效数据")
except WebDriverException as e:
self.logger.error(f"WebDriver异常: {str(e)}", exc_info=True)
if 'retry_count' not in response.meta or response.meta['retry_count'] < self.max_retries:
retry_count = response.meta.get('retry_count', 0) + 1
self.logger.info(f"尝试重新请求,重试次数: {retry_count}")
yield SeleniumRequest(
url=self.start_urls,
callback=self.parse,
meta={'retry_count': retry_count},
dont_filter=True,
wait_time=5 + retry_count * 2 # 指数退避
)
except Exception as e:
self.logger.error(f"处理页面时发生未预期错误: {str(e)}", exc_info=True)
finally:
# 可以在此处添加清理代码
pass
def _scroll_page(self, driver):
"""滚动页面确保所有元素加载"""
try:
# 缓慢滚动到底部
scroll_pause_time = 1
screen_height = driver.execute_script("return window.screen.height;")
scrolls = 5
for i in range(scrolls):
driver.execute_script(f"window.scrollTo(0, {screen_height * i});")
time.sleep(scroll_pause_time * (1 + random.random()))
# 滚回到顶部
driver.execute_script("window.scrollTo(0, 0);")
time.sleep(scroll_pause_time)
self.logger.info("页面滚动完成,确保元素加载")
except Exception as e:
self.logger.warning(f"页面滚动时发生异常: {str(e)}")
def _get_hot_search_items(self, driver):
"""尝试多种策略获取热搜项"""
selectors = [
'.category-wrap_iQLoo.horizontal_1eKyQ', # 主要选择器
'.board-item', # 备用选择器
'div[data-index]' # 基于属性的选择器
]
for selector in selectors:
try:
items = driver.find_elements(By.CSS_SELECTOR, selector)
if items and len(items) > 0:
self.logger.info(f"使用选择器 '{selector}' 成功找到 {len(items)} 个元素")
return items
except Exception as e:
self.logger.debug(f"选择器 '{selector}' 失败: {str(e)}")
# 如果CSS选择器都失败尝试XPath
try:
xpath_patterns = [
'//div[contains(@class, "category-wrap") and contains(@class, "horizontal")]',
'//div[contains(@class, "board-item")]',
'//div[@data-index]'
]
for xpath in xpath_patterns:
items = driver.find_elements(By.XPATH, xpath)
if items and len(items) > 0:
self.logger.info(f"使用XPath '{xpath}' 成功找到 {len(items)} 个元素")
return items
except Exception as e:
self.logger.debug(f"XPath策略失败: {str(e)}")
return []
def _extract_hot_search_data(self, item, driver):
"""提取单个热搜项数据(变量集中收集 + 末尾统一赋值)"""
# 常量定义
TITLE_SELECTORS = ['.c-single-text-ellipsis', '.title_dIF3B']
RANK_SELECTORS = ['.index_1Ew5p', '.hot-index_1Bl1a']
LINK_SELECTORS = [
'.look-more_3oNWC',
'a[href*="www.baidu.com/s?"]',
'.hot-desc_1m_jR a',
'.content_3Kk0y a'
]
DESC_SELECTORS = [
'.hot-desc_1m_jR.large_nSuFU',
'.hot-desc_1m_jR.small_Uvkd3',
'.desc_3CT34',
'.content_3Kk0y'
]
HOT_NUM_SELECTOR = '.hot-index_1Bl1a'
# 辅助函数
def find_visible_element(selectors, context=item):
for selector in selectors:
try:
elem = context.find_element(By.CSS_SELECTOR, selector)
if elem.is_displayed():
return elem
except Exception as e:
self.logger.debug(f"选择器 '{selector}' 未匹配: {str(e)}")
return None
def clean_text(text, remove_phrases=("查看更多>", "查看更多", "查看全文", "展开全文")):
"""深度清理文本:移除干扰短语 + 合并连续空格"""
if not text:
return ""
# 移除指定短语
for phrase in remove_phrases:
text = text.replace(phrase, "")
# 清理多余空白(包括\xa0等特殊空格
text = re.sub(r'\s+', ' ', text).strip()
return text.strip()
def normalize_url(url):
if not url or not isinstance(url, str):
return ""
url = url.strip()
if url.startswith('//'):
url = 'https:' + url
if url.startswith('data:') or not url.startswith(('http://', 'https://')):
return ""
return url
# 1. 提取标题
title_elem = find_visible_element(TITLE_SELECTORS)
if not title_elem:
self.logger.warning("标题元素未找到,跳过该项")
return None
title = clean_text(title_elem.text)
if not title:
self.logger.warning("标题内容为空,跳过该项")
return None
# 基础字段
now_ms = get_current_timestamp()
site_name = '百度热搜'
carrier_type = 'hot_search'
hkey = get_str_md5(title)
# 排名(默认使用当前收集序号)
rank = str(self.collected_items)
rank_elem = find_visible_element(RANK_SELECTORS)
if rank_elem:
extracted_rank = clean_text(rank_elem.text)
if extracted_rank:
rank = extracted_rank
# 链接与SID
url_href = ""
link_elem = find_visible_element(LINK_SELECTORS)
if link_elem:
raw_href = link_elem.get_attribute('href')
url_href = normalize_url(raw_href) if raw_href else ""
if url_href:
sid = hashlib.md5(url_href.lower().encode('utf-8')).hexdigest()
else:
fallback_seed = f"no_link_{title}_{now_ms}"
sid = hashlib.md5(fallback_seed.encode('utf-8')).hexdigest()
# 热度
heat = 0
try:
hot_elem = item.find_element(By.CSS_SELECTOR, HOT_NUM_SELECTOR)
hot_val = clean_text(hot_elem.text).replace(',', '')
if hot_val.isdigit():
heat = int(hot_val)
except Exception as e:
self.logger.debug(f"热度提取失败: {str(e)}")
# 描述
desc = ""
desc_elem = find_visible_element(DESC_SELECTORS)
if desc_elem:
desc = clean_text(desc_elem.text)
# 内容
detail_url = ""
try:
# 1. 先定位到描述容器
desc_container = item.find_element(By.CSS_SELECTOR, '.hot-desc_1m_jR')
# 2. 在容器内精准定位"查看更多"链接
detail_link = desc_container.find_element(By.CSS_SELECTOR, 'a.look-more_3oNWC[href*="www.baidu.com/s?"]')
# 3. 获取并标准化URL
raw_href = detail_link.get_attribute('href') or ""
detail_url = normalize_url(raw_href) # 使用之前定义的URL标准化函数
self.logger.debug(f"成功提取详情页URL: {detail_url}")
except Exception as e:
self.logger.debug(f"提取详情页URL失败: {str(e)}")
# 失败时保持空字符串,后续会使用主链接作为备选
# 图片
# img_element = item.find_element(By.CSS_SELECTOR, 'img[src*="cdn.bcebos.com/hotboard_image"]')
# img_url = img_element.get_attribute('src') or ""
# ==================== 3. 统一创建并赋值Item唯一赋值点 ====================
hot_search_item = MediaspidersItem()
hot_search_item['es_sitename'] = site_name
hot_search_item['es_urltitle'] = title
hot_search_item['es_urlcontent'] = desc
hot_search_item['es_carriertype'] = carrier_type
hot_search_item['es_urltime'] = self.url_time
hot_search_item['es_lasttime'] = now_ms
hot_search_item['es_loadtime'] = now_ms
hot_search_item['es_hkey'] = hkey
hot_search_item['es_simrank'] = rank
hot_search_item['es_heat'] = heat
hot_search_item['es_sid'] = sid
hot_search_item['es_urlname'] = detail_url
# 条件字段:仅当存在有效图片时赋值
# if img_url:
# hot_search_item['image_urls'] = [img_url] # ImagesPipeline要求列表格式
return hot_search_item

View File

@ -11,7 +11,6 @@ from scrapy_selenium import SeleniumRequest
from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.common.action_chains import ActionChains
from MediaSpiders.items import MediaspidersItem from MediaSpiders.items import MediaspidersItem
from MediaSpiders.spiders.TwitterUserInfoSpider import form_cookie_dict
from MediaSpiders.utils.http_utils import http_post from MediaSpiders.utils.http_utils import http_post
from MediaSpiders.utils.string_utils import get_str_md5 from MediaSpiders.utils.string_utils import get_str_md5
from MediaSpiders.utils.time_utils import get_time_stamp, get_current_timestamp from MediaSpiders.utils.time_utils import get_time_stamp, get_current_timestamp
@ -67,34 +66,21 @@ class FacebookSpider(scrapy.Spider):
logger.info("login facebook") logger.info("login facebook")
driver = response.request.meta['driver'] driver = response.request.meta['driver']
driver.maximize_window() driver.maximize_window()
# 访问主域,再设 Cookie driver.get('https://m.facebook.com/')
driver.get("https://www.facebook.com/") time.sleep(3)
time.sleep(2) # 获取采集登录账号并登录
login_users = self.redis_client.smembers('MediaSpiders:Facebook_login_accounts')
# 添加 Cookie确保 domain 是 .facebook.com user_list = []
cookie_string = self.redis_client.get("MediaSpiders:Facebook_Cookies").decode() for u in login_users:
cookie_dict = form_cookie_dict(cookie_string) # 你已有此函数 user_list.append(json.loads(u.decode()))
login_user = random.choice(user_list)
# 转换为 Selenium 所需格式(必须含 domain 和 path driver.find_element_by_xpath(
cookies_to_add = [] '//input[@name="email"]').send_keys(login_user['uid'])
for name, value in cookie_dict.items(): driver.find_element_by_xpath(
cookies_to_add.append({ '//input[@name="pass"]').send_keys(login_user['pwd'])
'name': name, driver.find_element_by_xpath('//button[@name="login"]').click()
'value': value, time.sleep(10)
'domain': '.facebook.com', logger.info("login as %s" % login_user['uid'])
'path': '/',
'secure': True
})
for cookie in cookies_to_add:
try:
driver.add_cookie(cookie)
except Exception as e:
logger.warning(f"Failed to add cookie {cookie['name']}: {e}")
driver.refresh()
time.sleep(5)
# 获取待采集目标账号,并逐个请求 # 获取待采集目标账号,并逐个请求
account_query_api = self.settings['SOCIAL_USER_QUERY_ALL_API'] account_query_api = self.settings['SOCIAL_USER_QUERY_ALL_API']
account_query_api = account_query_api.format(sortBy="id", shuffleResult="true") account_query_api = account_query_api.format(sortBy="id", shuffleResult="true")
@ -102,11 +88,9 @@ class FacebookSpider(scrapy.Spider):
'userType': self.settings['FACEBOOK_USER_TYPE'], 'userType': self.settings['FACEBOOK_USER_TYPE'],
'userFlag': 0 'userFlag': 0
} }
account_rsp = json.loads( account_rsp = json.loads(
http_post(account_query_api, json.dumps(post_data), headers={"Content-Type": "application/json"}).text) http_post(account_query_api, json.dumps(post_data), headers={"Content-Type": "application/json"}).text)
all_user_info = [] all_user_info = []
if account_rsp['code'] == 200: if account_rsp['code'] == 200:
all_user_info = account_rsp['content'] all_user_info = account_rsp['content']
logger.info('GET %s users' % account_rsp['message']) logger.info('GET %s users' % account_rsp['message'])
@ -123,7 +107,6 @@ class FacebookSpider(scrapy.Spider):
time.sleep(5) time.sleep(5)
last_page_articles_count = 0 last_page_articles_count = 0
logger.info("Current URL: {}".format(current_url)) logger.info("Current URL: {}".format(current_url))
#
current_page_articles = driver.find_elements_by_xpath( current_page_articles = driver.find_elements_by_xpath(
"//div[@data-pagelet='ProfileTimeline']/div[position() > %s]" % last_page_articles_count) "//div[@data-pagelet='ProfileTimeline']/div[position() > %s]" % last_page_articles_count)
items = self.get_article(current_page_articles, uid, driver) items = self.get_article(current_page_articles, uid, driver)
@ -146,95 +129,51 @@ class FacebookSpider(scrapy.Spider):
for article in articles: for article in articles:
item = MediaspidersItem() item = MediaspidersItem()
try: try:
# === 用户名:从 h2 下的 b/span 或直接 span 提取 === uname = article.find_element_by_xpath('.//h2//strong/span').text
try: article_url = article.find_element_by_xpath(".//a[contains(@href,'/posts/')]").get_attribute('href')
uname = article.find_element_by_xpath('.//h2//b//span').text
except:
try:
uname = article.find_element_by_xpath('.//h2//span[@dir="auto"]').text
except:
uname = uid
# === 文章链接和时间:从包含 /posts/pfbid 的 a 标签提取 ===
post_link_elem = article.find_element_by_xpath(".//a[contains(@href,'/posts/pfbid')]")
article_url = post_link_elem.get_attribute('href')
article_url = article_url.split("?")[0] article_url = article_url.split("?")[0]
article_time = post_link_elem.text # 时间文本直接在 a 标签内
# === 展开全文(如有)===
try: try:
clickable_fields = article.find_elements_by_xpath(".//div[@role='button']") clickable_fields = article.find_elements_by_xpath(".//div[@role='button']")
if len(clickable_fields) > 0: if len(clickable_fields) > 0:
for cf in clickable_fields: for cf in clickable_fields:
cf_text = cf.text cf_text = cf.text
if cf_text is not None and ("展开" in cf_text or "See more" in cf_text): if cf_text is not None and cf_text == "展开":
cf.click() cf.click()
time.sleep(1)
break break
except Exception as e: except Exception as e:
logger.debug(repr(e)) logger.debug(repr(e))
article_text_lines = article.find_elements_by_xpath(".//div[@data-ad-preview='message']")
# === 正文内容:使用 data-ad-rendering-role="story_message" === text_info = []
try: for line in article_text_lines:
article_text_lines = article.find_elements_by_xpath( text_info.append(line.text)
".//div[@data-ad-rendering-role='story_message']") article_text = "".join(text_info)
text_info = [] article_time = article.find_element_by_xpath(".//a[contains(@href,'/posts/')]/span").text
for line in article_text_lines:
text_info.append(line.text)
article_text = "".join(text_info)
except:
article_text = ""
# === 时间戳处理 ===
logger.info(f"article_time: {article_time}") logger.info(f"article_time: {article_time}")
article_time = get_time_stamp(article_time) article_time = get_time_stamp(
article_time) # 这里的 article_time 必须是中文模式下的时间比如“1天”、“5小时”等需要登陆Facebook后切换语言
logger.info(f"urltime: {article_time}") logger.info(f"urltime: {article_time}")
# === 图片提取 ===
img_urls = [] img_urls = []
try: imgs = article.find_elements_by_xpath(".//a[contains(@href,'/photo/')]//img")
imgs = article.find_elements_by_xpath(".//a[contains(@href,'/photo/')]//img") for img in imgs:
for img in imgs: img_urls.append(img.get_attribute("src"))
src = img.get_attribute("src")
if src and "emoji" not in src: # 过滤 emoji 图片
img_urls.append(src)
except:
pass
# === 视频(暂留空)===
video_urls = [] video_urls = []
article_id = get_str_md5(article_text)
# === 互动数据:点赞、评论、转发 ===
like_count = 0 like_count = 0
comment_count = 0 comment_count = 0
forward_count = 0 forward_count = 0
like_count_str = article.find_element_by_xpath(
".//div[@data-visualcompletion='ignore-dynamic']//span[@aria-hidden='true']").text
comment_and_forward_element = article.find_elements_by_xpath(".//div[@tabindex='0']//span[@dir='auto']")
try: try:
# 点赞数:通过 aria-label 匹配 if like_count_str:
like_label_elem = article.find_element_by_xpath( like_count = int(like_count_str.replace(",", ""))
".//div[@aria-label and contains(@aria-label, '赞:')]") if len(comment_and_forward_element) > 1:
like_label = like_label_elem.get_attribute("aria-label") comment_count_str = comment_and_forward_element[0].text
import re forward_count_str = comment_and_forward_element[1].text
like_match = re.search(r'(\d+)', like_label) comment_count = int(comment_count_str.replace(",", ""))
if like_match: forward_count = int(forward_count_str.replace(",", ""))
like_count = int(like_match.group(1))
except:
pass
try:
# 评论和转发:通常在 toolbar 内的两个 span 中
stat_spans = article.find_elements_by_xpath(
".//div[@role='toolbar']//span[@class='xt0b8zv x135b78x']")
if len(stat_spans) >= 2:
comment_count = int(stat_spans[0].text.replace(",", "")) if stat_spans[0].text.replace(",",
"").isdigit() else 0
forward_count = int(stat_spans[1].text.replace(",", "")) if stat_spans[1].text.replace(",",
"").isdigit() else 0
except: except:
logger.warning("获取点赞/评论/转发数量异常") logger.warning("获取点赞/评论/转发数量异常")
# === 填充 Item ===
article_id = get_str_md5(article_text)
item['es_sid'] = str(article_id) item['es_sid'] = str(article_id)
item['es_hkey'] = str(article_id) item['es_hkey'] = str(article_id)
item['es_content'] = str(article_text).replace('查看翻译', '') item['es_content'] = str(article_text).replace('查看翻译', '')
@ -254,23 +193,17 @@ class FacebookSpider(scrapy.Spider):
item['es_sitename'] = 'facebook' item['es_sitename'] = 'facebook'
item['es_srcname'] = 'facebook' item['es_srcname'] = 'facebook'
item['es_carriertype'] = 'media' item['es_carriertype'] = 'media'
# 判重
# === 判重逻辑 ===
if item['es_lasttime'] - item['es_urltime'] > 48 * 3600 * 1000: if item['es_lasttime'] - item['es_urltime'] > 48 * 3600 * 1000:
if self.bloom_filter.bfAdd(self.settings['FACEBOOK_FILTER_KEY'], article_id) <= 0: if self.bloom_filter.bfAdd(self.settings['FACEBOOK_FILTER_KEY'], article_id) <= 0:
logger.info("跳过已采集内容") logger.info("跳过已采集内容")
continue continue
if item['es_urlcontent'].endswith('展开'): if item['es_urlcontent'].endswith('展开'):
logger.info("跳过未展开的内容") logger.info("跳过未展开的内容")
continue continue
article_items.append(item) article_items.append(item)
except Exception as e: except Exception as e:
logger.debug("解析单条帖子失败: %s" % repr(e)) logger.debug(repr(e))
continue
logger.info("用户 {} 的发文数量为 {}".format(uid, len(article_items))) logger.info("用户 {} 的发文数量为 {}".format(uid, len(article_items)))
return article_items return article_items

View File

@ -1,417 +0,0 @@
# -*- coding: utf-8 -*-
import json
import logging
import time
from typing import List, Dict, Any, Optional
import scrapy
from scrapy_selenium import SeleniumRequest
from selenium.common.exceptions import TimeoutException, NoSuchElementException, ElementClickInterceptedException
from selenium.webdriver.common.by import By
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from MediaSpiders.items import MediaspidersItem
from MediaSpiders.utils.date_utils import get_time_stamp
from MediaSpiders.utils.hot_search_json_parser import url_response
from MediaSpiders.utils.string_utils import get_str_md5
from MediaSpiders.utils.time_utils import get_current_timestamp
class HotSearchRedisSpider(scrapy.Spider):
name = 'HotSearchRedisSpider'
custom_settings = {
'PROTO_MODULE_PATH': 'MediaSpiders.proto.Es_pb2',
'PROTO_CLASS_NAME': 'EsSets',
'PROTO_FIELD_NAME': 'Es',
'PROTO_SAVE_FILE_NAME': 'public_info_data_',
'IMAGES_STORE': r'/usr/local/temp_image/twitter',
'IMAGES_RESULT_FIELD': 'es_urlimage',
'FILES_STORE': r'/usr/local/videos',
'FILES_RESULT_FIELD': 'es_video',
'ZIP_FILE_NAME': 'image_data_ship_',
'FILE_ZIP_FILE_NAME': 'image_data_plane_',
'ITEM_PIPELINES': {
'MediaSpiders.pipelines.ProtobufSavePipeline': 300
},
'SPIDER_MIDDLEWARES': {
'MediaSpiders.middlewares.DumpFilterSpiderMiddleware': 543,
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware': None
}
}
# 常量定义
TOUTIAO_HOT_URL = 'https://www.toutiao.com/hot-event/hot-board/?origin=toutiao_pc'
BAIDU_URL = 'https://www.toutiao.com/'
PAGE_LOAD_TIMEOUT = 10
ELEMENT_WAIT_TIMEOUT = 5
MAX_NEWS_PER_HOT = 1
MAX_HOT_ITEMS = 15
# 选择器定义
# URL_SELECTORS = [
# '.card-render-wrapper a'
# # '.l-content a',
# # '.feed-card-wtt-l p a',
# # '.feed-card-article-l a'
# ]
AUTHOR_SELECTORS = [
"//div[@class='author-info']/div[@class='desc']/a[@class='name']",
"//div[@class='user-info']/a[@class='user-name']"
]
CONTENT_SELECTORS = [
"//div[@class='article-content']//p",
"//article/div[@class='weitoutiao-html']"
]
TIME_SELECTORS = [
"//p[@class='abstract']/span[@class='time']",
"//div[@class='article-meta']/span[1]"
]
# 需要过滤的文本模式
SKIP_PATTERNS = ['版权', '声明', '邮箱', '记者', '编辑', '来源', '投稿', '责任编辑']
def __init__(self, params=None, *args, **kwargs):
super().__init__(*args, **kwargs)
self.url_time = get_current_timestamp()
self.total_num = 0
self.authorization = None
self.job_id = None
if params:
try:
json_params = json.loads(params)
self.total_num = int(json_params.get('totalNum', 0))
self.authorization = json_params.get('authorization')
self.job_id = json_params.get('job_id')
except (json.JSONDecodeError, ValueError) as e:
self.logger.error(f"解析参数失败: {e}")
def start_requests(self):
"""开始请求"""
yield SeleniumRequest(
url=self.BAIDU_URL,
callback=self.parse_parent,
wait_time=self.PAGE_LOAD_TIMEOUT
)
def parse_parent(self, response):
"""解析热点列表页面"""
driver = response.request.meta['driver']
# 获取热点数据
hot_items = self._fetch_hot_items()[:self.MAX_HOT_ITEMS]
# hot_items = []
# hot_items.append({
# "fake_url": "https://www.toutiao.com/trending/7612920230477565459/?rank=14&log_from=4dda3d0c958f48_1772529869512",
# 'hot_id': '76132246866893fda27',
# 'hot_value': 5432429101,
# 'hot_word': '伊朗反击最初两天650民美士兵伤亡'
# })
for hot_item in hot_items:
if not hot_item.get('fake_url'):
self.logger.warning(f"热点 {hot_item['hot_word']} 无有效URL跳过")
continue
yield from self._process_hot_item(driver, hot_item)
def _fetch_hot_items(self) -> List[Dict[str, Any]]:
"""获取热点数据"""
try:
rsp_body = url_response(self.TOUTIAO_HOT_URL)
if rsp_body.get('status') != "success":
self.logger.error("获取热点数据失败")
return []
result_array = []
for line in rsp_body.get('data', []):
try:
result_array.append({
"hot_id": line.get('ClusterIdStr', ''),
"hot_word": line.get('Title', ''),
"hot_value": int(line.get('HotValue', 0)),
"fake_url": line.get('Url', '')
})
except Exception as e:
self.logger.error(f"解析热点数据失败: {e}")
self.logger.debug(f"问题数据: {line}")
return result_array
except Exception as e:
self.logger.error(f"获取热点数据异常: {e}")
return []
def _process_hot_item(self, driver, hot_item: Dict[str, Any]):
"""处理单个热点项"""
try:
yield from self._get_event_details(driver, hot_item)
# 加载热点页面
driver.get(hot_item['fake_url'])
self._wait_for_page_load(driver)
# 拿到所有标题,判断是否存在“事件脉络”
context_list = driver.find_elements(By.XPATH, "//div[@class='block-title']")
self.logger.info(f"context_list:{context_list}")
for context in context_list:
block_title = context.text.strip()
if block_title == "事件脉络":
yield from self._get_event_timeline(context, driver, hot_item)
continue
except Exception as e:
self.logger.error(f"处理热点 '{hot_item['hot_word']}' 失败: {e}")
def _get_event_timeline(self, context, driver, hot_item: Dict[str, Any]):
self.logger.info("开始采集事件脉络...")
# 定位按钮元素
button_element = context.find_elements(By.XPATH,
"//div[@class='timeline-card-wrapper']/div[@class='load-more']/button")[
0]
# 尝试普通点击
try:
button_element.click()
except ElementClickInterceptedException:
# 如果被遮挡使用JavaScript点击
self.logger.info("使用JavaScript点击按钮")
driver.execute_script("arguments[0].click();", button_element)
# 等待内容加载
time.sleep(2)
# 获取当前所有的脉络信息
event_list = context.find_elements(By.XPATH,
"//div[@class='timeline-card-wrapper']/div[@class='time-nodes']/div[@class='time-node']")
self.logger.info(f"找到 {len(event_list)} 个事件")
url_content = ''
for idx, even in enumerate(event_list, 1):
try:
# 获取标题和时间
title_elem = even.find_element(By.XPATH, ".//div[@class='title']")
title_text = title_elem.text
# 获取脉络内容
content_element = even.find_element(By.XPATH, ".//a[@class='content']/p")
content = content_element.text.strip()
# 检查是否有"最新"标签
try:
tag_element = content_element.find_element(By.XPATH, ".//span[@class='tag']")
tag = tag_element.text.strip() # "最新"
# 从内容中移除标签文本(如果需要)
content = content.replace(tag, "").strip()
except:
tag = ""
# 拼接标题和内容,用换行符隔开
if url_content:
url_content += "\n" # 在已有内容后添加换行符
# 添加当前条目的标题和内容
url_content += f"{title_text}\n{content}"
self.logger.info(f"已添加第{idx}条: {title_text[:20]}...")
except Exception as e:
self.logger.error(f"处理第{idx}个事件时出错: {e}")
continue
timeNow = get_current_timestamp()
# 创建item
event_timeline_item = MediaspidersItem()
event_timeline_item['es_carriertype'] = 'news'
event_timeline_item['es_srcname'] = 'https://www.toutiao.com/'
event_timeline_item['es_sitename'] = '今日头条'
event_timeline_item['es_sid'] = get_str_md5(hot_item["fake_url"])
event_timeline_item['es_urltitle'] = hot_item['hot_word']
event_timeline_item['es_authors'] = ''
event_timeline_item['es_urlcontent'] = url_content
event_timeline_item['es_urltime'] = timeNow
event_timeline_item['es_lasttime'] = timeNow
event_timeline_item['es_urlname'] = hot_item["fake_url"]
event_timeline_item['es_hkey'] = hot_item['hot_id']
event_timeline_item['es_urltopic'] = hot_item['hot_word']
event_timeline_item['es_video'] = ''
yield event_timeline_item
self.logger.info(f"事件脉络-采集成功 '{hot_item['hot_word']}'{hot_item['fake_url']}")
def _get_event_details(self, driver, hot_item: Dict[str, Any]):
"""获取事件详情卡片列表"""
self.logger.info(f"开始采集事件详情-{hot_item['hot_word']}: {hot_item['fake_url']}")
hot_url = hot_item['fake_url']
driver.get(hot_url)
self._wait_for_page_load(driver)
# 如果 api 采集的url为榜单页则采集卡片否则就直接采集详情页
if "article" not in hot_url:
cards = driver.find_elements(By.XPATH,
"//div[@class='block-content']/div[@class='card-render-wrapper']")
news_cards = cards[:self.MAX_NEWS_PER_HOT]
news_urls_array = []
for card in news_cards:
"""从卡片中提取URL"""
# for selector in self.URL_SELECTORS:
try:
element = card.find_element(By.CSS_SELECTOR, '.card-render-wrapper a')
url = element.get_attribute('href')
if url and url.startswith(('http://', 'https://')):
news_url = url
except NoSuchElementException:
break
# if "video" in news_url.lower() or not news_url:
# self.logger.info(f"跳过该链接采集: {news_url}")
# break
news_urls_array.append(news_url)
else:
# 将详情页赋值
news_urls_array = [hot_url]
try:
# 开始采集
for url in news_urls_array:
yield from self._process_news_page(driver, url, hot_item)
except Exception as e:
self.logger.error(f"获取事件详情卡片失败: {e}")
def _wait_for_page_load(self, driver, timeout: int = None):
"""等待页面加载"""
timeout = timeout or self.PAGE_LOAD_TIMEOUT
time.sleep(2) # 基础等待
try:
WebDriverWait(driver, timeout).until(
EC.presence_of_element_located((By.TAG_NAME, "body"))
)
except TimeoutException:
self.logger.warning("页面加载超时")
def _process_news_page(self, driver, news_url: str, hot_item: Dict[str, Any]):
"""处理单个新闻页面"""
try:
if "video" in news_url.lower() or not news_url:
driver.get(news_url)
self._wait_for_page_load(driver)
title = driver.find_elements(By.XPATH, "//div[@class='ttp-video-extras-title']/h1")[0].get_attribute('title')
time_text = driver.find_elements(By.XPATH, "//div[@class='meta-info']/span[@class='publish-time']")[0].text.replace("发布于 ", "").strip()
author = driver.find_elements(By.XPATH, "//div[@class='author-info']/a[@class='author-name']")[0].text.strip()
content = news_url
if time_text:
try:
url_time = get_time_stamp(time_text)
except Exception as e:
self.logger.debug(f"时间转换失败: {time_text}, {e}")
else:
driver.get(news_url)
self._wait_for_page_load(driver)
# 标题采集
try:
title = driver.find_elements(By.XPATH, "//div[@class='article-content']/h1")[0].text.strip()
except Exception as e:
logging.error(f'标题采集失败,已使用热搜名称...')
title = hot_item['hot_word']
# 提取页面信息
author = self._extract_text(driver, self.AUTHOR_SELECTORS)
content = self._extract_content(driver)
url_time = self._extract_time(driver)
if not content:
self.logger.warning(f"页面无有效内容: {news_url}")
return
# 创建item
even_details_item = MediaspidersItem()
even_details_item['es_carriertype'] = 'news'
even_details_item['es_srcname'] = 'https://www.toutiao.com/'
even_details_item['es_sitename'] = '今日头条'
even_details_item['es_sid'] = get_str_md5(news_url)
even_details_item['es_urltitle'] = title
even_details_item['es_authors'] = author
even_details_item['es_urlcontent'] = content
even_details_item['es_urltime'] = url_time
even_details_item['es_lasttime'] = url_time
even_details_item['es_urlname'] = news_url
even_details_item['es_hkey'] = hot_item['hot_id']
even_details_item['es_urltopic'] = hot_item['hot_word']
even_details_item['es_video'] = ''
yield even_details_item
self.logger.info(f"事件详情-采集成功 '{hot_item['hot_word']}'{news_url}")
except Exception as e:
self.logger.error(f"处理新闻页面失败 {news_url}: {e}")
def _extract_text(self, context, selectors: List[str]) -> Optional[str]:
"""从上下文中提取可见元素的文本"""
for selector in selectors:
try:
elements = context.find_elements(By.XPATH, selector)
for elem in elements:
if elem.is_displayed():
text = elem.text.strip()
if text:
return text
except Exception as e:
self.logger.debug(f"选择器 '{selector}' 未匹配: {e}")
return None
def _extract_content(self, driver) -> str:
"""提取文章内容"""
try:
time.sleep(2) # 等待内容加载
content_lines = []
for selector in self.CONTENT_SELECTORS:
try:
paragraphs = driver.find_elements(By.XPATH, selector)
for p in paragraphs:
if selector == '.weitoutiao-html':
text = p[0].text.strip()
else:
text = p.text.strip()
if text != '':
content_lines.append(text)
except Exception as e:
self.logger.debug(f"选择器 '{selector}' 提取失败: {e}")
return '\n'.join(content_lines) if content_lines else ""
except Exception as e:
self.logger.error(f"提取内容失败: {e}")
return ""
def _is_valid_content(self, text: str) -> bool:
"""验证内容是否有效"""
if not text or len(text) <= 10:
return False
return not any(pattern in text for pattern in self.SKIP_PATTERNS)
def _extract_time(self, driver) -> Optional[int]:
"""提取发布时间"""
time_text = self._extract_text(driver, self.TIME_SELECTORS)
if time_text:
try:
return get_time_stamp(time_text)
except Exception as e:
self.logger.debug(f"时间转换失败: {time_text}, {e}")
return self.url_time

View File

@ -1,408 +0,0 @@
# -*- coding: utf-8 -*-
# 标准库
from datetime import datetime, timedelta
import json
import logging as logger
import re
import time
import hashlib
import redis
import scrapy
from redisbloom.client import Client
from scrapy_selenium import SeleniumRequest
from selenium.webdriver.common.by import By
from MediaSpiders.items import MediaspidersItem
from MediaSpiders.utils.string_utils import get_str_md5
from MediaSpiders.utils.time_utils import get_current_timestamp
class LinkedinSpider(scrapy.Spider):
name = 'LinkedinUserSpider'
comment_urls = []
custom_settings = {
'PROTO_MODULE_PATH': 'MediaSpiders.proto.Es_pb2',
'PROTO_CLASS_NAME': 'EsSets',
'PROTO_FIELD_NAME': 'Es',
'PROTO_SAVE_FILE_NAME': 'public_info_data_',
'IMAGES_STORE': r'/usr/local/temp_image/linkedin',
'IMAGES_RESULT_FIELD': 'es_urlimage',
'FILES_STORE': r'/usr/local/videos',
'FILES_RESULT_FIELD': 'es_video',
'ZIP_FILE_NAME': 'image_data_publicinfo_',
'FILE_ZIP_FILE_NAME': 'image_data_plane_',
'ITEM_PIPELINES': {
'scrapy.pipelines.images.ImagesPipeline': 2,
'scrapy.pipelines.files.FilesPipeline': 1,
'MediaSpiders.pipelines.ProtobufSavePipeline': 300
},
'DOWNLOAD_DELAY': 2,
'SPIDER_MIDDLEWARES': {
'MediaSpiders.middlewares.DumpFilterSpiderMiddleware': 543,
'MediaSpiders.middlewares.KeywordFilterSpiderMiddleware': 544,
'MediaSpiders.middlewares.SimhashFilterSpiderMiddleware': 545,
}
}
def __init__(self, params=None, *args, **kwargs):
super(LinkedinSpider, self).__init__(*args, **kwargs)
json_params = json.loads(params)
logger.info(json_params)
self.crawl_comment = False
self.redis_client = None
self.bloom_filter = None
self.simhash_filter_key = None
if 'job_id' in json_params:
self.job_id = json_params['job_id']
def start_requests(self):
self.redis_client = redis.Redis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT'],
password=self.settings['REDIS_PWD'])
self.bloom_filter = Client(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT'],
password=self.settings['REDIS_PWD'])
self.simhash_filter_key = self.settings['LINKEDIN_SIMHASH_FILTER_KEY']
yield SeleniumRequest(url='https://www.google.com/', callback=self.parse)
def parse(self, response):
logger.info("login linkedin")
driver = response.request.meta['driver']
driver.maximize_window()
# 访问主域,再设 Cookie
driver.get("https://www.linkedin.com/")
time.sleep(2)
# 添加 Cookie确保 domain 是 .linkedin.com
cookie_string = self.redis_client.get("MediaSpiders:Linkedin_Cookies").decode()
cookie_dict = self.form_cookie_dict(cookie_string) # 你已有此函数
# 转换为 Selenium 所需格式(必须含 domain 和 path
cookies_to_add = []
for name, value in cookie_dict.items():
cookies_to_add.append({
'name': name,
'value': value,
'domain': '.linkedin.com',
'path': '/',
'secure': True
})
for cookie in cookies_to_add:
try:
driver.add_cookie(cookie)
except Exception as e:
logger.warning(f"Failed to add cookie {cookie['name']}: {e}")
driver.refresh()
time.sleep(5)
# 获取待采集目标账号,并逐个请求
# account_query_api = self.settings['SOCIAL_USER_QUERY_ALL_API']
# account_query_api = account_query_api.format(sortBy="id", shuffleResult="true")
# post_data = {
# 'userType': self.settings['FACEBOOK_USER_TYPE'],
# 'userFlag': 0
# }
#
# account_rsp = json.loads(
# http_post(account_query_api, json.dumps(post_data), headers={"Content-Type": "application/json"}).text)
# all_user_info = []
# if account_rsp['code'] == 200:
# all_user_info = account_rsp['content']
# logger.info('GET %s users' % account_rsp['message'])
# driver.set_window_size(1920, 1080)
all_user_info = [
{'id': 87, 'userFlag': '0', 'userName': 'andrewyng', 'userType': '2', 'userUid': 'USForcesJapan.J'}]
for user_info in all_user_info:
user_name = user_info['userName']
# 修复2: 移除 URL 末尾空格
current_url = f'https://www.linkedin.com/in/{user_name}/recent-activity/all/'
driver.get(current_url)
time.sleep(5)
# 修复3: 智能滚动加载(替代固定坐标点击)
self.smart_scroll(driver, max_scrolls=5)
# ✅ 修复 XPath使用现代 LinkedIn 动态卡片定位方式
current_page_articles = driver.find_elements(
By.XPATH,
"//div[contains(@class, 'feed-shared-update-v2')]"
)
logger.info(f"Found {len(current_page_articles)} articles for {user_name}")
items = self.get_linkedin_articles(current_page_articles, user_name, user_info['userUid'])
for item in items:
if item.get('es_commentcount', 0) > 0:
self.comment_urls.append({
'url': item['es_urlname'],
'article_id': item['es_sid'],
'article_author': item['es_authors'],
'article_text': item['es_urlcontent']
})
logger.info(f"用户 {item['es_userid']} 发文: {item['es_urlcontent'][:50]}...")
yield item
# 评论爬取需单独设计LinkedIn 评论需点击展开)
if self.crawl_comment and self.comment_urls:
comment_url = self.comment_urls.pop()
yield SeleniumRequest(
url=comment_url['url'],
callback=self.linkedin_comment_parse,
meta={'article_id': comment_url['article_id'], 'driver': driver}
)
def smart_scroll(self, driver, max_scrolls=5):
"""智能滚动:检测内容增量加载"""
last_height = driver.execute_script("return document.body.scrollHeight")
for i in range(max_scrolls):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(3) # 等待动态加载
# 检查是否加载了新内容
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
logger.info(f"滚动条 {i + 1}:未加载新内容,停止")
break
last_height = new_height
logger.info(f"滚动条 {i + 1}:加载了新内容到高度 {new_height}")
def get_linkedin_articles(self, articles, Uname, uid):
article_items = []
for idx, article in enumerate(articles):
try:
# === 1. 作者姓名 ===
try:
author_elem = article.find_element(By.XPATH,
".//span[contains(@class, 'update-components-actor__title')]//span[@aria-hidden='true']")
uname = author_elem.text.strip()
except:
uname = Uname
# === 2. 发布时间(相对时间转绝对时间戳)===
try:
time_elem = article.find_element(By.XPATH,
".//span[contains(@class, 'update-components-actor__sub-description')]")
relative_time = time_elem.text.split('')[0].strip() # "1 个月前"
article_time = self.parse_linkedin_relative_time(relative_time)
except Exception as e:
logger.warning(f"Time parse failed: {e}")
article_time = get_current_timestamp() - 86400000 # 默认24小时前
# === 3. 正文内容(处理"展开"按钮)===
try:
# 提取正文(多段落合并)
content_parts = article.find_elements(By.XPATH,
".//div[contains(@class, 'update-components-text')]//span[@dir='ltr']")
article_text = " ".join([p.text for p in content_parts if p.text.strip()])
except:
article_text = ""
# === 4. 文章链接(从 actor 链接提取)===
try:
# 获取文章链接
activity_urn = article.get_attribute("data-urn")
url_name = f"https://www.linkedin.com/feed/update/{activity_urn}"
except:
article_url = f"https://www.linkedin.com/in/{uname}/"
# === 5. 图片提取 ===
img_urls = []
try:
img_urls = [
img.get_attribute('data-delayed-url').strip()
for img in
article.find_elements(By.XPATH, ".//img[contains(@class, 'update-components-image__image')]")
if img.get_attribute('data-delayed-url')
]
except:
pass
# === 6. 互动数据(从 aria-label 提取)===
like_count = comment_count = forward_count = 0
try:
# 点赞数
like_btn = article.find_element(By.XPATH,
".//span[contains(@class, 'social-details-social-counts')]").text
like_count = self.extract_number(like_btn)
# 评论数
comment_btn = article.find_element(By.XPATH, ".//button[contains(@aria-label, '评论')]").text
comment_count = self.extract_number(comment_btn)
# 转发数
repost_btn = article.find_element(By.XPATH,
".//button[contains(@aria-label, '转发')]").text
forward_count = self.extract_number(repost_btn)
except Exception as e:
logger.debug(f"Interaction count parse failed: {e}")
try:
# == 7 获取是否年度影响人物linkedin特有标识,定位到包含 "• 3 度+" 的 span
degree_span = article.find_element(
By.XPATH,
"//span[@aria-hidden='true' and contains(., '') and contains(., '度+')]"
)
degree_text = degree_span.text.strip()
except Exception as e:
degree_text = ""
es_content = article_text.replace('[Original text:]', '').strip()
# === 7. 构建 Item ===
article_id = get_str_md5(f"{uname}{article_text}{article_time}")
item = MediaspidersItem()
item['es_sid'] = article_id
item['es_hkey'] = article_id
item['es_content'] = es_content
item['es_urlcontent'] = es_content
item['es_urltime'] = article_time # 注意:如果你之前已转为字符串时间,这里就是 str否则是时间戳
item['es_lasttime'] = get_current_timestamp()
item['es_loadtime'] = get_current_timestamp()
item['es_urltitle'] = uname
item['es_authors'] = uname
item['es_userid'] = uid
item['image_urls'] = img_urls
item['file_urls'] = []
item['es_urlname'] = url_name
item['es_commentcount'] = comment_count
item['es_forwardcount'] = forward_count
item['es_likecount'] = like_count
item['es_sitename'] = 'linkedin'
item['es_srcname'] = 'linkedin'
item['es_carriertype'] = 'media'
item['es_heat'] = degree_text
# 判重逻辑
if item['es_lasttime'] - item['es_urltime'] > 48 * 3600 * 1000:
if self.bloom_filter.bfAdd(self.settings['LINKEDIN_FILTER_KEY'], article_id) <= 0:
logger.info(f"跳过已采集内容: {article_id[:10]}...")
continue
if not item['es_urlcontent']:
logger.warning("跳过空内容动态")
continue
article_items.append(item)
logger.debug(f"Article {idx}: {uname} - {article_text[:30]}...")
except Exception as e:
logger.error(f"解析动态失败 (index {idx}): {repr(e)}")
continue
logger.info(f"用户 {uid} 共采集 {len(article_items)} 条有效动态")
return article_items
def parse_linkedin_relative_time(self, text):
""""1 个月前"等相对时间转为时间戳"""
now = datetime.now()
text = text.lower().replace(' ', '')
if '秒前' in text or 'secondsago' in text:
seconds = int(re.search(r'\d+', text).group())
return int((now - timedelta(seconds=seconds)).timestamp() * 1000)
elif '分钟前' in text or 'minutesago' in text:
minutes = int(re.search(r'\d+', text).group())
return int((now - timedelta(minutes=minutes)).timestamp() * 1000)
elif '小时前' in text or 'hoursago' in text:
hours = int(re.search(r'\d+', text).group())
return int((now - timedelta(hours=hours)).timestamp() * 1000)
elif '天前' in text or 'daysago' in text:
days = int(re.search(r'\d+', text).group())
return int((now - timedelta(days=days)).timestamp() * 1000)
elif '周前' in text or 'weeksago' in text:
weeks = int(re.search(r'\d+', text).group())
return int((now - timedelta(weeks=weeks)).timestamp() * 1000)
elif '月前' in text or 'monthsago' in text:
months = int(re.search(r'\d+', text).group())
# 简化处理1个月≈30天
return int((now - timedelta(days=months * 30)).timestamp() * 1000)
elif '年前' in text or 'yearsago' in text:
years = int(re.search(r'\d+', text).group())
return int((now - timedelta(days=years * 365)).timestamp() * 1000)
else:
return get_current_timestamp() - 86400000 # 默认24小时前
def extract_number(self, text):
""""1,234 个赞"提取数字 1234"""
try:
num_str = re.search(r'[\d,]+', text).group().replace(',', '')
return int(num_str)
except:
return 0
def linkedin_comment_parse(self, response):
driver = response.meta['driver']
article_id = response.meta['article_id']
# 点击"评论"按钮展开评论区
try:
comment_btn = driver.find_element(By.XPATH,
"//button[contains(@class, 'comments-comment-button')]")
comment_btn.click()
time.sleep(3)
except:
logger.warning("未找到评论按钮,跳过评论爬取")
return
# 滚动加载评论
self.smart_scroll(driver, max_scrolls=3)
# 提取评论
comment_elements = driver.find_elements(By.XPATH,
"//div[contains(@class, 'comments-comment-item')]")
for comment in comment_elements:
try:
author = comment.find_element(By.XPATH,
".//span[contains(@class, 'comments-post-meta__name-text')]").text.strip()
content = comment.find_element(By.XPATH,
".//span[contains(@class, 'comments-comment-item-content')]").text.strip()
comment_id = get_str_md5(f"{author}{content}")
item = MediaspidersItem()
item['es_sid'] = comment_id
item['es_hkey'] = article_id
item['es_content'] = content
item['es_authors'] = author
item['es_userid'] = author
item['es_urltime'] = get_current_timestamp()
item['es_sitename'] = 'linkedin'
item['es_srcname'] = 'linkedin_comment'
item['es_carriertype'] = 'comment'
yield item
except:
continue
# 继续处理队列中的其他评论
if self.comment_urls:
next_comment = self.comment_urls.pop()
yield SeleniumRequest(
url=next_comment['url'],
callback=self.linkedin_comment_parse,
meta={'article_id': next_comment['article_id'], 'driver': driver}
)
def form_cookie_dict(self, cookie_str: str) -> dict:
# 清理前缀(兼容中英文冒号)
for prefix in ["Cookie:", "Cookie"]:
if cookie_str.startswith(prefix):
cookie_str = cookie_str[len(prefix):].strip()
break
cookie_dict = {}
for item in cookie_str.split(';'):
item = item.strip()
if not item or '=' not in item:
continue
name, value = item.split('=', 1) # 仅分割第一个等号
name, value = name.strip(), value.strip()
# 移除 value 两端双引号Selenium 不需要)
if value.startswith('"') and value.endswith('"'):
value = value[1:-1]
cookie_dict[name] = value
return cookie_dict

View File

@ -1,182 +0,0 @@
# -*- coding: utf-8 -*-
from datetime import datetime
from datetime import timezone, timedelta
import json
import logging as logger
import random
import re
import time
from urllib import parse
import redis
import scrapy
from scrapy_selenium import SeleniumRequest
from MediaSpiders.items import MediaspidersItem, TwitterUserInfoItem
from MediaSpiders.utils.http_utils import http_post
from MediaSpiders.utils.login_utils import login
from MediaSpiders.utils.time_utils import get_time_stamp, get_current_timestamp
def form_cookie_dict(cookie_string):
cookie_string_list = cookie_string.split(';')
cookie_dict = {}
for cookie in cookie_string_list:
key = cookie.split('=')[0].replace(' ', '')
cookie_dict[key] = cookie.split('=')[1]
return cookie_dict
class TwitterSpider(scrapy.Spider):
name = 'TwitterUserInfoSpider'
custom_settings = {
'PROTO_SAVE_FILE_NAME': 'public_twitter_user_info_data_',
'IMAGES_STORE': r'/usr/local/temp_image/twitteruserinfo',
'IMAGES_RESULT_FIELD': 'avatar_path',
'ITEM_PIPELINES': {
'scrapy.pipelines.images.ImagesPipeline': 2,
'MediaSpiders.pipelines.TwitterUserDataSaveToMySQL': 300,
},
'SPIDER_MIDDLEWARES': {
'MediaSpiders.middlewares.DumpFilterSpiderMiddleware': 543,
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware': None
}
}
def __init__(self, params=None, *args, **kwargs):
super(TwitterSpider, self).__init__(*args, **kwargs)
self.total_num = 100
self.authorization = 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'
self.tags = {
"620632841": "媒体实体", # 纽约时报中文网
"1714100357582770176": "媒体实体", # 昨天
"218434058": "官方代表", # 高市早苗
"121669059": "媒体实体", # yonhapnews
"8149482": "媒体实体", # 美国之音中文网
"46574977": "媒体实体", # 华尔街日报中文网
"1260553941714186241": "名人", # 李老师不是你老师
"106379129": "官方代表", # 이재명
}
if params:
json_params = json.loads(params)
if 'totalNum' in json_params:
self.total_num = int(json_params['totalNum'])
if 'authorization' in json_params:
self.authorization = json_params['authorization']
if 'job_id' in json_params:
self.job_id = json_params['job_id']
def start_requests(self):
yield SeleniumRequest(url='https://www.google.com/', callback=self.login_twitter)
def login_twitter(self, response):
self.redis_client = redis.Redis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT'],
password=self.settings['REDIS_PWD'])
self.simhash_filter_key = self.settings['TWITTER_SIMHASH_FILTER_KEY']
cookie_string = None
# 获取采集登录账号并登录
login_users = self.redis_client.smembers('MediaSpiders:Twitter_login_accounts')
# 尝试自动化登录网页获取 cookies若失败则从redis中 使用已有cookies
# try:
#
# driver = login().login_with_selenium(
# 'https://x.com/i/flow/login',
# self.name,
# login_users=login_users,
# response=response
# )
# cookies = driver.get_cookies()
# # 取cookie中的ct0为x-csrf-token取gt为x-guest-token
# self.cookie_dict = {}
# for cookie in cookies:
# self.cookie_dict[cookie['name']] = cookie['value']
# except Exception as e:
# logger.info("自动化获取cookies失败")
cookie_string = self.redis_client.get("MediaSpiders:Twitter_Cookies").decode()
self.cookie_dict = form_cookie_dict(cookie_string)
ct0 = self.cookie_dict.get('ct0')
if not ct0:
logger.error("redis中cookie缺失ct0 (CSRF token)")
return
self.header = {
'Host': 'api.twitter.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36 Edg/143.0.0.0',
'Accept': '*/*',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'content-type': 'application/json',
'authorization': self.authorization,
'Origin': 'https://twitter.com',
'Cookie': cookie_string,
'X-Csrf-Token': ct0
}
self.filter_key = self.settings['TWITTER_FILTER_KEY']
self.pid_key = self.settings['TWITTER_PID_KEY']
url_key = self.redis_client.get("MediaSpiders:Twitter_URL_Key").decode()
account_query_api = self.settings['SOCIAL_USER_QUERY_ALL_API']
account_query_api = account_query_api.format(sortBy="id", shuffleResult="true")
post_data = {
'userType': self.settings['TWITTER_USER_TYPE'],
'userFlag': 0
}
account_rsp = json.loads(
http_post(account_query_api, json.dumps(post_data), headers={"Content-Type": "application/json"}).text)
all_user_info = []
if account_rsp['code'] == 200:
all_user_info = account_rsp['content']
for user_info in all_user_info:
graphql_url = f'https://x.com/i/api/graphql/-oaLodhGbbnzJBACb1kk2Q/UserByScreenName?variables=%7B%22screen_name%22%3A%22{user_info["userName"]}%22%2C%22withGrokTranslatedBio%22%3Afalse%7D&features=%7B%22hidden_profile_subscriptions_enabled%22%3Atrue%2C%22profile_label_improvements_pcf_label_in_post_enabled%22%3Atrue%2C%22responsive_web_profile_redirect_enabled%22%3Afalse%2C%22rweb_tipjar_consumption_enabled%22%3Atrue%2C%22verified_phone_label_enabled%22%3Afalse%2C%22subscriptions_verification_info_is_identity_verified_enabled%22%3Atrue%2C%22subscriptions_verification_info_verified_since_enabled%22%3Atrue%2C%22highlights_tweets_tab_ui_enabled%22%3Atrue%2C%22responsive_web_twitter_article_notes_tab_enabled%22%3Atrue%2C%22subscriptions_feature_can_gift_premium%22%3Atrue%2C%22creator_subscriptions_tweet_preview_api_enabled%22%3Atrue%2C%22responsive_web_graphql_skip_user_profile_image_extensions_enabled%22%3Afalse%2C%22responsive_web_graphql_timeline_navigation_enabled%22%3Atrue%7D&fieldToggles=%7B%22withPayments%22%3Afalse%2C%22withAuxiliaryUserLabels%22%3Atrue%7D'
yield scrapy.Request(url=graphql_url, callback=self.parse,
meta={
'uid': user_info['userUid'],
'uname': user_info['userName'],
'proxy': 'http://127.0.0.1:10809',
},
cookies=self.cookie_dict, headers=self.header)
def parse(self, response):
uid = response.request.meta['uid']
uname = response.request.meta['uname']
try:
rsp = json.loads(response.text)
entries = []
instructions = rsp['data']['user']['result']
item = TwitterUserInfoItem()
item['crawl_time'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
item['is_newest'] = 1
item['platform_type'] = "Twitter"
item['user_id'] = int(instructions['rest_id'])
item['nickname'] = instructions['core']['name']
item['username'] = instructions['core']['screen_name']
item['user_url'] = f'https://x.com/{uname}'
item['user_link'] = f'https://x.com/{uname}'
item['avatar_url'] = instructions['avatar']['image_url']
item['intro'] = instructions['legacy']['description']
item['city'] = instructions.get('location', {}).get('location', '').strip()
item['backgroud_image_url'] = instructions.get('legacy', {}).get('profile_banner_url', '')
item['image_urls'] = [
instructions['avatar']['image_url'],
instructions.get('legacy', {}).get('profile_banner_url', '').strip()
]
try:
# 转换为 datetime 对象
ts = get_time_stamp(
str(instructions['core']['created_at'])) + 8 * 3600 * 1000
dt = datetime.fromtimestamp(ts / 1000, tz=timezone.utc)
item['join_date'] = dt.strftime('%Y-%m-%d %H:%M:%S') # '2012-06-28 12:25:01'
except (ValueError, KeyError) as e:
item['join_date'] = None # 或记录日志
logger.error('时间转换失败:' + e)
item['signature'] = instructions.get('legacy', {}).get('description', '').strip() or instructions.get('profile_bio', {}).get(
'description', '').strip()
item['post_count'] = instructions['legacy']['statuses_count']
item['follow_count'] = instructions['legacy']['friends_count']
item['fans_count'] = instructions['legacy']['followers_count']
item['is_verified'] = str(instructions['is_blue_verified'])
item['tags'] = self.tags[uid]
verified_type = instructions.get('verification', {}).get('verified_type', None) # 认证类型
yield item
except:
self.logger.error("解析response错误")

View File

@ -1,7 +1,9 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import json import json
import logging as logger import logging as logger
import random
import re import re
import time
from urllib import parse from urllib import parse
import redis import redis
@ -10,18 +12,7 @@ from scrapy_selenium import SeleniumRequest
from MediaSpiders.items import MediaspidersItem from MediaSpiders.items import MediaspidersItem
from MediaSpiders.utils.http_utils import http_post from MediaSpiders.utils.http_utils import http_post
from MediaSpiders.utils.login_utils import login
from MediaSpiders.utils.time_utils import get_time_stamp, get_current_timestamp from MediaSpiders.utils.time_utils import get_time_stamp, get_current_timestamp
from MediaSpiders.utils.traslate_utils import translate_single, translate_content_with_paragraphs, needs_translation
def form_cookie_dict(cookie_string):
cookie_string_list = cookie_string.split(';')
cookie_dict = {}
for cookie in cookie_string_list:
key = cookie.split('=')[0].replace(' ', '')
cookie_dict[key] = cookie.split('=')[1]
return cookie_dict
class TwitterSpider(scrapy.Spider): class TwitterSpider(scrapy.Spider):
@ -35,8 +26,8 @@ class TwitterSpider(scrapy.Spider):
'IMAGES_RESULT_FIELD': 'es_urlimage', 'IMAGES_RESULT_FIELD': 'es_urlimage',
'FILES_STORE': r'/usr/local/videos', 'FILES_STORE': r'/usr/local/videos',
'FILES_RESULT_FIELD': 'es_video', 'FILES_RESULT_FIELD': 'es_video',
'ZIP_FILE_NAME': 'image_data_ship_', # 图片包名称 'ZIP_FILE_NAME': 'image_data_publicinfo_',
'FILE_ZIP_FILE_NAME': 'image_data_plane_', # 视频包名称 'FILE_ZIP_FILE_NAME': 'image_data_plane_',
'ITEM_PIPELINES': { 'ITEM_PIPELINES': {
'scrapy.pipelines.images.ImagesPipeline': 2, 'scrapy.pipelines.images.ImagesPipeline': 2,
'scrapy.pipelines.files.FilesPipeline': 1, 'scrapy.pipelines.files.FilesPipeline': 1,
@ -47,7 +38,6 @@ class TwitterSpider(scrapy.Spider):
# 'MediaSpiders.middlewares.KeywordFilterSpiderMiddleware': 544, # 'MediaSpiders.middlewares.KeywordFilterSpiderMiddleware': 544,
# 'MediaSpiders.middlewares.SimhashFilterSpiderMiddleware': 545, # 'MediaSpiders.middlewares.SimhashFilterSpiderMiddleware': 545,
# 'scrapy_splash.SplashDeduplicateArgsMiddleware': 700, # 'scrapy_splash.SplashDeduplicateArgsMiddleware': 700,
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware': None
} }
} }
@ -72,45 +62,59 @@ class TwitterSpider(scrapy.Spider):
self.redis_client = redis.Redis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT'], self.redis_client = redis.Redis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT'],
password=self.settings['REDIS_PWD']) password=self.settings['REDIS_PWD'])
self.simhash_filter_key = self.settings['TWITTER_SIMHASH_FILTER_KEY'] self.simhash_filter_key = self.settings['TWITTER_SIMHASH_FILTER_KEY']
logger.info("login twitter")
driver = response.request.meta['driver']
driver.maximize_window()
driver.get('https://twitter.com/i/flow/login')
time.sleep(5)
# 获取采集登录账号并登录 # 获取采集登录账号并登录
login_users = self.redis_client.smembers('MediaSpiders:Twitter_login_accounts') login_users = self.redis_client.smembers('MediaSpiders:Twitter_login_accounts')
# 从redis中 使用已有cookies否则自动化登录网页获取cookies user_list = []
cookie_string = self.redis_client.get("MediaSpiders:Twitter_Cookies").decode() for u in login_users:
ct0 = None user_list.append(json.loads(u.decode()))
if cookie_string: login_user = random.choice(user_list)
self.cookie_dict = form_cookie_dict(cookie_string) logger.info(f"login as user {login_user['uid']}")
# 5. 构建 headers driver.find_element_by_xpath("//input").send_keys(login_user['uid'])
ct0 = self.cookie_dict.get('ct0') try:
if not ct0: next_button = driver.find_element_by_xpath("//div[@role='button'][2]")
logger.error("redis中cookie缺失ct0 (CSRF token)") next_button.click()
return except Exception:
else: logger.info("点击“下一步”的button元素")
try: next_button = driver.find_element_by_xpath("//button[@role='button'][2]")
next_button.click()
driver = login().login_with_selenium( time.sleep(5)
'https://x.com/i/flow/login', try:
self.name, logger.info("输入手机号验证...")
login_users=login_users, driver.find_element_by_xpath("//input[@name='text']").send_keys("+8619962025641")
response=response driver.find_element_by_xpath("//button[@data-testid='ocfEnterTextNextButton']").click()
) time.sleep(5)
cookies = driver.get_cookies() except Exception:
# 取cookie中的ct0为x-csrf-token取gt为x-guest-token logger.info("无需输入手机号验证")
self.cookie_dict = {} driver.find_element_by_xpath("//input[@name='password']").send_keys(login_user['pwd'])
for cookie in cookies: driver.find_element_by_xpath("//button[@data-testid='LoginForm_Login_Button']").click()
self.cookie_dict[cookie['name']] = cookie['value'] time.sleep(5)
except Exception as e: try:
logger.info("自动化获取cookies失败") driver.find_element_by_xpath("//button[@data-testid='confirmationSheetConfirm']").click()
time.sleep(10)
except:
time.sleep(5)
cookies = driver.get_cookies()
# cookies = json.loads(response.text)['cookies']
# 取cookie中的ct0为x-csrf-token取gt为x-guest-token
self.cookie_dict = {}
for cookie in cookies:
self.cookie_dict[cookie['name']] = cookie['value']
self.header = { self.header = {
'Host': 'api.twitter.com', 'Host': 'api.twitter.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36 Edg/143.0.0.0', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0',
'Accept': '*/*', 'Accept': '*/*',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6', 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'content-type': 'application/json', 'content-type': 'application/json',
'authorization': self.authorization, 'authorization': self.authorization,
# 'x-twitter-active-user': 'yes',
'Origin': 'https://twitter.com', 'Origin': 'https://twitter.com',
'Cookie': cookie_string, 'Connection': 'keep-alive',
'X-Csrf-Token': ct0 'X-Csrf-Token': self.cookie_dict['ct0']
} }
self.filter_key = self.settings['TWITTER_FILTER_KEY'] self.filter_key = self.settings['TWITTER_FILTER_KEY']
self.pid_key = self.settings['TWITTER_PID_KEY'] self.pid_key = self.settings['TWITTER_PID_KEY']
@ -127,14 +131,9 @@ class TwitterSpider(scrapy.Spider):
if account_rsp['code'] == 200: if account_rsp['code'] == 200:
all_user_info = account_rsp['content'] all_user_info = account_rsp['content']
for user_info in all_user_info: for user_info in all_user_info:
graphql_url = f'https://x.com/i/api/graphql/{url_key}/UserTweets?variables=%7B%22userId%22%3A%22{user_info["userUid"]}%22%2C%22count%22%3A20%2C%22includePromotedContent%22%3Atrue%2C%22withQuickPromoteEligibilityTweetFields%22%3Atrue%2C%22withVoice%22%3Atrue%7D&features=%7B%22rweb_video_screen_enabled%22%3Afalse%2C%22profile_label_improvements_pcf_label_in_post_enabled%22%3Atrue%2C%22responsive_web_profile_redirect_enabled%22%3Afalse%2C%22rweb_tipjar_consumption_enabled%22%3Atrue%2C%22verified_phone_label_enabled%22%3Afalse%2C%22creator_subscriptions_tweet_preview_api_enabled%22%3Atrue%2C%22responsive_web_graphql_timeline_navigation_enabled%22%3Atrue%2C%22responsive_web_graphql_skip_user_profile_image_extensions_enabled%22%3Afalse%2C%22premium_content_api_read_enabled%22%3Afalse%2C%22communities_web_enable_tweet_community_results_fetch%22%3Atrue%2C%22c9s_tweet_anatomy_moderator_badge_enabled%22%3Atrue%2C%22responsive_web_grok_analyze_button_fetch_trends_enabled%22%3Afalse%2C%22responsive_web_grok_analyze_post_followups_enabled%22%3Atrue%2C%22responsive_web_jetfuel_frame%22%3Atrue%2C%22responsive_web_grok_share_attachment_enabled%22%3Atrue%2C%22responsive_web_grok_annotations_enabled%22%3Afalse%2C%22articles_preview_enabled%22%3Atrue%2C%22responsive_web_edit_tweet_api_enabled%22%3Atrue%2C%22graphql_is_translatable_rweb_tweet_is_translatable_enabled%22%3Atrue%2C%22view_counts_everywhere_api_enabled%22%3Atrue%2C%22longform_notetweets_consumption_enabled%22%3Atrue%2C%22responsive_web_twitter_article_tweet_consumption_enabled%22%3Atrue%2C%22tweet_awards_web_tipping_enabled%22%3Afalse%2C%22responsive_web_grok_show_grok_translated_post%22%3Afalse%2C%22responsive_web_grok_analysis_button_from_backend%22%3Atrue%2C%22post_ctas_fetch_enabled%22%3Afalse%2C%22creator_subscriptions_quote_tweet_preview_enabled%22%3Afalse%2C%22freedom_of_speech_not_reach_fetch_enabled%22%3Atrue%2C%22standardized_nudges_misinfo%22%3Atrue%2C%22tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled%22%3Atrue%2C%22longform_notetweets_rich_text_read_enabled%22%3Atrue%2C%22longform_notetweets_inline_media_enabled%22%3Atrue%2C%22responsive_web_grok_image_annotation_enabled%22%3Atrue%2C%22responsive_web_grok_imagine_annotation_enabled%22%3Atrue%2C%22responsive_web_grok_community_note_auto_translation_is_enabled%22%3Afalse%2C%22responsive_web_enhance_cards_enabled%22%3Afalse%7D&fieldToggles=%7B%22withArticlePlainText%22%3Afalse%7D' graphql_url = f'https://twitter.com/i/api/graphql/{url_key}/UserTweets?variables=%7B%22userId%22%3A%22{user_info["userUid"]}%22%2C%22count%22%3A20%2C%22includePromotedContent%22%3Atrue%2C%22withQuickPromoteEligibilityTweetFields%22%3Atrue%2C%22withVoice%22%3Atrue%2C%22withV2Timeline%22%3Atrue%7D&features=%7B%22rweb_lists_timeline_redesign_enabled%22%3Atrue%2C%22responsive_web_graphql_exclude_directive_enabled%22%3Atrue%2C%22verified_phone_label_enabled%22%3Afalse%2C%22creator_subscriptions_tweet_preview_api_enabled%22%3Atrue%2C%22responsive_web_graphql_timeline_navigation_enabled%22%3Atrue%2C%22responsive_web_graphql_skip_user_profile_image_extensions_enabled%22%3Afalse%2C%22tweetypie_unmention_optimization_enabled%22%3Atrue%2C%22responsive_web_edit_tweet_api_enabled%22%3Atrue%2C%22graphql_is_translatable_rweb_tweet_is_translatable_enabled%22%3Atrue%2C%22view_counts_everywhere_api_enabled%22%3Atrue%2C%22longform_notetweets_consumption_enabled%22%3Atrue%2C%22responsive_web_twitter_article_tweet_consumption_enabled%22%3Afalse%2C%22tweet_awards_web_tipping_enabled%22%3Afalse%2C%22freedom_of_speech_not_reach_fetch_enabled%22%3Atrue%2C%22standardized_nudges_misinfo%22%3Atrue%2C%22tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled%22%3Atrue%2C%22longform_notetweets_rich_text_read_enabled%22%3Atrue%2C%22longform_notetweets_inline_media_enabled%22%3Atrue%2C%22responsive_web_media_download_video_enabled%22%3Afalse%2C%22responsive_web_enhance_cards_enabled%22%3Afalse%7D&fieldToggles=%7B%22withArticleRichContentState%22%3Afalse%7D'
yield scrapy.Request(graphql_url, callback=self.parse,
yield scrapy.Request(url=graphql_url, callback=self.parse, meta={'uid': user_info['userUid'], 'currentCount': 0},
meta={
'uid': user_info['userUid'],
# 'proxy': 'http://127.0.0.1:10808',
'currentCount': 0
},
cookies=self.cookie_dict, headers=self.header) cookies=self.cookie_dict, headers=self.header)
def parse(self, response): def parse(self, response):
@ -165,65 +164,34 @@ class TwitterSpider(scrapy.Spider):
result = entry['content']['itemContent']['tweet_results']['result'] result = entry['content']['itemContent']['tweet_results']['result']
item['es_userid'] = result['core']['user_results']['result']['rest_id'] item['es_userid'] = result['core']['user_results']['result']['rest_id']
item['es_hkey'] = result['rest_id'] item['es_hkey'] = result['rest_id']
item['es_district'] = result['core']['user_results']['result']['location'] item['es_district'] = result['core']['user_results']['result']['legacy']['location']
screen_name = result['core']['user_results']['result']['core']['screen_name'] screen_name = result['core']['user_results']['result']['legacy']['screen_name']
user_name = result['core']['user_results']['result']['core']['name'] user_name = result['core']['user_results']['result']['legacy']['name']
item['es_urlname'] = 'https://x.com/%s/status/%s' % (screen_name, result['rest_id']) item['es_urlname'] = 'https://twitter.com/%s/status/%s' % (screen_name, result['rest_id'])
item['es_authors'] = screen_name item['es_authors'] = screen_name
item['es_extname'] = user_name item['es_extname'] = user_name
device_html = result['source']
device_type = re.search(r'>([^<]+)</a>', device_html).group(1)
legacy = result['legacy'] legacy = result['legacy']
author_full_text = legacy['full_text'] author_full_text = legacy['full_text']
created_at = legacy['created_at'] created_at = legacy['created_at']
# 评论、转发、点赞数量
item['es_commentcount'] = legacy['reply_count'] item['es_commentcount'] = legacy['reply_count']
item['es_forwardcount'] = legacy['retweet_count'] item['es_forwardcount'] = legacy['retweet_count']
item['es_likecount'] = legacy['favorite_count'] item['es_likecount'] = legacy['favorite_count']
# 评论+ 转发+ 点赞数量 TODO
interaction_count = legacy['reply_count'] + legacy['retweet_count'] + legacy['favorite_count']
# 语种
lang = legacy['lang']
# 推文话题 、 提及
topic = legacy['entities']['hashtags']
mentions = legacy['entities']['user_mentions']
item['es_lasttime'] = get_current_timestamp() item['es_lasttime'] = get_current_timestamp()
item['es_loadtime'] = get_current_timestamp() item['es_loadtime'] = get_current_timestamp()
item['es_urltime'] = get_time_stamp( item['es_urltime'] = get_time_stamp(
str(created_at)) + 8 * 3600 * 1000 # TW默认使用的是零时区转换为北京时间 str(created_at)) + 8 * 3600 * 1000 # TW默认使用的是零时区转换为北京时间
if 'quoted_status_result' in result: if 'quoted_status_result' in result:
item['es_isrepost'] = '1' item['es_isrepost'] = 'yes'
item['es_urltitle'] = author_full_text item['es_urltitle'] = author_full_text
item['es_catalog1'] = author_full_text item['es_catalog1'] = author_full_text
# 判断是否需要翻译
if needs_translation(author_full_text):
item['es_catalog2'] = translate_single(author_full_text)
else:
item['es_catalog2'] = ''
legacy = result['quoted_status_result']['result']['legacy'] legacy = result['quoted_status_result']['result']['legacy']
original_tweet = result['quoted_status_result']['result']['rest_id']
self.logger.info('采集引用推文原文信息') self.logger.info('采集引用推文原文信息')
elif 'retweeted_status_result' in legacy: elif 'retweeted_status_result' in legacy:
item['es_isrepost'] = '1' item['es_isrepost'] = 'yes'
legacy = legacy['retweeted_status_result']['result']['legacy'] legacy = legacy['retweeted_status_result']['result']['legacy']
original_tweet = result['retweeted_status_result']['result']['rest_id']
self.logger.info('采集转发推文原文信息') self.logger.info('采集转发推文原文信息')
else: item['es_content'] = legacy['full_text']
item['es_isrepost'] = '0'
original_tweet = ''
self.logger.info('采集原文信息')
item['es_urlcontent'] = legacy['full_text'] item['es_urlcontent'] = legacy['full_text']
# 获取文本
url_content = legacy['full_text']
# 判断是否需要翻译
if needs_translation(url_content):
item['es_content'] = translate_content_with_paragraphs(url_content)
else:
item['es_content'] = ''
# 下载图片 # 下载图片
image_url_list = [] image_url_list = []
if 'entities' in legacy and 'media' in legacy['entities']: if 'entities' in legacy and 'media' in legacy['entities']:

View File

@ -1,309 +0,0 @@
# -*- coding: utf-8 -*-
import json
import logging as logger
import random
import time
from math import ceil
import redis
import requests
import scrapy
from scrapy_selenium import SeleniumRequest
from MediaSpiders.settings import REDIS_HOST, REDIS_PORT, REDIS_PWD, SOCIAL_USER_QUERY_ALL_API, SOCIAL_USER_UPDATE_API, \
WECHAT_USER_TYPE
from MediaSpiders.utils.http_utils import http_post, UA
from MediaSpiders.utils.time_utils import get_current_timestamp
class WechatLinksFetcherSpider(scrapy.Spider):
name = 'WechatLinksFetcherSpider'
custom_settings = {
'PROTO_MODULE_PATH': 'MediaSpiders.proto.Es_pb2',
'PROTO_CLASS_NAME': 'EsSets',
'PROTO_FIELD_NAME': 'Es',
'PROTO_SAVE_FILE_NAME': 'public_info_data_',
'IMAGES_STORE': r'/usr/local/temp_image/twitter',
'IMAGES_RESULT_FIELD': 'es_urlimage',
'FILES_STORE': r'/usr/local/videos',
'FILES_RESULT_FIELD': 'es_video',
'ZIP_FILE_NAME': 'image_data_ship_',
'FILE_ZIP_FILE_NAME': 'image_data_plane_',
'ITEM_PIPELINES': {
'scrapy.pipelines.images.ImagesPipeline': 2,
'scrapy.pipelines.files.FilesPipeline': 1,
'MediaSpiders.pipelines.ProtobufSavePipeline': 300,
},
'SPIDER_MIDDLEWARES': {
'MediaSpiders.middlewares.DumpFilterSpiderMiddleware': 543,
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware': None
}
}
# 常量定义
PAGE_LOAD_TIMEOUT = 10
ELEMENT_WAIT_TIMEOUT = 5
MAX_NEWS_PER_HOT = 6
MAX_HOT_ITEMS = 10
# 需要过滤的文本模式
SKIP_PATTERNS = ['版权', '声明', '邮箱', '记者', '编辑', '来源', '投稿', '责任编辑']
def __init__(self, params=None, *args, **kwargs):
super().__init__(*args, **kwargs)
self.url_time = get_current_timestamp()
self.total_num = 0
self.authorization = None
self.job_id = None
self.redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PWD)
if params:
try:
json_params = json.loads(params)
self.total_num = int(json_params.get('totalNum', 0))
self.authorization = json_params.get('authorization')
self.job_id = json_params.get('job_id')
except (json.JSONDecodeError, ValueError) as e:
self.logger.error(f"解析参数失败: {e}")
def start_requests(self):
"""开始请求"""
yield SeleniumRequest(
url='https://mp.weixin.qq.com/',
callback=self.parse,
)
def parse(self, response):
driver = response.request.meta['driver']
cookies_key = "MediaSpiders:WeChatLinksFetcher_Cookies"
cookie_list = self.redis_client.lrange(cookies_key, 0, -1)
cookie_parts = [
item.decode('utf-8') if isinstance(item, bytes) else str(item)
for item in cookie_list
]
# 遍历cookies记录当前索引
for cookie_index, item in enumerate(cookie_parts):
try:
driver.delete_all_cookies()
driver.get('https://mp.weixin.qq.com/')
time.sleep(2)
cookie_string = item
cookie_dict = parse_cookie_string(cookie_string)
success_count = 0
for name, value in cookie_dict.items():
if add_cookie_smart(driver, name, value):
success_count += 1
else:
logger.warning(f"跳过 cookie: {name}")
logger.info(f"成功添加 {success_count}/{len(cookie_dict)} 个 cookie (索引: {cookie_index})")
# 验证 cookie 是否有效
driver.refresh()
time.sleep(5)
except Exception as e:
logger.error(f"使用 cookie 登录时出错: {str(e)}")
continue
count_per_account = 200
total_count = 0
break_flag = False
token_index = driver.current_url.rfind('token=')
token = driver.current_url[token_index + 6:]
logger.info(f'获取 token 成功!当前 token 为 {token}')
raw_cookies = driver.get_cookies()
cookies = {}
for c in raw_cookies:
cookies[c['name']] = c['value']
logger.info(f'获取 cookie 成功!')
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/136.0.0.0 Safari/537.36 Edg/136.0.0.0',
'Referer': f'https://mp.weixin.qq.com/cgi-bin/appmsg?t=media/'
f'appmsg_edit_v2&action=edit&isNew=1&type=77&createType=0&token={token}&lang=zh_CN'
}
query_api = SOCIAL_USER_QUERY_ALL_API.format(sortBy="updateTime", shuffleResult="false")
post_body = {
'userType': WECHAT_USER_TYPE,
'userFlag': 0
}
account_rsp = json.loads(
http_post(query_api, json.dumps(post_body), headers={"Content-Type": "application/json"}).text)
official_accounts = []
if account_rsp['code'] == 200:
official_accounts = account_rsp['content'][:10]
for account_line in official_accounts:
try:
if break_flag:
break
start_timestamp = int((time.time() - 500 * 24 * 3600) * 1000)
if 'updateTime' in account_line:
start_timestamp = account_line['updateTime']
start_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(start_timestamp / 1000))
account = account_line['userName']
search_account_api = f'https://mp.weixin.qq.com/cgi-bin/searchbiz?action=search_biz&begin=0&count=5&' \
f'query={account}&token={token}&lang=zh_CN&f=json&ajax=1'
logger.info(f"开始搜索公众号“{account}”...")
time.sleep(3 + random.random())
response = requests.get(search_account_api, cookies=cookies, headers=headers)
rsp_body = json.loads(response.text)
index_end = ceil(count_per_account / 5)
if 'list' in rsp_body:
matched_account = {}
matched_account_flag = False
for item in rsp_body['list']:
if item['nickname'] == account:
matched_account_flag = True
matched_account = item
break
if not matched_account_flag:
logger.info(f"未找到公众号“{account}")
continue
fake_id = matched_account['fakeid']
update_time_flag = True # 用于记录获取到的历史列表是否已经超出最早的时间限制
next_start_timestamp = int(time.time() * 1000)
for index in range(index_end):
if update_time_flag:
if next_start_timestamp - start_timestamp < 12 * 3600 * 1000:
logger.info(f"公众号“{account}”以及后续账号在12小时内已经扫码获取过文章链接本次获取结束")
break_flag = True
else:
fetch_article_api = f'https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=' \
f'{index * 5}&count=5&fakeid={fake_id}&type=9&query=&token={token}' \
f'&lang=zh_CN&f=json&ajax=1'
logger.info(f"开始获取公众号“{account}”在 {start_time} 后发表的的文章列表...")
time.sleep(3 + random.random())
article_response = requests.get(fetch_article_api, cookies=cookies, headers=headers)
article_rsp_body = json.loads(article_response.text)
if 'app_msg_list' in article_rsp_body:
for article in article_rsp_body['app_msg_list']:
title = article['title']
link = article['link']
update_time = article['update_time'] * 1000
if update_time > start_timestamp:
total_count += 1
time_str = time.strftime("%Y-%m-%d %H:%M:%S",
time.localtime(update_time / 1000))
logger.info(f"[No. {total_count}] 获取到公众号“{account}”在 {time_str} "
f"发表的文章《{title}》,链接地址:{link}")
self.redis_client.sadd(
f"MediaSpiders:Wechat_links:{account_line['id']}",
link)
else:
update_time_flag = False
break
else:
logger.info(json.dumps(article_rsp_body, ensure_ascii=False))
if 'base_resp' in article_rsp_body:
err_msg = article_rsp_body['base_resp']['err_msg']
if err_msg == "freq control" or err_msg == "invalid session":
logger.info("接口频率限制,稍后再试,本次获取结束")
break_flag = True
# 删除当前使用的cookie
self._remove_invalid_cookie(cookies_key, cookie_index)
break
if not break_flag:
# 本循环内只有12小时内扫过码以及接口频率限制退出会导致 break_flag 为 True这两种情况都不需要更新扫码状态
next_start_time = time.strftime("%Y-%m-%d %H:%M:%S",
time.localtime(next_start_timestamp / 1000))
account_line['updateTime'] = next_start_timestamp
http_post(SOCIAL_USER_UPDATE_API,
data=json.dumps(account_line, ensure_ascii=False).encode('utf-8'),
headers={'User-Agent': UA, "Content-Type": "application/json"}
)
logger.info(f"公众号“{account}”文章获取结束,该账号下一次获取起始时间为 {next_start_time}")
else:
logger.info(json.dumps(rsp_body, ensure_ascii=False))
if 'base_resp' in rsp_body:
if rsp_body['base_resp']['err_msg'] == "freq control":
logger.info("接口频率限制,稍后再试,本次获取结束")
break_flag = True
# 删除当前使用的cookie
self._remove_invalid_cookie(cookies_key, cookie_index)
break
except Exception as e:
logger.info(repr(e))
self.redis_client.close()
driver.quit()
def _remove_invalid_cookie(self, cookies_key, cookie_index):
"""删除无效的cookie"""
try:
# 方法1标记并删除
self.redis_client.lset(cookies_key, cookie_index, "__invalid__")
self.redis_client.lrem(cookies_key, 1, "__invalid__")
logger.info(f"已删除无效的cookie索引: {cookie_index}")
# 方法2或者直接删除整个列表如果cookie全部无效
# cookie_count = self.redis_client.llen(cookies_key)
# if cookie_count <= 1:
# self.redis_client.delete(cookies_key)
# logger.info(f"已删除所有cookies: {cookies_key}")
except Exception as e:
logger.error(f"删除cookie失败: {e}")
def parse_cookie_string(cookie_str):
"""解析 cookie 字符串为 dict"""
cookie_dict = {}
for item in cookie_str.split(';'):
if '=' in item:
name, value = item.split('=', 1)
cookie_dict[name.strip()] = value.strip()
return cookie_dict
def add_cookie_smart(driver, name, value, target_domain='mp.weixin.qq.com'):
"""
智能添加 cookie先试目标域名失败则试父域再失败则跳过
"""
# 微信核心 cookie 必须用 mp.weixin.qq.com
wechat_critical = ['wxuin', 'slave_sid', 'slave_user', 'bizuin', 'data_ticket', 'token']
# 腾讯通用 cookie 可尝试父域
tencent_common = ['ptui_loginuin', 'RK', 'ptcz', 'ua_id']
# 策略 1: 核心 cookie → 精确域名
if name in wechat_critical:
domains_to_try = [target_domain]
# 策略 2: 腾讯通用 cookie → 先试目标域,再试父域
elif name in tencent_common:
domains_to_try = [target_domain, '.weixin.qq.com', '.qq.com']
# 策略 3: 其他 cookie → 默认 host-only不传 domain
else:
domains_to_try = [None, target_domain]
for domain in domains_to_try:
cookie = {
'name': name,
'value': value,
'path': '/',
'secure': True
}
if domain:
cookie['domain'] = domain
try:
driver.add_cookie(cookie)
# logger.debug(f"✓ {name} added with domain={domain or 'host-only'}")
return True
except Exception as e:
if 'invalid cookie domain' in str(e):
continue # 尝试下一个 domain
else:
# logger.warning(f"✗ {name} failed: {e}")
return False
return False # 所有 domain 都失败

View File

@ -2,12 +2,10 @@
import json import json
import time import time
import redis
import scrapy import scrapy
from redisbloom.client import Client from redisbloom.client import Client
from MediaSpiders.items import MediaspidersItem from MediaSpiders.items import MediaspidersItem
from MediaSpiders.spiders.TwitterUserSpider import form_cookie_dict
from MediaSpiders.utils.http_utils import http_post from MediaSpiders.utils.http_utils import http_post
from MediaSpiders.utils.string_utils import find_text from MediaSpiders.utils.string_utils import find_text
from MediaSpiders.utils.time_utils import get_time_stamp, get_current_timestamp from MediaSpiders.utils.time_utils import get_time_stamp, get_current_timestamp
@ -61,11 +59,6 @@ class WeiboSpider(scrapy.Spider):
account_rsp = json.loads( account_rsp = json.loads(
http_post(account_query_api, json.dumps(post_data), headers={"Content-Type": "application/json"}).text) http_post(account_query_api, json.dumps(post_data), headers={"Content-Type": "application/json"}).text)
self.simhash_filter_key = self.settings['WEIBO_SIMHASH_FILTER_KEY'] self.simhash_filter_key = self.settings['WEIBO_SIMHASH_FILTER_KEY']
# 从 redis 中 获取 微博所需的 cookie
cookie_string = redis.Redis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT'],
password=self.settings['REDIS_PWD']).get("MediaSpiders:Weibo_Cookies").decode()
self.cookie_dict = form_cookie_dict(cookie_string)
all_user_info = [] all_user_info = []
if account_rsp['code'] == 200: if account_rsp['code'] == 200:
all_user_info = account_rsp['content'] all_user_info = account_rsp['content']
@ -74,10 +67,7 @@ class WeiboSpider(scrapy.Spider):
if uid[:6] != '107603': if uid[:6] != '107603':
uid = f'107603{uid}' uid = f'107603{uid}'
yield scrapy.Request('https://m.weibo.cn/api/container/getIndex?containerid=%s' % uid, yield scrapy.Request('https://m.weibo.cn/api/container/getIndex?containerid=%s' % uid,
callback=self.parse, callback=self.parse, meta={'currentCount': 0, 'uid': uid})
meta={'currentCount': 0, 'uid': uid},
cookies=self.cookie_dict
)
def parse(self, response): def parse(self, response):
rsp = json.loads(response.text) rsp = json.loads(response.text)

View File

@ -2,5 +2,3 @@
# #
# Please refer to the documentation for information on how to create and manage # Please refer to the documentation for information on how to create and manage
# your spiders. # your spiders.

View File

@ -1,11 +1,7 @@
import json
import redis
import scrapy import scrapy
import json
from MediaSpiders.items import MediaspidersItem
from MediaSpiders.utils.hot_search_json_parser import parse_weibo_response, parse_toutiao_response from MediaSpiders.utils.hot_search_json_parser import parse_weibo_response, parse_toutiao_response
from MediaSpiders.utils.time_utils import get_current_timestamp from MediaSpiders.items import MediaspidersItem
class HotSearchSpider(scrapy.Spider): class HotSearchSpider(scrapy.Spider):
@ -26,68 +22,42 @@ class HotSearchSpider(scrapy.Spider):
'MediaSpiders.pipelines.ProtobufSavePipeline': 300, 'MediaSpiders.pipelines.ProtobufSavePipeline': 300,
# 'MediaSpiders.pipelines.HotSearchSaveToMySQL': 300 # 'MediaSpiders.pipelines.HotSearchSaveToMySQL': 300
}, },
'SPIDER_MIDDLEWARES': { 'SPIDER_MIDDLEWARES': {},
'MediaSpiders.middlewares.DumpFilterSpiderMiddleware': 543,
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware': None
},
'DOWNLOADER_MIDDLEWARES': {}, 'DOWNLOADER_MIDDLEWARES': {},
'BATCH_SAVE_SIZE': 50 'BATCH_SAVE_SIZE': 50
} }
start_urls = [ start_urls = [
# 'https://weibo.com/ajax/side/hotSearch', 'https://weibo.com/ajax/side/hotSearch',
'https://www.toutiao.com/hot-event/hot-board/?origin=toutiao_pc' 'https://www.toutiao.com/hot-event/hot-board/?origin=toutiao_pc'
] ]
def __init__(self, params=None, *args, **kwargs): def __init__(self, params=None, *args, **kwargs):
super(HotSearchSpider, self).__init__(*args, **kwargs) super(HotSearchSpider, self).__init__(*args, **kwargs)
self.job_id = None
self.collected_items = []
if params: if params:
try: json_params = json.loads(params)
json_params = json.loads(params) if 'job_id' in json_params:
if 'job_id' in json_params: self.job_id = json_params['job_id']
self.job_id = json_params['job_id']
if 'max_items' in json_params:
self.max_items = int(json_params['max_items'])
except Exception as e:
self.logger.error(f"解析参数失败: {str(e)}")
def start_requests(self):
"""发起初始请求"""
self.logger.info(f"开始爬取热搜数据任务ID: {self.job_id if self.job_id else 'N/A'}")
self.url_time = get_current_timestamp()
for url in self.start_urls:
yield scrapy.Request(
url=url,
callback=self.parse
)
def parse(self, response): def parse(self, response):
result_array = [] result_array = []
try: if 'weibo.com' in response.url:
if 'weibo.com' in response.url: result_array = parse_weibo_response(response.text)
result_array = parse_weibo_response(response.text) elif 'toutiao.com' in response.url:
elif 'toutiao.com' in response.url: result_array = parse_toutiao_response(response.text)
result_array = parse_toutiao_response(response.text)
for line in result_array: for line in result_array:
hot_search_item = MediaspidersItem() hot_search_item = MediaspidersItem()
hot_search_item['es_carriertype'] = 'hot_search' hot_search_item['es_carriertype'] = 'hot_search'
hot_search_item['es_sid'] = line['id'] hot_search_item['es_sid'] = line['id']
hot_search_item['es_hkey'] = line['hot_id'] hot_search_item['es_hkey'] = line['hot_id']
hot_search_item['es_urltitle'] = line['hot_word'] hot_search_item['es_urltitle'] = line['hot_word']
hot_search_item['es_urlcontent'] = line['hot_word'] hot_search_item['es_urlcontent'] = line['hot_word']
hot_search_item['es_heat'] = line['hot_value'] hot_search_item['es_heat'] = line['hot_value']
hot_search_item['es_catalog'] = line['category'] hot_search_item['es_catalog'] = line['category']
hot_search_item['es_simrank'] = line['realtime_rank'] hot_search_item['es_simrank'] = line['realtime_rank']
hot_search_item['es_sitename'] = line['platform'] hot_search_item['es_sitename'] = line['platform']
hot_search_item['es_urltime'] = line['onboard_time'] hot_search_item['es_urltime'] = line['onboard_time']
hot_search_item['es_lasttime'] = line['crawl_time'] hot_search_item['es_lasttime'] = line['crawl_time']
hot_search_item['es_urlname'] = line['fake_url'] + "&news" hot_search_item['es_urlname'] = line['fake_url']
yield hot_search_item
yield hot_search_item
except Exception as e:
self.logger.exception(f"解析异常: {str(e)}")

View File

@ -87,9 +87,7 @@ def get_format_time(pattern, time_str):
date = result.group(1) date = result.group(1)
time_t = result.group(2) time_t = result.group(2)
date = date.replace('/', '-').replace(".", "-").replace( date = date.replace('/', '-').replace(".", "-").replace(
",", "-").replace("", "-").replace("", "-").replace("", "").replace( ",", "-").replace("", "-").replace("", "-").replace("", "").replace(' ', '-').replace('--', '-')
"", "-").replace("", "-").replace("", "").replace(
' ', '-').replace('--', '-')
date_array = date.split('-') date_array = date.split('-')
for i in range(len(date_array)): for i in range(len(date_array)):
if (date_array[i].endswith('st') or if (date_array[i].endswith('st') or
@ -130,7 +128,7 @@ def get_format_time(pattern, time_str):
if __name__ == '__main__': if __name__ == '__main__':
# a = [' 令和4年6月9日', 'www.kcna.kp (主体111.6.6.)', '民國111年06月09日 ', 'Jun. 9, 2022', '111年 06月 21日'] # a = [' 令和4年6月9日', 'www.kcna.kp (主体111.6.6.)', '民國111年06月09日 ', 'Jun. 9, 2022', '111年 06月 21日']
a = ['2026년 1월 6일 화요일 1면 [사진있음]'] a = ['06.10.2023 03:24']
for _ in a: for _ in a:
# print(get_time_stamp(_)) print(get_time_stamp(_))
print(get_time_stamp(_, {r"(\d{4}\d{1,2}월 \d{1,2}일)\D*(\d{2}:\d{2}:\d{2})*\D*": ['%Y-%m-%d %H:%M:%S']})) # print(get_time_stamp(_, {r"(\d{4}年\d{1,2}月\d{2}日)\D*(\d{2}:\d{2}:\d{2})*\D*": ['%Y-%m-%d %H:%M:%S']}))

View File

@ -2,13 +2,11 @@ import json
import uuid import uuid
import logging import logging
import time import time
import requests
from MediaSpiders.utils.string_utils import get_str_md5 from MediaSpiders.utils.string_utils import get_str_md5
def parse_weibo_response(rsp_body): def parse_weibo_response(rsp_str):
rsp_body = json.loads(rsp_str)
result_array = [] result_array = []
if rsp_body['ok'] == 1: if rsp_body['ok'] == 1:
realtime_data = rsp_body['data']['realtime'] realtime_data = rsp_body['data']['realtime']
@ -58,7 +56,7 @@ def parse_toutiao_response(rsp_str):
"platform": "今日头条", "platform": "今日头条",
"onboard_time": current_timestamp, "onboard_time": current_timestamp,
"crawl_time": current_timestamp, "crawl_time": current_timestamp,
"fake_url": line['Url'] "fake_url": f"https://www.toutiao.com/hot-event/hot-board/{custom_sid}"
} }
if 'InterestCategory' in line: if 'InterestCategory' in line:
result_line['category'] = ",".join(line['InterestCategory']) result_line['category'] = ",".join(line['InterestCategory'])
@ -68,11 +66,6 @@ def parse_toutiao_response(rsp_str):
logging.info(json.dumps(line, ensure_ascii=False)) logging.info(json.dumps(line, ensure_ascii=False))
return result_array return result_array
def url_response(url):
rsp_str = requests.get(url).text
return json.loads(rsp_str)
if __name__ == "__main__": if __name__ == "__main__":
# rsp_file = open("./toutiao_hot_search.json", 'r', encoding='utf-8') # rsp_file = open("./toutiao_hot_search.json", 'r', encoding='utf-8')

View File

@ -1,170 +0,0 @@
import random
import json
import time
import logging as logger
from selenium.webdriver import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from MediaSpiders.settings import REDIS_HOST, REDIS_PORT, REDIS_PWD, SOCIAL_USER_QUERY_ALL_API, SOCIAL_USER_UPDATE_API, \
WECHAT_USER_TYPE
from MediaSpiders.utils.http_utils import http_post, UA
class login:
def __init__(self):
self.name = None
self.url = None
def login_with_selenium(self, login_url, site_name, login_users=None, response=None, drivers=None):
"""
使用 Selenium 自动登录指定站点 Redis 账号池中随机选一个账号
:param driver: Selenium WebDriver 实例
:param login_url: 登录页面 URL
:param site_name: 站点名称 'Facebook'
:param login_users: Redis 客户端获取的账号密码
"""
self.name = site_name
self.url = login_url
logger.info(f"Starting login to {site_name}...")
if response is not None:
driver = response.request.meta['driver']
elif drivers is not None:
driver = drivers
if login_users is not None:
# 解析 redis 中 账号密码
user_list = [json.loads(u.decode()) for u in login_users]
self.login_user = random.choice(user_list)
if self.name == 'FacebookUserSpider':
self.facebook_login(driver)
elif self.name == 'TwitterUserSpider' or self.name == 'TwitterUserInfoSpider' :
self.twitter_login(driver)
elif self.name == 'wechat_links_fetcher':
self.wechat_links_login(driver)
time.sleep(10) # 等待登录完成(可优化为显式等待)
return driver
"""
FaceBook 登录 获取cookie
"""
def facebook_login(self, driver):
# 打开登录页
driver.maximize_window() # 注意:原代码中有空格!
time.sleep(3)
driver.get(self.url)
driver.find_element_by_xpath(
'//input[@name="email"]').send_keys(self.login_user['uid'])
driver.find_element_by_xpath(
'//input[@name="pass"]').send_keys(self.login_user['pwd'])
driver.find_element_by_xpath('//button[@name="login"]').click()
time.sleep(10)
logger.info(f"Logged in to {self.name} as {self.login_user['uid']}")
"""
Twitter 登录 获取cookie
"""
def twitter_login(self, driver):
# 打开登录页
driver.maximize_window() # 注意:原代码中有空格!
time.sleep(3)
driver.get(self.url)
# 隐藏指纹
driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
'source': '''
delete navigator.__proto__.webdriver;
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
'''
})
wait = WebDriverWait(driver, 15)
# 2. 通过 JS 打开第二个标签页(新 Tab
time.sleep(5)
driver.execute_script("window.open('');")
driver.execute_script("window.open('https://x.com/i/flow/login', '_blank');")
# 3. 获取所有标签页句柄
handles = driver.window_handles # [handle1, handle2]
# 4. 切换到第二个标签页(可选)
driver.switch_to.window(handles[1])
logger.info(f"login as user {self.login_user['uid']}")
# time.sleep(random.uniform(1.5, 3.0))
# driver.find_element_by_xpath("//input").send_keys(login_user['uid'])
# 等待并定位用户名输入框
username_input = wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, 'input[autocomplete="username"]'))
)
# 模拟真人逐字输入(带随机延迟)
username = self.login_user['uid']
for char in username:
username_input.send_keys(char)
time.sleep(random.uniform(0.05, 0.2)) # 每个字符间隔 50~200ms
time.sleep(random.uniform(0.3, 0.8)) # 输入完后稍作停顿
# 尝试点击 "Next" 按钮(主逻辑:带文本的按钮)
try:
next_button = wait.until(
EC.element_to_be_clickable(
(By.XPATH, "//button[.//span[contains(text(), 'Next') or contains(text(), '下一步')]]")
)
)
body = driver.find_element(By.TAG_NAME, "body")
ActionChains(driver).move_to_element_with_offset(body, 100, 100).perform()
time.sleep(0.5)
# 模拟鼠标移动到按钮并点击
actions = ActionChains(driver)
actions.move_to_element(next_button).pause(random.uniform(0.2, 0.6)).click().perform()
except Exception as e:
logger.info("主 Next 按钮未找到,尝试备用定位方式")
try:
# 备用:通过 role 定位第二个 button
next_button = driver.find_element(By.XPATH, "//button[@role='button'][2]")
actions = ActionChains(driver)
actions.move_to_element(next_button).pause(random.uniform(0.2, 0.6)).click().perform()
except Exception as e2:
logger.error(f"两种方式均无法点击 Next 按钮: {e}, {e2}")
raise
time.sleep(random.uniform(1.5, 5.0)) # 等待页面加载
try:
logger.info("输入手机号验证...")
driver.find_element_by_xpath("//input[@name='text']").send_keys("+8619962025641")
# driver.find_element_by_xpath("//button[@data-testid='ocfEnterTextNextButton']").click()
driver.find_element_by_xpath(driver.find_element_by_xpath("//button[.//span[text()='下一步']]")).click()
time.sleep(random.uniform(1.5, 5.0)) # 等待页面加载
except Exception:
logger.info("无需输入手机号验证")
driver.find_element_by_xpath("//input[@name='password']").send_keys(self.login_user['pwd'])
driver.find_element_by_xpath("//button[@data-testid='LoginForm_Login_Button']").click()
time.sleep(random.uniform(1.5, 5.0)) # 等待页面加载
try:
driver.find_element_by_xpath("//button[@data-testid='confirmationSheetConfirm']").click()
time.sleep(random.uniform(1.5, 10.0)) # 等待页面加载
except:
time.sleep(5)
logger.info(f"Logged in to {self.name} as {self.login_user['uid']}")
def wechat_links_login(self, driver):
driver.maximize_window()
driver.get(self.url)
print("等待打开登录后的页面...")
while True:
delay = random.randint(5, 11)
time.sleep(delay)
if 'token=' in driver.current_url:
print("登录成功!")
logger.info(f"Logged in to {self.name}")
break

View File

@ -1,67 +1,10 @@
import datetime
import time import time
from datetime import datetime, timezone, timedelta
import re
def get_current_timestamp(): def get_current_timestamp():
return int(time.time() * 1000) return int(time.time() * 1000)
def str_to_timestamp(dt_str: str, tz_offset: int = 8) -> int:
"""
将时间字符串转为 Unix 时间戳
支持格式
- 'YYYY-MM-DD HH:MM'
- 'YYYY-MM-DD HH:MM:SS'
- 以及包含额外文本的混合字符串"2026-02-27 20:11·头条新锐创作者"
Args:
dt_str: 时间字符串会自动提取其中的时间部分
tz_offset: 时区偏移小时中国用 8
Returns:
整数时间戳
Raises:
ValueError: 无法提取有效时间格式时抛出
"""
# 去除首尾空格
dt_str = dt_str.strip()
# 使用正则表达式提取时间部分(匹配 YYYY-MM-DD HH:MM 或 YYYY-MM-DD HH:MM:SS
time_pattern = r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}(?::\d{2})?)'
match = re.search(time_pattern, dt_str)
if not match:
raise ValueError(f"无法从字符串中提取有效时间格式: {dt_str}")
# 获取匹配到的时间字符串
time_str = match.group(1)
# 根据格式解析
try:
if len(time_str) == 16: # 'YYYY-MM-DD HH:MM'
dt = datetime.strptime(time_str, '%Y-%m-%d %H:%M')
elif len(time_str) == 19: # 'YYYY-MM-DD HH:MM:SS'
dt = datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S')
else:
# 尝试自动解析
for fmt in ['%Y-%m-%d %H:%M', '%Y-%m-%d %H:%M:%S']:
try:
dt = datetime.strptime(time_str, fmt)
break
except ValueError:
continue
else:
raise ValueError(f"无法解析的时间格式: {time_str}")
except ValueError as e:
raise ValueError(f"时间格式解析失败: {time_str}") from e
# 创建时区
tz = timezone(timedelta(hours=tz_offset))
# 返回时间戳
return int(dt.replace(tzinfo=tz).timestamp())
def get_time_stamp(date_str): def get_time_stamp(date_str):
try: try:

View File

@ -1,94 +0,0 @@
from MediaSpiders.settings import MAX_TEXT_LENGTH, TRANSLATE_API_URL, REQUEST_DELAY
import requests
import time
from typing import List, Tuple, Optional
from langdetect import detect, LangDetectException
def normalize_newlines(text: str) -> str:
"""\r\n\r 统一转换为 \n"""
if not text:
return text
return text.replace('\r\n', '\n').replace('\r', '\n')
def translate_single(text: str, source_lang: str = "auto", target_lang: str = "zh") -> Optional[str]:
"""翻译单段文本,失败返回 None"""
if not text or not text.strip():
return ""
payload = {
"text": text[:MAX_TEXT_LENGTH],
"source_lang": source_lang,
"target_lang": target_lang
}
try:
response = requests.post(TRANSLATE_API_URL, json=payload, timeout=10)
response.raise_for_status()
result = response.json()
return result.get("translated_text")
except Exception as e:
print(f"⚠️ 翻译失败: {e}")
return None
def translate_content_with_paragraphs(content: str) -> str:
"""
按段落翻译内容支持容错
- 某段失败 跳过该段保留空行或原文
- 返回拼接后的完整内容
"""
if not content:
return ""
# 标准化换行符
content = normalize_newlines(content)
paragraphs = content.split('\n')
translated_paragraphs = []
for para in paragraphs:
if not para.strip():
# 保留空行
translated_paragraphs.append("")
continue
trans = translate_single(para)
if trans is None:
# 段落翻译失败:跳过该段(可选:保留原文或留空)
print(f" ⚠️ 段落翻译失败,跳过: {para[:30]}...")
translated_paragraphs.append("") # 或 append(para) 保留原文
else:
translated_paragraphs.append(trans)
time.sleep(REQUEST_DELAY)
return '\n'.join(translated_paragraphs)
# ================== 数据库操作 ==================
def update_record(cursor, es_sid: int, new_title: str, new_content: str):
update_query = """
UPDATE indeximos
SET es_title = % s, es_content = % s
WHERE es_sid = % s
"""
cursor.execute(update_query, (new_title, new_content, es_sid))
def needs_translation(text: str) -> bool:
"""
判断文本是否需要翻译
- 如果检测到语言是 'zh'中文则不需要翻译返回 False
- 否则需要翻译返回 True
- 若无法检测如空文本纯符号等
"""
if not text or not text.strip():
return False # 空文本无需翻译
try:
lang = detect(text.strip())
return lang != 'zh-cn'
except LangDetectException:
# 无法检测语言(如全是数字、标点等),保守起见视为需要翻译
return True

View File

@ -2,322 +2,147 @@ import json
import random import random
import time import time
from math import ceil from math import ceil
import logging as logger
from selenium.webdriver.common.by import By
import redis import redis
import requests import requests
from selenium import webdriver from msedge.selenium_tools import Edge
from selenium.webdriver.chrome.options import Options from msedge.selenium_tools import EdgeOptions
from MediaSpiders.settings import REDIS_HOST, REDIS_PORT, REDIS_PWD, SOCIAL_USER_QUERY_ALL_API, SOCIAL_USER_UPDATE_API, \ from MediaSpiders.settings import REDIS_HOST, REDIS_PORT, REDIS_PWD, SOCIAL_USER_QUERY_ALL_API, SOCIAL_USER_UPDATE_API, \
WECHAT_USER_TYPE WECHAT_USER_TYPE
from MediaSpiders.utils.http_utils import http_post, UA from MediaSpiders.utils.http_utils import http_post, UA
chrome_options = Options() edge_options = EdgeOptions()
# 指定 chrome.exe 的完整路径 edge_options.use_chromium = True
chrome_options.binary_location = "D:/chrome-win64/chrome.exe" driver = Edge(executable_path='msedgedriver.exe', options=edge_options)
# chrome_options.use_chromium = True
driver = webdriver.Chrome(
executable_path=r"D:\chromedriver.exe",
options=chrome_options
)
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", { driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": "Object.defineProperty(navigator, 'webdriver', {get: () => undefined})" "source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
}) })
redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PWD) redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PWD)
def parse_cookie_string(cookie_str):
"""解析 cookie 字符串为 dict"""
cookie_dict = {}
for item in cookie_str.split(';'):
if '=' in item:
name, value = item.split('=', 1)
cookie_dict[name.strip()] = value.strip()
return cookie_dict
def add_cookie_smart(driver, name, value, target_domain='mp.weixin.qq.com'):
"""
智能添加 cookie先试目标域名失败则试父域再失败则跳过
"""
# 微信核心 cookie 必须用 mp.weixin.qq.com
wechat_critical = ['wxuin', 'slave_sid', 'slave_user', 'bizuin', 'data_ticket', 'token']
# 腾讯通用 cookie 可尝试父域
tencent_common = ['ptui_loginuin', 'RK', 'ptcz', 'ua_id']
# 策略 1: 核心 cookie → 精确域名
if name in wechat_critical:
domains_to_try = [target_domain]
# 策略 2: 腾讯通用 cookie → 先试目标域,再试父域
elif name in tencent_common:
domains_to_try = [target_domain, '.weixin.qq.com', '.qq.com']
# 策略 3: 其他 cookie → 默认 host-only不传 domain
else:
domains_to_try = [None, target_domain]
for domain in domains_to_try:
cookie = {
'name': name,
'value': value,
'path': '/',
'secure': True
}
if domain:
cookie['domain'] = domain
try:
driver.add_cookie(cookie)
# logger.debug(f"✓ {name} added with domain={domain or 'host-only'}")
return True
except Exception as e:
if 'invalid cookie domain' in str(e):
continue # 尝试下一个 domain
else:
# logger.warning(f"✗ {name} failed: {e}")
return False
return False # 所有 domain 都失败
if __name__ == "__main__": if __name__ == "__main__":
cookie_list = redis_client.lrange("MediaSpiders:WeChatLinksFetcher_Cookies", 0, -1) count_per_account = 200
cookie_parts = [ total_count = 0
item.decode('utf-8') if isinstance(item, bytes) else str(item) driver.maximize_window()
for item in cookie_list query_api = SOCIAL_USER_QUERY_ALL_API.format(sortBy="updateTime", shuffleResult="false")
] post_body = {
'userType': WECHAT_USER_TYPE,
# 标记是否需要手动登录 'userFlag': 0
need_manual_login = True }
current_cookie = None account_rsp = json.loads(
http_post(query_api, json.dumps(post_body), headers={"Content-Type": "application/json"}).text)
if not cookie_parts: official_accounts = []
logger.warning("Redis 中没有可用的 cookie需要手动登录") if account_rsp['code'] == 200:
need_manual_login = True official_accounts = account_rsp['content']
else: driver.get('https://mp.weixin.qq.com/')
# 尝试使用 Redis 中的 cookie 登录 print("等待打开登录后的页面...")
for item in cookie_parts: while True:
current_cookie = item delay = random.randint(5, 11)
try: time.sleep(delay)
driver.delete_all_cookies() if 'token=' in driver.current_url:
driver.get('https://mp.weixin.qq.com/') print("登录成功!")
time.sleep(2) break
break_flag = False
cookie_string = item token_index = driver.current_url.rfind('token=')
cookie_dict = parse_cookie_string(cookie_string) token = driver.current_url[token_index + 6:]
print(f'获取 token 成功!当前 token 为 {token}')
success_count = 0 raw_cookies = driver.get_cookies()
for name, value in cookie_dict.items(): cookies = {}
if add_cookie_smart(driver, name, value): for c in raw_cookies:
success_count += 1 cookies[c['name']] = c['value']
else: print(f'获取 cookie 成功!')
logger.warning(f"跳过 cookie: {name}") headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
logger.info(f"成功添加 {success_count}/{len(cookie_dict)} 个 cookie") 'Chrome/136.0.0.0 Safari/537.36 Edg/136.0.0.0',
'Referer': f'https://mp.weixin.qq.com/cgi-bin/appmsg?t=media/'
# 验证 cookie 是否有效 f'appmsg_edit_v2&action=edit&isNew=1&type=77&createType=0&token={token}&lang=zh_CN'
driver.refresh() }
time.sleep(5) for account_line in official_accounts:
try:
# 检查是否登录成功 - 通过检查 URL 中是否包含 token 或页面元素 if break_flag:
current_url = driver.current_url break
if 'token=' in current_url: start_timestamp = int((time.time() - 500 * 24 * 3600) * 1000)
logger.info("使用 Redis 中的 cookie 登录成功") if 'updateTime' in account_line:
need_manual_login = False start_timestamp = account_line['updateTime']
start_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(start_timestamp / 1000))
else: account = account_line['userName']
# 二次验证:检查页面上是否有登录状态相关的元素 search_account_api = f'https://mp.weixin.qq.com/cgi-bin/searchbiz?action=search_biz&begin=0&count=5&' \
try: f'query={account}&token={token}&lang=zh_CN&f=json&ajax=1'
# 检查是否有用户头像或用户名元素 print(f"开始搜索公众号“{account}”...")
driver.find_element(By.CSS_SELECTOR, time.sleep(3 + random.random())
".weui-desktop-account__nickname, .userinfo_nickname, .account_nickname") response = requests.get(search_account_api, cookies=cookies, headers=headers)
logger.info("通过页面元素验证,登录成功") rsp_body = json.loads(response.text)
need_manual_login = False index_end = ceil(count_per_account / 5)
if 'list' in rsp_body:
except: matched_account = {}
logger.warning("Cookie 登录失败,尝试下一个 cookie 或手动登录") matched_account_flag = False
except Exception as e: for item in rsp_body['list']:
logger.error(f"使用 cookie 登录时出错: {str(e)}") if item['nickname'] == account:
continue matched_account_flag = True
matched_account = item
# 如果自动登录失败,进行手动登录
if need_manual_login:
logger.info("所有 cookie 均无效,启动手动登录流程")
try:
driver.delete_all_cookies()
driver.get('https://mp.weixin.qq.com/')
time.sleep(2)
# 等待用户手动登录
logger.info("请在浏览器中手动完成登录(扫描二维码)")
logger.info("登录成功后,程序将自动继续执行")
# 设置最长等待时间(例如 120 秒)
max_wait_time = 120
start_time = time.time()
logged_in = False
while time.time() - start_time < max_wait_time:
current_url = driver.current_url
if 'token=' in current_url:
logged_in = True
logger.info("手动登录成功!")
break
# 检查页面元素
try:
driver.find_element(By.CSS_SELECTOR,
".weui-desktop-account__nickname, .userinfo_nickname, .account_nickname")
logged_in = True
logger.info("通过页面元素确认手动登录成功!")
break
except:
time.sleep(2)
if not logged_in:
logger.error(f"等待 {max_wait_time} 秒后仍未登录成功,程序终止")
raise Exception("手动登录超时")
# 获取新的 cookie
raw_cookies = driver.get_cookies()
new_cookie_dict = {}
for c in raw_cookies:
new_cookie_dict[c['name']] = c['value']
# 将字典转换为字符串格式
new_cookie_string = "; ".join([f"{k}={v}" for k, v in new_cookie_dict.items()])
# 更新 Redis 中的 cookie
logger.info("更新 Redis 中的 cookie")
# 删除旧的 cookie
redis_client.delete("MediaSpiders:WeChatLinksFetcher_Cookies")
# 添加新的 cookie
redis_client.lpush("MediaSpiders:WeChatLinksFetcher_Cookies", new_cookie_string)
current_cookie = new_cookie_string
logger.info("Redis cookie 更新成功")
except Exception as e:
logger.error(f"手动登录过程出错: {str(e)}")
raise
count_per_account = 200
total_count = 0
break_flag = False
token_index = driver.current_url.rfind('token=')
token = driver.current_url[token_index + 6:]
print(f'获取 token 成功!当前 token 为 {token}')
raw_cookies = driver.get_cookies()
cookies = {}
for c in raw_cookies:
cookies[c['name']] = c['value']
print(f'获取 cookie 成功!')
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/136.0.0.0 Safari/537.36 Edg/136.0.0.0',
'Referer': f'https://mp.weixin.qq.com/cgi-bin/appmsg?t=media/'
f'appmsg_edit_v2&action=edit&isNew=1&type=77&createType=0&token={token}&lang=zh_CN'
}
query_api = SOCIAL_USER_QUERY_ALL_API.format(sortBy="updateTime", shuffleResult="false")
post_body = {
'userType': WECHAT_USER_TYPE,
'userFlag': 0
}
account_rsp = json.loads(
http_post(query_api, json.dumps(post_body), headers={"Content-Type": "application/json"}).text)
official_accounts = []
if account_rsp['code'] == 200:
official_accounts = account_rsp['content']
for account_line in official_accounts:
try:
if break_flag:
break break
start_timestamp = int((time.time() - 500 * 24 * 3600) * 1000) if not matched_account_flag:
if 'updateTime' in account_line: print(f"未找到公众号“{account}")
start_timestamp = account_line['updateTime'] continue
start_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(start_timestamp / 1000)) fake_id = matched_account['fakeid']
account = account_line['userName'] update_time_flag = True # 用于记录获取到的历史列表是否已经超出最早的时间限制
search_account_api = f'https://mp.weixin.qq.com/cgi-bin/searchbiz?action=search_biz&begin=0&count=5&' \ next_start_timestamp = int(time.time() * 1000)
f'query={account}&token={token}&lang=zh_CN&f=json&ajax=1' for index in range(index_end):
print(f"开始搜索公众号“{account}”...") if update_time_flag:
time.sleep(3 + random.random()) if next_start_timestamp - start_timestamp < 12 * 3600 * 1000:
response = requests.get(search_account_api, cookies=cookies, headers=headers) print(f"公众号“{account}”以及后续账号在12小时内已经扫码获取过文章链接本次获取结束")
rsp_body = json.loads(response.text) break_flag = True
index_end = ceil(count_per_account / 5) else:
if 'list' in rsp_body: fetch_article_api = f'https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=' \
matched_account = {} f'{index * 5}&count=5&fakeid={fake_id}&type=9&query=&token={token}' \
matched_account_flag = False f'&lang=zh_CN&f=json&ajax=1'
for item in rsp_body['list']: print(f"开始获取公众号“{account}”在 {start_time} 后发表的的文章列表...")
if item['nickname'] == account: time.sleep(3 + random.random())
matched_account_flag = True article_response = requests.get(fetch_article_api, cookies=cookies, headers=headers)
matched_account = item article_rsp_body = json.loads(article_response.text)
break if 'app_msg_list' in article_rsp_body:
if not matched_account_flag: for article in article_rsp_body['app_msg_list']:
print(f"未找到公众号“{account}") title = article['title']
continue link = article['link']
fake_id = matched_account['fakeid'] update_time = article['update_time'] * 1000
update_time_flag = True # 用于记录获取到的历史列表是否已经超出最早的时间限制 if update_time > start_timestamp:
next_start_timestamp = int(time.time() * 1000) total_count += 1
for index in range(index_end): time_str = time.strftime("%Y-%m-%d %H:%M:%S",
if update_time_flag: time.localtime(update_time / 1000))
if next_start_timestamp - start_timestamp < 12 * 3600 * 1000: print(f"[No. {total_count}] 获取到公众号“{account}”在 {time_str} "
print(f"公众号“{account}”以及后续账号在12小时内已经扫码获取过文章链接本次获取结束") f"发表的文章《{title}》,链接地址:{link}")
break_flag = True redis_client.sadd(f"MediaSpiders:Wechat_links:{account_line['id']}", link)
else:
fetch_article_api = f'https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=' \
f'{index * 5}&count=5&fakeid={fake_id}&type=9&query=&token={token}' \
f'&lang=zh_CN&f=json&ajax=1'
print(f"开始获取公众号“{account}”在 {start_time} 后发表的的文章列表...")
time.sleep(3 + random.random())
article_response = requests.get(fetch_article_api, cookies=cookies, headers=headers)
article_rsp_body = json.loads(article_response.text)
if 'app_msg_list' in article_rsp_body:
for article in article_rsp_body['app_msg_list']:
title = article['title']
link = article['link']
update_time = article['update_time'] * 1000
if update_time > start_timestamp:
total_count += 1
time_str = time.strftime("%Y-%m-%d %H:%M:%S",
time.localtime(update_time / 1000))
print(f"[No. {total_count}] 获取到公众号“{account}”在 {time_str} "
f"发表的文章《{title}》,链接地址:{link}")
redis_client.sadd(f"MediaSpiders:Wechat_links:{account_line['id']}",
link)
else:
update_time_flag = False
break
else: else:
print(json.dumps(article_rsp_body, ensure_ascii=False)) update_time_flag = False
if 'base_resp' in article_rsp_body: break
err_msg = article_rsp_body['base_resp']['err_msg'] else:
if err_msg == "freq control" or err_msg == "invalid session": print(json.dumps(article_rsp_body, ensure_ascii=False))
print("接口频率限制,稍后再试,本次获取结束") if 'base_resp' in article_rsp_body:
break_flag = True if article_rsp_body['base_resp']['err_msg'] == "freq control":
break print("接口频率限制,稍后再试,本次获取结束")
break_flag = True
if not break_flag: break
# 本循环内只有12小时内扫过码以及接口频率限制退出会导致 break_flag 为 True这两种情况都不需要更新扫码状态 if not break_flag:
next_start_time = time.strftime("%Y-%m-%d %H:%M:%S", # 本循环内只有12小时内扫过码以及接口频率限制退出会导致 break_flag 为 True这两种情况都不需要更新扫码状态
time.localtime(next_start_timestamp / 1000)) next_start_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(next_start_timestamp / 1000))
account_line['updateTime'] = next_start_timestamp account_line['updateTime'] = next_start_timestamp
http_post(SOCIAL_USER_UPDATE_API, http_post(SOCIAL_USER_UPDATE_API,
data=json.dumps(account_line, ensure_ascii=False).encode('utf-8'), data=json.dumps(account_line, ensure_ascii=False).encode('utf-8'),
headers={'User-Agent': UA, "Content-Type": "application/json"} headers={'User-Agent': UA, "Content-Type": "application/json"}
) )
print(f"公众号“{account}”文章获取结束,该账号下一次获取起始时间为 {next_start_time}") print(f"公众号“{account}”文章获取结束,该账号下一次获取起始时间为 {next_start_time}")
else: else:
print(json.dumps(rsp_body, ensure_ascii=False)) print(json.dumps(rsp_body, ensure_ascii=False))
if 'base_resp' in rsp_body: if 'base_resp' in rsp_body:
if rsp_body['base_resp']['err_msg'] == "freq control": if rsp_body['base_resp']['err_msg'] == "freq control":
print("接口频率限制,稍后再试,本次获取结束") print("接口频率限制,稍后再试,本次获取结束")
break_flag = True break_flag = True
break break
except Exception as e: except Exception as e:
print(repr(e)) print(repr(e))
redis_client.close() redis_client.close()
driver.quit() driver.quit()

View File

@ -1,353 +0,0 @@
import time
import logging as logger
from selenium.webdriver.common.by import By
import redis
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from MediaSpiders.settings import REDIS_HOST, REDIS_PORT, REDIS_PWD
chrome_options = Options()
# 指定 chrome.exe 的完整路径
chrome_options.binary_location = r"D:\chrome-win64\chrome.exe"
# chrome_options.use_chromium = True
driver = webdriver.Chrome(
executable_path=r"D:\chromedriver-win64\chromedriver.exe",
options=chrome_options
)
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": "Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"
})
# Redis连接
redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PWD, decode_responses=True)
COOKIE_KEY = "MediaSpiders:WeChatLinksFetcher_Cookies"
def parse_cookie_string(cookie_str):
"""解析 cookie 字符串为 dict"""
cookie_dict = {}
for item in cookie_str.split(';'):
if '=' in item:
name, value = item.split('=', 1)
cookie_dict[name.strip()] = value.strip()
return cookie_dict
def add_cookie_smart(driver, name, value, target_domain='mp.weixin.qq.com'):
"""
智能添加 cookie先试目标域名失败则试父域再失败则跳过
"""
# 微信核心 cookie 必须用 mp.weixin.qq.com
wechat_critical = ['wxuin', 'slave_sid', 'slave_user', 'bizuin', 'data_ticket', 'token']
# 腾讯通用 cookie 可尝试父域
tencent_common = ['ptui_loginuin', 'RK', 'ptcz', 'ua_id']
# 策略 1: 核心 cookie → 精确域名
if name in wechat_critical:
domains_to_try = [target_domain]
# 策略 2: 腾讯通用 cookie → 先试目标域,再试父域
elif name in tencent_common:
domains_to_try = [target_domain, '.weixin.qq.com', '.qq.com']
# 策略 3: 其他 cookie → 默认 host-only不传 domain
else:
domains_to_try = [None, target_domain]
for domain in domains_to_try:
cookie = {
'name': name,
'value': value,
'path': '/',
'secure': True
}
if domain:
cookie['domain'] = domain
try:
driver.add_cookie(cookie)
# logger.debug(f"✓ {name} added with domain={domain or 'host-only'}")
return True
except Exception as e:
if 'invalid cookie domain' in str(e):
continue # 尝试下一个 domain
else:
# logger.warning(f"✗ {name} failed: {e}")
return False
return False # 所有 domain 都失败
def is_cookie_exists(cookie_str):
"""
判断cookie是否已存在Redis中
返回: (exists, duplicate_index)
"""
try:
# 解析新cookie
new_cookie_dict = parse_cookie_string(cookie_str)
# 获取Redis中所有现有cookie
existing_cookies = redis_client.lrange(COOKIE_KEY, 0, -1)
for idx, existing_cookie in enumerate(existing_cookies):
try:
existing_dict = parse_cookie_string(existing_cookie)
# 检查关键字段是否匹配
# 微信cookie的关键识别字段
key_fields = ['wxuin', 'slave_sid', 'slave_user', 'bizuin']
matches = 0
for field in key_fields:
if field in new_cookie_dict and field in existing_dict:
if new_cookie_dict[field] == existing_dict[field]:
matches += 1
# 如果匹配到2个以上关键字段认为是同一个cookie
if matches >= 2:
return True, idx
# 或者检查slave_sid最独特的标识
if 'slave_sid' in new_cookie_dict and 'slave_sid' in existing_dict:
if new_cookie_dict['slave_sid'] == existing_dict['slave_sid']:
return True, idx
except Exception as e:
logger.warning(f"解析现有cookie时出错: {e}")
continue
return False, -1
except Exception as e:
logger.error(f"判断cookie是否存在时出错: {e}")
return False, -1
def save_cookie_to_redis(cookie_str, force_save=False):
"""
保存cookie到Redis自动去重
Args:
cookie_str: cookie字符串
force_save: 是否强制保存即使存在也保存
Returns:
bool: 是否保存成功
"""
try:
# 检查是否已存在
exists, idx = is_cookie_exists(cookie_str)
if exists and not force_save:
logger.info(f"Cookie已存在 (索引: {idx}),跳过保存")
return False
if exists and force_save:
# 删除旧的,保存新的
redis_client.lset(COOKIE_KEY, idx, cookie_str)
logger.info(f"已更新现有cookie (索引: {idx})")
return True
else:
# 添加新cookie
redis_client.rpush(COOKIE_KEY, cookie_str)
logger.info(f"已添加新cookie当前总数: {redis_client.llen(COOKIE_KEY)}")
return True
except Exception as e:
logger.error(f"保存cookie到Redis失败: {e}")
return False
def cookie_dict_to_string(cookie_dict):
"""将cookie字典转换为字符串"""
return '; '.join([f"{k}={v}" for k, v in cookie_dict.items()])
def manual_login_and_get_cookie():
"""
手动扫码登录并获取cookie
"""
logger.info("开始手动扫码登录流程...")
try:
# 访问微信公众平台
driver.get("https://mp.weixin.qq.com/")
time.sleep(3)
# 等待页面加载
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.TAG_NAME, "body"))
)
# 检查是否已登录通过URL是否包含token
if "token=" in driver.current_url:
logger.info("检测到已登录状态直接获取cookie")
else:
logger.info("请手动扫描二维码登录...")
logger.info("等待登录完成...")
# 等待登录成功等待URL变化或特定元素出现
try:
# 等待URL中包含token
WebDriverWait(driver, 120).until(
lambda d: "token=" in d.current_url
)
logger.info("检测到登录成功!")
time.sleep(3)
except Exception as e:
logger.error("等待登录超时")
return None
# 获取当前页面所有cookies
cookies = driver.get_cookies()
if not cookies:
logger.error("未获取到cookies")
return None
# 转换为字符串格式
cookie_dict = {}
for cookie in cookies:
cookie_dict[cookie['name']] = cookie['value']
cookie_string = cookie_dict_to_string(cookie_dict)
# 获取token信息
token = None
if "token=" in driver.current_url:
token_index = driver.current_url.rfind('token=')
token = driver.current_url[token_index + 6:]
logger.info(f"获取到token: {token}")
logger.info(f"获取到 {len(cookie_dict)} 个cookie")
return {
'cookie_dict': cookie_dict,
'cookie_string': cookie_string,
'token': token,
'raw_cookies': cookies
}
except Exception as e:
logger.error(f"手动登录失败: {e}")
return None
def verify_cookie_valid(cookie_dict, token=None):
"""
验证cookie是否有效
Args:
cookie_dict: cookie字典
token: token字符串
Returns:
bool: cookie是否有效
"""
try:
if not token:
# 如果没有token尝试从cookie中构建
pass
# 构建请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Referer': f'https://mp.weixin.qq.com/',
}
# 尝试访问一个需要登录的接口
test_api = f'https://mp.weixin.qq.com/cgi-bin/bizlogin?action=validate&lang=zh_CN'
response = requests.get(test_api, cookies=cookie_dict, headers=headers, timeout=10)
# 检查响应
if response.status_code == 200:
try:
data = response.json()
if 'base_resp' in data:
err_msg = data['base_resp'].get('err_msg', '')
if err_msg in ['ok', '']:
logger.info("cookie验证有效")
return True
else:
logger.warning(f"cookie验证返回错误: {err_msg}")
return False
except:
# 如果不是JSON响应可能仍然有效
logger.info("cookie可能有效")
return True
else:
logger.warning(f"cookie验证请求失败: {response.status_code}")
return False
except Exception as e:
logger.error(f"验证cookie时出错: {e}")
return False
def main(type):
"""主函数"""
logger.info("微信公众号Cookie获取工具")
try:
# 1. 手动登录获取cookie
result = manual_login_and_get_cookie()
if not result:
logger.error("获取cookie失败")
return
cookie_string = result['cookie_string']
cookie_dict = result['cookie_dict']
token = result['token']
# 2. 验证cookie有效性
logger.info("正在验证cookie有效性...")
is_valid = verify_cookie_valid(cookie_dict, token)
if not is_valid:
logger.warning("cookie可能无效但仍会保存")
# 3. 检查cookie是否已存在并保存
logger.info("正在检查cookie是否已存在...")
exists, idx = is_cookie_exists(cookie_string)
if exists:
logger.info(f"Cookie已存在 (索引: {idx})")
# 询问是否覆盖
choice = type
if choice == 'y':
saved = save_cookie_to_redis(cookie_string, force_save=True)
if saved:
logger.info("已覆盖更新cookie")
else:
logger.info("取消保存")
else:
# 保存新cookie
saved = save_cookie_to_redis(cookie_string)
if saved:
logger.info("新cookie保存成功")
# 4. 显示当前cookie列表状态
total_cookies = redis_client.llen(COOKIE_KEY)
logger.info(f"当前Redis中cookie总数: {total_cookies}")
except KeyboardInterrupt:
logger.info("用户中断程序")
except Exception as e:
logger.error(f"程序执行出错: {e}")
finally:
driver.quit()
logger.info("程序结束")
if __name__ == "__main__":
# 是否覆盖原有cookie? y:覆盖、n:不覆盖
type = 'y'
# 运行主程序
main(type)

View File

@ -4,20 +4,7 @@ import sys
from scrapy.cmdline import execute from scrapy.cmdline import execute
"""
命令行启动
1 Win CMD
D:\dev\code\PythonCode\osc\spiders\MediaSpiders\.venv\Scripts\activate.bat
scrapy crawl FacebookUserSpider -a params="{}"
2Windows PowerShell
D:\dev\code\PythonCode\osc\spiders\MediaSpiders\.venv\Scripts\Activate.ps1
scrapy crawl FacebookUserSpider -a params="{}"
"""
dirpath = os.path.dirname(os.path.abspath(__file__)) dirpath = os.path.dirname(os.path.abspath(__file__))
sys.path.append(dirpath) sys.path.append(dirpath)
# 等效于scrapy crawl FacebookUserSpider -a params="{}" execute(['scrapy', 'crawl', 'FacebookUserSpider', '-a', 'params={}'])
# execute(['scrapy', 'crawl', 'hot_search_spider', '-a', 'params={}'])
execute(['scrapy', 'crawl', 'WechatLinksFetcherSpider', '-a', 'params={}'])

View File

@ -19,16 +19,6 @@ from ShipSpiders.utils.http_utils import http_post
from ShipSpiders.utils.time_utils import to_unix_timestamp from ShipSpiders.utils.time_utils import to_unix_timestamp
def cookie_dict_to_str(dict_cookie):
str_cookie = ""
for line in dict_cookie:
str_cookie += line['name']
str_cookie += "="
str_cookie += line['value']
str_cookie += ";"
return str_cookie[:-1]
class TrackpointsSpider(scrapy.Spider): class TrackpointsSpider(scrapy.Spider):
name = 'shipxy_track' name = 'shipxy_track'
settings = get_project_settings() settings = get_project_settings()
@ -81,22 +71,21 @@ class TrackpointsSpider(scrapy.Spider):
self.driver.get('https://www.shipxy.com/Home/Login') self.driver.get('https://www.shipxy.com/Home/Login')
time.sleep(2) time.sleep(2)
logger.info('Logging in with user_id and password...') logger.info('Logging in with user_id and password...')
pwd_button = self.driver.find_element_by_xpath("//a[text()='密码登录']") pwdbutton = self.driver.find_element_by_xpath("//a[text()='密码登录']")
pwd_button.click() pwdbutton.click()
time.sleep(0.5) time.sleep(0.5)
user_name = self.driver.find_element_by_id('userName') userName = self.driver.find_element_by_id('userName')
user_pwd = self.driver.find_element_by_id('userPWD') userPWD = self.driver.find_element_by_id('userPWD')
button = self.driver.find_element_by_id('loginBtn') button = self.driver.find_element_by_id('loginBtn')
user_name.send_keys(self.settings['SHIPXY_LOGIN_ACCOUNT']) userName.send_keys(self.settings['SHIPXY_LOGIN_ACCOUNT'])
user_pwd.send_keys(self.settings['SHIPXY_LOGIN_PASSWD']) userPWD.send_keys(self.settings['SHIPXY_LOGIN_PASSWD'])
button.click() button.click()
time.sleep(5) time.sleep(5)
self.driver.get('https://www.shipxy.com/') self.driver.get('https://www.shipxy.com/IHS')
logger.info(f"Logged in as {self.settings['SHIPXY_LOGIN_ACCOUNT']}, Updating cookies...") logger.info('Logged in! Updating cookies...')
except: except:
pass pass
self.dict_cookie = self.driver.get_cookies() self.dict_cookie = self.driver.get_cookies()
logger.info(self.dict_cookie)
# logger.info('Getting normal ship trackpoints...') # logger.info('Getting normal ship trackpoints...')
# slat, elat = -90 * 1000000, 90 * 1000000 # slat, elat = -90 * 1000000, 90 * 1000000
@ -139,68 +128,41 @@ class TrackpointsSpider(scrapy.Spider):
mmsi = sensitive_target['targetValue'] mmsi = sensitive_target['targetValue']
track_api = f'https://www.shipxy.com/Ship/GetTrackAll' \ track_api = f'https://www.shipxy.com/Ship/GetTrackAll' \
f'?shipid={mmsi}&btime={btime}&etime={etime}&limit=1&enc=0' f'?shipid={mmsi}&btime={btime}&etime={etime}&limit=1&enc=0'
get_md5_databody = {
"shipid": mmsi,
"btime": btime,
"etime": etime,
"limit": "1",
"enc": "0"
}
md5_token = self.get_md5_token_from_webpage(get_md5_databody)
cookie_str = cookie_dict_to_str(self.dict_cookie)
yield scrapy.Request(url=track_api, callback=self.parse_sensitive_ship, cookies=self.dict_cookie, yield scrapy.Request(url=track_api, callback=self.parse_sensitive_ship, cookies=self.dict_cookie,
meta={'mmsi': mmsi}, meta={'mmsi': mmsi})
headers={
# "Cookie": cookie_str,
"S": md5_token['sign'],
"T": md5_token['timestamp']
})
def get_md5_token_from_webpage(self, data):
js_script = f'return window.R0VOQ1NJR04({data});'
result = self.driver.execute_script(js_script)
return result
def decode_track(self, data):
js_script = f'return analyseAisTrack("{data}").data;'
result = self.driver.execute_script(js_script)
return result
def parse_sensitive_ship(self, response): def parse_sensitive_ship(self, response):
mmsi = response.meta['mmsi'] mmsi = response.meta['mmsi']
rsp_obj = json.loads(response.text) rsp_obj = json.loads(response.text)
if rsp_obj['status'] != 0: if rsp_obj['status'] != 0:
logger.info('[SENSITIVE SHIP] No track data of sensitive ship MMSI: %s' % mmsi) # logger.info('[SENSITIVE SHIP] No track data of sensitive ship MMSI: %s' % mmsi)
logger.info(response.text)
return return
data = rsp_obj['data'] data = rsp_obj['data']
# logger.info(f">>>>>>>>>>>>> data: {data}") tracks = []
# tracks = [] track_decode_api = self.settings['TRACK_DECODE_SERVICE']
# track_decode_api = self.settings['TRACK_DECODE_SERVICE'] retry_times = 1
# retry_times = 1 while retry_times <= 3:
# while retry_times <= 3: try:
# try: decode_data = requests.post(track_decode_api, data=data)
# decode_data = requests.post(track_decode_api, data=data) if decode_data.content == b'500': # 解码器返回错误值为 b'500'
# if decode_data.content == b'500': # 解码器返回错误值为 b'500' logger.warning(
# logger.warning( "解析服务错误!重启服务中... 第 %d" % retry_times)
# "解析服务错误!重启服务中... 第 %d 次" % retry_times) time.sleep(3)
# time.sleep(3) retry_times += 1
# retry_times += 1 continue
# continue else:
# else: tracks = json.loads(decode_data.content)
# tracks = json.loads(decode_data.content) break
# break except:
# except: logger.warning(
# logger.warning( "解析失败3 秒后重试第 %d 次..." % retry_times)
# "解析失败3 秒后重试第 %d 次..." % retry_times) time.sleep(3)
# time.sleep(3) retry_times += 1
# retry_times += 1
# if retry_times > 3:
# if retry_times > 3: logger.warning(
# logger.warning( "[SENSITIVE SHIP] 数据丢失 MMSI: %s" % mmsi)
# "[SENSITIVE SHIP] 数据丢失 MMSI: %s" % mmsi) return
# return
tracks = self.decode_track(data)
logger.info('[SENSITIVE SHIP] MMSI %s%d 个轨迹点 ' % (mmsi, len(tracks))) logger.info('[SENSITIVE SHIP] MMSI %s%d 个轨迹点 ' % (mmsi, len(tracks)))
for track in tracks: for track in tracks:
last_time = track['utc'] * 1000 last_time = track['utc'] * 1000

View File

@ -1,175 +0,0 @@
import time
from typing import List, Tuple, Optional
import pymysql
import requests
# ================== 配置区 ==================
# 数据库配置
DB_CONFIG = {
'host': '47.113.231.200',
'port': 28089,
'user': 'root',
'password': 'passok123A',
'database': 'dsp',
'charset': 'utf8mb4',
'autocommit': False # 手动控制事务
}
# 翻译 API 地址(替换为你的服务器 IP 或域名)
TRANSLATE_API_URL = "http://47.113.231.200:28081/translate"
# 指定时间格式YYYY-MM-DD HH:MM:SS
LOADTIME_AFTER = "2026-01-16 10:40:00"
# 目标站点列表
TARGET_SRCNAMES = [
'http://www.rodong.rep.kp/ko/index.php?MUBAMUAxQA==',
'http://www.kcna.kp/kp/category/articles/q/5394b80bdae203fadef02522cfb578c0.kcmsf',
'https://energynow.com/category/press_releases/',
'https://www.fao.org/newsroom/en' # 添加你的站点
]
# 单次请求间隔(秒),避免 API 被限流
REQUEST_DELAY = 1
# 最大文本长度(与 API 一致)
MAX_TEXT_LENGTH = 5000
def normalize_newlines(text: str) -> str:
"""\r\n\r 统一转换为 \n"""
if not text:
return text
return text.replace('\r\n', '\n').replace('\r', '\n')
def translate_single(text: str, source_lang: str = "auto", target_lang: str = "zh") -> Optional[str]:
"""翻译单段文本,失败返回 None"""
if not text or not text.strip():
return ""
payload = {
"text": text[:MAX_TEXT_LENGTH],
"source_lang": source_lang,
"target_lang": target_lang
}
try:
response = requests.post(TRANSLATE_API_URL, json=payload, timeout=10)
response.raise_for_status()
result = response.json()
return result.get("translated_text")
except Exception as e:
print(f"⚠️ 翻译失败: {e}")
return None
def translate_content_with_paragraphs(content: str) -> str:
"""
按段落翻译内容支持容错
- 某段失败 跳过该段保留空行或原文
- 返回拼接后的完整内容
"""
if not content:
return ""
# 标准化换行符
content = normalize_newlines(content)
paragraphs = content.split('\n')
translated_paragraphs = []
for para in paragraphs:
if not para.strip():
# 保留空行
translated_paragraphs.append("")
continue
trans = translate_single(para)
if trans is None:
# 段落翻译失败:跳过该段(可选:保留原文或留空)
print(f" ⚠️ 段落翻译失败,跳过: {para[:30]}...")
translated_paragraphs.append("") # 或 append(para) 保留原文
else:
translated_paragraphs.append(trans)
time.sleep(REQUEST_DELAY)
return '\n'.join(translated_paragraphs)
# ================== 数据库操作 ==================
def update_record(cursor, es_sid: int, new_title: str, new_content: str):
update_query = """
UPDATE indeximos
SET es_title = % s, es_content = % s
WHERE es_sid = % s
"""
cursor.execute(update_query, (new_title, new_content, es_sid))
# ================== 主逻辑 ==================
def main():
conn = pymysql.connect(**DB_CONFIG)
cursor = conn.cursor()
try:
placeholders = ','.join(['%s'] * len(TARGET_SRCNAMES))
query = f"""
SELECT es_sid, es_urltitle, es_urlcontent
FROM indeximos
WHERE es_loadtime > %s
AND (es_title IS NULL OR TRIM(es_title) = '')
AND es_srcname IN ({placeholders})
AND LENGTH(es_video) > 5
"""
params = [LOADTIME_AFTER] + TARGET_SRCNAMES
cursor.execute(query, params)
records: List[Tuple] = cursor.fetchall()
total = len(records)
print(f"✅ 共找到 {total} 条待翻译记录")
if total == 0:
return
success_count = 0
for idx, (es_sid, urltitle, urlcontent) in enumerate(records, 1):
print(f"\n[{idx}/{total}] 处理 es_sid={es_sid}")
start_time = time.time()
# 翻译标题
title_trans = translate_single(urltitle) if urltitle else ""
if title_trans is None:
print(" → 标题翻译失败,跳过整条")
continue
# 翻译内容(按段落,容错)
content_trans = translate_content_with_paragraphs(urlcontent)
# 更新数据库
update_record(cursor, es_sid, title_trans, content_trans)
success_count += 1
elapsed = time.time() - start_time
print(f" ✅ 翻译成功 | 耗时: {elapsed:.2f}s | 标题: {title_trans[:30]}...")
conn.commit()
print(f"\n🎉 完成!成功翻译 {success_count} / {total} 条记录")
except Exception as e:
conn.rollback()
print(f"❌ 发生错误: {e}")
raise
finally:
cursor.close()
conn.close()
if __name__ == "__main__":
main()

View File

@ -19,7 +19,6 @@ class WebsiteSpiderItem(scrapy.Item):
es_extname = scrapy.Field() es_extname = scrapy.Field()
es_channel = scrapy.Field() es_channel = scrapy.Field()
es_groupname = scrapy.Field() es_groupname = scrapy.Field()
es_title = scrapy.Field()
es_urltitle = scrapy.Field() es_urltitle = scrapy.Field()
es_urltopic = scrapy.Field() es_urltopic = scrapy.Field()
es_lasttime = scrapy.Field() es_lasttime = scrapy.Field()

View File

@ -1,6 +1,6 @@
syntax = "proto3"; syntax = "proto3";
message EsSets message EsSets //es<EFBFBD><EFBFBD>
{ {
repeated Es Es = 1; repeated Es Es = 1;
} }
@ -8,79 +8,78 @@ message EsSets
message Es message Es
{ {
string es_sid = 1; string es_sid = 1; //<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
string es_subjectId = 2; string es_subjectId = 2; //<EFBFBD><EFBFBD><EFBFBD><EFBFBD>id
string es_hkey = 3; string es_hkey = 3; //URLΨһ<EFBFBD><EFBFBD><EFBFBD>
string es_pkey = 4; string es_pkey = 4; //<EFBFBD><EFBFBD>URL<EFBFBD><EFBFBD><EFBFBD>
string es_startid = 5; string es_startid = 5; //<EFBFBD><EFBFBD>ʼ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
string es_urlname = 6; string es_urlname = 6; //URL<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
string es_sitename = 7; string es_sitename = 7; //<EFBFBD><EFBFBD>վ<EFBFBD><EFBFBD>
string es_extname = 8; string es_extname = 8; //<EFBFBD><EFBFBD>׺<EFBFBD><EFBFBD>
string es_channel = 9; string es_channel = 9; //<EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ƶ<EFBFBD><EFBFBD>
string es_groupname = 10; string es_groupname = 10; //<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
string es_urltitle = 11; string es_urltitle = 11; //<EFBFBD><EFBFBD>ҳ<EFBFBD><EFBFBD><EFBFBD>ñ<EFBFBD><EFBFBD><EFBFBD>
string es_urltopic = 12; string es_urltopic = 12; //<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ҳ<title><EFBFBD>ñ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ı<EFBFBD><EFBFBD><EFBFBD>
string es_lasttime = 13; string es_lasttime = 13; //<EFBFBD>ɼ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ڣ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʱ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
string es_loadtime = 14; string es_loadtime = 14; //<EFBFBD><EFBFBD><EFBFBD>ʱ<EFBFBD>ʵ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ES<EFBFBD><EFBFBD><EFBFBD>ʱ<EFBFBD>
string es_urldate = 15; string es_urldate = 15; //<EFBFBD><EFBFBD><EFBFBD>µķ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ڣ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
string es_urltime = 16; string es_urltime = 16; //<EFBFBD><EFBFBD><EFBFBD>µķ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ڣ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʱ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
string es_srcname = 17; string es_srcname = 17; //<EFBFBD><EFBFBD><EFBFBD>µ<EFBFBD><EFBFBD><EFBFBD>Դ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ȱʧ<EFBFBD><EFBFBD>
string es_authors = 18; string es_authors = 18; //<EFBFBD><EFBFBD><EFBFBD>µ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ߣ<EFBFBD><EFBFBD><EFBFBD>ȱʧ<EFBFBD><EFBFBD>
string es_district = 19; string es_district = 19; //<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>µ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>µĵ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ȱʧ<EFBFBD><EFBFBD>
string es_catalog = 20; string es_catalog = 20; //<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
string es_catalog1 = 21; string es_catalog1 = 21; //<EFBFBD><EFBFBD><EFBFBD><EFBFBD>һ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
string es_catalog2 = 22; string es_catalog2 = 22; //<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
string es_keywords = 23; string es_keywords = 23; //<EFBFBD><EFBFBD><EFBFBD>¹ؼ<EFBFBD><EFBFBD>ʣ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>½<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ģ<EFBFBD>
string es_abstract = 24; string es_abstract = 24; //<EFBFBD><EFBFBD><EFBFBD>µ<EFBFBD>ժҪ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>½<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ģ<EFBFBD>
string es_simflag = 25; string es_simflag = 25; //<EFBFBD>ظ<EFBFBD><EFBFBD><EFBFBD>ǣ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>֮<EFBFBD>ظ<EFBFBD><EFBFBD><EFBFBD>ҳ<EFBFBD><EFBFBD>HKEY
string es_simrank = 26; string es_simrank = 26; //<EFBFBD><EFBFBD><EFBFBD>ƶ<EFBFBD><EFBFBD><EFBFBD>ֵ
string es_urlimage = 27; string es_urlimage = 27; //ͼƬ<EFBFBD><EFBFBD>ַ
string es_imageflag = 28; string es_imageflag = 28; //<EFBFBD><EFBFBD>ҳ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ͼƬ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ŀ
string es_tableflag = 29; string es_tableflag = 29; //<EFBFBD><EFBFBD>ҳ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ŀ
string es_doclength = 30; string es_doclength = 30; //<EFBFBD><EFBFBD><EFBFBD>ij<EFBFBD><EFBFBD><EFBFBD>
string es_content = 31; string es_content = 31; //<EFBFBD><EFBFBD>ҳ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ݣ<EFBFBD><EFBFBD><EFBFBD>ͼƬ<EFBFBD><EFBFBD>
string es_urlcontent = 32; string es_urlcontent = 32; //<EFBFBD><EFBFBD>ҳ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ݣ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ͼƬ<EFBFBD><EFBFBD>
string es_bbsnum = 33; string es_bbsnum = 33; //<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
string es_pagelevel = 34; string es_pagelevel = 34; //<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʼҳ<EFBFBD>ʼ<EFBFBD>IJ<EFBFBD><EFBFBD><EFBFBD>
string es_urllevel = 35; string es_urllevel = 35; //<EFBFBD><EFBFBD><EFBFBD>ӵ<EFBFBD>Ŀ¼<EFBFBD><EFBFBD><EFBFBD>
string es_simhash = 36; string es_simhash = 36; //<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>simhashֵ
string es_ip = 37; string es_ip = 37; //<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ip
string es_heat = 38; string es_heat = 38; //<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ȶ<EFBFBD>
string es_similaritycount = 39; string es_similaritycount = 39; //<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
string es_similarity = 40; string es_similarity = 40; //<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>id
string es_similaritytime = 41; string es_similaritytime = 41; //<EFBFBD><EFBFBD><EFBFBD>ƶȼ<EFBFBD><EFBFBD><EFBFBD>ʱ<EFBFBD><EFBFBD>
string es_emotion = 42; string es_emotion = 42; //<EFBFBD><EFBFBD><EFBFBD>
string es_warningtime = 43; string es_warningtime = 43; //Ԥ<EFBFBD><EFBFBD>ʱ<EFBFBD><EFBFBD>
string es_carriertype = 44; string es_carriertype = 44; //<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
string es_commentcount = 45; string es_commentcount = 45; //<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
string es_forwardcount = 46; string es_forwardcount = 46; //ת<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
string es_positiveWords = 47; string es_positiveWords = 47; //<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
string es_negativeWords = 48; string es_negativeWords = 48; //<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
string es_negativeProbability = 49; string es_negativeProbability = 49; //<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
string es_reportinfo = 50; string es_reportinfo = 50; //<EFBFBD>Ƿ<EFBFBD><EFBFBD>ϱ<EFBFBD><EFBFBD><EFBFBD>Ϣ
string es_attention = 51; string es_attention = 51; //<EFBFBD>Ƿ<EFBFBD><EFBFBD>ע
string es_warning = 52; string es_warning = 52; //<EFBFBD>Ƿ<EFBFBD>Ԥ<EFBFBD><EFBFBD>
string es_readsign = 53; string es_readsign = 53; //<EFBFBD>Ƿ<EFBFBD><EFBFBD>Ѷ<EFBFBD>
string es_briefing = 54; string es_briefing = 54; //<EFBFBD>Ƿ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
string es_warning_word = 55; string es_warning_word = 55; //Ԥ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
string es_attentiontime = 56; string es_attentiontime = 56; //<EFBFBD><EFBFBD>עʱ<EFBFBD><EFBFBD>
string es_collection = 57; string es_collection = 57; //<EFBFBD>Ƿ<EFBFBD><EFBFBD>ղ<EFBFBD>
string es_attachment = 58; string es_attachment = 58; //<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
string es_userid = 59; string es_userid = 59;//number,<EFBFBD>û<EFBFBD>id<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ý<EFBFBD><EFBFBD><EFBFBD>˻<EFBFBD>)
string es_contenttype = 60; string es_contenttype = 60;//string,<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ͣ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Post<EFBFBD><EFBFBD><EFBFBD>ͣ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>status<EFBFBD><EFBFBD>link<EFBFBD><EFBFBD>photo<EFBFBD><EFBFBD>video<EFBFBD><EFBFBD>event<EFBFBD><EFBFBD>music<EFBFBD><EFBFBD>note<EFBFBD><EFBFBD>offer<EFBFBD><EFBFBD>album<EFBFBD>ȣ<EFBFBD>
string es_likecount = 61; string es_likecount = 61;//number,<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
string es_links = 62; string es_links = 62;//string<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>е<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ӵ<EFBFBD>ַ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ƶ<EFBFBD>ļ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ӵ<EFBFBD>ַ
string es_reactioncount = 63; string es_reactioncount = 63;//number,<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
string es_linkdesc = 64; string es_linkdesc = 64;//string<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>һ<EFBFBD><EFBFBD>post <EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ϊ<EFBFBD><EFBFBD><EFBFBD>ӣ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ӵ<EFBFBD>һЩ<EFBFBD><EFBFBD>Ϣ
string es_repostuid = 65; string es_repostuid = 65;//number<EFBFBD><EFBFBD>ת<EFBFBD><EFBFBD>ԭ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ߵ<EFBFBD>ID
string es_repostuname =66; string es_repostuname =66;//string<EFBFBD><EFBFBD>ת<EFBFBD><EFBFBD>ԭ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ߵ<EFBFBD>name
string es_repostid = 67; string es_repostid = 67;//string<EFBFBD><EFBFBD>ת<EFBFBD><EFBFBD>ԭ<EFBFBD><EFBFBD>ID
string es_tags = 68; string es_tags = 68;//string<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
string es_mentionsaccount = 69; string es_mentionsaccount = 69;//string<EFBFBD><EFBFBD><EFBFBD><EFBFBD>˺<EFBFBD>
string es_video = 70; string es_video = 70;//string<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>е<EFBFBD><EFBFBD><EFBFBD>Ƶ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
string es_isrepost = 71; string es_isrepost = 71;//boolean<EFBFBD><EFBFBD><EFBFBD>Ƿ<EFBFBD>ת<EFBFBD><EFBFBD>
string es_lang = 72; string es_lang = 72;//string<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
string es_client = 73; string es_client = 73;//string<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ͻ<EFBFBD><EFBFBD><EFBFBD>
string es_snapshot = 74; string es_snapshot = 74;
string es_title = 75;
} }

File diff suppressed because it is too large Load Diff

View File

@ -18,7 +18,7 @@ DESCRIPTOR = _descriptor.FileDescriptor(
package='', package='',
syntax='proto3', syntax='proto3',
serialized_options=None, serialized_options=None,
serialized_pb=b'\n\x08\x45s.proto\"\x19\n\x06\x45sSets\x12\x0f\n\x02\x45s\x18\x01 \x03(\x0b\x32\x03.Es\"\xcc\x0c\n\x02\x45s\x12\x0e\n\x06\x65s_sid\x18\x01 \x01(\t\x12\x14\n\x0c\x65s_subjectId\x18\x02 \x01(\t\x12\x0f\n\x07\x65s_hkey\x18\x03 \x01(\t\x12\x0f\n\x07\x65s_pkey\x18\x04 \x01(\t\x12\x12\n\nes_startid\x18\x05 \x01(\t\x12\x12\n\nes_urlname\x18\x06 \x01(\t\x12\x13\n\x0b\x65s_sitename\x18\x07 \x01(\t\x12\x12\n\nes_extname\x18\x08 \x01(\t\x12\x12\n\nes_channel\x18\t \x01(\t\x12\x14\n\x0c\x65s_groupname\x18\n \x01(\t\x12\x13\n\x0b\x65s_urltitle\x18\x0b \x01(\t\x12\x13\n\x0b\x65s_urltopic\x18\x0c \x01(\t\x12\x13\n\x0b\x65s_lasttime\x18\r \x01(\t\x12\x13\n\x0b\x65s_loadtime\x18\x0e \x01(\t\x12\x12\n\nes_urldate\x18\x0f \x01(\t\x12\x12\n\nes_urltime\x18\x10 \x01(\t\x12\x12\n\nes_srcname\x18\x11 \x01(\t\x12\x12\n\nes_authors\x18\x12 \x01(\t\x12\x13\n\x0b\x65s_district\x18\x13 \x01(\t\x12\x12\n\nes_catalog\x18\x14 \x01(\t\x12\x13\n\x0b\x65s_catalog1\x18\x15 \x01(\t\x12\x13\n\x0b\x65s_catalog2\x18\x16 \x01(\t\x12\x13\n\x0b\x65s_keywords\x18\x17 \x01(\t\x12\x13\n\x0b\x65s_abstract\x18\x18 \x01(\t\x12\x12\n\nes_simflag\x18\x19 \x01(\t\x12\x12\n\nes_simrank\x18\x1a \x01(\t\x12\x13\n\x0b\x65s_urlimage\x18\x1b \x01(\t\x12\x14\n\x0c\x65s_imageflag\x18\x1c \x01(\t\x12\x14\n\x0c\x65s_tableflag\x18\x1d \x01(\t\x12\x14\n\x0c\x65s_doclength\x18\x1e \x01(\t\x12\x12\n\nes_content\x18\x1f \x01(\t\x12\x15\n\res_urlcontent\x18 \x01(\t\x12\x11\n\tes_bbsnum\x18! \x01(\t\x12\x14\n\x0c\x65s_pagelevel\x18\" \x01(\t\x12\x13\n\x0b\x65s_urllevel\x18# \x01(\t\x12\x12\n\nes_simhash\x18$ \x01(\t\x12\r\n\x05\x65s_ip\x18% \x01(\t\x12\x0f\n\x07\x65s_heat\x18& \x01(\t\x12\x1a\n\x12\x65s_similaritycount\x18\' \x01(\t\x12\x15\n\res_similarity\x18( \x01(\t\x12\x19\n\x11\x65s_similaritytime\x18) \x01(\t\x12\x12\n\nes_emotion\x18* \x01(\t\x12\x16\n\x0e\x65s_warningtime\x18+ \x01(\t\x12\x16\n\x0e\x65s_carriertype\x18, \x01(\t\x12\x17\n\x0f\x65s_commentcount\x18- \x01(\t\x12\x17\n\x0f\x65s_forwardcount\x18. \x01(\t\x12\x18\n\x10\x65s_positiveWords\x18/ \x01(\t\x12\x18\n\x10\x65s_negativeWords\x18\x30 \x01(\t\x12\x1e\n\x16\x65s_negativeProbability\x18\x31 \x01(\t\x12\x15\n\res_reportinfo\x18\x32 \x01(\t\x12\x14\n\x0c\x65s_attention\x18\x33 \x01(\t\x12\x12\n\nes_warning\x18\x34 \x01(\t\x12\x13\n\x0b\x65s_readsign\x18\x35 \x01(\t\x12\x13\n\x0b\x65s_briefing\x18\x36 \x01(\t\x12\x17\n\x0f\x65s_warning_word\x18\x37 \x01(\t\x12\x18\n\x10\x65s_attentiontime\x18\x38 \x01(\t\x12\x15\n\res_collection\x18\x39 \x01(\t\x12\x15\n\res_attachment\x18: \x01(\t\x12\x11\n\tes_userid\x18; \x01(\t\x12\x16\n\x0e\x65s_contenttype\x18< \x01(\t\x12\x14\n\x0c\x65s_likecount\x18= \x01(\t\x12\x10\n\x08\x65s_links\x18> \x01(\t\x12\x18\n\x10\x65s_reactioncount\x18? \x01(\t\x12\x13\n\x0b\x65s_linkdesc\x18@ \x01(\t\x12\x14\n\x0c\x65s_repostuid\x18\x41 \x01(\t\x12\x16\n\x0e\x65s_repostuname\x18\x42 \x01(\t\x12\x13\n\x0b\x65s_repostid\x18\x43 \x01(\t\x12\x0f\n\x07\x65s_tags\x18\x44 \x01(\t\x12\x1a\n\x12\x65s_mentionsaccount\x18\x45 \x01(\t\x12\x10\n\x08\x65s_video\x18\x46 \x01(\t\x12\x13\n\x0b\x65s_isrepost\x18G \x01(\t\x12\x0f\n\x07\x65s_lang\x18H \x01(\t\x12\x11\n\tes_client\x18I \x01(\t\x12\x13\n\x0b\x65s_snapshot\x18J \x01(\t\x12\x10\n\x08\x65s_title\x18K \x01(\tb\x06proto3' serialized_pb=b'\n\x08\x45s.proto\"\x19\n\x06\x45sSets\x12\x0f\n\x02\x45s\x18\x01 \x03(\x0b\x32\x03.Es\"\xba\x0c\n\x02\x45s\x12\x0e\n\x06\x65s_sid\x18\x01 \x01(\t\x12\x14\n\x0c\x65s_subjectId\x18\x02 \x01(\t\x12\x0f\n\x07\x65s_hkey\x18\x03 \x01(\t\x12\x0f\n\x07\x65s_pkey\x18\x04 \x01(\t\x12\x12\n\nes_startid\x18\x05 \x01(\t\x12\x12\n\nes_urlname\x18\x06 \x01(\t\x12\x13\n\x0b\x65s_sitename\x18\x07 \x01(\t\x12\x12\n\nes_extname\x18\x08 \x01(\t\x12\x12\n\nes_channel\x18\t \x01(\t\x12\x14\n\x0c\x65s_groupname\x18\n \x01(\t\x12\x13\n\x0b\x65s_urltitle\x18\x0b \x01(\t\x12\x13\n\x0b\x65s_urltopic\x18\x0c \x01(\t\x12\x13\n\x0b\x65s_lasttime\x18\r \x01(\t\x12\x13\n\x0b\x65s_loadtime\x18\x0e \x01(\t\x12\x12\n\nes_urldate\x18\x0f \x01(\t\x12\x12\n\nes_urltime\x18\x10 \x01(\t\x12\x12\n\nes_srcname\x18\x11 \x01(\t\x12\x12\n\nes_authors\x18\x12 \x01(\t\x12\x13\n\x0b\x65s_district\x18\x13 \x01(\t\x12\x12\n\nes_catalog\x18\x14 \x01(\t\x12\x13\n\x0b\x65s_catalog1\x18\x15 \x01(\t\x12\x13\n\x0b\x65s_catalog2\x18\x16 \x01(\t\x12\x13\n\x0b\x65s_keywords\x18\x17 \x01(\t\x12\x13\n\x0b\x65s_abstract\x18\x18 \x01(\t\x12\x12\n\nes_simflag\x18\x19 \x01(\t\x12\x12\n\nes_simrank\x18\x1a \x01(\t\x12\x13\n\x0b\x65s_urlimage\x18\x1b \x01(\t\x12\x14\n\x0c\x65s_imageflag\x18\x1c \x01(\t\x12\x14\n\x0c\x65s_tableflag\x18\x1d \x01(\t\x12\x14\n\x0c\x65s_doclength\x18\x1e \x01(\t\x12\x12\n\nes_content\x18\x1f \x01(\t\x12\x15\n\res_urlcontent\x18 \x01(\t\x12\x11\n\tes_bbsnum\x18! \x01(\t\x12\x14\n\x0c\x65s_pagelevel\x18\" \x01(\t\x12\x13\n\x0b\x65s_urllevel\x18# \x01(\t\x12\x12\n\nes_simhash\x18$ \x01(\t\x12\r\n\x05\x65s_ip\x18% \x01(\t\x12\x0f\n\x07\x65s_heat\x18& \x01(\t\x12\x1a\n\x12\x65s_similaritycount\x18\' \x01(\t\x12\x15\n\res_similarity\x18( \x01(\t\x12\x19\n\x11\x65s_similaritytime\x18) \x01(\t\x12\x12\n\nes_emotion\x18* \x01(\t\x12\x16\n\x0e\x65s_warningtime\x18+ \x01(\t\x12\x16\n\x0e\x65s_carriertype\x18, \x01(\t\x12\x17\n\x0f\x65s_commentcount\x18- \x01(\t\x12\x17\n\x0f\x65s_forwardcount\x18. \x01(\t\x12\x18\n\x10\x65s_positiveWords\x18/ \x01(\t\x12\x18\n\x10\x65s_negativeWords\x18\x30 \x01(\t\x12\x1e\n\x16\x65s_negativeProbability\x18\x31 \x01(\t\x12\x15\n\res_reportinfo\x18\x32 \x01(\t\x12\x14\n\x0c\x65s_attention\x18\x33 \x01(\t\x12\x12\n\nes_warning\x18\x34 \x01(\t\x12\x13\n\x0b\x65s_readsign\x18\x35 \x01(\t\x12\x13\n\x0b\x65s_briefing\x18\x36 \x01(\t\x12\x17\n\x0f\x65s_warning_word\x18\x37 \x01(\t\x12\x18\n\x10\x65s_attentiontime\x18\x38 \x01(\t\x12\x15\n\res_collection\x18\x39 \x01(\t\x12\x15\n\res_attachment\x18: \x01(\t\x12\x11\n\tes_userid\x18; \x01(\t\x12\x16\n\x0e\x65s_contenttype\x18< \x01(\t\x12\x14\n\x0c\x65s_likecount\x18= \x01(\t\x12\x10\n\x08\x65s_links\x18> \x01(\t\x12\x18\n\x10\x65s_reactioncount\x18? \x01(\t\x12\x13\n\x0b\x65s_linkdesc\x18@ \x01(\t\x12\x14\n\x0c\x65s_repostuid\x18\x41 \x01(\t\x12\x16\n\x0e\x65s_repostuname\x18\x42 \x01(\t\x12\x13\n\x0b\x65s_repostid\x18\x43 \x01(\t\x12\x0f\n\x07\x65s_tags\x18\x44 \x01(\t\x12\x1a\n\x12\x65s_mentionsaccount\x18\x45 \x01(\t\x12\x10\n\x08\x65s_video\x18\x46 \x01(\t\x12\x13\n\x0b\x65s_isrepost\x18G \x01(\t\x12\x0f\n\x07\x65s_lang\x18H \x01(\t\x12\x11\n\tes_client\x18I \x01(\t\x12\x13\n\x0b\x65s_snapshot\x18J \x01(\tb\x06proto3'
) )
@ -580,13 +580,6 @@ _ES = _descriptor.Descriptor(
message_type=None, enum_type=None, containing_type=None, message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None, is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR), serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='es_title', full_name='Es.es_title', index=74,
number=75, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
], ],
extensions=[ extensions=[
], ],
@ -600,7 +593,7 @@ _ES = _descriptor.Descriptor(
oneofs=[ oneofs=[
], ],
serialized_start=40, serialized_start=40,
serialized_end=1652, serialized_end=1634,
) )
_ESSETS.fields_by_name['Es'].message_type = _ES _ESSETS.fields_by_name['Es'].message_type = _ES

View File

@ -12,9 +12,8 @@ SCHEDULER_PERSIST = True
SELENIUM_DRIVER_NAME = 'firefox' SELENIUM_DRIVER_NAME = 'firefox'
SELENIUM_DRIVER_EXECUTABLE_PATH = [ SELENIUM_DRIVER_EXECUTABLE_PATH = [
'http://10.55.13.121:28095', 'http://10.55.13.121:28095',
# 'http://10.55.13.108:28095', 'http://10.55.13.108:28095',
'http://10.55.13.3:28095', 'http://10.55.13.3:28095',
'http://74.121.148.204:28095'
] ]
SELENIUM_DRIVER_ARGUMENTS = ['-headless'] # '--headless' if using chrome instead of firefox SELENIUM_DRIVER_ARGUMENTS = ['-headless'] # '--headless' if using chrome instead of firefox
SELENIUM_DRIVER_PREFERENCES = { SELENIUM_DRIVER_PREFERENCES = {
@ -27,10 +26,8 @@ PER_BATCH_IP_USE_TIMES = 5 # 代理中间件每次从ip池获取一批ip
# REDIS_HOST = '38.54.94.107' # REDIS_HOST = '38.54.94.107'
# REDIS_PORT = '28097' # REDIS_PORT = '28097'
# REDIS_HOST = '10.55.13.3' REDIS_HOST = '10.55.13.3'
# REDIS_PORT = '7379' REDIS_PORT = '7379'
REDIS_HOST = '107.182.191.3'
REDIS_PORT = 7379
REDIS_PWD = 'jlkj-841-2-redis' REDIS_PWD = 'jlkj-841-2-redis'
REDIS_PARAMS = { REDIS_PARAMS = {
'password': 'jlkj-841-2-redis', 'password': 'jlkj-841-2-redis',
@ -167,10 +164,3 @@ ITEM_PIPELINES = {
'scrapy.pipelines.images.ImagesPipeline': 2, 'scrapy.pipelines.images.ImagesPipeline': 2,
'WebsiteSpider.pipelines.ProtobufSavePipeline': 300, 'WebsiteSpider.pipelines.ProtobufSavePipeline': 300,
} }
############################## 翻译
MAX_TEXT_LENGTH = 5999
# 翻译 API 地址(替换为你的服务器 IP 或域名)
TRANSLATE_API_URL = "http://47.113.231.200:28082/translate"
# 单次请求间隔(秒),避免 API 被限流
REQUEST_DELAY = 1

View File

@ -5,9 +5,7 @@ import re
import scrapy import scrapy
import validators import validators
from scrapy_redis.spiders import RedisSpider from scrapy_redis.spiders import RedisSpider
import redis
from WebsiteSpider.settings import REDIS_HOST, REDIS_PORT, REDIS_PWD
from WebsiteSpider.scrapy_selenium import SeleniumRequest from WebsiteSpider.scrapy_selenium import SeleniumRequest
from WebsiteSpider.utils.http_utils import build_url from WebsiteSpider.utils.http_utils import build_url
from WebsiteSpider.utils.parser_utils import parse_item_from_response from WebsiteSpider.utils.parser_utils import parse_item_from_response
@ -20,8 +18,7 @@ class WebsiteInfoCommonSpider(RedisSpider):
super(WebsiteInfoCommonSpider, self).__init__(*args, **kwargs) super(WebsiteInfoCommonSpider, self).__init__(*args, **kwargs)
json_params = json.loads(params) json_params = json.loads(params)
self.name = 'WebSite_' + json_params['clusterName'] self.name = 'WebSite_' + json_params['clusterName']
self.redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, self.redis_client = None
password=REDIS_PWD)
if 'job_id' in json_params: if 'job_id' in json_params:
self.job_id = json_params['job_id'] self.job_id = json_params['job_id']
@ -39,7 +36,7 @@ class WebsiteInfoCommonSpider(RedisSpider):
# 根据url特征判断是否为内容页若是则解析文本内容 # 根据url特征判断是否为内容页若是则解析文本内容
detail_page_reg = parse_rule['detailPageReg'] detail_page_reg = parse_rule['detailPageReg']
if detail_page_reg == "" or re.search(detail_page_reg, response.url) is not None: if detail_page_reg == "" or re.search(detail_page_reg, response.url) is not None:
yield_flag, webpage_item = parse_item_from_response(response, parse_rule, self.redis_client) yield_flag, webpage_item = parse_item_from_response(response, parse_rule)
if yield_flag: if yield_flag:
yield webpage_item yield webpage_item

View File

@ -89,9 +89,7 @@ def get_format_time(pattern, time_str):
date = result.group(1) date = result.group(1)
time_t = result.group(2) time_t = result.group(2)
date = date.replace('/', '-').replace(".", "-").replace( date = date.replace('/', '-').replace(".", "-").replace(
",", "-").replace("", "-").replace("", "-").replace("", "").replace( ",", "-").replace("", "-").replace("", "-").replace("", "").replace(' ', '-').replace('--', '-')
"", "-").replace("", "-").replace("", "").replace(
' ', '-').replace('--', '-')
date_array = date.split('-') date_array = date.split('-')
for i in range(len(date_array)): for i in range(len(date_array)):
if (date_array[i].endswith('st') or if (date_array[i].endswith('st') or
@ -137,7 +135,7 @@ def get_format_time(pattern, time_str):
if __name__ == '__main__': if __name__ == '__main__':
# a = [' 令和4年6月9日', 'www.kcna.kp (主体111.6.6.)', '民國111年06月09日 ', 'Jun. 9, 2022', '111年 06月 21日'] # a = [' 令和4年6月9日', 'www.kcna.kp (主体111.6.6.)', '民國111年06月09日 ', 'Jun. 9, 2022', '111年 06月 21日']
a = ['Wed, 12/03/2025 - 12:00'] a = ['July 26, 2024 12:53 PM']
for _ in a: for _ in a:
print(get_time_stamp(_)) print(get_time_stamp(_))
# print(get_time_stamp(_, {r"(\d{2}.\d{2}.\d{4})\D*(\d{2}\d{2}\d{2})*\D*": ['%d-%m-%Y %H:%M:%S']})) # print(get_time_stamp(_, {r"(\w+ \d+, \d{4})\D*(\d+:\d+)\D*": ['%B-%d-%Y %H:%M:%S']}))

View File

@ -20,11 +20,11 @@ def http_get(url):
return rsp return rsp
def http_post(url, data, headers=None, timeout=60): def http_post(url, data, headers=None):
if headers: if headers:
rsp = requests.post(url, data=data, headers=headers, timeout=timeout) rsp = requests.post(url, data=data, headers=headers)
else: else:
rsp = requests.post(url, data=data, headers={'User-Agent': ua}, timeout=timeout) rsp = requests.post(url, data=data, headers={'User-Agent': ua})
return rsp return rsp

View File

@ -9,10 +9,9 @@ from scrapy.loader import ItemLoader
from WebsiteSpider.items import WebsiteSpiderItem from WebsiteSpider.items import WebsiteSpiderItem
from WebsiteSpider.utils.date_utils import transfer_time_zone, get_time_stamp from WebsiteSpider.utils.date_utils import transfer_time_zone, get_time_stamp
from WebsiteSpider.utils.http_utils import filter_html_tags, build_url from WebsiteSpider.utils.http_utils import filter_html_tags, build_url
from WebsiteSpider.utils.traslate_utils import translate_single, translate_content_with_paragraphs, update_record
def parse_item_from_response(response, parse_rule, redis_client): def parse_item_from_response(response, parse_rule):
current_url = response.url current_url = response.url
allowed_domains = parse_rule['allowDomain'].split(';') allowed_domains = parse_rule['allowDomain'].split(';')
mapping = parse_rule['fieldMappings'] mapping = parse_rule['fieldMappings']
@ -117,7 +116,6 @@ def parse_item_from_response(response, parse_rule, redis_client):
logger.info("urltime: %s" % webpage_item['es_urltime']) logger.info("urltime: %s" % webpage_item['es_urltime'])
except KeyError: except KeyError:
logger.info('时间解析失败当前页面url: %s' % response.url) logger.info('时间解析失败当前页面url: %s' % response.url)
time_parse_rule = None time_parse_rule = None
if 'dateReg' in mapping: if 'dateReg' in mapping:
time_parse_rule = { time_parse_rule = {
@ -157,27 +155,4 @@ def parse_item_from_response(response, parse_rule, redis_client):
logger.info('时间无法解析,解析规则是:' + mapping['es_urltime']) logger.info('时间无法解析,解析规则是:' + mapping['es_urltime'])
if filter_VIP_content: if filter_VIP_content:
logger.info('当前内容是VIP文章并不完整已经过滤。') logger.info('当前内容是VIP文章并不完整已经过滤。')
if yield_flag:
try:
# 1. 从 Redis 获取原始数据
raw_urls = redis_client.lrange('WebsiteSpider:translate_sites', 0, -1)
translate_list = [
url_bytes.decode('utf-8').strip()
for url_bytes in raw_urls
if url_bytes and url_bytes.decode('utf-8').strip()
]
if webpage_item['es_srcname'] in translate_list:
# 翻译标题
webpage_item['es_abstract'] = translate_single(webpage_item['es_urltitle'])
if webpage_item['es_abstract'] is None:
logger.warning(" → 标题翻译失败,跳过整条")
else:
logger.info(f"翻译成功,标题译文长度:{len(webpage_item['es_abstract'])}")
# 翻译内容(按段落,容错)
no_tag_content = filter_html_tags(webpage_item['es_urlcontent'], retain_img_br=False)
webpage_item['es_content'] = translate_content_with_paragraphs(no_tag_content)
logger.info(f"翻译成功,内容译文长度:{len(webpage_item['es_content'])}")
except Exception as e:
logger.error(repr(e))
return yield_flag, webpage_item return yield_flag, webpage_item

View File

@ -3,7 +3,6 @@ import logging as logger
import os import os
import random import random
import requests
from PIL import Image from PIL import Image
from selenium.webdriver import DesiredCapabilities from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.firefox.options import Options from selenium.webdriver.firefox.options import Options
@ -26,15 +25,10 @@ def check_session(drive_path):
api = drive_path + '/graphql' api = drive_path + '/graphql'
post_body = '{"query": "{ grid { maxSession, sessionCount } }"}' post_body = '{"query": "{ grid { maxSession, sessionCount } }"}'
try: try:
# 添加超时控制1分钟 = 600秒 response = http_post(api, post_body)
response = http_post(api, post_body, timeout=60)
data_body = json.loads(response.content.decode()) data_body = json.loads(response.content.decode())
session_info = data_body['data']['grid'] session_info = data_body['data']['grid']
return session_info return session_info
except requests.exceptions.Timeout as e:
logger.error("获取地址为 {} 的 Selenium 信息超时超过5分钟".format(drive_path))
logger.error(repr(e))
return None
except Exception as e: except Exception as e:
logger.warning("获取地址为 {} 的 Selenium 信息失败,错误信息是:".format(drive_path)) logger.warning("获取地址为 {} 的 Selenium 信息失败,错误信息是:".format(drive_path))
logger.warning(repr(e)) logger.warning(repr(e))

View File

@ -1,78 +0,0 @@
from WebsiteSpider.settings import MAX_TEXT_LENGTH, TRANSLATE_API_URL, REQUEST_DELAY
import requests
import time
from typing import List, Tuple, Optional
def normalize_newlines(text: str) -> str:
"""\r\n\r 统一转换为 \n"""
if not text:
return text
return text.replace('\r\n', '\n').replace('\r', '\n')
def translate_single(text: str, source_lang: str = "auto", target_lang: str = "zh") -> Optional[str]:
"""翻译单段文本,失败返回 None"""
if not text or not text.strip():
return ""
payload = {
"text": text[:MAX_TEXT_LENGTH],
"source_lang": source_lang,
"target_lang": target_lang
}
try:
response = requests.post(TRANSLATE_API_URL, json=payload, timeout=10)
response.raise_for_status()
result = response.json()
return result.get("translated_text")
except Exception as e:
print(f"⚠️ 翻译失败: {e}")
return None
def translate_content_with_paragraphs(content: str) -> str:
"""
按段落翻译内容支持容错
- 某段失败 跳过该段保留空行或原文
- 返回拼接后的完整内容
"""
if not content:
return ""
# 标准化换行符
content = normalize_newlines(content)
paragraphs = content.split('\n')
translated_paragraphs = []
for para in paragraphs:
if not para.strip():
# 保留空行
translated_paragraphs.append("")
continue
trans = translate_single(para)
if trans is None:
# 段落翻译失败:跳过该段(可选:保留原文或留空)
print(f" ⚠️ 段落翻译失败,跳过: {para[:30]}...")
translated_paragraphs.append("") # 或 append(para) 保留原文
else:
translated_paragraphs.append(trans)
time.sleep(REQUEST_DELAY)
return '\n'.join(translated_paragraphs)
# ================== 数据库操作 ==================
def update_record(cursor, es_sid: int, new_title: str, new_content: str):
update_query = """
UPDATE indeximos
SET es_title = % s, es_content = % s
WHERE es_sid = % s
"""
cursor.execute(update_query, (new_title, new_content, es_sid))
if __name__ == "__main__":
print(translate_content_with_paragraphs("ВСУ провалили наступление на Сумском и Харьковском направлениях, сообщили РИА Новости в силовых структурах. В результате слаженных действий российских бойцов контратаки отражены, а противник обращен в бегство. Введенные ЕС ограничения на передвижения российских дипломатов противоречат Венской конвенции о дипломатических сношениях и мешают нормальной работе дипмиссий. Об этом заявил РИА Новости посол России в Бельгии Денис Гончар. Вице-президент США Джей Ди Вэнс посетит с визитом Армению и Азербайджан. Поездка в Ереван состоится 9-10 февраля, в Баку 10-11 февраля. В Вашингтон Вэнс вернется \"в среду вечером\", сообщает его пресс-пул. Либерально-демократическая партия под руководством премьер-министра Японии Санаэ Такаити победила на выборах в ключевую нижнюю палату парламента. Представители ЛДП получат 316 из 465 мандатов и смогут проводить законопроекты, даже если они не получат поддержки верхней палаты, где партия не имеет большинства. В России самая низкая безработица в странах \"Большой двадцатки\", выяснило РИА Новости, изучив данные национальных статслужб по итогам 2025 года. Уровень безработицы в России в декабре составил 2,2 процента, что на одну десятую процента ниже показателя 2024 года."))

View File

@ -9,4 +9,4 @@ dirpath = os.path.dirname(os.path.abspath(__file__))
sys.path.append(dirpath) sys.path.append(dirpath)
if __name__ == "__main__": if __name__ == "__main__":
execute(['scrapy', 'crawl', 'Website_report_list', '-a', 'params={"job_id":"801","clusterName":"star_4"}']) execute(['scrapy', 'crawl', 'website_info_common', '-a', 'params={"job_id":"801","clusterName":"star_4"}'])