Compare commits

...

2 Commits

Author SHA1 Message Date
yuxin-pc
81cac59adb 实现公司部署处理的业务逻辑 2025-12-26 08:55:50 +08:00
yuxin-pc
970d86ed7d 调用selenium下载网页的方法 2025-12-26 08:54:58 +08:00
23 changed files with 1765 additions and 218 deletions

View File

@ -9,6 +9,16 @@
<facet type="Spring" name="Spring"> <facet type="Spring" name="Spring">
<configuration /> <configuration />
</facet> </facet>
<facet type="jpa" name="JPA">
<configuration>
<setting name="validation-enabled" value="true" />
<setting name="provider-name" value="Hibernate" />
<datasource-mapping>
<factory-entry name="entityManagerFactory" />
</datasource-mapping>
<naming-strategy-map />
</configuration>
</facet>
</component> </component>
<component name="NewModuleRootManager" LANGUAGE_LEVEL="JDK_1_8"> <component name="NewModuleRootManager" LANGUAGE_LEVEL="JDK_1_8">
<output url="file://$MODULE_DIR$/target/classes" /> <output url="file://$MODULE_DIR$/target/classes" />
@ -20,7 +30,7 @@
</content> </content>
<orderEntry type="inheritedJdk" /> <orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" /> <orderEntry type="sourceFolder" forTests="false" />
<orderEntry type="library" name="Maven: org.jetbrains:annotations:26.0.2" level="project" /> <orderEntry type="library" name="Maven: org.jetbrains:annotations:26.0.2-1" level="project" />
<orderEntry type="library" name="Maven: org.apache.commons:commons-compress:1.18" level="project" /> <orderEntry type="library" name="Maven: org.apache.commons:commons-compress:1.18" level="project" />
<orderEntry type="library" name="Maven: com.alibaba:fastjson:1.2.75" level="project" /> <orderEntry type="library" name="Maven: com.alibaba:fastjson:1.2.75" level="project" />
<orderEntry type="library" name="Maven: org.projectlombok:lombok:1.18.16" level="project" /> <orderEntry type="library" name="Maven: org.projectlombok:lombok:1.18.16" level="project" />
@ -30,7 +40,6 @@
<orderEntry type="library" name="Maven: ch.qos.logback:logback-classic:1.2.3" level="project" /> <orderEntry type="library" name="Maven: ch.qos.logback:logback-classic:1.2.3" level="project" />
<orderEntry type="library" name="Maven: ch.qos.logback:logback-core:1.2.3" level="project" /> <orderEntry type="library" name="Maven: ch.qos.logback:logback-core:1.2.3" level="project" />
<orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-to-slf4j:2.13.3" level="project" /> <orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-to-slf4j:2.13.3" level="project" />
<orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-api:2.13.3" level="project" />
<orderEntry type="library" name="Maven: org.slf4j:jul-to-slf4j:1.7.30" level="project" /> <orderEntry type="library" name="Maven: org.slf4j:jul-to-slf4j:1.7.30" level="project" />
<orderEntry type="library" name="Maven: jakarta.annotation:jakarta.annotation-api:1.3.5" level="project" /> <orderEntry type="library" name="Maven: jakarta.annotation:jakarta.annotation-api:1.3.5" level="project" />
<orderEntry type="library" name="Maven: org.yaml:snakeyaml:1.27" level="project" /> <orderEntry type="library" name="Maven: org.yaml:snakeyaml:1.27" level="project" />
@ -93,7 +102,6 @@
<orderEntry type="library" name="Maven: org.apache.httpcomponents:httpcore:4.4.14" level="project" /> <orderEntry type="library" name="Maven: org.apache.httpcomponents:httpcore:4.4.14" level="project" />
<orderEntry type="library" name="Maven: org.apache.httpcomponents:httpasyncclient:4.1.4" level="project" /> <orderEntry type="library" name="Maven: org.apache.httpcomponents:httpasyncclient:4.1.4" level="project" />
<orderEntry type="library" name="Maven: org.apache.httpcomponents:httpcore-nio:4.4.14" level="project" /> <orderEntry type="library" name="Maven: org.apache.httpcomponents:httpcore-nio:4.4.14" level="project" />
<orderEntry type="library" name="Maven: commons-codec:commons-codec:1.15" level="project" />
<orderEntry type="library" name="Maven: org.elasticsearch.plugin:mapper-extras-client:7.7.0" level="project" /> <orderEntry type="library" name="Maven: org.elasticsearch.plugin:mapper-extras-client:7.7.0" level="project" />
<orderEntry type="library" name="Maven: org.elasticsearch.plugin:parent-join-client:7.7.0" level="project" /> <orderEntry type="library" name="Maven: org.elasticsearch.plugin:parent-join-client:7.7.0" level="project" />
<orderEntry type="library" name="Maven: org.elasticsearch.plugin:aggs-matrix-stats-client:7.7.0" level="project" /> <orderEntry type="library" name="Maven: org.elasticsearch.plugin:aggs-matrix-stats-client:7.7.0" level="project" />
@ -106,8 +114,6 @@
<orderEntry type="library" name="Maven: org.springframework.boot:spring-boot-starter-validation:2.4.1" level="project" /> <orderEntry type="library" name="Maven: org.springframework.boot:spring-boot-starter-validation:2.4.1" level="project" />
<orderEntry type="library" name="Maven: org.hibernate.validator:hibernate-validator:6.1.6.Final" level="project" /> <orderEntry type="library" name="Maven: org.hibernate.validator:hibernate-validator:6.1.6.Final" level="project" />
<orderEntry type="library" name="Maven: jakarta.validation:jakarta.validation-api:2.0.2" level="project" /> <orderEntry type="library" name="Maven: jakarta.validation:jakarta.validation-api:2.0.2" level="project" />
<orderEntry type="library" name="Maven: org.jboss.logging:jboss-logging:3.4.1.Final" level="project" />
<orderEntry type="library" name="Maven: com.fasterxml:classmate:1.5.1" level="project" />
<orderEntry type="library" name="Maven: org.springframework:spring-messaging:5.3.2" level="project" /> <orderEntry type="library" name="Maven: org.springframework:spring-messaging:5.3.2" level="project" />
<orderEntry type="library" name="Maven: org.springframework.integration:spring-integration-core:5.4.2" level="project" /> <orderEntry type="library" name="Maven: org.springframework.integration:spring-integration-core:5.4.2" level="project" />
<orderEntry type="library" name="Maven: io.projectreactor:reactor-core:3.4.1" level="project" /> <orderEntry type="library" name="Maven: io.projectreactor:reactor-core:3.4.1" level="project" />
@ -138,8 +144,8 @@
<orderEntry type="library" scope="TEST" name="Maven: net.minidev:json-smart:2.3" level="project" /> <orderEntry type="library" scope="TEST" name="Maven: net.minidev:json-smart:2.3" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: net.minidev:accessors-smart:1.2" level="project" /> <orderEntry type="library" scope="TEST" name="Maven: net.minidev:accessors-smart:1.2" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.ow2.asm:asm:5.0.4" level="project" /> <orderEntry type="library" scope="TEST" name="Maven: org.ow2.asm:asm:5.0.4" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: jakarta.xml.bind:jakarta.xml.bind-api:2.3.3" level="project" /> <orderEntry type="library" name="Maven: jakarta.xml.bind:jakarta.xml.bind-api:2.3.3" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: jakarta.activation:jakarta.activation-api:1.2.2" level="project" /> <orderEntry type="library" name="Maven: jakarta.activation:jakarta.activation-api:1.2.2" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.assertj:assertj-core:3.18.1" level="project" /> <orderEntry type="library" scope="TEST" name="Maven: org.assertj:assertj-core:3.18.1" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.hamcrest:hamcrest:2.2" level="project" /> <orderEntry type="library" scope="TEST" name="Maven: org.hamcrest:hamcrest:2.2" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.junit.jupiter:junit-jupiter:5.7.0" level="project" /> <orderEntry type="library" scope="TEST" name="Maven: org.junit.jupiter:junit-jupiter:5.7.0" level="project" />
@ -147,7 +153,7 @@
<orderEntry type="library" scope="TEST" name="Maven: org.junit.jupiter:junit-jupiter-engine:5.7.0" level="project" /> <orderEntry type="library" scope="TEST" name="Maven: org.junit.jupiter:junit-jupiter-engine:5.7.0" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.junit.platform:junit-platform-engine:1.7.0" level="project" /> <orderEntry type="library" scope="TEST" name="Maven: org.junit.platform:junit-platform-engine:1.7.0" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.mockito:mockito-core:3.6.28" level="project" /> <orderEntry type="library" scope="TEST" name="Maven: org.mockito:mockito-core:3.6.28" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: net.bytebuddy:byte-buddy:1.10.18" level="project" /> <orderEntry type="library" name="Maven: net.bytebuddy:byte-buddy:1.10.18" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: net.bytebuddy:byte-buddy-agent:1.10.18" level="project" /> <orderEntry type="library" scope="TEST" name="Maven: net.bytebuddy:byte-buddy-agent:1.10.18" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.objenesis:objenesis:3.1" level="project" /> <orderEntry type="library" scope="TEST" name="Maven: org.objenesis:objenesis:3.1" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.mockito:mockito-junit-jupiter:3.6.28" level="project" /> <orderEntry type="library" scope="TEST" name="Maven: org.mockito:mockito-junit-jupiter:3.6.28" level="project" />
@ -195,5 +201,40 @@
<orderEntry type="library" name="Maven: org.springframework.boot:spring-boot-devtools:2.4.1" level="project" /> <orderEntry type="library" name="Maven: org.springframework.boot:spring-boot-devtools:2.4.1" level="project" />
<orderEntry type="library" name="Maven: org.springframework.boot:spring-boot:2.4.1" level="project" /> <orderEntry type="library" name="Maven: org.springframework.boot:spring-boot:2.4.1" level="project" />
<orderEntry type="library" scope="RUNTIME" name="Maven: mysql:mysql-connector-java:8.0.22" level="project" /> <orderEntry type="library" scope="RUNTIME" name="Maven: mysql:mysql-connector-java:8.0.22" level="project" />
<orderEntry type="library" name="Maven: commons-net:commons-net:3.10.0" level="project" />
<orderEntry type="library" name="Maven: org.springframework.boot:spring-boot-starter-data-jpa:2.4.1" level="project" />
<orderEntry type="library" name="Maven: org.springframework.boot:spring-boot-starter-aop:2.4.1" level="project" />
<orderEntry type="library" name="Maven: org.aspectj:aspectjweaver:1.9.6" level="project" />
<orderEntry type="library" name="Maven: org.springframework.boot:spring-boot-starter-jdbc:2.4.1" level="project" />
<orderEntry type="library" name="Maven: com.zaxxer:HikariCP:3.4.5" level="project" />
<orderEntry type="library" name="Maven: org.springframework:spring-jdbc:5.3.2" level="project" />
<orderEntry type="library" name="Maven: jakarta.transaction:jakarta.transaction-api:1.3.3" level="project" />
<orderEntry type="library" name="Maven: jakarta.persistence:jakarta.persistence-api:2.2.3" level="project" />
<orderEntry type="library" name="Maven: org.hibernate:hibernate-core:5.4.25.Final" level="project" />
<orderEntry type="library" name="Maven: org.jboss.logging:jboss-logging:3.4.1.Final" level="project" />
<orderEntry type="library" name="Maven: org.javassist:javassist:3.27.0-GA" level="project" />
<orderEntry type="library" name="Maven: antlr:antlr:2.7.7" level="project" />
<orderEntry type="library" name="Maven: org.jboss:jandex:2.1.3.Final" level="project" />
<orderEntry type="library" name="Maven: com.fasterxml:classmate:1.5.1" level="project" />
<orderEntry type="library" name="Maven: org.dom4j:dom4j:2.1.3" level="project" />
<orderEntry type="library" name="Maven: org.hibernate.common:hibernate-commons-annotations:5.1.2.Final" level="project" />
<orderEntry type="library" name="Maven: org.glassfish.jaxb:jaxb-runtime:2.3.3" level="project" />
<orderEntry type="library" name="Maven: org.glassfish.jaxb:txw2:2.3.3" level="project" />
<orderEntry type="library" name="Maven: com.sun.istack:istack-commons-runtime:3.0.11" level="project" />
<orderEntry type="library" scope="RUNTIME" name="Maven: com.sun.activation:jakarta.activation:1.2.2" level="project" />
<orderEntry type="library" name="Maven: org.springframework.data:spring-data-jpa:2.4.2" level="project" />
<orderEntry type="library" name="Maven: org.springframework:spring-orm:5.3.2" level="project" />
<orderEntry type="library" name="Maven: org.springframework:spring-aspects:5.3.2" level="project" />
<orderEntry type="library" name="Maven: org.apache.poi:poi:5.2.4" level="project" />
<orderEntry type="library" name="Maven: commons-codec:commons-codec:1.15" level="project" />
<orderEntry type="library" name="Maven: org.apache.commons:commons-collections4:4.4" level="project" />
<orderEntry type="library" name="Maven: org.apache.commons:commons-math3:3.6.1" level="project" />
<orderEntry type="library" name="Maven: commons-io:commons-io:2.13.0" level="project" />
<orderEntry type="library" name="Maven: com.zaxxer:SparseBitSet:1.3" level="project" />
<orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-api:2.13.3" level="project" />
<orderEntry type="library" name="Maven: org.apache.poi:poi-ooxml:5.2.4" level="project" />
<orderEntry type="library" name="Maven: org.apache.poi:poi-ooxml-lite:5.2.4" level="project" />
<orderEntry type="library" name="Maven: org.apache.xmlbeans:xmlbeans:5.1.1" level="project" />
<orderEntry type="library" name="Maven: com.github.virtuald:curvesapi:1.08" level="project" />
</component> </component>
</module> </module>

View File

@ -94,15 +94,34 @@
<optional>true</optional> <optional>true</optional>
<scope>true</scope> <scope>true</scope>
</dependency> </dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
</dependency>
<dependency> <dependency>
<groupId>mysql</groupId> <groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId> <artifactId>mysql-connector-java</artifactId>
<scope>runtime</scope> <scope>runtime</scope>
</dependency> </dependency>
<dependency>
<groupId>commons-net</groupId>
<artifactId>commons-net</artifactId>
<version>3.10.0</version> <!-- 或使用最新版本 -->
</dependency>
<!-- Spring Data JPA -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-jpa</artifactId>
</dependency>
<!-- Apache POI for Excel (xlsx) -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>5.2.4</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>5.2.4</version>
</dependency>
</dependencies> </dependencies>
<dependencyManagement> <dependencyManagement>

View File

@ -3,9 +3,10 @@ package com.jsc.dsp;
import org.springframework.boot.SpringApplication; import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication; import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.boot.autoconfigure.jdbc.DataSourceAutoConfiguration; import org.springframework.boot.autoconfigure.jdbc.DataSourceAutoConfiguration;
import org.springframework.data.jpa.repository.config.EnableJpaRepositories;
import org.springframework.scheduling.annotation.EnableScheduling; import org.springframework.scheduling.annotation.EnableScheduling;
@SpringBootApplication(exclude = DataSourceAutoConfiguration.class) @SpringBootApplication
@EnableScheduling @EnableScheduling
public class DspApplication { public class DspApplication {

View File

@ -0,0 +1,45 @@
package com.jsc.dsp;
import com.alibaba.fastjson.JSONObject;
import com.jsc.dsp.model.ReturnT;
import com.jsc.dsp.utils.AutoPatroller;
import com.jsc.dsp.utils.DatabaseConnector;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestBody;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
import javax.annotation.Resource;
@RestController
@RequestMapping("/test")
public class TestController {
@Resource
DatabaseConnector databaseConnector;
@Resource
AutoPatroller autoPatroller;
@PostMapping("/exportExcel")
public ReturnT<String> exportExcel(@RequestBody JSONObject object) {
try {
String startTime = object.getString("startTime");
databaseConnector.exportToXlsx(startTime);
return new ReturnT<>(200, "", "");
} catch (Exception e) {
return new ReturnT<>(500, e.getMessage(), "");
}
}
@PostMapping("/triggerExportTask")
public ReturnT<String> triggerTask() {
try {
autoPatroller.exportDataAndUpload();
return new ReturnT<>(200, "", "");
} catch (Exception e) {
return new ReturnT<>(500, e.getMessage(), "");
}
}
}

View File

@ -0,0 +1,10 @@
package com.jsc.dsp.dao;
import com.jsc.dsp.model.Config;
import org.springframework.data.jpa.repository.JpaRepository;
import org.springframework.stereotype.Repository;
@Repository
public interface ConfigRepository extends JpaRepository<Config, Integer> {
Config findFirstByConfigName(String configName);
}

View File

@ -0,0 +1,12 @@
package com.jsc.dsp.dao;
import com.jsc.dsp.model.EsDataNewsView;
import org.springframework.data.jpa.repository.JpaRepository;
import org.springframework.stereotype.Repository;
import java.util.List;
@Repository
public interface EsDataNewsRepository extends JpaRepository<EsDataNewsView, String> {
List<EsDataNewsView> findAllByEsLoadtimeAfter(String loadtime);
}

View File

@ -0,0 +1,9 @@
package com.jsc.dsp.dao;
import com.jsc.dsp.model.Indeximos;
import org.springframework.data.jpa.repository.JpaRepository;
import org.springframework.stereotype.Repository;
@Repository
public interface IndeximosRepository extends JpaRepository<Indeximos, String> {
}

View File

@ -0,0 +1,15 @@
package com.jsc.dsp.model;
import lombok.Data;
import javax.persistence.Entity;
import javax.persistence.Id;
@Entity
@Data
public class Config {
@Id
Integer id;
String configName;
String configValue;
}

View File

@ -0,0 +1,34 @@
package com.jsc.dsp.model;
import lombok.Data;
import javax.persistence.Entity;
import javax.persistence.Id;
import javax.persistence.Table;
@Entity
@Data
@Table(name = "es_data_news")
public class EsDataNewsView {
@Id
String esSid;
String esAuthors;
String esCarriertype;
String esCatalog;
String esCollection;
Float esDoclength;
String esLang;
String esLasttime;
String esLinks;
String esLoadtime;
String esSitename;
String esSrcname;
String esUrlcontent;
String esUrlimage;
String esUrlname;
String esUrltime;
String esUrltitle;
String esAbstract;
String esKeywords;
String file;
}

View File

@ -2,10 +2,17 @@ package com.jsc.dsp.model;
import lombok.Data; import lombok.Data;
import javax.persistence.Entity;
import javax.persistence.Id;
import javax.persistence.Table;
import java.io.Serializable; import java.io.Serializable;
@Entity
@Data @Data
@Table(name = "indeximos")
public class Indeximos implements Serializable { public class Indeximos implements Serializable {
@Id
String es_sid;
String es_abstract; String es_abstract;
String es_annex; String es_annex;
String es_attachment; String es_attachment;
@ -56,7 +63,6 @@ public class Indeximos implements Serializable {
String es_repostuid; String es_repostuid;
String es_repostuname; String es_repostuname;
String es_rultopic; String es_rultopic;
String es_sid;
String es_simhash; String es_simhash;
String es_similarity; String es_similarity;
String es_similaritycount; String es_similaritycount;

View File

@ -0,0 +1,29 @@
package com.jsc.dsp.service;
import com.jsc.dsp.dao.ConfigRepository;
import com.jsc.dsp.model.Config;
import org.springframework.stereotype.Service;
import javax.annotation.Resource;
@Service
public class ConfigService {
@Resource
ConfigRepository configRepository;
public String getConfigValueByName(String configName) {
return getConfigByName(configName).getConfigValue();
}
public Config getConfigByName(String configName) {
return configRepository.findFirstByConfigName(configName);
}
public void setConfigValueByName(String configName, String configValue) {
Config config = getConfigByName(configName);
config.setConfigValue(configValue);
configRepository.save(config);
}
}

View File

@ -1,23 +1,15 @@
package com.jsc.dsp.service; package com.jsc.dsp.service;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.protobuf.Descriptors.FieldDescriptor; import com.google.protobuf.Descriptors.FieldDescriptor;
import com.jsc.dsp.binding.StorageBinding; import com.jsc.dsp.binding.StorageBinding;
import com.jsc.dsp.model.Indeximos; import com.jsc.dsp.model.Indeximos;
import com.jsc.dsp.proto.EsOuterClass.Es; import com.jsc.dsp.proto.EsOuterClass.Es;
import com.jsc.dsp.proto.EsOuterClass.EsSets; import com.jsc.dsp.proto.EsOuterClass.EsSets;
import com.jsc.dsp.utils.DBUtils; import com.jsc.dsp.utils.DatabaseConnector;
import com.jsc.dsp.utils.EsUtils;
import com.jsc.dsp.utils.FileUtils;
import com.jsc.dsp.utils.StringUtils; import com.jsc.dsp.utils.StringUtils;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
import org.elasticsearch.action.bulk.BulkRequest; import org.elasticsearch.action.bulk.BulkRequest;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.common.xcontent.XContentType;
import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value; import org.springframework.beans.factory.annotation.Value;
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
@ -27,6 +19,7 @@ import org.springframework.cloud.stream.annotation.StreamListener;
import org.springframework.messaging.support.MessageBuilder; import org.springframework.messaging.support.MessageBuilder;
import org.springframework.stereotype.Component; import org.springframework.stereotype.Component;
import javax.annotation.Resource;
import java.lang.reflect.Field; import java.lang.reflect.Field;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.util.*; import java.util.*;
@ -55,18 +48,8 @@ public class StorageService extends StreamService {
@Value("${custom.websiteWhiteList}") @Value("${custom.websiteWhiteList}")
String websiteWhiteListString; String websiteWhiteListString;
@Value("${db.driver}") @Resource
String dbDriver; DatabaseConnector databaseConnector;
@Value("${db.url}")
String dbUrl;
@Value("${db.user}")
String dbUser;
@Value("${db.password}")
String dbPassword;
private final Logger logger = LogManager.getLogger(StorageService.class.getName()); private final Logger logger = LogManager.getLogger(StorageService.class.getName());
@ -101,7 +84,7 @@ public class StorageService extends StreamService {
Map<FieldDescriptor, Object> fieldsMap = es.getAllFields(); Map<FieldDescriptor, Object> fieldsMap = es.getAllFields();
Indeximos indeximos = new Indeximos(); Indeximos indeximos = new Indeximos();
for (FieldDescriptor key : fieldsMap.keySet()) { for (FieldDescriptor key : fieldsMap.keySet()) {
boolean hasField = DBUtils.hasField(Indeximos.class, key.getName()); boolean hasField = databaseConnector.hasField(Indeximos.class, key.getName());
if (!hasField) { if (!hasField) {
continue; continue;
} }
@ -120,7 +103,7 @@ public class StorageService extends StreamService {
} else { } else {
Field field = indeximos.getClass().getDeclaredField(key.getName()); Field field = indeximos.getClass().getDeclaredField(key.getName());
field.setAccessible(true); field.setAccessible(true);
String fieldType = DBUtils.getFieldType(Indeximos.class, key.getName()); String fieldType = databaseConnector.getFieldType(Indeximos.class, key.getName());
if (fieldType.contains("Float")) { if (fieldType.contains("Float")) {
field.set(indeximos, Float.valueOf(value)); field.set(indeximos, Float.valueOf(value));
} else { } else {
@ -147,7 +130,7 @@ public class StorageService extends StreamService {
f.setAccessible(true); f.setAccessible(true);
//判断字段是否为空并且对象属性中的基本都会转为对象类型来判断 //判断字段是否为空并且对象属性中的基本都会转为对象类型来判断
if (f.get(indeximos) == null) { if (f.get(indeximos) == null) {
String fieldType = DBUtils.getFieldType(Indeximos.class, f.getName()); String fieldType = databaseConnector.getFieldType(Indeximos.class, f.getName());
if (fieldType.contains("Float")) { if (fieldType.contains("Float")) {
f.set(indeximos, 0.0f); f.set(indeximos, 0.0f);
} else { } else {
@ -161,11 +144,7 @@ public class StorageService extends StreamService {
} }
} }
if (dbStorageItems.size() > 0) { if (dbStorageItems.size() > 0) {
if (DBUtils.insertIntoDB(dbDriver, dbUrl, dbUser, dbPassword, dbStorageItems)) { databaseConnector.insertIntoDB(dbStorageItems);
logger.info("Store to MySQL Database");
} else {
logger.error("MySQL Database Storage error!");
}
} }
data.put("content", new String(esSetsBuilder.build().toByteArray(), StandardCharsets.ISO_8859_1)); data.put("content", new String(esSetsBuilder.build().toByteArray(), StandardCharsets.ISO_8859_1));
} }

View File

@ -1,55 +1,237 @@
package com.jsc.dsp.utils; package com.jsc.dsp.utils;
import com.alibaba.fastjson.JSON; import com.jsc.dsp.service.ConfigService;
import com.alibaba.fastjson.JSONArray; import org.slf4j.Logger;
import com.alibaba.fastjson.JSONObject; import org.slf4j.LoggerFactory;
import com.jsc.dsp.model.SearchAggregation;
import com.jsc.dsp.model.TargetSocial;
import com.jsc.dsp.model.TargetWebsite;
import java.util.Date;
import java.util.Map;
import java.util.logging.Logger;
import static com.jsc.dsp.utils.EsUtils.performAggregationSearch;
import org.springframework.beans.factory.annotation.Value; import org.springframework.beans.factory.annotation.Value;
import org.springframework.kafka.support.LogIfLevelEnabled;
import org.springframework.scheduling.annotation.Scheduled; import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component; import org.springframework.stereotype.Component;
import javax.annotation.Resource;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.attribute.BasicFileAttributes;
import java.nio.file.attribute.FileTime;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Comparator;
import java.util.Date;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
@Component @Component
public class AutoPatroller { public class AutoPatroller {
private final Logger logger = Logger.getLogger(this.getClass().getName()); @Resource
DatabaseConnector databaseConnector;
long updateInterval = 1500L; @Resource
FTPConnector ftpConnector;
@Value("${custom.websiteQueryAPI}") @Resource
String websiteQueryAPI; ConfigService configService;
@Value("${custom.websiteUpdateAPI}") private final Logger logger = LoggerFactory.getLogger(this.getClass().getName());
String websiteUpdateAPI;
@Value("${custom.socialQueryAPI}") private static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";
String socialQueryAPI; private static final SimpleDateFormat sdf = new SimpleDateFormat(DATE_FORMAT);
@Value("${custom.socialUpdateAPI}")
String socialUpdateAPI;
// @Scheduled(cron = "0 45 0/3 * * *") @Value("${custom.excelOutputPath}")
// public void checkNewsSite() { String excelOutputPath;
// checkWebsite("es_sitename", "es_carriertype", "news");
// } @Value("${custom.backupFilePath}")
// String backupFilePath;
// @Scheduled(cron = "0 15 1/3 * * *")
// public void checkWechat() { @Value("${custom.pagesOutputPath}")
// checkSocial("es_authors", "es_carriertype", "wechat", "5"); String pagesOutputPath;
// }
// @Value("${custom.ftpUploadPath}")
// @Scheduled(cron = "0 0 2/4 * * *") String ftpUploadPath;
// public void checkArticleSite() {
// checkWebsite("es_sitename", "es_carriertype", "article"); @Scheduled(cron = "0 0 8 * * *")
// } public void exportDataAndUpload() {
String lastLoadTime = configService.getConfigValueByName("last_loadtime");
String currentLoadTime = StringUtils.DateToString(new Date());
databaseConnector.exportToXlsx(lastLoadTime);
packagePagesFiles(lastLoadTime, currentLoadTime);
configService.setConfigValueByName("last_loadtime", currentLoadTime);
String zipFileName = String.format("data_news-%s.zip", currentLoadTime.replace("-", "").replace(":", "").replace(" ", ""));
String zipFileFullName = backupFilePath + File.separator + zipFileName;
String remoteZipPath = ftpUploadPath + "/" + zipFileName;
zipAndUploadDirectory(excelOutputPath, zipFileFullName, remoteZipPath);
}
/**
* 将指定目录打包成 ZIP 文件保存到指定本地路径并上传到 FTP 服务器
*
* @param sourceDirPath 本地要打包的源目录路径/data/reports
* @param localZipPath 本地 ZIP 文件保存路径/backup/archives/reports_20251224.zip
* @param remoteZipPath FTP 上的目标路径/ftp/backups/reports_20251224.zip
* @return 是否上传成功
*/
public boolean zipAndUploadDirectory(String sourceDirPath, String localZipPath, String remoteZipPath) {
Path sourceDir = Paths.get(sourceDirPath);
if (!Files.exists(sourceDir) || !Files.isDirectory(sourceDir)) {
logger.error("源目录不存在或不是一个目录: {}", sourceDirPath);
return false;
}
Path localZipFile = Paths.get(localZipPath);
Path zipParent = localZipFile.getParent();
if (zipParent != null && !Files.exists(zipParent)) {
try {
Files.createDirectories(zipParent);
logger.debug("创建 ZIP 父目录: {}", zipParent);
} catch (IOException e) {
logger.error("无法创建 ZIP 父目录: {}", zipParent, e);
return false;
}
}
// 打包目录到指定本地 ZIP 路径
try {
zipDirectory(sourceDir, localZipFile.toFile());
} catch (IOException e) {
logger.error("打包目录失败: {}", sourceDirPath, e);
return false;
}
// 上传 ZIP 文件
try (InputStream zipInputStream = Files.newInputStream(localZipFile)) {
boolean uploaded = ftpConnector.uploadFile(zipInputStream, remoteZipPath);
if (uploaded) {
logger.info("ZIP 文件上传成功 - 本地: {}, FTP: {}", localZipPath, remoteZipPath);
} else {
logger.error("ZIP 文件上传失败 - FTP: {}", remoteZipPath);
}
return uploaded;
} catch (IOException e) {
logger.error("读取本地 ZIP 文件失败: {}", localZipPath, e);
return false;
}
// 注意此处不再删除 localZipFile由调用方决定是否保留或清理
}
/**
* 将目录递归打包成 ZIP 文件
*
* @param sourceDir 要打包的源目录
* @param zipFile 输出的 ZIP 文件
* @throws IOException
*/
private void zipDirectory(Path sourceDir, File zipFile) throws IOException {
try (ZipOutputStream zipOut = new ZipOutputStream(new FileOutputStream(zipFile))) {
Files.walk(sourceDir)
.filter(path -> !Files.isDirectory(path)) // 只处理文件
.forEach(path -> {
ZipEntry zipEntry = new ZipEntry(sourceDir.relativize(path).toString());
try {
zipOut.putNextEntry(zipEntry);
Files.copy(path, zipOut);
zipOut.closeEntry();
} catch (IOException e) {
throw new RuntimeException("打包文件失败: " + path, e);
}
});
}
logger.info("目录打包完成: {} -> {}", sourceDir, zipFile.getAbsolutePath());
try {
Files.walk(sourceDir)
.sorted(Comparator.reverseOrder()) // 先处理子文件/子目录再处理父目录但这里只删文件
.filter(path -> !Files.isDirectory(path)) // 只删除文件
.forEach(path -> {
try {
Files.delete(path);
logger.debug("已删除文件: {}", path);
} catch (IOException e) {
logger.warn("无法删除文件: {}", path, e);
}
});
logger.info("源目录已清空(仅删除文件,保留目录结构): {}", sourceDir);
} catch (IOException e) {
logger.error("清空源目录时发生错误", e);
// 注意即使清理失败ZIP 已生成并会继续上传根据业务决定是否抛异常
// 如果要求必须清理成功才算成功可在此 throw 异常
}
}
public void packagePagesFiles(String startTime, String endTime) {
try {
// 解析时间范围
Date start = sdf.parse(startTime);
Date end = sdf.parse(endTime);
// 确保输出目录存在
Path excelOutputDir = Paths.get(excelOutputPath);
if (!Files.exists(excelOutputDir)) {
Files.createDirectories(excelOutputDir);
}
// 构造 ZIP 文件名
String zipFileName = String.format("pdf_files_%s_to_%s.zip",
startTime.replace(":", "").replace(" ", "_"),
endTime.replace(":", "").replace(" ", "_"));
Path zipFilePath = excelOutputDir.resolve(zipFileName);
// 遍历 mhtmlOutputPath 目录
Path sourceDir = Paths.get(pagesOutputPath);
if (!Files.exists(sourceDir) || !Files.isDirectory(sourceDir)) {
System.err.println("源目录不存在或不是目录: " + pagesOutputPath);
return;
}
try (ZipOutputStream zipOut = new ZipOutputStream(new FileOutputStream(zipFilePath.toFile()))) {
Files.walk(sourceDir)
.filter(path -> !Files.isDirectory(path))
.filter(path -> path.toString().toLowerCase().endsWith(".pdf"))
.forEach(path -> {
try {
// 获取文件创建时间Windows 支持Linux/macOS 可能返回最早时间
BasicFileAttributes attrs = Files.readAttributes(path, BasicFileAttributes.class);
FileTime creationTime = attrs.creationTime(); // 注意非所有系统都支持创建时间
Date fileCreationDate = new Date(creationTime.toMillis());
// 如果系统不支持创建时间例如某些 Linux 系统可改用 lastModifiedTime
// Date fileCreationDate = new Date(Files.getLastModifiedTime(path).toMillis());
if (!fileCreationDate.before(start) && !fileCreationDate.after(end)) {
// 文件创建时间在范围内
String entryName = sourceDir.relativize(path).toString();
zipOut.putNextEntry(new ZipEntry(entryName));
try (InputStream in = Files.newInputStream(path)) {
byte[] buffer = new byte[8192];
int len;
while ((len = in.read(buffer)) > 0) {
zipOut.write(buffer, 0, len);
}
}
zipOut.closeEntry();
System.out.println("已添加文件: " + path);
}
} catch (IOException e) {
System.err.println("处理文件时出错: " + path + " - " + e.getMessage());
}
});
}
System.out.println("ZIP 打包完成: " + zipFilePath);
} catch (ParseException e) {
System.err.println("时间格式解析错误,请确保使用格式: " + DATE_FORMAT);
e.printStackTrace();
} catch (IOException e) {
System.err.println("IO 错误: " + e.getMessage());
e.printStackTrace();
}
}
} }

View File

@ -1,131 +0,0 @@
package com.jsc.dsp.utils;
import com.alibaba.fastjson.JSONArray;
import com.jsc.dsp.model.Indeximos;
import java.io.File;
import java.lang.reflect.Field;
import java.lang.reflect.Method;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.*;
import java.util.logging.Logger;
public class DBUtils {
public static Connection conn = null;
private static final List<String> floatFields = Arrays.asList("es_doclength", "es_negativeProbability", "es_simrank");
private static final Logger logger = Logger.getLogger("com.jsc.dsp.utils.DBUtils");
public static Connection getConnection(String driver, String url, String user, String password) {
try {
Class.forName(driver);
return DriverManager.getConnection(url, user, password);
} catch (ClassNotFoundException | SQLException e) {
logger.warning("Cannot get DB connection!");
logger.warning(e.getMessage());
return null;
}
}
private static Map<String, Object> getObjectMap(Indeximos object) {
Map<String, Object> resultMap = new HashMap<>();
Field[] fields = object.getClass().getDeclaredFields();
for (Field field : fields) {
String fieldName = field.getName();
String firstLetter = fieldName.substring(0, 1).toUpperCase();
String getter = "get" + firstLetter + fieldName.substring(1);
try {
Method method = object.getClass().getMethod(getter);
Object fieldValue = method.invoke(object);
resultMap.put(fieldName, fieldValue);
} catch (Exception e) {
e.printStackTrace();
}
}
return resultMap;
}
public static boolean insertIntoDB(String driver, String url, String user, String password, List<Indeximos> objectList) {
if (conn == null) {
conn = getConnection(driver, url, user, password);
}
if (conn != null) {
try {
PreparedStatement pres = null;
for (Indeximos object : objectList) {
Map<String, Object> objectMap = getObjectMap(object);
Object[] keyObjects = objectMap.keySet().toArray();
List<String> keys = new ArrayList<>();
List<String> values = new ArrayList<>();
for (Object ko : keyObjects) {
String key = ko.toString();
keys.add(key);
Object value = objectMap.get(key);
if (floatFields.contains(key)) {
values.add(value.toString());
} else {
if (value != null && value.toString().length() > 0) {
values.add("'" + value.toString().replace("'", "\\'") + "'");
} else {
values.add("null");
}
}
}
String sqlInsert = "REPLACE INTO indeximos(" + String.join(", ", keys) + ") VALUES("
+ String.join(", ", values) + ")";
pres = conn.prepareStatement(sqlInsert);
pres.addBatch();
}
if (pres != null) {
pres.executeBatch();
pres.close();
}
return true;
} catch (SQLException e) {
logger.warning("Fail to insert data to Database");
logger.warning(e.getMessage());
conn = getConnection(driver, url, user, password);
return false;
}
} else {
return false;
}
}
public static boolean hasField(Class<?> clazz, String fieldName) {
try {
clazz.getDeclaredField(fieldName);
return true;
} catch (NoSuchFieldException e) {
return false;
}
}
public static String getFieldType(Class<?> clazz, String fieldName) {
try {
Field field = clazz.getDeclaredField(fieldName);
return field.getType().getName();
} catch (NoSuchFieldException e) {
return "";
}
}
public static void main(String[] args) {
List<Indeximos> objectList = JSONArray.parseArray(FileUtils.readContentFromFile(
"D:/data/local-storage/indeximos_1700030748332.json"), Indeximos.class);
insertIntoDB(
"com.mysql.cj.jdbc.Driver",
"jdbc:mysql://8.130.95.27:28089/dsp",
"root",
"passok123A",
objectList);
}
}

View File

@ -0,0 +1,150 @@
package com.jsc.dsp.utils;
import com.jsc.dsp.dao.EsDataNewsRepository;
import com.jsc.dsp.dao.IndeximosRepository;
import com.jsc.dsp.model.EsDataNewsView;
import com.jsc.dsp.model.Indeximos;
import org.apache.poi.ss.usermodel.*;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
import javax.annotation.Resource;
import java.io.ByteArrayOutputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.lang.reflect.Field;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.List;
import java.util.logging.Logger;
@Service
public class DatabaseConnector {
@Resource
IndeximosRepository indeximosRepository;
@Resource
EsDataNewsRepository esDataNewsRepository;
@Value("${custom.excelOutputPath}")
String excelOutputPath;
private final Logger logger = Logger.getLogger(this.getClass().getName());
public void insertIntoDB(List<Indeximos> objectList) {
try {
indeximosRepository.saveAll(objectList);
} catch (Exception e) {
logger.warning("Fail to insert data to Database");
logger.warning(e.getMessage());
}
}
public boolean hasField(Class<?> clazz, String fieldName) {
try {
clazz.getDeclaredField(fieldName);
return true;
} catch (NoSuchFieldException e) {
return false;
}
}
public String getFieldType(Class<?> clazz, String fieldName) {
try {
Field field = clazz.getDeclaredField(fieldName);
return field.getType().getName();
} catch (NoSuchFieldException e) {
return "";
}
}
public void exportToXlsx(String startTime) {
try {
Path dirPath = Paths.get(excelOutputPath);
if (!Files.exists(dirPath)) {
Files.createDirectories(dirPath);
}
String timestamp = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss"));
String fileName = "data_news-" + timestamp + ".xlsx";
Path filePath = dirPath.resolve(fileName);
List<EsDataNewsView> esDataNewsViewList = esDataNewsRepository.findAllByEsLoadtimeAfter(startTime);
if (!esDataNewsViewList.isEmpty()) {
Field[] fields = esDataNewsViewList.get(0).getClass().getDeclaredFields();
try (Workbook workbook = new XSSFWorkbook();
ByteArrayOutputStream out = new ByteArrayOutputStream()) {
Sheet sheet = workbook.createSheet("data");
// 创建表头
Row headerRow = sheet.createRow(0);
CellStyle headerStyle = workbook.createCellStyle();
headerStyle.setFillForegroundColor(IndexedColors.LIGHT_BLUE.getIndex());
headerStyle.setFillPattern(FillPatternType.SOLID_FOREGROUND);
for (int i = 0; i < fields.length; i++) {
Cell cell = headerRow.createCell(i);
String formField = formField(fields[i]);
cell.setCellValue(formField);
cell.setCellStyle(headerStyle);
}
// 填充数据
int rowNum = 1;
for (EsDataNewsView item : esDataNewsViewList) {
if (item.getFile() == null || item.getFile().length() < 5) {
continue;
}
Row row = sheet.createRow(rowNum++);
row.createCell(0).setCellValue(item.getEsSid());
row.createCell(1).setCellValue(item.getEsAuthors());
row.createCell(2).setCellValue(item.getEsCarriertype());
row.createCell(3).setCellValue(item.getEsCatalog());
row.createCell(4).setCellValue(item.getEsCollection());
row.createCell(5).setCellValue(item.getEsDoclength());
row.createCell(6).setCellValue(item.getEsLang());
row.createCell(7).setCellValue(item.getEsLasttime());
row.createCell(8).setCellValue(item.getEsLinks());
row.createCell(9).setCellValue(item.getEsLoadtime());
row.createCell(10).setCellValue(item.getEsSitename());
row.createCell(11).setCellValue(item.getEsSrcname());
row.createCell(12).setCellValue(item.getEsUrlcontent());
row.createCell(13).setCellValue(item.getEsUrlimage());
row.createCell(14).setCellValue(item.getEsUrlname());
row.createCell(15).setCellValue(item.getEsUrltime());
row.createCell(16).setCellValue(item.getEsUrltitle());
row.createCell(17).setCellValue(item.getEsAbstract());
row.createCell(18).setCellValue(item.getEsKeywords());
row.createCell(19).setCellValue(item.getFile());
}
// 自动调整列宽
for (int i = 0; i < fields.length; i++) {
sheet.autoSizeColumn(i);
}
workbook.write(out);
try (FileOutputStream fos = new FileOutputStream(filePath.toFile())) {
workbook.write(fos);
}
} catch (IOException e) {
e.printStackTrace();
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
private String formField(Field field) {
String fieldString = field.getName();
return StringUtils.camelToSnake(fieldString);
}
}

View File

@ -0,0 +1,108 @@
package com.jsc.dsp.utils;
import org.apache.commons.net.ftp.FTP;
import org.apache.commons.net.ftp.FTPClient;
import org.apache.commons.net.ftp.FTPReply;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;
import java.io.IOException;
import java.io.InputStream;
@Component
public class FTPConnector {
Logger log = LoggerFactory.getLogger(this.getClass().getName());
@Value("${ftp.host}")
String host;
@Value("${ftp.port}")
Integer port;
@Value("${ftp.username}")
String username;
@Value("${ftp.password}")
String password;
@Value("${ftp.timeout}")
Integer timeout;
public boolean uploadFile(InputStream inputStream, String remotePath) {
FTPClient ftpClient = new FTPClient();
try {
// 连接 FTP 服务器
ftpClient.connect(host, port);
ftpClient.login(username, password);
ftpClient.setConnectTimeout(timeout);
ftpClient.setSoTimeout(timeout);
// 设置文件类型为二进制避免文本模式损坏文件
ftpClient.setFileType(FTP.BINARY_FILE_TYPE);
// 启用被动模式适用于 NAT/防火墙环境
ftpClient.enterLocalPassiveMode();
// 检查登录是否成功
if (!FTPReply.isPositiveCompletion(ftpClient.getReplyCode())) {
ftpClient.disconnect();
log.error("FTP 登录失败");
return false;
}
// 创建目录如果路径包含子目录
createDirectories(ftpClient, remotePath);
// 上传文件
boolean success = ftpClient.storeFile(remotePath, inputStream);
if (success) {
log.info("文件上传成功: {}", remotePath);
} else {
log.error("FTP 上传失败,错误码: {}", ftpClient.getReplyCode());
}
return success;
} catch (IOException e) {
log.error("FTP 上传异常: {}", e.getMessage(), e);
return false;
} finally {
try {
if (inputStream != null) {
inputStream.close();
}
if (ftpClient.isConnected()) {
ftpClient.logout();
ftpClient.disconnect();
}
} catch (IOException e) {
log.warn("关闭 FTP 连接时出错", e);
}
}
}
/**
* 递归创建远程目录如果路径中包含目录
*/
private void createDirectories(FTPClient ftpClient, String remoteFilePath) throws IOException {
String[] pathParts = remoteFilePath.split("/");
StringBuilder currentPath = new StringBuilder();
for (int i = 0; i < pathParts.length - 1; i++) {
if (!pathParts[i].isEmpty()) {
currentPath.append("/").append(pathParts[i]);
// 尝试切换目录如果失败则创建
if (!ftpClient.changeWorkingDirectory(currentPath.toString())) {
boolean made = ftpClient.makeDirectory(currentPath.toString());
if (made) {
log.debug("创建 FTP 目录: {}", currentPath);
}
ftpClient.changeWorkingDirectory(currentPath.toString());
}
}
}
}
}

View File

@ -116,6 +116,29 @@ public class StringUtils {
return wordList; return wordList;
} }
public static String camelToSnake(String camel) {
if (camel == null || camel.isEmpty()) {
return camel;
}
StringBuilder result = new StringBuilder();
result.append(Character.toLowerCase(camel.charAt(0)));
for (int i = 1; i < camel.length(); i++) {
char ch = camel.charAt(i);
if (Character.isUpperCase(ch)) {
// 如果前一个字符不是大写或者后一个不是小写则加下划线
char prev = camel.charAt(i - 1);
if (!Character.isUpperCase(prev) ||
(i + 1 < camel.length() && Character.isLowerCase(camel.charAt(i + 1)))) {
result.append('_');
}
result.append(Character.toLowerCase(ch));
} else {
result.append(ch);
}
}
return result.toString();
}
public static void main(String[] args) { public static void main(String[] args) {
initFilterMap("http://39.98.151.140:28081/api/open/wordBank/queryAll"); initFilterMap("http://39.98.151.140:28081/api/open/wordBank/queryAll");
} }

View File

@ -42,21 +42,34 @@ spring:
records: 10 records: 10
interval: interval:
ms: 3600000 ms: 3600000
datasource:
url: jdbc:mysql://47.113.231.200:28089/dsp?useSSL=false&serverTimezone=Asia/Shanghai&allowPublicKeyRetrieval=true
username: root
password: passok123A
driver-class-name: com.mysql.cj.jdbc.Driver
jpa:
database-platform: org.hibernate.dialect.MySQL8Dialect
show-sql: true
topics: topics:
stream-protobuf: com.jsc.dsp.service.ProtobufService stream-protobuf: com.jsc.dsp.service.ProtobufService
stream-db: com.jsc.dsp.service.StorageService stream-db: com.jsc.dsp.service.StorageService
stream-file-dl: com.jsc.dsp.service.FileDlService stream-file-dl: com.jsc.dsp.service.FileDlService
switch: switch:
enable-storage-service: true enable-storage-service: false
enable-file-dl-service: false enable-file-dl-service: false
enable-protobuf-service: false enable-protobuf-service: false
db: ftp:
driver: com.mysql.cj.jdbc.Driver host: 144.34.185.108
url: jdbc:mysql://47.113.231.200:28089/dsp port: 21
user: root username: jsc-2b
password: passok123A password: 1234qwer%
timeout: 5000
passive-mode: true
custom: custom:
dev-mode: false dev-mode: false
filter-words-query-api: http://47.115.228.133:28081/api/open/wordBank/queryAll filter-words-query-api: http://47.115.228.133:28081/api/open/wordBank/queryAll
@ -72,3 +85,7 @@ custom:
socialQueryAPI: http://47.115.228.133:28081/api/open/target/social/queryAll?sortBy=id&shuffleResult=false socialQueryAPI: http://47.115.228.133:28081/api/open/target/social/queryAll?sortBy=id&shuffleResult=false
socialUpdateAPI: http://47.115.228.133:28081/api/open/target/social/update socialUpdateAPI: http://47.115.228.133:28081/api/open/target/social/update
websiteWhiteList: 能源界(国内信息);能源界(国际信息);中国能源新闻网;新华能源网;中国能源网(能源战略);中国农网(三农要闻);中国经济网(三农经济);中华粮网(粮食安全);美国之音(中国版面);美国之音(中美关系);美国之音(台海两岸版面);美国之音(港澳版面);看中国(看大陆版面);看中国(重点新闻);德国之声(中国报道);纽约时报中文网(中国版面);大纪元(一周大陆新闻);EnergyNow;联合国粮农组织;路透社(中国版面) websiteWhiteList: 能源界(国内信息);能源界(国际信息);中国能源新闻网;新华能源网;中国能源网(能源战略);中国农网(三农要闻);中国经济网(三农经济);中华粮网(粮食安全);美国之音(中国版面);美国之音(中美关系);美国之音(台海两岸版面);美国之音(港澳版面);看中国(看大陆版面);看中国(重点新闻);德国之声(中国报道);纽约时报中文网(中国版面);大纪元(一周大陆新闻);EnergyNow;联合国粮农组织;路透社(中国版面)
excelOutputPath: D:/data/output/upload
backupFilePath: D:/data/output/backup
pagesOutputPath: D:/data/output/pdf
ftpUploadPath: /home/jsc-2b

View File

@ -0,0 +1,322 @@
import logging
import os
import queue
import threading
import time
from datetime import datetime
import pymysql
from tqdm import tqdm
from save_page_as_pdf import PDFSaver
from save_remote_as_mhtml import RemoteMHTMLSaver
from save_page_as_mhtml import MHTMLSaver
import tldextract
# 配置日志
from save_remote_as_pdf import RemotePDFSaver
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(),
logging.FileHandler('pdf_downloader.log')
]
)
logger = logging.getLogger(__name__)
# =============== MySQL 配置 ===============
MYSQL_CONFIG = {
'host': '47.113.231.200',
'port': 28089,
'user': 'root',
'password': 'passok123A',
'database': 'dsp',
'charset': 'utf8mb4',
'autocommit': False # 手动控制事务
}
# =========================================
# 配置参数
BATCH_SIZE = 500
MAX_WORKERS = 1
TIMEOUT = 10
PDF_OUTPUT_DIR = 'D:/data/output/pdf'
MIN_PDF_SIZE = 80 * 1024 # 80KB
MHTML_OUTPUT_DIR = 'D:/data/output/mhtml'
os.makedirs(PDF_OUTPUT_DIR, exist_ok=True)
running = True
running_interval_seconds = 15
remote_host_name = [
'epochtimes.com',
# 'secretchina.com'
]
class PDFDownloader:
def __init__(self):
self.db_lock = threading.Lock()
self.task_queue = queue.Queue(maxsize=MAX_WORKERS * 3)
self.processed_count = 0
self.success_count = 0
self.fail_count = 0
self.small_file_count = 0 # 新增:统计小文件数量
self.last_loadtime = self.get_last_loadtime()
self.total_rows = self.get_total_rows()
self.start_time = time.time()
self.skip_hosts = []
self.local_handler = None
self.remote_handler = None
# 替换 MYSQL_CONFIG 中的连接方式
def get_db_connection(self):
return pymysql.connect(
host=MYSQL_CONFIG['host'],
port=MYSQL_CONFIG['port'],
user=MYSQL_CONFIG['user'],
password=MYSQL_CONFIG['password'],
database=MYSQL_CONFIG['database'],
charset='utf8mb4',
autocommit=False
)
def get_total_rows(self):
"""获取总记录数"""
with self.get_db_connection() as conn:
cursor = conn.cursor()
cursor.execute(
"SELECT COUNT(*) FROM indeximos "
"WHERE (es_video IS NULL OR es_video IN ('-2', '-1')) "
"AND es_loadtime > %s", self.last_loadtime
)
return cursor.fetchone()[0]
def get_last_loadtime(self):
"""获取上次导出数据的时间"""
with self.get_db_connection() as conn:
cursor = conn.cursor()
cursor.execute(
"SELECT config_value FROM config "
"WHERE config_name = 'last_loadtime' "
)
return cursor.fetchone()[0]
def use_remote_selenium(self, url):
for host in remote_host_name:
if host in url:
return True
return False
def format_pdf_filename(self, row):
"""格式化PDF文件名"""
es_urltitle = row[2] or 'untitled'
es_urltime = str(row[3]) or '19700101_000000'
es_sitename = row[4] or 'anonymous'
def clean_filename(text):
if not text:
return ''
invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*']
for char in invalid_chars:
text = text.replace(char, '_')
return text.strip()[:100]
try:
dt = datetime.strptime(es_urltime, '%Y-%m-%d %H:%M:%S')
es_urltime_fix = dt.strftime('%Y%m%d_%H%M%S')
except:
es_urltime_fix = '19700101_000000'
filename = f"{clean_filename(es_urltitle)}_{es_urltime_fix}_{es_sitename}.pdf"
return os.path.join(PDF_OUTPUT_DIR, filename)
def format_mhtml_filename(self, row):
"""格式化PDF文件名"""
es_urltitle = row[2] or 'untitled'
es_urltime = str(row[3]) or '19700101_000000'
es_sitename = row[4] or 'anonymous'
def clean_filename(text):
if not text:
return ''
invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*']
for char in invalid_chars:
text = text.replace(char, '_')
return text.strip()[:100]
try:
dt = datetime.strptime(es_urltime, '%Y-%m-%d %H:%M:%S')
es_urltime_fix = dt.strftime('%Y%m%d_%H%M%S')
except:
es_urltime_fix = '19700101_000000'
filename = f"{clean_filename(es_urltitle)}_{es_urltime_fix}_{es_sitename}.mhtml"
return os.path.join(PDF_OUTPUT_DIR, filename)
def fetch_data_batch(self, offset):
"""分页获取数据"""
with self.get_db_connection() as conn:
cursor = conn.cursor()
cursor.execute(
"SELECT es_sid, es_urlname, es_urltitle, es_urltime, es_sitename, es_authors FROM indeximos "
"WHERE (es_video IS NULL OR es_video IN ('-2', '-1')) "
"AND es_loadtime > %s "
"ORDER BY es_urltime LIMIT %s OFFSET %s",
(self.last_loadtime, BATCH_SIZE, offset)
)
return cursor.fetchall()
def update_file_status(self, es_sid, status, retry=3):
"""更新数据库状态"""
for attempt in range(retry):
try:
with self.db_lock, self.get_db_connection() as conn:
cursor = conn.cursor()
cursor.execute(
"UPDATE indeximos SET es_video = %s WHERE es_sid = %s",
(status, es_sid))
conn.commit()
return True
except Exception as e:
if attempt == retry - 1:
logger.error(f"更新数据库失败(es_sid={es_sid}): {e}")
return False
time.sleep(1)
def extract_main_domain(self, url):
extracted = tldextract.extract(url)
# 组合注册域名(主域名)
main_domain = f"{extracted.domain}.{extracted.suffix}"
return main_domain
def download_worker(self):
"""工作线程函数"""
while True:
try:
task = self.task_queue.get(timeout=1)
if task is None:
break
row = task
url = row[1]
if self.extract_main_domain(url) in self.skip_hosts:
self.small_file_count += 1
self.processed_count += 1
self.task_queue.task_done()
print(f"小文件规避暂时跳过URL{url}")
continue
output_file = self.format_pdf_filename(row) # 获取格式化后的文件名
try:
os.makedirs(os.path.dirname(output_file), exist_ok=True)
# 调用下载函数
if self.use_remote_selenium(url):
if self.remote_handler is None:
self.remote_handler = RemotePDFSaver()
success = self.remote_handler.save_as_pdf(
url=url,
output_path=output_file,
timeout=TIMEOUT
)
else:
if self.local_handler is None:
self.local_handler = PDFSaver()
success = self.local_handler.save_as_pdf(
url=url,
output_path=output_file,
timeout=TIMEOUT
)
# 验证下载结果
if success and os.path.exists(output_file):
file_size = os.path.getsize(output_file)
if file_size >= MIN_PDF_SIZE: # 文件大小合格
self.update_file_status(row[0], output_file)
self.success_count += 1
else: # 文件太小
self.update_file_status(row[0], '-2')
self.small_file_count += 1
logger.warning(f"文件过小({file_size}字节): {output_file}")
try:
os.remove(output_file)
self.skip_hosts.append(self.extract_main_domain(url))
except:
pass
else: # 下载失败
self.update_file_status(row[0], '0')
self.fail_count += 1
if os.path.exists(output_file):
try:
os.remove(output_file)
except:
pass
except Exception as e:
logger.error(f"下载出现异常(es_sid={row[0]}, url={url}): {str(e)}")
self.update_file_status(row[0], '-1')
self.fail_count += 1
self.processed_count += 1
self.task_queue.task_done()
except queue.Empty:
continue
def run(self):
"""启动下载任务"""
threads = []
# 创建工作线程
for _ in range(MAX_WORKERS):
t = threading.Thread(target=self.download_worker)
t.start()
threads.append(t)
# 使用进度条显示进度
with tqdm(total=self.total_rows, desc="处理进度", unit="") as pbar:
offset = 0
while True:
batch = self.fetch_data_batch(offset)
if not batch:
break
for row in batch:
self.task_queue.put(row)
pbar.update(len(batch))
pbar.set_postfix({
'成功': self.success_count,
'失败': self.fail_count,
'小文件': self.small_file_count,
'速度': f"{self.processed_count / (time.time() - self.start_time):.1f}条/秒"
})
offset += BATCH_SIZE
self.task_queue.join()
for _ in range(MAX_WORKERS):
self.task_queue.put(None)
for t in threads:
t.join()
total_time = time.time() - self.start_time
print(f"\n处理完成! 总计: {self.total_rows}")
print(f"成功: {self.success_count}条, 失败: {self.fail_count}条, 小文件: {self.small_file_count}")
print(f"总耗时: {total_time:.2f}秒, 平均速度: {self.total_rows / total_time:.2f}条/秒")
if __name__ == "__main__":
while running:
print(f"开始处理,总记录数: {PDFDownloader().get_total_rows()}")
downloader = PDFDownloader()
downloader.run()
print(f"运行完成,暂停{running_interval_seconds}秒后开始下一次运行...")
time.sleep(running_interval_seconds)

View File

@ -0,0 +1,141 @@
import logging
import os
import time
from urllib.parse import urlparse
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service as ChromeService
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(),
logging.FileHandler('mhtml_saver.log', encoding='utf-8')
]
)
logger = logging.getLogger(__name__)
class MHTMLSaver:
def __init__(self, headless=True):
logger.info("正在初始化 Chrome WebDriver自动匹配版本...")
service = ChromeService(executable_path="C:/Program Files/Python38/chromedriver.exe")
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.3650.75 Safari/537.36 Edg/143.0.3650.75"
# Chrome 选项
chrome_options = Options()
chrome_options.add_argument('--headless=new')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument(f'--user-agent={user_agent}')
chrome_options.add_argument('--save-page-as-mhtml') # 启用 MHTML 支持
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument('--lang=zh-CN')
chrome_options.add_experimental_option('prefs', {
'intl.accept_languages': 'zh-CN,zh,en'
})
# 或启动时指定(部分版本支持)
chrome_options.add_argument('--window-size=1920,1080')
# 隐藏 webdriver 特征
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
# 隐藏 "navigator.webdriver"
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
chrome_options.add_experimental_option('useAutomationExtension', False)
self.driver = webdriver.Chrome(service=service, options=chrome_options)
def save_as_mhtml(self, url, output_path=None, timeout=30, wait_time=5):
"""
将网页保存为 MHTML 文件
:param url: 目标网页 URL
:param output_path: 输出路径.mhtml
:param timeout: 页面加载超时
:param wait_time: 页面加载后等待时间用于动态内容渲染
:return: 保存的文件绝对路径
"""
if output_path is None:
parsed = urlparse(url)
domain = parsed.netloc.replace('www.', '').split('.')[0] or 'page'
output_path = f"{domain}.mhtml"
if not output_path.lower().endswith('.mhtml'):
output_path += '.mhtml'
try:
# 设置超时
self.driver.set_page_load_timeout(timeout)
# 启动后注入脚本(双重保险)
self.driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
'source': '''
delete navigator.__proto__.webdriver;
window.navigator.permissions.query = (parameters) => {
return parameters.name === 'notifications' ?
Promise.resolve({ state: Notification.permission }) :
originalQuery(parameters);
};
'''
})
# 在 driver.get() 之后设置窗口大小
logger.info(f"正在加载页面: {url}")
self.driver.get(url)
self.driver.set_window_size(1920, 1080)
# 等待页面动态内容加载(可调整)
logger.info(f"等待 {wait_time} 秒以确保页面完全渲染...")
time.sleep(wait_time)
# ✅ 关键:调用 CDP 命令捕获 MHTML
logger.info("正在生成 MHTML 快照...")
result = self.driver.execute_cdp_cmd('Page.captureSnapshot', {'format': 'mhtml'})
# ✅ result['data'] 是 Base64 编码的 MHTML 文本(实际是纯字符串)
mhtml_content = result['data']
# ✅ 以文本模式写入UTF-8
with open(output_path, 'w', encoding='utf-8', newline='') as f:
f.write(mhtml_content)
# 验证文件
file_size = os.path.getsize(output_path)
if file_size == 0:
raise RuntimeError("生成了空文件")
logger.info(f"✅ MHTML 保存成功: {os.path.abspath(output_path)} (大小: {file_size} 字节)")
return os.path.abspath(output_path)
except Exception as e:
logger.error(f"❌ 保存失败: {e}")
raise
def quit(self):
if self.driver:
self.driver.quit()
logger.info("浏览器已关闭")
# ===== 测试入口 =====
if __name__ == "__main__":
# 示例 URL可替换为你自己的
test_url = "https://cn.ultraiso.net/jiaocheng/ke-lu-guang-pan.html"
saver = MHTMLSaver(headless=True)
try:
output_file = saver.save_as_mhtml(
url=test_url,
output_path="example.mhtml",
timeout=30,
wait_time=5
)
print(f"\n🎉 成功保存 MHTML 文件: {output_file}")
except Exception as e:
print(f"\n💥 保存失败: {e}")
finally:
saver.quit()

View File

@ -0,0 +1,144 @@
import base64
import logging
import os
import time
from urllib.parse import urlparse
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service as ChromeService
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(),
logging.FileHandler('pdf_saver.log', encoding='utf-8')
]
)
logger = logging.getLogger(__name__)
class PDFSaver:
def __init__(self, headless=True):
logger.info("正在初始化 Chrome WebDriver自动匹配版本...")
service = ChromeService(executable_path="C:/Program Files/Python38/chromedriver.exe")
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.3650.75 Safari/537.36 Edg/143.0.3650.75"
# Chrome 选项
chrome_options = Options()
if headless:
chrome_options.add_argument('--headless=new')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument(f'--user-agent={user_agent}')
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument('--lang=zh-CN')
chrome_options.add_experimental_option('prefs', {
'intl.accept_languages': 'zh-CN,zh,en'
})
chrome_options.add_argument('--window-size=1920,1080')
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
# 注意PDF 打印不需要 --save-page-as-mhtml
self.driver = webdriver.Chrome(service=service, options=chrome_options)
def save_as_pdf(self, url, output_path=None, timeout=30, wait_time=5, print_options=None):
"""
将网页保存为 PDF 文件
:param url: 目标网页 URL
:param output_path: 输出路径.pdf
:param timeout: 页面加载超时
:param wait_time: 页面加载后等待时间用于动态内容渲染
:param print_options: PDF 打印选项可选参考 https://chromedevtools.github.io/devtools-protocol/tot/Page/#method-printToPDF
:return: 保存的文件绝对路径
"""
if output_path is None:
parsed = urlparse(url)
domain = parsed.netloc.replace('www.', '').split('.')[0] or 'page'
output_path = f"{domain}.pdf"
if not output_path.lower().endswith('.pdf'):
output_path += '.pdf'
# 默认打印选项(可按需调整)
default_print_options = {
'landscape': False,
'displayHeaderFooter': False,
'printBackground': True,
'preferCSSPageSize': True,
'paperWidth': 8.27, # A4 宽(英寸)
'paperHeight': 11.69, # A4 高(英寸)
}
if print_options:
default_print_options.update(print_options)
try:
self.driver.set_page_load_timeout(timeout)
# 隐藏自动化特征
self.driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
'source': '''
delete navigator.__proto__.webdriver;
window.navigator.permissions.query = (parameters) => {
return parameters.name === 'notifications' ?
Promise.resolve({ state: Notification.permission }) :
originalQuery(parameters);
};
const originalQuery = window.navigator.permissions.query;
'''
})
logger.info(f"正在加载页面: {url}")
self.driver.get(url)
self.driver.set_window_size(1920, 1080)
logger.info(f"等待 {wait_time} 秒以确保页面完全渲染...")
time.sleep(wait_time)
logger.info("正在生成 PDF...")
result = self.driver.execute_cdp_cmd('Page.printToPDF', default_print_options)
# result['data'] 是 Base64 编码的 PDF
pdf_data = base64.b64decode(result['data'])
with open(output_path, 'wb') as f:
f.write(pdf_data)
file_size = os.path.getsize(output_path)
if file_size == 0:
raise RuntimeError("生成了空文件")
logger.info(f"✅ PDF 保存成功: {os.path.abspath(output_path)} (大小: {file_size} 字节)")
return os.path.abspath(output_path)
except Exception as e:
logger.error(f"❌ 保存失败: {e}")
raise
def quit(self):
if self.driver:
self.driver.quit()
logger.info("浏览器已关闭")
# ===== 测试入口 =====
if __name__ == "__main__":
test_url = "https://cn.ultraiso.net/jiaocheng/ke-lu-guang-pan.html"
saver = PDFSaver(headless=True)
try:
output_file = saver.save_as_pdf(
url=test_url,
output_path="example.pdf",
timeout=30,
wait_time=5
)
print(f"\n🎉 成功保存 PDF 文件: {output_file}")
except Exception as e:
print(f"\n💥 保存失败: {e}")
finally:
saver.quit()

View File

@ -0,0 +1,190 @@
import logging
import os
import time
from urllib.parse import urlparse
from selenium import webdriver
from selenium.common.exceptions import (
WebDriverException,
TimeoutException,
SessionNotCreatedException,
InvalidSessionIdException
)
from selenium.webdriver.chrome.options import Options
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class RemoteMHTMLSaver:
def __init__(
self,
remote_url="http://144.34.185.108:28098/wd/hub",
headless=True,
max_retries=3,
retry_delay=2
):
"""
初始化远程 MHTML 保存器支持自动重建 session
:param remote_url: 远程 Selenium 地址
:param headless: 是否无头
:param max_retries: 单次操作最大重试次数
:param retry_delay: 重试前等待时间
"""
self.remote_url = remote_url
self.headless = headless
self.max_retries = max_retries
self.retry_delay = retry_delay
self.driver = None
self._init_driver()
def _build_chrome_options(self):
"""构建 Chrome 选项(可复用)"""
chrome_options = Options()
if self.headless:
chrome_options.add_argument('--headless=new')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--window-size=1920,1080')
chrome_options.add_argument(
"--user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.3650.75 Safari/537.36"
)
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
chrome_options.add_experimental_option('useAutomationExtension', False)
return chrome_options
def _init_driver(self):
"""初始化或重新初始化 WebDriver"""
if self.driver:
try:
self.driver.quit()
except Exception:
pass # 忽略关闭失败
logger.info(f"正在创建新的远程 WebDriver 会话: {self.remote_url}")
for attempt in range(3):
try:
self.driver = webdriver.Remote(
command_executor=self.remote_url,
options=self._build_chrome_options()
)
# 注入反检测脚本
self.driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
'source': '''
delete navigator.__proto__.webdriver;
window.chrome = { runtime: {} };
Object.defineProperty(navigator, 'languages', {
get: () => ['zh-CN', 'zh', 'en']
});
'''
})
logger.info("✅ 远程 WebDriver 会话创建成功")
return
except Exception as e:
logger.warning(f"创建 WebDriver 失败 (尝试 {attempt + 1}/3): {e}")
if attempt < 2:
time.sleep(2)
else:
raise RuntimeError(f"无法连接到远程 Selenium 服务: {e}")
def save_as_mhtml(self, url, output_path=None, timeout=30, wait_time=5):
"""
保存网页为 MHTML支持自动重试和 session 重建
"""
if output_path is None:
domain = urlparse(url).netloc.replace('www.', '').split('.')[0] or 'page'
output_path = f"{domain}.mhtml"
if not output_path.lower().endswith('.mhtml'):
output_path += '.mhtml'
last_exception = None
for retry in range(self.max_retries + 1):
try:
# 检查 driver 是否有效
if not self.driver:
self._init_driver()
self.driver.set_page_load_timeout(timeout)
logger.info(f"[{retry + 1}/{self.max_retries + 1}] 加载页面: {url}")
self.driver.get(url)
time.sleep(wait_time)
logger.info("生成 MHTML 快照...")
result = self.driver.execute_cdp_cmd('Page.captureSnapshot', {'format': 'mhtml'})
mhtml_content = result['data']
# 写入本地文件
with open(output_path, 'w', encoding='utf-8', newline='') as f:
f.write(mhtml_content)
file_size = os.path.getsize(output_path)
if file_size == 0:
raise RuntimeError("生成了空文件")
logger.info(f"✅ 保存成功: {os.path.abspath(output_path)} ({file_size} 字节)")
return os.path.abspath(output_path)
except (WebDriverException, InvalidSessionIdException, SessionNotCreatedException) as e:
last_exception = e
logger.warning(f"WebDriver 异常 (retry {retry + 1}): {e}")
if retry < self.max_retries:
logger.info("正在重建 WebDriver 会话...")
self._init_driver()
time.sleep(self.retry_delay)
else:
logger.error("达到最大重试次数,放弃")
break
except TimeoutException as e:
last_exception = e
logger.warning(f"页面加载超时 (retry {retry + 1}): {e}")
if retry < self.max_retries:
time.sleep(self.retry_delay)
else:
break
except Exception as e:
last_exception = e
logger.error(f"未知错误 (retry {retry + 1}): {e}")
break # 非 WebDriver 错误,不重试
# 如果所有重试失败
if os.path.exists(output_path):
try:
os.remove(output_path)
except OSError:
pass
raise RuntimeError(f"保存失败({type(last_exception).__name__}: {last_exception}")
def quit(self):
"""显式关闭浏览器"""
if self.driver:
try:
self.driver.quit()
logger.info("WebDriver 会话已关闭")
except Exception:
pass
self.driver = None
def __del__(self):
self.quit()
# ===== 测试 =====
if __name__ == "__main__":
saver = RemoteMHTMLSaver(
remote_url="http://144.34.185.108:28098/wd/hub", # ← 替换为你的云服务器公网 IP
headless=True
)
try:
saver.save_as_mhtml(
url="https://www.epochtimes.com/gb/25/12/22/n14660274.htm",
output_path="remote_example2.mhtml"
)
except Exception as e:
print(f"❌ 失败: {e}")
saver.quit()

View File

@ -0,0 +1,201 @@
import base64
import logging
import os
import time
from urllib.parse import urlparse
from selenium import webdriver
from selenium.common.exceptions import (
WebDriverException,
TimeoutException,
SessionNotCreatedException,
InvalidSessionIdException
)
from selenium.webdriver.chrome.options import Options
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class RemotePDFSaver:
def __init__(
self,
remote_url="http://144.34.185.108:28098/wd/hub",
headless=True,
max_retries=3,
retry_delay=2,
print_options=None
):
"""
初始化远程 PDF 保存器支持自动重建 session
:param remote_url: 远程 Selenium 地址
:param headless: 是否无头模式
:param max_retries: 单次操作最大重试次数
:param retry_delay: 重试前等待时间
:param print_options: PDF 打印选项参考 DevTools Protocol
"""
self.remote_url = remote_url
self.headless = headless
self.max_retries = max_retries
self.retry_delay = retry_delay
self.print_options = print_options or {
'landscape': False,
'displayHeaderFooter': False,
'printBackground': True,
'preferCSSPageSize': True,
'paperWidth': 8.27, # A4 宽(英寸)
'paperHeight': 11.69, # A4 高(英寸)
}
self.driver = None
self._init_driver()
def _build_chrome_options(self):
"""构建 Chrome 选项(可复用)"""
chrome_options = Options()
if self.headless:
chrome_options.add_argument('--headless=new')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--window-size=1920,1080')
chrome_options.add_argument(
"--user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.3650.75 Safari/537.36"
)
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
chrome_options.add_experimental_option('useAutomationExtension', False)
return chrome_options
def _init_driver(self):
"""初始化或重新初始化 WebDriver"""
if self.driver:
try:
self.driver.quit()
except Exception:
pass # 忽略关闭失败
logger.info(f"正在创建新的远程 WebDriver 会话: {self.remote_url}")
for attempt in range(3):
try:
self.driver = webdriver.Remote(
command_executor=self.remote_url,
options=self._build_chrome_options()
)
# 注入反检测脚本
self.driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
'source': '''
delete navigator.__proto__.webdriver;
window.chrome = { runtime: {} };
Object.defineProperty(navigator, 'languages', {
get: () => ['zh-CN', 'zh', 'en']
});
'''
})
logger.info("✅ 远程 WebDriver 会话创建成功")
return
except Exception as e:
logger.warning(f"创建 WebDriver 失败 (尝试 {attempt + 1}/3): {e}")
if attempt < 2:
time.sleep(2)
else:
raise RuntimeError(f"无法连接到远程 Selenium 服务: {e}")
def save_as_pdf(self, url, output_path=None, timeout=30, wait_time=5):
"""
保存网页为 PDF支持自动重试和 session 重建
"""
if output_path is None:
domain = urlparse(url).netloc.replace('www.', '').split('.')[0] or 'page'
output_path = f"{domain}.pdf"
if not output_path.lower().endswith('.pdf'):
output_path += '.pdf'
last_exception = None
for retry in range(self.max_retries + 1):
try:
# 检查 driver 是否有效
if not self.driver:
self._init_driver()
self.driver.set_page_load_timeout(timeout)
logger.info(f"[{retry + 1}/{self.max_retries + 1}] 加载页面: {url}")
self.driver.get(url)
time.sleep(wait_time)
logger.info("生成 PDF...")
result = self.driver.execute_cdp_cmd('Page.printToPDF', self.print_options)
pdf_data = base64.b64decode(result['data'])
# 写入本地 PDF 文件(二进制)
with open(output_path, 'wb') as f:
f.write(pdf_data)
file_size = os.path.getsize(output_path)
if file_size == 0:
raise RuntimeError("生成了空文件")
logger.info(f"✅ 保存成功: {os.path.abspath(output_path)} ({file_size} 字节)")
return os.path.abspath(output_path)
except (WebDriverException, InvalidSessionIdException, SessionNotCreatedException) as e:
last_exception = e
logger.warning(f"WebDriver 异常 (retry {retry + 1}): {e}")
if retry < self.max_retries:
logger.info("正在重建 WebDriver 会话...")
self._init_driver()
time.sleep(self.retry_delay)
else:
logger.error("达到最大重试次数,放弃")
break
except TimeoutException as e:
last_exception = e
logger.warning(f"页面加载超时 (retry {retry + 1}): {e}")
if retry < self.max_retries:
time.sleep(self.retry_delay)
else:
break
except Exception as e:
last_exception = e
logger.error(f"未知错误 (retry {retry + 1}): {e}")
break # 非 WebDriver 错误,不重试
# 清理失败生成的空文件
if os.path.exists(output_path):
try:
os.remove(output_path)
except OSError:
pass
raise RuntimeError(f"保存失败({type(last_exception).__name__}: {last_exception}")
def quit(self):
"""显式关闭浏览器"""
if self.driver:
try:
self.driver.quit()
logger.info("WebDriver 会话已关闭")
except Exception:
pass
self.driver = None
def __del__(self):
self.quit()
# ===== 测试 =====
if __name__ == "__main__":
saver = RemotePDFSaver(
remote_url="http://144.34.185.108:28098/wd/hub", # ← 替换为你的云服务器公网 IP
headless=True
)
try:
saver.save_as_pdf(
url="https://www.epochtimes.com/gb/25/12/22/n14660274.htm",
output_path="remote_example2.pdf"
)
except Exception as e:
print(f"❌ 失败: {e}")
saver.quit()