Compare commits
No commits in common. "81cac59adbd02e828f57e3b1c33da0641555165e" and "4de86e5f403f1e7313bf2e8aa5f3a9d7d48e2738" have entirely different histories.
81cac59adb
...
4de86e5f40
57
dsp/dsp.iml
57
dsp/dsp.iml
@ -9,16 +9,6 @@
|
||||
<facet type="Spring" name="Spring">
|
||||
<configuration />
|
||||
</facet>
|
||||
<facet type="jpa" name="JPA">
|
||||
<configuration>
|
||||
<setting name="validation-enabled" value="true" />
|
||||
<setting name="provider-name" value="Hibernate" />
|
||||
<datasource-mapping>
|
||||
<factory-entry name="entityManagerFactory" />
|
||||
</datasource-mapping>
|
||||
<naming-strategy-map />
|
||||
</configuration>
|
||||
</facet>
|
||||
</component>
|
||||
<component name="NewModuleRootManager" LANGUAGE_LEVEL="JDK_1_8">
|
||||
<output url="file://$MODULE_DIR$/target/classes" />
|
||||
@ -30,7 +20,7 @@
|
||||
</content>
|
||||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
<orderEntry type="library" name="Maven: org.jetbrains:annotations:26.0.2-1" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.jetbrains:annotations:26.0.2" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.commons:commons-compress:1.18" level="project" />
|
||||
<orderEntry type="library" name="Maven: com.alibaba:fastjson:1.2.75" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.projectlombok:lombok:1.18.16" level="project" />
|
||||
@ -40,6 +30,7 @@
|
||||
<orderEntry type="library" name="Maven: ch.qos.logback:logback-classic:1.2.3" level="project" />
|
||||
<orderEntry type="library" name="Maven: ch.qos.logback:logback-core:1.2.3" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-to-slf4j:2.13.3" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-api:2.13.3" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.slf4j:jul-to-slf4j:1.7.30" level="project" />
|
||||
<orderEntry type="library" name="Maven: jakarta.annotation:jakarta.annotation-api:1.3.5" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.yaml:snakeyaml:1.27" level="project" />
|
||||
@ -102,6 +93,7 @@
|
||||
<orderEntry type="library" name="Maven: org.apache.httpcomponents:httpcore:4.4.14" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.httpcomponents:httpasyncclient:4.1.4" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.httpcomponents:httpcore-nio:4.4.14" level="project" />
|
||||
<orderEntry type="library" name="Maven: commons-codec:commons-codec:1.15" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.elasticsearch.plugin:mapper-extras-client:7.7.0" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.elasticsearch.plugin:parent-join-client:7.7.0" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.elasticsearch.plugin:aggs-matrix-stats-client:7.7.0" level="project" />
|
||||
@ -114,6 +106,8 @@
|
||||
<orderEntry type="library" name="Maven: org.springframework.boot:spring-boot-starter-validation:2.4.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.hibernate.validator:hibernate-validator:6.1.6.Final" level="project" />
|
||||
<orderEntry type="library" name="Maven: jakarta.validation:jakarta.validation-api:2.0.2" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.jboss.logging:jboss-logging:3.4.1.Final" level="project" />
|
||||
<orderEntry type="library" name="Maven: com.fasterxml:classmate:1.5.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.springframework:spring-messaging:5.3.2" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.springframework.integration:spring-integration-core:5.4.2" level="project" />
|
||||
<orderEntry type="library" name="Maven: io.projectreactor:reactor-core:3.4.1" level="project" />
|
||||
@ -144,8 +138,8 @@
|
||||
<orderEntry type="library" scope="TEST" name="Maven: net.minidev:json-smart:2.3" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: net.minidev:accessors-smart:1.2" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: org.ow2.asm:asm:5.0.4" level="project" />
|
||||
<orderEntry type="library" name="Maven: jakarta.xml.bind:jakarta.xml.bind-api:2.3.3" level="project" />
|
||||
<orderEntry type="library" name="Maven: jakarta.activation:jakarta.activation-api:1.2.2" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: jakarta.xml.bind:jakarta.xml.bind-api:2.3.3" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: jakarta.activation:jakarta.activation-api:1.2.2" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: org.assertj:assertj-core:3.18.1" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: org.hamcrest:hamcrest:2.2" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: org.junit.jupiter:junit-jupiter:5.7.0" level="project" />
|
||||
@ -153,7 +147,7 @@
|
||||
<orderEntry type="library" scope="TEST" name="Maven: org.junit.jupiter:junit-jupiter-engine:5.7.0" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: org.junit.platform:junit-platform-engine:1.7.0" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: org.mockito:mockito-core:3.6.28" level="project" />
|
||||
<orderEntry type="library" name="Maven: net.bytebuddy:byte-buddy:1.10.18" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: net.bytebuddy:byte-buddy:1.10.18" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: net.bytebuddy:byte-buddy-agent:1.10.18" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: org.objenesis:objenesis:3.1" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: org.mockito:mockito-junit-jupiter:3.6.28" level="project" />
|
||||
@ -201,40 +195,5 @@
|
||||
<orderEntry type="library" name="Maven: org.springframework.boot:spring-boot-devtools:2.4.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.springframework.boot:spring-boot:2.4.1" level="project" />
|
||||
<orderEntry type="library" scope="RUNTIME" name="Maven: mysql:mysql-connector-java:8.0.22" level="project" />
|
||||
<orderEntry type="library" name="Maven: commons-net:commons-net:3.10.0" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.springframework.boot:spring-boot-starter-data-jpa:2.4.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.springframework.boot:spring-boot-starter-aop:2.4.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.aspectj:aspectjweaver:1.9.6" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.springframework.boot:spring-boot-starter-jdbc:2.4.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: com.zaxxer:HikariCP:3.4.5" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.springframework:spring-jdbc:5.3.2" level="project" />
|
||||
<orderEntry type="library" name="Maven: jakarta.transaction:jakarta.transaction-api:1.3.3" level="project" />
|
||||
<orderEntry type="library" name="Maven: jakarta.persistence:jakarta.persistence-api:2.2.3" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.hibernate:hibernate-core:5.4.25.Final" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.jboss.logging:jboss-logging:3.4.1.Final" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.javassist:javassist:3.27.0-GA" level="project" />
|
||||
<orderEntry type="library" name="Maven: antlr:antlr:2.7.7" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.jboss:jandex:2.1.3.Final" level="project" />
|
||||
<orderEntry type="library" name="Maven: com.fasterxml:classmate:1.5.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.dom4j:dom4j:2.1.3" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.hibernate.common:hibernate-commons-annotations:5.1.2.Final" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.glassfish.jaxb:jaxb-runtime:2.3.3" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.glassfish.jaxb:txw2:2.3.3" level="project" />
|
||||
<orderEntry type="library" name="Maven: com.sun.istack:istack-commons-runtime:3.0.11" level="project" />
|
||||
<orderEntry type="library" scope="RUNTIME" name="Maven: com.sun.activation:jakarta.activation:1.2.2" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.springframework.data:spring-data-jpa:2.4.2" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.springframework:spring-orm:5.3.2" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.springframework:spring-aspects:5.3.2" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.poi:poi:5.2.4" level="project" />
|
||||
<orderEntry type="library" name="Maven: commons-codec:commons-codec:1.15" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.commons:commons-collections4:4.4" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.commons:commons-math3:3.6.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: commons-io:commons-io:2.13.0" level="project" />
|
||||
<orderEntry type="library" name="Maven: com.zaxxer:SparseBitSet:1.3" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-api:2.13.3" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.poi:poi-ooxml:5.2.4" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.poi:poi-ooxml-lite:5.2.4" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.xmlbeans:xmlbeans:5.1.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: com.github.virtuald:curvesapi:1.08" level="project" />
|
||||
</component>
|
||||
</module>
|
||||
27
dsp/pom.xml
27
dsp/pom.xml
@ -94,34 +94,15 @@
|
||||
<optional>true</optional>
|
||||
<scope>true</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.projectlombok</groupId>
|
||||
<artifactId>lombok</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>mysql</groupId>
|
||||
<artifactId>mysql-connector-java</artifactId>
|
||||
<scope>runtime</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>commons-net</groupId>
|
||||
<artifactId>commons-net</artifactId>
|
||||
<version>3.10.0</version> <!-- 或使用最新版本 -->
|
||||
</dependency>
|
||||
<!-- Spring Data JPA -->
|
||||
<dependency>
|
||||
<groupId>org.springframework.boot</groupId>
|
||||
<artifactId>spring-boot-starter-data-jpa</artifactId>
|
||||
</dependency>
|
||||
|
||||
<!-- Apache POI for Excel (xlsx) -->
|
||||
<dependency>
|
||||
<groupId>org.apache.poi</groupId>
|
||||
<artifactId>poi</artifactId>
|
||||
<version>5.2.4</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.poi</groupId>
|
||||
<artifactId>poi-ooxml</artifactId>
|
||||
<version>5.2.4</version>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
|
||||
<dependencyManagement>
|
||||
|
||||
@ -3,10 +3,9 @@ package com.jsc.dsp;
|
||||
import org.springframework.boot.SpringApplication;
|
||||
import org.springframework.boot.autoconfigure.SpringBootApplication;
|
||||
import org.springframework.boot.autoconfigure.jdbc.DataSourceAutoConfiguration;
|
||||
import org.springframework.data.jpa.repository.config.EnableJpaRepositories;
|
||||
import org.springframework.scheduling.annotation.EnableScheduling;
|
||||
|
||||
@SpringBootApplication
|
||||
@SpringBootApplication(exclude = DataSourceAutoConfiguration.class)
|
||||
@EnableScheduling
|
||||
|
||||
public class DspApplication {
|
||||
|
||||
@ -1,45 +0,0 @@
|
||||
package com.jsc.dsp;
|
||||
|
||||
import com.alibaba.fastjson.JSONObject;
|
||||
import com.jsc.dsp.model.ReturnT;
|
||||
import com.jsc.dsp.utils.AutoPatroller;
|
||||
import com.jsc.dsp.utils.DatabaseConnector;
|
||||
import org.springframework.web.bind.annotation.PostMapping;
|
||||
import org.springframework.web.bind.annotation.RequestBody;
|
||||
import org.springframework.web.bind.annotation.RequestMapping;
|
||||
import org.springframework.web.bind.annotation.RestController;
|
||||
|
||||
import javax.annotation.Resource;
|
||||
|
||||
@RestController
|
||||
@RequestMapping("/test")
|
||||
public class TestController {
|
||||
|
||||
@Resource
|
||||
DatabaseConnector databaseConnector;
|
||||
|
||||
@Resource
|
||||
AutoPatroller autoPatroller;
|
||||
|
||||
@PostMapping("/exportExcel")
|
||||
public ReturnT<String> exportExcel(@RequestBody JSONObject object) {
|
||||
try {
|
||||
String startTime = object.getString("startTime");
|
||||
databaseConnector.exportToXlsx(startTime);
|
||||
return new ReturnT<>(200, "", "");
|
||||
} catch (Exception e) {
|
||||
return new ReturnT<>(500, e.getMessage(), "");
|
||||
}
|
||||
}
|
||||
|
||||
@PostMapping("/triggerExportTask")
|
||||
public ReturnT<String> triggerTask() {
|
||||
try {
|
||||
autoPatroller.exportDataAndUpload();
|
||||
return new ReturnT<>(200, "", "");
|
||||
} catch (Exception e) {
|
||||
return new ReturnT<>(500, e.getMessage(), "");
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,10 +0,0 @@
|
||||
package com.jsc.dsp.dao;
|
||||
|
||||
import com.jsc.dsp.model.Config;
|
||||
import org.springframework.data.jpa.repository.JpaRepository;
|
||||
import org.springframework.stereotype.Repository;
|
||||
|
||||
@Repository
|
||||
public interface ConfigRepository extends JpaRepository<Config, Integer> {
|
||||
Config findFirstByConfigName(String configName);
|
||||
}
|
||||
@ -1,12 +0,0 @@
|
||||
package com.jsc.dsp.dao;
|
||||
|
||||
import com.jsc.dsp.model.EsDataNewsView;
|
||||
import org.springframework.data.jpa.repository.JpaRepository;
|
||||
import org.springframework.stereotype.Repository;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@Repository
|
||||
public interface EsDataNewsRepository extends JpaRepository<EsDataNewsView, String> {
|
||||
List<EsDataNewsView> findAllByEsLoadtimeAfter(String loadtime);
|
||||
}
|
||||
@ -1,9 +0,0 @@
|
||||
package com.jsc.dsp.dao;
|
||||
|
||||
import com.jsc.dsp.model.Indeximos;
|
||||
import org.springframework.data.jpa.repository.JpaRepository;
|
||||
import org.springframework.stereotype.Repository;
|
||||
|
||||
@Repository
|
||||
public interface IndeximosRepository extends JpaRepository<Indeximos, String> {
|
||||
}
|
||||
@ -1,15 +0,0 @@
|
||||
package com.jsc.dsp.model;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
import javax.persistence.Entity;
|
||||
import javax.persistence.Id;
|
||||
|
||||
@Entity
|
||||
@Data
|
||||
public class Config {
|
||||
@Id
|
||||
Integer id;
|
||||
String configName;
|
||||
String configValue;
|
||||
}
|
||||
@ -1,34 +0,0 @@
|
||||
package com.jsc.dsp.model;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
import javax.persistence.Entity;
|
||||
import javax.persistence.Id;
|
||||
import javax.persistence.Table;
|
||||
|
||||
@Entity
|
||||
@Data
|
||||
@Table(name = "es_data_news")
|
||||
public class EsDataNewsView {
|
||||
@Id
|
||||
String esSid;
|
||||
String esAuthors;
|
||||
String esCarriertype;
|
||||
String esCatalog;
|
||||
String esCollection;
|
||||
Float esDoclength;
|
||||
String esLang;
|
||||
String esLasttime;
|
||||
String esLinks;
|
||||
String esLoadtime;
|
||||
String esSitename;
|
||||
String esSrcname;
|
||||
String esUrlcontent;
|
||||
String esUrlimage;
|
||||
String esUrlname;
|
||||
String esUrltime;
|
||||
String esUrltitle;
|
||||
String esAbstract;
|
||||
String esKeywords;
|
||||
String file;
|
||||
}
|
||||
@ -2,17 +2,10 @@ package com.jsc.dsp.model;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
import javax.persistence.Entity;
|
||||
import javax.persistence.Id;
|
||||
import javax.persistence.Table;
|
||||
import java.io.Serializable;
|
||||
|
||||
@Entity
|
||||
@Data
|
||||
@Table(name = "indeximos")
|
||||
public class Indeximos implements Serializable {
|
||||
@Id
|
||||
String es_sid;
|
||||
String es_abstract;
|
||||
String es_annex;
|
||||
String es_attachment;
|
||||
@ -63,6 +56,7 @@ public class Indeximos implements Serializable {
|
||||
String es_repostuid;
|
||||
String es_repostuname;
|
||||
String es_rultopic;
|
||||
String es_sid;
|
||||
String es_simhash;
|
||||
String es_similarity;
|
||||
String es_similaritycount;
|
||||
|
||||
@ -1,29 +0,0 @@
|
||||
package com.jsc.dsp.service;
|
||||
|
||||
import com.jsc.dsp.dao.ConfigRepository;
|
||||
import com.jsc.dsp.model.Config;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import javax.annotation.Resource;
|
||||
|
||||
@Service
|
||||
public class ConfigService {
|
||||
|
||||
@Resource
|
||||
ConfigRepository configRepository;
|
||||
|
||||
public String getConfigValueByName(String configName) {
|
||||
return getConfigByName(configName).getConfigValue();
|
||||
}
|
||||
|
||||
public Config getConfigByName(String configName) {
|
||||
return configRepository.findFirstByConfigName(configName);
|
||||
}
|
||||
|
||||
public void setConfigValueByName(String configName, String configValue) {
|
||||
Config config = getConfigByName(configName);
|
||||
config.setConfigValue(configValue);
|
||||
configRepository.save(config);
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,15 +1,23 @@
|
||||
package com.jsc.dsp.service;
|
||||
|
||||
import com.alibaba.fastjson.JSON;
|
||||
import com.alibaba.fastjson.JSONArray;
|
||||
import com.alibaba.fastjson.JSONObject;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.protobuf.Descriptors.FieldDescriptor;
|
||||
import com.jsc.dsp.binding.StorageBinding;
|
||||
import com.jsc.dsp.model.Indeximos;
|
||||
import com.jsc.dsp.proto.EsOuterClass.Es;
|
||||
import com.jsc.dsp.proto.EsOuterClass.EsSets;
|
||||
import com.jsc.dsp.utils.DatabaseConnector;
|
||||
import com.jsc.dsp.utils.DBUtils;
|
||||
import com.jsc.dsp.utils.EsUtils;
|
||||
import com.jsc.dsp.utils.FileUtils;
|
||||
import com.jsc.dsp.utils.StringUtils;
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
import org.elasticsearch.action.bulk.BulkRequest;
|
||||
import org.elasticsearch.action.index.IndexRequest;
|
||||
import org.elasticsearch.common.xcontent.XContentType;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
|
||||
@ -19,7 +27,6 @@ import org.springframework.cloud.stream.annotation.StreamListener;
|
||||
import org.springframework.messaging.support.MessageBuilder;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import javax.annotation.Resource;
|
||||
import java.lang.reflect.Field;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.*;
|
||||
@ -48,8 +55,18 @@ public class StorageService extends StreamService {
|
||||
@Value("${custom.websiteWhiteList}")
|
||||
String websiteWhiteListString;
|
||||
|
||||
@Resource
|
||||
DatabaseConnector databaseConnector;
|
||||
@Value("${db.driver}")
|
||||
String dbDriver;
|
||||
|
||||
@Value("${db.url}")
|
||||
String dbUrl;
|
||||
|
||||
@Value("${db.user}")
|
||||
String dbUser;
|
||||
|
||||
@Value("${db.password}")
|
||||
String dbPassword;
|
||||
|
||||
|
||||
private final Logger logger = LogManager.getLogger(StorageService.class.getName());
|
||||
|
||||
@ -84,7 +101,7 @@ public class StorageService extends StreamService {
|
||||
Map<FieldDescriptor, Object> fieldsMap = es.getAllFields();
|
||||
Indeximos indeximos = new Indeximos();
|
||||
for (FieldDescriptor key : fieldsMap.keySet()) {
|
||||
boolean hasField = databaseConnector.hasField(Indeximos.class, key.getName());
|
||||
boolean hasField = DBUtils.hasField(Indeximos.class, key.getName());
|
||||
if (!hasField) {
|
||||
continue;
|
||||
}
|
||||
@ -103,7 +120,7 @@ public class StorageService extends StreamService {
|
||||
} else {
|
||||
Field field = indeximos.getClass().getDeclaredField(key.getName());
|
||||
field.setAccessible(true);
|
||||
String fieldType = databaseConnector.getFieldType(Indeximos.class, key.getName());
|
||||
String fieldType = DBUtils.getFieldType(Indeximos.class, key.getName());
|
||||
if (fieldType.contains("Float")) {
|
||||
field.set(indeximos, Float.valueOf(value));
|
||||
} else {
|
||||
@ -130,7 +147,7 @@ public class StorageService extends StreamService {
|
||||
f.setAccessible(true);
|
||||
//判断字段是否为空,并且对象属性中的基本都会转为对象类型来判断
|
||||
if (f.get(indeximos) == null) {
|
||||
String fieldType = databaseConnector.getFieldType(Indeximos.class, f.getName());
|
||||
String fieldType = DBUtils.getFieldType(Indeximos.class, f.getName());
|
||||
if (fieldType.contains("Float")) {
|
||||
f.set(indeximos, 0.0f);
|
||||
} else {
|
||||
@ -144,7 +161,11 @@ public class StorageService extends StreamService {
|
||||
}
|
||||
}
|
||||
if (dbStorageItems.size() > 0) {
|
||||
databaseConnector.insertIntoDB(dbStorageItems);
|
||||
if (DBUtils.insertIntoDB(dbDriver, dbUrl, dbUser, dbPassword, dbStorageItems)) {
|
||||
logger.info("Store to MySQL Database");
|
||||
} else {
|
||||
logger.error("MySQL Database Storage error!");
|
||||
}
|
||||
}
|
||||
data.put("content", new String(esSetsBuilder.build().toByteArray(), StandardCharsets.ISO_8859_1));
|
||||
}
|
||||
|
||||
@ -1,237 +1,55 @@
|
||||
package com.jsc.dsp.utils;
|
||||
|
||||
import com.jsc.dsp.service.ConfigService;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import com.alibaba.fastjson.JSON;
|
||||
import com.alibaba.fastjson.JSONArray;
|
||||
import com.alibaba.fastjson.JSONObject;
|
||||
import com.jsc.dsp.model.SearchAggregation;
|
||||
import com.jsc.dsp.model.TargetSocial;
|
||||
import com.jsc.dsp.model.TargetWebsite;
|
||||
|
||||
import java.util.Date;
|
||||
import java.util.Map;
|
||||
import java.util.logging.Logger;
|
||||
|
||||
import static com.jsc.dsp.utils.EsUtils.performAggregationSearch;
|
||||
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.kafka.support.LogIfLevelEnabled;
|
||||
import org.springframework.scheduling.annotation.Scheduled;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import javax.annotation.Resource;
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.nio.file.attribute.BasicFileAttributes;
|
||||
import java.nio.file.attribute.FileTime;
|
||||
import java.text.ParseException;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.Comparator;
|
||||
import java.util.Date;
|
||||
import java.util.zip.ZipEntry;
|
||||
import java.util.zip.ZipOutputStream;
|
||||
|
||||
@Component
|
||||
public class AutoPatroller {
|
||||
|
||||
@Resource
|
||||
DatabaseConnector databaseConnector;
|
||||
private final Logger logger = Logger.getLogger(this.getClass().getName());
|
||||
|
||||
@Resource
|
||||
FTPConnector ftpConnector;
|
||||
long updateInterval = 1500L;
|
||||
|
||||
@Resource
|
||||
ConfigService configService;
|
||||
@Value("${custom.websiteQueryAPI}")
|
||||
String websiteQueryAPI;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(this.getClass().getName());
|
||||
@Value("${custom.websiteUpdateAPI}")
|
||||
String websiteUpdateAPI;
|
||||
|
||||
private static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";
|
||||
private static final SimpleDateFormat sdf = new SimpleDateFormat(DATE_FORMAT);
|
||||
@Value("${custom.socialQueryAPI}")
|
||||
String socialQueryAPI;
|
||||
|
||||
@Value("${custom.socialUpdateAPI}")
|
||||
String socialUpdateAPI;
|
||||
|
||||
@Value("${custom.excelOutputPath}")
|
||||
String excelOutputPath;
|
||||
|
||||
@Value("${custom.backupFilePath}")
|
||||
String backupFilePath;
|
||||
|
||||
@Value("${custom.pagesOutputPath}")
|
||||
String pagesOutputPath;
|
||||
|
||||
@Value("${custom.ftpUploadPath}")
|
||||
String ftpUploadPath;
|
||||
|
||||
@Scheduled(cron = "0 0 8 * * *")
|
||||
public void exportDataAndUpload() {
|
||||
String lastLoadTime = configService.getConfigValueByName("last_loadtime");
|
||||
String currentLoadTime = StringUtils.DateToString(new Date());
|
||||
databaseConnector.exportToXlsx(lastLoadTime);
|
||||
packagePagesFiles(lastLoadTime, currentLoadTime);
|
||||
configService.setConfigValueByName("last_loadtime", currentLoadTime);
|
||||
String zipFileName = String.format("data_news-%s.zip", currentLoadTime.replace("-", "").replace(":", "").replace(" ", ""));
|
||||
String zipFileFullName = backupFilePath + File.separator + zipFileName;
|
||||
String remoteZipPath = ftpUploadPath + "/" + zipFileName;
|
||||
zipAndUploadDirectory(excelOutputPath, zipFileFullName, remoteZipPath);
|
||||
}
|
||||
|
||||
/**
|
||||
* 将指定目录打包成 ZIP 文件(保存到指定本地路径),并上传到 FTP 服务器
|
||||
*
|
||||
* @param sourceDirPath 本地要打包的源目录路径(如:/data/reports)
|
||||
* @param localZipPath 本地 ZIP 文件保存路径(如:/backup/archives/reports_20251224.zip)
|
||||
* @param remoteZipPath FTP 上的目标路径(如:/ftp/backups/reports_20251224.zip)
|
||||
* @return 是否上传成功
|
||||
*/
|
||||
public boolean zipAndUploadDirectory(String sourceDirPath, String localZipPath, String remoteZipPath) {
|
||||
Path sourceDir = Paths.get(sourceDirPath);
|
||||
if (!Files.exists(sourceDir) || !Files.isDirectory(sourceDir)) {
|
||||
logger.error("源目录不存在或不是一个目录: {}", sourceDirPath);
|
||||
return false;
|
||||
}
|
||||
|
||||
Path localZipFile = Paths.get(localZipPath);
|
||||
Path zipParent = localZipFile.getParent();
|
||||
if (zipParent != null && !Files.exists(zipParent)) {
|
||||
try {
|
||||
Files.createDirectories(zipParent);
|
||||
logger.debug("创建 ZIP 父目录: {}", zipParent);
|
||||
} catch (IOException e) {
|
||||
logger.error("无法创建 ZIP 父目录: {}", zipParent, e);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// 打包目录到指定本地 ZIP 路径
|
||||
try {
|
||||
zipDirectory(sourceDir, localZipFile.toFile());
|
||||
} catch (IOException e) {
|
||||
logger.error("打包目录失败: {}", sourceDirPath, e);
|
||||
return false;
|
||||
}
|
||||
|
||||
// 上传 ZIP 文件
|
||||
try (InputStream zipInputStream = Files.newInputStream(localZipFile)) {
|
||||
boolean uploaded = ftpConnector.uploadFile(zipInputStream, remoteZipPath);
|
||||
if (uploaded) {
|
||||
logger.info("ZIP 文件上传成功 - 本地: {}, FTP: {}", localZipPath, remoteZipPath);
|
||||
} else {
|
||||
logger.error("ZIP 文件上传失败 - FTP: {}", remoteZipPath);
|
||||
}
|
||||
return uploaded;
|
||||
} catch (IOException e) {
|
||||
logger.error("读取本地 ZIP 文件失败: {}", localZipPath, e);
|
||||
return false;
|
||||
}
|
||||
|
||||
// 注意:此处不再删除 localZipFile,由调用方决定是否保留或清理
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 将目录递归打包成 ZIP 文件
|
||||
*
|
||||
* @param sourceDir 要打包的源目录
|
||||
* @param zipFile 输出的 ZIP 文件
|
||||
* @throws IOException
|
||||
*/
|
||||
private void zipDirectory(Path sourceDir, File zipFile) throws IOException {
|
||||
try (ZipOutputStream zipOut = new ZipOutputStream(new FileOutputStream(zipFile))) {
|
||||
Files.walk(sourceDir)
|
||||
.filter(path -> !Files.isDirectory(path)) // 只处理文件
|
||||
.forEach(path -> {
|
||||
ZipEntry zipEntry = new ZipEntry(sourceDir.relativize(path).toString());
|
||||
try {
|
||||
zipOut.putNextEntry(zipEntry);
|
||||
Files.copy(path, zipOut);
|
||||
zipOut.closeEntry();
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("打包文件失败: " + path, e);
|
||||
}
|
||||
});
|
||||
}
|
||||
logger.info("目录打包完成: {} -> {}", sourceDir, zipFile.getAbsolutePath());
|
||||
try {
|
||||
Files.walk(sourceDir)
|
||||
.sorted(Comparator.reverseOrder()) // 先处理子文件/子目录,再处理父目录(但这里只删文件)
|
||||
.filter(path -> !Files.isDirectory(path)) // 只删除文件
|
||||
.forEach(path -> {
|
||||
try {
|
||||
Files.delete(path);
|
||||
logger.debug("已删除文件: {}", path);
|
||||
} catch (IOException e) {
|
||||
logger.warn("无法删除文件: {}", path, e);
|
||||
}
|
||||
});
|
||||
logger.info("源目录已清空(仅删除文件,保留目录结构): {}", sourceDir);
|
||||
} catch (IOException e) {
|
||||
logger.error("清空源目录时发生错误", e);
|
||||
// 注意:即使清理失败,ZIP 已生成并会继续上传,根据业务决定是否抛异常
|
||||
// 如果要求“必须清理成功才算成功”,可在此 throw 异常
|
||||
}
|
||||
}
|
||||
|
||||
public void packagePagesFiles(String startTime, String endTime) {
|
||||
try {
|
||||
// 解析时间范围
|
||||
Date start = sdf.parse(startTime);
|
||||
Date end = sdf.parse(endTime);
|
||||
|
||||
// 确保输出目录存在
|
||||
Path excelOutputDir = Paths.get(excelOutputPath);
|
||||
if (!Files.exists(excelOutputDir)) {
|
||||
Files.createDirectories(excelOutputDir);
|
||||
}
|
||||
|
||||
// 构造 ZIP 文件名
|
||||
String zipFileName = String.format("pdf_files_%s_to_%s.zip",
|
||||
startTime.replace(":", "").replace(" ", "_"),
|
||||
endTime.replace(":", "").replace(" ", "_"));
|
||||
Path zipFilePath = excelOutputDir.resolve(zipFileName);
|
||||
|
||||
// 遍历 mhtmlOutputPath 目录
|
||||
Path sourceDir = Paths.get(pagesOutputPath);
|
||||
if (!Files.exists(sourceDir) || !Files.isDirectory(sourceDir)) {
|
||||
System.err.println("源目录不存在或不是目录: " + pagesOutputPath);
|
||||
return;
|
||||
}
|
||||
|
||||
try (ZipOutputStream zipOut = new ZipOutputStream(new FileOutputStream(zipFilePath.toFile()))) {
|
||||
Files.walk(sourceDir)
|
||||
.filter(path -> !Files.isDirectory(path))
|
||||
.filter(path -> path.toString().toLowerCase().endsWith(".pdf"))
|
||||
.forEach(path -> {
|
||||
try {
|
||||
// 获取文件创建时间(Windows 支持,Linux/macOS 可能返回最早时间)
|
||||
BasicFileAttributes attrs = Files.readAttributes(path, BasicFileAttributes.class);
|
||||
FileTime creationTime = attrs.creationTime(); // 注意:非所有系统都支持创建时间
|
||||
Date fileCreationDate = new Date(creationTime.toMillis());
|
||||
|
||||
// 如果系统不支持创建时间(例如某些 Linux 系统),可改用 lastModifiedTime
|
||||
// Date fileCreationDate = new Date(Files.getLastModifiedTime(path).toMillis());
|
||||
|
||||
if (!fileCreationDate.before(start) && !fileCreationDate.after(end)) {
|
||||
// 文件创建时间在范围内
|
||||
String entryName = sourceDir.relativize(path).toString();
|
||||
zipOut.putNextEntry(new ZipEntry(entryName));
|
||||
try (InputStream in = Files.newInputStream(path)) {
|
||||
byte[] buffer = new byte[8192];
|
||||
int len;
|
||||
while ((len = in.read(buffer)) > 0) {
|
||||
zipOut.write(buffer, 0, len);
|
||||
}
|
||||
}
|
||||
zipOut.closeEntry();
|
||||
System.out.println("已添加文件: " + path);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
System.err.println("处理文件时出错: " + path + " - " + e.getMessage());
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
System.out.println("ZIP 打包完成: " + zipFilePath);
|
||||
|
||||
} catch (ParseException e) {
|
||||
System.err.println("时间格式解析错误,请确保使用格式: " + DATE_FORMAT);
|
||||
e.printStackTrace();
|
||||
} catch (IOException e) {
|
||||
System.err.println("IO 错误: " + e.getMessage());
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
// @Scheduled(cron = "0 45 0/3 * * *")
|
||||
// public void checkNewsSite() {
|
||||
// checkWebsite("es_sitename", "es_carriertype", "news");
|
||||
// }
|
||||
//
|
||||
// @Scheduled(cron = "0 15 1/3 * * *")
|
||||
// public void checkWechat() {
|
||||
// checkSocial("es_authors", "es_carriertype", "wechat", "5");
|
||||
// }
|
||||
//
|
||||
// @Scheduled(cron = "0 0 2/4 * * *")
|
||||
// public void checkArticleSite() {
|
||||
// checkWebsite("es_sitename", "es_carriertype", "article");
|
||||
// }
|
||||
|
||||
}
|
||||
|
||||
131
dsp/src/main/java/com/jsc/dsp/utils/DBUtils.java
Normal file
131
dsp/src/main/java/com/jsc/dsp/utils/DBUtils.java
Normal file
@ -0,0 +1,131 @@
|
||||
package com.jsc.dsp.utils;
|
||||
|
||||
import com.alibaba.fastjson.JSONArray;
|
||||
import com.jsc.dsp.model.Indeximos;
|
||||
|
||||
import java.io.File;
|
||||
import java.lang.reflect.Field;
|
||||
import java.lang.reflect.Method;
|
||||
import java.sql.Connection;
|
||||
import java.sql.DriverManager;
|
||||
import java.sql.PreparedStatement;
|
||||
import java.sql.SQLException;
|
||||
import java.util.*;
|
||||
import java.util.logging.Logger;
|
||||
|
||||
|
||||
public class DBUtils {
|
||||
|
||||
public static Connection conn = null;
|
||||
|
||||
private static final List<String> floatFields = Arrays.asList("es_doclength", "es_negativeProbability", "es_simrank");
|
||||
|
||||
private static final Logger logger = Logger.getLogger("com.jsc.dsp.utils.DBUtils");
|
||||
|
||||
public static Connection getConnection(String driver, String url, String user, String password) {
|
||||
try {
|
||||
Class.forName(driver);
|
||||
return DriverManager.getConnection(url, user, password);
|
||||
} catch (ClassNotFoundException | SQLException e) {
|
||||
logger.warning("Cannot get DB connection!");
|
||||
logger.warning(e.getMessage());
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private static Map<String, Object> getObjectMap(Indeximos object) {
|
||||
Map<String, Object> resultMap = new HashMap<>();
|
||||
Field[] fields = object.getClass().getDeclaredFields();
|
||||
for (Field field : fields) {
|
||||
String fieldName = field.getName();
|
||||
String firstLetter = fieldName.substring(0, 1).toUpperCase();
|
||||
String getter = "get" + firstLetter + fieldName.substring(1);
|
||||
try {
|
||||
Method method = object.getClass().getMethod(getter);
|
||||
Object fieldValue = method.invoke(object);
|
||||
resultMap.put(fieldName, fieldValue);
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
return resultMap;
|
||||
}
|
||||
|
||||
public static boolean insertIntoDB(String driver, String url, String user, String password, List<Indeximos> objectList) {
|
||||
if (conn == null) {
|
||||
conn = getConnection(driver, url, user, password);
|
||||
}
|
||||
if (conn != null) {
|
||||
try {
|
||||
PreparedStatement pres = null;
|
||||
for (Indeximos object : objectList) {
|
||||
Map<String, Object> objectMap = getObjectMap(object);
|
||||
Object[] keyObjects = objectMap.keySet().toArray();
|
||||
List<String> keys = new ArrayList<>();
|
||||
List<String> values = new ArrayList<>();
|
||||
for (Object ko : keyObjects) {
|
||||
String key = ko.toString();
|
||||
keys.add(key);
|
||||
Object value = objectMap.get(key);
|
||||
if (floatFields.contains(key)) {
|
||||
values.add(value.toString());
|
||||
} else {
|
||||
if (value != null && value.toString().length() > 0) {
|
||||
values.add("'" + value.toString().replace("'", "\\'") + "'");
|
||||
} else {
|
||||
values.add("null");
|
||||
}
|
||||
}
|
||||
}
|
||||
String sqlInsert = "REPLACE INTO indeximos(" + String.join(", ", keys) + ") VALUES("
|
||||
+ String.join(", ", values) + ")";
|
||||
pres = conn.prepareStatement(sqlInsert);
|
||||
pres.addBatch();
|
||||
}
|
||||
if (pres != null) {
|
||||
pres.executeBatch();
|
||||
pres.close();
|
||||
}
|
||||
return true;
|
||||
} catch (SQLException e) {
|
||||
logger.warning("Fail to insert data to Database");
|
||||
logger.warning(e.getMessage());
|
||||
conn = getConnection(driver, url, user, password);
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static boolean hasField(Class<?> clazz, String fieldName) {
|
||||
try {
|
||||
clazz.getDeclaredField(fieldName);
|
||||
return true;
|
||||
} catch (NoSuchFieldException e) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public static String getFieldType(Class<?> clazz, String fieldName) {
|
||||
try {
|
||||
Field field = clazz.getDeclaredField(fieldName);
|
||||
return field.getType().getName();
|
||||
} catch (NoSuchFieldException e) {
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
List<Indeximos> objectList = JSONArray.parseArray(FileUtils.readContentFromFile(
|
||||
"D:/data/local-storage/indeximos_1700030748332.json"), Indeximos.class);
|
||||
insertIntoDB(
|
||||
"com.mysql.cj.jdbc.Driver",
|
||||
"jdbc:mysql://8.130.95.27:28089/dsp",
|
||||
"root",
|
||||
"passok123A",
|
||||
objectList);
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,150 +0,0 @@
|
||||
package com.jsc.dsp.utils;
|
||||
|
||||
import com.jsc.dsp.dao.EsDataNewsRepository;
|
||||
import com.jsc.dsp.dao.IndeximosRepository;
|
||||
import com.jsc.dsp.model.EsDataNewsView;
|
||||
import com.jsc.dsp.model.Indeximos;
|
||||
import org.apache.poi.ss.usermodel.*;
|
||||
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import javax.annotation.Resource;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.lang.reflect.Field;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.time.LocalDateTime;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
import java.util.List;
|
||||
import java.util.logging.Logger;
|
||||
|
||||
@Service
|
||||
public class DatabaseConnector {
|
||||
|
||||
@Resource
|
||||
IndeximosRepository indeximosRepository;
|
||||
|
||||
@Resource
|
||||
EsDataNewsRepository esDataNewsRepository;
|
||||
|
||||
@Value("${custom.excelOutputPath}")
|
||||
String excelOutputPath;
|
||||
|
||||
private final Logger logger = Logger.getLogger(this.getClass().getName());
|
||||
|
||||
public void insertIntoDB(List<Indeximos> objectList) {
|
||||
try {
|
||||
indeximosRepository.saveAll(objectList);
|
||||
} catch (Exception e) {
|
||||
logger.warning("Fail to insert data to Database");
|
||||
logger.warning(e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public boolean hasField(Class<?> clazz, String fieldName) {
|
||||
try {
|
||||
clazz.getDeclaredField(fieldName);
|
||||
return true;
|
||||
} catch (NoSuchFieldException e) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public String getFieldType(Class<?> clazz, String fieldName) {
|
||||
try {
|
||||
Field field = clazz.getDeclaredField(fieldName);
|
||||
return field.getType().getName();
|
||||
} catch (NoSuchFieldException e) {
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
public void exportToXlsx(String startTime) {
|
||||
try {
|
||||
Path dirPath = Paths.get(excelOutputPath);
|
||||
if (!Files.exists(dirPath)) {
|
||||
Files.createDirectories(dirPath);
|
||||
}
|
||||
String timestamp = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss"));
|
||||
String fileName = "data_news-" + timestamp + ".xlsx";
|
||||
Path filePath = dirPath.resolve(fileName);
|
||||
|
||||
List<EsDataNewsView> esDataNewsViewList = esDataNewsRepository.findAllByEsLoadtimeAfter(startTime);
|
||||
if (!esDataNewsViewList.isEmpty()) {
|
||||
Field[] fields = esDataNewsViewList.get(0).getClass().getDeclaredFields();
|
||||
try (Workbook workbook = new XSSFWorkbook();
|
||||
ByteArrayOutputStream out = new ByteArrayOutputStream()) {
|
||||
|
||||
Sheet sheet = workbook.createSheet("data");
|
||||
|
||||
// 创建表头
|
||||
Row headerRow = sheet.createRow(0);
|
||||
CellStyle headerStyle = workbook.createCellStyle();
|
||||
headerStyle.setFillForegroundColor(IndexedColors.LIGHT_BLUE.getIndex());
|
||||
headerStyle.setFillPattern(FillPatternType.SOLID_FOREGROUND);
|
||||
|
||||
for (int i = 0; i < fields.length; i++) {
|
||||
Cell cell = headerRow.createCell(i);
|
||||
String formField = formField(fields[i]);
|
||||
cell.setCellValue(formField);
|
||||
cell.setCellStyle(headerStyle);
|
||||
}
|
||||
// 填充数据
|
||||
int rowNum = 1;
|
||||
for (EsDataNewsView item : esDataNewsViewList) {
|
||||
if (item.getFile() == null || item.getFile().length() < 5) {
|
||||
continue;
|
||||
}
|
||||
Row row = sheet.createRow(rowNum++);
|
||||
row.createCell(0).setCellValue(item.getEsSid());
|
||||
row.createCell(1).setCellValue(item.getEsAuthors());
|
||||
row.createCell(2).setCellValue(item.getEsCarriertype());
|
||||
row.createCell(3).setCellValue(item.getEsCatalog());
|
||||
row.createCell(4).setCellValue(item.getEsCollection());
|
||||
row.createCell(5).setCellValue(item.getEsDoclength());
|
||||
row.createCell(6).setCellValue(item.getEsLang());
|
||||
row.createCell(7).setCellValue(item.getEsLasttime());
|
||||
row.createCell(8).setCellValue(item.getEsLinks());
|
||||
row.createCell(9).setCellValue(item.getEsLoadtime());
|
||||
row.createCell(10).setCellValue(item.getEsSitename());
|
||||
row.createCell(11).setCellValue(item.getEsSrcname());
|
||||
row.createCell(12).setCellValue(item.getEsUrlcontent());
|
||||
row.createCell(13).setCellValue(item.getEsUrlimage());
|
||||
row.createCell(14).setCellValue(item.getEsUrlname());
|
||||
row.createCell(15).setCellValue(item.getEsUrltime());
|
||||
row.createCell(16).setCellValue(item.getEsUrltitle());
|
||||
row.createCell(17).setCellValue(item.getEsAbstract());
|
||||
row.createCell(18).setCellValue(item.getEsKeywords());
|
||||
row.createCell(19).setCellValue(item.getFile());
|
||||
}
|
||||
|
||||
// 自动调整列宽
|
||||
for (int i = 0; i < fields.length; i++) {
|
||||
sheet.autoSizeColumn(i);
|
||||
}
|
||||
|
||||
workbook.write(out);
|
||||
|
||||
try (FileOutputStream fos = new FileOutputStream(filePath.toFile())) {
|
||||
workbook.write(fos);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
private String formField(Field field) {
|
||||
String fieldString = field.getName();
|
||||
return StringUtils.camelToSnake(fieldString);
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,108 +0,0 @@
|
||||
package com.jsc.dsp.utils;
|
||||
|
||||
import org.apache.commons.net.ftp.FTP;
|
||||
import org.apache.commons.net.ftp.FTPClient;
|
||||
import org.apache.commons.net.ftp.FTPReply;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
@Component
|
||||
public class FTPConnector {
|
||||
|
||||
Logger log = LoggerFactory.getLogger(this.getClass().getName());
|
||||
|
||||
@Value("${ftp.host}")
|
||||
String host;
|
||||
|
||||
@Value("${ftp.port}")
|
||||
Integer port;
|
||||
|
||||
@Value("${ftp.username}")
|
||||
String username;
|
||||
|
||||
@Value("${ftp.password}")
|
||||
String password;
|
||||
|
||||
@Value("${ftp.timeout}")
|
||||
Integer timeout;
|
||||
|
||||
public boolean uploadFile(InputStream inputStream, String remotePath) {
|
||||
FTPClient ftpClient = new FTPClient();
|
||||
try {
|
||||
// 连接 FTP 服务器
|
||||
ftpClient.connect(host, port);
|
||||
ftpClient.login(username, password);
|
||||
ftpClient.setConnectTimeout(timeout);
|
||||
ftpClient.setSoTimeout(timeout);
|
||||
|
||||
// 设置文件类型为二进制(避免文本模式损坏文件)
|
||||
ftpClient.setFileType(FTP.BINARY_FILE_TYPE);
|
||||
|
||||
// 启用被动模式(适用于 NAT/防火墙环境)
|
||||
ftpClient.enterLocalPassiveMode();
|
||||
|
||||
// 检查登录是否成功
|
||||
if (!FTPReply.isPositiveCompletion(ftpClient.getReplyCode())) {
|
||||
ftpClient.disconnect();
|
||||
log.error("FTP 登录失败");
|
||||
return false;
|
||||
}
|
||||
|
||||
// 创建目录(如果路径包含子目录)
|
||||
createDirectories(ftpClient, remotePath);
|
||||
|
||||
// 上传文件
|
||||
boolean success = ftpClient.storeFile(remotePath, inputStream);
|
||||
if (success) {
|
||||
log.info("文件上传成功: {}", remotePath);
|
||||
} else {
|
||||
log.error("FTP 上传失败,错误码: {}", ftpClient.getReplyCode());
|
||||
}
|
||||
return success;
|
||||
|
||||
} catch (IOException e) {
|
||||
log.error("FTP 上传异常: {}", e.getMessage(), e);
|
||||
return false;
|
||||
} finally {
|
||||
try {
|
||||
if (inputStream != null) {
|
||||
inputStream.close();
|
||||
}
|
||||
if (ftpClient.isConnected()) {
|
||||
ftpClient.logout();
|
||||
ftpClient.disconnect();
|
||||
}
|
||||
} catch (IOException e) {
|
||||
log.warn("关闭 FTP 连接时出错", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 递归创建远程目录(如果路径中包含目录)
|
||||
*/
|
||||
private void createDirectories(FTPClient ftpClient, String remoteFilePath) throws IOException {
|
||||
String[] pathParts = remoteFilePath.split("/");
|
||||
StringBuilder currentPath = new StringBuilder();
|
||||
|
||||
for (int i = 0; i < pathParts.length - 1; i++) {
|
||||
if (!pathParts[i].isEmpty()) {
|
||||
currentPath.append("/").append(pathParts[i]);
|
||||
// 尝试切换目录,如果失败则创建
|
||||
if (!ftpClient.changeWorkingDirectory(currentPath.toString())) {
|
||||
boolean made = ftpClient.makeDirectory(currentPath.toString());
|
||||
if (made) {
|
||||
log.debug("创建 FTP 目录: {}", currentPath);
|
||||
}
|
||||
ftpClient.changeWorkingDirectory(currentPath.toString());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -116,29 +116,6 @@ public class StringUtils {
|
||||
return wordList;
|
||||
}
|
||||
|
||||
public static String camelToSnake(String camel) {
|
||||
if (camel == null || camel.isEmpty()) {
|
||||
return camel;
|
||||
}
|
||||
StringBuilder result = new StringBuilder();
|
||||
result.append(Character.toLowerCase(camel.charAt(0)));
|
||||
for (int i = 1; i < camel.length(); i++) {
|
||||
char ch = camel.charAt(i);
|
||||
if (Character.isUpperCase(ch)) {
|
||||
// 如果前一个字符不是大写,或者后一个不是小写,则加下划线
|
||||
char prev = camel.charAt(i - 1);
|
||||
if (!Character.isUpperCase(prev) ||
|
||||
(i + 1 < camel.length() && Character.isLowerCase(camel.charAt(i + 1)))) {
|
||||
result.append('_');
|
||||
}
|
||||
result.append(Character.toLowerCase(ch));
|
||||
} else {
|
||||
result.append(ch);
|
||||
}
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
initFilterMap("http://39.98.151.140:28081/api/open/wordBank/queryAll");
|
||||
}
|
||||
|
||||
@ -42,34 +42,21 @@ spring:
|
||||
records: 10
|
||||
interval:
|
||||
ms: 3600000
|
||||
datasource:
|
||||
url: jdbc:mysql://47.113.231.200:28089/dsp?useSSL=false&serverTimezone=Asia/Shanghai&allowPublicKeyRetrieval=true
|
||||
username: root
|
||||
password: passok123A
|
||||
driver-class-name: com.mysql.cj.jdbc.Driver
|
||||
|
||||
jpa:
|
||||
database-platform: org.hibernate.dialect.MySQL8Dialect
|
||||
show-sql: true
|
||||
|
||||
topics:
|
||||
stream-protobuf: com.jsc.dsp.service.ProtobufService
|
||||
stream-db: com.jsc.dsp.service.StorageService
|
||||
stream-file-dl: com.jsc.dsp.service.FileDlService
|
||||
|
||||
switch:
|
||||
enable-storage-service: false
|
||||
enable-storage-service: true
|
||||
enable-file-dl-service: false
|
||||
enable-protobuf-service: false
|
||||
|
||||
ftp:
|
||||
host: 144.34.185.108
|
||||
port: 21
|
||||
username: jsc-2b
|
||||
password: 1234qwer%
|
||||
timeout: 5000
|
||||
passive-mode: true
|
||||
|
||||
db:
|
||||
driver: com.mysql.cj.jdbc.Driver
|
||||
url: jdbc:mysql://47.113.231.200:28089/dsp
|
||||
user: root
|
||||
password: passok123A
|
||||
custom:
|
||||
dev-mode: false
|
||||
filter-words-query-api: http://47.115.228.133:28081/api/open/wordBank/queryAll
|
||||
@ -85,7 +72,3 @@ custom:
|
||||
socialQueryAPI: http://47.115.228.133:28081/api/open/target/social/queryAll?sortBy=id&shuffleResult=false
|
||||
socialUpdateAPI: http://47.115.228.133:28081/api/open/target/social/update
|
||||
websiteWhiteList: 能源界(国内信息);能源界(国际信息);中国能源新闻网;新华能源网;中国能源网(能源战略);中国农网(三农要闻);中国经济网(三农经济);中华粮网(粮食安全);美国之音(中国版面);美国之音(中美关系);美国之音(台海两岸版面);美国之音(港澳版面);看中国(看大陆版面);看中国(重点新闻);德国之声(中国报道);纽约时报中文网(中国版面);大纪元(一周大陆新闻);EnergyNow;联合国粮农组织;路透社(中国版面)
|
||||
excelOutputPath: D:/data/output/upload
|
||||
backupFilePath: D:/data/output/backup
|
||||
pagesOutputPath: D:/data/output/pdf
|
||||
ftpUploadPath: /home/jsc-2b
|
||||
@ -1,322 +0,0 @@
|
||||
import logging
|
||||
import os
|
||||
import queue
|
||||
import threading
|
||||
import time
|
||||
from datetime import datetime
|
||||
|
||||
import pymysql
|
||||
from tqdm import tqdm
|
||||
|
||||
from save_page_as_pdf import PDFSaver
|
||||
from save_remote_as_mhtml import RemoteMHTMLSaver
|
||||
from save_page_as_mhtml import MHTMLSaver
|
||||
import tldextract
|
||||
|
||||
# 配置日志
|
||||
from save_remote_as_pdf import RemotePDFSaver
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.StreamHandler(),
|
||||
logging.FileHandler('pdf_downloader.log')
|
||||
]
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# =============== MySQL 配置 ===============
|
||||
MYSQL_CONFIG = {
|
||||
'host': '47.113.231.200',
|
||||
'port': 28089,
|
||||
'user': 'root',
|
||||
'password': 'passok123A',
|
||||
'database': 'dsp',
|
||||
'charset': 'utf8mb4',
|
||||
'autocommit': False # 手动控制事务
|
||||
}
|
||||
# =========================================
|
||||
|
||||
# 配置参数
|
||||
BATCH_SIZE = 500
|
||||
MAX_WORKERS = 1
|
||||
TIMEOUT = 10
|
||||
PDF_OUTPUT_DIR = 'D:/data/output/pdf'
|
||||
MIN_PDF_SIZE = 80 * 1024 # 80KB
|
||||
|
||||
MHTML_OUTPUT_DIR = 'D:/data/output/mhtml'
|
||||
os.makedirs(PDF_OUTPUT_DIR, exist_ok=True)
|
||||
|
||||
running = True
|
||||
running_interval_seconds = 15
|
||||
|
||||
remote_host_name = [
|
||||
'epochtimes.com',
|
||||
# 'secretchina.com'
|
||||
]
|
||||
|
||||
|
||||
class PDFDownloader:
|
||||
def __init__(self):
|
||||
self.db_lock = threading.Lock()
|
||||
self.task_queue = queue.Queue(maxsize=MAX_WORKERS * 3)
|
||||
self.processed_count = 0
|
||||
self.success_count = 0
|
||||
self.fail_count = 0
|
||||
self.small_file_count = 0 # 新增:统计小文件数量
|
||||
self.last_loadtime = self.get_last_loadtime()
|
||||
self.total_rows = self.get_total_rows()
|
||||
self.start_time = time.time()
|
||||
self.skip_hosts = []
|
||||
self.local_handler = None
|
||||
self.remote_handler = None
|
||||
|
||||
# 替换 MYSQL_CONFIG 中的连接方式
|
||||
def get_db_connection(self):
|
||||
return pymysql.connect(
|
||||
host=MYSQL_CONFIG['host'],
|
||||
port=MYSQL_CONFIG['port'],
|
||||
user=MYSQL_CONFIG['user'],
|
||||
password=MYSQL_CONFIG['password'],
|
||||
database=MYSQL_CONFIG['database'],
|
||||
charset='utf8mb4',
|
||||
autocommit=False
|
||||
)
|
||||
|
||||
def get_total_rows(self):
|
||||
"""获取总记录数"""
|
||||
with self.get_db_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(
|
||||
"SELECT COUNT(*) FROM indeximos "
|
||||
"WHERE (es_video IS NULL OR es_video IN ('-2', '-1')) "
|
||||
"AND es_loadtime > %s", self.last_loadtime
|
||||
)
|
||||
return cursor.fetchone()[0]
|
||||
|
||||
def get_last_loadtime(self):
|
||||
"""获取上次导出数据的时间"""
|
||||
with self.get_db_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(
|
||||
"SELECT config_value FROM config "
|
||||
"WHERE config_name = 'last_loadtime' "
|
||||
)
|
||||
return cursor.fetchone()[0]
|
||||
|
||||
def use_remote_selenium(self, url):
|
||||
for host in remote_host_name:
|
||||
if host in url:
|
||||
return True
|
||||
return False
|
||||
|
||||
def format_pdf_filename(self, row):
|
||||
"""格式化PDF文件名"""
|
||||
es_urltitle = row[2] or 'untitled'
|
||||
es_urltime = str(row[3]) or '19700101_000000'
|
||||
es_sitename = row[4] or 'anonymous'
|
||||
|
||||
def clean_filename(text):
|
||||
if not text:
|
||||
return ''
|
||||
invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*']
|
||||
for char in invalid_chars:
|
||||
text = text.replace(char, '_')
|
||||
return text.strip()[:100]
|
||||
|
||||
try:
|
||||
dt = datetime.strptime(es_urltime, '%Y-%m-%d %H:%M:%S')
|
||||
es_urltime_fix = dt.strftime('%Y%m%d_%H%M%S')
|
||||
except:
|
||||
es_urltime_fix = '19700101_000000'
|
||||
|
||||
filename = f"{clean_filename(es_urltitle)}_{es_urltime_fix}_{es_sitename}.pdf"
|
||||
return os.path.join(PDF_OUTPUT_DIR, filename)
|
||||
|
||||
def format_mhtml_filename(self, row):
|
||||
"""格式化PDF文件名"""
|
||||
es_urltitle = row[2] or 'untitled'
|
||||
es_urltime = str(row[3]) or '19700101_000000'
|
||||
es_sitename = row[4] or 'anonymous'
|
||||
|
||||
def clean_filename(text):
|
||||
if not text:
|
||||
return ''
|
||||
invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*']
|
||||
for char in invalid_chars:
|
||||
text = text.replace(char, '_')
|
||||
return text.strip()[:100]
|
||||
|
||||
try:
|
||||
dt = datetime.strptime(es_urltime, '%Y-%m-%d %H:%M:%S')
|
||||
es_urltime_fix = dt.strftime('%Y%m%d_%H%M%S')
|
||||
except:
|
||||
es_urltime_fix = '19700101_000000'
|
||||
|
||||
filename = f"{clean_filename(es_urltitle)}_{es_urltime_fix}_{es_sitename}.mhtml"
|
||||
return os.path.join(PDF_OUTPUT_DIR, filename)
|
||||
|
||||
def fetch_data_batch(self, offset):
|
||||
"""分页获取数据"""
|
||||
with self.get_db_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(
|
||||
"SELECT es_sid, es_urlname, es_urltitle, es_urltime, es_sitename, es_authors FROM indeximos "
|
||||
"WHERE (es_video IS NULL OR es_video IN ('-2', '-1')) "
|
||||
"AND es_loadtime > %s "
|
||||
"ORDER BY es_urltime LIMIT %s OFFSET %s",
|
||||
(self.last_loadtime, BATCH_SIZE, offset)
|
||||
)
|
||||
return cursor.fetchall()
|
||||
|
||||
def update_file_status(self, es_sid, status, retry=3):
|
||||
"""更新数据库状态"""
|
||||
for attempt in range(retry):
|
||||
try:
|
||||
with self.db_lock, self.get_db_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(
|
||||
"UPDATE indeximos SET es_video = %s WHERE es_sid = %s",
|
||||
(status, es_sid))
|
||||
conn.commit()
|
||||
return True
|
||||
except Exception as e:
|
||||
if attempt == retry - 1:
|
||||
logger.error(f"更新数据库失败(es_sid={es_sid}): {e}")
|
||||
return False
|
||||
time.sleep(1)
|
||||
|
||||
def extract_main_domain(self, url):
|
||||
extracted = tldextract.extract(url)
|
||||
# 组合注册域名(主域名)
|
||||
main_domain = f"{extracted.domain}.{extracted.suffix}"
|
||||
return main_domain
|
||||
|
||||
def download_worker(self):
|
||||
"""工作线程函数"""
|
||||
while True:
|
||||
try:
|
||||
task = self.task_queue.get(timeout=1)
|
||||
if task is None:
|
||||
break
|
||||
|
||||
row = task
|
||||
url = row[1]
|
||||
if self.extract_main_domain(url) in self.skip_hosts:
|
||||
self.small_file_count += 1
|
||||
self.processed_count += 1
|
||||
self.task_queue.task_done()
|
||||
print(f"小文件规避,暂时跳过URL:{url}")
|
||||
continue
|
||||
output_file = self.format_pdf_filename(row) # 获取格式化后的文件名
|
||||
|
||||
try:
|
||||
os.makedirs(os.path.dirname(output_file), exist_ok=True)
|
||||
|
||||
# 调用下载函数
|
||||
if self.use_remote_selenium(url):
|
||||
if self.remote_handler is None:
|
||||
self.remote_handler = RemotePDFSaver()
|
||||
success = self.remote_handler.save_as_pdf(
|
||||
url=url,
|
||||
output_path=output_file,
|
||||
timeout=TIMEOUT
|
||||
)
|
||||
else:
|
||||
if self.local_handler is None:
|
||||
self.local_handler = PDFSaver()
|
||||
success = self.local_handler.save_as_pdf(
|
||||
url=url,
|
||||
output_path=output_file,
|
||||
timeout=TIMEOUT
|
||||
)
|
||||
|
||||
# 验证下载结果
|
||||
if success and os.path.exists(output_file):
|
||||
file_size = os.path.getsize(output_file)
|
||||
|
||||
if file_size >= MIN_PDF_SIZE: # 文件大小合格
|
||||
self.update_file_status(row[0], output_file)
|
||||
self.success_count += 1
|
||||
else: # 文件太小
|
||||
self.update_file_status(row[0], '-2')
|
||||
self.small_file_count += 1
|
||||
logger.warning(f"文件过小({file_size}字节): {output_file}")
|
||||
try:
|
||||
os.remove(output_file)
|
||||
self.skip_hosts.append(self.extract_main_domain(url))
|
||||
except:
|
||||
pass
|
||||
else: # 下载失败
|
||||
self.update_file_status(row[0], '0')
|
||||
self.fail_count += 1
|
||||
if os.path.exists(output_file):
|
||||
try:
|
||||
os.remove(output_file)
|
||||
except:
|
||||
pass
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"下载出现异常(es_sid={row[0]}, url={url}): {str(e)}")
|
||||
self.update_file_status(row[0], '-1')
|
||||
self.fail_count += 1
|
||||
|
||||
self.processed_count += 1
|
||||
self.task_queue.task_done()
|
||||
|
||||
except queue.Empty:
|
||||
continue
|
||||
|
||||
def run(self):
|
||||
"""启动下载任务"""
|
||||
threads = []
|
||||
|
||||
# 创建工作线程
|
||||
for _ in range(MAX_WORKERS):
|
||||
t = threading.Thread(target=self.download_worker)
|
||||
t.start()
|
||||
threads.append(t)
|
||||
|
||||
# 使用进度条显示进度
|
||||
with tqdm(total=self.total_rows, desc="处理进度", unit="条") as pbar:
|
||||
offset = 0
|
||||
while True:
|
||||
batch = self.fetch_data_batch(offset)
|
||||
if not batch:
|
||||
break
|
||||
|
||||
for row in batch:
|
||||
self.task_queue.put(row)
|
||||
|
||||
pbar.update(len(batch))
|
||||
pbar.set_postfix({
|
||||
'成功': self.success_count,
|
||||
'失败': self.fail_count,
|
||||
'小文件': self.small_file_count,
|
||||
'速度': f"{self.processed_count / (time.time() - self.start_time):.1f}条/秒"
|
||||
})
|
||||
|
||||
offset += BATCH_SIZE
|
||||
|
||||
self.task_queue.join()
|
||||
|
||||
for _ in range(MAX_WORKERS):
|
||||
self.task_queue.put(None)
|
||||
|
||||
for t in threads:
|
||||
t.join()
|
||||
|
||||
total_time = time.time() - self.start_time
|
||||
print(f"\n处理完成! 总计: {self.total_rows}条")
|
||||
print(f"成功: {self.success_count}条, 失败: {self.fail_count}条, 小文件: {self.small_file_count}条")
|
||||
print(f"总耗时: {total_time:.2f}秒, 平均速度: {self.total_rows / total_time:.2f}条/秒")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
while running:
|
||||
print(f"开始处理,总记录数: {PDFDownloader().get_total_rows()}")
|
||||
downloader = PDFDownloader()
|
||||
downloader.run()
|
||||
print(f"运行完成,暂停{running_interval_seconds}秒后开始下一次运行...")
|
||||
time.sleep(running_interval_seconds)
|
||||
@ -1,141 +0,0 @@
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.webdriver.chrome.service import Service as ChromeService
|
||||
|
||||
# 配置日志
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.StreamHandler(),
|
||||
logging.FileHandler('mhtml_saver.log', encoding='utf-8')
|
||||
]
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MHTMLSaver:
|
||||
def __init__(self, headless=True):
|
||||
logger.info("正在初始化 Chrome WebDriver(自动匹配版本)...")
|
||||
service = ChromeService(executable_path="C:/Program Files/Python38/chromedriver.exe")
|
||||
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.3650.75 Safari/537.36 Edg/143.0.3650.75"
|
||||
|
||||
# Chrome 选项
|
||||
chrome_options = Options()
|
||||
chrome_options.add_argument('--headless=new')
|
||||
chrome_options.add_argument('--disable-gpu')
|
||||
chrome_options.add_argument('--no-sandbox')
|
||||
chrome_options.add_argument('--disable-dev-shm-usage')
|
||||
chrome_options.add_argument(f'--user-agent={user_agent}')
|
||||
chrome_options.add_argument('--save-page-as-mhtml') # 启用 MHTML 支持
|
||||
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
|
||||
chrome_options.add_experimental_option('useAutomationExtension', False)
|
||||
chrome_options.add_argument('--lang=zh-CN')
|
||||
chrome_options.add_experimental_option('prefs', {
|
||||
'intl.accept_languages': 'zh-CN,zh,en'
|
||||
})
|
||||
# 或启动时指定(部分版本支持)
|
||||
chrome_options.add_argument('--window-size=1920,1080')
|
||||
|
||||
# 隐藏 webdriver 特征
|
||||
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
|
||||
# 隐藏 "navigator.webdriver"
|
||||
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
|
||||
chrome_options.add_experimental_option('useAutomationExtension', False)
|
||||
|
||||
self.driver = webdriver.Chrome(service=service, options=chrome_options)
|
||||
|
||||
def save_as_mhtml(self, url, output_path=None, timeout=30, wait_time=5):
|
||||
"""
|
||||
将网页保存为 MHTML 文件
|
||||
:param url: 目标网页 URL
|
||||
:param output_path: 输出路径(.mhtml)
|
||||
:param timeout: 页面加载超时(秒)
|
||||
:param wait_time: 页面加载后等待时间(秒),用于动态内容渲染
|
||||
:return: 保存的文件绝对路径
|
||||
"""
|
||||
if output_path is None:
|
||||
parsed = urlparse(url)
|
||||
domain = parsed.netloc.replace('www.', '').split('.')[0] or 'page'
|
||||
output_path = f"{domain}.mhtml"
|
||||
|
||||
if not output_path.lower().endswith('.mhtml'):
|
||||
output_path += '.mhtml'
|
||||
|
||||
try:
|
||||
# 设置超时
|
||||
self.driver.set_page_load_timeout(timeout)
|
||||
|
||||
# 启动后注入脚本(双重保险)
|
||||
self.driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
|
||||
'source': '''
|
||||
delete navigator.__proto__.webdriver;
|
||||
window.navigator.permissions.query = (parameters) => {
|
||||
return parameters.name === 'notifications' ?
|
||||
Promise.resolve({ state: Notification.permission }) :
|
||||
originalQuery(parameters);
|
||||
};
|
||||
'''
|
||||
})
|
||||
# 在 driver.get() 之后设置窗口大小
|
||||
|
||||
logger.info(f"正在加载页面: {url}")
|
||||
self.driver.get(url)
|
||||
self.driver.set_window_size(1920, 1080)
|
||||
|
||||
# 等待页面动态内容加载(可调整)
|
||||
logger.info(f"等待 {wait_time} 秒以确保页面完全渲染...")
|
||||
time.sleep(wait_time)
|
||||
|
||||
# ✅ 关键:调用 CDP 命令捕获 MHTML
|
||||
logger.info("正在生成 MHTML 快照...")
|
||||
result = self.driver.execute_cdp_cmd('Page.captureSnapshot', {'format': 'mhtml'})
|
||||
|
||||
# ✅ result['data'] 是 Base64 编码的 MHTML 文本(实际是纯字符串)
|
||||
mhtml_content = result['data']
|
||||
|
||||
# ✅ 以文本模式写入(UTF-8)
|
||||
with open(output_path, 'w', encoding='utf-8', newline='') as f:
|
||||
f.write(mhtml_content)
|
||||
|
||||
# 验证文件
|
||||
file_size = os.path.getsize(output_path)
|
||||
if file_size == 0:
|
||||
raise RuntimeError("生成了空文件")
|
||||
|
||||
logger.info(f"✅ MHTML 保存成功: {os.path.abspath(output_path)} (大小: {file_size} 字节)")
|
||||
return os.path.abspath(output_path)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ 保存失败: {e}")
|
||||
raise
|
||||
|
||||
def quit(self):
|
||||
if self.driver:
|
||||
self.driver.quit()
|
||||
logger.info("浏览器已关闭")
|
||||
|
||||
|
||||
# ===== 测试入口 =====
|
||||
if __name__ == "__main__":
|
||||
# 示例 URL(可替换为你自己的)
|
||||
test_url = "https://cn.ultraiso.net/jiaocheng/ke-lu-guang-pan.html"
|
||||
|
||||
saver = MHTMLSaver(headless=True)
|
||||
try:
|
||||
output_file = saver.save_as_mhtml(
|
||||
url=test_url,
|
||||
output_path="example.mhtml",
|
||||
timeout=30,
|
||||
wait_time=5
|
||||
)
|
||||
print(f"\n🎉 成功保存 MHTML 文件: {output_file}")
|
||||
except Exception as e:
|
||||
print(f"\n💥 保存失败: {e}")
|
||||
finally:
|
||||
saver.quit()
|
||||
@ -1,144 +0,0 @@
|
||||
import base64
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.webdriver.chrome.service import Service as ChromeService
|
||||
|
||||
# 配置日志
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.StreamHandler(),
|
||||
logging.FileHandler('pdf_saver.log', encoding='utf-8')
|
||||
]
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PDFSaver:
|
||||
def __init__(self, headless=True):
|
||||
logger.info("正在初始化 Chrome WebDriver(自动匹配版本)...")
|
||||
service = ChromeService(executable_path="C:/Program Files/Python38/chromedriver.exe")
|
||||
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.3650.75 Safari/537.36 Edg/143.0.3650.75"
|
||||
|
||||
# Chrome 选项
|
||||
chrome_options = Options()
|
||||
if headless:
|
||||
chrome_options.add_argument('--headless=new')
|
||||
chrome_options.add_argument('--disable-gpu')
|
||||
chrome_options.add_argument('--no-sandbox')
|
||||
chrome_options.add_argument('--disable-dev-shm-usage')
|
||||
chrome_options.add_argument(f'--user-agent={user_agent}')
|
||||
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
|
||||
chrome_options.add_experimental_option('useAutomationExtension', False)
|
||||
chrome_options.add_argument('--lang=zh-CN')
|
||||
chrome_options.add_experimental_option('prefs', {
|
||||
'intl.accept_languages': 'zh-CN,zh,en'
|
||||
})
|
||||
chrome_options.add_argument('--window-size=1920,1080')
|
||||
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
|
||||
|
||||
# 注意:PDF 打印不需要 --save-page-as-mhtml
|
||||
self.driver = webdriver.Chrome(service=service, options=chrome_options)
|
||||
|
||||
def save_as_pdf(self, url, output_path=None, timeout=30, wait_time=5, print_options=None):
|
||||
"""
|
||||
将网页保存为 PDF 文件
|
||||
:param url: 目标网页 URL
|
||||
:param output_path: 输出路径(.pdf)
|
||||
:param timeout: 页面加载超时(秒)
|
||||
:param wait_time: 页面加载后等待时间(秒),用于动态内容渲染
|
||||
:param print_options: PDF 打印选项(可选),参考 https://chromedevtools.github.io/devtools-protocol/tot/Page/#method-printToPDF
|
||||
:return: 保存的文件绝对路径
|
||||
"""
|
||||
if output_path is None:
|
||||
parsed = urlparse(url)
|
||||
domain = parsed.netloc.replace('www.', '').split('.')[0] or 'page'
|
||||
output_path = f"{domain}.pdf"
|
||||
|
||||
if not output_path.lower().endswith('.pdf'):
|
||||
output_path += '.pdf'
|
||||
|
||||
# 默认打印选项(可按需调整)
|
||||
default_print_options = {
|
||||
'landscape': False,
|
||||
'displayHeaderFooter': False,
|
||||
'printBackground': True,
|
||||
'preferCSSPageSize': True,
|
||||
'paperWidth': 8.27, # A4 宽(英寸)
|
||||
'paperHeight': 11.69, # A4 高(英寸)
|
||||
}
|
||||
if print_options:
|
||||
default_print_options.update(print_options)
|
||||
|
||||
try:
|
||||
self.driver.set_page_load_timeout(timeout)
|
||||
|
||||
# 隐藏自动化特征
|
||||
self.driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
|
||||
'source': '''
|
||||
delete navigator.__proto__.webdriver;
|
||||
window.navigator.permissions.query = (parameters) => {
|
||||
return parameters.name === 'notifications' ?
|
||||
Promise.resolve({ state: Notification.permission }) :
|
||||
originalQuery(parameters);
|
||||
};
|
||||
const originalQuery = window.navigator.permissions.query;
|
||||
'''
|
||||
})
|
||||
|
||||
logger.info(f"正在加载页面: {url}")
|
||||
self.driver.get(url)
|
||||
self.driver.set_window_size(1920, 1080)
|
||||
|
||||
logger.info(f"等待 {wait_time} 秒以确保页面完全渲染...")
|
||||
time.sleep(wait_time)
|
||||
|
||||
logger.info("正在生成 PDF...")
|
||||
result = self.driver.execute_cdp_cmd('Page.printToPDF', default_print_options)
|
||||
|
||||
# result['data'] 是 Base64 编码的 PDF
|
||||
pdf_data = base64.b64decode(result['data'])
|
||||
|
||||
with open(output_path, 'wb') as f:
|
||||
f.write(pdf_data)
|
||||
|
||||
file_size = os.path.getsize(output_path)
|
||||
if file_size == 0:
|
||||
raise RuntimeError("生成了空文件")
|
||||
|
||||
logger.info(f"✅ PDF 保存成功: {os.path.abspath(output_path)} (大小: {file_size} 字节)")
|
||||
return os.path.abspath(output_path)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ 保存失败: {e}")
|
||||
raise
|
||||
|
||||
def quit(self):
|
||||
if self.driver:
|
||||
self.driver.quit()
|
||||
logger.info("浏览器已关闭")
|
||||
|
||||
|
||||
# ===== 测试入口 =====
|
||||
if __name__ == "__main__":
|
||||
test_url = "https://cn.ultraiso.net/jiaocheng/ke-lu-guang-pan.html"
|
||||
|
||||
saver = PDFSaver(headless=True)
|
||||
try:
|
||||
output_file = saver.save_as_pdf(
|
||||
url=test_url,
|
||||
output_path="example.pdf",
|
||||
timeout=30,
|
||||
wait_time=5
|
||||
)
|
||||
print(f"\n🎉 成功保存 PDF 文件: {output_file}")
|
||||
except Exception as e:
|
||||
print(f"\n💥 保存失败: {e}")
|
||||
finally:
|
||||
saver.quit()
|
||||
@ -1,190 +0,0 @@
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from selenium import webdriver
|
||||
from selenium.common.exceptions import (
|
||||
WebDriverException,
|
||||
TimeoutException,
|
||||
SessionNotCreatedException,
|
||||
InvalidSessionIdException
|
||||
)
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RemoteMHTMLSaver:
|
||||
def __init__(
|
||||
self,
|
||||
remote_url="http://144.34.185.108:28098/wd/hub",
|
||||
headless=True,
|
||||
max_retries=3,
|
||||
retry_delay=2
|
||||
):
|
||||
"""
|
||||
初始化远程 MHTML 保存器(支持自动重建 session)
|
||||
:param remote_url: 远程 Selenium 地址
|
||||
:param headless: 是否无头
|
||||
:param max_retries: 单次操作最大重试次数
|
||||
:param retry_delay: 重试前等待时间(秒)
|
||||
"""
|
||||
self.remote_url = remote_url
|
||||
self.headless = headless
|
||||
self.max_retries = max_retries
|
||||
self.retry_delay = retry_delay
|
||||
self.driver = None
|
||||
self._init_driver()
|
||||
|
||||
def _build_chrome_options(self):
|
||||
"""构建 Chrome 选项(可复用)"""
|
||||
chrome_options = Options()
|
||||
if self.headless:
|
||||
chrome_options.add_argument('--headless=new')
|
||||
chrome_options.add_argument('--no-sandbox')
|
||||
chrome_options.add_argument('--disable-dev-shm-usage')
|
||||
chrome_options.add_argument('--disable-gpu')
|
||||
chrome_options.add_argument('--window-size=1920,1080')
|
||||
chrome_options.add_argument(
|
||||
"--user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.3650.75 Safari/537.36"
|
||||
)
|
||||
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
|
||||
chrome_options.add_experimental_option('useAutomationExtension', False)
|
||||
return chrome_options
|
||||
|
||||
def _init_driver(self):
|
||||
"""初始化或重新初始化 WebDriver"""
|
||||
if self.driver:
|
||||
try:
|
||||
self.driver.quit()
|
||||
except Exception:
|
||||
pass # 忽略关闭失败
|
||||
|
||||
logger.info(f"正在创建新的远程 WebDriver 会话: {self.remote_url}")
|
||||
for attempt in range(3):
|
||||
try:
|
||||
self.driver = webdriver.Remote(
|
||||
command_executor=self.remote_url,
|
||||
options=self._build_chrome_options()
|
||||
)
|
||||
# 注入反检测脚本
|
||||
self.driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
|
||||
'source': '''
|
||||
delete navigator.__proto__.webdriver;
|
||||
window.chrome = { runtime: {} };
|
||||
Object.defineProperty(navigator, 'languages', {
|
||||
get: () => ['zh-CN', 'zh', 'en']
|
||||
});
|
||||
'''
|
||||
})
|
||||
logger.info("✅ 远程 WebDriver 会话创建成功")
|
||||
return
|
||||
except Exception as e:
|
||||
logger.warning(f"创建 WebDriver 失败 (尝试 {attempt + 1}/3): {e}")
|
||||
if attempt < 2:
|
||||
time.sleep(2)
|
||||
else:
|
||||
raise RuntimeError(f"无法连接到远程 Selenium 服务: {e}")
|
||||
|
||||
def save_as_mhtml(self, url, output_path=None, timeout=30, wait_time=5):
|
||||
"""
|
||||
保存网页为 MHTML,支持自动重试和 session 重建
|
||||
"""
|
||||
if output_path is None:
|
||||
domain = urlparse(url).netloc.replace('www.', '').split('.')[0] or 'page'
|
||||
output_path = f"{domain}.mhtml"
|
||||
if not output_path.lower().endswith('.mhtml'):
|
||||
output_path += '.mhtml'
|
||||
|
||||
last_exception = None
|
||||
|
||||
for retry in range(self.max_retries + 1):
|
||||
try:
|
||||
# 检查 driver 是否有效
|
||||
if not self.driver:
|
||||
self._init_driver()
|
||||
|
||||
self.driver.set_page_load_timeout(timeout)
|
||||
logger.info(f"[{retry + 1}/{self.max_retries + 1}] 加载页面: {url}")
|
||||
self.driver.get(url)
|
||||
time.sleep(wait_time)
|
||||
|
||||
logger.info("生成 MHTML 快照...")
|
||||
result = self.driver.execute_cdp_cmd('Page.captureSnapshot', {'format': 'mhtml'})
|
||||
mhtml_content = result['data']
|
||||
|
||||
# 写入本地文件
|
||||
with open(output_path, 'w', encoding='utf-8', newline='') as f:
|
||||
f.write(mhtml_content)
|
||||
|
||||
file_size = os.path.getsize(output_path)
|
||||
if file_size == 0:
|
||||
raise RuntimeError("生成了空文件")
|
||||
|
||||
logger.info(f"✅ 保存成功: {os.path.abspath(output_path)} ({file_size} 字节)")
|
||||
return os.path.abspath(output_path)
|
||||
|
||||
except (WebDriverException, InvalidSessionIdException, SessionNotCreatedException) as e:
|
||||
last_exception = e
|
||||
logger.warning(f"WebDriver 异常 (retry {retry + 1}): {e}")
|
||||
if retry < self.max_retries:
|
||||
logger.info("正在重建 WebDriver 会话...")
|
||||
self._init_driver()
|
||||
time.sleep(self.retry_delay)
|
||||
else:
|
||||
logger.error("达到最大重试次数,放弃")
|
||||
break
|
||||
|
||||
except TimeoutException as e:
|
||||
last_exception = e
|
||||
logger.warning(f"页面加载超时 (retry {retry + 1}): {e}")
|
||||
if retry < self.max_retries:
|
||||
time.sleep(self.retry_delay)
|
||||
else:
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
last_exception = e
|
||||
logger.error(f"未知错误 (retry {retry + 1}): {e}")
|
||||
break # 非 WebDriver 错误,不重试
|
||||
|
||||
# 如果所有重试失败
|
||||
if os.path.exists(output_path):
|
||||
try:
|
||||
os.remove(output_path)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
raise RuntimeError(f"保存失败({type(last_exception).__name__}): {last_exception}")
|
||||
|
||||
def quit(self):
|
||||
"""显式关闭浏览器"""
|
||||
if self.driver:
|
||||
try:
|
||||
self.driver.quit()
|
||||
logger.info("WebDriver 会话已关闭")
|
||||
except Exception:
|
||||
pass
|
||||
self.driver = None
|
||||
|
||||
def __del__(self):
|
||||
self.quit()
|
||||
|
||||
|
||||
# ===== 测试 =====
|
||||
if __name__ == "__main__":
|
||||
saver = RemoteMHTMLSaver(
|
||||
remote_url="http://144.34.185.108:28098/wd/hub", # ← 替换为你的云服务器公网 IP
|
||||
headless=True
|
||||
)
|
||||
try:
|
||||
saver.save_as_mhtml(
|
||||
url="https://www.epochtimes.com/gb/25/12/22/n14660274.htm",
|
||||
output_path="remote_example2.mhtml"
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"❌ 失败: {e}")
|
||||
|
||||
saver.quit()
|
||||
@ -1,201 +0,0 @@
|
||||
import base64
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from selenium import webdriver
|
||||
from selenium.common.exceptions import (
|
||||
WebDriverException,
|
||||
TimeoutException,
|
||||
SessionNotCreatedException,
|
||||
InvalidSessionIdException
|
||||
)
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RemotePDFSaver:
|
||||
def __init__(
|
||||
self,
|
||||
remote_url="http://144.34.185.108:28098/wd/hub",
|
||||
headless=True,
|
||||
max_retries=3,
|
||||
retry_delay=2,
|
||||
print_options=None
|
||||
):
|
||||
"""
|
||||
初始化远程 PDF 保存器(支持自动重建 session)
|
||||
:param remote_url: 远程 Selenium 地址
|
||||
:param headless: 是否无头模式
|
||||
:param max_retries: 单次操作最大重试次数
|
||||
:param retry_delay: 重试前等待时间(秒)
|
||||
:param print_options: PDF 打印选项(参考 DevTools Protocol)
|
||||
"""
|
||||
self.remote_url = remote_url
|
||||
self.headless = headless
|
||||
self.max_retries = max_retries
|
||||
self.retry_delay = retry_delay
|
||||
self.print_options = print_options or {
|
||||
'landscape': False,
|
||||
'displayHeaderFooter': False,
|
||||
'printBackground': True,
|
||||
'preferCSSPageSize': True,
|
||||
'paperWidth': 8.27, # A4 宽(英寸)
|
||||
'paperHeight': 11.69, # A4 高(英寸)
|
||||
}
|
||||
self.driver = None
|
||||
self._init_driver()
|
||||
|
||||
def _build_chrome_options(self):
|
||||
"""构建 Chrome 选项(可复用)"""
|
||||
chrome_options = Options()
|
||||
if self.headless:
|
||||
chrome_options.add_argument('--headless=new')
|
||||
chrome_options.add_argument('--no-sandbox')
|
||||
chrome_options.add_argument('--disable-dev-shm-usage')
|
||||
chrome_options.add_argument('--disable-gpu')
|
||||
chrome_options.add_argument('--window-size=1920,1080')
|
||||
chrome_options.add_argument(
|
||||
"--user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.3650.75 Safari/537.36"
|
||||
)
|
||||
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
|
||||
chrome_options.add_experimental_option('useAutomationExtension', False)
|
||||
return chrome_options
|
||||
|
||||
def _init_driver(self):
|
||||
"""初始化或重新初始化 WebDriver"""
|
||||
if self.driver:
|
||||
try:
|
||||
self.driver.quit()
|
||||
except Exception:
|
||||
pass # 忽略关闭失败
|
||||
|
||||
logger.info(f"正在创建新的远程 WebDriver 会话: {self.remote_url}")
|
||||
for attempt in range(3):
|
||||
try:
|
||||
self.driver = webdriver.Remote(
|
||||
command_executor=self.remote_url,
|
||||
options=self._build_chrome_options()
|
||||
)
|
||||
# 注入反检测脚本
|
||||
self.driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
|
||||
'source': '''
|
||||
delete navigator.__proto__.webdriver;
|
||||
window.chrome = { runtime: {} };
|
||||
Object.defineProperty(navigator, 'languages', {
|
||||
get: () => ['zh-CN', 'zh', 'en']
|
||||
});
|
||||
'''
|
||||
})
|
||||
logger.info("✅ 远程 WebDriver 会话创建成功")
|
||||
return
|
||||
except Exception as e:
|
||||
logger.warning(f"创建 WebDriver 失败 (尝试 {attempt + 1}/3): {e}")
|
||||
if attempt < 2:
|
||||
time.sleep(2)
|
||||
else:
|
||||
raise RuntimeError(f"无法连接到远程 Selenium 服务: {e}")
|
||||
|
||||
def save_as_pdf(self, url, output_path=None, timeout=30, wait_time=5):
|
||||
"""
|
||||
保存网页为 PDF,支持自动重试和 session 重建
|
||||
"""
|
||||
if output_path is None:
|
||||
domain = urlparse(url).netloc.replace('www.', '').split('.')[0] or 'page'
|
||||
output_path = f"{domain}.pdf"
|
||||
if not output_path.lower().endswith('.pdf'):
|
||||
output_path += '.pdf'
|
||||
|
||||
last_exception = None
|
||||
|
||||
for retry in range(self.max_retries + 1):
|
||||
try:
|
||||
# 检查 driver 是否有效
|
||||
if not self.driver:
|
||||
self._init_driver()
|
||||
|
||||
self.driver.set_page_load_timeout(timeout)
|
||||
logger.info(f"[{retry + 1}/{self.max_retries + 1}] 加载页面: {url}")
|
||||
self.driver.get(url)
|
||||
time.sleep(wait_time)
|
||||
|
||||
logger.info("生成 PDF...")
|
||||
result = self.driver.execute_cdp_cmd('Page.printToPDF', self.print_options)
|
||||
pdf_data = base64.b64decode(result['data'])
|
||||
|
||||
# 写入本地 PDF 文件(二进制)
|
||||
with open(output_path, 'wb') as f:
|
||||
f.write(pdf_data)
|
||||
|
||||
file_size = os.path.getsize(output_path)
|
||||
if file_size == 0:
|
||||
raise RuntimeError("生成了空文件")
|
||||
|
||||
logger.info(f"✅ 保存成功: {os.path.abspath(output_path)} ({file_size} 字节)")
|
||||
return os.path.abspath(output_path)
|
||||
|
||||
except (WebDriverException, InvalidSessionIdException, SessionNotCreatedException) as e:
|
||||
last_exception = e
|
||||
logger.warning(f"WebDriver 异常 (retry {retry + 1}): {e}")
|
||||
if retry < self.max_retries:
|
||||
logger.info("正在重建 WebDriver 会话...")
|
||||
self._init_driver()
|
||||
time.sleep(self.retry_delay)
|
||||
else:
|
||||
logger.error("达到最大重试次数,放弃")
|
||||
break
|
||||
|
||||
except TimeoutException as e:
|
||||
last_exception = e
|
||||
logger.warning(f"页面加载超时 (retry {retry + 1}): {e}")
|
||||
if retry < self.max_retries:
|
||||
time.sleep(self.retry_delay)
|
||||
else:
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
last_exception = e
|
||||
logger.error(f"未知错误 (retry {retry + 1}): {e}")
|
||||
break # 非 WebDriver 错误,不重试
|
||||
|
||||
# 清理失败生成的空文件
|
||||
if os.path.exists(output_path):
|
||||
try:
|
||||
os.remove(output_path)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
raise RuntimeError(f"保存失败({type(last_exception).__name__}): {last_exception}")
|
||||
|
||||
def quit(self):
|
||||
"""显式关闭浏览器"""
|
||||
if self.driver:
|
||||
try:
|
||||
self.driver.quit()
|
||||
logger.info("WebDriver 会话已关闭")
|
||||
except Exception:
|
||||
pass
|
||||
self.driver = None
|
||||
|
||||
def __del__(self):
|
||||
self.quit()
|
||||
|
||||
|
||||
# ===== 测试 =====
|
||||
if __name__ == "__main__":
|
||||
saver = RemotePDFSaver(
|
||||
remote_url="http://144.34.185.108:28098/wd/hub", # ← 替换为你的云服务器公网 IP
|
||||
headless=True
|
||||
)
|
||||
try:
|
||||
saver.save_as_pdf(
|
||||
url="https://www.epochtimes.com/gb/25/12/22/n14660274.htm",
|
||||
output_path="remote_example2.pdf"
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"❌ 失败: {e}")
|
||||
|
||||
saver.quit()
|
||||
Loading…
x
Reference in New Issue
Block a user