diff --git a/dsp/dsp.iml b/dsp/dsp.iml index 36aa587..99a49d3 100644 --- a/dsp/dsp.iml +++ b/dsp/dsp.iml @@ -1,240 +1,13 @@ - + - - - - - - - - - - - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/dsp/src/main/java/com/jsc/dsp/controller/ExportController.java b/dsp/src/main/java/com/jsc/dsp/controller/ExportController.java index d1c8ba5..5f90df6 100644 --- a/dsp/src/main/java/com/jsc/dsp/controller/ExportController.java +++ b/dsp/src/main/java/com/jsc/dsp/controller/ExportController.java @@ -44,4 +44,14 @@ public class ExportController { } } + @PostMapping("/triggerTwitterTask") + public ReturnT triggerTwitterTask() { + try { + new Thread(() -> autoExportAndUpload.exportTwitterDataAndUpload()).start(); + return new ReturnT<>(200, "", ""); + } catch (Exception e) { + return new ReturnT<>(500, e.getMessage(), ""); + } + } + } diff --git a/dsp/src/main/java/com/jsc/dsp/dao/EsDataTwitterRepository.java b/dsp/src/main/java/com/jsc/dsp/dao/EsDataTwitterRepository.java new file mode 100644 index 0000000..2b61292 --- /dev/null +++ b/dsp/src/main/java/com/jsc/dsp/dao/EsDataTwitterRepository.java @@ -0,0 +1,12 @@ +package com.jsc.dsp.dao; + +import com.jsc.dsp.model.EsDataTwitterView; +import org.springframework.data.jpa.repository.JpaRepository; +import org.springframework.stereotype.Repository; + +import java.util.List; + +@Repository +public interface EsDataTwitterRepository extends JpaRepository { + List findAllByEsLoadtimeAfter(String loadtime); +} diff --git a/dsp/src/main/java/com/jsc/dsp/model/EsDataTwitterView.java b/dsp/src/main/java/com/jsc/dsp/model/EsDataTwitterView.java new file mode 100644 index 0000000..13d5199 --- /dev/null +++ b/dsp/src/main/java/com/jsc/dsp/model/EsDataTwitterView.java @@ -0,0 +1,54 @@ +package com.jsc.dsp.model; + +import lombok.Data; + +import javax.persistence.Entity; +import javax.persistence.Id; +import javax.persistence.Table; + +@Entity +@Data +@Table(name = "es_data_twitter") +public class EsDataTwitterView { + + @Id + private String esUrltime; + + private String esAuthors; + + private String esCarriertype; + + private String esSitename; + + private String esUrlcontent; + + private String esUrlcontentTranslate; + + private String esUrlname; + + private String esUrltitle; + + private String esUrltitleTranslate; + + private String esVideo; + + private String esExtname; + + private String esIsrepost; + + private String esCatalog1; + + private String esForwardcount; + + private String esLikecount; + + private String esCommentcount; + + private String esHkey; + + private String esUrlimage; + + private String esUserid; + + private String esLoadtime; +} \ No newline at end of file diff --git a/dsp/src/main/java/com/jsc/dsp/service/StorageService.java b/dsp/src/main/java/com/jsc/dsp/service/StorageService.java index 0a17473..968d548 100644 --- a/dsp/src/main/java/com/jsc/dsp/service/StorageService.java +++ b/dsp/src/main/java/com/jsc/dsp/service/StorageService.java @@ -48,6 +48,9 @@ public class StorageService extends StreamService { @Value("${custom.websiteWhiteList}") String websiteWhiteListString; + @Value("${custom.twitterWhiteList}") + String twitterWhiteListString; + @Resource DatabaseConnector databaseConnector; @@ -66,6 +69,8 @@ public class StorageService extends StreamService { @StreamListener(StorageBinding.STORAGE_PIPELINE_IN) public void receiveMessage(Object payload) { List websiteWhiteList = Arrays.asList(websiteWhiteListString.split(";")); + List twitterWhiteList = Arrays.asList(twitterWhiteListString.split(";")); + String tempString; try { tempString = new String(base64.decode(payload.toString()), StandardCharsets.UTF_8); @@ -111,7 +116,7 @@ public class StorageService extends StreamService { } } } - // 只导出目标站点的数据 + // 只导出白名单站点的数据 if (websiteWhiteList.contains(indeximos.getEs_sitename())) { logger.info("开始处理站点【" + indeximos.getEs_sitename() + "】的数据入库流程"); String uuid = UUID.randomUUID().toString().replaceAll("-", ""); @@ -142,6 +147,34 @@ public class StorageService extends StreamService { } dbStorageItems.add(indeximos); } + if (indeximos.getEs_carriertype().equals("media") && twitterWhiteList.contains(indeximos.getEs_authors())) { + logger.info("开始处理推特用户【" + indeximos.getEs_authors() + "】的数据入库流程"); + String uuid = UUID.randomUUID().toString().replaceAll("-", ""); + String es_urlname = indeximos.getEs_urlname(); + if (!es_urlname.isEmpty()) { + // 根据urlname生成固定的UUID,避免重复入库相同的文章 + UUID _uuid = UUID.nameUUIDFromBytes(es_urlname.getBytes()); + uuid = _uuid.toString().replaceAll("-", ""); + } + indeximos.setEs_sid(uuid); + indeximos.setEs_loadtime(StringUtils.TimestampToStringDate(System.currentTimeMillis())); + builder.setEsSid(uuid); + for (Field f : indeximos.getClass().getDeclaredFields()) { + f.setAccessible(true); + //判断字段是否为空,并且对象属性中的基本都会转为对象类型来判断 + if (f.get(indeximos) == null) { + String fieldType = databaseConnector.getFieldType(Indeximos.class, f.getName()); + if (fieldType.contains("Float")) { + f.set(indeximos, 0.0f); + } else { + if (!dateFields.contains(f.getName())) { + f.set(indeximos, ""); + } + } + } + } + dbStorageItems.add(indeximos); + } } if (dbStorageItems.size() > 0) { databaseConnector.insertIntoDB(dbStorageItems); diff --git a/dsp/src/main/java/com/jsc/dsp/utils/AutoExportAndUpload.java b/dsp/src/main/java/com/jsc/dsp/utils/AutoExportAndUpload.java index 01a3708..2e7eb71 100644 --- a/dsp/src/main/java/com/jsc/dsp/utils/AutoExportAndUpload.java +++ b/dsp/src/main/java/com/jsc/dsp/utils/AutoExportAndUpload.java @@ -77,6 +77,24 @@ public class AutoExportAndUpload { zipAndUploadDirectory(excelOutputPath, zipFileFullName, remoteZipPath); } + /** + * 每周一、三、五的早上8点,执行导出数据的任务 + */ + @Scheduled(cron = "${custom.exportTaskSchedule}") + public void exportTwitterDataAndUpload() { + logger.info("开始导出twitter excel和pdf数据..."); + String twitterLastLoadTime = configService.getConfigValueByName("twitter_last_loadtime"); + String currentLoadTime = StringUtils.DateToString(new Date()); + String timestamp = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd")); + databaseConnector.twitterToXlsx(twitterLastLoadTime); + copyPagesFiles(twitterLastLoadTime, currentLoadTime); + configService.setConfigValueByName("twitter_last_loadtime", currentLoadTime); + String zipFileName = "data_twitter-" + timestamp + "-001.zip"; + String zipFileFullName = backupFilePath + File.separator + zipFileName; + String remoteZipPath = ftpUploadPath + "/" + zipFileName; + zipAndUploadDirectory(excelOutputPath, zipFileFullName, remoteZipPath); + } + /** * 将指定目录打包成 ZIP 文件(保存到指定本地路径),并上传到 FTP 服务器 * diff --git a/dsp/src/main/java/com/jsc/dsp/utils/DatabaseConnector.java b/dsp/src/main/java/com/jsc/dsp/utils/DatabaseConnector.java index 816c6c6..482b3f1 100644 --- a/dsp/src/main/java/com/jsc/dsp/utils/DatabaseConnector.java +++ b/dsp/src/main/java/com/jsc/dsp/utils/DatabaseConnector.java @@ -1,8 +1,11 @@ package com.jsc.dsp.utils; +import com.fasterxml.jackson.databind.ObjectMapper; import com.jsc.dsp.dao.EsDataNewsRepository; +import com.jsc.dsp.dao.EsDataTwitterRepository; import com.jsc.dsp.dao.IndeximosRepository; import com.jsc.dsp.model.EsDataNewsView; +import com.jsc.dsp.model.EsDataTwitterView; import com.jsc.dsp.model.Indeximos; import org.apache.poi.ss.usermodel.*; import org.apache.poi.xssf.usermodel.XSSFWorkbook; @@ -23,6 +26,8 @@ import java.nio.file.Paths; import java.time.LocalDateTime; import java.time.format.DateTimeFormatter; import java.util.List; +import com.fasterxml.jackson.databind.JsonNode; +import java.util.ArrayList; @Service public class DatabaseConnector { @@ -32,10 +37,14 @@ public class DatabaseConnector { @Resource EsDataNewsRepository esDataNewsRepository; + @Resource + EsDataTwitterRepository esDataTwitterRepository; @Value("${custom.excelOutputPath}") String excelOutputPath; + private static final ObjectMapper objectMapper = new ObjectMapper(); + private final Logger logger = LoggerFactory.getLogger(this.getClass().getName()); public void insertIntoDB(List objectList) { @@ -78,7 +87,7 @@ public class DatabaseConnector { List esDataNewsViewList = esDataNewsRepository.findAllByEsLoadtimeAfter(startTime); if (!esDataNewsViewList.isEmpty()) { - Field[] fields = esDataNewsViewList.get(0).getClass().getDeclaredFields(); + Field[] fields = esDataNewsViewList.get(0).getClass().getDeclaredFields(); // 通过反射获取类的成员信息,并使用这些类成员为后续生成的excel表头做准备 try (Workbook workbook = new XSSFWorkbook(); ByteArrayOutputStream out = new ByteArrayOutputStream()) { @@ -86,7 +95,7 @@ public class DatabaseConnector { // 创建表头 Row headerRow = sheet.createRow(0); - CellStyle headerStyle = workbook.createCellStyle(); + CellStyle headerStyle = workbook.createCellStyle(); // 创建单元格 headerStyle.setFillForegroundColor(IndexedColors.LIGHT_BLUE.getIndex()); headerStyle.setFillPattern(FillPatternType.SOLID_FOREGROUND); @@ -157,9 +166,186 @@ public class DatabaseConnector { } } + + public void twitterToXlsx(String startTime) { + try { + Path dirPath = Paths.get(excelOutputPath); + if (!Files.exists(dirPath)) { + Files.createDirectories(dirPath); + } + String timestamp = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd")); + String fileName = "data_twitter-" + timestamp + "-001.xlsx"; + Path filePath = dirPath.resolve(fileName); + + List esDataNewsViewList = esDataTwitterRepository.findAllByEsLoadtimeAfter(startTime); + if (!esDataNewsViewList.isEmpty()) { + Field[] fields = esDataNewsViewList.get(0).getClass().getDeclaredFields(); // 通过反射获取类的成员信息,并使用这些类成员为后续生成的excel表头做准备 + try (Workbook workbook = new XSSFWorkbook(); + ByteArrayOutputStream out = new ByteArrayOutputStream()) { + + Sheet sheet = workbook.createSheet("data"); + + // 创建表头 + Row headerRow = sheet.createRow(0); + CellStyle headerStyle = workbook.createCellStyle(); // 创建单元格 + headerStyle.setFillForegroundColor(IndexedColors.LIGHT_BLUE.getIndex()); + headerStyle.setFillPattern(FillPatternType.SOLID_FOREGROUND); + + for (int i = 0; i < fields.length; i++) { + Cell cell = headerRow.createCell(i); + String formField = formField(fields[i]); + cell.setCellValue(formField); + cell.setCellStyle(headerStyle); + } + // 填充数据 + int rowNum = 1; + for (EsDataTwitterView item : esDataNewsViewList) { + Row row = sheet.createRow(rowNum++); + logger.debug("导出excel第" + rowNum + "行"); + // 0: esUrltime + row.createCell(0).setCellValue(item.getEsUrltime() != null ? item.getEsUrltime() : ""); + + // 1: esAuthors + row.createCell(1).setCellValue(item.getEsAuthors() != null ? item.getEsAuthors() : ""); + + // 2: esCarriertype + row.createCell(2).setCellValue(item.getEsCarriertype() != null ? item.getEsCarriertype() : ""); + + // 3: esSitename + row.createCell(3).setCellValue(item.getEsSitename() != null ? item.getEsSitename() : ""); + + // 4: esUrlcontent + String esUrlcontent = item.getEsUrlcontent(); + if (esUrlcontent != null && esUrlcontent.length() > 10000) { + row.createCell(4).setCellValue(esUrlcontent.substring(0, 10000)); + } else { + row.createCell(4).setCellValue(esUrlcontent != null ? esUrlcontent : ""); + } + + // 5: esUrlcontentTranslate + String esUrlcontentTranslate = item.getEsUrlcontentTranslate(); + if (esUrlcontentTranslate != null && esUrlcontentTranslate.length() > 10000) { + row.createCell(5).setCellValue(esUrlcontentTranslate.substring(0, 10000)); + } else { + row.createCell(5).setCellValue(esUrlcontentTranslate != null ? esUrlcontentTranslate : ""); + } + + // 6: esUrlname + row.createCell(6).setCellValue(item.getEsUrlname() != null ? item.getEsUrlname() : ""); + + // 7: esUrltitle + String esUrltitle = item.getEsUrltitle(); + if (esUrltitle != null && esUrltitle.length() > 10000) { + row.createCell(7).setCellValue(esUrltitle.substring(0, 10000)); + } else { + row.createCell(7).setCellValue(esUrltitle != null ? esUrltitle : ""); + } + + // 8: es_urltitle_translate + String esUrltitleTranslate = item.getEsUrltitleTranslate(); + if (esUrltitleTranslate != null && esUrltitleTranslate.length() > 10000) { + row.createCell(7).setCellValue(esUrltitleTranslate.substring(0, 10000)); + } else { + row.createCell(7).setCellValue(esUrltitleTranslate != null ? esUrltitleTranslate : ""); + } + + // 9: esVideo + String videoFilenames = extractFilenamesFromJsonArray(item.getEsVideo()); + row.createCell(9).setCellValue(videoFilenames); + + // 10: esExtname + row.createCell(10).setCellValue(item.getEsExtname() != null ? item.getEsExtname() : ""); + + // 11: esIsrepost + row.createCell(11).setCellValue(item.getEsIsrepost() != null ? item.getEsIsrepost() : ""); + + // 12: esCatalog1 + row.createCell(12).setCellValue(item.getEsCatalog1() != null ? item.getEsCatalog1() : ""); + + // 13: esForwardcount + row.createCell(13).setCellValue(item.getEsForwardcount() != null ? item.getEsForwardcount() : ""); + + // 14: esLikecount + row.createCell(14).setCellValue(item.getEsLikecount() != null ? item.getEsLikecount() : ""); + + // 15: esCommentcount + row.createCell(15).setCellValue(item.getEsCommentcount() != null ? item.getEsCommentcount() : ""); + + // 16: esHkey + row.createCell(16).setCellValue(item.getEsHkey() != null ? item.getEsHkey() : ""); + + // 17: esUrlimage + String imageFilenames = extractFilenamesFromJsonArray(item.getEsUrlimage()); + row.createCell(17).setCellValue(imageFilenames); + + // 18: esUserid + row.createCell(18).setCellValue(item.getEsUserid() != null ? item.getEsUserid() : ""); + + // 19: esLoadtime + row.createCell(19).setCellValue(item.getEsLoadtime() != null ? item.getEsLoadtime() : ""); + + } + logger.info("完成excel数据写入,共" + rowNum + "行"); + + // 自动调整列宽 + for (int i = 0; i < fields.length; i++) { + sheet.autoSizeColumn(i); + } + + workbook.write(out); + + try (FileOutputStream fos = new FileOutputStream(filePath.toFile())) { + workbook.write(fos); + } + } catch (IOException e) { + e.printStackTrace(); + } + } + logger.info("excel导出完成!"); + } catch (Exception e) { + e.printStackTrace(); + } + } + + private String formField(Field field) { String fieldString = field.getName(); return StringUtils.camelToSnake(fieldString); } + + + public String extractFilenamesFromJsonArray(String jsonStr) { + if (jsonStr == null || jsonStr.trim().isEmpty()) { + return ""; + } + + try { + JsonNode array = objectMapper.readTree(jsonStr.replace("'", "\"").trim()); + if (!array.isArray()) { + return ""; + } + + List filenames = new ArrayList<>(); + for (JsonNode node : array) { + if (node.has("path")) { + String url = node.get("path").asText().trim(); + if (!url.isEmpty()) { + // 提取文件名(支持带参数的 URL) + String filename = url.split("\\?")[0]; // 去掉 ? 后的参数 + filename = filename.substring(filename.lastIndexOf('/') + 1); + if (!filename.isEmpty()) { + filenames.add(filename); + } + } + } + } + + return String.join(";", filenames); + } catch (Exception e) { + // 如果解析失败,返回空或原始内容(根据需求) + return ""; // 或者 return jsonStr; 用于调试 + } + } + } diff --git a/dsp/src/main/resources/application.yml b/dsp/src/main/resources/application.yml index fe43a5a..711ad4f 100644 --- a/dsp/src/main/resources/application.yml +++ b/dsp/src/main/resources/application.yml @@ -88,6 +88,7 @@ custom: socialQueryAPI: http://47.115.228.133:28081/api/open/target/social/queryAll?sortBy=id&shuffleResult=false socialUpdateAPI: http://47.115.228.133:28081/api/open/target/social/update websiteWhiteList: 能源界(国内信息);能源界(国际信息);中国能源新闻网;新华能源网;中国能源网(能源战略);中国农网(三农要闻);中国经济网(三农经济);中华粮网(粮食安全);美国之音(中国版面);美国之音(中美关系);美国之音(台海两岸版面);美国之音(港澳版面);看中国(看大陆版面);看中国(重点新闻);德国之声(中国报道);纽约时报中文网(中国版面);大纪元(一周大陆新闻);EnergyNow;联合国粮农组织;路透社(中国版面) + twitterWhiteList: nytchinese;YesterdayBigcat;takaichi_sanae;yonhapcn;VOAChinese;ChineseWSJ;whyyoutouzhele;Jaemyung_Lee excelOutputPath: D:/data/output/upload backupFilePath: D:/data/output/backup pagesOutputPath: D:/data/output/pdf