init
This commit is contained in:
commit
98d7406a99
158
knows-java/pom.xml
Normal file
158
knows-java/pom.xml
Normal file
@ -0,0 +1,158 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<parent>
|
||||
<groupId>org.springframework.boot</groupId>
|
||||
<artifactId>spring-boot-starter-parent</artifactId>
|
||||
<version>3.3.2</version>
|
||||
<relativePath/> <!-- lookup parent from repository -->
|
||||
</parent>
|
||||
<groupId>com.zhych</groupId>
|
||||
<artifactId>knows</artifactId>
|
||||
<version>0.0.1-SNAPSHOT</version>
|
||||
<name>embeddings</name>
|
||||
<description>embeddings</description>
|
||||
|
||||
<properties>
|
||||
<java.version>17</java.version>
|
||||
</properties>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.springframework.boot</groupId>
|
||||
<artifactId>spring-boot-starter-web</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>commons-lang3</artifactId>
|
||||
<version>3.12.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.projectlombok</groupId>
|
||||
<artifactId>lombok</artifactId>
|
||||
<optional>true</optional>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.springframework.boot</groupId>
|
||||
<artifactId>spring-boot-starter-test</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.springframework.boot</groupId>
|
||||
<artifactId>spring-boot-starter-data-elasticsearch</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.alibaba</groupId>
|
||||
<artifactId>fastjson</artifactId>
|
||||
<version>2.0.15</version>
|
||||
<scope>compile</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>cn.hutool</groupId>
|
||||
<artifactId>hutool-all</artifactId>
|
||||
<version>5.8.25</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.squareup.okhttp3</groupId>
|
||||
<artifactId>okhttp</artifactId>
|
||||
<version>5.0.0-alpha.3</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.httpcomponents</groupId>
|
||||
<artifactId>httpclient</artifactId>
|
||||
<version>4.5.13</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.elasticsearch.client</groupId>
|
||||
<artifactId>elasticsearch-rest-high-level-client</artifactId>
|
||||
<version>7.17.23</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>co.elastic.clients</groupId>
|
||||
<artifactId>elasticsearch-java</artifactId>
|
||||
<version>8.13.4</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.fasterxml.jackson.core</groupId>
|
||||
<artifactId>jackson-databind</artifactId>
|
||||
<version>2.15.2</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.alibaba</groupId>
|
||||
<artifactId>dashscope-sdk-java</artifactId>
|
||||
<version>2.8.3</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.pdfbox</groupId>
|
||||
<artifactId>pdfbox</artifactId>
|
||||
<version>2.0.24</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>net.sourceforge.tess4j</groupId>
|
||||
<artifactId>tess4j</artifactId>
|
||||
<version>5.7.0</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.bytedeco</groupId>
|
||||
<artifactId>opencv-platform</artifactId>
|
||||
<version>4.7.0-1.5.9</version>
|
||||
</dependency>
|
||||
|
||||
<!-- Apache POI for Word documents -->
|
||||
<dependency>
|
||||
<groupId>org.apache.poi</groupId>
|
||||
<artifactId>poi</artifactId>
|
||||
<version>5.2.3</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.poi</groupId>
|
||||
<artifactId>poi-ooxml</artifactId>
|
||||
<version>5.2.3</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.poi</groupId>
|
||||
<artifactId>poi-scratchpad</artifactId>
|
||||
<version>5.2.3</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
<resources>
|
||||
<resource>
|
||||
<directory>src/main/resources</directory>
|
||||
<includes>
|
||||
<include>**/*</include>
|
||||
</includes>
|
||||
</resource>
|
||||
</resources>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.springframework.boot</groupId>
|
||||
<artifactId>spring-boot-maven-plugin</artifactId>
|
||||
<configuration>
|
||||
<excludes>
|
||||
<exclude>
|
||||
<groupId>org.projectlombok</groupId>
|
||||
<artifactId>lombok</artifactId>
|
||||
</exclude>
|
||||
</excludes>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-surefire-plugin</artifactId>
|
||||
<version>3.0.0-M5</version>
|
||||
<configuration>
|
||||
<argLine>
|
||||
-Xmx2048m
|
||||
-Djava.library.path=${project.basedir}/lib/opencv
|
||||
</argLine>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
|
||||
</project>
|
15
knows-java/src/main/java/cn/luckday/Application.java
Normal file
15
knows-java/src/main/java/cn/luckday/Application.java
Normal file
@ -0,0 +1,15 @@
|
||||
package cn.luckday;
|
||||
|
||||
import org.springframework.boot.SpringApplication;
|
||||
import org.springframework.boot.autoconfigure.SpringBootApplication;
|
||||
import org.springframework.context.annotation.ComponentScan;
|
||||
|
||||
@SpringBootApplication
|
||||
@ComponentScan(value = {"cn.luckday.*"})
|
||||
public class Application {
|
||||
|
||||
public static void main(String[] args) {
|
||||
SpringApplication.run(Application.class, args);
|
||||
}
|
||||
|
||||
}
|
21
knows-java/src/main/java/cn/luckday/bean/KnowsIndex.java
Normal file
21
knows-java/src/main/java/cn/luckday/bean/KnowsIndex.java
Normal file
@ -0,0 +1,21 @@
|
||||
package cn.luckday.bean;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public class KnowsIndex {
|
||||
|
||||
private String id;
|
||||
|
||||
private String file_name;
|
||||
|
||||
private String file_path;
|
||||
|
||||
private String file_type;
|
||||
|
||||
private String file_size;
|
||||
|
||||
private String content;
|
||||
|
||||
private double[] content_vec;
|
||||
}
|
13
knows-java/src/main/java/cn/luckday/bean/SearchResult.java
Normal file
13
knows-java/src/main/java/cn/luckday/bean/SearchResult.java
Normal file
@ -0,0 +1,13 @@
|
||||
package cn.luckday.bean;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
public class SearchResult {
|
||||
private KnowsIndex knowsIndex;
|
||||
private Double score;
|
||||
}
|
@ -0,0 +1,112 @@
|
||||
package cn.luckday.controller;
|
||||
|
||||
import cn.hutool.core.collection.CollUtil;
|
||||
import cn.luckday.llm.QwenClient;
|
||||
import com.alibaba.dashscope.aigc.generation.GenerationResult;
|
||||
import com.alibaba.dashscope.exception.InputRequiredException;
|
||||
import com.alibaba.dashscope.exception.NoApiKeyException;
|
||||
import com.alibaba.fastjson.JSON;
|
||||
import com.alibaba.fastjson.JSONObject;
|
||||
import cn.luckday.bean.SearchResult;
|
||||
import cn.luckday.embed.EmbedClient;
|
||||
import cn.luckday.embed.ReRankClient;
|
||||
import cn.luckday.llm.OllamaClient;
|
||||
import cn.luckday.service.EsDocumentService;
|
||||
import jakarta.annotation.Resource;
|
||||
import jakarta.servlet.http.HttpServletResponse;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.web.bind.annotation.PostMapping;
|
||||
import org.springframework.web.bind.annotation.RequestBody;
|
||||
import org.springframework.web.bind.annotation.RequestMapping;
|
||||
import org.springframework.web.bind.annotation.RestController;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
@Slf4j
|
||||
@RestController
|
||||
@RequestMapping("/knows")
|
||||
public class KnowsController {
|
||||
|
||||
@Value("${embedding.uri}")
|
||||
private String embeddingUri;
|
||||
|
||||
@Value("${embedding.api-key}")
|
||||
private String embeddingApiKey;
|
||||
|
||||
@Value("${re-rank.uri}")
|
||||
private String ReRankUri;
|
||||
|
||||
@Value("${re-rank.api-key}")
|
||||
private String ReRankApiKey;
|
||||
|
||||
@Value("${oll.uri}")
|
||||
private String ollUri;
|
||||
|
||||
@Value("${qwen.api-key}")
|
||||
private static String apiKey;
|
||||
|
||||
@Value("${qwen.model}")
|
||||
private static String model;
|
||||
|
||||
@Resource
|
||||
private EsDocumentService service;
|
||||
|
||||
@PostMapping("/process")
|
||||
public List<SearchResult> process(@RequestBody Map<String, String> dto) throws IOException {
|
||||
String keyword = dto.get("keyword");
|
||||
return service.searchVector(EmbedClient.getEmbedding(embeddingUri, embeddingApiKey, keyword));
|
||||
}
|
||||
|
||||
@PostMapping("/generate")
|
||||
public void generate(HttpServletResponse response, @RequestBody Map<String, String> dto) throws IOException, NoApiKeyException, InputRequiredException {
|
||||
String keyword = dto.get("keyword");
|
||||
List<SearchResult> searchResults = service.searchVector(EmbedClient.getEmbedding(embeddingUri, embeddingApiKey, keyword));
|
||||
List<String> contents = searchResults.stream().map(searchResult -> searchResult.getKnowsIndex().getContent()).toList();
|
||||
log.info("搜索结果searchResults: {} ", contents);
|
||||
|
||||
Object reRankPassages = "";
|
||||
if (CollUtil.isNotEmpty(searchResults)) {
|
||||
// 重排处理
|
||||
List<String> contentList = new ArrayList<>();
|
||||
searchResults.forEach(searchResult -> contentList.add(searchResult.getKnowsIndex().getContent()));
|
||||
String reRank = ReRankClient.reRank(ReRankUri, ReRankApiKey, contentList, keyword);
|
||||
log.info("重排结果reRank: {} ", reRank);
|
||||
|
||||
JSONObject jsonObject = JSON.parseObject(reRank, JSONObject.class);
|
||||
reRankPassages = jsonObject.get("rerank_passages");
|
||||
}
|
||||
|
||||
// LLM总结回答
|
||||
OllamaClient.sendMsg(response, ollUri, keyword, reRankPassages.toString());
|
||||
}
|
||||
|
||||
@PostMapping("/qwen-generate")
|
||||
public String qwen(@RequestBody Map<String, String> dto) throws IOException, NoApiKeyException, InputRequiredException {
|
||||
String keyword = dto.get("keyword");
|
||||
List<SearchResult> searchResults = service.searchVector(EmbedClient.getEmbedding(embeddingUri, embeddingApiKey, keyword));
|
||||
List<String> contents = searchResults.stream().map(searchResult -> searchResult.getKnowsIndex().getContent()).toList();
|
||||
log.info("搜索结果searchResults: {} ", contents);
|
||||
|
||||
Object reRankPassages = "";
|
||||
if (CollUtil.isNotEmpty(searchResults)) {
|
||||
// 重排处理
|
||||
List<String> contentList = new ArrayList<>();
|
||||
searchResults.forEach(searchResult -> contentList.add(searchResult.getKnowsIndex().getContent()));
|
||||
String reRank = ReRankClient.reRank(ReRankUri, ReRankApiKey, contentList, keyword);
|
||||
log.info("重排结果reRank: {} ", reRank);
|
||||
|
||||
JSONObject jsonObject = JSON.parseObject(reRank, JSONObject.class);
|
||||
reRankPassages = jsonObject.get("rerank_passages");
|
||||
}
|
||||
|
||||
// LLM总结回答
|
||||
GenerationResult result = QwenClient.sendMsg(model, apiKey, keyword, reRankPassages.toString());
|
||||
String content = result.getOutput().getChoices().get(0).getMessage().getContent();
|
||||
log.info("千问: {}", content);
|
||||
return content;
|
||||
}
|
||||
}
|
@ -0,0 +1,22 @@
|
||||
package cn.luckday.controller;
|
||||
|
||||
import cn.luckday.service.RedFileService;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.web.bind.annotation.*;
|
||||
import org.springframework.web.multipart.MultipartFile;
|
||||
import java.util.*;
|
||||
|
||||
@RestController
|
||||
@RequestMapping("/api/file")
|
||||
public class RedFileController {
|
||||
|
||||
@Autowired
|
||||
private RedFileService redFileService;
|
||||
|
||||
@PostMapping("/upload")
|
||||
public ResponseEntity<?> uploadFile(@RequestParam("file") MultipartFile file) {
|
||||
redFileService.uploadFile(file);
|
||||
return ResponseEntity.ok(Map.of("message", "文件上传并解析成功"));
|
||||
}
|
||||
}
|
60
knows-java/src/main/java/cn/luckday/document/Main.java
Normal file
60
knows-java/src/main/java/cn/luckday/document/Main.java
Normal file
@ -0,0 +1,60 @@
|
||||
package cn.luckday.document;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.List;
|
||||
import java.awt.image.BufferedImage;
|
||||
|
||||
public class Main {
|
||||
public static void main(String[] args) {
|
||||
try {
|
||||
// 验证文件是否存在
|
||||
String pdfPath = "D:\\小红书文档\\中频\\运营经验库\\方法论\\PDF\\评论区和私信的互动指引的方法论.pdf";
|
||||
File pdfFile = new File(pdfPath);
|
||||
if (!pdfFile.exists()) {
|
||||
System.err.println("PDF文件不存在: " + pdfPath);
|
||||
return;
|
||||
}
|
||||
|
||||
// 初始化PDFParser时添加错误处理
|
||||
PDFParser parser = new PDFParser(pdfPath);
|
||||
try {
|
||||
parser.parse();
|
||||
} catch (Exception e) {
|
||||
System.err.println("PDF解析失败: " + e.getMessage());
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
||||
// 获取结果
|
||||
List<String> texts = parser.getExtractedText();
|
||||
List<BufferedImage> images = parser.getExtractedImages();
|
||||
List<Table> tables = parser.getExtractedTables();
|
||||
|
||||
// // 处理Word文档
|
||||
// String wordPath = "D:\\小红书文档\\高频\\平台知识库\\已处理word\\新模式开票流程及注意事项.docx";
|
||||
// WordProcessor wordProcessor = new WordProcessor(wordPath);
|
||||
// wordProcessor.process();
|
||||
//
|
||||
// // 获取提取的文本
|
||||
// List<String> textContent = wordProcessor.getExtractedText();
|
||||
// for (String text : textContent) {
|
||||
// System.out.println(text);
|
||||
// }
|
||||
//
|
||||
// // 处理表格
|
||||
// List<XWPFTable> tables = wordProcessor.getExtractedTables();
|
||||
// for (XWPFTable table : tables) {
|
||||
// List<List<String>> tableData = wordProcessor.convertTableToList(table);
|
||||
// System.out.println("表格数据:" + tableData);
|
||||
//
|
||||
// // 导出表格为CSV
|
||||
// wordProcessor.exportTableToCSV(table, "table_output.csv");
|
||||
// }
|
||||
//
|
||||
// // 保存图片
|
||||
// wordProcessor.saveImages("output_images");
|
||||
} catch (Exception e) {
|
||||
System.err.println("程序执行出错: " + e.getMessage());
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
180
knows-java/src/main/java/cn/luckday/document/OCRProcessor.java
Normal file
180
knows-java/src/main/java/cn/luckday/document/OCRProcessor.java
Normal file
@ -0,0 +1,180 @@
|
||||
package cn.luckday.document;
|
||||
|
||||
import net.sourceforge.tess4j.Tesseract;
|
||||
import org.opencv.core.CvType;
|
||||
import org.opencv.core.Mat;
|
||||
import org.opencv.core.Size;
|
||||
import org.opencv.imgproc.Imgproc;
|
||||
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.File;
|
||||
import java.awt.image.DataBufferByte;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
|
||||
public class OCRProcessor {
|
||||
static {
|
||||
try {
|
||||
// 从资源目录加载本地库
|
||||
String libraryPath = OCRProcessor.class
|
||||
.getClassLoader()
|
||||
.getResource("native/" + System.mapLibraryName("opencv_java4110"))
|
||||
.getPath();
|
||||
|
||||
System.load(libraryPath);
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
private final Tesseract tesseract;
|
||||
|
||||
public OCRProcessor() {
|
||||
tesseract = new Tesseract();
|
||||
initializeTesseract();
|
||||
}
|
||||
|
||||
private void initializeTesseract() {
|
||||
try {
|
||||
// 设置Tesseract数据路径
|
||||
String tessdataPath = System.getenv("TESSDATA_PREFIX");
|
||||
if (tessdataPath == null || tessdataPath.isEmpty()) {
|
||||
tessdataPath ="D:\\study\\backend\\embeddingstoes-master\\src\\main\\resources\\ocr";
|
||||
}
|
||||
|
||||
tesseract.setDatapath(tessdataPath);
|
||||
|
||||
// 修改:使用不依赖OSD的页面分割模式
|
||||
tesseract.setPageSegMode(3);
|
||||
|
||||
// 设置语言包
|
||||
tesseract.setLanguage("chi_sim");
|
||||
|
||||
// 性能优化配置
|
||||
tesseract.setTessVariable("tessedit_create_pdf", "0");
|
||||
tesseract.setTessVariable("tessedit_create_hocr", "0");
|
||||
tesseract.setTessVariable("tessedit_write_images", "0");
|
||||
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException("Tesseract 初始化失败: " + e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
|
||||
public String performOCR(BufferedImage image) {
|
||||
try {
|
||||
// 基本图像验证
|
||||
if (image == null || image.getWidth() < 10 || image.getHeight() < 10) {
|
||||
throw new IllegalArgumentException("无效的图像");
|
||||
}
|
||||
|
||||
// 预处理图像
|
||||
BufferedImage processedImage = preprocessImage(image);
|
||||
|
||||
// 执行OCR
|
||||
return tesseract.doOCR(processedImage);
|
||||
|
||||
} catch (Exception e) {
|
||||
System.err.println("OCR处理失败: " + e.getMessage());
|
||||
e.printStackTrace();
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
private BufferedImage preprocessImage(BufferedImage image) {
|
||||
try {
|
||||
Mat mat = bufferedImageToMat(image);
|
||||
|
||||
// 调整预处理步骤
|
||||
// 1. 转换为灰度图
|
||||
Mat gray = new Mat();
|
||||
Imgproc.cvtColor(mat, gray, Imgproc.COLOR_BGR2GRAY);
|
||||
|
||||
// 2. 使用OTSU二值化替代自适应阈值
|
||||
Mat binary = new Mat();
|
||||
Imgproc.threshold(gray, binary, 0, 255, Imgproc.THRESH_BINARY + Imgproc.THRESH_OTSU);
|
||||
|
||||
// 3. 添加形态学操作
|
||||
Mat kernel = Imgproc.getStructuringElement(Imgproc.MORPH_RECT, new Size(3, 3));
|
||||
Mat processed = new Mat();
|
||||
Imgproc.morphologyEx(binary, processed, Imgproc.MORPH_CLOSE, kernel);
|
||||
|
||||
// 4. 边缘增强
|
||||
Mat enhanced = new Mat();
|
||||
Imgproc.GaussianBlur(processed, enhanced, new Size(3, 3), 0);
|
||||
|
||||
return matToBufferedImage(enhanced);
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
return image;
|
||||
}
|
||||
}
|
||||
|
||||
private Mat bufferedImageToMat(BufferedImage image) {
|
||||
// 转换图像类型为 TYPE_3BYTE_BGR,如果需要的话
|
||||
BufferedImage convertedImage = image;
|
||||
if (image.getType() != BufferedImage.TYPE_3BYTE_BGR) {
|
||||
convertedImage = new BufferedImage(
|
||||
image.getWidth(),
|
||||
image.getHeight(),
|
||||
BufferedImage.TYPE_3BYTE_BGR);
|
||||
convertedImage.getGraphics().drawImage(image, 0, 0, null);
|
||||
}
|
||||
|
||||
// 获取图像数据
|
||||
byte[] pixels = ((DataBufferByte) convertedImage.getRaster().getDataBuffer()).getData();
|
||||
|
||||
// 创建Mat对象
|
||||
Mat mat = new Mat(
|
||||
convertedImage.getHeight(),
|
||||
convertedImage.getWidth(),
|
||||
CvType.CV_8UC3);
|
||||
mat.put(0, 0, pixels);
|
||||
|
||||
return mat;
|
||||
}
|
||||
|
||||
private BufferedImage matToBufferedImage(Mat mat) {
|
||||
// 确保mat是8位3通道或单通道
|
||||
int type = BufferedImage.TYPE_3BYTE_BGR;
|
||||
if (mat.channels() == 1) {
|
||||
type = BufferedImage.TYPE_BYTE_GRAY;
|
||||
}
|
||||
|
||||
// 获取mat的数据
|
||||
byte[] pixels = new byte[mat.channels() * mat.cols() * mat.rows()];
|
||||
mat.get(0, 0, pixels);
|
||||
|
||||
// 创建BufferedImage
|
||||
BufferedImage image = new BufferedImage(
|
||||
mat.cols(),
|
||||
mat.rows(),
|
||||
type);
|
||||
|
||||
// 设置图像数据
|
||||
byte[] targetPixels = ((DataBufferByte) image.getRaster().getDataBuffer()).getData();
|
||||
System.arraycopy(pixels, 0, targetPixels, 0, pixels.length);
|
||||
|
||||
return image;
|
||||
}
|
||||
|
||||
public void processPDF(String pdfPath) {
|
||||
try {
|
||||
// 添加内存使用监控
|
||||
Runtime runtime = Runtime.getRuntime();
|
||||
long maxMemory = runtime.maxMemory() / (1024 * 1024);
|
||||
System.out.println("最大可用内存: " + maxMemory + "MB");
|
||||
|
||||
// 原有PDF处理代码
|
||||
PDDocument document = PDDocument.load(new File(pdfPath));
|
||||
// ... existing code ...
|
||||
|
||||
// 确保资源释放
|
||||
document.close();
|
||||
} catch (OutOfMemoryError e) {
|
||||
System.err.println("内存不足: " + e.getMessage());
|
||||
// TODO 日志记录
|
||||
} catch (Exception e) {
|
||||
System.err.println("处理PDF时发生错误: " + e.getMessage());
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
137
knows-java/src/main/java/cn/luckday/document/PDFParser.java
Normal file
137
knows-java/src/main/java/cn/luckday/document/PDFParser.java
Normal file
@ -0,0 +1,137 @@
|
||||
package cn.luckday.document;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.text.PDFTextStripper;
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
|
||||
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
public class PDFParser {
|
||||
private final String pdfPath;
|
||||
private PDDocument document;
|
||||
private final OCRProcessor ocrProcessor;
|
||||
private List<String> extractedText;
|
||||
private List<BufferedImage> extractedImages;
|
||||
private List<Table> extractedTables;
|
||||
|
||||
public PDFParser(String pdfPath) {
|
||||
this.pdfPath = pdfPath;
|
||||
this.ocrProcessor = new OCRProcessor();
|
||||
this.extractedText = new ArrayList<>();
|
||||
this.extractedImages = new ArrayList<>();
|
||||
this.extractedTables = new ArrayList<>();
|
||||
}
|
||||
|
||||
public void parse() {
|
||||
try {
|
||||
document = PDDocument.load(new File(pdfPath));
|
||||
|
||||
// 1. 解析文本内容
|
||||
System.out.println("=== 开始解析文本 ===");
|
||||
extractText();
|
||||
|
||||
// 2. 解析图片
|
||||
System.out.println("\n=== 开始解析图片 ===");
|
||||
extractImages();
|
||||
|
||||
// 3. 解析表格
|
||||
// System.out.println("\n=== 开始解析表格 ===");
|
||||
// extractTables();
|
||||
|
||||
document.close();
|
||||
} catch (Exception e) {
|
||||
System.err.println("PDF解析失败: " + e.getMessage());
|
||||
e.printStackTrace();
|
||||
if (document != null) {
|
||||
try {
|
||||
document.close();
|
||||
} catch (IOException ignored) {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void extractText() throws IOException {
|
||||
System.out.println("正在提取PDF文本...");
|
||||
|
||||
// 只使用PDFTextStripper提取文本
|
||||
PDFTextStripper stripper = new PDFTextStripper();
|
||||
String text = stripper.getText(document);
|
||||
System.out.println("文本内容:\n" + text);
|
||||
extractedText.add(text);
|
||||
}
|
||||
|
||||
private void extractImages() throws IOException {
|
||||
System.out.println("正在提取并处理PDF图片...");
|
||||
int imageCounter = 0;
|
||||
|
||||
for (PDPage page : document.getPages()) {
|
||||
for (COSName name : page.getResources().getXObjectNames()) {
|
||||
PDXObject object = page.getResources().getXObject(name);
|
||||
if (object instanceof PDImageXObject) {
|
||||
PDImageXObject image = (PDImageXObject) object;
|
||||
BufferedImage bImage = image.getImage();
|
||||
|
||||
// 保存图片
|
||||
String imagePath = "output_images/extracted_image_" + imageCounter + ".png";
|
||||
ImageIO.write(bImage, "PNG", new File(imagePath));
|
||||
System.out.println("已保存图片: " + imagePath);
|
||||
|
||||
// OCR处理图片
|
||||
try {
|
||||
System.out.println("正在对图片 " + imageCounter + " 进行OCR处理...");
|
||||
String imageText = ocrProcessor.performOCR(bImage);
|
||||
if (!imageText.trim().isEmpty()) {
|
||||
System.out.println("图片 " + imageCounter + " OCR结果:\n" + imageText);
|
||||
extractedText.add("【图片" + imageCounter + "文本】\n" + imageText);
|
||||
} else {
|
||||
System.out.println("图片 " + imageCounter + " 未识别出文本");
|
||||
}
|
||||
} catch (Exception e) {
|
||||
System.err.println("处理图片 " + imageCounter + " 时出错: " + e.getMessage());
|
||||
}
|
||||
|
||||
extractedImages.add(bImage);
|
||||
imageCounter++;
|
||||
}
|
||||
}
|
||||
}
|
||||
System.out.println("共处理 " + imageCounter + " 张图片");
|
||||
}
|
||||
|
||||
private void extractTables() {
|
||||
System.out.println("正在提取PDF表格...");
|
||||
TableDetector detector = new TableDetector(document);
|
||||
extractedTables = detector.detectTables();
|
||||
|
||||
if (extractedTables.isEmpty()) {
|
||||
System.out.println("未检测到表格");
|
||||
} else {
|
||||
System.out.println("共检测到 " + extractedTables.size() + " 个表格");
|
||||
for (int i = 0; i < extractedTables.size(); i++) {
|
||||
System.out.println("表格 " + (i + 1) + ":\n" + extractedTables.get(i));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Getter方法
|
||||
public List<String> getExtractedText() {
|
||||
return extractedText;
|
||||
}
|
||||
|
||||
public List<BufferedImage> getExtractedImages() {
|
||||
return extractedImages;
|
||||
}
|
||||
|
||||
public List<Table> getExtractedTables() {
|
||||
return extractedTables;
|
||||
}
|
||||
}
|
43
knows-java/src/main/java/cn/luckday/document/Table.java
Normal file
43
knows-java/src/main/java/cn/luckday/document/Table.java
Normal file
@ -0,0 +1,43 @@
|
||||
package cn.luckday.document;
|
||||
|
||||
public class Table {
|
||||
private String content;
|
||||
private int rows;
|
||||
private int columns;
|
||||
|
||||
public Table(String content) {
|
||||
this.content = content;
|
||||
analyzeStructure();
|
||||
}
|
||||
|
||||
private void analyzeStructure() {
|
||||
if (content == null || content.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
// 按行分割内容
|
||||
String[] lines = content.split("\n");
|
||||
rows = lines.length;
|
||||
|
||||
// 分析列数(基于空格或制表符分隔)
|
||||
columns = 0;
|
||||
for (String line : lines) {
|
||||
String[] cells = line.trim().split("\\s+");
|
||||
columns = Math.max(columns, cells.length);
|
||||
}
|
||||
}
|
||||
|
||||
public int getRows() {
|
||||
return rows;
|
||||
}
|
||||
|
||||
public int getColumns() {
|
||||
return columns;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return String.format("Table{rows=%d, columns=%d, content='%s'}",
|
||||
rows, columns, content);
|
||||
}
|
||||
}
|
170
knows-java/src/main/java/cn/luckday/document/TableDetector.java
Normal file
170
knows-java/src/main/java/cn/luckday/document/TableDetector.java
Normal file
@ -0,0 +1,170 @@
|
||||
package cn.luckday.document;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.text.PDFTextStripperByArea;
|
||||
|
||||
import java.awt.Rectangle;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
public class TableDetector {
|
||||
private final PDDocument document;
|
||||
|
||||
public TableDetector(PDDocument document) {
|
||||
this.document = document;
|
||||
}
|
||||
|
||||
public List<Table> detectTables() {
|
||||
List<Table> tables = new ArrayList<>();
|
||||
try {
|
||||
for (PDPage page : document.getPages()) {
|
||||
// 使用文本位置分析来检测表格
|
||||
PDFTextStripperByArea stripper = new PDFTextStripperByArea();
|
||||
stripper.setSortByPosition(true);
|
||||
|
||||
// 检测表格边界
|
||||
List<Rectangle> tableRegions = detectTableRegions(page);
|
||||
|
||||
for (Rectangle region : tableRegions) {
|
||||
stripper.addRegion("table", region);
|
||||
stripper.extractRegions(page);
|
||||
String tableContent = stripper.getTextForRegion("table");
|
||||
tables.add(new Table(tableContent));
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
return tables;
|
||||
}
|
||||
|
||||
private List<Rectangle> detectTableRegions(PDPage page) {
|
||||
List<Rectangle> regions = new ArrayList<>();
|
||||
try {
|
||||
// 获取页面尺寸
|
||||
float pageHeight = page.getMediaBox().getHeight();
|
||||
float pageWidth = page.getMediaBox().getWidth();
|
||||
|
||||
// 使用PDFTextStripperByArea进行文本分析
|
||||
PDFTextStripperByArea stripper = new PDFTextStripperByArea();
|
||||
stripper.setSortByPosition(true);
|
||||
|
||||
// 将页面划分为网格进行分析
|
||||
int gridRows = 20;
|
||||
int gridCols = 20;
|
||||
float cellHeight = pageHeight / gridRows;
|
||||
float cellWidth = pageWidth / gridCols;
|
||||
|
||||
// 存储每个网格单元的文本密度
|
||||
int[][] textDensity = new int[gridRows][gridCols];
|
||||
|
||||
// 分析每个网格单元
|
||||
for (int row = 0; row < gridRows; row++) {
|
||||
for (int col = 0; col < gridCols; col++) {
|
||||
Rectangle cell = new Rectangle(
|
||||
(int) (col * cellWidth),
|
||||
(int) (row * cellHeight),
|
||||
(int) cellWidth,
|
||||
(int) cellHeight);
|
||||
|
||||
stripper.addRegion("cell_" + row + "_" + col, cell);
|
||||
stripper.extractRegions(page);
|
||||
String cellText = stripper.getTextForRegion("cell_" + row + "_" + col);
|
||||
|
||||
// 计算文本密度
|
||||
textDensity[row][col] = cellText.trim().length();
|
||||
}
|
||||
}
|
||||
|
||||
// 检测表格区域
|
||||
List<TableRegion> potentialTables = findPotentialTables(textDensity, gridRows, gridCols);
|
||||
|
||||
// 转换检测到的区域为实际坐标
|
||||
for (TableRegion tableRegion : potentialTables) {
|
||||
Rectangle rect = new Rectangle(
|
||||
(int) (tableRegion.startCol * cellWidth),
|
||||
(int) (tableRegion.startRow * cellHeight),
|
||||
(int) ((tableRegion.endCol - tableRegion.startCol + 1) * cellWidth),
|
||||
(int) ((tableRegion.endRow - tableRegion.startRow + 1) * cellHeight));
|
||||
regions.add(rect);
|
||||
}
|
||||
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
return regions;
|
||||
}
|
||||
|
||||
private List<TableRegion> findPotentialTables(int[][] textDensity, int rows, int cols) {
|
||||
List<TableRegion> tables = new ArrayList<>();
|
||||
boolean[][] visited = new boolean[rows][cols];
|
||||
|
||||
// 遍历网格寻找潜在的表格区域
|
||||
for (int i = 0; i < rows; i++) {
|
||||
for (int j = 0; j < cols; j++) {
|
||||
if (!visited[i][j] && isTableCell(textDensity, i, j)) {
|
||||
TableRegion region = new TableRegion();
|
||||
expandTableRegion(textDensity, visited, i, j, region);
|
||||
if (isValidTable(region)) {
|
||||
tables.add(region);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return tables;
|
||||
}
|
||||
|
||||
private boolean isTableCell(int[][] density, int row, int col) {
|
||||
// 判断是否为表格单元格的条件
|
||||
// 1. 文本密度适中
|
||||
// 2. 周围有类似的文本密度分布
|
||||
int cellDensity = density[row][col];
|
||||
return cellDensity > 0 && cellDensity < 100; // 可调整阈值
|
||||
}
|
||||
|
||||
private void expandTableRegion(int[][] density, boolean[][] visited,
|
||||
int row, int col, TableRegion region) {
|
||||
if (row < 0 || row >= density.length ||
|
||||
col < 0 || col >= density[0].length ||
|
||||
visited[row][col] ||
|
||||
!isTableCell(density, row, col)) {
|
||||
return;
|
||||
}
|
||||
|
||||
visited[row][col] = true;
|
||||
|
||||
// 更新表格区域的边界
|
||||
region.updateBounds(row, col);
|
||||
|
||||
// 递归检查相邻单元格
|
||||
expandTableRegion(density, visited, row - 1, col, region); // 上
|
||||
expandTableRegion(density, visited, row + 1, col, region); // 下
|
||||
expandTableRegion(density, visited, row, col - 1, region); // 左
|
||||
expandTableRegion(density, visited, row, col + 1, region); // 右
|
||||
}
|
||||
|
||||
private boolean isValidTable(TableRegion region) {
|
||||
// 验证检测到的区域是否可能是表格
|
||||
int width = region.endCol - region.startCol + 1;
|
||||
int height = region.endRow - region.startRow + 1;
|
||||
|
||||
// 表格至少应该有2x2的大小
|
||||
return width >= 2 && height >= 2;
|
||||
}
|
||||
|
||||
// 表格区域数据结构
|
||||
private static class TableRegion {
|
||||
int startRow = Integer.MAX_VALUE;
|
||||
int startCol = Integer.MAX_VALUE;
|
||||
int endRow = Integer.MIN_VALUE;
|
||||
int endCol = Integer.MIN_VALUE;
|
||||
|
||||
void updateBounds(int row, int col) {
|
||||
startRow = Math.min(startRow, row);
|
||||
startCol = Math.min(startCol, col);
|
||||
endRow = Math.max(endRow, row);
|
||||
endCol = Math.max(endCol, col);
|
||||
}
|
||||
}
|
||||
}
|
287
knows-java/src/main/java/cn/luckday/document/WordProcessor.java
Normal file
287
knows-java/src/main/java/cn/luckday/document/WordProcessor.java
Normal file
@ -0,0 +1,287 @@
|
||||
package cn.luckday.document;
|
||||
|
||||
import org.apache.poi.xwpf.usermodel.*;
|
||||
import org.apache.poi.hwpf.HWPFDocument;
|
||||
import org.apache.poi.hwpf.usermodel.Range;
|
||||
import org.apache.poi.hwpf.usermodel.Table;
|
||||
import org.apache.poi.hwpf.usermodel.TableRow;
|
||||
import org.apache.poi.hwpf.usermodel.TableCell;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import org.apache.poi.common.usermodel.PictureType;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
public class WordProcessor {
|
||||
private final String filePath;
|
||||
private List<String> extractedText;
|
||||
private List<XWPFTable> extractedTables;
|
||||
private List<XWPFPicture> extractedImages;
|
||||
|
||||
public WordProcessor(String filePath) {
|
||||
this.filePath = filePath;
|
||||
this.extractedText = new ArrayList<>();
|
||||
this.extractedTables = new ArrayList<>();
|
||||
this.extractedImages = new ArrayList<>();
|
||||
}
|
||||
|
||||
public void process() {
|
||||
File file = new File(filePath);
|
||||
if (filePath.endsWith(".docx")) {
|
||||
processDocx(file);
|
||||
} else if (filePath.endsWith(".doc")) {
|
||||
processDoc(file);
|
||||
} else {
|
||||
throw new IllegalArgumentException("不支持的文件格式:" + filePath);
|
||||
}
|
||||
}
|
||||
|
||||
private void processDocx(File file) {
|
||||
try (FileInputStream fis = new FileInputStream(file);
|
||||
XWPFDocument document = new XWPFDocument(fis)) {
|
||||
|
||||
// 提取文本
|
||||
extractTextFromDocx(document);
|
||||
|
||||
// 提取表格
|
||||
extractTablesFromDocx(document);
|
||||
|
||||
// 提取图片
|
||||
extractImagesFromDocx(document);
|
||||
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
private void processDoc(File file) {
|
||||
try (FileInputStream fis = new FileInputStream(file);
|
||||
POIFSFileSystem fs = new POIFSFileSystem(fis)) {
|
||||
|
||||
HWPFDocument document = new HWPFDocument(fs);
|
||||
|
||||
// 提取文本
|
||||
Range range = document.getRange();
|
||||
extractTextFromDoc(range);
|
||||
|
||||
// 提取表格
|
||||
extractTablesFromDoc(range);
|
||||
|
||||
// 提取图片(如果可能)
|
||||
extractImagesFromDoc(document);
|
||||
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
private void extractTextFromDocx(XWPFDocument document) {
|
||||
// 提取段落文本
|
||||
for (XWPFParagraph paragraph : document.getParagraphs()) {
|
||||
String text = paragraph.getText().trim();
|
||||
if (!text.isEmpty()) {
|
||||
extractedText.add(text);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void extractTablesFromDocx(XWPFDocument document) {
|
||||
// 提取表格
|
||||
for (XWPFTable table : document.getTables()) {
|
||||
extractedTables.add(table);
|
||||
|
||||
// 处理表格内容
|
||||
for (XWPFTableRow row : table.getRows()) {
|
||||
StringBuilder rowContent = new StringBuilder();
|
||||
for (XWPFTableCell cell : row.getTableCells()) {
|
||||
rowContent.append(cell.getText()).append("\t");
|
||||
}
|
||||
extractedText.add("表格行:" + rowContent.toString().trim());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void extractImagesFromDocx(XWPFDocument document) {
|
||||
// 提取图片
|
||||
for (XWPFParagraph paragraph : document.getParagraphs()) {
|
||||
for (XWPFRun run : paragraph.getRuns()) {
|
||||
List<XWPFPicture> pictures = run.getEmbeddedPictures();
|
||||
extractedImages.addAll(pictures);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void extractTextFromDoc(Range range) {
|
||||
String text = range.text();
|
||||
// 按段落分割
|
||||
String[] paragraphs = text.split("\\r?\\n");
|
||||
for (String paragraph : paragraphs) {
|
||||
if (!paragraph.trim().isEmpty()) {
|
||||
extractedText.add(paragraph.trim());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void extractTablesFromDoc(Range range) {
|
||||
for (int i = 0; i < range.numParagraphs(); i++) {
|
||||
if (range.getParagraph(i).isInTable()) {
|
||||
Table table = range.getTable(range.getParagraph(i));
|
||||
processDocTable(table);
|
||||
// 跳过表格中的其他段落
|
||||
i += table.numParagraphs() - 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void processDocTable(Table table) {
|
||||
List<List<String>> tableData = new ArrayList<>();
|
||||
for (int rowIdx = 0; rowIdx < table.numRows(); rowIdx++) {
|
||||
TableRow row = table.getRow(rowIdx);
|
||||
List<String> rowData = new ArrayList<>();
|
||||
|
||||
for (int colIdx = 0; colIdx < row.numCells(); colIdx++) {
|
||||
TableCell cell = row.getCell(colIdx);
|
||||
String cellText = cell.text().trim();
|
||||
if (cellText.endsWith("\u0007")) {
|
||||
cellText = cellText.substring(0, cellText.length() - 1);
|
||||
}
|
||||
rowData.add(cellText);
|
||||
}
|
||||
|
||||
tableData.add(rowData);
|
||||
extractedText.add("表格行:" + String.join("\t", rowData));
|
||||
}
|
||||
}
|
||||
|
||||
private void extractImagesFromDoc(HWPFDocument document) {
|
||||
// 注意:HWPF对图片的支持有限
|
||||
try {
|
||||
List<org.apache.poi.hwpf.usermodel.Picture> pictures = document.getPicturesTable().getAllPictures();
|
||||
File outputDir = new File("output_images");
|
||||
if (!outputDir.exists()) {
|
||||
outputDir.mkdirs();
|
||||
}
|
||||
|
||||
int imageCounter = 0;
|
||||
for (org.apache.poi.hwpf.usermodel.Picture picture : pictures) {
|
||||
String extension = picture.suggestFileExtension();
|
||||
String filename = String.format("doc_image_%d.%s", imageCounter++, extension);
|
||||
Path outputPath = Paths.get(outputDir.getPath(), filename);
|
||||
|
||||
// 保存图片数据
|
||||
Files.write(outputPath, picture.getContent());
|
||||
}
|
||||
} catch (Exception e) {
|
||||
System.out.println("警告:提取.doc文件中的图片时出错:" + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
public void saveImages(String outputDir) {
|
||||
try {
|
||||
File dir = new File(outputDir);
|
||||
if (!dir.exists()) {
|
||||
dir.mkdirs();
|
||||
}
|
||||
|
||||
int imageCounter = 0;
|
||||
for (XWPFPicture picture : extractedImages) {
|
||||
// 获取图片数据
|
||||
byte[] pictureData = picture.getPictureData().getData();
|
||||
|
||||
// 确定图片扩展名
|
||||
String extension = getImageExtension(picture.getPictureData().getPictureType());
|
||||
String filename = String.format("image_%d.%s", imageCounter++, extension);
|
||||
|
||||
// 保存图片
|
||||
Path outputPath = Paths.get(dir.getPath(), filename);
|
||||
Files.write(outputPath, pictureData);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
private String getImageExtension(int pictureType) {
|
||||
// 使用PictureType的常量来处理图片类型
|
||||
if (pictureType == PictureType.PNG.getOoxmlId()) {
|
||||
return "png";
|
||||
} else if (pictureType == PictureType.JPEG.getOoxmlId()) {
|
||||
return "jpg";
|
||||
} else if (pictureType == PictureType.GIF.getOoxmlId()) {
|
||||
return "gif";
|
||||
} else if (pictureType == PictureType.TIFF.getOoxmlId()) {
|
||||
return "tiff";
|
||||
} else if (pictureType == PictureType.BMP.getOoxmlId()) {
|
||||
return "bmp";
|
||||
} else if (pictureType == PictureType.EMF.getOoxmlId()) {
|
||||
return "emf";
|
||||
} else if (pictureType == PictureType.WMF.getOoxmlId()) {
|
||||
return "wmf";
|
||||
} else if (pictureType == PictureType.PICT.getOoxmlId()) {
|
||||
return "pict";
|
||||
} else if (pictureType == PictureType.DIB.getOoxmlId()) {
|
||||
return "dib";
|
||||
} else {
|
||||
return "unknown";
|
||||
}
|
||||
}
|
||||
|
||||
public List<String> getExtractedText() {
|
||||
return extractedText;
|
||||
}
|
||||
|
||||
public List<XWPFTable> getExtractedTables() {
|
||||
return extractedTables;
|
||||
}
|
||||
|
||||
public List<XWPFPicture> getExtractedImages() {
|
||||
return extractedImages;
|
||||
}
|
||||
|
||||
// 将表格转换为结构化数据
|
||||
public List<List<String>> convertTableToList(XWPFTable table) {
|
||||
List<List<String>> tableData = new ArrayList<>();
|
||||
|
||||
for (XWPFTableRow row : table.getRows()) {
|
||||
List<String> rowData = new ArrayList<>();
|
||||
for (XWPFTableCell cell : row.getTableCells()) {
|
||||
rowData.add(cell.getText().trim());
|
||||
}
|
||||
tableData.add(rowData);
|
||||
}
|
||||
|
||||
return tableData;
|
||||
}
|
||||
|
||||
// 导出表格为CSV格式
|
||||
public void exportTableToCSV(XWPFTable table, String outputPath) {
|
||||
try {
|
||||
StringBuilder csv = new StringBuilder();
|
||||
|
||||
for (XWPFTableRow row : table.getRows()) {
|
||||
List<String> rowData = new ArrayList<>();
|
||||
for (XWPFTableCell cell : row.getTableCells()) {
|
||||
// 处理CSV中的特殊字符
|
||||
String cellText = cell.getText().trim()
|
||||
.replace("\"", "\"\"")
|
||||
.replace(",", "\",\"");
|
||||
rowData.add("\"" + cellText + "\"");
|
||||
}
|
||||
csv.append(String.join(",", rowData)).append("\n");
|
||||
}
|
||||
|
||||
java.nio.file.Files.write(
|
||||
new File(outputPath).toPath(),
|
||||
csv.toString().getBytes());
|
||||
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
83
knows-java/src/main/java/cn/luckday/embed/EmbedClient.java
Normal file
83
knows-java/src/main/java/cn/luckday/embed/EmbedClient.java
Normal file
@ -0,0 +1,83 @@
|
||||
package cn.luckday.embed;
|
||||
import com.alibaba.fastjson.JSON;
|
||||
import com.alibaba.fastjson.JSONObject;
|
||||
import okhttp3.*;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
||||
public class EmbedClient {
|
||||
|
||||
public static double[] getEmbedding(String uri, String apiKey, String inputText) throws IOException {
|
||||
OkHttpClient client = new OkHttpClient();
|
||||
|
||||
// 创建请求体
|
||||
JSONObject requestBody = new JSONObject();
|
||||
requestBody.put("input", Collections.singletonList(inputText));
|
||||
|
||||
// 创建请求
|
||||
MediaType mediaType = MediaType.parse("application/json; charset=utf-8");
|
||||
RequestBody body = RequestBody.Companion.create(requestBody.toJSONString(), mediaType);
|
||||
Request request = new Request.Builder()
|
||||
.url(uri)
|
||||
.addHeader("Authorization", "Bearer " + apiKey)
|
||||
.addHeader("Content-Type", "application/json")
|
||||
.post(body)
|
||||
.build();
|
||||
|
||||
// 发送请求
|
||||
Response response = client.newCall(request).execute();
|
||||
if (!response.isSuccessful()) {
|
||||
throw new IOException("Unexpected code " + response);
|
||||
}
|
||||
|
||||
// 解析JSON响应
|
||||
String responseBody = response.body().string();
|
||||
EmbeddingResponse embeddingResponse = JSON.parseObject(responseBody, EmbeddingResponse.class);
|
||||
|
||||
// 返回嵌入向量
|
||||
return embeddingResponse.getData().get(0).getEmbedding();
|
||||
}
|
||||
|
||||
static class EmbeddingResponse {
|
||||
private List<Data> data;
|
||||
|
||||
public List<Data> getData() {
|
||||
return data;
|
||||
}
|
||||
|
||||
public void setData(List<Data> data) {
|
||||
this.data = data;
|
||||
}
|
||||
}
|
||||
|
||||
static class Data {
|
||||
private double[] embedding;
|
||||
private int index;
|
||||
private String object;
|
||||
|
||||
public double[] getEmbedding() {
|
||||
return embedding;
|
||||
}
|
||||
|
||||
public void setEmbedding(double[] embedding) {
|
||||
this.embedding = embedding;
|
||||
}
|
||||
|
||||
public int getIndex() {
|
||||
return index;
|
||||
}
|
||||
|
||||
public void setIndex(int index) {
|
||||
this.index = index;
|
||||
}
|
||||
|
||||
public String getObject() {
|
||||
return object;
|
||||
}
|
||||
|
||||
public void setObject(String object) {
|
||||
this.object = object;
|
||||
}
|
||||
}
|
||||
}
|
34
knows-java/src/main/java/cn/luckday/embed/ReRankClient.java
Normal file
34
knows-java/src/main/java/cn/luckday/embed/ReRankClient.java
Normal file
@ -0,0 +1,34 @@
|
||||
package cn.luckday.embed;
|
||||
|
||||
import com.alibaba.fastjson.JSONObject;
|
||||
import okhttp3.*;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
public class ReRankClient {
|
||||
|
||||
public static String reRank(String uri, String apiKey, List<String> textsList, String query) throws IOException {
|
||||
OkHttpClient client = new OkHttpClient();
|
||||
JSONObject requestBody = new JSONObject();
|
||||
String[] texts = textsList.toArray(new String[0]);
|
||||
requestBody.put("textList", texts);
|
||||
requestBody.put("query", query);
|
||||
// 创建请求
|
||||
MediaType mediaType = MediaType.parse("application/json; charset=utf-8");
|
||||
RequestBody body = RequestBody.Companion.create(requestBody.toJSONString(), mediaType);
|
||||
Request request = new Request.Builder()
|
||||
.url(uri)
|
||||
.addHeader("Authorization", "Bearer " + apiKey)
|
||||
.addHeader("Content-Type", "application/json")
|
||||
.post(body)
|
||||
.build();
|
||||
|
||||
// 发送请求
|
||||
Response response = client.newCall(request).execute();
|
||||
if (!response.isSuccessful()) {
|
||||
throw new IOException("Unexpected code " + response);
|
||||
}
|
||||
return response.body().string();
|
||||
}
|
||||
}
|
@ -0,0 +1,46 @@
|
||||
package cn.luckday.filter;
|
||||
|
||||
import jakarta.servlet.*;
|
||||
import jakarta.servlet.annotation.WebFilter;
|
||||
import jakarta.servlet.http.HttpServletRequest;
|
||||
import jakarta.servlet.http.HttpServletResponse;
|
||||
import org.springframework.core.annotation.Order;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
@Component
|
||||
@WebFilter(urlPatterns = "/*", asyncSupported = true)
|
||||
@Order(1)
|
||||
public class AccessControlFilter implements Filter {
|
||||
|
||||
@Override
|
||||
public void init(FilterConfig filterConfig) throws ServletException {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void doFilter(ServletRequest request, ServletResponse response, FilterChain chain) throws IOException, ServletException {
|
||||
HttpServletRequest httpServletRequest = (HttpServletRequest) request;
|
||||
HttpServletResponse httpServletResponse = (HttpServletResponse) response;
|
||||
|
||||
// 获取源站
|
||||
String origin = httpServletRequest.getHeader("origin");
|
||||
httpServletResponse.setHeader("Access-Control-Allow-Origin", "*");
|
||||
httpServletResponse.setHeader("Access-Control-Allow-Headers", "Content-Type,Content-Length, Authorization, Accept,X-Requested-With,cors, content-type, luck-token, userId, user, type");
|
||||
httpServletResponse.setHeader("Access-Control-Allow-Credentials", "true");
|
||||
httpServletResponse.setHeader("Access-Control-Allow-Methods", "GET,PUT,POST,DELETE,PATCH,OPTIONS");
|
||||
httpServletResponse.setHeader("Access-Control-Max-Age", "3600");
|
||||
|
||||
if ("OPTIONS".equals(httpServletRequest.getMethod())) {
|
||||
httpServletResponse.setStatus(HttpServletResponse.SC_OK);
|
||||
} else {
|
||||
chain.doFilter(request, response);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void destroy() {
|
||||
|
||||
}
|
||||
|
||||
}
|
85
knows-java/src/main/java/cn/luckday/llm/OllamaClient.java
Normal file
85
knows-java/src/main/java/cn/luckday/llm/OllamaClient.java
Normal file
@ -0,0 +1,85 @@
|
||||
package cn.luckday.llm;
|
||||
|
||||
import com.alibaba.fastjson2.JSON;
|
||||
import jakarta.servlet.http.HttpServletResponse;
|
||||
|
||||
import java.io.*;
|
||||
import java.net.HttpURLConnection;
|
||||
import java.net.URL;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Map;
|
||||
import java.util.HashMap;
|
||||
|
||||
public class OllamaClient {
|
||||
|
||||
private static Map<String, Object> PARAMS = new HashMap<>();
|
||||
private static Map<String, Object> OPTIONS = new HashMap<>();
|
||||
|
||||
static {
|
||||
OPTIONS.put("temperature", 0.3); // # 控制随机性(0-1,值越大越随机)
|
||||
OPTIONS.put("top_p", 0.5); // # 采样策略(0-1,值越小越集中)
|
||||
OPTIONS.put("max_tokens", 1024); // # 生成的最大 token 数
|
||||
|
||||
PARAMS.put("model", "deepseek-r1:32b");
|
||||
PARAMS.put("stream", true);
|
||||
PARAMS.put("options", OPTIONS);
|
||||
}
|
||||
|
||||
public static String PROMPT = "你是一个知识库,必须严格按照知识库检索的内容做最精简的回答,只回答关键信息,坚决杜绝胡编乱造,注意字数。" +
|
||||
"当所有知识库内容都与产品问题无关时,或者知识库检索到任何相关信息时,你的回答必须是“没有找到”这句话。" +
|
||||
" 以下是知识库:\n" +
|
||||
" { %content% }\n" +
|
||||
" 以上是知识库。 \n 以下是提问:";
|
||||
|
||||
public static void sendMsg(HttpServletResponse response, String uri, String query, String content) {
|
||||
try {
|
||||
// 设置SSE必要的响应头
|
||||
response.setContentType("text/event-stream");
|
||||
response.setCharacterEncoding("UTF-8");
|
||||
response.setHeader("Cache-Control", "no-cache");
|
||||
response.setHeader("Connection", "keep-alive");
|
||||
|
||||
URL url = new URL(uri);
|
||||
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
|
||||
conn.setRequestMethod("POST");
|
||||
conn.setRequestProperty("Accept", "text/event-stream");
|
||||
conn.setRequestProperty("Content-Type", "application/json");
|
||||
conn.setDoOutput(true);
|
||||
|
||||
PARAMS.put("prompt", PROMPT.replace("%content%", content) + query);
|
||||
String json = JSON.toJSONString(PARAMS);
|
||||
|
||||
try (OutputStream os = conn.getOutputStream()) {
|
||||
os.write(json.getBytes(StandardCharsets.UTF_8));
|
||||
}
|
||||
|
||||
int responseCode = conn.getResponseCode();
|
||||
|
||||
if (responseCode >= HttpURLConnection.HTTP_OK && responseCode < HttpURLConnection.HTTP_USE_PROXY) {
|
||||
try (BufferedReader br = new BufferedReader(new InputStreamReader(conn.getInputStream(), StandardCharsets.UTF_8));
|
||||
PrintWriter writer = response.getWriter()) {
|
||||
|
||||
String line;
|
||||
while ((line = br.readLine()) != null) {
|
||||
if (!line.trim().isEmpty()) {
|
||||
// 构造SSE消息格式
|
||||
writer.write("data: " + line + "\n\n");
|
||||
writer.flush();
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
throw new RuntimeException("Failed : HTTP error code : " + responseCode);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
try {
|
||||
response.setStatus(HttpServletResponse.SC_INTERNAL_SERVER_ERROR);
|
||||
PrintWriter writer = response.getWriter();
|
||||
writer.write("data: {\"error\": \"" + e.getMessage() + "\"}\n\n");
|
||||
writer.flush();
|
||||
} catch (IOException ioe) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
45
knows-java/src/main/java/cn/luckday/llm/QwenClient.java
Normal file
45
knows-java/src/main/java/cn/luckday/llm/QwenClient.java
Normal file
@ -0,0 +1,45 @@
|
||||
package cn.luckday.llm;
|
||||
|
||||
import java.util.Arrays;
|
||||
import com.alibaba.dashscope.aigc.generation.Generation;
|
||||
import com.alibaba.dashscope.aigc.generation.GenerationParam;
|
||||
import com.alibaba.dashscope.aigc.generation.GenerationResult;
|
||||
import com.alibaba.dashscope.common.Message;
|
||||
import com.alibaba.dashscope.common.Role;
|
||||
import com.alibaba.dashscope.exception.ApiException;
|
||||
import com.alibaba.dashscope.exception.InputRequiredException;
|
||||
import com.alibaba.dashscope.exception.NoApiKeyException;
|
||||
|
||||
public class QwenClient {
|
||||
|
||||
public static GenerationResult sendMsg(String model, String apiKey, String query, String content) throws ApiException, NoApiKeyException, InputRequiredException {
|
||||
Generation gen = new Generation();
|
||||
|
||||
Message systemMsg = Message.builder()
|
||||
.role(Role.SYSTEM.getValue())
|
||||
.content("你是一个知识库,必须严格按照知识库检索的内容做最精简的回答,只回答关键信息,坚决杜绝胡编乱造,注意数字。" +
|
||||
"当所有知识库内容都与产品问题无关时,或者知识库检索到任何相关信息时,你的回答必须是“没有找到”这句话。" +
|
||||
" 以下是知识库:\n" +
|
||||
" {" + content + "}\n" +
|
||||
" 以上是知识库。")
|
||||
.build();
|
||||
|
||||
Message userMsg = Message.builder()
|
||||
.role(Role.USER.getValue())
|
||||
.content(query)
|
||||
.build();
|
||||
|
||||
GenerationParam param = GenerationParam.builder()
|
||||
.model(model)
|
||||
.messages(Arrays.asList(systemMsg, userMsg))
|
||||
.resultFormat(GenerationParam.ResultFormat.MESSAGE)
|
||||
.apiKey(apiKey)
|
||||
.topK(50)
|
||||
.temperature(0.1f)
|
||||
.topP(0.8)
|
||||
.seed(1234)
|
||||
.build();
|
||||
|
||||
return gen.call(param);
|
||||
}
|
||||
}
|
@ -0,0 +1,143 @@
|
||||
package cn.luckday.service;
|
||||
|
||||
|
||||
import co.elastic.clients.elasticsearch.ElasticsearchClient;
|
||||
import co.elastic.clients.elasticsearch._types.Script;
|
||||
import co.elastic.clients.elasticsearch._types.query_dsl.*;
|
||||
import co.elastic.clients.elasticsearch.core.IndexResponse;
|
||||
import co.elastic.clients.elasticsearch.core.SearchResponse;
|
||||
import co.elastic.clients.elasticsearch.indices.CreateIndexRequest;
|
||||
import co.elastic.clients.elasticsearch.indices.CreateIndexResponse;
|
||||
import co.elastic.clients.json.JsonData;
|
||||
import cn.luckday.bean.SearchResult;
|
||||
import cn.luckday.bean.KnowsIndex;
|
||||
import cn.luckday.embed.EmbedClient;
|
||||
import jakarta.annotation.Resource;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
public class EsDocumentService {
|
||||
|
||||
@Value("${embedding.uri}")
|
||||
private String embeddingUri;
|
||||
|
||||
@Value("${embedding.api-key}")
|
||||
private String embeddingApiKey;
|
||||
|
||||
@Resource
|
||||
private ElasticsearchClient client;
|
||||
|
||||
public static final String INDEX_NAME = "knows_index";
|
||||
|
||||
public static final float SIMILARITY_THRESHOLD = 0.2f;
|
||||
|
||||
/**
|
||||
* 创建索引
|
||||
* @throws IOException 异常
|
||||
*/
|
||||
public void createIndex() throws IOException {
|
||||
CreateIndexRequest request = new CreateIndexRequest.Builder()
|
||||
.index(INDEX_NAME)
|
||||
|
||||
.mappings(m -> m
|
||||
.properties("file_name", p -> p.keyword(k -> k))
|
||||
.properties("file_path", p -> p.keyword(k -> k))
|
||||
.properties("file_type", p -> p.keyword(k -> k))
|
||||
.properties("file_size", p -> p.keyword(k -> k))
|
||||
.properties("remark_vec", p -> p
|
||||
.denseVector(dv -> dv
|
||||
.dims(1024)
|
||||
.index(true)
|
||||
.similarity("cosine")
|
||||
)
|
||||
)
|
||||
.properties("remark", p -> p
|
||||
.text(t -> t)
|
||||
)
|
||||
// .properties("remark", p -> p
|
||||
// .text(t -> t.searchAnalyzer("ik_smart")
|
||||
// .analyzer("ik_smart") // 使用 IK 分词器
|
||||
// )
|
||||
// )
|
||||
)
|
||||
.build();
|
||||
|
||||
CreateIndexResponse createIndexResponse = client.indices().create(request);
|
||||
log.info("Index created: {}", createIndexResponse.acknowledged());
|
||||
}
|
||||
|
||||
/**
|
||||
* 添加数据
|
||||
* @param knowsIndexList 数据
|
||||
* @throws IOException 异常
|
||||
*/
|
||||
public void indexSellList(List<KnowsIndex> knowsIndexList) throws IOException {
|
||||
for (KnowsIndex knowsIndex : knowsIndexList) {
|
||||
knowsIndex.setContent_vec(EmbedClient.getEmbedding(embeddingUri, embeddingApiKey, knowsIndex.getContent()));
|
||||
IndexResponse response = client.index(i -> i
|
||||
.index(INDEX_NAME)
|
||||
.id(knowsIndex.getId())
|
||||
.document(knowsIndex)
|
||||
);
|
||||
log.info("Sell indexed: {}", response.id());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 检索
|
||||
*
|
||||
* @param queryVector 向量
|
||||
*/
|
||||
public List<SearchResult> searchVector(double[] queryVector) throws IOException {
|
||||
// 创建向量相似度查询
|
||||
ScriptScoreQuery scriptScoreQuery = ScriptScoreQuery.of(q -> q
|
||||
.query(QueryBuilders.matchAll().build()._toQuery())
|
||||
.script(Script.of(s -> s.inline(i -> i
|
||||
.source("double score = cosineSimilarity(params.query_vector, 'content_vec'); " +
|
||||
"score = Math.min(1.0, Math.max(0.0, score)); " + // 确保评分在[0, 1]之间
|
||||
"if (score < params.threshold) { return 0; } else { return score; }")
|
||||
.params(Map.of(
|
||||
"query_vector", JsonData.of(queryVector),
|
||||
"threshold", JsonData.of(SIMILARITY_THRESHOLD) // 将阈值作为参数传递给脚本
|
||||
))))));
|
||||
|
||||
// 创建bool查询,向量相似度查询作为should子句
|
||||
Query boolQuery = QueryBuilders.bool(b -> b
|
||||
.should(scriptScoreQuery._toQuery())
|
||||
);
|
||||
|
||||
Query functionScoreQuery = QueryBuilders.functionScore(fs -> fs
|
||||
.query(boolQuery)
|
||||
.scoreMode(FunctionScoreMode.Max)
|
||||
.boostMode(FunctionBoostMode.Replace)
|
||||
.minScore((double) SIMILARITY_THRESHOLD)
|
||||
);
|
||||
|
||||
// 执行合并后的查询
|
||||
SearchResponse<KnowsIndex> combinedSearchResponse = client.search(s -> s
|
||||
.index(INDEX_NAME)
|
||||
.query(functionScoreQuery),
|
||||
KnowsIndex.class);
|
||||
|
||||
// 处理查询的结果
|
||||
return combinedSearchResponse.hits().hits().stream()
|
||||
.map(hit -> {
|
||||
double finalScore = Objects.nonNull(hit.score()) ? hit.score() : 0.0;
|
||||
return finalScore >= SIMILARITY_THRESHOLD ? new SearchResult(hit.source(), finalScore) : null;
|
||||
})
|
||||
.filter(Objects::nonNull)
|
||||
.sorted(Comparator.comparingDouble(SearchResult::getScore).reversed())
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
}
|
135
knows-java/src/main/java/cn/luckday/service/RedFileService.java
Normal file
135
knows-java/src/main/java/cn/luckday/service/RedFileService.java
Normal file
@ -0,0 +1,135 @@
|
||||
package cn.luckday.service;
|
||||
|
||||
import cn.hutool.core.util.IdUtil;
|
||||
import cn.luckday.bean.KnowsIndex;
|
||||
import cn.luckday.embed.EmbedClient;
|
||||
import cn.luckday.document.PDFParser;
|
||||
import cn.luckday.document.WordProcessor;
|
||||
import jakarta.annotation.Resource;
|
||||
import org.apache.poi.xwpf.usermodel.XWPFPicture;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.web.multipart.MultipartFile;
|
||||
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.File;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.time.LocalDateTime;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
@Service
|
||||
public class RedFileService {
|
||||
private static final String TEMP_DIR = "src/main/resources/temp_uploads";
|
||||
|
||||
@Value("${embedding.uri}")
|
||||
private String embeddingUri;
|
||||
|
||||
@Value("${embedding.api-key}")
|
||||
private String embeddingApiKey;
|
||||
|
||||
@Resource
|
||||
private EsDocumentService esDocumentService;
|
||||
|
||||
public void uploadFile(MultipartFile file) {
|
||||
try {
|
||||
String projectPath = System.getProperty("user.dir");
|
||||
Path tempDirPath = Paths.get(projectPath, TEMP_DIR);
|
||||
if (!Files.exists(tempDirPath)) {
|
||||
Files.createDirectories(tempDirPath);
|
||||
}
|
||||
|
||||
// 获取文件名和扩展名
|
||||
String originalFilename = file.getOriginalFilename();
|
||||
String fileExtension = getFileExtension(originalFilename);
|
||||
|
||||
// 生成临时文件路径
|
||||
String timestamp = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMddHHmmss"));
|
||||
String tempFileName = timestamp + "_" + originalFilename;
|
||||
Path tempFilePath = Paths.get(projectPath, TEMP_DIR, tempFileName);
|
||||
|
||||
// 保存上传的文件
|
||||
file.transferTo(tempFilePath.toFile());
|
||||
|
||||
// 解析文件内容
|
||||
Map<String, Object> parsedContent = parseFile(tempFilePath.toString(), fileExtension);
|
||||
|
||||
// 保存到 Elasticsearch
|
||||
String text = parsedContent.get("text").toString();
|
||||
|
||||
KnowsIndex knowsIndex = new KnowsIndex();
|
||||
knowsIndex.setId(String.valueOf(IdUtil.getSnowflakeNextId()));
|
||||
knowsIndex.setContent(text);
|
||||
knowsIndex.setContent_vec(EmbedClient.getEmbedding(embeddingUri, embeddingApiKey, text));
|
||||
esDocumentService.indexSellList(Arrays.asList(knowsIndex));
|
||||
|
||||
// 清理临时文件
|
||||
Files.deleteIfExists(tempFilePath);
|
||||
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
private String getFileExtension(String filename) {
|
||||
if (filename == null)
|
||||
return "";
|
||||
int lastDotIndex = filename.lastIndexOf('.');
|
||||
return (lastDotIndex == -1) ? "" : filename.substring(lastDotIndex + 1).toLowerCase();
|
||||
}
|
||||
|
||||
private Map<String, Object> parseFile(String filePath, String extension) throws Exception {
|
||||
Map<String, Object> content = new HashMap<>();
|
||||
|
||||
switch (extension) {
|
||||
case "pdf":
|
||||
PDFParser pdfParser = new PDFParser(filePath);
|
||||
pdfParser.parse();
|
||||
|
||||
// 获取解析结果
|
||||
List<String> texts = pdfParser.getExtractedText();
|
||||
List<BufferedImage> images = pdfParser.getExtractedImages();
|
||||
|
||||
// 合并所有文本
|
||||
StringBuilder fullText = new StringBuilder();
|
||||
for (String text : texts) {
|
||||
fullText.append(text).append("\n");
|
||||
}
|
||||
|
||||
content.put("text", fullText.toString());
|
||||
content.put("imageCount", images.size());
|
||||
break;
|
||||
|
||||
case "docx":
|
||||
WordProcessor wordProcessor = new WordProcessor(filePath);
|
||||
wordProcessor.process();
|
||||
|
||||
List<String> extractedText = wordProcessor.getExtractedText();
|
||||
// 合并所有文本
|
||||
StringBuilder docxFullText = new StringBuilder();
|
||||
for (String text : extractedText) {
|
||||
docxFullText.append(text).append("\n");
|
||||
}
|
||||
|
||||
List<XWPFPicture> extractedImages = wordProcessor.getExtractedImages();
|
||||
content.put("text", docxFullText.toString());
|
||||
content.put("imageCount", extractedImages.size());
|
||||
break;
|
||||
|
||||
default:
|
||||
throw new IllegalArgumentException("不支持的文件类型: " + extension);
|
||||
}
|
||||
|
||||
// 添加元数据
|
||||
content.put("filename", new File(filePath).getName());
|
||||
content.put("uploadTime", LocalDateTime.now().toString());
|
||||
content.put("fileType", extension);
|
||||
|
||||
return content;
|
||||
}
|
||||
}
|
32
knows-java/src/main/resources/application.yml
Normal file
32
knows-java/src/main/resources/application.yml
Normal file
@ -0,0 +1,32 @@
|
||||
server:
|
||||
port: 8899
|
||||
|
||||
spring:
|
||||
servlet:
|
||||
multipart:
|
||||
max-file-size: 10MB
|
||||
max-request-size: 10MB
|
||||
main:
|
||||
allow-bean-definition-overriding: true
|
||||
application:
|
||||
name: knows
|
||||
|
||||
elasticsearch:
|
||||
uris: 172.16.100.47:9200
|
||||
# username: elastic
|
||||
# password: 123456
|
||||
|
||||
qwen:
|
||||
api-key: sk-**********************
|
||||
model: qwen-plus
|
||||
|
||||
oll:
|
||||
uri: http://172.16.90.4:11434/api/generate
|
||||
|
||||
embedding:
|
||||
uri: http://172.16.90.4:6009/v1/embed
|
||||
api-key: sk-abcdefg1234567
|
||||
|
||||
re-rank:
|
||||
uri: http://172.16.90.4:6010/v1/reRank
|
||||
api-key: sk-abcdefg1234567
|
BIN
knows-java/src/main/resources/native/opencv_java4110.dll
Normal file
BIN
knows-java/src/main/resources/native/opencv_java4110.dll
Normal file
Binary file not shown.
BIN
knows-java/src/main/resources/ocr/chi_sim.traineddata
Normal file
BIN
knows-java/src/main/resources/ocr/chi_sim.traineddata
Normal file
Binary file not shown.
BIN
knows-java/src/main/resources/ocr/eng.traineddata
Normal file
BIN
knows-java/src/main/resources/ocr/eng.traineddata
Normal file
Binary file not shown.
BIN
knows-java/src/main/resources/ocr/osd.traineddata
Normal file
BIN
knows-java/src/main/resources/ocr/osd.traineddata
Normal file
Binary file not shown.
24
knows-java/src/test/java/cn/luckday/ApplicationTests.java
Normal file
24
knows-java/src/test/java/cn/luckday/ApplicationTests.java
Normal file
@ -0,0 +1,24 @@
|
||||
package cn.luckday;
|
||||
|
||||
import cn.luckday.service.EsDocumentService;
|
||||
import jakarta.annotation.Resource;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.boot.test.context.SpringBootTest;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
@SpringBootTest
|
||||
class ApplicationTests {
|
||||
|
||||
@Test
|
||||
void contextLoads() {
|
||||
}
|
||||
|
||||
@Resource
|
||||
private EsDocumentService service;
|
||||
|
||||
@Test
|
||||
void create() throws IOException {
|
||||
service.createIndex();
|
||||
}
|
||||
}
|
18
konws-python/embed/Dockerfile
Normal file
18
konws-python/embed/Dockerfile
Normal file
@ -0,0 +1,18 @@
|
||||
# 使用官方Python运行时作为父镜像
|
||||
FROM python:3.10
|
||||
|
||||
# 设置工作目录
|
||||
WORKDIR /app
|
||||
|
||||
# 将当前目录内容复制到容器的/app中
|
||||
ADD . /app
|
||||
|
||||
RUN pip install --upgrade pip -i https://pypi.tuna.tsinghua.edu.cn/simple
|
||||
# 安装程序需要的包
|
||||
RUN pip install --no-cache-dir -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
|
||||
|
||||
# 运行时监听的端口
|
||||
EXPOSE 6009
|
||||
|
||||
# 运行app.py时的命令及其参数
|
||||
CMD ["uvicorn", "embed:app", "--host", "0.0.0.0", "--port", "6009"]
|
76
konws-python/embed/embed.py
Normal file
76
konws-python/embed/embed.py
Normal file
@ -0,0 +1,76 @@
|
||||
import os
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
import uvicorn
|
||||
from fastapi import FastAPI, Depends, HTTPException, status
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
|
||||
from pydantic import BaseModel
|
||||
from sentence_transformers import SentenceTransformer, models
|
||||
|
||||
# 环境变量传入
|
||||
sk_key = os.environ.get('sk-key', 'sk-aaabbbcccdddeeefffggghhhiiijjjkkk')
|
||||
|
||||
# 创建一个FastAPI实例
|
||||
app = FastAPI()
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
# 创建一个HTTPBearer实例
|
||||
security = HTTPBearer()
|
||||
# 加载预训练的 Transformer 模型
|
||||
transformer_model = models.Transformer('./m3e-large', cache_dir='./cache')
|
||||
|
||||
# 创建 Mean Pooling 层
|
||||
pooling_model = models.Pooling(transformer_model.get_word_embedding_dimension(), pooling_mode='mean')
|
||||
|
||||
# 构建 SentenceTransformer 模型
|
||||
model = SentenceTransformer(modules=[transformer_model, pooling_model])
|
||||
|
||||
|
||||
class EmbeddingRequest(BaseModel):
|
||||
input: List[str]
|
||||
|
||||
|
||||
class EmbeddingResponse(BaseModel):
|
||||
data: list
|
||||
dimension: int
|
||||
|
||||
|
||||
@app.post("/v1/embed", response_model=EmbeddingResponse)
|
||||
async def get_embed(request: EmbeddingRequest, credentials: HTTPAuthorizationCredentials = Depends(security)):
|
||||
if credentials.credentials != sk_key:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||
detail="Invalid authorization code",
|
||||
)
|
||||
|
||||
# 计算嵌入向量和tokens数量
|
||||
embeddings = [model.encode(text) for text in request.input]
|
||||
# 归一化处理
|
||||
embeddings = [embedding / np.linalg.norm(embedding) for embedding in embeddings]
|
||||
# 将numpy数组转换为列表
|
||||
embeddings = [embedding.tolist() for embedding in embeddings]
|
||||
|
||||
response = {
|
||||
"data": [
|
||||
{
|
||||
"embedding": embedding,
|
||||
"index": index
|
||||
} for index, embedding in enumerate(embeddings)
|
||||
],
|
||||
"dimension": len(embeddings[0])
|
||||
}
|
||||
|
||||
return response
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
uvicorn.run("embed:app", host='0.0.0.0', port=6009, workers=2)
|
10
konws-python/embed/requirements.txt
Normal file
10
konws-python/embed/requirements.txt
Normal file
@ -0,0 +1,10 @@
|
||||
fastapi==0.99.1
|
||||
pydantic==1.10.7
|
||||
sentence-transformers==3.3.1
|
||||
uvicorn==0.23.1
|
||||
numpy==1.24.4
|
||||
scipy==1.10.1
|
||||
scikit-learn==1.3.0
|
||||
torchvision
|
||||
torchaudio
|
||||
torch
|
18
konws-python/rerank/Dockerfile
Normal file
18
konws-python/rerank/Dockerfile
Normal file
@ -0,0 +1,18 @@
|
||||
# 使用官方Python运行时作为父镜像
|
||||
FROM python:3.10
|
||||
|
||||
# 设置工作目录
|
||||
WORKDIR /app
|
||||
|
||||
# 将当前目录内容复制到容器的/app中
|
||||
ADD . /app
|
||||
|
||||
RUN pip install --upgrade pip -i https://pypi.tuna.tsinghua.edu.cn/simple
|
||||
# 安装程序需要的包
|
||||
RUN pip install --no-cache-dir -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
|
||||
|
||||
# 运行时监听的端口
|
||||
EXPOSE 6010
|
||||
|
||||
# 运行app.py时的命令及其参数
|
||||
CMD ["uvicorn", "rerank:app", "--host", "0.0.0.0", "--port", "6010"]
|
12
konws-python/rerank/requirements.txt
Normal file
12
konws-python/rerank/requirements.txt
Normal file
@ -0,0 +1,12 @@
|
||||
fastapi==0.99.1
|
||||
pydantic==1.10.7
|
||||
uvicorn==0.23.1
|
||||
tiktoken==0.4.0
|
||||
numpy==1.24.4
|
||||
scipy==1.10.1
|
||||
scikit-learn==1.5.0
|
||||
torchvision
|
||||
torchaudio
|
||||
torch
|
||||
BCEmbedding==0.1.5
|
||||
starlette~=0.27.0
|
58
konws-python/rerank/rerank.py
Normal file
58
konws-python/rerank/rerank.py
Normal file
@ -0,0 +1,58 @@
|
||||
import os
|
||||
from typing import List
|
||||
import uvicorn
|
||||
from BCEmbedding import RerankerModel
|
||||
from fastapi import FastAPI, Depends, HTTPException, status
|
||||
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
|
||||
from pydantic import BaseModel
|
||||
from starlette.middleware.cors import CORSMiddleware
|
||||
|
||||
# 环境变量传入
|
||||
sk_key = os.environ.get('sk-key', 'sk-aaabbbcccdddeeefffggghhhiiijjjkkk...')
|
||||
|
||||
# 创建一个FastAPI实例
|
||||
app = FastAPI()
|
||||
|
||||
# 添加CORS中间件
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"], # 允许所有来源
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"], # 允许所有方法
|
||||
allow_headers=["*"], # 允许所有头部
|
||||
)
|
||||
|
||||
# 创建一个HTTPBearer实例
|
||||
security = HTTPBearer()
|
||||
|
||||
# 初始化模型
|
||||
model = RerankerModel(model_name_or_path="./bce-reranker-base_v1")
|
||||
|
||||
|
||||
class ReRankRequest(BaseModel):
|
||||
textList: List[str]
|
||||
query: str
|
||||
|
||||
|
||||
class ReRankResponse(BaseModel):
|
||||
rerank_passages: List[str]
|
||||
rerank_scores: List[float]
|
||||
rerank_ids: List[int]
|
||||
|
||||
|
||||
# 定义路由,处理rerank请求
|
||||
@app.post("/v1/reRank", response_model=ReRankResponse)
|
||||
async def get_embeddings(request: ReRankRequest, credentials: HTTPAuthorizationCredentials = Depends(security)):
|
||||
if credentials.credentials != sk_key:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||
detail="Invalid authorization code",
|
||||
)
|
||||
query = request.query
|
||||
passages = request.textList
|
||||
return model.rerank(query, passages)
|
||||
|
||||
|
||||
# 运行应用
|
||||
if __name__ == "__main__":
|
||||
uvicorn.run("rerank:app", host='0.0.0.0', port=6010, workers=2)
|
296
konws-web/chatbox.html
Normal file
296
konws-web/chatbox.html
Normal file
@ -0,0 +1,296 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="zh-CN">
|
||||
<head>
|
||||
<meta charset="UTF-8" />
|
||||
<title>DeepSeek 32B Chat</title>
|
||||
<script src="js/marked.min.js"></script>
|
||||
<link rel="stylesheet" href="css/main.css" />
|
||||
</head>
|
||||
<body>
|
||||
<div id="chatBox">
|
||||
<div class="messages-container"></div>
|
||||
<div id="inputArea">
|
||||
<input type="text" id="userInput" placeholder="输入消息..." />
|
||||
<button onclick="sendMessage()" id="sendBtn">
|
||||
<svg class="icon" viewBox="0 0 1057 1024" xmlns="http://www.w3.org/2000/svg" width="20" height="20">
|
||||
<path
|
||||
d="M891.904 825.782857L462.482286 693.613714l429.421714-495.396571-561.517714 495.469714L0.073143 561.590857 1057.133714 0.073143 891.904 825.782857zM462.482286 1024v-231.058286l132.096 65.828572-132.096 165.156571z"
|
||||
fill="#ffffff"
|
||||
></path>
|
||||
</svg>
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- 添加上传按钮 -->
|
||||
<div class="upload-button-container">
|
||||
<button onclick="showUploadDialog()" class="upload-btn">
|
||||
<svg viewBox="0 0 1024 1024" xmlns="http://www.w3.org/2000/svg" width="20" height="20">
|
||||
<path
|
||||
d="M1024 736s-3.4048-10.24-10.24-20.5056l-150.1696-300.3392c-6.8352-10.24-20.48-20.5056-34.1248-20.5056H706.56c-13.6448 0-23.8848 10.24-23.8848 23.9104v40.96c0 13.6448 10.2144 23.9104 23.8848 23.9104h40.96c13.6448 0 27.2896 10.24 34.1248 20.48l105.8304 215.0656c6.8352 10.1888 0 20.4544-13.6704 20.4544H706.56c-13.6448 0-23.8848 10.24-23.8848 23.9104v122.9056c0 13.6448-10.24 23.9104-23.9104 23.9104H365.2352a23.3216 23.3216 0 0 1-23.8848-23.9104v-122.9056c0-13.6448-10.24-23.9104-23.9104-23.9104H146.7648c-13.6448 0-17.0752-10.24-13.6448-20.4544l109.2352-215.0656c6.8352-10.2144 20.48-20.48 34.1248-20.48h37.5552c13.6448 0 23.9104-10.24 23.9104-23.9104v-37.5552c0-13.6448-10.24-23.8848-23.9104-23.8848h-122.88c-13.6704 0-27.3152 10.2144-34.1248 20.48L10.24 715.4944c-6.8352 10.2656-10.24 20.5056-10.24 20.5056v235.4944c0 13.6448 10.24 23.9104 23.8848 23.9104h976.2048c13.6448 0 23.9104-10.24 23.9104-23.9104V736zM300.3648 292.2752h126.2848v358.4h170.6752v-358.4h133.12c13.6448 0 17.0752-6.8352 6.8352-17.0496l-211.6352-238.9504c-6.8352-10.2144-23.8848-10.2144-30.72 0l-204.8 238.9504c-6.8096 10.2144-3.4048 17.0496 10.24 17.0496z"
|
||||
fill="#fff"
|
||||
></path>
|
||||
</svg>
|
||||
</button>
|
||||
</div>
|
||||
|
||||
<!-- 文件上传弹窗 -->
|
||||
<div id="uploadDialog" class="upload-dialog">
|
||||
<div class="upload-dialog-content">
|
||||
<span class="close-btn" onclick="closeUploadDialog()">×</span>
|
||||
<h2>上传文件</h2>
|
||||
<div class="upload-area" id="dropZone">
|
||||
<input type="file" id="fileInput" style="display: none" onchange="handleFileSelect(event)" />
|
||||
<div class="upload-placeholder" onclick="document.getElementById('fileInput').click()">
|
||||
<i class="fas fa-cloud-upload-alt"></i>
|
||||
<p>点击或拖拽文件到此处上传</p>
|
||||
<p class="supported-formats">支持的格式: PDF, DOC, DOCX</p>
|
||||
</div>
|
||||
</div>
|
||||
<div id="uploadProgress" class="upload-progress" style="display: none">
|
||||
<div class="progress-bar">
|
||||
<div class="progress-fill"></div>
|
||||
</div>
|
||||
<span class="progress-text">0%</span>
|
||||
</div>
|
||||
<div id="uploadStatus" class="upload-status"></div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
const url = "http://localhost:8899";
|
||||
|
||||
let currentBotMessage = null;
|
||||
|
||||
// 添加消息到聊天框
|
||||
function addMessage(content, isUser = false) {
|
||||
const messagesContainer = document.querySelector(".messages-container");
|
||||
const messageDiv = document.createElement("div");
|
||||
messageDiv.className = `message ${isUser ? "user-message" : "bot-message"}`;
|
||||
|
||||
// 创建头像元素
|
||||
const avatar = document.createElement("img");
|
||||
avatar.className = "avatar";
|
||||
avatar.src = isUser ? "./images/user-avatar.png" : "/images/bot-avatar.png";
|
||||
avatar.alt = isUser ? "User Avatar" : "Bot Avatar";
|
||||
|
||||
// 创建消息内容容器
|
||||
const messageContent = document.createElement("div");
|
||||
messageContent.className = "message-content";
|
||||
|
||||
if (isUser) {
|
||||
messageContent.textContent = content;
|
||||
} else {
|
||||
messageContent.innerHTML = marked.parse(content);
|
||||
}
|
||||
|
||||
// 组装消息元素
|
||||
messageDiv.appendChild(avatar);
|
||||
messageDiv.appendChild(messageContent);
|
||||
|
||||
messagesContainer.appendChild(messageDiv);
|
||||
messagesContainer.scrollTop = messagesContainer.scrollHeight;
|
||||
return messageDiv;
|
||||
}
|
||||
|
||||
// 修改处理流式响应的部分
|
||||
async function streamResponse(prompt) {
|
||||
const btn = document.getElementById("sendBtn");
|
||||
btn.disabled = true;
|
||||
let accumulatedContent = "";
|
||||
|
||||
try {
|
||||
const response = await fetch(url + "/knows/generate", {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
Accept: "text/event-stream"
|
||||
},
|
||||
body: JSON.stringify({
|
||||
keyword: prompt
|
||||
})
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP error! status: ${response.status}`);
|
||||
}
|
||||
|
||||
const reader = response.body.getReader();
|
||||
const decoder = new TextDecoder();
|
||||
|
||||
if (!currentBotMessage) {
|
||||
// 创建完整的消息结构,包括头像
|
||||
const messageDiv = document.createElement("div");
|
||||
messageDiv.className = "message bot-message";
|
||||
|
||||
// 创建头像元素
|
||||
const avatar = document.createElement("img");
|
||||
avatar.className = "avatar";
|
||||
avatar.src = "./images/bot-avatar.png";
|
||||
avatar.alt = "Bot Avatar";
|
||||
|
||||
// 创建消息内容容器
|
||||
const messageContent = document.createElement("div");
|
||||
messageContent.className = "message-content";
|
||||
|
||||
// 组装消息元素
|
||||
messageDiv.appendChild(avatar);
|
||||
messageDiv.appendChild(messageContent);
|
||||
|
||||
document.querySelector(".messages-container").appendChild(messageDiv);
|
||||
currentBotMessage = messageContent; // 更新 currentBotMessage 为消息内容容器
|
||||
}
|
||||
|
||||
let thinkContent = true;
|
||||
while (true) {
|
||||
const { done, value } = await reader.read();
|
||||
if (done) break;
|
||||
|
||||
const chunk = decoder.decode(value);
|
||||
const lines = chunk.split("\n").filter((line) => line.trim().startsWith("data: "));
|
||||
|
||||
for (const line of lines) {
|
||||
try {
|
||||
// 移除 "data: " 前缀并解析JSON
|
||||
const jsonData = JSON.parse(line.substring(6));
|
||||
|
||||
if (jsonData.response) {
|
||||
let content = jsonData.response;
|
||||
// if (content.includes("\u003c/think\u003e")) {
|
||||
// thinkContent = false;
|
||||
// }
|
||||
|
||||
// if (!thinkContent) {
|
||||
// accumulatedContent += content;
|
||||
// currentBotMessage.innerHTML = marked.parse(accumulatedContent);
|
||||
// }
|
||||
|
||||
accumulatedContent += content;
|
||||
currentBotMessage.innerHTML = marked.parse(accumulatedContent);
|
||||
}
|
||||
|
||||
if (jsonData.done) {
|
||||
currentBotMessage = null;
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("解析数据失败:", error, "原始数据:", line);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
document.querySelector(".messages-container").scrollTop = document.querySelector(".messages-container").scrollHeight;
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("请求失败:", error);
|
||||
addMessage(`[错误] ${error.message}`, false);
|
||||
} finally {
|
||||
btn.disabled = false;
|
||||
}
|
||||
}
|
||||
|
||||
// 发送消息
|
||||
async function sendMessage() {
|
||||
const input = document.getElementById("userInput");
|
||||
const userMessage = input.value.trim();
|
||||
|
||||
if (!userMessage) return;
|
||||
|
||||
addMessage(userMessage, true);
|
||||
input.value = "";
|
||||
|
||||
await streamResponse(userMessage);
|
||||
}
|
||||
|
||||
// 回车键发送
|
||||
document.getElementById("userInput").addEventListener("keypress", (e) => {
|
||||
if (e.key === "Enter" && !e.shiftKey) {
|
||||
e.preventDefault();
|
||||
sendMessage();
|
||||
}
|
||||
});
|
||||
|
||||
function showUploadDialog() {
|
||||
document.getElementById("uploadDialog").style.display = "block";
|
||||
}
|
||||
|
||||
function closeUploadDialog() {
|
||||
document.getElementById("uploadDialog").style.display = "none";
|
||||
resetUploadDialog();
|
||||
}
|
||||
|
||||
function resetUploadDialog() {
|
||||
document.getElementById("fileInput").value = "";
|
||||
document.getElementById("uploadProgress").style.display = "none";
|
||||
document.getElementById("uploadStatus").innerHTML = "";
|
||||
document.getElementById("uploadStatus").className = "upload-status";
|
||||
}
|
||||
|
||||
function handleFileSelect(event) {
|
||||
const file = event.target.files[0];
|
||||
if (file) {
|
||||
uploadFile(file);
|
||||
}
|
||||
}
|
||||
|
||||
function updateProgress(percent) {
|
||||
const progressBar = document.querySelector(".progress-fill");
|
||||
const progressText = document.querySelector(".progress-text");
|
||||
progressBar.style.width = `${percent}%`;
|
||||
progressText.textContent = `${percent}%`;
|
||||
}
|
||||
|
||||
function uploadFile(file) {
|
||||
const formData = new FormData();
|
||||
formData.append("file", file);
|
||||
|
||||
const progressDiv = document.getElementById("uploadProgress");
|
||||
const statusDiv = document.getElementById("uploadStatus");
|
||||
|
||||
progressDiv.style.display = "block";
|
||||
statusDiv.innerHTML = "正在上传...";
|
||||
statusDiv.className = "upload-status";
|
||||
|
||||
fetch(url + "/api/file/upload", {
|
||||
method: "POST",
|
||||
body: formData
|
||||
})
|
||||
.then((response) => response.json())
|
||||
.then((data) => {
|
||||
statusDiv.innerHTML = data.message;
|
||||
statusDiv.className = "upload-status success";
|
||||
updateProgress(100);
|
||||
setTimeout(() => {
|
||||
closeUploadDialog();
|
||||
}, 2000);
|
||||
})
|
||||
.catch((error) => {
|
||||
statusDiv.innerHTML = "上传失败: " + error.message;
|
||||
statusDiv.className = "upload-status error";
|
||||
updateProgress(0);
|
||||
});
|
||||
}
|
||||
|
||||
// 添加拖拽上传支持
|
||||
const dropZone = document.getElementById("dropZone");
|
||||
|
||||
dropZone.addEventListener("dragover", (e) => {
|
||||
e.preventDefault();
|
||||
dropZone.style.borderColor = "#4CAF50";
|
||||
});
|
||||
|
||||
dropZone.addEventListener("dragleave", (e) => {
|
||||
e.preventDefault();
|
||||
dropZone.style.borderColor = "#ccc";
|
||||
});
|
||||
|
||||
dropZone.addEventListener("drop", (e) => {
|
||||
e.preventDefault();
|
||||
dropZone.style.borderColor = "#ccc";
|
||||
const file = e.dataTransfer.files[0];
|
||||
if (file) {
|
||||
uploadFile(file);
|
||||
}
|
||||
});
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
337
konws-web/css/main.css
Normal file
337
konws-web/css/main.css
Normal file
@ -0,0 +1,337 @@
|
||||
body {
|
||||
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
|
||||
max-width: 800px;
|
||||
margin: 0 auto;
|
||||
padding: 0;
|
||||
background-color: #f5f5f5;
|
||||
height: 100vh;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
}
|
||||
|
||||
#chatBox {
|
||||
flex: 1;
|
||||
background: #ededed;
|
||||
padding: 20px;
|
||||
overflow-y: auto;
|
||||
margin: 50px;
|
||||
border-radius: 18px;
|
||||
position: relative;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
padding-bottom: 80px;
|
||||
scrollbar-width: none;
|
||||
box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1);
|
||||
/* Firefox */
|
||||
-ms-overflow-style: none;
|
||||
/* IE and Edge */
|
||||
}
|
||||
|
||||
#chatBox::-webkit-scrollbar {
|
||||
display: none;
|
||||
/* Chrome, Safari, Opera */
|
||||
}
|
||||
|
||||
.messages-container {
|
||||
flex: 1;
|
||||
overflow-y: auto;
|
||||
padding: 20px;
|
||||
scrollbar-width: none;
|
||||
/* Firefox */
|
||||
-ms-overflow-style: none;
|
||||
/* IE and Edge */
|
||||
}
|
||||
|
||||
.messages-container::-webkit-scrollbar {
|
||||
display: none;
|
||||
/* Chrome, Safari, Opera */
|
||||
}
|
||||
|
||||
.message {
|
||||
margin: 10px 0;
|
||||
padding: 10px 15px;
|
||||
border-radius: 4px;
|
||||
max-width: 70%;
|
||||
word-wrap: break-word;
|
||||
position: relative;
|
||||
line-height: 1.5;
|
||||
font-size: 15px;
|
||||
width: max-content;
|
||||
display: flex;
|
||||
align-items: flex-start;
|
||||
gap: 10px;
|
||||
}
|
||||
|
||||
.avatar {
|
||||
width: 40px;
|
||||
height: 40px;
|
||||
border-radius: 50%;
|
||||
flex-shrink: 0;
|
||||
}
|
||||
|
||||
.message-content {
|
||||
padding: 10px 15px;
|
||||
border-radius: 15px;
|
||||
}
|
||||
|
||||
.user-message {
|
||||
width: max-content;
|
||||
margin-left: auto;
|
||||
flex-direction: row-reverse;
|
||||
}
|
||||
|
||||
.user-message .message-content {
|
||||
background: #95ec69;
|
||||
border-radius: 15px 0 15px 15px;
|
||||
}
|
||||
|
||||
.bot-message {
|
||||
background: white;
|
||||
margin-right: auto;
|
||||
border-radius: 0 15px 15px 15px;
|
||||
}
|
||||
|
||||
#inputArea {
|
||||
position: absolute;
|
||||
bottom: 20px;
|
||||
left: 50%;
|
||||
transform: translateX(-50%);
|
||||
display: flex;
|
||||
gap: 10px;
|
||||
padding: 15px;
|
||||
background: white;
|
||||
border-radius: 20px;
|
||||
box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1);
|
||||
width: calc(100% - 100px);
|
||||
max-width: 600px;
|
||||
box-sizing: border-box;
|
||||
}
|
||||
|
||||
#userInput {
|
||||
flex-grow: 1;
|
||||
padding: 8px 12px;
|
||||
border: 1px solid #ddd;
|
||||
border: none;
|
||||
border-radius: 4px;
|
||||
font-size: 15px;
|
||||
background: white;
|
||||
outline: none;
|
||||
}
|
||||
|
||||
#userInput:focus {
|
||||
border-color: #07c160;
|
||||
}
|
||||
|
||||
button {
|
||||
display: flex;
|
||||
justify-content: center;
|
||||
align-items: center;
|
||||
background: #07c160;
|
||||
color: white;
|
||||
border: none;
|
||||
border-radius: 4px;
|
||||
cursor: pointer;
|
||||
transition: background 0.2s;
|
||||
font-size: 15px;
|
||||
border-radius: 50%;
|
||||
width: 40px;
|
||||
height: 40px;
|
||||
position: absolute;
|
||||
right: 18px;
|
||||
}
|
||||
|
||||
button:hover {
|
||||
background: #06ae56;
|
||||
}
|
||||
|
||||
button:disabled {
|
||||
background: #9fd7b5;
|
||||
cursor: not-allowed;
|
||||
}
|
||||
|
||||
/* Markdown 样式优化 */
|
||||
.message pre {
|
||||
background: #f8f9fa;
|
||||
padding: 12px;
|
||||
border-radius: 4px;
|
||||
overflow-x: auto;
|
||||
margin: 8px 0;
|
||||
font-size: 14px;
|
||||
}
|
||||
|
||||
.message code {
|
||||
font-family: Menlo, Monaco, Consolas, "Courier New", monospace;
|
||||
background: rgba(0, 0, 0, 0.05);
|
||||
padding: 2px 4px;
|
||||
border-radius: 3px;
|
||||
font-size: 14px;
|
||||
}
|
||||
|
||||
.message p {
|
||||
margin: 0;
|
||||
}
|
||||
|
||||
.message p+p {
|
||||
margin-top: 8px;
|
||||
}
|
||||
|
||||
/* 滚动条样式 */
|
||||
#chatBox::-webkit-scrollbar {
|
||||
width: 6px;
|
||||
}
|
||||
|
||||
#chatBox::-webkit-scrollbar-track {
|
||||
background: #f1f1f1;
|
||||
}
|
||||
|
||||
#chatBox::-webkit-scrollbar-thumb {
|
||||
background: #c1c1c1;
|
||||
border-radius: 3px;
|
||||
}
|
||||
|
||||
#chatBox::-webkit-scrollbar-thumb:hover {
|
||||
background: #a8a8a8;
|
||||
}
|
||||
|
||||
/* 适配移动端 */
|
||||
@media (max-width: 768px) {
|
||||
body {
|
||||
max-width: 100%;
|
||||
height: 100vh;
|
||||
padding: 0;
|
||||
}
|
||||
|
||||
.message {
|
||||
max-width: 85%;
|
||||
}
|
||||
|
||||
#inputArea {
|
||||
padding: 10px;
|
||||
}
|
||||
}
|
||||
|
||||
.upload-button-container {
|
||||
position: fixed;
|
||||
bottom: 20px;
|
||||
right: 20px;
|
||||
z-index: 1000;
|
||||
}
|
||||
|
||||
.upload-btn {
|
||||
background-color: #000;
|
||||
color: white;
|
||||
border: none;
|
||||
border-radius: 50%;
|
||||
cursor: pointer;
|
||||
font-size: 16px;
|
||||
transition: background-color 0.3s;
|
||||
bottom: 50px;
|
||||
}
|
||||
|
||||
.upload-btn:hover {
|
||||
background-color: #1a1a1a;
|
||||
}
|
||||
|
||||
.upload-dialog {
|
||||
display: none;
|
||||
position: fixed;
|
||||
top: 0;
|
||||
left: 0;
|
||||
width: 100%;
|
||||
height: 100%;
|
||||
background-color: rgba(0, 0, 0, 0.5);
|
||||
z-index: 1001;
|
||||
}
|
||||
|
||||
.upload-dialog-content {
|
||||
position: relative;
|
||||
background-color: #fefefe;
|
||||
margin: 15% auto;
|
||||
padding: 20px;
|
||||
border-radius: 5px;
|
||||
width: 60%;
|
||||
max-width: 500px;
|
||||
}
|
||||
|
||||
.close-btn {
|
||||
position: absolute;
|
||||
right: 10px;
|
||||
top: 5px;
|
||||
font-size: 24px;
|
||||
cursor: pointer;
|
||||
color: #888;
|
||||
}
|
||||
|
||||
.close-btn:hover {
|
||||
color: #555;
|
||||
}
|
||||
|
||||
.upload-area {
|
||||
border: 2px dashed #ccc;
|
||||
border-radius: 5px;
|
||||
padding: 20px;
|
||||
text-align: center;
|
||||
margin: 20px 0;
|
||||
cursor: pointer;
|
||||
transition: border-color 0.3s;
|
||||
}
|
||||
|
||||
.upload-area:hover {
|
||||
border-color: #4caf50;
|
||||
}
|
||||
|
||||
.upload-placeholder {
|
||||
color: #666;
|
||||
}
|
||||
|
||||
.upload-placeholder i {
|
||||
font-size: 48px;
|
||||
color: #4caf50;
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
|
||||
.supported-formats {
|
||||
font-size: 12px;
|
||||
color: #888;
|
||||
margin-top: 5px;
|
||||
}
|
||||
|
||||
.upload-progress {
|
||||
margin: 15px 0;
|
||||
}
|
||||
|
||||
.progress-bar {
|
||||
width: 100%;
|
||||
height: 20px;
|
||||
background-color: #f0f0f0;
|
||||
border-radius: 10px;
|
||||
overflow: hidden;
|
||||
}
|
||||
|
||||
.progress-fill {
|
||||
width: 0%;
|
||||
height: 100%;
|
||||
background-color: #4caf50;
|
||||
transition: width 0.3s;
|
||||
}
|
||||
|
||||
.progress-text {
|
||||
display: block;
|
||||
text-align: center;
|
||||
margin-top: 5px;
|
||||
color: #666;
|
||||
}
|
||||
|
||||
.upload-status {
|
||||
margin-top: 10px;
|
||||
text-align: center;
|
||||
color: #666;
|
||||
}
|
||||
|
||||
.upload-status.success {
|
||||
color: #4caf50;
|
||||
}
|
||||
|
||||
.upload-status.error {
|
||||
color: #f44336;
|
||||
}
|
BIN
konws-web/images/bot-avatar.png
Normal file
BIN
konws-web/images/bot-avatar.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 3.5 KiB |
BIN
konws-web/images/user-avatar.png
Normal file
BIN
konws-web/images/user-avatar.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 4.7 KiB |
6
konws-web/js/marked.min.js
vendored
Normal file
6
konws-web/js/marked.min.js
vendored
Normal file
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user