DeepSeek 32B Chat

commit 98d7406a99f5626f2cb1a5e145170a674654d820 Author: liushuang Date: Wed Mar 5 14:14:54 2025 +0800 init diff --git a/knows-java/pom.xml b/knows-java/pom.xml new file mode 100644 index 0000000..00bab2f --- /dev/null +++ b/knows-java/pom.xml @@ -0,0 +1,158 @@ + + + 4.0.0 + + org.springframework.boot + spring-boot-starter-parent + 3.3.2 + + + com.zhych + knows + 0.0.1-SNAPSHOT + embeddings + embeddings + + + 17 + + + + org.springframework.boot + spring-boot-starter-web + + + org.apache.commons + commons-lang3 + 3.12.0 + + + org.projectlombok + lombok + true + + + org.springframework.boot + spring-boot-starter-test + test + + + org.springframework.boot + spring-boot-starter-data-elasticsearch + + + com.alibaba + fastjson + 2.0.15 + compile + + + cn.hutool + hutool-all + 5.8.25 + + + com.squareup.okhttp3 + okhttp + 5.0.0-alpha.3 + + + org.apache.httpcomponents + httpclient + 4.5.13 + + + org.elasticsearch.client + elasticsearch-rest-high-level-client + 7.17.23 + + + co.elastic.clients + elasticsearch-java + 8.13.4 + + + com.fasterxml.jackson.core + jackson-databind + 2.15.2 + + + com.alibaba + dashscope-sdk-java + 2.8.3 + + + + org.apache.pdfbox + pdfbox + 2.0.24 + + + + net.sourceforge.tess4j + tess4j + 5.7.0 + + + + org.bytedeco + opencv-platform + 4.7.0-1.5.9 + + + + + org.apache.poi + poi + 5.2.3 + + + org.apache.poi + poi-ooxml + 5.2.3 + + + org.apache.poi + poi-scratchpad + 5.2.3 + + + + + + + src/main/resources + + **/* + + + + + + org.springframework.boot + spring-boot-maven-plugin + + + + org.projectlombok + lombok + + + + + + org.apache.maven.plugins + maven-surefire-plugin + 3.0.0-M5 + + + -Xmx2048m + -Djava.library.path=${project.basedir}/lib/opencv + + + + + + + diff --git a/knows-java/src/main/java/cn/luckday/Application.java b/knows-java/src/main/java/cn/luckday/Application.java new file mode 100644 index 0000000..fb425ff --- /dev/null +++ b/knows-java/src/main/java/cn/luckday/Application.java @@ -0,0 +1,15 @@ +package cn.luckday; + +import org.springframework.boot.SpringApplication; +import org.springframework.boot.autoconfigure.SpringBootApplication; +import org.springframework.context.annotation.ComponentScan; + +@SpringBootApplication +@ComponentScan(value = {"cn.luckday.*"}) +public class Application { + + public static void main(String[] args) { + SpringApplication.run(Application.class, args); + } + +} diff --git a/knows-java/src/main/java/cn/luckday/bean/KnowsIndex.java b/knows-java/src/main/java/cn/luckday/bean/KnowsIndex.java new file mode 100644 index 0000000..88cdc3c --- /dev/null +++ b/knows-java/src/main/java/cn/luckday/bean/KnowsIndex.java @@ -0,0 +1,21 @@ +package cn.luckday.bean; + +import lombok.Data; + +@Data +public class KnowsIndex { + + private String id; + + private String file_name; + + private String file_path; + + private String file_type; + + private String file_size; + + private String content; + + private double[] content_vec; +} \ No newline at end of file diff --git a/knows-java/src/main/java/cn/luckday/bean/SearchResult.java b/knows-java/src/main/java/cn/luckday/bean/SearchResult.java new file mode 100644 index 0000000..3111155 --- /dev/null +++ b/knows-java/src/main/java/cn/luckday/bean/SearchResult.java @@ -0,0 +1,13 @@ +package cn.luckday.bean; + +import lombok.AllArgsConstructor; +import lombok.Data; +import lombok.NoArgsConstructor; + +@Data +@AllArgsConstructor +@NoArgsConstructor +public class SearchResult { + private KnowsIndex knowsIndex; + private Double score; +} diff --git a/knows-java/src/main/java/cn/luckday/controller/KnowsController.java b/knows-java/src/main/java/cn/luckday/controller/KnowsController.java new file mode 100644 index 0000000..e37fa99 --- /dev/null +++ b/knows-java/src/main/java/cn/luckday/controller/KnowsController.java @@ -0,0 +1,112 @@ +package cn.luckday.controller; + +import cn.hutool.core.collection.CollUtil; +import cn.luckday.llm.QwenClient; +import com.alibaba.dashscope.aigc.generation.GenerationResult; +import com.alibaba.dashscope.exception.InputRequiredException; +import com.alibaba.dashscope.exception.NoApiKeyException; +import com.alibaba.fastjson.JSON; +import com.alibaba.fastjson.JSONObject; +import cn.luckday.bean.SearchResult; +import cn.luckday.embed.EmbedClient; +import cn.luckday.embed.ReRankClient; +import cn.luckday.llm.OllamaClient; +import cn.luckday.service.EsDocumentService; +import jakarta.annotation.Resource; +import jakarta.servlet.http.HttpServletResponse; +import lombok.extern.slf4j.Slf4j; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.web.bind.annotation.PostMapping; +import org.springframework.web.bind.annotation.RequestBody; +import org.springframework.web.bind.annotation.RequestMapping; +import org.springframework.web.bind.annotation.RestController; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +@Slf4j +@RestController +@RequestMapping("/knows") +public class KnowsController { + + @Value("${embedding.uri}") + private String embeddingUri; + + @Value("${embedding.api-key}") + private String embeddingApiKey; + + @Value("${re-rank.uri}") + private String ReRankUri; + + @Value("${re-rank.api-key}") + private String ReRankApiKey; + + @Value("${oll.uri}") + private String ollUri; + + @Value("${qwen.api-key}") + private static String apiKey; + + @Value("${qwen.model}") + private static String model; + + @Resource + private EsDocumentService service; + + @PostMapping("/process") + public List process(@RequestBody Map dto) throws IOException { + String keyword = dto.get("keyword"); + return service.searchVector(EmbedClient.getEmbedding(embeddingUri, embeddingApiKey, keyword)); + } + + @PostMapping("/generate") + public void generate(HttpServletResponse response, @RequestBody Map dto) throws IOException, NoApiKeyException, InputRequiredException { + String keyword = dto.get("keyword"); + List searchResults = service.searchVector(EmbedClient.getEmbedding(embeddingUri, embeddingApiKey, keyword)); + List contents = searchResults.stream().map(searchResult -> searchResult.getKnowsIndex().getContent()).toList(); + log.info("搜索结果searchResults: {} ", contents); + + Object reRankPassages = ""; + if (CollUtil.isNotEmpty(searchResults)) { + // 重排处理 + List contentList = new ArrayList<>(); + searchResults.forEach(searchResult -> contentList.add(searchResult.getKnowsIndex().getContent())); + String reRank = ReRankClient.reRank(ReRankUri, ReRankApiKey, contentList, keyword); + log.info("重排结果reRank: {} ", reRank); + + JSONObject jsonObject = JSON.parseObject(reRank, JSONObject.class); + reRankPassages = jsonObject.get("rerank_passages"); + } + + // LLM总结回答 + OllamaClient.sendMsg(response, ollUri, keyword, reRankPassages.toString()); + } + + @PostMapping("/qwen-generate") + public String qwen(@RequestBody Map dto) throws IOException, NoApiKeyException, InputRequiredException { + String keyword = dto.get("keyword"); + List searchResults = service.searchVector(EmbedClient.getEmbedding(embeddingUri, embeddingApiKey, keyword)); + List contents = searchResults.stream().map(searchResult -> searchResult.getKnowsIndex().getContent()).toList(); + log.info("搜索结果searchResults: {} ", contents); + + Object reRankPassages = ""; + if (CollUtil.isNotEmpty(searchResults)) { + // 重排处理 + List contentList = new ArrayList<>(); + searchResults.forEach(searchResult -> contentList.add(searchResult.getKnowsIndex().getContent())); + String reRank = ReRankClient.reRank(ReRankUri, ReRankApiKey, contentList, keyword); + log.info("重排结果reRank: {} ", reRank); + + JSONObject jsonObject = JSON.parseObject(reRank, JSONObject.class); + reRankPassages = jsonObject.get("rerank_passages"); + } + + // LLM总结回答 + GenerationResult result = QwenClient.sendMsg(model, apiKey, keyword, reRankPassages.toString()); + String content = result.getOutput().getChoices().get(0).getMessage().getContent(); + log.info("千问: {}", content); + return content; + } +} diff --git a/knows-java/src/main/java/cn/luckday/controller/RedFileController.java b/knows-java/src/main/java/cn/luckday/controller/RedFileController.java new file mode 100644 index 0000000..6c15c13 --- /dev/null +++ b/knows-java/src/main/java/cn/luckday/controller/RedFileController.java @@ -0,0 +1,22 @@ +package cn.luckday.controller; + +import cn.luckday.service.RedFileService; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.http.ResponseEntity; +import org.springframework.web.bind.annotation.*; +import org.springframework.web.multipart.MultipartFile; +import java.util.*; + +@RestController +@RequestMapping("/api/file") +public class RedFileController { + + @Autowired + private RedFileService redFileService; + + @PostMapping("/upload") + public ResponseEntity uploadFile(@RequestParam("file") MultipartFile file) { + redFileService.uploadFile(file); + return ResponseEntity.ok(Map.of("message", "文件上传并解析成功")); + } +} diff --git a/knows-java/src/main/java/cn/luckday/document/Main.java b/knows-java/src/main/java/cn/luckday/document/Main.java new file mode 100644 index 0000000..5012274 --- /dev/null +++ b/knows-java/src/main/java/cn/luckday/document/Main.java @@ -0,0 +1,60 @@ +package cn.luckday.document; + +import java.io.File; +import java.util.List; +import java.awt.image.BufferedImage; + +public class Main { + public static void main(String[] args) { + try { + // 验证文件是否存在 + String pdfPath = "D:\\小红书文档\\中频\\运营经验库\\方法论\\PDF\\评论区和私信的互动指引的方法论.pdf"; + File pdfFile = new File(pdfPath); + if (!pdfFile.exists()) { + System.err.println("PDF文件不存在: " + pdfPath); + return; + } + + // 初始化PDFParser时添加错误处理 + PDFParser parser = new PDFParser(pdfPath); + try { + parser.parse(); + } catch (Exception e) { + System.err.println("PDF解析失败: " + e.getMessage()); + e.printStackTrace(); + } + + // 获取结果 + List texts = parser.getExtractedText(); + List images = parser.getExtractedImages(); + List tables = parser.getExtractedTables(); + + // // 处理Word文档 + // String wordPath = "D:\\小红书文档\\高频\\平台知识库\\已处理word\\新模式开票流程及注意事项.docx"; + // WordProcessor wordProcessor = new WordProcessor(wordPath); + // wordProcessor.process(); + // + // // 获取提取的文本 + // List textContent = wordProcessor.getExtractedText(); + // for (String text : textContent) { + // System.out.println(text); + // } + // + // // 处理表格 + // List tables = wordProcessor.getExtractedTables(); + // for (XWPFTable table : tables) { + // List> tableData = wordProcessor.convertTableToList(table); + // System.out.println("表格数据：" + tableData); + // + // // 导出表格为CSV + // wordProcessor.exportTableToCSV(table, "table_output.csv"); + // } + // + // // 保存图片 + // wordProcessor.saveImages("output_images"); + } catch (Exception e) { + System.err.println("程序执行出错: " + e.getMessage()); + e.printStackTrace(); + } + } +} diff --git a/knows-java/src/main/java/cn/luckday/document/OCRProcessor.java b/knows-java/src/main/java/cn/luckday/document/OCRProcessor.java new file mode 100644 index 0000000..abef4e3 --- /dev/null +++ b/knows-java/src/main/java/cn/luckday/document/OCRProcessor.java @@ -0,0 +1,180 @@ +package cn.luckday.document; + +import net.sourceforge.tess4j.Tesseract; +import org.opencv.core.CvType; +import org.opencv.core.Mat; +import org.opencv.core.Size; +import org.opencv.imgproc.Imgproc; + +import java.awt.image.BufferedImage; +import java.io.File; +import java.awt.image.DataBufferByte; +import org.apache.pdfbox.pdmodel.PDDocument; + +public class OCRProcessor { + static { + try { + // 从资源目录加载本地库 + String libraryPath = OCRProcessor.class + .getClassLoader() + .getResource("native/" + System.mapLibraryName("opencv_java4110")) + .getPath(); + + System.load(libraryPath); + } catch (Exception e) { + e.printStackTrace(); + } + } + + private final Tesseract tesseract; + + public OCRProcessor() { + tesseract = new Tesseract(); + initializeTesseract(); + } + + private void initializeTesseract() { + try { + // 设置Tesseract数据路径 + String tessdataPath = System.getenv("TESSDATA_PREFIX"); + if (tessdataPath == null || tessdataPath.isEmpty()) { + tessdataPath ="D:\\study\\backend\\embeddingstoes-master\\src\\main\\resources\\ocr"; + } + + tesseract.setDatapath(tessdataPath); + + // 修改：使用不依赖OSD的页面分割模式 + tesseract.setPageSegMode(3); + + // 设置语言包 + tesseract.setLanguage("chi_sim"); + + // 性能优化配置 + tesseract.setTessVariable("tessedit_create_pdf", "0"); + tesseract.setTessVariable("tessedit_create_hocr", "0"); + tesseract.setTessVariable("tessedit_write_images", "0"); + + } catch (Exception e) { + throw new RuntimeException("Tesseract 初始化失败: " + e.getMessage(), e); + } + } + + public String performOCR(BufferedImage image) { + try { + // 基本图像验证 + if (image == null || image.getWidth() < 10 || image.getHeight() < 10) { + throw new IllegalArgumentException("无效的图像"); + } + + // 预处理图像 + BufferedImage processedImage = preprocessImage(image); + + // 执行OCR + return tesseract.doOCR(processedImage); + + } catch (Exception e) { + System.err.println("OCR处理失败: " + e.getMessage()); + e.printStackTrace(); + return ""; + } + } + + private BufferedImage preprocessImage(BufferedImage image) { + try { + Mat mat = bufferedImageToMat(image); + + // 调整预处理步骤 + // 1. 转换为灰度图 + Mat gray = new Mat(); + Imgproc.cvtColor(mat, gray, Imgproc.COLOR_BGR2GRAY); + + // 2. 使用OTSU二值化替代自适应阈值 + Mat binary = new Mat(); + Imgproc.threshold(gray, binary, 0, 255, Imgproc.THRESH_BINARY + Imgproc.THRESH_OTSU); + + // 3. 添加形态学操作 + Mat kernel = Imgproc.getStructuringElement(Imgproc.MORPH_RECT, new Size(3, 3)); + Mat processed = new Mat(); + Imgproc.morphologyEx(binary, processed, Imgproc.MORPH_CLOSE, kernel); + + // 4. 边缘增强 + Mat enhanced = new Mat(); + Imgproc.GaussianBlur(processed, enhanced, new Size(3, 3), 0); + + return matToBufferedImage(enhanced); + } catch (Exception e) { + e.printStackTrace(); + return image; + } + } + + private Mat bufferedImageToMat(BufferedImage image) { + // 转换图像类型为 TYPE_3BYTE_BGR，如果需要的话 + BufferedImage convertedImage = image; + if (image.getType() != BufferedImage.TYPE_3BYTE_BGR) { + convertedImage = new BufferedImage( + image.getWidth(), + image.getHeight(), + BufferedImage.TYPE_3BYTE_BGR); + convertedImage.getGraphics().drawImage(image, 0, 0, null); + } + + // 获取图像数据 + byte[] pixels = ((DataBufferByte) convertedImage.getRaster().getDataBuffer()).getData(); + + // 创建Mat对象 + Mat mat = new Mat( + convertedImage.getHeight(), + convertedImage.getWidth(), + CvType.CV_8UC3); + mat.put(0, 0, pixels); + + return mat; + } + + private BufferedImage matToBufferedImage(Mat mat) { + // 确保mat是8位3通道或单通道 + int type = BufferedImage.TYPE_3BYTE_BGR; + if (mat.channels() == 1) { + type = BufferedImage.TYPE_BYTE_GRAY; + } + + // 获取mat的数据 + byte[] pixels = new byte[mat.channels() * mat.cols() * mat.rows()]; + mat.get(0, 0, pixels); + + // 创建BufferedImage + BufferedImage image = new BufferedImage( + mat.cols(), + mat.rows(), + type); + + // 设置图像数据 + byte[] targetPixels = ((DataBufferByte) image.getRaster().getDataBuffer()).getData(); + System.arraycopy(pixels, 0, targetPixels, 0, pixels.length); + + return image; + } + + public void processPDF(String pdfPath) { + try { + // 添加内存使用监控 + Runtime runtime = Runtime.getRuntime(); + long maxMemory = runtime.maxMemory() / (1024 * 1024); + System.out.println("最大可用内存: " + maxMemory + "MB"); + + // 原有PDF处理代码 + PDDocument document = PDDocument.load(new File(pdfPath)); + // ... existing code ... + + // 确保资源释放 + document.close(); + } catch (OutOfMemoryError e) { + System.err.println("内存不足: " + e.getMessage()); + // TODO 日志记录 + } catch (Exception e) { + System.err.println("处理PDF时发生错误: " + e.getMessage()); + e.printStackTrace(); + } + } +} \ No newline at end of file diff --git a/knows-java/src/main/java/cn/luckday/document/PDFParser.java b/knows-java/src/main/java/cn/luckday/document/PDFParser.java new file mode 100644 index 0000000..1ceffa1 --- /dev/null +++ b/knows-java/src/main/java/cn/luckday/document/PDFParser.java @@ -0,0 +1,137 @@ +package cn.luckday.document; + +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.text.PDFTextStripper; +import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.pdmodel.graphics.PDXObject; +import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; + +import javax.imageio.ImageIO; +import java.awt.image.BufferedImage; +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +public class PDFParser { + private final String pdfPath; + private PDDocument document; + private final OCRProcessor ocrProcessor; + private List extractedText; + private List extractedImages; + private List

extractedTables; + + public PDFParser(String pdfPath) { + this.pdfPath = pdfPath; + this.ocrProcessor = new OCRProcessor(); + this.extractedText = new ArrayList<>(); + this.extractedImages = new ArrayList<>(); + this.extractedTables = new ArrayList<>(); + } + + public void parse() { + try { + document = PDDocument.load(new File(pdfPath)); + + // 1. 解析文本内容 + System.out.println("=== 开始解析文本 ==="); + extractText(); + + // 2. 解析图片 + System.out.println("\n=== 开始解析图片 ==="); + extractImages(); + + // 3. 解析表格 +// System.out.println("\n=== 开始解析表格 ==="); +// extractTables(); + + document.close(); + } catch (Exception e) { + System.err.println("PDF解析失败: " + e.getMessage()); + e.printStackTrace(); + if (document != null) { + try { + document.close(); + } catch (IOException ignored) { + } + } + } + } + + private void extractText() throws IOException { + System.out.println("正在提取PDF文本..."); + + // 只使用PDFTextStripper提取文本 + PDFTextStripper stripper = new PDFTextStripper(); + String text = stripper.getText(document); + System.out.println("文本内容：\n" + text); + extractedText.add(text); + } + + private void extractImages() throws IOException { + System.out.println("正在提取并处理PDF图片..."); + int imageCounter = 0; + + for (PDPage page : document.getPages()) { + for (COSName name : page.getResources().getXObjectNames()) { + PDXObject object = page.getResources().getXObject(name); + if (object instanceof PDImageXObject) { + PDImageXObject image = (PDImageXObject) object; + BufferedImage bImage = image.getImage(); + + // 保存图片 + String imagePath = "output_images/extracted_image_" + imageCounter + ".png"; + ImageIO.write(bImage, "PNG", new File(imagePath)); + System.out.println("已保存图片: " + imagePath); + + // OCR处理图片 + try { + System.out.println("正在对图片 " + imageCounter + " 进行OCR处理..."); + String imageText = ocrProcessor.performOCR(bImage); + if (!imageText.trim().isEmpty()) { + System.out.println("图片 " + imageCounter + " OCR结果：\n" + imageText); + extractedText.add("【图片" + imageCounter + "文本】\n" + imageText); + } else { + System.out.println("图片 " + imageCounter + " 未识别出文本"); + } + } catch (Exception e) { + System.err.println("处理图片 " + imageCounter + " 时出错: " + e.getMessage()); + } + + extractedImages.add(bImage); + imageCounter++; + } + } + } + System.out.println("共处理 " + imageCounter + " 张图片"); + } + + private void extractTables() { + System.out.println("正在提取PDF表格..."); + TableDetector detector = new TableDetector(document); + extractedTables = detector.detectTables(); + + if (extractedTables.isEmpty()) { + System.out.println("未检测到表格"); + } else { + System.out.println("共检测到 " + extractedTables.size() + " 个表格"); + for (int i = 0; i < extractedTables.size(); i++) { + System.out.println("表格 " + (i + 1) + ":\n" + extractedTables.get(i)); + } + } + } + + // Getter方法 + public List getExtractedText() { + return extractedText; + } + + public List getExtractedImages() { + return extractedImages; + } + + public List

getExtractedTables() { + return extractedTables; + } +} \ No newline at end of file diff --git a/knows-java/src/main/java/cn/luckday/document/Table.java b/knows-java/src/main/java/cn/luckday/document/Table.java new file mode 100644 index 0000000..83d5ff1 --- /dev/null +++ b/knows-java/src/main/java/cn/luckday/document/Table.java @@ -0,0 +1,43 @@ +package cn.luckday.document; + +public class Table { + private String content; + private int rows; + private int columns; + + public Table(String content) { + this.content = content; + analyzeStructure(); + } + + private void analyzeStructure() { + if (content == null || content.isEmpty()) { + return; + } + + // 按行分割内容 + String[] lines = content.split("\n"); + rows = lines.length; + + // 分析列数（基于空格或制表符分隔） + columns = 0; + for (String line : lines) { + String[] cells = line.trim().split("\\s+"); + columns = Math.max(columns, cells.length); + } + } + + public int getRows() { + return rows; + } + + public int getColumns() { + return columns; + } + + @Override + public String toString() { + return String.format("Table{rows=%d, columns=%d, content='%s'}", + rows, columns, content); + } +} diff --git a/knows-java/src/main/java/cn/luckday/document/TableDetector.java b/knows-java/src/main/java/cn/luckday/document/TableDetector.java new file mode 100644 index 0000000..abf5d5c --- /dev/null +++ b/knows-java/src/main/java/cn/luckday/document/TableDetector.java @@ -0,0 +1,170 @@ +package cn.luckday.document; + +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.text.PDFTextStripperByArea; + +import java.awt.Rectangle; +import java.util.ArrayList; +import java.util.List; + +public class TableDetector { + private final PDDocument document; + + public TableDetector(PDDocument document) { + this.document = document; + } + + public List

detectTables() { + List

tables = new ArrayList<>(); + try { + for (PDPage page : document.getPages()) { + // 使用文本位置分析来检测表格 + PDFTextStripperByArea stripper = new PDFTextStripperByArea(); + stripper.setSortByPosition(true); + + // 检测表格边界 + List tableRegions = detectTableRegions(page); + + for (Rectangle region : tableRegions) { + stripper.addRegion("table", region); + stripper.extractRegions(page); + String tableContent = stripper.getTextForRegion("table"); + tables.add(new Table(tableContent)); + } + } + } catch (Exception e) { + e.printStackTrace(); + } + return tables; + } + + private List detectTableRegions(PDPage page) { + List regions = new ArrayList<>(); + try { + // 获取页面尺寸 + float pageHeight = page.getMediaBox().getHeight(); + float pageWidth = page.getMediaBox().getWidth(); + + // 使用PDFTextStripperByArea进行文本分析 + PDFTextStripperByArea stripper = new PDFTextStripperByArea(); + stripper.setSortByPosition(true); + + // 将页面划分为网格进行分析 + int gridRows = 20; + int gridCols = 20; + float cellHeight = pageHeight / gridRows; + float cellWidth = pageWidth / gridCols; + + // 存储每个网格单元的文本密度 + int[][] textDensity = new int[gridRows][gridCols]; + + // 分析每个网格单元 + for (int row = 0; row < gridRows; row++) { + for (int col = 0; col < gridCols; col++) { + Rectangle cell = new Rectangle( + (int) (col * cellWidth), + (int) (row * cellHeight), + (int) cellWidth, + (int) cellHeight); + + stripper.addRegion("cell_" + row + "_" + col, cell); + stripper.extractRegions(page); + String cellText = stripper.getTextForRegion("cell_" + row + "_" + col); + + // 计算文本密度 + textDensity[row][col] = cellText.trim().length(); + } + } + + // 检测表格区域 + List potentialTables = findPotentialTables(textDensity, gridRows, gridCols); + + // 转换检测到的区域为实际坐标 + for (TableRegion tableRegion : potentialTables) { + Rectangle rect = new Rectangle( + (int) (tableRegion.startCol * cellWidth), + (int) (tableRegion.startRow * cellHeight), + (int) ((tableRegion.endCol - tableRegion.startCol + 1) * cellWidth), + (int) ((tableRegion.endRow - tableRegion.startRow + 1) * cellHeight)); + regions.add(rect); + } + + } catch (Exception e) { + e.printStackTrace(); + } + return regions; + } + + private List findPotentialTables(int[][] textDensity, int rows, int cols) { + List tables = new ArrayList<>(); + boolean[][] visited = new boolean[rows][cols]; + + // 遍历网格寻找潜在的表格区域 + for (int i = 0; i < rows; i++) { + for (int j = 0; j < cols; j++) { + if (!visited[i][j] && isTableCell(textDensity, i, j)) { + TableRegion region = new TableRegion(); + expandTableRegion(textDensity, visited, i, j, region); + if (isValidTable(region)) { + tables.add(region); + } + } + } + } + return tables; + } + + private boolean isTableCell(int[][] density, int row, int col) { + // 判断是否为表格单元格的条件 + // 1. 文本密度适中 + // 2. 周围有类似的文本密度分布 + int cellDensity = density[row][col]; + return cellDensity > 0 && cellDensity < 100; // 可调整阈值 + } + + private void expandTableRegion(int[][] density, boolean[][] visited, + int row, int col, TableRegion region) { + if (row < 0 || row >= density.length || + col < 0 || col >= density[0].length || + visited[row][col] || + !isTableCell(density, row, col)) { + return; + } + + visited[row][col] = true; + + // 更新表格区域的边界 + region.updateBounds(row, col); + + // 递归检查相邻单元格 + expandTableRegion(density, visited, row - 1, col, region); // 上 + expandTableRegion(density, visited, row + 1, col, region); // 下 + expandTableRegion(density, visited, row, col - 1, region); // 左 + expandTableRegion(density, visited, row, col + 1, region); // 右 + } + + private boolean isValidTable(TableRegion region) { + // 验证检测到的区域是否可能是表格 + int width = region.endCol - region.startCol + 1; + int height = region.endRow - region.startRow + 1; + + // 表格至少应该有2x2的大小 + return width >= 2 && height >= 2; + } + + // 表格区域数据结构 + private static class TableRegion { + int startRow = Integer.MAX_VALUE; + int startCol = Integer.MAX_VALUE; + int endRow = Integer.MIN_VALUE; + int endCol = Integer.MIN_VALUE; + + void updateBounds(int row, int col) { + startRow = Math.min(startRow, row); + startCol = Math.min(startCol, col); + endRow = Math.max(endRow, row); + endCol = Math.max(endCol, col); + } + } +} \ No newline at end of file diff --git a/knows-java/src/main/java/cn/luckday/document/WordProcessor.java b/knows-java/src/main/java/cn/luckday/document/WordProcessor.java new file mode 100644 index 0000000..ed9efa2 --- /dev/null +++ b/knows-java/src/main/java/cn/luckday/document/WordProcessor.java @@ -0,0 +1,287 @@ +package cn.luckday.document; + +import org.apache.poi.xwpf.usermodel.*; +import org.apache.poi.hwpf.HWPFDocument; +import org.apache.poi.hwpf.usermodel.Range; +import org.apache.poi.hwpf.usermodel.Table; +import org.apache.poi.hwpf.usermodel.TableRow; +import org.apache.poi.hwpf.usermodel.TableCell; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import org.apache.poi.common.usermodel.PictureType; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +public class WordProcessor { + private final String filePath; + private List extractedText; + private List extractedTables; + private List extractedImages; + + public WordProcessor(String filePath) { + this.filePath = filePath; + this.extractedText = new ArrayList<>(); + this.extractedTables = new ArrayList<>(); + this.extractedImages = new ArrayList<>(); + } + + public void process() { + File file = new File(filePath); + if (filePath.endsWith(".docx")) { + processDocx(file); + } else if (filePath.endsWith(".doc")) { + processDoc(file); + } else { + throw new IllegalArgumentException("不支持的文件格式：" + filePath); + } + } + + private void processDocx(File file) { + try (FileInputStream fis = new FileInputStream(file); + XWPFDocument document = new XWPFDocument(fis)) { + + // 提取文本 + extractTextFromDocx(document); + + // 提取表格 + extractTablesFromDocx(document); + + // 提取图片 + extractImagesFromDocx(document); + + } catch (IOException e) { + e.printStackTrace(); + } + } + + private void processDoc(File file) { + try (FileInputStream fis = new FileInputStream(file); + POIFSFileSystem fs = new POIFSFileSystem(fis)) { + + HWPFDocument document = new HWPFDocument(fs); + + // 提取文本 + Range range = document.getRange(); + extractTextFromDoc(range); + + // 提取表格 + extractTablesFromDoc(range); + + // 提取图片（如果可能） + extractImagesFromDoc(document); + + } catch (IOException e) { + e.printStackTrace(); + } + } + + private void extractTextFromDocx(XWPFDocument document) { + // 提取段落文本 + for (XWPFParagraph paragraph : document.getParagraphs()) { + String text = paragraph.getText().trim(); + if (!text.isEmpty()) { + extractedText.add(text); + } + } + } + + private void extractTablesFromDocx(XWPFDocument document) { + // 提取表格 + for (XWPFTable table : document.getTables()) { + extractedTables.add(table); + + // 处理表格内容 + for (XWPFTableRow row : table.getRows()) { + StringBuilder rowContent = new StringBuilder(); + for (XWPFTableCell cell : row.getTableCells()) { + rowContent.append(cell.getText()).append("\t"); + } + extractedText.add("表格行：" + rowContent.toString().trim()); + } + } + } + + private void extractImagesFromDocx(XWPFDocument document) { + // 提取图片 + for (XWPFParagraph paragraph : document.getParagraphs()) { + for (XWPFRun run : paragraph.getRuns()) { + List pictures = run.getEmbeddedPictures(); + extractedImages.addAll(pictures); + } + } + } + + private void extractTextFromDoc(Range range) { + String text = range.text(); + // 按段落分割 + String[] paragraphs = text.split("\\r?\\n"); + for (String paragraph : paragraphs) { + if (!paragraph.trim().isEmpty()) { + extractedText.add(paragraph.trim()); + } + } + } + + private void extractTablesFromDoc(Range range) { + for (int i = 0; i < range.numParagraphs(); i++) { + if (range.getParagraph(i).isInTable()) { + Table table = range.getTable(range.getParagraph(i)); + processDocTable(table); + // 跳过表格中的其他段落 + i += table.numParagraphs() - 1; + } + } + } + + private void processDocTable(Table table) { + List> tableData = new ArrayList<>(); + for (int rowIdx = 0; rowIdx < table.numRows(); rowIdx++) { + TableRow row = table.getRow(rowIdx); + List rowData = new ArrayList<>(); + + for (int colIdx = 0; colIdx < row.numCells(); colIdx++) { + TableCell cell = row.getCell(colIdx); + String cellText = cell.text().trim(); + if (cellText.endsWith("\u0007")) { + cellText = cellText.substring(0, cellText.length() - 1); + } + rowData.add(cellText); + } + + tableData.add(rowData); + extractedText.add("表格行：" + String.join("\t", rowData)); + } + } + + private void extractImagesFromDoc(HWPFDocument document) { + // 注意：HWPF对图片的支持有限 + try { + List pictures = document.getPicturesTable().getAllPictures(); + File outputDir = new File("output_images"); + if (!outputDir.exists()) { + outputDir.mkdirs(); + } + + int imageCounter = 0; + for (org.apache.poi.hwpf.usermodel.Picture picture : pictures) { + String extension = picture.suggestFileExtension(); + String filename = String.format("doc_image_%d.%s", imageCounter++, extension); + Path outputPath = Paths.get(outputDir.getPath(), filename); + + // 保存图片数据 + Files.write(outputPath, picture.getContent()); + } + } catch (Exception e) { + System.out.println("警告：提取.doc文件中的图片时出错：" + e.getMessage()); + } + } + + public void saveImages(String outputDir) { + try { + File dir = new File(outputDir); + if (!dir.exists()) { + dir.mkdirs(); + } + + int imageCounter = 0; + for (XWPFPicture picture : extractedImages) { + // 获取图片数据 + byte[] pictureData = picture.getPictureData().getData(); + + // 确定图片扩展名 + String extension = getImageExtension(picture.getPictureData().getPictureType()); + String filename = String.format("image_%d.%s", imageCounter++, extension); + + // 保存图片 + Path outputPath = Paths.get(dir.getPath(), filename); + Files.write(outputPath, pictureData); + } + } catch (IOException e) { + e.printStackTrace(); + } + } + + private String getImageExtension(int pictureType) { + // 使用PictureType的常量来处理图片类型 + if (pictureType == PictureType.PNG.getOoxmlId()) { + return "png"; + } else if (pictureType == PictureType.JPEG.getOoxmlId()) { + return "jpg"; + } else if (pictureType == PictureType.GIF.getOoxmlId()) { + return "gif"; + } else if (pictureType == PictureType.TIFF.getOoxmlId()) { + return "tiff"; + } else if (pictureType == PictureType.BMP.getOoxmlId()) { + return "bmp"; + } else if (pictureType == PictureType.EMF.getOoxmlId()) { + return "emf"; + } else if (pictureType == PictureType.WMF.getOoxmlId()) { + return "wmf"; + } else if (pictureType == PictureType.PICT.getOoxmlId()) { + return "pict"; + } else if (pictureType == PictureType.DIB.getOoxmlId()) { + return "dib"; + } else { + return "unknown"; + } + } + + public List getExtractedText() { + return extractedText; + } + + public List getExtractedTables() { + return extractedTables; + } + + public List getExtractedImages() { + return extractedImages; + } + + // 将表格转换为结构化数据 + public List> convertTableToList(XWPFTable table) { + List> tableData = new ArrayList<>(); + + for (XWPFTableRow row : table.getRows()) { + List rowData = new ArrayList<>(); + for (XWPFTableCell cell : row.getTableCells()) { + rowData.add(cell.getText().trim()); + } + tableData.add(rowData); + } + + return tableData; + } + + // 导出表格为CSV格式 + public void exportTableToCSV(XWPFTable table, String outputPath) { + try { + StringBuilder csv = new StringBuilder(); + + for (XWPFTableRow row : table.getRows()) { + List rowData = new ArrayList<>(); + for (XWPFTableCell cell : row.getTableCells()) { + // 处理CSV中的特殊字符 + String cellText = cell.getText().trim() + .replace("\"", "\"\"") + .replace(",", "\",\""); + rowData.add("\"" + cellText + "\""); + } + csv.append(String.join(",", rowData)).append("\n"); + } + + java.nio.file.Files.write( + new File(outputPath).toPath(), + csv.toString().getBytes()); + + } catch (IOException e) { + e.printStackTrace(); + } + } +} \ No newline at end of file diff --git a/knows-java/src/main/java/cn/luckday/embed/EmbedClient.java b/knows-java/src/main/java/cn/luckday/embed/EmbedClient.java new file mode 100644 index 0000000..8f9c3f8 --- /dev/null +++ b/knows-java/src/main/java/cn/luckday/embed/EmbedClient.java @@ -0,0 +1,83 @@ +package cn.luckday.embed; +import com.alibaba.fastjson.JSON; +import com.alibaba.fastjson.JSONObject; +import okhttp3.*; + +import java.io.IOException; +import java.util.*; + +public class EmbedClient { + + public static double[] getEmbedding(String uri, String apiKey, String inputText) throws IOException { + OkHttpClient client = new OkHttpClient(); + + // 创建请求体 + JSONObject requestBody = new JSONObject(); + requestBody.put("input", Collections.singletonList(inputText)); + + // 创建请求 + MediaType mediaType = MediaType.parse("application/json; charset=utf-8"); + RequestBody body = RequestBody.Companion.create(requestBody.toJSONString(), mediaType); + Request request = new Request.Builder() + .url(uri) + .addHeader("Authorization", "Bearer " + apiKey) + .addHeader("Content-Type", "application/json") + .post(body) + .build(); + + // 发送请求 + Response response = client.newCall(request).execute(); + if (!response.isSuccessful()) { + throw new IOException("Unexpected code " + response); + } + + // 解析JSON响应 + String responseBody = response.body().string(); + EmbeddingResponse embeddingResponse = JSON.parseObject(responseBody, EmbeddingResponse.class); + + // 返回嵌入向量 + return embeddingResponse.getData().get(0).getEmbedding(); + } + + static class EmbeddingResponse { + private List data; + + public List getData() { + return data; + } + + public void setData(List data) { + this.data = data; + } + } + + static class Data { + private double[] embedding; + private int index; + private String object; + + public double[] getEmbedding() { + return embedding; + } + + public void setEmbedding(double[] embedding) { + this.embedding = embedding; + } + + public int getIndex() { + return index; + } + + public void setIndex(int index) { + this.index = index; + } + + public String getObject() { + return object; + } + + public void setObject(String object) { + this.object = object; + } + } +} \ No newline at end of file diff --git a/knows-java/src/main/java/cn/luckday/embed/ReRankClient.java b/knows-java/src/main/java/cn/luckday/embed/ReRankClient.java new file mode 100644 index 0000000..0752ab6 --- /dev/null +++ b/knows-java/src/main/java/cn/luckday/embed/ReRankClient.java @@ -0,0 +1,34 @@ +package cn.luckday.embed; + +import com.alibaba.fastjson.JSONObject; +import okhttp3.*; + +import java.io.IOException; +import java.util.List; + +public class ReRankClient { + + public static String reRank(String uri, String apiKey, List textsList, String query) throws IOException { + OkHttpClient client = new OkHttpClient(); + JSONObject requestBody = new JSONObject(); + String[] texts = textsList.toArray(new String[0]); + requestBody.put("textList", texts); + requestBody.put("query", query); + // 创建请求 + MediaType mediaType = MediaType.parse("application/json; charset=utf-8"); + RequestBody body = RequestBody.Companion.create(requestBody.toJSONString(), mediaType); + Request request = new Request.Builder() + .url(uri) + .addHeader("Authorization", "Bearer " + apiKey) + .addHeader("Content-Type", "application/json") + .post(body) + .build(); + + // 发送请求 + Response response = client.newCall(request).execute(); + if (!response.isSuccessful()) { + throw new IOException("Unexpected code " + response); + } + return response.body().string(); + } +} \ No newline at end of file diff --git a/knows-java/src/main/java/cn/luckday/filter/AccessControlFilter.java b/knows-java/src/main/java/cn/luckday/filter/AccessControlFilter.java new file mode 100644 index 0000000..d2ebb6a --- /dev/null +++ b/knows-java/src/main/java/cn/luckday/filter/AccessControlFilter.java @@ -0,0 +1,46 @@ +package cn.luckday.filter; + +import jakarta.servlet.*; +import jakarta.servlet.annotation.WebFilter; +import jakarta.servlet.http.HttpServletRequest; +import jakarta.servlet.http.HttpServletResponse; +import org.springframework.core.annotation.Order; +import org.springframework.stereotype.Component; + +import java.io.IOException; + +@Component +@WebFilter(urlPatterns = "/*", asyncSupported = true) +@Order(1) +public class AccessControlFilter implements Filter { + + @Override + public void init(FilterConfig filterConfig) throws ServletException { + } + + @Override + public void doFilter(ServletRequest request, ServletResponse response, FilterChain chain) throws IOException, ServletException { + HttpServletRequest httpServletRequest = (HttpServletRequest) request; + HttpServletResponse httpServletResponse = (HttpServletResponse) response; + + // 获取源站 + String origin = httpServletRequest.getHeader("origin"); + httpServletResponse.setHeader("Access-Control-Allow-Origin", "*"); + httpServletResponse.setHeader("Access-Control-Allow-Headers", "Content-Type,Content-Length, Authorization, Accept,X-Requested-With,cors, content-type, luck-token, userId, user, type"); + httpServletResponse.setHeader("Access-Control-Allow-Credentials", "true"); + httpServletResponse.setHeader("Access-Control-Allow-Methods", "GET,PUT,POST,DELETE,PATCH,OPTIONS"); + httpServletResponse.setHeader("Access-Control-Max-Age", "3600"); + + if ("OPTIONS".equals(httpServletRequest.getMethod())) { + httpServletResponse.setStatus(HttpServletResponse.SC_OK); + } else { + chain.doFilter(request, response); + } + } + + @Override + public void destroy() { + + } + +} diff --git a/knows-java/src/main/java/cn/luckday/llm/OllamaClient.java b/knows-java/src/main/java/cn/luckday/llm/OllamaClient.java new file mode 100644 index 0000000..cb53696 --- /dev/null +++ b/knows-java/src/main/java/cn/luckday/llm/OllamaClient.java @@ -0,0 +1,85 @@ +package cn.luckday.llm; + +import com.alibaba.fastjson2.JSON; +import jakarta.servlet.http.HttpServletResponse; + +import java.io.*; +import java.net.HttpURLConnection; +import java.net.URL; +import java.nio.charset.StandardCharsets; +import java.util.Map; +import java.util.HashMap; + +public class OllamaClient { + + private static Map PARAMS = new HashMap<>(); + private static Map OPTIONS = new HashMap<>(); + + static { + OPTIONS.put("temperature", 0.3); // # 控制随机性（0-1，值越大越随机） + OPTIONS.put("top_p", 0.5); // # 采样策略（0-1，值越小越集中） + OPTIONS.put("max_tokens", 1024); // # 生成的最大 token 数 + + PARAMS.put("model", "deepseek-r1:32b"); + PARAMS.put("stream", true); + PARAMS.put("options", OPTIONS); + } + + public static String PROMPT = "你是一个知识库，必须严格按照知识库检索的内容做最精简的回答，只回答关键信息，坚决杜绝胡编乱造，注意字数。" + + "当所有知识库内容都与产品问题无关时，或者知识库检索到任何相关信息时，你的回答必须是“没有找到”这句话。" + + " 以下是知识库：\n" + + " { %content% }\n" + + " 以上是知识库。 \n 以下是提问："; + + public static void sendMsg(HttpServletResponse response, String uri, String query, String content) { + try { + // 设置SSE必要的响应头 + response.setContentType("text/event-stream"); + response.setCharacterEncoding("UTF-8"); + response.setHeader("Cache-Control", "no-cache"); + response.setHeader("Connection", "keep-alive"); + + URL url = new URL(uri); + HttpURLConnection conn = (HttpURLConnection) url.openConnection(); + conn.setRequestMethod("POST"); + conn.setRequestProperty("Accept", "text/event-stream"); + conn.setRequestProperty("Content-Type", "application/json"); + conn.setDoOutput(true); + + PARAMS.put("prompt", PROMPT.replace("%content%", content) + query); + String json = JSON.toJSONString(PARAMS); + + try (OutputStream os = conn.getOutputStream()) { + os.write(json.getBytes(StandardCharsets.UTF_8)); + } + + int responseCode = conn.getResponseCode(); + + if (responseCode >= HttpURLConnection.HTTP_OK && responseCode < HttpURLConnection.HTTP_USE_PROXY) { + try (BufferedReader br = new BufferedReader(new InputStreamReader(conn.getInputStream(), StandardCharsets.UTF_8)); + PrintWriter writer = response.getWriter()) { + + String line; + while ((line = br.readLine()) != null) { + if (!line.trim().isEmpty()) { + // 构造SSE消息格式 + writer.write("data: " + line + "\n\n"); + writer.flush(); + } + } + } + } else { + throw new RuntimeException("Failed : HTTP error code : " + responseCode); + } + } catch (Exception e) { + try { + response.setStatus(HttpServletResponse.SC_INTERNAL_SERVER_ERROR); + PrintWriter writer = response.getWriter(); + writer.write("data: {\"error\": \"" + e.getMessage() + "\"}\n\n"); + writer.flush(); + } catch (IOException ioe) { + e.printStackTrace(); + } + } + } +} diff --git a/knows-java/src/main/java/cn/luckday/llm/QwenClient.java b/knows-java/src/main/java/cn/luckday/llm/QwenClient.java new file mode 100644 index 0000000..70a4e04 --- /dev/null +++ b/knows-java/src/main/java/cn/luckday/llm/QwenClient.java @@ -0,0 +1,45 @@ +package cn.luckday.llm; + +import java.util.Arrays; +import com.alibaba.dashscope.aigc.generation.Generation; +import com.alibaba.dashscope.aigc.generation.GenerationParam; +import com.alibaba.dashscope.aigc.generation.GenerationResult; +import com.alibaba.dashscope.common.Message; +import com.alibaba.dashscope.common.Role; +import com.alibaba.dashscope.exception.ApiException; +import com.alibaba.dashscope.exception.InputRequiredException; +import com.alibaba.dashscope.exception.NoApiKeyException; + +public class QwenClient { + + public static GenerationResult sendMsg(String model, String apiKey, String query, String content) throws ApiException, NoApiKeyException, InputRequiredException { + Generation gen = new Generation(); + + Message systemMsg = Message.builder() + .role(Role.SYSTEM.getValue()) + .content("你是一个知识库，必须严格按照知识库检索的内容做最精简的回答，只回答关键信息，坚决杜绝胡编乱造，注意数字。" + + "当所有知识库内容都与产品问题无关时，或者知识库检索到任何相关信息时，你的回答必须是“没有找到”这句话。" + + " 以下是知识库：\n" + + " {" + content + "}\n" + + " 以上是知识库。") + .build(); + + Message userMsg = Message.builder() + .role(Role.USER.getValue()) + .content(query) + .build(); + + GenerationParam param = GenerationParam.builder() + .model(model) + .messages(Arrays.asList(systemMsg, userMsg)) + .resultFormat(GenerationParam.ResultFormat.MESSAGE) + .apiKey(apiKey) + .topK(50) + .temperature(0.1f) + .topP(0.8) + .seed(1234) + .build(); + + return gen.call(param); + } +} \ No newline at end of file diff --git a/knows-java/src/main/java/cn/luckday/service/EsDocumentService.java b/knows-java/src/main/java/cn/luckday/service/EsDocumentService.java new file mode 100644 index 0000000..f883d1f --- /dev/null +++ b/knows-java/src/main/java/cn/luckday/service/EsDocumentService.java @@ -0,0 +1,143 @@ +package cn.luckday.service; + + +import co.elastic.clients.elasticsearch.ElasticsearchClient; +import co.elastic.clients.elasticsearch._types.Script; +import co.elastic.clients.elasticsearch._types.query_dsl.*; +import co.elastic.clients.elasticsearch.core.IndexResponse; +import co.elastic.clients.elasticsearch.core.SearchResponse; +import co.elastic.clients.elasticsearch.indices.CreateIndexRequest; +import co.elastic.clients.elasticsearch.indices.CreateIndexResponse; +import co.elastic.clients.json.JsonData; +import cn.luckday.bean.SearchResult; +import cn.luckday.bean.KnowsIndex; +import cn.luckday.embed.EmbedClient; +import jakarta.annotation.Resource; +import lombok.extern.slf4j.Slf4j; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.stereotype.Service; + +import java.io.IOException; +import java.util.Comparator; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.stream.Collectors; + +@Slf4j +@Service +public class EsDocumentService { + + @Value("${embedding.uri}") + private String embeddingUri; + + @Value("${embedding.api-key}") + private String embeddingApiKey; + + @Resource + private ElasticsearchClient client; + + public static final String INDEX_NAME = "knows_index"; + + public static final float SIMILARITY_THRESHOLD = 0.2f; + + /** + * 创建索引 + * @throws IOException 异常 + */ + public void createIndex() throws IOException { + CreateIndexRequest request = new CreateIndexRequest.Builder() + .index(INDEX_NAME) + + .mappings(m -> m + .properties("file_name", p -> p.keyword(k -> k)) + .properties("file_path", p -> p.keyword(k -> k)) + .properties("file_type", p -> p.keyword(k -> k)) + .properties("file_size", p -> p.keyword(k -> k)) + .properties("remark_vec", p -> p + .denseVector(dv -> dv + .dims(1024) + .index(true) + .similarity("cosine") + ) + ) + .properties("remark", p -> p + .text(t -> t) + ) +// .properties("remark", p -> p +// .text(t -> t.searchAnalyzer("ik_smart") +// .analyzer("ik_smart") // 使用 IK 分词器 +// ) +// ) + ) + .build(); + + CreateIndexResponse createIndexResponse = client.indices().create(request); + log.info("Index created: {}", createIndexResponse.acknowledged()); + } + + /** + * 添加数据 + * @param knowsIndexList 数据 + * @throws IOException 异常 + */ + public void indexSellList(List knowsIndexList) throws IOException { + for (KnowsIndex knowsIndex : knowsIndexList) { + knowsIndex.setContent_vec(EmbedClient.getEmbedding(embeddingUri, embeddingApiKey, knowsIndex.getContent())); + IndexResponse response = client.index(i -> i + .index(INDEX_NAME) + .id(knowsIndex.getId()) + .document(knowsIndex) + ); + log.info("Sell indexed: {}", response.id()); + } + } + + + /** + * 检索 + * + * @param queryVector 向量 + */ + public List searchVector(double[] queryVector) throws IOException { + // 创建向量相似度查询 + ScriptScoreQuery scriptScoreQuery = ScriptScoreQuery.of(q -> q + .query(QueryBuilders.matchAll().build()._toQuery()) + .script(Script.of(s -> s.inline(i -> i + .source("double score = cosineSimilarity(params.query_vector, 'content_vec'); " + + "score = Math.min(1.0, Math.max(0.0, score)); " + // 确保评分在[0, 1]之间 + "if (score < params.threshold) { return 0; } else { return score; }") + .params(Map.of( + "query_vector", JsonData.of(queryVector), + "threshold", JsonData.of(SIMILARITY_THRESHOLD) // 将阈值作为参数传递给脚本 + )))))); + + // 创建bool查询，向量相似度查询作为should子句 + Query boolQuery = QueryBuilders.bool(b -> b + .should(scriptScoreQuery._toQuery()) + ); + + Query functionScoreQuery = QueryBuilders.functionScore(fs -> fs + .query(boolQuery) + .scoreMode(FunctionScoreMode.Max) + .boostMode(FunctionBoostMode.Replace) + .minScore((double) SIMILARITY_THRESHOLD) + ); + + // 执行合并后的查询 + SearchResponse combinedSearchResponse = client.search(s -> s + .index(INDEX_NAME) + .query(functionScoreQuery), + KnowsIndex.class); + + // 处理查询的结果 + return combinedSearchResponse.hits().hits().stream() + .map(hit -> { + double finalScore = Objects.nonNull(hit.score()) ? hit.score() : 0.0; + return finalScore >= SIMILARITY_THRESHOLD ? new SearchResult(hit.source(), finalScore) : null; + }) + .filter(Objects::nonNull) + .sorted(Comparator.comparingDouble(SearchResult::getScore).reversed()) + .collect(Collectors.toList()); + } +} \ No newline at end of file diff --git a/knows-java/src/main/java/cn/luckday/service/RedFileService.java b/knows-java/src/main/java/cn/luckday/service/RedFileService.java new file mode 100644 index 0000000..9648952 --- /dev/null +++ b/knows-java/src/main/java/cn/luckday/service/RedFileService.java @@ -0,0 +1,135 @@ +package cn.luckday.service; + +import cn.hutool.core.util.IdUtil; +import cn.luckday.bean.KnowsIndex; +import cn.luckday.embed.EmbedClient; +import cn.luckday.document.PDFParser; +import cn.luckday.document.WordProcessor; +import jakarta.annotation.Resource; +import org.apache.poi.xwpf.usermodel.XWPFPicture; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.stereotype.Service; +import org.springframework.web.multipart.MultipartFile; + +import java.awt.image.BufferedImage; +import java.io.File; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.time.LocalDateTime; +import java.time.format.DateTimeFormatter; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +@Service +public class RedFileService { + private static final String TEMP_DIR = "src/main/resources/temp_uploads"; + + @Value("${embedding.uri}") + private String embeddingUri; + + @Value("${embedding.api-key}") + private String embeddingApiKey; + + @Resource + private EsDocumentService esDocumentService; + + public void uploadFile(MultipartFile file) { + try { + String projectPath = System.getProperty("user.dir"); + Path tempDirPath = Paths.get(projectPath, TEMP_DIR); + if (!Files.exists(tempDirPath)) { + Files.createDirectories(tempDirPath); + } + + // 获取文件名和扩展名 + String originalFilename = file.getOriginalFilename(); + String fileExtension = getFileExtension(originalFilename); + + // 生成临时文件路径 + String timestamp = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMddHHmmss")); + String tempFileName = timestamp + "_" + originalFilename; + Path tempFilePath = Paths.get(projectPath, TEMP_DIR, tempFileName); + + // 保存上传的文件 + file.transferTo(tempFilePath.toFile()); + + // 解析文件内容 + Map parsedContent = parseFile(tempFilePath.toString(), fileExtension); + + // 保存到 Elasticsearch + String text = parsedContent.get("text").toString(); + + KnowsIndex knowsIndex = new KnowsIndex(); + knowsIndex.setId(String.valueOf(IdUtil.getSnowflakeNextId())); + knowsIndex.setContent(text); + knowsIndex.setContent_vec(EmbedClient.getEmbedding(embeddingUri, embeddingApiKey, text)); + esDocumentService.indexSellList(Arrays.asList(knowsIndex)); + + // 清理临时文件 + Files.deleteIfExists(tempFilePath); + + } catch (Exception e) { + e.printStackTrace(); + } + } + + private String getFileExtension(String filename) { + if (filename == null) + return ""; + int lastDotIndex = filename.lastIndexOf('.'); + return (lastDotIndex == -1) ? "" : filename.substring(lastDotIndex + 1).toLowerCase(); + } + + private Map parseFile(String filePath, String extension) throws Exception { + Map content = new HashMap<>(); + + switch (extension) { + case "pdf": + PDFParser pdfParser = new PDFParser(filePath); + pdfParser.parse(); + + // 获取解析结果 + List texts = pdfParser.getExtractedText(); + List images = pdfParser.getExtractedImages(); + + // 合并所有文本 + StringBuilder fullText = new StringBuilder(); + for (String text : texts) { + fullText.append(text).append("\n"); + } + + content.put("text", fullText.toString()); + content.put("imageCount", images.size()); + break; + + case "docx": + WordProcessor wordProcessor = new WordProcessor(filePath); + wordProcessor.process(); + + List extractedText = wordProcessor.getExtractedText(); + // 合并所有文本 + StringBuilder docxFullText = new StringBuilder(); + for (String text : extractedText) { + docxFullText.append(text).append("\n"); + } + + List extractedImages = wordProcessor.getExtractedImages(); + content.put("text", docxFullText.toString()); + content.put("imageCount", extractedImages.size()); + break; + + default: + throw new IllegalArgumentException("不支持的文件类型: " + extension); + } + + // 添加元数据 + content.put("filename", new File(filePath).getName()); + content.put("uploadTime", LocalDateTime.now().toString()); + content.put("fileType", extension); + + return content; + } +} diff --git a/knows-java/src/main/resources/application.yml b/knows-java/src/main/resources/application.yml new file mode 100644 index 0000000..80fade2 --- /dev/null +++ b/knows-java/src/main/resources/application.yml @@ -0,0 +1,32 @@ +server: + port: 8899 + +spring: + servlet: + multipart: + max-file-size: 10MB + max-request-size: 10MB + main: + allow-bean-definition-overriding: true + application: + name: knows + + elasticsearch: + uris: 172.16.100.47:9200 +# username: elastic +# password: 123456 + +qwen: + api-key: sk-********************** + model: qwen-plus + +oll: + uri: http://172.16.90.4:11434/api/generate + +embedding: + uri: http://172.16.90.4:6009/v1/embed + api-key: sk-abcdefg1234567 + +re-rank: + uri: http://172.16.90.4:6010/v1/reRank + api-key: sk-abcdefg1234567 \ No newline at end of file diff --git a/knows-java/src/main/resources/native/opencv_java4110.dll b/knows-java/src/main/resources/native/opencv_java4110.dll new file mode 100644 index 0000000..4f11fd6 Binary files /dev/null and b/knows-java/src/main/resources/native/opencv_java4110.dll differ diff --git a/knows-java/src/main/resources/ocr/chi_sim.traineddata b/knows-java/src/main/resources/ocr/chi_sim.traineddata new file mode 100644 index 0000000..da7fa49 Binary files /dev/null and b/knows-java/src/main/resources/ocr/chi_sim.traineddata differ diff --git a/knows-java/src/main/resources/ocr/eng.traineddata b/knows-java/src/main/resources/ocr/eng.traineddata new file mode 100644 index 0000000..176dc32 Binary files /dev/null and b/knows-java/src/main/resources/ocr/eng.traineddata differ diff --git a/knows-java/src/main/resources/ocr/osd.traineddata b/knows-java/src/main/resources/ocr/osd.traineddata new file mode 100644 index 0000000..527457c Binary files /dev/null and b/knows-java/src/main/resources/ocr/osd.traineddata differ diff --git a/knows-java/src/test/java/cn/luckday/ApplicationTests.java b/knows-java/src/test/java/cn/luckday/ApplicationTests.java new file mode 100644 index 0000000..c943c75 --- /dev/null +++ b/knows-java/src/test/java/cn/luckday/ApplicationTests.java @@ -0,0 +1,24 @@ +package cn.luckday; + +import cn.luckday.service.EsDocumentService; +import jakarta.annotation.Resource; +import org.junit.jupiter.api.Test; +import org.springframework.boot.test.context.SpringBootTest; + +import java.io.IOException; + +@SpringBootTest +class ApplicationTests { + + @Test + void contextLoads() { + } + + @Resource + private EsDocumentService service; + + @Test + void create() throws IOException { + service.createIndex(); + } +} diff --git a/konws-python/embed/Dockerfile b/konws-python/embed/Dockerfile new file mode 100644 index 0000000..1f5817f --- /dev/null +++ b/konws-python/embed/Dockerfile @@ -0,0 +1,18 @@ +# 使用官方Python运行时作为父镜像 +FROM python:3.10 + +# 设置工作目录 +WORKDIR /app + +# 将当前目录内容复制到容器的/app中 +ADD . /app + +RUN pip install --upgrade pip -i https://pypi.tuna.tsinghua.edu.cn/simple +# 安装程序需要的包 +RUN pip install --no-cache-dir -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple + +# 运行时监听的端口 +EXPOSE 6009 + +# 运行app.py时的命令及其参数 +CMD ["uvicorn", "embed:app", "--host", "0.0.0.0", "--port", "6009"] \ No newline at end of file diff --git a/konws-python/embed/embed.py b/konws-python/embed/embed.py new file mode 100644 index 0000000..d6b4e39 --- /dev/null +++ b/konws-python/embed/embed.py @@ -0,0 +1,76 @@ +import os +from typing import List + +import numpy as np +import uvicorn +from fastapi import FastAPI, Depends, HTTPException, status +from fastapi.middleware.cors import CORSMiddleware +from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials +from pydantic import BaseModel +from sentence_transformers import SentenceTransformer, models + +# 环境变量传入 +sk_key = os.environ.get('sk-key', 'sk-aaabbbcccdddeeefffggghhhiiijjjkkk') + +# 创建一个FastAPI实例 +app = FastAPI() + +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +# 创建一个HTTPBearer实例 +security = HTTPBearer() +# 加载预训练的 Transformer 模型 +transformer_model = models.Transformer('./m3e-large', cache_dir='./cache') + +# 创建 Mean Pooling 层 +pooling_model = models.Pooling(transformer_model.get_word_embedding_dimension(), pooling_mode='mean') + +# 构建 SentenceTransformer 模型 +model = SentenceTransformer(modules=[transformer_model, pooling_model]) + + +class EmbeddingRequest(BaseModel): + input: List[str] + + +class EmbeddingResponse(BaseModel): + data: list + dimension: int + + +@app.post("/v1/embed", response_model=EmbeddingResponse) +async def get_embed(request: EmbeddingRequest, credentials: HTTPAuthorizationCredentials = Depends(security)): + if credentials.credentials != sk_key: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Invalid authorization code", + ) + + # 计算嵌入向量和tokens数量 + embeddings = [model.encode(text) for text in request.input] + # 归一化处理 + embeddings = [embedding / np.linalg.norm(embedding) for embedding in embeddings] + # 将numpy数组转换为列表 + embeddings = [embedding.tolist() for embedding in embeddings] + + response = { + "data": [ + { + "embedding": embedding, + "index": index + } for index, embedding in enumerate(embeddings) + ], + "dimension": len(embeddings[0]) + } + + return response + + +if __name__ == "__main__": + uvicorn.run("embed:app", host='0.0.0.0', port=6009, workers=2) diff --git a/konws-python/embed/requirements.txt b/konws-python/embed/requirements.txt new file mode 100644 index 0000000..8ea776c --- /dev/null +++ b/konws-python/embed/requirements.txt @@ -0,0 +1,10 @@ +fastapi==0.99.1 +pydantic==1.10.7 +sentence-transformers==3.3.1 +uvicorn==0.23.1 +numpy==1.24.4 +scipy==1.10.1 +scikit-learn==1.3.0 +torchvision +torchaudio +torch \ No newline at end of file diff --git a/konws-python/rerank/Dockerfile b/konws-python/rerank/Dockerfile new file mode 100644 index 0000000..e035575 --- /dev/null +++ b/konws-python/rerank/Dockerfile @@ -0,0 +1,18 @@ +# 使用官方Python运行时作为父镜像 +FROM python:3.10 + +# 设置工作目录 +WORKDIR /app + +# 将当前目录内容复制到容器的/app中 +ADD . /app + +RUN pip install --upgrade pip -i https://pypi.tuna.tsinghua.edu.cn/simple +# 安装程序需要的包 +RUN pip install --no-cache-dir -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple + +# 运行时监听的端口 +EXPOSE 6010 + +# 运行app.py时的命令及其参数 +CMD ["uvicorn", "rerank:app", "--host", "0.0.0.0", "--port", "6010"] \ No newline at end of file diff --git a/konws-python/rerank/requirements.txt b/konws-python/rerank/requirements.txt new file mode 100644 index 0000000..ac138e6 --- /dev/null +++ b/konws-python/rerank/requirements.txt @@ -0,0 +1,12 @@ +fastapi==0.99.1 +pydantic==1.10.7 +uvicorn==0.23.1 +tiktoken==0.4.0 +numpy==1.24.4 +scipy==1.10.1 +scikit-learn==1.5.0 +torchvision +torchaudio +torch +BCEmbedding==0.1.5 +starlette~=0.27.0 \ No newline at end of file diff --git a/konws-python/rerank/rerank.py b/konws-python/rerank/rerank.py new file mode 100644 index 0000000..e2f5957 --- /dev/null +++ b/konws-python/rerank/rerank.py @@ -0,0 +1,58 @@ +import os +from typing import List +import uvicorn +from BCEmbedding import RerankerModel +from fastapi import FastAPI, Depends, HTTPException, status +from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials +from pydantic import BaseModel +from starlette.middleware.cors import CORSMiddleware + +# 环境变量传入 +sk_key = os.environ.get('sk-key', 'sk-aaabbbcccdddeeefffggghhhiiijjjkkk...') + +# 创建一个FastAPI实例 +app = FastAPI() + +# 添加CORS中间件 +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], # 允许所有来源 + allow_credentials=True, + allow_methods=["*"], # 允许所有方法 + allow_headers=["*"], # 允许所有头部 +) + +# 创建一个HTTPBearer实例 +security = HTTPBearer() + +# 初始化模型 +model = RerankerModel(model_name_or_path="./bce-reranker-base_v1") + + +class ReRankRequest(BaseModel): + textList: List[str] + query: str + + +class ReRankResponse(BaseModel): + rerank_passages: List[str] + rerank_scores: List[float] + rerank_ids: List[int] + + +# 定义路由，处理rerank请求 +@app.post("/v1/reRank", response_model=ReRankResponse) +async def get_embeddings(request: ReRankRequest, credentials: HTTPAuthorizationCredentials = Depends(security)): + if credentials.credentials != sk_key: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Invalid authorization code", + ) + query = request.query + passages = request.textList + return model.rerank(query, passages) + + +# 运行应用 +if __name__ == "__main__": + uvicorn.run("rerank:app", host='0.0.0.0', port=6010, workers=2) diff --git a/konws-web/chatbox.html b/konws-web/chatbox.html new file mode 100644 index 0000000..81a8daa --- /dev/null +++ b/konws-web/chatbox.html @@ -0,0 +1,296 @@ + + + + + DeepSeek 32B Chat + + + + +

+ + +

+ +

+ + +

+ × +