From d4f2c1ddb729dc6bd1e3a5863d5fb0ecdf1a7c4d Mon Sep 17 00:00:00 2001 From: thinkgem Date: Thu, 15 May 2025 14:31:01 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BC=98=E5=8C=96=E6=96=87=E7=AB=A0=E9=99=84?= =?UTF-8?q?=E4=BB=B6=E8=AF=BB=E5=8F=96=EF=BC=8C=E5=A6=82=E6=9E=9Chtml?= =?UTF-8?q?=E9=99=84=E4=BB=B6=EF=BC=8C=E5=88=99=E4=BF=9D=E7=95=99=E6=A0=BC?= =?UTF-8?q?=E5=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cms/ai/config/WebClientThinkConfig.java | 2 +- .../cms/ai/service/ArticleVectorStoreImpl.java | 18 +++++++++++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/modules/cms-ai/src/main/java/com/jeesite/modules/cms/ai/config/WebClientThinkConfig.java b/modules/cms-ai/src/main/java/com/jeesite/modules/cms/ai/config/WebClientThinkConfig.java index c2b6a48a..6c1e1045 100644 --- a/modules/cms-ai/src/main/java/com/jeesite/modules/cms/ai/config/WebClientThinkConfig.java +++ b/modules/cms-ai/src/main/java/com/jeesite/modules/cms/ai/config/WebClientThinkConfig.java @@ -92,7 +92,7 @@ public class WebClientThinkConfig { } String reasoningContent = (String) delta.get("reasoning_content"); String content = (String) delta.get("content"); - if (StringUtils.isNotBlank(reasoningContent) && StringUtils.isBlank(content)) { + if (StringUtils.isNotEmpty(reasoningContent) && StringUtils.isEmpty(content)) { if (!thinkingFlag.get()) { thinkingFlag.set(true); delta.put("content", "\n" + reasoningContent); diff --git a/modules/cms-ai/src/main/java/com/jeesite/modules/cms/ai/service/ArticleVectorStoreImpl.java b/modules/cms-ai/src/main/java/com/jeesite/modules/cms/ai/service/ArticleVectorStoreImpl.java index 247eaeeb..7b9ad5cd 100644 --- a/modules/cms-ai/src/main/java/com/jeesite/modules/cms/ai/service/ArticleVectorStoreImpl.java +++ b/modules/cms-ai/src/main/java/com/jeesite/modules/cms/ai/service/ArticleVectorStoreImpl.java @@ -26,6 +26,9 @@ import jakarta.servlet.http.HttpServletRequest; import org.apache.tika.Tika; import org.apache.tika.config.TikaConfig; import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -38,6 +41,7 @@ import org.springframework.stereotype.Service; import java.io.IOException; import java.io.InputStream; +import java.nio.charset.StandardCharsets; import java.util.List; import java.util.Map; import java.util.Set; @@ -151,7 +155,19 @@ public class ArticleVectorStoreImpl implements ArticleVectorStore { */ private static @NotNull String getDocumentText(InputStream is) throws IOException, TikaException { TikaConfig config = TikaConfig.getDefaultConfig(); - String content = new Tika(config).parseToString(is); + Tika tika = new Tika(config); + Metadata metadata = new Metadata(); + TikaInputStream stream = TikaInputStream.get(is); + MediaType mimetype = tika.getDetector().detect(stream, metadata); + if (mimetype != null && StringUtils.equals(mimetype.getType(), "text")) { + String text = IOUtils.toString(stream, StandardCharsets.UTF_8); + if (StringUtils.isNotBlank(text)) { + return FlexmarkHtmlConverter.builder().build().convert(text); + } else { + return text; + } + } + String content = tika.parseToString(stream, metadata); return content.lines() .map(String::strip).filter(line -> !line.isEmpty()) .reduce((a, b) -> a + System.lineSeparator() + b)