优化文章附件读取,如果html附件,则保留格式

This commit is contained in:
thinkgem
2025-05-15 14:31:01 +08:00
parent 375fbbe375
commit d4f2c1ddb7
2 changed files with 18 additions and 2 deletions

View File

@@ -92,7 +92,7 @@ public class WebClientThinkConfig {
} }
String reasoningContent = (String) delta.get("reasoning_content"); String reasoningContent = (String) delta.get("reasoning_content");
String content = (String) delta.get("content"); String content = (String) delta.get("content");
if (StringUtils.isNotBlank(reasoningContent) && StringUtils.isBlank(content)) { if (StringUtils.isNotEmpty(reasoningContent) && StringUtils.isEmpty(content)) {
if (!thinkingFlag.get()) { if (!thinkingFlag.get()) {
thinkingFlag.set(true); thinkingFlag.set(true);
delta.put("content", "<think>\n" + reasoningContent); delta.put("content", "<think>\n" + reasoningContent);

View File

@@ -26,6 +26,9 @@ import jakarta.servlet.http.HttpServletRequest;
import org.apache.tika.Tika; import org.apache.tika.Tika;
import org.apache.tika.config.TikaConfig; import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException; import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@@ -38,6 +41,7 @@ import org.springframework.stereotype.Service;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
@@ -151,7 +155,19 @@ public class ArticleVectorStoreImpl implements ArticleVectorStore {
*/ */
private static @NotNull String getDocumentText(InputStream is) throws IOException, TikaException { private static @NotNull String getDocumentText(InputStream is) throws IOException, TikaException {
TikaConfig config = TikaConfig.getDefaultConfig(); TikaConfig config = TikaConfig.getDefaultConfig();
String content = new Tika(config).parseToString(is); Tika tika = new Tika(config);
Metadata metadata = new Metadata();
TikaInputStream stream = TikaInputStream.get(is);
MediaType mimetype = tika.getDetector().detect(stream, metadata);
if (mimetype != null && StringUtils.equals(mimetype.getType(), "text")) {
String text = IOUtils.toString(stream, StandardCharsets.UTF_8);
if (StringUtils.isNotBlank(text)) {
return FlexmarkHtmlConverter.builder().build().convert(text);
} else {
return text;
}
}
String content = tika.parseToString(stream, metadata);
return content.lines() return content.lines()
.map(String::strip).filter(line -> !line.isEmpty()) .map(String::strip).filter(line -> !line.isEmpty())
.reduce((a, b) -> a + System.lineSeparator() + b) .reduce((a, b) -> a + System.lineSeparator() + b)