优化文章附件读取,如果html附件,则保留格式

This commit is contained in:
thinkgem
2025-05-15 14:31:01 +08:00
parent 375fbbe375
commit d4f2c1ddb7
2 changed files with 18 additions and 2 deletions

View File

@@ -92,7 +92,7 @@ public class WebClientThinkConfig {
}
String reasoningContent = (String) delta.get("reasoning_content");
String content = (String) delta.get("content");
if (StringUtils.isNotBlank(reasoningContent) && StringUtils.isBlank(content)) {
if (StringUtils.isNotEmpty(reasoningContent) && StringUtils.isEmpty(content)) {
if (!thinkingFlag.get()) {
thinkingFlag.set(true);
delta.put("content", "<think>\n" + reasoningContent);

View File

@@ -26,6 +26,9 @@ import jakarta.servlet.http.HttpServletRequest;
import org.apache.tika.Tika;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -38,6 +41,7 @@ import org.springframework.stereotype.Service;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.Map;
import java.util.Set;
@@ -151,7 +155,19 @@ public class ArticleVectorStoreImpl implements ArticleVectorStore {
*/
private static @NotNull String getDocumentText(InputStream is) throws IOException, TikaException {
TikaConfig config = TikaConfig.getDefaultConfig();
String content = new Tika(config).parseToString(is);
Tika tika = new Tika(config);
Metadata metadata = new Metadata();
TikaInputStream stream = TikaInputStream.get(is);
MediaType mimetype = tika.getDetector().detect(stream, metadata);
if (mimetype != null && StringUtils.equals(mimetype.getType(), "text")) {
String text = IOUtils.toString(stream, StandardCharsets.UTF_8);
if (StringUtils.isNotBlank(text)) {
return FlexmarkHtmlConverter.builder().build().convert(text);
} else {
return text;
}
}
String content = tika.parseToString(stream, metadata);
return content.lines()
.map(String::strip).filter(line -> !line.isEmpty())
.reduce((a, b) -> a + System.lineSeparator() + b)