增加文章附件解析,存入向量数据库。
This commit is contained in:
@@ -94,6 +94,23 @@
|
|||||||
<artifactId>flexmark-html2md-converter</artifactId>
|
<artifactId>flexmark-html2md-converter</artifactId>
|
||||||
<version>0.64.8</version>
|
<version>0.64.8</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
<!-- 解析文档内容库 -->
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.tika</groupId>
|
||||||
|
<artifactId>tika-core</artifactId>
|
||||||
|
<version>3.1.0</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.tika</groupId>
|
||||||
|
<artifactId>tika-parsers-standard-package</artifactId>
|
||||||
|
<version>3.1.0</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.commons</groupId>
|
||||||
|
<artifactId>commons-compress</artifactId>
|
||||||
|
<version>1.27.1</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
|
|||||||
@@ -6,13 +6,26 @@ package com.jeesite.modules.cms.ai.service;
|
|||||||
|
|
||||||
import com.jeesite.common.collect.ListUtils;
|
import com.jeesite.common.collect.ListUtils;
|
||||||
import com.jeesite.common.collect.MapUtils;
|
import com.jeesite.common.collect.MapUtils;
|
||||||
|
import com.jeesite.common.config.Global;
|
||||||
|
import com.jeesite.common.io.IOUtils;
|
||||||
import com.jeesite.common.lang.StringUtils;
|
import com.jeesite.common.lang.StringUtils;
|
||||||
import com.jeesite.common.lang.TimeUtils;
|
import com.jeesite.common.lang.TimeUtils;
|
||||||
import com.jeesite.common.utils.PageUtils;
|
import com.jeesite.common.utils.PageUtils;
|
||||||
|
import com.jeesite.common.web.http.HttpClientUtils;
|
||||||
import com.jeesite.modules.cms.entity.Article;
|
import com.jeesite.modules.cms.entity.Article;
|
||||||
import com.jeesite.modules.cms.service.ArticleVectorStore;
|
import com.jeesite.modules.cms.service.ArticleVectorStore;
|
||||||
import com.jeesite.modules.cms.utils.CmsUtils;
|
import com.jeesite.modules.cms.utils.CmsUtils;
|
||||||
|
import com.vladsch.flexmark.html.renderer.LinkType;
|
||||||
|
import com.vladsch.flexmark.html.renderer.ResolvedLink;
|
||||||
import com.vladsch.flexmark.html2md.converter.FlexmarkHtmlConverter;
|
import com.vladsch.flexmark.html2md.converter.FlexmarkHtmlConverter;
|
||||||
|
import com.vladsch.flexmark.html2md.converter.HtmlLinkResolver;
|
||||||
|
import com.vladsch.flexmark.html2md.converter.HtmlLinkResolverFactory;
|
||||||
|
import com.vladsch.flexmark.html2md.converter.HtmlNodeConverterContext;
|
||||||
|
import org.apache.tika.Tika;
|
||||||
|
import org.apache.tika.config.TikaConfig;
|
||||||
|
import org.apache.tika.exception.TikaException;
|
||||||
|
import org.jetbrains.annotations.NotNull;
|
||||||
|
import org.jetbrains.annotations.Nullable;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
import org.springframework.ai.document.Document;
|
import org.springframework.ai.document.Document;
|
||||||
@@ -22,8 +35,11 @@ import org.springframework.ai.vectorstore.filter.FilterExpressionBuilder;
|
|||||||
import org.springframework.beans.factory.annotation.Autowired;
|
import org.springframework.beans.factory.annotation.Autowired;
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* CMS 文章向量库存储
|
* CMS 文章向量库存储
|
||||||
@@ -58,11 +74,68 @@ public class ArticleVectorStoreImpl implements ArticleVectorStore {
|
|||||||
metadata.put("createDate", article.getCreateDate());
|
metadata.put("createDate", article.getCreateDate());
|
||||||
metadata.put("updateBy", article.getUpdateBy());
|
metadata.put("updateBy", article.getUpdateBy());
|
||||||
metadata.put("updateDate", article.getUpdateDate());
|
metadata.put("updateDate", article.getUpdateDate());
|
||||||
|
List<String> attachmentList = ListUtils.newArrayList();
|
||||||
|
HtmlLinkResolverFactory linkResolverFactory = new HtmlLinkResolverFactory() {
|
||||||
|
@Override
|
||||||
|
public @Nullable Set<Class<?>> getAfterDependents() {
|
||||||
|
return Set.of();
|
||||||
|
}
|
||||||
|
@Override
|
||||||
|
public @Nullable Set<Class<?>> getBeforeDependents() {
|
||||||
|
return Set.of();
|
||||||
|
}
|
||||||
|
@Override
|
||||||
|
public boolean affectsGlobalScope() {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
@Override
|
||||||
|
public HtmlLinkResolver apply(HtmlNodeConverterContext htmlNodeConverterContext) {
|
||||||
|
return (node, context, resolvedLink) -> {
|
||||||
|
if ("a".equalsIgnoreCase(node.nodeName())) {
|
||||||
|
String href = node.attributes().get("href"); String url = href;
|
||||||
|
if (StringUtils.contains(url, "://")) {
|
||||||
|
try (InputStream is = HttpClientUtils.getInputStream(url, null)) {
|
||||||
|
String text = getDocumentText(is);
|
||||||
|
attachmentList.add(url + text);
|
||||||
|
} catch (IOException | TikaException e) {
|
||||||
|
logger.error(e.getMessage(), e);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
String ctxPath = Global.getCtxPath();
|
||||||
|
if (StringUtils.isNotBlank(ctxPath) && StringUtils.startsWith(url, ctxPath)){
|
||||||
|
url = url.substring(ctxPath.length());
|
||||||
|
}
|
||||||
|
try (InputStream is = IOUtils.getFileInputStream(Global.getUserfilesBaseDir(url))){
|
||||||
|
String text = getDocumentText(is);
|
||||||
|
attachmentList.add(url + text);
|
||||||
|
} catch (IOException | TikaException e) {
|
||||||
|
logger.error(e.getMessage(), e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return new ResolvedLink(LinkType.LINK, href);
|
||||||
|
}
|
||||||
|
return resolvedLink;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
* 获取文章附件中的内容
|
||||||
|
* @author ThinkGem
|
||||||
|
*/
|
||||||
|
private static @NotNull String getDocumentText(InputStream is) throws IOException, TikaException {
|
||||||
|
TikaConfig config = TikaConfig.getDefaultConfig();
|
||||||
|
String content = new Tika(config).parseToString(is);
|
||||||
|
return content.lines()
|
||||||
|
.map(String::strip).filter(line -> !line.isEmpty())
|
||||||
|
.reduce((a, b) -> a + System.lineSeparator() + b)
|
||||||
|
.orElse(StringUtils.EMPTY);
|
||||||
|
}
|
||||||
|
};
|
||||||
String content = article.getTitle() + ", " + article.getKeywords() + ", "
|
String content = article.getTitle() + ", " + article.getKeywords() + ", "
|
||||||
+ article.getDescription() + ", " + StringUtils.toMobileHtml(
|
+ article.getDescription() + ", " + FlexmarkHtmlConverter.builder()
|
||||||
article.getArticleData().getContent());
|
.linkResolverFactory(linkResolverFactory).build()
|
||||||
String markdown = FlexmarkHtmlConverter.builder().build().convert(content);
|
.convert(article.getArticleData().getContent())
|
||||||
List<Document> documents = List.of(new Document(article.getId(), markdown, metadata));
|
+ ", attachment: " + attachmentList;
|
||||||
|
List<Document> documents = List.of(new Document(article.getId(), content, metadata));
|
||||||
List<Document> splitDocuments = new TokenTextSplitter().apply(documents);
|
List<Document> splitDocuments = new TokenTextSplitter().apply(documents);
|
||||||
this.delete(article); // 删除原数据
|
this.delete(article); // 删除原数据
|
||||||
ListUtils.pageList(splitDocuments, 64, params -> {
|
ListUtils.pageList(splitDocuments, 64, params -> {
|
||||||
|
|||||||
Reference in New Issue
Block a user