diff --git a/modules/cms-ai/pom.xml b/modules/cms-ai/pom.xml index eba94c82..f39b11ad 100644 --- a/modules/cms-ai/pom.xml +++ b/modules/cms-ai/pom.xml @@ -94,6 +94,23 @@ flexmark-html2md-converter 0.64.8 + + + + org.apache.tika + tika-core + 3.1.0 + + + org.apache.tika + tika-parsers-standard-package + 3.1.0 + + + org.apache.commons + commons-compress + 1.27.1 + diff --git a/modules/cms-ai/src/main/java/com/jeesite/modules/cms/ai/service/ArticleVectorStoreImpl.java b/modules/cms-ai/src/main/java/com/jeesite/modules/cms/ai/service/ArticleVectorStoreImpl.java index 6e73b5d9..07d24ee2 100644 --- a/modules/cms-ai/src/main/java/com/jeesite/modules/cms/ai/service/ArticleVectorStoreImpl.java +++ b/modules/cms-ai/src/main/java/com/jeesite/modules/cms/ai/service/ArticleVectorStoreImpl.java @@ -6,13 +6,26 @@ package com.jeesite.modules.cms.ai.service; import com.jeesite.common.collect.ListUtils; import com.jeesite.common.collect.MapUtils; +import com.jeesite.common.config.Global; +import com.jeesite.common.io.IOUtils; import com.jeesite.common.lang.StringUtils; import com.jeesite.common.lang.TimeUtils; import com.jeesite.common.utils.PageUtils; +import com.jeesite.common.web.http.HttpClientUtils; import com.jeesite.modules.cms.entity.Article; import com.jeesite.modules.cms.service.ArticleVectorStore; import com.jeesite.modules.cms.utils.CmsUtils; +import com.vladsch.flexmark.html.renderer.LinkType; +import com.vladsch.flexmark.html.renderer.ResolvedLink; import com.vladsch.flexmark.html2md.converter.FlexmarkHtmlConverter; +import com.vladsch.flexmark.html2md.converter.HtmlLinkResolver; +import com.vladsch.flexmark.html2md.converter.HtmlLinkResolverFactory; +import com.vladsch.flexmark.html2md.converter.HtmlNodeConverterContext; +import org.apache.tika.Tika; +import org.apache.tika.config.TikaConfig; +import org.apache.tika.exception.TikaException; +import org.jetbrains.annotations.NotNull; +import org.jetbrains.annotations.Nullable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.ai.document.Document; @@ -22,8 +35,11 @@ import org.springframework.ai.vectorstore.filter.FilterExpressionBuilder; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; +import java.io.IOException; +import java.io.InputStream; import java.util.List; import java.util.Map; +import java.util.Set; /** * CMS 文章向量库存储 @@ -58,11 +74,68 @@ public class ArticleVectorStoreImpl implements ArticleVectorStore { metadata.put("createDate", article.getCreateDate()); metadata.put("updateBy", article.getUpdateBy()); metadata.put("updateDate", article.getUpdateDate()); + List attachmentList = ListUtils.newArrayList(); + HtmlLinkResolverFactory linkResolverFactory = new HtmlLinkResolverFactory() { + @Override + public @Nullable Set> getAfterDependents() { + return Set.of(); + } + @Override + public @Nullable Set> getBeforeDependents() { + return Set.of(); + } + @Override + public boolean affectsGlobalScope() { + return false; + } + @Override + public HtmlLinkResolver apply(HtmlNodeConverterContext htmlNodeConverterContext) { + return (node, context, resolvedLink) -> { + if ("a".equalsIgnoreCase(node.nodeName())) { + String href = node.attributes().get("href"); String url = href; + if (StringUtils.contains(url, "://")) { + try (InputStream is = HttpClientUtils.getInputStream(url, null)) { + String text = getDocumentText(is); + attachmentList.add(url + text); + } catch (IOException | TikaException e) { + logger.error(e.getMessage(), e); + } + } else { + String ctxPath = Global.getCtxPath(); + if (StringUtils.isNotBlank(ctxPath) && StringUtils.startsWith(url, ctxPath)){ + url = url.substring(ctxPath.length()); + } + try (InputStream is = IOUtils.getFileInputStream(Global.getUserfilesBaseDir(url))){ + String text = getDocumentText(is); + attachmentList.add(url + text); + } catch (IOException | TikaException e) { + logger.error(e.getMessage(), e); + } + } + return new ResolvedLink(LinkType.LINK, href); + } + return resolvedLink; + }; + } + /** + * 获取文章附件中的内容 + * @author ThinkGem + */ + private static @NotNull String getDocumentText(InputStream is) throws IOException, TikaException { + TikaConfig config = TikaConfig.getDefaultConfig(); + String content = new Tika(config).parseToString(is); + return content.lines() + .map(String::strip).filter(line -> !line.isEmpty()) + .reduce((a, b) -> a + System.lineSeparator() + b) + .orElse(StringUtils.EMPTY); + } + }; String content = article.getTitle() + ", " + article.getKeywords() + ", " - + article.getDescription() + ", " + StringUtils.toMobileHtml( - article.getArticleData().getContent()); - String markdown = FlexmarkHtmlConverter.builder().build().convert(content); - List documents = List.of(new Document(article.getId(), markdown, metadata)); + + article.getDescription() + ", " + FlexmarkHtmlConverter.builder() + .linkResolverFactory(linkResolverFactory).build() + .convert(article.getArticleData().getContent()) + + ", attachment: " + attachmentList; + List documents = List.of(new Document(article.getId(), content, metadata)); List splitDocuments = new TokenTextSplitter().apply(documents); this.delete(article); // 删除原数据 ListUtils.pageList(splitDocuments, 64, params -> {