Przeglądaj źródła

Merge pull request #9462 from weseek/feat/158281-save-to-vectorstore-in-html-format

feat(ai): Save file to VectorStore in HTML format
mergify[bot] 1 rok temu
rodzic
commit
b24a907308

+ 2 - 0
apps/app/package.json

@@ -197,9 +197,11 @@
     "reconnecting-websocket": "^4.4.0",
     "reconnecting-websocket": "^4.4.0",
     "redis": "^3.0.2",
     "redis": "^3.0.2",
     "rehype-katex": "^7.0.1",
     "rehype-katex": "^7.0.1",
+    "rehype-meta": "^4.0.1",
     "rehype-raw": "^7.0.0",
     "rehype-raw": "^7.0.0",
     "rehype-sanitize": "^6.0.0",
     "rehype-sanitize": "^6.0.0",
     "rehype-slug": "^6.0.0",
     "rehype-slug": "^6.0.0",
+    "rehype-stringify": "^10.0.1",
     "rehype-toc": "^3.0.2",
     "rehype-toc": "^3.0.2",
     "remark-breaks": "^4.0.0",
     "remark-breaks": "^4.0.0",
     "remark-directive": "^3.0.0",
     "remark-directive": "^3.0.0",

+ 8 - 7
apps/app/src/features/openai/server/services/openai.ts

@@ -2,6 +2,7 @@ import assert from 'node:assert';
 import { Readable, Transform } from 'stream';
 import { Readable, Transform } from 'stream';
 import { pipeline } from 'stream/promises';
 import { pipeline } from 'stream/promises';
 
 
+import type { IPagePopulatedToShowRevision } from '@growi/core';
 import { PageGrant, isPopulated } from '@growi/core';
 import { PageGrant, isPopulated } from '@growi/core';
 import type { HydratedDocument, Types } from 'mongoose';
 import type { HydratedDocument, Types } from 'mongoose';
 import mongoose from 'mongoose';
 import mongoose from 'mongoose';
@@ -20,7 +21,7 @@ import { createBatchStream } from '~/server/util/batch-stream';
 import loggerFactory from '~/utils/logger';
 import loggerFactory from '~/utils/logger';
 
 
 import { OpenaiServiceTypes } from '../../interfaces/ai';
 import { OpenaiServiceTypes } from '../../interfaces/ai';
-import { sanitizeMarkdown } from '../utils/sanitize-markdown';
+import { convertMarkdownToHtml } from '../utils/convert-markdown-to-html';
 
 
 import { getClient } from './client-delegator';
 import { getClient } from './client-delegator';
 // import { splitMarkdownIntoChunks } from './markdown-splitter/markdown-token-splitter';
 // import { splitMarkdownIntoChunks } from './markdown-splitter/markdown-token-splitter';
@@ -157,9 +158,9 @@ class OpenaiService implements IOpenaiService {
   //   }
   //   }
   // }
   // }
 
 
-  private async uploadFile(pageId: Types.ObjectId, body: string): Promise<OpenAI.Files.FileObject> {
-    const sanitizedMarkdown = await sanitizeMarkdown(body);
-    const file = await toFile(Readable.from(sanitizedMarkdown), `${pageId}.md`);
+  private async uploadFile(pageId: Types.ObjectId, pagePath: string, revisionBody: string): Promise<OpenAI.Files.FileObject> {
+    const convertedHtml = await convertMarkdownToHtml({ pagePath, revisionBody });
+    const file = await toFile(Readable.from(convertedHtml), `${pageId}.html`);
     const uploadedFile = await this.client.uploadFile(file);
     const uploadedFile = await this.client.uploadFile(file);
     return uploadedFile;
     return uploadedFile;
   }
   }
@@ -183,17 +184,17 @@ class OpenaiService implements IOpenaiService {
   async createVectorStoreFile(pages: Array<HydratedDocument<PageDocument>>): Promise<void> {
   async createVectorStoreFile(pages: Array<HydratedDocument<PageDocument>>): Promise<void> {
     const vectorStore = await this.getOrCreateVectorStoreForPublicScope();
     const vectorStore = await this.getOrCreateVectorStoreForPublicScope();
     const vectorStoreFileRelationsMap: VectorStoreFileRelationsMap = new Map();
     const vectorStoreFileRelationsMap: VectorStoreFileRelationsMap = new Map();
-    const processUploadFile = async(page: PageDocument) => {
+    const processUploadFile = async(page: HydratedDocument<PageDocument>) => {
       if (page._id != null && page.grant === PageGrant.GRANT_PUBLIC && page.revision != null) {
       if (page._id != null && page.grant === PageGrant.GRANT_PUBLIC && page.revision != null) {
         if (isPopulated(page.revision) && page.revision.body.length > 0) {
         if (isPopulated(page.revision) && page.revision.body.length > 0) {
-          const uploadedFile = await this.uploadFile(page._id, page.revision.body);
+          const uploadedFile = await this.uploadFile(page._id, page.path, page.revision.body);
           prepareVectorStoreFileRelations(vectorStore._id, page._id, uploadedFile.id, vectorStoreFileRelationsMap);
           prepareVectorStoreFileRelations(vectorStore._id, page._id, uploadedFile.id, vectorStoreFileRelationsMap);
           return;
           return;
         }
         }
 
 
         const pagePopulatedToShowRevision = await page.populateDataToShowRevision();
         const pagePopulatedToShowRevision = await page.populateDataToShowRevision();
         if (pagePopulatedToShowRevision.revision != null && pagePopulatedToShowRevision.revision.body.length > 0) {
         if (pagePopulatedToShowRevision.revision != null && pagePopulatedToShowRevision.revision.body.length > 0) {
-          const uploadedFile = await this.uploadFile(page._id, pagePopulatedToShowRevision.revision.body);
+          const uploadedFile = await this.uploadFile(page._id, page.path, pagePopulatedToShowRevision.revision.body);
           prepareVectorStoreFileRelations(vectorStore._id, page._id, uploadedFile.id, vectorStoreFileRelationsMap);
           prepareVectorStoreFileRelations(vectorStore._id, page._id, uploadedFile.id, vectorStoreFileRelationsMap);
         }
         }
       }
       }

+ 89 - 0
apps/app/src/features/openai/server/utils/convert-markdown-to-html.ts

@@ -0,0 +1,89 @@
+import { dynamicImport } from '@cspell/dynamic-import';
+import type { Root, Code } from 'mdast';
+import type * as RehypeMeta from 'rehype-meta';
+import type * as RehypeStringify from 'rehype-stringify';
+import type * as RemarkParse from 'remark-parse';
+import type * as RemarkRehype from 'remark-rehype';
+import type * as Unified from 'unified';
+import type * as UnistUtilVisit from 'unist-util-visit';
+
+interface ModuleCache {
+  unified?: typeof Unified.unified;
+  visit?: typeof UnistUtilVisit.visit;
+  remarkParse?: typeof RemarkParse.default;
+  remarkRehype?: typeof RemarkRehype.default;
+  rehypeMeta?: typeof RehypeMeta.default;
+  rehypeStringify?: typeof RehypeStringify.default;
+}
+
+let moduleCache: ModuleCache = {};
+
+const initializeModules = async(): Promise<void> => {
+  if (moduleCache.unified != null
+    && moduleCache.visit != null
+    && moduleCache.remarkParse != null
+    && moduleCache.remarkRehype != null
+    && moduleCache.rehypeMeta != null
+    && moduleCache.rehypeStringify != null
+  ) {
+    return;
+  }
+
+  const [
+    { unified },
+    { visit },
+    { default: remarkParse },
+    { default: remarkRehype },
+    { default: rehypeMeta },
+    { default: rehypeStringify },
+  ] = await Promise.all([
+    dynamicImport<typeof Unified>('unified', __dirname),
+    dynamicImport<typeof UnistUtilVisit>('unist-util-visit', __dirname),
+    dynamicImport<typeof RemarkParse>('remark-parse', __dirname),
+    dynamicImport<typeof RemarkRehype>('remark-rehype', __dirname),
+    dynamicImport<typeof RehypeMeta>('rehype-meta', __dirname),
+    dynamicImport<typeof RehypeStringify>('rehype-stringify', __dirname),
+  ]);
+
+  moduleCache = {
+    unified,
+    visit,
+    remarkParse,
+    remarkRehype,
+    rehypeMeta,
+    rehypeStringify,
+  };
+};
+
+export const convertMarkdownToHtml = async({ pagePath, revisionBody }: { pagePath: string, revisionBody: string }): Promise<string> => {
+  await initializeModules();
+
+  const {
+    unified, visit, remarkParse, remarkRehype, rehypeMeta, rehypeStringify,
+  } = moduleCache;
+
+  if (unified == null || visit == null || remarkParse == null || remarkRehype == null || rehypeMeta == null || rehypeStringify == null) {
+    throw new Error('Failed to initialize required modules');
+  }
+
+  const sanitizeMarkdown = () => {
+    return (tree: Root) => {
+      visit(tree, 'code', (node: Code) => {
+        if (node.lang === 'drawio') {
+          node.value = '<!-- drawio content replaced -->';
+        }
+      });
+    };
+  };
+
+  const processor = unified()
+    .use(remarkParse)
+    .use(sanitizeMarkdown)
+    .use(remarkRehype)
+    .use(rehypeMeta, {
+      title: pagePath,
+    })
+    .use(rehypeStringify);
+
+  return processor.processSync(revisionBody).toString();
+};

+ 0 - 65
apps/app/src/features/openai/server/utils/sanitize-markdown.ts

@@ -1,65 +0,0 @@
-import { dynamicImport } from '@cspell/dynamic-import';
-import type { Root, Code } from 'mdast';
-import type * as RemarkParse from 'remark-parse';
-import type * as RemarkStringify from 'remark-stringify';
-import type * as Unified from 'unified';
-import type * as UnistUtilVisit from 'unist-util-visit';
-
-interface ModuleCache {
-  remarkParse?: typeof RemarkParse.default;
-  remarkStringify?: typeof RemarkStringify.default;
-  unified?: typeof Unified.unified;
-  visit?: typeof UnistUtilVisit.visit;
-}
-
-let moduleCache: ModuleCache = {};
-
-const initializeModules = async(): Promise<void> => {
-  if (moduleCache.remarkParse != null && moduleCache.remarkStringify != null && moduleCache.unified != null && moduleCache.visit != null) {
-    return;
-  }
-
-  const [{ default: remarkParse }, { default: remarkStringify }, { unified }, { visit }] = await Promise.all([
-    dynamicImport<typeof RemarkParse>('remark-parse', __dirname),
-    dynamicImport<typeof RemarkStringify>('remark-stringify', __dirname),
-    dynamicImport<typeof Unified>('unified', __dirname),
-    dynamicImport<typeof UnistUtilVisit>('unist-util-visit', __dirname),
-  ]);
-
-  moduleCache = {
-    remarkParse,
-    remarkStringify,
-    unified,
-    visit,
-  };
-};
-
-export const sanitizeMarkdown = async(markdown: string): Promise<string> => {
-  await initializeModules();
-
-  const {
-    remarkParse, remarkStringify, unified, visit,
-  } = moduleCache;
-
-
-  if (remarkParse == null || remarkStringify == null || unified == null || visit == null) {
-    throw new Error('Failed to initialize required modules');
-  }
-
-  const sanitize = () => {
-    return (tree: Root) => {
-      visit(tree, 'code', (node: Code) => {
-        if (node.lang === 'drawio') {
-          node.value = '<!-- drawio content replaced -->';
-        }
-      });
-    };
-  };
-
-  const processor = unified()
-    .use(remarkParse)
-    .use(sanitize)
-    .use(remarkStringify);
-
-  return processor.processSync(markdown).toString();
-};

+ 69 - 0
pnpm-lock.yaml

@@ -597,6 +597,9 @@ importers:
       rehype-katex:
       rehype-katex:
         specifier: ^7.0.1
         specifier: ^7.0.1
         version: 7.0.1
         version: 7.0.1
+      rehype-meta:
+        specifier: ^4.0.1
+        version: 4.0.1
       rehype-raw:
       rehype-raw:
         specifier: ^7.0.0
         specifier: ^7.0.0
         version: 7.0.0
         version: 7.0.0
@@ -606,6 +609,9 @@ importers:
       rehype-slug:
       rehype-slug:
         specifier: ^6.0.0
         specifier: ^6.0.0
         version: 6.0.0
         version: 6.0.0
+      rehype-stringify:
+        specifier: ^10.0.1
+        version: 10.0.1
       rehype-toc:
       rehype-toc:
         specifier: ^3.0.2
         specifier: ^3.0.2
         version: 3.0.2
         version: 3.0.2
@@ -7328,6 +7334,9 @@ packages:
   hast-util-from-parse5@8.0.1:
   hast-util-from-parse5@8.0.1:
     resolution: {integrity: sha512-Er/Iixbc7IEa7r/XLtuG52zoqn/b3Xng/w6aZQ0xGVxzhw5xUFxcRqdPzP6yFi/4HBYRaifaI5fQ1RH8n0ZeOQ==}
     resolution: {integrity: sha512-Er/Iixbc7IEa7r/XLtuG52zoqn/b3Xng/w6aZQ0xGVxzhw5xUFxcRqdPzP6yFi/4HBYRaifaI5fQ1RH8n0ZeOQ==}
 
 
+  hast-util-from-selector@3.0.1:
+    resolution: {integrity: sha512-CA2dwcsAS6a7DNZq8HT5fNP4FzUq2PUpQpKnAtOCmfTk429jR0RtasLSMlFA1FNKd8lgfeCIAFl3/vD95be8Lg==}
+
   hast-util-has-property@3.0.0:
   hast-util-has-property@3.0.0:
     resolution: {integrity: sha512-MNilsvEKLFpV604hwfhVStK0usFY/QmM5zX16bo7EjnAEGofr5YyI37kzopBlZJkHD4t887i+q/C8/tr5Q94cA==}
     resolution: {integrity: sha512-MNilsvEKLFpV604hwfhVStK0usFY/QmM5zX16bo7EjnAEGofr5YyI37kzopBlZJkHD4t887i+q/C8/tr5Q94cA==}
 
 
@@ -7352,6 +7361,9 @@ packages:
   hast-util-select@6.0.2:
   hast-util-select@6.0.2:
     resolution: {integrity: sha512-hT/SD/d/Meu+iobvgkffo1QecV8WeKWxwsNMzcTJsKw1cKTQKSR/7ArJeURLNJF9HDjp9nVoORyNNJxrvBye8Q==}
     resolution: {integrity: sha512-hT/SD/d/Meu+iobvgkffo1QecV8WeKWxwsNMzcTJsKw1cKTQKSR/7ArJeURLNJF9HDjp9nVoORyNNJxrvBye8Q==}
 
 
+  hast-util-to-html@9.0.3:
+    resolution: {integrity: sha512-M17uBDzMJ9RPCqLMO92gNNUDuBSq10a25SDBI08iCCxmorf4Yy6sYHK57n9WAbRAAaU+DuR4W6GN9K4DFZesYg==}
+
   hast-util-to-jsx-runtime@2.3.0:
   hast-util-to-jsx-runtime@2.3.0:
     resolution: {integrity: sha512-H/y0+IWPdsLLS738P8tDnrQ8Z+dj12zQQ6WC11TIM21C8WFVoIxcqWXf2H3hiTVZjF1AWqoimGwrTWecWrnmRQ==}
     resolution: {integrity: sha512-H/y0+IWPdsLLS738P8tDnrQ8Z+dj12zQQ6WC11TIM21C8WFVoIxcqWXf2H3hiTVZjF1AWqoimGwrTWecWrnmRQ==}
 
 
@@ -7373,6 +7385,9 @@ packages:
   hastscript@8.0.0:
   hastscript@8.0.0:
     resolution: {integrity: sha512-dMOtzCEd3ABUeSIISmrETiKuyydk1w0pa+gE/uormcTpSYuaNJPbX1NU3JLyscSLjwAQM8bWMhhIlnCqnRvDTw==}
     resolution: {integrity: sha512-dMOtzCEd3ABUeSIISmrETiKuyydk1w0pa+gE/uormcTpSYuaNJPbX1NU3JLyscSLjwAQM8bWMhhIlnCqnRvDTw==}
 
 
+  hastscript@9.0.0:
+    resolution: {integrity: sha512-jzaLBGavEDKHrc5EfFImKN7nZKKBdSLIdGvCwDZ9TfzbF2ffXiov8CKE445L2Z1Ek2t/m4SKQ2j6Ipv7NyUolw==}
+
   he@1.2.0:
   he@1.2.0:
     resolution: {integrity: sha512-F/1DnUGPopORZi0ni+CvrCgHQ5FyEAHRLSApuYWMmrbSwoN2Mn/7k+Gl38gJnR7yyDZk6WLXwiGod1JOWNDKGw==}
     resolution: {integrity: sha512-F/1DnUGPopORZi0ni+CvrCgHQ5FyEAHRLSApuYWMmrbSwoN2Mn/7k+Gl38gJnR7yyDZk6WLXwiGod1JOWNDKGw==}
     hasBin: true
     hasBin: true
@@ -10297,6 +10312,9 @@ packages:
   rehype-katex@7.0.1:
   rehype-katex@7.0.1:
     resolution: {integrity: sha512-OiM2wrZ/wuhKkigASodFoo8wimG3H12LWQaH8qSPVJn9apWKFSH3YOCtbKpBorTVw/eI7cuT21XBbvwEswbIOA==}
     resolution: {integrity: sha512-OiM2wrZ/wuhKkigASodFoo8wimG3H12LWQaH8qSPVJn9apWKFSH3YOCtbKpBorTVw/eI7cuT21XBbvwEswbIOA==}
 
 
+  rehype-meta@4.0.1:
+    resolution: {integrity: sha512-nLwA17+GbtBYi3C1KSrFR8JlqXv76mz185U//xDEAYgzE3g/bSD6WKSXva1W95ttzouUCJwA09X3AQZIi3R+Nw==}
+
   rehype-raw@7.0.0:
   rehype-raw@7.0.0:
     resolution: {integrity: sha512-/aE8hCfKlQeA8LmyeyQvQF3eBiLRGNlfBJEvWH7ivp9sBqs7TNqBL5X3v157rM4IFETqDnIOO+z5M/biZbo9Ww==}
     resolution: {integrity: sha512-/aE8hCfKlQeA8LmyeyQvQF3eBiLRGNlfBJEvWH7ivp9sBqs7TNqBL5X3v157rM4IFETqDnIOO+z5M/biZbo9Ww==}
 
 
@@ -10310,6 +10328,9 @@ packages:
   rehype-slug@6.0.0:
   rehype-slug@6.0.0:
     resolution: {integrity: sha512-lWyvf/jwu+oS5+hL5eClVd3hNdmwM1kAC0BUvEGD19pajQMIzcNUd/k9GsfQ+FfECvX+JE+e9/btsKH0EjJT6A==}
     resolution: {integrity: sha512-lWyvf/jwu+oS5+hL5eClVd3hNdmwM1kAC0BUvEGD19pajQMIzcNUd/k9GsfQ+FfECvX+JE+e9/btsKH0EjJT6A==}
 
 
+  rehype-stringify@10.0.1:
+    resolution: {integrity: sha512-k9ecfXHmIPuFVI61B9DeLPN0qFHfawM6RsuX48hoqlaKSF61RskNjSm1lI8PhBEM0MRdLxVVm4WmTqJQccH9mA==}
+
   rehype-toc@3.0.2:
   rehype-toc@3.0.2:
     resolution: {integrity: sha512-DMt376+4i1KJGgHJL7Ezd65qKkJ7Eqp6JSB47BJ90ReBrohI9ufrornArM6f4oJjP2E2DVZZHufWucv/9t7GUQ==}
     resolution: {integrity: sha512-DMt376+4i1KJGgHJL7Ezd65qKkJ7Eqp6JSB47BJ90ReBrohI9ufrornArM6f4oJjP2E2DVZZHufWucv/9t7GUQ==}
     engines: {node: '>=10'}
     engines: {node: '>=10'}
@@ -12349,6 +12370,9 @@ packages:
   zwitch@2.0.2:
   zwitch@2.0.2:
     resolution: {integrity: sha512-JZxotl7SxAJH0j7dN4pxsTV6ZLXoLdGME+PsjkL/DaBrVryK9kTGq06GfKrwcSOqypP+fdXGoCHE36b99fWVoA==}
     resolution: {integrity: sha512-JZxotl7SxAJH0j7dN4pxsTV6ZLXoLdGME+PsjkL/DaBrVryK9kTGq06GfKrwcSOqypP+fdXGoCHE36b99fWVoA==}
 
 
+  zwitch@2.0.4:
+    resolution: {integrity: sha512-bXE4cR/kVZhKZX/RjPEflHaKVhUVl85noU3v6b8apfQEc1x4A+zBxjZ4lN8LqGd6WZ3dl98pY4o717VFmoPp+A==}
+
 snapshots:
 snapshots:
 
 
   '@adobe/css-tools@4.4.0': {}
   '@adobe/css-tools@4.4.0': {}
@@ -20039,6 +20063,13 @@ snapshots:
       vfile-location: 5.0.3
       vfile-location: 5.0.3
       web-namespaces: 2.0.1
       web-namespaces: 2.0.1
 
 
+  hast-util-from-selector@3.0.1:
+    dependencies:
+      '@types/hast': 3.0.4
+      css-selector-parser: 3.0.5
+      devlop: 1.1.0
+      hastscript: 9.0.0
+
   hast-util-has-property@3.0.0:
   hast-util-has-property@3.0.0:
     dependencies:
     dependencies:
       '@types/hast': 3.0.4
       '@types/hast': 3.0.4
@@ -20098,6 +20129,20 @@ snapshots:
       unist-util-visit: 5.0.0
       unist-util-visit: 5.0.0
       zwitch: 2.0.2
       zwitch: 2.0.2
 
 
+  hast-util-to-html@9.0.3:
+    dependencies:
+      '@types/hast': 3.0.4
+      '@types/unist': 3.0.3
+      ccount: 2.0.1
+      comma-separated-tokens: 2.0.2
+      hast-util-whitespace: 3.0.0
+      html-void-elements: 3.0.0
+      mdast-util-to-hast: 13.2.0
+      property-information: 6.1.1
+      space-separated-tokens: 2.0.1
+      stringify-entities: 4.0.4
+      zwitch: 2.0.4
+
   hast-util-to-jsx-runtime@2.3.0:
   hast-util-to-jsx-runtime@2.3.0:
     dependencies:
     dependencies:
       '@types/estree': 1.0.6
       '@types/estree': 1.0.6
@@ -20159,6 +20204,14 @@ snapshots:
       property-information: 6.1.1
       property-information: 6.1.1
       space-separated-tokens: 2.0.1
       space-separated-tokens: 2.0.1
 
 
+  hastscript@9.0.0:
+    dependencies:
+      '@types/hast': 3.0.4
+      comma-separated-tokens: 2.0.2
+      hast-util-parse-selector: 4.0.0
+      property-information: 6.1.1
+      space-separated-tokens: 2.0.1
+
   he@1.2.0: {}
   he@1.2.0: {}
 
 
   header-case@2.0.4:
   header-case@2.0.4:
@@ -23735,6 +23788,14 @@ snapshots:
       unist-util-visit-parents: 6.0.1
       unist-util-visit-parents: 6.0.1
       vfile: 6.0.3
       vfile: 6.0.3
 
 
+  rehype-meta@4.0.1:
+    dependencies:
+      '@types/hast': 3.0.4
+      hast-util-from-selector: 3.0.1
+      hast-util-select: 6.0.2
+      hastscript: 9.0.0
+      vfile: 6.0.3
+
   rehype-raw@7.0.0:
   rehype-raw@7.0.0:
     dependencies:
     dependencies:
       '@types/hast': 3.0.4
       '@types/hast': 3.0.4
@@ -23760,6 +23821,12 @@ snapshots:
       hast-util-to-string: 3.0.1
       hast-util-to-string: 3.0.1
       unist-util-visit: 5.0.0
       unist-util-visit: 5.0.0
 
 
+  rehype-stringify@10.0.1:
+    dependencies:
+      '@types/hast': 3.0.4
+      hast-util-to-html: 9.0.3
+      unified: 11.0.5
+
   rehype-toc@3.0.2:
   rehype-toc@3.0.2:
     dependencies:
     dependencies:
       '@jsdevtools/rehype-toc': 3.0.2
       '@jsdevtools/rehype-toc': 3.0.2
@@ -26157,3 +26224,5 @@ snapshots:
   zwitch@1.0.5: {}
   zwitch@1.0.5: {}
 
 
   zwitch@2.0.2: {}
   zwitch@2.0.2: {}
+
+  zwitch@2.0.4: {}