Parcourir la source

Merge pull request #9411 from weseek/feat/157516-remove-unnecessary-strings-from-markdown-and-save-in-vector-store

imprv(ai): Remove unnecessary strings from markdown when creating VectorStoreFIie
mergify[bot] il y a 1 an
Parent
commit
25a1bec1dd

+ 4 - 1
apps/app/src/features/openai/server/services/openai.ts

@@ -19,11 +19,13 @@ import { createBatchStream } from '~/server/util/batch-stream';
 import loggerFactory from '~/utils/logger';
 import loggerFactory from '~/utils/logger';
 
 
 import { OpenaiServiceTypes } from '../../interfaces/ai';
 import { OpenaiServiceTypes } from '../../interfaces/ai';
+import { sanitizeMarkdown } from '../utils/sanitize-markdown';
 
 
 import { getClient } from './client-delegator';
 import { getClient } from './client-delegator';
 // import { splitMarkdownIntoChunks } from './markdown-splitter/markdown-token-splitter';
 // import { splitMarkdownIntoChunks } from './markdown-splitter/markdown-token-splitter';
 import { oepnaiApiErrorHandler } from './openai-api-error-handler';
 import { oepnaiApiErrorHandler } from './openai-api-error-handler';
 
 
+
 const BATCH_SIZE = 100;
 const BATCH_SIZE = 100;
 
 
 const logger = loggerFactory('growi:service:openai');
 const logger = loggerFactory('growi:service:openai');
@@ -155,7 +157,8 @@ class OpenaiService implements IOpenaiService {
   // }
   // }
 
 
   private async uploadFile(pageId: Types.ObjectId, body: string): Promise<OpenAI.Files.FileObject> {
   private async uploadFile(pageId: Types.ObjectId, body: string): Promise<OpenAI.Files.FileObject> {
-    const file = await toFile(Readable.from(body), `${pageId}.md`);
+    const sanitizedMarkdown = await sanitizeMarkdown(body);
+    const file = await toFile(Readable.from(sanitizedMarkdown), `${pageId}.md`);
     const uploadedFile = await this.client.uploadFile(file);
     const uploadedFile = await this.client.uploadFile(file);
     return uploadedFile;
     return uploadedFile;
   }
   }

+ 65 - 0
apps/app/src/features/openai/server/utils/sanitize-markdown.ts

@@ -0,0 +1,65 @@
+import { dynamicImport } from '@cspell/dynamic-import';
+import type { Root, Code } from 'mdast';
+import type * as RemarkParse from 'remark-parse';
+import type * as RemarkStringify from 'remark-stringify';
+import type * as Unified from 'unified';
+import type * as UnistUtilVisit from 'unist-util-visit';
+
+interface ModuleCache {
+  remarkParse?: typeof RemarkParse.default;
+  remarkStringify?: typeof RemarkStringify.default;
+  unified?: typeof Unified.unified;
+  visit?: typeof UnistUtilVisit.visit;
+}
+
+let moduleCache: ModuleCache = {};
+
+const initializeModules = async(): Promise<void> => {
+  if (moduleCache.remarkParse != null && moduleCache.remarkStringify != null && moduleCache.unified != null && moduleCache.visit != null) {
+    return;
+  }
+
+  const [{ default: remarkParse }, { default: remarkStringify }, { unified }, { visit }] = await Promise.all([
+    dynamicImport<typeof RemarkParse>('remark-parse', __dirname),
+    dynamicImport<typeof RemarkStringify>('remark-stringify', __dirname),
+    dynamicImport<typeof Unified>('unified', __dirname),
+    dynamicImport<typeof UnistUtilVisit>('unist-util-visit', __dirname),
+  ]);
+
+  moduleCache = {
+    remarkParse,
+    remarkStringify,
+    unified,
+    visit,
+  };
+};
+
+export const sanitizeMarkdown = async(markdown: string): Promise<string> => {
+  await initializeModules();
+
+  const {
+    remarkParse, remarkStringify, unified, visit,
+  } = moduleCache;
+
+
+  if (remarkParse == null || remarkStringify == null || unified == null || visit == null) {
+    throw new Error('Failed to initialize required modules');
+  }
+
+  const sanitize = () => {
+    return (tree: Root) => {
+      visit(tree, 'code', (node: Code) => {
+        if (node.lang === 'drawio') {
+          node.value = '<!-- drawio content replaced -->';
+        }
+      });
+    };
+  };
+
+  const processor = unified()
+    .use(remarkParse)
+    .use(sanitize)
+    .use(remarkStringify);
+
+  return processor.processSync(markdown).toString();
+};