Просмотр исходного кода

Modified markdown to html conversion logic

Shun Miyazawa 1 год назад
Родитель
Сommit
f87304f15b

+ 9 - 7
apps/app/src/features/openai/server/services/openai.ts

@@ -2,6 +2,7 @@ import assert from 'node:assert';
 import { Readable, Transform } from 'stream';
 import { pipeline } from 'stream/promises';
 
+import type { IPagePopulatedToShowRevision } from '@growi/core';
 import { PageGrant, isPopulated } from '@growi/core';
 import type { HydratedDocument, Types } from 'mongoose';
 import mongoose from 'mongoose';
@@ -20,7 +21,7 @@ import { createBatchStream } from '~/server/util/batch-stream';
 import loggerFactory from '~/utils/logger';
 
 import { OpenaiServiceTypes } from '../../interfaces/ai';
-import { sanitizeMarkdown } from '../utils/sanitize-markdown';
+import { convertMarkdownToHtml } from '../utils/sanitize-markdown';
 
 import { getClient } from './client-delegator';
 // import { splitMarkdownIntoChunks } from './markdown-splitter/markdown-token-splitter';
@@ -157,9 +158,10 @@ class OpenaiService implements IOpenaiService {
   //   }
   // }
 
-  private async uploadFile(pageId: Types.ObjectId, body: string): Promise<OpenAI.Files.FileObject> {
-    const sanitizedMarkdown = await sanitizeMarkdown(body);
-    const file = await toFile(Readable.from(sanitizedMarkdown), `${pageId}.md`);
+  private async uploadFile(page: HydratedDocument<PageDocument> | IPagePopulatedToShowRevision): Promise<OpenAI.Files.FileObject> {
+    const convertedHtml = await convertMarkdownToHtml(page);
+    console.log('convertedHtml', convertedHtml);
+    const file = await toFile(Readable.from(convertedHtml), `${page._id}.html`);
     const uploadedFile = await this.client.uploadFile(file);
     return uploadedFile;
   }
@@ -183,17 +185,17 @@ class OpenaiService implements IOpenaiService {
   async createVectorStoreFile(pages: Array<HydratedDocument<PageDocument>>): Promise<void> {
     const vectorStore = await this.getOrCreateVectorStoreForPublicScope();
     const vectorStoreFileRelationsMap: VectorStoreFileRelationsMap = new Map();
-    const processUploadFile = async(page: PageDocument) => {
+    const processUploadFile = async(page: HydratedDocument<PageDocument>) => {
       if (page._id != null && page.grant === PageGrant.GRANT_PUBLIC && page.revision != null) {
         if (isPopulated(page.revision) && page.revision.body.length > 0) {
-          const uploadedFile = await this.uploadFile(page._id, page.revision.body);
+          const uploadedFile = await this.uploadFile(page);
           prepareVectorStoreFileRelations(vectorStore._id, page._id, uploadedFile.id, vectorStoreFileRelationsMap);
           return;
         }
 
         const pagePopulatedToShowRevision = await page.populateDataToShowRevision();
         if (pagePopulatedToShowRevision.revision != null && pagePopulatedToShowRevision.revision.body.length > 0) {
-          const uploadedFile = await this.uploadFile(page._id, pagePopulatedToShowRevision.revision.body);
+          const uploadedFile = await this.uploadFile(pagePopulatedToShowRevision);
           prepareVectorStoreFileRelations(vectorStore._id, page._id, uploadedFile.id, vectorStoreFileRelationsMap);
         }
       }

+ 53 - 12
apps/app/src/features/openai/server/utils/sanitize-markdown.ts

@@ -1,48 +1,82 @@
 import { dynamicImport } from '@cspell/dynamic-import';
+import { isPopulated } from '@growi/core';
+import type { IPagePopulatedToShowRevision } from '@growi/core/dist/interfaces';
 import type { Root, Code } from 'mdast';
+import type { HydratedDocument } from 'mongoose';
+import type * as RehypeMeta from 'rehype-meta';
+import type * as RehypeStringify from 'rehype-stringify';
 import type * as RemarkParse from 'remark-parse';
-import type * as RemarkStringify from 'remark-stringify';
+import type * as RemarkRehype from 'remark-rehype';
 import type * as Unified from 'unified';
 import type * as UnistUtilVisit from 'unist-util-visit';
 
+import type { PageDocument } from '~/server/models/page';
+
+
 interface ModuleCache {
   remarkParse?: typeof RemarkParse.default;
-  remarkStringify?: typeof RemarkStringify.default;
   unified?: typeof Unified.unified;
   visit?: typeof UnistUtilVisit.visit;
+  remarkRehype?: typeof RemarkRehype.default;
+  rehypeMeta?: typeof RehypeMeta.default;
+  rehypeStringify?: typeof RehypeStringify.default;
 }
 
 let moduleCache: ModuleCache = {};
 
 const initializeModules = async(): Promise<void> => {
-  if (moduleCache.remarkParse != null && moduleCache.remarkStringify != null && moduleCache.unified != null && moduleCache.visit != null) {
+  if (moduleCache.remarkParse != null
+    && moduleCache.unified != null
+    && moduleCache.visit != null
+    && moduleCache.remarkRehype != null
+    && moduleCache.rehypeMeta != null
+    && moduleCache.rehypeStringify != null
+  ) {
     return;
   }
 
-  const [{ default: remarkParse }, { default: remarkStringify }, { unified }, { visit }] = await Promise.all([
+  const [
+    { default: remarkParse },
+    { unified }, { visit },
+    { default: remarkRehype },
+    { default: rehypeMeta },
+    { default: rehypeStringify },
+  ] = await Promise.all([
     dynamicImport<typeof RemarkParse>('remark-parse', __dirname),
-    dynamicImport<typeof RemarkStringify>('remark-stringify', __dirname),
     dynamicImport<typeof Unified>('unified', __dirname),
     dynamicImport<typeof UnistUtilVisit>('unist-util-visit', __dirname),
+    dynamicImport<typeof RemarkRehype>('remark-rehype', __dirname),
+    dynamicImport<typeof RehypeMeta>('rehype-meta', __dirname),
+    dynamicImport<typeof RehypeStringify>('rehype-stringify', __dirname),
   ]);
 
   moduleCache = {
     remarkParse,
-    remarkStringify,
     unified,
     visit,
+    remarkRehype,
+    rehypeMeta,
+    rehypeStringify,
   };
 };
 
-export const sanitizeMarkdown = async(markdown: string): Promise<string> => {
+export const convertMarkdownToHtml = async(page: HydratedDocument<PageDocument> | IPagePopulatedToShowRevision): Promise<string> => {
   await initializeModules();
 
   const {
-    remarkParse, remarkStringify, unified, visit,
+    remarkParse,
+    unified, visit,
+    remarkRehype,
+    rehypeMeta,
+    rehypeStringify,
   } = moduleCache;
 
-
-  if (remarkParse == null || remarkStringify == null || unified == null || visit == null) {
+  if (remarkParse == null
+    || unified == null
+    || visit == null
+    || remarkRehype == null
+    || rehypeMeta == null
+    || rehypeStringify == null) {
     throw new Error('Failed to initialize required modules');
   }
 
@@ -56,10 +90,17 @@ export const sanitizeMarkdown = async(markdown: string): Promise<string> => {
     };
   };
 
+
+  const revisionBody = page.revision != null && isPopulated(page.revision) ? page.revision.body : undefined;
+
   const processor = unified()
     .use(remarkParse)
     .use(sanitize)
-    .use(remarkStringify);
+    .use(remarkRehype)
+    .use(rehypeMeta, {
+      title: page.path,
+    })
+    .use(rehypeStringify);
 
-  return processor.processSync(markdown).toString();
+  return processor.processSync(revisionBody).toString();
 };