Przeglądaj źródła

Merge pull request #9172 from weseek/feat/154030-rebuild-vector-store-when-updating-page

feat: Rebuild vector store when updating page
Yuki Takei 1 rok temu
rodzic
commit
9518e209c1

+ 1 - 1
apps/app/config/logger/config.dev.js

@@ -43,5 +43,5 @@ module.exports = {
   // 'growi:cli:StickyStretchableScroller': 'debug',
   // 'growi:cli:ItemsTree': 'debug',
   'growi:searchResultList': 'debug',
-
+  'growi:service:openai': 'debug',
 };

+ 2 - 2
apps/app/src/server/models/page.ts

@@ -7,6 +7,7 @@ import {
   type IPage,
   GroupType, type HasObjectId,
 } from '@growi/core';
+import type { IPagePopulatedToShowRevision } from '@growi/core/dist/interfaces';
 import { getIdForRef, isPopulated } from '@growi/core/dist/interfaces';
 import { isTopPage, hasSlash } from '@growi/core/dist/utils/page-path-utils';
 import { addTrailingSlash, normalizePath } from '@growi/core/dist/utils/path-utils';
@@ -50,10 +51,9 @@ export interface PageDocument extends IPage, Document<Types.ObjectId> {
   [x:string]: any // for obsolete methods
   getLatestRevisionBodyLength(): Promise<number | null | undefined>
   calculateAndUpdateLatestRevisionBodyLength(this: PageDocument): Promise<void>
-  populateDataToShowRevision(shouldExcludeBody?: boolean): Promise<PageDocument>
+  populateDataToShowRevision(shouldExcludeBody?: boolean): Promise<IPagePopulatedToShowRevision>
 }
 
-
 type TargetAndAncestorsResult = {
   targetAndAncestors: PageDocument[]
   rootPage: PageDocument

+ 3 - 2
apps/app/src/server/routes/apiv3/openai/rebuild-vector-store.ts

@@ -4,7 +4,7 @@ import type { ValidationChain } from 'express-validator';
 
 import type Crowi from '~/server/crowi';
 import { certifyAiService } from '~/server/middlewares/certify-ai-service';
-import { openaiService } from '~/server/service/openai/openai';
+import { getOpenaiService } from '~/server/service/openai/openai';
 import loggerFactory from '~/utils/logger';
 
 import { apiV3FormValidator } from '../../../middlewares/apiv3-form-validator';
@@ -28,7 +28,8 @@ export const rebuildVectorStoreHandlersFactory: RebuildVectorStoreFactory = (cro
     async(req: Request, res: ApiV3Response) => {
 
       try {
-        await openaiService.rebuildVectorStore();
+        const openaiService = getOpenaiService();
+        await openaiService?.rebuildVectorStoreAll();
         return res.apiv3({});
 
       }

+ 10 - 0
apps/app/src/server/routes/apiv3/page/create-page.ts

@@ -22,6 +22,7 @@ import PageTagRelation from '~/server/models/page-tag-relation';
 import { serializePageSecurely, serializeRevisionSecurely } from '~/server/models/serializers';
 import { configManager } from '~/server/service/config-manager';
 import { getTranslation } from '~/server/service/i18next';
+import { getOpenaiService } from '~/server/service/openai/openai';
 import loggerFactory from '~/utils/logger';
 
 import { apiV3FormValidator } from '../../../middlewares/apiv3-form-validator';
@@ -198,6 +199,15 @@ export const createPageHandlersFactory: CreatePageHandlersFactory = (crowi) => {
     catch (err) {
       logger.error('Failed to create subscription document', err);
     }
+
+    // Rebuild vector store file
+    try {
+      const openaiService = getOpenaiService();
+      await openaiService?.rebuildVectorStore(createdPage);
+    }
+    catch (err) {
+      logger.error('Rebuild vector store failed', err);
+    }
   }
 
   const addActivity = generateAddActivityMiddleware(crowi);

+ 11 - 0
apps/app/src/server/routes/apiv3/page/update-page.ts

@@ -18,6 +18,8 @@ import { generateAddActivityMiddleware } from '~/server/middlewares/add-activity
 import { GlobalNotificationSettingEvent } from '~/server/models/GlobalNotificationSetting';
 import type { PageDocument, PageModel } from '~/server/models/page';
 import { serializePageSecurely, serializeRevisionSecurely } from '~/server/models/serializers';
+import { configManager } from '~/server/service/config-manager';
+import { getOpenaiService } from '~/server/service/openai/openai';
 import { preNotifyService } from '~/server/service/pre-notify';
 import { normalizeLatestRevisionIfBroken } from '~/server/service/revision/normalize-latest-revision-if-broken';
 import { getYjsService } from '~/server/service/yjs';
@@ -114,6 +116,15 @@ export const updatePageHandlersFactory: UpdatePageHandlersFactory = (crowi) => {
         logger.error('Create user notification failed', err);
       }
     }
+
+    // Rebuild vector store file
+    try {
+      const openaiService = getOpenaiService();
+      await openaiService?.rebuildVectorStore(updatedPage);
+    }
+    catch (err) {
+      logger.error('Rebuild vector store failed', err);
+    }
   }
 
   const addActivity = generateAddActivityMiddleware(crowi);

+ 4 - 0
apps/app/src/server/service/openai/client-delegator/azure-openai-client-delegator.ts

@@ -22,6 +22,10 @@ export class AzureOpenaiClientDelegator implements IOpenaiClientDelegator {
     // TODO: initialize openaiVectorStoreId property
   }
 
+  async getFileList(): Promise<OpenAI.Files.FileObjectsPage> {
+    return this.client.files.list();
+  }
+
   async getVectorStoreFiles(): Promise<OpenAI.Beta.VectorStores.Files.VectorStoreFilesPage> {
     return this.client.beta.vectorStores.files.list(this.openaiVectorStoreId);
   }

+ 1 - 0
apps/app/src/server/service/openai/client-delegator/interfaces.ts

@@ -4,6 +4,7 @@ import type { Uploadable } from 'openai/uploads';
 export interface IOpenaiClientDelegator {
   getVectorStoreFiles(): Promise<OpenAI.Beta.VectorStores.Files.VectorStoreFilesPage>;
   deleteVectorStoreFiles(fileId: string): Promise<OpenAI.Beta.VectorStores.Files.VectorStoreFileDeleted>;
+  getFileList(): Promise<OpenAI.Files.FileObjectsPage>;
   deleteFile(fileId: string): Promise<OpenAI.Files.FileDeleted>;
   uploadAndPoll(files: Uploadable[]): Promise<OpenAI.Beta.VectorStores.FileBatches.VectorStoreFileBatch>;
 }

+ 4 - 0
apps/app/src/server/service/openai/client-delegator/openai-client-delegator.ts

@@ -36,6 +36,10 @@ export class OpenaiClientDelegator implements IOpenaiClientDelegator {
     return this.client.beta.vectorStores.files.del(this.openaiVectorStoreId, fileId);
   }
 
+  async getFileList(): Promise<OpenAI.Files.FileObjectsPage> {
+    return this.client.files.list();
+  }
+
   async deleteFile(fileId: string): Promise<OpenAI.Files.FileDeleted> {
     return this.client.files.del(fileId);
   }

+ 87 - 25
apps/app/src/server/service/openai/openai.ts

@@ -1,54 +1,116 @@
-import { Readable } from 'stream';
+import { Readable, Transform } from 'stream';
 
-import { PageGrant } from '@growi/core';
-import type { HydratedDocument } from 'mongoose';
+import { PageGrant, isPopulated } from '@growi/core';
+import type { HydratedDocument, Types } from 'mongoose';
 import mongoose from 'mongoose';
 import { toFile } from 'openai';
+import type { FileLike } from 'openai/uploads.mjs';
 
+import { OpenaiServiceTypes } from '~/interfaces/ai';
 import type { PageDocument, PageModel } from '~/server/models/page';
 import { configManager } from '~/server/service/config-manager';
+import { createBatchStream } from '~/server/util/batch-stream';
+import loggerFactory from '~/utils/logger';
+
 
 import { getClient } from './client-delegator';
 
+const BATCH_SIZE = 100;
+
+const logger = loggerFactory('growi:service:openai');
+
+const createFileForVectorStore = async(pageId: Types.ObjectId, body: string): Promise<FileLike> => {
+  return toFile(Readable.from(body), `${pageId}.md`);
+};
+
 export interface IOpenaiService {
-  rebuildVectorStore(): Promise<void>;
+  createVectorStoreFile(pages: PageDocument[]): Promise<void>;
+  rebuildVectorStoreAll(): Promise<void>;
+  rebuildVectorStore(page: PageDocument): Promise<void>;
 }
 class OpenaiService implements IOpenaiService {
 
-  constructor() {
-    const aiEnabled = configManager.getConfig('crowi', 'app:aiEnabled');
-    if (!aiEnabled) {
-      return;
-    }
-  }
-
   private get client() {
     const openaiServiceType = configManager.getConfig('crowi', 'app:openaiServiceType');
     return getClient({ openaiServiceType });
   }
 
-  async rebuildVectorStore() {
-    // TODO: https://redmine.weseek.co.jp/issues/154364
-
-    // Create all public pages VectorStoreFile
-    const page = mongoose.model<HydratedDocument<PageDocument>, PageModel>('Page');
-    const allPublicPages = await page.find({ grant: PageGrant.GRANT_PUBLIC }).populate('revision');
+  async createVectorStoreFile(pages: PageDocument[]): Promise<void> {
+    const filesPromise: Promise<FileLike>[] = [];
+    pages.forEach(async(page) => {
+      if (page._id != null && page.grant === PageGrant.GRANT_PUBLIC && page.revision != null) {
+        if (isPopulated(page.revision) && page.revision.body.length > 0) {
+          filesPromise.push(createFileForVectorStore(page._id, page.revision.body));
+        }
 
-    const filesPromise = allPublicPages
-      .filter(page => page.revision?.body != null && page.revision.body.length > 0)
-      .map(async(page) => {
-        const file = await toFile(Readable.from(page.revision.body), `${page._id}.md`);
-        return file;
-      });
+        const pagePopulatedToShowRevision = await page.populateDataToShowRevision();
+        if (pagePopulatedToShowRevision.revision != null && pagePopulatedToShowRevision.revision.body.length > 0) {
+          filesPromise.push(createFileForVectorStore(page._id, pagePopulatedToShowRevision.revision.body));
+        }
+      }
+    });
 
     if (filesPromise.length === 0) {
       return;
     }
 
     const files = await Promise.all(filesPromise);
-    await this.client.uploadAndPoll(files);
+
+    const res = await this.client.uploadAndPoll(files);
+    logger.debug('create vector store file: ', res);
+  }
+
+  async rebuildVectorStoreAll() {
+    // TODO: https://redmine.weseek.co.jp/issues/154364
+
+    // Create all public pages VectorStoreFile
+    const Page = mongoose.model<HydratedDocument<PageDocument>, PageModel>('Page');
+    const pagesStream = Page.find({ grant: PageGrant.GRANT_PUBLIC }).populate('revision').cursor({ batch_size: BATCH_SIZE });
+    const batchStrem = createBatchStream(BATCH_SIZE);
+
+    const createVectorStoreFile = this.createVectorStoreFile.bind(this);
+    const createVectorStoreFileStream = new Transform({
+      objectMode: true,
+      async transform(chunk: PageDocument[], encoding, callback) {
+        await createVectorStoreFile(chunk);
+        this.push(chunk);
+        callback();
+      },
+    });
+
+    pagesStream
+      .pipe(batchStrem)
+      .pipe(createVectorStoreFileStream);
+  }
+
+  async rebuildVectorStore(page: PageDocument) {
+
+    // delete vector store file
+    const files = await this.client.getFileList();
+    files.data.forEach(async(file) => {
+      if (file.filename === `${page._id}.md`) {
+        const res = await this.client.deleteFile(file.id);
+        logger.debug('delete vector store file: ', res);
+      }
+    });
+
+    await this.createVectorStoreFile([page]);
   }
 
 }
 
-export const openaiService = new OpenaiService();
+let instance: OpenaiService;
+export const getOpenaiService = (): IOpenaiService | undefined => {
+  if (instance != null) {
+    return instance;
+  }
+
+  const aiEnabled = configManager.getConfig('crowi', 'app:aiEnabled');
+  const openaiServiceType = configManager.getConfig('crowi', 'app:openaiServiceType');
+  if (aiEnabled && openaiServiceType != null && OpenaiServiceTypes.includes(openaiServiceType)) {
+    instance = new OpenaiService();
+    return instance;
+  }
+
+  return;
+};

+ 15 - 1
apps/app/src/server/service/page/index.ts

@@ -43,6 +43,7 @@ import {
 import type { PageTagRelationDocument } from '~/server/models/page-tag-relation';
 import PageTagRelation from '~/server/models/page-tag-relation';
 import type { UserGroupDocument } from '~/server/models/user-group';
+import { getOpenaiService } from '~/server/service/openai/openai';
 import { createBatchStream } from '~/server/util/batch-stream';
 import { collectAncestorPaths } from '~/server/util/collect-ancestor-paths';
 import { generalXssFilter } from '~/services/general-xss-filter';
@@ -1177,6 +1178,10 @@ class PageService implements IPageService {
       duplicatedTarget = await (this.create as CreateMethod)(
         newPagePath, populatedPage?.revision?.body ?? '', user, options,
       );
+
+      // Do not await because communication with OpenAI takes time
+      const openaiService = getOpenaiService();
+      openaiService?.createVectorStoreFile([duplicatedTarget]);
     }
     this.pageEvent.emit('duplicate', page, user);
 
@@ -1402,9 +1407,18 @@ class PageService implements IPageService {
       }
     });
 
-    await Page.insertMany(newPages, { ordered: false });
+    const duplicatedPages = await Page.insertMany(newPages, { ordered: false });
+    const duplicatedPageIds = duplicatedPages.map(duplicatedPage => duplicatedPage._id);
+
     await Revision.insertMany(newRevisions, { ordered: false });
     await this.duplicateTags(pageIdMapping);
+
+    const duplicatedPagesWithPopulatedToShowRevison = await Page
+      .find({ _id: { $in: duplicatedPageIds }, grant: PageGrant.GRANT_PUBLIC }).populate('revision') as PageDocument[];
+
+    // Do not await because communication with OpenAI takes time
+    const openaiService = getOpenaiService();
+    openaiService?.createVectorStoreFile(duplicatedPagesWithPopulatedToShowRevison);
   }
 
   private async duplicateDescendantsV4(pages, user, oldPagePathPrefix, newPagePathPrefix) {