Просмотр исходного кода

Merge pull request #9197 from weseek/feat/154831-vector-store-file-relation-model

feat: VectorStoreFileRelationModel
mergify[bot] 1 год назад
Родитель
Сommit
cefd6fd59d

+ 65 - 0
apps/app/src/features/openai/server/models/vector-store-file-relation.ts

@@ -0,0 +1,65 @@
+import type { Types } from 'mongoose';
+import type mongoose from 'mongoose';
+import { type Model, type Document, Schema } from 'mongoose';
+
+import { getOrCreateModel } from '~/server/util/mongoose-utils';
+
+export interface VectorStoreFileRelation {
+  pageId: mongoose.Types.ObjectId;
+  fileIds: string[];
+}
+
+interface VectorStoreFileRelationDocument extends VectorStoreFileRelation, Document {}
+
+interface VectorStoreFileRelationModel extends Model<VectorStoreFileRelation> {
+  upsertVectorStoreFileRelations(vectorStoreFileRelations: VectorStoreFileRelation[]): Promise<void>;
+}
+
+export const prepareVectorStoreFileRelations = (
+    pageId: Types.ObjectId, fileId: string, vectorStoreFileRelations: VectorStoreFileRelation[],
+): VectorStoreFileRelation[] => {
+  const existingData = vectorStoreFileRelations.find(relation => relation.pageId.equals(pageId));
+
+  // If the data exists, add the fileId to the fileIds array
+  if (existingData != null) {
+    existingData.fileIds.push(fileId);
+  }
+  // If the data doesn't exist, create a new one and add it to the array
+  else {
+    vectorStoreFileRelations.push({
+      pageId,
+      fileIds: [fileId],
+    });
+  }
+
+  return vectorStoreFileRelations;
+};
+
+const schema = new Schema<VectorStoreFileRelationDocument, VectorStoreFileRelationModel>({
+  pageId: {
+    type: Schema.Types.ObjectId,
+    ref: 'Page',
+    required: true,
+    unique: true,
+  },
+  fileIds: [{
+    type: String,
+    required: true,
+  }],
+});
+
+schema.statics.upsertVectorStoreFileRelations = async function(vectorStoreFileRelations: VectorStoreFileRelation[]): Promise<void> {
+  await this.bulkWrite(
+    vectorStoreFileRelations.map((data) => {
+      return {
+        updateOne: {
+          filter: { pageId: data.pageId },
+          update: { $addToSet: { fileIds: { $each: data.fileIds } } },
+          upsert: true,
+        },
+      };
+    }),
+  );
+};
+
+export default getOrCreateModel<VectorStoreFileRelationDocument, VectorStoreFileRelationModel>('VectorStoreFileRelation', schema);

+ 8 - 0
apps/app/src/server/service/openai/client-delegator/azure-openai-client-delegator.ts

@@ -22,6 +22,14 @@ export class AzureOpenaiClientDelegator implements IOpenaiClientDelegator {
     // TODO: initialize openaiVectorStoreId property
   }
 
+  async uploadFile(file: Uploadable): Promise<OpenAI.Files.FileObject> {
+    return this.client.files.create({ file, purpose: 'assistants' });
+  }
+
+  async createVectorStoreFileBatch(fileIds: string[]): Promise<OpenAI.Beta.VectorStores.FileBatches.VectorStoreFileBatch> {
+    return this.client.beta.vectorStores.fileBatches.create(this.openaiVectorStoreId, { file_ids: fileIds });
+  }
+
   async getFileList(): Promise<OpenAI.Files.FileObjectsPage> {
     return this.client.files.list();
   }

+ 2 - 0
apps/app/src/server/service/openai/client-delegator/interfaces.ts

@@ -2,6 +2,8 @@ import type OpenAI from 'openai';
 import type { Uploadable } from 'openai/uploads';
 
 export interface IOpenaiClientDelegator {
+  uploadFile(file: Uploadable): Promise<OpenAI.Files.FileObject>
+  createVectorStoreFileBatch(fileIds: string[]): Promise<OpenAI.Beta.VectorStores.FileBatches.VectorStoreFileBatch>
   getVectorStoreFiles(): Promise<OpenAI.Beta.VectorStores.Files.VectorStoreFilesPage>;
   deleteVectorStoreFiles(fileId: string): Promise<OpenAI.Beta.VectorStores.Files.VectorStoreFileDeleted>;
   getFileList(): Promise<OpenAI.Files.FileObjectsPage>;

+ 8 - 0
apps/app/src/server/service/openai/client-delegator/openai-client-delegator.ts

@@ -28,6 +28,14 @@ export class OpenaiClientDelegator implements IOpenaiClientDelegator {
     this.client = new OpenAI({ apiKey });
   }
 
+  async uploadFile(file: Uploadable): Promise<OpenAI.Files.FileObject> {
+    return this.client.files.create({ file, purpose: 'assistants' });
+  }
+
+  async createVectorStoreFileBatch(fileIds: string[]): Promise<OpenAI.Beta.VectorStores.FileBatches.VectorStoreFileBatch> {
+    return this.client.beta.vectorStores.fileBatches.create(this.openaiVectorStoreId, { file_ids: fileIds });
+  }
+
   async getVectorStoreFiles(): Promise<OpenAI.Beta.VectorStores.Files.VectorStoreFilesPage> {
     return this.client.beta.vectorStores.files.list(this.openaiVectorStoreId);
   }

+ 70 - 24
apps/app/src/server/service/openai/openai.ts

@@ -3,9 +3,13 @@ import { Readable, Transform } from 'stream';
 import { PageGrant, isPopulated } from '@growi/core';
 import type { HydratedDocument, Types } from 'mongoose';
 import mongoose from 'mongoose';
+import type OpenAI from 'openai';
 import { toFile } from 'openai';
-import type { FileLike } from 'openai/uploads.mjs';
 
+import VectorStoreFileRelationModel, {
+  type VectorStoreFileRelation,
+  prepareVectorStoreFileRelations,
+} from '~/features/openai/server/models/vector-store-file-relation';
 import { OpenaiServiceTypes } from '~/interfaces/ai';
 import type { PageDocument, PageModel } from '~/server/models/page';
 import { configManager } from '~/server/service/config-manager';
@@ -19,9 +23,6 @@ const BATCH_SIZE = 100;
 
 const logger = loggerFactory('growi:service:openai');
 
-const createFileForVectorStore = async(pageId: Types.ObjectId, body: string): Promise<FileLike> => {
-  return toFile(Readable.from(body), `${pageId}.md`);
-};
 
 export interface IOpenaiService {
   createVectorStoreFile(pages: PageDocument[]): Promise<void>;
@@ -35,29 +36,83 @@ class OpenaiService implements IOpenaiService {
     return getClient({ openaiServiceType });
   }
 
-  async createVectorStoreFile(pages: PageDocument[]): Promise<void> {
-    const filesPromise: Promise<FileLike>[] = [];
-    pages.forEach(async(page) => {
+  private async uploadFile(pageId: Types.ObjectId, body: string): Promise<OpenAI.Files.FileObject> {
+    const file = await toFile(Readable.from(body), `${pageId}.md`);
+    const uploadedFile = await this.client.uploadFile(file);
+    return uploadedFile;
+  }
+
+  async createVectorStoreFile(pages: Array<PageDocument>): Promise<void> {
+    const preparedVectorStoreFileRelations: VectorStoreFileRelation[] = [];
+    const processUploadFile = async(page: PageDocument) => {
       if (page._id != null && page.grant === PageGrant.GRANT_PUBLIC && page.revision != null) {
         if (isPopulated(page.revision) && page.revision.body.length > 0) {
-          filesPromise.push(createFileForVectorStore(page._id, page.revision.body));
+          const uploadedFile = await this.uploadFile(page._id, page.revision.body);
+          prepareVectorStoreFileRelations(page._id, uploadedFile.id, preparedVectorStoreFileRelations);
+          return;
         }
 
         const pagePopulatedToShowRevision = await page.populateDataToShowRevision();
         if (pagePopulatedToShowRevision.revision != null && pagePopulatedToShowRevision.revision.body.length > 0) {
-          filesPromise.push(createFileForVectorStore(page._id, pagePopulatedToShowRevision.revision.body));
+          const uploadedFile = await this.uploadFile(page._id, pagePopulatedToShowRevision.revision.body);
+          prepareVectorStoreFileRelations(page._id, uploadedFile.id, preparedVectorStoreFileRelations);
         }
       }
+    };
+
+    // Start workers to process results
+    const workers = pages.map(processUploadFile);
+
+    // Wait for all processing to complete.
+    const fileUploadResult = await Promise.allSettled(workers);
+    fileUploadResult.forEach((result) => {
+      if (result.status === 'rejected') {
+        logger.error(result.reason);
+      }
     });
 
-    if (filesPromise.length === 0) {
-      return;
+    try {
+      // Create vector store file
+      const uploadedFileIds = preparedVectorStoreFileRelations.map(data => data.fileIds).flat();
+      const createVectorStoreFileBatchResponse = await this.client.createVectorStoreFileBatch(uploadedFileIds);
+      logger.debug('Create vector store file', createVectorStoreFileBatchResponse);
+
+      // Save vector store file relation
+      await VectorStoreFileRelationModel.upsertVectorStoreFileRelations(preparedVectorStoreFileRelations);
+    }
+    catch (err) {
+      logger.error(err);
     }
 
-    const files = await Promise.all(filesPromise);
+  }
 
-    const res = await this.client.uploadAndPoll(files);
-    logger.debug('create vector store file: ', res);
+  private async deleteVectorStoreFile(page: PageDocument): Promise<void> {
+    // Delete vector store file and delete vector store file relation
+    const vectorStoreFileRelation = await VectorStoreFileRelationModel.findOne({ pageId: page._id });
+    if (vectorStoreFileRelation != null) {
+      const deletedFileIds: string[] = [];
+      for (const fileId of vectorStoreFileRelation.fileIds) {
+        try {
+          // eslint-disable-next-line no-await-in-loop
+          const deleteFileResponse = await this.client.deleteFile(fileId);
+          logger.debug('Delete vector store file', deleteFileResponse);
+          deletedFileIds.push(fileId);
+        }
+        catch (err) {
+          logger.error(err);
+        }
+      }
+
+      const undeletedFileIds = vectorStoreFileRelation.fileIds.filter(fileId => !deletedFileIds.includes(fileId));
+
+      if (undeletedFileIds.length === 0) {
+        await vectorStoreFileRelation.remove();
+        return;
+      }
+
+      vectorStoreFileRelation.fileIds = undeletedFileIds;
+      await vectorStoreFileRelation.save();
+    }
   }
 
   async rebuildVectorStoreAll() {
@@ -84,16 +139,7 @@ class OpenaiService implements IOpenaiService {
   }
 
   async rebuildVectorStore(page: PageDocument) {
-
-    // delete vector store file
-    const files = await this.client.getFileList();
-    files.data.forEach(async(file) => {
-      if (file.filename === `${page._id}.md`) {
-        const res = await this.client.deleteFile(file.id);
-        logger.debug('delete vector store file: ', res);
-      }
-    });
-
+    await this.deleteVectorStoreFile(page);
     await this.createVectorStoreFile([page]);
   }