Просмотр исходного кода

Enabled to create a specialized vectorStore

Shun Miyazawa 1 год назад
Родитель
Сommit
d703468a51
1 измененных файлов с 60 добавлено и 9 удалено
  1. 60 9
      apps/app/src/features/openai/server/services/openai.ts

+ 60 - 9
apps/app/src/features/openai/server/services/openai.ts

@@ -3,6 +3,8 @@ import { Readable, Transform } from 'stream';
 import { pipeline } from 'stream/promises';
 
 import { PageGrant, isPopulated } from '@growi/core';
+import { addTrailingSlash, normalizePath } from '@growi/core/dist/utils/path-utils';
+import escapeStringRegexp from 'escape-string-regexp';
 import type { HydratedDocument, Types } from 'mongoose';
 import mongoose from 'mongoose';
 import type OpenAI from 'openai';
@@ -42,7 +44,7 @@ export interface IOpenaiService {
   getOrCreateVectorStoreForPublicScope(): Promise<VectorStoreDocument>;
   deleteExpiredThreads(limit: number, apiCallInterval: number): Promise<void>; // for CronJob
   deleteObsolatedVectorStoreRelations(): Promise<void> // for CronJob
-  createVectorStoreFile(pages: PageDocument[]): Promise<void>;
+  createVectorStoreFile(vectorStore: VectorStoreDocument, pages: PageDocument[]): Promise<void>;
   deleteVectorStoreFile(vectorStoreRelationId: Types.ObjectId, pageId: Types.ObjectId): Promise<void>;
   deleteObsoleteVectorStoreFile(limit: number, apiCallInterval: number): Promise<void>; // for CronJob
   rebuildVectorStoreAll(): Promise<void>;
@@ -145,6 +147,22 @@ class OpenaiService implements IOpenaiService {
     return newVectorStoreDocument;
   }
 
+  private async createVectorStore(): Promise<VectorStoreDocument> {
+    try {
+      const newVectorStore = await this.client.createVectorStore(VectorStoreScopeType.PUBLIC); // TODO: fix argument
+
+      const newVectorStoreDocument = await VectorStoreModel.create({
+        scopeType: VectorStoreScopeType.PUBLIC,
+        vectorStoreId: newVectorStore.id,
+      }) as VectorStoreDocument;
+
+      return newVectorStoreDocument;
+    }
+    catch (err) {
+      throw new Error(err);
+    }
+  }
+
   // TODO: https://redmine.weseek.co.jp/issues/156643
   // private async uploadFileByChunks(pageId: Types.ObjectId, body: string, vectorStoreFileRelationsMap: VectorStoreFileRelationsMap) {
   //   const chunks = await splitMarkdownIntoChunks(body, 'gpt-4o');
@@ -183,8 +201,8 @@ class OpenaiService implements IOpenaiService {
     }
   }
 
-  async createVectorStoreFile(pages: Array<HydratedDocument<PageDocument>>): Promise<void> {
-    const vectorStore = await this.getOrCreateVectorStoreForPublicScope();
+  async createVectorStoreFile(vectorStore: VectorStoreDocument, pages: Array<HydratedDocument<PageDocument>>): Promise<void> {
+    // const vectorStore = await this.getOrCreateVectorStoreForPublicScope();
     const vectorStoreFileRelationsMap: VectorStoreFileRelationsMap = new Map();
     const processUploadFile = async(page: HydratedDocument<PageDocument>) => {
       if (page._id != null && page.grant === PageGrant.GRANT_PUBLIC && page.revision != null) {
@@ -359,17 +377,50 @@ class OpenaiService implements IOpenaiService {
   }
 
   async createAiAssistant(data: Omit<AiAssistant, 'vectorStore'>): Promise<AiAssistantDocument> {
+    // 1. Get pages stream based on path patterns
+    const conditions: Array<{path: string | RegExp}> = data.pagePathPatterns.map((path) => {
+      if (path.endsWith('/*')) {
+        const basePathWithoutGlob = path.slice(0, -2); // remove '/*'
+        const pathWithTrailingSlash = addTrailingSlash(basePathWithoutGlob);
+        const startsPattern = escapeStringRegexp(pathWithTrailingSlash);
+
+        return { path: new RegExp(`^${startsPattern}`) };
+      }
+      return { path: normalizePath(path) };
+    });
 
+    // 2. Create vector store file transform stream
     const Page = mongoose.model<HydratedDocument<PageDocument>, PageModel>('Page');
-    const { PageQueryBuilder } = Page;
-    const builder = new PageQueryBuilder(Page.find(), false); // includeEmpty = false
+    const pagesStream = Page.find({ $or: conditions })
+      .populate('revision')
+      .cursor({ batchSize: BATCH_SIZE });
+    const batchStream = createBatchStream(BATCH_SIZE);
 
-    builder.addConditionToListByPathsArrayWithGlob(data.pagePathPatterns);
+    const vectorStore = await this.createVectorStore();
 
-    const pages = await builder.query.exec();
+    const createVectorStoreFile = this.createVectorStoreFile.bind(this);
+    const createVectorStoreFileStream = new Transform({
+      objectMode: true,
+      async transform(chunk: HydratedDocument<PageDocument>[], encoding, callback) {
+        try {
+          await createVectorStoreFile(vectorStore, chunk);
+          this.push(chunk);
+          callback();
+        }
+        catch (error) {
+          callback(error);
+        }
+      },
+    });
+
+    // 3. Process stream pipeline
+    await pipeline(pagesStream, batchStream, createVectorStoreFileStream);
+
+    // 4. Create AI Assistant with vector store (TODO)
+    const aiAssistant = await AiAssistantModel.create({
+      ...data, vectorStore,
+    });
 
-    const dumyVectorStoreId = '676e0d9863442b736e7ecf09';
-    const aiAssistant = await AiAssistantModel.create({ ...data, vectorStore: dumyVectorStoreId });
     return aiAssistant;
   }