Просмотр исходного кода

Merge pull request #9663 from weseek/feat/161961-create-vector-store-file-on-page-creation

feat: Create vector store file on page creation
Shun Miyazawa 1 год назад
Родитель
Сommit
cd6fad8bd5

+ 19 - 2
apps/app/src/features/openai/server/models/ai-assistant.ts

@@ -4,11 +4,13 @@ import { type Model, type Document, Schema } from 'mongoose';
 import { getOrCreateModel } from '~/server/util/mongoose-utils';
 
 import { type AiAssistant, AiAssistantShareScope, AiAssistantAccessScope } from '../../interfaces/ai-assistant';
+import { generateGlobPatterns } from '../utils/generate-glob-patterns';
 
 export interface AiAssistantDocument extends AiAssistant, Document {}
 
-type AiAssistantModel = Model<AiAssistantDocument>
-
+interface AiAssistantModel extends Model<AiAssistantDocument> {
+  findByPagePaths(pagePaths: string[]): Promise<AiAssistantDocument[]>;
+}
 
 /*
  * Schema Definition
@@ -103,4 +105,19 @@ const schema = new Schema<AiAssistantDocument>(
   },
 );
 
+
+schema.statics.findByPagePaths = async function(pagePaths: string[]): Promise<AiAssistantDocument[]> {
+  const pagePathsWithGlobPattern = pagePaths.map(pagePath => generateGlobPatterns(pagePath)).flat();
+  const assistants = await this.find({
+    $or: [
+      // Case 1: Exact match
+      { pagePathPatterns: { $in: pagePaths } },
+      // Case 2: Glob pattern match
+      { pagePathPatterns: { $in: pagePathsWithGlobPattern } },
+    ],
+  }).populate('vectorStore');
+
+  return assistants;
+};
+
 export default getOrCreateModel<AiAssistantDocument, AiAssistantModel>('AiAssistant', schema);

+ 90 - 11
apps/app/src/features/openai/server/services/openai.ts

@@ -2,7 +2,7 @@ import assert from 'node:assert';
 import { Readable, Transform } from 'stream';
 import { pipeline } from 'stream/promises';
 
-import type { Lang } from '@growi/core';
+import type { IUser, Ref, Lang } from '@growi/core';
 import {
   PageGrant, getIdForRef, getIdStringForRef, isPopulated, type IUserHasId,
 } from '@growi/core';
@@ -61,7 +61,6 @@ const convertPathPatternsToRegExp = (pagePathPatterns: string[]): Array<string |
   });
 };
 
-
 export interface IOpenaiService {
   getOrCreateThread(userId: string, vectorStoreRelation: VectorStoreDocument, threadId?: string): Promise<OpenAI.Beta.Threads.Thread | undefined>;
   getThreads(vectorStoreRelationId: string): Promise<ThreadRelationDocument[]>
@@ -74,13 +73,14 @@ export interface IOpenaiService {
   getVectorStoreRelation(aiAssistantId: string): Promise<VectorStoreDocument>
   getVectorStoreRelationsByPageIds(pageId: Types.ObjectId[]): Promise<VectorStoreDocument[]>;
   createVectorStoreFile(vectorStoreRelation: VectorStoreDocument, pages: PageDocument[]): Promise<void>;
+  createVectorStoreFileOnPageCreate(pages: PageDocument[]): Promise<void>;
+  updateVectorStoreFileOnPageUpdate(page: HydratedDocument<PageDocument>): Promise<void>;
   deleteVectorStoreFile(vectorStoreRelationId: Types.ObjectId, pageId: Types.ObjectId): Promise<void>;
   deleteVectorStoreFilesByPageIds(pageIds: Types.ObjectId[]): Promise<void>;
   deleteObsoleteVectorStoreFile(limit: number, apiCallInterval: number): Promise<void>; // for CronJob
   // rebuildVectorStoreAll(): Promise<void>;
   // rebuildVectorStore(page: HydratedDocument<PageDocument>): Promise<void>;
   isAiAssistantUsable(aiAssistantId: string, user: IUserHasId): Promise<boolean>;
-  updateVectorStore(page: HydratedDocument<PageDocument>): Promise<void>;
   createAiAssistant(data: Omit<AiAssistant, 'vectorStore'>): Promise<AiAssistantDocument>;
   updateAiAssistant(aiAssistantId: string, data: Omit<AiAssistant, 'vectorStore'>): Promise<AiAssistantDocument>;
   getAccessibleAiAssistants(user: IUserHasId): Promise<AccessibleAiAssistants>
@@ -325,7 +325,7 @@ class OpenaiService implements IOpenaiService {
     // const vectorStore = await this.getOrCreateVectorStoreForPublicScope();
     const vectorStoreFileRelationsMap: VectorStoreFileRelationsMap = new Map();
     const processUploadFile = async(page: HydratedDocument<PageDocument>) => {
-      if (page._id != null && page.grant === PageGrant.GRANT_PUBLIC && page.revision != null) {
+      if (page._id != null && page.revision != null) {
         if (isPopulated(page.revision) && page.revision.body.length > 0) {
           const uploadedFile = await this.uploadFile(page._id, page.path, page.revision.body);
           prepareVectorStoreFileRelations(vectorStoreRelation._id, page._id, uploadedFile.id, vectorStoreFileRelationsMap);
@@ -501,13 +501,92 @@ class OpenaiService implements IOpenaiService {
   //   await pipeline(pagesStream, batchStrem, createVectorStoreFileStream);
   // }
 
-  async updateVectorStore(page: HydratedDocument<PageDocument>) {
-    const vectorStoreRelations = await this.getVectorStoreRelationsByPageIds([page._id]);
-    console.log('vectorStoreRelations', vectorStoreRelations);
-    vectorStoreRelations.forEach(async(vectorStoreRelation) => {
-      await this.deleteVectorStoreFile(vectorStoreRelation._id, page._id);
-      await this.createVectorStoreFile(vectorStoreRelation, [page]);
-    });
+  async filterPagesByAccessScope(aiAssistant: AiAssistantDocument, pages: HydratedDocument<PageDocument>[]) {
+    const isPublicPage = (page :HydratedDocument<PageDocument>) => page.grant === PageGrant.GRANT_PUBLIC;
+
+    const isUserGroupAccessible = (page :HydratedDocument<PageDocument>, ownerUserGroupIds: string[]) => {
+      if (page.grant !== PageGrant.GRANT_USER_GROUP) return false;
+      return page.grantedGroups.some(group => ownerUserGroupIds.includes(getIdStringForRef(group.item)));
+    };
+
+    const isOwnerAccessible = (page: HydratedDocument<PageDocument>, ownerId: Ref<IUser>) => {
+      if (page.grant !== PageGrant.GRANT_OWNER) return false;
+      return page.grantedUsers.some(user => getIdStringForRef(user) === getIdStringForRef(ownerId));
+    };
+
+    const getOwnerUserGroupIds = async(owner: Ref<IUser>) => {
+      const userGroups = await UserGroupRelation.findAllUserGroupIdsRelatedToUser(owner);
+      const externalGroups = await ExternalUserGroupRelation.findAllUserGroupIdsRelatedToUser(owner);
+      return [...userGroups, ...externalGroups].map(group => getIdStringForRef(group));
+    };
+
+    switch (aiAssistant.accessScope) {
+      case AiAssistantAccessScope.PUBLIC_ONLY:
+        return pages.filter(isPublicPage);
+
+      case AiAssistantAccessScope.GROUPS: {
+        const ownerUserGroupIds = await getOwnerUserGroupIds(aiAssistant.owner);
+        return pages.filter(page => isPublicPage(page) || isUserGroupAccessible(page, ownerUserGroupIds));
+      }
+
+      case AiAssistantAccessScope.OWNER: {
+        const ownerUserGroupIds = await getOwnerUserGroupIds(aiAssistant.owner);
+        return pages.filter(page => isPublicPage(page) || isOwnerAccessible(page, aiAssistant.owner) || isUserGroupAccessible(page, ownerUserGroupIds));
+      }
+
+      default:
+        return [];
+    }
+  }
+
+  async createVectorStoreFileOnPageCreate(pages: HydratedDocument<PageDocument>[]): Promise<void> {
+    const pagePaths = pages.map(page => page.path);
+    const aiAssistants = await AiAssistantModel.findByPagePaths(pagePaths);
+
+    if (aiAssistants.length === 0) {
+      return;
+    }
+
+    for await (const aiAssistant of aiAssistants) {
+      const pagesToVectorize = await this.filterPagesByAccessScope(aiAssistant, pages);
+      const vectorStoreRelation = aiAssistant.vectorStore;
+      if (vectorStoreRelation == null || !isPopulated(vectorStoreRelation)) {
+        continue;
+      }
+
+      logger.debug('--------- createVectorStoreFileOnPageCreate ---------');
+      logger.debug('AccessScopeType of aiAssistant: ', aiAssistant.accessScope);
+      logger.debug('VectorStoreFile pagePath to be created: ', pagesToVectorize.map(page => page.path));
+      logger.debug('-----------------------------------------------------');
+
+      await this.createVectorStoreFile(vectorStoreRelation as VectorStoreDocument, pagesToVectorize);
+    }
+  }
+
+  async updateVectorStoreFileOnPageUpdate(page: HydratedDocument<PageDocument>) {
+    const aiAssistants = await AiAssistantModel.findByPagePaths([page.path]);
+
+    if (aiAssistants.length === 0) {
+      return;
+    }
+
+    for await (const aiAssistant of aiAssistants) {
+      const pagesToVectorize = await this.filterPagesByAccessScope(aiAssistant, [page]);
+      const vectorStoreRelation = aiAssistant.vectorStore;
+      if (vectorStoreRelation == null || !isPopulated(vectorStoreRelation)) {
+        continue;
+      }
+
+      logger.debug('---------- updateVectorStoreOnPageUpdate ------------');
+      logger.debug('AccessScopeType of aiAssistant: ', aiAssistant.accessScope);
+      logger.debug('PagePath of VectorStoreFile to be deleted: ', page.path);
+      logger.debug('pagePath of VectorStoreFile to be created: ', pagesToVectorize.map(page => page.path));
+      logger.debug('-----------------------------------------------------');
+
+      // Do not create a new VectorStoreFile if page is changed to a permission that AiAssistant does not have access to
+      await this.createVectorStoreFile(vectorStoreRelation as VectorStoreDocument, pagesToVectorize);
+      await this.deleteVectorStoreFile((vectorStoreRelation as VectorStoreDocument)._id, page._id);
+    }
   }
 
   private async createVectorStoreFileWithStream(vectorStoreRelation: VectorStoreDocument, conditions: mongoose.FilterQuery<PageDocument>): Promise<void> {

+ 48 - 0
apps/app/src/features/openai/server/utils/generate-glob-patterns.spec.ts

@@ -0,0 +1,48 @@
+import { describe, test, expect } from 'vitest';
+
+import { generateGlobPatterns } from './generate-glob-patterns';
+
+describe('generateGlobPatterns', () => {
+  test('generates glob patterns for basic path with trailing slash', () => {
+    const path = '/Sandbox/Bootstrap5/';
+    const patterns = generateGlobPatterns(path);
+
+    expect(patterns).toEqual([
+      '/Sandbox/*',
+      '/Sandbox/Bootstrap5/*',
+    ]);
+  });
+
+  test('generates glob patterns for multi-level path with trailing slash', () => {
+    const path = '/user/admin/memo/';
+    const patterns = generateGlobPatterns(path);
+
+    expect(patterns).toEqual([
+      '/user/*',
+      '/user/admin/*',
+      '/user/admin/memo/*',
+    ]);
+  });
+
+  test('generates glob patterns for path without trailing slash', () => {
+    const path = '/path/to/directory';
+    const patterns = generateGlobPatterns(path);
+
+    expect(patterns).toEqual([
+      '/path/*',
+      '/path/to/*',
+      '/path/to/directory/*',
+    ]);
+  });
+
+  test('handles path with empty segments correctly', () => {
+    const path = '/path//to///dir';
+    const patterns = generateGlobPatterns(path);
+
+    expect(patterns).toEqual([
+      '/path/*',
+      '/path/to/*',
+      '/path/to/dir/*',
+    ]);
+  });
+});

+ 28 - 0
apps/app/src/features/openai/server/utils/generate-glob-patterns.ts

@@ -0,0 +1,28 @@
+import { pathUtils } from '@growi/core/dist/utils';
+
+/**
+  * @example
+  * // Input: '/Sandbox/Bootstrap5/'
+  * // Output: ['/Sandbox/*', '/Sandbox/Bootstrap5/*']
+  *
+  * // Input: '/user/admin/memo/'
+  * // Output: ['/user/*', '/user/admin/*', '/user/admin/memo/*']
+  */
+export const generateGlobPatterns = (path: string): string[] => {
+  // Remove trailing slash if exists
+  const normalizedPath = pathUtils.removeTrailingSlash(path);
+
+  // Split path into segments
+  const segments = normalizedPath.split('/').filter(Boolean);
+
+  // Generate patterns
+  const patterns: string[] = [];
+  let currentPath = '';
+
+  for (let i = 0; i < segments.length; i++) {
+    currentPath += `/${segments[i]}`;
+    patterns.push(`${currentPath}/*`);
+  }
+
+  return patterns;
+};

+ 1 - 2
apps/app/src/server/routes/apiv3/page/create-page.ts

@@ -205,9 +205,8 @@ export const createPageHandlersFactory: CreatePageHandlersFactory = (crowi) => {
     if (isAiEnabled()) {
       const { getOpenaiService } = await import('~/features/openai/server/services/openai');
       try {
-        // TODO: https://redmine.weseek.co.jp/issues/160334
         const openaiService = getOpenaiService();
-        // await openaiService?.rebuildVectorStore(createdPage);
+        await openaiService?.createVectorStoreFileOnPageCreate([createdPage]);
       }
       catch (err) {
         logger.error('Rebuild vector store failed', err);

+ 1 - 1
apps/app/src/server/routes/apiv3/page/update-page.ts

@@ -122,7 +122,7 @@ export const updatePageHandlersFactory: UpdatePageHandlersFactory = (crowi) => {
       const { getOpenaiService } = await import('~/features/openai/server/services/openai');
       try {
         const openaiService = getOpenaiService();
-        await openaiService?.updateVectorStore(updatedPage);
+        await openaiService?.updateVectorStoreFileOnPageUpdate(updatedPage);
       }
       catch (err) {
         logger.error('Rebuild vector store failed', err);

+ 6 - 10
apps/app/src/server/service/page/index.ts

@@ -1171,12 +1171,10 @@ class PageService implements IPageService {
       );
 
       if (isAiEnabled()) {
-        // TODO: https://redmine.weseek.co.jp/issues/160336
         const { getOpenaiService } = await import('~/features/openai/server/services/openai');
-
-        // Do not await because communication with OpenAI takes time
         const openaiService = getOpenaiService();
-        // openaiService?.createVectorStoreFile([duplicatedTarget]);
+        // Do not await because communication with OpenAI takes time
+        openaiService?.createVectorStoreFileOnPageCreate([duplicatedTarget]);
       }
     }
     this.pageEvent.emit('duplicate', page, user);
@@ -1409,16 +1407,14 @@ class PageService implements IPageService {
     await Revision.insertMany(newRevisions, { ordered: false });
     await this.duplicateTags(pageIdMapping);
 
-    const duplicatedPagesWithPopulatedToShowRevison = await Page
-      .find({ _id: { $in: duplicatedPageIds }, grant: PageGrant.GRANT_PUBLIC }).populate('revision') as PageDocument[];
+    const duplicatedPagesWithPopulatedToShowRevision: HydratedDocument<PageDocument>[] = await Page
+      .find({ _id: { $in: duplicatedPageIds }, grant: PageGrant.GRANT_PUBLIC }).populate('revision');
 
     if (isAiEnabled()) {
-      // TODO: https://redmine.weseek.co.jp/issues/160336
       const { getOpenaiService } = await import('~/features/openai/server/services/openai');
-
-      // Do not await because communication with OpenAI takes time
       const openaiService = getOpenaiService();
-      // openaiService?.createVectorStoreFile(duplicatedPagesWithPopulatedToShowRevison);
+      // Do not await because communication with OpenAI takes time
+      openaiService?.createVectorStoreFileOnPageCreate(duplicatedPagesWithPopulatedToShowRevision);
     }
   }