Просмотр исходного кода

Merge pull request #9906 from weseek/feat/save-attachment-to-vector-store

feat: Save attachment to VectorStore
mergify[bot] 10 месяцев назад
Родитель
Сommit
2ef53c8165

+ 3 - 2
apps/app/package.json

@@ -73,6 +73,7 @@
     "@godaddy/terminus": "^4.9.0",
     "@google-cloud/storage": "^5.8.5",
     "@growi/core": "workspace:^",
+    "@growi/pdf-converter-client": "workspace:^",
     "@growi/pluginkit": "workspace:^",
     "@growi/presentation": "workspace:^",
     "@growi/preset-templates": "workspace:^",
@@ -82,20 +83,20 @@
     "@growi/remark-growi-directive": "workspace:^",
     "@growi/remark-lsx": "workspace:^",
     "@growi/slack": "workspace:^",
-    "@growi/pdf-converter-client": "workspace:^",
     "@keycloak/keycloak-admin-client": "^18.0.0",
     "@opentelemetry/api": "^1.9.0",
     "@opentelemetry/auto-instrumentations-node": "^0.55.1",
     "@opentelemetry/exporter-metrics-otlp-grpc": "^0.57.0",
     "@opentelemetry/exporter-trace-otlp-grpc": "^0.57.0",
     "@opentelemetry/resources": "^1.28.0",
-    "@opentelemetry/semantic-conventions": "^1.28.0",
     "@opentelemetry/sdk-metrics": "^1.28.0",
     "@opentelemetry/sdk-node": "^0.57.0",
     "@opentelemetry/sdk-trace-node": "^1.28.0",
+    "@opentelemetry/semantic-conventions": "^1.28.0",
     "@slack/web-api": "^6.2.4",
     "@slack/webhook": "^6.0.0",
     "@types/async": "^3.2.24",
+    "@types/multer": "^1.4.12",
     "JSONStream": "^1.3.5",
     "archiver": "^5.3.0",
     "array.prototype.flatmap": "^1.2.2",

+ 55 - 17
apps/app/src/features/openai/server/models/vector-store-file-relation.ts

@@ -7,6 +7,7 @@ import { getOrCreateModel } from '~/server/util/mongoose-utils';
 export interface VectorStoreFileRelation {
   vectorStoreRelationId: mongoose.Types.ObjectId;
   page: mongoose.Types.ObjectId;
+  attachment?: mongoose.Types.ObjectId;
   fileIds: string[];
   isAttachedToVectorStore: boolean;
 }
@@ -19,10 +20,21 @@ interface VectorStoreFileRelationModel extends Model<VectorStoreFileRelation> {
 }
 
 export const prepareVectorStoreFileRelations = (
-    vectorStoreRelationId: Types.ObjectId, page: Types.ObjectId, fileId: string, relationsMap: Map<string, VectorStoreFileRelation>,
+    vectorStoreRelationId: Types.ObjectId,
+    page: Types.ObjectId,
+    fileId: string,
+    relationsMap: Map<string, VectorStoreFileRelation>,
+    attachment?: Types.ObjectId,
 ): Map<string, VectorStoreFileRelation> => {
-  const pageIdStr = page.toHexString();
-  const existingData = relationsMap.get(pageIdStr);
+
+  const key = (() => {
+    if (attachment == null) {
+      return page.toHexString();
+    }
+    return page.toHexString() + attachment.toHexString();
+  })();
+
+  const existingData = relationsMap.get(key);
 
   // If the data exists, add the fileId to the fileIds array
   if (existingData != null) {
@@ -30,11 +42,12 @@ export const prepareVectorStoreFileRelations = (
   }
   // If the data doesn't exist, create a new one and add it to the map
   else {
-    relationsMap.set(pageIdStr, {
+    relationsMap.set(key, {
       vectorStoreRelationId,
       page,
       fileIds: [fileId],
       isAttachedToVectorStore: false,
+      attachment,
     });
   }
 
@@ -52,6 +65,10 @@ const schema = new Schema<VectorStoreFileRelationDocument, VectorStoreFileRelati
     ref: 'Page',
     required: true,
   },
+  attachment: {
+    type: Schema.Types.ObjectId,
+    ref: 'Attachment',
+  },
   fileIds: [{
     type: String,
     required: true,
@@ -64,22 +81,43 @@ const schema = new Schema<VectorStoreFileRelationDocument, VectorStoreFileRelati
 });
 
 // define unique compound index
-schema.index({ vectorStoreRelationId: 1, page: 1 }, { unique: true });
+schema.index({ vectorStoreRelationId: 1, page: 1, attachment: 1 }, { unique: true });
 
 schema.statics.upsertVectorStoreFileRelations = async function(vectorStoreFileRelations: VectorStoreFileRelation[]): Promise<void> {
-  await this.bulkWrite(
-    vectorStoreFileRelations.map((data) => {
-      return {
-        updateOne: {
-          filter: { page: data.page, vectorStoreRelationId: data.vectorStoreRelationId },
-          update: {
-            $addToSet: { fileIds: { $each: data.fileIds } },
-          },
-          upsert: true,
+  const upsertOps = vectorStoreFileRelations
+    .filter(data => data.attachment == null)
+    .map(data => ({
+      updateOne: {
+        filter: {
+          page: data.page,
+          vectorStoreRelationId: data.vectorStoreRelationId,
+          attachment: { $exists: false },
         },
-      };
-    }),
-  );
+        update: {
+          $addToSet: { fileIds: { $each: data.fileIds } },
+        },
+        upsert: true,
+      },
+    }));
+
+  const insertOps = vectorStoreFileRelations
+    .filter(data => data.attachment != null)
+    .map(data => ({
+      insertOne: {
+        document: {
+          vectorStoreRelationId: data.vectorStoreRelationId,
+          page: data.page,
+          attachment: data.attachment,
+          fileIds: data.fileIds,
+          isAttachedToVectorStore: data.isAttachedToVectorStore,
+        },
+      },
+    }));
+
+  const bulkOps = [...upsertOps, ...insertOps];
+  if (bulkOps.length > 0) {
+    await this.bulkWrite(bulkOps);
+  }
 };
 
 // Used when attached to VectorStore

+ 4 - 0
apps/app/src/features/openai/server/services/client-delegator/azure-openai-client-delegator.ts

@@ -78,6 +78,10 @@ export class AzureOpenaiClientDelegator implements IOpenaiClientDelegator {
     return this.client.files.create({ file, purpose: 'assistants' });
   }
 
+  async createVectorStoreFile(vectorStoreId: string, fileId: string): Promise<OpenAI.VectorStores.Files.VectorStoreFile> {
+    return this.client.vectorStores.files.create(vectorStoreId, { file_id: fileId });
+  }
+
   async createVectorStoreFileBatch(vectorStoreId: string, fileIds: string[]): Promise<OpenAI.VectorStores.FileBatches.VectorStoreFileBatch> {
     return this.client.vectorStores.fileBatches.create(vectorStoreId, { file_ids: fileIds });
   }

+ 1 - 0
apps/app/src/features/openai/server/services/client-delegator/interfaces.ts

@@ -13,6 +13,7 @@ export interface IOpenaiClientDelegator {
   createVectorStore(name: string): Promise<OpenAI.VectorStores.VectorStore>
   deleteVectorStore(vectorStoreId: string): Promise<OpenAI.VectorStores.VectorStoreDeleted>
   uploadFile(file: Uploadable): Promise<OpenAI.Files.FileObject>
+  createVectorStoreFile(vectorStoreId: string, fileId: string): Promise<OpenAI.VectorStores.Files.VectorStoreFile>
   createVectorStoreFileBatch(vectorStoreId: string, fileIds: string[]): Promise<OpenAI.VectorStores.FileBatches.VectorStoreFileBatch>
   deleteFile(fileId: string): Promise<OpenAI.Files.FileDeleted>;
   chatCompletion(body: OpenAI.Chat.Completions.ChatCompletionCreateParamsNonStreaming): Promise<OpenAI.Chat.Completions.ChatCompletion>

+ 4 - 0
apps/app/src/features/openai/server/services/client-delegator/openai-client-delegator.ts

@@ -79,6 +79,10 @@ export class OpenaiClientDelegator implements IOpenaiClientDelegator {
     return this.client.files.create({ file, purpose: 'assistants' });
   }
 
+  async createVectorStoreFile(vectorStoreId: string, fileId: string): Promise<OpenAI.VectorStores.Files.VectorStoreFile> {
+    return this.client.vectorStores.files.create(vectorStoreId, { file_id: fileId });
+  }
+
   async createVectorStoreFileBatch(vectorStoreId: string, fileIds: string[]): Promise<OpenAI.VectorStores.FileBatches.VectorStoreFileBatch> {
     return this.client.vectorStores.fileBatches.create(vectorStoreId, { file_ids: fileIds });
   }

+ 221 - 23
apps/app/src/features/openai/server/services/openai.ts

@@ -1,5 +1,6 @@
+import fs from 'fs';
 import assert from 'node:assert';
-import { Readable, Transform } from 'stream';
+import { Readable, Transform, Writable } from 'stream';
 import { pipeline } from 'stream/promises';
 
 import type {
@@ -22,6 +23,8 @@ import VectorStoreFileRelationModel, {
   type VectorStoreFileRelation,
   prepareVectorStoreFileRelations,
 } from '~/features/openai/server/models/vector-store-file-relation';
+import type Crowi from '~/server/crowi';
+import type { IAttachmentDocument, IAttachmentModel } from '~/server/models/attachment';
 import type { PageDocument, PageModel } from '~/server/models/page';
 import UserGroupRelation from '~/server/models/user-group-relation';
 import { configManager } from '~/server/service/config-manager';
@@ -40,6 +43,7 @@ import { removeGlobPath } from '../../utils/remove-glob-path';
 import AiAssistantModel, { type AiAssistantDocument } from '../models/ai-assistant';
 import { convertMarkdownToHtml } from '../utils/convert-markdown-to-html';
 import { generateGlobPatterns } from '../utils/generate-glob-patterns';
+import { isVectorStoreCompatible } from '../utils/is-vector-store-compatible';
 
 import { getClient } from './client-delegator';
 import { openaiApiErrorHandler } from './openai-api-error-handler';
@@ -75,10 +79,8 @@ export interface IOpenaiService {
   deleteObsoletedVectorStoreRelations(): Promise<void> // for CronJob
   deleteVectorStore(vectorStoreRelationId: string): Promise<void>;
   getMessageData(threadId: string, lang?: Lang, options?: MessageListParams): Promise<OpenAI.Beta.Threads.Messages.MessagesPage>;
-  createVectorStoreFile(vectorStoreRelation: VectorStoreDocument, pages: PageDocument[]): Promise<void>;
   createVectorStoreFileOnPageCreate(pages: PageDocument[]): Promise<void>;
   updateVectorStoreFileOnPageUpdate(page: HydratedDocument<PageDocument>): Promise<void>;
-  deleteVectorStoreFile(vectorStoreRelationId: Types.ObjectId, pageId: Types.ObjectId): Promise<void>;
   deleteVectorStoreFilesByPageIds(pageIds: Types.ObjectId[]): Promise<void>;
   deleteObsoleteVectorStoreFile(limit: number, apiCallInterval: number): Promise<void>; // for CronJob
   isAiAssistantUsable(aiAssistantId: string, user: IUserHasId): Promise<boolean>;
@@ -89,12 +91,24 @@ export interface IOpenaiService {
 }
 class OpenaiService implements IOpenaiService {
 
+  private crowi: Crowi;
+
+  constructor(crowi: Crowi) {
+    this.crowi = crowi;
+
+    this.createVectorStoreFileOnUploadAttachment = this.createVectorStoreFileOnUploadAttachment.bind(this);
+    crowi.attachmentService.addAttachHandler(this.createVectorStoreFileOnUploadAttachment);
+
+    this.deleteVectorStoreFileOnDeleteAttachment = this.deleteVectorStoreFileOnDeleteAttachment.bind(this);
+    crowi.attachmentService.addDetachHandler(this.deleteVectorStoreFileOnDeleteAttachment);
+  }
+
   private get client() {
     const openaiServiceType = configManager.getConfig('openai:serviceType');
     return getClient({ openaiServiceType });
   }
 
-  async generateThreadTitle(message: string): Promise<string | null> {
+  private async generateThreadTitle(message: string): Promise<string | null> {
     const systemMessage = [
       'Create a brief title (max 5 words) from your message.',
       'Respond in the same language the user uses in their input.',
@@ -155,7 +169,7 @@ class OpenaiService implements IOpenaiService {
     }
   }
 
-  async updateThreads(aiAssistantId: string, vectorStoreId: string): Promise<void> {
+  private async updateThreads(aiAssistantId: string, vectorStoreId: string): Promise<void> {
     const threadRelations = await this.getThreadsByAiAssistantId(aiAssistantId);
     for await (const threadRelation of threadRelations) {
       try {
@@ -231,7 +245,7 @@ class OpenaiService implements IOpenaiService {
   }
 
 
-  async getVectorStoreRelationsByPageIds(pageIds: Types.ObjectId[]): Promise<VectorStoreDocument[]> {
+  private async getVectorStoreRelationsByPageIds(pageIds: Types.ObjectId[]): Promise<VectorStoreDocument[]> {
     const pipeline = [
       // Stage 1: Match documents with the given pageId
       {
@@ -309,6 +323,28 @@ class OpenaiService implements IOpenaiService {
     return uploadedFile;
   }
 
+  private async uploadFileForAttachment(fileName: string, readStream?: NodeJS.ReadableStream, filePath?: string): Promise<OpenAI.Files.FileObject> {
+    const streamSource: NodeJS.ReadableStream = (() => {
+      if (readStream != null) {
+        return readStream;
+      }
+
+      if (filePath != null) {
+        return fs.createReadStream(filePath);
+      }
+
+      throw new Error('readStream and filePath are both null');
+    })();
+
+    const uploadableFile = await toFile(
+      streamSource,
+      fileName,
+    );
+
+    const uploadedFile = await this.client.uploadFile(uploadableFile);
+    return uploadedFile;
+  }
+
   async deleteVectorStore(vectorStoreRelationId: string): Promise<void> {
     const vectorStoreDocument: VectorStoreDocument | null = await VectorStoreModel.findOne({ _id: vectorStoreRelationId, isDeleted: false });
     if (vectorStoreDocument == null) {
@@ -326,14 +362,56 @@ class OpenaiService implements IOpenaiService {
     }
   }
 
-  async createVectorStoreFile(vectorStoreRelation: VectorStoreDocument, pages: Array<HydratedDocument<PageDocument>>): Promise<void> {
-    // const vectorStore = await this.getOrCreateVectorStoreForPublicScope();
+  private async createVectorStoreFileWithStreamForAttachment(
+      pageId: Types.ObjectId, vectorStoreRelationId: Types.ObjectId, vectorStoreFileRelationsMap: VectorStoreFileRelationsMap,
+  ): Promise<void> {
+
+    const Attachment = mongoose.model<HydratedDocument<IAttachmentDocument>, IAttachmentModel>('Attachment');
+    const attachmentsCursor = Attachment.find({ page: pageId }).cursor();
+    const batchStream = createBatchStream(BATCH_SIZE);
+
+    const uploadFileStreamForAttachment = new Writable({
+      objectMode: true,
+      write: async(attachments: HydratedDocument<IAttachmentDocument>[], _encoding, callback) => {
+        for await (const attachment of attachments) {
+          try {
+            if (!isVectorStoreCompatible(attachment.originalName, attachment.fileFormat)) {
+              continue;
+            }
+            const readStream = await this.crowi.fileUploadService.findDeliveryFile(attachment);
+            const uploadedFileForAttachment = await this.uploadFileForAttachment(attachment.originalName, readStream);
+            prepareVectorStoreFileRelations(
+              vectorStoreRelationId, pageId, uploadedFileForAttachment.id, vectorStoreFileRelationsMap, attachment._id,
+            );
+          }
+          catch (err) {
+            logger.error(err);
+          }
+        }
+        callback();
+      },
+      final: (callback) => {
+        logger.debug('Finished uploading attachments');
+        callback();
+      },
+    });
+
+    await pipeline(attachmentsCursor, batchStream, uploadFileStreamForAttachment);
+  }
+
+  private async createVectorStoreFile(
+      vectorStoreRelation: VectorStoreDocument, pages: Array<HydratedDocument<PageDocument>>, ignoreAttachments = false,
+  ): Promise<void> {
     const vectorStoreFileRelationsMap: VectorStoreFileRelationsMap = new Map();
     const processUploadFile = async(page: HydratedDocument<PageDocument>) => {
       if (page._id != null && page.revision != null) {
         if (isPopulated(page.revision) && page.revision.body.length > 0) {
           const uploadedFile = await this.uploadFile(page.revision.body, page);
           prepareVectorStoreFileRelations(vectorStoreRelation._id, page._id, uploadedFile.id, vectorStoreFileRelationsMap);
+
+          if (!ignoreAttachments) {
+            await this.createVectorStoreFileWithStreamForAttachment(page._id, vectorStoreRelation._id, vectorStoreFileRelationsMap);
+          }
           return;
         }
 
@@ -341,6 +419,10 @@ class OpenaiService implements IOpenaiService {
         if (pagePopulatedToShowRevision.revision != null && pagePopulatedToShowRevision.revision.body.length > 0) {
           const uploadedFile = await this.uploadFile(pagePopulatedToShowRevision.revision.body, page);
           prepareVectorStoreFileRelations(vectorStoreRelation._id, page._id, uploadedFile.id, vectorStoreFileRelationsMap);
+
+          if (!ignoreAttachments) {
+            await this.createVectorStoreFileWithStreamForAttachment(page._id, vectorStoreRelation._id, vectorStoreFileRelationsMap);
+          }
         }
       }
     };
@@ -413,7 +495,57 @@ class OpenaiService implements IOpenaiService {
     await VectorStoreModel.deleteMany({ _id: { $nin: currentVectorStoreRelationIds }, isDeleted: true });
   }
 
-  async deleteVectorStoreFile(vectorStoreRelationId: Types.ObjectId, pageId: Types.ObjectId, apiCallInterval?: number): Promise<void> {
+  private async deleteVectorStoreFileForAttachment(vectorStoreFileRelation: VectorStoreFileRelation): Promise<void> {
+    if (vectorStoreFileRelation.attachment == null) {
+      return;
+    }
+
+    const deleteAllAttachmentVectorStoreFileRelations = async() => {
+      await VectorStoreFileRelationModel.deleteMany({ attachment: vectorStoreFileRelation.attachment });
+    };
+
+    try {
+      // Delete entities in VectorStoreFile
+      const fileId = vectorStoreFileRelation.fileIds[0];
+      const deleteFileResponse = await this.client.deleteFile(fileId);
+      logger.debug('Delete vector store file (attachment) ', deleteFileResponse);
+
+      // Delete related VectorStoreFileRelation document
+      const attachmentId = vectorStoreFileRelation.attachment;
+      if (attachmentId != null) {
+        await deleteAllAttachmentVectorStoreFileRelations();
+      }
+    }
+    catch (err) {
+      logger.error(err);
+      await openaiApiErrorHandler(err, {
+        notFoundError: () => deleteAllAttachmentVectorStoreFileRelations(),
+      });
+    }
+  }
+
+  private async deleteVectorStoreFile(
+      vectorStoreRelationId: Types.ObjectId, pageId: Types.ObjectId, ignoreAttachments = false, apiCallInterval?: number,
+  ): Promise<void> {
+
+    if (!ignoreAttachments) {
+      // Get all VectorStoreFIleDocument (attachments) associated with the page
+      const vectorStoreFileRelationsForAttachment = await VectorStoreFileRelationModel.find({
+        vectorStoreRelationId, page: pageId, attachment: { $exists: true },
+      });
+
+      if (vectorStoreFileRelationsForAttachment.length !== 0) {
+        for await (const vectorStoreFileRelation of vectorStoreFileRelationsForAttachment) {
+          try {
+            await this.deleteVectorStoreFileForAttachment(vectorStoreFileRelation);
+          }
+          catch (err) {
+            logger.error(err);
+          }
+        }
+      }
+    }
+
     // Delete vector store file and delete vector store file relation
     const vectorStoreFileRelation = await VectorStoreFileRelationModel.findOne({ vectorStoreRelationId, page: pageId });
     if (vectorStoreFileRelation == null) {
@@ -476,7 +608,7 @@ class OpenaiService implements IOpenaiService {
     // Delete obsolete VectorStoreFile
     for await (const vectorStoreFileRelation of obsoleteVectorStoreFileRelations) {
       try {
-        await this.deleteVectorStoreFile(vectorStoreFileRelation.vectorStoreRelationId, vectorStoreFileRelation.page, apiCallInterval);
+        await this.deleteVectorStoreFile(vectorStoreFileRelation.vectorStoreRelationId, vectorStoreFileRelation.page, false, apiCallInterval);
       }
       catch (err) {
         logger.error(err);
@@ -484,10 +616,25 @@ class OpenaiService implements IOpenaiService {
     }
   }
 
-  async filterPagesByAccessScope(aiAssistant: AiAssistantDocument, pages: HydratedDocument<PageDocument>[]) {
-    const isPublicPage = (page :HydratedDocument<PageDocument>) => page.grant === PageGrant.GRANT_PUBLIC;
+  private async deleteVectorStoreFileOnDeleteAttachment(attachmentId: string) {
+    const vectorStoreFileRelation = await VectorStoreFileRelationModel.findOne({ attachment: attachmentId });
+    if (vectorStoreFileRelation == null) {
+      return;
+    }
+
+    try {
+      await this.deleteVectorStoreFileForAttachment(vectorStoreFileRelation);
+    }
+    catch (err) {
+      logger.error(err);
+    }
+  }
+
+
+  private async filterPagesByAccessScope(aiAssistant: AiAssistantDocument, pages: HydratedDocument<PageDocument>[]) {
+    const isPublicPage = (page: HydratedDocument<PageDocument>) => page.grant === PageGrant.GRANT_PUBLIC;
 
-    const isUserGroupAccessible = (page :HydratedDocument<PageDocument>, ownerUserGroupIds: string[]) => {
+    const isUserGroupAccessible = (page: HydratedDocument<PageDocument>, ownerUserGroupIds: string[]) => {
       if (page.grant !== PageGrant.GRANT_USER_GROUP) return false;
       return page.grantedGroups.some(group => ownerUserGroupIds.includes(getIdStringForRef(group.item)));
     };
@@ -576,8 +723,58 @@ class OpenaiService implements IOpenaiService {
       logger.debug('-----------------------------------------------------');
 
       // Do not create a new VectorStoreFile if page is changed to a permission that AiAssistant does not have access to
-      await this.createVectorStoreFile(vectorStoreRelation as VectorStoreDocument, pagesToVectorize);
-      await this.deleteVectorStoreFile((vectorStoreRelation as VectorStoreDocument)._id, page._id);
+      await this.deleteVectorStoreFile(
+        (vectorStoreRelation as VectorStoreDocument)._id,
+        page._id,
+        true, // ignoreAttachments = true
+      );
+      await this.createVectorStoreFile(
+        vectorStoreRelation as VectorStoreDocument,
+        pagesToVectorize,
+        true, // ignoreAttachments = true
+      );
+    }
+  }
+
+  private async createVectorStoreFileOnUploadAttachment(
+      pageId: string, attachment: HydratedDocument<IAttachmentDocument>, file: Express.Multer.File,
+  ): Promise<void> {
+    if (!isVectorStoreCompatible(file.originalname, file.mimetype)) {
+      return;
+    }
+
+    const Page = mongoose.model<HydratedDocument<PageDocument>, PageModel>('Page');
+    const page = await Page.findById(pageId);
+    if (page == null) {
+      return;
+    }
+
+    const aiAssistants = await this.findAiAssistantByPagePath([page.path], { shouldPopulateVectorStore: true });
+    if (aiAssistants.length === 0) {
+      return;
+    }
+
+    const uploadedFile = await this.uploadFileForAttachment(file.originalname, undefined, file.path);
+    logger.debug('Uploaded file', uploadedFile);
+
+    for await (const aiAssistant of aiAssistants) {
+      const pagesToVectorize = await this.filterPagesByAccessScope(aiAssistant, [page]);
+      if (pagesToVectorize.length === 0) {
+        continue;
+      }
+
+      const vectorStoreRelation = aiAssistant.vectorStore;
+      if (vectorStoreRelation == null || !isPopulated(vectorStoreRelation)) {
+        continue;
+      }
+
+      await this.client.createVectorStoreFile(vectorStoreRelation.vectorStoreId, uploadedFile.id);
+
+      const vectorStoreFileRelationsMap: VectorStoreFileRelationsMap = new Map();
+      prepareVectorStoreFileRelations(vectorStoreRelation._id as Types.ObjectId, page._id, uploadedFile.id, vectorStoreFileRelationsMap, attachment._id);
+      const vectorStoreFileRelations = Array.from(vectorStoreFileRelationsMap.values());
+
+      await VectorStoreFileRelationModel.upsertVectorStoreFileRelations(vectorStoreFileRelations);
     }
   }
 
@@ -594,7 +791,7 @@ class OpenaiService implements IOpenaiService {
       objectMode: true,
       async transform(chunk: HydratedDocument<PageDocument>[], encoding, callback) {
         try {
-          logger.debug('Search results of page paths', chunk.map(page => page.path));
+          logger.debug('Target page path for VectorStoreFile generation: ', chunk.map(page => page.path));
           await createVectorStoreFile(vectorStoreRelation, chunk);
           this.push(chunk);
           callback();
@@ -905,7 +1102,7 @@ class OpenaiService implements IOpenaiService {
     return totalPageCount > limitLearnablePageCountPerAssistant;
   }
 
-  async findAiAssistantByPagePath(
+  private async findAiAssistantByPagePath(
       pagePaths: string[], options?: { shouldPopulateOwner?: boolean, shouldPopulateVectorStore?: boolean },
   ): Promise<AiAssistantDocument[]> {
 
@@ -935,15 +1132,16 @@ class OpenaiService implements IOpenaiService {
 }
 
 let instance: OpenaiService;
-export const getOpenaiService = (): IOpenaiService | undefined => {
-  if (instance != null) {
-    return instance;
-  }
-
+export const initializeOpenaiService = (crowi: Crowi): void => {
   const aiEnabled = configManager.getConfig('app:aiEnabled');
   const openaiServiceType = configManager.getConfig('openai:serviceType');
   if (aiEnabled && openaiServiceType != null && OpenaiServiceTypes.includes(openaiServiceType)) {
-    instance = new OpenaiService();
+    instance = new OpenaiService(crowi);
+  }
+};
+
+export const getOpenaiService = (): IOpenaiService | undefined => {
+  if (instance != null) {
     return instance;
   }
 

+ 46 - 0
apps/app/src/features/openai/server/utils/is-vector-store-compatible.ts

@@ -0,0 +1,46 @@
+import path from 'path';
+
+// See: https://platform.openai.com/docs/assistants/tools/file-search#supported-files
+const supportedFormats = {
+  '.c': 'text/x-c',
+  '.cpp': 'text/x-c++',
+  '.cs': 'text/x-csharp',
+  '.css': 'text/css',
+  '.doc': 'application/msword',
+  '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+  '.go': 'text/x-golang',
+  '.html': 'text/html',
+  '.java': 'text/x-java',
+  '.js': 'text/javascript',
+  '.json': 'application/json',
+  '.md': 'text/markdown',
+  '.pdf': 'application/pdf',
+  '.php': 'text/x-php',
+  '.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
+  '.py': ['text/x-python', 'text/x-script.python'],
+  '.rb': 'text/x-ruby',
+  '.sh': 'application/x-sh',
+  '.tex': 'text/x-tex',
+  '.ts': 'application/typescript',
+  '.txt': 'text/plain',
+} as const;
+
+type SupportedExtension = keyof typeof supportedFormats;
+
+export const isVectorStoreCompatible = (originalName: string, mimeType: string): boolean => {
+  // Get extension
+  const extension = path.extname(originalName).toLowerCase();
+
+  // Check if the file extension is supported
+  if (!(extension in supportedFormats)) {
+    return false;
+  }
+
+  // Get Mime Type (At this point, file extension is confirmed to be supported, so type-safe access is possible)
+  const supportedMimeType = supportedFormats[extension as SupportedExtension];
+
+  // Check if the mimeType is supported
+  return Array.isArray(supportedMimeType)
+    ? supportedMimeType.includes(mimeType)
+    : supportedMimeType === mimeType;
+};

+ 47 - 0
apps/app/src/migrations/20250522105040-delete-old-index-for-vector-store-file-relation.js

@@ -0,0 +1,47 @@
+import mongoose from 'mongoose';
+
+import { getMongoUri, mongoOptions } from '~/server/util/mongoose-utils';
+import loggerFactory from '~/utils/logger';
+
+const logger = loggerFactory('growi:migrate:vector-store-file-relation-index-migration');
+
+async function dropIndexIfExists(db, collectionName, indexName) {
+  // check existence of the collection
+  const items = await db.listCollections({ name: collectionName }, { nameOnly: true }).toArray();
+  if (items.length === 0) {
+    return;
+  }
+
+  const collection = await db.collection(collectionName);
+  if (await collection.indexExists(indexName)) {
+    await collection.dropIndex(indexName);
+  }
+}
+
+module.exports = {
+  async up(db) {
+    logger.info('Apply migration');
+
+    await mongoose.connect(getMongoUri(), mongoOptions);
+
+    // Drop old index
+    await dropIndexIfExists(db, 'vectorstorefilerelations', 'vectorStoreRelationId_1_page_1');
+
+    // Create index
+    const collection = mongoose.connection.collection('vectorstorefilerelations');
+    await collection.createIndex({ vectorStoreRelationId: 1, page: 1, attachment: 1 }, { unique: true });
+
+  },
+  async down(db) {
+    logger.info('Rollback migration');
+
+    await mongoose.connect(getMongoUri(), mongoOptions);
+
+    // Drop new index
+    await dropIndexIfExists(db, 'vectorstorefilerelations', 'vectorStoreRelationId_1_page_1_attachment_1');
+
+    // Recreate old index
+    const collection = mongoose.connection.collection('vectorstorefilerelations');
+    await collection.createIndex({ vectorStoreRelationId: 1, page: 1 }, { unique: true });
+  },
+};

+ 10 - 1
apps/app/src/server/crowi/index.js

@@ -11,6 +11,7 @@ import next from 'next';
 import { KeycloakUserGroupSyncService } from '~/features/external-user-group/server/service/keycloak-user-group-sync';
 import { LdapUserGroupSyncService } from '~/features/external-user-group/server/service/ldap-user-group-sync';
 import { startCronIfEnabled as startOpenaiCronIfEnabled } from '~/features/openai/server/services/cron';
+import { initializeOpenaiService } from '~/features/openai/server/services/openai';
 import { checkPageBulkExportJobInProgressCronService } from '~/features/page-bulk-export/server/service/check-page-bulk-export-job-in-progress-cron';
 import instanciatePageBulkExportJobCleanUpCronService, {
   pageBulkExportJobCleanUpCronService,
@@ -177,7 +178,6 @@ Crowi.prototype.init = async function() {
   this.models = await setupModelsDependentOnCrowi(this);
   await this.setupConfigManager();
   await this.setupSessionConfig();
-  this.setupCron();
 
   // setup messaging services
   await this.setupS2sMessagingService();
@@ -223,8 +223,13 @@ Crowi.prototype.init = async function() {
     // depends on passport service
     this.setupExternalAccountService(),
     this.setupExternalUserGroupSyncService(),
+
+    // depends on AttachmentService
+    this.setupOpenaiService(),
   ]);
 
+  this.setupCron();
+
   await normalizeData();
 };
 
@@ -809,4 +814,8 @@ Crowi.prototype.setupExternalUserGroupSyncService = function() {
   this.keycloakUserGroupSyncService = new KeycloakUserGroupSyncService(this.s2sMessagingService, this.socketIoService);
 };
 
+Crowi.prototype.setupOpenaiService = function() {
+  initializeOpenaiService(this);
+};
+
 export default Crowi;

+ 3 - 2
apps/app/src/server/routes/apiv3/attachment.js

@@ -339,8 +339,9 @@ module.exports = (crowi) => {
    *          500:
    *            $ref: '#/components/responses/500'
    */
-  router.post('/', accessTokenParser, loginRequiredStrictly, excludeReadOnlyUser, uploads.single('file'), autoReap,
+  router.post('/', accessTokenParser, loginRequiredStrictly, excludeReadOnlyUser, uploads.single('file'),
     validator.retrieveAddAttachment, apiV3FormValidator, addActivity,
+    // Removed autoReap middleware to use file data in asynchronous processes. Instead, implemented file deletion after asynchronous processes complete
     async(req, res) => {
 
       const pageId = req.body.page_id;
@@ -360,7 +361,7 @@ module.exports = (crowi) => {
           return res.apiv3Err(`Forbidden to access to the page '${page.id}'`);
         }
 
-        const attachment = await attachmentService.createAttachment(file, req.user, pageId, AttachmentType.WIKI_PAGE);
+        const attachment = await attachmentService.createAttachment(file, req.user, pageId, AttachmentType.WIKI_PAGE, () => autoReap(req, res, () => {}));
 
         const result = {
           page: serializePageSecurely(page),

+ 60 - 8
apps/app/src/server/service/attachment.js

@@ -10,11 +10,23 @@ const mongoose = require('mongoose');
 // eslint-disable-next-line @typescript-eslint/no-unused-vars
 const logger = loggerFactory('growi:service:AttachmentService');
 
+const createReadStream = (filePath) => {
+  return fs.createReadStream(filePath, {
+    flags: 'r', encoding: null, fd: null, mode: '0666', autoClose: true,
+  });
+};
+
 /**
  * the service class for Attachment and file-uploader
  */
 class AttachmentService {
 
+  /** @type {Array<(pageId: string, attachment: Attachment, file: Express.Multer.File) => Promise<void>>} */
+  attachHandlers = [];
+
+  /** @type {Array<(attachmentId: string) => Promise<void>>} */
+  detachHandlers = [];
+
   /** @type {import('~/server/crowi').default} Crowi instance */
   crowi;
 
@@ -23,7 +35,7 @@ class AttachmentService {
     this.crowi = crowi;
   }
 
-  async createAttachment(file, user, pageId = null, attachmentType) {
+  async createAttachment(file, user, pageId = null, attachmentType, disposeTmpFileCallback) {
     const { fileUploadService } = this.crowi;
 
     // check limit
@@ -32,22 +44,36 @@ class AttachmentService {
       throw new Error(res.errorMessage);
     }
 
-    const fileStream = fs.createReadStream(file.path, {
-      flags: 'r', encoding: null, fd: null, mode: '0666', autoClose: true,
-    });
-
     // create an Attachment document and upload file
     let attachment;
+    let readStreamForCreateAttachmentDocument;
     try {
+      readStreamForCreateAttachmentDocument = createReadStream(file.path);
       attachment = Attachment.createWithoutSave(pageId, user, file.originalname, file.mimetype, file.size, attachmentType);
-      await fileUploadService.uploadAttachment(fileStream, attachment);
+      await fileUploadService.uploadAttachment(readStreamForCreateAttachmentDocument, attachment);
       await attachment.save();
+
+      const attachHandlerPromises = this.attachHandlers.map((handler) => {
+        return handler(pageId, attachment, file);
+      });
+
+      // Do not await, run in background
+      Promise.all(attachHandlerPromises)
+        .catch((err) => {
+          logger.error('Error while executing attach handler', err);
+        })
+        .finally(() => {
+          disposeTmpFileCallback?.(file);
+        });
     }
     catch (err) {
-      // delete temporary file
-      fs.unlink(file.path, (err) => { if (err) { logger.error('Error while deleting tmp file.') } });
+      logger.error('Error while creating attachment', err);
+      disposeTmpFileCallback?.(file);
       throw err;
     }
+    finally {
+      readStreamForCreateAttachmentDocument.destroy();
+    }
 
     return attachment;
   }
@@ -78,6 +104,16 @@ class AttachmentService {
     await fileUploadService.deleteFile(attachment);
     await attachment.remove();
 
+    const detachedHandlerPromises = this.detachHandlers.map((handler) => {
+      return handler(attachment._id);
+    });
+
+    // Do not await, run in background
+    Promise.all(detachedHandlerPromises)
+      .catch((err) => {
+        logger.error('Error while executing detached handler', err);
+      });
+
     return;
   }
 
@@ -88,6 +124,22 @@ class AttachmentService {
     return count >= 1;
   }
 
+  /**
+   * Register a handler that will be called after attachment creation
+   * @param {(pageId: string, attachment: Attachment, file: Express.Multer.File) => Promise<void>} handler
+   */
+  addAttachHandler(handler) {
+    this.attachHandlers.push(handler);
+  }
+
+  /**
+   * Register a handler that will be called before attachment deletion
+   * @param {(attachmentId: string) => Promise<void>} handler
+   */
+  addDetachHandler(handler) {
+    this.detachHandlers.push(handler);
+  }
+
 }
 
 module.exports = AttachmentService;

+ 11 - 8
pnpm-lock.yaml

@@ -304,6 +304,9 @@ importers:
       '@types/async':
         specifier: ^3.2.24
         version: 3.2.24
+      '@types/multer':
+        specifier: ^1.4.12
+        version: 1.4.12
       JSONStream:
         specifier: ^1.3.5
         version: 1.3.5
@@ -14901,7 +14904,7 @@ snapshots:
       '@aws-sdk/client-sso-oidc': 3.600.0
       '@aws-sdk/client-sts': 3.600.0(@aws-sdk/client-sso-oidc@3.600.0)
       '@aws-sdk/core': 3.598.0
-      '@aws-sdk/credential-provider-node': 3.600.0(@aws-sdk/client-sso-oidc@3.600.0)(@aws-sdk/client-sts@3.600.0)
+      '@aws-sdk/credential-provider-node': 3.600.0(@aws-sdk/client-sso-oidc@3.600.0)(@aws-sdk/client-sts@3.600.0(@aws-sdk/client-sso-oidc@3.600.0))
       '@aws-sdk/middleware-host-header': 3.598.0
       '@aws-sdk/middleware-logger': 3.598.0
       '@aws-sdk/middleware-recursion-detection': 3.598.0
@@ -15009,7 +15012,7 @@ snapshots:
       '@aws-crypto/sha256-js': 5.2.0
       '@aws-sdk/client-sts': 3.600.0(@aws-sdk/client-sso-oidc@3.600.0)
       '@aws-sdk/core': 3.598.0
-      '@aws-sdk/credential-provider-node': 3.600.0(@aws-sdk/client-sso-oidc@3.600.0)(@aws-sdk/client-sts@3.600.0)
+      '@aws-sdk/credential-provider-node': 3.600.0(@aws-sdk/client-sso-oidc@3.600.0)(@aws-sdk/client-sts@3.600.0(@aws-sdk/client-sso-oidc@3.600.0))
       '@aws-sdk/middleware-host-header': 3.598.0
       '@aws-sdk/middleware-logger': 3.598.0
       '@aws-sdk/middleware-recursion-detection': 3.598.0
@@ -15185,7 +15188,7 @@ snapshots:
       '@aws-crypto/sha256-js': 5.2.0
       '@aws-sdk/client-sso-oidc': 3.600.0
       '@aws-sdk/core': 3.598.0
-      '@aws-sdk/credential-provider-node': 3.600.0(@aws-sdk/client-sso-oidc@3.600.0)(@aws-sdk/client-sts@3.600.0)
+      '@aws-sdk/credential-provider-node': 3.600.0(@aws-sdk/client-sso-oidc@3.600.0)(@aws-sdk/client-sts@3.600.0(@aws-sdk/client-sso-oidc@3.600.0))
       '@aws-sdk/middleware-host-header': 3.598.0
       '@aws-sdk/middleware-logger': 3.598.0
       '@aws-sdk/middleware-recursion-detection': 3.598.0
@@ -15296,7 +15299,7 @@ snapshots:
     transitivePeerDependencies:
       - aws-crt
 
-  '@aws-sdk/credential-provider-ini@3.598.0(@aws-sdk/client-sso-oidc@3.600.0)(@aws-sdk/client-sts@3.600.0)':
+  '@aws-sdk/credential-provider-ini@3.598.0(@aws-sdk/client-sso-oidc@3.600.0)(@aws-sdk/client-sts@3.600.0(@aws-sdk/client-sso-oidc@3.600.0))':
     dependencies:
       '@aws-sdk/client-sts': 3.600.0(@aws-sdk/client-sso-oidc@3.600.0)
       '@aws-sdk/credential-provider-env': 3.598.0
@@ -15331,11 +15334,11 @@ snapshots:
     transitivePeerDependencies:
       - aws-crt
 
-  '@aws-sdk/credential-provider-node@3.600.0(@aws-sdk/client-sso-oidc@3.600.0)(@aws-sdk/client-sts@3.600.0)':
+  '@aws-sdk/credential-provider-node@3.600.0(@aws-sdk/client-sso-oidc@3.600.0)(@aws-sdk/client-sts@3.600.0(@aws-sdk/client-sso-oidc@3.600.0))':
     dependencies:
       '@aws-sdk/credential-provider-env': 3.598.0
       '@aws-sdk/credential-provider-http': 3.598.0
-      '@aws-sdk/credential-provider-ini': 3.598.0(@aws-sdk/client-sso-oidc@3.600.0)(@aws-sdk/client-sts@3.600.0)
+      '@aws-sdk/credential-provider-ini': 3.598.0(@aws-sdk/client-sso-oidc@3.600.0)(@aws-sdk/client-sts@3.600.0(@aws-sdk/client-sso-oidc@3.600.0))
       '@aws-sdk/credential-provider-process': 3.598.0
       '@aws-sdk/credential-provider-sso': 3.598.0(@aws-sdk/client-sso-oidc@3.600.0)
       '@aws-sdk/credential-provider-web-identity': 3.598.0(@aws-sdk/client-sts@3.600.0)
@@ -15418,8 +15421,8 @@ snapshots:
       '@aws-sdk/credential-provider-cognito-identity': 3.600.0
       '@aws-sdk/credential-provider-env': 3.598.0
       '@aws-sdk/credential-provider-http': 3.598.0
-      '@aws-sdk/credential-provider-ini': 3.598.0(@aws-sdk/client-sso-oidc@3.600.0)(@aws-sdk/client-sts@3.600.0)
-      '@aws-sdk/credential-provider-node': 3.600.0(@aws-sdk/client-sso-oidc@3.600.0)(@aws-sdk/client-sts@3.600.0)
+      '@aws-sdk/credential-provider-ini': 3.598.0(@aws-sdk/client-sso-oidc@3.600.0)(@aws-sdk/client-sts@3.600.0(@aws-sdk/client-sso-oidc@3.600.0))
+      '@aws-sdk/credential-provider-node': 3.600.0(@aws-sdk/client-sso-oidc@3.600.0)(@aws-sdk/client-sts@3.600.0(@aws-sdk/client-sso-oidc@3.600.0))
       '@aws-sdk/credential-provider-process': 3.598.0
       '@aws-sdk/credential-provider-sso': 3.598.0(@aws-sdk/client-sso-oidc@3.600.0)
       '@aws-sdk/credential-provider-web-identity': 3.598.0(@aws-sdk/client-sts@3.600.0)