소스 검색

Merge pull request #11229 from growilabs/fix/153352-184094-bulk-export-attachment-removal-race

fix: Prevent bulk-export cleanup race on shared attachment
Yuki Takei 3 일 전
부모
커밋
62ddc370b0

+ 151 - 3
apps/app/src/features/page-bulk-export/server/service/page-bulk-export-job-clean-up-cron.integ.ts

@@ -1,4 +1,5 @@
 import mongoose from 'mongoose';
+import { mock } from 'vitest-mock-extended';
 
 import type Crowi from '~/server/crowi';
 import { configManager } from '~/server/service/config-manager';
@@ -36,9 +37,15 @@ vi.mock('./page-bulk-export-job-cron', () => {
 });
 
 describe('PageBulkExportJobCleanUpCronService', () => {
-  const crowi = {} as Crowi;
-  // biome-ignore lint/suspicious/noImplicitAnyLet: ignore
-  let user;
+  const removeAttachmentMock = vi.fn(() => Promise.resolve());
+  const crowi = mock<Crowi>({
+    attachmentService: {
+      removeAttachment: removeAttachmentMock,
+    },
+  });
+  let user: mongoose.HydratedDocument<
+    mongoose.InferSchemaType<typeof userSchema>
+  >;
 
   beforeAll(async () => {
     await configManager.loadConfigs();
@@ -52,6 +59,7 @@ describe('PageBulkExportJobCleanUpCronService', () => {
 
   beforeEach(async () => {
     await PageBulkExportJob.deleteMany();
+    removeAttachmentMock.mockClear();
   });
 
   describe('deleteExpiredExportJobs', () => {
@@ -177,6 +185,146 @@ describe('PageBulkExportJobCleanUpCronService', () => {
     });
   });
 
+  // Regression coverage for the race condition that left zombie job records
+  // when multiple expired jobs shared a single attachment (the duplicate-reuse
+  // path re-binds an existing attachment to a fresh job). Without the dedup,
+  // the concurrent cleanup loop calls removeAttachment per-sibling and the
+  // loser of the race throws "Attachment not found", which silently drops its
+  // job record out of the deleteMany() set.
+  describe('deleteDownloadExpiredExportJobs (shared attachment)', () => {
+    const sharedAttachmentId = new mongoose.Types.ObjectId();
+    const otherAttachmentId = new mongoose.Types.ObjectId();
+
+    beforeEach(async () => {
+      await configManager.updateConfig(
+        'app:bulkExportDownloadExpirationSeconds',
+        86400,
+      ); // 1 day
+    });
+
+    test('should call removeAttachment exactly once when multiple expired jobs share the same attachment', async () => {
+      // arrange: two expired jobs pointing at the same attachment (the
+      // duplicate-reuse path produces this shape)
+      const expiredAt = new Date(Date.now() - 86400 * 1000 - 1);
+      await PageBulkExportJob.insertMany([
+        {
+          user,
+          page: new mongoose.Types.ObjectId(),
+          format: PageBulkExportFormat.md,
+          status: PageBulkExportJobStatus.completed,
+          completedAt: expiredAt,
+          attachment: sharedAttachmentId,
+        },
+        {
+          user,
+          page: new mongoose.Types.ObjectId(),
+          format: PageBulkExportFormat.md,
+          status: PageBulkExportJobStatus.completed,
+          completedAt: expiredAt,
+          attachment: sharedAttachmentId,
+        },
+      ]);
+
+      // act
+      await pageBulkExportJobCleanUpCronService?.deleteDownloadExpiredExportJobs();
+
+      // assert: only one removeAttachment call for the shared attachment, and
+      // both job records are gone (no zombie left behind)
+      expect(removeAttachmentMock).toHaveBeenCalledTimes(1);
+      expect(removeAttachmentMock).toHaveBeenCalledWith(sharedAttachmentId);
+      expect(await PageBulkExportJob.find()).toHaveLength(0);
+    });
+
+    test('should still remove distinct attachments once each when expired jobs reference different attachments', async () => {
+      // arrange: ensure the dedup does not over-merge across distinct attachments
+      const expiredAt = new Date(Date.now() - 86400 * 1000 - 1);
+      await PageBulkExportJob.insertMany([
+        {
+          user,
+          page: new mongoose.Types.ObjectId(),
+          format: PageBulkExportFormat.md,
+          status: PageBulkExportJobStatus.completed,
+          completedAt: expiredAt,
+          attachment: sharedAttachmentId,
+        },
+        {
+          user,
+          page: new mongoose.Types.ObjectId(),
+          format: PageBulkExportFormat.md,
+          status: PageBulkExportJobStatus.completed,
+          completedAt: expiredAt,
+          attachment: otherAttachmentId,
+        },
+      ]);
+
+      // act
+      await pageBulkExportJobCleanUpCronService?.deleteDownloadExpiredExportJobs();
+
+      // assert
+      expect(removeAttachmentMock).toHaveBeenCalledTimes(2);
+      expect(removeAttachmentMock).toHaveBeenCalledWith(sharedAttachmentId);
+      expect(removeAttachmentMock).toHaveBeenCalledWith(otherAttachmentId);
+      expect(await PageBulkExportJob.find()).toHaveLength(0);
+    });
+
+    test('should not call removeAttachment when an unexpired sibling job still references the attachment', async () => {
+      // arrange: one expired + one unexpired sharing the same attachment.
+      // The unexpired sibling protects the attachment from deletion.
+      await PageBulkExportJob.insertMany([
+        {
+          user,
+          page: new mongoose.Types.ObjectId(),
+          format: PageBulkExportFormat.md,
+          status: PageBulkExportJobStatus.completed,
+          completedAt: new Date(Date.now() - 86400 * 1000 - 1),
+          attachment: sharedAttachmentId,
+        },
+        {
+          user,
+          page: new mongoose.Types.ObjectId(),
+          format: PageBulkExportFormat.md,
+          status: PageBulkExportJobStatus.completed,
+          completedAt: new Date(Date.now()),
+          attachment: sharedAttachmentId,
+        },
+      ]);
+
+      // act
+      await pageBulkExportJobCleanUpCronService?.deleteDownloadExpiredExportJobs();
+
+      // assert: attachment retained, only the expired job is gone
+      expect(removeAttachmentMock).not.toHaveBeenCalled();
+      const remaining = await PageBulkExportJob.find();
+      expect(remaining).toHaveLength(1);
+      expect(remaining[0].attachment?.toString()).toBe(
+        sharedAttachmentId.toString(),
+      );
+    });
+
+    test('should delete an expired job whose removeAttachment resolves as a no-op (zombie with dangling attachment ref)', async () => {
+      // arrange: simulate the real removeAttachment idempotent contract — when
+      // the attachment metadata doc is already gone, the call resolves without
+      // throwing. The job record must still be deleteMany()-d.
+      const zombieAttachmentId = new mongoose.Types.ObjectId();
+      await PageBulkExportJob.create({
+        user,
+        page: new mongoose.Types.ObjectId(),
+        format: PageBulkExportFormat.md,
+        status: PageBulkExportJobStatus.completed,
+        completedAt: new Date(Date.now() - 86400 * 1000 - 1),
+        attachment: zombieAttachmentId,
+      });
+
+      // act
+      await pageBulkExportJobCleanUpCronService?.deleteDownloadExpiredExportJobs();
+
+      // assert
+      expect(removeAttachmentMock).toHaveBeenCalledTimes(1);
+      expect(removeAttachmentMock).toHaveBeenCalledWith(zombieAttachmentId);
+      expect(await PageBulkExportJob.find()).toHaveLength(0);
+    });
+  });
+
   describe('deleteFailedExportJobs', () => {
     // arrange
     const jobId1 = new mongoose.Types.ObjectId();

+ 22 - 0
apps/app/src/features/page-bulk-export/server/service/page-bulk-export-job-clean-up-cron.ts

@@ -85,9 +85,31 @@ class PageBulkExportJobCleanUpCronService extends CronService {
       completedAt: { $lt: thresholdDate },
     });
 
+    // Pick one "owner" job per shared attachment. When multiple jobs in this
+    // batch reference the same attachment (e.g. the duplicate-reuse path that
+    // re-binds an existing attachment to a new job), only the owner is allowed
+    // to call removeAttachment. Without this, the concurrent cleanup loop below
+    // would issue removeAttachment for the same attachment from each sibling,
+    // and the loser of the race would throw "Attachment not found" / GridFS
+    // "File not found", leaving its job record undeleted as a permanent zombie.
+    const attachmentOwnerJobIds = new Map<string, string>();
+    for (const job of downloadExpiredExportJobs) {
+      const attachmentKey = job.attachment?.toString();
+      if (attachmentKey == null) continue;
+      if (!attachmentOwnerJobIds.has(attachmentKey)) {
+        attachmentOwnerJobIds.set(attachmentKey, job._id.toString());
+      }
+    }
+
     const cleanUp = async (job: PageBulkExportJobDocument) => {
       await pageBulkExportJobCronService?.cleanUpExportJobResources(job);
 
+      const attachmentKey = job.attachment?.toString();
+      if (attachmentKey == null) return;
+      const isAttachmentOwner =
+        attachmentOwnerJobIds.get(attachmentKey) === job._id.toString();
+      if (!isAttachmentOwner) return;
+
       const hasSameAttachmentAndDownloadNotExpired =
         await PageBulkExportJob.findOne({
           attachment: job.attachment,

+ 59 - 0
apps/app/src/server/service/attachment.spec.ts

@@ -0,0 +1,59 @@
+import { mock } from 'vitest-mock-extended';
+
+import type Crowi from '../crowi';
+import type { IAttachmentDocument } from '../models/attachment';
+import { Attachment } from '../models/attachment';
+import { AttachmentService } from './attachment';
+
+// Locks down two contracts of removeAttachment:
+// 1. Missing metadata doc is a no-op (the bulk-export cleanup cron relies on
+//    this to self-heal zombie job records without throwing).
+// 2. A genuine file-store failure propagates, so callers like the attachment
+//    delete API surface it instead of dropping the metadata doc and stranding
+//    an orphan blob.
+describe('AttachmentService.removeAttachment', () => {
+  test('should resolve without throwing when the attachment is already gone', async () => {
+    const findByIdSpy = vi
+      .spyOn(Attachment, 'findById')
+      .mockResolvedValue(null);
+    const deleteFile = vi.fn();
+    const crowi = mock<Crowi>({
+      fileUploadService: { deleteFile },
+    });
+    const service = new AttachmentService(crowi);
+
+    await expect(
+      service.removeAttachment('this-id-does-not-exist'),
+    ).resolves.toBeUndefined();
+
+    expect(deleteFile).not.toHaveBeenCalled();
+    findByIdSpy.mockRestore();
+  });
+
+  test('should propagate the error and not drop the metadata doc when the file store fails', async () => {
+    const attachmentRemove = vi.fn().mockResolvedValue(undefined);
+    const fakeAttachment = mock<IAttachmentDocument>({
+      remove: attachmentRemove,
+    });
+    const findByIdSpy = vi
+      .spyOn(Attachment, 'findById')
+      .mockResolvedValue(fakeAttachment);
+    const deleteFile = vi
+      .fn()
+      .mockRejectedValue(new Error('S3 is temporarily unavailable'));
+    const crowi = mock<Crowi>({
+      fileUploadService: { deleteFile },
+    });
+    const service = new AttachmentService(crowi);
+    service.detachHandlers = [];
+
+    await expect(service.removeAttachment('some-id')).rejects.toThrow(
+      'S3 is temporarily unavailable',
+    );
+
+    expect(deleteFile).toHaveBeenCalledTimes(1);
+    // metadata doc must survive so the blob stays referenceable for retry
+    expect(attachmentRemove).not.toHaveBeenCalled();
+    findByIdSpy.mockRestore();
+  });
+});

+ 15 - 1
apps/app/src/server/service/attachment.ts

@@ -147,10 +147,24 @@ export class AttachmentService implements IAttachmentService {
     const { fileUploadService } = this.crowi;
     const attachment = await Attachment.findById(attachmentId);
 
+    // No-op when the metadata doc is already gone. The bulk-export cleanup cron
+    // relies on this to self-heal: a job whose attachment was already removed by
+    // a previous tick (or by a concurrent remover that won an unsynchronized
+    // cross-process race) resolves cleanly here and gets deleted instead of
+    // lingering as a zombie record. Throwing would re-surface it every tick.
     if (attachment == null) {
-      throw new Error(`Attachment not found: ${attachmentId}`);
+      logger.debug(
+        `removeAttachment: attachment already gone, skipping: ${attachmentId}`,
+      );
+      return;
     }
 
+    // Intentionally NOT swallowing deleteFile errors. A genuine file-store
+    // failure (S3/GridFS outage, permission error) must propagate so callers
+    // such as the attachment delete API surface it instead of dropping the
+    // metadata doc and stranding an unreferenceable orphan blob. "File already
+    // gone" is not an error path here: the underlying stores already no-op it
+    // (see gridfs deleteFile, which warns and returns when the file is missing).
     await fileUploadService.deleteFile(attachment);
     await attachment.remove();