Quellcode durchsuchen

fix(app): prevent bulk-export cleanup race on shared attachment

When the duplicate-reuse path produces multiple completed jobs that share a
single attachment, the download-expiration cleanup cron ran cleanUp() for
each job concurrently via Promise.allSettled. Each cleanUp called
attachmentService.removeAttachment for the same attachment, and the loser
of the race threw "Attachment not found" / GridFS "File not found". That
rejection excluded the losing job from the deleteMany() set, leaving a
zombie job record whose attachment ref was already gone. Every subsequent
cron tick re-attempted the same removal and produced the same error.

Two coordinated changes:

1. cron-side dedup: in deleteDownloadExpiredExportJobs, pick a single owner
   job per attachment from the expired batch and only run removeAttachment
   on the owner. Non-owner jobs still get their per-job resources cleaned
   up and the job record dropped by deleteMany, but they no longer race
   for the shared attachment.
2. removeAttachment idempotency: treat a missing attachment as a no-op
   instead of throwing, and swallow underlying deleteFile errors while
   still removing the metadata doc. This recovers existing zombie records
   on the next cron tick and adds defence-in-depth for any other concurrent
   caller (the attachment delete API and userSchema.deleteImage both
   pre-check or fire-and-forget, so the relaxed contract is safe for them).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
tomoyuki-t vor 2 Wochen
Ursprung
Commit
ea75fdcc85

+ 147 - 1
apps/app/src/features/page-bulk-export/server/service/page-bulk-export-job-clean-up-cron.integ.ts

@@ -36,7 +36,12 @@ vi.mock('./page-bulk-export-job-cron', () => {
 });
 
 describe('PageBulkExportJobCleanUpCronService', () => {
-  const crowi = {} as Crowi;
+  const removeAttachmentMock = vi.fn(() => Promise.resolve());
+  const crowi = {
+    attachmentService: {
+      removeAttachment: removeAttachmentMock,
+    },
+  } as unknown as Crowi;
   // biome-ignore lint/suspicious/noImplicitAnyLet: ignore
   let user;
 
@@ -52,6 +57,7 @@ describe('PageBulkExportJobCleanUpCronService', () => {
 
   beforeEach(async () => {
     await PageBulkExportJob.deleteMany();
+    removeAttachmentMock.mockClear();
   });
 
   describe('deleteExpiredExportJobs', () => {
@@ -177,6 +183,146 @@ describe('PageBulkExportJobCleanUpCronService', () => {
     });
   });
 
+  // Regression coverage for the race condition that left zombie job records
+  // when multiple expired jobs shared a single attachment (the duplicate-reuse
+  // path re-binds an existing attachment to a fresh job). Without the dedup,
+  // the concurrent cleanup loop calls removeAttachment per-sibling and the
+  // loser of the race throws "Attachment not found", which silently drops its
+  // job record out of the deleteMany() set.
+  describe('deleteDownloadExpiredExportJobs (shared attachment)', () => {
+    const sharedAttachmentId = new mongoose.Types.ObjectId();
+    const otherAttachmentId = new mongoose.Types.ObjectId();
+
+    beforeEach(async () => {
+      await configManager.updateConfig(
+        'app:bulkExportDownloadExpirationSeconds',
+        86400,
+      ); // 1 day
+    });
+
+    test('should call removeAttachment exactly once when multiple expired jobs share the same attachment', async () => {
+      // arrange: two expired jobs pointing at the same attachment (the
+      // duplicate-reuse path produces this shape)
+      const expiredAt = new Date(Date.now() - 86400 * 1000 - 1);
+      await PageBulkExportJob.insertMany([
+        {
+          user,
+          page: new mongoose.Types.ObjectId(),
+          format: PageBulkExportFormat.md,
+          status: PageBulkExportJobStatus.completed,
+          completedAt: expiredAt,
+          attachment: sharedAttachmentId,
+        },
+        {
+          user,
+          page: new mongoose.Types.ObjectId(),
+          format: PageBulkExportFormat.md,
+          status: PageBulkExportJobStatus.completed,
+          completedAt: expiredAt,
+          attachment: sharedAttachmentId,
+        },
+      ]);
+
+      // act
+      await pageBulkExportJobCleanUpCronService?.deleteDownloadExpiredExportJobs();
+
+      // assert: only one removeAttachment call for the shared attachment, and
+      // both job records are gone (no zombie left behind)
+      expect(removeAttachmentMock).toHaveBeenCalledTimes(1);
+      expect(removeAttachmentMock).toHaveBeenCalledWith(sharedAttachmentId);
+      expect(await PageBulkExportJob.find()).toHaveLength(0);
+    });
+
+    test('should still remove distinct attachments once each when expired jobs reference different attachments', async () => {
+      // arrange: ensure the dedup does not over-merge across distinct attachments
+      const expiredAt = new Date(Date.now() - 86400 * 1000 - 1);
+      await PageBulkExportJob.insertMany([
+        {
+          user,
+          page: new mongoose.Types.ObjectId(),
+          format: PageBulkExportFormat.md,
+          status: PageBulkExportJobStatus.completed,
+          completedAt: expiredAt,
+          attachment: sharedAttachmentId,
+        },
+        {
+          user,
+          page: new mongoose.Types.ObjectId(),
+          format: PageBulkExportFormat.md,
+          status: PageBulkExportJobStatus.completed,
+          completedAt: expiredAt,
+          attachment: otherAttachmentId,
+        },
+      ]);
+
+      // act
+      await pageBulkExportJobCleanUpCronService?.deleteDownloadExpiredExportJobs();
+
+      // assert
+      expect(removeAttachmentMock).toHaveBeenCalledTimes(2);
+      expect(removeAttachmentMock).toHaveBeenCalledWith(sharedAttachmentId);
+      expect(removeAttachmentMock).toHaveBeenCalledWith(otherAttachmentId);
+      expect(await PageBulkExportJob.find()).toHaveLength(0);
+    });
+
+    test('should not call removeAttachment when an unexpired sibling job still references the attachment', async () => {
+      // arrange: one expired + one unexpired sharing the same attachment.
+      // The unexpired sibling protects the attachment from deletion.
+      await PageBulkExportJob.insertMany([
+        {
+          user,
+          page: new mongoose.Types.ObjectId(),
+          format: PageBulkExportFormat.md,
+          status: PageBulkExportJobStatus.completed,
+          completedAt: new Date(Date.now() - 86400 * 1000 - 1),
+          attachment: sharedAttachmentId,
+        },
+        {
+          user,
+          page: new mongoose.Types.ObjectId(),
+          format: PageBulkExportFormat.md,
+          status: PageBulkExportJobStatus.completed,
+          completedAt: new Date(Date.now()),
+          attachment: sharedAttachmentId,
+        },
+      ]);
+
+      // act
+      await pageBulkExportJobCleanUpCronService?.deleteDownloadExpiredExportJobs();
+
+      // assert: attachment retained, only the expired job is gone
+      expect(removeAttachmentMock).not.toHaveBeenCalled();
+      const remaining = await PageBulkExportJob.find();
+      expect(remaining).toHaveLength(1);
+      expect(remaining[0].attachment?.toString()).toBe(
+        sharedAttachmentId.toString(),
+      );
+    });
+
+    test('should delete an expired job whose removeAttachment resolves as a no-op (zombie with dangling attachment ref)', async () => {
+      // arrange: simulate the real removeAttachment idempotent contract — when
+      // the attachment metadata doc is already gone, the call resolves without
+      // throwing. The job record must still be deleteMany()-d.
+      const zombieAttachmentId = new mongoose.Types.ObjectId();
+      await PageBulkExportJob.create({
+        user,
+        page: new mongoose.Types.ObjectId(),
+        format: PageBulkExportFormat.md,
+        status: PageBulkExportJobStatus.completed,
+        completedAt: new Date(Date.now() - 86400 * 1000 - 1),
+        attachment: zombieAttachmentId,
+      });
+
+      // act
+      await pageBulkExportJobCleanUpCronService?.deleteDownloadExpiredExportJobs();
+
+      // assert
+      expect(removeAttachmentMock).toHaveBeenCalledTimes(1);
+      expect(removeAttachmentMock).toHaveBeenCalledWith(zombieAttachmentId);
+      expect(await PageBulkExportJob.find()).toHaveLength(0);
+    });
+  });
+
   describe('deleteFailedExportJobs', () => {
     // arrange
     const jobId1 = new mongoose.Types.ObjectId();

+ 22 - 0
apps/app/src/features/page-bulk-export/server/service/page-bulk-export-job-clean-up-cron.ts

@@ -85,9 +85,31 @@ class PageBulkExportJobCleanUpCronService extends CronService {
       completedAt: { $lt: thresholdDate },
     });
 
+    // Pick one "owner" job per shared attachment. When multiple jobs in this
+    // batch reference the same attachment (e.g. the duplicate-reuse path that
+    // re-binds an existing attachment to a new job), only the owner is allowed
+    // to call removeAttachment. Without this, the concurrent cleanup loop below
+    // would issue removeAttachment for the same attachment from each sibling,
+    // and the loser of the race would throw "Attachment not found" / GridFS
+    // "File not found", leaving its job record undeleted as a permanent zombie.
+    const attachmentOwnerJobIds = new Map<string, string>();
+    for (const job of downloadExpiredExportJobs) {
+      const attachmentKey = job.attachment?.toString();
+      if (attachmentKey == null) continue;
+      if (!attachmentOwnerJobIds.has(attachmentKey)) {
+        attachmentOwnerJobIds.set(attachmentKey, job._id.toString());
+      }
+    }
+
     const cleanUp = async (job: PageBulkExportJobDocument) => {
       await pageBulkExportJobCronService?.cleanUpExportJobResources(job);
 
+      const attachmentKey = job.attachment?.toString();
+      if (attachmentKey == null) return;
+      const isAttachmentOwner =
+        attachmentOwnerJobIds.get(attachmentKey) === job._id.toString();
+      if (!isAttachmentOwner) return;
+
       const hasSameAttachmentAndDownloadNotExpired =
         await PageBulkExportJob.findOne({
           attachment: job.attachment,

+ 54 - 0
apps/app/src/server/service/attachment.spec.ts

@@ -0,0 +1,54 @@
+import type Crowi from '../crowi';
+import { Attachment } from '../models/attachment';
+import { AttachmentService } from './attachment';
+
+// Locks down the idempotent contract that the bulk-export cleanup cron relies
+// on (page-bulk-export-job-clean-up-cron.ts): removing an attachment whose
+// metadata doc is already gone must resolve without throwing, otherwise the
+// surrounding Promise.allSettled rejects the cleanup and leaves the parent job
+// record undeleted as a zombie.
+describe('AttachmentService.removeAttachment', () => {
+  test('should resolve without throwing when the attachment is already gone', async () => {
+    const findByIdSpy = vi
+      .spyOn(Attachment, 'findById')
+      // biome-ignore lint/suspicious/noExplicitAny: Mongoose query shape is irrelevant to the contract under test
+      .mockResolvedValue(null as any);
+    const deleteFile = vi.fn();
+    const crowi = {
+      fileUploadService: { deleteFile },
+    } as unknown as Crowi;
+    const service = new AttachmentService(crowi);
+
+    await expect(
+      service.removeAttachment('this-id-does-not-exist'),
+    ).resolves.toBeUndefined();
+
+    expect(deleteFile).not.toHaveBeenCalled();
+    findByIdSpy.mockRestore();
+  });
+
+  test('should still drop the metadata doc when the underlying file delete throws (race with concurrent remover)', async () => {
+    const attachmentRemove = vi.fn().mockResolvedValue(undefined);
+    const fakeAttachment = { _id: 'some-id', remove: attachmentRemove };
+    const findByIdSpy = vi
+      .spyOn(Attachment, 'findById')
+      // biome-ignore lint/suspicious/noExplicitAny: Mongoose query shape is irrelevant to the contract under test
+      .mockResolvedValue(fakeAttachment as any);
+    const deleteFile = vi
+      .fn()
+      .mockRejectedValue(new Error('File not found for id some-id'));
+    const crowi = {
+      fileUploadService: { deleteFile },
+      detachHandlers: [],
+    } as unknown as Crowi;
+    const service = new AttachmentService(crowi);
+    // detachHandlers lives on the service instance, not crowi
+    service.detachHandlers = [];
+
+    await expect(service.removeAttachment('some-id')).resolves.toBeUndefined();
+
+    expect(deleteFile).toHaveBeenCalledTimes(1);
+    expect(attachmentRemove).toHaveBeenCalledTimes(1);
+    findByIdSpy.mockRestore();
+  });
+});

+ 20 - 2
apps/app/src/server/service/attachment.ts

@@ -147,11 +147,29 @@ export class AttachmentService implements IAttachmentService {
     const { fileUploadService } = this.crowi;
     const attachment = await Attachment.findById(attachmentId);
 
+    // No-op when the attachment is already gone. Two flows rely on this:
+    // (1) bulk-export cleanup may double-call this for sibling jobs that share
+    //     an attachment (the per-job loop runs concurrently), and (2) GridFS
+    //     chunks can be removed out-of-band, leaving a dangling metadata doc.
+    // Throwing here would re-surface those as cron errors on every tick.
     if (attachment == null) {
-      throw new Error(`Attachment not found: ${attachmentId}`);
+      logger.debug(
+        `removeAttachment: attachment already gone, skipping: ${attachmentId}`,
+      );
+      return;
     }
 
-    await fileUploadService.deleteFile(attachment);
+    try {
+      await fileUploadService.deleteFile(attachment);
+    } catch (err) {
+      // Same idempotency contract applies to the underlying file store: if the
+      // chunk was already deleted (e.g. a racing remove call), treat it as a
+      // no-op and continue to drop the metadata doc.
+      logger.warn(
+        `removeAttachment: deleteFile failed for ${attachmentId}, continuing to drop metadata`,
+        err,
+      );
+    }
     await attachment.remove();
 
     const detachedHandlerPromises = this.detachHandlers.map((handler) => {