wiki
/
weseek__growi
mirror of https://github.com/weseek/growi


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328
							import fs from 'node:fs';
import path from 'node:path';
import { Readable, Writable } from 'node:stream';
import { pipeline as pipelinePromise } from 'node:stream/promises';

import { OnInit } from '@tsed/common';
import { Service } from '@tsed/di';
import { Logger } from '@tsed/logger';
import { Cluster } from 'puppeteer-cluster';

interface PageInfo {
  htmlString: string;
  htmlFilePath: string;
}

export const JobStatusSharedWithGrowi = {
  HTML_EXPORT_IN_PROGRESS: 'HTML_EXPORT_IN_PROGRESS',
  HTML_EXPORT_DONE: 'HTML_EXPORT_DONE',
  FAILED: 'FAILED',
} as const;

export const JobStatus = {
  ...JobStatusSharedWithGrowi,
  PDF_EXPORT_DONE: 'PDF_EXPORT_DONE',
} as const;

export type JobStatusSharedWithGrowi =
  (typeof JobStatusSharedWithGrowi)[keyof typeof JobStatusSharedWithGrowi];
export type JobStatus = (typeof JobStatus)[keyof typeof JobStatus];

interface JobInfo {
  expirationDate: Date;
  status: JobStatus;
  currentStream?: Readable;
}

@Service()
class PdfConvertService implements OnInit {
  private puppeteerCluster: Cluster | undefined;

  private maxConcurrency = 1;

  private convertRetryLimit = 5;

  private tmpOutputRootDir = '/tmp/page-bulk-export';

  private tmpHtmlDir = `${this.tmpOutputRootDir}/html`;

  private jobList: {
    [key: string]: JobInfo;
  } = {};

  constructor(private readonly logger: Logger) {}

  async $onInit(): Promise<void> {
    await this.initPuppeteerCluster();
  }

  /**
   * Register or update job inside jobList with given jobId, expirationDate, and status.
   * If job is new, start reading html files and convert them to pdf.
   * @param jobId PageBulkExportJob ID
   * @param expirationDate expiration date of job
   * @param status status of job
   * @param appId application ID for GROWI.cloud
   */
  async registerOrUpdateJob(
    jobId: string,
    expirationDate: Date,
    status: JobStatusSharedWithGrowi,
    appId?: number,
  ): Promise<void> {
    const isJobNew = !(jobId in this.jobList);

    if (isJobNew) {
      this.jobList[jobId] = { expirationDate, status };
    } else {
      const jobInfo = this.jobList[jobId];
      jobInfo.expirationDate = expirationDate;

      if (!this.isJobCompleted(jobId)) {
        jobInfo.status = status;
      }
    }

    if (status === JobStatus.FAILED) {
      this.jobList[jobId].currentStream?.destroy(new Error('job failed'));
    }

    if (isJobNew && status !== JobStatus.FAILED) {
      this.readHtmlAndConvertToPdfUntilFinish(jobId, appId);
    }
  }

  /**
   * Get job status
   * @param jobId id of PageBulkExportJob
   * @returns job status
   */
  getJobStatus(jobId: string): JobStatus {
    if (!(jobId in this.jobList)) return JobStatus.FAILED;
    return this.jobList[jobId].status;
  }

  /**
   * Clean up job list by removing expired jobs, finished jobs, and failed jobs
   */
  cleanUpJobList(): void {
    const now = new Date();
    for (const jobId of Object.keys(this.jobList)) {
      const job = this.jobList[jobId];
      if (now > job.expirationDate || this.isJobCompleted(jobId)) {
        job.currentStream?.destroy(new Error('job expired'));
        delete this.jobList[jobId];
      }
    }
  }

  /**
   * Close puppeteer cluster
   */
  async closePuppeteerCluster(): Promise<void> {
    if (this.puppeteerCluster == null) {
      this.logger.info('No puppeteer cluster running for closure');
      return;
    }

    this.logger.info('Closing puppeteer cluster...');
    await this.puppeteerCluster.idle();
    await this.puppeteerCluster.close();
  }

  private isJobCompleted(jobId: string): boolean {
    if (this.jobList[jobId] == null) return true;
    return (
      this.jobList[jobId].status === JobStatus.PDF_EXPORT_DONE ||
      this.jobList[jobId].status === JobStatus.FAILED
    );
  }

  /**
   * Read html files from shared fs path, convert them to pdf, and save them to shared fs path.
   * Repeat this until all html files are converted to pdf or job fails.
   * @param jobId PageBulkExportJob ID
   * @param appId application ID for GROWI.cloud
   */
  private async readHtmlAndConvertToPdfUntilFinish(
    jobId: string,
    appId?: number,
  ): Promise<void> {
    while (!this.isJobCompleted(jobId)) {
      // eslint-disable-next-line no-await-in-loop
      await new Promise((resolve) => setTimeout(resolve, 10 * 1000));

      try {
        if (new Date() > this.jobList[jobId].expirationDate) {
          throw new Error('Job expired');
        }

        const htmlReadable = this.getHtmlReadable(jobId, appId);
        const pdfWritable = this.getPdfWritable();
        this.jobList[jobId].currentStream = htmlReadable;

        // eslint-disable-next-line no-await-in-loop
        await pipelinePromise(htmlReadable, pdfWritable);
        this.jobList[jobId].currentStream = undefined;
      } catch (err) {
        this.logger.error('Failed to convert html to pdf', err);
        this.jobList[jobId].status = JobStatus.FAILED;
        this.jobList[jobId].currentStream?.destroy(
          new Error('Failed to convert html to pdf'),
        );
        break;
      }
    }
  }

  /**
   * Get readable stream that reads html files from shared fs path
   * @param jobId PageBulkExportJob ID
   * @param appId application ID for GROWI.cloud
   * @returns readable stream
   */
  private getHtmlReadable(jobId: string, appId?: number): Readable {
    const jobHtmlDir = path.join(
      this.tmpHtmlDir,
      appId?.toString() ?? '',
      jobId,
    );
    const htmlFileEntries = fs
      .readdirSync(jobHtmlDir, { recursive: true, withFileTypes: true })
      .filter((entry) => entry.isFile());
    let index = 0;

    const jobList = this.jobList;

    return new Readable({
      objectMode: true,
      async read() {
        if (index >= htmlFileEntries.length) {
          if (
            jobList[jobId].status === JobStatus.HTML_EXPORT_DONE &&
            htmlFileEntries.length === 0
          ) {
            jobList[jobId].status = JobStatus.PDF_EXPORT_DONE;
          }
          this.push(null);
          return;
        }

        const entry = htmlFileEntries[index];
        const htmlFilePath = path.join(entry.parentPath, entry.name);
        const htmlString = await fs.promises.readFile(htmlFilePath, 'utf-8');

        this.push({ htmlString, htmlFilePath });

        index += 1;
      },
    });
  }

  /**
   * Get writable stream that converts html to pdf, and save it to shared fs path
   * @returns writable stream
   */
  private getPdfWritable(): Writable {
    return new Writable({
      objectMode: true,
      write: async (pageInfo: PageInfo, encoding, callback) => {
        const fileOutputPath = pageInfo.htmlFilePath
          .replace(new RegExp(`^${this.tmpHtmlDir}`), this.tmpOutputRootDir)
          .replace(/\.html$/, '.pdf');
        const fileOutputParentPath = this.getParentPath(fileOutputPath);

        try {
          const pdfBody = await this.convertHtmlToPdf(pageInfo.htmlString);
          await fs.promises.mkdir(fileOutputParentPath, { recursive: true });
          await fs.promises.writeFile(fileOutputPath, pdfBody);

          await fs.promises.rm(pageInfo.htmlFilePath, { force: true });
        } catch (err) {
          if (err instanceof Error) {
            callback(err);
          }
          return;
        }
        callback();
      },
    });
  }

  /**
   * Convert html to pdf. Retry up to convertRetryLimit if failed.
   * @param htmlString html to convert to pdf
   * @returns converted pdf
   */
  private async convertHtmlToPdf(htmlString: string): Promise<Buffer> {
    const executeConvert = async (retries: number): Promise<Buffer> => {
      try {
        return this.puppeteerCluster?.execute(htmlString);
      } catch (err) {
        if (retries > 0) {
          this.logger.error(
            'Failed to convert markdown to pdf. Retrying...',
            err,
          );
          return executeConvert(retries - 1);
        }
        throw err;
      }
    };

    const result = await executeConvert(this.convertRetryLimit);

    return result;
  }

  /**
   * Initialize puppeteer cluster
   */
  private async initPuppeteerCluster(): Promise<void> {
    if (process.env.SKIP_PUPPETEER_INIT === 'true') return;

    this.puppeteerCluster = await Cluster.launch({
      concurrency: Cluster.CONCURRENCY_PAGE,
      maxConcurrency: this.maxConcurrency,
      workerCreationDelay: 10000,
    });

    await this.puppeteerCluster.task(async ({ page, data: htmlString }) => {
      await page.setContent(htmlString, { waitUntil: 'domcontentloaded' });
      await page.addStyleTag({
        content: `
          body {
            font-family: 'Lato', 'IPAGothic', 'Noto Sans CJK';
          }
        `,
      });
      await page.emulateMediaType('screen');
      const pdfResult = await page.pdf({
        margin: {
          top: '100px',
          right: '50px',
          bottom: '100px',
          left: '50px',
        },
        printBackground: true,
        format: 'A4',
      });
      return pdfResult;
    });
  }

  /**
   * Get parent path from given path
   * @param path target path
   * @returns parent path
   */
  private getParentPath(path: string): string {
    const parentPath = path.split('/').slice(0, -1).join('/');
    if (parentPath === '' || parentPath === '/') {
      return '/';
    }
    return parentPath;
  }
}

export default PdfConvertService;