pdf-convert.ts 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328
  1. import fs from 'node:fs';
  2. import path from 'node:path';
  3. import { Readable, Writable } from 'node:stream';
  4. import { pipeline as pipelinePromise } from 'node:stream/promises';
  5. import { OnInit } from '@tsed/common';
  6. import { Service } from '@tsed/di';
  7. import { Logger } from '@tsed/logger';
  8. import { Cluster } from 'puppeteer-cluster';
  9. interface PageInfo {
  10. htmlString: string;
  11. htmlFilePath: string;
  12. }
  13. export const JobStatusSharedWithGrowi = {
  14. HTML_EXPORT_IN_PROGRESS: 'HTML_EXPORT_IN_PROGRESS',
  15. HTML_EXPORT_DONE: 'HTML_EXPORT_DONE',
  16. FAILED: 'FAILED',
  17. } as const;
  18. export const JobStatus = {
  19. ...JobStatusSharedWithGrowi,
  20. PDF_EXPORT_DONE: 'PDF_EXPORT_DONE',
  21. } as const;
  22. export type JobStatusSharedWithGrowi =
  23. (typeof JobStatusSharedWithGrowi)[keyof typeof JobStatusSharedWithGrowi];
  24. export type JobStatus = (typeof JobStatus)[keyof typeof JobStatus];
  25. interface JobInfo {
  26. expirationDate: Date;
  27. status: JobStatus;
  28. currentStream?: Readable;
  29. }
  30. @Service()
  31. class PdfConvertService implements OnInit {
  32. private puppeteerCluster: Cluster | undefined;
  33. private maxConcurrency = 1;
  34. private convertRetryLimit = 5;
  35. private tmpOutputRootDir = '/tmp/page-bulk-export';
  36. private tmpHtmlDir = `${this.tmpOutputRootDir}/html`;
  37. private jobList: {
  38. [key: string]: JobInfo;
  39. } = {};
  40. constructor(private readonly logger: Logger) {}
  41. async $onInit(): Promise<void> {
  42. await this.initPuppeteerCluster();
  43. }
  44. /**
  45. * Register or update job inside jobList with given jobId, expirationDate, and status.
  46. * If job is new, start reading html files and convert them to pdf.
  47. * @param jobId PageBulkExportJob ID
  48. * @param expirationDate expiration date of job
  49. * @param status status of job
  50. * @param appId application ID for GROWI.cloud
  51. */
  52. async registerOrUpdateJob(
  53. jobId: string,
  54. expirationDate: Date,
  55. status: JobStatusSharedWithGrowi,
  56. appId?: number,
  57. ): Promise<void> {
  58. const isJobNew = !(jobId in this.jobList);
  59. if (isJobNew) {
  60. this.jobList[jobId] = { expirationDate, status };
  61. } else {
  62. const jobInfo = this.jobList[jobId];
  63. jobInfo.expirationDate = expirationDate;
  64. if (!this.isJobCompleted(jobId)) {
  65. jobInfo.status = status;
  66. }
  67. }
  68. if (status === JobStatus.FAILED) {
  69. this.jobList[jobId].currentStream?.destroy(new Error('job failed'));
  70. }
  71. if (isJobNew && status !== JobStatus.FAILED) {
  72. this.readHtmlAndConvertToPdfUntilFinish(jobId, appId);
  73. }
  74. }
  75. /**
  76. * Get job status
  77. * @param jobId id of PageBulkExportJob
  78. * @returns job status
  79. */
  80. getJobStatus(jobId: string): JobStatus {
  81. if (!(jobId in this.jobList)) return JobStatus.FAILED;
  82. return this.jobList[jobId].status;
  83. }
  84. /**
  85. * Clean up job list by removing expired jobs, finished jobs, and failed jobs
  86. */
  87. cleanUpJobList(): void {
  88. const now = new Date();
  89. for (const jobId of Object.keys(this.jobList)) {
  90. const job = this.jobList[jobId];
  91. if (now > job.expirationDate || this.isJobCompleted(jobId)) {
  92. job.currentStream?.destroy(new Error('job expired'));
  93. delete this.jobList[jobId];
  94. }
  95. }
  96. }
  97. /**
  98. * Close puppeteer cluster
  99. */
  100. async closePuppeteerCluster(): Promise<void> {
  101. if (this.puppeteerCluster == null) {
  102. this.logger.info('No puppeteer cluster running for closure');
  103. return;
  104. }
  105. this.logger.info('Closing puppeteer cluster...');
  106. await this.puppeteerCluster.idle();
  107. await this.puppeteerCluster.close();
  108. }
  109. private isJobCompleted(jobId: string): boolean {
  110. if (this.jobList[jobId] == null) return true;
  111. return (
  112. this.jobList[jobId].status === JobStatus.PDF_EXPORT_DONE ||
  113. this.jobList[jobId].status === JobStatus.FAILED
  114. );
  115. }
  116. /**
  117. * Read html files from shared fs path, convert them to pdf, and save them to shared fs path.
  118. * Repeat this until all html files are converted to pdf or job fails.
  119. * @param jobId PageBulkExportJob ID
  120. * @param appId application ID for GROWI.cloud
  121. */
  122. private async readHtmlAndConvertToPdfUntilFinish(
  123. jobId: string,
  124. appId?: number,
  125. ): Promise<void> {
  126. while (!this.isJobCompleted(jobId)) {
  127. // eslint-disable-next-line no-await-in-loop
  128. await new Promise((resolve) => setTimeout(resolve, 10 * 1000));
  129. try {
  130. if (new Date() > this.jobList[jobId].expirationDate) {
  131. throw new Error('Job expired');
  132. }
  133. const htmlReadable = this.getHtmlReadable(jobId, appId);
  134. const pdfWritable = this.getPdfWritable();
  135. this.jobList[jobId].currentStream = htmlReadable;
  136. // eslint-disable-next-line no-await-in-loop
  137. await pipelinePromise(htmlReadable, pdfWritable);
  138. this.jobList[jobId].currentStream = undefined;
  139. } catch (err) {
  140. this.logger.error('Failed to convert html to pdf', err);
  141. this.jobList[jobId].status = JobStatus.FAILED;
  142. this.jobList[jobId].currentStream?.destroy(
  143. new Error('Failed to convert html to pdf'),
  144. );
  145. break;
  146. }
  147. }
  148. }
  149. /**
  150. * Get readable stream that reads html files from shared fs path
  151. * @param jobId PageBulkExportJob ID
  152. * @param appId application ID for GROWI.cloud
  153. * @returns readable stream
  154. */
  155. private getHtmlReadable(jobId: string, appId?: number): Readable {
  156. const jobHtmlDir = path.join(
  157. this.tmpHtmlDir,
  158. appId?.toString() ?? '',
  159. jobId,
  160. );
  161. const htmlFileEntries = fs
  162. .readdirSync(jobHtmlDir, { recursive: true, withFileTypes: true })
  163. .filter((entry) => entry.isFile());
  164. let index = 0;
  165. const jobList = this.jobList;
  166. return new Readable({
  167. objectMode: true,
  168. async read() {
  169. if (index >= htmlFileEntries.length) {
  170. if (
  171. jobList[jobId].status === JobStatus.HTML_EXPORT_DONE &&
  172. htmlFileEntries.length === 0
  173. ) {
  174. jobList[jobId].status = JobStatus.PDF_EXPORT_DONE;
  175. }
  176. this.push(null);
  177. return;
  178. }
  179. const entry = htmlFileEntries[index];
  180. const htmlFilePath = path.join(entry.parentPath, entry.name);
  181. const htmlString = await fs.promises.readFile(htmlFilePath, 'utf-8');
  182. this.push({ htmlString, htmlFilePath });
  183. index += 1;
  184. },
  185. });
  186. }
  187. /**
  188. * Get writable stream that converts html to pdf, and save it to shared fs path
  189. * @returns writable stream
  190. */
  191. private getPdfWritable(): Writable {
  192. return new Writable({
  193. objectMode: true,
  194. write: async (pageInfo: PageInfo, encoding, callback) => {
  195. const fileOutputPath = pageInfo.htmlFilePath
  196. .replace(new RegExp(`^${this.tmpHtmlDir}`), this.tmpOutputRootDir)
  197. .replace(/\.html$/, '.pdf');
  198. const fileOutputParentPath = this.getParentPath(fileOutputPath);
  199. try {
  200. const pdfBody = await this.convertHtmlToPdf(pageInfo.htmlString);
  201. await fs.promises.mkdir(fileOutputParentPath, { recursive: true });
  202. await fs.promises.writeFile(fileOutputPath, pdfBody);
  203. await fs.promises.rm(pageInfo.htmlFilePath, { force: true });
  204. } catch (err) {
  205. if (err instanceof Error) {
  206. callback(err);
  207. }
  208. return;
  209. }
  210. callback();
  211. },
  212. });
  213. }
  214. /**
  215. * Convert html to pdf. Retry up to convertRetryLimit if failed.
  216. * @param htmlString html to convert to pdf
  217. * @returns converted pdf
  218. */
  219. private async convertHtmlToPdf(htmlString: string): Promise<Buffer> {
  220. const executeConvert = async (retries: number): Promise<Buffer> => {
  221. try {
  222. return this.puppeteerCluster?.execute(htmlString);
  223. } catch (err) {
  224. if (retries > 0) {
  225. this.logger.error(
  226. 'Failed to convert markdown to pdf. Retrying...',
  227. err,
  228. );
  229. return executeConvert(retries - 1);
  230. }
  231. throw err;
  232. }
  233. };
  234. const result = await executeConvert(this.convertRetryLimit);
  235. return result;
  236. }
  237. /**
  238. * Initialize puppeteer cluster
  239. */
  240. private async initPuppeteerCluster(): Promise<void> {
  241. if (process.env.SKIP_PUPPETEER_INIT === 'true') return;
  242. this.puppeteerCluster = await Cluster.launch({
  243. concurrency: Cluster.CONCURRENCY_PAGE,
  244. maxConcurrency: this.maxConcurrency,
  245. workerCreationDelay: 10000,
  246. });
  247. await this.puppeteerCluster.task(async ({ page, data: htmlString }) => {
  248. await page.setContent(htmlString, { waitUntil: 'domcontentloaded' });
  249. await page.addStyleTag({
  250. content: `
  251. body {
  252. font-family: 'Lato', 'IPAGothic', 'Noto Sans CJK';
  253. }
  254. `,
  255. });
  256. await page.emulateMediaType('screen');
  257. const pdfResult = await page.pdf({
  258. margin: {
  259. top: '100px',
  260. right: '50px',
  261. bottom: '100px',
  262. left: '50px',
  263. },
  264. printBackground: true,
  265. format: 'A4',
  266. });
  267. return pdfResult;
  268. });
  269. }
  270. /**
  271. * Get parent path from given path
  272. * @param path target path
  273. * @returns parent path
  274. */
  275. private getParentPath(path: string): string {
  276. const parentPath = path.split('/').slice(0, -1).join('/');
  277. if (parentPath === '' || parentPath === '/') {
  278. return '/';
  279. }
  280. return parentPath;
  281. }
  282. }
  283. export default PdfConvertService;