markdown-splitter.ts 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130
  1. import yaml from 'js-yaml';
  2. import remarkFrontmatter from 'remark-frontmatter'; // Frontmatter processing
  3. import remarkGfm from 'remark-gfm'; // GFM processing
  4. import remarkParse from 'remark-parse';
  5. import type { Options as StringifyOptions } from 'remark-stringify';
  6. import remarkStringify from 'remark-stringify';
  7. import { unified } from 'unified';
  8. export type Chunk = {
  9. label: string;
  10. text: string;
  11. };
  12. /**
  13. * Processes and adds a new chunk to the chunks array if content is not empty.
  14. * Clears the contentBuffer array after processing.
  15. * @param chunks - The array to store processed chunks.
  16. * @param contentBuffer - The array of content lines to be processed.
  17. * @param label - The label for the content chunk.
  18. */
  19. function addContentChunk(chunks: Chunk[], contentBuffer: string[], label: string) {
  20. const text = contentBuffer.join('\n\n').trimEnd();
  21. if (text !== '') {
  22. chunks.push({ label, text });
  23. }
  24. contentBuffer.length = 0; // Clear the contentBuffer array
  25. }
  26. /**
  27. * Updates the section numbers based on the heading depth and returns the updated section label.
  28. * Handles non-consecutive heading levels by initializing missing levels with 1.
  29. * @param sectionNumbers - The current section numbers.
  30. * @param headingDepth - The depth of the heading (e.g., # is depth 1).
  31. * @returns The updated section label.
  32. */
  33. function updateSectionNumbers(sectionNumbers: number[], headingDepth: number): string {
  34. if (headingDepth > sectionNumbers.length) {
  35. // Initialize missing levels with 1
  36. while (sectionNumbers.length < headingDepth) {
  37. sectionNumbers.push(1);
  38. }
  39. }
  40. else if (headingDepth === sectionNumbers.length) {
  41. // Increment the last number for the same level
  42. sectionNumbers[headingDepth - 1]++;
  43. }
  44. else {
  45. // Remove deeper levels and increment the current level
  46. sectionNumbers.splice(headingDepth);
  47. sectionNumbers[headingDepth - 1]++;
  48. }
  49. return sectionNumbers.join('-');
  50. }
  51. /**
  52. * Splits Markdown text into labeled chunks using remark-parse and remark-stringify,
  53. * considering content that may start before any headers and handling non-consecutive heading levels.
  54. * @param markdownText - The input Markdown string.
  55. * @returns An array of labeled chunks.
  56. */
  57. export async function splitMarkdownIntoChunks(markdownText: string): Promise<Chunk[]> {
  58. const chunks: Chunk[] = [];
  59. const sectionNumbers: number[] = [];
  60. let frontmatter: Record<string, unknown> | null = null; // Variable to store frontmatter
  61. const contentBuffer: string[] = [];
  62. let currentSectionLabel = '';
  63. if (typeof markdownText !== 'string' || markdownText.trim() === '') {
  64. return chunks;
  65. }
  66. const parser = unified()
  67. .use(remarkParse)
  68. .use(remarkFrontmatter, ['yaml'])
  69. .use(remarkGfm); // Enable GFM extensions
  70. const stringifyOptions: StringifyOptions = {
  71. bullet: '-', // Set list bullet to hyphen
  72. rule: '-', // Use hyphen for horizontal rules
  73. };
  74. const stringifier = unified()
  75. .use(remarkFrontmatter, ['yaml'])
  76. .use(remarkGfm)
  77. .use(remarkStringify, stringifyOptions);
  78. const parsedTree = parser.parse(markdownText);
  79. // Iterate over top-level nodes to prevent duplication
  80. for (const node of parsedTree.children) {
  81. if (node.type === 'yaml') {
  82. frontmatter = yaml.load(node.value) as Record<string, unknown>;
  83. }
  84. else if (node.type === 'heading') {
  85. // Process pending content before heading
  86. if (contentBuffer.length > 0) {
  87. const contentLabel = currentSectionLabel !== '' ? `${currentSectionLabel}-content` : '0-content';
  88. addContentChunk(chunks, contentBuffer, contentLabel);
  89. }
  90. const headingDepth = node.depth;
  91. currentSectionLabel = updateSectionNumbers(sectionNumbers, headingDepth);
  92. const headingMarkdown = stringifier.stringify(node as any);// eslint-disable-line @typescript-eslint/no-explicit-any
  93. chunks.push({ label: `${currentSectionLabel}-heading`, text: headingMarkdown.trim() });
  94. }
  95. else {
  96. // Add non-heading content to the buffer
  97. const contentMarkdown = stringifier.stringify(node as any).trim(); // eslint-disable-line @typescript-eslint/no-explicit-any
  98. if (contentMarkdown !== '') {
  99. contentBuffer.push(contentMarkdown);
  100. }
  101. }
  102. }
  103. // Process any remaining content
  104. if (contentBuffer.length > 0) {
  105. const contentLabel = currentSectionLabel !== '' ? `${currentSectionLabel}-content` : '0-content';
  106. addContentChunk(chunks, contentBuffer, contentLabel);
  107. }
  108. if (frontmatter) {
  109. chunks.unshift({
  110. label: 'frontmatter',
  111. text: JSON.stringify(frontmatter, null, 2),
  112. });
  113. }
  114. return chunks;
  115. }