markdown-splitter.ts 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
  1. import { dynamicImport } from '@cspell/dynamic-import';
  2. import type { TiktokenModel } from 'js-tiktoken';
  3. import { encodingForModel } from 'js-tiktoken';
  4. import yaml from 'js-yaml';
  5. import type * as RemarkFrontmatter from 'remark-frontmatter';
  6. import type * as RemarkGfm from 'remark-gfm';
  7. import type * as RemarkParse from 'remark-parse';
  8. import type * as RemarkStringify from 'remark-stringify';
  9. import type * as Unified from 'unified';
  10. export type MarkdownFragment = {
  11. label: string;
  12. type: string;
  13. text: string;
  14. tokenCount: number;
  15. };
  16. /**
  17. * Updates the section numbers based on the heading depth and returns the updated section label.
  18. * Handles non-consecutive heading levels by initializing missing levels with 1.
  19. * @param sectionNumbers - The current section numbers.
  20. * @param headingDepth - The depth of the heading (e.g., # is depth 1).
  21. * @returns The updated section label.
  22. */
  23. function updateSectionNumbers(sectionNumbers: number[], headingDepth: number): string {
  24. if (headingDepth > sectionNumbers.length) {
  25. // Initialize missing levels with 1
  26. while (sectionNumbers.length < headingDepth) {
  27. sectionNumbers.push(1);
  28. }
  29. }
  30. else if (headingDepth === sectionNumbers.length) {
  31. // Increment the last number for the same level
  32. sectionNumbers[headingDepth - 1]++;
  33. }
  34. else {
  35. // Remove deeper levels and increment the current level
  36. sectionNumbers.splice(headingDepth);
  37. sectionNumbers[headingDepth - 1]++;
  38. }
  39. return sectionNumbers.join('-');
  40. }
  41. /**
  42. * Splits Markdown text into labeled markdownFragments using remark-parse and remark-stringify,
  43. * processing each content node separately and labeling them as 1-content-1, 1-content-2, etc.
  44. * @param markdownText - The input Markdown string.
  45. * @returns An array of labeled markdownFragments.
  46. */
  47. export async function splitMarkdownIntoFragments(markdownText: string, model: TiktokenModel): Promise<MarkdownFragment[]> {
  48. const markdownFragments: MarkdownFragment[] = [];
  49. const sectionNumbers: number[] = [];
  50. let currentSectionLabel = '';
  51. const contentCounters: Record<string, number> = {};
  52. if (typeof markdownText !== 'string' || markdownText.trim() === '') {
  53. return markdownFragments;
  54. }
  55. const encoder = encodingForModel(model);
  56. const remarkParse = (await dynamicImport<typeof RemarkParse>('remark-parse', __dirname)).default;
  57. const remarkFrontmatter = (await dynamicImport<typeof RemarkFrontmatter>('remark-frontmatter', __dirname)).default;
  58. const remarkGfm = (await dynamicImport<typeof RemarkGfm>('remark-gfm', __dirname)).default;
  59. const remarkStringify = (await dynamicImport<typeof RemarkStringify>('remark-stringify', __dirname)).default;
  60. const unified = (await dynamicImport<typeof Unified>('unified', __dirname)).unified;
  61. const parser = unified()
  62. .use(remarkParse)
  63. .use(remarkFrontmatter, ['yaml'])
  64. .use(remarkGfm); // Enable GFM extensions
  65. const stringifyOptions: RemarkStringify.Options = {
  66. bullet: '-', // Set list bullet to hyphen
  67. rule: '-', // Use hyphen for horizontal rules
  68. };
  69. const stringifier = unified()
  70. .use(remarkFrontmatter, ['yaml'])
  71. .use(remarkGfm)
  72. .use(remarkStringify, stringifyOptions);
  73. const parsedTree = parser.parse(markdownText);
  74. // Iterate over top-level nodes to prevent duplication
  75. for (const node of parsedTree.children) {
  76. if (node.type === 'yaml') {
  77. // Frontmatter block found, handle only the first instance
  78. const frontmatter = yaml.load(node.value) as Record<string, unknown>;
  79. const frontmatterText = JSON.stringify(frontmatter, null, 2);
  80. const tokenCount = encoder.encode(frontmatterText).length;
  81. markdownFragments.push({
  82. label: 'frontmatter',
  83. type: 'yaml',
  84. text: frontmatterText,
  85. tokenCount,
  86. });
  87. }
  88. else if (node.type === 'heading') {
  89. const headingDepth = node.depth;
  90. currentSectionLabel = updateSectionNumbers(sectionNumbers, headingDepth);
  91. const headingMarkdown = stringifier.stringify(node as any).trim(); // eslint-disable-line @typescript-eslint/no-explicit-any
  92. const tokenCount = encoder.encode(headingMarkdown).length;
  93. markdownFragments.push({
  94. label: `${currentSectionLabel}-heading`, type: node.type, text: headingMarkdown, tokenCount,
  95. });
  96. }
  97. else {
  98. // Process non-heading content individually
  99. const contentMarkdown = stringifier.stringify(node as any).trim(); // eslint-disable-line @typescript-eslint/no-explicit-any
  100. if (contentMarkdown !== '') {
  101. const contentCountKey = currentSectionLabel || '0';
  102. if (!contentCounters[contentCountKey]) {
  103. contentCounters[contentCountKey] = 1;
  104. }
  105. else {
  106. contentCounters[contentCountKey]++;
  107. }
  108. const contentLabel = currentSectionLabel !== ''
  109. ? `${currentSectionLabel}-content-${contentCounters[contentCountKey]}`
  110. : `0-content-${contentCounters[contentCountKey]}`;
  111. const tokenCount = encoder.encode(contentMarkdown).length;
  112. markdownFragments.push({
  113. label: contentLabel, type: node.type, text: contentMarkdown, tokenCount,
  114. });
  115. }
  116. }
  117. }
  118. return markdownFragments;
  119. }