markdown-splitter.ts 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126
  1. import type { TiktokenModel } from 'js-tiktoken';
  2. import { encodingForModel } from 'js-tiktoken';
  3. import yaml from 'js-yaml';
  4. import remarkFrontmatter from 'remark-frontmatter'; // Frontmatter processing
  5. import remarkGfm from 'remark-gfm'; // GFM processing
  6. import remarkParse from 'remark-parse';
  7. import type { Options as StringifyOptions } from 'remark-stringify';
  8. import remarkStringify from 'remark-stringify';
  9. import { unified } from 'unified';
  10. export type MarkdownFragment = {
  11. label: string;
  12. type: string;
  13. text: string;
  14. tokenCount: number;
  15. };
  16. /**
  17. * Updates the section numbers based on the heading depth and returns the updated section label.
  18. * Handles non-consecutive heading levels by initializing missing levels with 1.
  19. * @param sectionNumbers - The current section numbers.
  20. * @param headingDepth - The depth of the heading (e.g., # is depth 1).
  21. * @returns The updated section label.
  22. */
  23. function updateSectionNumbers(sectionNumbers: number[], headingDepth: number): string {
  24. if (headingDepth > sectionNumbers.length) {
  25. // Initialize missing levels with 1
  26. while (sectionNumbers.length < headingDepth) {
  27. sectionNumbers.push(1);
  28. }
  29. }
  30. else if (headingDepth === sectionNumbers.length) {
  31. // Increment the last number for the same level
  32. sectionNumbers[headingDepth - 1]++;
  33. }
  34. else {
  35. // Remove deeper levels and increment the current level
  36. sectionNumbers.splice(headingDepth);
  37. sectionNumbers[headingDepth - 1]++;
  38. }
  39. return sectionNumbers.join('-');
  40. }
  41. /**
  42. * Splits Markdown text into labeled markdownFragments using remark-parse and remark-stringify,
  43. * processing each content node separately and labeling them as 1-content-1, 1-content-2, etc.
  44. * @param markdownText - The input Markdown string.
  45. * @returns An array of labeled markdownFragments.
  46. */
  47. export async function splitMarkdownIntoFragments(markdownText: string, model: TiktokenModel): Promise<MarkdownFragment[]> {
  48. const markdownFragments: MarkdownFragment[] = [];
  49. const sectionNumbers: number[] = [];
  50. let currentSectionLabel = '';
  51. const contentCounters: Record<string, number> = {};
  52. if (typeof markdownText !== 'string' || markdownText.trim() === '') {
  53. return markdownFragments;
  54. }
  55. const encoder = encodingForModel(model);
  56. const parser = unified()
  57. .use(remarkParse)
  58. .use(remarkFrontmatter, ['yaml'])
  59. .use(remarkGfm); // Enable GFM extensions
  60. const stringifyOptions: StringifyOptions = {
  61. bullet: '-', // Set list bullet to hyphen
  62. rule: '-', // Use hyphen for horizontal rules
  63. };
  64. const stringifier = unified()
  65. .use(remarkFrontmatter, ['yaml'])
  66. .use(remarkGfm)
  67. .use(remarkStringify, stringifyOptions);
  68. const parsedTree = parser.parse(markdownText);
  69. // Iterate over top-level nodes to prevent duplication
  70. for (const node of parsedTree.children) {
  71. if (node.type === 'yaml') {
  72. // Frontmatter block found, handle only the first instance
  73. const frontmatter = yaml.load(node.value) as Record<string, unknown>;
  74. const frontmatterText = JSON.stringify(frontmatter, null, 2);
  75. const tokenCount = encoder.encode(frontmatterText).length;
  76. markdownFragments.push({
  77. label: 'frontmatter',
  78. type: 'yaml',
  79. text: frontmatterText,
  80. tokenCount,
  81. });
  82. }
  83. else if (node.type === 'heading') {
  84. const headingDepth = node.depth;
  85. currentSectionLabel = updateSectionNumbers(sectionNumbers, headingDepth);
  86. const headingMarkdown = stringifier.stringify(node as any).trim(); // eslint-disable-line @typescript-eslint/no-explicit-any
  87. const tokenCount = encoder.encode(headingMarkdown).length;
  88. markdownFragments.push({
  89. label: `${currentSectionLabel}-heading`, type: node.type, text: headingMarkdown, tokenCount,
  90. });
  91. }
  92. else {
  93. // Process non-heading content individually
  94. const contentMarkdown = stringifier.stringify(node as any).trim(); // eslint-disable-line @typescript-eslint/no-explicit-any
  95. if (contentMarkdown !== '') {
  96. const contentCountKey = currentSectionLabel || '0';
  97. if (!contentCounters[contentCountKey]) {
  98. contentCounters[contentCountKey] = 1;
  99. }
  100. else {
  101. contentCounters[contentCountKey]++;
  102. }
  103. const contentLabel = currentSectionLabel !== ''
  104. ? `${currentSectionLabel}-content-${contentCounters[contentCountKey]}`
  105. : `0-content-${contentCounters[contentCountKey]}`;
  106. const tokenCount = encoder.encode(contentMarkdown).length;
  107. markdownFragments.push({
  108. label: contentLabel, type: node.type, text: contentMarkdown, tokenCount,
  109. });
  110. }
  111. }
  112. }
  113. return markdownFragments;
  114. }