markdown-token-splitter.ts 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188
  1. import { encodingForModel, type TiktokenModel } from 'js-tiktoken';
  2. import { splitMarkdownIntoFragments, type MarkdownFragment } from './markdown-splitter';
  3. type MarkdownFragmentGroups = MarkdownFragment[][] ;
  4. function groupMarkdownFragments(
  5. markdownFragments: MarkdownFragment[],
  6. maxToken: number,
  7. ): MarkdownFragmentGroups {
  8. const prefixes = markdownFragments.map(({ label }) => {
  9. if (label === 'frontmatter') return 'frontmatter';
  10. const match = label.match(/^\d+(?:-\d+)*/)!; // eslint-disable-line @typescript-eslint/no-non-null-assertion
  11. return match[0];
  12. });
  13. const uniquePrefixes = [...new Set(prefixes.filter(Boolean))];
  14. // Group chunks by prefix
  15. const fragmentGroupes: MarkdownFragmentGroups = [];
  16. let remainingPrefixes = [...uniquePrefixes];
  17. // Process chunks so that the total token count per level doesn't exceed maxToken
  18. while (remainingPrefixes.length > 0) {
  19. const prefix = remainingPrefixes[0]; // Get the first prefix
  20. const hasNextLevelPrefix = uniquePrefixes.some(p => p !== prefix && p.startsWith(prefix));
  21. if (!hasNextLevelPrefix) {
  22. // If there is no prefix that starts with the current prefix, group the chunks directly
  23. let matchingFragments = markdownFragments.filter(fragment => fragment.label.startsWith(prefix));
  24. // Add parent heading if it exists
  25. const parts = prefix.split('-');
  26. for (let i = 1; i < parts.length; i++) {
  27. const parentPrefix = parts.slice(0, i).join('-');
  28. const parentHeading = markdownFragments.find(fragment => fragment.label === `${parentPrefix}-heading`);
  29. if (parentHeading) {
  30. matchingFragments = [parentHeading, ...matchingFragments]; // Add the heading at the front
  31. }
  32. }
  33. fragmentGroupes.push(matchingFragments);
  34. }
  35. else {
  36. // Filter chunks that start with the current prefix
  37. let matchingFragments = markdownFragments.filter(fragment => fragment.label.startsWith(prefix));
  38. // Add parent heading if it exists
  39. const parts = prefix.split('-');
  40. for (let i = 1; i < parts.length; i++) {
  41. const parentPrefix = parts.slice(0, i).join('-');
  42. const parentHeading = markdownFragments.find(fragment => fragment.label === `${parentPrefix}-heading`);
  43. if (parentHeading) {
  44. matchingFragments = [parentHeading, ...matchingFragments];
  45. }
  46. }
  47. // Calculate total token count including parent headings
  48. const totalTokenCount = matchingFragments.reduce((sum, fragment) => sum + fragment.tokenCount, 0);
  49. // If the total token count doesn't exceed maxToken, group the chunks
  50. if (totalTokenCount <= maxToken) {
  51. fragmentGroupes.push(matchingFragments);
  52. remainingPrefixes = remainingPrefixes.filter(p => !p.startsWith(`${prefix}-`));
  53. }
  54. else {
  55. // If it exceeds maxToken, strictly filter chunks by the exact numeric prefix
  56. const strictMatchingFragments = markdownFragments.filter((fragment) => {
  57. const match = fragment.label.match(/^\d+(-\d+)*(?=-)/);
  58. return match && match[0] === prefix;
  59. });
  60. // Add parent heading if it exists
  61. for (let i = 1; i < parts.length; i++) {
  62. const parentPrefix = parts.slice(0, i).join('-');
  63. const parentHeading = markdownFragments.find(fragment => fragment.label === `${parentPrefix}-heading`);
  64. if (parentHeading) {
  65. strictMatchingFragments.unshift(parentHeading); // Add the heading at the front
  66. }
  67. }
  68. fragmentGroupes.push(strictMatchingFragments);
  69. }
  70. }
  71. remainingPrefixes.shift();
  72. }
  73. return fragmentGroupes;
  74. }
  75. // Function to group markdown into chunks based on token count
  76. export async function splitMarkdownIntoChunks(
  77. markdownText: string,
  78. model: TiktokenModel,
  79. maxToken = 800,
  80. ): Promise<string[]> {
  81. const encoder = encodingForModel(model);
  82. // If the total token count for the entire markdown text is less than or equal to maxToken,
  83. // return the entire markdown as a single chunk.
  84. if (encoder.encode(markdownText).length <= maxToken) {
  85. return [markdownText];
  86. }
  87. // Split markdown text into chunks
  88. const markdownFragments = await splitMarkdownIntoFragments(markdownText, model);
  89. const chunks = [] as string[];
  90. // Group the chunks based on token count
  91. const fragmentGroupes = groupMarkdownFragments(markdownFragments, maxToken);
  92. fragmentGroupes.forEach((fragmentGroupe) => {
  93. // Calculate the total token count for each group
  94. const totalTokenCount = fragmentGroupe.reduce((sum, fragment) => sum + fragment.tokenCount, 0);
  95. // If the total token count doesn't exceed maxToken, combine the chunks into one
  96. if (totalTokenCount <= maxToken) {
  97. const chunk = fragmentGroupe.map((fragment, index) => {
  98. const nextFragment = fragmentGroupe[index + 1];
  99. if (nextFragment) {
  100. // If both the current and next chunks are headings, add a single newline
  101. if (fragment.type === 'heading' && nextFragment.type === 'heading') {
  102. return `${fragment.text}\n`;
  103. }
  104. // Add two newlines for other cases
  105. return `${fragment.text}\n\n`;
  106. }
  107. return fragment.text; // No newlines for the last chunk
  108. }).join('');
  109. chunks.push(chunk);
  110. }
  111. else {
  112. // If the total token count exceeds maxToken, split content
  113. const headingFragments = fragmentGroupe.filter(fragment => fragment.type === 'heading'); // Find all headings
  114. const headingText = headingFragments.map(heading => heading.text).join('\n'); // Combine headings with one newline
  115. for (const fragment of fragmentGroupe) {
  116. if (fragment.label.includes('content')) {
  117. // Combine heading and paragraph content
  118. const combinedTokenCount = headingFragments.reduce((sum, heading) => sum + heading.tokenCount, 0) + fragment.tokenCount;
  119. // Check if headingChunks alone exceed maxToken
  120. const headingTokenCount = headingFragments.reduce((sum, heading) => sum + heading.tokenCount, 0);
  121. if (headingTokenCount > maxToken / 2) {
  122. throw new Error(
  123. `Heading token count is too large. Heading token count: ${headingTokenCount}, allowed maximum: ${Math.ceil(maxToken / 2)}`,
  124. );
  125. }
  126. // If the combined token count exceeds maxToken, split the content by character count
  127. if (combinedTokenCount > maxToken) {
  128. const headingTokenCount = headingFragments.reduce((sum, heading) => sum + heading.tokenCount, 0);
  129. const remainingTokenCount = maxToken - headingTokenCount;
  130. // Calculate the total character count and token count
  131. const fragmentCharCount = fragment.text.length;
  132. const fragmenTokenCount = fragment.tokenCount;
  133. // Calculate the character count for splitting
  134. const charCountForSplit = Math.floor((remainingTokenCount / fragmenTokenCount) * fragmentCharCount);
  135. // Split content based on character count
  136. const splitContents = [];
  137. for (let i = 0; i < fragment.text.length; i += charCountForSplit) {
  138. splitContents.push(fragment.text.slice(i, i + charCountForSplit));
  139. }
  140. // Add each split content to the new group of chunks
  141. splitContents.forEach((splitText) => {
  142. const chunk = headingText
  143. ? `${headingText}\n\n${splitText}`
  144. : `${splitText}`;
  145. chunks.push(chunk);
  146. });
  147. }
  148. else {
  149. const chunk = `${headingText}\n\n${fragment.text}`;
  150. chunks.push(chunk);
  151. }
  152. }
  153. }
  154. }
  155. });
  156. return chunks;
  157. }