markdown-token-splitter.cjs.map 12 KB

1
  1. {"version":3,"file":"markdown-token-splitter.cjs","sources":["../../src/services/markdown-token-splitter.ts"],"sourcesContent":["import { encodingForModel, type TiktokenModel } from 'js-tiktoken';\n\nimport { splitMarkdownIntoFragments, type MarkdownFragment } from './markdown-splitter';\n\ntype MarkdownFragmentGroups = MarkdownFragment[][] ;\n\nfunction groupMarkdownFragments(\n markdownFragments: MarkdownFragment[],\n maxToken: number,\n): MarkdownFragmentGroups {\n\n const prefixes = markdownFragments.map(({ label }) => {\n if (label === 'frontmatter') return 'frontmatter';\n const match = label.match(/^\\d+(?:-\\d+)*/)!; // eslint-disable-line @typescript-eslint/no-non-null-assertion\n return match[0];\n });\n\n const uniquePrefixes = [...new Set(prefixes.filter(Boolean))];\n\n // Group chunks by prefix\n const fragmentGroupes: MarkdownFragmentGroups = [];\n let remainingPrefixes = [...uniquePrefixes];\n\n // Process chunks so that the total token count per level doesn't exceed maxToken\n while (remainingPrefixes.length > 0) {\n const prefix = remainingPrefixes[0]; // Get the first prefix\n const hasNextLevelPrefix = uniquePrefixes.some(p => p !== prefix && p.startsWith(prefix));\n\n if (!hasNextLevelPrefix) {\n // If there is no prefix that starts with the current prefix, group the chunks directly\n let matchingFragments = markdownFragments.filter(fragment => fragment.label.startsWith(prefix));\n\n // Add parent heading if it exists\n const parts = prefix.split('-');\n for (let i = 1; i < parts.length; i++) {\n const parentPrefix = parts.slice(0, i).join('-');\n const parentHeading = markdownFragments.find(fragment => fragment.label === `${parentPrefix}-heading`);\n if (parentHeading) {\n matchingFragments = [parentHeading, ...matchingFragments]; // Add the heading at the front\n }\n }\n\n fragmentGroupes.push(matchingFragments);\n }\n else {\n // Filter chunks that start with the current prefix\n let matchingFragments = markdownFragments.filter(fragment => fragment.label.startsWith(prefix));\n\n // Add parent heading if it exists\n const parts = prefix.split('-');\n for (let i = 1; i < parts.length; i++) {\n const parentPrefix = parts.slice(0, i).join('-');\n const parentHeading = markdownFragments.find(fragment => fragment.label === `${parentPrefix}-heading`);\n if (parentHeading) {\n matchingFragments = [parentHeading, ...matchingFragments];\n }\n }\n\n // Calculate total token count including parent headings\n const totalTokenCount = matchingFragments.reduce((sum, fragment) => sum + fragment.tokenCount, 0);\n\n // If the total token count doesn't exceed maxToken, group the chunks\n if (totalTokenCount <= maxToken) {\n fragmentGroupes.push(matchingFragments);\n remainingPrefixes = remainingPrefixes.filter(p => !p.startsWith(`${prefix}-`));\n }\n else {\n // If it exceeds maxToken, strictly filter chunks by the exact numeric prefix\n const strictMatchingFragments = markdownFragments.filter((fragment) => {\n const match = fragment.label.match(/^\\d+(-\\d+)*(?=-)/);\n return match && match[0] === prefix;\n });\n\n // Add parent heading if it exists\n for (let i = 1; i < parts.length; i++) {\n const parentPrefix = parts.slice(0, i).join('-');\n const parentHeading = markdownFragments.find(fragment => fragment.label === `${parentPrefix}-heading`);\n if (parentHeading) {\n strictMatchingFragments.unshift(parentHeading); // Add the heading at the front\n }\n }\n\n fragmentGroupes.push(strictMatchingFragments);\n }\n }\n remainingPrefixes.shift();\n }\n\n return fragmentGroupes;\n}\n\n// Function to group markdown into chunks based on token count\nexport async function splitMarkdownIntoChunks(\n markdownText: string,\n model: TiktokenModel,\n maxToken = 800,\n): Promise<string[]> {\n const encoder = encodingForModel(model);\n\n // If the total token count for the entire markdown text is less than or equal to maxToken,\n // return the entire markdown as a single chunk.\n if (encoder.encode(markdownText).length <= maxToken) {\n return [markdownText];\n }\n\n // Split markdown text into chunks\n const markdownFragments = await splitMarkdownIntoFragments(markdownText, model);\n const chunks = [] as string[];\n\n // Group the chunks based on token count\n const fragmentGroupes = groupMarkdownFragments(markdownFragments, maxToken);\n\n fragmentGroupes.forEach((fragmentGroupe) => {\n // Calculate the total token count for each group\n const totalTokenCount = fragmentGroupe.reduce((sum, fragment) => sum + fragment.tokenCount, 0);\n\n // If the total token count doesn't exceed maxToken, combine the chunks into one\n if (totalTokenCount <= maxToken) {\n const chunk = fragmentGroupe.map((fragment, index) => {\n const nextFragment = fragmentGroupe[index + 1];\n if (nextFragment) {\n // If both the current and next chunks are headings, add a single newline\n if (fragment.type === 'heading' && nextFragment.type === 'heading') {\n return `${fragment.text}\\n`;\n }\n // Add two newlines for other cases\n return `${fragment.text}\\n\\n`;\n }\n return fragment.text; // No newlines for the last chunk\n }).join('');\n\n chunks.push(chunk);\n }\n else {\n // If the total token count exceeds maxToken, split content\n const headingFragments = fragmentGroupe.filter(fragment => fragment.type === 'heading'); // Find all headings\n const headingText = headingFragments.map(heading => heading.text).join('\\n'); // Combine headings with one newline\n\n for (const fragment of fragmentGroupe) {\n if (fragment.label.includes('content')) {\n // Combine heading and paragraph content\n const combinedTokenCount = headingFragments.reduce((sum, heading) => sum + heading.tokenCount, 0) + fragment.tokenCount;\n // Check if headingChunks alone exceed maxToken\n const headingTokenCount = headingFragments.reduce((sum, heading) => sum + heading.tokenCount, 0);\n\n if (headingTokenCount > maxToken / 2) {\n throw new Error(\n `Heading token count is too large. Heading token count: ${headingTokenCount}, allowed maximum: ${Math.ceil(maxToken / 2)}`,\n );\n }\n\n // If the combined token count exceeds maxToken, split the content by character count\n if (combinedTokenCount > maxToken) {\n const headingTokenCount = headingFragments.reduce((sum, heading) => sum + heading.tokenCount, 0);\n const remainingTokenCount = maxToken - headingTokenCount;\n\n // Calculate the total character count and token count\n const fragmentCharCount = fragment.text.length;\n const fragmenTokenCount = fragment.tokenCount;\n\n // Calculate the character count for splitting\n const charCountForSplit = Math.floor((remainingTokenCount / fragmenTokenCount) * fragmentCharCount);\n\n // Split content based on character count\n const splitContents = [];\n for (let i = 0; i < fragment.text.length; i += charCountForSplit) {\n splitContents.push(fragment.text.slice(i, i + charCountForSplit));\n }\n\n // Add each split content to the new group of chunks\n splitContents.forEach((splitText) => {\n const chunk = headingText\n ? `${headingText}\\n\\n${splitText}`\n : `${splitText}`;\n chunks.push(chunk);\n });\n }\n else {\n const chunk = `${headingText}\\n\\n${fragment.text}`;\n chunks.push(chunk);\n }\n }\n }\n }\n });\n\n return chunks;\n}\n"],"names":["groupMarkdownFragments","markdownFragments","maxToken","prefixes","label","uniquePrefixes","fragmentGroupes","remainingPrefixes","prefix","p","matchingFragments","fragment","parts","i","parentPrefix","parentHeading","sum","strictMatchingFragments","match","splitMarkdownIntoChunks","markdownText","model","encodingForModel","splitMarkdownIntoFragments","chunks","fragmentGroupe","chunk","index","nextFragment","headingFragments","headingText","heading","combinedTokenCount","headingTokenCount","remainingTokenCount","fragmentCharCount","fragmenTokenCount","charCountForSplit","splitContents","splitText"],"mappings":"oJAMA,SAASA,EACLC,EACAC,EACsB,CAExB,MAAMC,EAAWF,EAAkB,IAAI,CAAC,CAAE,MAAAG,KACpCA,IAAU,cAAsB,cACtBA,EAAM,MAAM,eAAe,EAC5B,CAAC,CACf,EAEKC,EAAiB,CAAC,GAAG,IAAI,IAAIF,EAAS,OAAO,OAAO,CAAC,CAAC,EAGtDG,EAA0C,CAAA,EAC5C,IAAAC,EAAoB,CAAC,GAAGF,CAAc,EAGnC,KAAAE,EAAkB,OAAS,GAAG,CAC7B,MAAAC,EAASD,EAAkB,CAAC,EAGlC,GAF2BF,EAAe,KAAKI,GAAKA,IAAMD,GAAUC,EAAE,WAAWD,CAAM,CAAC,EAkBnF,CAEC,IAAAE,EAAoBT,EAAkB,OAAOU,GAAYA,EAAS,MAAM,WAAWH,CAAM,CAAC,EAGxF,MAAAI,EAAQJ,EAAO,MAAM,GAAG,EAC9B,QAASK,EAAI,EAAGA,EAAID,EAAM,OAAQC,IAAK,CACrC,MAAMC,EAAeF,EAAM,MAAM,EAAGC,CAAC,EAAE,KAAK,GAAG,EACzCE,EAAgBd,EAAkB,KAAKU,GAAYA,EAAS,QAAU,GAAGG,CAAY,UAAU,EACjGC,IACkBL,EAAA,CAACK,EAAe,GAAGL,CAAiB,EAE5D,CAMA,GAHwBA,EAAkB,OAAO,CAACM,EAAKL,IAAaK,EAAML,EAAS,WAAY,CAAC,GAGzET,EACrBI,EAAgB,KAAKI,CAAiB,EAClBH,EAAAA,EAAkB,OAAYE,GAAA,CAACA,EAAE,WAAW,GAAGD,CAAM,GAAG,CAAC,MAE1E,CAEH,MAAMS,EAA0BhB,EAAkB,OAAQU,GAAa,CACrE,MAAMO,EAAQP,EAAS,MAAM,MAAM,kBAAkB,EAC9C,OAAAO,GAASA,EAAM,CAAC,IAAMV,CAAA,CAC9B,EAGD,QAASK,EAAI,EAAGA,EAAID,EAAM,OAAQC,IAAK,CACrC,MAAMC,EAAeF,EAAM,MAAM,EAAGC,CAAC,EAAE,KAAK,GAAG,EACzCE,EAAgBd,EAAkB,KAAKU,GAAYA,EAAS,QAAU,GAAGG,CAAY,UAAU,EACjGC,GACFE,EAAwB,QAAQF,CAAa,CAEjD,CAEAT,EAAgB,KAAKW,CAAuB,CAC9C,CACF,KAxDyB,CAEnB,IAAAP,EAAoBT,EAAkB,OAAOU,GAAYA,EAAS,MAAM,WAAWH,CAAM,CAAC,EAGxF,MAAAI,EAAQJ,EAAO,MAAM,GAAG,EAC9B,QAASK,EAAI,EAAGA,EAAID,EAAM,OAAQC,IAAK,CACrC,MAAMC,EAAeF,EAAM,MAAM,EAAGC,CAAC,EAAE,KAAK,GAAG,EACzCE,EAAgBd,EAAkB,KAAKU,GAAYA,EAAS,QAAU,GAAGG,CAAY,UAAU,EACjGC,IACkBL,EAAA,CAACK,EAAe,GAAGL,CAAiB,EAE5D,CAEAJ,EAAgB,KAAKI,CAAiB,CAAA,CA2CxCH,EAAkB,MAAM,CAC1B,CAEO,OAAAD,CACT,CAGA,eAAsBa,EAClBC,EACAC,EACAnB,EAAW,IACM,CAKnB,GAJgBoB,mBAAiBD,CAAK,EAI1B,OAAOD,CAAY,EAAE,QAAUlB,EACzC,MAAO,CAACkB,CAAY,EAItB,MAAMnB,EAAoB,MAAMsB,EAAAA,2BAA2BH,EAAcC,CAAK,EACxEG,EAAS,CAAA,EAKC,OAFQxB,EAAuBC,EAAmBC,CAAQ,EAE1D,QAASuB,GAAmB,CAK1C,GAHwBA,EAAe,OAAO,CAACT,EAAKL,IAAaK,EAAML,EAAS,WAAY,CAAC,GAGtET,EAAU,CAC/B,MAAMwB,EAAQD,EAAe,IAAI,CAACd,EAAUgB,IAAU,CAC9C,MAAAC,EAAeH,EAAeE,EAAQ,CAAC,EAC7C,OAAIC,EAEEjB,EAAS,OAAS,WAAaiB,EAAa,OAAS,UAChD,GAAGjB,EAAS,IAAI;AAAA,EAGlB,GAAGA,EAAS,IAAI;AAAA;AAAA,EAElBA,EAAS,IAAA,CACjB,EAAE,KAAK,EAAE,EAEVa,EAAO,KAAKE,CAAK,CAAA,KAEd,CAEH,MAAMG,EAAmBJ,EAAe,OAAmBd,GAAAA,EAAS,OAAS,SAAS,EAChFmB,EAAcD,EAAiB,IAAIE,GAAWA,EAAQ,IAAI,EAAE,KAAK;AAAA,CAAI,EAE3E,UAAWpB,KAAYc,EACrB,GAAId,EAAS,MAAM,SAAS,SAAS,EAAG,CAEhC,MAAAqB,EAAqBH,EAAiB,OAAO,CAACb,EAAKe,IAAYf,EAAMe,EAAQ,WAAY,CAAC,EAAIpB,EAAS,WAEvGsB,EAAoBJ,EAAiB,OAAO,CAACb,EAAKe,IAAYf,EAAMe,EAAQ,WAAY,CAAC,EAE3F,GAAAE,EAAoB/B,EAAW,EACjC,MAAM,IAAI,MACR,0DAA0D+B,CAAiB,sBAAsB,KAAK,KAAK/B,EAAW,CAAC,CAAC,EAAA,EAK5H,GAAI8B,EAAqB9B,EAAU,CAC3B+B,MAAAA,EAAoBJ,EAAiB,OAAO,CAACb,EAAKe,IAAYf,EAAMe,EAAQ,WAAY,CAAC,EACzFG,EAAsBhC,EAAW+B,EAGjCE,EAAoBxB,EAAS,KAAK,OAClCyB,EAAoBzB,EAAS,WAG7B0B,EAAoB,KAAK,MAAOH,EAAsBE,EAAqBD,CAAiB,EAG5FG,EAAgB,CAAA,EACtB,QAASzB,EAAI,EAAGA,EAAIF,EAAS,KAAK,OAAQE,GAAKwB,EAC7CC,EAAc,KAAK3B,EAAS,KAAK,MAAME,EAAGA,EAAIwB,CAAiB,CAAC,EAIpDC,EAAA,QAASC,GAAc,CAC7B,MAAAb,EAAQI,EACV,GAAGA,CAAW;AAAA;AAAA,EAAOS,CAAS,GAC9B,GAAGA,CAAS,GAChBf,EAAO,KAAKE,CAAK,CAAA,CAClB,CAAA,KAEE,CACG,MAAAA,EAAQ,GAAGI,CAAW;AAAA;AAAA,EAAOnB,EAAS,IAAI,GAChDa,EAAO,KAAKE,CAAK,CACnB,CACF,CAEJ,CAAA,CACD,EAEMF,CACT"}