| 1 |
- {"version":3,"file":"markdown-splitter.cjs","sources":["../../src/services/markdown-splitter.ts"],"sourcesContent":["import type { TiktokenModel } from 'js-tiktoken';\nimport { encodingForModel } from 'js-tiktoken';\nimport yaml from 'js-yaml';\nimport remarkFrontmatter from 'remark-frontmatter'; // Frontmatter processing\nimport remarkGfm from 'remark-gfm'; // GFM processing\nimport remarkParse from 'remark-parse';\nimport type { Options as StringifyOptions } from 'remark-stringify';\nimport remarkStringify from 'remark-stringify';\nimport { unified } from 'unified';\n\nexport type MarkdownFragment = {\n label: string;\n type: string;\n text: string;\n tokenCount: number;\n};\n\n/**\n * Updates the section numbers based on the heading depth and returns the updated section label.\n * Handles non-consecutive heading levels by initializing missing levels with 1.\n * @param sectionNumbers - The current section numbers.\n * @param headingDepth - The depth of the heading (e.g., # is depth 1).\n * @returns The updated section label.\n */\nfunction updateSectionNumbers(sectionNumbers: number[], headingDepth: number): string {\n if (headingDepth > sectionNumbers.length) {\n // Initialize missing levels with 1\n while (sectionNumbers.length < headingDepth) {\n sectionNumbers.push(1);\n }\n }\n else if (headingDepth === sectionNumbers.length) {\n // Increment the last number for the same level\n sectionNumbers[headingDepth - 1]++;\n }\n else {\n // Remove deeper levels and increment the current level\n sectionNumbers.splice(headingDepth);\n sectionNumbers[headingDepth - 1]++;\n }\n return sectionNumbers.join('-');\n}\n\n/**\n * Splits Markdown text into labeled markdownFragments using remark-parse and remark-stringify,\n * processing each content node separately and labeling them as 1-content-1, 1-content-2, etc.\n * @param markdownText - The input Markdown string.\n * @returns An array of labeled markdownFragments.\n */\nexport async function splitMarkdownIntoFragments(markdownText: string, model: TiktokenModel): Promise<MarkdownFragment[]> {\n const markdownFragments: MarkdownFragment[] = [];\n const sectionNumbers: number[] = [];\n let currentSectionLabel = '';\n const contentCounters: Record<string, number> = {};\n\n if (typeof markdownText !== 'string' || markdownText.trim() === '') {\n return markdownFragments;\n }\n\n const encoder = encodingForModel(model);\n\n const parser = unified()\n .use(remarkParse)\n .use(remarkFrontmatter, ['yaml'])\n .use(remarkGfm); // Enable GFM extensions\n\n const stringifyOptions: StringifyOptions = {\n bullet: '-', // Set list bullet to hyphen\n rule: '-', // Use hyphen for horizontal rules\n };\n\n const stringifier = unified()\n .use(remarkFrontmatter, ['yaml'])\n .use(remarkGfm)\n .use(remarkStringify, stringifyOptions);\n\n const parsedTree = parser.parse(markdownText);\n\n // Iterate over top-level nodes to prevent duplication\n for (const node of parsedTree.children) {\n if (node.type === 'yaml') {\n // Frontmatter block found, handle only the first instance\n const frontmatter = yaml.load(node.value) as Record<string, unknown>;\n const frontmatterText = JSON.stringify(frontmatter, null, 2);\n const tokenCount = encoder.encode(frontmatterText).length;\n markdownFragments.push({\n label: 'frontmatter',\n type: 'yaml',\n text: frontmatterText,\n tokenCount,\n });\n }\n else if (node.type === 'heading') {\n const headingDepth = node.depth;\n currentSectionLabel = updateSectionNumbers(sectionNumbers, headingDepth);\n\n const headingMarkdown = stringifier.stringify(node as any).trim(); // eslint-disable-line @typescript-eslint/no-explicit-any\n const tokenCount = encoder.encode(headingMarkdown).length;\n markdownFragments.push({\n label: `${currentSectionLabel}-heading`, type: node.type, text: headingMarkdown, tokenCount,\n });\n }\n else {\n // Process non-heading content individually\n const contentMarkdown = stringifier.stringify(node as any).trim(); // eslint-disable-line @typescript-eslint/no-explicit-any\n if (contentMarkdown !== '') {\n const contentCountKey = currentSectionLabel || '0';\n if (!contentCounters[contentCountKey]) {\n contentCounters[contentCountKey] = 1;\n }\n else {\n contentCounters[contentCountKey]++;\n }\n const contentLabel = currentSectionLabel !== ''\n ? `${currentSectionLabel}-content-${contentCounters[contentCountKey]}`\n : `0-content-${contentCounters[contentCountKey]}`;\n const tokenCount = encoder.encode(contentMarkdown).length;\n markdownFragments.push({\n label: contentLabel, type: node.type, text: contentMarkdown, tokenCount,\n });\n }\n }\n }\n\n return markdownFragments;\n}\n"],"names":["updateSectionNumbers","sectionNumbers","headingDepth","splitMarkdownIntoFragments","markdownText","model","markdownFragments","currentSectionLabel","contentCounters","encoder","encodingForModel","parser","unified","remarkParse","remarkFrontmatter","remarkGfm","stringifyOptions","stringifier","remarkStringify","parsedTree","node","frontmatter","yaml","frontmatterText","tokenCount","headingMarkdown","contentMarkdown","contentCountKey","contentLabel"],"mappings":"yQAwBA,SAASA,EAAqBC,EAA0BC,EAA8B,CAChF,GAAAA,EAAeD,EAAe,OAEzB,KAAAA,EAAe,OAASC,GAC7BD,EAAe,KAAK,CAAC,OAGhBC,IAAiBD,EAAe,QAMvCA,EAAe,OAAOC,CAAY,EAClCD,EAAeC,EAAe,CAAC,IAE1B,OAAAD,EAAe,KAAK,GAAG,CAChC,CAQsB,eAAAE,EAA2BC,EAAsBC,EAAmD,CACxH,MAAMC,EAAwC,CAAA,EACxCL,EAA2B,CAAA,EACjC,IAAIM,EAAsB,GAC1B,MAAMC,EAA0C,CAAA,EAEhD,GAAI,OAAOJ,GAAiB,UAAYA,EAAa,KAAA,IAAW,GACvD,OAAAE,EAGH,MAAAG,EAAUC,mBAAiBL,CAAK,EAEhCM,EAASC,EAAA,QAAA,EACZ,IAAIC,CAAW,EACf,IAAIC,EAAmB,CAAC,MAAM,CAAC,EAC/B,IAAIC,CAAS,EAEVC,EAAqC,CACzC,OAAQ,IACR,KAAM,GAAA,EAGFC,EAAcL,EAAA,QAAA,EACjB,IAAIE,EAAmB,CAAC,MAAM,CAAC,EAC/B,IAAIC,CAAS,EACb,IAAIG,EAAiBF,CAAgB,EAElCG,EAAaR,EAAO,MAAMP,CAAY,EAGjC,UAAAgB,KAAQD,EAAW,SACxB,GAAAC,EAAK,OAAS,OAAQ,CAExB,MAAMC,EAAcC,EAAK,KAAKF,EAAK,KAAK,EAClCG,EAAkB,KAAK,UAAUF,EAAa,KAAM,CAAC,EACrDG,EAAaf,EAAQ,OAAOc,CAAe,EAAE,OACnDjB,EAAkB,KAAK,CACrB,MAAO,cACP,KAAM,OACN,KAAMiB,EACN,WAAAC,CAAA,CACD,CAAA,SAEMJ,EAAK,OAAS,UAAW,CAChC,MAAMlB,EAAekB,EAAK,MACJb,EAAAP,EAAqBC,EAAgBC,CAAY,EAEvE,MAAMuB,EAAkBR,EAAY,UAAUG,CAAW,EAAE,KAAK,EAC1DI,EAAaf,EAAQ,OAAOgB,CAAe,EAAE,OACnDnB,EAAkB,KAAK,CACrB,MAAO,GAAGC,CAAmB,WAAY,KAAMa,EAAK,KAAM,KAAMK,EAAiB,WAAAD,CAAA,CAClF,CAAA,KAEE,CAEH,MAAME,EAAkBT,EAAY,UAAUG,CAAW,EAAE,KAAK,EAChE,GAAIM,IAAoB,GAAI,CAC1B,MAAMC,EAAkBpB,GAAuB,IAC1CC,EAAgBmB,CAAe,EAIlCnB,EAAgBmB,CAAe,IAH/BnB,EAAgBmB,CAAe,EAAI,EAKrC,MAAMC,EAAerB,IAAwB,GACzC,GAAGA,CAAmB,YAAYC,EAAgBmB,CAAe,CAAC,GAClE,aAAanB,EAAgBmB,CAAe,CAAC,GAC3CH,EAAaf,EAAQ,OAAOiB,CAAe,EAAE,OACnDpB,EAAkB,KAAK,CACrB,MAAOsB,EAAc,KAAMR,EAAK,KAAM,KAAMM,EAAiB,WAAAF,CAAA,CAC9D,CACH,CACF,CAGK,OAAAlB,CACT"}
|