| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133 |
- import { dynamicImport } from '@cspell/dynamic-import';
- import type { TiktokenModel } from 'js-tiktoken';
- import { encodingForModel } from 'js-tiktoken';
- import yaml from 'js-yaml';
- import type * as RemarkFrontmatter from 'remark-frontmatter';
- import type * as RemarkGfm from 'remark-gfm';
- import type * as RemarkParse from 'remark-parse';
- import type * as RemarkStringify from 'remark-stringify';
- import type * as Unified from 'unified';
- export type MarkdownFragment = {
- label: string;
- type: string;
- text: string;
- tokenCount: number;
- };
- /**
- * Updates the section numbers based on the heading depth and returns the updated section label.
- * Handles non-consecutive heading levels by initializing missing levels with 1.
- * @param sectionNumbers - The current section numbers.
- * @param headingDepth - The depth of the heading (e.g., # is depth 1).
- * @returns The updated section label.
- */
- function updateSectionNumbers(sectionNumbers: number[], headingDepth: number): string {
- if (headingDepth > sectionNumbers.length) {
- // Initialize missing levels with 1
- while (sectionNumbers.length < headingDepth) {
- sectionNumbers.push(1);
- }
- }
- else if (headingDepth === sectionNumbers.length) {
- // Increment the last number for the same level
- sectionNumbers[headingDepth - 1]++;
- }
- else {
- // Remove deeper levels and increment the current level
- sectionNumbers.splice(headingDepth);
- sectionNumbers[headingDepth - 1]++;
- }
- return sectionNumbers.join('-');
- }
- /**
- * Splits Markdown text into labeled markdownFragments using remark-parse and remark-stringify,
- * processing each content node separately and labeling them as 1-content-1, 1-content-2, etc.
- * @param markdownText - The input Markdown string.
- * @returns An array of labeled markdownFragments.
- */
- export async function splitMarkdownIntoFragments(markdownText: string, model: TiktokenModel): Promise<MarkdownFragment[]> {
- const markdownFragments: MarkdownFragment[] = [];
- const sectionNumbers: number[] = [];
- let currentSectionLabel = '';
- const contentCounters: Record<string, number> = {};
- if (typeof markdownText !== 'string' || markdownText.trim() === '') {
- return markdownFragments;
- }
- const encoder = encodingForModel(model);
- const remarkParse = (await dynamicImport<typeof RemarkParse>('remark-parse', __dirname)).default;
- const remarkFrontmatter = (await dynamicImport<typeof RemarkFrontmatter>('remark-frontmatter', __dirname)).default;
- const remarkGfm = (await dynamicImport<typeof RemarkGfm>('remark-gfm', __dirname)).default;
- const remarkStringify = (await dynamicImport<typeof RemarkStringify>('remark-stringify', __dirname)).default;
- const unified = (await dynamicImport<typeof Unified>('unified', __dirname)).unified;
- const parser = unified()
- .use(remarkParse)
- .use(remarkFrontmatter, ['yaml'])
- .use(remarkGfm); // Enable GFM extensions
- const stringifyOptions: RemarkStringify.Options = {
- bullet: '-', // Set list bullet to hyphen
- rule: '-', // Use hyphen for horizontal rules
- };
- const stringifier = unified()
- .use(remarkFrontmatter, ['yaml'])
- .use(remarkGfm)
- .use(remarkStringify, stringifyOptions);
- const parsedTree = parser.parse(markdownText);
- // Iterate over top-level nodes to prevent duplication
- for (const node of parsedTree.children) {
- if (node.type === 'yaml') {
- // Frontmatter block found, handle only the first instance
- const frontmatter = yaml.load(node.value) as Record<string, unknown>;
- const frontmatterText = JSON.stringify(frontmatter, null, 2);
- const tokenCount = encoder.encode(frontmatterText).length;
- markdownFragments.push({
- label: 'frontmatter',
- type: 'yaml',
- text: frontmatterText,
- tokenCount,
- });
- }
- else if (node.type === 'heading') {
- const headingDepth = node.depth;
- currentSectionLabel = updateSectionNumbers(sectionNumbers, headingDepth);
- const headingMarkdown = stringifier.stringify(node as any).trim(); // eslint-disable-line @typescript-eslint/no-explicit-any
- const tokenCount = encoder.encode(headingMarkdown).length;
- markdownFragments.push({
- label: `${currentSectionLabel}-heading`, type: node.type, text: headingMarkdown, tokenCount,
- });
- }
- else {
- // Process non-heading content individually
- const contentMarkdown = stringifier.stringify(node as any).trim(); // eslint-disable-line @typescript-eslint/no-explicit-any
- if (contentMarkdown !== '') {
- const contentCountKey = currentSectionLabel || '0';
- if (!contentCounters[contentCountKey]) {
- contentCounters[contentCountKey] = 1;
- }
- else {
- contentCounters[contentCountKey]++;
- }
- const contentLabel = currentSectionLabel !== ''
- ? `${currentSectionLabel}-content-${contentCounters[contentCountKey]}`
- : `0-content-${contentCounters[contentCountKey]}`;
- const tokenCount = encoder.encode(contentMarkdown).length;
- markdownFragments.push({
- label: contentLabel, type: node.type, text: contentMarkdown, tokenCount,
- });
- }
- }
- }
- return markdownFragments;
- }
|