|
|
@@ -1,176 +1,89 @@
|
|
|
-import type { Tiktoken, TiktokenModel } from '@dqbd/tiktoken';
|
|
|
-import { encoding_for_model } from '@dqbd/tiktoken'; // eslint-disable-line
|
|
|
-import type { Root, Content, Heading } from 'mdast';
|
|
|
-
|
|
|
-/**
|
|
|
- * Interface representing a section
|
|
|
- */
|
|
|
-interface Section {
|
|
|
- heading: Heading | null;
|
|
|
- content: Content[];
|
|
|
-}
|
|
|
-
|
|
|
-/**
|
|
|
- * Function to recursively split Markdown content by header sections so that each section has a token count below the specified maximum
|
|
|
- *
|
|
|
- * @param model - The name of the model to use (e.g., 'gpt-4')
|
|
|
- * @param markdownContent - The Markdown content to split
|
|
|
- * @param maxTokens - The maximum number of tokens per section (default: 100)
|
|
|
- * @returns An array of split Markdown sections
|
|
|
- */
|
|
|
-export async function splitMarkdownByTokens(
|
|
|
- model: TiktokenModel,
|
|
|
- markdownContent: string,
|
|
|
- maxTokens = 100,
|
|
|
-): Promise<string[]> {
|
|
|
- // Obtain encoding based on the model
|
|
|
- const encoding: Tiktoken = encoding_for_model(model);
|
|
|
-
|
|
|
- // Dynamically import remark-parse
|
|
|
- const remarkParse = (await import('remark-parse')).default;
|
|
|
- const remarkStringify = (await import('remark-stringify')).default;
|
|
|
- const unified = (await import('unified')).unified;
|
|
|
-
|
|
|
- // Parse Markdown into AST
|
|
|
- const processor = unified().use(remarkParse);
|
|
|
- const tree = processor.parse(markdownContent) as Root;
|
|
|
-
|
|
|
- /**
|
|
|
- * Function to stringify a node
|
|
|
- * @param node - The node to stringify
|
|
|
- * @returns The Markdown string of the node
|
|
|
- */
|
|
|
- const stringify = (node: Root): string => {
|
|
|
- return "aaa"; // eslint-disable-line
|
|
|
- };
|
|
|
-
|
|
|
- /**
|
|
|
- * Function to get the token count of a text
|
|
|
- * @param text - The text to calculate token count for
|
|
|
- * @returns The number of tokens
|
|
|
- */
|
|
|
- const getTokenCount = (text: string): number => {
|
|
|
- return encoding.encode(text).length;
|
|
|
- };
|
|
|
-
|
|
|
- /**
|
|
|
- * Function to split nodes into sections based on headers
|
|
|
- * @param nodes - The array of nodes to split
|
|
|
- * @returns An array of sections
|
|
|
- */
|
|
|
- const splitSections = (nodes: Content[]): Section[] => {
|
|
|
- const sections: Section[] = [];
|
|
|
- let currentSection: Section = { heading: null, content: [] };
|
|
|
-
|
|
|
- for (const node of nodes) {
|
|
|
- if (node.type === 'heading') {
|
|
|
- // Start a new section
|
|
|
- if (currentSection.heading || currentSection.content.length > 0) {
|
|
|
- sections.push(currentSection);
|
|
|
+import type { TiktokenModel } from 'js-tiktoken';
|
|
|
+import { encodingForModel } from 'js-tiktoken';
|
|
|
+import type { Root, Content } from 'mdast';
|
|
|
+import remarkParse from 'remark-parse';
|
|
|
+import remarkStringify from 'remark-stringify';
|
|
|
+import { unified } from 'unified';
|
|
|
+
|
|
|
+export function splitMarkdownByTokens(
|
|
|
+ markdownString: string,
|
|
|
+ maxTokens: number,
|
|
|
+ modelName: TiktokenModel,
|
|
|
+): string[] {
|
|
|
+ // Parse the markdown into an AST
|
|
|
+ const tree = unified().use(remarkParse).parse(markdownString) as Root;
|
|
|
+ const encoding = encodingForModel(modelName);
|
|
|
+
|
|
|
+ function countTokens(text: string): number {
|
|
|
+ const tokens = encoding.encode(text);
|
|
|
+ return tokens.length;
|
|
|
+ }
|
|
|
+
|
|
|
+ // Recursively split sections
|
|
|
+ function splitSectionRecursively(nodes: Content[]): Content[][] {
|
|
|
+ const sections: Content[][] = [];
|
|
|
+ let currentSection: Content[] = [];
|
|
|
+
|
|
|
+ for (let i = 0; i < nodes.length; i++) {
|
|
|
+ const node = nodes[i];
|
|
|
+ currentSection.push(node);
|
|
|
+
|
|
|
+ const markdown = unified()
|
|
|
+ .use(remarkStringify)
|
|
|
+ .stringify({ type: 'root', children: currentSection });
|
|
|
+ const tokenCount = countTokens(markdown);
|
|
|
+
|
|
|
+ if (tokenCount > maxTokens) {
|
|
|
+ // If the token count exceeds the limit, treat the nodes up to the previous one as a section
|
|
|
+ currentSection.pop();
|
|
|
+ if (currentSection.length > 0) {
|
|
|
+ sections.push([...currentSection]);
|
|
|
+ }
|
|
|
+ // Start a new section from the current node
|
|
|
+ currentSection = [node];
|
|
|
+
|
|
|
+ // If a single node exceeds maxTokens, add it as its own section
|
|
|
+ const singleNodeMarkdown = unified()
|
|
|
+ .use(remarkStringify)
|
|
|
+ .stringify({ type: 'root', children: [node] });
|
|
|
+ const singleNodeTokenCount = countTokens(singleNodeMarkdown);
|
|
|
+ if (singleNodeTokenCount > maxTokens) {
|
|
|
+ sections.push([node]);
|
|
|
+ currentSection = [];
|
|
|
}
|
|
|
- currentSection = { heading: node as Heading, content: [] };
|
|
|
- }
|
|
|
- else {
|
|
|
- currentSection.content.push(node);
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- // Add the last section
|
|
|
- if (currentSection.heading || currentSection.content.length > 0) {
|
|
|
- sections.push(currentSection);
|
|
|
- }
|
|
|
-
|
|
|
- return sections;
|
|
|
- };
|
|
|
-
|
|
|
- /**
|
|
|
- * Function to recursively process sections
|
|
|
- * @param sections - The array of sections to process
|
|
|
- * @returns An array of split Markdown strings
|
|
|
- */
|
|
|
- const processSections = (sections: Section[]): string[] => {
|
|
|
- const results: string[] = [];
|
|
|
-
|
|
|
- for (const section of sections) {
|
|
|
- const nodes: Content[] = [];
|
|
|
- if (section.heading) {
|
|
|
- nodes.push(section.heading);
|
|
|
}
|
|
|
- nodes.push(...section.content);
|
|
|
-
|
|
|
- const subtree: Root = { type: 'root', children: nodes };
|
|
|
- const content = stringify(subtree);
|
|
|
- const tokenCount = getTokenCount(content);
|
|
|
|
|
|
- if (tokenCount <= maxTokens) {
|
|
|
- results.push(content);
|
|
|
- }
|
|
|
- else if (section.content.some(child => child.type === 'heading')) {
|
|
|
- // Split into subsections
|
|
|
- const subsections = splitSections(section.content);
|
|
|
- results.push(...processSections(subsections));
|
|
|
- }
|
|
|
- else {
|
|
|
- // Split by paragraphs
|
|
|
- const paragraphs = 'sass';
|
|
|
- results.push(...paragraphs);
|
|
|
+ // If it's the last node, add the section
|
|
|
+ if (i === nodes.length - 1 && currentSection.length > 0) {
|
|
|
+ sections.push([...currentSection]);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- return results;
|
|
|
- };
|
|
|
-
|
|
|
- /**
|
|
|
- * Function to split nodes by paragraphs
|
|
|
- * @param nodes - The array of nodes to split
|
|
|
- * @returns An array of split Markdown strings
|
|
|
- */
|
|
|
- const splitByParagraphs = (nodes: Content[]): string[] => {
|
|
|
- const results: string[] = [];
|
|
|
- let currentNodes: Content[] = [];
|
|
|
- let currentTokenCount = 0;
|
|
|
-
|
|
|
- for (const node of nodes) {
|
|
|
- const nodeContent = stringify({ type: 'root', children: [node] });
|
|
|
- const nodeTokenCount = getTokenCount(nodeContent);
|
|
|
-
|
|
|
- if (currentTokenCount + nodeTokenCount <= maxTokens) {
|
|
|
- currentNodes.push(node);
|
|
|
- currentTokenCount += nodeTokenCount;
|
|
|
+ // Recursively split each section
|
|
|
+ const recursivelySplitSections: Content[][] = [];
|
|
|
+ for (const section of sections) {
|
|
|
+ // If the section contains child headings, split further
|
|
|
+ const hasHeading = section.some(node => node.type === 'heading');
|
|
|
+ if (hasHeading && countTokens(unified().use(remarkStringify).stringify({ type: 'root', children: section })) > maxTokens) {
|
|
|
+ // Recursively split child nodes
|
|
|
+ const childSections = splitSectionRecursively(section);
|
|
|
+ recursivelySplitSections.push(...childSections);
|
|
|
}
|
|
|
else {
|
|
|
- if (currentNodes.length > 0) {
|
|
|
- const chunk = stringify({ type: 'root', children: currentNodes });
|
|
|
- results.push(chunk);
|
|
|
- currentNodes = [];
|
|
|
- currentTokenCount = 0;
|
|
|
- }
|
|
|
- if (nodeTokenCount > maxTokens) {
|
|
|
- // If a single node exceeds maxTokens, add it as is
|
|
|
- results.push(nodeContent);
|
|
|
- }
|
|
|
- else {
|
|
|
- currentNodes.push(node);
|
|
|
- currentTokenCount = nodeTokenCount;
|
|
|
- }
|
|
|
+ recursivelySplitSections.push(section);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- if (currentNodes.length > 0) {
|
|
|
- const chunk = stringify({ type: 'root', children: currentNodes });
|
|
|
- results.push(chunk);
|
|
|
- }
|
|
|
-
|
|
|
- return results;
|
|
|
- };
|
|
|
+ return recursivelySplitSections;
|
|
|
+ }
|
|
|
|
|
|
- // Get initial sections
|
|
|
- const initialSections = splitSections(tree.children);
|
|
|
- // Process sections
|
|
|
- const result = processSections(initialSections);
|
|
|
+ // Recursively split the AST's child nodes
|
|
|
+ const splitSections = splitSectionRecursively(tree.children);
|
|
|
|
|
|
- // Free the encoding
|
|
|
- encoding.free();
|
|
|
+ // Convert the split sections back into markdown strings
|
|
|
+ const markdownSections = splitSections.map(sectionNodes => unified()
|
|
|
+ .use(remarkStringify)
|
|
|
+ .stringify({ type: 'root', children: sectionNodes }));
|
|
|
|
|
|
- return result;
|
|
|
+ return markdownSections;
|
|
|
}
|