nHigashiWeseek 1 год назад
Родитель
Сommit
4f5d584808

+ 11 - 11
packages/markdown-splitter/src/services/markdown-splitter.ts

@@ -8,9 +8,9 @@ import type { Options as StringifyOptions } from 'remark-stringify';
 import remarkStringify from 'remark-stringify';
 import { unified } from 'unified';
 
-export type Chunk = {
+export type MarkdownFragment = {
   label: string;
-  type?: string;
+  type: string;
   text: string;
   tokenCount: number;
 };
@@ -42,19 +42,19 @@ function updateSectionNumbers(sectionNumbers: number[], headingDepth: number): s
 }
 
 /**
- * Splits Markdown text into labeled chunks using remark-parse and remark-stringify,
+ * Splits Markdown text into labeled markdownFragments using remark-parse and remark-stringify,
  * processing each content node separately and labeling them as 1-content-1, 1-content-2, etc.
  * @param markdownText - The input Markdown string.
- * @returns An array of labeled chunks.
+ * @returns An array of labeled markdownFragments.
  */
-export async function splitMarkdownIntoChunks(markdownText: string, model: TiktokenModel): Promise<Chunk[]> {
-  const chunks: Chunk[] = [];
+export async function splitMarkdownIntoFragments(markdownText: string, model: TiktokenModel): Promise<MarkdownFragment[]> {
+  const markdownFragments: MarkdownFragment[] = [];
   const sectionNumbers: number[] = [];
   let currentSectionLabel = '';
   const contentCounters: Record<string, number> = {};
 
   if (typeof markdownText !== 'string' || markdownText.trim() === '') {
-    return chunks;
+    return markdownFragments;
   }
 
   const encoder = encodingForModel(model);
@@ -83,7 +83,7 @@ export async function splitMarkdownIntoChunks(markdownText: string, model: Tikto
       const frontmatter = yaml.load(node.value) as Record<string, unknown>;
       const frontmatterText = JSON.stringify(frontmatter, null, 2);
       const tokenCount = encoder.encode(frontmatterText).length;
-      chunks.push({
+      markdownFragments.push({
         label: 'frontmatter',
         type: 'yaml',
         text: frontmatterText,
@@ -96,7 +96,7 @@ export async function splitMarkdownIntoChunks(markdownText: string, model: Tikto
 
       const headingMarkdown = stringifier.stringify(node as any).trim(); // eslint-disable-line @typescript-eslint/no-explicit-any
       const tokenCount = encoder.encode(headingMarkdown).length;
-      chunks.push({
+      markdownFragments.push({
         label: `${currentSectionLabel}-heading`, type: node.type, text: headingMarkdown, tokenCount,
       });
     }
@@ -115,12 +115,12 @@ export async function splitMarkdownIntoChunks(markdownText: string, model: Tikto
           ? `${currentSectionLabel}-content-${contentCounters[contentCountKey]}`
           : `0-content-${contentCounters[contentCountKey]}`;
         const tokenCount = encoder.encode(contentMarkdown).length;
-        chunks.push({
+        markdownFragments.push({
           label: contentLabel, type: node.type, text: contentMarkdown, tokenCount,
         });
       }
     }
   }
 
-  return chunks;
+  return markdownFragments;
 }

+ 60 - 76
packages/markdown-splitter/src/services/markdown-token-splitter.ts

@@ -1,14 +1,14 @@
 import type { TiktokenModel } from 'js-tiktoken';
 
-import { splitMarkdownIntoChunks, type Chunk } from './markdown-splitter';
+import { splitMarkdownIntoFragments, type MarkdownFragment } from './markdown-splitter';
 
-type GroupedChunks = { [prefix: string]: Chunk[] };
+type MarkdownFragmentGroups = MarkdownFragment[][] ;
 
 function assembleMarkdownRecursively(
-    chunks: Chunk[],
+    markdownFragments: MarkdownFragment[],
     maxToken: number,
-): GroupedChunks {
-  const labels = chunks.map(chunk => chunk.label);
+): MarkdownFragmentGroups {
+  const labels = markdownFragments.map(fragment => fragment.label);
 
   // Get a list of unique prefixes
   const uniquePrefixes: string[] = [...new Set(labels.map((label) => {
@@ -21,7 +21,7 @@ function assembleMarkdownRecursively(
 
 
   // Group chunks by prefix
-  const groupedChunks: GroupedChunks = {};
+  const fragmentGroupes: MarkdownFragmentGroups = [];
   let remainingPrefixes = [...uniquePrefixes];
 
   // Process chunks so that the total token count per level doesn't exceed maxToken
@@ -31,65 +31,65 @@ function assembleMarkdownRecursively(
 
     if (!hasNextLevelPrefix) {
       // If there is no prefix that starts with the current prefix, group the chunks directly
-      let strictMatchingChunks = chunks.filter(chunk => chunk.label === prefix);
+      let matchingFragments = markdownFragments.filter(fragment => fragment.label === prefix);
 
       // Add parent heading if it exists
       const parts = prefix.split('-');
       for (let i = 1; i < parts.length; i++) {
         const parentPrefix = parts.slice(0, i).join('-');
-        const parentHeading = chunks.find(chunk => chunk.label === `${parentPrefix}-heading`);
+        const parentHeading = markdownFragments.find(fragment => fragment.label === `${parentPrefix}-heading`);
         if (parentHeading) {
-          strictMatchingChunks = [parentHeading, ...strictMatchingChunks]; // Add the heading at the front
+          matchingFragments = [parentHeading, ...matchingFragments]; // Add the heading at the front
         }
       }
 
-      groupedChunks[prefix] = strictMatchingChunks;
+      fragmentGroupes.push(matchingFragments);
     }
     else {
       // Filter chunks that start with the current prefix
-      let matchingChunks = chunks.filter(chunk => chunk.label.startsWith(prefix));
+      let matchingFragments = markdownFragments.filter(fragment => fragment.label.startsWith(prefix));
 
       // Add parent heading if it exists
       const parts = prefix.split('-');
       for (let i = 1; i < parts.length; i++) {
         const parentPrefix = parts.slice(0, i).join('-');
-        const parentHeading = chunks.find(chunk => chunk.label === `${parentPrefix}-heading`);
+        const parentHeading = markdownFragments.find(fragment => fragment.label === `${parentPrefix}-heading`);
         if (parentHeading) {
-          matchingChunks = [parentHeading, ...matchingChunks];
+          matchingFragments = [parentHeading, ...matchingFragments];
         }
       }
 
       // Calculate total token count including parent headings
-      const totalTokenCount = matchingChunks.reduce((sum, chunk) => sum + chunk.tokenCount, 0);
+      const totalTokenCount = matchingFragments.reduce((sum, fragment) => sum + fragment.tokenCount, 0);
 
       // If the total token count doesn't exceed maxToken, group the chunks
       if (totalTokenCount <= maxToken) {
-        groupedChunks[prefix] = matchingChunks;
+        fragmentGroupes.push(matchingFragments);
         remainingPrefixes = remainingPrefixes.filter(p => !p.startsWith(`${prefix}-`));
       }
       else {
         // If it exceeds maxToken, strictly filter chunks by the exact numeric prefix
-        const strictMatchingChunks = chunks.filter((chunk) => {
-          const match = chunk.label.match(/^\d+(-\d+)*(?=-)/);
+        const strictMatchingFragments = markdownFragments.filter((fragment) => {
+          const match = fragment.label.match(/^\d+(-\d+)*(?=-)/);
           return match && match[0] === prefix;
         });
 
         // Add parent heading if it exists
         for (let i = 1; i < parts.length; i++) {
           const parentPrefix = parts.slice(0, i).join('-');
-          const parentHeading = chunks.find(chunk => chunk.label === `${parentPrefix}-heading`);
+          const parentHeading = markdownFragments.find(fragment => fragment.label === `${parentPrefix}-heading`);
           if (parentHeading) {
-            strictMatchingChunks.unshift(parentHeading); // Add the heading at the front
+            strictMatchingFragments.unshift(parentHeading); // Add the heading at the front
           }
         }
 
-        groupedChunks[prefix] = strictMatchingChunks;
+        fragmentGroupes.push(strictMatchingFragments);
       }
     }
     remainingPrefixes.shift();
   }
 
-  return groupedChunks;
+  return fragmentGroupes;
 }
 
 // Function to group markdown into chunks based on token count
@@ -97,104 +97,88 @@ export async function assembleMarkdownIntoChunk(
     markdownText: string,
     model = 'gpt-4' as TiktokenModel,
     maxToken = 800,
-): Promise<GroupedChunks> {
+): Promise<string[]> {
   // Split markdown text into chunks
-  const chunks = await splitMarkdownIntoChunks(markdownText, model);
+  const markdownFragments = await splitMarkdownIntoFragments(markdownText, model);
+  const chunks = [] as string[];
 
   // Group the chunks based on token count
-  const groupedChunks = assembleMarkdownRecursively(chunks, maxToken);
-
-  for (const prefix of Object.keys(groupedChunks)) {
-    const chunks = groupedChunks[prefix];
+  const fragmentGroupes = assembleMarkdownRecursively(markdownFragments, maxToken);
 
+  fragmentGroupes.forEach((fragmentGroupe) => {
     // Calculate the total token count for each group
-    const totalTokenCount = chunks.reduce((sum, chunk) => sum + chunk.tokenCount, 0);
+    const totalTokenCount = fragmentGroupe.reduce((sum, fragment) => sum + fragment.tokenCount, 0);
 
     // If the total token count doesn't exceed maxToken, combine the chunks into one
     if (totalTokenCount <= maxToken) {
-      const combinedContent = chunks.map((chunk, index) => {
-        const nextChunk = chunks[index + 1];
-        if (nextChunk) {
+      const chunk = fragmentGroupe.map((fragment, index) => {
+        const nextFragment = fragmentGroupe[index + 1];
+        if (nextFragment) {
           // If both the current and next chunks are headings, add a single newline
-          if (chunk.type === 'heading' && nextChunk.type === 'heading') {
-            return `${chunk.text}\n`;
+          if (fragment.type === 'heading' && nextFragment.type === 'heading') {
+            return `${fragment.text}\n`;
           }
           // Add two newlines for other cases
-          return `${chunk.text}\n\n`;
+          return `${fragment.text}\n\n`;
         }
-        return chunk.text; // No newlines for the last chunk
+        return fragment.text; // No newlines for the last chunk
       }).join('');
 
-      // Combine into one chunk while maintaining the token count
-      groupedChunks[prefix] = [{
-        label: prefix,
-        text: combinedContent,
-        tokenCount: totalTokenCount,
-      }];
+      chunks.push(chunk);
     }
     else {
       // If the total token count exceeds maxToken, split content
-      const headingChunks = chunks.filter(chunk => chunk.type === 'heading'); // Find all headings
-      const headingText = headingChunks.map(heading => heading.text).join('\n'); // Combine headings with one newline
-
-      const newGroupedChunks = []; // Create a new group of chunks
+      const headingFragments = fragmentGroupe.filter(fragment => fragment.type === 'heading'); // Find all headings
+      const headingText = headingFragments.map(heading => heading.text).join('\n'); // Combine headings with one newline
 
-      for (const chunk of chunks) {
-        if (chunk.label.includes('content')) {
+      for (const fragment of fragmentGroupe) {
+        if (fragment.label.includes('content')) {
           // Combine heading and paragraph content
-          const combinedText = `${headingText}\n\n${chunk.text}`;
-          const combinedTokenCount = headingChunks.reduce((sum, heading) => sum + heading.tokenCount, 0) + chunk.tokenCount;
+          const combinedTokenCount = headingFragments.reduce((sum, heading) => sum + heading.tokenCount, 0) + fragment.tokenCount;
           // Check if headingChunks alone exceed maxToken
-          const headingTokenCount = headingChunks.reduce((sum, heading) => sum + heading.tokenCount, 0);
-          if (headingTokenCount > maxToken) {
+          const headingTokenCount = headingFragments.reduce((sum, heading) => sum + heading.tokenCount, 0);
+
+          if (headingTokenCount > maxToken / 2) {
             console.error(`Heading token count exceeds maxToken. Heading token count: ${headingTokenCount}, maxToken: ${maxToken}`);
             break; // Exit the loop
           }
 
           // If the combined token count exceeds maxToken, split the content by character count
           if (combinedTokenCount > maxToken) {
-            const headingTokenCount = headingChunks.reduce((sum, heading) => sum + heading.tokenCount, 0);
+            const headingTokenCount = headingFragments.reduce((sum, heading) => sum + heading.tokenCount, 0);
             const remainingTokenCount = maxToken - headingTokenCount;
 
             // Calculate the total character count and token count
-            const totalCharCount = chunk.text.length;
-            const totalTokenCount = chunk.tokenCount;
+            const fragmentCharCount = fragment.text.length;
+            const fragmenTokenCount = fragment.tokenCount;
 
             // Calculate the character count for splitting
-            const charCountForSplit = Math.floor((remainingTokenCount / totalTokenCount) * totalCharCount);
+            const charCountForSplit = Math.floor((remainingTokenCount / fragmenTokenCount) * fragmentCharCount);
 
             // Split content based on character count
             const splitContents = [];
-            for (let i = 0; i < chunk.text.length; i += charCountForSplit) {
-              splitContents.push(chunk.text.slice(i, i + charCountForSplit));
+            for (let i = 0; i < fragment.text.length; i += charCountForSplit) {
+              splitContents.push(fragment.text.slice(i, i + charCountForSplit));
             }
 
             // Add each split content to the new group of chunks
-            splitContents.forEach((splitText, i) => {
-              newGroupedChunks.push({
-                label: `${chunk.label}-split-${i + 1}`,
-                text: `${headingText}\n\n${splitText}`,
-                tokenCount: remainingTokenCount,
-                type: 'split',
-              });
+            splitContents.forEach((splitText) => {
+              const chunk = headingText
+                ? `${headingText}\n\n${splitText}`
+                : `${splitText}`;
+              chunks.push(chunk);
             });
           }
           else {
-            // If the combined token count doesn't exceed maxToken, add as-is
-            newGroupedChunks.push({
-              label: chunk.label,
-              text: combinedText,
-              tokenCount: combinedTokenCount,
-              type: 'combined',
-            });
+            const chunk = headingText
+              ? `${headingText}\n\n${fragment.text}`
+              : `${fragment.text}`;
+            chunks.push(chunk);
           }
         }
       }
-
-      // Update grouped chunks with the new group
-      groupedChunks[prefix] = newGroupedChunks;
     }
-  }
+  });
 
-  return groupedChunks;
+  return chunks;
 }

+ 27 - 27
packages/markdown-splitter/test/index.spec.ts

@@ -1,17 +1,17 @@
 import { encodingForModel, type TiktokenModel } from 'js-tiktoken';
 
-import type { Chunk } from '../src/services/markdown-splitter';
-import { splitMarkdownIntoChunks } from '../src/services/markdown-splitter';
+import type { MarkdownFragment } from '../src/services/markdown-splitter';
+import { splitMarkdownIntoFragments } from '../src/services/markdown-splitter';
 
 const MODEL: TiktokenModel = 'gpt-4';
 const encoder = encodingForModel(MODEL);
 
-describe('splitMarkdownIntoChunks', () => {
+describe('splitMarkdownIntoFragments', () => {
 
   test('handles empty markdown string', async() => {
     const markdown = '';
-    const expected: Chunk[] = [];
-    const result = await splitMarkdownIntoChunks(markdown, MODEL);
+    const expected: MarkdownFragment[] = [];
+    const result = await splitMarkdownIntoFragments(markdown, MODEL);
     expect(result).toEqual(expected);
   });
 
@@ -22,7 +22,7 @@ It spans multiple lines.
 Another paragraph.
     `;
 
-    const expected: Chunk[] = [
+    const expected: MarkdownFragment[] = [
       {
         label: '0-content-1',
         type: 'paragraph',
@@ -37,7 +37,7 @@ Another paragraph.
       },
     ];
 
-    const result = await splitMarkdownIntoChunks(markdown, MODEL);
+    const result = await splitMarkdownIntoFragments(markdown, MODEL);
     expect(result).toEqual(expected);
   });
 
@@ -53,7 +53,7 @@ Content under header 1.1.
 Content under header 2.
     `;
 
-    const expected: Chunk[] = [
+    const expected: MarkdownFragment[] = [
       {
         label: '1-heading',
         type: 'heading',
@@ -92,7 +92,7 @@ Content under header 2.
       },
     ];
 
-    const result = await splitMarkdownIntoChunks(markdown, MODEL);
+    const result = await splitMarkdownIntoFragments(markdown, MODEL);
     expect(result).toEqual(expected);
   });
 
@@ -116,7 +116,7 @@ Content of chapter 2.
 Content of section 2.1.
     `;
 
-    const expected: Chunk[] = [
+    const expected: MarkdownFragment[] = [
       {
         label: '0-content-1',
         type: 'paragraph',
@@ -185,7 +185,7 @@ Content of section 2.1.
       },
     ];
 
-    const result = await splitMarkdownIntoChunks(markdown, MODEL);
+    const result = await splitMarkdownIntoFragments(markdown, MODEL);
     expect(result).toEqual(expected);
   });
 
@@ -204,7 +204,7 @@ Content under header 1.2.
 Content under header 2.
     `;
 
-    const expected: Chunk[] = [
+    const expected: MarkdownFragment[] = [
       {
         label: '1-heading',
         type: 'heading',
@@ -255,7 +255,7 @@ Content under header 2.
       },
     ];
 
-    const result = await splitMarkdownIntoChunks(markdown, MODEL);
+    const result = await splitMarkdownIntoFragments(markdown, MODEL);
     expect(result).toEqual(expected);
   });
 
@@ -268,7 +268,7 @@ Content under header 1.
 Content under header 1.1.1.1.
     `;
 
-    const expected: Chunk[] = [
+    const expected: MarkdownFragment[] = [
       {
         label: '1-heading',
         type: 'heading',
@@ -295,7 +295,7 @@ Content under header 1.1.1.1.
       },
     ];
 
-    const result = await splitMarkdownIntoChunks(markdown, MODEL);
+    const result = await splitMarkdownIntoFragments(markdown, MODEL);
     expect(result).toEqual(expected);
   });
 
@@ -309,7 +309,7 @@ This is the second paragraph without a header.
 Content under header 1.
     `;
 
-    const expected: Chunk[] = [
+    const expected: MarkdownFragment[] = [
       {
         label: '0-content-1',
         type: 'paragraph',
@@ -336,7 +336,7 @@ Content under header 1.
       },
     ];
 
-    const result = await splitMarkdownIntoChunks(markdown, MODEL);
+    const result = await splitMarkdownIntoFragments(markdown, MODEL);
     expect(result).toEqual(expected);
   });
 
@@ -349,7 +349,7 @@ Content under header 1.
 ### Header 1.1.1
     `;
 
-    const expected: Chunk[] = [
+    const expected: MarkdownFragment[] = [
       {
         label: '1-heading',
         type: 'heading',
@@ -370,7 +370,7 @@ Content under header 1.
       },
     ];
 
-    const result = await splitMarkdownIntoChunks(markdown, MODEL);
+    const result = await splitMarkdownIntoFragments(markdown, MODEL);
     expect(result).toEqual(expected);
   });
 
@@ -387,7 +387,7 @@ Another piece of content.
 Content under header 2.
     `;
 
-    const expected: Chunk[] = [
+    const expected: MarkdownFragment[] = [
       {
         label: '1-heading',
         type: 'heading',
@@ -426,7 +426,7 @@ Content under header 2.
       },
     ];
 
-    const result = await splitMarkdownIntoChunks(markdown, MODEL);
+    const result = await splitMarkdownIntoFragments(markdown, MODEL);
     expect(result).toEqual(expected);
   });
 
@@ -444,7 +444,7 @@ Content under header 1.
 Content under header 2.
     `;
 
-    const expected: Chunk[] = [
+    const expected: MarkdownFragment[] = [
       {
         label: '1-heading',
         type: 'heading',
@@ -477,7 +477,7 @@ Content under header 2.
       },
     ];
 
-    const result = await splitMarkdownIntoChunks(markdown, MODEL);
+    const result = await splitMarkdownIntoFragments(markdown, MODEL);
     expect(result).toEqual(expected);
   });
 
@@ -494,7 +494,7 @@ Additional content.
 Content under header 2.
     `;
 
-    const expected: Chunk[] = [
+    const expected: MarkdownFragment[] = [
       {
         label: '1-heading',
         type: 'heading',
@@ -533,7 +533,7 @@ Content under header 2.
       },
     ];
 
-    const result = await splitMarkdownIntoChunks(markdown, MODEL);
+    const result = await splitMarkdownIntoFragments(markdown, MODEL);
     expect(result).toEqual(expected);
   });
 
@@ -547,7 +547,7 @@ author: John Doe
 Some introductory content.
     `;
 
-    const expected: Chunk[] = [
+    const expected: MarkdownFragment[] = [
       {
         label: 'frontmatter',
         type: 'yaml',
@@ -568,7 +568,7 @@ Some introductory content.
       },
     ];
 
-    const result = await splitMarkdownIntoChunks(markdown, MODEL);
+    const result = await splitMarkdownIntoFragments(markdown, MODEL);
     expect(result).toEqual(expected);
   });
 });