nHigashiWeseek 1 год назад
Родитель
Сommit
345287f8fb

+ 2 - 4
packages/markdown-splitter/src/services/markdown-token-splitter.ts

@@ -31,7 +31,7 @@ function groupMarkdownFragments(
 
     if (!hasNextLevelPrefix) {
       // If there is no prefix that starts with the current prefix, group the chunks directly
-      let matchingFragments = markdownFragments.filter(fragment => fragment.label === prefix);
+      let matchingFragments = markdownFragments.filter(fragment => fragment.label.startsWith(prefix));
 
       // Add parent heading if it exists
       const parts = prefix.split('-');
@@ -179,9 +179,7 @@ export async function splitMarkdownIntoChunks(
             });
           }
           else {
-            const chunk = headingText
-              ? `${headingText}\n\n${fragment.text}`
-              : `${fragment.text}`;
+            const chunk = `${headingText}\n\n${fragment.text}`;
             chunks.push(chunk);
           }
         }

+ 0 - 17
packages/markdown-splitter/test/index.spec.ts

@@ -698,21 +698,4 @@ Another section with a shorter header, but enough content to ensure proper chunk
     expect(result).toHaveLength(1);
     expect(result[0]).toBe(markdownText);
   });
-
-  test('Should return the entire markdown as a single chunk if token count is less than or equal to maxToken', async() => {
-    const markdownWithContentBeforeHeading = `
-This is a short paragraph
-
-# Header 1
-${repeatedText}
-    `;
-
-    const maxToken = 800;
-
-    const result = await splitMarkdownIntoChunks(markdownWithContentBeforeHeading, MODEL, maxToken);
-    result.forEach((chunk) => {
-      const tokenCount = encoder.encode(chunk).length;
-      expect(tokenCount).toBeLessThanOrEqual(maxToken * 1.1);
-    });
-  });
 });