1 year ago · e0ffd83165
--- a/packages/markdown-splitter/src/services/markdown-splitter.spec.ts
+++ b/packages/markdown-splitter/src/services/markdown-splitter.spec.ts
@@ -1,7 +1,6 @@
 
															 import { encodingForModel, type TiktokenModel } from 'js-tiktoken';
														
 
															-import type { MarkdownFragment } from '~/index';
														
 
															-import { splitMarkdownIntoChunks, splitMarkdownIntoFragments } from '~/index';
														
 
															+import { splitMarkdownIntoFragments, type MarkdownFragment } from './markdown-splitter';
														
 
															 const MODEL: TiktokenModel = 'gpt-4';
														
 
															 const encoder = encodingForModel(MODEL);
														
@@ -572,130 +571,3 @@ Some introductory content.
 
															     expect(result).toEqual(expected);
														
 
															   });
														
 
															 });
														
 
															-
														
 
															-describe('splitMarkdownIntoChunks', () => {
														
 
															-  const repeatedText = 'This is a repeated sentence for testing purposes. '.repeat(100);
														
 
															-  const markdown = `---
														
 
															-title: Test Document
														
 
															-author: John Doe
														
 
															----
														
 
															-
														
 
															-${repeatedText}
														
 
															-
														
 
															-# Header 1
														
 
															-
														
 
															-This is the first paragraph under header 1. It contains some text to simulate a longer paragraph for testing.
														
 
															-This paragraph is extended with more content to ensure proper chunking behavior.${repeatedText}
														
 
															-
														
 
															-## Header 1-1
														
 
															-
														
 
															-This is the first paragraph under header 1-1. The text is a bit longer to ensure proper chunking. More text follows.
														
 
															-
														
 
															-
														
 
															-### Header 1-1-1
														
 
															-
														
 
															-This is the first paragraph under header 1-1-1. The content is nested deeper,
														
 
															-making sure that the chunking algorithm works properly with multiple levels of headers.
														
 
															-
														
 
															-This is another paragraph under header 1-1-1, continuing the content at this deeper level.
														
 
															-
														
 
															-#### Header 1-1-1-1
														
 
															-
														
 
															-Now we have reached the fourth level of headers. The text here should also be properly chunked and grouped with its parent headers.
														
 
															-
														
 
															-This is another paragraph under header 1-1-1-1. It should be grouped with the correct higher-level headers.
														
 
															-
														
 
															-# Header 2
														
 
															-
														
 
															-Here is some content under header 2. This section should also be sufficiently long to ensure that the token count threshold is reached in the test.
														
 
															-
														
 
															-## Header 2-1
														
 
															-
														
 
															-${repeatedText}
														
 
															-
														
 
															-${repeatedText}
														
 
															-
														
 
															-Another sub-header under header 2 with text for testing chunking behavior. This is a fairly lengthy paragraph as well.
														
 
															-
														
 
															-We now have a fourth-level sub-header under header 2-1. This ensures that the chunking logic can handle deeply nested content.
														
 
															-
														
 
															-### Header 2-1-1
														
 
															-
														
 
															-Here is another paragraph under header 2-1-1. This paragraph is part of a more deeply nested section.
														
 
															-
														
 
															-# Header 3
														
 
															-
														
 
															-Continuing with more headers and content to make sure the markdown document is sufficiently large. This is a new header with more paragraphs under it.
														
 
															-
														
 
															-### Header 3-1
														
 
															-
														
 
															-This is a sub-header under header 3. The content here continues to grow, ensuring that the markdown is long enough to trigger multiple chunks.
														
 
															-
														
 
															-#### Header 3-1-1
														
 
															-
														
 
															-Here is a fourth-level sub-header under header 3-1. This paragraph is designed to create a larger markdown file for testing purposes.
														
 
															-`;
														
 
															-  test('Each chunk should not exceed the specified token count', async() => {
														
 
															-    const maxToken = 800;
														
 
															-    const result = await splitMarkdownIntoChunks(markdown, MODEL, maxToken);
														
 
															-
														
 
															-    result.forEach((chunk) => {
														
 
															-      const tokenCount = encoder.encode(chunk).length;
														
 
															-      expect(tokenCount).toBeLessThanOrEqual(maxToken * 1.1);
														
 
															-    });
														
 
															-  });
														
 
															-  test('Each chunk should include the relevant top-level header', async() => {
														
 
															-    const result = await splitMarkdownIntoChunks(markdown, MODEL, 800);
														
 
															-
														
 
															-    result.forEach((chunk) => {
														
 
															-      const containsHeader1 = chunk.includes('# Header 1');
														
 
															-      const containsHeader2 = chunk.includes('# Header 2');
														
 
															-      const containsHeader3 = chunk.includes('# Header 3');
														
 
															-      const doesNotContainHash = !chunk.includes('# ');
														
 
															-
														
 
															-      expect(containsHeader1 || containsHeader2 || containsHeader3 || doesNotContainHash).toBe(true);
														
 
															-    });
														
 
															-  });
														
 
															-  test('Should throw an error if a header exceeds half of maxToken size with correct error message', async() => {
														
 
															-    const maxToken = 800;
														
 
															-    const markdownWithLongHeader = `
														
 
															-# Short Header 1
														
 
															-
														
 
															-This is the first paragraph under short header 1. It contains some text for testing purposes.
														
 
															-
														
 
															-## ${repeatedText}
														
 
															-
														
 
															-This is the first paragraph under the long header. It contains text to ensure that the header length check is triggered if the header is too long.
														
 
															-
														
 
															-# Short Header 2
														
 
															-
														
 
															-Another section with a shorter header, but enough content to ensure proper chunking.
														
 
															-`;
														
 
															-
														
 
															-    try {
														
 
															-      await splitMarkdownIntoChunks(markdownWithLongHeader, MODEL, maxToken);
														
 
															-    }
														
 
															-    catch (error) {
														
 
															-      if (error instanceof Error) {
														
 
															-        expect(error.message).toContain('Heading token count is too large');
														
 
															-      }
														
 
															-      else {
														
 
															-        throw new Error('An unknown error occurred');
														
 
															-      }
														
 
															-    }
														
 
															-  });
														
 
															-
														
 
															-  test('Should return the entire markdown as a single chunk if token count is less than or equal to maxToken', async() => {
														
 
															-    const markdownText = `
														
 
															-    # Header 1
														
 
															-    This is a short paragraph under header 1. It contains only a few sentences to ensure that the total token count remains under the maxToken limit.
														
 
															-    `;
														
 
															-
														
 
															-    const maxToken = 800;
														
 
															-
														
 
															-    const result = await splitMarkdownIntoChunks(markdownText, MODEL, maxToken);
														
 
															-
														
 
															-    expect(result).toHaveLength(1);
														
 
															-    expect(result[0]).toBe(markdownText);
														
 
															-  });
														
 
															-});
														
--- a/packages/markdown-splitter/src/services/markdown-token-splitter.spec.ts
+++ b/packages/markdown-splitter/src/services/markdown-token-splitter.spec.ts
@@ -0,0 +1,134 @@
 
															+import type { TiktokenModel } from 'js-tiktoken';
														
 
															+import { encodingForModel } from 'js-tiktoken';
														
 
															+
														
 
															+import { splitMarkdownIntoChunks } from './markdown-token-splitter';
														
 
															+
														
 
															+const MODEL: TiktokenModel = 'gpt-4';
														
 
															+const encoder = encodingForModel(MODEL);
														
 
															+
														
 
															+describe('splitMarkdownIntoChunks', () => {
														
 
															+  const repeatedText = 'This is a repeated sentence for testing purposes. '.repeat(100);
														
 
															+  const markdown = `---
														
 
															+title: Test Document
														
 
															+author: John Doe
														
 
															+---
														
 
															+
														
 
															+${repeatedText}
														
 
															+
														
 
															+# Header 1
														
 
															+
														
 
															+This is the first paragraph under header 1. It contains some text to simulate a longer paragraph for testing.
														
 
															+This paragraph is extended with more content to ensure proper chunking behavior.${repeatedText}
														
 
															+
														
 
															+## Header 1-1
														
 
															+
														
 
															+This is the first paragraph under header 1-1. The text is a bit longer to ensure proper chunking. More text follows.
														
 
															+
														
 
															+
														
 
															+### Header 1-1-1
														
 
															+
														
 
															+This is the first paragraph under header 1-1-1. The content is nested deeper,
														
 
															+making sure that the chunking algorithm works properly with multiple levels of headers.
														
 
															+
														
 
															+This is another paragraph under header 1-1-1, continuing the content at this deeper level.
														
 
															+
														
 
															+#### Header 1-1-1-1
														
 
															+
														
 
															+Now we have reached the fourth level of headers. The text here should also be properly chunked and grouped with its parent headers.
														
 
															+
														
 
															+This is another paragraph under header 1-1-1-1. It should be grouped with the correct higher-level headers.
														
 
															+
														
 
															+# Header 2
														
 
															+
														
 
															+Here is some content under header 2. This section should also be sufficiently long to ensure that the token count threshold is reached in the test.
														
 
															+
														
 
															+## Header 2-1
														
 
															+
														
 
															+${repeatedText}
														
 
															+
														
 
															+${repeatedText}
														
 
															+
														
 
															+Another sub-header under header 2 with text for testing chunking behavior. This is a fairly lengthy paragraph as well.
														
 
															+
														
 
															+We now have a fourth-level sub-header under header 2-1. This ensures that the chunking logic can handle deeply nested content.
														
 
															+
														
 
															+### Header 2-1-1
														
 
															+
														
 
															+Here is another paragraph under header 2-1-1. This paragraph is part of a more deeply nested section.
														
 
															+
														
 
															+# Header 3
														
 
															+
														
 
															+Continuing with more headers and content to make sure the markdown document is sufficiently large. This is a new header with more paragraphs under it.
														
 
															+
														
 
															+### Header 3-1
														
 
															+
														
 
															+This is a sub-header under header 3. The content here continues to grow, ensuring that the markdown is long enough to trigger multiple chunks.
														
 
															+
														
 
															+#### Header 3-1-1
														
 
															+
														
 
															+Here is a fourth-level sub-header under header 3-1. This paragraph is designed to create a larger markdown file for testing purposes.
														
 
															+`;
														
 
															+  test('Each chunk should not exceed the specified token count', async() => {
														
 
															+    const maxToken = 800;
														
 
															+    const result = await splitMarkdownIntoChunks(markdown, MODEL, maxToken);
														
 
															+
														
 
															+    result.forEach((chunk) => {
														
 
															+      const tokenCount = encoder.encode(chunk).length;
														
 
															+      expect(tokenCount).toBeLessThanOrEqual(maxToken * 1.1);
														
 
															+    });
														
 
															+  });
														
 
															+  test('Each chunk should include the relevant top-level header', async() => {
														
 
															+    const result = await splitMarkdownIntoChunks(markdown, MODEL, 800);
														
 
															+
														
 
															+    result.forEach((chunk) => {
														
 
															+      const containsHeader1 = chunk.includes('# Header 1');
														
 
															+      const containsHeader2 = chunk.includes('# Header 2');
														
 
															+      const containsHeader3 = chunk.includes('# Header 3');
														
 
															+      const doesNotContainHash = !chunk.includes('# ');
														
 
															+
														
 
															+      expect(containsHeader1 || containsHeader2 || containsHeader3 || doesNotContainHash).toBe(true);
														
 
															+    });
														
 
															+  });
														
 
															+  test('Should throw an error if a header exceeds half of maxToken size with correct error message', async() => {
														
 
															+    const maxToken = 800;
														
 
															+    const markdownWithLongHeader = `
														
 
															+# Short Header 1
														
 
															+
														
 
															+This is the first paragraph under short header 1. It contains some text for testing purposes.
														
 
															+
														
 
															+## ${repeatedText}
														
 
															+
														
 
															+This is the first paragraph under the long header. It contains text to ensure that the header length check is triggered if the header is too long.
														
 
															+
														
 
															+# Short Header 2
														
 
															+
														
 
															+Another section with a shorter header, but enough content to ensure proper chunking.
														
 
															+`;
														
 
															+
														
 
															+    try {
														
 
															+      await splitMarkdownIntoChunks(markdownWithLongHeader, MODEL, maxToken);
														
 
															+    }
														
 
															+    catch (error) {
														
 
															+      if (error instanceof Error) {
														
 
															+        expect(error.message).toContain('Heading token count is too large');
														
 
															+      }
														
 
															+      else {
														
 
															+        throw new Error('An unknown error occurred');
														
 
															+      }
														
 
															+    }
														
 
															+  });
														
 
															+
														
 
															+  test('Should return the entire markdown as a single chunk if token count is less than or equal to maxToken', async() => {
														
 
															+    const markdownText = `
														
 
															+    # Header 1
														
 
															+    This is a short paragraph under header 1. It contains only a few sentences to ensure that the total token count remains under the maxToken limit.
														
 
															+    `;
														
 
															+
														
 
															+    const maxToken = 800;
														
 
															+
														
 
															+    const result = await splitMarkdownIntoChunks(markdownText, MODEL, maxToken);
														
 
															+
														
 
															+    expect(result).toHaveLength(1);
														
 
															+    expect(result[0]).toBe(markdownText);
														
 
															+  });
														
 
															+});