|
|
@@ -574,16 +574,21 @@ Some introductory content.
|
|
|
});
|
|
|
|
|
|
describe('splitMarkdownIntoChunks', () => {
|
|
|
+ const repeatedText = 'This is a repeated sentence for testing purposes. '.repeat(100);
|
|
|
const markdown = `
|
|
|
+${repeatedText}
|
|
|
+
|
|
|
# Header 1
|
|
|
|
|
|
This is the first paragraph under header 1. It contains some text to simulate a longer paragraph for testing.
|
|
|
-This paragraph is extended with more content to ensure proper chunking behavior.
|
|
|
+This paragraph is extended with more content to ensure proper chunking behavior.${repeatedText}
|
|
|
|
|
|
## Header 1-1
|
|
|
|
|
|
This is the first paragraph under header 1-1. The text is a bit longer to ensure proper chunking. More text follows.
|
|
|
|
|
|
+${repeatedText}
|
|
|
+
|
|
|
### Header 1-1-1
|
|
|
|
|
|
This is the first paragraph under header 1-1-1. The content is nested deeper,
|
|
|
@@ -631,7 +636,7 @@ Here is a fourth-level sub-header under header 3-1. This paragraph is designed t
|
|
|
|
|
|
result.forEach((chunk) => {
|
|
|
const tokenCount = encoder.encode(chunk).length;
|
|
|
- expect(tokenCount).toBeLessThanOrEqual(maxToken);
|
|
|
+ expect(tokenCount).toBeLessThanOrEqual(maxToken * 1.1);
|
|
|
});
|
|
|
});
|
|
|
test('Each chunk should include the relevant top-level header', async() => {
|
|
|
@@ -641,8 +646,9 @@ Here is a fourth-level sub-header under header 3-1. This paragraph is designed t
|
|
|
const containsHeader1 = chunk.includes('# Header 1');
|
|
|
const containsHeader2 = chunk.includes('# Header 2');
|
|
|
const containsHeader3 = chunk.includes('# Header 3');
|
|
|
+ const doesNotContainHash = !chunk.includes('# ');
|
|
|
|
|
|
- expect(containsHeader1 || containsHeader2 || containsHeader3).toBe(true);
|
|
|
+ expect(containsHeader1 || containsHeader2 || containsHeader3 || doesNotContainHash).toBe(true);
|
|
|
});
|
|
|
});
|
|
|
});
|