markdown-token-splitter.spec.ts 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134
  1. import type { TiktokenModel } from 'js-tiktoken';
  2. import { encodingForModel } from 'js-tiktoken';
  3. import { splitMarkdownIntoChunks } from './markdown-token-splitter';
  4. const MODEL: TiktokenModel = 'gpt-4';
  5. const encoder = encodingForModel(MODEL);
  6. describe('splitMarkdownIntoChunks', () => {
  7. const repeatedText = 'This is a repeated sentence for testing purposes. '.repeat(100);
  8. const markdown = `---
  9. title: Test Document
  10. author: John Doe
  11. ---
  12. ${repeatedText}
  13. # Header 1
  14. This is the first paragraph under header 1. It contains some text to simulate a longer paragraph for testing.
  15. This paragraph is extended with more content to ensure proper chunking behavior.${repeatedText}
  16. ## Header 1-1
  17. This is the first paragraph under header 1-1. The text is a bit longer to ensure proper chunking. More text follows.
  18. ### Header 1-1-1
  19. This is the first paragraph under header 1-1-1. The content is nested deeper,
  20. making sure that the chunking algorithm works properly with multiple levels of headers.
  21. This is another paragraph under header 1-1-1, continuing the content at this deeper level.
  22. #### Header 1-1-1-1
  23. Now we have reached the fourth level of headers. The text here should also be properly chunked and grouped with its parent headers.
  24. This is another paragraph under header 1-1-1-1. It should be grouped with the correct higher-level headers.
  25. # Header 2
  26. Here is some content under header 2. This section should also be sufficiently long to ensure that the token count threshold is reached in the test.
  27. ## Header 2-1
  28. ${repeatedText}
  29. ${repeatedText}
  30. Another sub-header under header 2 with text for testing chunking behavior. This is a fairly lengthy paragraph as well.
  31. We now have a fourth-level sub-header under header 2-1. This ensures that the chunking logic can handle deeply nested content.
  32. ### Header 2-1-1
  33. Here is another paragraph under header 2-1-1. This paragraph is part of a more deeply nested section.
  34. # Header 3
  35. Continuing with more headers and content to make sure the markdown document is sufficiently large. This is a new header with more paragraphs under it.
  36. ### Header 3-1
  37. This is a sub-header under header 3. The content here continues to grow, ensuring that the markdown is long enough to trigger multiple chunks.
  38. #### Header 3-1-1
  39. Here is a fourth-level sub-header under header 3-1. This paragraph is designed to create a larger markdown file for testing purposes.
  40. `;
  41. test('Each chunk should not exceed the specified token count', async() => {
  42. const maxToken = 800;
  43. const result = await splitMarkdownIntoChunks(markdown, MODEL, maxToken);
  44. result.forEach((chunk) => {
  45. const tokenCount = encoder.encode(chunk).length;
  46. expect(tokenCount).toBeLessThanOrEqual(maxToken * 1.1);
  47. });
  48. });
  49. test('Each chunk should include the relevant top-level header', async() => {
  50. const result = await splitMarkdownIntoChunks(markdown, MODEL, 800);
  51. result.forEach((chunk) => {
  52. const containsHeader1 = chunk.includes('# Header 1');
  53. const containsHeader2 = chunk.includes('# Header 2');
  54. const containsHeader3 = chunk.includes('# Header 3');
  55. const doesNotContainHash = !chunk.includes('# ');
  56. expect(containsHeader1 || containsHeader2 || containsHeader3 || doesNotContainHash).toBe(true);
  57. });
  58. });
  59. test('Should throw an error if a header exceeds half of maxToken size with correct error message', async() => {
  60. const maxToken = 800;
  61. const markdownWithLongHeader = `
  62. # Short Header 1
  63. This is the first paragraph under short header 1. It contains some text for testing purposes.
  64. ## ${repeatedText}
  65. This is the first paragraph under the long header. It contains text to ensure that the header length check is triggered if the header is too long.
  66. # Short Header 2
  67. Another section with a shorter header, but enough content to ensure proper chunking.
  68. `;
  69. try {
  70. await splitMarkdownIntoChunks(markdownWithLongHeader, MODEL, maxToken);
  71. }
  72. catch (error) {
  73. if (error instanceof Error) {
  74. expect(error.message).toContain('Heading token count is too large');
  75. }
  76. else {
  77. throw new Error('An unknown error occurred');
  78. }
  79. }
  80. });
  81. test('Should return the entire markdown as a single chunk if token count is less than or equal to maxToken', async() => {
  82. const markdownText = `
  83. # Header 1
  84. This is a short paragraph under header 1. It contains only a few sentences to ensure that the total token count remains under the maxToken limit.
  85. `;
  86. const maxToken = 800;
  87. const result = await splitMarkdownIntoChunks(markdownText, MODEL, maxToken);
  88. expect(result).toHaveLength(1);
  89. expect(result[0]).toBe(markdownText);
  90. });
  91. });