| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718 |
- import { encodingForModel, type TiktokenModel } from 'js-tiktoken';
- import type { MarkdownFragment } from '~/index';
- import { splitMarkdownIntoChunks, splitMarkdownIntoFragments } from '~/index';
- const MODEL: TiktokenModel = 'gpt-4';
- const encoder = encodingForModel(MODEL);
- describe('splitMarkdownIntoFragments', () => {
- test('handles empty markdown string', async() => {
- const markdown = '';
- const expected: MarkdownFragment[] = [];
- const result = await splitMarkdownIntoFragments(markdown, MODEL);
- expect(result).toEqual(expected);
- });
- test('handles markdown with only content and no headers', async() => {
- const markdown = `This is some content without any headers.
- It spans multiple lines.
- Another paragraph.
- `;
- const expected: MarkdownFragment[] = [
- {
- label: '0-content-1',
- type: 'paragraph',
- text: 'This is some content without any headers.\nIt spans multiple lines.',
- tokenCount: encoder.encode('This is some content without any headers.\nIt spans multiple lines.').length,
- },
- {
- label: '0-content-2',
- type: 'paragraph',
- text: 'Another paragraph.',
- tokenCount: encoder.encode('Another paragraph.').length,
- },
- ];
- const result = await splitMarkdownIntoFragments(markdown, MODEL);
- expect(result).toEqual(expected);
- });
- test('handles markdown starting with a header', async() => {
- const markdown = `
- # Header 1
- Content under header 1.
- ## Header 1.1
- Content under header 1.1.
- # Header 2
- Content under header 2.
- `;
- const expected: MarkdownFragment[] = [
- {
- label: '1-heading',
- type: 'heading',
- text: '# Header 1',
- tokenCount: encoder.encode('# Header 1').length,
- },
- {
- label: '1-content-1',
- type: 'paragraph',
- text: 'Content under header 1.',
- tokenCount: encoder.encode('Content under header 1.').length,
- },
- {
- label: '1-1-heading',
- type: 'heading',
- text: '## Header 1.1',
- tokenCount: encoder.encode('## Header 1.1').length,
- },
- {
- label: '1-1-content-1',
- type: 'paragraph',
- text: 'Content under header 1.1.',
- tokenCount: encoder.encode('Content under header 1.1.').length,
- },
- {
- label: '2-heading',
- type: 'heading',
- text: '# Header 2',
- tokenCount: encoder.encode('# Header 2').length,
- },
- {
- label: '2-content-1',
- type: 'paragraph',
- text: 'Content under header 2.',
- tokenCount: encoder.encode('Content under header 2.').length,
- },
- ];
- const result = await splitMarkdownIntoFragments(markdown, MODEL);
- expect(result).toEqual(expected);
- });
- test('handles markdown with non-consecutive heading levels', async() => {
- const markdown = `
- Introduction without a header.
- # Chapter 1
- Content of chapter 1.
- ### Section 1.1.1
- Content of section 1.1.1.
- ## Section 1.2
- Content of section 1.2.
- # Chapter 2
- Content of chapter 2.
- ## Section 2.1
- Content of section 2.1.
- `;
- const expected: MarkdownFragment[] = [
- {
- label: '0-content-1',
- type: 'paragraph',
- text: 'Introduction without a header.',
- tokenCount: encoder.encode('Introduction without a header.').length,
- },
- {
- label: '1-heading',
- type: 'heading',
- text: '# Chapter 1',
- tokenCount: encoder.encode('# Chapter 1').length,
- },
- {
- label: '1-content-1',
- type: 'paragraph',
- text: 'Content of chapter 1.',
- tokenCount: encoder.encode('Content of chapter 1.').length,
- },
- {
- label: '1-1-1-heading',
- type: 'heading',
- text: '### Section 1.1.1',
- tokenCount: encoder.encode('### Section 1.1.1').length,
- },
- {
- label: '1-1-1-content-1',
- type: 'paragraph',
- text: 'Content of section 1.1.1.',
- tokenCount: encoder.encode('Content of section 1.1.1.').length,
- },
- {
- label: '1-2-heading',
- type: 'heading',
- text: '## Section 1.2',
- tokenCount: encoder.encode('## Section 1.2').length,
- },
- {
- label: '1-2-content-1',
- type: 'paragraph',
- text: 'Content of section 1.2.',
- tokenCount: encoder.encode('Content of section 1.2.').length,
- },
- {
- label: '2-heading',
- type: 'heading',
- text: '# Chapter 2',
- tokenCount: encoder.encode('# Chapter 2').length,
- },
- {
- label: '2-content-1',
- type: 'paragraph',
- text: 'Content of chapter 2.',
- tokenCount: encoder.encode('Content of chapter 2.').length,
- },
- {
- label: '2-1-heading',
- type: 'heading',
- text: '## Section 2.1',
- tokenCount: encoder.encode('## Section 2.1').length,
- },
- {
- label: '2-1-content-1',
- type: 'paragraph',
- text: 'Content of section 2.1.',
- tokenCount: encoder.encode('Content of section 2.1.').length,
- },
- ];
- const result = await splitMarkdownIntoFragments(markdown, MODEL);
- expect(result).toEqual(expected);
- });
- test('handles markdown with skipped heading levels', async() => {
- const markdown = `
- # Header 1
- Content under header 1.
- #### Header 1.1.1.1
- Content under header 1.1.1.1.
- ## Header 1.2
- Content under header 1.2.
- # Header 2
- Content under header 2.
- `;
- const expected: MarkdownFragment[] = [
- {
- label: '1-heading',
- type: 'heading',
- text: '# Header 1',
- tokenCount: encoder.encode('# Header 1').length,
- },
- {
- label: '1-content-1',
- type: 'paragraph',
- text: 'Content under header 1.',
- tokenCount: encoder.encode('Content under header 1.').length,
- },
- {
- label: '1-1-1-1-heading',
- type: 'heading',
- text: '#### Header 1.1.1.1',
- tokenCount: encoder.encode('#### Header 1.1.1.1').length,
- },
- {
- label: '1-1-1-1-content-1',
- type: 'paragraph',
- text: 'Content under header 1.1.1.1.',
- tokenCount: encoder.encode('Content under header 1.1.1.1.').length,
- },
- {
- label: '1-2-heading',
- type: 'heading',
- text: '## Header 1.2',
- tokenCount: encoder.encode('## Header 1.2').length,
- },
- {
- label: '1-2-content-1',
- type: 'paragraph',
- text: 'Content under header 1.2.',
- tokenCount: encoder.encode('Content under header 1.2.').length,
- },
- {
- label: '2-heading',
- type: 'heading',
- text: '# Header 2',
- tokenCount: encoder.encode('# Header 2').length,
- },
- {
- label: '2-content-1',
- type: 'paragraph',
- text: 'Content under header 2.',
- tokenCount: encoder.encode('Content under header 2.').length,
- },
- ];
- const result = await splitMarkdownIntoFragments(markdown, MODEL);
- expect(result).toEqual(expected);
- });
- test('handles malformed headings', async() => {
- const markdown = `
- # Header 1
- Content under header 1.
- #### Header 1.1.1.1
- Content under header 1.1.1.1.
- `;
- const expected: MarkdownFragment[] = [
- {
- label: '1-heading',
- type: 'heading',
- text: '# Header 1',
- tokenCount: encoder.encode('# Header 1').length,
- },
- {
- label: '1-content-1',
- type: 'paragraph',
- text: 'Content under header 1.',
- tokenCount: encoder.encode('Content under header 1.').length,
- },
- {
- label: '1-1-1-1-heading',
- type: 'heading',
- text: '#### Header 1.1.1.1',
- tokenCount: encoder.encode('#### Header 1.1.1.1').length,
- },
- {
- label: '1-1-1-1-content-1',
- type: 'paragraph',
- text: 'Content under header 1.1.1.1.',
- tokenCount: encoder.encode('Content under header 1.1.1.1.').length,
- },
- ];
- const result = await splitMarkdownIntoFragments(markdown, MODEL);
- expect(result).toEqual(expected);
- });
- test('handles multiple content blocks before any headers', async() => {
- const markdown = `
- This is the first paragraph without a header.
- This is the second paragraph without a header.
- # Header 1
- Content under header 1.
- `;
- const expected: MarkdownFragment[] = [
- {
- label: '0-content-1',
- type: 'paragraph',
- text: 'This is the first paragraph without a header.',
- tokenCount: encoder.encode('This is the first paragraph without a header.').length,
- },
- {
- label: '0-content-2',
- type: 'paragraph',
- text: 'This is the second paragraph without a header.',
- tokenCount: encoder.encode('This is the second paragraph without a header.').length,
- },
- {
- label: '1-heading',
- type: 'heading',
- text: '# Header 1',
- tokenCount: encoder.encode('# Header 1').length,
- },
- {
- label: '1-content-1',
- type: 'paragraph',
- text: 'Content under header 1.',
- tokenCount: encoder.encode('Content under header 1.').length,
- },
- ];
- const result = await splitMarkdownIntoFragments(markdown, MODEL);
- expect(result).toEqual(expected);
- });
- test('handles markdown with only headers and no content', async() => {
- const markdown = `
- # Header 1
- ## Header 1.1
- ### Header 1.1.1
- `;
- const expected: MarkdownFragment[] = [
- {
- label: '1-heading',
- type: 'heading',
- text: '# Header 1',
- tokenCount: encoder.encode('# Header 1').length,
- },
- {
- label: '1-1-heading',
- type: 'heading',
- text: '## Header 1.1',
- tokenCount: encoder.encode('## Header 1.1').length,
- },
- {
- label: '1-1-1-heading',
- type: 'heading',
- text: '### Header 1.1.1',
- tokenCount: encoder.encode('### Header 1.1.1').length,
- },
- ];
- const result = await splitMarkdownIntoFragments(markdown, MODEL);
- expect(result).toEqual(expected);
- });
- test('handles markdown with mixed content and headers', async() => {
- const markdown = `
- # Header 1
- Content under header 1.
- ## Header 1.1
- Content under header 1.1.
- Another piece of content.
- # Header 2
- Content under header 2.
- `;
- const expected: MarkdownFragment[] = [
- {
- label: '1-heading',
- type: 'heading',
- text: '# Header 1',
- tokenCount: encoder.encode('# Header 1').length,
- },
- {
- label: '1-content-1',
- type: 'paragraph',
- text: 'Content under header 1.',
- tokenCount: encoder.encode('Content under header 1.').length,
- },
- {
- label: '1-1-heading',
- type: 'heading',
- text: '## Header 1.1',
- tokenCount: encoder.encode('## Header 1.1').length,
- },
- {
- label: '1-1-content-1',
- type: 'paragraph',
- text: 'Content under header 1.1.\nAnother piece of content.',
- tokenCount: encoder.encode('Content under header 1.1.\nAnother piece of content.').length,
- },
- {
- label: '2-heading',
- type: 'heading',
- text: '# Header 2',
- tokenCount: encoder.encode('# Header 2').length,
- },
- {
- label: '2-content-1',
- type: 'paragraph',
- text: 'Content under header 2.',
- tokenCount: encoder.encode('Content under header 2.').length,
- },
- ];
- const result = await splitMarkdownIntoFragments(markdown, MODEL);
- expect(result).toEqual(expected);
- });
- test('preserves list indentation and reduces unnecessary line breaks', async() => {
- const markdown = `
- # Header 1
- Content under header 1.
- - Item 1
- - Subitem 1
- - Item 2
- # Header 2
- Content under header 2.
- `;
- const expected: MarkdownFragment[] = [
- {
- label: '1-heading',
- type: 'heading',
- text: '# Header 1',
- tokenCount: encoder.encode('# Header 1').length,
- },
- {
- label: '1-content-1',
- type: 'paragraph',
- text: 'Content under header 1.',
- tokenCount: encoder.encode('Content under header 1.').length,
- },
- {
- label: '1-content-2',
- type: 'list',
- text: '- Item 1\n - Subitem 1\n- Item 2',
- tokenCount: encoder.encode('- Item 1\n - Subitem 1\n- Item 2').length,
- },
- {
- label: '2-heading',
- type: 'heading',
- text: '# Header 2',
- tokenCount: encoder.encode('# Header 2').length,
- },
- {
- label: '2-content-1',
- type: 'paragraph',
- text: 'Content under header 2.',
- tokenCount: encoder.encode('Content under header 2.').length,
- },
- ];
- const result = await splitMarkdownIntoFragments(markdown, MODEL);
- expect(result).toEqual(expected);
- });
- test('code blocks containing # are not treated as headings', async() => {
- const markdown = `
- # Header 1
- Some introductory content.
- \`\`\`
- # This is a comment with a # symbol
- Some code line
- \`\`\`
- Additional content.
- # Header 2
- Content under header 2.
- `;
- const expected: MarkdownFragment[] = [
- {
- label: '1-heading',
- type: 'heading',
- text: '# Header 1',
- tokenCount: encoder.encode('# Header 1').length,
- },
- {
- label: '1-content-1',
- type: 'paragraph',
- text: 'Some introductory content.',
- tokenCount: encoder.encode('Some introductory content.').length,
- },
- {
- label: '1-content-2',
- type: 'code',
- text: '```\n# This is a comment with a # symbol\nSome code line\n```',
- tokenCount: encoder.encode('```\n# This is a comment with a # symbol\nSome code line\n```').length,
- },
- {
- label: '1-content-3',
- type: 'paragraph',
- text: 'Additional content.',
- tokenCount: encoder.encode('Additional content.').length,
- },
- {
- label: '2-heading',
- type: 'heading',
- text: '# Header 2',
- tokenCount: encoder.encode('# Header 2').length,
- },
- {
- label: '2-content-1',
- type: 'paragraph',
- text: 'Content under header 2.',
- tokenCount: encoder.encode('Content under header 2.').length,
- },
- ];
- const result = await splitMarkdownIntoFragments(markdown, MODEL);
- expect(result).toEqual(expected);
- });
- test('frontmatter is processed and labeled correctly', async() => {
- const markdown = `---
- title: Test Document
- author: John Doe
- ---
- # Header 1
- Some introductory content.
- `;
- const expected: MarkdownFragment[] = [
- {
- label: 'frontmatter',
- type: 'yaml',
- text: JSON.stringify({ title: 'Test Document', author: 'John Doe' }, null, 2),
- tokenCount: encoder.encode(JSON.stringify({ title: 'Test Document', author: 'John Doe' }, null, 2)).length,
- },
- {
- label: '1-heading',
- type: 'heading',
- text: '# Header 1',
- tokenCount: encoder.encode('# Header 1').length,
- },
- {
- label: '1-content-1',
- type: 'paragraph',
- text: 'Some introductory content.',
- tokenCount: encoder.encode('Some introductory content.').length,
- },
- ];
- const result = await splitMarkdownIntoFragments(markdown, MODEL);
- expect(result).toEqual(expected);
- });
- });
- describe('splitMarkdownIntoChunks', () => {
- const repeatedText = 'This is a repeated sentence for testing purposes. '.repeat(100);
- const markdown = `---
- title: Test Document
- author: John Doe
- ---
- ${repeatedText}
- # Header 1
- This is the first paragraph under header 1. It contains some text to simulate a longer paragraph for testing.
- This paragraph is extended with more content to ensure proper chunking behavior.${repeatedText}
- ## Header 1-1
- This is the first paragraph under header 1-1. The text is a bit longer to ensure proper chunking. More text follows.
- ### Header 1-1-1
- This is the first paragraph under header 1-1-1. The content is nested deeper,
- making sure that the chunking algorithm works properly with multiple levels of headers.
- This is another paragraph under header 1-1-1, continuing the content at this deeper level.
- #### Header 1-1-1-1
- Now we have reached the fourth level of headers. The text here should also be properly chunked and grouped with its parent headers.
- This is another paragraph under header 1-1-1-1. It should be grouped with the correct higher-level headers.
- # Header 2
- Here is some content under header 2. This section should also be sufficiently long to ensure that the token count threshold is reached in the test.
- ## Header 2-1
- ${repeatedText}
- ${repeatedText}
- Another sub-header under header 2 with text for testing chunking behavior. This is a fairly lengthy paragraph as well.
- We now have a fourth-level sub-header under header 2-1. This ensures that the chunking logic can handle deeply nested content.
- ### Header 2-1-1
- Here is another paragraph under header 2-1-1. This paragraph is part of a more deeply nested section.
- # Header 3
- Continuing with more headers and content to make sure the markdown document is sufficiently large. This is a new header with more paragraphs under it.
- ### Header 3-1
- This is a sub-header under header 3. The content here continues to grow, ensuring that the markdown is long enough to trigger multiple chunks.
- #### Header 3-1-1
- Here is a fourth-level sub-header under header 3-1. This paragraph is designed to create a larger markdown file for testing purposes.
- `;
- test('Each chunk should not exceed the specified token count', async() => {
- const maxToken = 800;
- const result = await splitMarkdownIntoChunks(markdown, MODEL, maxToken);
- result.forEach((chunk) => {
- const tokenCount = encoder.encode(chunk).length;
- expect(tokenCount).toBeLessThanOrEqual(maxToken * 1.1);
- });
- });
- test('Each chunk should include the relevant top-level header', async() => {
- const result = await splitMarkdownIntoChunks(markdown, MODEL, 800);
- result.forEach((chunk) => {
- const containsHeader1 = chunk.includes('# Header 1');
- const containsHeader2 = chunk.includes('# Header 2');
- const containsHeader3 = chunk.includes('# Header 3');
- const doesNotContainHash = !chunk.includes('# ');
- expect(containsHeader1 || containsHeader2 || containsHeader3 || doesNotContainHash).toBe(true);
- });
- });
- test('Should throw an error if a header exceeds half of maxToken size with correct error message', async() => {
- const maxToken = 800;
- const markdownWithLongHeader = `
- # Short Header 1
- This is the first paragraph under short header 1. It contains some text for testing purposes.
- ## ${repeatedText}
- This is the first paragraph under the long header. It contains text to ensure that the header length check is triggered if the header is too long.
- # Short Header 2
- Another section with a shorter header, but enough content to ensure proper chunking.
- `;
- try {
- await splitMarkdownIntoChunks(markdownWithLongHeader, MODEL, maxToken);
- }
- catch (error) {
- if (error instanceof Error) {
- expect(error.message).toContain('Heading token count is too large');
- }
- else {
- throw new Error('An unknown error occurred');
- }
- }
- });
- test('Should return the entire markdown as a single chunk if token count is less than or equal to maxToken', async() => {
- const markdownText = `
- # Header 1
- This is a short paragraph under header 1. It contains only a few sentences to ensure that the total token count remains under the maxToken limit.
- `;
- const maxToken = 800;
- const result = await splitMarkdownIntoChunks(markdownText, MODEL, maxToken);
- expect(result).toHaveLength(1);
- expect(result[0]).toBe(markdownText);
- });
- test('Should return the entire markdown as a single chunk if token count is less than or equal to maxToken', async() => {
- const markdownWithContentBeforeHeading = `
- This is a short paragraph
- # Header 1
- ${repeatedText}
- `;
- const maxToken = 800;
- const result = await splitMarkdownIntoChunks(markdownWithContentBeforeHeading, MODEL, maxToken);
- result.forEach((chunk) => {
- const tokenCount = encoder.encode(chunk).length;
- expect(tokenCount).toBeLessThanOrEqual(maxToken * 1.1);
- });
- });
- });
|