| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425 |
- import { encodingForModel, type TiktokenModel } from 'js-tiktoken';
- import type { Chunk } from '../src/services/markdown-splitter';
- import { splitMarkdownIntoChunks } from '../src/services/markdown-splitter';
- import { splitMarkdownByTokens } from '../src/services/markdown-token-splitter';
- describe('splitMarkdownIntoChunks', () => {
- test('handles empty markdown string', async() => {
- const markdown = '';
- const expected: Chunk[] = [];
- const result = await splitMarkdownIntoChunks(markdown); // Await the result
- expect(result).toEqual(expected);
- });
- test('handles markdown with only content and no headers', async() => {
- const markdown = `This is some content without any headers.
- It spans multiple lines.
- Another paragraph.
- `;
- const expected: Chunk[] = [
- {
- label: '0-content-1',
- type: 'paragraph',
- text: 'This is some content without any headers.\nIt spans multiple lines.',
- },
- {
- label: '0-content-2',
- type: 'paragraph',
- text: 'Another paragraph.',
- },
- ];
- const result = await splitMarkdownIntoChunks(markdown); // Await the result
- expect(result).toEqual(expected);
- });
- test('handles markdown starting with a header', async() => {
- const markdown = `
- # Header 1
- Content under header 1.
- ## Header 1.1
- Content under header 1.1.
- # Header 2
- Content under header 2.
- `;
- const expected: Chunk[] = [
- { label: '1-heading', type: 'heading', text: '# Header 1' },
- { label: '1-content-1', type: 'paragraph', text: 'Content under header 1.' },
- { label: '1-1-heading', type: 'heading', text: '## Header 1.1' },
- { label: '1-1-content-1', type: 'paragraph', text: 'Content under header 1.1.' },
- { label: '2-heading', type: 'heading', text: '# Header 2' },
- { label: '2-content-1', type: 'paragraph', text: 'Content under header 2.' },
- ];
- const result = await splitMarkdownIntoChunks(markdown); // Await the result
- expect(result).toEqual(expected);
- });
- test('handles markdown with non-consecutive heading levels', async() => {
- const markdown = `
- Introduction without a header.
- # Chapter 1
- Content of chapter 1.
- ### Section 1.1.1
- Content of section 1.1.1.
- ## Section 1.2
- Content of section 1.2.
- # Chapter 2
- Content of chapter 2.
- ## Section 2.1
- Content of section 2.1.
- `;
- const expected: Chunk[] = [
- {
- label: '0-content-1',
- type: 'paragraph',
- text: 'Introduction without a header.',
- },
- {
- label: '1-heading',
- type: 'heading',
- text: '# Chapter 1',
- },
- {
- label: '1-content-1',
- type: 'paragraph',
- text: 'Content of chapter 1.',
- },
- {
- label: '1-1-1-heading',
- type: 'heading',
- text: '### Section 1.1.1',
- },
- {
- label: '1-1-1-content-1',
- type: 'paragraph',
- text: 'Content of section 1.1.1.',
- },
- {
- label: '1-2-heading',
- type: 'heading',
- text: '## Section 1.2',
- },
- {
- label: '1-2-content-1',
- type: 'paragraph',
- text: 'Content of section 1.2.',
- },
- {
- label: '2-heading',
- type: 'heading',
- text: '# Chapter 2',
- },
- {
- label: '2-content-1',
- type: 'paragraph',
- text: 'Content of chapter 2.',
- },
- {
- label: '2-1-heading',
- type: 'heading',
- text: '## Section 2.1',
- },
- {
- label: '2-1-content-1',
- type: 'paragraph',
- text: 'Content of section 2.1.',
- },
- ];
- const result = await splitMarkdownIntoChunks(markdown); // Await the result
- expect(result).toEqual(expected);
- });
- test('handles markdown with skipped heading levels', async() => {
- const markdown = `
- # Header 1
- Content under header 1.
- #### Header 1.1.1.1
- Content under header 1.1.1.1.
- ## Header 1.2
- Content under header 1.2.
- # Header 2
- Content under header 2.
- `;
- const expected: Chunk[] = [
- { label: '1-heading', type: 'heading', text: '# Header 1' },
- { label: '1-content-1', type: 'paragraph', text: 'Content under header 1.' },
- { label: '1-1-1-1-heading', type: 'heading', text: '#### Header 1.1.1.1' },
- { label: '1-1-1-1-content-1', type: 'paragraph', text: 'Content under header 1.1.1.1.' },
- { label: '1-2-heading', type: 'heading', text: '## Header 1.2' },
- { label: '1-2-content-1', type: 'paragraph', text: 'Content under header 1.2.' },
- { label: '2-heading', type: 'heading', text: '# Header 2' },
- { label: '2-content-1', type: 'paragraph', text: 'Content under header 2.' },
- ];
- const result = await splitMarkdownIntoChunks(markdown); // Await the result
- expect(result).toEqual(expected);
- });
- test('handles malformed headings', async() => {
- const markdown = `
- # Header 1
- Content under header 1.
- #### Header 1.1.1.1
- Content under header 1.1.1.1.
- `;
- const expected: Chunk[] = [
- { label: '1-heading', type: 'heading', text: '# Header 1' },
- { label: '1-content-1', type: 'paragraph', text: 'Content under header 1.' },
- { label: '1-1-1-1-heading', type: 'heading', text: '#### Header 1.1.1.1' },
- { label: '1-1-1-1-content-1', type: 'paragraph', text: 'Content under header 1.1.1.1.' },
- ];
- const result = await splitMarkdownIntoChunks(markdown); // Await the result
- expect(result).toEqual(expected);
- });
- test('handles multiple content blocks before any headers', async() => {
- const markdown = `
- This is the first paragraph without a header.
- This is the second paragraph without a header.
- # Header 1
- Content under header 1.
- `;
- const expected: Chunk[] = [
- {
- label: '0-content-1',
- type: 'paragraph',
- text: 'This is the first paragraph without a header.',
- },
- {
- label: '0-content-2',
- type: 'paragraph',
- text: 'This is the second paragraph without a header.',
- },
- {
- label: '1-heading',
- type: 'heading',
- text: '# Header 1',
- },
- {
- label: '1-content-1',
- type: 'paragraph',
- text: 'Content under header 1.',
- },
- ];
- const result = await splitMarkdownIntoChunks(markdown); // Await the result
- expect(result).toEqual(expected);
- });
- test('handles markdown with only headers and no content', async() => {
- const markdown = `
- # Header 1
- ## Header 1.1
- ### Header 1.1.1
- `;
- const expected: Chunk[] = [
- { label: '1-heading', type: 'heading', text: '# Header 1' },
- { label: '1-1-heading', type: 'heading', text: '## Header 1.1' },
- { label: '1-1-1-heading', type: 'heading', text: '### Header 1.1.1' },
- ];
- const result = await splitMarkdownIntoChunks(markdown); // Await the result
- expect(result).toEqual(expected);
- });
- test('handles markdown with mixed content and headers', async() => {
- const markdown = `
- # Header 1
- Content under header 1.
- ## Header 1.1
- Content under header 1.1.
- Another piece of content.
- # Header 2
- Content under header 2.
- `;
- const expected: Chunk[] = [
- { label: '1-heading', type: 'heading', text: '# Header 1' },
- { label: '1-content-1', type: 'paragraph', text: 'Content under header 1.' },
- { label: '1-1-heading', type: 'heading', text: '## Header 1.1' },
- { label: '1-1-content-1', type: 'paragraph', text: 'Content under header 1.1.\nAnother piece of content.' },
- { label: '2-heading', type: 'heading', text: '# Header 2' },
- { label: '2-content-1', type: 'paragraph', text: 'Content under header 2.' },
- ];
- const result = await splitMarkdownIntoChunks(markdown); // Await the result
- expect(result).toEqual(expected);
- });
- test('preserves list indentation and reduces unnecessary line breaks', async() => {
- const markdown = `
- # Header 1
- Content under header 1.
- - Item 1
- - Subitem 1
- - Item 2
- # Header 2
- Content under header 2.
- `;
- const expected: Chunk[] = [
- { label: '1-heading', type: 'heading', text: '# Header 1' },
- { label: '1-content-1', type: 'paragraph', text: 'Content under header 1.' },
- { label: '1-content-2', type: 'list', text: '- Item 1\n - Subitem 1\n- Item 2' },
- { label: '2-heading', type: 'heading', text: '# Header 2' },
- { label: '2-content-1', type: 'paragraph', text: 'Content under header 2.' },
- ];
- const result = await splitMarkdownIntoChunks(markdown); // Await the result
- expect(result).toEqual(expected);
- });
- test('code blocks containing # are not treated as headings', async() => {
- const markdown = `
- # Header 1
- Some introductory content.
- \`\`\`
- # This is a comment with a # symbol
- Some code line
- \`\`\`
- Additional content.
- # Header 2
- Content under header 2.
- `;
- const expected: Chunk[] = [
- { label: '1-heading', type: 'heading', text: '# Header 1' },
- { label: '1-content-1', type: 'paragraph', text: 'Some introductory content.' },
- { label: '1-content-2', type: 'code', text: '```\n# This is a comment with a # symbol\nSome code line\n```' },
- { label: '1-content-3', type: 'paragraph', text: 'Additional content.' },
- { label: '2-heading', type: 'heading', text: '# Header 2' },
- { label: '2-content-1', type: 'paragraph', text: 'Content under header 2.' },
- ];
- const result = await splitMarkdownIntoChunks(markdown);
- expect(result).toEqual(expected);
- });
- test('frontmatter is processed and labeled correctly', async() => {
- const markdown = `---
- title: Test Document
- author: John Doe
- ---
- # Header 1
- Some introductory content.
- `;
- const expected: Chunk[] = [
- { label: 'frontmatter', type: 'yaml', text: JSON.stringify({ title: 'Test Document', author: 'John Doe' }, null, 2) },
- { label: '1-heading', type: 'heading', text: '# Header 1' },
- { label: '1-content-1', type: 'paragraph', text: 'Some introductory content.' },
- ];
- const result = await splitMarkdownIntoChunks(markdown);
- expect(result).toEqual(expected);
- });
- });
- describe('splitMarkdownByTokens', () => {
- test('preserves list indentation and reduces unnecessary line breaks', async() => {
- const model: TiktokenModel = 'gpt-4';
- const markdown = `
- # Header 1
- Content under header 1.
- - Item 1
- - Subitem 1
- - Item 2
- # Header 2
- Content under header 2.
- `;
- const encoder = encodingForModel(model);
- const expected: Chunk[] = [
- {
- label: '1-heading',
- type: 'heading',
- text: '# Header 1',
- tokenCount: encoder.encode('# Header 1').length,
- },
- {
- label: '1-content-1',
- type: 'paragraph',
- text: 'Content under header 1.',
- tokenCount: encoder.encode('Content under header 1.').length,
- },
- {
- label: '1-content-2',
- type: 'list',
- text: '- Item 1\n - Subitem 1\n- Item 2',
- tokenCount: encoder.encode('- Item 1\n - Subitem 1\n- Item 2').length,
- },
- {
- label: '2-heading',
- type: 'heading',
- text: '# Header 2',
- tokenCount: encoder.encode('# Header 2').length,
- },
- {
- label: '2-content-1',
- type: 'paragraph',
- text: 'Content under header 2.',
- tokenCount: encoder.encode('Content under header 2.').length,
- },
- ];
- const result = await splitMarkdownByTokens(markdown, model, 200);
- // Compare each chunk individually to check for correctness
- expect(result.length).toEqual(expected.length);
- });
- test('long text is split into chunks within maxTokens limit', async() => {
- const model: TiktokenModel = 'gpt-4';
- const maxTokens = 200;
- const encoder = encodingForModel(model);
- // create long paragraphs
- const longParagraph = 'Lorem ipsum dolor sit amet, consectetur adipiscing elit. '.repeat(50);
- const markdown = `
- # Header 1
- ${longParagraph}
- ## Header 1.1
- ${longParagraph}
- ### Header 1.1.1
- ${longParagraph}
- # Header 2
- ${longParagraph}
- `;
- const result = await splitMarkdownByTokens(markdown, model, maxTokens);
- // Verify that each chunk's tokenCount is less than or equal to maxTokens
- for (const chunk of result) {
- expect(chunk.tokenCount).toBeLessThanOrEqual(maxTokens);
- }
- // General test for the chunks (add more detailed tests if necessary)
- expect(result.length).toBeGreaterThan(0);
- // Confirm that the correct model was used
- for (const chunk of result) {
- const calculatedTokenCount = encoder.encode(chunk.text).length;
- expect(chunk.tokenCount).toEqual(calculatedTokenCount);
- }
- });
- });
|