|
@@ -1,12 +1,17 @@
|
|
|
-import type { Chunk } from '../src/services/markdown-splitter';
|
|
|
|
|
-import { splitMarkdownIntoChunks } from '../src/services/markdown-splitter';
|
|
|
|
|
|
|
+import { encodingForModel, type TiktokenModel } from 'js-tiktoken';
|
|
|
|
|
|
|
|
-describe('splitMarkdownIntoChunks', () => {
|
|
|
|
|
|
|
+import type { MarkdownFragment } from '~/index';
|
|
|
|
|
+import { splitMarkdownIntoChunks, splitMarkdownIntoFragments } from '~/index';
|
|
|
|
|
+
|
|
|
|
|
+const MODEL: TiktokenModel = 'gpt-4';
|
|
|
|
|
+const encoder = encodingForModel(MODEL);
|
|
|
|
|
+
|
|
|
|
|
+describe('splitMarkdownIntoFragments', () => {
|
|
|
|
|
|
|
|
test('handles empty markdown string', async() => {
|
|
test('handles empty markdown string', async() => {
|
|
|
const markdown = '';
|
|
const markdown = '';
|
|
|
- const expected: Chunk[] = [];
|
|
|
|
|
- const result = await splitMarkdownIntoChunks(markdown); // Await the result
|
|
|
|
|
|
|
+ const expected: MarkdownFragment[] = [];
|
|
|
|
|
+ const result = await splitMarkdownIntoFragments(markdown, MODEL);
|
|
|
expect(result).toEqual(expected);
|
|
expect(result).toEqual(expected);
|
|
|
});
|
|
});
|
|
|
|
|
|
|
@@ -16,13 +21,23 @@ It spans multiple lines.
|
|
|
|
|
|
|
|
Another paragraph.
|
|
Another paragraph.
|
|
|
`;
|
|
`;
|
|
|
- const expected: Chunk[] = [
|
|
|
|
|
|
|
+
|
|
|
|
|
+ const expected: MarkdownFragment[] = [
|
|
|
|
|
+ {
|
|
|
|
|
+ label: '0-content-1',
|
|
|
|
|
+ type: 'paragraph',
|
|
|
|
|
+ text: 'This is some content without any headers.\nIt spans multiple lines.',
|
|
|
|
|
+ tokenCount: encoder.encode('This is some content without any headers.\nIt spans multiple lines.').length,
|
|
|
|
|
+ },
|
|
|
{
|
|
{
|
|
|
- label: '0-content',
|
|
|
|
|
- text: 'This is some content without any headers.\nIt spans multiple lines.\n\nAnother paragraph.',
|
|
|
|
|
|
|
+ label: '0-content-2',
|
|
|
|
|
+ type: 'paragraph',
|
|
|
|
|
+ text: 'Another paragraph.',
|
|
|
|
|
+ tokenCount: encoder.encode('Another paragraph.').length,
|
|
|
},
|
|
},
|
|
|
];
|
|
];
|
|
|
- const result = await splitMarkdownIntoChunks(markdown); // Await the result
|
|
|
|
|
|
|
+
|
|
|
|
|
+ const result = await splitMarkdownIntoFragments(markdown, MODEL);
|
|
|
expect(result).toEqual(expected);
|
|
expect(result).toEqual(expected);
|
|
|
});
|
|
});
|
|
|
|
|
|
|
@@ -37,15 +52,47 @@ Content under header 1.1.
|
|
|
# Header 2
|
|
# Header 2
|
|
|
Content under header 2.
|
|
Content under header 2.
|
|
|
`;
|
|
`;
|
|
|
- const expected: Chunk[] = [
|
|
|
|
|
- { label: '1-heading', text: '# Header 1' },
|
|
|
|
|
- { label: '1-content', text: 'Content under header 1.' },
|
|
|
|
|
- { label: '1-1-heading', text: '## Header 1.1' },
|
|
|
|
|
- { label: '1-1-content', text: 'Content under header 1.1.' },
|
|
|
|
|
- { label: '2-heading', text: '# Header 2' },
|
|
|
|
|
- { label: '2-content', text: 'Content under header 2.' },
|
|
|
|
|
|
|
+
|
|
|
|
|
+ const expected: MarkdownFragment[] = [
|
|
|
|
|
+ {
|
|
|
|
|
+ label: '1-heading',
|
|
|
|
|
+ type: 'heading',
|
|
|
|
|
+ text: '# Header 1',
|
|
|
|
|
+ tokenCount: encoder.encode('# Header 1').length,
|
|
|
|
|
+ },
|
|
|
|
|
+ {
|
|
|
|
|
+ label: '1-content-1',
|
|
|
|
|
+ type: 'paragraph',
|
|
|
|
|
+ text: 'Content under header 1.',
|
|
|
|
|
+ tokenCount: encoder.encode('Content under header 1.').length,
|
|
|
|
|
+ },
|
|
|
|
|
+ {
|
|
|
|
|
+ label: '1-1-heading',
|
|
|
|
|
+ type: 'heading',
|
|
|
|
|
+ text: '## Header 1.1',
|
|
|
|
|
+ tokenCount: encoder.encode('## Header 1.1').length,
|
|
|
|
|
+ },
|
|
|
|
|
+ {
|
|
|
|
|
+ label: '1-1-content-1',
|
|
|
|
|
+ type: 'paragraph',
|
|
|
|
|
+ text: 'Content under header 1.1.',
|
|
|
|
|
+ tokenCount: encoder.encode('Content under header 1.1.').length,
|
|
|
|
|
+ },
|
|
|
|
|
+ {
|
|
|
|
|
+ label: '2-heading',
|
|
|
|
|
+ type: 'heading',
|
|
|
|
|
+ text: '# Header 2',
|
|
|
|
|
+ tokenCount: encoder.encode('# Header 2').length,
|
|
|
|
|
+ },
|
|
|
|
|
+ {
|
|
|
|
|
+ label: '2-content-1',
|
|
|
|
|
+ type: 'paragraph',
|
|
|
|
|
+ text: 'Content under header 2.',
|
|
|
|
|
+ tokenCount: encoder.encode('Content under header 2.').length,
|
|
|
|
|
+ },
|
|
|
];
|
|
];
|
|
|
- const result = await splitMarkdownIntoChunks(markdown); // Await the result
|
|
|
|
|
|
|
+
|
|
|
|
|
+ const result = await splitMarkdownIntoFragments(markdown, MODEL);
|
|
|
expect(result).toEqual(expected);
|
|
expect(result).toEqual(expected);
|
|
|
});
|
|
});
|
|
|
|
|
|
|
@@ -68,53 +115,77 @@ Content of chapter 2.
|
|
|
## Section 2.1
|
|
## Section 2.1
|
|
|
Content of section 2.1.
|
|
Content of section 2.1.
|
|
|
`;
|
|
`;
|
|
|
- const expected: Chunk[] = [
|
|
|
|
|
|
|
+
|
|
|
|
|
+ const expected: MarkdownFragment[] = [
|
|
|
{
|
|
{
|
|
|
- label: '0-content',
|
|
|
|
|
|
|
+ label: '0-content-1',
|
|
|
|
|
+ type: 'paragraph',
|
|
|
text: 'Introduction without a header.',
|
|
text: 'Introduction without a header.',
|
|
|
|
|
+ tokenCount: encoder.encode('Introduction without a header.').length,
|
|
|
},
|
|
},
|
|
|
{
|
|
{
|
|
|
label: '1-heading',
|
|
label: '1-heading',
|
|
|
|
|
+ type: 'heading',
|
|
|
text: '# Chapter 1',
|
|
text: '# Chapter 1',
|
|
|
|
|
+ tokenCount: encoder.encode('# Chapter 1').length,
|
|
|
},
|
|
},
|
|
|
{
|
|
{
|
|
|
- label: '1-content',
|
|
|
|
|
|
|
+ label: '1-content-1',
|
|
|
|
|
+ type: 'paragraph',
|
|
|
text: 'Content of chapter 1.',
|
|
text: 'Content of chapter 1.',
|
|
|
|
|
+ tokenCount: encoder.encode('Content of chapter 1.').length,
|
|
|
},
|
|
},
|
|
|
{
|
|
{
|
|
|
label: '1-1-1-heading',
|
|
label: '1-1-1-heading',
|
|
|
|
|
+ type: 'heading',
|
|
|
text: '### Section 1.1.1',
|
|
text: '### Section 1.1.1',
|
|
|
|
|
+ tokenCount: encoder.encode('### Section 1.1.1').length,
|
|
|
},
|
|
},
|
|
|
{
|
|
{
|
|
|
- label: '1-1-1-content',
|
|
|
|
|
|
|
+ label: '1-1-1-content-1',
|
|
|
|
|
+ type: 'paragraph',
|
|
|
text: 'Content of section 1.1.1.',
|
|
text: 'Content of section 1.1.1.',
|
|
|
|
|
+ tokenCount: encoder.encode('Content of section 1.1.1.').length,
|
|
|
},
|
|
},
|
|
|
{
|
|
{
|
|
|
label: '1-2-heading',
|
|
label: '1-2-heading',
|
|
|
|
|
+ type: 'heading',
|
|
|
text: '## Section 1.2',
|
|
text: '## Section 1.2',
|
|
|
|
|
+ tokenCount: encoder.encode('## Section 1.2').length,
|
|
|
},
|
|
},
|
|
|
{
|
|
{
|
|
|
- label: '1-2-content',
|
|
|
|
|
|
|
+ label: '1-2-content-1',
|
|
|
|
|
+ type: 'paragraph',
|
|
|
text: 'Content of section 1.2.',
|
|
text: 'Content of section 1.2.',
|
|
|
|
|
+ tokenCount: encoder.encode('Content of section 1.2.').length,
|
|
|
},
|
|
},
|
|
|
{
|
|
{
|
|
|
label: '2-heading',
|
|
label: '2-heading',
|
|
|
|
|
+ type: 'heading',
|
|
|
text: '# Chapter 2',
|
|
text: '# Chapter 2',
|
|
|
|
|
+ tokenCount: encoder.encode('# Chapter 2').length,
|
|
|
},
|
|
},
|
|
|
{
|
|
{
|
|
|
- label: '2-content',
|
|
|
|
|
|
|
+ label: '2-content-1',
|
|
|
|
|
+ type: 'paragraph',
|
|
|
text: 'Content of chapter 2.',
|
|
text: 'Content of chapter 2.',
|
|
|
|
|
+ tokenCount: encoder.encode('Content of chapter 2.').length,
|
|
|
},
|
|
},
|
|
|
{
|
|
{
|
|
|
label: '2-1-heading',
|
|
label: '2-1-heading',
|
|
|
|
|
+ type: 'heading',
|
|
|
text: '## Section 2.1',
|
|
text: '## Section 2.1',
|
|
|
|
|
+ tokenCount: encoder.encode('## Section 2.1').length,
|
|
|
},
|
|
},
|
|
|
{
|
|
{
|
|
|
- label: '2-1-content',
|
|
|
|
|
|
|
+ label: '2-1-content-1',
|
|
|
|
|
+ type: 'paragraph',
|
|
|
text: 'Content of section 2.1.',
|
|
text: 'Content of section 2.1.',
|
|
|
|
|
+ tokenCount: encoder.encode('Content of section 2.1.').length,
|
|
|
},
|
|
},
|
|
|
];
|
|
];
|
|
|
- const result = await splitMarkdownIntoChunks(markdown); // Await the result
|
|
|
|
|
|
|
+
|
|
|
|
|
+ const result = await splitMarkdownIntoFragments(markdown, MODEL);
|
|
|
expect(result).toEqual(expected);
|
|
expect(result).toEqual(expected);
|
|
|
});
|
|
});
|
|
|
|
|
|
|
@@ -132,17 +203,59 @@ Content under header 1.2.
|
|
|
# Header 2
|
|
# Header 2
|
|
|
Content under header 2.
|
|
Content under header 2.
|
|
|
`;
|
|
`;
|
|
|
- const expected: Chunk[] = [
|
|
|
|
|
- { label: '1-heading', text: '# Header 1' },
|
|
|
|
|
- { label: '1-content', text: 'Content under header 1.' },
|
|
|
|
|
- { label: '1-1-1-1-heading', text: '#### Header 1.1.1.1' },
|
|
|
|
|
- { label: '1-1-1-1-content', text: 'Content under header 1.1.1.1.' },
|
|
|
|
|
- { label: '1-2-heading', text: '## Header 1.2' },
|
|
|
|
|
- { label: '1-2-content', text: 'Content under header 1.2.' },
|
|
|
|
|
- { label: '2-heading', text: '# Header 2' },
|
|
|
|
|
- { label: '2-content', text: 'Content under header 2.' },
|
|
|
|
|
|
|
+
|
|
|
|
|
+ const expected: MarkdownFragment[] = [
|
|
|
|
|
+ {
|
|
|
|
|
+ label: '1-heading',
|
|
|
|
|
+ type: 'heading',
|
|
|
|
|
+ text: '# Header 1',
|
|
|
|
|
+ tokenCount: encoder.encode('# Header 1').length,
|
|
|
|
|
+ },
|
|
|
|
|
+ {
|
|
|
|
|
+ label: '1-content-1',
|
|
|
|
|
+ type: 'paragraph',
|
|
|
|
|
+ text: 'Content under header 1.',
|
|
|
|
|
+ tokenCount: encoder.encode('Content under header 1.').length,
|
|
|
|
|
+ },
|
|
|
|
|
+ {
|
|
|
|
|
+ label: '1-1-1-1-heading',
|
|
|
|
|
+ type: 'heading',
|
|
|
|
|
+ text: '#### Header 1.1.1.1',
|
|
|
|
|
+ tokenCount: encoder.encode('#### Header 1.1.1.1').length,
|
|
|
|
|
+ },
|
|
|
|
|
+ {
|
|
|
|
|
+ label: '1-1-1-1-content-1',
|
|
|
|
|
+ type: 'paragraph',
|
|
|
|
|
+ text: 'Content under header 1.1.1.1.',
|
|
|
|
|
+ tokenCount: encoder.encode('Content under header 1.1.1.1.').length,
|
|
|
|
|
+ },
|
|
|
|
|
+ {
|
|
|
|
|
+ label: '1-2-heading',
|
|
|
|
|
+ type: 'heading',
|
|
|
|
|
+ text: '## Header 1.2',
|
|
|
|
|
+ tokenCount: encoder.encode('## Header 1.2').length,
|
|
|
|
|
+ },
|
|
|
|
|
+ {
|
|
|
|
|
+ label: '1-2-content-1',
|
|
|
|
|
+ type: 'paragraph',
|
|
|
|
|
+ text: 'Content under header 1.2.',
|
|
|
|
|
+ tokenCount: encoder.encode('Content under header 1.2.').length,
|
|
|
|
|
+ },
|
|
|
|
|
+ {
|
|
|
|
|
+ label: '2-heading',
|
|
|
|
|
+ type: 'heading',
|
|
|
|
|
+ text: '# Header 2',
|
|
|
|
|
+ tokenCount: encoder.encode('# Header 2').length,
|
|
|
|
|
+ },
|
|
|
|
|
+ {
|
|
|
|
|
+ label: '2-content-1',
|
|
|
|
|
+ type: 'paragraph',
|
|
|
|
|
+ text: 'Content under header 2.',
|
|
|
|
|
+ tokenCount: encoder.encode('Content under header 2.').length,
|
|
|
|
|
+ },
|
|
|
];
|
|
];
|
|
|
- const result = await splitMarkdownIntoChunks(markdown); // Await the result
|
|
|
|
|
|
|
+
|
|
|
|
|
+ const result = await splitMarkdownIntoFragments(markdown, MODEL);
|
|
|
expect(result).toEqual(expected);
|
|
expect(result).toEqual(expected);
|
|
|
});
|
|
});
|
|
|
|
|
|
|
@@ -154,13 +267,35 @@ Content under header 1.
|
|
|
#### Header 1.1.1.1
|
|
#### Header 1.1.1.1
|
|
|
Content under header 1.1.1.1.
|
|
Content under header 1.1.1.1.
|
|
|
`;
|
|
`;
|
|
|
- const expected: Chunk[] = [
|
|
|
|
|
- { label: '1-heading', text: '# Header 1' },
|
|
|
|
|
- { label: '1-content', text: 'Content under header 1.' },
|
|
|
|
|
- { label: '1-1-1-1-heading', text: '#### Header 1.1.1.1' },
|
|
|
|
|
- { label: '1-1-1-1-content', text: 'Content under header 1.1.1.1.' },
|
|
|
|
|
|
|
+
|
|
|
|
|
+ const expected: MarkdownFragment[] = [
|
|
|
|
|
+ {
|
|
|
|
|
+ label: '1-heading',
|
|
|
|
|
+ type: 'heading',
|
|
|
|
|
+ text: '# Header 1',
|
|
|
|
|
+ tokenCount: encoder.encode('# Header 1').length,
|
|
|
|
|
+ },
|
|
|
|
|
+ {
|
|
|
|
|
+ label: '1-content-1',
|
|
|
|
|
+ type: 'paragraph',
|
|
|
|
|
+ text: 'Content under header 1.',
|
|
|
|
|
+ tokenCount: encoder.encode('Content under header 1.').length,
|
|
|
|
|
+ },
|
|
|
|
|
+ {
|
|
|
|
|
+ label: '1-1-1-1-heading',
|
|
|
|
|
+ type: 'heading',
|
|
|
|
|
+ text: '#### Header 1.1.1.1',
|
|
|
|
|
+ tokenCount: encoder.encode('#### Header 1.1.1.1').length,
|
|
|
|
|
+ },
|
|
|
|
|
+ {
|
|
|
|
|
+ label: '1-1-1-1-content-1',
|
|
|
|
|
+ type: 'paragraph',
|
|
|
|
|
+ text: 'Content under header 1.1.1.1.',
|
|
|
|
|
+ tokenCount: encoder.encode('Content under header 1.1.1.1.').length,
|
|
|
|
|
+ },
|
|
|
];
|
|
];
|
|
|
- const result = await splitMarkdownIntoChunks(markdown); // Await the result
|
|
|
|
|
|
|
+
|
|
|
|
|
+ const result = await splitMarkdownIntoFragments(markdown, MODEL);
|
|
|
expect(result).toEqual(expected);
|
|
expect(result).toEqual(expected);
|
|
|
});
|
|
});
|
|
|
|
|
|
|
@@ -173,15 +308,35 @@ This is the second paragraph without a header.
|
|
|
# Header 1
|
|
# Header 1
|
|
|
Content under header 1.
|
|
Content under header 1.
|
|
|
`;
|
|
`;
|
|
|
- const expected: Chunk[] = [
|
|
|
|
|
|
|
+
|
|
|
|
|
+ const expected: MarkdownFragment[] = [
|
|
|
{
|
|
{
|
|
|
- label: '0-content',
|
|
|
|
|
- text: 'This is the first paragraph without a header.\n\nThis is the second paragraph without a header.',
|
|
|
|
|
|
|
+ label: '0-content-1',
|
|
|
|
|
+ type: 'paragraph',
|
|
|
|
|
+ text: 'This is the first paragraph without a header.',
|
|
|
|
|
+ tokenCount: encoder.encode('This is the first paragraph without a header.').length,
|
|
|
|
|
+ },
|
|
|
|
|
+ {
|
|
|
|
|
+ label: '0-content-2',
|
|
|
|
|
+ type: 'paragraph',
|
|
|
|
|
+ text: 'This is the second paragraph without a header.',
|
|
|
|
|
+ tokenCount: encoder.encode('This is the second paragraph without a header.').length,
|
|
|
|
|
+ },
|
|
|
|
|
+ {
|
|
|
|
|
+ label: '1-heading',
|
|
|
|
|
+ type: 'heading',
|
|
|
|
|
+ text: '# Header 1',
|
|
|
|
|
+ tokenCount: encoder.encode('# Header 1').length,
|
|
|
|
|
+ },
|
|
|
|
|
+ {
|
|
|
|
|
+ label: '1-content-1',
|
|
|
|
|
+ type: 'paragraph',
|
|
|
|
|
+ text: 'Content under header 1.',
|
|
|
|
|
+ tokenCount: encoder.encode('Content under header 1.').length,
|
|
|
},
|
|
},
|
|
|
- { label: '1-heading', text: '# Header 1' },
|
|
|
|
|
- { label: '1-content', text: 'Content under header 1.' },
|
|
|
|
|
];
|
|
];
|
|
|
- const result = await splitMarkdownIntoChunks(markdown); // Await the result
|
|
|
|
|
|
|
+
|
|
|
|
|
+ const result = await splitMarkdownIntoFragments(markdown, MODEL);
|
|
|
expect(result).toEqual(expected);
|
|
expect(result).toEqual(expected);
|
|
|
});
|
|
});
|
|
|
|
|
|
|
@@ -193,12 +348,29 @@ Content under header 1.
|
|
|
|
|
|
|
|
### Header 1.1.1
|
|
### Header 1.1.1
|
|
|
`;
|
|
`;
|
|
|
- const expected: Chunk[] = [
|
|
|
|
|
- { label: '1-heading', text: '# Header 1' },
|
|
|
|
|
- { label: '1-1-heading', text: '## Header 1.1' },
|
|
|
|
|
- { label: '1-1-1-heading', text: '### Header 1.1.1' },
|
|
|
|
|
|
|
+
|
|
|
|
|
+ const expected: MarkdownFragment[] = [
|
|
|
|
|
+ {
|
|
|
|
|
+ label: '1-heading',
|
|
|
|
|
+ type: 'heading',
|
|
|
|
|
+ text: '# Header 1',
|
|
|
|
|
+ tokenCount: encoder.encode('# Header 1').length,
|
|
|
|
|
+ },
|
|
|
|
|
+ {
|
|
|
|
|
+ label: '1-1-heading',
|
|
|
|
|
+ type: 'heading',
|
|
|
|
|
+ text: '## Header 1.1',
|
|
|
|
|
+ tokenCount: encoder.encode('## Header 1.1').length,
|
|
|
|
|
+ },
|
|
|
|
|
+ {
|
|
|
|
|
+ label: '1-1-1-heading',
|
|
|
|
|
+ type: 'heading',
|
|
|
|
|
+ text: '### Header 1.1.1',
|
|
|
|
|
+ tokenCount: encoder.encode('### Header 1.1.1').length,
|
|
|
|
|
+ },
|
|
|
];
|
|
];
|
|
|
- const result = await splitMarkdownIntoChunks(markdown); // Await the result
|
|
|
|
|
|
|
+
|
|
|
|
|
+ const result = await splitMarkdownIntoFragments(markdown, MODEL);
|
|
|
expect(result).toEqual(expected);
|
|
expect(result).toEqual(expected);
|
|
|
});
|
|
});
|
|
|
|
|
|
|
@@ -214,15 +386,47 @@ Another piece of content.
|
|
|
# Header 2
|
|
# Header 2
|
|
|
Content under header 2.
|
|
Content under header 2.
|
|
|
`;
|
|
`;
|
|
|
- const expected: Chunk[] = [
|
|
|
|
|
- { label: '1-heading', text: '# Header 1' },
|
|
|
|
|
- { label: '1-content', text: 'Content under header 1.' },
|
|
|
|
|
- { label: '1-1-heading', text: '## Header 1.1' },
|
|
|
|
|
- { label: '1-1-content', text: 'Content under header 1.1.\nAnother piece of content.' },
|
|
|
|
|
- { label: '2-heading', text: '# Header 2' },
|
|
|
|
|
- { label: '2-content', text: 'Content under header 2.' },
|
|
|
|
|
|
|
+
|
|
|
|
|
+ const expected: MarkdownFragment[] = [
|
|
|
|
|
+ {
|
|
|
|
|
+ label: '1-heading',
|
|
|
|
|
+ type: 'heading',
|
|
|
|
|
+ text: '# Header 1',
|
|
|
|
|
+ tokenCount: encoder.encode('# Header 1').length,
|
|
|
|
|
+ },
|
|
|
|
|
+ {
|
|
|
|
|
+ label: '1-content-1',
|
|
|
|
|
+ type: 'paragraph',
|
|
|
|
|
+ text: 'Content under header 1.',
|
|
|
|
|
+ tokenCount: encoder.encode('Content under header 1.').length,
|
|
|
|
|
+ },
|
|
|
|
|
+ {
|
|
|
|
|
+ label: '1-1-heading',
|
|
|
|
|
+ type: 'heading',
|
|
|
|
|
+ text: '## Header 1.1',
|
|
|
|
|
+ tokenCount: encoder.encode('## Header 1.1').length,
|
|
|
|
|
+ },
|
|
|
|
|
+ {
|
|
|
|
|
+ label: '1-1-content-1',
|
|
|
|
|
+ type: 'paragraph',
|
|
|
|
|
+ text: 'Content under header 1.1.\nAnother piece of content.',
|
|
|
|
|
+ tokenCount: encoder.encode('Content under header 1.1.\nAnother piece of content.').length,
|
|
|
|
|
+ },
|
|
|
|
|
+ {
|
|
|
|
|
+ label: '2-heading',
|
|
|
|
|
+ type: 'heading',
|
|
|
|
|
+ text: '# Header 2',
|
|
|
|
|
+ tokenCount: encoder.encode('# Header 2').length,
|
|
|
|
|
+ },
|
|
|
|
|
+ {
|
|
|
|
|
+ label: '2-content-1',
|
|
|
|
|
+ type: 'paragraph',
|
|
|
|
|
+ text: 'Content under header 2.',
|
|
|
|
|
+ tokenCount: encoder.encode('Content under header 2.').length,
|
|
|
|
|
+ },
|
|
|
];
|
|
];
|
|
|
- const result = await splitMarkdownIntoChunks(markdown); // Await the result
|
|
|
|
|
|
|
+
|
|
|
|
|
+ const result = await splitMarkdownIntoFragments(markdown, MODEL);
|
|
|
expect(result).toEqual(expected);
|
|
expect(result).toEqual(expected);
|
|
|
});
|
|
});
|
|
|
|
|
|
|
@@ -239,15 +443,44 @@ Content under header 1.
|
|
|
# Header 2
|
|
# Header 2
|
|
|
Content under header 2.
|
|
Content under header 2.
|
|
|
`;
|
|
`;
|
|
|
- const expected: Chunk[] = [
|
|
|
|
|
- { label: '1-heading', text: '# Header 1' },
|
|
|
|
|
- { label: '1-content', text: 'Content under header 1.\n\n- Item 1\n - Subitem 1\n- Item 2' },
|
|
|
|
|
- { label: '2-heading', text: '# Header 2' },
|
|
|
|
|
- { label: '2-content', text: 'Content under header 2.' },
|
|
|
|
|
|
|
+
|
|
|
|
|
+ const expected: MarkdownFragment[] = [
|
|
|
|
|
+ {
|
|
|
|
|
+ label: '1-heading',
|
|
|
|
|
+ type: 'heading',
|
|
|
|
|
+ text: '# Header 1',
|
|
|
|
|
+ tokenCount: encoder.encode('# Header 1').length,
|
|
|
|
|
+ },
|
|
|
|
|
+ {
|
|
|
|
|
+ label: '1-content-1',
|
|
|
|
|
+ type: 'paragraph',
|
|
|
|
|
+ text: 'Content under header 1.',
|
|
|
|
|
+ tokenCount: encoder.encode('Content under header 1.').length,
|
|
|
|
|
+ },
|
|
|
|
|
+ {
|
|
|
|
|
+ label: '1-content-2',
|
|
|
|
|
+ type: 'list',
|
|
|
|
|
+ text: '- Item 1\n - Subitem 1\n- Item 2',
|
|
|
|
|
+ tokenCount: encoder.encode('- Item 1\n - Subitem 1\n- Item 2').length,
|
|
|
|
|
+ },
|
|
|
|
|
+ {
|
|
|
|
|
+ label: '2-heading',
|
|
|
|
|
+ type: 'heading',
|
|
|
|
|
+ text: '# Header 2',
|
|
|
|
|
+ tokenCount: encoder.encode('# Header 2').length,
|
|
|
|
|
+ },
|
|
|
|
|
+ {
|
|
|
|
|
+ label: '2-content-1',
|
|
|
|
|
+ type: 'paragraph',
|
|
|
|
|
+ text: 'Content under header 2.',
|
|
|
|
|
+ tokenCount: encoder.encode('Content under header 2.').length,
|
|
|
|
|
+ },
|
|
|
];
|
|
];
|
|
|
- const result = await splitMarkdownIntoChunks(markdown); // Await the result
|
|
|
|
|
|
|
+
|
|
|
|
|
+ const result = await splitMarkdownIntoFragments(markdown, MODEL);
|
|
|
expect(result).toEqual(expected);
|
|
expect(result).toEqual(expected);
|
|
|
});
|
|
});
|
|
|
|
|
+
|
|
|
test('code blocks containing # are not treated as headings', async() => {
|
|
test('code blocks containing # are not treated as headings', async() => {
|
|
|
const markdown = `
|
|
const markdown = `
|
|
|
# Header 1
|
|
# Header 1
|
|
@@ -261,16 +494,49 @@ Additional content.
|
|
|
Content under header 2.
|
|
Content under header 2.
|
|
|
`;
|
|
`;
|
|
|
|
|
|
|
|
- const expected: Chunk[] = [
|
|
|
|
|
- { label: '1-heading', text: '# Header 1' },
|
|
|
|
|
- { label: '1-content', text: 'Some introductory content.\n\n```\n# This is a comment with a # symbol\nSome code line\n```\n\nAdditional content.' },
|
|
|
|
|
- { label: '2-heading', text: '# Header 2' },
|
|
|
|
|
- { label: '2-content', text: 'Content under header 2.' },
|
|
|
|
|
|
|
+ const expected: MarkdownFragment[] = [
|
|
|
|
|
+ {
|
|
|
|
|
+ label: '1-heading',
|
|
|
|
|
+ type: 'heading',
|
|
|
|
|
+ text: '# Header 1',
|
|
|
|
|
+ tokenCount: encoder.encode('# Header 1').length,
|
|
|
|
|
+ },
|
|
|
|
|
+ {
|
|
|
|
|
+ label: '1-content-1',
|
|
|
|
|
+ type: 'paragraph',
|
|
|
|
|
+ text: 'Some introductory content.',
|
|
|
|
|
+ tokenCount: encoder.encode('Some introductory content.').length,
|
|
|
|
|
+ },
|
|
|
|
|
+ {
|
|
|
|
|
+ label: '1-content-2',
|
|
|
|
|
+ type: 'code',
|
|
|
|
|
+ text: '```\n# This is a comment with a # symbol\nSome code line\n```',
|
|
|
|
|
+ tokenCount: encoder.encode('```\n# This is a comment with a # symbol\nSome code line\n```').length,
|
|
|
|
|
+ },
|
|
|
|
|
+ {
|
|
|
|
|
+ label: '1-content-3',
|
|
|
|
|
+ type: 'paragraph',
|
|
|
|
|
+ text: 'Additional content.',
|
|
|
|
|
+ tokenCount: encoder.encode('Additional content.').length,
|
|
|
|
|
+ },
|
|
|
|
|
+ {
|
|
|
|
|
+ label: '2-heading',
|
|
|
|
|
+ type: 'heading',
|
|
|
|
|
+ text: '# Header 2',
|
|
|
|
|
+ tokenCount: encoder.encode('# Header 2').length,
|
|
|
|
|
+ },
|
|
|
|
|
+ {
|
|
|
|
|
+ label: '2-content-1',
|
|
|
|
|
+ type: 'paragraph',
|
|
|
|
|
+ text: 'Content under header 2.',
|
|
|
|
|
+ tokenCount: encoder.encode('Content under header 2.').length,
|
|
|
|
|
+ },
|
|
|
];
|
|
];
|
|
|
|
|
|
|
|
- const result = await splitMarkdownIntoChunks(markdown);
|
|
|
|
|
|
|
+ const result = await splitMarkdownIntoFragments(markdown, MODEL);
|
|
|
expect(result).toEqual(expected);
|
|
expect(result).toEqual(expected);
|
|
|
});
|
|
});
|
|
|
|
|
+
|
|
|
test('frontmatter is processed and labeled correctly', async() => {
|
|
test('frontmatter is processed and labeled correctly', async() => {
|
|
|
const markdown = `---
|
|
const markdown = `---
|
|
|
title: Test Document
|
|
title: Test Document
|
|
@@ -281,13 +547,155 @@ author: John Doe
|
|
|
Some introductory content.
|
|
Some introductory content.
|
|
|
`;
|
|
`;
|
|
|
|
|
|
|
|
- const expected: Chunk[] = [
|
|
|
|
|
- { label: 'frontmatter', text: JSON.stringify({ title: 'Test Document', author: 'John Doe' }, null, 2) },
|
|
|
|
|
- { label: '1-heading', text: '# Header 1' },
|
|
|
|
|
- { label: '1-content', text: 'Some introductory content.' },
|
|
|
|
|
|
|
+ const expected: MarkdownFragment[] = [
|
|
|
|
|
+ {
|
|
|
|
|
+ label: 'frontmatter',
|
|
|
|
|
+ type: 'yaml',
|
|
|
|
|
+ text: JSON.stringify({ title: 'Test Document', author: 'John Doe' }, null, 2),
|
|
|
|
|
+ tokenCount: encoder.encode(JSON.stringify({ title: 'Test Document', author: 'John Doe' }, null, 2)).length,
|
|
|
|
|
+ },
|
|
|
|
|
+ {
|
|
|
|
|
+ label: '1-heading',
|
|
|
|
|
+ type: 'heading',
|
|
|
|
|
+ text: '# Header 1',
|
|
|
|
|
+ tokenCount: encoder.encode('# Header 1').length,
|
|
|
|
|
+ },
|
|
|
|
|
+ {
|
|
|
|
|
+ label: '1-content-1',
|
|
|
|
|
+ type: 'paragraph',
|
|
|
|
|
+ text: 'Some introductory content.',
|
|
|
|
|
+ tokenCount: encoder.encode('Some introductory content.').length,
|
|
|
|
|
+ },
|
|
|
];
|
|
];
|
|
|
|
|
|
|
|
- const result = await splitMarkdownIntoChunks(markdown);
|
|
|
|
|
|
|
+ const result = await splitMarkdownIntoFragments(markdown, MODEL);
|
|
|
expect(result).toEqual(expected);
|
|
expect(result).toEqual(expected);
|
|
|
});
|
|
});
|
|
|
});
|
|
});
|
|
|
|
|
+
|
|
|
|
|
+describe('splitMarkdownIntoChunks', () => {
|
|
|
|
|
+ const repeatedText = 'This is a repeated sentence for testing purposes. '.repeat(100);
|
|
|
|
|
+ const markdown = `---
|
|
|
|
|
+title: Test Document
|
|
|
|
|
+author: John Doe
|
|
|
|
|
+---
|
|
|
|
|
+
|
|
|
|
|
+${repeatedText}
|
|
|
|
|
+
|
|
|
|
|
+# Header 1
|
|
|
|
|
+
|
|
|
|
|
+This is the first paragraph under header 1. It contains some text to simulate a longer paragraph for testing.
|
|
|
|
|
+This paragraph is extended with more content to ensure proper chunking behavior.${repeatedText}
|
|
|
|
|
+
|
|
|
|
|
+## Header 1-1
|
|
|
|
|
+
|
|
|
|
|
+This is the first paragraph under header 1-1. The text is a bit longer to ensure proper chunking. More text follows.
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+### Header 1-1-1
|
|
|
|
|
+
|
|
|
|
|
+This is the first paragraph under header 1-1-1. The content is nested deeper,
|
|
|
|
|
+making sure that the chunking algorithm works properly with multiple levels of headers.
|
|
|
|
|
+
|
|
|
|
|
+This is another paragraph under header 1-1-1, continuing the content at this deeper level.
|
|
|
|
|
+
|
|
|
|
|
+#### Header 1-1-1-1
|
|
|
|
|
+
|
|
|
|
|
+Now we have reached the fourth level of headers. The text here should also be properly chunked and grouped with its parent headers.
|
|
|
|
|
+
|
|
|
|
|
+This is another paragraph under header 1-1-1-1. It should be grouped with the correct higher-level headers.
|
|
|
|
|
+
|
|
|
|
|
+# Header 2
|
|
|
|
|
+
|
|
|
|
|
+Here is some content under header 2. This section should also be sufficiently long to ensure that the token count threshold is reached in the test.
|
|
|
|
|
+
|
|
|
|
|
+## Header 2-1
|
|
|
|
|
+
|
|
|
|
|
+${repeatedText}
|
|
|
|
|
+
|
|
|
|
|
+${repeatedText}
|
|
|
|
|
+
|
|
|
|
|
+Another sub-header under header 2 with text for testing chunking behavior. This is a fairly lengthy paragraph as well.
|
|
|
|
|
+
|
|
|
|
|
+We now have a fourth-level sub-header under header 2-1. This ensures that the chunking logic can handle deeply nested content.
|
|
|
|
|
+
|
|
|
|
|
+### Header 2-1-1
|
|
|
|
|
+
|
|
|
|
|
+Here is another paragraph under header 2-1-1. This paragraph is part of a more deeply nested section.
|
|
|
|
|
+
|
|
|
|
|
+# Header 3
|
|
|
|
|
+
|
|
|
|
|
+Continuing with more headers and content to make sure the markdown document is sufficiently large. This is a new header with more paragraphs under it.
|
|
|
|
|
+
|
|
|
|
|
+### Header 3-1
|
|
|
|
|
+
|
|
|
|
|
+This is a sub-header under header 3. The content here continues to grow, ensuring that the markdown is long enough to trigger multiple chunks.
|
|
|
|
|
+
|
|
|
|
|
+#### Header 3-1-1
|
|
|
|
|
+
|
|
|
|
|
+Here is a fourth-level sub-header under header 3-1. This paragraph is designed to create a larger markdown file for testing purposes.
|
|
|
|
|
+`;
|
|
|
|
|
+ test('Each chunk should not exceed the specified token count', async() => {
|
|
|
|
|
+ const maxToken = 800;
|
|
|
|
|
+ const result = await splitMarkdownIntoChunks(markdown, MODEL, maxToken);
|
|
|
|
|
+
|
|
|
|
|
+ result.forEach((chunk) => {
|
|
|
|
|
+ const tokenCount = encoder.encode(chunk).length;
|
|
|
|
|
+ expect(tokenCount).toBeLessThanOrEqual(maxToken * 1.1);
|
|
|
|
|
+ });
|
|
|
|
|
+ });
|
|
|
|
|
+ test('Each chunk should include the relevant top-level header', async() => {
|
|
|
|
|
+ const result = await splitMarkdownIntoChunks(markdown, MODEL, 800);
|
|
|
|
|
+
|
|
|
|
|
+ result.forEach((chunk) => {
|
|
|
|
|
+ const containsHeader1 = chunk.includes('# Header 1');
|
|
|
|
|
+ const containsHeader2 = chunk.includes('# Header 2');
|
|
|
|
|
+ const containsHeader3 = chunk.includes('# Header 3');
|
|
|
|
|
+ const doesNotContainHash = !chunk.includes('# ');
|
|
|
|
|
+
|
|
|
|
|
+ expect(containsHeader1 || containsHeader2 || containsHeader3 || doesNotContainHash).toBe(true);
|
|
|
|
|
+ });
|
|
|
|
|
+ });
|
|
|
|
|
+ test('Should throw an error if a header exceeds half of maxToken size with correct error message', async() => {
|
|
|
|
|
+ const maxToken = 800;
|
|
|
|
|
+ const markdownWithLongHeader = `
|
|
|
|
|
+# Short Header 1
|
|
|
|
|
+
|
|
|
|
|
+This is the first paragraph under short header 1. It contains some text for testing purposes.
|
|
|
|
|
+
|
|
|
|
|
+## ${repeatedText}
|
|
|
|
|
+
|
|
|
|
|
+This is the first paragraph under the long header. It contains text to ensure that the header length check is triggered if the header is too long.
|
|
|
|
|
+
|
|
|
|
|
+# Short Header 2
|
|
|
|
|
+
|
|
|
|
|
+Another section with a shorter header, but enough content to ensure proper chunking.
|
|
|
|
|
+`;
|
|
|
|
|
+
|
|
|
|
|
+ try {
|
|
|
|
|
+ await splitMarkdownIntoChunks(markdownWithLongHeader, MODEL, maxToken);
|
|
|
|
|
+ }
|
|
|
|
|
+ catch (error) {
|
|
|
|
|
+ if (error instanceof Error) {
|
|
|
|
|
+ expect(error.message).toContain('Heading token count is too large');
|
|
|
|
|
+ }
|
|
|
|
|
+ else {
|
|
|
|
|
+ throw new Error('An unknown error occurred');
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ });
|
|
|
|
|
+
|
|
|
|
|
+ test('Should return the entire markdown as a single chunk if token count is less than or equal to maxToken', async() => {
|
|
|
|
|
+ const markdownText = `
|
|
|
|
|
+ # Header 1
|
|
|
|
|
+ This is a short paragraph under header 1. It contains only a few sentences to ensure that the total token count remains under the maxToken limit.
|
|
|
|
|
+ `;
|
|
|
|
|
+
|
|
|
|
|
+ const maxToken = 800;
|
|
|
|
|
+
|
|
|
|
|
+ const result = await splitMarkdownIntoChunks(markdownText, MODEL, maxToken);
|
|
|
|
|
+
|
|
|
|
|
+ expect(result).toHaveLength(1);
|
|
|
|
|
+ expect(result[0]).toBe(markdownText);
|
|
|
|
|
+ });
|
|
|
|
|
+});
|