1 year ago · 2ac50d56b0
--- a/packages/markdown-token-splitter/test/index.test.js
+++ b/packages/markdown-token-splitter/test/index.test.js
@@ -0,0 +1,159 @@
 
				+import { encodingForModel } from 'js-tiktoken';
			
 
				+import test from 'tape';
			
 
				+
			
 
				+import { splitMarkdownByTokens } from '../src/services/markdown-token-splitter';
			
 
				+
			
 
				+const modelName = 'gpt-3.5-turbo'; // Replace with the appropriate model name if needed
			
 
				+
			
 
				+test('should split markdown into sections under the max token limit', (t) => {
			
 
				+  const markdown = `
			
 
				+# Heading 1
			
 
				+Content under heading 1.
			
 
				+
			
 
				+# Heading 2
			
 
				+Content under heading 2.
			
 
				+
			
 
				+## Subheading 2.1
			
 
				+Content under subheading 2.1.
			
 
				+
			
 
				+# Heading 3
			
 
				+Content under heading 3.
			
 
				+  `;
			
 
				+  const maxTokens = 50;
			
 
				+
			
 
				+  const sections = splitMarkdownByTokens(markdown, maxTokens, modelName);
			
 
				+
			
 
				+  t.ok(sections.length > 1, 'Sections should be more than one');
			
 
				+  sections.forEach((section) => {
			
 
				+    const tokens = encodingForModel(modelName).encode(section);
			
 
				+    t.ok(tokens.length <= maxTokens, 'Section tokens should be less than or equal to maxTokens');
			
 
				+  });
			
 
				+
			
 
				+  t.end();
			
 
				+});
			
 
				+
			
 
				+test('should handle markdown without headings', (t) => {
			
 
				+  const markdown = 'This is a markdown without any headings. It should be returned as a single section unless it exceeds the max token limit.';
			
 
				+  const maxTokens = 100;
			
 
				+
			
 
				+  const sections = splitMarkdownByTokens(markdown, maxTokens, modelName);
			
 
				+
			
 
				+  t.equal(sections.length, 1, 'Sections should be exactly one');
			
 
				+  const tokens = encodingForModel(modelName).encode(sections[0]);
			
 
				+  t.ok(tokens.length <= maxTokens, 'Section tokens should be less than or equal to maxTokens');
			
 
				+
			
 
				+  t.end();
			
 
				+});
			
 
				+
			
 
				+test('should split large content under a single heading', (t) => {
			
 
				+  const markdown = `
			
 
				+# Large Heading
			
 
				+${'Long content. '.repeat(500)}
			
 
				+  `;
			
 
				+  const maxTokens = 100;
			
 
				+
			
 
				+  const sections = splitMarkdownByTokens(markdown, maxTokens, modelName);
			
 
				+
			
 
				+  t.ok(sections.length > 1, 'Sections should be more than one');
			
 
				+  sections.forEach((section) => {
			
 
				+    const tokens = encodingForModel(modelName).encode(section);
			
 
				+    t.ok(tokens.length <= maxTokens, 'Section tokens should be less than or equal to maxTokens');
			
 
				+  });
			
 
				+
			
 
				+  t.end();
			
 
				+});
			
 
				+
			
 
				+test('should handle empty markdown input', (t) => {
			
 
				+  const markdown = '';
			
 
				+  const maxTokens = 50;
			
 
				+
			
 
				+  const sections = splitMarkdownByTokens(markdown, maxTokens, modelName);
			
 
				+
			
 
				+  t.equal(sections.length, 0, 'Sections should be zero');
			
 
				+
			
 
				+  t.end();
			
 
				+});
			
 
				+
			
 
				+test('should handle markdown where a single node exceeds maxTokens', (t) => {
			
 
				+  const markdown = `
			
 
				+# Heading with Large Content
			
 
				+${'Very long content. '.repeat(1000)}
			
 
				+  `;
			
 
				+  const maxTokens = 10;
			
 
				+
			
 
				+  const sections = splitMarkdownByTokens(markdown, maxTokens, modelName);
			
 
				+
			
 
				+  t.ok(sections.length > 1, 'Sections should be more than one');
			
 
				+  sections.forEach((section) => {
			
 
				+    const tokens = encodingForModel(modelName).encode(section);
			
 
				+    t.ok(tokens.length <= maxTokens, 'Section tokens should be less than or equal to maxTokens');
			
 
				+  });
			
 
				+
			
 
				+  t.end();
			
 
				+});
			
 
				+
			
 
				+test('should correctly split nested headings', (t) => {
			
 
				+  const markdown = `
			
 
				+# Heading 1
			
 
				+Content under heading 1.
			
 
				+
			
 
				+## Subheading 1.1
			
 
				+Content under subheading 1.1.
			
 
				+
			
 
				+### Sub-subheading 1.1.1
			
 
				+Content under sub-subheading 1.1.1.
			
 
				+
			
 
				+# Heading 2
			
 
				+Content under heading 2.
			
 
				+  `;
			
 
				+  const maxTokens = 30;
			
 
				+
			
 
				+  const sections = splitMarkdownByTokens(markdown, maxTokens, modelName);
			
 
				+
			
 
				+  t.ok(sections.length > 1, 'Sections should be more than one');
			
 
				+  sections.forEach((section) => {
			
 
				+    const tokens = encodingForModel(modelName).encode(section);
			
 
				+    t.ok(tokens.length <= maxTokens, 'Section tokens should be less than or equal to maxTokens');
			
 
				+  });
			
 
				+
			
 
				+  t.end();
			
 
				+});
			
 
				+
			
 
				+test('should not split sections unnecessarily', (t) => {
			
 
				+  const markdown = `
			
 
				+# Short Heading
			
 
				+Short content.
			
 
				+  `;
			
 
				+  const maxTokens = 100;
			
 
				+
			
 
				+  const sections = splitMarkdownByTokens(markdown, maxTokens, modelName);
			
 
				+
			
 
				+  t.equal(sections.length, 1, 'Sections should be exactly one');
			
 
				+  const tokens = encodingForModel(modelName).encode(sections[0]);
			
 
				+  t.ok(tokens.length <= maxTokens, 'Section tokens should be less than or equal to maxTokens');
			
 
				+
			
 
				+  t.end();
			
 
				+});
			
 
				+
			
 
				+test('should handle multiple consecutive headings', (t) => {
			
 
				+  const markdown = `
			
 
				+# Heading 1
			
 
				+
			
 
				+# Heading 2
			
 
				+
			
 
				+# Heading 3
			
 
				+
			
 
				+# Heading 4
			
 
				+  `;
			
 
				+  const maxTokens = 10;
			
 
				+
			
 
				+  const sections = splitMarkdownByTokens(markdown, maxTokens, modelName);
			
 
				+
			
 
				+  t.equal(sections.length, 4, 'Sections should be four');
			
 
				+  sections.forEach((section) => {
			
 
				+    const tokens = encodingForModel(modelName).encode(section);
			
 
				+    t.ok(tokens.length <= maxTokens, 'Section tokens should be less than or equal to maxTokens');
			
 
				+  });
			
 
				+
			
 
				+  t.end();
			
 
				+});