1 год назад · 2ac50d56b0
--- a/packages/markdown-token-splitter/test/index.test.js
+++ b/packages/markdown-token-splitter/test/index.test.js
@@ -0,0 +1,159 @@
 
															+import { encodingForModel } from 'js-tiktoken';
														
 
															+import test from 'tape';
														
 
															+
														
 
															+import { splitMarkdownByTokens } from '../src/services/markdown-token-splitter';
														
 
															+
														
 
															+const modelName = 'gpt-3.5-turbo'; // Replace with the appropriate model name if needed
														
 
															+
														
 
															+test('should split markdown into sections under the max token limit', (t) => {
														
 
															+  const markdown = `
														
 
															+# Heading 1
														
 
															+Content under heading 1.
														
 
															+
														
 
															+# Heading 2
														
 
															+Content under heading 2.
														
 
															+
														
 
															+## Subheading 2.1
														
 
															+Content under subheading 2.1.
														
 
															+
														
 
															+# Heading 3
														
 
															+Content under heading 3.
														
 
															+  `;
														
 
															+  const maxTokens = 50;
														
 
															+
														
 
															+  const sections = splitMarkdownByTokens(markdown, maxTokens, modelName);
														
 
															+
														
 
															+  t.ok(sections.length > 1, 'Sections should be more than one');
														
 
															+  sections.forEach((section) => {
														
 
															+    const tokens = encodingForModel(modelName).encode(section);
														
 
															+    t.ok(tokens.length <= maxTokens, 'Section tokens should be less than or equal to maxTokens');
														
 
															+  });
														
 
															+
														
 
															+  t.end();
														
 
															+});
														
 
															+
														
 
															+test('should handle markdown without headings', (t) => {
														
 
															+  const markdown = 'This is a markdown without any headings. It should be returned as a single section unless it exceeds the max token limit.';
														
 
															+  const maxTokens = 100;
														
 
															+
														
 
															+  const sections = splitMarkdownByTokens(markdown, maxTokens, modelName);
														
 
															+
														
 
															+  t.equal(sections.length, 1, 'Sections should be exactly one');
														
 
															+  const tokens = encodingForModel(modelName).encode(sections[0]);
														
 
															+  t.ok(tokens.length <= maxTokens, 'Section tokens should be less than or equal to maxTokens');
														
 
															+
														
 
															+  t.end();
														
 
															+});
														
 
															+
														
 
															+test('should split large content under a single heading', (t) => {
														
 
															+  const markdown = `
														
 
															+# Large Heading
														
 
															+${'Long content. '.repeat(500)}
														
 
															+  `;
														
 
															+  const maxTokens = 100;
														
 
															+
														
 
															+  const sections = splitMarkdownByTokens(markdown, maxTokens, modelName);
														
 
															+
														
 
															+  t.ok(sections.length > 1, 'Sections should be more than one');
														
 
															+  sections.forEach((section) => {
														
 
															+    const tokens = encodingForModel(modelName).encode(section);
														
 
															+    t.ok(tokens.length <= maxTokens, 'Section tokens should be less than or equal to maxTokens');
														
 
															+  });
														
 
															+
														
 
															+  t.end();
														
 
															+});
														
 
															+
														
 
															+test('should handle empty markdown input', (t) => {
														
 
															+  const markdown = '';
														
 
															+  const maxTokens = 50;
														
 
															+
														
 
															+  const sections = splitMarkdownByTokens(markdown, maxTokens, modelName);
														
 
															+
														
 
															+  t.equal(sections.length, 0, 'Sections should be zero');
														
 
															+
														
 
															+  t.end();
														
 
															+});
														
 
															+
														
 
															+test('should handle markdown where a single node exceeds maxTokens', (t) => {
														
 
															+  const markdown = `
														
 
															+# Heading with Large Content
														
 
															+${'Very long content. '.repeat(1000)}
														
 
															+  `;
														
 
															+  const maxTokens = 10;
														
 
															+
														
 
															+  const sections = splitMarkdownByTokens(markdown, maxTokens, modelName);
														
 
															+
														
 
															+  t.ok(sections.length > 1, 'Sections should be more than one');
														
 
															+  sections.forEach((section) => {
														
 
															+    const tokens = encodingForModel(modelName).encode(section);
														
 
															+    t.ok(tokens.length <= maxTokens, 'Section tokens should be less than or equal to maxTokens');
														
 
															+  });
														
 
															+
														
 
															+  t.end();
														
 
															+});
														
 
															+
														
 
															+test('should correctly split nested headings', (t) => {
														
 
															+  const markdown = `
														
 
															+# Heading 1
														
 
															+Content under heading 1.
														
 
															+
														
 
															+## Subheading 1.1
														
 
															+Content under subheading 1.1.
														
 
															+
														
 
															+### Sub-subheading 1.1.1
														
 
															+Content under sub-subheading 1.1.1.
														
 
															+
														
 
															+# Heading 2
														
 
															+Content under heading 2.
														
 
															+  `;
														
 
															+  const maxTokens = 30;
														
 
															+
														
 
															+  const sections = splitMarkdownByTokens(markdown, maxTokens, modelName);
														
 
															+
														
 
															+  t.ok(sections.length > 1, 'Sections should be more than one');
														
 
															+  sections.forEach((section) => {
														
 
															+    const tokens = encodingForModel(modelName).encode(section);
														
 
															+    t.ok(tokens.length <= maxTokens, 'Section tokens should be less than or equal to maxTokens');
														
 
															+  });
														
 
															+
														
 
															+  t.end();
														
 
															+});
														
 
															+
														
 
															+test('should not split sections unnecessarily', (t) => {
														
 
															+  const markdown = `
														
 
															+# Short Heading
														
 
															+Short content.
														
 
															+  `;
														
 
															+  const maxTokens = 100;
														
 
															+
														
 
															+  const sections = splitMarkdownByTokens(markdown, maxTokens, modelName);
														
 
															+
														
 
															+  t.equal(sections.length, 1, 'Sections should be exactly one');
														
 
															+  const tokens = encodingForModel(modelName).encode(sections[0]);
														
 
															+  t.ok(tokens.length <= maxTokens, 'Section tokens should be less than or equal to maxTokens');
														
 
															+
														
 
															+  t.end();
														
 
															+});
														
 
															+
														
 
															+test('should handle multiple consecutive headings', (t) => {
														
 
															+  const markdown = `
														
 
															+# Heading 1
														
 
															+
														
 
															+# Heading 2
														
 
															+
														
 
															+# Heading 3
														
 
															+
														
 
															+# Heading 4
														
 
															+  `;
														
 
															+  const maxTokens = 10;
														
 
															+
														
 
															+  const sections = splitMarkdownByTokens(markdown, maxTokens, modelName);
														
 
															+
														
 
															+  t.equal(sections.length, 4, 'Sections should be four');
														
 
															+  sections.forEach((section) => {
														
 
															+    const tokens = encodingForModel(modelName).encode(section);
														
 
															+    t.ok(tokens.length <= maxTokens, 'Section tokens should be less than or equal to maxTokens');
														
 
															+  });
														
 
															+
														
 
															+  t.end();
														
 
															+});