nHigashiWeseek 1 год назад
Родитель
Сommit
70a93b81f5
1 измененных файлов с 231 добавлено и 93 удалено
  1. 231 93
      packages/markdown-splitter/test/index.spec.ts

+ 231 - 93
packages/markdown-splitter/test/index.spec.ts

@@ -1,117 +1,255 @@
-import { splitMarkdownIntoChunks } from '../src/services/markdown-splitter';
+// splitMarkdownIntoChunks.test.ts
+
+import type { Chunk } from '../src/services/markdown-splitter';
+import { splitMarkdownIntoChunks } from '../src/services/markdown-splitter'; // パスはプロジェクト構造に応じて調整してください
 
 describe('splitMarkdownIntoChunks', () => {
 
-  it('should split markdown into sections using the specified chunk size', async() => {
-    const markdown = `
-# Heading 1
-This is some content under heading 1.
-
-# Heading 2
-This is some content under heading 2.
-
-# Heading 3
-This is some content under heading 3.
-
-# Heading 4
-This is some content under heading 4.
-`;
-    const chunkSize = 60;
-    const result = await splitMarkdownIntoChunks(markdown, chunkSize);
-
-    // Expect the result to have more than one section due to chunkSize limitations
-    expect(result.length).toBeGreaterThan(1);
-    for (const section of result) {
-      expect(section.pageContent.length).toBeLessThanOrEqual(chunkSize);
-    }
+  test('handles empty markdown string', () => {
+    const markdown = '';
+    const expected: Chunk[] = [];
+    const result = splitMarkdownIntoChunks(markdown);
+    expect(result).toEqual(expected);
   });
 
-  it('should handle markdown without headers', async() => {
-    const markdown = `
-This is some content without any headers. It should not be split unless it exceeds the chunk size.
-`;
-    const chunkSize = 100;
-    const result = await splitMarkdownIntoChunks(markdown, chunkSize);
-
-    // Since the content is short, expect no splits
-    expect(result.length).toBe(1);
-    expect(result[0].pageContent.length).toBeLessThanOrEqual(chunkSize);
+  test('handles markdown with only content and no headers', () => {
+    const markdown = `This is some content without any headers.
+It spans multiple lines.
+
+Another paragraph.
+    `;
+    const expected: Chunk[] = [
+      {
+        label: '0-content',
+        content: 'This is some content without any headers.\nIt spans multiple lines.\n\nAnother paragraph.',
+      },
+    ];
+    const result = splitMarkdownIntoChunks(markdown);
+    expect(result).toEqual(expected);
   });
 
-  it('should split large content under a single heading', async() => {
+  test('handles markdown starting with a header', () => {
     const markdown = `
-# Large Heading
-${'This is some repetitive content. '.repeat(50)}
-`;
-    const chunkSize = 100;
-    const result = await splitMarkdownIntoChunks(markdown, chunkSize);
-
-    expect(result.length).toBeGreaterThan(1);
-    for (const section of result) {
-      expect(section.pageContent.length).toBeLessThanOrEqual(chunkSize);
-    }
+# Header 1
+Content under header 1.
+
+## Header 1.1
+Content under header 1.1.
+
+# Header 2
+Content under header 2.
+    `;
+    const expected: Chunk[] = [
+      { label: '1', content: '# Header 1' },
+      { label: '1-content', content: 'Content under header 1.' },
+      { label: '1-1', content: '## Header 1.1' },
+      { label: '1-1-content', content: 'Content under header 1.1.' },
+      { label: '2', content: '# Header 2' },
+      { label: '2-content', content: 'Content under header 2.' },
+    ];
+    const result = splitMarkdownIntoChunks(markdown);
+    expect(result).toEqual(expected);
   });
 
-  it('should handle empty markdown input', async() => {
-    const markdown = '';
-    const chunkSize = 10;
-    const result = await splitMarkdownIntoChunks(markdown, chunkSize);
-
-    // Expect an empty result for empty markdown input
-    expect(result.length).toBe(0);
+  test('handles markdown with non-consecutive heading levels', () => {
+    const markdown = `
+Introduction without a header.
+
+# Chapter 1
+Content of chapter 1.
+
+### Section 1.1.1
+Content of section 1.1.1.
+
+## Section 1.2
+Content of section 1.2.
+
+# Chapter 2
+Content of chapter 2.
+
+## Section 2.1
+Content of section 2.1.
+    `;
+    const expected: Chunk[] = [
+      {
+        label: '0-content',
+        content: 'Introduction without a header.',
+      },
+      {
+        label: '1',
+        content: '# Chapter 1',
+      },
+      {
+        label: '1-content',
+        content: 'Content of chapter 1.',
+      },
+      {
+        label: '1-1-1',
+        content: '### Section 1.1.1',
+      },
+      {
+        label: '1-1-1-content',
+        content: 'Content of section 1.1.1.',
+      },
+      {
+        label: '1-2',
+        content: '## Section 1.2',
+      },
+      {
+        label: '1-2-content',
+        content: 'Content of section 1.2.',
+      },
+      {
+        label: '2',
+        content: '# Chapter 2',
+      },
+      {
+        label: '2-content',
+        content: 'Content of chapter 2.',
+      },
+      {
+        label: '2-1',
+        content: '## Section 2.1',
+      },
+      {
+        label: '2-1-content',
+        content: 'Content of section 2.1.',
+      },
+    ];
+    const result = splitMarkdownIntoChunks(markdown);
+    expect(result).toEqual(expected);
   });
 
-  it('should correctly split nested headings', async() => {
+  test('handles markdown with skipped heading levels', () => {
     const markdown = `
-# Heading 1
-Content under heading 1.
-
-## Subheading 1.1
-Content under subheading 1.1.
-
-# Heading 2
-Content under heading 2.
-`;
-    const chunkSize = 50;
-    const result = await splitMarkdownIntoChunks(markdown, chunkSize);
-
-    // Expect multiple sections
-    expect(result.length).toBeGreaterThan(1);
-    for (const section of result) {
-      expect(section.pageContent.length).toBeLessThanOrEqual(chunkSize);
-    }
+# Header 1
+Content under header 1.
+
+#### Header 1.1.1.1
+Content under header 1.1.1.1.
+
+## Header 1.2
+Content under header 1.2.
+
+# Header 2
+Content under header 2.
+    `;
+    const expected: Chunk[] = [
+      { label: '1', content: '# Header 1' },
+      { label: '1-content', content: 'Content under header 1.' },
+      { label: '1-1-1-1', content: '#### Header 1.1.1.1' },
+      { label: '1-1-1-1-content', content: 'Content under header 1.1.1.1.' },
+      { label: '1-2', content: '## Header 1.2' },
+      { label: '1-2-content', content: 'Content under header 1.2.' },
+      { label: '2', content: '# Header 2' },
+      { label: '2-content', content: 'Content under header 2.' },
+    ];
+    const result = splitMarkdownIntoChunks(markdown);
+    expect(result).toEqual(expected);
   });
 
-  it('should not split if content fits within chunk size', async() => {
+  test('handles malformed headings', () => {
     const markdown = `
-# Heading
-Short content.
-`;
-    const chunkSize = 100;
-    const result = await splitMarkdownIntoChunks(markdown, chunkSize);
-
-    // Expect the result to be a single section since the content is small
-    expect(result.length).toBe(1);
-    expect(result[0].pageContent.length).toBeLessThanOrEqual(chunkSize);
+# Header 1
+Content under header 1.
+
+#### Header 1.1.1.1
+Content under header 1.1.1.1.
+    `;
+    const expected: Chunk[] = [
+      { label: '1', content: '# Header 1' },
+      { label: '1-content', content: 'Content under header 1.' },
+      // Malformed heading '### ' is skipped or handled as content
+      { label: '1-1-1-1', content: '#### Header 1.1.1.1' },
+      { label: '1-1-1-1-content', content: 'Content under header 1.1.1.1.' },
+    ];
+    const result = splitMarkdownIntoChunks(markdown);
+    expect(result).toEqual(expected);
   });
 
-  it('should handle multiple consecutive headers', async() => {
+  test('handles multiple content blocks before any headers', () => {
     const markdown = `
-# Heading 1
-
-# Heading 2
+This is the first paragraph without a header.
+
+This is the second paragraph without a header.
+
+# Header 1
+Content under header 1.
+    `;
+    const expected: Chunk[] = [
+      {
+        label: '0-content',
+        content: 'This is the first paragraph without a header.\n\nThis is the second paragraph without a header.',
+      },
+      { label: '1', content: '# Header 1' },
+      { label: '1-content', content: 'Content under header 1.' },
+    ];
+    const result = splitMarkdownIntoChunks(markdown);
+    expect(result).toEqual(expected);
+  });
 
-# Heading 3
+  test('handles markdown with only headers and no content', () => {
+    const markdown = `
+# Header 1
+
+## Header 1.1
+
+### Header 1.1.1
+    `;
+    const expected: Chunk[] = [
+      { label: '1', content: '# Header 1' },
+      { label: '1-1', content: '## Header 1.1' },
+      { label: '1-1-1', content: '### Header 1.1.1' },
+    ];
+    const result = splitMarkdownIntoChunks(markdown);
+    expect(result).toEqual(expected);
+  });
 
-# Heading 4
-`;
-    const chunkSize = 50;
-    const result = await splitMarkdownIntoChunks(markdown, chunkSize);
+  test('handles markdown with mixed content and headers', () => {
+    const markdown = `
+# Header 1
+Content under header 1.
+
+## Header 1.1
+Content under header 1.1.
+Another piece of content.
+
+# Header 2
+Content under header 2.
+    `;
+    const expected: Chunk[] = [
+      { label: '1', content: '# Header 1' },
+      { label: '1-content', content: 'Content under header 1.' },
+      { label: '1-1', content: '## Header 1.1' },
+      { label: '1-1-content', content: 'Content under header 1.1.\nAnother piece of content.' },
+      { label: '2', content: '# Header 2' },
+      { label: '2-content', content: 'Content under header 2.' },
+    ];
+    const result = splitMarkdownIntoChunks(markdown);
+    expect(result).toEqual(expected);
+  });
 
-    // Expect each heading to be treated as a separate section
-    expect(result.length).toBeGreaterThan(1);
-    for (const section of result) {
-      expect(section.pageContent.length).toBeLessThanOrEqual(chunkSize);
-    }
+  test('preserves list indentation and reduces unnecessary line breaks', () => {
+    const markdown = `
+# Header 1
+Content under header 1.
+
+- Item 1
+  - Subitem 1
+- Item 2
+
+
+# Header 2
+Content under header 2.
+    `;
+    const expected: Chunk[] = [
+      { label: '1', content: '# Header 1' },
+      { label: '1-content', content: 'Content under header 1.\n\n- Item 1\n  - Subitem 1\n- Item 2' },
+      { label: '2', content: '# Header 2' },
+      { label: '2-content', content: 'Content under header 2.' },
+    ];
+    const result = splitMarkdownIntoChunks(markdown);
+    expect(result).toEqual(expected);
   });
+
 });