wiki
/
weseek__growi
mirror of https://github.com/weseek/growi


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574
							import { encodingForModel, type TiktokenModel } from 'js-tiktoken';

import type { MarkdownFragment } from '../src/services/markdown-splitter';
import { splitMarkdownIntoFragments } from '../src/services/markdown-splitter';

const MODEL: TiktokenModel = 'gpt-4';
const encoder = encodingForModel(MODEL);

describe('splitMarkdownIntoFragments', () => {

  test('handles empty markdown string', async() => {
    const markdown = '';
    const expected: MarkdownFragment[] = [];
    const result = await splitMarkdownIntoFragments(markdown, MODEL);
    expect(result).toEqual(expected);
  });

  test('handles markdown with only content and no headers', async() => {
    const markdown = `This is some content without any headers.
It spans multiple lines.

Another paragraph.
    `;

    const expected: MarkdownFragment[] = [
      {
        label: '0-content-1',
        type: 'paragraph',
        text: 'This is some content without any headers.\nIt spans multiple lines.',
        tokenCount: encoder.encode('This is some content without any headers.\nIt spans multiple lines.').length,
      },
      {
        label: '0-content-2',
        type: 'paragraph',
        text: 'Another paragraph.',
        tokenCount: encoder.encode('Another paragraph.').length,
      },
    ];

    const result = await splitMarkdownIntoFragments(markdown, MODEL);
    expect(result).toEqual(expected);
  });

  test('handles markdown starting with a header', async() => {
    const markdown = `
# Header 1
Content under header 1.

## Header 1.1
Content under header 1.1.

# Header 2
Content under header 2.
    `;

    const expected: MarkdownFragment[] = [
      {
        label: '1-heading',
        type: 'heading',
        text: '# Header 1',
        tokenCount: encoder.encode('# Header 1').length,
      },
      {
        label: '1-content-1',
        type: 'paragraph',
        text: 'Content under header 1.',
        tokenCount: encoder.encode('Content under header 1.').length,
      },
      {
        label: '1-1-heading',
        type: 'heading',
        text: '## Header 1.1',
        tokenCount: encoder.encode('## Header 1.1').length,
      },
      {
        label: '1-1-content-1',
        type: 'paragraph',
        text: 'Content under header 1.1.',
        tokenCount: encoder.encode('Content under header 1.1.').length,
      },
      {
        label: '2-heading',
        type: 'heading',
        text: '# Header 2',
        tokenCount: encoder.encode('# Header 2').length,
      },
      {
        label: '2-content-1',
        type: 'paragraph',
        text: 'Content under header 2.',
        tokenCount: encoder.encode('Content under header 2.').length,
      },
    ];

    const result = await splitMarkdownIntoFragments(markdown, MODEL);
    expect(result).toEqual(expected);
  });

  test('handles markdown with non-consecutive heading levels', async() => {
    const markdown = `
Introduction without a header.

# Chapter 1
Content of chapter 1.

### Section 1.1.1
Content of section 1.1.1.

## Section 1.2
Content of section 1.2.

# Chapter 2
Content of chapter 2.

## Section 2.1
Content of section 2.1.
    `;

    const expected: MarkdownFragment[] = [
      {
        label: '0-content-1',
        type: 'paragraph',
        text: 'Introduction without a header.',
        tokenCount: encoder.encode('Introduction without a header.').length,
      },
      {
        label: '1-heading',
        type: 'heading',
        text: '# Chapter 1',
        tokenCount: encoder.encode('# Chapter 1').length,
      },
      {
        label: '1-content-1',
        type: 'paragraph',
        text: 'Content of chapter 1.',
        tokenCount: encoder.encode('Content of chapter 1.').length,
      },
      {
        label: '1-1-1-heading',
        type: 'heading',
        text: '### Section 1.1.1',
        tokenCount: encoder.encode('### Section 1.1.1').length,
      },
      {
        label: '1-1-1-content-1',
        type: 'paragraph',
        text: 'Content of section 1.1.1.',
        tokenCount: encoder.encode('Content of section 1.1.1.').length,
      },
      {
        label: '1-2-heading',
        type: 'heading',
        text: '## Section 1.2',
        tokenCount: encoder.encode('## Section 1.2').length,
      },
      {
        label: '1-2-content-1',
        type: 'paragraph',
        text: 'Content of section 1.2.',
        tokenCount: encoder.encode('Content of section 1.2.').length,
      },
      {
        label: '2-heading',
        type: 'heading',
        text: '# Chapter 2',
        tokenCount: encoder.encode('# Chapter 2').length,
      },
      {
        label: '2-content-1',
        type: 'paragraph',
        text: 'Content of chapter 2.',
        tokenCount: encoder.encode('Content of chapter 2.').length,
      },
      {
        label: '2-1-heading',
        type: 'heading',
        text: '## Section 2.1',
        tokenCount: encoder.encode('## Section 2.1').length,
      },
      {
        label: '2-1-content-1',
        type: 'paragraph',
        text: 'Content of section 2.1.',
        tokenCount: encoder.encode('Content of section 2.1.').length,
      },
    ];

    const result = await splitMarkdownIntoFragments(markdown, MODEL);
    expect(result).toEqual(expected);
  });

  test('handles markdown with skipped heading levels', async() => {
    const markdown = `
# Header 1
Content under header 1.

#### Header 1.1.1.1
Content under header 1.1.1.1.

## Header 1.2
Content under header 1.2.

# Header 2
Content under header 2.
    `;

    const expected: MarkdownFragment[] = [
      {
        label: '1-heading',
        type: 'heading',
        text: '# Header 1',
        tokenCount: encoder.encode('# Header 1').length,
      },
      {
        label: '1-content-1',
        type: 'paragraph',
        text: 'Content under header 1.',
        tokenCount: encoder.encode('Content under header 1.').length,
      },
      {
        label: '1-1-1-1-heading',
        type: 'heading',
        text: '#### Header 1.1.1.1',
        tokenCount: encoder.encode('#### Header 1.1.1.1').length,
      },
      {
        label: '1-1-1-1-content-1',
        type: 'paragraph',
        text: 'Content under header 1.1.1.1.',
        tokenCount: encoder.encode('Content under header 1.1.1.1.').length,
      },
      {
        label: '1-2-heading',
        type: 'heading',
        text: '## Header 1.2',
        tokenCount: encoder.encode('## Header 1.2').length,
      },
      {
        label: '1-2-content-1',
        type: 'paragraph',
        text: 'Content under header 1.2.',
        tokenCount: encoder.encode('Content under header 1.2.').length,
      },
      {
        label: '2-heading',
        type: 'heading',
        text: '# Header 2',
        tokenCount: encoder.encode('# Header 2').length,
      },
      {
        label: '2-content-1',
        type: 'paragraph',
        text: 'Content under header 2.',
        tokenCount: encoder.encode('Content under header 2.').length,
      },
    ];

    const result = await splitMarkdownIntoFragments(markdown, MODEL);
    expect(result).toEqual(expected);
  });

  test('handles malformed headings', async() => {
    const markdown = `
# Header 1
Content under header 1.

#### Header 1.1.1.1
Content under header 1.1.1.1.
    `;

    const expected: MarkdownFragment[] = [
      {
        label: '1-heading',
        type: 'heading',
        text: '# Header 1',
        tokenCount: encoder.encode('# Header 1').length,
      },
      {
        label: '1-content-1',
        type: 'paragraph',
        text: 'Content under header 1.',
        tokenCount: encoder.encode('Content under header 1.').length,
      },
      {
        label: '1-1-1-1-heading',
        type: 'heading',
        text: '#### Header 1.1.1.1',
        tokenCount: encoder.encode('#### Header 1.1.1.1').length,
      },
      {
        label: '1-1-1-1-content-1',
        type: 'paragraph',
        text: 'Content under header 1.1.1.1.',
        tokenCount: encoder.encode('Content under header 1.1.1.1.').length,
      },
    ];

    const result = await splitMarkdownIntoFragments(markdown, MODEL);
    expect(result).toEqual(expected);
  });

  test('handles multiple content blocks before any headers', async() => {
    const markdown = `
This is the first paragraph without a header.

This is the second paragraph without a header.

# Header 1
Content under header 1.
    `;

    const expected: MarkdownFragment[] = [
      {
        label: '0-content-1',
        type: 'paragraph',
        text: 'This is the first paragraph without a header.',
        tokenCount: encoder.encode('This is the first paragraph without a header.').length,
      },
      {
        label: '0-content-2',
        type: 'paragraph',
        text: 'This is the second paragraph without a header.',
        tokenCount: encoder.encode('This is the second paragraph without a header.').length,
      },
      {
        label: '1-heading',
        type: 'heading',
        text: '# Header 1',
        tokenCount: encoder.encode('# Header 1').length,
      },
      {
        label: '1-content-1',
        type: 'paragraph',
        text: 'Content under header 1.',
        tokenCount: encoder.encode('Content under header 1.').length,
      },
    ];

    const result = await splitMarkdownIntoFragments(markdown, MODEL);
    expect(result).toEqual(expected);
  });

  test('handles markdown with only headers and no content', async() => {
    const markdown = `
# Header 1

## Header 1.1

### Header 1.1.1
    `;

    const expected: MarkdownFragment[] = [
      {
        label: '1-heading',
        type: 'heading',
        text: '# Header 1',
        tokenCount: encoder.encode('# Header 1').length,
      },
      {
        label: '1-1-heading',
        type: 'heading',
        text: '## Header 1.1',
        tokenCount: encoder.encode('## Header 1.1').length,
      },
      {
        label: '1-1-1-heading',
        type: 'heading',
        text: '### Header 1.1.1',
        tokenCount: encoder.encode('### Header 1.1.1').length,
      },
    ];

    const result = await splitMarkdownIntoFragments(markdown, MODEL);
    expect(result).toEqual(expected);
  });

  test('handles markdown with mixed content and headers', async() => {
    const markdown = `
# Header 1
Content under header 1.

## Header 1.1
Content under header 1.1.
Another piece of content.

# Header 2
Content under header 2.
    `;

    const expected: MarkdownFragment[] = [
      {
        label: '1-heading',
        type: 'heading',
        text: '# Header 1',
        tokenCount: encoder.encode('# Header 1').length,
      },
      {
        label: '1-content-1',
        type: 'paragraph',
        text: 'Content under header 1.',
        tokenCount: encoder.encode('Content under header 1.').length,
      },
      {
        label: '1-1-heading',
        type: 'heading',
        text: '## Header 1.1',
        tokenCount: encoder.encode('## Header 1.1').length,
      },
      {
        label: '1-1-content-1',
        type: 'paragraph',
        text: 'Content under header 1.1.\nAnother piece of content.',
        tokenCount: encoder.encode('Content under header 1.1.\nAnother piece of content.').length,
      },
      {
        label: '2-heading',
        type: 'heading',
        text: '# Header 2',
        tokenCount: encoder.encode('# Header 2').length,
      },
      {
        label: '2-content-1',
        type: 'paragraph',
        text: 'Content under header 2.',
        tokenCount: encoder.encode('Content under header 2.').length,
      },
    ];

    const result = await splitMarkdownIntoFragments(markdown, MODEL);
    expect(result).toEqual(expected);
  });

  test('preserves list indentation and reduces unnecessary line breaks', async() => {
    const markdown = `
# Header 1
Content under header 1.

- Item 1
  - Subitem 1
- Item 2


# Header 2
Content under header 2.
    `;

    const expected: MarkdownFragment[] = [
      {
        label: '1-heading',
        type: 'heading',
        text: '# Header 1',
        tokenCount: encoder.encode('# Header 1').length,
      },
      {
        label: '1-content-1',
        type: 'paragraph',
        text: 'Content under header 1.',
        tokenCount: encoder.encode('Content under header 1.').length,
      },
      {
        label: '1-content-2',
        type: 'list',
        text: '- Item 1\n  - Subitem 1\n- Item 2',
        tokenCount: encoder.encode('- Item 1\n  - Subitem 1\n- Item 2').length,
      },
      {
        label: '2-heading',
        type: 'heading',
        text: '# Header 2',
        tokenCount: encoder.encode('# Header 2').length,
      },
      {
        label: '2-content-1',
        type: 'paragraph',
        text: 'Content under header 2.',
        tokenCount: encoder.encode('Content under header 2.').length,
      },
    ];

    const result = await splitMarkdownIntoFragments(markdown, MODEL);
    expect(result).toEqual(expected);
  });

  test('code blocks containing # are not treated as headings', async() => {
    const markdown = `
# Header 1
Some introductory content.
\`\`\`
# This is a comment with a # symbol
Some code line
\`\`\`
Additional content.
# Header 2
Content under header 2.
    `;

    const expected: MarkdownFragment[] = [
      {
        label: '1-heading',
        type: 'heading',
        text: '# Header 1',
        tokenCount: encoder.encode('# Header 1').length,
      },
      {
        label: '1-content-1',
        type: 'paragraph',
        text: 'Some introductory content.',
        tokenCount: encoder.encode('Some introductory content.').length,
      },
      {
        label: '1-content-2',
        type: 'code',
        text: '```\n# This is a comment with a # symbol\nSome code line\n```',
        tokenCount: encoder.encode('```\n# This is a comment with a # symbol\nSome code line\n```').length,
      },
      {
        label: '1-content-3',
        type: 'paragraph',
        text: 'Additional content.',
        tokenCount: encoder.encode('Additional content.').length,
      },
      {
        label: '2-heading',
        type: 'heading',
        text: '# Header 2',
        tokenCount: encoder.encode('# Header 2').length,
      },
      {
        label: '2-content-1',
        type: 'paragraph',
        text: 'Content under header 2.',
        tokenCount: encoder.encode('Content under header 2.').length,
      },
    ];

    const result = await splitMarkdownIntoFragments(markdown, MODEL);
    expect(result).toEqual(expected);
  });

  test('frontmatter is processed and labeled correctly', async() => {
    const markdown = `---
title: Test Document
author: John Doe
---

# Header 1
Some introductory content.
    `;

    const expected: MarkdownFragment[] = [
      {
        label: 'frontmatter',
        type: 'yaml',
        text: JSON.stringify({ title: 'Test Document', author: 'John Doe' }, null, 2),
        tokenCount: encoder.encode(JSON.stringify({ title: 'Test Document', author: 'John Doe' }, null, 2)).length,
      },
      {
        label: '1-heading',
        type: 'heading',
        text: '# Header 1',
        tokenCount: encoder.encode('# Header 1').length,
      },
      {
        label: '1-content-1',
        type: 'paragraph',
        text: 'Some introductory content.',
        tokenCount: encoder.encode('Some introductory content.').length,
      },
    ];

    const result = await splitMarkdownIntoFragments(markdown, MODEL);
    expect(result).toEqual(expected);
  });
});