Răsfoiți Sursa

rm markdown-splitter 😢

Shun Miyazawa 1 an în urmă
părinte
comite
1d9a8ea5ef

+ 0 - 573
apps/app/src/features/openai/server/services/markdown-splitter/markdown-splitter.spec.ts

@@ -1,573 +0,0 @@
-import { encodingForModel, type TiktokenModel } from 'js-tiktoken';
-
-import { splitMarkdownIntoFragments, type MarkdownFragment } from './markdown-splitter';
-
-const MODEL: TiktokenModel = 'gpt-4';
-const encoder = encodingForModel(MODEL);
-
-describe('splitMarkdownIntoFragments', () => {
-
-  test('handles empty markdown string', async() => {
-    const markdown = '';
-    const expected: MarkdownFragment[] = [];
-    const result = await splitMarkdownIntoFragments(markdown, MODEL);
-    expect(result).toEqual(expected);
-  });
-
-  test('handles markdown with only content and no headers', async() => {
-    const markdown = `This is some content without any headers.
-It spans multiple lines.
-
-Another paragraph.
-    `;
-
-    const expected: MarkdownFragment[] = [
-      {
-        label: '0-content-1',
-        type: 'paragraph',
-        text: 'This is some content without any headers.\nIt spans multiple lines.',
-        tokenCount: encoder.encode('This is some content without any headers.\nIt spans multiple lines.').length,
-      },
-      {
-        label: '0-content-2',
-        type: 'paragraph',
-        text: 'Another paragraph.',
-        tokenCount: encoder.encode('Another paragraph.').length,
-      },
-    ];
-
-    const result = await splitMarkdownIntoFragments(markdown, MODEL);
-    expect(result).toEqual(expected);
-  });
-
-  test('handles markdown starting with a header', async() => {
-    const markdown = `
-# Header 1
-Content under header 1.
-
-## Header 1.1
-Content under header 1.1.
-
-# Header 2
-Content under header 2.
-    `;
-
-    const expected: MarkdownFragment[] = [
-      {
-        label: '1-heading',
-        type: 'heading',
-        text: '# Header 1',
-        tokenCount: encoder.encode('# Header 1').length,
-      },
-      {
-        label: '1-content-1',
-        type: 'paragraph',
-        text: 'Content under header 1.',
-        tokenCount: encoder.encode('Content under header 1.').length,
-      },
-      {
-        label: '1-1-heading',
-        type: 'heading',
-        text: '## Header 1.1',
-        tokenCount: encoder.encode('## Header 1.1').length,
-      },
-      {
-        label: '1-1-content-1',
-        type: 'paragraph',
-        text: 'Content under header 1.1.',
-        tokenCount: encoder.encode('Content under header 1.1.').length,
-      },
-      {
-        label: '2-heading',
-        type: 'heading',
-        text: '# Header 2',
-        tokenCount: encoder.encode('# Header 2').length,
-      },
-      {
-        label: '2-content-1',
-        type: 'paragraph',
-        text: 'Content under header 2.',
-        tokenCount: encoder.encode('Content under header 2.').length,
-      },
-    ];
-
-    const result = await splitMarkdownIntoFragments(markdown, MODEL);
-    expect(result).toEqual(expected);
-  });
-
-  test('handles markdown with non-consecutive heading levels', async() => {
-    const markdown = `
-Introduction without a header.
-
-# Chapter 1
-Content of chapter 1.
-
-### Section 1.1.1
-Content of section 1.1.1.
-
-## Section 1.2
-Content of section 1.2.
-
-# Chapter 2
-Content of chapter 2.
-
-## Section 2.1
-Content of section 2.1.
-    `;
-
-    const expected: MarkdownFragment[] = [
-      {
-        label: '0-content-1',
-        type: 'paragraph',
-        text: 'Introduction without a header.',
-        tokenCount: encoder.encode('Introduction without a header.').length,
-      },
-      {
-        label: '1-heading',
-        type: 'heading',
-        text: '# Chapter 1',
-        tokenCount: encoder.encode('# Chapter 1').length,
-      },
-      {
-        label: '1-content-1',
-        type: 'paragraph',
-        text: 'Content of chapter 1.',
-        tokenCount: encoder.encode('Content of chapter 1.').length,
-      },
-      {
-        label: '1-1-1-heading',
-        type: 'heading',
-        text: '### Section 1.1.1',
-        tokenCount: encoder.encode('### Section 1.1.1').length,
-      },
-      {
-        label: '1-1-1-content-1',
-        type: 'paragraph',
-        text: 'Content of section 1.1.1.',
-        tokenCount: encoder.encode('Content of section 1.1.1.').length,
-      },
-      {
-        label: '1-2-heading',
-        type: 'heading',
-        text: '## Section 1.2',
-        tokenCount: encoder.encode('## Section 1.2').length,
-      },
-      {
-        label: '1-2-content-1',
-        type: 'paragraph',
-        text: 'Content of section 1.2.',
-        tokenCount: encoder.encode('Content of section 1.2.').length,
-      },
-      {
-        label: '2-heading',
-        type: 'heading',
-        text: '# Chapter 2',
-        tokenCount: encoder.encode('# Chapter 2').length,
-      },
-      {
-        label: '2-content-1',
-        type: 'paragraph',
-        text: 'Content of chapter 2.',
-        tokenCount: encoder.encode('Content of chapter 2.').length,
-      },
-      {
-        label: '2-1-heading',
-        type: 'heading',
-        text: '## Section 2.1',
-        tokenCount: encoder.encode('## Section 2.1').length,
-      },
-      {
-        label: '2-1-content-1',
-        type: 'paragraph',
-        text: 'Content of section 2.1.',
-        tokenCount: encoder.encode('Content of section 2.1.').length,
-      },
-    ];
-
-    const result = await splitMarkdownIntoFragments(markdown, MODEL);
-    expect(result).toEqual(expected);
-  });
-
-  test('handles markdown with skipped heading levels', async() => {
-    const markdown = `
-# Header 1
-Content under header 1.
-
-#### Header 1.1.1.1
-Content under header 1.1.1.1.
-
-## Header 1.2
-Content under header 1.2.
-
-# Header 2
-Content under header 2.
-    `;
-
-    const expected: MarkdownFragment[] = [
-      {
-        label: '1-heading',
-        type: 'heading',
-        text: '# Header 1',
-        tokenCount: encoder.encode('# Header 1').length,
-      },
-      {
-        label: '1-content-1',
-        type: 'paragraph',
-        text: 'Content under header 1.',
-        tokenCount: encoder.encode('Content under header 1.').length,
-      },
-      {
-        label: '1-1-1-1-heading',
-        type: 'heading',
-        text: '#### Header 1.1.1.1',
-        tokenCount: encoder.encode('#### Header 1.1.1.1').length,
-      },
-      {
-        label: '1-1-1-1-content-1',
-        type: 'paragraph',
-        text: 'Content under header 1.1.1.1.',
-        tokenCount: encoder.encode('Content under header 1.1.1.1.').length,
-      },
-      {
-        label: '1-2-heading',
-        type: 'heading',
-        text: '## Header 1.2',
-        tokenCount: encoder.encode('## Header 1.2').length,
-      },
-      {
-        label: '1-2-content-1',
-        type: 'paragraph',
-        text: 'Content under header 1.2.',
-        tokenCount: encoder.encode('Content under header 1.2.').length,
-      },
-      {
-        label: '2-heading',
-        type: 'heading',
-        text: '# Header 2',
-        tokenCount: encoder.encode('# Header 2').length,
-      },
-      {
-        label: '2-content-1',
-        type: 'paragraph',
-        text: 'Content under header 2.',
-        tokenCount: encoder.encode('Content under header 2.').length,
-      },
-    ];
-
-    const result = await splitMarkdownIntoFragments(markdown, MODEL);
-    expect(result).toEqual(expected);
-  });
-
-  test('handles malformed headings', async() => {
-    const markdown = `
-# Header 1
-Content under header 1.
-
-#### Header 1.1.1.1
-Content under header 1.1.1.1.
-    `;
-
-    const expected: MarkdownFragment[] = [
-      {
-        label: '1-heading',
-        type: 'heading',
-        text: '# Header 1',
-        tokenCount: encoder.encode('# Header 1').length,
-      },
-      {
-        label: '1-content-1',
-        type: 'paragraph',
-        text: 'Content under header 1.',
-        tokenCount: encoder.encode('Content under header 1.').length,
-      },
-      {
-        label: '1-1-1-1-heading',
-        type: 'heading',
-        text: '#### Header 1.1.1.1',
-        tokenCount: encoder.encode('#### Header 1.1.1.1').length,
-      },
-      {
-        label: '1-1-1-1-content-1',
-        type: 'paragraph',
-        text: 'Content under header 1.1.1.1.',
-        tokenCount: encoder.encode('Content under header 1.1.1.1.').length,
-      },
-    ];
-
-    const result = await splitMarkdownIntoFragments(markdown, MODEL);
-    expect(result).toEqual(expected);
-  });
-
-  test('handles multiple content blocks before any headers', async() => {
-    const markdown = `
-This is the first paragraph without a header.
-
-This is the second paragraph without a header.
-
-# Header 1
-Content under header 1.
-    `;
-
-    const expected: MarkdownFragment[] = [
-      {
-        label: '0-content-1',
-        type: 'paragraph',
-        text: 'This is the first paragraph without a header.',
-        tokenCount: encoder.encode('This is the first paragraph without a header.').length,
-      },
-      {
-        label: '0-content-2',
-        type: 'paragraph',
-        text: 'This is the second paragraph without a header.',
-        tokenCount: encoder.encode('This is the second paragraph without a header.').length,
-      },
-      {
-        label: '1-heading',
-        type: 'heading',
-        text: '# Header 1',
-        tokenCount: encoder.encode('# Header 1').length,
-      },
-      {
-        label: '1-content-1',
-        type: 'paragraph',
-        text: 'Content under header 1.',
-        tokenCount: encoder.encode('Content under header 1.').length,
-      },
-    ];
-
-    const result = await splitMarkdownIntoFragments(markdown, MODEL);
-    expect(result).toEqual(expected);
-  });
-
-  test('handles markdown with only headers and no content', async() => {
-    const markdown = `
-# Header 1
-
-## Header 1.1
-
-### Header 1.1.1
-    `;
-
-    const expected: MarkdownFragment[] = [
-      {
-        label: '1-heading',
-        type: 'heading',
-        text: '# Header 1',
-        tokenCount: encoder.encode('# Header 1').length,
-      },
-      {
-        label: '1-1-heading',
-        type: 'heading',
-        text: '## Header 1.1',
-        tokenCount: encoder.encode('## Header 1.1').length,
-      },
-      {
-        label: '1-1-1-heading',
-        type: 'heading',
-        text: '### Header 1.1.1',
-        tokenCount: encoder.encode('### Header 1.1.1').length,
-      },
-    ];
-
-    const result = await splitMarkdownIntoFragments(markdown, MODEL);
-    expect(result).toEqual(expected);
-  });
-
-  test('handles markdown with mixed content and headers', async() => {
-    const markdown = `
-# Header 1
-Content under header 1.
-
-## Header 1.1
-Content under header 1.1.
-Another piece of content.
-
-# Header 2
-Content under header 2.
-    `;
-
-    const expected: MarkdownFragment[] = [
-      {
-        label: '1-heading',
-        type: 'heading',
-        text: '# Header 1',
-        tokenCount: encoder.encode('# Header 1').length,
-      },
-      {
-        label: '1-content-1',
-        type: 'paragraph',
-        text: 'Content under header 1.',
-        tokenCount: encoder.encode('Content under header 1.').length,
-      },
-      {
-        label: '1-1-heading',
-        type: 'heading',
-        text: '## Header 1.1',
-        tokenCount: encoder.encode('## Header 1.1').length,
-      },
-      {
-        label: '1-1-content-1',
-        type: 'paragraph',
-        text: 'Content under header 1.1.\nAnother piece of content.',
-        tokenCount: encoder.encode('Content under header 1.1.\nAnother piece of content.').length,
-      },
-      {
-        label: '2-heading',
-        type: 'heading',
-        text: '# Header 2',
-        tokenCount: encoder.encode('# Header 2').length,
-      },
-      {
-        label: '2-content-1',
-        type: 'paragraph',
-        text: 'Content under header 2.',
-        tokenCount: encoder.encode('Content under header 2.').length,
-      },
-    ];
-
-    const result = await splitMarkdownIntoFragments(markdown, MODEL);
-    expect(result).toEqual(expected);
-  });
-
-  test('preserves list indentation and reduces unnecessary line breaks', async() => {
-    const markdown = `
-# Header 1
-Content under header 1.
-
-- Item 1
-  - Subitem 1
-- Item 2
-
-
-# Header 2
-Content under header 2.
-    `;
-
-    const expected: MarkdownFragment[] = [
-      {
-        label: '1-heading',
-        type: 'heading',
-        text: '# Header 1',
-        tokenCount: encoder.encode('# Header 1').length,
-      },
-      {
-        label: '1-content-1',
-        type: 'paragraph',
-        text: 'Content under header 1.',
-        tokenCount: encoder.encode('Content under header 1.').length,
-      },
-      {
-        label: '1-content-2',
-        type: 'list',
-        text: '- Item 1\n  - Subitem 1\n- Item 2',
-        tokenCount: encoder.encode('- Item 1\n  - Subitem 1\n- Item 2').length,
-      },
-      {
-        label: '2-heading',
-        type: 'heading',
-        text: '# Header 2',
-        tokenCount: encoder.encode('# Header 2').length,
-      },
-      {
-        label: '2-content-1',
-        type: 'paragraph',
-        text: 'Content under header 2.',
-        tokenCount: encoder.encode('Content under header 2.').length,
-      },
-    ];
-
-    const result = await splitMarkdownIntoFragments(markdown, MODEL);
-    expect(result).toEqual(expected);
-  });
-
-  test('code blocks containing # are not treated as headings', async() => {
-    const markdown = `
-# Header 1
-Some introductory content.
-\`\`\`
-# This is a comment with a # symbol
-Some code line
-\`\`\`
-Additional content.
-# Header 2
-Content under header 2.
-    `;
-
-    const expected: MarkdownFragment[] = [
-      {
-        label: '1-heading',
-        type: 'heading',
-        text: '# Header 1',
-        tokenCount: encoder.encode('# Header 1').length,
-      },
-      {
-        label: '1-content-1',
-        type: 'paragraph',
-        text: 'Some introductory content.',
-        tokenCount: encoder.encode('Some introductory content.').length,
-      },
-      {
-        label: '1-content-2',
-        type: 'code',
-        text: '```\n# This is a comment with a # symbol\nSome code line\n```',
-        tokenCount: encoder.encode('```\n# This is a comment with a # symbol\nSome code line\n```').length,
-      },
-      {
-        label: '1-content-3',
-        type: 'paragraph',
-        text: 'Additional content.',
-        tokenCount: encoder.encode('Additional content.').length,
-      },
-      {
-        label: '2-heading',
-        type: 'heading',
-        text: '# Header 2',
-        tokenCount: encoder.encode('# Header 2').length,
-      },
-      {
-        label: '2-content-1',
-        type: 'paragraph',
-        text: 'Content under header 2.',
-        tokenCount: encoder.encode('Content under header 2.').length,
-      },
-    ];
-
-    const result = await splitMarkdownIntoFragments(markdown, MODEL);
-    expect(result).toEqual(expected);
-  });
-
-  test('frontmatter is processed and labeled correctly', async() => {
-    const markdown = `---
-title: Test Document
-author: John Doe
----
-
-# Header 1
-Some introductory content.
-    `;
-
-    const expected: MarkdownFragment[] = [
-      {
-        label: 'frontmatter',
-        type: 'yaml',
-        text: JSON.stringify({ title: 'Test Document', author: 'John Doe' }, null, 2),
-        tokenCount: encoder.encode(JSON.stringify({ title: 'Test Document', author: 'John Doe' }, null, 2)).length,
-      },
-      {
-        label: '1-heading',
-        type: 'heading',
-        text: '# Header 1',
-        tokenCount: encoder.encode('# Header 1').length,
-      },
-      {
-        label: '1-content-1',
-        type: 'paragraph',
-        text: 'Some introductory content.',
-        tokenCount: encoder.encode('Some introductory content.').length,
-      },
-    ];
-
-    const result = await splitMarkdownIntoFragments(markdown, MODEL);
-    expect(result).toEqual(expected);
-  });
-});

+ 0 - 133
apps/app/src/features/openai/server/services/markdown-splitter/markdown-splitter.ts

@@ -1,133 +0,0 @@
-import { dynamicImport } from '@cspell/dynamic-import';
-import type { TiktokenModel } from 'js-tiktoken';
-import { encodingForModel } from 'js-tiktoken';
-import yaml from 'js-yaml';
-import type * as RemarkFrontmatter from 'remark-frontmatter';
-import type * as RemarkGfm from 'remark-gfm';
-import type * as RemarkParse from 'remark-parse';
-import type * as RemarkStringify from 'remark-stringify';
-import type * as Unified from 'unified';
-
-
-export type MarkdownFragment = {
-  label: string;
-  type: string;
-  text: string;
-  tokenCount: number;
-};
-
-/**
- * Updates the section numbers based on the heading depth and returns the updated section label.
- * Handles non-consecutive heading levels by initializing missing levels with 1.
- * @param sectionNumbers - The current section numbers.
- * @param headingDepth - The depth of the heading (e.g., # is depth 1).
- * @returns The updated section label.
- */
-function updateSectionNumbers(sectionNumbers: number[], headingDepth: number): string {
-  if (headingDepth > sectionNumbers.length) {
-    // Initialize missing levels with 1
-    while (sectionNumbers.length < headingDepth) {
-      sectionNumbers.push(1);
-    }
-  }
-  else if (headingDepth === sectionNumbers.length) {
-    // Increment the last number for the same level
-    sectionNumbers[headingDepth - 1]++;
-  }
-  else {
-    // Remove deeper levels and increment the current level
-    sectionNumbers.splice(headingDepth);
-    sectionNumbers[headingDepth - 1]++;
-  }
-  return sectionNumbers.join('-');
-}
-
-/**
- * Splits Markdown text into labeled markdownFragments using remark-parse and remark-stringify,
- * processing each content node separately and labeling them as 1-content-1, 1-content-2, etc.
- * @param markdownText - The input Markdown string.
- * @returns An array of labeled markdownFragments.
- */
-export async function splitMarkdownIntoFragments(markdownText: string, model: TiktokenModel): Promise<MarkdownFragment[]> {
-  const markdownFragments: MarkdownFragment[] = [];
-  const sectionNumbers: number[] = [];
-  let currentSectionLabel = '';
-  const contentCounters: Record<string, number> = {};
-
-  if (typeof markdownText !== 'string' || markdownText.trim() === '') {
-    return markdownFragments;
-  }
-
-  const encoder = encodingForModel(model);
-
-  const remarkParse = (await dynamicImport<typeof RemarkParse>('remark-parse', __dirname)).default;
-  const remarkFrontmatter = (await dynamicImport<typeof RemarkFrontmatter>('remark-frontmatter', __dirname)).default;
-  const remarkGfm = (await dynamicImport<typeof RemarkGfm>('remark-gfm', __dirname)).default;
-  const remarkStringify = (await dynamicImport<typeof RemarkStringify>('remark-stringify', __dirname)).default;
-  const unified = (await dynamicImport<typeof Unified>('unified', __dirname)).unified;
-
-  const parser = unified()
-    .use(remarkParse)
-    .use(remarkFrontmatter, ['yaml'])
-    .use(remarkGfm); // Enable GFM extensions
-
-  const stringifyOptions: RemarkStringify.Options = {
-    bullet: '-', // Set list bullet to hyphen
-    rule: '-', // Use hyphen for horizontal rules
-  };
-
-  const stringifier = unified()
-    .use(remarkFrontmatter, ['yaml'])
-    .use(remarkGfm)
-    .use(remarkStringify, stringifyOptions);
-
-  const parsedTree = parser.parse(markdownText);
-
-  // Iterate over top-level nodes to prevent duplication
-  for (const node of parsedTree.children) {
-    if (node.type === 'yaml') {
-      // Frontmatter block found, handle only the first instance
-      const frontmatter = yaml.load(node.value) as Record<string, unknown>;
-      const frontmatterText = JSON.stringify(frontmatter, null, 2);
-      const tokenCount = encoder.encode(frontmatterText).length;
-      markdownFragments.push({
-        label: 'frontmatter',
-        type: 'yaml',
-        text: frontmatterText,
-        tokenCount,
-      });
-    }
-    else if (node.type === 'heading') {
-      const headingDepth = node.depth;
-      currentSectionLabel = updateSectionNumbers(sectionNumbers, headingDepth);
-
-      const headingMarkdown = stringifier.stringify(node as any).trim(); // eslint-disable-line @typescript-eslint/no-explicit-any
-      const tokenCount = encoder.encode(headingMarkdown).length;
-      markdownFragments.push({
-        label: `${currentSectionLabel}-heading`, type: node.type, text: headingMarkdown, tokenCount,
-      });
-    }
-    else {
-      // Process non-heading content individually
-      const contentMarkdown = stringifier.stringify(node as any).trim(); // eslint-disable-line @typescript-eslint/no-explicit-any
-      if (contentMarkdown !== '') {
-        const contentCountKey = currentSectionLabel || '0';
-        if (!contentCounters[contentCountKey]) {
-          contentCounters[contentCountKey] = 1;
-        }
-        else {
-          contentCounters[contentCountKey]++;
-        }
-        const contentLabel = currentSectionLabel !== ''
-          ? `${currentSectionLabel}-content-${contentCounters[contentCountKey]}`
-          : `0-content-${contentCounters[contentCountKey]}`;
-        const tokenCount = encoder.encode(contentMarkdown).length;
-        markdownFragments.push({
-          label: contentLabel, type: node.type, text: contentMarkdown, tokenCount,
-        });
-      }
-    }
-  }
-
-  return markdownFragments;
-}

+ 0 - 134
apps/app/src/features/openai/server/services/markdown-splitter/markdown-token-splitter.spec.ts

@@ -1,134 +0,0 @@
-import type { TiktokenModel } from 'js-tiktoken';
-import { encodingForModel } from 'js-tiktoken';
-
-import { splitMarkdownIntoChunks } from './markdown-token-splitter';
-
-const MODEL: TiktokenModel = 'gpt-4';
-const encoder = encodingForModel(MODEL);
-
-describe('splitMarkdownIntoChunks', () => {
-  const repeatedText = 'This is a repeated sentence for testing purposes. '.repeat(100);
-  const markdown = `---
-title: Test Document
-author: John Doe
----
-
-${repeatedText}
-
-# Header 1
-
-This is the first paragraph under header 1. It contains some text to simulate a longer paragraph for testing.
-This paragraph is extended with more content to ensure proper chunking behavior.${repeatedText}
-
-## Header 1-1
-
-This is the first paragraph under header 1-1. The text is a bit longer to ensure proper chunking. More text follows.
-
-
-### Header 1-1-1
-
-This is the first paragraph under header 1-1-1. The content is nested deeper,
-making sure that the chunking algorithm works properly with multiple levels of headers.
-
-This is another paragraph under header 1-1-1, continuing the content at this deeper level.
-
-#### Header 1-1-1-1
-
-Now we have reached the fourth level of headers. The text here should also be properly chunked and grouped with its parent headers.
-
-This is another paragraph under header 1-1-1-1. It should be grouped with the correct higher-level headers.
-
-# Header 2
-
-Here is some content under header 2. This section should also be sufficiently long to ensure that the token count threshold is reached in the test.
-
-## Header 2-1
-
-${repeatedText}
-
-${repeatedText}
-
-Another sub-header under header 2 with text for testing chunking behavior. This is a fairly lengthy paragraph as well.
-
-We now have a fourth-level sub-header under header 2-1. This ensures that the chunking logic can handle deeply nested content.
-
-### Header 2-1-1
-
-Here is another paragraph under header 2-1-1. This paragraph is part of a more deeply nested section.
-
-# Header 3
-
-Continuing with more headers and content to make sure the markdown document is sufficiently large. This is a new header with more paragraphs under it.
-
-### Header 3-1
-
-This is a sub-header under header 3. The content here continues to grow, ensuring that the markdown is long enough to trigger multiple chunks.
-
-#### Header 3-1-1
-
-Here is a fourth-level sub-header under header 3-1. This paragraph is designed to create a larger markdown file for testing purposes.
-`;
-  test('Each chunk should not exceed the specified token count', async() => {
-    const maxToken = 800;
-    const result = await splitMarkdownIntoChunks(markdown, MODEL, maxToken);
-
-    result.forEach((chunk) => {
-      const tokenCount = encoder.encode(chunk).length;
-      expect(tokenCount).toBeLessThanOrEqual(maxToken * 1.1);
-    });
-  });
-  test('Each chunk should include the relevant top-level header', async() => {
-    const result = await splitMarkdownIntoChunks(markdown, MODEL, 800);
-
-    result.forEach((chunk) => {
-      const containsHeader1 = chunk.includes('# Header 1');
-      const containsHeader2 = chunk.includes('# Header 2');
-      const containsHeader3 = chunk.includes('# Header 3');
-      const doesNotContainHash = !chunk.includes('# ');
-
-      expect(containsHeader1 || containsHeader2 || containsHeader3 || doesNotContainHash).toBe(true);
-    });
-  });
-  test('Should throw an error if a header exceeds half of maxToken size with correct error message', async() => {
-    const maxToken = 800;
-    const markdownWithLongHeader = `
-# Short Header 1
-
-This is the first paragraph under short header 1. It contains some text for testing purposes.
-
-## ${repeatedText}
-
-This is the first paragraph under the long header. It contains text to ensure that the header length check is triggered if the header is too long.
-
-# Short Header 2
-
-Another section with a shorter header, but enough content to ensure proper chunking.
-`;
-
-    try {
-      await splitMarkdownIntoChunks(markdownWithLongHeader, MODEL, maxToken);
-    }
-    catch (error) {
-      if (error instanceof Error) {
-        expect(error.message).toContain('Heading token count is too large');
-      }
-      else {
-        throw new Error('An unknown error occurred');
-      }
-    }
-  });
-
-  test('Should return the entire markdown as a single chunk if token count is less than or equal to maxToken', async() => {
-    const markdownText = `
-    # Header 1
-    This is a short paragraph under header 1. It contains only a few sentences to ensure that the total token count remains under the maxToken limit.
-    `;
-
-    const maxToken = 800;
-
-    const result = await splitMarkdownIntoChunks(markdownText, MODEL, maxToken);
-
-    expect(result).toHaveLength(1);
-    expect(result[0]).toBe(markdownText);
-  });
-});

+ 0 - 188
apps/app/src/features/openai/server/services/markdown-splitter/markdown-token-splitter.ts

@@ -1,188 +0,0 @@
-import { encodingForModel, type TiktokenModel } from 'js-tiktoken';
-
-import { splitMarkdownIntoFragments, type MarkdownFragment } from './markdown-splitter';
-
-type MarkdownFragmentGroups = MarkdownFragment[][] ;
-
-function groupMarkdownFragments(
-    markdownFragments: MarkdownFragment[],
-    maxToken: number,
-): MarkdownFragmentGroups {
-
-  const prefixes = markdownFragments.map(({ label }) => {
-    if (label === 'frontmatter') return 'frontmatter';
-    const match = label.match(/^\d+(?:-\d+)*/)!; // eslint-disable-line @typescript-eslint/no-non-null-assertion
-    return match[0];
-  });
-
-  const uniquePrefixes = [...new Set(prefixes.filter(Boolean))];
-
-  // Group chunks by prefix
-  const fragmentGroupes: MarkdownFragmentGroups = [];
-  let remainingPrefixes = [...uniquePrefixes];
-
-  // Process chunks so that the total token count per level doesn't exceed maxToken
-  while (remainingPrefixes.length > 0) {
-    const prefix = remainingPrefixes[0]; // Get the first prefix
-    const hasNextLevelPrefix = uniquePrefixes.some(p => p !== prefix && p.startsWith(prefix));
-
-    if (!hasNextLevelPrefix) {
-      // If there is no prefix that starts with the current prefix, group the chunks directly
-      let matchingFragments = markdownFragments.filter(fragment => fragment.label.startsWith(prefix));
-
-      // Add parent heading if it exists
-      const parts = prefix.split('-');
-      for (let i = 1; i < parts.length; i++) {
-        const parentPrefix = parts.slice(0, i).join('-');
-        const parentHeading = markdownFragments.find(fragment => fragment.label === `${parentPrefix}-heading`);
-        if (parentHeading) {
-          matchingFragments = [parentHeading, ...matchingFragments]; // Add the heading at the front
-        }
-      }
-
-      fragmentGroupes.push(matchingFragments);
-    }
-    else {
-      // Filter chunks that start with the current prefix
-      let matchingFragments = markdownFragments.filter(fragment => fragment.label.startsWith(prefix));
-
-      // Add parent heading if it exists
-      const parts = prefix.split('-');
-      for (let i = 1; i < parts.length; i++) {
-        const parentPrefix = parts.slice(0, i).join('-');
-        const parentHeading = markdownFragments.find(fragment => fragment.label === `${parentPrefix}-heading`);
-        if (parentHeading) {
-          matchingFragments = [parentHeading, ...matchingFragments];
-        }
-      }
-
-      // Calculate total token count including parent headings
-      const totalTokenCount = matchingFragments.reduce((sum, fragment) => sum + fragment.tokenCount, 0);
-
-      // If the total token count doesn't exceed maxToken, group the chunks
-      if (totalTokenCount <= maxToken) {
-        fragmentGroupes.push(matchingFragments);
-        remainingPrefixes = remainingPrefixes.filter(p => !p.startsWith(`${prefix}-`));
-      }
-      else {
-        // If it exceeds maxToken, strictly filter chunks by the exact numeric prefix
-        const strictMatchingFragments = markdownFragments.filter((fragment) => {
-          const match = fragment.label.match(/^\d+(-\d+)*(?=-)/);
-          return match && match[0] === prefix;
-        });
-
-        // Add parent heading if it exists
-        for (let i = 1; i < parts.length; i++) {
-          const parentPrefix = parts.slice(0, i).join('-');
-          const parentHeading = markdownFragments.find(fragment => fragment.label === `${parentPrefix}-heading`);
-          if (parentHeading) {
-            strictMatchingFragments.unshift(parentHeading); // Add the heading at the front
-          }
-        }
-
-        fragmentGroupes.push(strictMatchingFragments);
-      }
-    }
-    remainingPrefixes.shift();
-  }
-
-  return fragmentGroupes;
-}
-
-// Function to group markdown into chunks based on token count
-export async function splitMarkdownIntoChunks(
-    markdownText: string,
-    model: TiktokenModel,
-    maxToken = 800,
-): Promise<string[]> {
-  const encoder = encodingForModel(model);
-
-  // If the total token count for the entire markdown text is less than or equal to maxToken,
-  // return the entire markdown as a single chunk.
-  if (encoder.encode(markdownText).length <= maxToken) {
-    return [markdownText];
-  }
-
-  // Split markdown text into chunks
-  const markdownFragments = await splitMarkdownIntoFragments(markdownText, model);
-  const chunks: string[] = [];
-
-  // Group the chunks based on token count
-  const fragmentGroupes = groupMarkdownFragments(markdownFragments, maxToken);
-
-  fragmentGroupes.forEach((fragmentGroupe) => {
-    // Calculate the total token count for each group
-    const totalTokenCount = fragmentGroupe.reduce((sum, fragment) => sum + fragment.tokenCount, 0);
-
-    // If the total token count doesn't exceed maxToken, combine the chunks into one
-    if (totalTokenCount <= maxToken) {
-      const chunk = fragmentGroupe.map((fragment, index) => {
-        const nextFragment = fragmentGroupe[index + 1];
-        if (nextFragment) {
-          // If both the current and next chunks are headings, add a single newline
-          if (fragment.type === 'heading' && nextFragment.type === 'heading') {
-            return `${fragment.text}\n`;
-          }
-          // Add two newlines for other cases
-          return `${fragment.text}\n\n`;
-        }
-        return fragment.text; // No newlines for the last chunk
-      }).join('');
-
-      chunks.push(chunk);
-    }
-    else {
-      // If the total token count exceeds maxToken, split content
-      const headingFragments = fragmentGroupe.filter(fragment => fragment.type === 'heading'); // Find all headings
-      const headingText = headingFragments.map(heading => heading.text).join('\n'); // Combine headings with one newline
-
-      for (const fragment of fragmentGroupe) {
-        if (fragment.label.includes('content')) {
-          // Combine heading and paragraph content
-          const combinedTokenCount = headingFragments.reduce((sum, heading) => sum + heading.tokenCount, 0) + fragment.tokenCount;
-          // Check if headingChunks alone exceed maxToken
-          const headingTokenCount = headingFragments.reduce((sum, heading) => sum + heading.tokenCount, 0);
-
-          if (headingTokenCount > maxToken / 2) {
-            throw new Error(
-              `Heading token count is too large. Heading token count: ${headingTokenCount}, allowed maximum: ${Math.ceil(maxToken / 2)}`,
-            );
-          }
-
-          // If the combined token count exceeds maxToken, split the content by character count
-          if (combinedTokenCount > maxToken) {
-            const headingTokenCount = headingFragments.reduce((sum, heading) => sum + heading.tokenCount, 0);
-            const remainingTokenCount = maxToken - headingTokenCount;
-
-            // Calculate the total character count and token count
-            const fragmentCharCount = fragment.text.length;
-            const fragmenTokenCount = fragment.tokenCount;
-
-            // Calculate the character count for splitting
-            const charCountForSplit = Math.floor((remainingTokenCount / fragmenTokenCount) * fragmentCharCount);
-
-            // Split content based on character count
-            const splitContents: string[] = [];
-            for (let i = 0; i < fragment.text.length; i += charCountForSplit) {
-              splitContents.push(fragment.text.slice(i, i + charCountForSplit));
-            }
-
-            // Add each split content to the new group of chunks
-            splitContents.forEach((splitText) => {
-              const chunk = headingText
-                ? `${headingText}\n\n${splitText}`
-                : `${splitText}`;
-              chunks.push(chunk);
-            });
-          }
-          else {
-            const chunk = `${headingText}\n\n${fragment.text}`;
-            chunks.push(chunk);
-          }
-        }
-      }
-    }
-  });
-
-  return chunks;
-}