Просмотр исходного кода

update markdown split into chunks logic

nHigashiWeseek 1 год назад
Родитель
Сommit
196ab99656

+ 19 - 33
packages/markdown-splitter/src/services/markdown-splitter.ts

@@ -8,24 +8,11 @@ import { unified } from 'unified';
 
 export type Chunk = {
   label: string;
+  type: string;
   text: string;
+  tokenCount?: number;
 };
 
-/**
- * Processes and adds a new chunk to the chunks array if content is not empty.
- * Clears the contentBuffer array after processing.
- * @param chunks - The array to store processed chunks.
- * @param contentBuffer - The array of content lines to be processed.
- * @param label - The label for the content chunk.
- */
-function addContentChunk(chunks: Chunk[], contentBuffer: string[], label: string) {
-  const text = contentBuffer.join('\n\n').trimEnd();
-  if (text !== '') {
-    chunks.push({ label, text });
-  }
-  contentBuffer.length = 0; // Clear the contentBuffer array
-}
-
 /**
  * Updates the section numbers based on the heading depth and returns the updated section label.
  * Handles non-consecutive heading levels by initializing missing levels with 1.
@@ -54,7 +41,7 @@ function updateSectionNumbers(sectionNumbers: number[], headingDepth: number): s
 
 /**
  * Splits Markdown text into labeled chunks using remark-parse and remark-stringify,
- * considering content that may start before any headers and handling non-consecutive heading levels.
+ * processing each content node separately and labeling them as 1-content-1, 1-content-2, etc.
  * @param markdownText - The input Markdown string.
  * @returns An array of labeled chunks.
  */
@@ -62,8 +49,8 @@ export async function splitMarkdownIntoChunks(markdownText: string): Promise<Chu
   const chunks: Chunk[] = [];
   const sectionNumbers: number[] = [];
   let frontmatter: Record<string, unknown> | null = null; // Variable to store frontmatter
-  const contentBuffer: string[] = [];
   let currentSectionLabel = '';
+  const contentCounters: Record<string, number> = {};
 
   if (typeof markdownText !== 'string' || markdownText.trim() === '') {
     return chunks;
@@ -92,36 +79,35 @@ export async function splitMarkdownIntoChunks(markdownText: string): Promise<Chu
       frontmatter = yaml.load(node.value) as Record<string, unknown>;
     }
     else if (node.type === 'heading') {
-      // Process pending content before heading
-      if (contentBuffer.length > 0) {
-        const contentLabel = currentSectionLabel !== '' ? `${currentSectionLabel}-content` : '0-content';
-        addContentChunk(chunks, contentBuffer, contentLabel);
-      }
-
       const headingDepth = node.depth;
       currentSectionLabel = updateSectionNumbers(sectionNumbers, headingDepth);
 
-      const headingMarkdown = stringifier.stringify(node as any);// eslint-disable-line @typescript-eslint/no-explicit-any
-      chunks.push({ label: `${currentSectionLabel}-heading`, text: headingMarkdown.trim() });
+      const headingMarkdown = stringifier.stringify(node as any).trim(); // eslint-disable-line @typescript-eslint/no-explicit-any
+      chunks.push({ label: `${currentSectionLabel}-heading`, type: node.type, text: headingMarkdown });
     }
     else {
-      // Add non-heading content to the buffer
+      // Process non-heading content individually
       const contentMarkdown = stringifier.stringify(node as any).trim(); // eslint-disable-line @typescript-eslint/no-explicit-any
       if (contentMarkdown !== '') {
-        contentBuffer.push(contentMarkdown);
+        const contentCountKey = currentSectionLabel || '0';
+        if (!contentCounters[contentCountKey]) {
+          contentCounters[contentCountKey] = 1;
+        }
+        else {
+          contentCounters[contentCountKey]++;
+        }
+        const contentLabel = currentSectionLabel !== ''
+          ? `${currentSectionLabel}-content-${contentCounters[contentCountKey]}`
+          : `0-content-${contentCounters[contentCountKey]}`;
+        chunks.push({ label: contentLabel, type: node.type, text: contentMarkdown });
       }
     }
   }
 
-  // Process any remaining content
-  if (contentBuffer.length > 0) {
-    const contentLabel = currentSectionLabel !== '' ? `${currentSectionLabel}-content` : '0-content';
-    addContentChunk(chunks, contentBuffer, contentLabel);
-  }
-
   if (frontmatter) {
     chunks.unshift({
       label: 'frontmatter',
+      type: 'yaml',
       text: JSON.stringify(frontmatter, null, 2),
     });
   }

+ 84 - 50
packages/markdown-splitter/test/index.spec.ts

@@ -18,8 +18,14 @@ Another paragraph.
     `;
     const expected: Chunk[] = [
       {
-        label: '0-content',
-        text: 'This is some content without any headers.\nIt spans multiple lines.\n\nAnother paragraph.',
+        label: '0-content-1',
+        type: 'paragraph',
+        text: 'This is some content without any headers.\nIt spans multiple lines.',
+      },
+      {
+        label: '0-content-2',
+        type: 'paragraph',
+        text: 'Another paragraph.',
       },
     ];
     const result = await splitMarkdownIntoChunks(markdown); // Await the result
@@ -38,12 +44,12 @@ Content under header 1.1.
 Content under header 2.
     `;
     const expected: Chunk[] = [
-      { label: '1-heading', text: '# Header 1' },
-      { label: '1-content', text: 'Content under header 1.' },
-      { label: '1-1-heading', text: '## Header 1.1' },
-      { label: '1-1-content', text: 'Content under header 1.1.' },
-      { label: '2-heading', text: '# Header 2' },
-      { label: '2-content', text: 'Content under header 2.' },
+      { label: '1-heading', type: 'heading', text: '# Header 1' },
+      { label: '1-content-1', type: 'paragraph', text: 'Content under header 1.' },
+      { label: '1-1-heading', type: 'heading', text: '## Header 1.1' },
+      { label: '1-1-content-1', type: 'paragraph', text: 'Content under header 1.1.' },
+      { label: '2-heading', type: 'heading', text: '# Header 2' },
+      { label: '2-content-1', type: 'paragraph', text: 'Content under header 2.' },
     ];
     const result = await splitMarkdownIntoChunks(markdown); // Await the result
     expect(result).toEqual(expected);
@@ -70,47 +76,58 @@ Content of section 2.1.
     `;
     const expected: Chunk[] = [
       {
-        label: '0-content',
+        label: '0-content-1',
+        type: 'paragraph',
         text: 'Introduction without a header.',
       },
       {
         label: '1-heading',
+        type: 'heading',
         text: '# Chapter 1',
       },
       {
-        label: '1-content',
+        label: '1-content-1',
+        type: 'paragraph',
         text: 'Content of chapter 1.',
       },
       {
         label: '1-1-1-heading',
+        type: 'heading',
         text: '### Section 1.1.1',
       },
       {
-        label: '1-1-1-content',
+        label: '1-1-1-content-1',
+        type: 'paragraph',
         text: 'Content of section 1.1.1.',
       },
       {
         label: '1-2-heading',
+        type: 'heading',
         text: '## Section 1.2',
       },
       {
-        label: '1-2-content',
+        label: '1-2-content-1',
+        type: 'paragraph',
         text: 'Content of section 1.2.',
       },
       {
         label: '2-heading',
+        type: 'heading',
         text: '# Chapter 2',
       },
       {
-        label: '2-content',
+        label: '2-content-1',
+        type: 'paragraph',
         text: 'Content of chapter 2.',
       },
       {
         label: '2-1-heading',
+        type: 'heading',
         text: '## Section 2.1',
       },
       {
-        label: '2-1-content',
+        label: '2-1-content-1',
+        type: 'paragraph',
         text: 'Content of section 2.1.',
       },
     ];
@@ -133,14 +150,14 @@ Content under header 1.2.
 Content under header 2.
     `;
     const expected: Chunk[] = [
-      { label: '1-heading', text: '# Header 1' },
-      { label: '1-content', text: 'Content under header 1.' },
-      { label: '1-1-1-1-heading', text: '#### Header 1.1.1.1' },
-      { label: '1-1-1-1-content', text: 'Content under header 1.1.1.1.' },
-      { label: '1-2-heading', text: '## Header 1.2' },
-      { label: '1-2-content', text: 'Content under header 1.2.' },
-      { label: '2-heading', text: '# Header 2' },
-      { label: '2-content', text: 'Content under header 2.' },
+      { label: '1-heading', type: 'heading', text: '# Header 1' },
+      { label: '1-content-1', type: 'paragraph', text: 'Content under header 1.' },
+      { label: '1-1-1-1-heading', type: 'heading', text: '#### Header 1.1.1.1' },
+      { label: '1-1-1-1-content-1', type: 'paragraph', text: 'Content under header 1.1.1.1.' },
+      { label: '1-2-heading', type: 'heading', text: '## Header 1.2' },
+      { label: '1-2-content-1', type: 'paragraph', text: 'Content under header 1.2.' },
+      { label: '2-heading', type: 'heading', text: '# Header 2' },
+      { label: '2-content-1', type: 'paragraph', text: 'Content under header 2.' },
     ];
     const result = await splitMarkdownIntoChunks(markdown); // Await the result
     expect(result).toEqual(expected);
@@ -155,10 +172,10 @@ Content under header 1.
 Content under header 1.1.1.1.
     `;
     const expected: Chunk[] = [
-      { label: '1-heading', text: '# Header 1' },
-      { label: '1-content', text: 'Content under header 1.' },
-      { label: '1-1-1-1-heading', text: '#### Header 1.1.1.1' },
-      { label: '1-1-1-1-content', text: 'Content under header 1.1.1.1.' },
+      { label: '1-heading', type: 'heading', text: '# Header 1' },
+      { label: '1-content-1', type: 'paragraph', text: 'Content under header 1.' },
+      { label: '1-1-1-1-heading', type: 'heading', text: '#### Header 1.1.1.1' },
+      { label: '1-1-1-1-content-1', type: 'paragraph', text: 'Content under header 1.1.1.1.' },
     ];
     const result = await splitMarkdownIntoChunks(markdown); // Await the result
     expect(result).toEqual(expected);
@@ -175,11 +192,25 @@ Content under header 1.
     `;
     const expected: Chunk[] = [
       {
-        label: '0-content',
-        text: 'This is the first paragraph without a header.\n\nThis is the second paragraph without a header.',
+        label: '0-content-1',
+        type: 'paragraph',
+        text: 'This is the first paragraph without a header.',
+      },
+      {
+        label: '0-content-2',
+        type: 'paragraph',
+        text: 'This is the second paragraph without a header.',
+      },
+      {
+        label: '1-heading',
+        type: 'heading',
+        text: '# Header 1',
+      },
+      {
+        label: '1-content-1',
+        type: 'paragraph',
+        text: 'Content under header 1.',
       },
-      { label: '1-heading', text: '# Header 1' },
-      { label: '1-content', text: 'Content under header 1.' },
     ];
     const result = await splitMarkdownIntoChunks(markdown); // Await the result
     expect(result).toEqual(expected);
@@ -194,9 +225,9 @@ Content under header 1.
 ### Header 1.1.1
     `;
     const expected: Chunk[] = [
-      { label: '1-heading', text: '# Header 1' },
-      { label: '1-1-heading', text: '## Header 1.1' },
-      { label: '1-1-1-heading', text: '### Header 1.1.1' },
+      { label: '1-heading', type: 'heading', text: '# Header 1' },
+      { label: '1-1-heading', type: 'heading', text: '## Header 1.1' },
+      { label: '1-1-1-heading', type: 'heading', text: '### Header 1.1.1' },
     ];
     const result = await splitMarkdownIntoChunks(markdown); // Await the result
     expect(result).toEqual(expected);
@@ -215,12 +246,12 @@ Another piece of content.
 Content under header 2.
     `;
     const expected: Chunk[] = [
-      { label: '1-heading', text: '# Header 1' },
-      { label: '1-content', text: 'Content under header 1.' },
-      { label: '1-1-heading', text: '## Header 1.1' },
-      { label: '1-1-content', text: 'Content under header 1.1.\nAnother piece of content.' },
-      { label: '2-heading', text: '# Header 2' },
-      { label: '2-content', text: 'Content under header 2.' },
+      { label: '1-heading', type: 'heading', text: '# Header 1' },
+      { label: '1-content-1', type: 'paragraph', text: 'Content under header 1.' },
+      { label: '1-1-heading', type: 'heading', text: '## Header 1.1' },
+      { label: '1-1-content-1', type: 'paragraph', text: 'Content under header 1.1.\nAnother piece of content.' },
+      { label: '2-heading', type: 'heading', text: '# Header 2' },
+      { label: '2-content-1', type: 'paragraph', text: 'Content under header 2.' },
     ];
     const result = await splitMarkdownIntoChunks(markdown); // Await the result
     expect(result).toEqual(expected);
@@ -240,10 +271,11 @@ Content under header 1.
 Content under header 2.
     `;
     const expected: Chunk[] = [
-      { label: '1-heading', text: '# Header 1' },
-      { label: '1-content', text: 'Content under header 1.\n\n- Item 1\n  - Subitem 1\n- Item 2' },
-      { label: '2-heading', text: '# Header 2' },
-      { label: '2-content', text: 'Content under header 2.' },
+      { label: '1-heading', type: 'heading', text: '# Header 1' },
+      { label: '1-content-1', type: 'paragraph', text: 'Content under header 1.' },
+      { label: '1-content-2', type: 'list', text: '- Item 1\n  - Subitem 1\n- Item 2' },
+      { label: '2-heading', type: 'heading', text: '# Header 2' },
+      { label: '2-content-1', type: 'paragraph', text: 'Content under header 2.' },
     ];
     const result = await splitMarkdownIntoChunks(markdown); // Await the result
     expect(result).toEqual(expected);
@@ -262,10 +294,12 @@ Content under header 2.
     `;
 
     const expected: Chunk[] = [
-      { label: '1-heading', text: '# Header 1' },
-      { label: '1-content', text: 'Some introductory content.\n\n```\n# This is a comment with a # symbol\nSome code line\n```\n\nAdditional content.' },
-      { label: '2-heading', text: '# Header 2' },
-      { label: '2-content', text: 'Content under header 2.' },
+      { label: '1-heading', type: 'heading', text: '# Header 1' },
+      { label: '1-content-1', type: 'paragraph', text: 'Some introductory content.' },
+      { label: '1-content-2', type: 'code', text: '```\n# This is a comment with a # symbol\nSome code line\n```' },
+      { label: '1-content-3', type: 'paragraph', text: 'Additional content.' },
+      { label: '2-heading', type: 'heading', text: '# Header 2' },
+      { label: '2-content-1', type: 'paragraph', text: 'Content under header 2.' },
     ];
 
     const result = await splitMarkdownIntoChunks(markdown);
@@ -282,9 +316,9 @@ Some introductory content.
     `;
 
     const expected: Chunk[] = [
-      { label: 'frontmatter', text: JSON.stringify({ title: 'Test Document', author: 'John Doe' }, null, 2) },
-      { label: '1-heading', text: '# Header 1' },
-      { label: '1-content', text: 'Some introductory content.' },
+      { label: 'frontmatter', type: 'yaml', text: JSON.stringify({ title: 'Test Document', author: 'John Doe' }, null, 2) },
+      { label: '1-heading', type: 'heading', text: '# Header 1' },
+      { label: '1-content-1', type: 'paragraph', text: 'Some introductory content.' },
     ];
 
     const result = await splitMarkdownIntoChunks(markdown);