1 год назад · 6beafbba5b
--- a/packages/markdown-splitter/package.json
+++ b/packages/markdown-splitter/package.json
@@ -39,5 +39,11 @@
 
				     "react-dom": "^18.2.0"
			
 
				   },
			
 
				   "dependencies": {
			
 
				+    "@types/js-yaml": "^4.0.9",
			
 
				+    "remark-frontmatter": "^5.0.0",
			
 
				+    "remark-gfm": "^4.0.0",
			
 
				+    "remark-parse": "^11.0.0",
			
 
				+    "remark-stringify": "^11.0.0",
			
 
				+    "unified": "^11.0.0"
			
 
				   }
			
 
				 }
			
--- a/packages/markdown-splitter/src/services/markdown-splitter.ts
+++ b/packages/markdown-splitter/src/services/markdown-splitter.ts
@@ -1,3 +1,11 @@
 
				+import yaml from 'js-yaml';
			
 
				+import remarkFrontmatter from 'remark-frontmatter'; // Frontmatter processing
			
 
				+import remarkGfm from 'remark-gfm'; // GFM processing
			
 
				+import remarkParse from 'remark-parse';
			
 
				+import type { Options as StringifyOptions } from 'remark-stringify';
			
 
				+import remarkStringify from 'remark-stringify';
			
 
				+import { unified } from 'unified';
			
 
				+
			
 
				 export type Chunk = {
			
 
				   label: string;
			
 
				   text: string;
			
@@ -5,101 +13,117 @@ export type Chunk = {
 
				 
			
 
				 /**
			
 
				  * Processes and adds a new chunk to the chunks array if content is not empty.
			
 
				- * Clears the contentLines array after processing.
			
 
				- * @param chunks - The array to store chunks.
			
 
				- * @param contentLines - The array of content lines.
			
 
				+ * Clears the contentBuffer array after processing.
			
 
				+ * @param chunks - The array to store processed chunks.
			
 
				+ * @param contentBuffer - The array of content lines to be processed.
			
 
				  * @param label - The label for the content chunk.
			
 
				  */
			
 
				-function processPendingContent(chunks: Chunk[], contentLines: string[], label: string) {
			
 
				-  const text = contentLines.join('\n').trimEnd();
			
 
				+function addContentChunk(chunks: Chunk[], contentBuffer: string[], label: string) {
			
 
				+  const text = contentBuffer.join('\n\n').trimEnd();
			
 
				   if (text !== '') {
			
 
				     chunks.push({ label, text });
			
 
				   }
			
 
				-  contentLines.length = 0; // Clear the contentLines array
			
 
				+  contentBuffer.length = 0; // Clear the contentBuffer array
			
 
				 }
			
 
				 
			
 
				 /**
			
 
				  * Updates the section numbers based on the heading depth and returns the updated section label.
			
 
				  * Handles non-consecutive heading levels by initializing missing levels with 1.
			
 
				  * @param sectionNumbers - The current section numbers.
			
 
				- * @param depth - The depth of the heading (e.g., # is depth 1).
			
 
				+ * @param headingDepth - The depth of the heading (e.g., # is depth 1).
			
 
				  * @returns The updated section label.
			
 
				  */
			
 
				-function updateSectionNumbers(sectionNumbers: number[], depth: number): string {
			
 
				-  if (depth > sectionNumbers.length) {
			
 
				-    // If depth increases, initialize missing levels with 1
			
 
				-    while (sectionNumbers.length < depth) {
			
 
				+function updateSectionNumbers(sectionNumbers: number[], headingDepth: number): string {
			
 
				+  if (headingDepth > sectionNumbers.length) {
			
 
				+    // Initialize missing levels with 1
			
 
				+    while (sectionNumbers.length < headingDepth) {
			
 
				       sectionNumbers.push(1);
			
 
				     }
			
 
				   }
			
 
				-  else if (depth === sectionNumbers.length) {
			
 
				-    // Same level, increment the last number
			
 
				-    sectionNumbers[depth - 1]++;
			
 
				+  else if (headingDepth === sectionNumbers.length) {
			
 
				+    // Increment the last number for the same level
			
 
				+    sectionNumbers[headingDepth - 1]++;
			
 
				   }
			
 
				   else {
			
 
				-    // Depth decreases, remove deeper levels and increment current level
			
 
				-    sectionNumbers.splice(depth);
			
 
				-    sectionNumbers[depth - 1]++;
			
 
				+    // Remove deeper levels and increment the current level
			
 
				+    sectionNumbers.splice(headingDepth);
			
 
				+    sectionNumbers[headingDepth - 1]++;
			
 
				   }
			
 
				   return sectionNumbers.join('-');
			
 
				 }
			
 
				 
			
 
				 /**
			
 
				- * Splits Markdown text into labeled chunks, considering content that may start before any headers
			
 
				- * and handling non-consecutive heading levels. Preserves list indentation and leading spaces while
			
 
				- * reducing unnecessary line breaks. Ensures that no empty line is added between sections.
			
 
				- * @param markdown - The input Markdown string.
			
 
				+ * Splits Markdown text into labeled chunks using remark-parse and remark-stringify,
			
 
				+ * considering content that may start before any headers and handling non-consecutive heading levels.
			
 
				+ * @param markdownText - The input Markdown string.
			
 
				  * @returns An array of labeled chunks.
			
 
				  */
			
 
				-export function splitMarkdownIntoChunks(markdown: string): Chunk[] {
			
 
				+export async function splitMarkdownIntoChunks(markdownText: string): Promise<Chunk[]> {
			
 
				   const chunks: Chunk[] = [];
			
 
				   const sectionNumbers: number[] = [];
			
 
				+  let frontmatter: Record<string, unknown> | null = null; // Variable to store frontmatter
			
 
				+  const contentBuffer: string[] = [];
			
 
				+  let currentSectionLabel = '';
			
 
				 
			
 
				-  if (typeof markdown !== 'string' || markdown.trim() === '') {
			
 
				+  if (typeof markdownText !== 'string' || markdownText.trim() === '') {
			
 
				     return chunks;
			
 
				   }
			
 
				 
			
 
				-  const lines = markdown.split('\n');
			
 
				-  const contentLines: string[] = [];
			
 
				-  let currentLabel = '';
			
 
				-  let previousLineEmpty = false;
			
 
				+  const parser = unified()
			
 
				+    .use(remarkParse)
			
 
				+    .use(remarkFrontmatter, ['yaml'])
			
 
				+    .use(remarkGfm); // Enable GFM extensions
			
 
				 
			
 
				-  for (const line of lines) {
			
 
				-    const trimmedLine = line.trim();
			
 
				+  const stringifyOptions: StringifyOptions = {
			
 
				+    bullet: '-', // Set list bullet to hyphen
			
 
				+    rule: '-', // Use hyphen for horizontal rules
			
 
				+  };
			
 
				 
			
 
				-    if (trimmedLine.startsWith('#')) {
			
 
				-      // Process any pending content before starting a new section
			
 
				-      if (contentLines.length > 0) {
			
 
				-        const contentLabel = currentLabel !== '' ? `${currentLabel}-content` : '0-content';
			
 
				-        processPendingContent(chunks, contentLines, contentLabel);
			
 
				-      }
			
 
				+  const stringifier = unified()
			
 
				+    .use(remarkFrontmatter, ['yaml'])
			
 
				+    .use(remarkGfm)
			
 
				+    .use(remarkStringify, stringifyOptions);
			
 
				 
			
 
				-      // Match heading level and text
			
 
				-      const headerMatch = trimmedLine.match(/^(#+)\s+(.*)/);
			
 
				-      if (headerMatch) {
			
 
				-        const headingDepth = headerMatch[1].length;
			
 
				-        currentLabel = updateSectionNumbers(sectionNumbers, headingDepth);
			
 
				-        chunks.push({ label: `${currentLabel}-heading`, text: line });
			
 
				-      }
			
 
				+  const parsedTree = parser.parse(markdownText);
			
 
				+
			
 
				+  // Iterate over top-level nodes to prevent duplication
			
 
				+  for (const node of parsedTree.children) {
			
 
				+    if (node.type === 'yaml') {
			
 
				+      frontmatter = yaml.load(node.value) as Record<string, unknown>;
			
 
				     }
			
 
				-    else if (trimmedLine === '') {
			
 
				-      // Handle empty lines to avoid multiple consecutive empty lines
			
 
				-      if (!previousLineEmpty && contentLines.length > 0) {
			
 
				-        contentLines.push('');
			
 
				-        previousLineEmpty = true;
			
 
				+    else if (node.type === 'heading') {
			
 
				+      // Process pending content before heading
			
 
				+      if (contentBuffer.length > 0) {
			
 
				+        const contentLabel = currentSectionLabel !== '' ? `${currentSectionLabel}-content` : '0-content';
			
 
				+        addContentChunk(chunks, contentBuffer, contentLabel);
			
 
				       }
			
 
				+
			
 
				+      const headingDepth = node.depth;
			
 
				+      currentSectionLabel = updateSectionNumbers(sectionNumbers, headingDepth);
			
 
				+
			
 
				+      const headingMarkdown = stringifier.stringify(node as any);// eslint-disable-line @typescript-eslint/no-explicit-any
			
 
				+      chunks.push({ label: `${currentSectionLabel}-heading`, text: headingMarkdown.trim() });
			
 
				     }
			
 
				     else {
			
 
				-      // Add non-empty lines to the current content
			
 
				-      contentLines.push(line);
			
 
				-      previousLineEmpty = false;
			
 
				+      // Add non-heading content to the buffer
			
 
				+      const contentMarkdown = stringifier.stringify(node as any).trim(); // eslint-disable-line @typescript-eslint/no-explicit-any
			
 
				+      if (contentMarkdown !== '') {
			
 
				+        contentBuffer.push(contentMarkdown);
			
 
				+      }
			
 
				     }
			
 
				   }
			
 
				 
			
 
				-  // Process any remaining content after the last line
			
 
				-  if (contentLines.length > 0) {
			
 
				-    const contentLabel = currentLabel !== '' ? `${currentLabel}-content` : '0-content';
			
 
				-    processPendingContent(chunks, contentLines, contentLabel);
			
 
				+  // Process any remaining content
			
 
				+  if (contentBuffer.length > 0) {
			
 
				+    const contentLabel = currentSectionLabel !== '' ? `${currentSectionLabel}-content` : '0-content';
			
 
				+    addContentChunk(chunks, contentBuffer, contentLabel);
			
 
				+  }
			
 
				+
			
 
				+  if (frontmatter) {
			
 
				+    chunks.unshift({
			
 
				+      label: 'frontmatter',
			
 
				+      text: JSON.stringify(frontmatter, null, 2),
			
 
				+    });
			
 
				   }
			
 
				 
			
 
				   return chunks;
			
--- a/packages/markdown-splitter/test/index.spec.ts
+++ b/packages/markdown-splitter/test/index.spec.ts
@@ -3,14 +3,14 @@ import { splitMarkdownIntoChunks } from '../src/services/markdown-splitter';
 
				 
			
 
				 describe('splitMarkdownIntoChunks', () => {
			
 
				 
			
 
				-  test('handles empty markdown string', () => {
			
 
				+  test('handles empty markdown string', async() => {
			
 
				     const markdown = '';
			
 
				     const expected: Chunk[] = [];
			
 
				-    const result = splitMarkdownIntoChunks(markdown);
			
 
				+    const result = await splitMarkdownIntoChunks(markdown); // Await the result
			
 
				     expect(result).toEqual(expected);
			
 
				   });
			
 
				 
			
 
				-  test('handles markdown with only content and no headers', () => {
			
 
				+  test('handles markdown with only content and no headers', async() => {
			
 
				     const markdown = `This is some content without any headers.
			
 
				 It spans multiple lines.
			
 
				 
			
@@ -22,11 +22,11 @@ Another paragraph.
 
				         text: 'This is some content without any headers.\nIt spans multiple lines.\n\nAnother paragraph.',
			
 
				       },
			
 
				     ];
			
 
				-    const result = splitMarkdownIntoChunks(markdown);
			
 
				+    const result = await splitMarkdownIntoChunks(markdown); // Await the result
			
 
				     expect(result).toEqual(expected);
			
 
				   });
			
 
				 
			
 
				-  test('handles markdown starting with a header', () => {
			
 
				+  test('handles markdown starting with a header', async() => {
			
 
				     const markdown = `
			
 
				 # Header 1
			
 
				 Content under header 1.
			
@@ -45,11 +45,11 @@ Content under header 2.
 
				       { label: '2-heading', text: '# Header 2' },
			
 
				       { label: '2-content', text: 'Content under header 2.' },
			
 
				     ];
			
 
				-    const result = splitMarkdownIntoChunks(markdown);
			
 
				+    const result = await splitMarkdownIntoChunks(markdown); // Await the result
			
 
				     expect(result).toEqual(expected);
			
 
				   });
			
 
				 
			
 
				-  test('handles markdown with non-consecutive heading levels', () => {
			
 
				+  test('handles markdown with non-consecutive heading levels', async() => {
			
 
				     const markdown = `
			
 
				 Introduction without a header.
			
 
				 
			
@@ -114,11 +114,11 @@ Content of section 2.1.
 
				         text: 'Content of section 2.1.',
			
 
				       },
			
 
				     ];
			
 
				-    const result = splitMarkdownIntoChunks(markdown);
			
 
				+    const result = await splitMarkdownIntoChunks(markdown); // Await the result
			
 
				     expect(result).toEqual(expected);
			
 
				   });
			
 
				 
			
 
				-  test('handles markdown with skipped heading levels', () => {
			
 
				+  test('handles markdown with skipped heading levels', async() => {
			
 
				     const markdown = `
			
 
				 # Header 1
			
 
				 Content under header 1.
			
@@ -142,11 +142,11 @@ Content under header 2.
 
				       { label: '2-heading', text: '# Header 2' },
			
 
				       { label: '2-content', text: 'Content under header 2.' },
			
 
				     ];
			
 
				-    const result = splitMarkdownIntoChunks(markdown);
			
 
				+    const result = await splitMarkdownIntoChunks(markdown); // Await the result
			
 
				     expect(result).toEqual(expected);
			
 
				   });
			
 
				 
			
 
				-  test('handles malformed headings', () => {
			
 
				+  test('handles malformed headings', async() => {
			
 
				     const markdown = `
			
 
				 # Header 1
			
 
				 Content under header 1.
			
@@ -160,11 +160,11 @@ Content under header 1.1.1.1.
 
				       { label: '1-1-1-1-heading', text: '#### Header 1.1.1.1' },
			
 
				       { label: '1-1-1-1-content', text: 'Content under header 1.1.1.1.' },
			
 
				     ];
			
 
				-    const result = splitMarkdownIntoChunks(markdown);
			
 
				+    const result = await splitMarkdownIntoChunks(markdown); // Await the result
			
 
				     expect(result).toEqual(expected);
			
 
				   });
			
 
				 
			
 
				-  test('handles multiple content blocks before any headers', () => {
			
 
				+  test('handles multiple content blocks before any headers', async() => {
			
 
				     const markdown = `
			
 
				 This is the first paragraph without a header.
			
 
				 
			
@@ -181,11 +181,11 @@ Content under header 1.
 
				       { label: '1-heading', text: '# Header 1' },
			
 
				       { label: '1-content', text: 'Content under header 1.' },
			
 
				     ];
			
 
				-    const result = splitMarkdownIntoChunks(markdown);
			
 
				+    const result = await splitMarkdownIntoChunks(markdown); // Await the result
			
 
				     expect(result).toEqual(expected);
			
 
				   });
			
 
				 
			
 
				-  test('handles markdown with only headers and no content', () => {
			
 
				+  test('handles markdown with only headers and no content', async() => {
			
 
				     const markdown = `
			
 
				 # Header 1
			
 
				 
			
@@ -198,11 +198,11 @@ Content under header 1.
 
				       { label: '1-1-heading', text: '## Header 1.1' },
			
 
				       { label: '1-1-1-heading', text: '### Header 1.1.1' },
			
 
				     ];
			
 
				-    const result = splitMarkdownIntoChunks(markdown);
			
 
				+    const result = await splitMarkdownIntoChunks(markdown); // Await the result
			
 
				     expect(result).toEqual(expected);
			
 
				   });
			
 
				 
			
 
				-  test('handles markdown with mixed content and headers', () => {
			
 
				+  test('handles markdown with mixed content and headers', async() => {
			
 
				     const markdown = `
			
 
				 # Header 1
			
 
				 Content under header 1.
			
@@ -222,11 +222,11 @@ Content under header 2.
 
				       { label: '2-heading', text: '# Header 2' },
			
 
				       { label: '2-content', text: 'Content under header 2.' },
			
 
				     ];
			
 
				-    const result = splitMarkdownIntoChunks(markdown);
			
 
				+    const result = await splitMarkdownIntoChunks(markdown); // Await the result
			
 
				     expect(result).toEqual(expected);
			
 
				   });
			
 
				 
			
 
				-  test('preserves list indentation and reduces unnecessary line breaks', () => {
			
 
				+  test('preserves list indentation and reduces unnecessary line breaks', async() => {
			
 
				     const markdown = `
			
 
				 # Header 1
			
 
				 Content under header 1.
			
@@ -245,8 +245,49 @@ Content under header 2.
 
				       { label: '2-heading', text: '# Header 2' },
			
 
				       { label: '2-content', text: 'Content under header 2.' },
			
 
				     ];
			
 
				-    const result = splitMarkdownIntoChunks(markdown);
			
 
				+    const result = await splitMarkdownIntoChunks(markdown); // Await the result
			
 
				     expect(result).toEqual(expected);
			
 
				   });
			
 
				+  test('code blocks containing # are not treated as headings', async() => {
			
 
				+    const markdown = `
			
 
				+# Header 1
			
 
				+Some introductory content.
			
 
				+\`\`\`
			
 
				+# This is a comment with a # symbol
			
 
				+Some code line
			
 
				+\`\`\`
			
 
				+Additional content.
			
 
				+# Header 2
			
 
				+Content under header 2.
			
 
				+    `;
			
 
				+
			
 
				+    const expected: Chunk[] = [
			
 
				+      { label: '1-heading', text: '# Header 1' },
			
 
				+      { label: '1-content', text: 'Some introductory content.\n\n```\n# This is a comment with a # symbol\nSome code line\n```\n\nAdditional content.' },
			
 
				+      { label: '2-heading', text: '# Header 2' },
			
 
				+      { label: '2-content', text: 'Content under header 2.' },
			
 
				+    ];
			
 
				 
			
 
				+    const result = await splitMarkdownIntoChunks(markdown);
			
 
				+    expect(result).toEqual(expected);
			
 
				+  });
			
 
				+  test('frontmatter is processed and labeled correctly', async() => {
			
 
				+    const markdown = `---
			
 
				+title: Test Document
			
 
				+author: John Doe
			
 
				+---
			
 
				+
			
 
				+# Header 1
			
 
				+Some introductory content.
			
 
				+    `;
			
 
				+
			
 
				+    const expected: Chunk[] = [
			
 
				+      { label: 'frontmatter', text: JSON.stringify({ title: 'Test Document', author: 'John Doe' }, null, 2) },
			
 
				+      { label: '1-heading', text: '# Header 1' },
			
 
				+      { label: '1-content', text: 'Some introductory content.' },
			
 
				+    ];
			
 
				+
			
 
				+    const result = await splitMarkdownIntoChunks(markdown);
			
 
				+    expect(result).toEqual(expected);
			
 
				+  });
			
 
				 });
			
--- a/yarn.lock
+++ b/yarn.lock
@@ -2199,6 +2199,13 @@
 
				 
			
 
				 "@growi/markdown-splitter@link:packages/markdown-splitter":
			
 
				   version "1.0.0"
			
 
				+  dependencies:
			
 
				+    "@types/js-yaml" "^4.0.9"
			
 
				+    remark-frontmatter "^5.0.0"
			
 
				+    remark-gfm "^4.0.0"
			
 
				+    remark-parse "^11.0.0"
			
 
				+    remark-stringify "^11.0.0"
			
 
				+    unified "^11.0.0"
			
 
				 
			
 
				 "@growi/pluginkit@link:packages/pluginkit":
			
 
				   version "1.0.1"
			
@@ -4518,6 +4525,11 @@
 
				     expect "^29.0.0"
			
 
				     pretty-format "^29.0.0"
			
 
				 
			
 
				+"@types/js-yaml@^4.0.9":
			
 
				+  version "4.0.9"
			
 
				+  resolved "https://registry.yarnpkg.com/@types/js-yaml/-/js-yaml-4.0.9.tgz#cd82382c4f902fed9691a2ed79ec68c5898af4c2"
			
 
				+  integrity sha512-k4MGaQl5TGo/iipqb2UDG2UwjXziSWkh0uysQelTlJpX1qGlpUZYm8PnO4DxG1qBomtJUdYJ6qR6xdIah10JLg==
			
 
				+
			
 
				 "@types/json-schema@*", "@types/json-schema@^7.0.12", "@types/json-schema@^7.0.5", "@types/json-schema@^7.0.6", "@types/json-schema@^7.0.8", "@types/json-schema@^7.0.9":
			
 
				   version "7.0.15"
			
 
				   resolved "https://registry.yarnpkg.com/@types/json-schema/-/json-schema-7.0.15.tgz#596a1747233694d50f6ad8a7869fcb6f56cf5841"
			
@@ -18341,19 +18353,6 @@ unicode-emoji-modifier-base@^1.0.0:
 
				   resolved "https://registry.yarnpkg.com/unicode-emoji-modifier-base/-/unicode-emoji-modifier-base-1.0.0.tgz#dbbd5b54ba30f287e2a8d5a249da6c0cef369459"
			
 
				   integrity sha512-yLSH4py7oFH3oG/9K+XWrz1pSi3dfUrWEnInbxMfArOfc1+33BlGPQtLsOYwvdMy11AwUBetYuaRxSPqgkq+8g==
			
 
				 
			
 
				-unified@^10.1.2:
			
 
				-  version "10.1.2"
			
 
				-  resolved "https://registry.yarnpkg.com/unified/-/unified-10.1.2.tgz#b1d64e55dafe1f0b98bb6c719881103ecf6c86df"
			
 
				-  integrity sha512-pUSWAi/RAnVy1Pif2kAoeWNBa3JVrx0MId2LASj8G+7AiHWoKZNTomq6LG326T68U7/e263X6fTdcXIy7XnF7Q==
			
 
				-  dependencies:
			
 
				-    "@types/unist" "^2.0.0"
			
 
				-    bail "^2.0.0"
			
 
				-    extend "^3.0.0"
			
 
				-    is-buffer "^2.0.0"
			
 
				-    is-plain-obj "^4.0.0"
			
 
				-    trough "^2.0.0"
			
 
				-    vfile "^5.0.0"
			
 
				-
			
 
				 unified@^11.0.0, unified@^11.0.3, unified@^11.0.4:
			
 
				   version "11.0.5"
			
 
				   resolved "https://registry.yarnpkg.com/unified/-/unified-11.0.5.tgz#f66677610a5c0a9ee90cab2b8d4d66037026d9e1"
			
@@ -18754,7 +18753,7 @@ vfile-message@^4.0.0:
 
				     "@types/unist" "^3.0.0"
			
 
				     unist-util-stringify-position "^4.0.0"
			
 
				 
			
 
				-vfile@^5.0.0, vfile@^5.1.0:
			
 
				+vfile@^5.1.0:
			
 
				   version "5.3.7"
			
 
				   resolved "https://registry.yarnpkg.com/vfile/-/vfile-5.3.7.tgz#de0677e6683e3380fafc46544cfe603118826ab7"
			
 
				   integrity sha512-r7qlzkgErKjobAmyNIkkSpizsFPYiUPuJb5pNW1RB4JcYVZhs4lIbVqk8XPk033CV/1z8ss5pkax8SuhGpcG8g==