1 年間前 · 5d6fe9167b
--- a/packages/markdown-splitter/src/services/markdown-splitter.ts
+++ b/packages/markdown-splitter/src/services/markdown-splitter.ts
@@ -1,25 +1,111 @@
 
				-import { RecursiveCharacterTextSplitter } from '@langchain/textsplitters';
			
 
				-import type { Document } from 'langchain/document';
			
 
				+export type Chunk = {
			
 
				+  label: string;
			
 
				+  content: string;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * Adds a new chunk to the chunks array if content is not empty.
			
 
				+ * Trims trailing whitespace and newlines to avoid unnecessary line breaks.
			
 
				+ * @param chunks - The array to store chunks
			
 
				+ * @param content - The content of the chunk
			
 
				+ * @param label - The label of the chunk
			
 
				+ */
			
 
				+function createChunk(chunks: Chunk[], content: string, label: string) {
			
 
				+  const trimmedContent = content.trimEnd(); // 末尾の空白と改行を削除
			
 
				+  if (trimmedContent !== '') {
			
 
				+    chunks.push({ label, content: trimmedContent });
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * Updates the section label based on the heading depth.
			
 
				+ * Allows non-consecutive heading levels by initializing missing levels with 1.
			
 
				+ * @param sectionCounters - The current section counters
			
 
				+ * @param depth - The depth of the heading (e.g., # is depth 1, ## is depth 2)
			
 
				+ * @returns The updated section label
			
 
				+ */
			
 
				+function updateSectionLabel(sectionCounters: number[], depth: number): string {
			
 
				+  if (depth > sectionCounters.length) {
			
 
				+    // If depth increases by more than one, initialize missing levels with 1
			
 
				+    while (sectionCounters.length < depth) {
			
 
				+      sectionCounters.push(1);
			
 
				+    }
			
 
				+  }
			
 
				+  else if (depth === sectionCounters.length) {
			
 
				+    // If the same level, increment the last counter
			
 
				+    sectionCounters[depth - 1]++;
			
 
				+  }
			
 
				+  else {
			
 
				+    // If depth decreases, remove deeper levels and increment the current level
			
 
				+    sectionCounters.splice(depth);
			
 
				+    sectionCounters[depth - 1]++;
			
 
				+  }
			
 
				+  return sectionCounters.join('-');
			
 
				+}
			
 
				 
			
 
				 /**
			
 
				- * Function to recursively split a markdown string by header sections (and within subsections if they exceed the specified max token count).
			
 
				- *
			
 
				- * @param markdownString - The input markdown string
			
 
				- * @param chunkSize - The chunk size for splitting (default is 1000)
			
 
				- * @returns An array of split markdown sections
			
 
				+ * Splits Markdown text into labeled chunks, considering content that may start before any headers
			
 
				+ * and handling non-consecutive heading levels. Reduces unnecessary line breaks while preserving
			
 
				+ * list indentation and leading spaces. Ensures that no empty line is added between sections.
			
 
				+ * @param markdown - The input Markdown string
			
 
				+ * @returns An array of chunks
			
 
				  */
			
 
				-export async function splitMarkdownIntoChunks(
			
 
				-    markdownString: string,
			
 
				-    chunkSize = 1000, // Default chunk size set to 1000
			
 
				-): Promise<Document[]> {
			
 
				-  const validMarkdownString = markdownString || '';
			
 
				-
			
 
				-  const mdSplitter = RecursiveCharacterTextSplitter.fromLanguage('markdown', {
			
 
				-    chunkSize, // Use the provided chunkSize
			
 
				-    chunkOverlap: 0,
			
 
				+export function splitMarkdownIntoChunks(markdown: string): Chunk[] {
			
 
				+  const chunks: Chunk[] = [];
			
 
				+  const sectionCounters: number[] = [];
			
 
				+
			
 
				+  if (!markdown || typeof markdown !== 'string' || markdown.trim() === '') {
			
 
				+    return chunks;
			
 
				+  }
			
 
				+
			
 
				+  const lines = markdown.split('\n');
			
 
				+  let currentContent: string[] = [];
			
 
				+  let currentSectionLabel = '';
			
 
				+  let previousLineEmpty = false;
			
 
				+
			
 
				+  lines.forEach((line) => {
			
 
				+    const trimmedLine = line.trim();
			
 
				+
			
 
				+    if (trimmedLine.startsWith('#')) {
			
 
				+      if (currentContent.length > 0) {
			
 
				+        if (currentSectionLabel !== '') {
			
 
				+          const contentLabel = `${currentSectionLabel}-content`;
			
 
				+          createChunk(chunks, currentContent.join('\n'), contentLabel);
			
 
				+        }
			
 
				+        else {
			
 
				+          createChunk(chunks, currentContent.join('\n'), '0-content');
			
 
				+        }
			
 
				+        currentContent = [];
			
 
				+      }
			
 
				+
			
 
				+      const headerMatch = trimmedLine.match(/^(#+)\s+(.*)/);
			
 
				+      if (headerMatch) {
			
 
				+        const headerDepth = headerMatch[1].length;
			
 
				+        currentSectionLabel = updateSectionLabel(sectionCounters, headerDepth);
			
 
				+        createChunk(chunks, line, currentSectionLabel);
			
 
				+      }
			
 
				+    }
			
 
				+    else if (trimmedLine === '') {
			
 
				+      if (!previousLineEmpty && currentContent.length > 0) {
			
 
				+        currentContent.push('');
			
 
				+        previousLineEmpty = true;
			
 
				+      }
			
 
				+    }
			
 
				+    else {
			
 
				+      currentContent.push(line);
			
 
				+      previousLineEmpty = false;
			
 
				+    }
			
 
				   });
			
 
				 
			
 
				-  const mdDocs = await mdSplitter.createDocuments([validMarkdownString]);
			
 
				+  if (currentContent.length > 0) {
			
 
				+    if (currentSectionLabel !== '') {
			
 
				+      const contentLabel = `${currentSectionLabel}-content`;
			
 
				+      createChunk(chunks, currentContent.join('\n'), contentLabel);
			
 
				+    }
			
 
				+    else {
			
 
				+      createChunk(chunks, currentContent.join('\n'), '0-content');
			
 
				+    }
			
 
				+  }
			
 
				 
			
 
				-  return mdDocs;
			
 
				+  return chunks;
			
 
				 }