nHigashiWeseek 1 год назад
Родитель
Сommit
e6f1ffdfc5
1 измененных файлов с 34 добавлено и 14 удалено
  1. 34 14
      packages/markdown-splitter/src/services/markdown-splitter.ts

+ 34 - 14
packages/markdown-splitter/src/services/markdown-splitter.ts

@@ -1,3 +1,6 @@
+import yaml from 'js-yaml';
+import remarkFrontmatter from 'remark-frontmatter'; // Frontmatter processing
+import remarkGfm from 'remark-gfm'; // GFM processing
 import remarkParse from 'remark-parse';
 import type { Options as StringifyOptions } from 'remark-stringify';
 import remarkStringify from 'remark-stringify';
@@ -58,45 +61,55 @@ function updateSectionNumbers(sectionNumbers: number[], headingDepth: number): s
 export async function splitMarkdownIntoChunks(markdownText: string): Promise<Chunk[]> {
   const chunks: Chunk[] = [];
   const sectionNumbers: number[] = [];
+  let frontmatter: Record<string, unknown> | null = null; // Variable to store frontmatter
+  const contentBuffer: string[] = [];
+  let currentSectionLabel = '';
 
   if (typeof markdownText !== 'string' || markdownText.trim() === '') {
     return chunks;
   }
 
-  const parser = unified().use(remarkParse);
+  const parser = unified()
+    .use(remarkParse)
+    .use(remarkFrontmatter, ['yaml'])
+    .use(remarkGfm); // Enable GFM extensions
 
   const stringifyOptions: StringifyOptions = {
     bullet: '-', // Set list bullet to hyphen
     rule: '-', // Use hyphen for horizontal rules
   };
 
-  // Create stringifier once
-  const stringifier = unified().use(remarkStringify, stringifyOptions);
+  const stringifier = unified()
+    .use(remarkFrontmatter, ['yaml'])
+    .use(remarkGfm)
+    .use(remarkStringify, stringifyOptions);
 
   const parsedTree = parser.parse(markdownText);
 
-  const contentBuffer: string[] = [];
-  let currentSectionLabel = '';
-
-  const markdownNodes = parsedTree.children;
-
-  for (const node of markdownNodes) {
-    if (node.type === 'heading') {
+  // Iterate over top-level nodes to prevent duplication
+  for (const node of parsedTree.children) {
+    if (node.type === 'yaml') {
+      frontmatter = yaml.load(node.value) as Record<string, unknown>;
+    }
+    else if (node.type === 'heading') {
       // Process pending content before heading
       if (contentBuffer.length > 0) {
         const contentLabel = currentSectionLabel !== '' ? `${currentSectionLabel}-content` : '0-content';
         addContentChunk(chunks, contentBuffer, contentLabel);
       }
 
-      const headingDepth = node.depth as number;
+      const headingDepth = node.depth;
       currentSectionLabel = updateSectionNumbers(sectionNumbers, headingDepth);
 
-      const headingMarkdown = stringifier.stringify(node as any); // eslint-disable-line @typescript-eslint/no-explicit-any
+      const headingMarkdown = stringifier.stringify(node as any);// eslint-disable-line @typescript-eslint/no-explicit-any
       chunks.push({ label: `${currentSectionLabel}-heading`, text: headingMarkdown.trim() });
     }
     else {
-      const contentMarkdown = stringifier.stringify(node as any);// eslint-disable-line @typescript-eslint/no-explicit-any
-      contentBuffer.push(contentMarkdown.trim());
+      // Add non-heading content to the buffer
+      const contentMarkdown = stringifier.stringify(node as any).trim(); // eslint-disable-line @typescript-eslint/no-explicit-any
+      if (contentMarkdown !== '') {
+        contentBuffer.push(contentMarkdown);
+      }
     }
   }
 
@@ -106,5 +119,12 @@ export async function splitMarkdownIntoChunks(markdownText: string): Promise<Chu
     addContentChunk(chunks, contentBuffer, contentLabel);
   }
 
+  if (frontmatter) {
+    chunks.unshift({
+      label: 'frontmatter',
+      text: JSON.stringify(frontmatter, null, 2),
+    });
+  }
+
   return chunks;
 }