Просмотр исходного кода

change splitmarkdown into chunks

nHigashiWeseek 1 год назад
Родитель
Сommit
806b7843a2
1 измененных файлов с 25 добавлено и 15 удалено
  1. 25 15
      packages/markdown-splitter/src/services/markdown-splitter.ts

+ 25 - 15
packages/markdown-splitter/src/services/markdown-splitter.ts

@@ -1,3 +1,5 @@
+import type { TiktokenModel } from 'js-tiktoken';
+import { encodingForModel } from 'js-tiktoken';
 import yaml from 'js-yaml';
 import remarkFrontmatter from 'remark-frontmatter'; // Frontmatter processing
 import remarkGfm from 'remark-gfm'; // GFM processing
@@ -8,9 +10,9 @@ import { unified } from 'unified';
 
 export type Chunk = {
   label: string;
-  type: string;
+  type?: string;
   text: string;
-  tokenCount?: number;
+  tokenCount: number;
 };
 
 /**
@@ -45,10 +47,9 @@ function updateSectionNumbers(sectionNumbers: number[], headingDepth: number): s
  * @param markdownText - The input Markdown string.
  * @returns An array of labeled chunks.
  */
-export async function splitMarkdownIntoChunks(markdownText: string): Promise<Chunk[]> {
+export async function splitMarkdownIntoChunks(markdownText: string, model: TiktokenModel): Promise<Chunk[]> {
   const chunks: Chunk[] = [];
   const sectionNumbers: number[] = [];
-  let frontmatter: Record<string, unknown> | null = null; // Variable to store frontmatter
   let currentSectionLabel = '';
   const contentCounters: Record<string, number> = {};
 
@@ -56,6 +57,8 @@ export async function splitMarkdownIntoChunks(markdownText: string): Promise<Chu
     return chunks;
   }
 
+  const encoder = encodingForModel(model);
+
   const parser = unified()
     .use(remarkParse)
     .use(remarkFrontmatter, ['yaml'])
@@ -76,14 +79,26 @@ export async function splitMarkdownIntoChunks(markdownText: string): Promise<Chu
   // Iterate over top-level nodes to prevent duplication
   for (const node of parsedTree.children) {
     if (node.type === 'yaml') {
-      frontmatter = yaml.load(node.value) as Record<string, unknown>;
+      // Frontmatter block found, handle only the first instance
+      const frontmatter = yaml.load(node.value) as Record<string, unknown>;
+      const frontmatterText = JSON.stringify(frontmatter, null, 2);
+      const tokenCount = encoder.encode(frontmatterText).length;
+      chunks.push({
+        label: 'frontmatter',
+        type: 'yaml',
+        text: frontmatterText,
+        tokenCount,
+      });
     }
     else if (node.type === 'heading') {
       const headingDepth = node.depth;
       currentSectionLabel = updateSectionNumbers(sectionNumbers, headingDepth);
 
       const headingMarkdown = stringifier.stringify(node as any).trim(); // eslint-disable-line @typescript-eslint/no-explicit-any
-      chunks.push({ label: `${currentSectionLabel}-heading`, type: node.type, text: headingMarkdown });
+      const tokenCount = encoder.encode(headingMarkdown).length;
+      chunks.push({
+        label: `${currentSectionLabel}-heading`, type: node.type, text: headingMarkdown, tokenCount,
+      });
     }
     else {
       // Process non-heading content individually
@@ -99,18 +114,13 @@ export async function splitMarkdownIntoChunks(markdownText: string): Promise<Chu
         const contentLabel = currentSectionLabel !== ''
           ? `${currentSectionLabel}-content-${contentCounters[contentCountKey]}`
           : `0-content-${contentCounters[contentCountKey]}`;
-        chunks.push({ label: contentLabel, type: node.type, text: contentMarkdown });
+        const tokenCount = encoder.encode(contentMarkdown).length;
+        chunks.push({
+          label: contentLabel, type: node.type, text: contentMarkdown, tokenCount,
+        });
       }
     }
   }
 
-  if (frontmatter) {
-    chunks.unshift({
-      label: 'frontmatter',
-      type: 'yaml',
-      text: JSON.stringify(frontmatter, null, 2),
-    });
-  }
-
   return chunks;
 }