Browse Source

check token count before split

nHigashiWeseek 1 year ago
parent
commit
f30647ef02

+ 9 - 1
packages/markdown-splitter/src/services/markdown-token-splitter.ts

@@ -1,4 +1,4 @@
-import type { TiktokenModel } from 'js-tiktoken';
+import { encodingForModel, type TiktokenModel } from 'js-tiktoken';
 
 import { splitMarkdownIntoFragments, type MarkdownFragment } from './markdown-splitter';
 
@@ -98,6 +98,14 @@ export async function splitMarkdownIntoChunks(
     model: TiktokenModel,
     maxToken = 800,
 ): Promise<string[]> {
+  const encoder = encodingForModel(model);
+
+  // If the total token count for the entire markdown text is less than or equal to maxToken,
+  // return the entire markdown as a single chunk.
+  if (encoder.encode(markdownText).length <= maxToken) {
+    return [markdownText];
+  }
+
   // Split markdown text into chunks
   const markdownFragments = await splitMarkdownIntoFragments(markdownText, model);
   const chunks = [] as string[];