Bladeren bron

add markdown token splitter

nHigashiWeseek 1 jaar geleden
bovenliggende
commit
be02f0cc6a

+ 1 - 0
packages/markdown-splitter/src/index.ts

@@ -1 +1,2 @@
 export * from './services/markdown-splitter';
+export * from './services/markdown-token-splitter';

+ 72 - 0
packages/markdown-splitter/src/services/markdown-token-splitter.ts

@@ -0,0 +1,72 @@
+import { RecursiveCharacterTextSplitter } from '@langchain/textsplitters';
+import type { TiktokenModel } from 'js-tiktoken';
+import { encodingForModel } from 'js-tiktoken'; // eslint-disable-line camelcase
+
+import type { Chunk } from './markdown-splitter';
+import { splitMarkdownIntoChunks } from './markdown-splitter';
+
+/**
+ * Splits Markdown text based on the number of tokens.
+ * Uses the `splitMarkdownIntoChunks` function to get initial chunks,
+ * then adjusts each chunk to ensure it's under `maxTokens`.
+ *
+ * Labels are assigned in the order: frontmatter, 0, 1, 2, ...
+ *
+ * @param markdownText - Input Markdown text.
+ * @param model - Tiktoken model to use.
+ * @param maxTokens - Maximum number of tokens per chunk.
+ * @returns An array of labeled chunks.
+ */
+export async function splitMarkdownByTokens(
+    markdownText: string,
+    model: TiktokenModel,
+    maxTokens: number,
+): Promise<Chunk[]> {
+  const initialChunks = await splitMarkdownIntoChunks(markdownText);
+  const encoder = encodingForModel(model);
+
+  // Process each chunk asynchronously
+  const chunkProcessingPromises = initialChunks.map(async(chunk) => {
+    const tokenCount = encoder.encode(chunk.text).length;
+
+    if (tokenCount > maxTokens) {
+      // Calculate the number of splits by adding 10% to the token count
+      const numSplits = Math.ceil((tokenCount * 1.1) / maxTokens);
+
+      // Calculate the average number of characters per chunk
+      const averageCharsPerChunk = Math.ceil(chunk.text.length / numSplits);
+
+      // Set the overlap amount to 5% of the average character count
+      const overlap = Math.ceil(averageCharsPerChunk * 0.05);
+
+      const splitter = new RecursiveCharacterTextSplitter({
+        chunkSize: averageCharsPerChunk,
+        chunkOverlap: overlap,
+      });
+
+      // Re-split using LangChain (await is necessary)
+      const splitTexts = await splitter.splitText(chunk.text);
+
+      return splitTexts.map((subText, index) => ({
+        label: `${chunk.label}-${index + 1}`, // Add split number to the label
+        type: chunk.type,
+        text: subText,
+        tokenCount: encoder.encode(subText).length,
+      }));
+    }
+
+    // If token count is less than or equal to maxTokens, return as is
+    return [
+      {
+        label: chunk.label,
+        type: chunk.type,
+        text: chunk.text,
+        tokenCount,
+      },
+    ];
+  });
+
+  const finalChunks = (await Promise.all(chunkProcessingPromises)).flat();
+
+  return finalChunks;
+}