Sfoglia il codice sorgente

sprit markdown by token count

nHigashiWeseek 1 anno fa
parent
commit
55318b544e

+ 1 - 1
apps/app/next-env.d.ts

@@ -2,4 +2,4 @@
 /// <reference types="next/image-types/global" />
 
 // NOTE: This file should not be edited
-// see https://nextjs.org/docs/basic-features/typescript for more information.
+// see https://nextjs.org/docs/pages/building-your-application/configuring/typescript for more information.

+ 1 - 0
apps/app/package.json

@@ -63,6 +63,7 @@
     "@azure/identity": "^4.3.0",
     "@azure/storage-blob": "^12.16.0",
     "@browser-bunyan/console-formatted-stream": "^1.8.0",
+    "@dqbd/tiktoken": "^1.0.16",
     "@elastic/elasticsearch7": "npm:@elastic/elasticsearch@^7.17.0",
     "@elastic/elasticsearch8": "npm:@elastic/elasticsearch@^8.7.0",
     "@godaddy/terminus": "^4.9.0",

+ 174 - 0
apps/app/src/server/service/page-spritter.ts

@@ -0,0 +1,174 @@
+import type { Tiktoken, TiktokenModel } from '@dqbd/tiktoken';
+import { encoding_for_model } from '@dqbd/tiktoken'; // eslint-disable-line
+import type { Root, Content, Heading } from 'mdast';
+import remarkParse from 'remark-parse';
+import remarkStringify from 'remark-stringify';
+import { unified } from 'unified';
+
+/**
+ * Interface representing a section
+ */
+interface Section {
+  heading: Heading | null;
+  content: Content[];
+}
+
+/**
+ * Function to recursively split Markdown content by header sections so that each section has a token count below the specified maximum
+ *
+ * @param model - The name of the model to use (e.g., 'gpt-4')
+ * @param markdownContent - The Markdown content to split
+ * @param maxTokens - The maximum number of tokens per section (default: 100)
+ * @returns An array of split Markdown sections
+ */
+export function splitMarkdownByTokens(
+    model: TiktokenModel,
+    markdownContent: string,
+    maxTokens = 100,
+): string[] {
+  // Obtain encoding based on the model
+  const encoding: Tiktoken = encoding_for_model(model);
+
+  // Parse Markdown into AST
+  const processor = unified().use(remarkParse);
+  const tree = processor.parse(markdownContent) as Root;
+
+  /**
+   * Function to stringify a node
+   * @param node - The node to stringify
+   * @returns The Markdown string of the node
+   */
+  const stringify = (node: Root): string => {
+    return unified().use(remarkStringify).stringify(node);
+  };
+
+  /**
+   * Function to get the token count of a text
+   * @param text - The text to calculate token count for
+   * @returns The number of tokens
+   */
+  const getTokenCount = (text: string): number => {
+    return encoding.encode(text).length;
+  };
+
+  /**
+   * Function to split nodes into sections based on headers
+   * @param nodes - The array of nodes to split
+   * @returns An array of sections
+   */
+  const splitSections = (nodes: Content[]): Section[] => {
+    const sections: Section[] = [];
+    let currentSection: Section = { heading: null, content: [] };
+
+    for (const node of nodes) {
+      if (node.type === 'heading') {
+        // Start a new section
+        if (currentSection.heading || currentSection.content.length > 0) {
+          sections.push(currentSection);
+        }
+        currentSection = { heading: node as Heading, content: [] };
+      }
+      else {
+        currentSection.content.push(node);
+      }
+    }
+
+    // Add the last section
+    if (currentSection.heading || currentSection.content.length > 0) {
+      sections.push(currentSection);
+    }
+
+    return sections;
+  };
+
+  /**
+   * Function to recursively process sections
+   * @param sections - The array of sections to process
+   * @returns An array of split Markdown strings
+   */
+  const processSections = (sections: Section[]): string[] => {
+    const results: string[] = [];
+
+    for (const section of sections) {
+      const nodes: Content[] = [];
+      if (section.heading) {
+        nodes.push(section.heading);
+      }
+      nodes.push(...section.content);
+
+      const subtree: Root = { type: 'root', children: nodes };
+      const content = stringify(subtree);
+      const tokenCount = getTokenCount(content);
+
+      if (tokenCount <= maxTokens) {
+        results.push(content);
+      }
+      else if (section.content.some(child => child.type === 'heading')) {
+        // Split into subsections
+        const subsections = splitSections(section.content);
+        results.push(...processSections(subsections));
+      }
+      else {
+        // Split by paragraphs
+        const paragraphs = splitByParagraphs(nodes);
+        results.push(...paragraphs);
+      }
+    }
+
+    return results;
+  };
+
+  /**
+   * Function to split nodes by paragraphs
+   * @param nodes - The array of nodes to split
+   * @returns An array of split Markdown strings
+   */
+  const splitByParagraphs = (nodes: Content[]): string[] => {
+    const results: string[] = [];
+    let currentNodes: Content[] = [];
+    let currentTokenCount = 0;
+
+    for (const node of nodes) {
+      const nodeContent = stringify({ type: 'root', children: [node] });
+      const nodeTokenCount = getTokenCount(nodeContent);
+
+      if (currentTokenCount + nodeTokenCount <= maxTokens) {
+        currentNodes.push(node);
+        currentTokenCount += nodeTokenCount;
+      }
+      else {
+        if (currentNodes.length > 0) {
+          const chunk = stringify({ type: 'root', children: currentNodes });
+          results.push(chunk);
+          currentNodes = [];
+          currentTokenCount = 0;
+        }
+        if (nodeTokenCount > maxTokens) {
+          // If a single node exceeds maxTokens, add it as is
+          results.push(nodeContent);
+        }
+        else {
+          currentNodes.push(node);
+          currentTokenCount = nodeTokenCount;
+        }
+      }
+    }
+
+    if (currentNodes.length > 0) {
+      const chunk = stringify({ type: 'root', children: currentNodes });
+      results.push(chunk);
+    }
+
+    return results;
+  };
+
+  // Get initial sections
+  const initialSections = splitSections(tree.children);
+  // Process sections
+  const result = processSections(initialSections);
+
+  // Free the encoding
+  encoding.free();
+
+  return result;
+}

+ 78 - 0
apps/app/test/integration/service/page-sprit.test.ts

@@ -0,0 +1,78 @@
+import type { TiktokenModel } from '@dqbd/tiktoken';
+
+import { splitMarkdownByTokens } from '../../../src/server/service/page-spritter';
+
+describe('splitMarkdownByTokens', () => {
+  const model: TiktokenModel = 'gpt-3.5-turbo';
+
+  test('Returns without splitting when token count is below the maximum', () => {
+    const markdownContent = '# Heading\n\nThis is a test.';
+    const result = splitMarkdownByTokens(model, markdownContent, 1000);
+    expect(result).toHaveLength(1);
+    expect(result[0]).toBe(markdownContent);
+  });
+
+  test('Splits by sections when token count exceeds the maximum', () => {
+    const markdownContent = `
+# Heading1
+
+This is the content of section 1.
+
+# Heading2
+
+This is the content of section 2.
+
+# Heading3
+
+This is the content of section 3.
+    `;
+    const result = splitMarkdownByTokens(model, markdownContent, 10); // Set a small maxTokens
+    expect(result).toHaveLength(3);
+    expect(result[0]).toContain('Heading1');
+    expect(result[1]).toContain('Heading2');
+    expect(result[2]).toContain('Heading3');
+  });
+
+  test('Recursively splits into subsections', () => {
+    const markdownContent = `
+# Heading1
+
+## Subheading1-1
+
+Content1-1
+
+## Subheading1-2
+
+Content1-2
+
+# Heading2
+
+Content2
+    `;
+    const result = splitMarkdownByTokens(model, markdownContent, 20);
+    expect(result.length).toBeGreaterThan(2);
+    expect(result.some(chunk => chunk.includes('Subheading1-1'))).toBe(true);
+    expect(result.some(chunk => chunk.includes('Subheading1-2'))).toBe(true);
+  });
+
+  test('Splits by paragraphs', () => {
+    const markdownContent = `
+# Heading
+
+${'Long paragraph. '.repeat(50)}
+    `;
+    const result = splitMarkdownByTokens(model, markdownContent, 50);
+    expect(result.length).toBeGreaterThan(1);
+  });
+
+  test('Adds a single node as is when it exceeds maxTokens', () => {
+    const markdownContent = `
+# Heading
+
+${'Very long paragraph. '.repeat(200)}
+    `;
+    const result = splitMarkdownByTokens(model, markdownContent, 50);
+    expect(result).toHaveLength(1);
+    expect(result[0]).toContain('Very long paragraph.');
+  });
+});

+ 8 - 28
yarn.lock

@@ -1860,6 +1860,11 @@
   resolved "https://registry.yarnpkg.com/@discoveryjs/json-ext/-/json-ext-0.5.7.tgz#1d572bfbbe14b7704e0ba0f39b74815b84870d70"
   integrity sha512-dBVuXR082gk3jsFp7Rd/JI4kytwGHecnCoTtXFb7DB6CNHp4rg5k1bhg0nWdLGLnOV71lmDzGQaLMy8iPLY0pw==
 
+"@dqbd/tiktoken@^1.0.16":
+  version "1.0.16"
+  resolved "https://registry.yarnpkg.com/@dqbd/tiktoken/-/tiktoken-1.0.16.tgz#374c201d84158b50babff84e77aa3504564f510b"
+  integrity sha512-4uIrs5qxAwFVFFEP507HZIZhGOsgfaEMEWDXWalr+v+XP+wJwP60EVmkZtQyQe70IsKGVkx5umBxw4NfmU0pPg==
+
 "@dual-bundle/import-meta-resolve@^4.0.0":
   version "4.1.0"
   resolved "https://registry.yarnpkg.com/@dual-bundle/import-meta-resolve/-/import-meta-resolve-4.1.0.tgz#519c1549b0e147759e7825701ecffd25e5819f7b"
@@ -16996,7 +17001,7 @@ string-template@>=1.0.0:
   resolved "https://registry.yarnpkg.com/string-template/-/string-template-1.0.0.tgz#9e9f2233dc00f218718ec379a28a5673ecca8b96"
   integrity sha1-np8iM9wA8hhxjsN5oopWc+zKi5Y=
 
-"string-width-cjs@npm:string-width@^4.2.0":
+"string-width-cjs@npm:string-width@^4.2.0", "string-width@^1.0.2 || 2 || 3 || 4", string-width@^4.1.0, string-width@^4.2.0, string-width@^4.2.3:
   version "4.2.3"
   resolved "https://registry.yarnpkg.com/string-width/-/string-width-4.2.3.tgz#269c7117d27b05ad2e536830a8ec895ef9c6d010"
   integrity sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==
@@ -17014,15 +17019,6 @@ string-width@=4.2.2:
     is-fullwidth-code-point "^3.0.0"
     strip-ansi "^6.0.0"
 
-"string-width@^1.0.2 || 2 || 3 || 4", string-width@^4.1.0, string-width@^4.2.0, string-width@^4.2.3:
-  version "4.2.3"
-  resolved "https://registry.yarnpkg.com/string-width/-/string-width-4.2.3.tgz#269c7117d27b05ad2e536830a8ec895ef9c6d010"
-  integrity sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==
-  dependencies:
-    emoji-regex "^8.0.0"
-    is-fullwidth-code-point "^3.0.0"
-    strip-ansi "^6.0.1"
-
 string-width@^5.0.1, string-width@^5.1.2:
   version "5.1.2"
   resolved "https://registry.yarnpkg.com/string-width/-/string-width-5.1.2.tgz#14f8daec6d81e7221d2a357e668cab73bdbca794"
@@ -17106,7 +17102,7 @@ stringify-entities@^4.0.0:
     character-entities-html4 "^2.0.0"
     character-entities-legacy "^3.0.0"
 
-"strip-ansi-cjs@npm:strip-ansi@^6.0.1":
+"strip-ansi-cjs@npm:strip-ansi@^6.0.1", strip-ansi@^6.0.0, strip-ansi@^6.0.1:
   version "6.0.1"
   resolved "https://registry.yarnpkg.com/strip-ansi/-/strip-ansi-6.0.1.tgz#9e26c63d30f53443e9489495b2105d37b67a85d9"
   integrity sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==
@@ -17120,13 +17116,6 @@ strip-ansi@^3.0.0:
   dependencies:
     ansi-regex "^2.0.0"
 
-strip-ansi@^6.0.0, strip-ansi@^6.0.1:
-  version "6.0.1"
-  resolved "https://registry.yarnpkg.com/strip-ansi/-/strip-ansi-6.0.1.tgz#9e26c63d30f53443e9489495b2105d37b67a85d9"
-  integrity sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==
-  dependencies:
-    ansi-regex "^5.0.1"
-
 strip-ansi@^7.0.1, strip-ansi@^7.1.0:
   version "7.1.0"
   resolved "https://registry.yarnpkg.com/strip-ansi/-/strip-ansi-7.1.0.tgz#d5b6568ca689d8561370b0707685d22434faff45"
@@ -18921,7 +18910,7 @@ word-wrap@^1.2.3:
   resolved "https://registry.yarnpkg.com/word-wrap/-/word-wrap-1.2.3.tgz#610636f6b1f703891bd34771ccb17fb93b47079c"
   integrity sha512-Hz/mrNwitNRh/HUAtM/VT/5VH+ygD6DV7mYKZAtHOrbs8U7lvPS6xf7EJKMF0uW1KJCl0H701g3ZGus+muE5vQ==
 
-"wrap-ansi-cjs@npm:wrap-ansi@^7.0.0":
+"wrap-ansi-cjs@npm:wrap-ansi@^7.0.0", wrap-ansi@^7.0.0:
   version "7.0.0"
   resolved "https://registry.yarnpkg.com/wrap-ansi/-/wrap-ansi-7.0.0.tgz#67e145cff510a6a6984bdf1152911d69d2eb9e43"
   integrity sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==
@@ -18939,15 +18928,6 @@ wrap-ansi@^6.2.0:
     string-width "^4.1.0"
     strip-ansi "^6.0.0"
 
-wrap-ansi@^7.0.0:
-  version "7.0.0"
-  resolved "https://registry.yarnpkg.com/wrap-ansi/-/wrap-ansi-7.0.0.tgz#67e145cff510a6a6984bdf1152911d69d2eb9e43"
-  integrity sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==
-  dependencies:
-    ansi-styles "^4.0.0"
-    string-width "^4.1.0"
-    strip-ansi "^6.0.0"
-
 wrap-ansi@^8.1.0:
   version "8.1.0"
   resolved "https://registry.yarnpkg.com/wrap-ansi/-/wrap-ansi-8.1.0.tgz#56dc22368ee570face1b49819975d9b9a5ead214"