1 year ago · 55318b544e
--- a/apps/app/next-env.d.ts
+++ b/apps/app/next-env.d.ts
@@ -2,4 +2,4 @@
 
				 /// <reference types="next/image-types/global" />
			
 
				 
			
 
				 // NOTE: This file should not be edited
			
 
				-// see https://nextjs.org/docs/basic-features/typescript for more information.
			
 
				+// see https://nextjs.org/docs/pages/building-your-application/configuring/typescript for more information.
			
--- a/apps/app/package.json
+++ b/apps/app/package.json
@@ -63,6 +63,7 @@
 
				     "@azure/identity": "^4.3.0",
			
 
				     "@azure/storage-blob": "^12.16.0",
			
 
				     "@browser-bunyan/console-formatted-stream": "^1.8.0",
			
 
				+    "@dqbd/tiktoken": "^1.0.16",
			
 
				     "@elastic/elasticsearch7": "npm:@elastic/elasticsearch@^7.17.0",
			
 
				     "@elastic/elasticsearch8": "npm:@elastic/elasticsearch@^8.7.0",
			
 
				     "@godaddy/terminus": "^4.9.0",
			
--- a/apps/app/src/server/service/page-spritter.ts
+++ b/apps/app/src/server/service/page-spritter.ts
@@ -0,0 +1,174 @@
 
				+import type { Tiktoken, TiktokenModel } from '@dqbd/tiktoken';
			
 
				+import { encoding_for_model } from '@dqbd/tiktoken'; // eslint-disable-line
			
 
				+import type { Root, Content, Heading } from 'mdast';
			
 
				+import remarkParse from 'remark-parse';
			
 
				+import remarkStringify from 'remark-stringify';
			
 
				+import { unified } from 'unified';
			
 
				+
			
 
				+/**
			
 
				+ * Interface representing a section
			
 
				+ */
			
 
				+interface Section {
			
 
				+  heading: Heading | null;
			
 
				+  content: Content[];
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * Function to recursively split Markdown content by header sections so that each section has a token count below the specified maximum
			
 
				+ *
			
 
				+ * @param model - The name of the model to use (e.g., 'gpt-4')
			
 
				+ * @param markdownContent - The Markdown content to split
			
 
				+ * @param maxTokens - The maximum number of tokens per section (default: 100)
			
 
				+ * @returns An array of split Markdown sections
			
 
				+ */
			
 
				+export function splitMarkdownByTokens(
			
 
				+    model: TiktokenModel,
			
 
				+    markdownContent: string,
			
 
				+    maxTokens = 100,
			
 
				+): string[] {
			
 
				+  // Obtain encoding based on the model
			
 
				+  const encoding: Tiktoken = encoding_for_model(model);
			
 
				+
			
 
				+  // Parse Markdown into AST
			
 
				+  const processor = unified().use(remarkParse);
			
 
				+  const tree = processor.parse(markdownContent) as Root;
			
 
				+
			
 
				+  /**
			
 
				+   * Function to stringify a node
			
 
				+   * @param node - The node to stringify
			
 
				+   * @returns The Markdown string of the node
			
 
				+   */
			
 
				+  const stringify = (node: Root): string => {
			
 
				+    return unified().use(remarkStringify).stringify(node);
			
 
				+  };
			
 
				+
			
 
				+  /**
			
 
				+   * Function to get the token count of a text
			
 
				+   * @param text - The text to calculate token count for
			
 
				+   * @returns The number of tokens
			
 
				+   */
			
 
				+  const getTokenCount = (text: string): number => {
			
 
				+    return encoding.encode(text).length;
			
 
				+  };
			
 
				+
			
 
				+  /**
			
 
				+   * Function to split nodes into sections based on headers
			
 
				+   * @param nodes - The array of nodes to split
			
 
				+   * @returns An array of sections
			
 
				+   */
			
 
				+  const splitSections = (nodes: Content[]): Section[] => {
			
 
				+    const sections: Section[] = [];
			
 
				+    let currentSection: Section = { heading: null, content: [] };
			
 
				+
			
 
				+    for (const node of nodes) {
			
 
				+      if (node.type === 'heading') {
			
 
				+        // Start a new section
			
 
				+        if (currentSection.heading || currentSection.content.length > 0) {
			
 
				+          sections.push(currentSection);
			
 
				+        }
			
 
				+        currentSection = { heading: node as Heading, content: [] };
			
 
				+      }
			
 
				+      else {
			
 
				+        currentSection.content.push(node);
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+    // Add the last section
			
 
				+    if (currentSection.heading || currentSection.content.length > 0) {
			
 
				+      sections.push(currentSection);
			
 
				+    }
			
 
				+
			
 
				+    return sections;
			
 
				+  };
			
 
				+
			
 
				+  /**
			
 
				+   * Function to recursively process sections
			
 
				+   * @param sections - The array of sections to process
			
 
				+   * @returns An array of split Markdown strings
			
 
				+   */
			
 
				+  const processSections = (sections: Section[]): string[] => {
			
 
				+    const results: string[] = [];
			
 
				+
			
 
				+    for (const section of sections) {
			
 
				+      const nodes: Content[] = [];
			
 
				+      if (section.heading) {
			
 
				+        nodes.push(section.heading);
			
 
				+      }
			
 
				+      nodes.push(...section.content);
			
 
				+
			
 
				+      const subtree: Root = { type: 'root', children: nodes };
			
 
				+      const content = stringify(subtree);
			
 
				+      const tokenCount = getTokenCount(content);
			
 
				+
			
 
				+      if (tokenCount <= maxTokens) {
			
 
				+        results.push(content);
			
 
				+      }
			
 
				+      else if (section.content.some(child => child.type === 'heading')) {
			
 
				+        // Split into subsections
			
 
				+        const subsections = splitSections(section.content);
			
 
				+        results.push(...processSections(subsections));
			
 
				+      }
			
 
				+      else {
			
 
				+        // Split by paragraphs
			
 
				+        const paragraphs = splitByParagraphs(nodes);
			
 
				+        results.push(...paragraphs);
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+    return results;
			
 
				+  };
			
 
				+
			
 
				+  /**
			
 
				+   * Function to split nodes by paragraphs
			
 
				+   * @param nodes - The array of nodes to split
			
 
				+   * @returns An array of split Markdown strings
			
 
				+   */
			
 
				+  const splitByParagraphs = (nodes: Content[]): string[] => {
			
 
				+    const results: string[] = [];
			
 
				+    let currentNodes: Content[] = [];
			
 
				+    let currentTokenCount = 0;
			
 
				+
			
 
				+    for (const node of nodes) {
			
 
				+      const nodeContent = stringify({ type: 'root', children: [node] });
			
 
				+      const nodeTokenCount = getTokenCount(nodeContent);
			
 
				+
			
 
				+      if (currentTokenCount + nodeTokenCount <= maxTokens) {
			
 
				+        currentNodes.push(node);
			
 
				+        currentTokenCount += nodeTokenCount;
			
 
				+      }
			
 
				+      else {
			
 
				+        if (currentNodes.length > 0) {
			
 
				+          const chunk = stringify({ type: 'root', children: currentNodes });
			
 
				+          results.push(chunk);
			
 
				+          currentNodes = [];
			
 
				+          currentTokenCount = 0;
			
 
				+        }
			
 
				+        if (nodeTokenCount > maxTokens) {
			
 
				+          // If a single node exceeds maxTokens, add it as is
			
 
				+          results.push(nodeContent);
			
 
				+        }
			
 
				+        else {
			
 
				+          currentNodes.push(node);
			
 
				+          currentTokenCount = nodeTokenCount;
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+    if (currentNodes.length > 0) {
			
 
				+      const chunk = stringify({ type: 'root', children: currentNodes });
			
 
				+      results.push(chunk);
			
 
				+    }
			
 
				+
			
 
				+    return results;
			
 
				+  };
			
 
				+
			
 
				+  // Get initial sections
			
 
				+  const initialSections = splitSections(tree.children);
			
 
				+  // Process sections
			
 
				+  const result = processSections(initialSections);
			
 
				+
			
 
				+  // Free the encoding
			
 
				+  encoding.free();
			
 
				+
			
 
				+  return result;
			
 
				+}
			
--- a/apps/app/test/integration/service/page-sprit.test.ts
+++ b/apps/app/test/integration/service/page-sprit.test.ts
@@ -0,0 +1,78 @@
 
				+import type { TiktokenModel } from '@dqbd/tiktoken';
			
 
				+
			
 
				+import { splitMarkdownByTokens } from '../../../src/server/service/page-spritter';
			
 
				+
			
 
				+describe('splitMarkdownByTokens', () => {
			
 
				+  const model: TiktokenModel = 'gpt-3.5-turbo';
			
 
				+
			
 
				+  test('Returns without splitting when token count is below the maximum', () => {
			
 
				+    const markdownContent = '# Heading\n\nThis is a test.';
			
 
				+    const result = splitMarkdownByTokens(model, markdownContent, 1000);
			
 
				+    expect(result).toHaveLength(1);
			
 
				+    expect(result[0]).toBe(markdownContent);
			
 
				+  });
			
 
				+
			
 
				+  test('Splits by sections when token count exceeds the maximum', () => {
			
 
				+    const markdownContent = `
			
 
				+# Heading1
			
 
				+
			
 
				+This is the content of section 1.
			
 
				+
			
 
				+# Heading2
			
 
				+
			
 
				+This is the content of section 2.
			
 
				+
			
 
				+# Heading3
			
 
				+
			
 
				+This is the content of section 3.
			
 
				+    `;
			
 
				+    const result = splitMarkdownByTokens(model, markdownContent, 10); // Set a small maxTokens
			
 
				+    expect(result).toHaveLength(3);
			
 
				+    expect(result[0]).toContain('Heading1');
			
 
				+    expect(result[1]).toContain('Heading2');
			
 
				+    expect(result[2]).toContain('Heading3');
			
 
				+  });
			
 
				+
			
 
				+  test('Recursively splits into subsections', () => {
			
 
				+    const markdownContent = `
			
 
				+# Heading1
			
 
				+
			
 
				+## Subheading1-1
			
 
				+
			
 
				+Content1-1
			
 
				+
			
 
				+## Subheading1-2
			
 
				+
			
 
				+Content1-2
			
 
				+
			
 
				+# Heading2
			
 
				+
			
 
				+Content2
			
 
				+    `;
			
 
				+    const result = splitMarkdownByTokens(model, markdownContent, 20);
			
 
				+    expect(result.length).toBeGreaterThan(2);
			
 
				+    expect(result.some(chunk => chunk.includes('Subheading1-1'))).toBe(true);
			
 
				+    expect(result.some(chunk => chunk.includes('Subheading1-2'))).toBe(true);
			
 
				+  });
			
 
				+
			
 
				+  test('Splits by paragraphs', () => {
			
 
				+    const markdownContent = `
			
 
				+# Heading
			
 
				+
			
 
				+${'Long paragraph. '.repeat(50)}
			
 
				+    `;
			
 
				+    const result = splitMarkdownByTokens(model, markdownContent, 50);
			
 
				+    expect(result.length).toBeGreaterThan(1);
			
 
				+  });
			
 
				+
			
 
				+  test('Adds a single node as is when it exceeds maxTokens', () => {
			
 
				+    const markdownContent = `
			
 
				+# Heading
			
 
				+
			
 
				+${'Very long paragraph. '.repeat(200)}
			
 
				+    `;
			
 
				+    const result = splitMarkdownByTokens(model, markdownContent, 50);
			
 
				+    expect(result).toHaveLength(1);
			
 
				+    expect(result[0]).toContain('Very long paragraph.');
			
 
				+  });
			
 
				+});
			
--- a/yarn.lock
+++ b/yarn.lock
@@ -1860,6 +1860,11 @@
 
				   resolved "https://registry.yarnpkg.com/@discoveryjs/json-ext/-/json-ext-0.5.7.tgz#1d572bfbbe14b7704e0ba0f39b74815b84870d70"
			
 
				   integrity sha512-dBVuXR082gk3jsFp7Rd/JI4kytwGHecnCoTtXFb7DB6CNHp4rg5k1bhg0nWdLGLnOV71lmDzGQaLMy8iPLY0pw==
			
 
				 
			
 
				+"@dqbd/tiktoken@^1.0.16":
			
 
				+  version "1.0.16"
			
 
				+  resolved "https://registry.yarnpkg.com/@dqbd/tiktoken/-/tiktoken-1.0.16.tgz#374c201d84158b50babff84e77aa3504564f510b"
			
 
				+  integrity sha512-4uIrs5qxAwFVFFEP507HZIZhGOsgfaEMEWDXWalr+v+XP+wJwP60EVmkZtQyQe70IsKGVkx5umBxw4NfmU0pPg==
			
 
				+
			
 
				 "@dual-bundle/import-meta-resolve@^4.0.0":
			
 
				   version "4.1.0"
			
 
				   resolved "https://registry.yarnpkg.com/@dual-bundle/import-meta-resolve/-/import-meta-resolve-4.1.0.tgz#519c1549b0e147759e7825701ecffd25e5819f7b"
			
@@ -16996,7 +17001,7 @@ string-template@>=1.0.0:
 
				   resolved "https://registry.yarnpkg.com/string-template/-/string-template-1.0.0.tgz#9e9f2233dc00f218718ec379a28a5673ecca8b96"
			
 
				   integrity sha1-np8iM9wA8hhxjsN5oopWc+zKi5Y=
			
 
				 
			
 
				-"string-width-cjs@npm:string-width@^4.2.0":
			
 
				+"string-width-cjs@npm:string-width@^4.2.0", "string-width@^1.0.2 || 2 || 3 || 4", string-width@^4.1.0, string-width@^4.2.0, string-width@^4.2.3:
			
 
				   version "4.2.3"
			
 
				   resolved "https://registry.yarnpkg.com/string-width/-/string-width-4.2.3.tgz#269c7117d27b05ad2e536830a8ec895ef9c6d010"
			
 
				   integrity sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==
			
@@ -17014,15 +17019,6 @@ string-width@=4.2.2:
 
				     is-fullwidth-code-point "^3.0.0"
			
 
				     strip-ansi "^6.0.0"
			
 
				 
			
 
				-"string-width@^1.0.2 || 2 || 3 || 4", string-width@^4.1.0, string-width@^4.2.0, string-width@^4.2.3:
			
 
				-  version "4.2.3"
			
 
				-  resolved "https://registry.yarnpkg.com/string-width/-/string-width-4.2.3.tgz#269c7117d27b05ad2e536830a8ec895ef9c6d010"
			
 
				-  integrity sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==
			
 
				-  dependencies:
			
 
				-    emoji-regex "^8.0.0"
			
 
				-    is-fullwidth-code-point "^3.0.0"
			
 
				-    strip-ansi "^6.0.1"
			
 
				-
			
 
				 string-width@^5.0.1, string-width@^5.1.2:
			
 
				   version "5.1.2"
			
 
				   resolved "https://registry.yarnpkg.com/string-width/-/string-width-5.1.2.tgz#14f8daec6d81e7221d2a357e668cab73bdbca794"
			
@@ -17106,7 +17102,7 @@ stringify-entities@^4.0.0:
 
				     character-entities-html4 "^2.0.0"
			
 
				     character-entities-legacy "^3.0.0"
			
 
				 
			
 
				-"strip-ansi-cjs@npm:strip-ansi@^6.0.1":
			
 
				+"strip-ansi-cjs@npm:strip-ansi@^6.0.1", strip-ansi@^6.0.0, strip-ansi@^6.0.1:
			
 
				   version "6.0.1"
			
 
				   resolved "https://registry.yarnpkg.com/strip-ansi/-/strip-ansi-6.0.1.tgz#9e26c63d30f53443e9489495b2105d37b67a85d9"
			
 
				   integrity sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==
			
@@ -17120,13 +17116,6 @@ strip-ansi@^3.0.0:
 
				   dependencies:
			
 
				     ansi-regex "^2.0.0"
			
 
				 
			
 
				-strip-ansi@^6.0.0, strip-ansi@^6.0.1:
			
 
				-  version "6.0.1"
			
 
				-  resolved "https://registry.yarnpkg.com/strip-ansi/-/strip-ansi-6.0.1.tgz#9e26c63d30f53443e9489495b2105d37b67a85d9"
			
 
				-  integrity sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==
			
 
				-  dependencies:
			
 
				-    ansi-regex "^5.0.1"
			
 
				-
			
 
				 strip-ansi@^7.0.1, strip-ansi@^7.1.0:
			
 
				   version "7.1.0"
			
 
				   resolved "https://registry.yarnpkg.com/strip-ansi/-/strip-ansi-7.1.0.tgz#d5b6568ca689d8561370b0707685d22434faff45"
			
@@ -18921,7 +18910,7 @@ word-wrap@^1.2.3:
 
				   resolved "https://registry.yarnpkg.com/word-wrap/-/word-wrap-1.2.3.tgz#610636f6b1f703891bd34771ccb17fb93b47079c"
			
 
				   integrity sha512-Hz/mrNwitNRh/HUAtM/VT/5VH+ygD6DV7mYKZAtHOrbs8U7lvPS6xf7EJKMF0uW1KJCl0H701g3ZGus+muE5vQ==
			
 
				 
			
 
				-"wrap-ansi-cjs@npm:wrap-ansi@^7.0.0":
			
 
				+"wrap-ansi-cjs@npm:wrap-ansi@^7.0.0", wrap-ansi@^7.0.0:
			
 
				   version "7.0.0"
			
 
				   resolved "https://registry.yarnpkg.com/wrap-ansi/-/wrap-ansi-7.0.0.tgz#67e145cff510a6a6984bdf1152911d69d2eb9e43"
			
 
				   integrity sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==
			
@@ -18939,15 +18928,6 @@ wrap-ansi@^6.2.0:
 
				     string-width "^4.1.0"
			
 
				     strip-ansi "^6.0.0"
			
 
				 
			
 
				-wrap-ansi@^7.0.0:
			
 
				-  version "7.0.0"
			
 
				-  resolved "https://registry.yarnpkg.com/wrap-ansi/-/wrap-ansi-7.0.0.tgz#67e145cff510a6a6984bdf1152911d69d2eb9e43"
			
 
				-  integrity sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==
			
 
				-  dependencies:
			
 
				-    ansi-styles "^4.0.0"
			
 
				-    string-width "^4.1.0"
			
 
				-    strip-ansi "^6.0.0"
			
 
				-
			
 
				 wrap-ansi@^8.1.0:
			
 
				   version "8.1.0"
			
 
				   resolved "https://registry.yarnpkg.com/wrap-ansi/-/wrap-ansi-8.1.0.tgz#56dc22368ee570face1b49819975d9b9a5ead214"