hai 1 ano · a59c4f492a
--- a/packages/page-split/.eslintignore
+++ b/packages/page-split/.eslintignore
@@ -0,0 +1,2 @@
 
				+/dist/**
			
 
				+/types/**
			
--- a/packages/page-split/.eslintrc.cjs
+++ b/packages/page-split/.eslintrc.cjs
@@ -0,0 +1,5 @@
 
				+module.exports = {
			
 
				+  extends: [
			
 
				+    'weseek/react',
			
 
				+  ],
			
 
				+};
			
--- a/packages/page-split/.gitignore
+++ b/packages/page-split/.gitignore
@@ -0,0 +1 @@
 
				+/dist
			
--- a/packages/page-split/src/index.ts
+++ b/packages/page-split/src/index.ts
@@ -0,0 +1 @@
 
				+export * from './services/page-split';
			
--- a/packages/page-split/src/services/index.ts
+++ b/packages/page-split/src/services/index.ts
@@ -1 +0,0 @@
 
				-export * from './page-split';
			
--- a/packages/page-split/src/services/page-split.ts
+++ b/packages/page-split/src/services/page-split.ts
@@ -1,176 +1,89 @@
 
				-import type { Tiktoken, TiktokenModel } from '@dqbd/tiktoken';
			
 
				-import { encoding_for_model } from '@dqbd/tiktoken'; // eslint-disable-line
			
 
				-import type { Root, Content, Heading } from 'mdast';
			
 
				-
			
 
				-/**
			
 
				- * Interface representing a section
			
 
				- */
			
 
				-interface Section {
			
 
				-  heading: Heading | null;
			
 
				-  content: Content[];
			
 
				-}
			
 
				-
			
 
				-/**
			
 
				- * Function to recursively split Markdown content by header sections so that each section has a token count below the specified maximum
			
 
				- *
			
 
				- * @param model - The name of the model to use (e.g., 'gpt-4')
			
 
				- * @param markdownContent - The Markdown content to split
			
 
				- * @param maxTokens - The maximum number of tokens per section (default: 100)
			
 
				- * @returns An array of split Markdown sections
			
 
				- */
			
 
				-export async function splitMarkdownByTokens(
			
 
				-    model: TiktokenModel,
			
 
				-    markdownContent: string,
			
 
				-    maxTokens = 100,
			
 
				-): Promise<string[]> {
			
 
				-  // Obtain encoding based on the model
			
 
				-  const encoding: Tiktoken = encoding_for_model(model);
			
 
				-
			
 
				-  // Dynamically import remark-parse
			
 
				-  const remarkParse = (await import('remark-parse')).default;
			
 
				-  const remarkStringify = (await import('remark-stringify')).default;
			
 
				-  const unified = (await import('unified')).unified;
			
 
				-
			
 
				-  // Parse Markdown into AST
			
 
				-  const processor = unified().use(remarkParse);
			
 
				-  const tree = processor.parse(markdownContent) as Root;
			
 
				-
			
 
				-  /**
			
 
				-   * Function to stringify a node
			
 
				-   * @param node - The node to stringify
			
 
				-   * @returns The Markdown string of the node
			
 
				-   */
			
 
				-  const stringify = (node: Root): string => {
			
 
				-    return "aaa"; // eslint-disable-line
			
 
				-  };
			
 
				-
			
 
				-  /**
			
 
				-   * Function to get the token count of a text
			
 
				-   * @param text - The text to calculate token count for
			
 
				-   * @returns The number of tokens
			
 
				-   */
			
 
				-  const getTokenCount = (text: string): number => {
			
 
				-    return encoding.encode(text).length;
			
 
				-  };
			
 
				-
			
 
				-  /**
			
 
				-   * Function to split nodes into sections based on headers
			
 
				-   * @param nodes - The array of nodes to split
			
 
				-   * @returns An array of sections
			
 
				-   */
			
 
				-  const splitSections = (nodes: Content[]): Section[] => {
			
 
				-    const sections: Section[] = [];
			
 
				-    let currentSection: Section = { heading: null, content: [] };
			
 
				-
			
 
				-    for (const node of nodes) {
			
 
				-      if (node.type === 'heading') {
			
 
				-        // Start a new section
			
 
				-        if (currentSection.heading || currentSection.content.length > 0) {
			
 
				-          sections.push(currentSection);
			
 
				+import type { TiktokenModel } from 'js-tiktoken';
			
 
				+import { encodingForModel } from 'js-tiktoken';
			
 
				+import type { Root, Content } from 'mdast';
			
 
				+import remarkParse from 'remark-parse';
			
 
				+import remarkStringify from 'remark-stringify';
			
 
				+import { unified } from 'unified';
			
 
				+
			
 
				+export function splitMarkdownByTokens(
			
 
				+    markdownString: string,
			
 
				+    maxTokens: number,
			
 
				+    modelName: TiktokenModel,
			
 
				+): string[] {
			
 
				+  // Parse the markdown into an AST
			
 
				+  const tree = unified().use(remarkParse).parse(markdownString) as Root;
			
 
				+  const encoding = encodingForModel(modelName);
			
 
				+
			
 
				+  function countTokens(text: string): number {
			
 
				+    const tokens = encoding.encode(text);
			
 
				+    return tokens.length;
			
 
				+  }
			
 
				+
			
 
				+  // Recursively split sections
			
 
				+  function splitSectionRecursively(nodes: Content[]): Content[][] {
			
 
				+    const sections: Content[][] = [];
			
 
				+    let currentSection: Content[] = [];
			
 
				+
			
 
				+    for (let i = 0; i < nodes.length; i++) {
			
 
				+      const node = nodes[i];
			
 
				+      currentSection.push(node);
			
 
				+
			
 
				+      const markdown = unified()
			
 
				+        .use(remarkStringify)
			
 
				+        .stringify({ type: 'root', children: currentSection });
			
 
				+      const tokenCount = countTokens(markdown);
			
 
				+
			
 
				+      if (tokenCount > maxTokens) {
			
 
				+        // If the token count exceeds the limit, treat the nodes up to the previous one as a section
			
 
				+        currentSection.pop();
			
 
				+        if (currentSection.length > 0) {
			
 
				+          sections.push([...currentSection]);
			
 
				+        }
			
 
				+        // Start a new section from the current node
			
 
				+        currentSection = [node];
			
 
				+
			
 
				+        // If a single node exceeds maxTokens, add it as its own section
			
 
				+        const singleNodeMarkdown = unified()
			
 
				+          .use(remarkStringify)
			
 
				+          .stringify({ type: 'root', children: [node] });
			
 
				+        const singleNodeTokenCount = countTokens(singleNodeMarkdown);
			
 
				+        if (singleNodeTokenCount > maxTokens) {
			
 
				+          sections.push([node]);
			
 
				+          currentSection = [];
			
 
				         }
			
 
				-        currentSection = { heading: node as Heading, content: [] };
			
 
				-      }
			
 
				-      else {
			
 
				-        currentSection.content.push(node);
			
 
				-      }
			
 
				-    }
			
 
				-
			
 
				-    // Add the last section
			
 
				-    if (currentSection.heading || currentSection.content.length > 0) {
			
 
				-      sections.push(currentSection);
			
 
				-    }
			
 
				-
			
 
				-    return sections;
			
 
				-  };
			
 
				-
			
 
				-  /**
			
 
				-   * Function to recursively process sections
			
 
				-   * @param sections - The array of sections to process
			
 
				-   * @returns An array of split Markdown strings
			
 
				-   */
			
 
				-  const processSections = (sections: Section[]): string[] => {
			
 
				-    const results: string[] = [];
			
 
				-
			
 
				-    for (const section of sections) {
			
 
				-      const nodes: Content[] = [];
			
 
				-      if (section.heading) {
			
 
				-        nodes.push(section.heading);
			
 
				       }
			
 
				-      nodes.push(...section.content);
			
 
				-
			
 
				-      const subtree: Root = { type: 'root', children: nodes };
			
 
				-      const content = stringify(subtree);
			
 
				-      const tokenCount = getTokenCount(content);
			
 
				 
			
 
				-      if (tokenCount <= maxTokens) {
			
 
				-        results.push(content);
			
 
				-      }
			
 
				-      else if (section.content.some(child => child.type === 'heading')) {
			
 
				-        // Split into subsections
			
 
				-        const subsections = splitSections(section.content);
			
 
				-        results.push(...processSections(subsections));
			
 
				-      }
			
 
				-      else {
			
 
				-        // Split by paragraphs
			
 
				-        const paragraphs = 'sass';
			
 
				-        results.push(...paragraphs);
			
 
				+      // If it's the last node, add the section
			
 
				+      if (i === nodes.length - 1 && currentSection.length > 0) {
			
 
				+        sections.push([...currentSection]);
			
 
				       }
			
 
				     }
			
 
				 
			
 
				-    return results;
			
 
				-  };
			
 
				-
			
 
				-  /**
			
 
				-   * Function to split nodes by paragraphs
			
 
				-   * @param nodes - The array of nodes to split
			
 
				-   * @returns An array of split Markdown strings
			
 
				-   */
			
 
				-  const splitByParagraphs = (nodes: Content[]): string[] => {
			
 
				-    const results: string[] = [];
			
 
				-    let currentNodes: Content[] = [];
			
 
				-    let currentTokenCount = 0;
			
 
				-
			
 
				-    for (const node of nodes) {
			
 
				-      const nodeContent = stringify({ type: 'root', children: [node] });
			
 
				-      const nodeTokenCount = getTokenCount(nodeContent);
			
 
				-
			
 
				-      if (currentTokenCount + nodeTokenCount <= maxTokens) {
			
 
				-        currentNodes.push(node);
			
 
				-        currentTokenCount += nodeTokenCount;
			
 
				+    // Recursively split each section
			
 
				+    const recursivelySplitSections: Content[][] = [];
			
 
				+    for (const section of sections) {
			
 
				+      // If the section contains child headings, split further
			
 
				+      const hasHeading = section.some(node => node.type === 'heading');
			
 
				+      if (hasHeading && countTokens(unified().use(remarkStringify).stringify({ type: 'root', children: section })) > maxTokens) {
			
 
				+        // Recursively split child nodes
			
 
				+        const childSections = splitSectionRecursively(section);
			
 
				+        recursivelySplitSections.push(...childSections);
			
 
				       }
			
 
				       else {
			
 
				-        if (currentNodes.length > 0) {
			
 
				-          const chunk = stringify({ type: 'root', children: currentNodes });
			
 
				-          results.push(chunk);
			
 
				-          currentNodes = [];
			
 
				-          currentTokenCount = 0;
			
 
				-        }
			
 
				-        if (nodeTokenCount > maxTokens) {
			
 
				-          // If a single node exceeds maxTokens, add it as is
			
 
				-          results.push(nodeContent);
			
 
				-        }
			
 
				-        else {
			
 
				-          currentNodes.push(node);
			
 
				-          currentTokenCount = nodeTokenCount;
			
 
				-        }
			
 
				+        recursivelySplitSections.push(section);
			
 
				       }
			
 
				     }
			
 
				 
			
 
				-    if (currentNodes.length > 0) {
			
 
				-      const chunk = stringify({ type: 'root', children: currentNodes });
			
 
				-      results.push(chunk);
			
 
				-    }
			
 
				-
			
 
				-    return results;
			
 
				-  };
			
 
				+    return recursivelySplitSections;
			
 
				+  }
			
 
				 
			
 
				-  // Get initial sections
			
 
				-  const initialSections = splitSections(tree.children);
			
 
				-  // Process sections
			
 
				-  const result = processSections(initialSections);
			
 
				+  // Recursively split the AST's child nodes
			
 
				+  const splitSections = splitSectionRecursively(tree.children);
			
 
				 
			
 
				-  // Free the encoding
			
 
				-  encoding.free();
			
 
				+  // Convert the split sections back into markdown strings
			
 
				+  const markdownSections = splitSections.map(sectionNodes => unified()
			
 
				+    .use(remarkStringify)
			
 
				+    .stringify({ type: 'root', children: sectionNodes }));
			
 
				 
			
 
				-  return result;
			
 
				+  return markdownSections;
			
 
				 }
			
--- a/packages/page-split/vite.config.ts
+++ b/packages/page-split/vite.config.ts
@@ -0,0 +1,39 @@
 
				+import path from 'path';
			
 
				+
			
 
				+import react from '@vitejs/plugin-react';
			
 
				+import glob from 'glob';
			
 
				+import { nodeExternals } from 'rollup-plugin-node-externals';
			
 
				+import { defineConfig } from 'vite';
			
 
				+import dts from 'vite-plugin-dts';
			
 
				+
			
 
				+// https://vitejs.dev/config/
			
 
				+export default defineConfig({
			
 
				+  plugins: [
			
 
				+    react(),
			
 
				+    dts({ copyDtsFiles: true }),
			
 
				+    {
			
 
				+      ...nodeExternals({
			
 
				+        devDeps: true,
			
 
				+        builtinsPrefix: 'ignore',
			
 
				+      }),
			
 
				+      enforce: 'pre',
			
 
				+    },
			
 
				+  ],
			
 
				+  build: {
			
 
				+    outDir: 'dist',
			
 
				+    sourcemap: true,
			
 
				+    lib: {
			
 
				+      entry: glob.sync(path.resolve(__dirname, 'src/**/*.ts'), {
			
 
				+        ignore: '**/*.spec.ts',
			
 
				+      }),
			
 
				+      name: 'presentation-libs',
			
 
				+      formats: ['es'],
			
 
				+    },
			
 
				+    rollupOptions: {
			
 
				+      output: {
			
 
				+        preserveModules: true,
			
 
				+        preserveModulesRoot: 'src',
			
 
				+      },
			
 
				+    },
			
 
				+  },
			
 
				+});