nHigashiWeseek hai 1 ano
pai
achega
a59c4f492a

+ 2 - 0
packages/page-split/.eslintignore

@@ -0,0 +1,2 @@
+/dist/**
+/types/**

+ 5 - 0
packages/page-split/.eslintrc.cjs

@@ -0,0 +1,5 @@
+module.exports = {
+  extends: [
+    'weseek/react',
+  ],
+};

+ 1 - 0
packages/page-split/.gitignore

@@ -0,0 +1 @@
+/dist

+ 1 - 0
packages/page-split/src/index.ts

@@ -0,0 +1 @@
+export * from './services/page-split';

+ 0 - 1
packages/page-split/src/services/index.ts

@@ -1 +0,0 @@
-export * from './page-split';

+ 74 - 161
packages/page-split/src/services/page-split.ts

@@ -1,176 +1,89 @@
-import type { Tiktoken, TiktokenModel } from '@dqbd/tiktoken';
-import { encoding_for_model } from '@dqbd/tiktoken'; // eslint-disable-line
-import type { Root, Content, Heading } from 'mdast';
-
-/**
- * Interface representing a section
- */
-interface Section {
-  heading: Heading | null;
-  content: Content[];
-}
-
-/**
- * Function to recursively split Markdown content by header sections so that each section has a token count below the specified maximum
- *
- * @param model - The name of the model to use (e.g., 'gpt-4')
- * @param markdownContent - The Markdown content to split
- * @param maxTokens - The maximum number of tokens per section (default: 100)
- * @returns An array of split Markdown sections
- */
-export async function splitMarkdownByTokens(
-    model: TiktokenModel,
-    markdownContent: string,
-    maxTokens = 100,
-): Promise<string[]> {
-  // Obtain encoding based on the model
-  const encoding: Tiktoken = encoding_for_model(model);
-
-  // Dynamically import remark-parse
-  const remarkParse = (await import('remark-parse')).default;
-  const remarkStringify = (await import('remark-stringify')).default;
-  const unified = (await import('unified')).unified;
-
-  // Parse Markdown into AST
-  const processor = unified().use(remarkParse);
-  const tree = processor.parse(markdownContent) as Root;
-
-  /**
-   * Function to stringify a node
-   * @param node - The node to stringify
-   * @returns The Markdown string of the node
-   */
-  const stringify = (node: Root): string => {
-    return "aaa"; // eslint-disable-line
-  };
-
-  /**
-   * Function to get the token count of a text
-   * @param text - The text to calculate token count for
-   * @returns The number of tokens
-   */
-  const getTokenCount = (text: string): number => {
-    return encoding.encode(text).length;
-  };
-
-  /**
-   * Function to split nodes into sections based on headers
-   * @param nodes - The array of nodes to split
-   * @returns An array of sections
-   */
-  const splitSections = (nodes: Content[]): Section[] => {
-    const sections: Section[] = [];
-    let currentSection: Section = { heading: null, content: [] };
-
-    for (const node of nodes) {
-      if (node.type === 'heading') {
-        // Start a new section
-        if (currentSection.heading || currentSection.content.length > 0) {
-          sections.push(currentSection);
+import type { TiktokenModel } from 'js-tiktoken';
+import { encodingForModel } from 'js-tiktoken';
+import type { Root, Content } from 'mdast';
+import remarkParse from 'remark-parse';
+import remarkStringify from 'remark-stringify';
+import { unified } from 'unified';
+
+export function splitMarkdownByTokens(
+    markdownString: string,
+    maxTokens: number,
+    modelName: TiktokenModel,
+): string[] {
+  // Parse the markdown into an AST
+  const tree = unified().use(remarkParse).parse(markdownString) as Root;
+  const encoding = encodingForModel(modelName);
+
+  function countTokens(text: string): number {
+    const tokens = encoding.encode(text);
+    return tokens.length;
+  }
+
+  // Recursively split sections
+  function splitSectionRecursively(nodes: Content[]): Content[][] {
+    const sections: Content[][] = [];
+    let currentSection: Content[] = [];
+
+    for (let i = 0; i < nodes.length; i++) {
+      const node = nodes[i];
+      currentSection.push(node);
+
+      const markdown = unified()
+        .use(remarkStringify)
+        .stringify({ type: 'root', children: currentSection });
+      const tokenCount = countTokens(markdown);
+
+      if (tokenCount > maxTokens) {
+        // If the token count exceeds the limit, treat the nodes up to the previous one as a section
+        currentSection.pop();
+        if (currentSection.length > 0) {
+          sections.push([...currentSection]);
+        }
+        // Start a new section from the current node
+        currentSection = [node];
+
+        // If a single node exceeds maxTokens, add it as its own section
+        const singleNodeMarkdown = unified()
+          .use(remarkStringify)
+          .stringify({ type: 'root', children: [node] });
+        const singleNodeTokenCount = countTokens(singleNodeMarkdown);
+        if (singleNodeTokenCount > maxTokens) {
+          sections.push([node]);
+          currentSection = [];
         }
-        currentSection = { heading: node as Heading, content: [] };
-      }
-      else {
-        currentSection.content.push(node);
-      }
-    }
-
-    // Add the last section
-    if (currentSection.heading || currentSection.content.length > 0) {
-      sections.push(currentSection);
-    }
-
-    return sections;
-  };
-
-  /**
-   * Function to recursively process sections
-   * @param sections - The array of sections to process
-   * @returns An array of split Markdown strings
-   */
-  const processSections = (sections: Section[]): string[] => {
-    const results: string[] = [];
-
-    for (const section of sections) {
-      const nodes: Content[] = [];
-      if (section.heading) {
-        nodes.push(section.heading);
       }
-      nodes.push(...section.content);
-
-      const subtree: Root = { type: 'root', children: nodes };
-      const content = stringify(subtree);
-      const tokenCount = getTokenCount(content);
 
-      if (tokenCount <= maxTokens) {
-        results.push(content);
-      }
-      else if (section.content.some(child => child.type === 'heading')) {
-        // Split into subsections
-        const subsections = splitSections(section.content);
-        results.push(...processSections(subsections));
-      }
-      else {
-        // Split by paragraphs
-        const paragraphs = 'sass';
-        results.push(...paragraphs);
+      // If it's the last node, add the section
+      if (i === nodes.length - 1 && currentSection.length > 0) {
+        sections.push([...currentSection]);
       }
     }
 
-    return results;
-  };
-
-  /**
-   * Function to split nodes by paragraphs
-   * @param nodes - The array of nodes to split
-   * @returns An array of split Markdown strings
-   */
-  const splitByParagraphs = (nodes: Content[]): string[] => {
-    const results: string[] = [];
-    let currentNodes: Content[] = [];
-    let currentTokenCount = 0;
-
-    for (const node of nodes) {
-      const nodeContent = stringify({ type: 'root', children: [node] });
-      const nodeTokenCount = getTokenCount(nodeContent);
-
-      if (currentTokenCount + nodeTokenCount <= maxTokens) {
-        currentNodes.push(node);
-        currentTokenCount += nodeTokenCount;
+    // Recursively split each section
+    const recursivelySplitSections: Content[][] = [];
+    for (const section of sections) {
+      // If the section contains child headings, split further
+      const hasHeading = section.some(node => node.type === 'heading');
+      if (hasHeading && countTokens(unified().use(remarkStringify).stringify({ type: 'root', children: section })) > maxTokens) {
+        // Recursively split child nodes
+        const childSections = splitSectionRecursively(section);
+        recursivelySplitSections.push(...childSections);
       }
       else {
-        if (currentNodes.length > 0) {
-          const chunk = stringify({ type: 'root', children: currentNodes });
-          results.push(chunk);
-          currentNodes = [];
-          currentTokenCount = 0;
-        }
-        if (nodeTokenCount > maxTokens) {
-          // If a single node exceeds maxTokens, add it as is
-          results.push(nodeContent);
-        }
-        else {
-          currentNodes.push(node);
-          currentTokenCount = nodeTokenCount;
-        }
+        recursivelySplitSections.push(section);
       }
     }
 
-    if (currentNodes.length > 0) {
-      const chunk = stringify({ type: 'root', children: currentNodes });
-      results.push(chunk);
-    }
-
-    return results;
-  };
+    return recursivelySplitSections;
+  }
 
-  // Get initial sections
-  const initialSections = splitSections(tree.children);
-  // Process sections
-  const result = processSections(initialSections);
+  // Recursively split the AST's child nodes
+  const splitSections = splitSectionRecursively(tree.children);
 
-  // Free the encoding
-  encoding.free();
+  // Convert the split sections back into markdown strings
+  const markdownSections = splitSections.map(sectionNodes => unified()
+    .use(remarkStringify)
+    .stringify({ type: 'root', children: sectionNodes }));
 
-  return result;
+  return markdownSections;
 }

+ 39 - 0
packages/page-split/vite.config.ts

@@ -0,0 +1,39 @@
+import path from 'path';
+
+import react from '@vitejs/plugin-react';
+import glob from 'glob';
+import { nodeExternals } from 'rollup-plugin-node-externals';
+import { defineConfig } from 'vite';
+import dts from 'vite-plugin-dts';
+
+// https://vitejs.dev/config/
+export default defineConfig({
+  plugins: [
+    react(),
+    dts({ copyDtsFiles: true }),
+    {
+      ...nodeExternals({
+        devDeps: true,
+        builtinsPrefix: 'ignore',
+      }),
+      enforce: 'pre',
+    },
+  ],
+  build: {
+    outDir: 'dist',
+    sourcemap: true,
+    lib: {
+      entry: glob.sync(path.resolve(__dirname, 'src/**/*.ts'), {
+        ignore: '**/*.spec.ts',
+      }),
+      name: 'presentation-libs',
+      formats: ['es'],
+    },
+    rollupOptions: {
+      output: {
+        preserveModules: true,
+        preserveModulesRoot: 'src',
+      },
+    },
+  },
+});