1 year ago · a9f35e8a22
--- a/packages/markdown-splitter/dist/index.cjs
+++ b/packages/markdown-splitter/dist/index.cjs
@@ -1,2 +0,0 @@
 
															-"use strict";Object.defineProperty(exports,Symbol.toStringTag,{value:"Module"});const t=require("./services/markdown-splitter.cjs"),n=require("./services/markdown-token-splitter.cjs");exports.splitMarkdownIntoFragments=t.splitMarkdownIntoFragments;exports.splitMarkdownIntoChunks=n.splitMarkdownIntoChunks;
														
 
															-//# sourceMappingURL=index.cjs.map
														
--- a/packages/markdown-splitter/dist/index.cjs.map
+++ b/packages/markdown-splitter/dist/index.cjs.map
@@ -1 +0,0 @@
 
															-{"version":3,"file":"index.cjs","sources":[],"sourcesContent":[],"names":[],"mappings":""}
														
--- a/packages/markdown-splitter/dist/index.d.ts
+++ b/packages/markdown-splitter/dist/index.d.ts
@@ -1,2 +0,0 @@
 
															-export * from './services/markdown-splitter';
														
 
															-export * from './services/markdown-token-splitter';
														
--- a/packages/markdown-splitter/dist/index.js
+++ b/packages/markdown-splitter/dist/index.js
@@ -1,7 +0,0 @@
 
															-import { splitMarkdownIntoFragments as t } from "./services/markdown-splitter.js";
														
 
															-import { splitMarkdownIntoChunks as p } from "./services/markdown-token-splitter.js";
														
 
															-export {
														
 
															-  p as splitMarkdownIntoChunks,
														
 
															-  t as splitMarkdownIntoFragments
														
 
															-};
														
 
															-//# sourceMappingURL=index.js.map
														
--- a/packages/markdown-splitter/dist/index.js.map
+++ b/packages/markdown-splitter/dist/index.js.map
@@ -1 +0,0 @@
 
															-{"version":3,"file":"index.js","sources":[],"sourcesContent":[],"names":[],"mappings":";;"}
														
--- a/packages/markdown-splitter/dist/services/markdown-splitter.cjs
+++ b/packages/markdown-splitter/dist/services/markdown-splitter.cjs
@@ -1,2 +0,0 @@
 
															-"use strict";Object.defineProperty(exports,Symbol.toStringTag,{value:"Module"});const q=require("js-tiktoken"),M=require("js-yaml"),f=require("remark-frontmatter"),y=require("remark-gfm"),S=require("remark-parse"),C=require("remark-stringify"),d=require("unified");function F(t,n){if(n>t.length)for(;t.length<n;)t.push(1);else n===t.length||t.splice(n),t[n-1]++;return t.join("-")}async function w(t,n){const s=[],g=[];let i="";const c={};if(typeof t!="string"||t.trim()==="")return s;const u=q.encodingForModel(n),p=d.unified().use(S).use(f,["yaml"]).use(y),m={bullet:"-",rule:"-"},a=d.unified().use(f,["yaml"]).use(y).use(C,m),k=p.parse(t);for(const o of k.children)if(o.type==="yaml"){const r=M.load(o.value),e=JSON.stringify(r,null,2),l=u.encode(e).length;s.push({label:"frontmatter",type:"yaml",text:e,tokenCount:l})}else if(o.type==="heading"){const r=o.depth;i=F(g,r);const e=a.stringify(o).trim(),l=u.encode(e).length;s.push({label:`${i}-heading`,type:o.type,text:e,tokenCount:l})}else{const r=a.stringify(o).trim();if(r!==""){const e=i||"0";c[e]?c[e]++:c[e]=1;const l=i!==""?`${i}-content-${c[e]}`:`0-content-${c[e]}`,h=u.encode(r).length;s.push({label:l,type:o.type,text:r,tokenCount:h})}}return s}exports.splitMarkdownIntoFragments=w;
														
 
															-//# sourceMappingURL=markdown-splitter.cjs.map
														
--- a/packages/markdown-splitter/dist/services/markdown-splitter.cjs.map
+++ b/packages/markdown-splitter/dist/services/markdown-splitter.cjs.map
--- a/packages/markdown-splitter/dist/services/markdown-splitter.d.ts
+++ b/packages/markdown-splitter/dist/services/markdown-splitter.d.ts
@@ -1,15 +0,0 @@
 
															-import { TiktokenModel } from 'js-tiktoken';
														
 
															-
														
 
															-export type MarkdownFragment = {
														
 
															-    label: string;
														
 
															-    type: string;
														
 
															-    text: string;
														
 
															-    tokenCount: number;
														
 
															-};
														
 
															-/**
														
 
															- * Splits Markdown text into labeled markdownFragments using remark-parse and remark-stringify,
														
 
															- * processing each content node separately and labeling them as 1-content-1, 1-content-2, etc.
														
 
															- * @param markdownText - The input Markdown string.
														
 
															- * @returns An array of labeled markdownFragments.
														
 
															- */
														
 
															-export declare function splitMarkdownIntoFragments(markdownText: string, model: TiktokenModel): Promise<MarkdownFragment[]>;
														
--- a/packages/markdown-splitter/dist/services/markdown-splitter.js
+++ b/packages/markdown-splitter/dist/services/markdown-splitter.js
@@ -1,65 +0,0 @@
 
															-import { encodingForModel as C } from "js-tiktoken";
														
 
															-import F from "js-yaml";
														
 
															-import m from "remark-frontmatter";
														
 
															-import p from "remark-gfm";
														
 
															-import M from "remark-parse";
														
 
															-import S from "remark-stringify";
														
 
															-import { unified as u } from "unified";
														
 
															-function $(t, n) {
														
 
															-  if (n > t.length)
														
 
															-    for (; t.length < n; )
														
 
															-      t.push(1);
														
 
															-  else n === t.length || t.splice(n), t[n - 1]++;
														
 
															-  return t.join("-");
														
 
															-}
														
 
															-async function G(t, n) {
														
 
															-  const s = [], y = [];
														
 
															-  let i = "";
														
 
															-  const l = {};
														
 
															-  if (typeof t != "string" || t.trim() === "")
														
 
															-    return s;
														
 
															-  const f = C(n), g = u().use(M).use(m, ["yaml"]).use(p), d = {
														
 
															-    bullet: "-",
														
 
															-    // Set list bullet to hyphen
														
 
															-    rule: "-"
														
 
															-    // Use hyphen for horizontal rules
														
 
															-  }, a = u().use(m, ["yaml"]).use(p).use(S, d), k = g.parse(t);
														
 
															-  for (const o of k.children)
														
 
															-    if (o.type === "yaml") {
														
 
															-      const r = F.load(o.value), e = JSON.stringify(r, null, 2), c = f.encode(e).length;
														
 
															-      s.push({
														
 
															-        label: "frontmatter",
														
 
															-        type: "yaml",
														
 
															-        text: e,
														
 
															-        tokenCount: c
														
 
															-      });
														
 
															-    } else if (o.type === "heading") {
														
 
															-      const r = o.depth;
														
 
															-      i = $(y, r);
														
 
															-      const e = a.stringify(o).trim(), c = f.encode(e).length;
														
 
															-      s.push({
														
 
															-        label: `${i}-heading`,
														
 
															-        type: o.type,
														
 
															-        text: e,
														
 
															-        tokenCount: c
														
 
															-      });
														
 
															-    } else {
														
 
															-      const r = a.stringify(o).trim();
														
 
															-      if (r !== "") {
														
 
															-        const e = i || "0";
														
 
															-        l[e] ? l[e]++ : l[e] = 1;
														
 
															-        const c = i !== "" ? `${i}-content-${l[e]}` : `0-content-${l[e]}`, h = f.encode(r).length;
														
 
															-        s.push({
														
 
															-          label: c,
														
 
															-          type: o.type,
														
 
															-          text: r,
														
 
															-          tokenCount: h
														
 
															-        });
														
 
															-      }
														
 
															-    }
														
 
															-  return s;
														
 
															-}
														
 
															-export {
														
 
															-  G as splitMarkdownIntoFragments
														
 
															-};
														
 
															-//# sourceMappingURL=markdown-splitter.js.map
														
--- a/packages/markdown-splitter/dist/services/markdown-splitter.js.map
+++ b/packages/markdown-splitter/dist/services/markdown-splitter.js.map
--- a/packages/markdown-splitter/dist/services/markdown-splitter.spec.d.ts
+++ b/packages/markdown-splitter/dist/services/markdown-splitter.spec.d.ts
@@ -1 +0,0 @@
 
															-export {};
														
--- a/packages/markdown-splitter/dist/services/markdown-token-splitter.cjs
+++ b/packages/markdown-splitter/dist/services/markdown-token-splitter.cjs
@@ -1,10 +0,0 @@
 
															-"use strict";Object.defineProperty(exports,Symbol.toStringTag,{value:"Module"});const m=require("js-tiktoken"),P=require("./markdown-splitter.cjs");function j(l,p){const u=l.map(({label:i})=>i==="frontmatter"?"frontmatter":i.match(/^\d+(?:-\d+)*/)[0]),C=[...new Set(u.filter(Boolean))],g=[];let a=[...C];for(;a.length>0;){const i=a[0];if(C.some(s=>s!==i&&s.startsWith(i))){let s=l.filter(t=>t.label.startsWith(i));const o=i.split("-");for(let t=1;t<o.length;t++){const e=o.slice(0,t).join("-"),r=l.find(c=>c.label===`${e}-heading`);r&&(s=[r,...s])}if(s.reduce((t,e)=>t+e.tokenCount,0)<=p)g.push(s),a=a.filter(t=>!t.startsWith(`${i}-`));else{const t=l.filter(e=>{const r=e.label.match(/^\d+(-\d+)*(?=-)/);return r&&r[0]===i});for(let e=1;e<o.length;e++){const r=o.slice(0,e).join("-"),c=l.find(d=>d.label===`${r}-heading`);c&&t.unshift(c)}g.push(t)}}else{let s=l.filter(n=>n.label.startsWith(i));const o=i.split("-");for(let n=1;n<o.length;n++){const t=o.slice(0,n).join("-"),e=l.find(r=>r.label===`${t}-heading`);e&&(s=[e,...s])}g.push(s)}a.shift()}return g}async function y(l,p,u=800){if(m.encodingForModel(p).encode(l).length<=u)return[l];const g=await P.splitMarkdownIntoFragments(l,p),a=[];return j(g,u).forEach(f=>{if(f.reduce((o,n)=>o+n.tokenCount,0)<=u){const o=f.map((n,t)=>{const e=f[t+1];return e?n.type==="heading"&&e.type==="heading"?`${n.text}
														
 
															-`:`${n.text}
														
 
															-
														
 
															-`:n.text}).join("");a.push(o)}else{const o=f.filter(t=>t.type==="heading"),n=o.map(t=>t.text).join(`
														
 
															-`);for(const t of f)if(t.label.includes("content")){const e=o.reduce((c,d)=>c+d.tokenCount,0)+t.tokenCount,r=o.reduce((c,d)=>c+d.tokenCount,0);if(r>u/2)throw new Error(`Heading token count is too large. Heading token count: ${r}, allowed maximum: ${Math.ceil(u/2)}`);if(e>u){const c=o.reduce((h,k)=>h+k.tokenCount,0),d=u-c,b=t.text.length,M=t.tokenCount,x=Math.floor(d/M*b),$=[];for(let h=0;h<t.text.length;h+=x)$.push(t.text.slice(h,h+x));$.forEach(h=>{const k=n?`${n}
														
 
															-
														
 
															-${h}`:`${h}`;a.push(k)})}else{const c=`${n}
														
 
															-
														
 
															-${t.text}`;a.push(c)}}}}),a}exports.splitMarkdownIntoChunks=y;
														
 
															-//# sourceMappingURL=markdown-token-splitter.cjs.map
														
--- a/packages/markdown-splitter/dist/services/markdown-token-splitter.cjs.map
+++ b/packages/markdown-splitter/dist/services/markdown-token-splitter.cjs.map
--- a/packages/markdown-splitter/dist/services/markdown-token-splitter.d.ts
+++ b/packages/markdown-splitter/dist/services/markdown-token-splitter.d.ts
@@ -1,3 +0,0 @@
 
															-import { TiktokenModel } from 'js-tiktoken';
														
 
															-
														
 
															-export declare function splitMarkdownIntoChunks(markdownText: string, model: TiktokenModel, maxToken?: number): Promise<string[]>;
														
--- a/packages/markdown-splitter/dist/services/markdown-token-splitter.js
+++ b/packages/markdown-splitter/dist/services/markdown-token-splitter.js
@@ -1,88 +0,0 @@
 
															-import { encodingForModel as M } from "js-tiktoken";
														
 
															-import { splitMarkdownIntoFragments as P } from "./markdown-splitter.js";
														
 
															-function j(l, p) {
														
 
															-  const h = l.map(({ label: i }) => i === "frontmatter" ? "frontmatter" : i.match(/^\d+(?:-\d+)*/)[0]), C = [...new Set(h.filter(Boolean))], d = [];
														
 
															-  let a = [...C];
														
 
															-  for (; a.length > 0; ) {
														
 
															-    const i = a[0];
														
 
															-    if (C.some((s) => s !== i && s.startsWith(i))) {
														
 
															-      let s = l.filter((t) => t.label.startsWith(i));
														
 
															-      const o = i.split("-");
														
 
															-      for (let t = 1; t < o.length; t++) {
														
 
															-        const e = o.slice(0, t).join("-"), r = l.find((c) => c.label === `${e}-heading`);
														
 
															-        r && (s = [r, ...s]);
														
 
															-      }
														
 
															-      if (s.reduce((t, e) => t + e.tokenCount, 0) <= p)
														
 
															-        d.push(s), a = a.filter((t) => !t.startsWith(`${i}-`));
														
 
															-      else {
														
 
															-        const t = l.filter((e) => {
														
 
															-          const r = e.label.match(/^\d+(-\d+)*(?=-)/);
														
 
															-          return r && r[0] === i;
														
 
															-        });
														
 
															-        for (let e = 1; e < o.length; e++) {
														
 
															-          const r = o.slice(0, e).join("-"), c = l.find((g) => g.label === `${r}-heading`);
														
 
															-          c && t.unshift(c);
														
 
															-        }
														
 
															-        d.push(t);
														
 
															-      }
														
 
															-    } else {
														
 
															-      let s = l.filter((n) => n.label.startsWith(i));
														
 
															-      const o = i.split("-");
														
 
															-      for (let n = 1; n < o.length; n++) {
														
 
															-        const t = o.slice(0, n).join("-"), e = l.find((r) => r.label === `${t}-heading`);
														
 
															-        e && (s = [e, ...s]);
														
 
															-      }
														
 
															-      d.push(s);
														
 
															-    }
														
 
															-    a.shift();
														
 
															-  }
														
 
															-  return d;
														
 
															-}
														
 
															-async function F(l, p, h = 800) {
														
 
															-  if (M(p).encode(l).length <= h)
														
 
															-    return [l];
														
 
															-  const d = await P(l, p), a = [];
														
 
															-  return j(d, h).forEach((f) => {
														
 
															-    if (f.reduce((o, n) => o + n.tokenCount, 0) <= h) {
														
 
															-      const o = f.map((n, t) => {
														
 
															-        const e = f[t + 1];
														
 
															-        return e ? n.type === "heading" && e.type === "heading" ? `${n.text}
														
 
															-` : `${n.text}
														
 
															-
														
 
															-` : n.text;
														
 
															-      }).join("");
														
 
															-      a.push(o);
														
 
															-    } else {
														
 
															-      const o = f.filter((t) => t.type === "heading"), n = o.map((t) => t.text).join(`
														
 
															-`);
														
 
															-      for (const t of f)
														
 
															-        if (t.label.includes("content")) {
														
 
															-          const e = o.reduce((c, g) => c + g.tokenCount, 0) + t.tokenCount, r = o.reduce((c, g) => c + g.tokenCount, 0);
														
 
															-          if (r > h / 2)
														
 
															-            throw new Error(
														
 
															-              `Heading token count is too large. Heading token count: ${r}, allowed maximum: ${Math.ceil(h / 2)}`
														
 
															-            );
														
 
															-          if (e > h) {
														
 
															-            const c = o.reduce((u, x) => u + x.tokenCount, 0), g = h - c, m = t.text.length, b = t.tokenCount, k = Math.floor(g / b * m), $ = [];
														
 
															-            for (let u = 0; u < t.text.length; u += k)
														
 
															-              $.push(t.text.slice(u, u + k));
														
 
															-            $.forEach((u) => {
														
 
															-              const x = n ? `${n}
														
 
															-
														
 
															-${u}` : `${u}`;
														
 
															-              a.push(x);
														
 
															-            });
														
 
															-          } else {
														
 
															-            const c = `${n}
														
 
															-
														
 
															-${t.text}`;
														
 
															-            a.push(c);
														
 
															-          }
														
 
															-        }
														
 
															-    }
														
 
															-  }), a;
														
 
															-}
														
 
															-export {
														
 
															-  F as splitMarkdownIntoChunks
														
 
															-};
														
 
															-//# sourceMappingURL=markdown-token-splitter.js.map
														
--- a/packages/markdown-splitter/dist/services/markdown-token-splitter.js.map
+++ b/packages/markdown-splitter/dist/services/markdown-token-splitter.js.map
--- a/packages/markdown-splitter/dist/services/markdown-token-splitter.spec.d.ts
+++ b/packages/markdown-splitter/dist/services/markdown-token-splitter.spec.d.ts
@@ -1 +0,0 @@
 
															-export {};
	`@@ -1 +0,0 @@`
	`-{"version":3,"file":"index.cjs","sources":[],"sourcesContent":[],"names":[],"mappings":""}`
	`@@ -1,2 +0,0 @@`
	`-export * from './services/markdown-splitter';`
	`-export * from './services/markdown-token-splitter';`
	`@@ -1 +0,0 @@`
	`-{"version":3,"file":"index.js","sources":[],"sourcesContent":[],"names":[],"mappings":";;"}`