markdown-token-splitter.cjs 2.1 KB

12345678910
  1. "use strict";Object.defineProperty(exports,Symbol.toStringTag,{value:"Module"});const m=require("js-tiktoken"),P=require("./markdown-splitter.cjs");function j(l,p){const u=l.map(({label:i})=>i==="frontmatter"?"frontmatter":i.match(/^\d+(?:-\d+)*/)[0]),C=[...new Set(u.filter(Boolean))],g=[];let a=[...C];for(;a.length>0;){const i=a[0];if(C.some(s=>s!==i&&s.startsWith(i))){let s=l.filter(t=>t.label.startsWith(i));const o=i.split("-");for(let t=1;t<o.length;t++){const e=o.slice(0,t).join("-"),r=l.find(c=>c.label===`${e}-heading`);r&&(s=[r,...s])}if(s.reduce((t,e)=>t+e.tokenCount,0)<=p)g.push(s),a=a.filter(t=>!t.startsWith(`${i}-`));else{const t=l.filter(e=>{const r=e.label.match(/^\d+(-\d+)*(?=-)/);return r&&r[0]===i});for(let e=1;e<o.length;e++){const r=o.slice(0,e).join("-"),c=l.find(d=>d.label===`${r}-heading`);c&&t.unshift(c)}g.push(t)}}else{let s=l.filter(n=>n.label.startsWith(i));const o=i.split("-");for(let n=1;n<o.length;n++){const t=o.slice(0,n).join("-"),e=l.find(r=>r.label===`${t}-heading`);e&&(s=[e,...s])}g.push(s)}a.shift()}return g}async function y(l,p,u=800){if(m.encodingForModel(p).encode(l).length<=u)return[l];const g=await P.splitMarkdownIntoFragments(l,p),a=[];return j(g,u).forEach(f=>{if(f.reduce((o,n)=>o+n.tokenCount,0)<=u){const o=f.map((n,t)=>{const e=f[t+1];return e?n.type==="heading"&&e.type==="heading"?`${n.text}
  2. `:`${n.text}
  3. `:n.text}).join("");a.push(o)}else{const o=f.filter(t=>t.type==="heading"),n=o.map(t=>t.text).join(`
  4. `);for(const t of f)if(t.label.includes("content")){const e=o.reduce((c,d)=>c+d.tokenCount,0)+t.tokenCount,r=o.reduce((c,d)=>c+d.tokenCount,0);if(r>u/2)throw new Error(`Heading token count is too large. Heading token count: ${r}, allowed maximum: ${Math.ceil(u/2)}`);if(e>u){const c=o.reduce((h,k)=>h+k.tokenCount,0),d=u-c,b=t.text.length,M=t.tokenCount,x=Math.floor(d/M*b),$=[];for(let h=0;h<t.text.length;h+=x)$.push(t.text.slice(h,h+x));$.forEach(h=>{const k=n?`${n}
  5. ${h}`:`${h}`;a.push(k)})}else{const c=`${n}
  6. ${t.text}`;a.push(c)}}}}),a}exports.splitMarkdownIntoChunks=y;
  7. //# sourceMappingURL=markdown-token-splitter.cjs.map