wiki
/
weseek__growi
зеркало из https://github.com/weseek/growi


			
				
					
						
						
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
							import { encodingForModel as M } from "js-tiktoken";
import { splitMarkdownIntoFragments as P } from "./markdown-splitter.js";
function j(l, p) {
  const h = l.map(({ label: i }) => i === "frontmatter" ? "frontmatter" : i.match(/^\d+(?:-\d+)*/)[0]), C = [...new Set(h.filter(Boolean))], d = [];
  let a = [...C];
  for (; a.length > 0; ) {
    const i = a[0];
    if (C.some((s) => s !== i && s.startsWith(i))) {
      let s = l.filter((t) => t.label.startsWith(i));
      const o = i.split("-");
      for (let t = 1; t < o.length; t++) {
        const e = o.slice(0, t).join("-"), r = l.find((c) => c.label === `${e}-heading`);
        r && (s = [r, ...s]);
      }
      if (s.reduce((t, e) => t + e.tokenCount, 0) <= p)
        d.push(s), a = a.filter((t) => !t.startsWith(`${i}-`));
      else {
        const t = l.filter((e) => {
          const r = e.label.match(/^\d+(-\d+)*(?=-)/);
          return r && r[0] === i;
        });
        for (let e = 1; e < o.length; e++) {
          const r = o.slice(0, e).join("-"), c = l.find((g) => g.label === `${r}-heading`);
          c && t.unshift(c);
        }
        d.push(t);
      }
    } else {
      let s = l.filter((n) => n.label.startsWith(i));
      const o = i.split("-");
      for (let n = 1; n < o.length; n++) {
        const t = o.slice(0, n).join("-"), e = l.find((r) => r.label === `${t}-heading`);
        e && (s = [e, ...s]);
      }
      d.push(s);
    }
    a.shift();
  }
  return d;
}
async function F(l, p, h = 800) {
  if (M(p).encode(l).length <= h)
    return [l];
  const d = await P(l, p), a = [];
  return j(d, h).forEach((f) => {
    if (f.reduce((o, n) => o + n.tokenCount, 0) <= h) {
      const o = f.map((n, t) => {
        const e = f[t + 1];
        return e ? n.type === "heading" && e.type === "heading" ? `${n.text}
` : `${n.text}

` : n.text;
      }).join("");
      a.push(o);
    } else {
      const o = f.filter((t) => t.type === "heading"), n = o.map((t) => t.text).join(`
`);
      for (const t of f)
        if (t.label.includes("content")) {
          const e = o.reduce((c, g) => c + g.tokenCount, 0) + t.tokenCount, r = o.reduce((c, g) => c + g.tokenCount, 0);
          if (r > h / 2)
            throw new Error(
              `Heading token count is too large. Heading token count: ${r}, allowed maximum: ${Math.ceil(h / 2)}`
            );
          if (e > h) {
            const c = o.reduce((u, x) => u + x.tokenCount, 0), g = h - c, m = t.text.length, b = t.tokenCount, k = Math.floor(g / b * m), $ = [];
            for (let u = 0; u < t.text.length; u += k)
              $.push(t.text.slice(u, u + k));
            $.forEach((u) => {
              const x = n ? `${n}

${u}` : `${u}`;
              a.push(x);
            });
          } else {
            const c = `${n}

${t.text}`;
            a.push(c);
          }
        }
    }
  }), a;
}
export {
  F as splitMarkdownIntoChunks
};
//# sourceMappingURL=markdown-token-splitter.js.map