markdown-token-splitter.js 2.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
  1. import { encodingForModel as M } from "js-tiktoken";
  2. import { splitMarkdownIntoFragments as P } from "./markdown-splitter.js";
  3. function j(l, p) {
  4. const h = l.map(({ label: i }) => i === "frontmatter" ? "frontmatter" : i.match(/^\d+(?:-\d+)*/)[0]), C = [...new Set(h.filter(Boolean))], d = [];
  5. let a = [...C];
  6. for (; a.length > 0; ) {
  7. const i = a[0];
  8. if (C.some((s) => s !== i && s.startsWith(i))) {
  9. let s = l.filter((t) => t.label.startsWith(i));
  10. const o = i.split("-");
  11. for (let t = 1; t < o.length; t++) {
  12. const e = o.slice(0, t).join("-"), r = l.find((c) => c.label === `${e}-heading`);
  13. r && (s = [r, ...s]);
  14. }
  15. if (s.reduce((t, e) => t + e.tokenCount, 0) <= p)
  16. d.push(s), a = a.filter((t) => !t.startsWith(`${i}-`));
  17. else {
  18. const t = l.filter((e) => {
  19. const r = e.label.match(/^\d+(-\d+)*(?=-)/);
  20. return r && r[0] === i;
  21. });
  22. for (let e = 1; e < o.length; e++) {
  23. const r = o.slice(0, e).join("-"), c = l.find((g) => g.label === `${r}-heading`);
  24. c && t.unshift(c);
  25. }
  26. d.push(t);
  27. }
  28. } else {
  29. let s = l.filter((n) => n.label.startsWith(i));
  30. const o = i.split("-");
  31. for (let n = 1; n < o.length; n++) {
  32. const t = o.slice(0, n).join("-"), e = l.find((r) => r.label === `${t}-heading`);
  33. e && (s = [e, ...s]);
  34. }
  35. d.push(s);
  36. }
  37. a.shift();
  38. }
  39. return d;
  40. }
  41. async function F(l, p, h = 800) {
  42. if (M(p).encode(l).length <= h)
  43. return [l];
  44. const d = await P(l, p), a = [];
  45. return j(d, h).forEach((f) => {
  46. if (f.reduce((o, n) => o + n.tokenCount, 0) <= h) {
  47. const o = f.map((n, t) => {
  48. const e = f[t + 1];
  49. return e ? n.type === "heading" && e.type === "heading" ? `${n.text}
  50. ` : `${n.text}
  51. ` : n.text;
  52. }).join("");
  53. a.push(o);
  54. } else {
  55. const o = f.filter((t) => t.type === "heading"), n = o.map((t) => t.text).join(`
  56. `);
  57. for (const t of f)
  58. if (t.label.includes("content")) {
  59. const e = o.reduce((c, g) => c + g.tokenCount, 0) + t.tokenCount, r = o.reduce((c, g) => c + g.tokenCount, 0);
  60. if (r > h / 2)
  61. throw new Error(
  62. `Heading token count is too large. Heading token count: ${r}, allowed maximum: ${Math.ceil(h / 2)}`
  63. );
  64. if (e > h) {
  65. const c = o.reduce((u, x) => u + x.tokenCount, 0), g = h - c, m = t.text.length, b = t.tokenCount, k = Math.floor(g / b * m), $ = [];
  66. for (let u = 0; u < t.text.length; u += k)
  67. $.push(t.text.slice(u, u + k));
  68. $.forEach((u) => {
  69. const x = n ? `${n}
  70. ${u}` : `${u}`;
  71. a.push(x);
  72. });
  73. } else {
  74. const c = `${n}
  75. ${t.text}`;
  76. a.push(c);
  77. }
  78. }
  79. }
  80. }), a;
  81. }
  82. export {
  83. F as splitMarkdownIntoChunks
  84. };
  85. //# sourceMappingURL=markdown-token-splitter.js.map