markdown-splitter.js 1.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
  1. import { encodingForModel as C } from "js-tiktoken";
  2. import F from "js-yaml";
  3. import m from "remark-frontmatter";
  4. import p from "remark-gfm";
  5. import M from "remark-parse";
  6. import S from "remark-stringify";
  7. import { unified as u } from "unified";
  8. function $(t, n) {
  9. if (n > t.length)
  10. for (; t.length < n; )
  11. t.push(1);
  12. else n === t.length || t.splice(n), t[n - 1]++;
  13. return t.join("-");
  14. }
  15. async function G(t, n) {
  16. const s = [], y = [];
  17. let i = "";
  18. const l = {};
  19. if (typeof t != "string" || t.trim() === "")
  20. return s;
  21. const f = C(n), g = u().use(M).use(m, ["yaml"]).use(p), d = {
  22. bullet: "-",
  23. // Set list bullet to hyphen
  24. rule: "-"
  25. // Use hyphen for horizontal rules
  26. }, a = u().use(m, ["yaml"]).use(p).use(S, d), k = g.parse(t);
  27. for (const o of k.children)
  28. if (o.type === "yaml") {
  29. const r = F.load(o.value), e = JSON.stringify(r, null, 2), c = f.encode(e).length;
  30. s.push({
  31. label: "frontmatter",
  32. type: "yaml",
  33. text: e,
  34. tokenCount: c
  35. });
  36. } else if (o.type === "heading") {
  37. const r = o.depth;
  38. i = $(y, r);
  39. const e = a.stringify(o).trim(), c = f.encode(e).length;
  40. s.push({
  41. label: `${i}-heading`,
  42. type: o.type,
  43. text: e,
  44. tokenCount: c
  45. });
  46. } else {
  47. const r = a.stringify(o).trim();
  48. if (r !== "") {
  49. const e = i || "0";
  50. l[e] ? l[e]++ : l[e] = 1;
  51. const c = i !== "" ? `${i}-content-${l[e]}` : `0-content-${l[e]}`, h = f.encode(r).length;
  52. s.push({
  53. label: c,
  54. type: o.type,
  55. text: r,
  56. tokenCount: h
  57. });
  58. }
  59. }
  60. return s;
  61. }
  62. export {
  63. G as splitMarkdownIntoFragments
  64. };
  65. //# sourceMappingURL=markdown-splitter.js.map