markdown-splitter.spec.ts 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573
  1. import { encodingForModel, type TiktokenModel } from 'js-tiktoken';
  2. import { splitMarkdownIntoFragments, type MarkdownFragment } from './markdown-splitter';
  3. const MODEL: TiktokenModel = 'gpt-4';
  4. const encoder = encodingForModel(MODEL);
  5. describe('splitMarkdownIntoFragments', () => {
  6. test('handles empty markdown string', async() => {
  7. const markdown = '';
  8. const expected: MarkdownFragment[] = [];
  9. const result = await splitMarkdownIntoFragments(markdown, MODEL);
  10. expect(result).toEqual(expected);
  11. });
  12. test('handles markdown with only content and no headers', async() => {
  13. const markdown = `This is some content without any headers.
  14. It spans multiple lines.
  15. Another paragraph.
  16. `;
  17. const expected: MarkdownFragment[] = [
  18. {
  19. label: '0-content-1',
  20. type: 'paragraph',
  21. text: 'This is some content without any headers.\nIt spans multiple lines.',
  22. tokenCount: encoder.encode('This is some content without any headers.\nIt spans multiple lines.').length,
  23. },
  24. {
  25. label: '0-content-2',
  26. type: 'paragraph',
  27. text: 'Another paragraph.',
  28. tokenCount: encoder.encode('Another paragraph.').length,
  29. },
  30. ];
  31. const result = await splitMarkdownIntoFragments(markdown, MODEL);
  32. expect(result).toEqual(expected);
  33. });
  34. test('handles markdown starting with a header', async() => {
  35. const markdown = `
  36. # Header 1
  37. Content under header 1.
  38. ## Header 1.1
  39. Content under header 1.1.
  40. # Header 2
  41. Content under header 2.
  42. `;
  43. const expected: MarkdownFragment[] = [
  44. {
  45. label: '1-heading',
  46. type: 'heading',
  47. text: '# Header 1',
  48. tokenCount: encoder.encode('# Header 1').length,
  49. },
  50. {
  51. label: '1-content-1',
  52. type: 'paragraph',
  53. text: 'Content under header 1.',
  54. tokenCount: encoder.encode('Content under header 1.').length,
  55. },
  56. {
  57. label: '1-1-heading',
  58. type: 'heading',
  59. text: '## Header 1.1',
  60. tokenCount: encoder.encode('## Header 1.1').length,
  61. },
  62. {
  63. label: '1-1-content-1',
  64. type: 'paragraph',
  65. text: 'Content under header 1.1.',
  66. tokenCount: encoder.encode('Content under header 1.1.').length,
  67. },
  68. {
  69. label: '2-heading',
  70. type: 'heading',
  71. text: '# Header 2',
  72. tokenCount: encoder.encode('# Header 2').length,
  73. },
  74. {
  75. label: '2-content-1',
  76. type: 'paragraph',
  77. text: 'Content under header 2.',
  78. tokenCount: encoder.encode('Content under header 2.').length,
  79. },
  80. ];
  81. const result = await splitMarkdownIntoFragments(markdown, MODEL);
  82. expect(result).toEqual(expected);
  83. });
  84. test('handles markdown with non-consecutive heading levels', async() => {
  85. const markdown = `
  86. Introduction without a header.
  87. # Chapter 1
  88. Content of chapter 1.
  89. ### Section 1.1.1
  90. Content of section 1.1.1.
  91. ## Section 1.2
  92. Content of section 1.2.
  93. # Chapter 2
  94. Content of chapter 2.
  95. ## Section 2.1
  96. Content of section 2.1.
  97. `;
  98. const expected: MarkdownFragment[] = [
  99. {
  100. label: '0-content-1',
  101. type: 'paragraph',
  102. text: 'Introduction without a header.',
  103. tokenCount: encoder.encode('Introduction without a header.').length,
  104. },
  105. {
  106. label: '1-heading',
  107. type: 'heading',
  108. text: '# Chapter 1',
  109. tokenCount: encoder.encode('# Chapter 1').length,
  110. },
  111. {
  112. label: '1-content-1',
  113. type: 'paragraph',
  114. text: 'Content of chapter 1.',
  115. tokenCount: encoder.encode('Content of chapter 1.').length,
  116. },
  117. {
  118. label: '1-1-1-heading',
  119. type: 'heading',
  120. text: '### Section 1.1.1',
  121. tokenCount: encoder.encode('### Section 1.1.1').length,
  122. },
  123. {
  124. label: '1-1-1-content-1',
  125. type: 'paragraph',
  126. text: 'Content of section 1.1.1.',
  127. tokenCount: encoder.encode('Content of section 1.1.1.').length,
  128. },
  129. {
  130. label: '1-2-heading',
  131. type: 'heading',
  132. text: '## Section 1.2',
  133. tokenCount: encoder.encode('## Section 1.2').length,
  134. },
  135. {
  136. label: '1-2-content-1',
  137. type: 'paragraph',
  138. text: 'Content of section 1.2.',
  139. tokenCount: encoder.encode('Content of section 1.2.').length,
  140. },
  141. {
  142. label: '2-heading',
  143. type: 'heading',
  144. text: '# Chapter 2',
  145. tokenCount: encoder.encode('# Chapter 2').length,
  146. },
  147. {
  148. label: '2-content-1',
  149. type: 'paragraph',
  150. text: 'Content of chapter 2.',
  151. tokenCount: encoder.encode('Content of chapter 2.').length,
  152. },
  153. {
  154. label: '2-1-heading',
  155. type: 'heading',
  156. text: '## Section 2.1',
  157. tokenCount: encoder.encode('## Section 2.1').length,
  158. },
  159. {
  160. label: '2-1-content-1',
  161. type: 'paragraph',
  162. text: 'Content of section 2.1.',
  163. tokenCount: encoder.encode('Content of section 2.1.').length,
  164. },
  165. ];
  166. const result = await splitMarkdownIntoFragments(markdown, MODEL);
  167. expect(result).toEqual(expected);
  168. });
  169. test('handles markdown with skipped heading levels', async() => {
  170. const markdown = `
  171. # Header 1
  172. Content under header 1.
  173. #### Header 1.1.1.1
  174. Content under header 1.1.1.1.
  175. ## Header 1.2
  176. Content under header 1.2.
  177. # Header 2
  178. Content under header 2.
  179. `;
  180. const expected: MarkdownFragment[] = [
  181. {
  182. label: '1-heading',
  183. type: 'heading',
  184. text: '# Header 1',
  185. tokenCount: encoder.encode('# Header 1').length,
  186. },
  187. {
  188. label: '1-content-1',
  189. type: 'paragraph',
  190. text: 'Content under header 1.',
  191. tokenCount: encoder.encode('Content under header 1.').length,
  192. },
  193. {
  194. label: '1-1-1-1-heading',
  195. type: 'heading',
  196. text: '#### Header 1.1.1.1',
  197. tokenCount: encoder.encode('#### Header 1.1.1.1').length,
  198. },
  199. {
  200. label: '1-1-1-1-content-1',
  201. type: 'paragraph',
  202. text: 'Content under header 1.1.1.1.',
  203. tokenCount: encoder.encode('Content under header 1.1.1.1.').length,
  204. },
  205. {
  206. label: '1-2-heading',
  207. type: 'heading',
  208. text: '## Header 1.2',
  209. tokenCount: encoder.encode('## Header 1.2').length,
  210. },
  211. {
  212. label: '1-2-content-1',
  213. type: 'paragraph',
  214. text: 'Content under header 1.2.',
  215. tokenCount: encoder.encode('Content under header 1.2.').length,
  216. },
  217. {
  218. label: '2-heading',
  219. type: 'heading',
  220. text: '# Header 2',
  221. tokenCount: encoder.encode('# Header 2').length,
  222. },
  223. {
  224. label: '2-content-1',
  225. type: 'paragraph',
  226. text: 'Content under header 2.',
  227. tokenCount: encoder.encode('Content under header 2.').length,
  228. },
  229. ];
  230. const result = await splitMarkdownIntoFragments(markdown, MODEL);
  231. expect(result).toEqual(expected);
  232. });
  233. test('handles malformed headings', async() => {
  234. const markdown = `
  235. # Header 1
  236. Content under header 1.
  237. #### Header 1.1.1.1
  238. Content under header 1.1.1.1.
  239. `;
  240. const expected: MarkdownFragment[] = [
  241. {
  242. label: '1-heading',
  243. type: 'heading',
  244. text: '# Header 1',
  245. tokenCount: encoder.encode('# Header 1').length,
  246. },
  247. {
  248. label: '1-content-1',
  249. type: 'paragraph',
  250. text: 'Content under header 1.',
  251. tokenCount: encoder.encode('Content under header 1.').length,
  252. },
  253. {
  254. label: '1-1-1-1-heading',
  255. type: 'heading',
  256. text: '#### Header 1.1.1.1',
  257. tokenCount: encoder.encode('#### Header 1.1.1.1').length,
  258. },
  259. {
  260. label: '1-1-1-1-content-1',
  261. type: 'paragraph',
  262. text: 'Content under header 1.1.1.1.',
  263. tokenCount: encoder.encode('Content under header 1.1.1.1.').length,
  264. },
  265. ];
  266. const result = await splitMarkdownIntoFragments(markdown, MODEL);
  267. expect(result).toEqual(expected);
  268. });
  269. test('handles multiple content blocks before any headers', async() => {
  270. const markdown = `
  271. This is the first paragraph without a header.
  272. This is the second paragraph without a header.
  273. # Header 1
  274. Content under header 1.
  275. `;
  276. const expected: MarkdownFragment[] = [
  277. {
  278. label: '0-content-1',
  279. type: 'paragraph',
  280. text: 'This is the first paragraph without a header.',
  281. tokenCount: encoder.encode('This is the first paragraph without a header.').length,
  282. },
  283. {
  284. label: '0-content-2',
  285. type: 'paragraph',
  286. text: 'This is the second paragraph without a header.',
  287. tokenCount: encoder.encode('This is the second paragraph without a header.').length,
  288. },
  289. {
  290. label: '1-heading',
  291. type: 'heading',
  292. text: '# Header 1',
  293. tokenCount: encoder.encode('# Header 1').length,
  294. },
  295. {
  296. label: '1-content-1',
  297. type: 'paragraph',
  298. text: 'Content under header 1.',
  299. tokenCount: encoder.encode('Content under header 1.').length,
  300. },
  301. ];
  302. const result = await splitMarkdownIntoFragments(markdown, MODEL);
  303. expect(result).toEqual(expected);
  304. });
  305. test('handles markdown with only headers and no content', async() => {
  306. const markdown = `
  307. # Header 1
  308. ## Header 1.1
  309. ### Header 1.1.1
  310. `;
  311. const expected: MarkdownFragment[] = [
  312. {
  313. label: '1-heading',
  314. type: 'heading',
  315. text: '# Header 1',
  316. tokenCount: encoder.encode('# Header 1').length,
  317. },
  318. {
  319. label: '1-1-heading',
  320. type: 'heading',
  321. text: '## Header 1.1',
  322. tokenCount: encoder.encode('## Header 1.1').length,
  323. },
  324. {
  325. label: '1-1-1-heading',
  326. type: 'heading',
  327. text: '### Header 1.1.1',
  328. tokenCount: encoder.encode('### Header 1.1.1').length,
  329. },
  330. ];
  331. const result = await splitMarkdownIntoFragments(markdown, MODEL);
  332. expect(result).toEqual(expected);
  333. });
  334. test('handles markdown with mixed content and headers', async() => {
  335. const markdown = `
  336. # Header 1
  337. Content under header 1.
  338. ## Header 1.1
  339. Content under header 1.1.
  340. Another piece of content.
  341. # Header 2
  342. Content under header 2.
  343. `;
  344. const expected: MarkdownFragment[] = [
  345. {
  346. label: '1-heading',
  347. type: 'heading',
  348. text: '# Header 1',
  349. tokenCount: encoder.encode('# Header 1').length,
  350. },
  351. {
  352. label: '1-content-1',
  353. type: 'paragraph',
  354. text: 'Content under header 1.',
  355. tokenCount: encoder.encode('Content under header 1.').length,
  356. },
  357. {
  358. label: '1-1-heading',
  359. type: 'heading',
  360. text: '## Header 1.1',
  361. tokenCount: encoder.encode('## Header 1.1').length,
  362. },
  363. {
  364. label: '1-1-content-1',
  365. type: 'paragraph',
  366. text: 'Content under header 1.1.\nAnother piece of content.',
  367. tokenCount: encoder.encode('Content under header 1.1.\nAnother piece of content.').length,
  368. },
  369. {
  370. label: '2-heading',
  371. type: 'heading',
  372. text: '# Header 2',
  373. tokenCount: encoder.encode('# Header 2').length,
  374. },
  375. {
  376. label: '2-content-1',
  377. type: 'paragraph',
  378. text: 'Content under header 2.',
  379. tokenCount: encoder.encode('Content under header 2.').length,
  380. },
  381. ];
  382. const result = await splitMarkdownIntoFragments(markdown, MODEL);
  383. expect(result).toEqual(expected);
  384. });
  385. test('preserves list indentation and reduces unnecessary line breaks', async() => {
  386. const markdown = `
  387. # Header 1
  388. Content under header 1.
  389. - Item 1
  390. - Subitem 1
  391. - Item 2
  392. # Header 2
  393. Content under header 2.
  394. `;
  395. const expected: MarkdownFragment[] = [
  396. {
  397. label: '1-heading',
  398. type: 'heading',
  399. text: '# Header 1',
  400. tokenCount: encoder.encode('# Header 1').length,
  401. },
  402. {
  403. label: '1-content-1',
  404. type: 'paragraph',
  405. text: 'Content under header 1.',
  406. tokenCount: encoder.encode('Content under header 1.').length,
  407. },
  408. {
  409. label: '1-content-2',
  410. type: 'list',
  411. text: '- Item 1\n - Subitem 1\n- Item 2',
  412. tokenCount: encoder.encode('- Item 1\n - Subitem 1\n- Item 2').length,
  413. },
  414. {
  415. label: '2-heading',
  416. type: 'heading',
  417. text: '# Header 2',
  418. tokenCount: encoder.encode('# Header 2').length,
  419. },
  420. {
  421. label: '2-content-1',
  422. type: 'paragraph',
  423. text: 'Content under header 2.',
  424. tokenCount: encoder.encode('Content under header 2.').length,
  425. },
  426. ];
  427. const result = await splitMarkdownIntoFragments(markdown, MODEL);
  428. expect(result).toEqual(expected);
  429. });
  430. test('code blocks containing # are not treated as headings', async() => {
  431. const markdown = `
  432. # Header 1
  433. Some introductory content.
  434. \`\`\`
  435. # This is a comment with a # symbol
  436. Some code line
  437. \`\`\`
  438. Additional content.
  439. # Header 2
  440. Content under header 2.
  441. `;
  442. const expected: MarkdownFragment[] = [
  443. {
  444. label: '1-heading',
  445. type: 'heading',
  446. text: '# Header 1',
  447. tokenCount: encoder.encode('# Header 1').length,
  448. },
  449. {
  450. label: '1-content-1',
  451. type: 'paragraph',
  452. text: 'Some introductory content.',
  453. tokenCount: encoder.encode('Some introductory content.').length,
  454. },
  455. {
  456. label: '1-content-2',
  457. type: 'code',
  458. text: '```\n# This is a comment with a # symbol\nSome code line\n```',
  459. tokenCount: encoder.encode('```\n# This is a comment with a # symbol\nSome code line\n```').length,
  460. },
  461. {
  462. label: '1-content-3',
  463. type: 'paragraph',
  464. text: 'Additional content.',
  465. tokenCount: encoder.encode('Additional content.').length,
  466. },
  467. {
  468. label: '2-heading',
  469. type: 'heading',
  470. text: '# Header 2',
  471. tokenCount: encoder.encode('# Header 2').length,
  472. },
  473. {
  474. label: '2-content-1',
  475. type: 'paragraph',
  476. text: 'Content under header 2.',
  477. tokenCount: encoder.encode('Content under header 2.').length,
  478. },
  479. ];
  480. const result = await splitMarkdownIntoFragments(markdown, MODEL);
  481. expect(result).toEqual(expected);
  482. });
  483. test('frontmatter is processed and labeled correctly', async() => {
  484. const markdown = `---
  485. title: Test Document
  486. author: John Doe
  487. ---
  488. # Header 1
  489. Some introductory content.
  490. `;
  491. const expected: MarkdownFragment[] = [
  492. {
  493. label: 'frontmatter',
  494. type: 'yaml',
  495. text: JSON.stringify({ title: 'Test Document', author: 'John Doe' }, null, 2),
  496. tokenCount: encoder.encode(JSON.stringify({ title: 'Test Document', author: 'John Doe' }, null, 2)).length,
  497. },
  498. {
  499. label: '1-heading',
  500. type: 'heading',
  501. text: '# Header 1',
  502. tokenCount: encoder.encode('# Header 1').length,
  503. },
  504. {
  505. label: '1-content-1',
  506. type: 'paragraph',
  507. text: 'Some introductory content.',
  508. tokenCount: encoder.encode('Some introductory content.').length,
  509. },
  510. ];
  511. const result = await splitMarkdownIntoFragments(markdown, MODEL);
  512. expect(result).toEqual(expected);
  513. });
  514. });