index.spec.ts 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718
  1. import { encodingForModel, type TiktokenModel } from 'js-tiktoken';
  2. import type { MarkdownFragment } from '~/index';
  3. import { splitMarkdownIntoChunks, splitMarkdownIntoFragments } from '~/index';
  4. const MODEL: TiktokenModel = 'gpt-4';
  5. const encoder = encodingForModel(MODEL);
  6. describe('splitMarkdownIntoFragments', () => {
  7. test('handles empty markdown string', async() => {
  8. const markdown = '';
  9. const expected: MarkdownFragment[] = [];
  10. const result = await splitMarkdownIntoFragments(markdown, MODEL);
  11. expect(result).toEqual(expected);
  12. });
  13. test('handles markdown with only content and no headers', async() => {
  14. const markdown = `This is some content without any headers.
  15. It spans multiple lines.
  16. Another paragraph.
  17. `;
  18. const expected: MarkdownFragment[] = [
  19. {
  20. label: '0-content-1',
  21. type: 'paragraph',
  22. text: 'This is some content without any headers.\nIt spans multiple lines.',
  23. tokenCount: encoder.encode('This is some content without any headers.\nIt spans multiple lines.').length,
  24. },
  25. {
  26. label: '0-content-2',
  27. type: 'paragraph',
  28. text: 'Another paragraph.',
  29. tokenCount: encoder.encode('Another paragraph.').length,
  30. },
  31. ];
  32. const result = await splitMarkdownIntoFragments(markdown, MODEL);
  33. expect(result).toEqual(expected);
  34. });
  35. test('handles markdown starting with a header', async() => {
  36. const markdown = `
  37. # Header 1
  38. Content under header 1.
  39. ## Header 1.1
  40. Content under header 1.1.
  41. # Header 2
  42. Content under header 2.
  43. `;
  44. const expected: MarkdownFragment[] = [
  45. {
  46. label: '1-heading',
  47. type: 'heading',
  48. text: '# Header 1',
  49. tokenCount: encoder.encode('# Header 1').length,
  50. },
  51. {
  52. label: '1-content-1',
  53. type: 'paragraph',
  54. text: 'Content under header 1.',
  55. tokenCount: encoder.encode('Content under header 1.').length,
  56. },
  57. {
  58. label: '1-1-heading',
  59. type: 'heading',
  60. text: '## Header 1.1',
  61. tokenCount: encoder.encode('## Header 1.1').length,
  62. },
  63. {
  64. label: '1-1-content-1',
  65. type: 'paragraph',
  66. text: 'Content under header 1.1.',
  67. tokenCount: encoder.encode('Content under header 1.1.').length,
  68. },
  69. {
  70. label: '2-heading',
  71. type: 'heading',
  72. text: '# Header 2',
  73. tokenCount: encoder.encode('# Header 2').length,
  74. },
  75. {
  76. label: '2-content-1',
  77. type: 'paragraph',
  78. text: 'Content under header 2.',
  79. tokenCount: encoder.encode('Content under header 2.').length,
  80. },
  81. ];
  82. const result = await splitMarkdownIntoFragments(markdown, MODEL);
  83. expect(result).toEqual(expected);
  84. });
  85. test('handles markdown with non-consecutive heading levels', async() => {
  86. const markdown = `
  87. Introduction without a header.
  88. # Chapter 1
  89. Content of chapter 1.
  90. ### Section 1.1.1
  91. Content of section 1.1.1.
  92. ## Section 1.2
  93. Content of section 1.2.
  94. # Chapter 2
  95. Content of chapter 2.
  96. ## Section 2.1
  97. Content of section 2.1.
  98. `;
  99. const expected: MarkdownFragment[] = [
  100. {
  101. label: '0-content-1',
  102. type: 'paragraph',
  103. text: 'Introduction without a header.',
  104. tokenCount: encoder.encode('Introduction without a header.').length,
  105. },
  106. {
  107. label: '1-heading',
  108. type: 'heading',
  109. text: '# Chapter 1',
  110. tokenCount: encoder.encode('# Chapter 1').length,
  111. },
  112. {
  113. label: '1-content-1',
  114. type: 'paragraph',
  115. text: 'Content of chapter 1.',
  116. tokenCount: encoder.encode('Content of chapter 1.').length,
  117. },
  118. {
  119. label: '1-1-1-heading',
  120. type: 'heading',
  121. text: '### Section 1.1.1',
  122. tokenCount: encoder.encode('### Section 1.1.1').length,
  123. },
  124. {
  125. label: '1-1-1-content-1',
  126. type: 'paragraph',
  127. text: 'Content of section 1.1.1.',
  128. tokenCount: encoder.encode('Content of section 1.1.1.').length,
  129. },
  130. {
  131. label: '1-2-heading',
  132. type: 'heading',
  133. text: '## Section 1.2',
  134. tokenCount: encoder.encode('## Section 1.2').length,
  135. },
  136. {
  137. label: '1-2-content-1',
  138. type: 'paragraph',
  139. text: 'Content of section 1.2.',
  140. tokenCount: encoder.encode('Content of section 1.2.').length,
  141. },
  142. {
  143. label: '2-heading',
  144. type: 'heading',
  145. text: '# Chapter 2',
  146. tokenCount: encoder.encode('# Chapter 2').length,
  147. },
  148. {
  149. label: '2-content-1',
  150. type: 'paragraph',
  151. text: 'Content of chapter 2.',
  152. tokenCount: encoder.encode('Content of chapter 2.').length,
  153. },
  154. {
  155. label: '2-1-heading',
  156. type: 'heading',
  157. text: '## Section 2.1',
  158. tokenCount: encoder.encode('## Section 2.1').length,
  159. },
  160. {
  161. label: '2-1-content-1',
  162. type: 'paragraph',
  163. text: 'Content of section 2.1.',
  164. tokenCount: encoder.encode('Content of section 2.1.').length,
  165. },
  166. ];
  167. const result = await splitMarkdownIntoFragments(markdown, MODEL);
  168. expect(result).toEqual(expected);
  169. });
  170. test('handles markdown with skipped heading levels', async() => {
  171. const markdown = `
  172. # Header 1
  173. Content under header 1.
  174. #### Header 1.1.1.1
  175. Content under header 1.1.1.1.
  176. ## Header 1.2
  177. Content under header 1.2.
  178. # Header 2
  179. Content under header 2.
  180. `;
  181. const expected: MarkdownFragment[] = [
  182. {
  183. label: '1-heading',
  184. type: 'heading',
  185. text: '# Header 1',
  186. tokenCount: encoder.encode('# Header 1').length,
  187. },
  188. {
  189. label: '1-content-1',
  190. type: 'paragraph',
  191. text: 'Content under header 1.',
  192. tokenCount: encoder.encode('Content under header 1.').length,
  193. },
  194. {
  195. label: '1-1-1-1-heading',
  196. type: 'heading',
  197. text: '#### Header 1.1.1.1',
  198. tokenCount: encoder.encode('#### Header 1.1.1.1').length,
  199. },
  200. {
  201. label: '1-1-1-1-content-1',
  202. type: 'paragraph',
  203. text: 'Content under header 1.1.1.1.',
  204. tokenCount: encoder.encode('Content under header 1.1.1.1.').length,
  205. },
  206. {
  207. label: '1-2-heading',
  208. type: 'heading',
  209. text: '## Header 1.2',
  210. tokenCount: encoder.encode('## Header 1.2').length,
  211. },
  212. {
  213. label: '1-2-content-1',
  214. type: 'paragraph',
  215. text: 'Content under header 1.2.',
  216. tokenCount: encoder.encode('Content under header 1.2.').length,
  217. },
  218. {
  219. label: '2-heading',
  220. type: 'heading',
  221. text: '# Header 2',
  222. tokenCount: encoder.encode('# Header 2').length,
  223. },
  224. {
  225. label: '2-content-1',
  226. type: 'paragraph',
  227. text: 'Content under header 2.',
  228. tokenCount: encoder.encode('Content under header 2.').length,
  229. },
  230. ];
  231. const result = await splitMarkdownIntoFragments(markdown, MODEL);
  232. expect(result).toEqual(expected);
  233. });
  234. test('handles malformed headings', async() => {
  235. const markdown = `
  236. # Header 1
  237. Content under header 1.
  238. #### Header 1.1.1.1
  239. Content under header 1.1.1.1.
  240. `;
  241. const expected: MarkdownFragment[] = [
  242. {
  243. label: '1-heading',
  244. type: 'heading',
  245. text: '# Header 1',
  246. tokenCount: encoder.encode('# Header 1').length,
  247. },
  248. {
  249. label: '1-content-1',
  250. type: 'paragraph',
  251. text: 'Content under header 1.',
  252. tokenCount: encoder.encode('Content under header 1.').length,
  253. },
  254. {
  255. label: '1-1-1-1-heading',
  256. type: 'heading',
  257. text: '#### Header 1.1.1.1',
  258. tokenCount: encoder.encode('#### Header 1.1.1.1').length,
  259. },
  260. {
  261. label: '1-1-1-1-content-1',
  262. type: 'paragraph',
  263. text: 'Content under header 1.1.1.1.',
  264. tokenCount: encoder.encode('Content under header 1.1.1.1.').length,
  265. },
  266. ];
  267. const result = await splitMarkdownIntoFragments(markdown, MODEL);
  268. expect(result).toEqual(expected);
  269. });
  270. test('handles multiple content blocks before any headers', async() => {
  271. const markdown = `
  272. This is the first paragraph without a header.
  273. This is the second paragraph without a header.
  274. # Header 1
  275. Content under header 1.
  276. `;
  277. const expected: MarkdownFragment[] = [
  278. {
  279. label: '0-content-1',
  280. type: 'paragraph',
  281. text: 'This is the first paragraph without a header.',
  282. tokenCount: encoder.encode('This is the first paragraph without a header.').length,
  283. },
  284. {
  285. label: '0-content-2',
  286. type: 'paragraph',
  287. text: 'This is the second paragraph without a header.',
  288. tokenCount: encoder.encode('This is the second paragraph without a header.').length,
  289. },
  290. {
  291. label: '1-heading',
  292. type: 'heading',
  293. text: '# Header 1',
  294. tokenCount: encoder.encode('# Header 1').length,
  295. },
  296. {
  297. label: '1-content-1',
  298. type: 'paragraph',
  299. text: 'Content under header 1.',
  300. tokenCount: encoder.encode('Content under header 1.').length,
  301. },
  302. ];
  303. const result = await splitMarkdownIntoFragments(markdown, MODEL);
  304. expect(result).toEqual(expected);
  305. });
  306. test('handles markdown with only headers and no content', async() => {
  307. const markdown = `
  308. # Header 1
  309. ## Header 1.1
  310. ### Header 1.1.1
  311. `;
  312. const expected: MarkdownFragment[] = [
  313. {
  314. label: '1-heading',
  315. type: 'heading',
  316. text: '# Header 1',
  317. tokenCount: encoder.encode('# Header 1').length,
  318. },
  319. {
  320. label: '1-1-heading',
  321. type: 'heading',
  322. text: '## Header 1.1',
  323. tokenCount: encoder.encode('## Header 1.1').length,
  324. },
  325. {
  326. label: '1-1-1-heading',
  327. type: 'heading',
  328. text: '### Header 1.1.1',
  329. tokenCount: encoder.encode('### Header 1.1.1').length,
  330. },
  331. ];
  332. const result = await splitMarkdownIntoFragments(markdown, MODEL);
  333. expect(result).toEqual(expected);
  334. });
  335. test('handles markdown with mixed content and headers', async() => {
  336. const markdown = `
  337. # Header 1
  338. Content under header 1.
  339. ## Header 1.1
  340. Content under header 1.1.
  341. Another piece of content.
  342. # Header 2
  343. Content under header 2.
  344. `;
  345. const expected: MarkdownFragment[] = [
  346. {
  347. label: '1-heading',
  348. type: 'heading',
  349. text: '# Header 1',
  350. tokenCount: encoder.encode('# Header 1').length,
  351. },
  352. {
  353. label: '1-content-1',
  354. type: 'paragraph',
  355. text: 'Content under header 1.',
  356. tokenCount: encoder.encode('Content under header 1.').length,
  357. },
  358. {
  359. label: '1-1-heading',
  360. type: 'heading',
  361. text: '## Header 1.1',
  362. tokenCount: encoder.encode('## Header 1.1').length,
  363. },
  364. {
  365. label: '1-1-content-1',
  366. type: 'paragraph',
  367. text: 'Content under header 1.1.\nAnother piece of content.',
  368. tokenCount: encoder.encode('Content under header 1.1.\nAnother piece of content.').length,
  369. },
  370. {
  371. label: '2-heading',
  372. type: 'heading',
  373. text: '# Header 2',
  374. tokenCount: encoder.encode('# Header 2').length,
  375. },
  376. {
  377. label: '2-content-1',
  378. type: 'paragraph',
  379. text: 'Content under header 2.',
  380. tokenCount: encoder.encode('Content under header 2.').length,
  381. },
  382. ];
  383. const result = await splitMarkdownIntoFragments(markdown, MODEL);
  384. expect(result).toEqual(expected);
  385. });
  386. test('preserves list indentation and reduces unnecessary line breaks', async() => {
  387. const markdown = `
  388. # Header 1
  389. Content under header 1.
  390. - Item 1
  391. - Subitem 1
  392. - Item 2
  393. # Header 2
  394. Content under header 2.
  395. `;
  396. const expected: MarkdownFragment[] = [
  397. {
  398. label: '1-heading',
  399. type: 'heading',
  400. text: '# Header 1',
  401. tokenCount: encoder.encode('# Header 1').length,
  402. },
  403. {
  404. label: '1-content-1',
  405. type: 'paragraph',
  406. text: 'Content under header 1.',
  407. tokenCount: encoder.encode('Content under header 1.').length,
  408. },
  409. {
  410. label: '1-content-2',
  411. type: 'list',
  412. text: '- Item 1\n - Subitem 1\n- Item 2',
  413. tokenCount: encoder.encode('- Item 1\n - Subitem 1\n- Item 2').length,
  414. },
  415. {
  416. label: '2-heading',
  417. type: 'heading',
  418. text: '# Header 2',
  419. tokenCount: encoder.encode('# Header 2').length,
  420. },
  421. {
  422. label: '2-content-1',
  423. type: 'paragraph',
  424. text: 'Content under header 2.',
  425. tokenCount: encoder.encode('Content under header 2.').length,
  426. },
  427. ];
  428. const result = await splitMarkdownIntoFragments(markdown, MODEL);
  429. expect(result).toEqual(expected);
  430. });
  431. test('code blocks containing # are not treated as headings', async() => {
  432. const markdown = `
  433. # Header 1
  434. Some introductory content.
  435. \`\`\`
  436. # This is a comment with a # symbol
  437. Some code line
  438. \`\`\`
  439. Additional content.
  440. # Header 2
  441. Content under header 2.
  442. `;
  443. const expected: MarkdownFragment[] = [
  444. {
  445. label: '1-heading',
  446. type: 'heading',
  447. text: '# Header 1',
  448. tokenCount: encoder.encode('# Header 1').length,
  449. },
  450. {
  451. label: '1-content-1',
  452. type: 'paragraph',
  453. text: 'Some introductory content.',
  454. tokenCount: encoder.encode('Some introductory content.').length,
  455. },
  456. {
  457. label: '1-content-2',
  458. type: 'code',
  459. text: '```\n# This is a comment with a # symbol\nSome code line\n```',
  460. tokenCount: encoder.encode('```\n# This is a comment with a # symbol\nSome code line\n```').length,
  461. },
  462. {
  463. label: '1-content-3',
  464. type: 'paragraph',
  465. text: 'Additional content.',
  466. tokenCount: encoder.encode('Additional content.').length,
  467. },
  468. {
  469. label: '2-heading',
  470. type: 'heading',
  471. text: '# Header 2',
  472. tokenCount: encoder.encode('# Header 2').length,
  473. },
  474. {
  475. label: '2-content-1',
  476. type: 'paragraph',
  477. text: 'Content under header 2.',
  478. tokenCount: encoder.encode('Content under header 2.').length,
  479. },
  480. ];
  481. const result = await splitMarkdownIntoFragments(markdown, MODEL);
  482. expect(result).toEqual(expected);
  483. });
  484. test('frontmatter is processed and labeled correctly', async() => {
  485. const markdown = `---
  486. title: Test Document
  487. author: John Doe
  488. ---
  489. # Header 1
  490. Some introductory content.
  491. `;
  492. const expected: MarkdownFragment[] = [
  493. {
  494. label: 'frontmatter',
  495. type: 'yaml',
  496. text: JSON.stringify({ title: 'Test Document', author: 'John Doe' }, null, 2),
  497. tokenCount: encoder.encode(JSON.stringify({ title: 'Test Document', author: 'John Doe' }, null, 2)).length,
  498. },
  499. {
  500. label: '1-heading',
  501. type: 'heading',
  502. text: '# Header 1',
  503. tokenCount: encoder.encode('# Header 1').length,
  504. },
  505. {
  506. label: '1-content-1',
  507. type: 'paragraph',
  508. text: 'Some introductory content.',
  509. tokenCount: encoder.encode('Some introductory content.').length,
  510. },
  511. ];
  512. const result = await splitMarkdownIntoFragments(markdown, MODEL);
  513. expect(result).toEqual(expected);
  514. });
  515. });
  516. describe('splitMarkdownIntoChunks', () => {
  517. const repeatedText = 'This is a repeated sentence for testing purposes. '.repeat(100);
  518. const markdown = `---
  519. title: Test Document
  520. author: John Doe
  521. ---
  522. ${repeatedText}
  523. # Header 1
  524. This is the first paragraph under header 1. It contains some text to simulate a longer paragraph for testing.
  525. This paragraph is extended with more content to ensure proper chunking behavior.${repeatedText}
  526. ## Header 1-1
  527. This is the first paragraph under header 1-1. The text is a bit longer to ensure proper chunking. More text follows.
  528. ### Header 1-1-1
  529. This is the first paragraph under header 1-1-1. The content is nested deeper,
  530. making sure that the chunking algorithm works properly with multiple levels of headers.
  531. This is another paragraph under header 1-1-1, continuing the content at this deeper level.
  532. #### Header 1-1-1-1
  533. Now we have reached the fourth level of headers. The text here should also be properly chunked and grouped with its parent headers.
  534. This is another paragraph under header 1-1-1-1. It should be grouped with the correct higher-level headers.
  535. # Header 2
  536. Here is some content under header 2. This section should also be sufficiently long to ensure that the token count threshold is reached in the test.
  537. ## Header 2-1
  538. ${repeatedText}
  539. ${repeatedText}
  540. Another sub-header under header 2 with text for testing chunking behavior. This is a fairly lengthy paragraph as well.
  541. We now have a fourth-level sub-header under header 2-1. This ensures that the chunking logic can handle deeply nested content.
  542. ### Header 2-1-1
  543. Here is another paragraph under header 2-1-1. This paragraph is part of a more deeply nested section.
  544. # Header 3
  545. Continuing with more headers and content to make sure the markdown document is sufficiently large. This is a new header with more paragraphs under it.
  546. ### Header 3-1
  547. This is a sub-header under header 3. The content here continues to grow, ensuring that the markdown is long enough to trigger multiple chunks.
  548. #### Header 3-1-1
  549. Here is a fourth-level sub-header under header 3-1. This paragraph is designed to create a larger markdown file for testing purposes.
  550. `;
  551. test('Each chunk should not exceed the specified token count', async() => {
  552. const maxToken = 800;
  553. const result = await splitMarkdownIntoChunks(markdown, MODEL, maxToken);
  554. result.forEach((chunk) => {
  555. const tokenCount = encoder.encode(chunk).length;
  556. expect(tokenCount).toBeLessThanOrEqual(maxToken * 1.1);
  557. });
  558. });
  559. test('Each chunk should include the relevant top-level header', async() => {
  560. const result = await splitMarkdownIntoChunks(markdown, MODEL, 800);
  561. result.forEach((chunk) => {
  562. const containsHeader1 = chunk.includes('# Header 1');
  563. const containsHeader2 = chunk.includes('# Header 2');
  564. const containsHeader3 = chunk.includes('# Header 3');
  565. const doesNotContainHash = !chunk.includes('# ');
  566. expect(containsHeader1 || containsHeader2 || containsHeader3 || doesNotContainHash).toBe(true);
  567. });
  568. });
  569. test('Should throw an error if a header exceeds half of maxToken size with correct error message', async() => {
  570. const maxToken = 800;
  571. const markdownWithLongHeader = `
  572. # Short Header 1
  573. This is the first paragraph under short header 1. It contains some text for testing purposes.
  574. ## ${repeatedText}
  575. This is the first paragraph under the long header. It contains text to ensure that the header length check is triggered if the header is too long.
  576. # Short Header 2
  577. Another section with a shorter header, but enough content to ensure proper chunking.
  578. `;
  579. try {
  580. await splitMarkdownIntoChunks(markdownWithLongHeader, MODEL, maxToken);
  581. }
  582. catch (error) {
  583. if (error instanceof Error) {
  584. expect(error.message).toContain('Heading token count is too large');
  585. }
  586. else {
  587. throw new Error('An unknown error occurred');
  588. }
  589. }
  590. });
  591. test('Should return the entire markdown as a single chunk if token count is less than or equal to maxToken', async() => {
  592. const markdownText = `
  593. # Header 1
  594. This is a short paragraph under header 1. It contains only a few sentences to ensure that the total token count remains under the maxToken limit.
  595. `;
  596. const maxToken = 800;
  597. const result = await splitMarkdownIntoChunks(markdownText, MODEL, maxToken);
  598. expect(result).toHaveLength(1);
  599. expect(result[0]).toBe(markdownText);
  600. });
  601. test('Should return the entire markdown as a single chunk if token count is less than or equal to maxToken', async() => {
  602. const markdownWithContentBeforeHeading = `
  603. This is a short paragraph
  604. # Header 1
  605. ${repeatedText}
  606. `;
  607. const maxToken = 800;
  608. const result = await splitMarkdownIntoChunks(markdownWithContentBeforeHeading, MODEL, maxToken);
  609. result.forEach((chunk) => {
  610. const tokenCount = encoder.encode(chunk).length;
  611. expect(tokenCount).toBeLessThanOrEqual(maxToken * 1.1);
  612. });
  613. });
  614. });