index.spec.ts 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425
  1. import { encodingForModel, type TiktokenModel } from 'js-tiktoken';
  2. import type { Chunk } from '../src/services/markdown-splitter';
  3. import { splitMarkdownIntoChunks } from '../src/services/markdown-splitter';
  4. import { splitMarkdownByTokens } from '../src/services/markdown-token-splitter';
  5. describe('splitMarkdownIntoChunks', () => {
  6. test('handles empty markdown string', async() => {
  7. const markdown = '';
  8. const expected: Chunk[] = [];
  9. const result = await splitMarkdownIntoChunks(markdown); // Await the result
  10. expect(result).toEqual(expected);
  11. });
  12. test('handles markdown with only content and no headers', async() => {
  13. const markdown = `This is some content without any headers.
  14. It spans multiple lines.
  15. Another paragraph.
  16. `;
  17. const expected: Chunk[] = [
  18. {
  19. label: '0-content-1',
  20. type: 'paragraph',
  21. text: 'This is some content without any headers.\nIt spans multiple lines.',
  22. },
  23. {
  24. label: '0-content-2',
  25. type: 'paragraph',
  26. text: 'Another paragraph.',
  27. },
  28. ];
  29. const result = await splitMarkdownIntoChunks(markdown); // Await the result
  30. expect(result).toEqual(expected);
  31. });
  32. test('handles markdown starting with a header', async() => {
  33. const markdown = `
  34. # Header 1
  35. Content under header 1.
  36. ## Header 1.1
  37. Content under header 1.1.
  38. # Header 2
  39. Content under header 2.
  40. `;
  41. const expected: Chunk[] = [
  42. { label: '1-heading', type: 'heading', text: '# Header 1' },
  43. { label: '1-content-1', type: 'paragraph', text: 'Content under header 1.' },
  44. { label: '1-1-heading', type: 'heading', text: '## Header 1.1' },
  45. { label: '1-1-content-1', type: 'paragraph', text: 'Content under header 1.1.' },
  46. { label: '2-heading', type: 'heading', text: '# Header 2' },
  47. { label: '2-content-1', type: 'paragraph', text: 'Content under header 2.' },
  48. ];
  49. const result = await splitMarkdownIntoChunks(markdown); // Await the result
  50. expect(result).toEqual(expected);
  51. });
  52. test('handles markdown with non-consecutive heading levels', async() => {
  53. const markdown = `
  54. Introduction without a header.
  55. # Chapter 1
  56. Content of chapter 1.
  57. ### Section 1.1.1
  58. Content of section 1.1.1.
  59. ## Section 1.2
  60. Content of section 1.2.
  61. # Chapter 2
  62. Content of chapter 2.
  63. ## Section 2.1
  64. Content of section 2.1.
  65. `;
  66. const expected: Chunk[] = [
  67. {
  68. label: '0-content-1',
  69. type: 'paragraph',
  70. text: 'Introduction without a header.',
  71. },
  72. {
  73. label: '1-heading',
  74. type: 'heading',
  75. text: '# Chapter 1',
  76. },
  77. {
  78. label: '1-content-1',
  79. type: 'paragraph',
  80. text: 'Content of chapter 1.',
  81. },
  82. {
  83. label: '1-1-1-heading',
  84. type: 'heading',
  85. text: '### Section 1.1.1',
  86. },
  87. {
  88. label: '1-1-1-content-1',
  89. type: 'paragraph',
  90. text: 'Content of section 1.1.1.',
  91. },
  92. {
  93. label: '1-2-heading',
  94. type: 'heading',
  95. text: '## Section 1.2',
  96. },
  97. {
  98. label: '1-2-content-1',
  99. type: 'paragraph',
  100. text: 'Content of section 1.2.',
  101. },
  102. {
  103. label: '2-heading',
  104. type: 'heading',
  105. text: '# Chapter 2',
  106. },
  107. {
  108. label: '2-content-1',
  109. type: 'paragraph',
  110. text: 'Content of chapter 2.',
  111. },
  112. {
  113. label: '2-1-heading',
  114. type: 'heading',
  115. text: '## Section 2.1',
  116. },
  117. {
  118. label: '2-1-content-1',
  119. type: 'paragraph',
  120. text: 'Content of section 2.1.',
  121. },
  122. ];
  123. const result = await splitMarkdownIntoChunks(markdown); // Await the result
  124. expect(result).toEqual(expected);
  125. });
  126. test('handles markdown with skipped heading levels', async() => {
  127. const markdown = `
  128. # Header 1
  129. Content under header 1.
  130. #### Header 1.1.1.1
  131. Content under header 1.1.1.1.
  132. ## Header 1.2
  133. Content under header 1.2.
  134. # Header 2
  135. Content under header 2.
  136. `;
  137. const expected: Chunk[] = [
  138. { label: '1-heading', type: 'heading', text: '# Header 1' },
  139. { label: '1-content-1', type: 'paragraph', text: 'Content under header 1.' },
  140. { label: '1-1-1-1-heading', type: 'heading', text: '#### Header 1.1.1.1' },
  141. { label: '1-1-1-1-content-1', type: 'paragraph', text: 'Content under header 1.1.1.1.' },
  142. { label: '1-2-heading', type: 'heading', text: '## Header 1.2' },
  143. { label: '1-2-content-1', type: 'paragraph', text: 'Content under header 1.2.' },
  144. { label: '2-heading', type: 'heading', text: '# Header 2' },
  145. { label: '2-content-1', type: 'paragraph', text: 'Content under header 2.' },
  146. ];
  147. const result = await splitMarkdownIntoChunks(markdown); // Await the result
  148. expect(result).toEqual(expected);
  149. });
  150. test('handles malformed headings', async() => {
  151. const markdown = `
  152. # Header 1
  153. Content under header 1.
  154. #### Header 1.1.1.1
  155. Content under header 1.1.1.1.
  156. `;
  157. const expected: Chunk[] = [
  158. { label: '1-heading', type: 'heading', text: '# Header 1' },
  159. { label: '1-content-1', type: 'paragraph', text: 'Content under header 1.' },
  160. { label: '1-1-1-1-heading', type: 'heading', text: '#### Header 1.1.1.1' },
  161. { label: '1-1-1-1-content-1', type: 'paragraph', text: 'Content under header 1.1.1.1.' },
  162. ];
  163. const result = await splitMarkdownIntoChunks(markdown); // Await the result
  164. expect(result).toEqual(expected);
  165. });
  166. test('handles multiple content blocks before any headers', async() => {
  167. const markdown = `
  168. This is the first paragraph without a header.
  169. This is the second paragraph without a header.
  170. # Header 1
  171. Content under header 1.
  172. `;
  173. const expected: Chunk[] = [
  174. {
  175. label: '0-content-1',
  176. type: 'paragraph',
  177. text: 'This is the first paragraph without a header.',
  178. },
  179. {
  180. label: '0-content-2',
  181. type: 'paragraph',
  182. text: 'This is the second paragraph without a header.',
  183. },
  184. {
  185. label: '1-heading',
  186. type: 'heading',
  187. text: '# Header 1',
  188. },
  189. {
  190. label: '1-content-1',
  191. type: 'paragraph',
  192. text: 'Content under header 1.',
  193. },
  194. ];
  195. const result = await splitMarkdownIntoChunks(markdown); // Await the result
  196. expect(result).toEqual(expected);
  197. });
  198. test('handles markdown with only headers and no content', async() => {
  199. const markdown = `
  200. # Header 1
  201. ## Header 1.1
  202. ### Header 1.1.1
  203. `;
  204. const expected: Chunk[] = [
  205. { label: '1-heading', type: 'heading', text: '# Header 1' },
  206. { label: '1-1-heading', type: 'heading', text: '## Header 1.1' },
  207. { label: '1-1-1-heading', type: 'heading', text: '### Header 1.1.1' },
  208. ];
  209. const result = await splitMarkdownIntoChunks(markdown); // Await the result
  210. expect(result).toEqual(expected);
  211. });
  212. test('handles markdown with mixed content and headers', async() => {
  213. const markdown = `
  214. # Header 1
  215. Content under header 1.
  216. ## Header 1.1
  217. Content under header 1.1.
  218. Another piece of content.
  219. # Header 2
  220. Content under header 2.
  221. `;
  222. const expected: Chunk[] = [
  223. { label: '1-heading', type: 'heading', text: '# Header 1' },
  224. { label: '1-content-1', type: 'paragraph', text: 'Content under header 1.' },
  225. { label: '1-1-heading', type: 'heading', text: '## Header 1.1' },
  226. { label: '1-1-content-1', type: 'paragraph', text: 'Content under header 1.1.\nAnother piece of content.' },
  227. { label: '2-heading', type: 'heading', text: '# Header 2' },
  228. { label: '2-content-1', type: 'paragraph', text: 'Content under header 2.' },
  229. ];
  230. const result = await splitMarkdownIntoChunks(markdown); // Await the result
  231. expect(result).toEqual(expected);
  232. });
  233. test('preserves list indentation and reduces unnecessary line breaks', async() => {
  234. const markdown = `
  235. # Header 1
  236. Content under header 1.
  237. - Item 1
  238. - Subitem 1
  239. - Item 2
  240. # Header 2
  241. Content under header 2.
  242. `;
  243. const expected: Chunk[] = [
  244. { label: '1-heading', type: 'heading', text: '# Header 1' },
  245. { label: '1-content-1', type: 'paragraph', text: 'Content under header 1.' },
  246. { label: '1-content-2', type: 'list', text: '- Item 1\n - Subitem 1\n- Item 2' },
  247. { label: '2-heading', type: 'heading', text: '# Header 2' },
  248. { label: '2-content-1', type: 'paragraph', text: 'Content under header 2.' },
  249. ];
  250. const result = await splitMarkdownIntoChunks(markdown); // Await the result
  251. expect(result).toEqual(expected);
  252. });
  253. test('code blocks containing # are not treated as headings', async() => {
  254. const markdown = `
  255. # Header 1
  256. Some introductory content.
  257. \`\`\`
  258. # This is a comment with a # symbol
  259. Some code line
  260. \`\`\`
  261. Additional content.
  262. # Header 2
  263. Content under header 2.
  264. `;
  265. const expected: Chunk[] = [
  266. { label: '1-heading', type: 'heading', text: '# Header 1' },
  267. { label: '1-content-1', type: 'paragraph', text: 'Some introductory content.' },
  268. { label: '1-content-2', type: 'code', text: '```\n# This is a comment with a # symbol\nSome code line\n```' },
  269. { label: '1-content-3', type: 'paragraph', text: 'Additional content.' },
  270. { label: '2-heading', type: 'heading', text: '# Header 2' },
  271. { label: '2-content-1', type: 'paragraph', text: 'Content under header 2.' },
  272. ];
  273. const result = await splitMarkdownIntoChunks(markdown);
  274. expect(result).toEqual(expected);
  275. });
  276. test('frontmatter is processed and labeled correctly', async() => {
  277. const markdown = `---
  278. title: Test Document
  279. author: John Doe
  280. ---
  281. # Header 1
  282. Some introductory content.
  283. `;
  284. const expected: Chunk[] = [
  285. { label: 'frontmatter', type: 'yaml', text: JSON.stringify({ title: 'Test Document', author: 'John Doe' }, null, 2) },
  286. { label: '1-heading', type: 'heading', text: '# Header 1' },
  287. { label: '1-content-1', type: 'paragraph', text: 'Some introductory content.' },
  288. ];
  289. const result = await splitMarkdownIntoChunks(markdown);
  290. expect(result).toEqual(expected);
  291. });
  292. });
  293. describe('splitMarkdownByTokens', () => {
  294. test('preserves list indentation and reduces unnecessary line breaks', async() => {
  295. const model: TiktokenModel = 'gpt-4';
  296. const markdown = `
  297. # Header 1
  298. Content under header 1.
  299. - Item 1
  300. - Subitem 1
  301. - Item 2
  302. # Header 2
  303. Content under header 2.
  304. `;
  305. const encoder = encodingForModel(model);
  306. const expected: Chunk[] = [
  307. {
  308. label: '1-heading',
  309. type: 'heading',
  310. text: '# Header 1',
  311. tokenCount: encoder.encode('# Header 1').length,
  312. },
  313. {
  314. label: '1-content-1',
  315. type: 'paragraph',
  316. text: 'Content under header 1.',
  317. tokenCount: encoder.encode('Content under header 1.').length,
  318. },
  319. {
  320. label: '1-content-2',
  321. type: 'list',
  322. text: '- Item 1\n - Subitem 1\n- Item 2',
  323. tokenCount: encoder.encode('- Item 1\n - Subitem 1\n- Item 2').length,
  324. },
  325. {
  326. label: '2-heading',
  327. type: 'heading',
  328. text: '# Header 2',
  329. tokenCount: encoder.encode('# Header 2').length,
  330. },
  331. {
  332. label: '2-content-1',
  333. type: 'paragraph',
  334. text: 'Content under header 2.',
  335. tokenCount: encoder.encode('Content under header 2.').length,
  336. },
  337. ];
  338. const result = await splitMarkdownByTokens(markdown, model, 200);
  339. // Compare each chunk individually to check for correctness
  340. expect(result.length).toEqual(expected.length);
  341. });
  342. test('long text is split into chunks within maxTokens limit', async() => {
  343. const model: TiktokenModel = 'gpt-4';
  344. const maxTokens = 200;
  345. const encoder = encodingForModel(model);
  346. // create long paragraphs
  347. const longParagraph = 'Lorem ipsum dolor sit amet, consectetur adipiscing elit. '.repeat(50);
  348. const markdown = `
  349. # Header 1
  350. ${longParagraph}
  351. ## Header 1.1
  352. ${longParagraph}
  353. ### Header 1.1.1
  354. ${longParagraph}
  355. # Header 2
  356. ${longParagraph}
  357. `;
  358. const result = await splitMarkdownByTokens(markdown, model, maxTokens);
  359. // Verify that each chunk's tokenCount is less than or equal to maxTokens
  360. for (const chunk of result) {
  361. expect(chunk.tokenCount).toBeLessThanOrEqual(maxTokens);
  362. }
  363. // General test for the chunks (add more detailed tests if necessary)
  364. expect(result.length).toBeGreaterThan(0);
  365. // Confirm that the correct model was used
  366. for (const chunk of result) {
  367. const calculatedTokenCount = encoder.encode(chunk.text).length;
  368. expect(chunk.tokenCount).toEqual(calculatedTokenCount);
  369. }
  370. });
  371. });