index.spec.ts 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327
  1. import type { Chunk } from '../src/services/markdown-splitter';
  2. import { splitMarkdownIntoChunks } from '../src/services/markdown-splitter';
  3. describe('splitMarkdownIntoChunks', () => {
  4. test('handles empty markdown string', async() => {
  5. const markdown = '';
  6. const expected: Chunk[] = [];
  7. const result = await splitMarkdownIntoChunks(markdown); // Await the result
  8. expect(result).toEqual(expected);
  9. });
  10. test('handles markdown with only content and no headers', async() => {
  11. const markdown = `This is some content without any headers.
  12. It spans multiple lines.
  13. Another paragraph.
  14. `;
  15. const expected: Chunk[] = [
  16. {
  17. label: '0-content-1',
  18. type: 'paragraph',
  19. text: 'This is some content without any headers.\nIt spans multiple lines.',
  20. },
  21. {
  22. label: '0-content-2',
  23. type: 'paragraph',
  24. text: 'Another paragraph.',
  25. },
  26. ];
  27. const result = await splitMarkdownIntoChunks(markdown); // Await the result
  28. expect(result).toEqual(expected);
  29. });
  30. test('handles markdown starting with a header', async() => {
  31. const markdown = `
  32. # Header 1
  33. Content under header 1.
  34. ## Header 1.1
  35. Content under header 1.1.
  36. # Header 2
  37. Content under header 2.
  38. `;
  39. const expected: Chunk[] = [
  40. { label: '1-heading', type: 'heading', text: '# Header 1' },
  41. { label: '1-content-1', type: 'paragraph', text: 'Content under header 1.' },
  42. { label: '1-1-heading', type: 'heading', text: '## Header 1.1' },
  43. { label: '1-1-content-1', type: 'paragraph', text: 'Content under header 1.1.' },
  44. { label: '2-heading', type: 'heading', text: '# Header 2' },
  45. { label: '2-content-1', type: 'paragraph', text: 'Content under header 2.' },
  46. ];
  47. const result = await splitMarkdownIntoChunks(markdown); // Await the result
  48. expect(result).toEqual(expected);
  49. });
  50. test('handles markdown with non-consecutive heading levels', async() => {
  51. const markdown = `
  52. Introduction without a header.
  53. # Chapter 1
  54. Content of chapter 1.
  55. ### Section 1.1.1
  56. Content of section 1.1.1.
  57. ## Section 1.2
  58. Content of section 1.2.
  59. # Chapter 2
  60. Content of chapter 2.
  61. ## Section 2.1
  62. Content of section 2.1.
  63. `;
  64. const expected: Chunk[] = [
  65. {
  66. label: '0-content-1',
  67. type: 'paragraph',
  68. text: 'Introduction without a header.',
  69. },
  70. {
  71. label: '1-heading',
  72. type: 'heading',
  73. text: '# Chapter 1',
  74. },
  75. {
  76. label: '1-content-1',
  77. type: 'paragraph',
  78. text: 'Content of chapter 1.',
  79. },
  80. {
  81. label: '1-1-1-heading',
  82. type: 'heading',
  83. text: '### Section 1.1.1',
  84. },
  85. {
  86. label: '1-1-1-content-1',
  87. type: 'paragraph',
  88. text: 'Content of section 1.1.1.',
  89. },
  90. {
  91. label: '1-2-heading',
  92. type: 'heading',
  93. text: '## Section 1.2',
  94. },
  95. {
  96. label: '1-2-content-1',
  97. type: 'paragraph',
  98. text: 'Content of section 1.2.',
  99. },
  100. {
  101. label: '2-heading',
  102. type: 'heading',
  103. text: '# Chapter 2',
  104. },
  105. {
  106. label: '2-content-1',
  107. type: 'paragraph',
  108. text: 'Content of chapter 2.',
  109. },
  110. {
  111. label: '2-1-heading',
  112. type: 'heading',
  113. text: '## Section 2.1',
  114. },
  115. {
  116. label: '2-1-content-1',
  117. type: 'paragraph',
  118. text: 'Content of section 2.1.',
  119. },
  120. ];
  121. const result = await splitMarkdownIntoChunks(markdown); // Await the result
  122. expect(result).toEqual(expected);
  123. });
  124. test('handles markdown with skipped heading levels', async() => {
  125. const markdown = `
  126. # Header 1
  127. Content under header 1.
  128. #### Header 1.1.1.1
  129. Content under header 1.1.1.1.
  130. ## Header 1.2
  131. Content under header 1.2.
  132. # Header 2
  133. Content under header 2.
  134. `;
  135. const expected: Chunk[] = [
  136. { label: '1-heading', type: 'heading', text: '# Header 1' },
  137. { label: '1-content-1', type: 'paragraph', text: 'Content under header 1.' },
  138. { label: '1-1-1-1-heading', type: 'heading', text: '#### Header 1.1.1.1' },
  139. { label: '1-1-1-1-content-1', type: 'paragraph', text: 'Content under header 1.1.1.1.' },
  140. { label: '1-2-heading', type: 'heading', text: '## Header 1.2' },
  141. { label: '1-2-content-1', type: 'paragraph', text: 'Content under header 1.2.' },
  142. { label: '2-heading', type: 'heading', text: '# Header 2' },
  143. { label: '2-content-1', type: 'paragraph', text: 'Content under header 2.' },
  144. ];
  145. const result = await splitMarkdownIntoChunks(markdown); // Await the result
  146. expect(result).toEqual(expected);
  147. });
  148. test('handles malformed headings', async() => {
  149. const markdown = `
  150. # Header 1
  151. Content under header 1.
  152. #### Header 1.1.1.1
  153. Content under header 1.1.1.1.
  154. `;
  155. const expected: Chunk[] = [
  156. { label: '1-heading', type: 'heading', text: '# Header 1' },
  157. { label: '1-content-1', type: 'paragraph', text: 'Content under header 1.' },
  158. { label: '1-1-1-1-heading', type: 'heading', text: '#### Header 1.1.1.1' },
  159. { label: '1-1-1-1-content-1', type: 'paragraph', text: 'Content under header 1.1.1.1.' },
  160. ];
  161. const result = await splitMarkdownIntoChunks(markdown); // Await the result
  162. expect(result).toEqual(expected);
  163. });
  164. test('handles multiple content blocks before any headers', async() => {
  165. const markdown = `
  166. This is the first paragraph without a header.
  167. This is the second paragraph without a header.
  168. # Header 1
  169. Content under header 1.
  170. `;
  171. const expected: Chunk[] = [
  172. {
  173. label: '0-content-1',
  174. type: 'paragraph',
  175. text: 'This is the first paragraph without a header.',
  176. },
  177. {
  178. label: '0-content-2',
  179. type: 'paragraph',
  180. text: 'This is the second paragraph without a header.',
  181. },
  182. {
  183. label: '1-heading',
  184. type: 'heading',
  185. text: '# Header 1',
  186. },
  187. {
  188. label: '1-content-1',
  189. type: 'paragraph',
  190. text: 'Content under header 1.',
  191. },
  192. ];
  193. const result = await splitMarkdownIntoChunks(markdown); // Await the result
  194. expect(result).toEqual(expected);
  195. });
  196. test('handles markdown with only headers and no content', async() => {
  197. const markdown = `
  198. # Header 1
  199. ## Header 1.1
  200. ### Header 1.1.1
  201. `;
  202. const expected: Chunk[] = [
  203. { label: '1-heading', type: 'heading', text: '# Header 1' },
  204. { label: '1-1-heading', type: 'heading', text: '## Header 1.1' },
  205. { label: '1-1-1-heading', type: 'heading', text: '### Header 1.1.1' },
  206. ];
  207. const result = await splitMarkdownIntoChunks(markdown); // Await the result
  208. expect(result).toEqual(expected);
  209. });
  210. test('handles markdown with mixed content and headers', async() => {
  211. const markdown = `
  212. # Header 1
  213. Content under header 1.
  214. ## Header 1.1
  215. Content under header 1.1.
  216. Another piece of content.
  217. # Header 2
  218. Content under header 2.
  219. `;
  220. const expected: Chunk[] = [
  221. { label: '1-heading', type: 'heading', text: '# Header 1' },
  222. { label: '1-content-1', type: 'paragraph', text: 'Content under header 1.' },
  223. { label: '1-1-heading', type: 'heading', text: '## Header 1.1' },
  224. { label: '1-1-content-1', type: 'paragraph', text: 'Content under header 1.1.\nAnother piece of content.' },
  225. { label: '2-heading', type: 'heading', text: '# Header 2' },
  226. { label: '2-content-1', type: 'paragraph', text: 'Content under header 2.' },
  227. ];
  228. const result = await splitMarkdownIntoChunks(markdown); // Await the result
  229. expect(result).toEqual(expected);
  230. });
  231. test('preserves list indentation and reduces unnecessary line breaks', async() => {
  232. const markdown = `
  233. # Header 1
  234. Content under header 1.
  235. - Item 1
  236. - Subitem 1
  237. - Item 2
  238. # Header 2
  239. Content under header 2.
  240. `;
  241. const expected: Chunk[] = [
  242. { label: '1-heading', type: 'heading', text: '# Header 1' },
  243. { label: '1-content-1', type: 'paragraph', text: 'Content under header 1.' },
  244. { label: '1-content-2', type: 'list', text: '- Item 1\n - Subitem 1\n- Item 2' },
  245. { label: '2-heading', type: 'heading', text: '# Header 2' },
  246. { label: '2-content-1', type: 'paragraph', text: 'Content under header 2.' },
  247. ];
  248. const result = await splitMarkdownIntoChunks(markdown); // Await the result
  249. expect(result).toEqual(expected);
  250. });
  251. test('code blocks containing # are not treated as headings', async() => {
  252. const markdown = `
  253. # Header 1
  254. Some introductory content.
  255. \`\`\`
  256. # This is a comment with a # symbol
  257. Some code line
  258. \`\`\`
  259. Additional content.
  260. # Header 2
  261. Content under header 2.
  262. `;
  263. const expected: Chunk[] = [
  264. { label: '1-heading', type: 'heading', text: '# Header 1' },
  265. { label: '1-content-1', type: 'paragraph', text: 'Some introductory content.' },
  266. { label: '1-content-2', type: 'code', text: '```\n# This is a comment with a # symbol\nSome code line\n```' },
  267. { label: '1-content-3', type: 'paragraph', text: 'Additional content.' },
  268. { label: '2-heading', type: 'heading', text: '# Header 2' },
  269. { label: '2-content-1', type: 'paragraph', text: 'Content under header 2.' },
  270. ];
  271. const result = await splitMarkdownIntoChunks(markdown);
  272. expect(result).toEqual(expected);
  273. });
  274. test('frontmatter is processed and labeled correctly', async() => {
  275. const markdown = `---
  276. title: Test Document
  277. author: John Doe
  278. ---
  279. # Header 1
  280. Some introductory content.
  281. `;
  282. const expected: Chunk[] = [
  283. { label: 'frontmatter', type: 'yaml', text: JSON.stringify({ title: 'Test Document', author: 'John Doe' }, null, 2) },
  284. { label: '1-heading', type: 'heading', text: '# Header 1' },
  285. { label: '1-content-1', type: 'paragraph', text: 'Some introductory content.' },
  286. ];
  287. const result = await splitMarkdownIntoChunks(markdown);
  288. expect(result).toEqual(expected);
  289. });
  290. });