index.spec.ts 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252
  1. import type { Chunk } from '../src/services/markdown-splitter';
  2. import { splitMarkdownIntoChunks } from '../src/services/markdown-splitter';
  3. describe('splitMarkdownIntoChunks', () => {
  4. test('handles empty markdown string', () => {
  5. const markdown = '';
  6. const expected: Chunk[] = [];
  7. const result = splitMarkdownIntoChunks(markdown);
  8. expect(result).toEqual(expected);
  9. });
  10. test('handles markdown with only content and no headers', () => {
  11. const markdown = `This is some content without any headers.
  12. It spans multiple lines.
  13. Another paragraph.
  14. `;
  15. const expected: Chunk[] = [
  16. {
  17. label: '0-content',
  18. text: 'This is some content without any headers.\nIt spans multiple lines.\n\nAnother paragraph.',
  19. },
  20. ];
  21. const result = splitMarkdownIntoChunks(markdown);
  22. expect(result).toEqual(expected);
  23. });
  24. test('handles markdown starting with a header', () => {
  25. const markdown = `
  26. # Header 1
  27. Content under header 1.
  28. ## Header 1.1
  29. Content under header 1.1.
  30. # Header 2
  31. Content under header 2.
  32. `;
  33. const expected: Chunk[] = [
  34. { label: '1-heading', text: '# Header 1' },
  35. { label: '1-content', text: 'Content under header 1.' },
  36. { label: '1-1-heading', text: '## Header 1.1' },
  37. { label: '1-1-content', text: 'Content under header 1.1.' },
  38. { label: '2-heading', text: '# Header 2' },
  39. { label: '2-content', text: 'Content under header 2.' },
  40. ];
  41. const result = splitMarkdownIntoChunks(markdown);
  42. expect(result).toEqual(expected);
  43. });
  44. test('handles markdown with non-consecutive heading levels', () => {
  45. const markdown = `
  46. Introduction without a header.
  47. # Chapter 1
  48. Content of chapter 1.
  49. ### Section 1.1.1
  50. Content of section 1.1.1.
  51. ## Section 1.2
  52. Content of section 1.2.
  53. # Chapter 2
  54. Content of chapter 2.
  55. ## Section 2.1
  56. Content of section 2.1.
  57. `;
  58. const expected: Chunk[] = [
  59. {
  60. label: '0-content',
  61. text: 'Introduction without a header.',
  62. },
  63. {
  64. label: '1-heading',
  65. text: '# Chapter 1',
  66. },
  67. {
  68. label: '1-content',
  69. text: 'Content of chapter 1.',
  70. },
  71. {
  72. label: '1-1-1-heading',
  73. text: '### Section 1.1.1',
  74. },
  75. {
  76. label: '1-1-1-content',
  77. text: 'Content of section 1.1.1.',
  78. },
  79. {
  80. label: '1-2-heading',
  81. text: '## Section 1.2',
  82. },
  83. {
  84. label: '1-2-content',
  85. text: 'Content of section 1.2.',
  86. },
  87. {
  88. label: '2-heading',
  89. text: '# Chapter 2',
  90. },
  91. {
  92. label: '2-content',
  93. text: 'Content of chapter 2.',
  94. },
  95. {
  96. label: '2-1-heading',
  97. text: '## Section 2.1',
  98. },
  99. {
  100. label: '2-1-content',
  101. text: 'Content of section 2.1.',
  102. },
  103. ];
  104. const result = splitMarkdownIntoChunks(markdown);
  105. expect(result).toEqual(expected);
  106. });
  107. test('handles markdown with skipped heading levels', () => {
  108. const markdown = `
  109. # Header 1
  110. Content under header 1.
  111. #### Header 1.1.1.1
  112. Content under header 1.1.1.1.
  113. ## Header 1.2
  114. Content under header 1.2.
  115. # Header 2
  116. Content under header 2.
  117. `;
  118. const expected: Chunk[] = [
  119. { label: '1-heading', text: '# Header 1' },
  120. { label: '1-content', text: 'Content under header 1.' },
  121. { label: '1-1-1-1-heading', text: '#### Header 1.1.1.1' },
  122. { label: '1-1-1-1-content', text: 'Content under header 1.1.1.1.' },
  123. { label: '1-2-heading', text: '## Header 1.2' },
  124. { label: '1-2-content', text: 'Content under header 1.2.' },
  125. { label: '2-heading', text: '# Header 2' },
  126. { label: '2-content', text: 'Content under header 2.' },
  127. ];
  128. const result = splitMarkdownIntoChunks(markdown);
  129. expect(result).toEqual(expected);
  130. });
  131. test('handles malformed headings', () => {
  132. const markdown = `
  133. # Header 1
  134. Content under header 1.
  135. #### Header 1.1.1.1
  136. Content under header 1.1.1.1.
  137. `;
  138. const expected: Chunk[] = [
  139. { label: '1-heading', text: '# Header 1' },
  140. { label: '1-content', text: 'Content under header 1.' },
  141. { label: '1-1-1-1-heading', text: '#### Header 1.1.1.1' },
  142. { label: '1-1-1-1-content', text: 'Content under header 1.1.1.1.' },
  143. ];
  144. const result = splitMarkdownIntoChunks(markdown);
  145. expect(result).toEqual(expected);
  146. });
  147. test('handles multiple content blocks before any headers', () => {
  148. const markdown = `
  149. This is the first paragraph without a header.
  150. This is the second paragraph without a header.
  151. # Header 1
  152. Content under header 1.
  153. `;
  154. const expected: Chunk[] = [
  155. {
  156. label: '0-content',
  157. text: 'This is the first paragraph without a header.\n\nThis is the second paragraph without a header.',
  158. },
  159. { label: '1-heading', text: '# Header 1' },
  160. { label: '1-content', text: 'Content under header 1.' },
  161. ];
  162. const result = splitMarkdownIntoChunks(markdown);
  163. expect(result).toEqual(expected);
  164. });
  165. test('handles markdown with only headers and no content', () => {
  166. const markdown = `
  167. # Header 1
  168. ## Header 1.1
  169. ### Header 1.1.1
  170. `;
  171. const expected: Chunk[] = [
  172. { label: '1-heading', text: '# Header 1' },
  173. { label: '1-1-heading', text: '## Header 1.1' },
  174. { label: '1-1-1-heading', text: '### Header 1.1.1' },
  175. ];
  176. const result = splitMarkdownIntoChunks(markdown);
  177. expect(result).toEqual(expected);
  178. });
  179. test('handles markdown with mixed content and headers', () => {
  180. const markdown = `
  181. # Header 1
  182. Content under header 1.
  183. ## Header 1.1
  184. Content under header 1.1.
  185. Another piece of content.
  186. # Header 2
  187. Content under header 2.
  188. `;
  189. const expected: Chunk[] = [
  190. { label: '1-heading', text: '# Header 1' },
  191. { label: '1-content', text: 'Content under header 1.' },
  192. { label: '1-1-heading', text: '## Header 1.1' },
  193. { label: '1-1-content', text: 'Content under header 1.1.\nAnother piece of content.' },
  194. { label: '2-heading', text: '# Header 2' },
  195. { label: '2-content', text: 'Content under header 2.' },
  196. ];
  197. const result = splitMarkdownIntoChunks(markdown);
  198. expect(result).toEqual(expected);
  199. });
  200. test('preserves list indentation and reduces unnecessary line breaks', () => {
  201. const markdown = `
  202. # Header 1
  203. Content under header 1.
  204. - Item 1
  205. - Subitem 1
  206. - Item 2
  207. # Header 2
  208. Content under header 2.
  209. `;
  210. const expected: Chunk[] = [
  211. { label: '1-heading', text: '# Header 1' },
  212. { label: '1-content', text: 'Content under header 1.\n\n- Item 1\n - Subitem 1\n- Item 2' },
  213. { label: '2-heading', text: '# Header 2' },
  214. { label: '2-content', text: 'Content under header 2.' },
  215. ];
  216. const result = splitMarkdownIntoChunks(markdown);
  217. expect(result).toEqual(expected);
  218. });
  219. });