test_tests.py 65 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575
  1. import unittest
  2. from olmocr.bench.tests import (
  3. BaselineTest,
  4. BasePDFTest,
  5. MathTest,
  6. TableTest,
  7. TestChecked,
  8. TestType,
  9. TextOrderTest,
  10. TextPresenceTest,
  11. ValidationError,
  12. normalize_text,
  13. parse_html_tables,
  14. parse_markdown_tables,
  15. )
  16. class TestNormalizeText(unittest.TestCase):
  17. """Test the normalize_text function"""
  18. def test_whitespace_normalization(self):
  19. """Test that whitespace is properly normalized"""
  20. input_text = "This has\tmultiple spaces\nand\nnewlines"
  21. expected = "This has multiple spaces and newlines"
  22. self.assertEqual(normalize_text(input_text), expected)
  23. def test_character_replacement(self):
  24. """Test that fancy characters are replaced with ASCII equivalents"""
  25. input_text = "This has 'fancy' “quotes” and—dashes"
  26. expected = "This has 'fancy' \"quotes\" and-dashes"
  27. self.assertEqual(normalize_text(input_text), expected)
  28. def test_markdown1(self):
  29. """Test that fancy characters are replaced with ASCII equivalents"""
  30. input_text = "this is *bold*"
  31. expected = "this is bold"
  32. self.assertEqual(normalize_text(input_text), expected)
  33. def test_markdown2(self):
  34. """Test that fancy characters are replaced with ASCII equivalents"""
  35. input_text = "_italic__ is *bold*"
  36. expected = "italic_ is bold"
  37. self.assertEqual(normalize_text(input_text), expected)
  38. def test_empty_input(self):
  39. """Test that empty input returns empty output"""
  40. self.assertEqual(normalize_text(""), "")
  41. def test_brs(self):
  42. """Test that empty input returns empty output"""
  43. self.assertEqual(normalize_text("Hello<br>everyone"), "Hello everyone")
  44. self.assertEqual(normalize_text("Hello<br>everyone"), normalize_text("Hello\neveryone"))
  45. self.assertEqual(normalize_text("Hello<br/>everyone"), "Hello everyone")
  46. self.assertEqual(normalize_text("Hello<br/>everyone"), normalize_text("Hello\neveryone"))
  47. def test_two_stars(self):
  48. self.assertEqual(
  49. normalize_text(
  50. "**Georges V.** (2007) – *Le Forez du VIe au IVe millénaire av. J.-C. Territoires, identités et stratégies des sociétés humaines du Massif central dans le bassin amont de la Loire (France)*, thèse de doctorat, université de Bourgogne, Dijon, 2 vol., 435 p."
  51. ),
  52. "Georges V. (2007) - Le Forez du VIe au IVe millénaire av. J.-C. Territoires, identités et stratégies des sociétés humaines du Massif central dans le bassin amont de la Loire (France), thèse de doctorat, université de Bourgogne, Dijon, 2 vol., 435 p.",
  53. )
  54. class TestBasePDFTest(unittest.TestCase):
  55. """Test the BasePDFTest class"""
  56. def test_valid_initialization(self):
  57. """Test that a valid initialization works"""
  58. test = BasePDFTest(pdf="test.pdf", page=1, id="test_id", type=TestType.BASELINE.value)
  59. self.assertEqual(test.pdf, "test.pdf")
  60. self.assertEqual(test.page, 1)
  61. self.assertEqual(test.id, "test_id")
  62. self.assertEqual(test.type, TestType.BASELINE.value)
  63. self.assertEqual(test.max_diffs, 0)
  64. self.assertIsNone(test.checked)
  65. self.assertIsNone(test.url)
  66. def test_empty_pdf(self):
  67. """Test that empty PDF raises ValidationError"""
  68. with self.assertRaises(ValidationError):
  69. BasePDFTest(pdf="", page=1, id="test_id", type=TestType.BASELINE.value)
  70. def test_empty_id(self):
  71. """Test that empty ID raises ValidationError"""
  72. with self.assertRaises(ValidationError):
  73. BasePDFTest(pdf="test.pdf", page=1, id="", type=TestType.BASELINE.value)
  74. def test_negative_max_diffs(self):
  75. """Test that negative max_diffs raises ValidationError"""
  76. with self.assertRaises(ValidationError):
  77. BasePDFTest(pdf="test.pdf", page=1, id="test_id", type=TestType.BASELINE.value, max_diffs=-1)
  78. def test_invalid_test_type(self):
  79. """Test that invalid test type raises ValidationError"""
  80. with self.assertRaises(ValidationError):
  81. BasePDFTest(pdf="test.pdf", page=1, id="test_id", type="invalid_type")
  82. def test_run_method_not_implemented(self):
  83. """Test that run method raises NotImplementedError"""
  84. test = BasePDFTest(pdf="test.pdf", page=1, id="test_id", type=TestType.BASELINE.value)
  85. with self.assertRaises(NotImplementedError):
  86. test.run("content")
  87. def test_checked_enum(self):
  88. """Test that checked accepts valid TestChecked enums"""
  89. test = BasePDFTest(pdf="test.pdf", page=1, id="test_id", type=TestType.BASELINE.value, checked=TestChecked.VERIFIED)
  90. self.assertEqual(test.checked, TestChecked.VERIFIED)
  91. class TestTextPresenceTest(unittest.TestCase):
  92. """Test the TextPresenceTest class"""
  93. def test_valid_present_test(self):
  94. """Test that a valid PRESENT test initializes correctly"""
  95. test = TextPresenceTest(pdf="test.pdf", page=1, id="test_id", type=TestType.PRESENT.value, text="test text")
  96. self.assertEqual(test.text, "test text")
  97. self.assertTrue(test.case_sensitive)
  98. self.assertIsNone(test.first_n)
  99. self.assertIsNone(test.last_n)
  100. def test_valid_absent_test(self):
  101. """Test that a valid ABSENT test initializes correctly"""
  102. test = TextPresenceTest(pdf="test.pdf", page=1, id="test_id", type=TestType.ABSENT.value, text="test text", case_sensitive=False)
  103. self.assertEqual(test.text, "test text")
  104. self.assertFalse(test.case_sensitive)
  105. def test_empty_text(self):
  106. """Test that empty text raises ValidationError"""
  107. with self.assertRaises(ValidationError):
  108. TextPresenceTest(pdf="test.pdf", page=1, id="test_id", type=TestType.PRESENT.value, text="")
  109. def test_present_text_exact_match(self):
  110. """Test that PRESENT test returns True for exact match"""
  111. test = TextPresenceTest(pdf="test.pdf", page=1, id="test_id", type=TestType.PRESENT.value, text="target text")
  112. result, _ = test.run("This is some target text in a document")
  113. self.assertTrue(result)
  114. def test_present_text_not_found(self):
  115. """Test that PRESENT test returns False when text not found"""
  116. test = TextPresenceTest(pdf="test.pdf", page=1, id="test_id", type=TestType.PRESENT.value, text="missing text")
  117. result, explanation = test.run("This document doesn't have the target")
  118. self.assertFalse(result)
  119. self.assertIn("missing text", explanation)
  120. def test_present_text_with_max_diffs(self):
  121. """Test that PRESENT test with max_diffs handles fuzzy matching"""
  122. test = TextPresenceTest(pdf="test.pdf", page=1, id="test_id", type=TestType.PRESENT.value, text="target text", max_diffs=2)
  123. result, _ = test.run("This is some targett textt in a document")
  124. self.assertTrue(result)
  125. def test_absent_text_found(self):
  126. """Test that ABSENT test returns False when text is found"""
  127. test = TextPresenceTest(pdf="test.pdf", page=1, id="test_id", type=TestType.ABSENT.value, text="target text")
  128. result, explanation = test.run("This is some target text in a document")
  129. self.assertFalse(result)
  130. self.assertIn("target text", explanation)
  131. def test_absent_text_found_diffs(self):
  132. """Test that ABSENT test returns False when text is found"""
  133. test = TextPresenceTest(pdf="test.pdf", page=1, id="test_id", type=TestType.ABSENT.value, text="target text", max_diffs=2)
  134. result, explanation = test.run("This is some target text in a document")
  135. self.assertFalse(result)
  136. result, explanation = test.run("This is some targett text in a document")
  137. self.assertFalse(result)
  138. result, explanation = test.run("This is some targettt text in a document")
  139. self.assertFalse(result)
  140. result, explanation = test.run("This is some targetttt text in a document")
  141. self.assertTrue(result)
  142. def test_absent_text_not_found(self):
  143. """Test that ABSENT test returns True when text is not found"""
  144. test = TextPresenceTest(pdf="test.pdf", page=1, id="test_id", type=TestType.ABSENT.value, text="missing text")
  145. result, _ = test.run("This document doesn't have the target")
  146. self.assertTrue(result)
  147. def test_case_insensitive_present(self):
  148. """Test that case_sensitive=False works for PRESENT test"""
  149. test = TextPresenceTest(pdf="test.pdf", page=1, id="test_id", type=TestType.PRESENT.value, text="TARGET TEXT", case_sensitive=False)
  150. result, _ = test.run("This is some target text in a document")
  151. self.assertTrue(result)
  152. def test_case_insensitive_absent(self):
  153. """Test that case_sensitive=False works for ABSENT test"""
  154. test = TextPresenceTest(pdf="test.pdf", page=1, id="test_id", type=TestType.ABSENT.value, text="TARGET TEXT", case_sensitive=False)
  155. result, explanation = test.run("This is some target text in a document")
  156. self.assertFalse(result)
  157. def test_first_n_limit(self):
  158. """Test that first_n parameter works correctly"""
  159. test = TextPresenceTest(pdf="test.pdf", page=1, id="test_id", type=TestType.PRESENT.value, text="beginning", first_n=20)
  160. result, _ = test.run("beginning of text, but not the end")
  161. self.assertTrue(result)
  162. # Test that text beyond first_n isn't matched
  163. test = TextPresenceTest(pdf="test.pdf", page=1, id="test_id", type=TestType.PRESENT.value, text="end", first_n=20)
  164. result, _ = test.run("beginning of text, but not the end")
  165. self.assertFalse(result)
  166. def test_last_n_limit(self):
  167. """Test that last_n parameter works correctly"""
  168. test = TextPresenceTest(pdf="test.pdf", page=1, id="test_id", type=TestType.PRESENT.value, text="end", last_n=20)
  169. result, _ = test.run("beginning of text, but not the end")
  170. self.assertTrue(result)
  171. # Test that text beyond last_n isn't matched
  172. test = TextPresenceTest(pdf="test.pdf", page=1, id="test_id", type=TestType.PRESENT.value, text="beginning", last_n=20)
  173. result, _ = test.run("beginning of text, but not the end")
  174. self.assertFalse(result)
  175. def test_both_first_and_last_n(self):
  176. """Test that combining first_n and last_n works correctly"""
  177. test = TextPresenceTest(pdf="test.pdf", page=1, id="test_id", type=TestType.PRESENT.value, text="beginning", first_n=15, last_n=10)
  178. result, _ = test.run("beginning of text, middle part, but not the end")
  179. self.assertTrue(result)
  180. # Text only in middle shouldn't be found
  181. test = TextPresenceTest(pdf="test.pdf", page=1, id="test_id", type=TestType.PRESENT.value, text="middle", first_n=15, last_n=10)
  182. result, _ = test.run("beginning of text, middle part, but not the end")
  183. self.assertFalse(result)
  184. def test_unicode_normalized_forms(self):
  185. """Test that e+accent == e_with_accent unicode chars"""
  186. test = TextPresenceTest(pdf="test.pdf", page=1, id="test_id", type=TestType.PRESENT.value, text="I like to eat at a caf\u00e9")
  187. result, _ = test.run("I like to eat at a caf\u00e9")
  188. self.assertTrue(result)
  189. result, _ = test.run("I like to eat at a cafe\u0301")
  190. self.assertTrue(result)
  191. class TestTextOrderTest(unittest.TestCase):
  192. """Test the TextOrderTest class"""
  193. def test_valid_initialization(self):
  194. """Test that valid initialization works"""
  195. test = TextOrderTest(pdf="test.pdf", page=1, id="test_id", type=TestType.ORDER.value, before="first text", after="second text")
  196. self.assertEqual(test.before, "first text")
  197. self.assertEqual(test.after, "second text")
  198. def test_invalid_test_type(self):
  199. """Test that invalid test type raises ValidationError"""
  200. with self.assertRaises(ValidationError):
  201. TextOrderTest(pdf="test.pdf", page=1, id="test_id", type=TestType.PRESENT.value, before="first text", after="second text")
  202. def test_empty_before(self):
  203. """Test that empty before text raises ValidationError"""
  204. with self.assertRaises(ValidationError):
  205. TextOrderTest(pdf="test.pdf", page=1, id="test_id", type=TestType.ORDER.value, before="", after="second text")
  206. def test_empty_after(self):
  207. """Test that empty after text raises ValidationError"""
  208. with self.assertRaises(ValidationError):
  209. TextOrderTest(pdf="test.pdf", page=1, id="test_id", type=TestType.ORDER.value, before="first text", after="")
  210. def test_correct_order(self):
  211. """Test that correct order returns True"""
  212. test = TextOrderTest(pdf="test.pdf", page=1, id="test_id", type=TestType.ORDER.value, before="first", after="second")
  213. result, _ = test.run("This has first and then second in correct order")
  214. self.assertTrue(result)
  215. def test_incorrect_order(self):
  216. """Test that incorrect order returns False"""
  217. test = TextOrderTest(pdf="test.pdf", page=1, id="test_id", type=TestType.ORDER.value, before="second", after="first")
  218. result, explanation = test.run("This has first and then second in correct order")
  219. self.assertFalse(result)
  220. def test_before_not_found(self):
  221. """Test that 'before' text not found returns False"""
  222. test = TextOrderTest(pdf="test.pdf", page=1, id="test_id", type=TestType.ORDER.value, before="missing", after="present")
  223. result, explanation = test.run("This text has present but not the other word")
  224. self.assertFalse(result)
  225. def test_after_not_found(self):
  226. """Test that 'after' text not found returns False"""
  227. test = TextOrderTest(pdf="test.pdf", page=1, id="test_id", type=TestType.ORDER.value, before="present", after="missing")
  228. result, explanation = test.run("This text has present but not the other word")
  229. self.assertFalse(result)
  230. def test_max_diffs(self):
  231. """Test that max_diffs parameter works correctly"""
  232. test = TextOrderTest(pdf="test.pdf", page=1, id="test_id", type=TestType.ORDER.value, before="first", after="second", max_diffs=1)
  233. result, _ = test.run("This has firsst and then secand in correct order")
  234. self.assertTrue(result)
  235. def test_multiple_occurrences(self):
  236. """Test that multiple occurrences are handled correctly"""
  237. test = TextOrderTest(pdf="test.pdf", page=1, id="test_id", type=TestType.ORDER.value, before="target", after="target")
  238. result, _ = test.run("This has target and then target again")
  239. self.assertTrue(result)
  240. # Test reverse direction fails
  241. test = TextOrderTest(pdf="test.pdf", page=1, id="test_id", type=TestType.ORDER.value, before="B", after="A")
  242. result, _ = test.run("A B A B") # A comes before B, but B also comes before second A
  243. self.assertTrue(result)
  244. class TestTableTest(unittest.TestCase):
  245. """Test the TableTest class"""
  246. def setUp(self):
  247. """Set up test fixtures"""
  248. self.markdown_table = """
  249. | Header 1 | Header 2 | Header 3 |
  250. | -------- | -------- | -------- |
  251. | Cell A1 | Cell A2 | Cell A3 |
  252. | Cell B1 | Cell B2 | Cell B3 |
  253. """
  254. self.html_table = """
  255. <table>
  256. <tr>
  257. <th>Header 1</th>
  258. <th>Header 2</th>
  259. <th>Header 3</th>
  260. </tr>
  261. <tr>
  262. <td>Cell A1</td>
  263. <td>Cell A2</td>
  264. <td>Cell A3</td>
  265. </tr>
  266. <tr>
  267. <td>Cell B1</td>
  268. <td>Cell B2</td>
  269. <td>Cell B3</td>
  270. </tr>
  271. </table>
  272. """
  273. def test_valid_initialization(self):
  274. """Test that valid initialization works"""
  275. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="target cell")
  276. self.assertEqual(test.cell, "target cell")
  277. self.assertEqual(test.up, "")
  278. self.assertEqual(test.down, "")
  279. self.assertEqual(test.left, "")
  280. self.assertEqual(test.right, "")
  281. self.assertEqual(test.top_heading, "")
  282. self.assertEqual(test.left_heading, "")
  283. def test_invalid_test_type(self):
  284. """Test that invalid test type raises ValidationError"""
  285. with self.assertRaises(ValidationError):
  286. TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.PRESENT.value, cell="target cell")
  287. def test_parse_markdown_tables(self):
  288. """Test markdown table parsing"""
  289. _test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Cell A2")
  290. tables = parse_markdown_tables(self.markdown_table)
  291. self.assertEqual(len(tables), 1)
  292. self.assertEqual(tables[0].data.shape, (3, 3)) # 3 rows, 3 columns
  293. self.assertEqual(tables[0].data[0, 0], "Header 1")
  294. self.assertEqual(tables[0].data[1, 1], "Cell A2")
  295. self.assertEqual(tables[0].data[2, 2], "Cell B3")
  296. def test_parse_html_tables(self):
  297. """Test HTML table parsing"""
  298. _test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Cell A2")
  299. tables = parse_html_tables(self.html_table)
  300. self.assertEqual(len(tables), 1)
  301. self.assertEqual(tables[0].data.shape, (3, 3)) # 3 rows, 3 columns
  302. self.assertEqual(tables[0].data[0, 0], "Header 1")
  303. self.assertEqual(tables[0].data[1, 1], "Cell A2")
  304. self.assertEqual(tables[0].data[2, 2], "Cell B3")
  305. def test_match_cell(self):
  306. """Test finding a cell in a table"""
  307. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Cell A2")
  308. result, _ = test.run(self.markdown_table)
  309. self.assertTrue(result)
  310. def test_cell_not_found(self):
  311. """Test cell not found in table"""
  312. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Missing Cell")
  313. result, explanation = test.run(self.markdown_table)
  314. self.assertFalse(result)
  315. self.assertIn("No cell matching", explanation)
  316. def test_up_relationship(self):
  317. """Test up relationship in table"""
  318. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Cell A2", up="Header 2")
  319. result, _ = test.run(self.markdown_table)
  320. self.assertTrue(result)
  321. # Test incorrect up relationship
  322. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Cell A2", up="Wrong Header")
  323. result, explanation = test.run(self.markdown_table)
  324. self.assertFalse(result)
  325. self.assertIn("doesn't match expected", explanation)
  326. def test_down_relationship(self):
  327. """Test down relationship in table"""
  328. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Cell A2", down="Cell B2")
  329. result, _ = test.run(self.markdown_table)
  330. self.assertTrue(result)
  331. # Test incorrect down relationship
  332. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Cell A2", down="Wrong Cell")
  333. result, explanation = test.run(self.markdown_table)
  334. self.assertFalse(result)
  335. self.assertIn("doesn't match expected", explanation)
  336. def test_left_relationship(self):
  337. """Test left relationship in table"""
  338. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Cell A2", left="Cell A1")
  339. result, _ = test.run(self.markdown_table)
  340. self.assertTrue(result)
  341. # Test incorrect left relationship
  342. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Cell A2", left="Wrong Cell")
  343. result, explanation = test.run(self.markdown_table)
  344. self.assertFalse(result)
  345. self.assertIn("doesn't match expected", explanation)
  346. def test_right_relationship(self):
  347. """Test right relationship in table"""
  348. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Cell A2", right="Cell A3")
  349. result, _ = test.run(self.markdown_table)
  350. self.assertTrue(result)
  351. # Test incorrect right relationship
  352. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Cell A2", right="Wrong Cell")
  353. result, explanation = test.run(self.markdown_table)
  354. self.assertFalse(result)
  355. self.assertIn("doesn't match expected", explanation)
  356. def test_top_heading_relationship(self):
  357. """Test top_heading relationship in table"""
  358. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Cell B2", top_heading="Header 2")
  359. result, _ = test.run(self.markdown_table)
  360. self.assertTrue(result)
  361. # Test incorrect top_heading relationship
  362. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Cell B2", top_heading="Wrong Header")
  363. result, explanation = test.run(self.markdown_table)
  364. self.assertFalse(result)
  365. self.assertIn("doesn't match expected", explanation)
  366. def test_left_heading_relationship(self):
  367. """Test left_heading relationship in table"""
  368. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Cell A3", left_heading="Cell A1")
  369. result, _ = test.run(self.markdown_table)
  370. self.assertTrue(result)
  371. # Test incorrect left_heading relationship
  372. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Cell A3", left_heading="Wrong Cell")
  373. result, explanation = test.run(self.markdown_table)
  374. self.assertFalse(result)
  375. self.assertIn("doesn't match expected", explanation)
  376. def test_multiple_relationships(self):
  377. """Test multiple relationships in table"""
  378. test = TableTest(
  379. pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Cell A2", up="Header 2", down="Cell B2", left="Cell A1", right="Cell A3"
  380. )
  381. result, _ = test.run(self.markdown_table)
  382. self.assertTrue(result)
  383. # Test one incorrect relationship
  384. test = TableTest(
  385. pdf="test.pdf",
  386. page=1,
  387. id="test_id",
  388. type=TestType.TABLE.value,
  389. cell="Cell A2",
  390. up="Header 2",
  391. down="Cell B2",
  392. left="Wrong Cell", # This is incorrect
  393. right="Cell A3",
  394. )
  395. result, explanation = test.run(self.markdown_table)
  396. self.assertFalse(result)
  397. self.assertIn("doesn't match expected", explanation)
  398. def test_no_tables_found(self):
  399. """Test behavior when no tables are found"""
  400. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Cell A2")
  401. result, explanation = test.run("This is plain text with no tables")
  402. self.assertFalse(result)
  403. self.assertEqual(explanation, "No tables found in the content")
  404. def test_fuzzy_matching(self):
  405. """Test fuzzy matching with max_diffs"""
  406. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Cell A2", max_diffs=1)
  407. # Create table with slightly misspelled cell
  408. misspelled_table = self.markdown_table.replace("Cell A2", "Cel A2")
  409. result, _ = test.run(misspelled_table)
  410. self.assertTrue(result)
  411. def test_with_stripped_content(self):
  412. """Test table parsing with stripped content"""
  413. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Cell A2")
  414. # Strip all leading/trailing whitespace from the markdown table
  415. stripped_table = self.markdown_table.strip()
  416. result, explanation = test.run(stripped_table)
  417. self.assertTrue(result, f"Table test failed with stripped content: {explanation}")
  418. def test_table_at_end_of_file(self):
  419. """Test that a table at the very end of the file is correctly detected"""
  420. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Cell A2")
  421. # Create content with text followed by a table at the very end with no trailing newline
  422. content_with_table_at_end = "Some text before the table.\n" + self.markdown_table.strip()
  423. result, explanation = test.run(content_with_table_at_end)
  424. self.assertTrue(result, f"Table at end of file not detected: {explanation}")
  425. def test_table_at_end_with_no_trailing_newline(self):
  426. """Test that a table at the end with no trailing newline is detected"""
  427. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Cell A2")
  428. # Remove the trailing newline from the markdown table
  429. content_without_newline = self.markdown_table.rstrip()
  430. result, explanation = test.run(content_without_newline)
  431. self.assertTrue(result, f"Table without trailing newline not detected: {explanation}")
  432. def test_table_at_end_with_extra_spaces(self):
  433. """Test that a table at the end with extra spaces is detected"""
  434. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Cell A2")
  435. # Add extra spaces to the end of lines in the table
  436. lines = self.markdown_table.split("\n")
  437. content_with_extra_spaces = "\n".join([line + " " for line in lines])
  438. result, explanation = test.run(content_with_extra_spaces)
  439. self.assertTrue(result, f"Table with extra spaces not detected: {explanation}")
  440. def test_table_at_end_with_mixed_whitespace(self):
  441. """Test that a table at the end with mixed whitespace is detected"""
  442. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Cell A2")
  443. # Add various whitespace characters to the table
  444. content_with_mixed_whitespace = "Some text before the table.\n" + self.markdown_table.strip() + " \t "
  445. result, explanation = test.run(content_with_mixed_whitespace)
  446. self.assertTrue(result, f"Table with mixed whitespace not detected: {explanation}")
  447. def test_malformed_table_at_end(self):
  448. """Test that a slightly malformed table at the end is still detected"""
  449. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Cell A2")
  450. # Create a table with irregular pipe placement at the end
  451. malformed_table = """
  452. Some text before the table.
  453. | Header 1 | Header 2 | Header 3
  454. | -------- | -------- | --------
  455. | Cell A1 | Cell A2 | Cell A3 |
  456. | Cell B1 | Cell B2 | Cell B3"""
  457. result, explanation = test.run(malformed_table)
  458. self.assertTrue(result, f"Malformed table at end not detected: {explanation}")
  459. def test_incomplete_table_at_end(self):
  460. """Test that an incomplete table at the end still gets detected if it contains valid rows"""
  461. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Cell A2")
  462. # Missing the separator row
  463. incomplete_table = """
  464. Some text before the table.
  465. | Header 1 | Header 2 | Header 3 |
  466. | Cell A1 | Cell A2 | Cell A3 |
  467. | Cell B1 | Cell B2 | Cell B3 |"""
  468. result, explanation = test.run(incomplete_table)
  469. self.assertTrue(result, f"Incomplete table at end not detected: {explanation}")
  470. def test_table_with_excessive_blank_lines_at_end(self):
  471. """Test that a table followed by many blank lines is detected"""
  472. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Cell A2")
  473. # Add many blank lines after the table
  474. table_with_blanks = self.markdown_table + "\n\n\n\n\n\n\n\n\n\n"
  475. result, explanation = test.run(table_with_blanks)
  476. self.assertTrue(result, f"Table with blank lines at end not detected: {explanation}")
  477. def test_table_at_end_after_long_text(self):
  478. """Test that a table at the end after a very long text is detected"""
  479. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Cell A2")
  480. # Create a very long text before the table
  481. long_text = "Lorem ipsum dolor sit amet, " * 100
  482. content_with_long_text = long_text + "\n" + self.markdown_table.strip()
  483. result, explanation = test.run(content_with_long_text)
  484. self.assertTrue(result, f"Table after long text not detected: {explanation}")
  485. def test_valid_table_at_eof_without_newline(self):
  486. """Test that a valid table at EOF without a trailing newline is detected"""
  487. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Cell A2")
  488. # Valid table but without trailing newline at the very end of the file
  489. valid_table_eof = """
  490. | Header 1 | Header 2 | Header 3 |
  491. | -------- | -------- | -------- |
  492. | Cell A1 | Cell A2 | Cell A3 |
  493. | Cell B1 | Cell B2 | Cell B3 |""".strip()
  494. result, explanation = test.run(valid_table_eof)
  495. self.assertTrue(result, f"Valid table at EOF without newline not detected: {explanation}")
  496. def test_normalizing(self):
  497. table = """| Question - – Satisfaction on scale of 10 | Response | Resident Sample | Business Sample |
  498. |----------------------------------------|----------|----------------|-----------------|
  499. | Planning for and managing residential, commercial and industrial development | Rating of 8, 9 or 10 | 13% | 11% |
  500. | | Average rating | 6.4 | 5.7 |
  501. | | Don’t know responses | 11% | 6% |
  502. | Environmental protection, support for green projects (e.g. green grants, building retrofits programs, zero waste) | Rating of 8, 9 or 10 | 35% | 34% |
  503. | | Average rating | 8.0 | 7.5 |
  504. | | Don’t know responses | 8% | 6% |
  505. | Providing and maintaining parks and green spaces | Rating of 8, 9 or 10 | 42% | 41% |
  506. | | Average rating | 7.7 | 7.3 |
  507. | | Don’t know responses | 1% | 1% |"""
  508. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="6%", top_heading="Business\nSample")
  509. result, explanation = test.run(table)
  510. self.assertTrue(result, explanation)
  511. def test_mathematical_minus(self):
  512. table = """| Response | Chinese experimenter | White experimenter |
  513. |----------|----------------------|--------------------|
  514. | | Divided attention | Full attention | Divided attention | Full attention |
  515. | Nonverbal| −.34 (.22) | .54* (.17) | .12 (.27) | −.20 (.24) |
  516. | Verbal | −.25 (.23) | .36 (.20) | .12 (.27) | −.34 (.22) |
  517. """
  518. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="-.34 (.22)")
  519. result, explanation = test.run(table)
  520. self.assertTrue(result, explanation)
  521. def test_markdown_marker(self):
  522. table = """| CATEGORY | POINTS EARNED |
  523. |------------------------------|------------------|
  524. | Sustainable Sites | 9 |
  525. | Water Efficiency | 3 |
  526. | Energy & Atmosphere | 12 |
  527. | Materials & Resources | 6 |
  528. | Indoor Environmental Quality | 11 |
  529. | Innovation & Design Process | 5 |
  530. | TOTAL | 46 |"""
  531. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="9", up="POINTS EARNED")
  532. result, explanation = test.run(table)
  533. self.assertTrue(result, explanation)
  534. def test_diffs(self):
  535. table = """| CATEGORY | POINTS EARNED |
  536. |------------------------------|------------------|
  537. | Sustainable Sites | 9 |
  538. | Water Efficiency | 3 |
  539. | Energy & Atmosphere | 12 |
  540. | Materials & Resources | 6 |
  541. | Indoor Environmental Quality | 11 |
  542. | Innovation & Design Process | 5 |
  543. | TOTAL | 46 |"""
  544. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="9", left="Sustl Sie", max_diffs=2)
  545. result, explanation = test.run(table)
  546. self.assertFalse(result, explanation)
  547. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="9", left="Sustainable Site", max_diffs=2)
  548. result, explanation = test.run(table)
  549. self.assertTrue(result, explanation)
  550. def test_markdown_marker2(self):
  551. table = """| Concentration
  552. level | [CO] | [SO2] | [NOx] |
  553. |------------------------|-----------|-------|----------|
  554. | Control | 0 μM | 0 μM | 0 nM |
  555. | Low | 250
  556. μM | 8 μM | 0.002 nM |
  557. | Medium | 625 μM | 20 μM | 0.005 nM |
  558. | High | 1250 μM | 40 μM | 0.01 nM |"""
  559. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="20 μM", up=".002 nM")
  560. result, explanation = test.run(table)
  561. self.assertFalse(result, explanation)
  562. def test_marker3(self):
  563. table = """| | N | Minimum | Maximum | Gemiddelde | Sd |
  564. |-----------------------------------------------|-------|---------|---------|------------|-----|
  565. | Slaapkwaliteit tijdens
  566. gewone nachten | 2017 | 1,0 | 6,0 | 3,9 | 1,0 |
  567. | Slaapkwaliteit tijdens
  568. consignatiediensten | 19816 | 1,0 | 6,0 | 2,8 | 1,2 |
  569. """
  570. test = TableTest(
  571. pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="2,8", left_heading="Slaapkwaliteit tijdens\nconsignatiediensten"
  572. )
  573. result, explanation = test.run(table)
  574. self.assertFalse(result, explanation)
  575. def test_big_table(self):
  576. table = """ <table>
  577. <caption>Base: Resident respondents (n=1,315) and Business respondents (n=397)</caption>
  578. <thead>
  579. <tr>
  580. <th>Question – Satisfaction on scale of 10</th>
  581. <th>Response</th>
  582. <th>Resident Sample</th>
  583. <th>Business Sample</th>
  584. </tr>
  585. </thead>
  586. <tbody>
  587. <!-- First category -->
  588. <tr class="category-row">
  589. <td rowspan="3">Planning for and managing residential, commercial and industrial development</td>
  590. <td>Rating of 8, 9 or 10</td>
  591. <td>13%</td>
  592. <td>11%</td>
  593. </tr>
  594. <tr>
  595. <td class="subcategory">Average rating</td>
  596. <td>6.4</td>
  597. <td>5.7</td>
  598. </tr>
  599. <tr>
  600. <td class="subcategory">Don't know responses</td>
  601. <td>11%</td>
  602. <td>6%</td>
  603. </tr>
  604. <!-- Second category -->
  605. <tr class="category-row">
  606. <td rowspan="3">Environmental protection, support for green projects (e.g. green grants, building retrofits programs, zero waste)</td>
  607. <td>Rating of 8, 9 or 10</td>
  608. <td>35%</td>
  609. <td>34%</td>
  610. </tr>
  611. <tr>
  612. <td class="subcategory">Average rating</td>
  613. <td>8.0</td>
  614. <td>7.5</td>
  615. </tr>
  616. <tr>
  617. <td class="subcategory">Don't know responses</td>
  618. <td>8%</td>
  619. <td>6%</td>
  620. </tr>
  621. <!-- Third category -->
  622. <tr class="category-row">
  623. <td rowspan="3">Providing and maintaining parks and green spaces</td>
  624. <td>Rating of 8, 9 or 10</td>
  625. <td>42%</td>
  626. <td>41%</td>
  627. </tr>
  628. <tr>
  629. <td class="subcategory">Average rating</td>
  630. <td>7.7</td>
  631. <td>7.3</td>
  632. </tr>
  633. <tr>
  634. <td class="subcategory">Don't know responses</td>
  635. <td>1%</td>
  636. <td>1%</td>
  637. </tr>
  638. </tbody>
  639. </table>
  640. """
  641. test = TableTest(
  642. pdf="test.pdf",
  643. page=1,
  644. id="test_id",
  645. type=TestType.TABLE.value,
  646. max_diffs=5,
  647. cell="Planning for and managing residential, commercial and industrial development",
  648. down="Environmental protection,\nsupport for green projects\n(e.g. green grants,\nbuilding retrofits programs,\nzero waste)",
  649. )
  650. result, explanation = test.run(table)
  651. self.assertTrue(result, explanation)
  652. def test_html_rowspans_colspans(self):
  653. table = """ <table>
  654. <thead>
  655. <tr>
  656. <th rowspan="2">Product Category</th>
  657. <th rowspan="2">Product Subcategory</th>
  658. <th colspan="4">Quarterly Sales ($000s)</th>
  659. <th rowspan="2">Annual Total</th>
  660. </tr>
  661. <tr>
  662. <th>Q1</th>
  663. <th>Q2</th>
  664. <th>Q3</th>
  665. <th>Q4</th>
  666. </tr>
  667. </thead>
  668. <tbody>
  669. <tr class="category">
  670. <td rowspan="4">Electronics</td>
  671. <td>Smartphones</td>
  672. <td>245</td>
  673. <td>278</td>
  674. <td>312</td>
  675. <td>389</td>
  676. <td>1,224</td>
  677. </tr>
  678. <tr class="subcategory">
  679. <td>Laptops</td>
  680. <td>187</td>
  681. <td>192</td>
  682. <td>243</td>
  683. <td>297</td>
  684. <td>919</td>
  685. </tr>
  686. <tr class="subcategory">
  687. <td>Tablets</td>
  688. <td>95</td>
  689. <td>123</td>
  690. <td>135</td>
  691. <td>156</td>
  692. <td>509</td>
  693. </tr>
  694. <tr class="subcategory">
  695. <td>Accessories</td>
  696. <td>64</td>
  697. <td>72</td>
  698. <td>87</td>
  699. <td>105</td>
  700. <td>328</td>
  701. </tr>
  702. <tr class="category">
  703. <td rowspan="3">Home Appliances</td>
  704. <td>Refrigerators</td>
  705. <td>132</td>
  706. <td>145</td>
  707. <td>151</td>
  708. <td>162</td>
  709. <td>590</td>
  710. </tr>
  711. <tr class="subcategory">
  712. <td>Washing Machines</td>
  713. <td>98</td>
  714. <td>112</td>
  715. <td>127</td>
  716. <td>143</td>
  717. <td>480</td>
  718. </tr>
  719. <tr class="subcategory">
  720. <td>Microwaves</td>
  721. <td>54</td>
  722. <td>67</td>
  723. <td>72</td>
  724. <td>84</td>
  725. <td>277</td>
  726. </tr>
  727. <tr class="category">
  728. <td rowspan="3">Furniture</td>
  729. <td>Sofas</td>
  730. <td>112</td>
  731. <td>128</td>
  732. <td>134</td>
  733. <td>142</td>
  734. <td>516</td>
  735. </tr>
  736. <tr class="subcategory">
  737. <td>Tables</td>
  738. <td>87</td>
  739. <td>95</td>
  740. <td>103</td>
  741. <td>124</td>
  742. <td>409</td>
  743. </tr>
  744. <tr class="subcategory">
  745. <td>Chairs</td>
  746. <td>76</td>
  747. <td>84</td>
  748. <td>92</td>
  749. <td>110</td>
  750. <td>362</td>
  751. </tr>
  752. <tr class="total">
  753. <td colspan="2">Quarterly Totals</td>
  754. <td>1,150</td>
  755. <td>1,296</td>
  756. <td>1,456</td>
  757. <td>1,712</td>
  758. <td>5,614</td>
  759. </tr>
  760. </tbody>
  761. </table>"""
  762. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Refrigerators", left="Home Appliances")
  763. result, explanation = test.run(table)
  764. self.assertTrue(result, explanation)
  765. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Washing Machines", left="Home Appliances")
  766. result, explanation = test.run(table)
  767. self.assertTrue(result, explanation)
  768. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Microwaves", left="Home Appliances")
  769. result, explanation = test.run(table)
  770. self.assertTrue(result, explanation)
  771. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Sofas", top_heading="Product Subcategory")
  772. result, explanation = test.run(table)
  773. self.assertTrue(result, explanation)
  774. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="135", top_heading="Q3")
  775. result, explanation = test.run(table)
  776. self.assertTrue(result, explanation)
  777. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="135", top_heading="Quarterly Sales ($000s)")
  778. result, explanation = test.run(table)
  779. self.assertTrue(result, explanation)
  780. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="1,712", top_heading="Quarterly Sales ($000s)")
  781. result, explanation = test.run(table)
  782. self.assertTrue(result, explanation)
  783. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="135", top_heading="Q2")
  784. result, explanation = test.run(table)
  785. self.assertFalse(result, explanation)
  786. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="135", top_heading="Q1")
  787. result, explanation = test.run(table)
  788. self.assertFalse(result, explanation)
  789. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="135", top_heading="Q4")
  790. result, explanation = test.run(table)
  791. self.assertFalse(result, explanation)
  792. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Home Appliances", top_heading="Product Category")
  793. result, explanation = test.run(table)
  794. self.assertTrue(result, explanation)
  795. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Washing Machines", top_heading="Product Category")
  796. result, explanation = test.run(table)
  797. self.assertFalse(result, explanation)
  798. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Washing Machines", top_heading="Q3")
  799. result, explanation = test.run(table)
  800. self.assertFalse(result, explanation)
  801. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Washing Machines", top_heading="Quarterly Sales ($000s)")
  802. result, explanation = test.run(table)
  803. self.assertFalse(result, explanation)
  804. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Electronics", right="Laptops")
  805. result, explanation = test.run(table)
  806. self.assertTrue(result, explanation)
  807. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Electronics", right="Accessories")
  808. result, explanation = test.run(table)
  809. self.assertTrue(result, explanation)
  810. # TODO Skipping these for now
  811. # test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Quarterly Sales ($000s)", down="Q2")
  812. # result, explanation = test.run(table)
  813. # self.assertTrue(result, explanation)
  814. # test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Q2", up="Quarterly Sales ($000s)")
  815. # result, explanation = test.run(table)
  816. # self.assertTrue(result, explanation)
  817. def test_multiple_markdown_tables(self):
  818. """Test that we can find and verify cells in multiple markdown tables in one document"""
  819. content = """
  820. # First Table
  821. | Name | Age | Role |
  822. | ---- | --- | ---- |
  823. | John | 28 | Developer |
  824. | Jane | 32 | Designer |
  825. | Bob | 45 | Manager |
  826. Some text between tables...
  827. # Second Table
  828. | Department | Budget | Employees |
  829. | ---------- | ------ | --------- |
  830. | Engineering | 1.2M | 15 |
  831. | Design | 0.8M | 8 |
  832. | Marketing | 1.5M | 12 |
  833. | HR | 0.5M | 5 |
  834. """
  835. # Test cells in the first table
  836. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="John", right="28")
  837. result, explanation = test.run(content)
  838. self.assertTrue(result, explanation)
  839. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="32", left="Jane")
  840. result, explanation = test.run(content)
  841. self.assertTrue(result, explanation)
  842. # Test cells in the second table
  843. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Engineering", right="1.2M")
  844. result, explanation = test.run(content)
  845. self.assertTrue(result, explanation)
  846. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="12", left="1.5M")
  847. result, explanation = test.run(content)
  848. self.assertTrue(result, explanation)
  849. # Verify top headings work correctly across tables
  850. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Bob", top_heading="Name")
  851. result, explanation = test.run(content)
  852. self.assertTrue(result, explanation)
  853. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="HR", top_heading="Department")
  854. result, explanation = test.run(content)
  855. self.assertTrue(result, explanation)
  856. def test_multiple_html_tables(self):
  857. """Test that we can find and verify cells in multiple HTML tables in one document"""
  858. content = """
  859. <h1>First Table</h1>
  860. <table>
  861. <thead>
  862. <tr>
  863. <th>Country</th>
  864. <th>Capital</th>
  865. <th>Population</th>
  866. </tr>
  867. </thead>
  868. <tbody>
  869. <tr>
  870. <td>USA</td>
  871. <td>Washington DC</td>
  872. <td>331M</td>
  873. </tr>
  874. <tr>
  875. <td>France</td>
  876. <td>Paris</td>
  877. <td>67M</td>
  878. </tr>
  879. <tr>
  880. <td>Japan</td>
  881. <td>Tokyo</td>
  882. <td>126M</td>
  883. </tr>
  884. </tbody>
  885. </table>
  886. <p>Some text between tables...</p>
  887. <h1>Second Table</h1>
  888. <table>
  889. <thead>
  890. <tr>
  891. <th>Company</th>
  892. <th>Industry</th>
  893. <th>Revenue</th>
  894. <th>Employees</th>
  895. </tr>
  896. </thead>
  897. <tbody>
  898. <tr>
  899. <td>ABC Corp</td>
  900. <td>Technology</td>
  901. <td>$5B</td>
  902. <td>10,000</td>
  903. </tr>
  904. <tr>
  905. <td>XYZ Inc</td>
  906. <td>Healthcare</td>
  907. <td>$2.5B</td>
  908. <td>8,500</td>
  909. </tr>
  910. <tr>
  911. <td>Acme Co</td>
  912. <td>Manufacturing</td>
  913. <td>$1.8B</td>
  914. <td>15,000</td>
  915. </tr>
  916. <tr>
  917. <td>Global LLC</td>
  918. <td>Finance</td>
  919. <td>$3.2B</td>
  920. <td>6,200</td>
  921. </tr>
  922. </tbody>
  923. </table>
  924. """
  925. # Test cells in the first table
  926. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="USA", right="Washington DC")
  927. result, explanation = test.run(content)
  928. self.assertTrue(result, explanation)
  929. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="126M", left="Tokyo")
  930. result, explanation = test.run(content)
  931. self.assertTrue(result, explanation)
  932. # Test cells in the second table
  933. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="XYZ Inc", right="Healthcare")
  934. result, explanation = test.run(content)
  935. self.assertTrue(result, explanation)
  936. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="15,000", left="$1.8B")
  937. result, explanation = test.run(content)
  938. self.assertTrue(result, explanation)
  939. # Verify top headings work correctly across tables
  940. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Tokyo", top_heading="Capital")
  941. result, explanation = test.run(content)
  942. self.assertTrue(result, explanation)
  943. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Finance", top_heading="Industry")
  944. result, explanation = test.run(content)
  945. self.assertTrue(result, explanation)
  946. def test_mixed_markdown_and_html_tables(self):
  947. """Test that we can find and verify cells in mixed markdown and HTML tables in one document"""
  948. content = """
  949. # Markdown Table
  950. | Product | Price | Quantity |
  951. | ------- | ----- | -------- |
  952. | Apple | $1.20 | 100 |
  953. | Orange | $0.80 | 150 |
  954. | Banana | $0.60 | 200 |
  955. <h1>HTML Table</h1>
  956. <table>
  957. <tr>
  958. <th>Month</th>
  959. <th>Income</th>
  960. <th>Expenses</th>
  961. <th>Profit</th>
  962. </tr>
  963. <tr>
  964. <td>January</td>
  965. <td>$10,000</td>
  966. <td>$8,000</td>
  967. <td>$2,000</td>
  968. </tr>
  969. <tr>
  970. <td>February</td>
  971. <td>$12,000</td>
  972. <td>$9,500</td>
  973. <td>$2,500</td>
  974. </tr>
  975. <tr>
  976. <td>March</td>
  977. <td>$15,000</td>
  978. <td>$10,200</td>
  979. <td>$4,800</td>
  980. </tr>
  981. </table>
  982. """
  983. # Test cells in the markdown table
  984. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Orange", right="$0.80")
  985. result, explanation = test.run(content)
  986. self.assertTrue(result, explanation)
  987. # Test cells in the HTML table
  988. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="February", right="$12,000")
  989. result, explanation = test.run(content)
  990. self.assertTrue(result, explanation)
  991. # Verify we can find cells with specific top headings
  992. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="100", top_heading="Quantity")
  993. result, explanation = test.run(content)
  994. self.assertTrue(result, explanation)
  995. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="$4,800", top_heading="Profit")
  996. result, explanation = test.run(content)
  997. self.assertTrue(result, explanation)
  998. def test_br_tags_replacement(self):
  999. """Test that <br> and <br/> tags are correctly replaced with newlines"""
  1000. table = """<table>
  1001. <tr>
  1002. <th>Header 1</th>
  1003. <th>Header 2</th>
  1004. </tr>
  1005. <tr>
  1006. <td>Line 1<br/>Line 2<br/>Line 3</td>
  1007. <td>Single line</td>
  1008. </tr>
  1009. </table>"""
  1010. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Line 1 Line 2 Line 3")
  1011. result, explanation = test.run(table)
  1012. self.assertTrue(result, explanation)
  1013. def test_real_complicated_table(self):
  1014. table = """ <table>
  1015. <thead>
  1016. <tr>
  1017. <th colspan="7">Table 1 &nbsp;&nbsp; Differences in diagnoses, gender and family status for participants with a suicide attempt and those without a suicide attempt within the 12-month follow-up interval</th>
  1018. </tr>
  1019. <tr class="header-row">
  1020. <th rowspan="2"></th>
  1021. <th colspan="2">Participants with no<br>suicide attempt<br>(n = 132)<sup>a</sup></th>
  1022. <th colspan="2">Participants with a<br>suicide attempt<br>(n = 43)<sup>b</sup></th>
  1023. <th colspan="3"></th>
  1024. </tr>
  1025. <tr class="header-row">
  1026. <th>n</th>
  1027. <th>%</th>
  1028. <th>n</th>
  1029. <th>%</th>
  1030. <th>χ<sup>2</sup></th>
  1031. <th>d.f.</th>
  1032. <th>P</th>
  1033. </tr>
  1034. </thead>
  1035. <tbody>
  1036. <tr>
  1037. <td class="section-header">ICD-10 diagnoses</td>
  1038. <td></td>
  1039. <td></td>
  1040. <td></td>
  1041. <td></td>
  1042. <td></td>
  1043. <td></td>
  1044. <td></td>
  1045. </tr>
  1046. <tr>
  1047. <td>&nbsp;&nbsp;F0</td>
  1048. <td>1</td>
  1049. <td>0.76</td>
  1050. <td>0</td>
  1051. <td>0.00</td>
  1052. <td>0.00</td>
  1053. <td>1</td>
  1054. <td>1.00</td>
  1055. </tr>
  1056. <tr>
  1057. <td>&nbsp;&nbsp;F1</td>
  1058. <td>17</td>
  1059. <td>12.88</td>
  1060. <td>12</td>
  1061. <td>27.91</td>
  1062. <td>4.39</td>
  1063. <td>1</td>
  1064. <td>0.04</td>
  1065. </tr>
  1066. <tr>
  1067. <td>&nbsp;&nbsp;F2</td>
  1068. <td>1</td>
  1069. <td>0.76</td>
  1070. <td>0</td>
  1071. <td>0.00</td>
  1072. <td>0.00</td>
  1073. <td>1</td>
  1074. <td>1.00</td>
  1075. </tr>
  1076. <tr>
  1077. <td>&nbsp;&nbsp;F3</td>
  1078. <td>106</td>
  1079. <td>80.30</td>
  1080. <td>31</td>
  1081. <td>72.09</td>
  1082. <td>0.74</td>
  1083. <td>1</td>
  1084. <td>0.39</td>
  1085. </tr>
  1086. <tr>
  1087. <td>&nbsp;&nbsp;F4</td>
  1088. <td>42</td>
  1089. <td>31.82</td>
  1090. <td>17</td>
  1091. <td>39.53</td>
  1092. <td>0.61</td>
  1093. <td>1</td>
  1094. <td>0.43</td>
  1095. </tr>
  1096. <tr>
  1097. <td>&nbsp;&nbsp;F5</td>
  1098. <td>5</td>
  1099. <td>3.79</td>
  1100. <td>5</td>
  1101. <td>11.63</td>
  1102. <td>2.44</td>
  1103. <td>1</td>
  1104. <td>0.12</td>
  1105. </tr>
  1106. <tr>
  1107. <td>&nbsp;&nbsp;F6</td>
  1108. <td>20</td>
  1109. <td>15.15</td>
  1110. <td>19</td>
  1111. <td>44.19</td>
  1112. <td>14.48</td>
  1113. <td>1</td>
  1114. <td>0.00</td>
  1115. </tr>
  1116. <tr>
  1117. <td>&nbsp;&nbsp;F7</td>
  1118. <td>0</td>
  1119. <td>0.00</td>
  1120. <td>0</td>
  1121. <td>0.00</td>
  1122. <td>—</td>
  1123. <td>—</td>
  1124. <td>—</td>
  1125. </tr>
  1126. <tr>
  1127. <td>&nbsp;&nbsp;F8</td>
  1128. <td>1</td>
  1129. <td>0.76</td>
  1130. <td>0</td>
  1131. <td>0.00</td>
  1132. <td>0.00</td>
  1133. <td>1</td>
  1134. <td>1.00</td>
  1135. </tr>
  1136. <tr>
  1137. <td>&nbsp;&nbsp;F9</td>
  1138. <td>2</td>
  1139. <td>1.52</td>
  1140. <td>1</td>
  1141. <td>2.33</td>
  1142. <td>0.00</td>
  1143. <td>1</td>
  1144. <td>1.00</td>
  1145. </tr>
  1146. <tr>
  1147. <td class="section-header">Gender</td>
  1148. <td></td>
  1149. <td></td>
  1150. <td></td>
  1151. <td></td>
  1152. <td>3.09</td>
  1153. <td>2</td>
  1154. <td>0.21</td>
  1155. </tr>
  1156. <tr>
  1157. <td>&nbsp;&nbsp;Female</td>
  1158. <td>75</td>
  1159. <td>56.8</td>
  1160. <td>24</td>
  1161. <td>55.8</td>
  1162. <td></td>
  1163. <td></td>
  1164. <td></td>
  1165. </tr>
  1166. <tr>
  1167. <td>&nbsp;&nbsp;Male</td>
  1168. <td>57</td>
  1169. <td>43.2</td>
  1170. <td>18</td>
  1171. <td>41.9</td>
  1172. <td></td>
  1173. <td></td>
  1174. <td></td>
  1175. </tr>
  1176. <tr>
  1177. <td>&nbsp;&nbsp;Diverse</td>
  1178. <td>0</td>
  1179. <td>0</td>
  1180. <td>1</td>
  1181. <td>2.3</td>
  1182. <td></td>
  1183. <td></td>
  1184. <td></td>
  1185. </tr>
  1186. <tr>
  1187. <td class="section-header">Family status</td>
  1188. <td></td>
  1189. <td></td>
  1190. <td></td>
  1191. <td></td>
  1192. <td>4.87</td>
  1193. <td>4</td>
  1194. <td>0.30</td>
  1195. </tr>
  1196. <tr>
  1197. <td>&nbsp;&nbsp;Single</td>
  1198. <td>55</td>
  1199. <td>41.7</td>
  1200. <td>14</td>
  1201. <td>32.6</td>
  1202. <td></td>
  1203. <td></td>
  1204. <td></td>
  1205. </tr>
  1206. <tr>
  1207. <td>&nbsp;&nbsp;Partnership</td>
  1208. <td>25</td>
  1209. <td>18.9</td>
  1210. <td>9</td>
  1211. <td>20.9</td>
  1212. <td></td>
  1213. <td></td>
  1214. <td></td>
  1215. </tr>
  1216. <tr>
  1217. <td>&nbsp;&nbsp;Married</td>
  1218. <td>27</td>
  1219. <td>20.5</td>
  1220. <td>5</td>
  1221. <td>11.6</td>
  1222. <td></td>
  1223. <td></td>
  1224. <td></td>
  1225. </tr>
  1226. <tr>
  1227. <td>&nbsp;&nbsp;Divorced</td>
  1228. <td>20</td>
  1229. <td>15.2</td>
  1230. <td>11</td>
  1231. <td>25.6</td>
  1232. <td></td>
  1233. <td></td>
  1234. <td></td>
  1235. </tr>
  1236. <tr>
  1237. <td>&nbsp;&nbsp;Widowed</td>
  1238. <td>1</td>
  1239. <td>0.8</td>
  1240. <td>1</td>
  1241. <td>2.3</td>
  1242. <td></td>
  1243. <td></td>
  1244. <td></td>
  1245. </tr>
  1246. </tbody>
  1247. <tfoot>
  1248. <tr>
  1249. <td colspan="8" class="footnote">
  1250. F0: Organic, including symptomatic, mental disorders; F1: Mental and behavioural disorders due to psychoactive substance use; F2: Schizophrenia, schizotypal and delusional disorders; F3: affective disorders; F4: Neurotic, stress-related and somatoform disorders; F5: Behavioural syndromes associated with physiological disturbances and physical factors; F6: Disorders of adult personality and behaviour; F7: Mental retardation; F8: Disorders of psychological development; F9: Behavioural and emotional disorders with onset usually occurring in childhood and adolescence.<br>
  1251. a. 75.43% of the total sample with full information on suicide reattempts within the entire 12-month follow-up interval.<br>
  1252. b. 24.57% of the total sample with full information on suicide reattempts within the entire 12-month follow-up interval.
  1253. </td>
  1254. </tr>
  1255. </tfoot>
  1256. </table>"""
  1257. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="4.39", top_heading="χ2")
  1258. result, explanation = test.run(table)
  1259. self.assertTrue(result, explanation)
  1260. test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="12.88", top_heading="%")
  1261. result, explanation = test.run(table)
  1262. self.assertTrue(result, explanation)
  1263. # Account for the superscript in the header
  1264. test = TableTest(
  1265. pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="12.88", top_heading="Participants with no suicide attempt (n = 132)a"
  1266. )
  1267. result, explanation = test.run(table)
  1268. self.assertTrue(result, explanation)
  1269. test = TableTest(
  1270. pdf="test.pdf",
  1271. page=1,
  1272. id="test_id",
  1273. type=TestType.TABLE.value,
  1274. cell="12.88",
  1275. top_heading="Table 1 Differences in diagnoses, gender and family status for participants with a suicide attempt and those without a suicide attempt within the 12-month follow-up interval",
  1276. )
  1277. result, explanation = test.run(table)
  1278. self.assertTrue(result, explanation)
  1279. class TestBaselineTest(unittest.TestCase):
  1280. """Test the BaselineTest class"""
  1281. def test_valid_initialization(self):
  1282. """Test that valid initialization works"""
  1283. test = BaselineTest(pdf="test.pdf", page=1, id="test_id", type=TestType.BASELINE.value, max_repeats=50)
  1284. self.assertEqual(test.max_repeats, 50)
  1285. def test_non_empty_content(self):
  1286. """Test that non-empty content passes"""
  1287. test = BaselineTest(pdf="test.pdf", page=1, id="test_id", type=TestType.BASELINE.value)
  1288. result, _ = test.run("This is some normal content")
  1289. self.assertTrue(result)
  1290. def test_empty_content(self):
  1291. """Test that empty content fails"""
  1292. test = BaselineTest(pdf="test.pdf", page=1, id="test_id", type=TestType.BASELINE.value)
  1293. result, explanation = test.run(" \n\t ")
  1294. self.assertFalse(result)
  1295. self.assertIn("no alpha numeric characters", explanation)
  1296. def test_repeating_content(self):
  1297. """Test that highly repeating content fails"""
  1298. test = BaselineTest(pdf="test.pdf", page=1, id="test_id", type=TestType.BASELINE.value, max_repeats=2)
  1299. # Create highly repeating content - repeat "abc" many times
  1300. repeating_content = "abc" * 10
  1301. result, explanation = test.run(repeating_content)
  1302. self.assertFalse(result)
  1303. self.assertIn("repeating", explanation)
  1304. def test_content_with_disallowed_characters(self):
  1305. """Test that content with disallowed characters fails"""
  1306. test = BaselineTest(pdf="test.pdf", page=1, id="test_id", type=TestType.BASELINE.value)
  1307. result, explanation = test.run("This has Chinese characters: 你好")
  1308. self.assertFalse(result)
  1309. self.assertIn("disallowed characters", explanation)
  1310. def test_content_with_emoji(self):
  1311. """Test that content with emoji fails"""
  1312. test = BaselineTest(pdf="test.pdf", page=1, id="test_id", type=TestType.BASELINE.value)
  1313. result, explanation = test.run("This has emoji: 😊")
  1314. self.assertFalse(result)
  1315. self.assertIn("disallowed characters", explanation)
  1316. self.assertIn("😊", explanation)
  1317. def test_content_with_mandarin(self):
  1318. test = BaselineTest(pdf="test.pdf", page=1, id="test_id", type=TestType.BASELINE.value)
  1319. result, explanation = test.run("asdfasdfas維基百科/中文asdfw")
  1320. self.assertFalse(result)
  1321. self.assertIn("disallowed characters", explanation)
  1322. def test_valid_content(self):
  1323. """Test that valid content passes all checks"""
  1324. test = BaselineTest(pdf="test.pdf", page=1, id="test_id", type=TestType.BASELINE.value)
  1325. content = "This is some normal content with proper English letters and no suspicious repetition."
  1326. result, _ = test.run(content)
  1327. self.assertTrue(result)
  1328. class TestMathTest(unittest.TestCase):
  1329. """Test the MathTest class"""
  1330. def test_valid_initialization(self):
  1331. """Test that valid initialization works"""
  1332. try:
  1333. test = MathTest(pdf="test.pdf", page=1, id="test_id", type=TestType.MATH.value, math="a + b = c")
  1334. self.assertEqual(test.math, "a + b = c")
  1335. except Exception as e:
  1336. self.fail(f"Valid initialization failed with: {e}")
  1337. def test_invalid_test_type(self):
  1338. """Test that invalid test type raises ValidationError"""
  1339. with self.assertRaises(ValidationError):
  1340. MathTest(pdf="test.pdf", page=1, id="test_id", type=TestType.PRESENT.value, math="a + b = c")
  1341. def test_empty_math(self):
  1342. """Test that empty math raises ValidationError"""
  1343. with self.assertRaises(ValidationError):
  1344. MathTest(pdf="test.pdf", page=1, id="test_id", type=TestType.MATH.value, math="")
  1345. def test_exact_math_match(self):
  1346. """Test exact match of math equation"""
  1347. try:
  1348. test = MathTest(pdf="test.pdf", page=1, id="test_id", type=TestType.MATH.value, math="a + b = c")
  1349. # Test content with exact math match
  1350. content = "Here is an equation: $$a + b = c$$"
  1351. result, _ = test.run(content)
  1352. self.assertTrue(result)
  1353. except Exception as e:
  1354. self.fail(f"Test failed with: {e}")
  1355. def test_rendered_math_match(self):
  1356. """Test rendered match of math equation"""
  1357. try:
  1358. test = MathTest(pdf="test.pdf", page=1, id="test_id", type=TestType.MATH.value, math="a + b = c")
  1359. # Test content with different but equivalent math
  1360. content = "Here is an equation: $$a+b=c$$"
  1361. result, _ = test.run(content)
  1362. self.assertTrue(result)
  1363. except Exception as e:
  1364. self.fail(f"Test failed with: {e}")
  1365. def test_no_math_match(self):
  1366. """Test no match of math equation"""
  1367. try:
  1368. test = MathTest(pdf="test.pdf", page=1, id="test_id", type=TestType.MATH.value, math="a + b = c")
  1369. # Test content with no matching math
  1370. content = "Here is an equation: $$x + y = z$$"
  1371. result, explanation = test.run(content)
  1372. self.assertFalse(result)
  1373. self.assertIn("No match found", explanation)
  1374. except Exception as e:
  1375. self.fail(f"Test failed with: {e}")
  1376. def test_different_math_delimiters(self):
  1377. """Test different math delimiters"""
  1378. try:
  1379. test = MathTest(pdf="test.pdf", page=1, id="test_id", type=TestType.MATH.value, math="a + b = c")
  1380. # Test different delimiters
  1381. delimiters = [
  1382. "$$a + b = c$$", # $$...$$
  1383. "$a + b = c$", # $...$
  1384. "\\(a + b = c\\)", # \(...\)
  1385. "\\[a + b = c\\]", # \[...\]
  1386. ]
  1387. for delim in delimiters:
  1388. content = f"Here is an equation: {delim}"
  1389. result, _ = test.run(content)
  1390. self.assertTrue(result)
  1391. except Exception as e:
  1392. self.fail(f"Test failed with: {e}")
  1393. if __name__ == "__main__":
  1394. unittest.main()