scan_dolmadocs.py 93 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030
  1. import argparse
  2. import base64
  3. import csv
  4. import datetime
  5. import json
  6. import os
  7. import random
  8. import re
  9. import sqlite3
  10. import tempfile
  11. from concurrent.futures import ThreadPoolExecutor
  12. from pathlib import Path
  13. from typing import Any, Dict, List, Optional, Tuple
  14. import boto3
  15. import requests
  16. import tinyhost
  17. from tqdm import tqdm
  18. from olmocr.data.renderpdf import render_pdf_to_base64webp
  19. from olmocr.s3_utils import get_s3_bytes, parse_s3_path
  20. def parse_args():
  21. parser = argparse.ArgumentParser(description="Scan OLMO OCR workspace results and create visual samples")
  22. parser.add_argument("workspace", help="OLMO OCR workspace path (s3://bucket/workspace)")
  23. parser.add_argument("--pages_per_output", type=int, default=30, help="Number of pages per output file")
  24. parser.add_argument("--repeats", type=int, default=1, help="Number of output files to generate")
  25. parser.add_argument("--pdf_profile", help="AWS profile for accessing PDFs")
  26. parser.add_argument("--output_dir", default="dolma_samples", help="Directory to save output HTML files")
  27. parser.add_argument("--max_workers", type=int, default=4, help="Maximum number of worker threads")
  28. parser.add_argument(
  29. "--db_path",
  30. default="~/s2pdf_url_data/d65142df-6588-4b68-a12c-d468b3761189.csv.db",
  31. help="Path to the SQLite database containing PDF hash to URL mapping",
  32. )
  33. parser.add_argument(
  34. "--prolific_code",
  35. required=True,
  36. help="Fixed completion code to use for all outputs",
  37. )
  38. parser.add_argument(
  39. "--prolific_csv",
  40. default="prolific_codes.csv",
  41. help="Path to save the file with tinyhost links (one URL per line)",
  42. )
  43. parser.add_argument(
  44. "--read_results",
  45. help="Path to a CSV file containing previously generated tinyhost links to extract annotations",
  46. )
  47. return parser.parse_args()
  48. # Fixed prolific code is now passed in as a command line argument
  49. def obfuscate_code(code):
  50. """Gently obfuscate the Prolific code so it's not immediately visible in source."""
  51. # Convert to base64 and reverse
  52. encoded = base64.b64encode(code.encode()).decode()
  53. return encoded[::-1]
  54. def deobfuscate_code(obfuscated_code):
  55. """Deobfuscate the code - this will be done in JavaScript."""
  56. # Reverse and decode from base64
  57. reversed_encoded = obfuscated_code[::-1]
  58. try:
  59. return base64.b64decode(reversed_encoded).decode()
  60. except:
  61. return "ERROR_DECODING"
  62. def parse_pdf_hash(pretty_pdf_path: str) -> Optional[str]:
  63. pattern = r"s3://ai2-s2-pdfs/([a-f0-9]{4})/([a-f0-9]+)\.pdf"
  64. match = re.match(pattern, pretty_pdf_path)
  65. if match:
  66. return match.group(1) + match.group(2)
  67. return None
  68. def get_original_url(pdf_hash: str, db_path: str) -> Optional[str]:
  69. """Look up the original URL for a PDF hash in the SQLite database."""
  70. if not pdf_hash:
  71. return None
  72. try:
  73. sqlite_db_path = os.path.expanduser(db_path)
  74. if not os.path.exists(sqlite_db_path):
  75. print(f"SQLite database not found at {sqlite_db_path}")
  76. return None
  77. conn = sqlite3.connect(sqlite_db_path)
  78. cursor = conn.cursor()
  79. cursor.execute("SELECT uri FROM pdf_mapping WHERE pdf_hash = ?", (pdf_hash,))
  80. result = cursor.fetchone()
  81. conn.close()
  82. if result:
  83. return result[0]
  84. return None
  85. except Exception as e:
  86. print(f"Error looking up URL for PDF hash {pdf_hash}: {e}")
  87. return None
  88. def list_result_files(s3_client, workspace_path):
  89. """List all JSON result files in the workspace results directory."""
  90. bucket, prefix = parse_s3_path(workspace_path)
  91. results_prefix = os.path.join(prefix, "results").rstrip("/") + "/"
  92. all_files = []
  93. paginator = s3_client.get_paginator("list_objects_v2")
  94. for page in paginator.paginate(Bucket=bucket, Prefix=results_prefix):
  95. if "Contents" in page:
  96. all_files.extend([f"s3://{bucket}/{obj['Key']}" for obj in page["Contents"] if obj["Key"].endswith(".jsonl") or obj["Key"].endswith(".json")])
  97. # if len(all_files) > 1000:
  98. # break
  99. return all_files
  100. def get_random_pages(s3_client, result_files, count=30):
  101. """Get random pages from the result files."""
  102. random_pages = []
  103. # Try to collect the requested number of pages
  104. attempts = 0
  105. max_attempts = count * 3 # Allow extra attempts to handle potential failures
  106. while len(random_pages) < count and attempts < max_attempts:
  107. attempts += 1
  108. # Pick a random result file
  109. if not result_files:
  110. print("No result files found!")
  111. break
  112. result_file = random.choice(result_files)
  113. try:
  114. # Get the content of the file
  115. content = get_s3_bytes(s3_client, result_file)
  116. lines = content.decode("utf-8").strip().split("\n")
  117. if not lines:
  118. continue
  119. # Pick a random line (which contains a complete document)
  120. line = random.choice(lines)
  121. doc = json.loads(line)
  122. # A Dolma document has "text", "metadata", and "attributes" fields
  123. if "text" not in doc or "metadata" not in doc or "attributes" not in doc:
  124. print(f"Document in {result_file} is not a valid Dolma document")
  125. continue
  126. # Get the original PDF path from metadata
  127. pdf_path = doc["metadata"].get("Source-File")
  128. if not pdf_path:
  129. continue
  130. # Get page spans from attributes
  131. page_spans = doc["attributes"].get("pdf_page_numbers", [])
  132. if not page_spans:
  133. continue
  134. # Pick a random page span
  135. page_span = random.choice(page_spans)
  136. if len(page_span) >= 3:
  137. # Page spans are [start_pos, end_pos, page_num]
  138. page_num = page_span[2]
  139. # Extract text for this page
  140. start_pos, end_pos = page_span[0], page_span[1]
  141. page_text = doc["text"][start_pos:end_pos].strip()
  142. # Include the text snippet with the page info
  143. random_pages.append((pdf_path, page_num, page_text, result_file))
  144. if len(random_pages) >= count:
  145. break
  146. except Exception as e:
  147. print(f"Error processing {result_file}: {e}")
  148. continue
  149. print(f"Found {len(random_pages)} random pages from Dolma documents")
  150. return random_pages
  151. def create_presigned_url(s3_client, pdf_path, expiration=3600 * 24 * 7):
  152. """Create a presigned URL for the given S3 path."""
  153. try:
  154. bucket, key = parse_s3_path(pdf_path)
  155. url = s3_client.generate_presigned_url("get_object", Params={"Bucket": bucket, "Key": key}, ExpiresIn=expiration)
  156. return url
  157. except Exception as e:
  158. print(f"Error creating presigned URL for {pdf_path}: {e}")
  159. return None
  160. def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path, db_path, prolific_code, resolution=2048):
  161. """Create an HTML file with rendered PDF pages."""
  162. # Obfuscate the provided Prolific code
  163. obfuscated_code = obfuscate_code(prolific_code)
  164. # Get current date and time for the report
  165. current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
  166. html_content = f"""
  167. <!DOCTYPE html>
  168. <html lang="en">
  169. <head>
  170. <meta charset="UTF-8">
  171. <meta name="viewport" content="width=device-width, initial-scale=1.0">
  172. <title>OLMO OCR Samples</title>
  173. <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap" rel="stylesheet">
  174. <style>
  175. :root {{
  176. --primary-color: #2563eb;
  177. --secondary-color: #4b5563;
  178. --border-color: #e5e7eb;
  179. --bg-color: #f9fafb;
  180. --text-color: #111827;
  181. --text-light: #6b7280;
  182. --card-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06);
  183. --success-color: #10b981;
  184. --overlay-bg: rgba(0, 0, 0, 0.7);
  185. }}
  186. * {{
  187. box-sizing: border-box;
  188. margin: 0;
  189. padding: 0;
  190. }}
  191. body {{
  192. font-family: 'Inter', sans-serif;
  193. line-height: 1.6;
  194. color: var(--text-color);
  195. background-color: var(--bg-color);
  196. padding: 2rem;
  197. display: flex;
  198. flex-direction: row;
  199. gap: 2rem;
  200. }}
  201. ul {{
  202. margin-left: 2em;
  203. }}
  204. ol {{
  205. margin-left: 2em;
  206. }}
  207. .highlight {{
  208. background-color: #f8f9fa;
  209. border-left: 3px solid #3498db;
  210. padding: 10px 15px;
  211. margin: 15px 0;
  212. }}
  213. .container {{
  214. flex: 2;
  215. max-width: 750px;
  216. }}
  217. header {{
  218. position: sticky;
  219. top: 2rem;
  220. flex: 1;
  221. min-width: 380px;
  222. max-width: 420px;
  223. max-height: calc(100vh - 4rem);
  224. overflow-y: auto;
  225. padding: 1.5rem;
  226. background-color: white;
  227. border-radius: 0.5rem;
  228. box-shadow: var(--card-shadow);
  229. align-self: flex-start;
  230. font-size: small;
  231. }}
  232. header h2 {{
  233. margin-top: 1em;
  234. }}
  235. .important {{
  236. font-weight: bold;
  237. }}
  238. .info-bar {{
  239. background-color: white;
  240. padding: 1rem;
  241. border-radius: 0.5rem;
  242. margin-bottom: 2rem;
  243. box-shadow: var(--card-shadow);
  244. display: flex;
  245. justify-content: space-between;
  246. flex-wrap: wrap;
  247. gap: 1rem;
  248. }}
  249. .info-item {{
  250. flex: 1;
  251. min-width: 200px;
  252. }}
  253. .info-item h3 {{
  254. font-size: 0.6rem;
  255. color: var(--text-light);
  256. margin-bottom: 0.25rem;
  257. }}
  258. .info-item p {{
  259. font-size: 0.6rem;
  260. }}
  261. .page-grid {{
  262. display: grid;
  263. grid-template-columns: 1fr;
  264. gap: 2rem;
  265. }}
  266. .page-container {{
  267. background-color: white;
  268. border-radius: 0.5rem;
  269. overflow: hidden;
  270. box-shadow: var(--card-shadow);
  271. transition: all 0.3s ease;
  272. }}
  273. .page-container.editing {{
  274. box-shadow: 0 0 0 3px var(--primary-color), var(--card-shadow);
  275. }}
  276. .page-info {{
  277. padding: 1rem;
  278. border-bottom: 1px solid var(--border-color);
  279. }}
  280. .page-info h2 {{
  281. font-size: 1rem;
  282. margin-bottom: 0.5rem;
  283. white-space: nowrap;
  284. overflow: hidden;
  285. text-overflow: ellipsis;
  286. }}
  287. .page-info p {{
  288. font-size: 0.875rem;
  289. color: var(--text-light);
  290. }}
  291. .page-image-wrapper {{
  292. padding: 1rem;
  293. display: flex;
  294. justify-content: center;
  295. align-items: center;
  296. background-color: #f3f4f6;
  297. }}
  298. .page-image {{
  299. max-width: 100%;
  300. height: auto;
  301. border: 1px solid var(--border-color);
  302. }}
  303. .s3-link {{
  304. padding: 1rem;
  305. background-color: #f8fafc;
  306. border-top: 1px solid var(--border-color);
  307. font-size: 0.875rem;
  308. color: var(--secondary-color);
  309. word-break: break-all;
  310. }}
  311. .s3-link a {{
  312. color: var(--primary-color);
  313. text-decoration: none;
  314. font-weight: 500;
  315. }}
  316. .s3-link a:hover {{
  317. text-decoration: underline;
  318. }}
  319. /* Annotation elements */
  320. .annotation-interface {{
  321. display: none; /* Hide annotation interface by default */
  322. margin-top: 1rem;
  323. padding: 0.5rem;
  324. border-top: 1px solid var(--border-color);
  325. border-radius: 0.25rem;
  326. background-color: #f8fafc;
  327. }}
  328. .annotation-interface.active {{
  329. display: block; /* Show only the active annotation interface */
  330. }}
  331. .question-container {{
  332. margin-bottom: 1rem;
  333. }}
  334. .question-text {{
  335. font-weight: 500;
  336. margin-bottom: 0.5rem;
  337. }}
  338. /* Button group styling for connected buttons */
  339. .btn-group {{
  340. display: inline-flex;
  341. margin-bottom: 0.5rem;
  342. }}
  343. .btn-group .toggle-button {{
  344. padding: 0.5rem 1rem;
  345. border: 1px solid var(--border-color);
  346. background-color: #f8fafc;
  347. cursor: pointer;
  348. margin: 0;
  349. /* Remove individual border radius so we can set unified ones */
  350. border-radius: 0;
  351. }}
  352. .btn-group .toggle-button:first-child {{
  353. border-right: none;
  354. border-top-left-radius: 0.25rem;
  355. border-bottom-left-radius: 0.25rem;
  356. }}
  357. .btn-group .toggle-button:last-child {{
  358. border-top-right-radius: 0.25rem;
  359. border-bottom-right-radius: 0.25rem;
  360. }}
  361. .btn-group .toggle-button:not(:first-child):not(:last-child) {{
  362. border-right: none;
  363. }}
  364. .toggle-button.active {{
  365. background-color: var(--primary-color);
  366. color: white;
  367. }}
  368. .checkbox-group {{
  369. display: flex;
  370. flex-wrap: wrap;
  371. gap: 0.5rem;
  372. margin-bottom: 1rem;
  373. }}
  374. .checkbox-group label {{
  375. display: flex;
  376. align-items: center;
  377. padding: 0.25rem 0.5rem;
  378. background-color: #f1f5f9;
  379. border-radius: 0.25rem;
  380. cursor: pointer;
  381. font-size: 0.875rem;
  382. border-left: 3px solid transparent;
  383. }}
  384. .checkbox-group label:hover {{
  385. background-color: #e2e8f0;
  386. }}
  387. .checkbox-group input[type="checkbox"] {{
  388. margin-right: 0.5rem;
  389. }}
  390. /* Styling for checkbox groups with headings */
  391. .question-container h4 {{
  392. margin-bottom: 0.5rem;
  393. font-weight: 600;
  394. font-size: 0.9rem;
  395. border-bottom: 1px solid #e5e7eb;
  396. padding-bottom: 0.25rem;
  397. }}
  398. /* Slightly different styling for each group */
  399. .question-container h4:nth-of-type(1) + .checkbox-group label {{
  400. border-left-color: #3b82f6; /* Blue for identifiers */
  401. }}
  402. .question-container h4:nth-of-type(2) + .checkbox-group label {{
  403. border-left-color: #10b981; /* Green for PII with identifier */
  404. }}
  405. .question-container h4:nth-of-type(3) + .checkbox-group label {{
  406. border-left-color: #f59e0b; /* Amber for always-PII */
  407. }}
  408. .continue-button {{
  409. padding: 0.5rem 1rem;
  410. background-color: var(--primary-color);
  411. color: white;
  412. border: none;
  413. border-radius: 0.25rem;
  414. cursor: pointer;
  415. font-weight: 500;
  416. }}
  417. .continue-button:hover {{
  418. background-color: #1d4ed8;
  419. }}
  420. .annotation-interface textarea {{
  421. display: none; /* Hide textarea by default */
  422. width: 100%;
  423. margin-top: 0.5rem;
  424. margin-bottom: 1rem;
  425. padding: 0.5rem;
  426. font-size: 0.875rem;
  427. border: 1px solid var(--border-color);
  428. border-radius: 0.25rem;
  429. }}
  430. .annotation-status {{
  431. display: inline-block;
  432. margin-left: 1rem;
  433. padding: 0.25rem 0.5rem;
  434. border-radius: 0.25rem;
  435. font-size: 0.75rem;
  436. font-weight: 600;
  437. }}
  438. .status-complete {{
  439. background-color: #ecfdf5;
  440. color: var(--success-color);
  441. cursor: pointer;
  442. transition: all 0.2s ease;
  443. }}
  444. .status-complete:hover {{
  445. background-color: #d1fae5;
  446. box-shadow: 0 0 0 2px rgba(16, 185, 129, 0.3);
  447. }}
  448. .status-pending {{
  449. background-color: #fff7ed;
  450. color: #ea580c;
  451. }}
  452. .status-current {{
  453. background-color: #eff6ff;
  454. color: var(--primary-color);
  455. animation: pulse 2s infinite;
  456. }}
  457. @keyframes pulse {{
  458. 0% {{ opacity: 0.6; }}
  459. 50% {{ opacity: 1; }}
  460. 100% {{ opacity: 0.6; }}
  461. }}
  462. .error {{
  463. color: #dc2626;
  464. padding: 1rem;
  465. background-color: #fee2e2;
  466. border-radius: 0.25rem;
  467. }}
  468. .completion-message {{
  469. display: none;
  470. margin: 2rem auto;
  471. padding: 1.5rem;
  472. background-color: #ecfdf5;
  473. border: 1px solid #A7F3D0;
  474. border-radius: 0.5rem;
  475. text-align: center;
  476. color: var(--success-color);
  477. font-weight: 600;
  478. max-width: 500px;
  479. }}
  480. footer {{
  481. margin-top: 3rem;
  482. text-align: center;
  483. color: var(--text-light);
  484. font-size: 0.875rem;
  485. border-top: 1px solid var(--border-color);
  486. padding-top: 1rem;
  487. }}
  488. /* Instructions Modal */
  489. .instructions-modal-overlay {{
  490. position: fixed;
  491. top: 0;
  492. left: 0;
  493. right: 0;
  494. bottom: 0;
  495. background-color: var(--overlay-bg);
  496. display: flex;
  497. align-items: center;
  498. justify-content: center;
  499. z-index: 1000;
  500. opacity: 0;
  501. visibility: hidden;
  502. transition: opacity 0.3s ease, visibility 0.3s ease;
  503. backdrop-filter: blur(3px);
  504. }}
  505. .instructions-modal-overlay.visible {{
  506. opacity: 1;
  507. visibility: visible;
  508. }}
  509. .instructions-modal {{
  510. background-color: white;
  511. border-radius: 8px;
  512. width: 90%;
  513. max-width: 1000px;
  514. max-height: 90vh;
  515. overflow-y: auto;
  516. padding: 2rem;
  517. box-shadow: 0 10px 25px rgba(0, 0, 0, 0.2);
  518. position: relative;
  519. animation: modalAppear 0.3s ease;
  520. }}
  521. @keyframes modalAppear {{
  522. from {{
  523. opacity: 0;
  524. transform: translateY(-20px);
  525. }}
  526. to {{
  527. opacity: 1;
  528. transform: translateY(0);
  529. }}
  530. }}
  531. .instructions-modal-header {{
  532. margin-bottom: 1.5rem;
  533. text-align: center;
  534. }}
  535. .instructions-modal-header h2 {{
  536. font-size: 1.5rem;
  537. color: var(--primary-color);
  538. margin-bottom: 0.5rem;
  539. }}
  540. .instructions-modal-content {{
  541. margin-bottom: 2rem;
  542. overflow-y: auto;
  543. max-height: 60vh;
  544. padding-right: 10px;
  545. border-radius: 4px;
  546. scrollbar-width: thin;
  547. }}
  548. /* Scrollbar styling for webkit browsers */
  549. .instructions-modal-content::-webkit-scrollbar {{
  550. width: 8px;
  551. }}
  552. .instructions-modal-content::-webkit-scrollbar-track {{
  553. background: #f1f1f1;
  554. border-radius: 10px;
  555. }}
  556. .instructions-modal-content::-webkit-scrollbar-thumb {{
  557. background: #c0c0c0;
  558. border-radius: 10px;
  559. }}
  560. .instructions-modal-content::-webkit-scrollbar-thumb:hover {{
  561. background: #a0a0a0;
  562. }}
  563. /* Styling for the cloned sidebar content in the modal */
  564. .instructions-modal-content header {{
  565. position: static;
  566. min-width: unset;
  567. max-width: unset;
  568. max-height: unset;
  569. overflow-y: visible;
  570. padding: 0;
  571. background-color: transparent;
  572. border-radius: 0;
  573. box-shadow: none;
  574. align-self: auto;
  575. font-size: inherit;
  576. }}
  577. .instructions-modal-footer {{
  578. text-align: center;
  579. }}
  580. .instructions-modal-button {{
  581. padding: 0.75rem 2rem;
  582. background-color: var(--primary-color);
  583. color: white;
  584. border: none;
  585. border-radius: 4px;
  586. font-size: 1rem;
  587. font-weight: 600;
  588. cursor: pointer;
  589. transition: background-color 0.2s ease;
  590. }}
  591. .instructions-modal-button:hover {{
  592. background-color: #1d4ed8;
  593. }}
  594. .instructions-modal-button:disabled {{
  595. background-color: #9cb3f0;
  596. cursor: not-allowed;
  597. opacity: 0.7;
  598. }}
  599. @media (max-width: 768px) {{
  600. body {{
  601. padding: 1rem;
  602. flex-direction: column;
  603. }}
  604. header {{
  605. position: static;
  606. max-width: 100%;
  607. margin-left: 0;
  608. margin-bottom: 2rem;
  609. }}
  610. .container {{
  611. max-width: 100%;
  612. }}
  613. .instructions-modal {{
  614. padding: 1.5rem;
  615. width: 95%;
  616. }}
  617. }}
  618. </style>
  619. </head>
  620. <body>
  621. <header>
  622. <h2>Task Overview</h2>
  623. <p>In this task, you will review {len(random_pages)} document pages and determine whether they contain any <span class="important">Personally Identifiable Information (PII)</span>. For each page, please follow the decision flow outlined in the "How to Annotate" section below.</p>
  624. <p>Carefully but efficiently inspect each page and select the appropriate response. You do <span class="important">not</span> need to read every word. Instead, focus on ascertaining the document's intended use and spotting information that would qualify as PII.</p>
  625. <p>The entire task should take about <span class="important">20-25 minutes</span>.</p>
  626. <button id="view-instructions-button" style="background-color: var(--primary-color); color: white; border: none; border-radius: 4px; padding: 0.5rem 1rem; margin: 1rem 0; cursor: pointer;">View Instructions Popup</button>
  627. <h2>How to Annotate</h2>
  628. <p>The current annotation will be highlighted with a blue outline and a set of response buttons will be displayed directly below the page preview. If you are having trouble viewing the displayed page, click the “View Cached PDF” link for a better look. However, <span class="important">DO NOT</span> examine the entire document; <span class="important">ONLY</span> review the single page being previewed (also indicated in the parentheses after “Viewed Cached PDF”).</p>
  629. <p>For each page, complete the following steps:</p>
  630. <ol>
  631. <li>
  632. <p><span class="important">Determine if the document is intended for public release.</span></p>
  633. <p>Inspect the page and answer: "Is this document intended for public release or dissemination?"</p>
  634. <ul>
  635. <li><strong>Yes</strong> - If the document appears to be a publication, research paper, public information, etc.</li>
  636. <li><strong>No</strong> - If the document appears to be private, personal, or not intended for public release</li>
  637. <li><strong>Cannot Read</strong> - If you are unable to read the page (e.g., foreign language, no text, etc.)</li>
  638. <li><strong>Report Content</strong> - If the content is inappropriate or disturbing</li>
  639. </ul>
  640. <p>If you selected "Yes," "Cannot Read," or "Report Content," you will automatically move to the next document. If you selected "No," proceed to Step 2.</p>
  641. </li>
  642. <li>
  643. <p><span class="important">Identify the kind of PII found in the private document (if any).</span></p>
  644. <p>You will be shown a checklist with a set of PII options.</p>
  645. <ul>
  646. <li>Refer to the "How to Identify PII" section below and mark all options that apply.</li>
  647. <li>If you select "Other," describe the kind of other PII in the expanded text box.</li>
  648. </ul>
  649. </li>
  650. <li>
  651. <p><span class="important">Press the blue Continue button to complete your annotation.</span></p>
  652. <p>You will automatically be moved to the next annotation.</p>
  653. </li>
  654. </ol>
  655. <p><span class="important">Note</span>: If you cannot confidently tell that a page is private, treat it as public and do not mark any PII you are unsure about. We anticipate very few private pages or instances of PII in these documents, so erring towards public and no PII minimizes false positives and keeps the review process consistent.</p>
  656. <p>You may review and edit your previous annotations at any time. To do so, press the green Edit button directly above the page preview for the annotation you want to edit.</p>
  657. <p>After completing all {len(random_pages)} document pages, you will receive a Prolific completion code.</p>
  658. <h2>How to Identify PII</h2>
  659. <h3 style="color: #3b82f6;">Identifiers for PII</h3>
  660. <p>Some personal information needs to be accompanied by an <span class="important">identifier</span> to be considered PII. Identifiers that trigger PII include:</p>
  661. <ul>
  662. <li>Names (full names, first/last names, maiden names, nicknames, aliases)</li>
  663. <li>Email Addresses</li>
  664. <li>Phone Numbers</li>
  665. </ul>
  666. <p>Note that the reverse is also true - an identifier must be accompanied by additional personal information or another identifier (e.g., name + email address) to be considered PII.</p>
  667. <br/>
  668. <h3 style="color: #10b981;">PII that must co-occur with an Identifier</h3>
  669. <div class="highlight">
  670. <p>The following types of information should <span class="important">only</span> be marked as PII if they occur <span class="important">alongside an identifier</span> (commonly, a person's name):</p>
  671. <ul>
  672. <li>Addresses (street address, postal code, etc.)</li>
  673. <li>Biographical Information (date of birth, place of birth, gender, sexual orientation, race, ethnicity, citizenship/immigration status, religion)</li>
  674. <li>Location Information (geolocations, specific coordinates)</li>
  675. <li>Employment Information (job titles, workplace names, employment history)</li>
  676. <li>Education Information (school names, degrees, transcripts)</li>
  677. <li>Medical Information (health records, diagnoses, genetic or neural data)</li>
  678. </ul>
  679. </div>
  680. <p>For example, a street address might be personal information, but is not PII by itself; however, a street address associated with a name <span class="important">is</span> regulated PII.</p>
  681. <br/>
  682. <h3 style="color: #f59e0b;">PII that occurs even without an Identifier</h3>
  683. <div class="highlight">
  684. <p>Certain types of sensitive information should always be classified as PII because the information is inherently self-identifying. The following should <span class="important">always be marked as PII</span> even if they do not occur alongside an identifier:</p>
  685. <ul>
  686. <li>Government IDs (SSNs, passport numbers, driver's license numbers, tax IDs)</li>
  687. <li>Financial Information (credit card numbers, bank account/routing numbers)</li>
  688. <li>Biometric Data (fingerprints, retina scans, facial recognition data, voice signatures)</li>
  689. <li>Login information (<span class="important">only</span> mark as PII when a <span class="important">username, password, and login location</span> are present together)</li>
  690. </ul>
  691. </div>
  692. </header>
  693. <div class="container">
  694. <div class="info-bar">
  695. <div class="info-item">
  696. <h3>Generated On</h3>
  697. <p>{current_time}</p>
  698. </div>
  699. <div class="info-item">
  700. <h3>Workspace</h3>
  701. <p title="{workspace_path}">{workspace_path}</p>
  702. </div>
  703. <div class="info-item">
  704. <h3>Sample Size</h3>
  705. <p>{len(random_pages)} pages</p>
  706. </div>
  707. </div>
  708. <div class="page-grid">
  709. """
  710. for i, (pdf_path, page_num, page_text, result_file) in enumerate(tqdm(random_pages, desc="Rendering pages")):
  711. # Get original URL from PDF hash
  712. pdf_hash = parse_pdf_hash(pdf_path)
  713. _original_url = get_original_url(pdf_hash, db_path) if pdf_hash else None
  714. # Create a truncated path for display
  715. display_path = pdf_path
  716. if len(display_path) > 60:
  717. display_path = "..." + display_path[-57:]
  718. # Generate presigned URL
  719. presigned_url = create_presigned_url(pdf_s3_client, pdf_path)
  720. try:
  721. # Download PDF to temp file
  722. bucket, key = parse_s3_path(pdf_path)
  723. with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_file:
  724. pdf_data = pdf_s3_client.get_object(Bucket=bucket, Key=key)["Body"].read()
  725. temp_file.write(pdf_data)
  726. temp_file_path = temp_file.name
  727. # Render PDF to base64 webp
  728. base64_image = render_pdf_to_base64webp(temp_file_path, page_num, resolution)
  729. # Add CSS class for the first annotation interface to be active by default
  730. active_class = " active" if i == 0 else ""
  731. # Add to HTML with the annotation interface
  732. html_content += f"""
  733. <div class="page-container" data-index="{i}">
  734. <div class="page-info">
  735. <p>{f'<a href="{presigned_url}#page={page_num}" target="_blank">View Cached PDF (page {page_num})</a>' if presigned_url else pdf_path}</p>
  736. <p>
  737. Status: <span class="annotation-status status-pending" id="status-{i}">Pending</span>
  738. </p>
  739. </div>
  740. <div class="page-image-wrapper">
  741. <img class="page-image" src="data:image/webp;base64,{base64_image}" alt="PDF Page {page_num}" loading="lazy" />
  742. </div>
  743. <div class="annotation-interface{active_class}" data-id="page-{i}" data-pdf-path="{pdf_path}" data-pdf-page="{page_num}">
  744. <div class="question-container" id="question1-{i}">
  745. <p class="question-text">Is this document meant for public dissemination? (ex. news article, research paper, etc.)</p>
  746. <span class="btn-group">
  747. <button type="button" class="toggle-button primary-option" data-value="yes-public" onclick="togglePrimaryOption(this, {i})">Yes</button>
  748. <button type="button" class="toggle-button primary-option" data-value="no-public" onclick="togglePrimaryOption(this, {i})">No</button>
  749. <button type="button" class="toggle-button primary-option" data-value="cannot-read" onclick="togglePrimaryOption(this, {i})">Cannot Read</button>
  750. <button type="button" class="toggle-button primary-option" data-value="report-content" onclick="togglePrimaryOption(this, {i})">Report Content</button>
  751. </span>
  752. </div>
  753. <div class="question-container" id="public-pii-options-{i}" style="display: none; margin-top: 1rem;">
  754. <p class="question-text">Select any PII found in this public document:</p>
  755. <div class="checkbox-group">
  756. <label><input type="checkbox" class="pii-checkbox" data-value="ssn" onchange="saveCheckboxes(this)"> SSN</label>
  757. <label><input type="checkbox" class="pii-checkbox" data-value="bank-info" onchange="saveCheckboxes(this)"> Bank Info</label>
  758. <label><input type="checkbox" class="pii-checkbox" data-value="credit-card" onchange="saveCheckboxes(this)"> Credit Card Info</label>
  759. <label><input type="checkbox" class="pii-checkbox" data-value="usernames-passwords" onchange="saveCheckboxes(this)"> Usernames/Passwords</label>
  760. <label><input type="checkbox" class="pii-checkbox" data-value="other" onchange="toggleOtherTextarea(this)"> Other</label>
  761. </div>
  762. <textarea id="other-pii-public-{i}" placeholder="Describe other PII found in the document" style="display: none;" onchange="saveFeedback(this)" onkeydown="handleTextareaKeydown(event, this)"></textarea>
  763. <button type="button" class="continue-button" onclick="saveThenNext(this)">Continue</button>
  764. </div>
  765. <div class="question-container" id="private-pii-options-{i}" style="display: none; margin-top: 1rem;">
  766. <p class="question-text">Select any PII found in this private document:</p>
  767. <h4 style="margin-top: 1rem; font-size: 0.9rem; color: #3b82f6;">Identifiers for PII (Select these if found)</h4>
  768. <div class="checkbox-group">
  769. <label><input type="checkbox" class="pii-checkbox" data-value="names" onchange="saveCheckboxes(this)"> Names (full, first, last, nicknames)</label>
  770. <label><input type="checkbox" class="pii-checkbox" data-value="email" onchange="saveCheckboxes(this)"> Email Addresses</label>
  771. <label><input type="checkbox" class="pii-checkbox" data-value="phone" onchange="saveCheckboxes(this)"> Phone Numbers</label>
  772. </div>
  773. <h4 style="margin-top: 1rem; font-size: 0.9rem; color: #10b981;">PII that must co-occur with an Identifier</h4>
  774. <div class="checkbox-group">
  775. <label><input type="checkbox" class="pii-checkbox" data-value="addresses" onchange="saveCheckboxes(this)"> Addresses</label>
  776. <label><input type="checkbox" class="pii-checkbox" data-value="biographical" onchange="saveCheckboxes(this)"> Biographical Info (DOB, gender, etc.)</label>
  777. <label><input type="checkbox" class="pii-checkbox" data-value="location" onchange="saveCheckboxes(this)"> Location Information</label>
  778. <label><input type="checkbox" class="pii-checkbox" data-value="employment" onchange="saveCheckboxes(this)"> Employment Information</label>
  779. <label><input type="checkbox" class="pii-checkbox" data-value="education" onchange="saveCheckboxes(this)"> Education Information</label>
  780. <label><input type="checkbox" class="pii-checkbox" data-value="medical" onchange="saveCheckboxes(this)"> Medical Information</label>
  781. </div>
  782. <h4 style="margin-top: 1rem; font-size: 0.9rem; color: #f59e0b;">PII that occurs even without an Identifier</h4>
  783. <div class="checkbox-group">
  784. <label><input type="checkbox" class="pii-checkbox" data-value="government-id" onchange="saveCheckboxes(this)"> Government IDs (SSN, passport, etc.)</label>
  785. <label><input type="checkbox" class="pii-checkbox" data-value="financial" onchange="saveCheckboxes(this)"> Financial Information (credit card, bank)</label>
  786. <label><input type="checkbox" class="pii-checkbox" data-value="biometric" onchange="saveCheckboxes(this)"> Biometric Data</label>
  787. <label><input type="checkbox" class="pii-checkbox" data-value="login-info" onchange="saveCheckboxes(this)"> Login Information (username + password)</label>
  788. <label><input type="checkbox" class="pii-checkbox" data-value="other" onchange="toggleOtherTextarea(this)"> Other</label>
  789. </div>
  790. <textarea id="other-pii-private-{i}" placeholder="Describe other PII found in the document" style="display: none;" onchange="saveFeedback(this)" onkeydown="handleTextareaKeydown(event, this)"></textarea>
  791. <button type="button" class="continue-button" onclick="saveThenNext(this)">Continue</button>
  792. </div>
  793. </div>
  794. </div>
  795. """
  796. # Clean up temp file
  797. os.unlink(temp_file_path)
  798. except Exception as e:
  799. # Add CSS class for the first annotation interface to be active by default
  800. active_class = " active" if i == 0 else ""
  801. html_content += f"""
  802. <div class="page-container" data-index="{i}">
  803. <div class="page-info">
  804. <p>{f'<a href="{presigned_url}#page={page_num}" target="_blank">View Cached PDF (page {page_num})</a>' if presigned_url else pdf_path}</p>
  805. <p>
  806. Status: <span class="annotation-status status-pending" id="status-{i}">Pending</span>
  807. </p>
  808. </div>
  809. <div class="error">Error: {str(e)}</div>
  810. <div class="annotation-interface{active_class}" data-id="page-{i}" data-pdf-path="{pdf_path}" data-pdf-page="{page_num}">
  811. <div class="question-container" id="question1-{i}">
  812. <p class="question-text">Is this document intended for public release or dissemination?</p>
  813. <span class="btn-group">
  814. <button type="button" class="toggle-button primary-option" data-value="yes-public" onclick="togglePrimaryOption(this, {i})">Yes</button>
  815. <button type="button" class="toggle-button primary-option" data-value="no-public" onclick="togglePrimaryOption(this, {i})">No</button>
  816. <button type="button" class="toggle-button primary-option" data-value="cannot-read" onclick="togglePrimaryOption(this, {i})">Cannot Read</button>
  817. <button type="button" class="toggle-button primary-option" data-value="report-content" onclick="togglePrimaryOption(this, {i})">Report Content</button>
  818. </span>
  819. </div>
  820. <div class="question-container" id="public-pii-options-{i}" style="display: none; margin-top: 1rem;">
  821. <p class="question-text">Select any PII found in this public document:</p>
  822. <div class="checkbox-group">
  823. <label><input type="checkbox" class="pii-checkbox" data-value="ssn" onchange="saveCheckboxes(this)"> SSN</label>
  824. <label><input type="checkbox" class="pii-checkbox" data-value="bank-info" onchange="saveCheckboxes(this)"> Bank Info</label>
  825. <label><input type="checkbox" class="pii-checkbox" data-value="credit-card" onchange="saveCheckboxes(this)"> Credit Card Info</label>
  826. <label><input type="checkbox" class="pii-checkbox" data-value="usernames-passwords" onchange="saveCheckboxes(this)"> Usernames/Passwords</label>
  827. <label><input type="checkbox" class="pii-checkbox" data-value="other" onchange="toggleOtherTextarea(this)"> Other</label>
  828. </div>
  829. <textarea id="other-pii-public-{i}" placeholder="Describe other PII found in the document" style="display: none;" onchange="saveFeedback(this)" onkeydown="handleTextareaKeydown(event, this)"></textarea>
  830. <button type="button" class="continue-button" onclick="saveThenNext(this)">Continue</button>
  831. </div>
  832. <div class="question-container" id="private-pii-options-{i}" style="display: none; margin-top: 1rem;">
  833. <p class="question-text">Select any PII found in this private document:</p>
  834. <h4 style="margin-top: 1rem; font-size: 0.9rem; color: #3b82f6;">Identifiers (Select these if found)</h4>
  835. <div class="checkbox-group">
  836. <label><input type="checkbox" class="pii-checkbox" data-value="names" onchange="saveCheckboxes(this)"> Names (full, first, last, nicknames)</label>
  837. <label><input type="checkbox" class="pii-checkbox" data-value="email" onchange="saveCheckboxes(this)"> Email Addresses</label>
  838. <label><input type="checkbox" class="pii-checkbox" data-value="phone" onchange="saveCheckboxes(this)"> Phone Numbers</label>
  839. </div>
  840. <h4 style="margin-top: 1rem; font-size: 0.9rem; color: #10b981;">PII that requires an identifier above</h4>
  841. <div class="checkbox-group">
  842. <label><input type="checkbox" class="pii-checkbox" data-value="addresses" onchange="saveCheckboxes(this)"> Addresses</label>
  843. <label><input type="checkbox" class="pii-checkbox" data-value="biographical" onchange="saveCheckboxes(this)"> Biographical Info (DOB, gender, etc.)</label>
  844. <label><input type="checkbox" class="pii-checkbox" data-value="location" onchange="saveCheckboxes(this)"> Location Information</label>
  845. <label><input type="checkbox" class="pii-checkbox" data-value="employment" onchange="saveCheckboxes(this)"> Employment Information</label>
  846. <label><input type="checkbox" class="pii-checkbox" data-value="education" onchange="saveCheckboxes(this)"> Education Information</label>
  847. <label><input type="checkbox" class="pii-checkbox" data-value="medical" onchange="saveCheckboxes(this)"> Medical Information</label>
  848. </div>
  849. <h4 style="margin-top: 1rem; font-size: 0.9rem; color: #f59e0b;">PII that is always sensitive (even without an identifier)</h4>
  850. <div class="checkbox-group">
  851. <label><input type="checkbox" class="pii-checkbox" data-value="government-id" onchange="saveCheckboxes(this)"> Government IDs (SSN, passport, etc.)</label>
  852. <label><input type="checkbox" class="pii-checkbox" data-value="financial" onchange="saveCheckboxes(this)"> Financial Information (credit card, bank)</label>
  853. <label><input type="checkbox" class="pii-checkbox" data-value="biometric" onchange="saveCheckboxes(this)"> Biometric Data</label>
  854. <label><input type="checkbox" class="pii-checkbox" data-value="login-info" onchange="saveCheckboxes(this)"> Login Information (username + password)</label>
  855. <label><input type="checkbox" class="pii-checkbox" data-value="other" onchange="toggleOtherTextarea(this)"> Other</label>
  856. </div>
  857. <textarea id="other-pii-private-{i}" placeholder="Describe other PII found in the document" style="display: none;" onchange="saveFeedback(this)" onkeydown="handleTextareaKeydown(event, this)"></textarea>
  858. <button type="button" class="continue-button" onclick="saveThenNext(this)">Continue</button>
  859. </div>
  860. </div>
  861. </div>
  862. """
  863. html_content += (
  864. """
  865. </div>
  866. <div class="completion-message" id="completion-message">
  867. Thank you! All annotations are complete.<br>
  868. Your Prolific completion code is: <strong id="prolific-code">Loading...</strong>
  869. </div>
  870. <!-- Store the obfuscated code in a hidden element -->
  871. <div id="obfuscated-code" style="display:none;">"""
  872. + obfuscated_code
  873. + """</div>
  874. </div>
  875. <script>
  876. // Using externally injected async functions: fetchDatastore() and putDatastore()
  877. // Track annotation progress
  878. let currentIndex = 0;
  879. const totalPages = document.querySelectorAll('.page-container').length;
  880. // Update progress bar
  881. function updateProgressBar() {
  882. // Check if all annotations are complete
  883. if (currentIndex >= totalPages) {
  884. document.getElementById('completion-message').style.display = 'block';
  885. }
  886. }
  887. // Update status indicators
  888. function updateStatusIndicators() {
  889. // Reset all status indicators
  890. document.querySelectorAll('.annotation-status').forEach(function(status) {
  891. status.className = 'annotation-status status-pending';
  892. status.textContent = 'Pending';
  893. // Remove any click handlers
  894. status.onclick = null;
  895. });
  896. // Set current item status
  897. const currentStatus = document.getElementById(`status-${currentIndex}`);
  898. if (currentStatus) {
  899. currentStatus.className = 'annotation-status status-current';
  900. currentStatus.textContent = 'Current';
  901. }
  902. // Update completed statuses
  903. for (let i = 0; i < currentIndex; i++) {
  904. const status = document.getElementById(`status-${i}`);
  905. if (status) {
  906. status.className = 'annotation-status status-complete';
  907. status.textContent = 'Edit ✎';
  908. // Add click handler to edit this annotation
  909. status.onclick = function() { editAnnotation(i); };
  910. }
  911. }
  912. }
  913. // Function to enable editing a previously completed annotation
  914. function editAnnotation(index) {
  915. // Hide current annotation interface
  916. document.querySelector(`.annotation-interface[data-id="page-${currentIndex}"]`).classList.remove('active');
  917. // Remove editing class from all containers
  918. document.querySelectorAll('.page-container').forEach(container => {
  919. container.classList.remove('editing');
  920. });
  921. // Show the selected annotation interface
  922. document.querySelector(`.annotation-interface[data-id="page-${index}"]`).classList.add('active');
  923. // Add editing class to the container being edited
  924. const activeContainer = document.querySelector(`.page-container[data-index="${index}"]`);
  925. if (activeContainer) {
  926. activeContainer.classList.add('editing');
  927. activeContainer.scrollIntoView({ behavior: 'smooth', block: 'center' });
  928. }
  929. // Update current index
  930. currentIndex = index;
  931. updateProgressBar();
  932. updateStatusIndicators();
  933. }
  934. // Navigate to the next document
  935. function goToNextDocument() {
  936. // Hide current annotation interface
  937. document.querySelector(`.annotation-interface[data-id="page-${currentIndex}"]`).classList.remove('active');
  938. // Remove editing class from all containers
  939. document.querySelectorAll('.page-container').forEach(container => {
  940. container.classList.remove('editing');
  941. });
  942. // Move to next document if not at the end
  943. if (currentIndex < totalPages - 1) {
  944. currentIndex++;
  945. document.querySelector(`.annotation-interface[data-id="page-${currentIndex}"]`).classList.add('active');
  946. // Add editing class to current container
  947. const activeContainer = document.querySelector(`.page-container[data-index="${currentIndex}"]`);
  948. if (activeContainer) {
  949. activeContainer.classList.add('editing');
  950. activeContainer.scrollIntoView({ behavior: 'smooth', block: 'center' });
  951. }
  952. updateProgressBar();
  953. updateStatusIndicators();
  954. }
  955. else {
  956. // This was the last document, mark as complete
  957. currentIndex = totalPages;
  958. updateProgressBar();
  959. updateStatusIndicators();
  960. // Show completion message and scroll to it
  961. document.getElementById('completion-message').style.display = 'block';
  962. document.getElementById('completion-message').scrollIntoView({ behavior: 'smooth', block: 'center' });
  963. }
  964. }
  965. // Handle text area keydown for Enter key
  966. function handleTextareaKeydown(event, textarea) {
  967. // If Enter key is pressed and not with Shift key, move to next document
  968. if (event.key === 'Enter' && !event.shiftKey) {
  969. event.preventDefault();
  970. saveFeedback(textarea).then(() => {
  971. goToNextDocument();
  972. });
  973. }
  974. }
  975. async function saveFeedback(source) {
  976. const interfaceDiv = source.closest('.annotation-interface');
  977. const id = interfaceDiv.getAttribute('data-id');
  978. // Get the selected primary option
  979. const activePrimaryButton = interfaceDiv.querySelector('button.primary-option.active');
  980. const primaryOption = activePrimaryButton ? activePrimaryButton.getAttribute('data-value') : null;
  981. // Get checkbox selections for public document
  982. const publicPiiOptions = [];
  983. interfaceDiv.querySelectorAll('#public-pii-options-' + id.split('-')[1] + ' input[type="checkbox"]:checked').forEach(checkbox => {
  984. publicPiiOptions.push(checkbox.getAttribute('data-value'));
  985. });
  986. // Get checkbox selections for private document
  987. const privatePiiOptions = [];
  988. interfaceDiv.querySelectorAll('#private-pii-options-' + id.split('-')[1] + ' input[type="checkbox"]:checked').forEach(checkbox => {
  989. privatePiiOptions.push(checkbox.getAttribute('data-value'));
  990. });
  991. // Get any "Other" descriptions
  992. const otherPublicDesc = interfaceDiv.querySelector('#other-pii-public-' + id.split('-')[1])?.value || '';
  993. const otherPrivateDesc = interfaceDiv.querySelector('#other-pii-private-' + id.split('-')[1])?.value || '';
  994. const pdfPath = interfaceDiv.getAttribute('data-pdf-path');
  995. const pdfPage = interfaceDiv.getAttribute('data-pdf-page');
  996. const datastore = await fetchDatastore() || {};
  997. datastore[id] = {
  998. primaryOption: primaryOption,
  999. publicPiiOptions: publicPiiOptions,
  1000. privatePiiOptions: privatePiiOptions,
  1001. otherPublicDesc: otherPublicDesc,
  1002. otherPrivateDesc: otherPrivateDesc,
  1003. pdfPath: pdfPath,
  1004. pdfPage: pdfPage
  1005. };
  1006. await putDatastore(datastore);
  1007. }
  1008. function saveThenNext(btn) {
  1009. const interfaceDiv = btn.closest('.annotation-interface');
  1010. saveFeedback(interfaceDiv).then(() => {
  1011. goToNextDocument();
  1012. });
  1013. }
  1014. function togglePrimaryOption(btn, index) {
  1015. const interfaceDiv = btn.closest('.annotation-interface');
  1016. // Remove active class from all primary option buttons in this group
  1017. interfaceDiv.querySelectorAll('button.primary-option').forEach(function(b) {
  1018. b.classList.remove('active');
  1019. });
  1020. // Toggle on the clicked button
  1021. btn.classList.add('active');
  1022. // Get the selected option
  1023. const option = btn.getAttribute('data-value');
  1024. // If user selected Yes, Cannot Read, or Report Content, clear any checkboxes
  1025. // from "No" option that might have been selected before
  1026. if (option === 'yes-public' || option === 'cannot-read' || option === 'report-content') {
  1027. // Clear all checkboxes
  1028. interfaceDiv.querySelectorAll('.pii-checkbox').forEach(checkbox => {
  1029. checkbox.checked = false;
  1030. });
  1031. // Hide/clear any textareas
  1032. interfaceDiv.querySelectorAll('textarea').forEach(textarea => {
  1033. textarea.value = '';
  1034. textarea.style.display = 'none';
  1035. });
  1036. }
  1037. // Hide all secondary option containers
  1038. document.querySelector(`#public-pii-options-${index}`).style.display = 'none';
  1039. document.querySelector(`#private-pii-options-${index}`).style.display = 'none';
  1040. // Immediately save the primary option selection
  1041. saveFeedback(interfaceDiv);
  1042. // Show the appropriate secondary options based on the selected primary option
  1043. if (option === 'yes-public') {
  1044. // If "Yes" for public document, immediately go to next without asking for PII
  1045. goToNextDocument();
  1046. } else if (option === 'no-public') {
  1047. document.querySelector(`#private-pii-options-${index}`).style.display = 'block';
  1048. } else {
  1049. // For "cannot-read" or "report-content", just save and move to next
  1050. goToNextDocument();
  1051. }
  1052. }
  1053. function toggleOtherTextarea(checkbox) {
  1054. const container = checkbox.closest('.question-container');
  1055. const textareaId = container.querySelector('textarea').id;
  1056. const textarea = document.getElementById(textareaId);
  1057. if (checkbox.checked) {
  1058. textarea.style.display = 'block';
  1059. textarea.focus();
  1060. } else {
  1061. textarea.style.display = 'none';
  1062. }
  1063. saveCheckboxes(checkbox);
  1064. }
  1065. function saveCheckboxes(input) {
  1066. const interfaceDiv = input.closest('.annotation-interface');
  1067. return saveFeedback(interfaceDiv);
  1068. }
  1069. // Function to deobfuscate the Prolific code
  1070. function deobfuscateCode(obfuscatedCode) {
  1071. // Reverse the string
  1072. const reversed = obfuscatedCode.split('').reverse().join('');
  1073. // Decode from base64
  1074. try {
  1075. return atob(reversed);
  1076. } catch (e) {
  1077. return "ERROR_DECODING";
  1078. }
  1079. }
  1080. function getQueryParam(name) {
  1081. const urlParams = new URLSearchParams(window.location.search);
  1082. return urlParams.get(name);
  1083. }
  1084. document.addEventListener("DOMContentLoaded", async function() {
  1085. // Get the datastore
  1086. const datastore = await fetchDatastore() || {};
  1087. // Check for PROLIFIC_PID in the URL query parameters
  1088. const prolificPid = getQueryParam('PROLIFIC_PID');
  1089. if (prolificPid) {
  1090. // If it exists, update the datastore with this value
  1091. datastore.prolific_pid = prolificPid;
  1092. await putDatastore(datastore);
  1093. }
  1094. // Track if instructions have been seen before
  1095. if (!datastore.hasOwnProperty('instructions_seen')) {
  1096. datastore.instructions_seen = false;
  1097. await putDatastore(datastore);
  1098. }
  1099. // Add editing class to the first container by default
  1100. const firstContainer = document.querySelector(`.page-container[data-index="0"]`);
  1101. if (firstContainer) {
  1102. firstContainer.classList.add('editing');
  1103. }
  1104. updateProgressBar();
  1105. updateStatusIndicators();
  1106. // Get and deobfuscate the Prolific code
  1107. const obfuscatedCode = document.getElementById('obfuscated-code').textContent;
  1108. const prolificCode = deobfuscateCode(obfuscatedCode);
  1109. document.getElementById('prolific-code').textContent = prolificCode;
  1110. document.querySelectorAll('.annotation-interface').forEach(function(interfaceDiv) {
  1111. const id = interfaceDiv.getAttribute('data-id');
  1112. const pageIndex = id.split('-')[1];
  1113. if (datastore[id]) {
  1114. const data = datastore[id];
  1115. // Set active state for primary option buttons
  1116. interfaceDiv.querySelectorAll('button.primary-option').forEach(function(btn) {
  1117. if (btn.getAttribute('data-value') === data.primaryOption) {
  1118. btn.classList.add('active');
  1119. // Show the appropriate secondary options
  1120. const option = btn.getAttribute('data-value');
  1121. if (option === 'yes-public') {
  1122. // No action needed for public documents - PII options remain hidden
  1123. } else if (option === 'no-public') {
  1124. document.querySelector(`#private-pii-options-${pageIndex}`).style.display = 'block';
  1125. }
  1126. } else {
  1127. btn.classList.remove('active');
  1128. }
  1129. });
  1130. // Restore public PII checkboxes
  1131. if (data.publicPiiOptions && data.publicPiiOptions.length > 0) {
  1132. const publicContainer = document.querySelector(`#public-pii-options-${pageIndex}`);
  1133. data.publicPiiOptions.forEach(option => {
  1134. const checkbox = publicContainer.querySelector(`input[data-value="${option}"]`);
  1135. if (checkbox) {
  1136. checkbox.checked = true;
  1137. if (option === 'other') {
  1138. document.getElementById(`other-pii-public-${pageIndex}`).style.display = 'block';
  1139. }
  1140. }
  1141. });
  1142. }
  1143. // Restore private PII checkboxes
  1144. if (data.privatePiiOptions && data.privatePiiOptions.length > 0) {
  1145. const privateContainer = document.querySelector(`#private-pii-options-${pageIndex}`);
  1146. data.privatePiiOptions.forEach(option => {
  1147. const checkbox = privateContainer.querySelector(`input[data-value="${option}"]`);
  1148. if (checkbox) {
  1149. checkbox.checked = true;
  1150. if (option === 'other') {
  1151. document.getElementById(`other-pii-private-${pageIndex}`).style.display = 'block';
  1152. }
  1153. }
  1154. });
  1155. }
  1156. // Set the textarea values
  1157. if (data.otherPublicDesc) {
  1158. document.getElementById(`other-pii-public-${pageIndex}`).value = data.otherPublicDesc;
  1159. }
  1160. if (data.otherPrivateDesc) {
  1161. document.getElementById(`other-pii-private-${pageIndex}`).value = data.otherPrivateDesc;
  1162. }
  1163. }
  1164. });
  1165. // If we have stored data, restore the current position
  1166. let lastAnnotatedIndex = -1;
  1167. for (let i = 0; i < totalPages; i++) {
  1168. const pageId = `page-${i}`;
  1169. if (datastore[pageId] && datastore[pageId].primaryOption) {
  1170. lastAnnotatedIndex = i;
  1171. }
  1172. }
  1173. // If we have annotated pages, go to the first unannotated page
  1174. if (lastAnnotatedIndex >= 0) {
  1175. document.querySelector(`.annotation-interface.active`).classList.remove('active');
  1176. // Check if all pages are annotated
  1177. if (lastAnnotatedIndex === totalPages - 1) {
  1178. // All pages are annotated, set currentIndex to totalPages to trigger completion
  1179. currentIndex = totalPages;
  1180. // Show completion message and scroll to it
  1181. document.getElementById('completion-message').style.display = 'block';
  1182. document.getElementById('completion-message').scrollIntoView({ behavior: 'smooth', block: 'center' });
  1183. } else {
  1184. // Go to the next unannotated page
  1185. currentIndex = lastAnnotatedIndex + 1;
  1186. document.querySelector(`.annotation-interface[data-id="page-${currentIndex}"]`).classList.add('active');
  1187. // Add editing class and scroll to the active annotation
  1188. const activeContainer = document.querySelector(`.page-container[data-index="${currentIndex}"]`);
  1189. if (activeContainer) {
  1190. // Remove editing class from all containers first
  1191. document.querySelectorAll('.page-container').forEach(container => {
  1192. container.classList.remove('editing');
  1193. });
  1194. // Add editing class to current container
  1195. activeContainer.classList.add('editing');
  1196. activeContainer.scrollIntoView({ behavior: 'smooth', block: 'center' });
  1197. }
  1198. }
  1199. updateProgressBar();
  1200. updateStatusIndicators();
  1201. }
  1202. });
  1203. // Instructions modal functionality
  1204. // Create modal container
  1205. const instructionsModal = document.createElement('div');
  1206. instructionsModal.className = 'instructions-modal-overlay';
  1207. instructionsModal.id = 'instructions-modal';
  1208. // Create modal content container
  1209. const modalContent = document.createElement('div');
  1210. modalContent.className = 'instructions-modal';
  1211. // Create header
  1212. const modalHeader = document.createElement('div');
  1213. modalHeader.className = 'instructions-modal-header';
  1214. modalHeader.innerHTML = `
  1215. <h2>Welcome to the OLMO OCR Annotation Task</h2>
  1216. <p>Please read these instructions carefully before you begin.</p>
  1217. `;
  1218. // Create content section - will be populated with sidebar content
  1219. const modalContentSection = document.createElement('div');
  1220. modalContentSection.className = 'instructions-modal-content';
  1221. // Clone the sidebar content to reuse in the modal
  1222. const sidebarContent = document.querySelector('header').cloneNode(true);
  1223. // Remove the "View Instructions Popup" button from the cloned content
  1224. const viewInstructionsButton = sidebarContent.querySelector('#view-instructions-button');
  1225. if (viewInstructionsButton) {
  1226. viewInstructionsButton.remove();
  1227. }
  1228. // Style the sidebar content for use in the modal
  1229. sidebarContent.style.fontSize = '14px';
  1230. sidebarContent.style.lineHeight = '1.5';
  1231. // Append the cloned sidebar content to the modal content section
  1232. modalContentSection.appendChild(sidebarContent);
  1233. // Create footer with start button (initially disabled)
  1234. const modalFooter = document.createElement('div');
  1235. modalFooter.className = 'instructions-modal-footer';
  1236. modalFooter.innerHTML = `<button id="start-button" class="instructions-modal-button" disabled>I Understand, Begin Task</button>
  1237. <p id="scroll-notice" style="margin-top: 10px; font-size: 0.85rem; color: #6b7280;">Please scroll to the bottom to continue</p>`;
  1238. // Assemble the modal
  1239. modalContent.appendChild(modalHeader);
  1240. modalContent.appendChild(modalContentSection);
  1241. modalContent.appendChild(modalFooter);
  1242. instructionsModal.appendChild(modalContent);
  1243. // Track scroll position in instructions and enable button when scrolled to bottom
  1244. let hasReachedBottom = false;
  1245. // Function to check if user has scrolled to the bottom of instructions
  1246. function checkScrollPosition() {
  1247. const contentSection = modalContentSection;
  1248. const scrollableContent = contentSection;
  1249. // Calculate if the user is at the bottom (allowing for small differences)
  1250. // We consider "bottom" when user has scrolled to at least 90% of the content
  1251. const scrollPosition = scrollableContent.scrollTop + scrollableContent.clientHeight;
  1252. const scrollHeight = scrollableContent.scrollHeight;
  1253. const scrollPercentage = (scrollPosition / scrollHeight) * 100;
  1254. if (scrollPercentage >= 90 && !hasReachedBottom) {
  1255. // User has scrolled to the bottom, enable the button
  1256. hasReachedBottom = true;
  1257. const startButton = document.getElementById('start-button');
  1258. if (startButton) {
  1259. startButton.disabled = false;
  1260. // Change the notice text
  1261. const scrollNotice = document.getElementById('scroll-notice');
  1262. if (scrollNotice) {
  1263. scrollNotice.textContent = 'You may now proceed';
  1264. scrollNotice.style.color = '#10b981'; // Success color
  1265. }
  1266. }
  1267. }
  1268. }
  1269. // Add scroll event listener to the modal content
  1270. modalContentSection.addEventListener('scroll', checkScrollPosition);
  1271. document.body.appendChild(instructionsModal);
  1272. // Show the instructions modal when the page loads
  1273. async function showInstructionsModal() {
  1274. const datastore = await fetchDatastore() || {};
  1275. // Check if the task is already completed or instructions have been seen
  1276. const isTaskCompleted = currentIndex >= totalPages;
  1277. const instructionsSeen = datastore.instructions_seen === true;
  1278. // Only show instructions if task is not completed and instructions haven't been seen
  1279. if (!isTaskCompleted && !instructionsSeen) {
  1280. instructionsModal.classList.add('visible');
  1281. }
  1282. }
  1283. // Handle button clicks for instructions modal
  1284. document.addEventListener('click', async function(event) {
  1285. // Start button closes the modal and marks instructions as seen
  1286. if (event.target && event.target.id === 'start-button') {
  1287. // Hide the modal
  1288. instructionsModal.classList.remove('visible');
  1289. // Update datastore to remember that instructions have been seen
  1290. const datastore = await fetchDatastore() || {};
  1291. datastore.instructions_seen = true;
  1292. await putDatastore(datastore);
  1293. }
  1294. // View instructions button shows the modal
  1295. if (event.target && event.target.id === 'view-instructions-button') {
  1296. instructionsModal.classList.add('visible');
  1297. }
  1298. });
  1299. // Show the instructions modal when page loads (after a slight delay)
  1300. setTimeout(showInstructionsModal, 500);
  1301. </script>
  1302. </body>
  1303. </html>
  1304. """
  1305. )
  1306. with open(output_path, "w") as f:
  1307. f.write(html_content)
  1308. print(f"Created HTML output at {output_path}")
  1309. def generate_sample_set(args, i, s3_client, pdf_s3_client, result_files):
  1310. """Generate a single sample set."""
  1311. output_filename = Path(args.output_dir) / f"dolma_samples_{i+1}.html"
  1312. print(f"\nGenerating sample set {i+1} of {args.repeats}")
  1313. # Get random pages
  1314. random_pages = get_random_pages(s3_client, result_files, args.pages_per_output)
  1315. # Use the fixed prolific code from command line arguments
  1316. prolific_code = args.prolific_code
  1317. # Create HTML output with the Prolific code
  1318. create_html_output(random_pages, pdf_s3_client, output_filename, args.workspace, args.db_path, prolific_code)
  1319. return output_filename
  1320. def extract_datastore_url(html_content: str) -> Optional[str]:
  1321. """Extract the presigned datastore URL from HTML content."""
  1322. match = re.search(r'const\s+presignedGetUrl\s*=\s*"([^"]+)"', html_content)
  1323. if match:
  1324. return match.group(1)
  1325. return None
  1326. def extract_page_number_from_html(html_content: str, page_id: str) -> Optional[int]:
  1327. """Extract PDF page number from HTML content for a specific page_id.
  1328. This is a fallback mechanism for older versions of the annotation page
  1329. that didn't store the page number in a data attribute.
  1330. """
  1331. # Try to find the page number in the "View Cached PDF (page X)" text
  1332. # Look for section with this page_id
  1333. page_section_pattern = '<div class="page-container"[^>]*data-index="([^"]*)"[^>]*>.*?<div class="page-info">.*?<a href="[^"]*#page=([0-9]+)"[^>]*>View Cached PDF \\(page ([0-9]+)\\)</a>'
  1334. matches = re.finditer(page_section_pattern, html_content, re.DOTALL)
  1335. for match in matches:
  1336. container_index = match.group(1)
  1337. pdf_page_from_url = match.group(2)
  1338. pdf_page_from_text = match.group(3)
  1339. # Check if this container index matches our page_id (page-X)
  1340. if f"page-{container_index}" == page_id:
  1341. # Both numbers should be the same, but prefer the one from the URL fragment
  1342. try:
  1343. return int(pdf_page_from_url)
  1344. except (ValueError, TypeError):
  1345. try:
  1346. return int(pdf_page_from_text)
  1347. except (ValueError, TypeError):
  1348. pass
  1349. return None
  1350. def fetch_annotations(tinyhost_link: str) -> Tuple[Dict[str, Any], str, str]:
  1351. """Fetch and parse annotations from a tinyhost link."""
  1352. # Request the HTML content
  1353. print(f"Fetching annotations from {tinyhost_link}")
  1354. response = requests.get(tinyhost_link)
  1355. response.raise_for_status()
  1356. html_content = response.text
  1357. # Extract the datastore URL
  1358. datastore_url = extract_datastore_url(html_content)
  1359. if not datastore_url:
  1360. print(f"Could not find datastore URL in {tinyhost_link}")
  1361. return {}, tinyhost_link, html_content
  1362. # Fetch the datastore content
  1363. print(f"Found datastore URL: {datastore_url}")
  1364. try:
  1365. datastore_response = requests.get(datastore_url)
  1366. datastore_response.raise_for_status()
  1367. annotations = datastore_response.json()
  1368. return annotations, tinyhost_link, html_content
  1369. except Exception as e:
  1370. print(f"Error fetching datastore from {datastore_url}: {e}")
  1371. return {}, tinyhost_link, html_content
  1372. def process_annotations(annotations_by_link: List[Tuple[Dict[str, Any], str, str]]) -> Dict[str, List[Dict[str, Any]]]:
  1373. """Process and categorize annotations by feedback type."""
  1374. results = {
  1375. "public_document": [],
  1376. "private_document": [],
  1377. "cannot_read": [],
  1378. "report_content": [],
  1379. "no_annotation": [],
  1380. }
  1381. # Process each annotation
  1382. for annotations, link, html_content in annotations_by_link:
  1383. # Extract Prolific PID from datastore if available
  1384. prolific_pid = annotations.get("prolific_pid", None)
  1385. for page_id, annotation in annotations.items():
  1386. # Skip non-page entries like prolific_pid
  1387. if page_id == "prolific_pid":
  1388. continue
  1389. # Handle case where annotation might be a boolean or non-dict value
  1390. if not isinstance(annotation, dict) or "primaryOption" not in annotation:
  1391. continue
  1392. primary_option = annotation["primaryOption"]
  1393. pdf_path = annotation.get("pdfPath", "Unknown")
  1394. # Get PDF page number from annotation data
  1395. # This is the actual page number in the PDF that was annotated
  1396. pdf_page = None
  1397. # First try to get it from the annotation data (for new format)
  1398. if annotation.get("pdfPage"):
  1399. try:
  1400. pdf_page = int(annotation.get("pdfPage"))
  1401. except (ValueError, TypeError):
  1402. pass
  1403. # Fallback: try to extract page number from HTML content (for older format)
  1404. if pdf_page is None:
  1405. pdf_page = extract_page_number_from_html(html_content, page_id)
  1406. # Build a result item based on the new annotation structure
  1407. if primary_option == "yes-public":
  1408. # Public document - no PII info collected with new flow
  1409. results["public_document"].append(
  1410. {
  1411. "page_id": page_id,
  1412. "link": link,
  1413. "pdf_path": pdf_path,
  1414. "pdf_page": pdf_page,
  1415. "pii_types": [],
  1416. "has_pii": False,
  1417. "description": "",
  1418. "prolific_pid": prolific_pid,
  1419. }
  1420. )
  1421. elif primary_option == "no-public":
  1422. # Private document with potential PII
  1423. private_pii_options = annotation.get("privatePiiOptions", [])
  1424. other_desc = annotation.get("otherPrivateDesc", "")
  1425. if not private_pii_options:
  1426. # No PII selected in a private document
  1427. results["private_document"].append(
  1428. {
  1429. "page_id": page_id,
  1430. "link": link,
  1431. "pdf_path": pdf_path,
  1432. "pdf_page": pdf_page,
  1433. "pii_types": [],
  1434. "has_pii": False,
  1435. "description": "",
  1436. "prolific_pid": prolific_pid,
  1437. }
  1438. )
  1439. else:
  1440. # PII found in a private document
  1441. results["private_document"].append(
  1442. {
  1443. "page_id": page_id,
  1444. "link": link,
  1445. "pdf_path": pdf_path,
  1446. "pdf_page": pdf_page,
  1447. "pii_types": private_pii_options,
  1448. "has_pii": True,
  1449. "description": other_desc if "other" in private_pii_options else "",
  1450. "prolific_pid": prolific_pid,
  1451. }
  1452. )
  1453. elif primary_option == "cannot-read":
  1454. results["cannot_read"].append({"page_id": page_id, "link": link, "pdf_path": pdf_path, "pdf_page": pdf_page, "prolific_pid": prolific_pid})
  1455. elif primary_option == "report-content":
  1456. results["report_content"].append({"page_id": page_id, "link": link, "pdf_path": pdf_path, "pdf_page": pdf_page, "prolific_pid": prolific_pid})
  1457. else:
  1458. results["no_annotation"].append({"page_id": page_id, "link": link, "pdf_path": pdf_path, "pdf_page": pdf_page, "prolific_pid": prolific_pid})
  1459. return results
  1460. def print_annotation_report(annotation_results: Dict[str, List[Dict[str, Any]]], pdf_s3_client=None):
  1461. """Print a summary report of annotations."""
  1462. total_pages = sum(len(items) for items in annotation_results.values())
  1463. print("\n" + "=" * 80)
  1464. print(f"ANNOTATION REPORT - Total Pages: {total_pages}")
  1465. print("=" * 80)
  1466. # Count pages with PII in public documents
  1467. public_with_pii = [page for page in annotation_results["public_document"] if page.get("has_pii", False)]
  1468. public_without_pii = [page for page in annotation_results["public_document"] if not page.get("has_pii", False)]
  1469. # Count pages with PII in private documents
  1470. private_with_pii = [page for page in annotation_results["private_document"] if page.get("has_pii", False)]
  1471. private_without_pii = [page for page in annotation_results["private_document"] if not page.get("has_pii", False)]
  1472. # Print summary statistics
  1473. print("\nSummary:")
  1474. print(
  1475. f" Public documents (total): {len(annotation_results['public_document'])} ({len(annotation_results['public_document'])/total_pages*100:.1f}% of all pages)"
  1476. )
  1477. print(f" - With PII: {len(public_with_pii)} ({len(public_with_pii)/max(1, len(annotation_results['public_document']))*100:.1f}% of public docs)")
  1478. print(
  1479. f" - Without PII: {len(public_without_pii)} ({len(public_without_pii)/max(1, len(annotation_results['public_document']))*100:.1f}% of public docs)"
  1480. )
  1481. print(
  1482. f" Private documents (total): {len(annotation_results['private_document'])} ({len(annotation_results['private_document'])/total_pages*100:.1f}% of all pages)"
  1483. )
  1484. print(f" - With PII: {len(private_with_pii)} ({len(private_with_pii)/max(1, len(annotation_results['private_document']))*100:.1f}% of private docs)")
  1485. print(
  1486. f" - Without PII: {len(private_without_pii)} ({len(private_without_pii)/max(1, len(annotation_results['private_document']))*100:.1f}% of private docs)"
  1487. )
  1488. print(f" Unreadable pages: {len(annotation_results['cannot_read'])} ({len(annotation_results['cannot_read'])/total_pages*100:.1f}%)")
  1489. print(f" Pages with reported content: {len(annotation_results['report_content'])} ({len(annotation_results['report_content'])/total_pages*100:.1f}%)")
  1490. print(f" Pages without annotation: {len(annotation_results['no_annotation'])} ({len(annotation_results['no_annotation'])/total_pages*100:.1f}%)")
  1491. # With the updated flow, there should be no public documents with PII flags
  1492. # as we don't collect PII information for public documents anymore
  1493. if public_with_pii:
  1494. print("\nNote: With the current annotation flow, public documents should not have PII flags.")
  1495. print(f"Found {len(public_with_pii)} public documents incorrectly marked with PII.")
  1496. # Analyze PII types in private documents
  1497. if private_with_pii:
  1498. # Categorize the PII types for clearer reporting
  1499. pii_categories = {
  1500. "Identifiers": ["names", "email", "phone"],
  1501. "PII requiring identifiers": ["addresses", "biographical", "location", "employment", "education", "medical"],
  1502. "Always sensitive PII": ["government-id", "financial", "biometric", "login-info"],
  1503. }
  1504. # Dictionary to track all PII counts
  1505. pii_counts_private = {}
  1506. for page in private_with_pii:
  1507. for pii_type in page.get("pii_types", []):
  1508. pii_counts_private[pii_type] = pii_counts_private.get(pii_type, 0) + 1
  1509. # Print categorized PII counts
  1510. print("\nPII Types in Private Documents:")
  1511. # Print each category
  1512. for category, pii_types in pii_categories.items():
  1513. print(f"\n {category}:")
  1514. for pii_type in pii_types:
  1515. count = pii_counts_private.get(pii_type, 0)
  1516. if count > 0:
  1517. print(f" - {pii_type}: {count} ({count/len(private_with_pii)*100:.1f}%)")
  1518. # Print any other PII types not in our categories (like "other")
  1519. other_pii = [pii_type for pii_type in pii_counts_private.keys() if not any(pii_type in types for types in pii_categories.values())]
  1520. if other_pii:
  1521. print("\n Other PII types:")
  1522. for pii_type in other_pii:
  1523. count = pii_counts_private.get(pii_type, 0)
  1524. print(f" - {pii_type}: {count} ({count/len(private_with_pii)*100:.1f}%)")
  1525. # With the updated flow, there should be no public documents with PII flags
  1526. # so we can remove this section
  1527. if public_with_pii:
  1528. print("\nNote: Public documents with PII flags found in old annotation results.")
  1529. print("These are from annotation sessions before the workflow change and should be disregarded.")
  1530. # Print detailed report for private documents with PII
  1531. if private_with_pii:
  1532. print("\nDetailed Report - Private Documents with PII:")
  1533. print("-" * 80)
  1534. for i, item in enumerate(private_with_pii, 1):
  1535. pdf_path = item["pdf_path"]
  1536. page_id = item["page_id"]
  1537. # Get the actual PDF page number
  1538. pdf_page = item.get("pdf_page")
  1539. # Generate presigned URL with PDF page number if client is available
  1540. presigned_url = None
  1541. if pdf_s3_client and pdf_path.startswith("s3://"):
  1542. presigned_url = create_presigned_url(pdf_s3_client, pdf_path)
  1543. if presigned_url and pdf_page is not None:
  1544. presigned_url += f"#page={pdf_page}"
  1545. print(f"{i}. PDF: {pdf_path}")
  1546. print(f" Page ID: {page_id}")
  1547. print(f" Link: {item['link']}#{page_id}")
  1548. if presigned_url:
  1549. print(f" Presigned URL: {presigned_url}")
  1550. print(f" PII Types: {', '.join(item['pii_types'])}")
  1551. if item.get("description"):
  1552. print(f" Description: {item['description']}")
  1553. if item.get("prolific_pid"):
  1554. print(f" Prolific PID: {item['prolific_pid']}")
  1555. print("-" * 80)
  1556. print("\nReport complete.")
  1557. def read_and_process_results(args):
  1558. """Read and process results from a previously generated CSV file."""
  1559. try:
  1560. # Read the CSV file
  1561. links = []
  1562. with open(args.read_results, "r") as f:
  1563. for line in f:
  1564. if line.strip():
  1565. links.append(line.strip())
  1566. if not links:
  1567. print(f"No tinyhost links found in {args.read_results}")
  1568. return
  1569. print(f"Found {len(links)} tinyhost links in {args.read_results}")
  1570. # Set up PDF S3 client with profile if specified
  1571. if args.pdf_profile:
  1572. pdf_session = boto3.Session(profile_name=args.pdf_profile)
  1573. pdf_s3_client = pdf_session.client("s3")
  1574. else:
  1575. pdf_s3_client = boto3.client("s3")
  1576. # Fetch and process annotations
  1577. annotations_by_link = []
  1578. for link in tqdm(links, desc="Fetching annotations"):
  1579. try:
  1580. annotations, link_url, html_content = fetch_annotations(link)
  1581. annotations_by_link.append((annotations, link_url, html_content))
  1582. except Exception as e:
  1583. print(f"Error processing {link}: {e}")
  1584. # Process and categorize annotations
  1585. annotation_results = process_annotations(annotations_by_link)
  1586. # Print report with presigned URLs
  1587. print_annotation_report(annotation_results, pdf_s3_client)
  1588. # Save detailed report to file
  1589. output_file = Path(args.output_dir) / "annotation_report.csv"
  1590. print(f"\nSaving detailed report to {output_file}")
  1591. with open(output_file, "w", newline="") as f:
  1592. writer = csv.writer(f)
  1593. writer.writerow(["Category", "PDF Path", "Page ID", "Link", "Presigned URL", "Document Type", "PII Types", "Description", "Prolific PID"])
  1594. for category, items in annotation_results.items():
  1595. for item in items:
  1596. pdf_path = item["pdf_path"]
  1597. # Get the actual PDF page number
  1598. pdf_page = item.get("pdf_page")
  1599. # Generate presigned URL with the PDF page number
  1600. presigned_url = ""
  1601. if pdf_path.startswith("s3://"):
  1602. url = create_presigned_url(pdf_s3_client, pdf_path)
  1603. if url and pdf_page is not None:
  1604. presigned_url = f"{url}#page={pdf_page}"
  1605. elif url:
  1606. presigned_url = url
  1607. if category == "public_document":
  1608. doc_type = "Public"
  1609. pii_types = ", ".join(item.get("pii_types", []))
  1610. description = item.get("description", "")
  1611. elif category == "private_document":
  1612. doc_type = "Private"
  1613. pii_types = ", ".join(item.get("pii_types", []))
  1614. description = item.get("description", "")
  1615. else:
  1616. doc_type = ""
  1617. pii_types = ""
  1618. description = ""
  1619. # Extract Prolific PID from the item if available
  1620. prolific_pid = item.get("prolific_pid", "")
  1621. writer.writerow(
  1622. [
  1623. category,
  1624. item["pdf_path"],
  1625. item["page_id"],
  1626. f"{item['link']}#{item['page_id']}",
  1627. presigned_url,
  1628. doc_type,
  1629. pii_types,
  1630. description,
  1631. prolific_pid,
  1632. ]
  1633. )
  1634. print(f"Report saved to {output_file}")
  1635. except Exception as e:
  1636. print(f"Error processing results: {e}")
  1637. raise
  1638. def main():
  1639. args = parse_args()
  1640. # Check if we're reading results from a previous run
  1641. if args.read_results:
  1642. read_and_process_results(args)
  1643. return
  1644. # Set up S3 clients
  1645. s3_client = boto3.client("s3")
  1646. # Set up PDF S3 client with profile if specified
  1647. if args.pdf_profile:
  1648. pdf_session = boto3.Session(profile_name=args.pdf_profile)
  1649. pdf_s3_client = pdf_session.client("s3")
  1650. else:
  1651. pdf_s3_client = s3_client
  1652. # Create output directory
  1653. output_dir = Path(args.output_dir)
  1654. output_dir.mkdir(exist_ok=True, parents=True)
  1655. # List all result files
  1656. print(f"Listing result files in {args.workspace}/results...")
  1657. result_files = list_result_files(s3_client, args.workspace)
  1658. print(f"Found {len(result_files)} result files")
  1659. # Use ThreadPoolExecutor to parallelize the generation of sample sets
  1660. output_files = []
  1661. if args.repeats > 1:
  1662. print(f"Using ThreadPoolExecutor with {min(args.max_workers, args.repeats)} workers")
  1663. with ThreadPoolExecutor(max_workers=min(args.max_workers, args.repeats)) as executor:
  1664. futures = []
  1665. for i in range(args.repeats):
  1666. future = executor.submit(generate_sample_set, args, i, s3_client, pdf_s3_client, result_files)
  1667. futures.append(future)
  1668. # Wait for all futures to complete and collect results
  1669. for future in futures:
  1670. try:
  1671. output_filename = future.result()
  1672. output_files.append(output_filename)
  1673. print(f"Completed generation of {output_filename}")
  1674. except Exception as e:
  1675. print(f"Error generating sample set: {e}")
  1676. else:
  1677. # If only one repeat, just run it directly
  1678. output_filename = generate_sample_set(args, 0, s3_client, pdf_s3_client, result_files)
  1679. output_files.append(output_filename)
  1680. # Now upload each resulting file into tinyhost
  1681. print("Generated all files, uploading tinyhost links now")
  1682. links = []
  1683. for output_filename in output_files:
  1684. link = tinyhost.tinyhost([str(output_filename)])[0]
  1685. links.append(link)
  1686. print(link)
  1687. # Create CSV file with just the tinyhost links, one per line
  1688. csv_path = args.prolific_csv
  1689. print(f"Writing tinyhost links to {csv_path}")
  1690. with open(csv_path, "w", newline="") as csvfile:
  1691. for link in links:
  1692. csvfile.write(f"{link}\n")
  1693. print(f"Tinyhost links written to {csv_path}")
  1694. if __name__ == "__main__":
  1695. main()