movedolmadocs_to_md.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103
  1. #!/usr/bin/env python3
  2. import argparse
  3. import json
  4. import os
  5. from urllib.parse import urlparse
  6. import boto3
  7. def parse_args():
  8. parser = argparse.ArgumentParser(description="Read JSONL files from an S3 prefix, extract text, and write to local .md files.")
  9. parser.add_argument(
  10. "--s3-prefix",
  11. default="s3://ai2-oe-data/jakep/pdfworkspaces/pdelfin_testset/results/",
  12. help="S3 prefix containing the JSONL files (default: s3://ai2-oe-data/jakep/pdfworkspaces/pdelfin_testset/results/)",
  13. )
  14. parser.add_argument("--output-dir", default="output_md", help="Local directory to store output .md files (default: output_md)")
  15. return parser.parse_args()
  16. def main():
  17. args = parse_args()
  18. # Parse the s3-prefix into bucket and prefix
  19. parsed_s3 = urlparse(args.s3_prefix)
  20. # e.g. netloc = 'ai2-oe-data', path = '/jakep/pdfworkspaces/pdelfin_testset/results/'
  21. bucket_name = parsed_s3.netloc
  22. # Remove leading '/' from parsed_s3.path
  23. prefix = parsed_s3.path.lstrip("/")
  24. # Ensure local output directory exists
  25. os.makedirs(args.output_dir, exist_ok=True)
  26. # Initialize S3 client
  27. s3 = boto3.client("s3")
  28. # List all objects under the prefix
  29. paginator = s3.get_paginator("list_objects_v2")
  30. pages = paginator.paginate(Bucket=bucket_name, Prefix=prefix)
  31. for page in pages:
  32. if "Contents" not in page:
  33. continue
  34. for obj in page["Contents"]:
  35. key = obj["Key"]
  36. # Skip non-jsonl files
  37. if not key.endswith(".jsonl"):
  38. continue
  39. print(f"Processing S3 object: s3://{bucket_name}/{key}")
  40. # Read the S3 object
  41. s3_object = s3.get_object(Bucket=bucket_name, Key=key)
  42. # s3_object['Body'] is a StreamingBody, so we can read it line-by-line
  43. body_stream = s3_object["Body"].iter_lines()
  44. for line in body_stream:
  45. if not line.strip():
  46. continue
  47. try:
  48. record = json.loads(line)
  49. except json.JSONDecodeError:
  50. print("Warning: Failed to decode JSON line.")
  51. continue
  52. # Extract text
  53. text_content = record.get("text", "")
  54. if not text_content.strip():
  55. # If there's no text, skip
  56. continue
  57. # Derive the output filename based on the "Source-File" metadata
  58. metadata = record.get("metadata", {})
  59. source_file = metadata.get("Source-File", "")
  60. # Example: source_file = 's3://ai2-oe-data/jakep/pdfdata/pdelfin_testset/fcffd2dd327d4e58d3c6d1d22ba62531c863_page8.pdf'
  61. # We want to end up with: 'fcffd2dd327d4e58d3c6d1d22ba62531c863_page8_pdelf.md'
  62. # 1) Extract just the filename from the path
  63. # 2) Remove '.pdf'
  64. # 3) Append '_pdelf.md'
  65. source_filename = os.path.basename(source_file) # e.g. 'fcffd2dd327d4e58d3c6d1d22ba62531c863_page8.pdf'
  66. if source_filename.lower().endswith(".pdf"):
  67. source_filename = source_filename[:-4] # remove .pdf
  68. output_filename = f"{source_filename}_pdelf.md"
  69. output_path = os.path.join(args.output_dir, output_filename)
  70. # Append the text to the corresponding file
  71. # If you want to overwrite instead, change mode to 'w'
  72. with open(output_path, "a", encoding="utf-8") as f:
  73. f.write(text_content + "\n")
  74. # Optional: Print or log what you've written
  75. # print(f"Appended text to {output_path}")
  76. print("Done processing all JSONL files.")
  77. if __name__ == "__main__":
  78. main()