s2orc_extractor.sh 1.1 KB

123456789101112131415161718192021222324252627282930313233343536
  1. #!/bin/bash
  2. # Define the output file for the metadata.sha1 fields
  3. OUTPUT_FILE="s2orc_pdfs_v2.txt"
  4. # Clear the output file if it already exists
  5. > "$OUTPUT_FILE"
  6. # Create a temporary directory for partial outputs
  7. temp_output_dir=$(mktemp -d)
  8. # Ensure the temporary directory is cleaned up on exit or error
  9. trap 'rm -rf "$temp_output_dir"' EXIT
  10. # Export the temporary output directory variable for use in xargs
  11. export temp_output_dir
  12. echo "temp dir $temp_output_dir"
  13. # Find all .gz files recursively from the current directory
  14. find 'split=train' -type f -name "*.gz" | \
  15. xargs -P 30 -I{} bash -c '
  16. gz_file="$1"
  17. partial_output="$temp_output_dir/$(basename "$gz_file").txt"
  18. # Stream uncompressed data directly into jq and format the output
  19. gunzip -c "$gz_file" | jq -r '"'"'
  20. select(.metadata.sha1 != null) |
  21. "s3://ai2-s2-pdfs/" + (.metadata.sha1[:4]) + "/" + (.metadata.sha1[4:]) + ".pdf"
  22. '"'"' >> "$partial_output"
  23. ' _ {}
  24. # Concatenate all partial outputs into the final output file
  25. cat "$temp_output_dir"/*.txt >> "$OUTPUT_FILE"
  26. echo "All metadata.sha1 fields have been extracted to $OUTPUT_FILE."