jsonl_to_markdown.py 2.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657
  1. import json
  2. import os
  3. import sys
  4. # This is a simple script to convert JSONL files to Markdown format.
  5. # It reads each line of the JSONL file, extracts the 'text' field,
  6. # and saves it as a Markdown file with the line number as the filename.
  7. # The script also handles potential JSON decoding errors and prints relevant messages.
  8. def jsonl_to_markdown(input_file, output_dir):
  9. """
  10. Reads a JSONL file, extracts the 'text' field from each line, and saves it as a Markdown file.
  11. Args:
  12. input_file (str): Path to the input JSONL file.
  13. output_dir (str): Directory to save the Markdown files.
  14. """
  15. if not os.path.exists(output_dir):
  16. os.makedirs(output_dir)
  17. with open(input_file, "r", encoding="utf-8") as file:
  18. for i, line in enumerate(file):
  19. try:
  20. # Parse the JSON line
  21. data = json.loads(line)
  22. text_content = data.get("text", "")
  23. # Save to a Markdown file
  24. output_file = os.path.join(output_dir, f"line_{i + 1}.md")
  25. with open(output_file, "w", encoding="utf-8") as md_file:
  26. md_file.write(text_content)
  27. print(f"Extracted and saved line {i + 1} to {output_file}")
  28. except json.JSONDecodeError as e:
  29. print(f"Error decoding JSON on line {i + 1}: {e}")
  30. except Exception as e:
  31. print(f"Unexpected error on line {i + 1}: {e}")
  32. # Example usage
  33. # input_jsonl_file = "/path/to/test.jsonl" # Replace with the actual path to your JSONL file
  34. # output_directory = "/path/to/output_markdown" # Replace with the desired output directory
  35. # jsonl_to_markdown(input_jsonl_file, output_directory)
  36. # This is the main entrypoint to use the script from the command line.
  37. # It takes two arguments: the input JSONL file and the output directory.
  38. # The script will create the output directory if it does not exist.
  39. if __name__ == "__main__":
  40. if len(sys.argv) != 3:
  41. print("Usage: python jsonl_to_markdown.py <input_file> <output_dir>")
  42. sys.exit(1)
  43. input_file = sys.argv[1]
  44. output_dir = sys.argv[2]
  45. jsonl_to_markdown(input_file, output_dir)