api.py 3.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798
  1. from fastapi import FastAPI, File, UploadFile
  2. from fastapi.responses import JSONResponse
  3. import os
  4. import subprocess
  5. import uvicorn
  6. import json
  7. import shutil
  8. from pathlib import Path
  9. app = FastAPI()
  10. # 确保工作目录存在
  11. WORKSPACE = "./workspace"
  12. os.makedirs(WORKSPACE, exist_ok=True)
  13. @app.post("/olmocr/")
  14. async def olmocr(file: UploadFile = File(...)):
  15. """使用olmocr将pdf处理成具有格式化的文本"""
  16. if file.content_type != "application/pdf":
  17. print(file.content_type)
  18. return JSONResponse(
  19. status_code=400,
  20. content={"message": "只支持 PDF 文件"}
  21. )
  22. # 生成保存路径
  23. file_path = os.path.join(WORKSPACE, str(file.filename))
  24. # 保存文件
  25. try:
  26. contents = await file.read()
  27. with open(file_path, "wb") as f:
  28. f.write(contents)
  29. pdf_path = os.path.join(WORKSPACE, str(file.filename))
  30. # 构建命令并执行
  31. cmd = ["python", "-m", "olmocr.pipeline", WORKSPACE, "--pdfs", pdf_path]
  32. # 执行命令,等待完成
  33. process = subprocess.run(
  34. cmd,
  35. stdout=subprocess.PIPE,
  36. stderr=subprocess.PIPE,
  37. text=True,
  38. check=True
  39. )
  40. # 命令输出
  41. log_text = process.stdout
  42. # 检查结果目录
  43. results_dir = os.path.join(WORKSPACE, "results")
  44. # 查找输出文件
  45. output_files = list(Path(results_dir).glob("output_*.jsonl"))
  46. # 读取JSONL文件
  47. output_file = output_files[0]
  48. with open(output_file, "r") as f:
  49. content = f.read().strip()
  50. if not content:
  51. return f"输出文件为空\n\n日志输出:\n{log_text}", "", None, None
  52. # 解析JSON
  53. result = json.loads(content)
  54. extracted_text = result.get("text", "未找到文本内容")
  55. return {
  56. "message": extracted_text,
  57. }
  58. except Exception as e:
  59. return JSONResponse(
  60. status_code=500,
  61. content={"message": f"文件上传失败: {str(e)}"}
  62. )
  63. finally:
  64. # 清空 WORKSPACE 文件夹
  65. try:
  66. # 先关闭文件(如果已打开)
  67. await file.close()
  68. # 删除 WORKSPACE 下的所有文件和文件夹
  69. for filename in os.listdir(WORKSPACE):
  70. file_path = os.path.join(WORKSPACE, filename)
  71. try:
  72. if os.path.isfile(file_path) or os.path.islink(file_path):
  73. os.unlink(file_path)
  74. elif os.path.isdir(file_path):
  75. shutil.rmtree(file_path)
  76. except Exception as e:
  77. print(f"删除 {file_path} 时出错: {e}")
  78. except Exception as e:
  79. print(f"清理工作空间时出错: {e}")
  80. if __name__ == "__main__":
  81. uvicorn.run(app, host="0.0.0.0", port=8000)