| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798 |
- from fastapi import FastAPI, File, UploadFile
- from fastapi.responses import JSONResponse
- import os
- import subprocess
- import uvicorn
- import json
- import shutil
- from pathlib import Path
- app = FastAPI()
- # 确保工作目录存在
- WORKSPACE = "./workspace"
- os.makedirs(WORKSPACE, exist_ok=True)
- @app.post("/olmocr/")
- async def olmocr(file: UploadFile = File(...)):
- """使用olmocr将pdf处理成具有格式化的文本"""
- if file.content_type != "application/pdf":
- print(file.content_type)
- return JSONResponse(
- status_code=400,
- content={"message": "只支持 PDF 文件"}
- )
-
- # 生成保存路径
- file_path = os.path.join(WORKSPACE, str(file.filename))
- # 保存文件
- try:
- contents = await file.read()
- with open(file_path, "wb") as f:
- f.write(contents)
- pdf_path = os.path.join(WORKSPACE, str(file.filename))
-
- # 构建命令并执行
- cmd = ["python", "-m", "olmocr.pipeline", WORKSPACE, "--pdfs", pdf_path]
-
- # 执行命令,等待完成
- process = subprocess.run(
- cmd,
- stdout=subprocess.PIPE,
- stderr=subprocess.PIPE,
- text=True,
- check=True
- )
- # 命令输出
- log_text = process.stdout
-
- # 检查结果目录
- results_dir = os.path.join(WORKSPACE, "results")
- # 查找输出文件
- output_files = list(Path(results_dir).glob("output_*.jsonl"))
- # 读取JSONL文件
- output_file = output_files[0]
-
- with open(output_file, "r") as f:
- content = f.read().strip()
- if not content:
- return f"输出文件为空\n\n日志输出:\n{log_text}", "", None, None
-
- # 解析JSON
- result = json.loads(content)
- extracted_text = result.get("text", "未找到文本内容")
- return {
- "message": extracted_text,
- }
- except Exception as e:
- return JSONResponse(
- status_code=500,
- content={"message": f"文件上传失败: {str(e)}"}
- )
- finally:
- # 清空 WORKSPACE 文件夹
- try:
- # 先关闭文件(如果已打开)
- await file.close()
-
- # 删除 WORKSPACE 下的所有文件和文件夹
- for filename in os.listdir(WORKSPACE):
- file_path = os.path.join(WORKSPACE, filename)
- try:
- if os.path.isfile(file_path) or os.path.islink(file_path):
- os.unlink(file_path)
- elif os.path.isdir(file_path):
- shutil.rmtree(file_path)
- except Exception as e:
- print(f"删除 {file_path} 时出错: {e}")
-
- except Exception as e:
- print(f"清理工作空间时出错: {e}")
-
-
-
-
- if __name__ == "__main__":
- uvicorn.run(app, host="0.0.0.0", port=8000)
-
|