from fastapi import FastAPI, File, UploadFile from fastapi.responses import JSONResponse import os import subprocess import uvicorn import json import shutil from pathlib import Path app = FastAPI() # 确保工作目录存在 WORKSPACE = "./workspace" os.makedirs(WORKSPACE, exist_ok=True) @app.post("/olmocr/") async def olmocr(file: UploadFile = File(...)): """使用olmocr将pdf处理成具有格式化的文本""" if file.content_type != "application/pdf": print(file.content_type) return JSONResponse( status_code=400, content={"message": "只支持 PDF 文件"} ) # 生成保存路径 file_path = os.path.join(WORKSPACE, str(file.filename)) # 保存文件 try: contents = await file.read() with open(file_path, "wb") as f: f.write(contents) pdf_path = os.path.join(WORKSPACE, str(file.filename)) # 构建命令并执行 cmd = ["python", "-m", "olmocr.pipeline", WORKSPACE, "--pdfs", pdf_path] # 执行命令,等待完成 process = subprocess.run( cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=True ) # 命令输出 log_text = process.stdout # 检查结果目录 results_dir = os.path.join(WORKSPACE, "results") # 查找输出文件 output_files = list(Path(results_dir).glob("output_*.jsonl")) # 读取JSONL文件 output_file = output_files[0] with open(output_file, "r") as f: content = f.read().strip() if not content: return f"输出文件为空\n\n日志输出:\n{log_text}", "", None, None # 解析JSON result = json.loads(content) extracted_text = result.get("text", "未找到文本内容") return { "message": extracted_text, } except Exception as e: return JSONResponse( status_code=500, content={"message": f"文件上传失败: {str(e)}"} ) finally: # 清空 WORKSPACE 文件夹 try: # 先关闭文件(如果已打开) await file.close() # 删除 WORKSPACE 下的所有文件和文件夹 for filename in os.listdir(WORKSPACE): file_path = os.path.join(WORKSPACE, filename) try: if os.path.isfile(file_path) or os.path.islink(file_path): os.unlink(file_path) elif os.path.isdir(file_path): shutil.rmtree(file_path) except Exception as e: print(f"删除 {file_path} 时出错: {e}") except Exception as e: print(f"清理工作空间时出错: {e}") if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=8000)