app.py 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
  1. import os
  2. import json
  3. import gradio as gr
  4. import subprocess
  5. import pandas as pd
  6. from pathlib import Path
  7. import shutil
  8. import time
  9. import re
  10. # 创建工作目录
  11. WORKSPACE_DIR = "olmocr_workspace"
  12. os.makedirs(WORKSPACE_DIR, exist_ok=True)
  13. def modify_html_for_better_display(html_content):
  14. """修改HTML以便在Gradio中更好地显示"""
  15. if not html_content:
  16. return html_content
  17. # 增加容器宽度
  18. html_content = html_content.replace('<div class="container">',
  19. '<div class="container" style="max-width: 100%; width: 100%;">')
  20. # 增加文本大小
  21. html_content = html_content.replace('<style>',
  22. '<style>\nbody {font-size: 16px;}\n.text-content {font-size: 16px; line-height: 1.5;}\n')
  23. # 调整图像和文本部分的大小比例
  24. html_content = html_content.replace('<div class="row">',
  25. '<div class="row" style="display: flex; flex-wrap: wrap;">')
  26. html_content = html_content.replace('<div class="col-md-6">',
  27. '<div class="col-md-6" style="flex: 0 0 50%; max-width: 50%; padding: 15px;">')
  28. # 增加页面之间的间距
  29. html_content = html_content.replace('<div class="page">',
  30. '<div class="page" style="margin-bottom: 30px; border-bottom: 1px solid #ccc; padding-bottom: 20px;">')
  31. # 增加图像大小
  32. html_content = re.sub(r'<img([^>]*)style="([^"]*)"',
  33. r'<img\1style="max-width: 100%; height: auto; \2"',
  34. html_content)
  35. # 添加缩放控制
  36. zoom_controls = """
  37. <div style="position: fixed; bottom: 20px; right: 20px; background: #fff; padding: 10px; border-radius: 5px; box-shadow: 0 0 10px rgba(0,0,0,0.2); z-index: 1000;">
  38. <button onclick="document.body.style.zoom = parseFloat(document.body.style.zoom || 1) + 0.1;" style="margin-right: 5px;">放大</button>
  39. <button onclick="document.body.style.zoom = parseFloat(document.body.style.zoom || 1) - 0.1;">缩小</button>
  40. </div>
  41. """
  42. html_content = html_content.replace('</body>', f'{zoom_controls}</body>')
  43. return html_content
  44. def process_pdf(pdf_file):
  45. """处理PDF文件并返回结果"""
  46. if pdf_file is None:
  47. return "请上传PDF文件", "", None, None
  48. # 创建一个唯一的工作目录
  49. timestamp = int(time.time())
  50. work_dir = os.path.join(WORKSPACE_DIR, f"job_{timestamp}")
  51. os.makedirs(work_dir, exist_ok=True)
  52. # 复制PDF文件
  53. pdf_path = os.path.join(work_dir, "input.pdf")
  54. shutil.copy(pdf_file, pdf_path)
  55. # 构建命令并执行
  56. cmd = ["python", "-m", "olmocr.pipeline", work_dir, "--pdfs", pdf_path]
  57. try:
  58. # 执行命令,等待完成
  59. process = subprocess.run(
  60. cmd,
  61. stdout=subprocess.PIPE,
  62. stderr=subprocess.PIPE,
  63. text=True,
  64. check=True
  65. )
  66. # 命令输出
  67. log_text = process.stdout
  68. # 检查结果目录
  69. results_dir = os.path.join(work_dir, "results")
  70. if not os.path.exists(results_dir):
  71. return f"处理完成,但未生成结果目录\n\n日志输出:\n{log_text}", "", None, None
  72. # 查找输出文件
  73. output_files = list(Path(results_dir).glob("output_*.jsonl"))
  74. if not output_files:
  75. return f"处理完成,但未找到输出文件\n\n日志输出:\n{log_text}", "", None, None
  76. # 读取JSONL文件
  77. output_file = output_files[0]
  78. with open(output_file, "r") as f:
  79. content = f.read().strip()
  80. if not content:
  81. return f"输出文件为空\n\n日志输出:\n{log_text}", "", None, None
  82. # 解析JSON
  83. result = json.loads(content)
  84. extracted_text = result.get("text", "未找到文本内容")
  85. # 生成HTML预览
  86. try:
  87. preview_cmd = ["python", "-m", "olmocr.viewer.dolmaviewer", str(output_file)]
  88. subprocess.run(preview_cmd, check=True)
  89. except Exception as e:
  90. log_text += f"\n生成HTML预览失败: {str(e)}"
  91. # 查找HTML文件
  92. html_files = list(Path("dolma_previews").glob("*.html"))
  93. html_content = ""
  94. if html_files:
  95. try:
  96. with open(html_files[0], "r", encoding="utf-8") as hf:
  97. html_content = hf.read()
  98. # 修改HTML以更好地显示
  99. html_content = modify_html_for_better_display(html_content)
  100. except Exception as e:
  101. log_text += f"\n读取HTML预览失败: {str(e)}"
  102. # 创建元数据表格
  103. metadata = result.get("metadata", {})
  104. meta_rows = []
  105. for key, value in metadata.items():
  106. meta_rows.append([key, value])
  107. df = pd.DataFrame(meta_rows, columns=["属性", "值"])
  108. # return log_text, extracted_text, html_content, df
  109. return extracted_text
  110. except subprocess.CalledProcessError as e:
  111. return f"命令执行失败: {e.stderr}", "", None, None
  112. except Exception as e:
  113. return f"处理过程中发生错误: {str(e)}", "", None, None
  114. # 创建Gradio界面
  115. with gr.Blocks(title="鼎盛方圆 PDF提取工具") as app:
  116. gr.Markdown("# 鼎盛方圆 PDF文本提取工具")
  117. with gr.Row():
  118. with gr.Column(scale=1):
  119. pdf_input = gr.File(label="上传PDF文件", file_types=[".pdf"])
  120. process_btn = gr.Button("处理PDF", variant="primary")
  121. with gr.Column(scale=2):
  122. tabs = gr.Tabs()
  123. with tabs:
  124. with gr.TabItem("提取文本"):
  125. text_output = gr.Textbox(label="提取的文本", lines=20, interactive=True)
  126. # with gr.TabItem("HTML预览", id="html_preview_tab"):
  127. # # 使用更大的HTML组件
  128. # html_output = gr.HTML(label="HTML预览", elem_id="html_preview_container")
  129. # with gr.TabItem("元数据"):
  130. # meta_output = gr.DataFrame(label="文档元数据")
  131. # with gr.TabItem("日志"):
  132. # log_output = gr.Textbox(label="处理日志", lines=15, interactive=False)
  133. # 使用CSS自定义HTML预览标签页和内容大小
  134. gr.HTML("""
  135. <style>
  136. #html_preview_container {
  137. height: 800px;
  138. width: 100%;
  139. overflow: auto;
  140. border: 1px solid #ddd;
  141. border-radius: 4px;
  142. }
  143. #html_preview_container iframe {
  144. width: 100%;
  145. height: 100%;
  146. border: none;
  147. }
  148. </style>
  149. """)
  150. # 添加操作说明
  151. gr.Markdown("""
  152. ## 使用说明
  153. 1. 上传PDF文件
  154. 2. 点击"处理PDF"按钮
  155. 3. 等待处理完成
  156. 4. 查看提取的文本
  157. ## 注意
  158. - 处理过程可能需要几分钟,请耐心等待
  159. - 首次运行会下载模型(约7GB)
  160. """)
  161. # 绑定按钮事件 - 使用阻塞模式
  162. process_btn.click(
  163. fn=process_pdf,
  164. inputs=pdf_input,
  165. # outputs=[log_output, text_output, html_output, meta_output],
  166. outputs=[text_output],
  167. api_name="process"
  168. )
  169. # 启动应用
  170. if __name__ == "__main__":
  171. app.launch(
  172. server_name='0.0.0.0',
  173. server_port=5000,
  174. share=False
  175. )