Merge pull request #879 from opendatalab/release-0.9.1

Release 0.9.1

Merge pull request #879 from opendatalab/release-0.9.1
Release 0.9.1
069bcfe6 · Xiaomeng Zhao · GitHub · 8ee1da82 · bff7bd93 · 069bcfe6
Unverified Commit 069bcfe6 authored Nov 06, 2024 by Xiaomeng Zhao Committed by GitHub Nov 06, 2024
75 changed files
--- a/.gitattributes
+++ b/.gitattributes
+*.js linguist-vendored
+*.mjs linguist-vendored
+*.html linguist-documentation
+*.css linguist-vendored
+*.scss linguist-vendored
\ No newline at end of file
--- a/.github/ISSUE_TEMPLATE/bug_report.yml
+++ b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -78,9 +78,9 @@ body:
      #multiple: false
      options:
        -
-        - "0.6.x"
        - "0.7.x"
        - "0.8.x"
+        - "0.9.x"
    validations:
      required: true


--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -10,7 +10,7 @@ formats:

 python:
  install:
-    - requirements: docs/zh_cn/requirements.txt
+    - requirements: next_docs/zh_cn/requirements.txt

 sphinx:
-  configuration: docs/zh_cn/conf.py
+  configuration: next_docs/zh_cn/conf.py
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@
 [![OpenDataLab](https://img.shields.io/badge/Demo_on_OpenDataLab-blue?logo=data:image/svg+xml;base64,PHN2ZyB3aWR0aD0iMzAiIGhlaWdodD0iMzAiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyIgZmlsbD0ibm9uZSI+CiA8ZGVmcz4KICA8bGluZWFyR3JhZGllbnQgeTI9IjAuNTMzNjciIHgyPSIxLjAwMDQiIHkxPSIwLjI5MjE5IiB4MT0iLTAuMTEyNjgiIGlkPSJhIj4KICAgPHN0b3Agc3RvcC1jb2xvcj0iIzE1NDNGRSIvPgogICA8c3RvcCBzdG9wLWNvbG9yPSIjOEM0NkZGIiBvZmZzZXQ9IjEiLz4KICA8L2xpbmVhckdyYWRpZW50PgogIDxsaW5lYXJHcmFkaWVudCB5Mj0iMC41OTc1NyIgeDI9IjEuMDExMzciIHkxPSIwLjExMDIzIiB4MT0iLTAuMDg0NzQiIGlkPSJiIj4KICAgPHN0b3Agc3RvcC1jb2xvcj0iIzE1NDNGRSIvPgogICA8c3RvcCBzdG9wLWNvbG9yPSIjOEM0NkZGIiBvZmZzZXQ9IjEiLz4KICA8L2xpbmVhckdyYWRpZW50PgogPC9kZWZzPgogPGc+CiAgPHRpdGxlPkxheWVyIDE8L3RpdGxlPgogIDxwYXRoIGlkPSJzdmdfMSIgZmlsbD0idXJsKCNhKSIgZD0ibTEuNjIzLDEyLjA2N2EwLjQ4NCwwLjQ4NCAwIDAgMSAwLjA3LC0wLjM4NGw1LjMxLC03Ljg5NWMwLjA2OCwtMC4xIDAuMTcsLTAuMTcyIDAuMjg4LC0wLjJsMTQuMzc3LC0zLjQ3NGEwLjQ4NCwwLjQ4NCAwIDAgMSAwLjU4NCwwLjM1N2wzLjY2MiwxNS4xNTJjMS40NzcsNi4xMTQgLTIuMjgxLDEyLjI2NyAtOC4zOTQsMTMuNzQ1Yy02LjExNCwxLjQ3NyAtMTIuMjY3LC0yLjI4MSAtMTMuNzQ1LC04LjM5NWwtMi4xNTIsLTguOTA2eiIgb3BhY2l0eT0iMC40Ii8+CiAgPHBhdGggaWQ9InN2Z18yIiBmaWxsPSJ1cmwoI2IpIiBkPSJtNS44MjYsOC42NzNjMCwtMC4xMzYgMC4wNTcsLTAuMjY2IDAuMTU3LC0wLjM1OGw3LjAxNywtNi40MjVhMC40ODQsMC40ODQgMCAwIDEgMC4zMjcsLTAuMTI3bDE0Ljc5LDBjMC4yNjgsMCAwLjQ4NSwwLjIxNiAwLjQ4NSwwLjQ4NGwwLDE1LjU4OWMwLDYuMjkgLTUuMDk5LDExLjM4OCAtMTEuMzg4LDExLjM4OGMtNi4yOSwwIC0xMS4zODgsLTUuMDk5IC0xMS4zODgsLTExLjM4OGwwLC05LjE2M3oiLz4KICA8cGF0aCBpZD0ic3ZnXzMiIGZpbGw9IiM1RDc2RkYiIGQ9Im0xMi4zMzEsOC43NTNsLTYuMzgzLC0wLjM5OGw3LjEyMiwtNi41MmwwLjI5OSw1Ljg5MmEwLjk3OCwwLjk3OCAwIDAgMSAtMS4wMzgsMS4wMjZ6Ii8+CiAgPHBhdGggaWQ9InN2Z180IiBmaWxsPSIjMDAyOEZEIiBkPSJtMjAuNDE2LDE1LjAyMmwwLDEuNzExYTIuNDA0LDIuNDA0IDAgMCAxIC00LjgwOCwwbDAsLTQuMjc4bC0yLjgxLDBsMCw0LjY4NmE1LjIxNSw1LjIxNSAwIDEgMCAxMC40MywwbDAsLTQuNjg2bDAsMi41NjdsLTIuODEyLDB6IiBjbGlwLXJ1bGU9ImV2ZW5vZGQiIGZpbGwtcnVsZT0iZXZlbm9kZCIvPgogIDxwYXRoIGlkPSJzdmdfNSIgZmlsbD0iIzAwMjhGRCIgZD0ibTIzLjIyOCwxMy44ODFsMS4xNCwwbDAsMS4xNDFsLTEuMTQsMGwwLC0xLjE0bDAsLTAuMDAxem0tMi44MTIsLTAuNjkybDEuODM0LDBsMCwxLjgzM2wtMS44MzQsMGwwLC0xLjgzMmwwLC0wLjAwMXptMS44MzQsLTAuOTc5bDAuOTc4LDBsMCwwLjk3OWwtMC45NzgsMGwwLC0wLjk3OGwwLC0wLjAwMXptMS41NDgsLTEuNjI5bDAuNjExLDBsMCwwLjYxMWwtMC42MTEsMGwwLC0wLjYxMXoiLz4KICA8cGF0aCBpZD0ic3ZnXzYiIGZpbGw9IiNmZmYiIGQ9Im0yMC4wODYsMTQuOTEybDAsMS43MTFhMi40MDQsMi40MDQgMCAxIDEgLTQuODA3LDBsMCwtNC4yNzhsLTIuODEyLDBsMCw0LjY4NmE1LjIxNSw1LjIxNSAwIDAgMCAxMC40MywwbDAsLTQuNjg2bDAsMi41NjdsLTIuODEsMGwtMC4wMDEsMHoiIGNsaXAtcnVsZT0iZXZlbm9kZCIgZmlsbC1ydWxlPSJldmVub2RkIi8+CiAgPHBhdGggaWQ9InN2Z183IiBmaWxsPSIjZmZmIiBkPSJtMjIuODk4LDEzLjc3MWwxLjE0LDBsMCwxLjE0MWwtMS4xNCwwbDAsLTEuMTRsMCwtMC4wMDF6bS0yLjgxMiwtMC42OTJsMS44MzQsMGwwLDEuODMzbC0xLjgzNCwwbDAsLTEuODMybDAsLTAuMDAxem0xLjgzNCwtMC45NzlsMC45NzgsMGwwLDAuOTc5bC0wLjk3OCwwbDAsLTAuOTc5em0xLjU0OCwtMS42MjlsMC42MTEsMGwwLDAuNjExbC0wLjYxLDBsMCwtMC42MWwtMC4wMDEsLTAuMDAxeiIvPgogPC9nPgo8L3N2Zz4=&labelColor=white)](https://opendatalab.com/OpenSourceTools/Extractor/PDF)
 [![HuggingFace](https://img.shields.io/badge/Demo_on_HuggingFace-yellow.svg?logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAF8AAABYCAMAAACkl9t/AAAAk1BMVEVHcEz/nQv/nQv/nQr/nQv/nQr/nQv/nQv/nQr/wRf/txT/pg7/yRr/rBD/zRz/ngv/oAz/zhz/nwv/txT/ngv/0B3+zBz/nQv/0h7/wxn/vRb/thXkuiT/rxH/pxD/ogzcqyf/nQvTlSz/czCxky7/SjifdjT/Mj3+Mj3wMj15aTnDNz+DSD9RTUBsP0FRO0Q6O0WyIxEIAAAAGHRSTlMADB8zSWF3krDDw8TJ1NbX5efv8ff9/fxKDJ9uAAAGKklEQVR42u2Z63qjOAyGC4RwCOfB2JAGqrSb2WnTw/1f3UaWcSGYNKTdf/P+mOkTrE+yJBulvfvLT2A5ruenaVHyIks33npl/6C4s/ZLAM45SOi/1FtZPyFur1OYofBX3w7d54Bxm+E8db+nDr12ttmESZ4zludJEG5S7TO72YPlKZFyE+YCYUJTBZsMiNS5Sd7NlDmKM2Eg2JQg8awbglfqgbhArjxkS7dgp2RH6hc9AMLdZYUtZN5DJr4molC8BfKrEkPKEnEVjLbgW1fLy77ZVOJagoIcLIl+IxaQZGjiX597HopF5CkaXVMDO9Pyix3AFV3kw4lQLCbHuMovz8FallbcQIJ5Ta0vks9RnolbCK84BtjKRS5uA43hYoZcOBGIG2Epbv6CvFVQ8m8loh66WNySsnN7htL58LNp+NXT8/PhXiBXPMjLSxtwp8W9f/1AngRierBkA+kk/IpUSOeKByzn8y3kAAAfh//0oXgV4roHm/kz4E2z//zRc3/lgwBzbM2mJxQEa5pqgX7d1L0htrhx7LKxOZlKbwcAWyEOWqYSI8YPtgDQVjpB5nvaHaSnBaQSD6hweDi8PosxD6/PT09YY3xQA7LTCTKfYX+QHpA0GCcqmEHvr/cyfKQTEuwgbs2kPxJEB0iNjfJcCTPyocx+A0griHSmADiC91oNGVwJ69RudYe65vJmoqfpul0lrqXadW0jFKH5BKwAeCq+Den7s+3zfRJzA61/Uj/9H/VzLKTx9jFPPdXeeP+L7WEvDLAKAIoF8bPTKT0+TM7W8ePj3Rz/Yn3kOAp2f1Kf0Weony7pn/cPydvhQYV+eFOfmOu7VB/ViPe34/EN3RFHY/yRuT8ddCtMPH/McBAT5s+vRde/gf2c/sPsjLK+m5IBQF5tO+h2tTlBGnP6693JdsvofjOPnnEHkh2TnV/X1fBl9S5zrwuwF8NFrAVJVwCAPTe8gaJlomqlp0pv4Pjn98tJ/t/fL++6unpR1YGC2n/KCoa0tTLoKiEeUPDl94nj+5/Tv3/eT5vBQ60X1S0oZr+IWRR8Ldhu7AlLjPISlJcO9vrFotky9SpzDequlwEir5beYAc0R7D9KS1DXva0jhYRDXoExPdc6yw5GShkZXe9QdO/uOvHofxjrV/TNS6iMJS+4TcSTgk9n5agJdBQbB//IfF/HpvPt3Tbi7b6I6K0R72p6ajryEJrENW2bbeVUGjfgoals4L443c7BEE4mJO2SpbRngxQrAKRudRzGQ8jVOL2qDVjjI8K1gc3TIJ5KiFZ1q+gdsARPB4NQS4AjwVSt72DSoXNyOWUrU5mQ9nRYyjp89Xo7oRI6Bga9QNT1mQ/ptaJq5T/7WcgAZywR/XlPGAUDdet3LE+qS0TI+g+aJU8MIqjo0Kx8Ly+maxLjJmjQ18rA0YCkxLQbUZP1WqdmyQGJLUm7VnQFqodmXSqmRrdVpqdzk5LvmvgtEcW8PMGdaS23EOWyDVbACZzUJPaqMbjDxpA3Qrgl0AikimGDbqmyT8P8NOYiqrldF8rX+YN7TopX4UoHuSCYY7cgX4gHwclQKl1zhx0THf+tCAUValzjI7Wg9EhptrkIcfIJjA94evOn8B2eHaVzvBrnl2ig0So6hvPaz0IGcOvTHvUIlE2+prqAxLSQxZlU2stql1NqCCLdIiIN/i1DBEHUoElM9dBravbiAnKqgpi4IBkw+utSPIoBijDXJipSVV7MpOEJUAc5Qmm3BnUN+w3hteEieYKfRZSIUcXKMVf0u5wD4EwsUNVvZOtUT7A2GkffHjByWpHqvRBYrTV72a6j8zZ6W0DTE86Hn04bmyWX3Ri9WH7ZU6Q7h+ZHo0nHUAcsQvVhXRDZHChwiyi/hnPuOsSEF6Exk3o6Y9DT1eZ+6cASXk2Y9k+6EOQMDGm6WBK10wOQJCBwren86cPPWUcRAnTVjGcU1LBgs9FURiX/e6479yZcLwCBmTxiawEwrOcleuu12t3tbLv/N4RLYIBhYexm7Fcn4OJcn0+zc+s8/VfPeddZHAGN6TT8eGczHdR/Gts1/MzDkThr23zqrVfAMFT33Nx1RJsx1k5zuWILLnG/vsH+Fv5D4NTVcp1Gzo8AAAAAElFTkSuQmCC&labelColor=white)](https://huggingface.co/spaces/opendatalab/MinerU)
 [![ModelScope](https://img.shields.io/badge/Demo_on_ModelScope-purple?logo=data:image/svg+xml;base64,PHN2ZyB3aWR0aD0iMjIzIiBoZWlnaHQ9IjIwMCIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KCiA8Zz4KICA8dGl0bGU+TGF5ZXIgMTwvdGl0bGU+CiAgPHBhdGggaWQ9InN2Z18xNCIgZmlsbD0iIzYyNGFmZiIgZD0ibTAsODkuODRsMjUuNjUsMGwwLDI1LjY0OTk5bC0yNS42NSwwbDAsLTI1LjY0OTk5eiIvPgogIDxwYXRoIGlkPSJzdmdfMTUiIGZpbGw9IiM2MjRhZmYiIGQ9Im05OS4xNCwxMTUuNDlsMjUuNjUsMGwwLDI1LjY1bC0yNS42NSwwbDAsLTI1LjY1eiIvPgogIDxwYXRoIGlkPSJzdmdfMTYiIGZpbGw9IiM2MjRhZmYiIGQ9Im0xNzYuMDksMTQxLjE0bC0yNS42NDk5OSwwbDAsMjIuMTlsNDcuODQsMGwwLC00Ny44NGwtMjIuMTksMGwwLDI1LjY1eiIvPgogIDxwYXRoIGlkPSJzdmdfMTciIGZpbGw9IiMzNmNmZDEiIGQ9Im0xMjQuNzksODkuODRsMjUuNjUsMGwwLDI1LjY0OTk5bC0yNS42NSwwbDAsLTI1LjY0OTk5eiIvPgogIDxwYXRoIGlkPSJzdmdfMTgiIGZpbGw9IiMzNmNmZDEiIGQ9Im0wLDY0LjE5bDI1LjY1LDBsMCwyNS42NWwtMjUuNjUsMGwwLC0yNS42NXoiLz4KICA8cGF0aCBpZD0ic3ZnXzE5IiBmaWxsPSIjNjI0YWZmIiBkPSJtMTk4LjI4LDg5Ljg0bDI1LjY0OTk5LDBsMCwyNS42NDk5OWwtMjUuNjQ5OTksMGwwLC0yNS42NDk5OXoiLz4KICA8cGF0aCBpZD0ic3ZnXzIwIiBmaWxsPSIjMzZjZmQxIiBkPSJtMTk4LjI4LDY0LjE5bDI1LjY0OTk5LDBsMCwyNS42NWwtMjUuNjQ5OTksMGwwLC0yNS42NXoiLz4KICA8cGF0aCBpZD0ic3ZnXzIxIiBmaWxsPSIjNjI0YWZmIiBkPSJtMTUwLjQ0LDQybDAsMjIuMTlsMjUuNjQ5OTksMGwwLDI1LjY1bDIyLjE5LDBsMCwtNDcuODRsLTQ3Ljg0LDB6Ii8+CiAgPHBhdGggaWQ9InN2Z18yMiIgZmlsbD0iIzM2Y2ZkMSIgZD0ibTczLjQ5LDg5Ljg0bDI1LjY1LDBsMCwyNS42NDk5OWwtMjUuNjUsMGwwLC0yNS42NDk5OXoiLz4KICA8cGF0aCBpZD0ic3ZnXzIzIiBmaWxsPSIjNjI0YWZmIiBkPSJtNDcuODQsNjQuMTlsMjUuNjUsMGwwLC0yMi4xOWwtNDcuODQsMGwwLDQ3Ljg0bDIyLjE5LDBsMCwtMjUuNjV6Ii8+CiAgPHBhdGggaWQ9InN2Z18yNCIgZmlsbD0iIzYyNGFmZiIgZD0ibTQ3Ljg0LDExNS40OWwtMjIuMTksMGwwLDQ3Ljg0bDQ3Ljg0LDBsMCwtMjIuMTlsLTI1LjY1LDBsMCwtMjUuNjV6Ii8+CiA8L2c+Cjwvc3ZnPg==&labelColor=white)](https://www.modelscope.cn/studios/OpenDataLab/MinerU)
-[![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/gist/papayalove/b5f4913389e7ff9883c6b687de156e78/mineru_demo.ipynb)
+[![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/gist/myhloli/3b3a00a4a0a61577b6c30f989092d20d/mineru_demo.ipynb)
 [![Paper](https://img.shields.io/badge/Paper-arXiv-green)](https://arxiv.org/abs/2409.18839)


@@ -42,6 +42,7 @@
 </div>

 # Changelog
+- 2024/11/06 0.9.1 released. Integrated the [StructTable-InternVL2-1B](https://huggingface.co/U4R/StructTable-InternVL2-1B) model for table recognition functionality.
 - 2024/10/31 0.9.0 released. This is a major new version with extensive code refactoring, addressing numerous issues, improving performance, reducing hardware requirements, and enhancing usability:
  - Refactored the sorting module code to use [layoutreader](https://github.com/ppaanngggg/layoutreader) for reading order sorting, ensuring high accuracy in various layouts.
  - Refactored the paragraph concatenation module to achieve good results in cross-column, cross-page, cross-figure, and cross-table scenarios.

--- a/README_ja-JP.md
+++ b/README_ja-JP.md
@@ -18,6 +18,9 @@
 <a href="https://trendshift.io/repositories/11174" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11174" alt="opendatalab%2FMinerU | Trendshift" style="width: 200px; height: 55px;"/></a>


+<div align="center" style="color: red; background-color: #ffdddd; padding: 10px; border: 1px solid red; border-radius: 5px;">
+  <strong>NOTE：</strong> このドキュメントはすでに古くなっています。最新版のドキュメントを参照してください。
+</div>


 [English](README.md) | [简体中文](README_zh-CN.md) | [日本語](README_ja-JP.md)

--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -17,7 +17,7 @@
 [![OpenDataLab](https://img.shields.io/badge/Demo_on_OpenDataLab-blue?logo=data:image/svg+xml;base64,PHN2ZyB3aWR0aD0iMzAiIGhlaWdodD0iMzAiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyIgZmlsbD0ibm9uZSI+CiA8ZGVmcz4KICA8bGluZWFyR3JhZGllbnQgeTI9IjAuNTMzNjciIHgyPSIxLjAwMDQiIHkxPSIwLjI5MjE5IiB4MT0iLTAuMTEyNjgiIGlkPSJhIj4KICAgPHN0b3Agc3RvcC1jb2xvcj0iIzE1NDNGRSIvPgogICA8c3RvcCBzdG9wLWNvbG9yPSIjOEM0NkZGIiBvZmZzZXQ9IjEiLz4KICA8L2xpbmVhckdyYWRpZW50PgogIDxsaW5lYXJHcmFkaWVudCB5Mj0iMC41OTc1NyIgeDI9IjEuMDExMzciIHkxPSIwLjExMDIzIiB4MT0iLTAuMDg0NzQiIGlkPSJiIj4KICAgPHN0b3Agc3RvcC1jb2xvcj0iIzE1NDNGRSIvPgogICA8c3RvcCBzdG9wLWNvbG9yPSIjOEM0NkZGIiBvZmZzZXQ9IjEiLz4KICA8L2xpbmVhckdyYWRpZW50PgogPC9kZWZzPgogPGc+CiAgPHRpdGxlPkxheWVyIDE8L3RpdGxlPgogIDxwYXRoIGlkPSJzdmdfMSIgZmlsbD0idXJsKCNhKSIgZD0ibTEuNjIzLDEyLjA2N2EwLjQ4NCwwLjQ4NCAwIDAgMSAwLjA3LC0wLjM4NGw1LjMxLC03Ljg5NWMwLjA2OCwtMC4xIDAuMTcsLTAuMTcyIDAuMjg4LC0wLjJsMTQuMzc3LC0zLjQ3NGEwLjQ4NCwwLjQ4NCAwIDAgMSAwLjU4NCwwLjM1N2wzLjY2MiwxNS4xNTJjMS40NzcsNi4xMTQgLTIuMjgxLDEyLjI2NyAtOC4zOTQsMTMuNzQ1Yy02LjExNCwxLjQ3NyAtMTIuMjY3LC0yLjI4MSAtMTMuNzQ1LC04LjM5NWwtMi4xNTIsLTguOTA2eiIgb3BhY2l0eT0iMC40Ii8+CiAgPHBhdGggaWQ9InN2Z18yIiBmaWxsPSJ1cmwoI2IpIiBkPSJtNS44MjYsOC42NzNjMCwtMC4xMzYgMC4wNTcsLTAuMjY2IDAuMTU3LC0wLjM1OGw3LjAxNywtNi40MjVhMC40ODQsMC40ODQgMCAwIDEgMC4zMjcsLTAuMTI3bDE0Ljc5LDBjMC4yNjgsMCAwLjQ4NSwwLjIxNiAwLjQ4NSwwLjQ4NGwwLDE1LjU4OWMwLDYuMjkgLTUuMDk5LDExLjM4OCAtMTEuMzg4LDExLjM4OGMtNi4yOSwwIC0xMS4zODgsLTUuMDk5IC0xMS4zODgsLTExLjM4OGwwLC05LjE2M3oiLz4KICA8cGF0aCBpZD0ic3ZnXzMiIGZpbGw9IiM1RDc2RkYiIGQ9Im0xMi4zMzEsOC43NTNsLTYuMzgzLC0wLjM5OGw3LjEyMiwtNi41MmwwLjI5OSw1Ljg5MmEwLjk3OCwwLjk3OCAwIDAgMSAtMS4wMzgsMS4wMjZ6Ii8+CiAgPHBhdGggaWQ9InN2Z180IiBmaWxsPSIjMDAyOEZEIiBkPSJtMjAuNDE2LDE1LjAyMmwwLDEuNzExYTIuNDA0LDIuNDA0IDAgMCAxIC00LjgwOCwwbDAsLTQuMjc4bC0yLjgxLDBsMCw0LjY4NmE1LjIxNSw1LjIxNSAwIDEgMCAxMC40MywwbDAsLTQuNjg2bDAsMi41NjdsLTIuODEyLDB6IiBjbGlwLXJ1bGU9ImV2ZW5vZGQiIGZpbGwtcnVsZT0iZXZlbm9kZCIvPgogIDxwYXRoIGlkPSJzdmdfNSIgZmlsbD0iIzAwMjhGRCIgZD0ibTIzLjIyOCwxMy44ODFsMS4xNCwwbDAsMS4xNDFsLTEuMTQsMGwwLC0xLjE0bDAsLTAuMDAxem0tMi44MTIsLTAuNjkybDEuODM0LDBsMCwxLjgzM2wtMS44MzQsMGwwLC0xLjgzMmwwLC0wLjAwMXptMS44MzQsLTAuOTc5bDAuOTc4LDBsMCwwLjk3OWwtMC45NzgsMGwwLC0wLjk3OGwwLC0wLjAwMXptMS41NDgsLTEuNjI5bDAuNjExLDBsMCwwLjYxMWwtMC42MTEsMGwwLC0wLjYxMXoiLz4KICA8cGF0aCBpZD0ic3ZnXzYiIGZpbGw9IiNmZmYiIGQ9Im0yMC4wODYsMTQuOTEybDAsMS43MTFhMi40MDQsMi40MDQgMCAxIDEgLTQuODA3LDBsMCwtNC4yNzhsLTIuODEyLDBsMCw0LjY4NmE1LjIxNSw1LjIxNSAwIDAgMCAxMC40MywwbDAsLTQuNjg2bDAsMi41NjdsLTIuODEsMGwtMC4wMDEsMHoiIGNsaXAtcnVsZT0iZXZlbm9kZCIgZmlsbC1ydWxlPSJldmVub2RkIi8+CiAgPHBhdGggaWQ9InN2Z183IiBmaWxsPSIjZmZmIiBkPSJtMjIuODk4LDEzLjc3MWwxLjE0LDBsMCwxLjE0MWwtMS4xNCwwbDAsLTEuMTRsMCwtMC4wMDF6bS0yLjgxMiwtMC42OTJsMS44MzQsMGwwLDEuODMzbC0xLjgzNCwwbDAsLTEuODMybDAsLTAuMDAxem0xLjgzNCwtMC45NzlsMC45NzgsMGwwLDAuOTc5bC0wLjk3OCwwbDAsLTAuOTc5em0xLjU0OCwtMS42MjlsMC42MTEsMGwwLDAuNjExbC0wLjYxLDBsMCwtMC42MWwtMC4wMDEsLTAuMDAxeiIvPgogPC9nPgo8L3N2Zz4=&labelColor=white)](https://opendatalab.com/OpenSourceTools/Extractor/PDF)
 [![ModelScope](https://img.shields.io/badge/Demo_on_ModelScope-purple?logo=data:image/svg+xml;base64,PHN2ZyB3aWR0aD0iMjIzIiBoZWlnaHQ9IjIwMCIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KCiA8Zz4KICA8dGl0bGU+TGF5ZXIgMTwvdGl0bGU+CiAgPHBhdGggaWQ9InN2Z18xNCIgZmlsbD0iIzYyNGFmZiIgZD0ibTAsODkuODRsMjUuNjUsMGwwLDI1LjY0OTk5bC0yNS42NSwwbDAsLTI1LjY0OTk5eiIvPgogIDxwYXRoIGlkPSJzdmdfMTUiIGZpbGw9IiM2MjRhZmYiIGQ9Im05OS4xNCwxMTUuNDlsMjUuNjUsMGwwLDI1LjY1bC0yNS42NSwwbDAsLTI1LjY1eiIvPgogIDxwYXRoIGlkPSJzdmdfMTYiIGZpbGw9IiM2MjRhZmYiIGQ9Im0xNzYuMDksMTQxLjE0bC0yNS42NDk5OSwwbDAsMjIuMTlsNDcuODQsMGwwLC00Ny44NGwtMjIuMTksMGwwLDI1LjY1eiIvPgogIDxwYXRoIGlkPSJzdmdfMTciIGZpbGw9IiMzNmNmZDEiIGQ9Im0xMjQuNzksODkuODRsMjUuNjUsMGwwLDI1LjY0OTk5bC0yNS42NSwwbDAsLTI1LjY0OTk5eiIvPgogIDxwYXRoIGlkPSJzdmdfMTgiIGZpbGw9IiMzNmNmZDEiIGQ9Im0wLDY0LjE5bDI1LjY1LDBsMCwyNS42NWwtMjUuNjUsMGwwLC0yNS42NXoiLz4KICA8cGF0aCBpZD0ic3ZnXzE5IiBmaWxsPSIjNjI0YWZmIiBkPSJtMTk4LjI4LDg5Ljg0bDI1LjY0OTk5LDBsMCwyNS42NDk5OWwtMjUuNjQ5OTksMGwwLC0yNS42NDk5OXoiLz4KICA8cGF0aCBpZD0ic3ZnXzIwIiBmaWxsPSIjMzZjZmQxIiBkPSJtMTk4LjI4LDY0LjE5bDI1LjY0OTk5LDBsMCwyNS42NWwtMjUuNjQ5OTksMGwwLC0yNS42NXoiLz4KICA8cGF0aCBpZD0ic3ZnXzIxIiBmaWxsPSIjNjI0YWZmIiBkPSJtMTUwLjQ0LDQybDAsMjIuMTlsMjUuNjQ5OTksMGwwLDI1LjY1bDIyLjE5LDBsMCwtNDcuODRsLTQ3Ljg0LDB6Ii8+CiAgPHBhdGggaWQ9InN2Z18yMiIgZmlsbD0iIzM2Y2ZkMSIgZD0ibTczLjQ5LDg5Ljg0bDI1LjY1LDBsMCwyNS42NDk5OWwtMjUuNjUsMGwwLC0yNS42NDk5OXoiLz4KICA8cGF0aCBpZD0ic3ZnXzIzIiBmaWxsPSIjNjI0YWZmIiBkPSJtNDcuODQsNjQuMTlsMjUuNjUsMGwwLC0yMi4xOWwtNDcuODQsMGwwLDQ3Ljg0bDIyLjE5LDBsMCwtMjUuNjV6Ii8+CiAgPHBhdGggaWQ9InN2Z18yNCIgZmlsbD0iIzYyNGFmZiIgZD0ibTQ3Ljg0LDExNS40OWwtMjIuMTksMGwwLDQ3Ljg0bDQ3Ljg0LDBsMCwtMjIuMTlsLTI1LjY1LDBsMCwtMjUuNjV6Ii8+CiA8L2c+Cjwvc3ZnPg==&labelColor=white)](https://www.modelscope.cn/studios/OpenDataLab/MinerU)
 [![HuggingFace](https://img.shields.io/badge/Demo_on_HuggingFace-yellow.svg?logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAF8AAABYCAMAAACkl9t/AAAAk1BMVEVHcEz/nQv/nQv/nQr/nQv/nQr/nQv/nQv/nQr/wRf/txT/pg7/yRr/rBD/zRz/ngv/oAz/zhz/nwv/txT/ngv/0B3+zBz/nQv/0h7/wxn/vRb/thXkuiT/rxH/pxD/ogzcqyf/nQvTlSz/czCxky7/SjifdjT/Mj3+Mj3wMj15aTnDNz+DSD9RTUBsP0FRO0Q6O0WyIxEIAAAAGHRSTlMADB8zSWF3krDDw8TJ1NbX5efv8ff9/fxKDJ9uAAAGKklEQVR42u2Z63qjOAyGC4RwCOfB2JAGqrSb2WnTw/1f3UaWcSGYNKTdf/P+mOkTrE+yJBulvfvLT2A5ruenaVHyIks33npl/6C4s/ZLAM45SOi/1FtZPyFur1OYofBX3w7d54Bxm+E8db+nDr12ttmESZ4zludJEG5S7TO72YPlKZFyE+YCYUJTBZsMiNS5Sd7NlDmKM2Eg2JQg8awbglfqgbhArjxkS7dgp2RH6hc9AMLdZYUtZN5DJr4molC8BfKrEkPKEnEVjLbgW1fLy77ZVOJagoIcLIl+IxaQZGjiX597HopF5CkaXVMDO9Pyix3AFV3kw4lQLCbHuMovz8FallbcQIJ5Ta0vks9RnolbCK84BtjKRS5uA43hYoZcOBGIG2Epbv6CvFVQ8m8loh66WNySsnN7htL58LNp+NXT8/PhXiBXPMjLSxtwp8W9f/1AngRierBkA+kk/IpUSOeKByzn8y3kAAAfh//0oXgV4roHm/kz4E2z//zRc3/lgwBzbM2mJxQEa5pqgX7d1L0htrhx7LKxOZlKbwcAWyEOWqYSI8YPtgDQVjpB5nvaHaSnBaQSD6hweDi8PosxD6/PT09YY3xQA7LTCTKfYX+QHpA0GCcqmEHvr/cyfKQTEuwgbs2kPxJEB0iNjfJcCTPyocx+A0griHSmADiC91oNGVwJ69RudYe65vJmoqfpul0lrqXadW0jFKH5BKwAeCq+Den7s+3zfRJzA61/Uj/9H/VzLKTx9jFPPdXeeP+L7WEvDLAKAIoF8bPTKT0+TM7W8ePj3Rz/Yn3kOAp2f1Kf0Weony7pn/cPydvhQYV+eFOfmOu7VB/ViPe34/EN3RFHY/yRuT8ddCtMPH/McBAT5s+vRde/gf2c/sPsjLK+m5IBQF5tO+h2tTlBGnP6693JdsvofjOPnnEHkh2TnV/X1fBl9S5zrwuwF8NFrAVJVwCAPTe8gaJlomqlp0pv4Pjn98tJ/t/fL++6unpR1YGC2n/KCoa0tTLoKiEeUPDl94nj+5/Tv3/eT5vBQ60X1S0oZr+IWRR8Ldhu7AlLjPISlJcO9vrFotky9SpzDequlwEir5beYAc0R7D9KS1DXva0jhYRDXoExPdc6yw5GShkZXe9QdO/uOvHofxjrV/TNS6iMJS+4TcSTgk9n5agJdBQbB//IfF/HpvPt3Tbi7b6I6K0R72p6ajryEJrENW2bbeVUGjfgoals4L443c7BEE4mJO2SpbRngxQrAKRudRzGQ8jVOL2qDVjjI8K1gc3TIJ5KiFZ1q+gdsARPB4NQS4AjwVSt72DSoXNyOWUrU5mQ9nRYyjp89Xo7oRI6Bga9QNT1mQ/ptaJq5T/7WcgAZywR/XlPGAUDdet3LE+qS0TI+g+aJU8MIqjo0Kx8Ly+maxLjJmjQ18rA0YCkxLQbUZP1WqdmyQGJLUm7VnQFqodmXSqmRrdVpqdzk5LvmvgtEcW8PMGdaS23EOWyDVbACZzUJPaqMbjDxpA3Qrgl0AikimGDbqmyT8P8NOYiqrldF8rX+YN7TopX4UoHuSCYY7cgX4gHwclQKl1zhx0THf+tCAUValzjI7Wg9EhptrkIcfIJjA94evOn8B2eHaVzvBrnl2ig0So6hvPaz0IGcOvTHvUIlE2+prqAxLSQxZlU2stql1NqCCLdIiIN/i1DBEHUoElM9dBravbiAnKqgpi4IBkw+utSPIoBijDXJipSVV7MpOEJUAc5Qmm3BnUN+w3hteEieYKfRZSIUcXKMVf0u5wD4EwsUNVvZOtUT7A2GkffHjByWpHqvRBYrTV72a6j8zZ6W0DTE86Hn04bmyWX3Ri9WH7ZU6Q7h+ZHo0nHUAcsQvVhXRDZHChwiyi/hnPuOsSEF6Exk3o6Y9DT1eZ+6cASXk2Y9k+6EOQMDGm6WBK10wOQJCBwren86cPPWUcRAnTVjGcU1LBgs9FURiX/e6479yZcLwCBmTxiawEwrOcleuu12t3tbLv/N4RLYIBhYexm7Fcn4OJcn0+zc+s8/VfPeddZHAGN6TT8eGczHdR/Gts1/MzDkThr23zqrVfAMFT33Nx1RJsx1k5zuWILLnG/vsH+Fv5D4NTVcp1Gzo8AAAAAElFTkSuQmCC&labelColor=white)](https://huggingface.co/spaces/opendatalab/MinerU)
-[![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/gist/papayalove/b5f4913389e7ff9883c6b687de156e78/mineru_demo.ipynb)
+[![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/gist/myhloli/3b3a00a4a0a61577b6c30f989092d20d/mineru_demo.ipynb)
 [![Paper](https://img.shields.io/badge/Paper-arXiv-green)](https://arxiv.org/abs/2409.18839)


@@ -43,6 +43,7 @@

 # 更新记录

+- 2024/11/06 0.9.1发布，为表格识别功能接入了[StructTable-InternVL2-1B](https://huggingface.co/U4R/StructTable-InternVL2-1B)模型
 - 2024/10/31 0.9.0发布，这是我们进行了大量代码重构的全新版本，解决了众多问题，提升了性能，降低了硬件需求，并提供了更丰富的易用性：
  - 重构排序模块代码，使用 [layoutreader](https://github.com/ppaanngggg/layoutreader) 进行阅读顺序排序，确保在各种排版下都能实现极高准确率
  - 重构段落拼接模块，在跨栏、跨页、跨图、跨表情况下均能实现良好的段落拼接效果

--- a/docs/FAQ_en_us.md
+++ b/docs/FAQ_en_us.md
@@ -57,3 +57,10 @@ pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/package
 ```

 Reference: https://github.com/opendatalab/MinerU/issues/558
+
+### 7. On some Linux servers, the program immediately reports an error `Illegal instruction (core dumped)`
+
+This might be because the server's CPU does not support the AVX/AVX2 instruction set, or the CPU itself supports it but has been disabled by the system administrator. You can try contacting the system administrator to remove the restriction or change to a different server.
+
+References: https://github.com/opendatalab/MinerU/issues/591 , https://github.com/opendatalab/MinerU/issues/736
+
--- a/docs/FAQ_zh_cn.md
+++ b/docs/FAQ_zh_cn.md
@@ -59,3 +59,9 @@ pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/package
 ```

 参考：https://github.com/opendatalab/MinerU/issues/558
+
+### 7.在部分Linux服务器上，程序一运行就报错 `非法指令 (核心已转储)` 或 `Illegal instruction (core dumped)`
+
+可能是因为服务器CPU不支持AVX/AVX2指令集，或cpu本身支持但被运维禁用了，可以尝试联系运维解除限制或更换服务器。
+
+参考：https://github.com/opendatalab/MinerU/issues/591 ， https://github.com/opendatalab/MinerU/issues/736
\ No newline at end of file
--- a/docs/README_Ubuntu_CUDA_Acceleration_en_US.md
+++ b/docs/README_Ubuntu_CUDA_Acceleration_en_US.md
@@ -93,7 +93,7 @@ Download a sample file from the repository and test it.

 ```sh
 wget https://github.com/opendatalab/MinerU/raw/master/demo/small_ocr.pdf
-magic-pdf -p small_ocr.pdf
+magic-pdf -p small_ocr.pdf -o ./output
 ```

 ### 9. Test CUDA Acceleration
@@ -108,7 +108,7 @@ If your graphics card has at least **8GB** of VRAM, follow these steps to test C
   ```
 2. Test CUDA acceleration with the following command:
   ```sh
-   magic-pdf -p small_ocr.pdf
+   magic-pdf -p small_ocr.pdf -o ./output
   ```

 ### 10. Enable CUDA Acceleration for OCR
@@ -119,5 +119,5 @@ If your graphics card has at least **8GB** of VRAM, follow these steps to test C
   ```
 2. Test OCR acceleration with the following command:
   ```sh
-   magic-pdf -p small_ocr.pdf
+   magic-pdf -p small_ocr.pdf -o ./output
   ```
--- a/docs/README_Ubuntu_CUDA_Acceleration_zh_CN.md
+++ b/docs/README_Ubuntu_CUDA_Acceleration_zh_CN.md
@@ -11,7 +11,6 @@ nvidia-smi
 注意:`CUDA Version` 显示的版本号应 >= 12.1，如显示的版本号小于12.1，请升级驱动

 ```plaintext
-```
 +---------------------------------------------------------------------------------------+
 | NVIDIA-SMI 537.34                 Driver Version: 537.34       CUDA Version: 12.2     |
 |-----------------------------------------+----------------------+----------------------+
@@ -93,7 +92,7 @@ pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com -i h

 ```bash
 wget https://gitee.com/myhloli/MinerU/raw/master/demo/small_ocr.pdf
-magic-pdf -p small_ocr.pdf
+magic-pdf -p small_ocr.pdf -o ./output
 ```

 ## 9. 测试CUDA加速
@@ -111,7 +110,7 @@ magic-pdf -p small_ocr.pdf
 **2.运行以下命令测试cuda加速效果**

 ```bash
-magic-pdf -p small_ocr.pdf
+magic-pdf -p small_ocr.pdf -o ./output
 ```

 > 提示：CUDA加速是否生效可以根据log中输出的各个阶段cost耗时来简单判断，通常情况下，`layout detection cost` 和 `mfr time` 应提速10倍以上。
@@ -127,7 +126,7 @@ python -m pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.
 **2.运行以下命令测试ocr加速效果**

 ```bash
-magic-pdf -p small_ocr.pdf
+magic-pdf -p small_ocr.pdf -o ./output
 ```

 > 提示：CUDA加速是否生效可以根据log中输出的各个阶段cost耗时来简单判断，通常情况下，`ocr cost`应提速10倍以上。
--- a/docs/README_Windows_CUDA_Acceleration_en_US.md
+++ b/docs/README_Windows_CUDA_Acceleration_en_US.md
@@ -53,7 +53,7 @@ Download a sample file from the repository and test it.

 ```powershell
  wget https://github.com/opendatalab/MinerU/raw/master/demo/small_ocr.pdf -O small_ocr.pdf
-  magic-pdf -p small_ocr.pdf
+  magic-pdf -p small_ocr.pdf -o ./output
 ```

 ### 8. Test CUDA Acceleration
@@ -86,7 +86,7 @@ If your graphics card has at least 8GB of VRAM, follow these steps to test CUDA-
 3. **Run the following command to test CUDA acceleration**:

   ```
-   magic-pdf -p small_ocr.pdf
+   magic-pdf -p small_ocr.pdf -o ./output
   ```

 ### 9. Enable CUDA Acceleration for OCR
@@ -97,5 +97,5 @@ If your graphics card has at least 8GB of VRAM, follow these steps to test CUDA-
   ```
 2. **Run the following command to test OCR acceleration**:
   ```
-   magic-pdf -p small_ocr.pdf
+   magic-pdf -p small_ocr.pdf -o ./output
   ```
--- a/docs/README_Windows_CUDA_Acceleration_zh_CN.md
+++ b/docs/README_Windows_CUDA_Acceleration_zh_CN.md
@@ -55,7 +55,7 @@ pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com -i h

 ```powershell
 wget https://github.com/opendatalab/MinerU/raw/master/demo/small_ocr.pdf -O small_ocr.pdf
- magic-pdf -p small_ocr.pdf
+ magic-pdf -p small_ocr.pdf -o ./output
 ```

 ## 8. 测试CUDA加速
@@ -87,7 +87,7 @@ pip install --force-reinstall torch==2.3.1 torchvision==0.18.1 --index-url https
 **3.运行以下命令测试cuda加速效果**

 ```bash
-magic-pdf -p small_ocr.pdf
+magic-pdf -p small_ocr.pdf -o ./output
 ```

 > 提示：CUDA加速是否生效可以根据log中输出的各个阶段的耗时来简单判断，通常情况下，`layout detection time` 和 `mfr time` 应提速10倍以上。
@@ -103,7 +103,7 @@ pip install paddlepaddle-gpu==2.6.1
 **2.运行以下命令测试ocr加速效果**

 ```bash
-magic-pdf -p small_ocr.pdf
+magic-pdf -p small_ocr.pdf -o ./output
 ```

 > 提示：CUDA加速是否生效可以根据log中输出的各个阶段cost耗时来简单判断，通常情况下，`ocr time`应提速10倍以上。
--- a/magic_pdf/data/data_reader_writer/multi_bucket_s3.py
+++ b/magic_pdf/data/data_reader_writer/multi_bucket_s3.py
+import os
 from magic_pdf.config.exceptions import InvalidConfig, InvalidParams
 from magic_pdf.data.data_reader_writer.base import DataReader, DataWriter
 from magic_pdf.data.io.s3 import S3Reader, S3Writer
@@ -7,30 +8,34 @@ from magic_pdf.libs.path_utils import (parse_s3_range_params, parse_s3path,


 class MultiS3Mixin:
-    def __init__(self, default_bucket: str, s3_configs: list[S3Config]):
+    def __init__(self, default_prefix: str, s3_configs: list[S3Config]):
        """Initialized with multiple s3 configs.

        Args:
-            default_bucket (str): the default bucket name of the relative path
+            default_prefix (str): the default prefix of the relative path. for example, {some_bucket}/{some_prefix} or {some_bucket}
            s3_configs (list[S3Config]): list of s3 configs, the bucket_name must be unique in the list.

        Raises:
-            InvalidConfig: default bucket config not in s3_configs
-            InvalidConfig: bucket name not unique in s3_configs
-            InvalidConfig: default bucket must be provided
+            InvalidConfig: default bucket config not in s3_configs.
+            InvalidConfig: bucket name not unique in s3_configs.
+            InvalidConfig: default bucket must be provided.
        """
-        if len(default_bucket) == 0:
-            raise InvalidConfig('default_bucket must be provided')
+        if len(default_prefix) == 0:
+            raise InvalidConfig('default_prefix must be provided')
+    
+        arr = default_prefix.strip("/").split("/")
+        self.default_bucket = arr[0]
+        self.default_prefix = "/".join(arr[1:])

        found_default_bucket_config = False
        for conf in s3_configs:
-            if conf.bucket_name == default_bucket:
+            if conf.bucket_name == self.default_bucket:
                found_default_bucket_config = True
                break

        if not found_default_bucket_config:
            raise InvalidConfig(
-                f'default_bucket: {default_bucket} config must be provided in s3_configs: {s3_configs}'
+                f'default_bucket: {self.default_bucket} config must be provided in s3_configs: {s3_configs}'
            )

        uniq_bucket = set([conf.bucket_name for conf in s3_configs])
@@ -39,7 +44,6 @@ class MultiS3Mixin:
                f'the bucket_name in s3_configs: {s3_configs} must be unique'
            )

-        self.default_bucket = default_bucket
        self.s3_configs = s3_configs
        self._s3_clients_h: dict = {}

@@ -47,14 +51,14 @@ class MultiS3Mixin:
 class MultiBucketS3DataReader(DataReader, MultiS3Mixin):
    def read(self, path: str) -> bytes:
        """Read the path from s3, select diffect bucket client for each request
-        based on the path, also support range read.
+        based on the bucket, also support range read.

        Args:
-            path (str): the s3 path of file, the path must be in the format of s3://bucket_name/path?offset,limit
-            for example: s3://bucket_name/path?0,100
+            path (str): the s3 path of file, the path must be in the format of s3://bucket_name/path?offset,limit.
+            for example: s3://bucket_name/path?0,100.

        Returns:
-            bytes: the content of s3 file
+            bytes: the content of s3 file.
        """
        may_range_params = parse_s3_range_params(path)
        if may_range_params is None or 2 != len(may_range_params):
@@ -84,21 +88,22 @@ class MultiBucketS3DataReader(DataReader, MultiS3Mixin):

    def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes:
        """Read the file with offset and limit, select diffect bucket client
-        for each request based on the path.
+        for each request based on the bucket.

        Args:
-            path (str): the file path
+            path (str): the file path.
            offset (int, optional): the number of bytes skipped. Defaults to 0.
            limit (int, optional): the number of bytes want to read. Defaults to -1 which means infinite.

        Returns:
-            bytes: the file content
+            bytes: the file content.
        """
        if path.startswith('s3://'):
            bucket_name, path = parse_s3path(path)
            s3_reader = self.__get_s3_client(bucket_name)
        else:
            s3_reader = self.__get_s3_client(self.default_bucket)
+            path = os.path.join(self.default_prefix, path)
        return s3_reader.read_at(path, offset, limit)


@@ -123,15 +128,16 @@ class MultiBucketS3DataWriter(DataWriter, MultiS3Mixin):

    def write(self, path: str, data: bytes) -> None:
        """Write file with data, also select diffect bucket client for each
-        request based on the path.
+        request based on the bucket.

        Args:
            path (str): the path of file, if the path is relative path, it will be joined with parent_dir.
-            data (bytes): the data want to write
+            data (bytes): the data want to write.
        """
        if path.startswith('s3://'):
            bucket_name, path = parse_s3path(path)
            s3_writer = self.__get_s3_client(bucket_name)
        else:
            s3_writer = self.__get_s3_client(self.default_bucket)
+            path = os.path.join(self.default_prefix, path)
        return s3_writer.write(path, data)
--- a/magic_pdf/data/data_reader_writer/s3.py
+++ b/magic_pdf/data/data_reader_writer/s3.py
@@ -6,6 +6,7 @@ from magic_pdf.data.schemas import S3Config
 class S3DataReader(MultiBucketS3DataReader):
    def __init__(
        self,
+        default_prefix_without_bucket: str,
        bucket: str,
        ak: str,
        sk: str,
@@ -15,6 +16,7 @@ class S3DataReader(MultiBucketS3DataReader):
        """s3 reader client.

        Args:
+            default_prefix_without_bucket: prefix that not contains bucket
            bucket (str): bucket name
            ak (str): access key
            sk (str): secret key
@@ -23,7 +25,7 @@ class S3DataReader(MultiBucketS3DataReader):
            refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html
        """
        super().__init__(
-            bucket,
+            f'{bucket}/{default_prefix_without_bucket}',
            [
                S3Config(
                    bucket_name=bucket,
@@ -39,6 +41,7 @@ class S3DataReader(MultiBucketS3DataReader):
 class S3DataWriter(MultiBucketS3DataWriter):
    def __init__(
        self,
+        default_prefix_without_bucket: str,
        bucket: str,
        ak: str,
        sk: str,
@@ -48,6 +51,7 @@ class S3DataWriter(MultiBucketS3DataWriter):
        """s3 writer client.

        Args:
+            default_prefix_without_bucket: prefix that not contains bucket
            bucket (str): bucket name
            ak (str): access key
            sk (str): secret key
@@ -56,7 +60,7 @@ class S3DataWriter(MultiBucketS3DataWriter):
            refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html
        """
        super().__init__(
-            bucket,
+            f'{bucket}/{default_prefix_without_bucket}',
            [
                S3Config(
                    bucket_name=bucket,

--- a/magic_pdf/data/io/__init__.py
+++ b/magic_pdf/data/io/__init__.py
+
+from magic_pdf.data.io.base import IOReader, IOWriter  # noqa: F401
+from magic_pdf.data.io.http import HttpReader, HttpWriter  # noqa: F401
+from magic_pdf.data.io.s3 import S3Reader, S3Writer  # noqa: F401
+
+__all__ = ['IOReader', 'IOWriter', 'HttpReader', 'HttpWriter', 'S3Reader', 'S3Writer']
\ No newline at end of file
--- a/magic_pdf/data/io/base.py
+++ b/magic_pdf/data/io/base.py
@@ -29,7 +29,7 @@ class IOReader(ABC):
        pass


-class IOWriter:
+class IOWriter(ABC):

    @abstractmethod
    def write(self, path: str, data: bytes) -> None:

--- a/magic_pdf/data/schemas.py
+++ b/magic_pdf/data/schemas.py
@@ -3,6 +3,8 @@ from pydantic import BaseModel, Field


 class S3Config(BaseModel):
+    """S3 config
+    """
    bucket_name: str = Field(description='s3 bucket name', min_length=1)
    access_key: str = Field(description='s3 access key', min_length=1)
    secret_key: str = Field(description='s3 secret key', min_length=1)
@@ -11,5 +13,7 @@ class S3Config(BaseModel):


 class PageInfo(BaseModel):
+    """The width and height of page
+    """
    w: float = Field(description='the width of page')
    h: float = Field(description='the height of page')
--- a/magic_pdf/dict2md/ocr_mkcontent.py
+++ b/magic_pdf/dict2md/ocr_mkcontent.py
@@ -119,6 +119,16 @@ def detect_language(text):
        return 'empty'


+# 连写字符拆分
+def __replace_ligatures(text: str):
+    text = re.sub(r'ﬁ', 'fi', text)  # 替换 fi 连写符
+    text = re.sub(r'ﬂ', 'fl', text)  # 替换 fl 连写符
+    text = re.sub(r'ﬀ', 'ff', text)  # 替换 ff 连写符
+    text = re.sub(r'ﬃ', 'ffi', text)  # 替换 ffi 连写符
+    text = re.sub(r'ﬄ', 'ffl', text)  # 替换 ffl 连写符
+    return text
+
+
 def merge_para_with_text(para_block):
    para_text = ''
    for i, line in enumerate(para_block['lines']):
@@ -141,22 +151,34 @@ def merge_para_with_text(para_block):
            if span_type == ContentType.Text:
                content = ocr_escape_special_markdown_char(span['content'])
            elif span_type == ContentType.InlineEquation:
-                content = f" ${span['content']}$ "
+                content = f"${span['content']}$"
            elif span_type == ContentType.InterlineEquation:
                content = f"\n$$\n{span['content']}\n$$\n"

+            content = content.strip()
            if content != '':
                langs = ['zh', 'ja', 'ko']
                if line_lang in langs:  # 遇到一些一个字一个span的文档，这种单字语言判断不准，需要用整行文本判断
+                    if span_type in [ContentType.Text, ContentType.InterlineEquation]:
                        para_text += content  # 中文/日语/韩文语境下，content间不需要空格分隔
-                elif line_lang == 'en':
+                    elif span_type == ContentType.InlineEquation:
+                        para_text += f" {content} "
+                else:
+                    if span_type in [ContentType.Text, ContentType.InlineEquation]:
                        # 如果是前一行带有-连字符，那么末尾不应该加空格
                        if __is_hyphen_at_line_end(content):
                            para_text += content[:-1]
+                        elif len(content) == 1 and content not in ['A', 'I', 'a', 'i']:
+                            para_text += content
+                        else:  # 西方文本语境下 content间需要空格分隔
+                            para_text += f"{content} "
+                    elif span_type == ContentType.InterlineEquation:
+                        para_text += content
            else:
-                        para_text += content + ' '
-                else:
-                    para_text += content + ' '  # 西方文本语境下 content间需要空格分隔
+                continue
+    # 连写字符拆分
+    para_text = __replace_ligatures(para_text)
+
    return para_text



--- a/magic_pdf/model/pdf_extract_kit.py
+++ b/magic_pdf/model/pdf_extract_kit.py
@@ -38,15 +38,13 @@ except ImportError as e:
 from magic_pdf.model.pek_sub_modules.layoutlmv3.model_init import Layoutlmv3_Predictor
 from magic_pdf.model.pek_sub_modules.post_process import latex_rm_whitespace
 from magic_pdf.model.pek_sub_modules.self_modify import ModifiedPaddleOCR
-# from magic_pdf.model.pek_sub_modules.structeqtable.StructTableModel import StructTableModel
+from magic_pdf.model.pek_sub_modules.structeqtable.StructTableModel import StructTableModel
 from magic_pdf.model.ppTableModel import ppTableModel


 def table_model_init(table_model_type, model_path, max_time, _device_='cpu'):
    if table_model_type == MODEL_NAME.STRUCT_EQTABLE:
-        # table_model = StructTableModel(model_path, max_time=max_time, device=_device_)
-        logger.error("StructEqTable is under upgrade, the current version does not support it.")
-        exit(1)
+        table_model = StructTableModel(model_path, max_time=max_time)
    elif table_model_type == MODEL_NAME.TABLE_MASTER:
        config = {
            "model_dir": model_path,
@@ -463,7 +461,9 @@ class CustomPEKModel:
                html_code = None
                if self.table_model_name == MODEL_NAME.STRUCT_EQTABLE:
                    with torch.no_grad():
-                        latex_code = self.table_model.image2latex(new_image)[0]
+                        table_result = self.table_model.predict(new_image, "html")
+                        if len(table_result) > 0:
+                            html_code = table_result[0]
                else:
                    html_code = self.table_model.img2html(new_image)

@@ -474,14 +474,17 @@ class CustomPEKModel:
                # 判断是否返回正常

                if latex_code:
-                    expected_ending = latex_code.strip().endswith('end{tabular}') or latex_code.strip().endswith(
-                        'end{table}')
+                    expected_ending = latex_code.strip().endswith('end{tabular}') or latex_code.strip().endswith('end{table}')
                    if expected_ending:
                        res["latex"] = latex_code
                    else:
                        logger.warning(f"table recognition processing fails, not found expected LaTeX table end")
                elif html_code:
+                    expected_ending = html_code.strip().endswith('</html>') or html_code.strip().endswith('</table>')
+                    if expected_ending:
                        res["html"] = html_code
+                    else:
+                        logger.warning(f"table recognition processing fails, not found expected HTML table end")
                else:
                    logger.warning(f"table recognition processing fails, not get latex or html return")
            logger.info(f"table time: {round(time.time() - table_start, 2)}")

--- a/magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py
+++ b/magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py
-from loguru import logger
+import re

-try:
-    from struct_eqtable.model import StructTable
-except ImportError:
-    logger.error("StructEqTable is under upgrade, the current version does not support it.")
-from pypandoc import convert_text
+import torch
+from struct_eqtable import build_model


 class StructTableModel:
-    def __init__(self, model_path, max_new_tokens=2048, max_time=400, device = 'cpu'):
+    def __init__(self, model_path, max_new_tokens=1024, max_time=60):
        # init
-        self.model_path = model_path
-        self.max_new_tokens = max_new_tokens # maximum output tokens length
-        self.max_time = max_time # timeout for processing in seconds
-        if device == 'cuda':
-            self.model = StructTable(self.model_path, self.max_new_tokens, self.max_time).cuda()
+        assert torch.cuda.is_available(), "CUDA must be available for StructEqTable model."
+        self.model = build_model(
+            model_ckpt=model_path,
+            max_new_tokens=max_new_tokens,
+            max_time=max_time,
+            lmdeploy=False,
+            flash_attn=False,
+            batch_size=1,
+        ).cuda()
+        self.default_format = "html"
+
+    def predict(self, images, output_format=None, **kwargs):
+
+        if output_format is None:
+            output_format = self.default_format
        else:
-            self.model = StructTable(self.model_path, self.max_new_tokens, self.max_time)
+            if output_format not in ['latex', 'markdown', 'html']:
+                raise ValueError(f"Output format {output_format} is not supported.")
+
+        results = self.model(
+            images, output_format=output_format
+        )
+
+        if output_format == "html":
+            results = [self.minify_html(html) for html in results]

-    def image2latex(self, image) -> str:
-        table_latex = self.model.forward(image)
-        return table_latex
+        return results

-    def image2html(self, image) -> str:
-        table_latex = self.image2latex(image)
-        table_html = convert_text(table_latex, 'html', format='latex')
-        return table_html
+    def minify_html(self, html):
+        # 移除多余的空白字符
+        html = re.sub(r'\s+', ' ', html)
+        # 移除行尾的空白字符
+        html = re.sub(r'\s*>\s*', '>', html)
+        # 移除标签前的空白字符
+        html = re.sub(r'\s*<\s*', '<', html)
+        return html.strip()
\ No newline at end of file
--- a/magic_pdf/model/ppTableModel.py
+++ b/magic_pdf/model/ppTableModel.py
+import cv2
 from paddleocr.ppstructure.table.predict_table import TableSystem
 from paddleocr.ppstructure.utility import init_args
 from magic_pdf.libs.Constants import *
@@ -36,12 +37,13 @@ class ppTableModel(object):
        - HTML (str): A string representing the HTML structure with content of the table.
        """
        if isinstance(image, Image.Image):
-            image = np.array(image)
+            image = np.asarray(image)
+            image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
        pred_res, _ = self.table_sys(image)
        pred_html = pred_res["html"]
-        res = '<td><table  border="1">' + pred_html.replace("<html><body><table>", "").replace("</table></body></html>",
-                                                                                               "") + "</table></td>\n"
-        return res
+        # res = '<td><table  border="1">' + pred_html.replace("<html><body><table>", "").replace(
+        # "</table></body></html>","") + "</table></td>\n"
+        return pred_html

    def parse_args(self, **kwargs):
        parser = init_args()

--- a/magic_pdf/para/para_split_v3.py
+++ b/magic_pdf/para/para_split_v3.py
@@ -63,15 +63,18 @@ def __is_list_or_index_block(block):
        first_line = block['lines'][0]
        line_height = first_line['bbox'][3] - first_line['bbox'][1]
        block_weight = block['bbox_fs'][2] - block['bbox_fs'][0]
+        block_height = block['bbox_fs'][3] - block['bbox_fs'][1]

        left_close_num = 0
        left_not_close_num = 0
        right_not_close_num = 0
        right_close_num = 0
        lines_text_list = []
-
+        center_close_num = 0
+        external_sides_not_close_num = 0
        multiple_para_flag = False
        last_line = block['lines'][-1]
+
        # 如果首行左边不顶格而右边顶格,末行左边顶格而右边不顶格 （第一行可能可以右边不顶格）
        if (first_line['bbox'][0] - block['bbox_fs'][0] > line_height / 2 and
                # block['bbox_fs'][2] - first_line['bbox'][2] < line_height and
@@ -82,6 +85,16 @@ def __is_list_or_index_block(block):

        for line in block['lines']:

+            line_mid_x = (line['bbox'][0] + line['bbox'][2]) / 2
+            block_mid_x = (block['bbox_fs'][0] + block['bbox_fs'][2]) / 2
+            if (
+                    line['bbox'][0] - block['bbox_fs'][0] > 0.8 * line_height and
+                    block['bbox_fs'][2] - line['bbox'][2] > 0.8 * line_height
+            ):
+                external_sides_not_close_num += 1
+            if abs(line_mid_x - block_mid_x) < line_height / 2:
+                center_close_num += 1
+
            line_text = ""

            for span in line['spans']:
@@ -103,7 +116,7 @@ def __is_list_or_index_block(block):
                right_close_num += 1
            else:
                # 右侧不顶格情况下是否有一段距离，拍脑袋用0.3block宽度做阈值
-                closed_area = 0.3 * block_weight
+                closed_area = 0.26 * block_weight
                # closed_area = 5 * line_height
                if block['bbox_fs'][2] - line['bbox'][2] > closed_area:
                    right_not_close_num += 1
@@ -132,17 +145,29 @@ def __is_list_or_index_block(block):
                line_num_flag = True

        # 有的目录右侧不贴边, 目前认为左边或者右边有一边全贴边，且符合数字规则极为index
-        if ((left_close_num/len(block['lines']) >= 0.8 or right_close_num/len(block['lines']) >= 0.8)
+        if ((left_close_num / len(block['lines']) >= 0.8 or right_close_num / len(block['lines']) >= 0.8)
                and line_num_flag
        ):
            for line in block['lines']:
                line[ListLineTag.IS_LIST_START_LINE] = True
            return BlockType.Index

+        # 全部line都居中的特殊list识别，每行都需要换行，特征是多行，且大多数行都前后not_close,每line中点x坐标接近
+        # 补充条件block的长宽比有要求
+        elif (
+                external_sides_not_close_num >= 2 and
+                center_close_num == len(block['lines']) and
+                external_sides_not_close_num / len(block['lines']) >= 0.5 and
+                block_height / block_weight > 0.4
+        ):
+            for line in block['lines']:
+                line[ListLineTag.IS_LIST_START_LINE] = True
+            return BlockType.List
+
        elif left_close_num >= 2 and (
                right_not_close_num >= 2 or line_end_flag or left_not_close_num >= 2) and not multiple_para_flag:
            # 处理一种特殊的没有缩进的list，所有行都贴左边，通过右边的空隙判断是否是item尾
-            if left_close_num / len(block['lines']) > 0.9:
+            if left_close_num / len(block['lines']) > 0.8:
                # 这种是每个item只有一行，且左边都贴边的短item list
                if flag_end_count == 0 and right_close_num / len(block['lines']) < 0.5:
                    for line in block['lines']:
@@ -154,7 +179,7 @@ def __is_list_or_index_block(block):
                        if lines_text_list[i][-1] in LIST_END_FLAG:
                            line[ListLineTag.IS_LIST_END_LINE] = True
                            if i + 1 < len(block['lines']):
-                                block['lines'][i+1][ListLineTag.IS_LIST_START_LINE] = True
+                                block['lines'][i + 1][ListLineTag.IS_LIST_START_LINE] = True
                # line item基本没有结束标识符，而且也没有缩进，按右侧空隙判断哪些是item end
                else:
                    line_start_flag = False
@@ -162,7 +187,8 @@ def __is_list_or_index_block(block):
                        if line_start_flag:
                            line[ListLineTag.IS_LIST_START_LINE] = True
                            line_start_flag = False
-                        elif abs(block['bbox_fs'][2] - line['bbox'][2]) > line_height:
+                        # elif abs(block['bbox_fs'][2] - line['bbox'][2]) > line_height:
+                        if abs(block['bbox_fs'][2] - line['bbox'][2]) > 0.1 * block_weight:
                            line[ListLineTag.IS_LIST_END_LINE] = True
                            line_start_flag = True
            # 一种有缩进的特殊有序list,start line 左侧不贴边且以数字开头，end line 以 IS_LIST_END_LINE 结尾且数量和start line 一致

--- a/next_docs/en/.readthedocs.yaml
+++ b/next_docs/en/.readthedocs.yaml
@@ -10,7 +10,7 @@ formats:

 python:
  install:
-    - requirements: docs/requirements.txt
+    - requirements: next_docs/requirements.txt

 sphinx:
-  configuration: docs/en/conf.py
+  configuration: next_docs/en/conf.py
--- a/next_docs/en/_static/image/MinerU-logo-hq.png
+++ b/next_docs/en/_static/image/MinerU-logo-hq.png
--- a/next_docs/en/_static/image/MinerU-logo.png
+++ b/next_docs/en/_static/image/MinerU-logo.png
--- a/next_docs/en/_static/image/datalab_logo.png
+++ b/next_docs/en/_static/image/datalab_logo.png
--- a/next_docs/en/_static/image/flowchart_en.png
+++ b/next_docs/en/_static/image/flowchart_en.png
--- a/next_docs/en/_static/image/flowchart_zh_cn.png
+++ b/next_docs/en/_static/image/flowchart_zh_cn.png
--- a/next_docs/en/_static/image/layout_example.png
+++ b/next_docs/en/_static/image/layout_example.png
--- a/next_docs/en/_static/image/poly.png
+++ b/next_docs/en/_static/image/poly.png
--- a/next_docs/en/_static/image/project_panorama_en.png
+++ b/next_docs/en/_static/image/project_panorama_en.png
--- a/next_docs/en/_static/image/project_panorama_zh_cn.png
+++ b/next_docs/en/_static/image/project_panorama_zh_cn.png
--- a/next_docs/en/_static/image/spans_example.png
+++ b/next_docs/en/_static/image/spans_example.png
--- a/next_docs/en/_static/image/web_demo_1.png
+++ b/next_docs/en/_static/image/web_demo_1.png
--- a/next_docs/en/additional_notes/changelog.rst
+++ b/next_docs/en/additional_notes/changelog.rst
+
+
+Changelog
+=========
+
+-  2024/09/27 Version 0.8.1 released, Fixed some bugs, and providing a
+   `localized deployment version <projects/web_demo/README.md>`__ of the
+   `online
+   demo <https://opendatalab.com/OpenSourceTools/Extractor/PDF/>`__ and
+   the `front-end interface <projects/web/README.md>`__.
+-  2024/09/09: Version 0.8.0 released, supporting fast deployment with
+   Dockerfile, and launching demos on Huggingface and Modelscope.
+-  2024/08/30: Version 0.7.1 released, add paddle tablemaster table
+   recognition option
+-  2024/08/09: Version 0.7.0b1 released, simplified installation
+   process, added table recognition functionality
+-  2024/08/01: Version 0.6.2b1 released, optimized dependency conflict
+   issues and installation documentation
+-  2024/07/05: Initial open-source release
+
+
+.. warning::
+
+   fix ``localized deployment version`` and ``front-end interface``
+
+
--- a/next_docs/en/additional_notes/faq.rst
+++ b/next_docs/en/additional_notes/faq.rst
+FAQ
+==========================
+
+1. When using the command ``pip install magic-pdf[full]`` on newer versions of macOS, the error ``zsh: no matches found: magic-pdf[full]`` occurs.
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+On macOS, the default shell has switched from Bash to Z shell, which has
+special handling logic for certain types of string matching. This can
+lead to the “no matches found” error. You can try disabling the globbing
+feature in the command line and then run the installation command again.
+
+.. code:: bash
+
+   setopt no_nomatch
+   pip install magic-pdf[full]
+
+2. Encountering the error ``pickle.UnpicklingError: invalid load key, 'v'.`` during use
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This might be due to an incomplete download of the model file. You can
+try re-downloading the model file and then try again. Reference:
+https://github.com/opendatalab/MinerU/issues/143
+
+3. Where should the model files be downloaded and how should the ``/models-dir`` configuration be set?
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The path for the model files is configured in “magic-pdf.json”. just
+like:
+
+.. code:: json
+
+   {
+     "models-dir": "/tmp/models"
+   }
+
+This path is an absolute path, not a relative path. You can obtain the
+absolute path in the models directory using the “pwd” command.
+Reference:
+https://github.com/opendatalab/MinerU/issues/155#issuecomment-2230216874
+
+4. Encountered the error ``ImportError: libGL.so.1: cannot open shared object file: No such file or directory`` in Ubuntu 22.04 on WSL2
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The ``libgl`` library is missing in Ubuntu 22.04 on WSL2. You can
+install the ``libgl`` library with the following command to resolve the
+issue:
+
+.. code:: bash
+
+   sudo apt-get install libgl1-mesa-glx
+
+Reference: https://github.com/opendatalab/MinerU/issues/388
+
+5. Encountered error ``ModuleNotFoundError: No module named 'fairscale'``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+You need to uninstall the module and reinstall it:
+
+.. code:: bash
+
+   pip uninstall fairscale
+   pip install fairscale
+
+Reference: https://github.com/opendatalab/MinerU/issues/411
+
+6. On some newer devices like the H100, the text parsed during OCR using CUDA acceleration is garbled.
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The compatibility of cuda11 with new graphics cards is poor, and the
+CUDA version used by Paddle needs to be upgraded.
+
+.. code:: bash
+
+   pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu123/
+
+Reference: https://github.com/opendatalab/MinerU/issues/558
--- a/next_docs/en/additional_notes/glossary.rst
+++ b/next_docs/en/additional_notes/glossary.rst
+
+
+Glossary 
+===========
+
+1. jsonl 
+    TODO: add description
+
+2. magic-pdf.json
+    TODO: add description
+
--- a/next_docs/en/additional_notes/known_issues.rst
+++ b/next_docs/en/additional_notes/known_issues.rst
+Known Issues
+============
+
+-  Reading order is based on the model’s sorting of text distribution in
+   space, which may become disordered under extremely complex layouts.
+-  Vertical text is not supported.
+-  Tables of contents and lists are recognized through rules; a few
+   uncommon list formats may not be identified.
+-  Only one level of headings is supported; hierarchical heading levels
+   are currently not supported.
+-  Code blocks are not yet supported in the layout model.
+-  Comic books, art books, elementary school textbooks, and exercise
+   books are not well-parsed yet
+-  Enabling OCR may produce better results in PDFs with a high density
+   of formulas
+-  If you are processing PDFs with a large number of formulas, it is
+   strongly recommended to enable the OCR function. When using PyMuPDF
+   to extract text, overlapping text lines can occur, leading to
+   inaccurate formula insertion positions.
--- a/next_docs/en/api.rst
+++ b/next_docs/en/api.rst
-Data Api
------------------

 .. toctree::
   :maxdepth: 2

-   api/dataset.rst
-   api/data_reader_writer.rst
-   api/read_api.rst
+   api/dataset
+   api/data_reader_writer
+   api/read_api
+   api/schemas
+   api/io
+   api/classes
\ No newline at end of file
--- a/next_docs/en/api/classes.rst
+++ b/next_docs/en/api/classes.rst
+Class Hierarchy
+===============
+
+.. inheritance-diagram:: magic_pdf.data.io.base magic_pdf.data.io.http magic_pdf.data.io.s3
+   :parts: 2
+
+
+.. inheritance-diagram:: magic_pdf.data.dataset
+   :parts: 2
+
+
+.. inheritance-diagram:: magic_pdf.data.data_reader_writer.base magic_pdf.data.data_reader_writer.filebase magic_pdf.data.data_reader_writer.multi_bucket_s3
+   :parts: 2
+
--- a/next_docs/en/api/data_reader_writer.rst
+++ b/next_docs/en/api/data_reader_writer.rst

 Data Reader Writer
--------------------
+===================

 .. autoclass:: magic_pdf.data.data_reader_writer.DataReader
   :members:
   :inherited-members:
+   :show-inheritance:

 .. autoclass:: magic_pdf.data.data_reader_writer.DataWriter
   :members:
   :inherited-members:
+   :show-inheritance:

 .. autoclass:: magic_pdf.data.data_reader_writer.S3DataReader
   :members:
   :inherited-members:
+   :show-inheritance:

 .. autoclass:: magic_pdf.data.data_reader_writer.S3DataWriter
   :members:
   :inherited-members:
+   :show-inheritance:

 .. autoclass:: magic_pdf.data.data_reader_writer.FileBasedDataReader
   :members:
   :inherited-members:
+   :show-inheritance:

 .. autoclass:: magic_pdf.data.data_reader_writer.FileBasedDataWriter
   :members:
   :inherited-members:
-
-.. autoclass:: magic_pdf.data.data_reader_writer.S3DataReader
-   :members:
-   :inherited-members:
-
-.. autoclass:: magic_pdf.data.data_reader_writer.S3DataWriter
-   :members:
-   :inherited-members:
+   :show-inheritance:

 .. autoclass:: magic_pdf.data.data_reader_writer.MultiBucketS3DataReader
   :members:
   :inherited-members:
+   :show-inheritance:

 .. autoclass:: magic_pdf.data.data_reader_writer.MultiBucketS3DataWriter
   :members:
   :inherited-members:
+   :show-inheritance:

--- a/next_docs/en/api/dataset.rst
+++ b/next_docs/en/api/dataset.rst
-Dataset Api
------------------
+Dataset
+========

 .. autoclass:: magic_pdf.data.dataset.PageableData
   :members:
   :inherited-members:
+   :show-inheritance:
+

 .. autoclass:: magic_pdf.data.dataset.Dataset
   :members:
   :inherited-members:
+   :show-inheritance:

 .. autoclass:: magic_pdf.data.dataset.ImageDataset
   :members:
   :inherited-members:
+   :show-inheritance:

 .. autoclass:: magic_pdf.data.dataset.PymuDocDataset
   :members:
   :inherited-members:
+   :show-inheritance:

 .. autoclass:: magic_pdf.data.dataset.Doc
   :members:
   :inherited-members:
+   :show-inheritance:
--- a/next_docs/en/api/io.rst
+++ b/next_docs/en/api/io.rst
+IO
+==
+
+.. autoclass:: magic_pdf.data.io.base.IOReader
+   :members:
+   :inherited-members:
+   :show-inheritance:
+
+.. autoclass:: magic_pdf.data.io.base.IOWriter
+   :members:
+   :inherited-members:
+   :show-inheritance:
+
+.. autoclass:: magic_pdf.data.io.s3.S3Reader
+   :members:
+   :inherited-members:
+   :show-inheritance:
+
+.. autoclass:: magic_pdf.data.io.s3.S3Writer
+   :members:
+   :inherited-members:
+   :show-inheritance:
+
+.. autoclass:: magic_pdf.data.io.http.HttpReader
+   :members:
+   :inherited-members:
+   :show-inheritance:
+
+.. autoclass:: magic_pdf.data.io.http.HttpWriter
+   :members:
+   :inherited-members:
+   :show-inheritance:
+
--- a/next_docs/en/api/read_api.rst
+++ b/next_docs/en/api/read_api.rst
-read_api Api
------------------
+read_api
+=========

 .. automodule:: magic_pdf.data.read_api
   :members:

--- a/next_docs/en/api/schemas.rst
+++ b/next_docs/en/api/schemas.rst
+
+schemas 
+===========
+
+.. autopydantic_model:: magic_pdf.data.schemas.S3Config
+   :members:
+
+.. autopydantic_model:: magic_pdf.data.schemas.PageInfo
+   :members:
+
--- a/next_docs/en/conf.py
+++ b/next_docs/en/conf.py
@@ -15,7 +15,8 @@ import subprocess
 import sys

 from sphinx.ext import autodoc
-
+from docutils import nodes
+from docutils.parsers.rst import Directive

 def install(package):
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', package])
@@ -58,10 +59,20 @@ extensions = [
    'sphinx_copybutton',
    'sphinx.ext.autodoc',
    'sphinx.ext.autosummary',
+    'sphinx.ext.inheritance_diagram',
    'myst_parser',
    'sphinxarg.ext',
+    'sphinxcontrib.autodoc_pydantic',
 ]

+# class hierarchy diagram
+inheritance_graph_attrs = dict(rankdir="LR", size='"8.0, 12.0"', fontsize=14, ratio='compress')
+inheritance_node_attrs = dict(shape='ellipse', fontsize=14, height=0.75)
+inheritance_edge_attrs = dict(arrow='vee')
+
+autodoc_pydantic_model_show_json = True
+autodoc_pydantic_model_show_config_summary = False
+
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']

@@ -120,3 +131,21 @@ class MockedClassDocumenter(autodoc.ClassDocumenter):
 autodoc.ClassDocumenter = MockedClassDocumenter

 navigation_with_keys = False
+
+
+# add custom directive 
+
+
+class VideoDirective(Directive):
+    required_arguments = 1
+    optional_arguments = 0
+    final_argument_whitespace = True
+    option_spec = {}
+
+    def run(self):
+        url = self.arguments[0]
+        video_node = nodes.raw('', f'<iframe width="560" height="315" src="{url}" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>', format='html')
+        return [video_node]
+
+def setup(app):
+    app.add_directive('video', VideoDirective)
\ No newline at end of file
--- a/next_docs/en/index.rst
+++ b/next_docs/en/index.rst
@@ -26,6 +26,50 @@ Welcome to the MinerU Documentation
   </p>


+Project Introduction
+--------------------
+
+MinerU is a tool that converts PDFs into machine-readable formats (e.g.,
+markdown, JSON), allowing for easy extraction into any format. MinerU
+was born during the pre-training process of
+`InternLM <https://github.com/InternLM/InternLM>`__. We focus on solving
+symbol conversion issues in scientific literature and hope to contribute
+to technological development in the era of large models. Compared to
+well-known commercial products, MinerU is still young. If you encounter
+any issues or if the results are not as expected, please submit an issue
+on `issue <https://github.com/opendatalab/MinerU/issues>`__ and **attach
+the relevant PDF**.
+
+.. video:: https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c
+
+
+Key Features
+------------
+
+-  Removes elements such as headers, footers, footnotes, and page
+   numbers while maintaining semantic continuity
+-  Outputs text in a human-readable order from multi-column documents
+-  Retains the original structure of the document, including titles,
+   paragraphs, and lists
+-  Extracts images, image captions, tables, and table captions
+-  Automatically recognizes formulas in the document and converts them
+   to LaTeX
+-  Automatically recognizes tables in the document and converts them to
+   LaTeX
+-  Automatically detects and enables OCR for corrupted PDFs
+-  Supports both CPU and GPU environments
+-  Supports Windows, Linux, and Mac platforms
+
+
+User Guide
+-------------
+.. toctree::
+   :maxdepth: 2
+   :caption: User Guide
+
+   user_guide
+
+
 API Reference
 -------------

@@ -34,5 +78,27 @@ method, this part of the documentation is for you.

 .. toctree::
   :maxdepth: 2
+   :caption: API

   api
+
+
+Additional Notes
+------------------
+.. toctree::
+   :maxdepth: 1
+   :caption: Additional Notes
+
+   additional_notes/known_issues
+   additional_notes/faq
+   additional_notes/changelog
+   additional_notes/glossary
+
+
+Projects 
+---------
+.. toctree::
+   :maxdepth: 1
+   :caption: Projects
+
+   projects
\ No newline at end of file
--- a/next_docs/en/projects.rst
+++ b/next_docs/en/projects.rst
+
+
+
+llama_index_rag 
+===============
+
+
+gradio_app
+============
+
+
+other projects
+===============
\ No newline at end of file
--- a/next_docs/en/user_guide.rst
+++ b/next_docs/en/user_guide.rst
+
+
+.. toctree::
+    :maxdepth: 2
+
+    user_guide/install
+    user_guide/quick_start
+    user_guide/tutorial
+    user_guide/data
+    
--- a/next_docs/en/user_guide/data.rst
+++ b/next_docs/en/user_guide/data.rst
+
+
+Data
+=========
+
+.. toctree::
+   :maxdepth: 2
+
+   data/dataset
+
+   data/read_api
+
+   data/data_reader_writer 
+
+   data/io
+
+
+
+
--- a/next_docs/en/user_guide/data/data_reader_writer.rst
+++ b/next_docs/en/user_guide/data/data_reader_writer.rst
+
+Data Reader Writer 
+====================
+
+Aims for read or write bytes from different media, You can implement new classes to meet the needs of your personal scenarios 
+if MinerU have not provide the suitable classes. It is easy to implement new classes, the only one requirement is to inherit from
+``DataReader`` or ``DataWriter``
+
+.. code:: python
+
+    class SomeReader(DataReader):
+        def read(self, path: str) -> bytes:
+            pass
+
+        def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes:
+            pass
+
+
+    class SomeWriter(DataWriter):
+        def write(self, path: str, data: bytes) -> None:
+            pass
+
+        def write_string(self, path: str, data: str) -> None:
+            pass
+
+
+Reader may curious about the difference between :doc:`io` and this section. Those two sections look very similarity at first glance.
+:doc:`io` provides fundamental functions, while This section thinks more at application level. Customer can build they own classes to meet 
+their own applications need which may share same IO function. That is why we have :doc:`io`.
+
+
+Important Classes
+-----------------
+
+.. code:: python
+
+    class FileBasedDataReader(DataReader):
+        def __init__(self, parent_dir: str = ''):
+            pass
+
+
+    class FileBasedDataWriter(DataWriter):
+        def __init__(self, parent_dir: str = '') -> None:
+            pass
+
+Class ``FileBasedDataReader`` initialized with unary param ``parent_dir``, That means that every method ``FileBasedDataReader`` provided will have features as follow.
+
+Features:
+    #. read content from the absolute path file, ``parent_dir`` will be ignored.
+    #. read the relative path, file will first join with ``parent_dir``, then read content from the merged path
+
+
+.. note::
+
+    ``FileBasedDataWriter`` shares the same behavior with ``FileBaseDataReader``
+
+
+.. code:: python 
+
+    class MultiS3Mixin:
+        def __init__(self, default_prefix: str, s3_configs: list[S3Config]):
+            pass
+
+    class MultiBucketS3DataReader(DataReader, MultiS3Mixin):
+        pass
+
+All read-related method that class ``MultiBucketS3DataReader`` provided will have features as follow.
+
+Features:
+    #. read object with full s3-format path, for example ``s3://test_bucket/test_object``, ``default_prefix`` will be ignored.
+    #. read object with relative path, file will join ``default_prefix`` and trim the ``bucket_name`` firstly, then read the content. ``bucket_name`` is the first element of the result after split ``default_prefix`` with delimiter ``\`` 
+
+.. note::
+    ``MultiBucketS3DataWriter`` shares the same behavior with ``MultiBucketS3DataReader``
+
+
+.. code:: python
+
+    class S3DataReader(MultiBucketS3DataReader):
+        pass
+
+``S3DataReader`` is build on top of MultiBucketS3DataReader which only support for bucket. So is ``S3DataWriter``. 
+
+
+Read Examples
+------------
+
+.. code:: python
+
+    # file based related 
+    file_based_reader1 = FileBasedDataReader('')
+
+    ## will read file abc 
+    file_based_reader1.read('abc') 
+
+    file_based_reader2 = FileBasedDataReader('/tmp')
+
+    ## will read /tmp/abc
+    file_based_reader2.read('abc')
+
+    ## will read /var/logs/message.txt
+    file_based_reader2.read('/var/logs/message.txt')
+
+    # multi bucket s3 releated
+    multi_bucket_s3_reader1 = MultiBucketS3DataReader("test_bucket1/test_prefix", list[S3Config(
+            bucket_name=test_bucket1, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
+        ),
+        S3Config(
+            bucket_name=test_bucket_2,
+            access_key=ak_2,
+            secret_key=sk_2,
+            endpoint_url=endpoint_url_2,
+        )])
+    
+    ## will read s3://test_bucket1/test_prefix/abc
+    multi_bucket_s3_reader1.read('abc')
+
+    ## will read s3://test_bucket1/efg
+    multi_bucket_s3_reader1.read('s3://test_bucket1/efg')
+
+    ## will read s3://test_bucket2/abc
+    multi_bucket_s3_reader1.read('s3://test_bucket2/abc')
+
+    # s3 related
+    s3_reader1 = S3DataReader(
+        default_prefix_without_bucket = "test_prefix"
+        bucket: "test_bucket",
+        ak: "ak",
+        sk: "sk",
+        endpoint_url: "localhost"
+    )
+
+    ## will read s3://test_bucket/test_prefix/abc 
+    s3_reader1.read('abc')
+   
+    ## will read s3://test_bucket/efg
+    s3_reader1.read('s3://test_bucket/efg')
+
+
+Write Examples
+---------------
+
+.. code:: python
+
+    # file based related 
+    file_based_writer1 = FileBasedDataWriter('')
+
+    ## will write 123 to abc
+    file_based_writer1.write('abc', '123'.encode()) 
+
+    ## will write 123 to abc
+    file_based_writer1.write_string('abc', '123') 
+
+    file_based_writer2 = FileBasedDataWriter('/tmp')
+
+    ## will write 123 to /tmp/abc
+    file_based_writer2.write_string('abc', '123')
+
+    ## will write 123 to /var/logs/message.txt
+    file_based_writer2.write_string('/var/logs/message.txt', '123')
+
+    # multi bucket s3 releated
+    multi_bucket_s3_writer1 = MultiBucketS3DataWriter("test_bucket1/test_prefix", list[S3Config(
+            bucket_name=test_bucket1, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
+        ),
+        S3Config(
+            bucket_name=test_bucket_2,
+            access_key=ak_2,
+            secret_key=sk_2,
+            endpoint_url=endpoint_url_2,
+        )])
+    
+    ## will write 123 to s3://test_bucket1/test_prefix/abc
+    multi_bucket_s3_writer1.write_string('abc', '123')
+
+    ## will write 123 to s3://test_bucket1/test_prefix/abc
+    multi_bucket_s3_writer1.write('abc', '123'.encode())
+
+    ## will write 123 to s3://test_bucket1/efg
+    multi_bucket_s3_writer1.write('s3://test_bucket1/efg', '123'.encode())
+
+    ## will write 123 to s3://test_bucket2/abc
+    multi_bucket_s3_writer1.write('s3://test_bucket2/abc', '123'.encode())
+
+    # s3 related
+    s3_writer1 = S3DataWriter(
+        default_prefix_without_bucket = "test_prefix"
+        bucket: "test_bucket",
+        ak: "ak",
+        sk: "sk",
+        endpoint_url: "localhost"
+    )
+
+    ## will write 123 to s3://test_bucket/test_prefix/abc 
+    s3_writer1.write('abc', '123'.encode())
+
+    ## will write 123 to s3://test_bucket/test_prefix/abc 
+    s3_writer1.write_string('abc', '123')
+
+    ## will write 123 to s3://test_bucket/efg
+    s3_writer1.write('s3://test_bucket/efg', '123'.encode())
+
+
+Check :doc:`../../api/classes` for more intuitions or check :doc:`../../api/data_reader_writer` for more details
--- a/next_docs/en/user_guide/data/dataset.rst
+++ b/next_docs/en/user_guide/data/dataset.rst
+
+
+Dataset 
+===========
+
+
+Import Classes 
+-----------------
+
+Dataset 
+^^^^^^^^
+
+Each pdfs or image will form one ``Dataset``. As we all know, Pdf has two categories, :ref:`digital_method_section` or :ref:`ocr_method_section`.
+Will get ``ImageDataset`` which is subclass of ``Dataset`` with images and get ``PymuDocDataset`` from pdf files.
+The difference between ``ImageDataset`` and ``PymuDocDataset`` is that ``ImageDataset`` only support ``OCR`` parse method, 
+while ``PymuDocDataset`` support both ``OCR`` and ``TXT``
+
+.. note::
+
+    In fact some pdf may generated by images, that means it can not support ``TXT`` methods. Currently it is something the user needs to ensure does not happen
+
+
+
+Pdf Parse Methods
+------------------
+
+.. _ocr_method_section:
+OCR 
+^^^^
+Extract chars via ``Optical Character Recognition`` technical.
+
+.. _digital_method_section:
+TXT
+^^^^^^^^
+Extract chars via third-party library, currently we use ``pymupdf``. 
+
+
+
+Check :doc:`../../api/classes` for more intuitions or check :doc:`../../api/dataset` for more details
+
--- a/next_docs/en/user_guide/data/io.rst
+++ b/next_docs/en/user_guide/data/io.rst
+
+IO
+===
+
+Aims for read or write bytes from different media, Currently We provide ``S3Reader``, ``S3Writer`` for AWS S3 compatible media 
+and ``HttpReader``, ``HttpWriter`` for remote Http file. You can implement new classes to meet the needs of your personal scenarios 
+if MinerU have not provide the suitable classes. It is easy to implement new classes, the only one requirement is to inherit from
+``IOReader`` or ``IOWriter``
+
+.. code:: python
+
+    class SomeReader(IOReader):
+        def read(self, path: str) -> bytes:
+            pass
+
+        def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes:
+            pass
+
+
+    class SomeWriter(IOWriter):
+        def write(self, path: str, data: bytes) -> None:
+            pass
+
+Check :doc:`../../api/classes` for more intuitions or check :doc:`../../api/io` for more details
+
--- a/next_docs/en/user_guide/data/read_api.rst
+++ b/next_docs/en/user_guide/data/read_api.rst
+
+read_api 
+==========
+
+Read the content from file or directory to create ``Dataset``, Currently we provided serval functions that cover some scenarios.
+if you have new scenarios that is common to most of the users, you can post it on the offical github issues with detail descriptions.
+Also it is easy to implement your own read-related funtions.
+
+
+Important Functions
+-------------------
+
+
+read_jsonl
+^^^^^^^^^^^^^^^^
+
+Read the contet from jsonl which may located on local machine or remote s3. if you want to know more about jsonl, please goto :doc:`../../additional_notes/glossary`
+
+.. code:: python
+
+    # read jsonl from local machine 
+    datasets = read_jsonl("tt.jsonl", None)
+
+    # read jsonl from remote s3
+    datasets = read_jsonl("s3://bucket_1/tt.jsonl", s3_reader)
+
+
+read_local_pdfs
+^^^^^^^^^^^^^^^^
+
+Read pdf from path or directory.
+
+
+.. code:: python
+
+    # read pdf path
+    datasets = read_local_pdfs("tt.pdf")
+
+    # read pdfs under directory
+    datasets = read_local_pdfs("pdfs/")
+
+
+read_local_images
+^^^^^^^^^^^^^^^^^^^
+
+Read images from path or directory
+
+.. code:: python 
+
+    # read from image path 
+    datasets = read_local_images("tt.png")
+
+
+    # read files from directory that endswith suffix in suffixes array 
+    datasets = read_local_images("images/", suffixes=["png", "jpg"])
+
+
+Check :doc:`../../api/read_api` for more details
\ No newline at end of file
--- a/next_docs/en/user_guide/install.rst
+++ b/next_docs/en/user_guide/install.rst
+
+Installation
+==============
+
+.. toctree::
+   :maxdepth: 1
+
+   install/install
+   install//boost_with_cuda
+   install/download_model_weight_files
+
+
--- a/next_docs/en/user_guide/install/boost_with_cuda.rst
+++ b/next_docs/en/user_guide/install/boost_with_cuda.rst
+
+Boost With Cuda 
+================
+
+
+If your device supports CUDA and meets the GPU requirements of the
+mainline environment, you can use GPU acceleration. Please select the
+appropriate guide based on your system:
+
+-  :ref:`ubuntu_22_04_lts_section`
+-  :ref:`windows_10_or_11_section`
+
+-  Quick Deployment with Docker > Docker requires a GPU with at least
+   16GB of VRAM, and all acceleration features are enabled by default.
+
+.. note:: 
+
+   Before running this Docker, you can use the following command to
+   check if your device supports CUDA acceleration on Docker. 
+
+   bash  docker run --rm --gpus=all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi
+
+.. code:: sh
+
+   wget https://github.com/opendatalab/MinerU/raw/master/Dockerfile
+   docker build -t mineru:latest .
+   docker run --rm -it --gpus=all mineru:latest /bin/bash
+   magic-pdf --help
+
+.. _ubuntu_22_04_lts_section:
+
+Ubuntu 22.04 LTS
+-----------------
+
+1. Check if NVIDIA Drivers Are Installed
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code:: sh
+
+   nvidia-smi
+
+If you see information similar to the following, it means that the
+NVIDIA drivers are already installed, and you can skip Step 2.
+
+Notice:``CUDA Version`` should be >= 12.1, If the displayed version
+number is less than 12.1, please upgrade the driver.
+
+.. code:: text
+
+   +---------------------------------------------------------------------------------------+
+   | NVIDIA-SMI 537.34                 Driver Version: 537.34       CUDA Version: 12.2     |
+   |-----------------------------------------+----------------------+----------------------+
+   | GPU  Name                     TCC/WDDM  | Bus-Id        Disp.A | Volatile Uncorr. ECC |
+   | Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
+   |                                         |                      |               MIG M. |
+   |=========================================+======================+======================|
+   |   0  NVIDIA GeForce RTX 3060 Ti   WDDM  | 00000000:01:00.0  On |                  N/A |
+   |  0%   51C    P8              12W / 200W |   1489MiB /  8192MiB |      5%      Default |
+   |                                         |                      |                  N/A |
+   +-----------------------------------------+----------------------+----------------------+
+
+2. Install the Driver
+~~~~~~~~~~~~~~~~~~~~~
+
+If no driver is installed, use the following command:
+
+.. code:: sh
+
+   sudo apt-get update
+   sudo apt-get install nvidia-driver-545
+
+Install the proprietary driver and restart your computer after
+installation.
+
+.. code:: sh
+
+   reboot
+
+3. Install Anaconda
+~~~~~~~~~~~~~~~~~~~
+
+If Anaconda is already installed, skip this step.
+
+.. code:: sh
+
+   wget https://repo.anaconda.com/archive/Anaconda3-2024.06-1-Linux-x86_64.sh
+   bash Anaconda3-2024.06-1-Linux-x86_64.sh
+
+In the final step, enter ``yes``, close the terminal, and reopen it.
+
+4. Create an Environment Using Conda
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Specify Python version 3.10.
+
+.. code:: sh
+
+   conda create -n MinerU python=3.10
+   conda activate MinerU
+
+5. Install Applications
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code:: sh
+
+   pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com
+
+❗ After installation, make sure to check the version of ``magic-pdf``
+using the following command:
+
+.. code:: sh
+
+   magic-pdf --version
+
+If the version number is less than 0.7.0, please report the issue.
+
+6. Download Models
+~~~~~~~~~~~~~~~~~~
+
+Refer to detailed instructions on :doc:`download_model_weight_files`
+
+7. Understand the Location of the Configuration File
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+After completing the `6. Download Models <#6-download-models>`__ step,
+the script will automatically generate a ``magic-pdf.json`` file in the
+user directory and configure the default model path. You can find the
+``magic-pdf.json`` file in your user directory.
+
+   The user directory for Linux is “/home/username”.
+
+8. First Run
+~~~~~~~~~~~~
+
+Download a sample file from the repository and test it.
+
+.. code:: sh
+
+   wget https://github.com/opendatalab/MinerU/raw/master/demo/small_ocr.pdf
+   magic-pdf -p small_ocr.pdf
+
+9. Test CUDA Acceleration
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If your graphics card has at least **8GB** of VRAM, follow these steps
+to test CUDA acceleration:
+
+   ❗ Due to the extremely limited nature of 8GB VRAM for running this
+   application, you need to close all other programs using VRAM to
+   ensure that 8GB of VRAM is available when running this application.
+
+1. Modify the value of ``"device-mode"`` in the ``magic-pdf.json``
+   configuration file located in your home directory.
+
+   .. code:: json
+
+      {
+        "device-mode": "cuda"
+      }
+
+2. Test CUDA acceleration with the following command:
+
+   .. code:: sh
+
+      magic-pdf -p small_ocr.pdf
+
+10. Enable CUDA Acceleration for OCR
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+1. Download ``paddlepaddle-gpu``. Installation will automatically enable
+   OCR acceleration.
+
+   .. code:: sh
+
+      python -m pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/
+
+2. Test OCR acceleration with the following command:
+
+   .. code:: sh
+
+      magic-pdf -p small_ocr.pdf
+
+.. _windows_10_or_11_section:
+
+Windows 10/11
+--------------
+
+1. Install CUDA and cuDNN
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Required versions: CUDA 11.8 + cuDNN 8.7.0
+
+-  CUDA 11.8: https://developer.nvidia.com/cuda-11-8-0-download-archive
+-  cuDNN v8.7.0 (November 28th, 2022), for CUDA 11.x:
+   https://developer.nvidia.com/rdp/cudnn-archive
+
+2. Install Anaconda
+~~~~~~~~~~~~~~~~~~~
+
+If Anaconda is already installed, you can skip this step.
+
+Download link: https://repo.anaconda.com/archive/Anaconda3-2024.06-1-Windows-x86_64.exe
+
+3. Create an Environment Using Conda
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Python version must be 3.10.
+
+::
+
+   conda create -n MinerU python=3.10
+   conda activate MinerU
+
+4. Install Applications
+~~~~~~~~~~~~~~~~~~~~~~~
+
+::
+
+   pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com
+
+..
+
+   ❗️After installation, verify the version of ``magic-pdf``:
+
+   .. code:: bash
+
+      magic-pdf --version
+
+   If the version number is less than 0.7.0, please report it in the
+   issues section.
+
+5. Download Models
+~~~~~~~~~~~~~~~~~~
+
+Refer to detailed instructions on :doc:`download_model_weight_files`
+
+6. Understand the Location of the Configuration File
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+After completing the `5. Download Models <#5-download-models>`__ step,
+the script will automatically generate a ``magic-pdf.json`` file in the
+user directory and configure the default model path. You can find the
+``magic-pdf.json`` file in your 【user directory】 .
+
+   The user directory for Windows is “C:/Users/username”.
+
+7. First Run
+~~~~~~~~~~~~
+
+Download a sample file from the repository and test it.
+
+.. code:: powershell
+
+     wget https://github.com/opendatalab/MinerU/raw/master/demo/small_ocr.pdf -O small_ocr.pdf
+     magic-pdf -p small_ocr.pdf
+
+8. Test CUDA Acceleration
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If your graphics card has at least 8GB of VRAM, follow these steps to
+test CUDA-accelerated parsing performance.
+
+   ❗ Due to the extremely limited nature of 8GB VRAM for running this
+   application, you need to close all other programs using VRAM to
+   ensure that 8GB of VRAM is available when running this application.
+
+1. **Overwrite the installation of torch and torchvision** supporting
+   CUDA.
+
+   ::
+
+      pip install --force-reinstall torch==2.3.1 torchvision==0.18.1 --index-url https://download.pytorch.org/whl/cu118
+
+   ..
+
+      ❗️Ensure the following versions are specified in the command:
+
+      ::
+
+         torch==2.3.1 torchvision==0.18.1
+
+      These are the highest versions we support. Installing higher
+      versions without specifying them will cause the program to fail.
+
+2. **Modify the value of ``"device-mode"``** in the ``magic-pdf.json``
+   configuration file located in your user directory.
+
+   .. code:: json
+
+      {
+        "device-mode": "cuda"
+      }
+
+3. **Run the following command to test CUDA acceleration**:
+
+   ::
+
+      magic-pdf -p small_ocr.pdf
+
+9. Enable CUDA Acceleration for OCR
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+1. **Download paddlepaddle-gpu**, which will automatically enable OCR
+   acceleration upon installation.
+
+   ::
+
+      pip install paddlepaddle-gpu==2.6.1
+
+2. **Run the following command to test OCR acceleration**:
+
+   ::
+
+      magic-pdf -p small_ocr.pdf
+
--- a/next_docs/en/user_guide/install/download_model_weight_files.rst
+++ b/next_docs/en/user_guide/install/download_model_weight_files.rst
+
+Download Model Weight Files
+==============================
+
+Model downloads are divided into initial downloads and updates to the
+model directory. Please refer to the corresponding documentation for
+instructions on how to proceed.
+
+Initial download of model files
+------------------------------
+
+1. Download the Model from Hugging Face
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Use a Python Script to Download Model Files from Hugging Face
+
+.. code:: bash
+
+   pip install huggingface_hub
+   wget https://github.com/opendatalab/MinerU/raw/master/scripts/download_models_hf.py -O download_models_hf.py
+   python download_models_hf.py
+
+The Python script will automatically download the model files and
+configure the model directory in the configuration file.
+
+The configuration file can be found in the user directory, with the
+filename ``magic-pdf.json``.
+
+How to update models previously downloaded
+-----------------------------------------
+
+1. Models downloaded via Git LFS
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+   Due to feedback from some users that downloading model files using
+   git lfs was incomplete or resulted in corrupted model files, this
+   method is no longer recommended.
+
+If you previously downloaded model files via git lfs, you can navigate
+to the previous download directory and use the ``git pull`` command to
+update the model.
+
+2. Models downloaded via Hugging Face or Model Scope
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If you previously downloaded models via Hugging Face or Model Scope, you
+can rerun the Python script used for the initial download. This will
+automatically update the model directory to the latest version.
--- a/next_docs/en/user_guide/install/install.rst
+++ b/next_docs/en/user_guide/install/install.rst
+
+Install 
+===============================================================
+If you encounter any installation issues, please first consult the FAQ.
+If the parsing results are not as expected, refer to the Known Issues.
+There are three different ways to experience MinerU
+
+Pre-installation Notice—Hardware and Software Environment Support
+------------------------------------------------------------------
+
+To ensure the stability and reliability of the project, we only optimize
+and test for specific hardware and software environments during
+development. This ensures that users deploying and running the project
+on recommended system configurations will get the best performance with
+the fewest compatibility issues.
+
+By focusing resources on the mainline environment, our team can more
+efficiently resolve potential bugs and develop new features.
+
+In non-mainline environments, due to the diversity of hardware and
+software configurations, as well as third-party dependency compatibility
+issues, we cannot guarantee 100% project availability. Therefore, for
+users who wish to use this project in non-recommended environments, we
+suggest carefully reading the documentation and FAQ first. Most issues
+already have corresponding solutions in the FAQ. We also encourage
+community feedback to help us gradually expand support.
+
+.. raw:: html
+
+   <style>
+      table, th, td {
+      border: 1px solid black;
+      border-collapse: collapse;
+      }
+   </style>
+   <table>
+    <tr>
+        <td colspan="3" rowspan="2">Operating System</td>
+    </tr>
+    <tr>
+        <td>Ubuntu 22.04 LTS</td>
+        <td>Windows 10 / 11</td>
+        <td>macOS 11+</td>
+    </tr>
+    <tr>
+        <td colspan="3">CPU</td>
+        <td>x86_64</td>
+        <td>x86_64</td>
+        <td>x86_64 / arm64</td>
+    </tr>
+    <tr>
+        <td colspan="3">Memory</td>
+        <td colspan="3">16GB or more, recommended 32GB+</td>
+    </tr>
+    <tr>
+        <td colspan="3">Python Version</td>
+        <td colspan="3">3.10</td>
+    </tr>
+    <tr>
+        <td colspan="3">Nvidia Driver Version</td>
+        <td>latest (Proprietary Driver)</td>
+        <td>latest</td>
+        <td>None</td>
+    </tr>
+    <tr>
+        <td colspan="3">CUDA Environment</td>
+        <td>Automatic installation [12.1 (pytorch) + 11.8 (paddle)]</td>
+        <td>11.8 (manual installation) + cuDNN v8.7.0 (manual installation)</td>
+        <td>None</td>
+    </tr>
+    <tr>
+        <td rowspan="2">GPU Hardware Support List</td>
+        <td colspan="2">Minimum Requirement 8G+ VRAM</td>
+        <td colspan="2">3060ti/3070/3080/3080ti/4060/4070/4070ti<br>
+        8G VRAM enables layout, formula recognition acceleration and OCR acceleration</td>
+        <td rowspan="2">None</td>
+    </tr>
+    <tr>
+        <td colspan="2">Recommended Configuration 16G+ VRAM</td>
+        <td colspan="2">3090/3090ti/4070ti super/4080/4090<br>
+        16G VRAM or more can enable layout, formula recognition, OCR acceleration and table recognition acceleration simultaneously
+        </td>
+    </tr>
+   </table>
+
+
+Create an environment
+~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: shell
+
+    conda create -n MinerU python=3.10
+    conda activate MinerU
+    pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com
+
+
+Download model weight files
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: shell
+
+    pip install huggingface_hub
+    wget https://github.com/opendatalab/MinerU/raw/master/scripts/download_models_hf.py -O download_models_hf.py
+    python download_models_hf.py    
+
+
+The MinerU is installed, Check out :doc:`../quick_start` or reading :doc:`boost_with_cuda` for accelerate inference
\ No newline at end of file
--- a/next_docs/en/user_guide/quick_start.rst
+++ b/next_docs/en/user_guide/quick_start.rst
+
+Quick Start 
+==============
+
+Eager to get started? This page gives a good introduction to MinerU. Follow Installation to set up a project and install MinerU first.
+
+
+.. toctree::
+    :maxdepth: 1
+
+    quick_start/command_line
+    quick_start/to_markdown
+
--- a/next_docs/en/user_guide/quick_start/command_line.rst
+++ b/next_docs/en/user_guide/quick_start/command_line.rst
+
+
+Command Line
+===================
+
+.. code:: bash
+
+   magic-pdf --help
+   Usage: magic-pdf [OPTIONS]
+
+   Options:
+     -v, --version                display the version and exit
+     -p, --path PATH              local pdf filepath or directory  [required]
+     -o, --output-dir PATH        output local directory  [required]
+     -m, --method [ocr|txt|auto]  the method for parsing pdf. ocr: using ocr
+                                  technique to extract information from pdf. txt:
+                                  suitable for the text-based pdf only and
+                                  outperform ocr. auto: automatically choose the
+                                  best method for parsing pdf from ocr and txt.
+                                  without method specified, auto will be used by
+                                  default.
+     -l, --lang TEXT              Input the languages in the pdf (if known) to
+                                  improve OCR accuracy.  Optional. You should
+                                  input "Abbreviation" with language form url: ht
+                                  tps://paddlepaddle.github.io/PaddleOCR/en/ppocr
+                                  /blog/multi_languages.html#5-support-languages-
+                                  and-abbreviations
+     -d, --debug BOOLEAN          Enables detailed debugging information during
+                                  the execution of the CLI commands.
+     -s, --start INTEGER          The starting page for PDF parsing, beginning
+                                  from 0.
+     -e, --end INTEGER            The ending page for PDF parsing, beginning from
+                                  0.
+     --help                       Show this message and exit.
+
+
+   ## show version
+   magic-pdf -v
+
+   ## command line example
+   magic-pdf -p {some_pdf} -o {some_output_dir} -m auto
+
+``{some_pdf}`` can be a single PDF file or a directory containing
+multiple PDFs. The results will be saved in the ``{some_output_dir}``
+directory. The output file list is as follows:
+
+.. code:: text
+
+   ├── some_pdf.md                          # markdown file
+   ├── images                               # directory for storing images
+   ├── some_pdf_layout.pdf                  # layout diagram
+   ├── some_pdf_middle.json                 # MinerU intermediate processing result
+   ├── some_pdf_model.json                  # model inference result
+   ├── some_pdf_origin.pdf                  # original PDF file
+   ├── some_pdf_spans.pdf                   # smallest granularity bbox position information diagram
+   └── some_pdf_content_list.json           # Rich text JSON arranged in reading order
+
+For more information about the output files, please refer to the :doc:`../tutorial/output_file_description`
+
--- a/next_docs/en/user_guide/quick_start/extract_text.rst
+++ b/next_docs/en/user_guide/quick_start/extract_text.rst
+
+
+Extract Content from Pdf
+========================
+
+.. code:: python
+
+    from magic_pdf.data.read_api import read_local_pdfs
+    from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union
+    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
--- a/next_docs/en/user_guide/quick_start/to_markdown.rst
+++ b/next_docs/en/user_guide/quick_start/to_markdown.rst
+
+
+Convert To Markdown
+========================
+
+.. code:: python
+
+    import os
+
+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
+    from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
+    from magic_pdf.pipe.OCRPipe import OCRPipe
+
+
+    ## args
+    model_list = []
+    pdf_file_name = "abc.pdf"  # replace with the real pdf path
+
+
+    ## prepare env
+    local_image_dir, local_md_dir = "output/images", "output"
+    os.makedirs(local_image_dir, exist_ok=True)
+
+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
+        local_md_dir
+    ) # create 00
+    image_dir = str(os.path.basename(local_image_dir))
+
+    reader1 = FileBasedDataReader("")
+    pdf_bytes = reader1.read(pdf_file_name)   # read the pdf content
+
+
+    pipe = OCRPipe(pdf_bytes, model_list, image_writer)
+
+    pipe.pipe_classify()
+    pipe.pipe_analyze()
+    pipe.pipe_parse()
+
+    pdf_info = pipe.pdf_mid_data["pdf_info"]
+
+
+    md_content = pipe.pipe_mk_markdown(
+        image_dir, drop_mode=DropMode.NONE, md_make_mode=MakeMode.MM_MD
+    )
+
+    if isinstance(md_content, list):
+        md_writer.write_string(f"{pdf_file_name}.md", "\n".join(md_content))
+    else:
+        md_writer.write_string(f"{pdf_file_name}.md", md_content)
+
+
+Check :doc:`../data/data_reader_writer` for more [reader | writer] examples 
--- a/next_docs/en/user_guide/tutorial.rst
+++ b/next_docs/en/user_guide/tutorial.rst
+
+Tutorial
+===========
+
+From the beginning to the end, Show how to using mineru via a minimal project
+
+.. toctree::
+    :maxdepth: 1
+
+    tutorial/output_file_description
\ No newline at end of file
--- a/next_docs/en/user_guide/tutorial/output_file_description.rst
+++ b/next_docs/en/user_guide/tutorial/output_file_description.rst
+
+Output File Description
+=========================
+
+After executing the ``magic-pdf`` command, in addition to outputting
+files related to markdown, several other files unrelated to markdown
+will also be generated. These files will be introduced one by one.
+
+some_pdf_layout.pdf
+~~~~~~~~~~~~~~~~~~~
+
+Each page layout consists of one or more boxes. The number at the top
+left of each box indicates its sequence number. Additionally, in
+``layout.pdf``, different content blocks are highlighted with different
+background colors.
+
+.. figure:: ../../_static/image/layout_example.png
+   :alt: layout example
+
+   layout example
+
+some_pdf_spans.pdf
+~~~~~~~~~~~~~~~~~~
+
+All spans on the page are drawn with different colored line frames
+according to the span type. This file can be used for quality control,
+allowing for quick identification of issues such as missing text or
+unrecognized inline formulas.
+
+.. figure:: ../../_static/image/spans_example.png
+   :alt: spans example
+
+   spans example
+
+some_pdf_model.json
+~~~~~~~~~~~~~~~~~~~
+
+Structure Definition
+^^^^^^^^^^^^^^^^^^^^
+
+.. code:: python
+
+   from pydantic import BaseModel, Field
+   from enum import IntEnum
+
+   class CategoryType(IntEnum):
+        title = 0               # Title
+        plain_text = 1          # Text
+        abandon = 2             # Includes headers, footers, page numbers, and page annotations
+        figure = 3              # Image
+        figure_caption = 4      # Image description
+        table = 5               # Table
+        table_caption = 6       # Table description
+        table_footnote = 7      # Table footnote
+        isolate_formula = 8     # Block formula
+        formula_caption = 9     # Formula label
+
+        embedding = 13          # Inline formula
+        isolated = 14           # Block formula
+        text = 15               # OCR recognition result
+
+
+   class PageInfo(BaseModel):
+       page_no: int = Field(description="Page number, the first page is 0", ge=0)
+       height: int = Field(description="Page height", gt=0)
+       width: int = Field(description="Page width", ge=0)
+
+   class ObjectInferenceResult(BaseModel):
+       category_id: CategoryType = Field(description="Category", ge=0)
+       poly: list[float] = Field(description="Quadrilateral coordinates, representing the coordinates of the top-left, top-right, bottom-right, and bottom-left points respectively")
+       score: float = Field(description="Confidence of the inference result")
+       latex: str | None = Field(description="LaTeX parsing result", default=None)
+       html: str | None = Field(description="HTML parsing result", default=None)
+
+   class PageInferenceResults(BaseModel):
+        layout_dets: list[ObjectInferenceResult] = Field(description="Page recognition results", ge=0)
+        page_info: PageInfo = Field(description="Page metadata")
+
+
+   # The inference results of all pages, ordered by page number, are stored in a list as the inference results of MinerU
+   inference_result: list[PageInferenceResults] = []
+
+The format of the poly coordinates is [x0, y0, x1, y1, x2, y2, x3, y3],
+representing the coordinates of the top-left, top-right, bottom-right,
+and bottom-left points respectively. |Poly Coordinate Diagram|
+
+example
+^^^^^^^
+
+.. code:: json
+
+   [
+       {
+           "layout_dets": [
+               {
+                   "category_id": 2,
+                   "poly": [
+                       99.1906967163086,
+                       100.3119125366211,
+                       730.3707885742188,
+                       100.3119125366211,
+                       730.3707885742188,
+                       245.81326293945312,
+                       99.1906967163086,
+                       245.81326293945312
+                   ],
+                   "score": 0.9999997615814209
+               }
+           ],
+           "page_info": {
+               "page_no": 0,
+               "height": 2339,
+               "width": 1654
+           }
+       },
+       {
+           "layout_dets": [
+               {
+                   "category_id": 5,
+                   "poly": [
+                       99.13092803955078,
+                       2210.680419921875,
+                       497.3183898925781,
+                       2210.680419921875,
+                       497.3183898925781,
+                       2264.78076171875,
+                       99.13092803955078,
+                       2264.78076171875
+                   ],
+                   "score": 0.9999997019767761
+               }
+           ],
+           "page_info": {
+               "page_no": 1,
+               "height": 2339,
+               "width": 1654
+           }
+       }
+   ]
+
+some_pdf_middle.json
+~~~~~~~~~~~~~~~~~~~~
+
+-------+--------------------------------------------------------------+
+| Field | Description                                                  |
+| Name  |                                                              |
+=======+==============================================================+
+| pdf   | list, each element is a dict representing the parsing result |
+| _info | of each PDF page, see the table below for details            |
+-------+--------------------------------------------------------------+
+| \_    | ocr \| txt, used to indicate the mode used in this           |
+| parse | intermediate parsing state                                   |
+| _type |                                                              |
+-------+--------------------------------------------------------------+
+| \_ve  | string, indicates the version of magic-pdf used in this      |
+| rsion | parsing                                                      |
+| _name |                                                              |
+-------+--------------------------------------------------------------+
+
+**pdf_info**
+
+Field structure description
+
+---------+------------------------------------------------------------+
+| Field   | Description                                                |
+| Name    |                                                            |
+=========+============================================================+
+| preproc | Intermediate result after PDF preprocessing, not yet       |
+| _blocks | segmented                                                  |
+---------+------------------------------------------------------------+
+| layout  | Layout segmentation results, containing layout direction   |
+| _bboxes | (vertical, horizontal), and bbox, sorted by reading order  |
+---------+------------------------------------------------------------+
+| p       | Page number, starting from 0                               |
+| age_idx |                                                            |
+---------+------------------------------------------------------------+
+| pa      | Page width and height                                      |
+| ge_size |                                                            |
+---------+------------------------------------------------------------+
+| \_layo  | Layout tree structure                                      |
+| ut_tree |                                                            |
+---------+------------------------------------------------------------+
+| images  | list, each element is a dict representing an img_block     |
+---------+------------------------------------------------------------+
+| tables  | list, each element is a dict representing a table_block    |
+---------+------------------------------------------------------------+
+| inter   | list, each element is a dict representing an               |
+| line_eq | interline_equation_block                                   |
+| uations |                                                            |
+---------+------------------------------------------------------------+
+| di      | List, block information returned by the model that needs   |
+| scarded | to be dropped                                              |
+| _blocks |                                                            |
+---------+------------------------------------------------------------+
+| para    | Result after segmenting preproc_blocks                     |
+| _blocks |                                                            |
+---------+------------------------------------------------------------+
+
+In the above table, ``para_blocks`` is an array of dicts, each dict
+representing a block structure. A block can support up to one level of
+nesting.
+
+**block**
+
+The outer block is referred to as a first-level block, and the fields in
+the first-level block include:
+
+---------+-------------------------------------------------------------+
+| Field   | Description                                                 |
+| Name    |                                                             |
+=========+=============================================================+
+| type    | Block type (table|image)                                    |
+---------+-------------------------------------------------------------+
+| bbox    | Block bounding box coordinates                              |
+---------+-------------------------------------------------------------+
+| blocks  | list, each element is a dict representing a second-level    |
+|         | block                                                       |
+---------+-------------------------------------------------------------+
+
+There are only two types of first-level blocks: “table” and “image”. All
+other blocks are second-level blocks.
+
+The fields in a second-level block include:
+
+-----+----------------------------------------------------------------+
+| Fi  | Description                                                    |
+| eld |                                                                |
+| N   |                                                                |
+| ame |                                                                |
+=====+================================================================+
+| t   | Block type                                                     |
+| ype |                                                                |
+-----+----------------------------------------------------------------+
+| b   | Block bounding box coordinates                                 |
+| box |                                                                |
+-----+----------------------------------------------------------------+
+| li  | list, each element is a dict representing a line, used to      |
+| nes | describe the composition of a line of information              |
+-----+----------------------------------------------------------------+
+
+Detailed explanation of second-level block types
+
+================== ======================
+type               Description
+================== ======================
+image_body         Main body of the image
+image_caption      Image description text
+table_body         Main body of the table
+table_caption      Table description text
+table_footnote     Table footnote
+text               Text block
+title              Title block
+interline_equation Block formula
+================== ======================
+
+**line**
+
+The field format of a line is as follows:
+
+-----+----------------------------------------------------------------+
+| Fi  | Description                                                    |
+| eld |                                                                |
+| N   |                                                                |
+| ame |                                                                |
+=====+================================================================+
+| b   | Bounding box coordinates of the line                           |
+| box |                                                                |
+-----+----------------------------------------------------------------+
+| sp  | list, each element is a dict representing a span, used to      |
+| ans | describe the composition of the smallest unit                  |
+-----+----------------------------------------------------------------+
+
+**span**
+
+----------+-----------------------------------------------------------+
+| Field    | Description                                               |
+| Name     |                                                           |
+==========+===========================================================+
+| bbox     | Bounding box coordinates of the span                      |
+----------+-----------------------------------------------------------+
+| type     | Type of the span                                          |
+----------+-----------------------------------------------------------+
+| content  | Text spans use content, chart spans use img_path to store |
+| \|       | the actual text or screenshot path information            |
+| img_path |                                                           |
+----------+-----------------------------------------------------------+
+
+The types of spans are as follows:
+
+================== ==============
+type               Description
+================== ==============
+image              Image
+table              Table
+text               Text
+inline_equation    Inline formula
+interline_equation Block formula
+================== ==============
+
+**Summary**
+
+A span is the smallest storage unit for all elements.
+
+The elements stored within para_blocks are block information.
+
+The block structure is as follows:
+
+First-level block (if any) -> Second-level block -> Line -> Span
+
+.. _example-1:
+
+example
+^^^^^^^
+
+.. code:: json
+
+   {
+       "pdf_info": [
+           {
+               "preproc_blocks": [
+                   {
+                       "type": "text",
+                       "bbox": [
+                           52,
+                           61.956024169921875,
+                           294,
+                           82.99800872802734
+                       ],
+                       "lines": [
+                           {
+                               "bbox": [
+                                   52,
+                                   61.956024169921875,
+                                   294,
+                                   72.0000228881836
+                               ],
+                               "spans": [
+                                   {
+                                       "bbox": [
+                                           54.0,
+                                           61.956024169921875,
+                                           296.2261657714844,
+                                           72.0000228881836
+                                       ],
+                                       "content": "dependent on the service headway and the reliability of the departure ",
+                                       "type": "text",
+                                       "score": 1.0
+                                   }
+                               ]
+                           }
+                       ]
+                   }
+               ],
+               "layout_bboxes": [
+                   {
+                       "layout_bbox": [
+                           52,
+                           61,
+                           294,
+                           731
+                       ],
+                       "layout_label": "V",
+                       "sub_layout": []
+                   }
+               ],
+               "page_idx": 0,
+               "page_size": [
+                   612.0,
+                   792.0
+               ],
+               "_layout_tree": [],
+               "images": [],
+               "tables": [],
+               "interline_equations": [],
+               "discarded_blocks": [],
+               "para_blocks": [
+                   {
+                       "type": "text",
+                       "bbox": [
+                           52,
+                           61.956024169921875,
+                           294,
+                           82.99800872802734
+                       ],
+                       "lines": [
+                           {
+                               "bbox": [
+                                   52,
+                                   61.956024169921875,
+                                   294,
+                                   72.0000228881836
+                               ],
+                               "spans": [
+                                   {
+                                       "bbox": [
+                                           54.0,
+                                           61.956024169921875,
+                                           296.2261657714844,
+                                           72.0000228881836
+                                       ],
+                                       "content": "dependent on the service headway and the reliability of the departure ",
+                                       "type": "text",
+                                       "score": 1.0
+                                   }
+                               ]
+                           }
+                       ]
+                   }
+               ]
+           }
+       ],
+       "_parse_type": "txt",
+       "_version_name": "0.6.1"
+   }
+
+.. |Poly Coordinate Diagram| image:: ../../_static/image/poly.png
--- a/next_docs/requirements.txt
+++ b/next_docs/requirements.txt
@@ -5,7 +5,8 @@ Pillow==8.4.0
 pydantic>=2.7.2,<2.8.0
 PyMuPDF>=1.24.9
 sphinx
-sphinx-argparse
-sphinx-book-theme
-sphinx-copybutton
-sphinx_rtd_theme
+sphinx-argparse>=0.5.2
+sphinx-book-theme>=1.1.3
+sphinx-copybutton>=0.5.2
+sphinx_rtd_theme>=3.0.1
+autodoc_pydantic>=2.2.0
\ No newline at end of file
--- a/next_docs/zh_cn/.readthedocs.yaml
+++ b/next_docs/zh_cn/.readthedocs.yaml
@@ -10,7 +10,7 @@ formats:

 python:
  install:
-    - requirements: docs/requirements.txt
+    - requirements: next_docs/requirements.txt

 sphinx:
-  configuration: docs/zh_cn/conf.py
+  configuration: next_docs/zh_cn/conf.py
--- a/projects/web_demo/README.md
+++ b/projects/web_demo/README.md
@@ -56,5 +56,5 @@ python3 app.py or python app.py
 ps：API documentation

 ```
-Open the mineru-web API mineru-web接口文档.html in the browser
+https://apifox.com/apidoc/shared-b8eda098-ab9c-4cb3-9432-62be9be9c6f7
 ```
--- a/projects/web_demo/README_zh-CN.md
+++ b/projects/web_demo/README_zh-CN.md
@@ -55,5 +55,5 @@ python3 app.py 或者 python app.py
 ps：接口文档

 ```
-在浏览器打开 mineru-web接口文档.html
+https://apifox.com/apidoc/shared-b8eda098-ab9c-4cb3-9432-62be9be9c6f7
 ```
--- a/projects/web_demo/mineru-web接口文档.html
+++ b/projects/web_demo/mineru-web接口文档.html
--- a/scripts/download_models.py
+++ b/scripts/download_models.py
+import json
+import os
+
+import requests
+from modelscope import snapshot_download
+
+
+def download_json(url):
+    # 下载JSON文件
+    response = requests.get(url)
+    response.raise_for_status()  # 检查请求是否成功
+    return response.json()
+
+
+def download_and_modify_json(url, local_filename, modifications):
+    if os.path.exists(local_filename):
+        data = json.load(open(local_filename))
+        config_version = data.get('config_version', '0.0.0')
+        if config_version < '1.0.0':
+            data = download_json(url)
+    else:
+        data = download_json(url)
+
+    # 修改内容
+    for key, value in modifications.items():
+        data[key] = value
+
+    # 保存修改后的内容
+    with open(local_filename, 'w', encoding='utf-8') as f:
+        json.dump(data, f, ensure_ascii=False, indent=4)
+
+
+if __name__ == '__main__':
+    mineru_patterns = [
+        "models/Layout/LayoutLMv3/*",
+        "models/Layout/YOLO/*",
+        "models/MFD/YOLO/*",
+        "models/MFR/unimernet_small/*",
+        "models/TabRec/TableMaster/*",
+        "models/TabRec/StructEqTable/*",
+    ]
+    model_dir = snapshot_download('opendatalab/PDF-Extract-Kit-1.0', allow_patterns=mineru_patterns)
+    layoutreader_model_dir = snapshot_download('ppaanngggg/layoutreader')
+    model_dir = model_dir + '/models'
+    print(f'model_dir is: {model_dir}')
+    print(f'layoutreader_model_dir is: {layoutreader_model_dir}')
+
+    json_url = 'https://gitee.com/myhloli/MinerU/raw/dev/magic-pdf.template.json'
+    config_file_name = 'magic-pdf.json'
+    home_dir = os.path.expanduser('~')
+    config_file = os.path.join(home_dir, config_file_name)
+
+    json_mods = {
+        'models-dir': model_dir,
+        'layoutreader-model-dir': layoutreader_model_dir,
+    }
+
+    download_and_modify_json(json_url, config_file, json_mods)
+    print(f'The configuration file has been configured successfully, the path is: {config_file}')
--- a/scripts/download_models_hf.py
+++ b/scripts/download_models_hf.py
+import json
+import os
+
+import requests
+from huggingface_hub import snapshot_download
+
+
+def download_json(url):
+    # 下载JSON文件
+    response = requests.get(url)
+    response.raise_for_status()  # 检查请求是否成功
+    return response.json()
+
+
+def download_and_modify_json(url, local_filename, modifications):
+    if os.path.exists(local_filename):
+        data = json.load(open(local_filename))
+        config_version = data.get('config_version', '0.0.0')
+        if config_version < '1.0.0':
+            data = download_json(url)
+    else:
+        data = download_json(url)
+
+    # 修改内容
+    for key, value in modifications.items():
+        data[key] = value
+
+    # 保存修改后的内容
+    with open(local_filename, 'w', encoding='utf-8') as f:
+        json.dump(data, f, ensure_ascii=False, indent=4)
+
+
+if __name__ == '__main__':
+
+    mineru_patterns = [
+        "models/Layout/LayoutLMv3/*",
+        "models/Layout/YOLO/*",
+        "models/MFD/YOLO/*",
+        "models/MFR/unimernet_small/*",
+        "models/TabRec/TableMaster/*",
+        "models/TabRec/StructEqTable/*",
+    ]
+    model_dir = snapshot_download('opendatalab/PDF-Extract-Kit-1.0', allow_patterns=mineru_patterns)
+
+    layoutreader_pattern = [
+        "*.json",
+        "*.safetensors",
+    ]
+    layoutreader_model_dir = snapshot_download('hantian/layoutreader', allow_patterns=layoutreader_pattern)
+
+    model_dir = model_dir + '/models'
+    print(f'model_dir is: {model_dir}')
+    print(f'layoutreader_model_dir is: {layoutreader_model_dir}')
+
+    json_url = 'https://github.com/opendatalab/MinerU/raw/dev/magic-pdf.template.json'
+    config_file_name = 'magic-pdf.json'
+    home_dir = os.path.expanduser('~')
+    config_file = os.path.join(home_dir, config_file_name)
+
+    json_mods = {
+        'models-dir': model_dir,
+        'layoutreader-model-dir': layoutreader_model_dir,
+    }
+
+    download_and_modify_json(json_url, config_file, json_mods)
+    print(f'The configuration file has been configured successfully, the path is: {config_file}')
--- a/setup.py
+++ b/setup.py
@@ -43,8 +43,9 @@ if __name__ == '__main__':
                     "paddleocr==2.7.3",  # 2.8.0及2.8.1版本与detectron2有冲突，需锁定2.7.3
                     "paddlepaddle==3.0.0b1;platform_system=='Linux'",  # 解决linux的段异常问题
                     "paddlepaddle==2.6.1;platform_system=='Windows' or platform_system=='Darwin'",  # windows版本3.0.0b1效率下降，需锁定2.6.1
-                     "pypandoc",  # 表格解析latex转html
-                     "struct-eqtable==0.1.0",  # 表格解析
+                     "struct-eqtable==0.3.2",  # 表格解析
+                     "einops",  # struct-eqtable依赖
+                     "accelerate",  # struct-eqtable依赖
                     "doclayout_yolo==0.0.2",  # doclayout_yolo
                     "detectron2"
                     ],

--- a/tests/test_data/data_reader_writer/test_multi_bucket_s3.py
+++ b/tests/test_data/data_reader_writer/test_multi_bucket_s3.py
@@ -41,8 +41,8 @@ def test_multi_bucket_s3_reader_writer():
        ),
    ]

-    reader = MultiBucketS3DataReader(default_bucket=bucket, s3_configs=s3configs)
-    writer = MultiBucketS3DataWriter(default_bucket=bucket, s3_configs=s3configs)
+    reader = MultiBucketS3DataReader(bucket, s3configs)
+    writer = MultiBucketS3DataWriter(bucket, s3configs)

    bits = reader.read('meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl')

@@ -80,3 +80,81 @@ def test_multi_bucket_s3_reader_writer():
    assert '123'.encode() == reader.read(
        'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt'
    )
+
+
+@pytest.mark.skipif(
+    os.getenv('S3_ACCESS_KEY_2', None) is None, reason='need s3 config!'
+)
+def test_multi_bucket_s3_reader_writer_with_prefix():
+    """test multi bucket s3 reader writer must config s3 config in the
+    environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export
+    S3_SECRET_KEY=xxx export S3_ENDPOINT=xxx.
+
+    export S3_BUCKET_2=xxx export S3_ACCESS_KEY_2=xxx export S3_SECRET_KEY_2=xxx export S3_ENDPOINT_2=xxx
+    """
+    bucket = os.getenv('S3_BUCKET', '')
+    ak = os.getenv('S3_ACCESS_KEY', '')
+    sk = os.getenv('S3_SECRET_KEY', '')
+    endpoint_url = os.getenv('S3_ENDPOINT', '')
+
+    bucket_2 = os.getenv('S3_BUCKET_2', '')
+    ak_2 = os.getenv('S3_ACCESS_KEY_2', '')
+    sk_2 = os.getenv('S3_SECRET_KEY_2', '')
+    endpoint_url_2 = os.getenv('S3_ENDPOINT_2', '')
+
+    s3configs = [
+        S3Config(
+            bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
+        ),
+        S3Config(
+            bucket_name=bucket_2,
+            access_key=ak_2,
+            secret_key=sk_2,
+            endpoint_url=endpoint_url_2,
+        ),
+    ]
+
+    prefix = 'meta-index'
+    reader = MultiBucketS3DataReader(f'{bucket}/{prefix}', s3configs)
+    writer = MultiBucketS3DataWriter(f'{bucket}/{prefix}', s3configs)
+
+    bits = reader.read('scihub/v001/scihub/part-66210c190659-000026.jsonl')
+
+    assert bits == reader.read(
+        f's3://{bucket}/{prefix}/scihub/v001/scihub/part-66210c190659-000026.jsonl'
+    )
+
+    bits = reader.read(
+        f's3://{bucket_2}/enbook-scimag/78800000/libgen.scimag78872000-78872999/10.1017/cbo9780511770425.012.pdf'
+    )
+    docs = fitz.open('pdf', bits)
+    assert len(docs) == 10
+
+    bits = reader.read(
+        'scihub/v001/scihub/part-66210c190659-000026.jsonl?bytes=566,713'
+    )
+    assert bits == reader.read_at(
+        'scihub/v001/scihub/part-66210c190659-000026.jsonl', 566, 713
+    )
+    assert len(json.loads(bits)) > 0
+
+    writer.write_string(
+        'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt', 'abc'
+    )
+
+    assert 'abc'.encode() == reader.read(
+        'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
+    )
+
+    assert 'abc'.encode() == reader.read(
+        f's3://{bucket}/{prefix}/unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
+    )
+
+    writer.write(
+        'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt',
+        '123'.encode(),
+    )
+
+    assert '123'.encode() == reader.read(
+        'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt'
+    )
--- a/tests/test_data/data_reader_writer/test_s3.py
+++ b/tests/test_data/data_reader_writer/test_s3.py
@@ -9,7 +9,7 @@ from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
 @pytest.mark.skipif(
    os.getenv('S3_ACCESS_KEY', None) is None, reason='need s3 config!'
 )
-def test_multi_bucket_s3_reader_writer():
+def test_s3_reader_writer():
    """test multi bucket s3 reader writer must config s3 config in the
    environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export
    S3_SECRET_KEY=xxx export S3_ENDPOINT=xxx."""
@@ -18,8 +18,8 @@ def test_multi_bucket_s3_reader_writer():
    sk = os.getenv('S3_SECRET_KEY', '')
    endpoint_url = os.getenv('S3_ENDPOINT', '')

-    reader = S3DataReader(bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint_url)
-    writer = S3DataWriter(bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint_url)
+    reader = S3DataReader('', bucket, ak, sk, endpoint_url)
+    writer = S3DataWriter('', bucket, ak, sk, endpoint_url)

    bits = reader.read('meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl')

@@ -51,3 +51,56 @@ def test_multi_bucket_s3_reader_writer():
    assert '123'.encode() == reader.read(
        'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt'
    )
+
+
+@pytest.mark.skipif(
+    os.getenv('S3_ACCESS_KEY', None) is None, reason='need s3 config!'
+)
+def test_s3_reader_writer_with_prefix():
+    """test multi bucket s3 reader writer must config s3 config in the
+    environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export
+    S3_SECRET_KEY=xxx export S3_ENDPOINT=xxx."""
+    bucket = os.getenv('S3_BUCKET', '')
+    ak = os.getenv('S3_ACCESS_KEY', '')
+    sk = os.getenv('S3_SECRET_KEY', '')
+    endpoint_url = os.getenv('S3_ENDPOINT', '')
+
+    prefix = 'meta-index'
+
+    reader = S3DataReader(prefix, bucket, ak, sk, endpoint_url)
+    writer = S3DataWriter(prefix, bucket, ak, sk, endpoint_url)
+
+    bits = reader.read('scihub/v001/scihub/part-66210c190659-000026.jsonl')
+
+    assert bits == reader.read(
+        f's3://{bucket}/{prefix}/scihub/v001/scihub/part-66210c190659-000026.jsonl'
+    )
+
+    bits = reader.read(
+        'scihub/v001/scihub/part-66210c190659-000026.jsonl?bytes=566,713'
+    )
+    assert bits == reader.read_at(
+        'scihub/v001/scihub/part-66210c190659-000026.jsonl', 566, 713
+    )
+    assert len(json.loads(bits)) > 0
+
+    writer.write_string(
+        'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt', 'abc'
+    )
+
+    assert 'abc'.encode() == reader.read(
+        'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
+    )
+
+    assert 'abc'.encode() == reader.read(
+        f's3://{bucket}/{prefix}/unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
+    )
+
+    writer.write(
+        f'{bucket}/{prefix}/unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt',
+        '123'.encode(),
+    )
+
+    assert '123'.encode() == reader.read(
+        'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt'
+    )
--- a/tests/test_table/test_tablemaster.py
+++ b/tests/test_table/test_tablemaster.py
 import unittest
 from PIL import Image
+from lxml import etree
+
 from magic_pdf.model.ppTableModel import ppTableModel

 class TestppTableModel(unittest.TestCase):
@@ -10,8 +12,44 @@ class TestppTableModel(unittest.TestCase):
                  "model_dir": "D:/models/PDF-Extract-Kit/models/TabRec/TableMaster"}
        table_model = ppTableModel(config)
        res = table_model.img2html(img)
-        true_value = """<td><table  border="1"><thead><tr><td><b>Methods</b></td><td><b>R</b></td><td><b>P</b></td><td><b>F</b></td><td><b>FPS</b></td></tr></thead><tbody><tr><td>SegLink [26]</td><td>70.0</td><td>86.0</td><td>77.0</td><td>8.9</td></tr><tr><td>PixelLink [4]</td><td>73.2</td><td>83.0</td><td>77.8</td><td>-</td></tr><tr><td>TextSnake [18]</td><td>73.9</td><td>83.2</td><td>78.3</td><td>1.1</td></tr><tr><td>TextField [37]</td><td>75.9</td><td>87.4</td><td>81.3</td><td>5.2 </td></tr><tr><td>MSR[38]</td><td>76.7</td><td>87.4</td><td>81.7</td><td>-</td></tr><tr><td>FTSN[3]</td><td>77.1</td><td>87.6</td><td>82.0</td><td>-</td></tr><tr><td>LSE[30]</td><td>81.7</td><td>84.2</td><td>82.9</td><td>-</td></tr><tr><td>CRAFT [2]</td><td>78.2</td><td>88.2</td><td>82.9</td><td>8.6</td></tr><tr><td>MCN [16]</td><td>79</td><td>88.</td><td>83</td><td>-</td></tr><tr><td>ATRR[35]</td><td>82.1</td><td>85.2</td><td>83.6</td><td>-</td></tr><tr><td>PAN [34]</td><td>83.8</td><td>84.4</td><td>84.1</td><td>30.2</td></tr><tr><td>DB[12]</td><td>79.2</td><td>91.5</td><td>84.9</td><td>32.0</td></tr><tr><td>DRRG [41]</td><td>82.30</td><td>88.05</td><td>85.08</td><td>-</td></tr><tr><td>Ours (SynText)</td><td>80.68</td><td>85.40</td><td>82.97</td><td>12.68</td></tr><tr><td>Ours (MLT-17)</td><td>84.54</td><td>86.62</td><td>85.57</td><td>12.31</td></tr></tbody></table></td>\n"""
-        self.assertEqual(true_value, res)
+        # 验证生成的 HTML 是否符合预期
+        parser = etree.HTMLParser()
+        tree = etree.fromstring(res, parser)
+
+        # 检查 HTML 结构
+        assert tree.find('.//table') is not None, "HTML should contain a <table> element"
+        assert tree.find('.//thead') is not None, "HTML should contain a <thead> element"
+        assert tree.find('.//tbody') is not None, "HTML should contain a <tbody> element"
+        assert tree.find('.//tr') is not None, "HTML should contain a <tr> element"
+        assert tree.find('.//td') is not None, "HTML should contain a <td> element"
+
+        # 检查具体的表格内容
+        headers = tree.xpath('//thead/tr/td/b')
+        print(headers)  # Print headers for debugging
+        assert len(headers) == 5, "Thead should have 5 columns"
+        assert headers[0].text and headers[0].text.strip() == "Methods", "First header should be 'Methods'"
+        assert headers[1].text and headers[1].text.strip() == "R", "Second header should be 'R'"
+        assert headers[2].text and headers[2].text.strip() == "P", "Third header should be 'P'"
+        assert headers[3].text and headers[3].text.strip() == "F", "Fourth header should be 'F'"
+        assert headers[4].text and headers[4].text.strip() == "FPS", "Fifth header should be 'FPS'"
+
+        # 检查第一行数据
+        first_row = tree.xpath('//tbody/tr[1]/td')
+        assert len(first_row) == 5, "First row should have 5 cells"
+        assert first_row[0].text and first_row[0].text.strip() == "SegLink[26]", "First cell should be 'SegLink[26]'"
+        assert first_row[1].text and first_row[1].text.strip() == "70.0", "Second cell should be '70.0'"
+        assert first_row[2].text and first_row[2].text.strip() == "86.0", "Third cell should be '86.0'"
+        assert first_row[3].text and first_row[3].text.strip() == "77.0", "Fourth cell should be '77.0'"
+        assert first_row[4].text and first_row[4].text.strip() == "8.9", "Fifth cell should be '8.9'"
+
+        # 检查倒数第二行数据
+        second_last_row = tree.xpath('//tbody/tr[position()=last()-1]/td')
+        assert len(second_last_row) == 5, "second_last_row should have 5 cells"
+        assert second_last_row[0].text and second_last_row[0].text.strip() == "Ours (SynText)", "First cell should be 'Ours (SynText)'"
+        assert second_last_row[1].text and second_last_row[1].text.strip() == "80.68", "Second cell should be '80.68'"
+        assert second_last_row[2].text and second_last_row[2].text.strip() == "85.40", "Third cell should be '85.40'"
+        assert second_last_row[3].text and second_last_row[3].text.strip() == "82.97", "Fourth cell should be '82.97'"
+        assert second_last_row[3].text and second_last_row[4].text.strip() == "12.68", "Fifth cell should be '12.68'"


 if __name__ == "__main__":