Unverified Commit dd19f59e authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub

fix(ocr_mkcontent): revise table caption output (#397)

* fix(ocr_mkcontent): revise table caption output

- Ensuring that
  table captions are properly included in the output.
- Remove the redundant `table_caption` variable。

* Update cla.yml

* Update bug_report.yml

* feat(cli): add debug option for detailed error handling

Enable users to invoke the CLI command with a new debug flag to get detailed debugging information.

* fix(pdf-extract-kit): adjust crop_paste parameters for better accuracyThe crop_paste_x and crop_paste_y values in the pdf_extract_kit.py have been modified
to improve the accuracy and consistency of OCR processing. The new values are set to 25
to ensure more precise image cropping and pasting which leads to better OCR recognition
results.

* Update README_zh-CN.md (#404)

correct FAQ url

* Update README_zh-CN.md (#404) (#409) (#410)

correct FAQ url
Co-authored-by: 's avatarsfk <18810651050@163.com>

* Update FAQ_zh_cn.md

add new issue

* Update FAQ_en_us.md

* Update README_Windows_CUDA_Acceleration_zh_CN.md

* Update README_zh-CN.md

* @Thepathakarpit has signed the CLA in opendatalab/MinerU#418

* fix(pdf-extract-kit): increase crop_paste margin for OCR processingDouble the crop_paste margin from25 to 50 to ensure better OCR accuracy and
handling of border cases. This change will help in improving the overall quality of
OCR'ed text by providing more context around the detected text areas.

* fix(common): deep copy model list before drawing model bbox

Use a deep copy of the original model list in `drow_model_bbox` to avoid potential
modifications to the source data. This ensures the integrity of the original models
is maintained while generating the model bounding boxes visualization.

---------
Co-authored-by: 's avatarsfk <18810651050@163.com>
Co-authored-by: 's avatardrunkpig <60862764+drunkpig@users.noreply.github.com>
Co-authored-by: 's avatargithub-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
parent c9a51491
......@@ -78,8 +78,8 @@ body:
#multiple: false
options:
-
- "0.5.x"
- "0.6.x"
- "0.7.x"
validations:
required: true
......@@ -92,6 +92,5 @@ body:
-
- cpu
- cuda
- mps
validations:
required: true
......@@ -208,8 +208,9 @@ cp magic-pdf.template.json ~/magic-pdf.json
> ❗️务必正确配置模型权重文件所在目录的【绝对路径】,否则会因为找不到模型文件而导致程序无法运行
>
> windows系统中此路径应包含盘符,且需把路径中所有的""替换为"/",否则会因为转义原因导致json文件语法错误。
> windows系统中此路径应包含盘符,且需把路径中所有的`"\"`替换为`"/"`,否则会因为转义原因导致json文件语法错误。
>
> 例如:模型放在D盘根目录的models目录,则model-dir的值应为"D:/models"
```json
......@@ -340,8 +341,10 @@ TODO
# FAQ
[常见问题](docs/FAQ_zh_cn.md)
[FAQ](docs/FAQ_en_us.md)
# All Thanks To Our Contributors
......
......@@ -36,3 +36,11 @@ sudo apt-get install libgl1-mesa-glx
```
Reference: https://github.com/opendatalab/MinerU/issues/388
### 5. Encountered error `ModuleNotFoundError: No module named 'fairscale'`
You need to uninstall the module and reinstall it:
```bash
pip uninstall fairscale
pip install fairscale
```
Reference: https://github.com/opendatalab/MinerU/issues/411
......@@ -33,3 +33,11 @@ WSL2的Ubuntu22.04中缺少`libgl`库,可通过以下命令安装`libgl`库解
sudo apt-get install libgl1-mesa-glx
```
参考:https://github.com/opendatalab/MinerU/issues/388
### 5.遇到报错 `ModuleNotFoundError : Nomodulenamed 'fairscale'`
需要卸载该模块并重新安装
```bash
pip uninstall fairscale
pip install fairscale
```
参考:https://github.com/opendatalab/MinerU/issues/411
......@@ -27,7 +27,7 @@ pip install magic-pdf[full]==0.7.0b1 --extra-index-url https://wheels.myhloli.co
> ```bash
> magic-pdf --version
>```
> 如果版本号小于0.6.2,请到issue中向我们反馈
> 如果版本号小于0.7.0,请到issue中向我们反馈
## 5. 下载模型
详细参考 [如何下载模型文件](how_to_download_models_zh_cn.md)
......
......@@ -123,7 +123,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
table_caption = ''
for block in para_block['blocks']: # 1st.拼table_caption
if block['type'] == BlockType.TableCaption:
table_caption = merge_para_with_text(block)
para_text += merge_para_with_text(block)
for block in para_block['blocks']: # 2nd.拼table_body
if block['type'] == BlockType.TableBody:
for line in block['lines']:
......@@ -133,7 +133,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
if span.get('latex', ''):
para_text += f"\n\n$\n {span['latex']}\n$\n\n"
else:
para_text += f"\n![{table_caption}]({join_path(img_buket_path, span['image_path'])}) \n"
para_text += f"\n![]({join_path(img_buket_path, span['image_path'])}) \n"
for block in para_block['blocks']: # 3rd.拼table_footnote
if block['type'] == BlockType.TableFootnote:
para_text += merge_para_with_text(block)
......
......@@ -45,7 +45,15 @@ auto: automatically choose the best method for parsing pdf from ocr and txt.
without method specified, auto will be used by default.""",
default='auto',
)
def cli(path, output_dir, method):
@click.option(
"-d",
"--debug",
"debug_able",
type=bool,
help="Enables detailed debugging information during the execution of the CLI commands.",
default=False,
)
def cli(path, output_dir, method, debug_able):
model_config.__use_inside_model__ = True
model_config.__model_mode__ = 'full'
os.makedirs(output_dir, exist_ok=True)
......@@ -64,6 +72,7 @@ def cli(path, output_dir, method):
pdf_data,
[],
method,
debug_able,
)
except Exception as e:
......
......@@ -32,6 +32,7 @@ def do_parse(
pdf_bytes,
model_list,
parse_method,
debug_able,
f_draw_span_bbox=True,
f_draw_layout_bbox=True,
f_dump_md=True,
......@@ -42,6 +43,11 @@ def do_parse(
f_make_md_mode=MakeMode.MM_MD,
f_draw_model_bbox=False,
):
if debug_able:
logger.warning("debug mode is on")
f_dump_content_list = True
f_draw_model_bbox = True
orig_model_list = copy.deepcopy(model_list)
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name,
parse_method)
......@@ -78,8 +84,7 @@ def do_parse(
if f_draw_span_bbox:
draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
if f_draw_model_bbox:
drow_model_bbox(orig_model_list, pdf_bytes, local_md_dir,
pdf_file_name)
drow_model_bbox(copy.deepcopy(orig_model_list), pdf_bytes, local_md_dir, pdf_file_name)
md_content = pipe.pipe_mk_markdown(image_dir,
drop_mode=DropMode.NONE,
......
......@@ -23,6 +23,14 @@
"created_at": "2024-08-07T09:11:10Z",
"repoId": 765083837,
"pullRequestNo": 355
},
{
"name": "Thepathakarpit",
"id": 119810812,
"comment_id": 2286123353,
"created_at": "2024-08-13T12:23:16Z",
"repoId": 765083837,
"pullRequestNo": 418
}
]
}
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment