Commit d6c58ecc authored by liukaiwen's avatar liukaiwen

# add table recognition using struct-eqtable

## Changelog
31/07/20204
- Support table recognition. Table images will be converted into LaTex.

### how to use the new feature:
set the attribute 'table-mode' to 'true' in magic-pdf.json

### caution:
it takes 200s to 500s to convert a single table image using cpu
parent 7b712c40
...@@ -130,7 +130,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""): ...@@ -130,7 +130,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
if span['type'] == ContentType.Table: if span['type'] == ContentType.Table:
# if processed by table model # if processed by table model
if span.get('content', ''): if span.get('content', ''):
para_text += f"\n {span['content']} \n" para_text += f"\n\n$\n {span['content']}\n$\n\n"
else: else:
para_text += f"\n![]({join_path(img_buket_path, span['image_path'])}) \n" para_text += f"\n![]({join_path(img_buket_path, span['image_path'])}) \n"
for block in para_block['blocks']: # 3rd.拼table_footnote for block in para_block['blocks']: # 3rd.拼table_footnote
......
...@@ -561,13 +561,9 @@ class MagicModel: ...@@ -561,13 +561,9 @@ class MagicModel:
span["type"] = ContentType.Image span["type"] = ContentType.Image
elif category_id == 5: elif category_id == 5:
# 获取table模型结果 # 获取table模型结果
html = layout_det.get("html", None)
latex = layout_det.get("latex", None) latex = layout_det.get("latex", None)
if html: if latex:
span["content"] = html
elif latex:
span["content"] = latex span["content"] = latex
span["type"] = ContentType.Table span["type"] = ContentType.Table
elif category_id == 13: elif category_id == 13:
span["content"] = layout_det["latex"] span["content"] = layout_det["latex"]
......
...@@ -287,13 +287,7 @@ class CustomPEKModel: ...@@ -287,13 +287,7 @@ class CustomPEKModel:
end_time = time.time() end_time = time.time()
run_time = end_time - start_time run_time = end_time - start_time
print(f"------------table recognition processing ends within {run_time}s-----") print(f"------------table recognition processing ends within {run_time}s-----")
layout["latex"] = latex_code
# try to convert latex to html
try:
html_code = convert_text(latex_code, 'html', format='latex')
layout["html"] = html_code
except Exception as e:
layout["latex"] = latex_code
logger.error(f"[pdf_extract_kit][CustomPEKModel]: converting latex to html failed: {e}")
return layout_res return layout_res
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment