Commit d6c58ecc authored by liukaiwen's avatar liukaiwen

# add table recognition using struct-eqtable

## Changelog
31/07/20204
- Support table recognition. Table images will be converted into LaTex.

### how to use the new feature:
set the attribute 'table-mode' to 'true' in magic-pdf.json

### caution:
it takes 200s to 500s to convert a single table image using cpu
parent 7b712c40
......@@ -130,7 +130,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
if span['type'] == ContentType.Table:
# if processed by table model
if span.get('content', ''):
para_text += f"\n {span['content']} \n"
para_text += f"\n\n$\n {span['content']}\n$\n\n"
else:
para_text += f"\n![]({join_path(img_buket_path, span['image_path'])}) \n"
for block in para_block['blocks']: # 3rd.拼table_footnote
......
......@@ -561,13 +561,9 @@ class MagicModel:
span["type"] = ContentType.Image
elif category_id == 5:
# 获取table模型结果
html = layout_det.get("html", None)
latex = layout_det.get("latex", None)
if html:
span["content"] = html
elif latex:
if latex:
span["content"] = latex
span["type"] = ContentType.Table
elif category_id == 13:
span["content"] = layout_det["latex"]
......
......@@ -287,13 +287,7 @@ class CustomPEKModel:
end_time = time.time()
run_time = end_time - start_time
print(f"------------table recognition processing ends within {run_time}s-----")
# try to convert latex to html
try:
html_code = convert_text(latex_code, 'html', format='latex')
layout["html"] = html_code
except Exception as e:
layout["latex"] = latex_code
logger.error(f"[pdf_extract_kit][CustomPEKModel]: converting latex to html failed: {e}")
return layout_res
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment