Unverified Commit 460ea6b4 authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub

Merge pull request #791 from myhloli/fix-imgs-block

feat(draw_bbox): update bounding box drawing for tables and images
parents 7469697b 0e8d5893
...@@ -71,7 +71,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, ...@@ -71,7 +71,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
for block in para_block['blocks']: # 2nd.拼image_caption for block in para_block['blocks']: # 2nd.拼image_caption
if block['type'] == BlockType.ImageCaption: if block['type'] == BlockType.ImageCaption:
para_text += merge_para_with_text(block) + ' \n' para_text += merge_para_with_text(block) + ' \n'
for block in para_block['blocks']: # 2nd.拼image_caption for block in para_block['blocks']: # 3rd.拼image_footnote
if block['type'] == BlockType.ImageFootnote: if block['type'] == BlockType.ImageFootnote:
para_text += merge_para_with_text(block) + ' \n' para_text += merge_para_with_text(block) + ' \n'
elif para_type == BlockType.Table: elif para_type == BlockType.Table:
......
...@@ -141,11 +141,33 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename): ...@@ -141,11 +141,33 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
layout_bbox_list = [] layout_bbox_list = []
table_type_order = {
'table_caption': 1,
'table_body': 2,
'table_footnote': 3
}
for page in pdf_info: for page in pdf_info:
page_block_list = [] page_block_list = []
for block in page['para_blocks']: for block in page['para_blocks']:
bbox = block['bbox'] if block['type'] in [
page_block_list.append(bbox) BlockType.Text,
BlockType.Title,
BlockType.InterlineEquation,
BlockType.List,
BlockType.Index,
]:
bbox = block['bbox']
page_block_list.append(bbox)
elif block['type'] in [BlockType.Image]:
for sub_block in block['blocks']:
bbox = sub_block['bbox']
page_block_list.append(bbox)
elif block['type'] in [BlockType.Table]:
sorted_blocks = sorted(block['blocks'], key=lambda x: table_type_order[x['type']])
for sub_block in sorted_blocks:
bbox = sub_block['bbox']
page_block_list.append(bbox)
layout_bbox_list.append(page_block_list) layout_bbox_list.append(page_block_list)
pdf_docs = fitz.open('pdf', pdf_bytes) pdf_docs = fitz.open('pdf', pdf_bytes)
...@@ -153,11 +175,11 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename): ...@@ -153,11 +175,11 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
for i, page in enumerate(pdf_docs): for i, page in enumerate(pdf_docs):
draw_bbox_without_number(i, dropped_bbox_list, page, [158, 158, 158], True) draw_bbox_without_number(i, dropped_bbox_list, page, [158, 158, 158], True)
draw_bbox_without_number(i, tables_list, page, [153, 153, 0], True) # color ! # draw_bbox_without_number(i, tables_list, page, [153, 153, 0], True) # color !
draw_bbox_without_number(i, tables_body_list, page, [204, 204, 0], True) draw_bbox_without_number(i, tables_body_list, page, [204, 204, 0], True)
draw_bbox_without_number(i, tables_caption_list, page, [255, 255, 102], True) draw_bbox_without_number(i, tables_caption_list, page, [255, 255, 102], True)
draw_bbox_without_number(i, tables_footnote_list, page, [229, 255, 204], True) draw_bbox_without_number(i, tables_footnote_list, page, [229, 255, 204], True)
draw_bbox_without_number(i, imgs_list, page, [51, 102, 0], True) # draw_bbox_without_number(i, imgs_list, page, [51, 102, 0], True)
draw_bbox_without_number(i, imgs_body_list, page, [153, 255, 51], True) draw_bbox_without_number(i, imgs_body_list, page, [153, 255, 51], True)
draw_bbox_without_number(i, imgs_caption_list, page, [102, 178, 255], True) draw_bbox_without_number(i, imgs_caption_list, page, [102, 178, 255], True)
draw_bbox_without_number(i, imgs_footnote_list, page, [255, 178, 102], True), draw_bbox_without_number(i, imgs_footnote_list, page, [255, 178, 102], True),
...@@ -338,19 +360,23 @@ def draw_line_sort_bbox(pdf_info, pdf_bytes, out_path, filename): ...@@ -338,19 +360,23 @@ def draw_line_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
for page in pdf_info: for page in pdf_info:
page_line_list = [] page_line_list = []
for block in page['preproc_blocks']: for block in page['preproc_blocks']:
if block['type'] in ['text', 'title', 'interline_equation']: if block['type'] in [BlockType.Text, BlockType.Title, BlockType.InterlineEquation]:
for line in block['lines']: for line in block['lines']:
bbox = line['bbox'] bbox = line['bbox']
index = line['index'] index = line['index']
page_line_list.append({'index': index, 'bbox': bbox}) page_line_list.append({'index': index, 'bbox': bbox})
if block['type'] in ['table', 'image']: if block['type'] in [BlockType.Image, BlockType.Table]:
bbox = block['bbox'] for sub_block in block['blocks']:
index = block['index'] if sub_block['type'] in [BlockType.ImageBody, BlockType.TableBody]:
page_line_list.append({'index': index, 'bbox': bbox}) for line in sub_block['virtual_lines']:
# for line in block['lines']: bbox = line['bbox']
# bbox = line['bbox'] index = line['index']
# index = line['index'] page_line_list.append({'index': index, 'bbox': bbox})
# page_line_list.append({'index': index, 'bbox': bbox}) elif sub_block['type'] in [BlockType.ImageCaption, BlockType.TableCaption, BlockType.ImageFootnote, BlockType.TableFootnote]:
for line in sub_block['lines']:
bbox = line['bbox']
index = line['index']
page_line_list.append({'index': index, 'bbox': bbox})
sorted_bboxes = sorted(page_line_list, key=lambda x: x['index']) sorted_bboxes = sorted(page_line_list, key=lambda x: x['index'])
layout_bbox_list.append(sorted_bbox['bbox'] for sorted_bbox in sorted_bboxes) layout_bbox_list.append(sorted_bbox['bbox'] for sorted_bbox in sorted_bboxes)
pdf_docs = fitz.open('pdf', pdf_bytes) pdf_docs = fitz.open('pdf', pdf_bytes)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment