Unverified Commit 099f19f2 authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub

Merge pull request #834 from myhloli/dev

feat(pdf_parse): improve span filtering and add new block types
parents 73afb7d6 149132d6
...@@ -175,11 +175,14 @@ Detailed explanation of second-level block types ...@@ -175,11 +175,14 @@ Detailed explanation of second-level block types
| :----------------- | :--------------------- | | :----------------- | :--------------------- |
| image_body | Main body of the image | | image_body | Main body of the image |
| image_caption | Image description text | | image_caption | Image description text |
| image_footnote | Image footnote |
| table_body | Main body of the table | | table_body | Main body of the table |
| table_caption | Table description text | | table_caption | Table description text |
| table_footnote | Table footnote | | table_footnote | Table footnote |
| text | Text block | | text | Text block |
| title | Title block | | title | Title block |
| index | Index block |
| list | List block |
| interline_equation | Block formula | | interline_equation | Block formula |
<br> <br>
......
...@@ -174,12 +174,15 @@ poly 坐标的格式 \[x0, y0, x1, y1, x2, y2, x3, y3\], 分别表示左上、 ...@@ -174,12 +174,15 @@ poly 坐标的格式 \[x0, y0, x1, y1, x2, y2, x3, y3\], 分别表示左上、
| :----------------- | :------------- | | :----------------- | :------------- |
| image_body | 图像的本体 | | image_body | 图像的本体 |
| image_caption | 图像的描述文本 | | image_caption | 图像的描述文本 |
| table_body | 表格本体 | | image_footnote | 图像的脚注 |
| table_body | 表格本体 |
| table_caption | 表格的描述文本 | | table_caption | 表格的描述文本 |
| table_footnote | 表格的脚注 | | table_footnote | 表格的脚注 |
| text | 文本块 | | text | 文本块 |
| title | 标题块 | | title | 标题块 |
| interline_equation | 行间公式块 | | index | 目录块 |
| list | 列表块 |
| interline_equation | 行间公式块 |
<br> <br>
......
...@@ -249,7 +249,8 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename): ...@@ -249,7 +249,8 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename):
page_dropped_list.append(span['bbox']) page_dropped_list.append(span['bbox'])
dropped_list.append(page_dropped_list) dropped_list.append(page_dropped_list)
# 构造其余useful_list # 构造其余useful_list
for block in page['para_blocks']: # for block in page['para_blocks']: # span直接用分段合并前的结果就可以
for block in page['preproc_blocks']:
if block['type'] in [ if block['type'] in [
BlockType.Text, BlockType.Text,
BlockType.Title, BlockType.Title,
......
__version__ = "0.8.0" __version__ = "0.9.0"
...@@ -382,39 +382,44 @@ def revert_group_blocks(blocks): ...@@ -382,39 +382,44 @@ def revert_group_blocks(blocks):
return new_blocks return new_blocks
def remove_outside_spans(spans, all_bboxes): def remove_outside_spans(spans, all_bboxes, all_discarded_blocks):
image_bboxes = [] def get_block_bboxes(blocks, block_type_list):
table_bboxes = [] return [block[0:4] for block in blocks if block[7] in block_type_list]
other_block_bboxes = []
for block in all_bboxes: image_bboxes = get_block_bboxes(all_bboxes, [BlockType.ImageBody])
block_type = block[7] table_bboxes = get_block_bboxes(all_bboxes, [BlockType.TableBody])
block_bbox = block[0:4] other_block_type = []
for block_type in BlockType.__dict__.values():
if block_type == BlockType.ImageBody: if not isinstance(block_type, str):
image_bboxes.append(block_bbox) continue
elif block_type == BlockType.TableBody: if block_type not in [BlockType.ImageBody, BlockType.TableBody]:
table_bboxes.append(block_bbox) other_block_type.append(block_type)
else: other_block_bboxes = get_block_bboxes(all_bboxes, other_block_type)
other_block_bboxes.append(block_bbox) discarded_block_bboxes = get_block_bboxes(all_discarded_blocks, [BlockType.Discarded])
new_spans = [] new_spans = []
for span in spans: for span in spans:
if span['type'] == ContentType.Image: span_bbox = span['bbox']
for block_bbox in image_bboxes: span_type = span['type']
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block_bbox) > 0.5:
new_spans.append(span) if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.4 for block_bbox in
break discarded_block_bboxes):
elif span['type'] == ContentType.Table: new_spans.append(span)
for block_bbox in table_bboxes: continue
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block_bbox) > 0.5:
new_spans.append(span) if span_type == ContentType.Image:
break if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.5 for block_bbox in
image_bboxes):
new_spans.append(span)
elif span_type == ContentType.Table:
if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.5 for block_bbox in
table_bboxes):
new_spans.append(span)
else: else:
for block_bbox in other_block_bboxes: if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.5 for block_bbox in
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block_bbox) > 0.5: other_block_bboxes):
new_spans.append(span) new_spans.append(span)
break
return new_spans return new_spans
...@@ -488,7 +493,8 @@ def parse_page_core( ...@@ -488,7 +493,8 @@ def parse_page_core(
raise Exception('parse_mode must be txt or ocr') raise Exception('parse_mode must be txt or ocr')
"""在删除重复span之前,应该通过image_body和table_body的block过滤一下image和table的span""" """在删除重复span之前,应该通过image_body和table_body的block过滤一下image和table的span"""
spans = remove_outside_spans(spans, all_bboxes) """顺便删除大水印并保留abandon的span"""
spans = remove_outside_spans(spans, all_bboxes, all_discarded_blocks)
"""删除重叠spans中置信度较低的那些""" """删除重叠spans中置信度较低的那些"""
spans, dropped_spans_by_confidence = remove_overlaps_low_confidence_spans(spans) spans, dropped_spans_by_confidence = remove_overlaps_low_confidence_spans(spans)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment