Unverified Commit c4a52ee6 authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub

Merge pull request #630 from myhloli/content-list-not-drop

feat(ocr_mkcontent): support drop reason in none_with_reason mode
parents 734f9c4c 98313d4a
...@@ -272,30 +272,28 @@ def para_to_standard_format(para, img_buket_path): ...@@ -272,30 +272,28 @@ def para_to_standard_format(para, img_buket_path):
return para_content return para_content
def para_to_standard_format_v2(para_block, img_buket_path, page_idx, parse_type="auto", lang=None): def para_to_standard_format_v2(para_block, img_buket_path, page_idx, parse_type="auto", lang=None, drop_reason=None):
para_type = para_block['type'] para_type = para_block['type']
para_content = {}
if para_type == BlockType.Text: if para_type == BlockType.Text:
para_content = { para_content = {
'type': 'text', 'type': 'text',
'text': merge_para_with_text(para_block, parse_type=parse_type, lang=lang), 'text': merge_para_with_text(para_block, parse_type=parse_type, lang=lang),
'page_idx': page_idx,
} }
elif para_type == BlockType.Title: elif para_type == BlockType.Title:
para_content = { para_content = {
'type': 'text', 'type': 'text',
'text': merge_para_with_text(para_block, parse_type=parse_type, lang=lang), 'text': merge_para_with_text(para_block, parse_type=parse_type, lang=lang),
'text_level': 1, 'text_level': 1,
'page_idx': page_idx,
} }
elif para_type == BlockType.InterlineEquation: elif para_type == BlockType.InterlineEquation:
para_content = { para_content = {
'type': 'equation', 'type': 'equation',
'text': merge_para_with_text(para_block, parse_type=parse_type, lang=lang), 'text': merge_para_with_text(para_block, parse_type=parse_type, lang=lang),
'text_format': 'latex', 'text_format': 'latex',
'page_idx': page_idx,
} }
elif para_type == BlockType.Image: elif para_type == BlockType.Image:
para_content = {'type': 'image', 'page_idx': page_idx} para_content = {'type': 'image'}
for block in para_block['blocks']: for block in para_block['blocks']:
if block['type'] == BlockType.ImageBody: if block['type'] == BlockType.ImageBody:
para_content['img_path'] = join_path( para_content['img_path'] = join_path(
...@@ -306,7 +304,7 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, parse_type= ...@@ -306,7 +304,7 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, parse_type=
if block['type'] == BlockType.ImageFootnote: if block['type'] == BlockType.ImageFootnote:
para_content['img_footnote'] = merge_para_with_text(block, parse_type=parse_type, lang=lang) para_content['img_footnote'] = merge_para_with_text(block, parse_type=parse_type, lang=lang)
elif para_type == BlockType.Table: elif para_type == BlockType.Table:
para_content = {'type': 'table', 'page_idx': page_idx} para_content = {'type': 'table'}
for block in para_block['blocks']: for block in para_block['blocks']:
if block['type'] == BlockType.TableBody: if block['type'] == BlockType.TableBody:
if block["lines"][0]["spans"][0].get('latex', ''): if block["lines"][0]["spans"][0].get('latex', ''):
...@@ -319,6 +317,11 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, parse_type= ...@@ -319,6 +317,11 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, parse_type=
if block['type'] == BlockType.TableFootnote: if block['type'] == BlockType.TableFootnote:
para_content['table_footnote'] = merge_para_with_text(block, parse_type=parse_type, lang=lang) para_content['table_footnote'] = merge_para_with_text(block, parse_type=parse_type, lang=lang)
para_content['page_idx'] = page_idx
if drop_reason is not None:
para_content['drop_reason'] = drop_reason
return para_content return para_content
...@@ -406,10 +409,14 @@ def union_make(pdf_info_dict: list, ...@@ -406,10 +409,14 @@ def union_make(pdf_info_dict: list,
lang=None): lang=None):
output_content = [] output_content = []
for page_info in pdf_info_dict: for page_info in pdf_info_dict:
drop_reason_flag = False
drop_reason = None
if page_info.get('need_drop', False): if page_info.get('need_drop', False):
drop_reason = page_info.get('drop_reason') drop_reason = page_info.get('drop_reason')
if drop_mode == DropMode.NONE: if drop_mode == DropMode.NONE:
pass pass
elif drop_mode == DropMode.NONE_WITH_REASON:
drop_reason_flag = True
elif drop_mode == DropMode.WHOLE_PDF: elif drop_mode == DropMode.WHOLE_PDF:
raise Exception((f'drop_mode is {DropMode.WHOLE_PDF} ,' raise Exception((f'drop_mode is {DropMode.WHOLE_PDF} ,'
f'drop_reason is {drop_reason}')) f'drop_reason is {drop_reason}'))
...@@ -434,6 +441,10 @@ def union_make(pdf_info_dict: list, ...@@ -434,6 +441,10 @@ def union_make(pdf_info_dict: list,
output_content.extend(page_markdown) output_content.extend(page_markdown)
elif make_mode == MakeMode.STANDARD_FORMAT: elif make_mode == MakeMode.STANDARD_FORMAT:
for para_block in paras_of_layout: for para_block in paras_of_layout:
if drop_reason_flag:
para_content = para_to_standard_format_v2(
para_block, img_buket_path, page_idx, parse_type=parse_type, lang=lang, drop_reason=drop_reason)
else:
para_content = para_to_standard_format_v2( para_content = para_to_standard_format_v2(
para_block, img_buket_path, page_idx, parse_type=parse_type, lang=lang) para_block, img_buket_path, page_idx, parse_type=parse_type, lang=lang)
output_content.append(para_content) output_content.append(para_content)
......
...@@ -8,3 +8,4 @@ class DropMode: ...@@ -8,3 +8,4 @@ class DropMode:
WHOLE_PDF = "whole_pdf" WHOLE_PDF = "whole_pdf"
SINGLE_PAGE = "single_page" SINGLE_PAGE = "single_page"
NONE = "none" NONE = "none"
NONE_WITH_REASON = "none_with_reason"
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment