Unverified Commit b39f3a8e authored by myhloli's avatar myhloli Committed by GitHub

Merge pull request #35 from myhloli/master

将ocr_parse逻辑切换到v2,并解决几个parse过程中的error
parents e8544335 dcf6e712
...@@ -252,7 +252,7 @@ def fix_image_block(block, img_blocks): ...@@ -252,7 +252,7 @@ def fix_image_block(block, img_blocks):
break break
# 根据list长度,判断img_block中是否有img_caption # 根据list长度,判断img_block中是否有img_caption
if len(img_block['img_caption_bbox']) > 0: if img_block['img_caption_bbox'] is not None:
img_caption_block, img_caption_spans = merge_spans_to_block( img_caption_block, img_caption_spans = merge_spans_to_block(
block['spans'], img_block['img_caption_bbox'], BlockType.ImageCaption block['spans'], img_block['img_caption_bbox'], BlockType.ImageCaption
) )
...@@ -280,7 +280,7 @@ def fix_table_block(block, table_blocks): ...@@ -280,7 +280,7 @@ def fix_table_block(block, table_blocks):
break break
# 根据list长度,判断table_block中是否有caption # 根据list长度,判断table_block中是否有caption
if len(table_block['table_caption_bbox']) > 0: if table_block['table_caption_bbox'] is not None:
table_caption_block, table_caption_spans = merge_spans_to_block( table_caption_block, table_caption_spans = merge_spans_to_block(
block['spans'], table_block['table_caption_bbox'], BlockType.TableCaption block['spans'], table_block['table_caption_bbox'], BlockType.TableCaption
) )
...@@ -293,7 +293,7 @@ def fix_table_block(block, table_blocks): ...@@ -293,7 +293,7 @@ def fix_table_block(block, table_blocks):
block['spans'].remove(span) block['spans'].remove(span)
# 根据list长度,判断table_block中是否有table_note # 根据list长度,判断table_block中是否有table_note
if len(table_block['table_footnote_bbox']) > 0: if table_block['table_footnote_bbox'] is not None:
table_footnote_block, table_footnote_spans = merge_spans_to_block( table_footnote_block, table_footnote_spans = merge_spans_to_block(
block['spans'], table_block['table_footnote_bbox'], BlockType.TableFootnote block['spans'], table_block['table_footnote_bbox'], BlockType.TableFootnote
) )
......
...@@ -222,10 +222,10 @@ def get_qa_need_list_v2(blocks): ...@@ -222,10 +222,10 @@ def get_qa_need_list_v2(blocks):
interline_equations = [] interline_equations = []
for block in blocks: for block in blocks:
if block["type"] == BlockType.Image: if block["block_type"] == BlockType.Image:
images.append(block) images.append(block)
elif block["type"] == BlockType.Table: elif block["block_type"] == BlockType.Table:
tables.append(block) tables.append(block)
elif block["type"] == BlockType.InterlineEquation: elif block["block_type"] == BlockType.InterlineEquation:
interline_equations.append(block) interline_equations.append(block)
return images, tables, interline_equations return images, tables, interline_equations
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
from loguru import logger from loguru import logger
from magic_pdf.rw import AbsReaderWriter from magic_pdf.rw import AbsReaderWriter
from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr from magic_pdf.pdf_parse_by_ocr_v2 import parse_pdf_by_ocr
from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment