Commit fc107725 authored by 赵小蒙's avatar 赵小蒙

ocr_construct_page_component 位置移动

parent 433684c6
......@@ -53,7 +53,7 @@ from magic_pdf.pre_proc.citationmarker_remove import remove_citation_marker
from magic_pdf.pre_proc.equations_replace import combine_chars_to_pymudict, remove_chars_in_text_blocks, replace_equations_in_textblock
from magic_pdf.pre_proc.pdf_pre_filter import pdf_filter
from magic_pdf.pre_proc.detect_footer_header_by_statistics import drop_footer_header
from magic_pdf.pre_proc.construct_paras import construct_page_component
from magic_pdf.pre_proc.construct_page_dict import construct_page_component
from magic_pdf.pre_proc.fix_image import combine_images, fix_image_vertical, fix_seperated_image, include_img_title
from magic_pdf.post_proc.pdf_post_filter import pdf_post_filter
from magic_pdf.pre_proc.remove_rotate_bbox import get_side_boundry, remove_rotate_side_textblock, remove_side_blank_block
......
......@@ -18,6 +18,7 @@ from magic_pdf.libs.drop_tag import DropTag
from magic_pdf.libs.ocr_content_type import ContentType
from magic_pdf.libs.safe_filename import sanitize_filename
from magic_pdf.para.para_split import para_split
from magic_pdf.pre_proc.construct_page_dict import ocr_construct_page_component
from magic_pdf.pre_proc.detect_footer_by_model import parse_footers
from magic_pdf.pre_proc.detect_footnote import parse_footnotes_by_model
from magic_pdf.pre_proc.detect_header import parse_headers
......@@ -33,28 +34,6 @@ from magic_pdf.pre_proc.ocr_span_list_modify import remove_spans_by_bboxes, remo
from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox
def construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
images, tables, interline_equations, inline_equations,
dropped_text_block, dropped_image_block, dropped_table_block, dropped_equation_block,
need_remove_spans_bboxes_dict):
return_dict = {
'preproc_blocks': blocks,
'layout_bboxes': layout_bboxes,
'page_idx': page_id,
'page_size': [page_w, page_h],
'_layout_tree': layout_tree,
'images': images,
'tables': tables,
'interline_equations': interline_equations,
'inline_equations': inline_equations,
'droped_text_block': dropped_text_block,
'droped_image_block': dropped_image_block,
'droped_table_block': dropped_table_block,
'dropped_equation_block': dropped_equation_block,
'droped_bboxes': need_remove_spans_bboxes_dict,
}
return return_dict
def parse_pdf_by_ocr(
pdf_path,
......@@ -254,7 +233,7 @@ def parse_pdf_by_ocr(
dropped_equation_block.append(span)
'''构造pdf_info_dict'''
page_info = construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
page_info = ocr_construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
images, tables, interline_equations, inline_equations,
dropped_text_block, dropped_image_block, dropped_table_block,
dropped_equation_block,
......
......@@ -75,7 +75,7 @@ from magic_pdf.pre_proc.equations_replace import (
)
from magic_pdf.pre_proc.pdf_pre_filter import pdf_filter
from magic_pdf.pre_proc.detect_footer_header_by_statistics import drop_footer_header
from magic_pdf.pre_proc.construct_paras import construct_page_component
from magic_pdf.pre_proc.construct_page_dict import construct_page_component
from magic_pdf.pre_proc.fix_image import (
combine_images,
fix_image_vertical,
......
......@@ -28,3 +28,26 @@ def construct_page_component(page_id, image_info, table_info, text_blocks_prepr
return_dict['footnote_bboxes_tmp'] = footnote_bboxes_tmp
return return_dict
def ocr_construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
images, tables, interline_equations, inline_equations,
dropped_text_block, dropped_image_block, dropped_table_block, dropped_equation_block,
need_remove_spans_bboxes_dict):
return_dict = {
'preproc_blocks': blocks,
'layout_bboxes': layout_bboxes,
'page_idx': page_id,
'page_size': [page_w, page_h],
'_layout_tree': layout_tree,
'images': images,
'tables': tables,
'interline_equations': interline_equations,
'inline_equations': inline_equations,
'droped_text_block': dropped_text_block,
'droped_image_block': dropped_image_block,
'droped_table_block': dropped_table_block,
'dropped_equation_block': dropped_equation_block,
'droped_bboxes': need_remove_spans_bboxes_dict,
}
return return_dict
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment