Unverified Commit f5268688 authored by myhloli's avatar myhloli Committed by GitHub

Merge pull request #38 from papayalove/master

更新了para_split_by_model
parents 07012dca f519f63d
This diff is collapsed.
......@@ -11,6 +11,7 @@ from magic_pdf.libs.drop_tag import DropTag
from magic_pdf.libs.hash_utils import compute_md5
from magic_pdf.libs.ocr_content_type import ContentType
from magic_pdf.para.para_split import para_split
from magic_pdf.para.para_split_by_model import para_split_by_model
from magic_pdf.pre_proc.construct_page_dict import ocr_construct_page_component
from magic_pdf.pre_proc.detect_footer_by_model import parse_footers
from magic_pdf.pre_proc.detect_footnote import parse_footnotes_by_model
......@@ -208,6 +209,9 @@ def parse_pdf_by_ocr(
pdf_info_dict[f"page_{page_id}"] = page_info
"""分段"""
if debug_mode:
para_split_by_model(pdf_info_dict, debug_mode=debug_mode)
else:
para_split(pdf_info_dict, debug_mode=debug_mode)
"""dict转list"""
......
......@@ -12,6 +12,8 @@ from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layout_split
from magic_pdf.pre_proc.ocr_dict_merge import sort_blocks_by_layout, fill_spans_in_blocks, fix_block_spans
from magic_pdf.pre_proc.ocr_span_list_modify import remove_overlaps_min_spans, get_qa_need_list_v2
from magic_pdf.para.para_split import para_split
from magic_pdf.para.para_split_by_model import para_split_by_model
def parse_pdf_by_ocr(pdf_bytes,
......@@ -90,7 +92,10 @@ def parse_pdf_by_ocr(pdf_bytes,
pdf_info_dict[f"page_{page_id}"] = page_info
"""分段"""
pass
if debug_mode:
para_split_by_model(pdf_info_dict, debug_mode=debug_mode)
else:
para_split(pdf_info_dict, debug_mode=debug_mode)
"""dict转list"""
pdf_info_list = dict_to_list(pdf_info_dict)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment