Commit a0be4652 authored by 赵小蒙's avatar 赵小蒙

parse_pdf_by_ocr 逻辑更新

parent 701f3849
import json
import os
from loguru import logger
......@@ -20,9 +21,16 @@ def save_markdown(markdown_text, input_filepath):
file.write(markdown_text)
def read_json_file(file_path):
with open(file_path, 'r') as f:
data = json.load(f)
return data
if __name__ == '__main__':
ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_0.json"
pdf_info_dict = parse_pdf_by_ocr(ocr_json_file_path)
ocr_pdf_info = read_json_file(ocr_json_file_path)
pdf_info_dict = parse_pdf_by_ocr(ocr_pdf_info)
markdown_text = mk_nlp_markdown(pdf_info_dict)
logger.info(markdown_text)
save_markdown(markdown_text, ocr_json_file_path)
......
import json
from magic_pdf.libs.boxbase import get_minbox_if_overlap_by_ratio
from magic_pdf.libs.ocr_dict_merge import merge_spans
def read_json_file(file_path):
with open(file_path, 'r') as f:
data = json.load(f)
return data
def construct_page_component(page_id, text_blocks_preproc):
return_dict = {
'preproc_blocks': text_blocks_preproc,
......@@ -19,11 +11,11 @@ def construct_page_component(page_id, text_blocks_preproc):
def parse_pdf_by_ocr(
ocr_json_file_path,
ocr_pdf_info,
start_page_id=0,
end_page_id=None,
):
ocr_pdf_info = read_json_file(ocr_json_file_path)
pdf_info_dict = {}
end_page_id = end_page_id if end_page_id else len(ocr_pdf_info) - 1
for page_id in range(start_page_id, end_page_id + 1):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment