Commit f0c463ed authored by 许瑞's avatar 许瑞

Merge branch 'master' of https://github.com/myhloli/Magic-PDF

parents efed5faa 3d2fcc9d
...@@ -90,9 +90,10 @@ def ocr_parse_core(book_name, ocr_pdf_path, ocr_pdf_model_info, start_page_id=0, ...@@ -90,9 +90,10 @@ def ocr_parse_core(book_name, ocr_pdf_path, ocr_pdf_model_info, start_page_id=0,
if __name__ == '__main__': if __name__ == '__main__':
# pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf" pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf"
# json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json" json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
# ocr_local_parse(pdf_path, json_file_path) # ocr_local_parse(pdf_path, json_file_path)
# book_name = "数学新星网/edu_00001236" book_name = "科数网/edu_00011318"
# ocr_online_parse(book_name) ocr_online_parse(book_name)
pass pass
...@@ -72,18 +72,26 @@ def ocr_mk_mm_markdown_with_para(pdf_info_dict: dict): ...@@ -72,18 +72,26 @@ def ocr_mk_mm_markdown_with_para(pdf_info_dict: dict):
markdown = [] markdown = []
for _, page_info in pdf_info_dict.items(): for _, page_info in pdf_info_dict.items():
paras_of_layout = page_info.get("para_blocks") paras_of_layout = page_info.get("para_blocks")
page_markdown = ocr_mk_mm_markdown_with_para_core(paras_of_layout) page_markdown = ocr_mk_mm_markdown_with_para_core(paras_of_layout, "mm")
markdown.extend(page_markdown) markdown.extend(page_markdown)
return '\n\n'.join(markdown) return '\n\n'.join(markdown)
def ocr_mk_nlp_markdown_with_para(pdf_info_dict: dict):
markdown = []
for _, page_info in pdf_info_dict.items():
paras_of_layout = page_info.get("para_blocks")
page_markdown = ocr_mk_mm_markdown_with_para_core(paras_of_layout, "nlp")
markdown.extend(page_markdown)
return '\n\n'.join(markdown)
def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: dict): def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: dict):
markdown_with_para_and_pagination = [] markdown_with_para_and_pagination = []
for page_no, page_info in pdf_info_dict.items(): for page_no, page_info in pdf_info_dict.items():
paras_of_layout = page_info.get("para_blocks") paras_of_layout = page_info.get("para_blocks")
if not paras_of_layout: if not paras_of_layout:
continue continue
page_markdown = ocr_mk_mm_markdown_with_para_core(paras_of_layout) page_markdown = ocr_mk_mm_markdown_with_para_core(paras_of_layout, "mm")
markdown_with_para_and_pagination.append({ markdown_with_para_and_pagination.append({
'page_no': page_no, 'page_no': page_no,
'md_content': '\n\n'.join(page_markdown) 'md_content': '\n\n'.join(page_markdown)
...@@ -91,7 +99,7 @@ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: dict): ...@@ -91,7 +99,7 @@ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: dict):
return markdown_with_para_and_pagination return markdown_with_para_and_pagination
def ocr_mk_mm_markdown_with_para_core(paras_of_layout): def ocr_mk_mm_markdown_with_para_core(paras_of_layout, mode):
page_markdown = [] page_markdown = []
for paras in paras_of_layout: for paras in paras_of_layout:
for para in paras: for para in paras:
...@@ -99,6 +107,7 @@ def ocr_mk_mm_markdown_with_para_core(paras_of_layout): ...@@ -99,6 +107,7 @@ def ocr_mk_mm_markdown_with_para_core(paras_of_layout):
for line in para: for line in para:
for span in line['spans']: for span in line['spans']:
span_type = span.get('type') span_type = span.get('type')
content = ''
if span_type == ContentType.Text: if span_type == ContentType.Text:
content = split_long_words(span['content']) content = split_long_words(span['content'])
# content = span['content'] # content = span['content']
...@@ -107,9 +116,16 @@ def ocr_mk_mm_markdown_with_para_core(paras_of_layout): ...@@ -107,9 +116,16 @@ def ocr_mk_mm_markdown_with_para_core(paras_of_layout):
elif span_type == ContentType.InterlineEquation: elif span_type == ContentType.InterlineEquation:
content = f"\n$$\n{span['content']}\n$$\n" content = f"\n$$\n{span['content']}\n$$\n"
elif span_type in [ContentType.Image, ContentType.Table]: elif span_type in [ContentType.Image, ContentType.Table]:
content = f"\n![]({join_path(s3_image_save_path, span['image_path'])})\n" if mode == 'mm':
para_text += content + ' ' content = f"\n![]({join_path(s3_image_save_path, span['image_path'])})\n"
page_markdown.append(para_text.strip() + ' ') elif mode == 'nlp':
pass
if content != '':
para_text += content + ' '
if para_text.strip() == '':
continue
else:
page_markdown.append(para_text.strip() + ' ')
return page_markdown return page_markdown
......
This diff is collapsed.
...@@ -57,16 +57,16 @@ def construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, lay ...@@ -57,16 +57,16 @@ def construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, lay
def parse_pdf_by_ocr( def parse_pdf_by_ocr(
pdf_path, pdf_path,
s3_pdf_profile, s3_pdf_profile,
pdf_model_output, pdf_model_output,
save_path, save_path,
book_name, book_name,
pdf_model_profile=None, pdf_model_profile=None,
image_s3_config=None, image_s3_config=None,
start_page_id=0, start_page_id=0,
end_page_id=None, end_page_id=None,
debug_mode=False, debug_mode=False,
): ):
pdf_bytes = read_file(pdf_path, s3_pdf_profile) pdf_bytes = read_file(pdf_path, s3_pdf_profile)
save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "tmp", "unittest") save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "tmp", "unittest")
...@@ -95,7 +95,6 @@ def parse_pdf_by_ocr( ...@@ -95,7 +95,6 @@ def parse_pdf_by_ocr(
start_time = time.time() start_time = time.time()
end_page_id = end_page_id if end_page_id else len(pdf_docs) - 1 end_page_id = end_page_id if end_page_id else len(pdf_docs) - 1
for page_id in range(start_page_id, end_page_id + 1): for page_id in range(start_page_id, end_page_id + 1):
...@@ -125,13 +124,6 @@ def parse_pdf_by_ocr( ...@@ -125,13 +124,6 @@ def parse_pdf_by_ocr(
page_id, page, ocr_page_info, md_bookname_save_path, debug_mode=debug_mode page_id, page, ocr_page_info, md_bookname_save_path, debug_mode=debug_mode
) )
# 构建需要remove的bbox列表
# need_remove_spans_bboxes = []
# need_remove_spans_bboxes.extend(page_no_bboxes)
# need_remove_spans_bboxes.extend(header_bboxes)
# need_remove_spans_bboxes.extend(footer_bboxes)
# need_remove_spans_bboxes.extend(footnote_bboxes)
# 构建需要remove的bbox字典 # 构建需要remove的bbox字典
need_remove_spans_bboxes_dict = { need_remove_spans_bboxes_dict = {
DropTag.PAGE_NUMBER: page_no_bboxes, DropTag.PAGE_NUMBER: page_no_bboxes,
...@@ -199,50 +191,48 @@ def parse_pdf_by_ocr( ...@@ -199,50 +191,48 @@ def parse_pdf_by_ocr(
else: else:
continue continue
'''删除重叠spans中较小的那些'''
# 删除重叠spans中较小的那些
spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans) spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans)
# 删除remove_span_block_bboxes中的bbox '''
# spans = remove_spans_by_bboxes(spans, need_remove_spans_bboxes) 删除remove_span_block_bboxes中的bbox
# 按qa要求,增加drop相关数据 并增加drop相关数据
'''
spans, dropped_spans_by_removed_bboxes = remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict) spans, dropped_spans_by_removed_bboxes = remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict)
# 对image和table截图 '''对image和table截图'''
spans = cut_image_and_table(spans, page, page_id, book_name, save_path, img_s3_client) spans = cut_image_and_table(spans, page, page_id, book_name, save_path, img_s3_client)
# 行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧) '''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
displayed_list = [] displayed_list = []
text_inline_lines = [] text_inline_lines = []
modify_y_axis(spans, displayed_list, text_inline_lines) modify_y_axis(spans, displayed_list, text_inline_lines)
# 模型识别错误的行间公式, type类型转换成行内公式
'''模型识别错误的行间公式, type类型转换成行内公式'''
spans = modify_inline_equation(spans, displayed_list, text_inline_lines) spans = modify_inline_equation(spans, displayed_list, text_inline_lines)
# bbox去除粘连 '''bbox去除粘连'''
spans = remove_overlap_between_bbox(spans) spans = remove_overlap_between_bbox(spans)
# 对tpye=["interline_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0 '''
对tpye=["interline_equation", "image", "table"]进行额外处理,
如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
'''
spans = adjust_bbox_for_standalone_block(spans) spans = adjust_bbox_for_standalone_block(spans)
'''从ocr_page_info中解析layout信息(按自然阅读方向排序,并修复重叠和交错的bad case)'''
# 从ocr_page_info中解析layout信息(按自然阅读方向排序,并修复重叠和交错的bad case)
layout_bboxes, layout_tree = layout_detect(ocr_page_info['subfield_dets'], page, ocr_page_info) layout_bboxes, layout_tree = layout_detect(ocr_page_info['subfield_dets'], page, ocr_page_info)
# 将spans合并成line(在layout内,从上到下,从左到右) '''将spans合并成line(在layout内,从上到下,从左到右)'''
lines, dropped_spans_by_layout = merge_spans_to_line_by_layout(spans, layout_bboxes) lines, dropped_spans_by_layout = merge_spans_to_line_by_layout(spans, layout_bboxes)
# 将lines合并成block '''将lines合并成block'''
blocks = merge_lines_to_block(lines) blocks = merge_lines_to_block(lines)
# 根据block合并段落 '''获取QA需要外置的list'''
#para_blocks = para_split(blocks, layout_bboxes)
# 获取QA需要外置的list
images, tables, interline_equations, inline_equations = get_qa_need_list(blocks) images, tables, interline_equations, inline_equations = get_qa_need_list(blocks)
# drop的span_list合并 '''drop的span_list合并'''
dropped_spans = [] dropped_spans = []
dropped_spans.extend(dropped_spans_by_span_overlap) dropped_spans.extend(dropped_spans_by_span_overlap)
dropped_spans.extend(dropped_spans_by_removed_bboxes) dropped_spans.extend(dropped_spans_by_removed_bboxes)
...@@ -263,19 +253,18 @@ def parse_pdf_by_ocr( ...@@ -263,19 +253,18 @@ def parse_pdf_by_ocr(
elif span['type'] in [ContentType.InlineEquation, ContentType.InterlineEquation]: elif span['type'] in [ContentType.InlineEquation, ContentType.InterlineEquation]:
dropped_equation_block.append(span) dropped_equation_block.append(span)
'''构造pdf_info_dict'''
# 构造pdf_info_dict
page_info = construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree, page_info = construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
images, tables, interline_equations, inline_equations, images, tables, interline_equations, inline_equations,
dropped_text_block, dropped_image_block, dropped_table_block, dropped_equation_block, dropped_text_block, dropped_image_block, dropped_table_block,
dropped_equation_block,
need_remove_spans_bboxes_dict) need_remove_spans_bboxes_dict)
pdf_info_dict[f"page_{page_id}"] = page_info pdf_info_dict[f"page_{page_id}"] = page_info
"""分段""" """分段"""
para_split(pdf_info_dict) para_split(pdf_info_dict, debug_mode=debug_mode)
# 在测试时,保存调试信息 '''在测试时,保存调试信息'''
if debug_mode: if debug_mode:
params_file_save_path = join_path( params_file_save_path = join_path(
save_tmp_path, "md", book_name, "preproc_out.json" save_tmp_path, "md", book_name, "preproc_out.json"
......
...@@ -7,7 +7,7 @@ from magic_pdf.dict2md.ocr_mkcontent import ( ...@@ -7,7 +7,7 @@ from magic_pdf.dict2md.ocr_mkcontent import (
ocr_mk_nlp_markdown, ocr_mk_nlp_markdown,
ocr_mk_mm_markdown, ocr_mk_mm_markdown,
ocr_mk_mm_standard_format, ocr_mk_mm_standard_format,
ocr_mk_mm_markdown_with_para, ocr_mk_mm_markdown_with_para_and_pagination, ocr_mk_mm_markdown_with_para, ocr_mk_mm_markdown_with_para_and_pagination, ocr_mk_nlp_markdown_with_para,
) )
from magic_pdf.libs.commons import ( from magic_pdf.libs.commons import (
read_file, read_file,
...@@ -510,7 +510,8 @@ def ocr_pdf_intermediate_dict_to_markdown_with_para(jso: dict, debug_mode=False) ...@@ -510,7 +510,8 @@ def ocr_pdf_intermediate_dict_to_markdown_with_para(jso: dict, debug_mode=False)
pdf_intermediate_dict = jso["pdf_intermediate_dict"] pdf_intermediate_dict = jso["pdf_intermediate_dict"]
# 将 pdf_intermediate_dict 解压 # 将 pdf_intermediate_dict 解压
pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict) pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
markdown_content = ocr_mk_mm_markdown_with_para(pdf_intermediate_dict) # markdown_content = ocr_mk_mm_markdown_with_para(pdf_intermediate_dict)
markdown_content = ocr_mk_nlp_markdown_with_para(pdf_intermediate_dict)
jso["content"] = markdown_content jso["content"] = markdown_content
logger.info( logger.info(
f"book_name is:{get_data_source(jso)}/{jso['file_id']},markdown content length is {len(markdown_content)}", f"book_name is:{get_data_source(jso)}/{jso['file_id']},markdown content length is {len(markdown_content)}",
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment