Commit e9aa103c authored by 赵小蒙's avatar 赵小蒙

ocr增加分页markdown输出格式

parent 27c080a9
......@@ -94,6 +94,36 @@ def ocr_mk_mm_markdown_with_para(pdf_info_dict: dict):
return '\n\n'.join(markdown)
def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: dict):
markdown_with_para_and_pagination = []
for page_no, page_info in pdf_info_dict.items():
page_markdown = []
paras = page_info.get("para_blocks")
if not paras:
continue
for para in paras:
para_text = ''
for line in para:
for span in line['spans']:
span_type = span.get('type')
if span_type == ContentType.Text:
content = split_long_words(span['content'])
# content = span['content']
elif span_type == ContentType.InlineEquation:
content = f"${span['content']}$"
elif span_type == ContentType.InterlineEquation:
content = f"\n$$\n{span['content']}\n$$\n"
elif span_type in [ContentType.Image, ContentType.Table]:
content = f"\n![]({join_path(s3_image_save_path, span['image_path'])})\n"
para_text += content + ' '
page_markdown.append(para_text.strip() + ' ')
markdown_with_para_and_pagination.append({
'page_no': page_no,
'md': '\n\n'.join(page_markdown)
})
return markdown_with_para_and_pagination
def make_standard_format_with_para(pdf_info_dict: dict):
content_list = []
for _, page_info in pdf_info_dict.items():
......
......@@ -7,7 +7,7 @@ from magic_pdf.dict2md.ocr_mkcontent import (
ocr_mk_nlp_markdown,
ocr_mk_mm_markdown,
ocr_mk_mm_standard_format,
ocr_mk_mm_markdown_with_para,
ocr_mk_mm_markdown_with_para, ocr_mk_mm_markdown_with_para_and_pagination,
)
from magic_pdf.libs.commons import (
read_file,
......@@ -525,6 +525,35 @@ def ocr_pdf_intermediate_dict_to_markdown_with_para(jso: dict, debug_mode=False)
return jso
def ocr_pdf_intermediate_dict_to_markdown_with_para_and_pagination(jso: dict, debug_mode=False) -> dict:
if debug_mode:
pass
else: # 如果debug没开,则检测是否有needdrop字段
if jso.get("need_drop", False):
book_name = join_path(get_data_source(jso), jso["file_id"])
logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
jso["dropped"] = True
return jso
try:
pdf_intermediate_dict = jso["pdf_intermediate_dict"]
# 将 pdf_intermediate_dict 解压
pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
markdown_content = ocr_mk_mm_markdown_with_para_and_pagination(pdf_intermediate_dict)
jso["content"] = markdown_content
logger.info(
f"book_name is:{get_data_source(jso)}/{jso['file_id']},markdown content length is {len(markdown_content)}",
file=sys.stderr,
)
# 把无用的信息清空
# jso["doc_layout_result"] = ""
jso["pdf_intermediate_dict"] = ""
# jso["pdf_meta"] = ""
except Exception as e:
jso = exception_handler(jso, e)
return jso
def ocr_pdf_intermediate_dict_to_markdown_with_para_for_qa(
jso: dict, debug_mode=False
) -> dict:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment