Commit 83753cbd authored by xuchao's avatar xuchao

元素类型引用统一定义

parent d5ea44f9
...@@ -30,13 +30,13 @@ def read_json_file(file_path): ...@@ -30,13 +30,13 @@ def read_json_file(file_path):
if __name__ == '__main__': if __name__ == '__main__':
ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.pdf" #ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.pdf"
ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json" #ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json"
# ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.pdf" # ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.pdf"
# ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.json" # ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.json"
# ocr_pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf" ocr_pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf"
# ocr_json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json" ocr_json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
try: try:
ocr_pdf_model_info = read_json_file(ocr_json_file_path) ocr_pdf_model_info = read_json_file(ocr_json_file_path)
pth = Path(ocr_json_file_path) pth = Path(ocr_json_file_path)
......
...@@ -326,7 +326,11 @@ def mk_mm_markdown(content_list): ...@@ -326,7 +326,11 @@ def mk_mm_markdown(content_list):
if content_type == "text": if content_type == "text":
content_md.append(c.get("text")) content_md.append(c.get("text"))
elif content_type == "equation": elif content_type == "equation":
content_md.append(f"$$\n{c.get('latex')}\n$$") content = c.get("latex")
if content.startswith("$$") and content.endswith("$$"):
content_md.append(content)
else:
content_md.append(f"\n$$\n{c.get('latex')}\n$$\n")
elif content_type in UNI_FORMAT_TEXT_TYPE: elif content_type in UNI_FORMAT_TEXT_TYPE:
content_md.append(f"{'#'*int(content_type[1])} {c.get('text')}") content_md.append(f"{'#'*int(content_type[1])} {c.get('text')}")
elif content_type == "image": elif content_type == "image":
......
...@@ -3,11 +3,12 @@ import numpy as np ...@@ -3,11 +3,12 @@ import numpy as np
from loguru import logger from loguru import logger
from magic_pdf.libs.boxbase import _is_in from magic_pdf.libs.boxbase import _is_in
from magic_pdf.libs.ocr_content_type import ContentType
LINE_STOP_FLAG = ['.', '!', '?', '。', '!', '?',":", ":", ")", ")", ";"] LINE_STOP_FLAG = ['.', '!', '?', '。', '!', '?',":", ":", ")", ")", ";"]
INLINE_EQUATION = 'inline_equation' INLINE_EQUATION = ContentType.InlineEquation
INTER_EQUATION = "displayed_equation" INTERLINE_EQUATION = ContentType.InterlineEquation
TEXT = "text" TEXT = "text"
def __add_line_period(blocks, layout_bboxes): def __add_line_period(blocks, layout_bboxes):
...@@ -20,20 +21,19 @@ def __add_line_period(blocks, layout_bboxes): ...@@ -20,20 +21,19 @@ def __add_line_period(blocks, layout_bboxes):
for line in block['lines']: for line in block['lines']:
last_span = line['spans'][-1] last_span = line['spans'][-1]
span_type = last_span['type'] span_type = last_span['type']
if span_type in [TEXT, INLINE_EQUATION]: if span_type in [INLINE_EQUATION]:
span_content = last_span['content'].strip() span_content = last_span['content'].strip()
if span_type==INLINE_EQUATION and span_content[-1] not in LINE_STOP_FLAG: if span_type==INLINE_EQUATION and span_content[-1] not in LINE_STOP_FLAG:
if span_type in [INLINE_EQUATION, INTER_EQUATION]: if span_type in [INLINE_EQUATION, INTERLINE_EQUATION]:
last_span['content'] = span_content + '.' last_span['content'] = span_content + '.'
def __valign_lines(blocks, layout_bboxes): def __valign_lines(blocks, layout_bboxes):
""" """
对齐行的左侧和右侧。 在一个layoutbox内对齐行的左侧和右侧。
扫描行的左侧和右侧,如果x0, x1差距不超过3就强行对齐到所处layout的左右两侧(和layout有一段距离)。 扫描行的左侧和右侧,如果x0, x1差距不超过一个阈值,就强行对齐到所处layout的左右两侧(和layout有一段距离)。
3是个经验值,TODO,计算得来 3是个经验值,TODO,计算得来,可以设置为1.5个正文字符。
""" """
min_distance = 3 min_distance = 3
...@@ -159,11 +159,14 @@ def __split_para_in_layoutbox(lines_group, layout_bboxes, lang="en", char_avg_le ...@@ -159,11 +159,14 @@ def __split_para_in_layoutbox(lines_group, layout_bboxes, lang="en", char_avg_le
else: else:
para.append(line) para.append(line)
else: # 其他,图片、表格、行间公式,各自占一段 else: # 其他,图片、表格、行间公式,各自占一段
para.append(line) if len(para)>0:
paras.append(para) paras.append(para)
para = []
else:
paras.append([line])
para = []
# para_text = ''.join([get_span_text(span) for line in para for span in line['spans']]) # para_text = ''.join([get_span_text(span) for line in para for span in line['spans']])
# logger.info(para_text) # logger.info(para_text)
para = []
if len(para)>0: if len(para)>0:
paras.append(para) paras.append(para)
# para_text = ''.join([get_span_text(span) for line in para for span in line['spans']]) # para_text = ''.join([get_span_text(span) for line in para for span in line['spans']])
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment