Commit f36c2656 authored by kernel.h@qq.com's avatar kernel.h@qq.com

使用面积占比方式判断一行文本是不是在一个layoutbox里

parent a36ef4f8
...@@ -92,5 +92,5 @@ if __name__ == '__main__': ...@@ -92,5 +92,5 @@ if __name__ == '__main__':
ocr_json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json" ocr_json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
# ocr_pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.pdf" # ocr_pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.pdf"
# ocr_json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.json" # ocr_json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.json"
#ocr_local_parse(ocr_pdf_path, ocr_json_file_path) ocr_local_parse(ocr_pdf_path, ocr_json_file_path)
ocr_online_parse(book_name="美国加州中学教材/edu_00000060") #ocr_online_parse(book_name="美国加州中学教材/edu_00000060")
...@@ -71,25 +71,26 @@ def ocr_mk_mm_markdown(pdf_info_dict: dict): ...@@ -71,25 +71,26 @@ def ocr_mk_mm_markdown(pdf_info_dict: dict):
def ocr_mk_mm_markdown_with_para(pdf_info_dict: dict): def ocr_mk_mm_markdown_with_para(pdf_info_dict: dict):
markdown = [] markdown = []
for _, page_info in pdf_info_dict.items(): for _, page_info in pdf_info_dict.items():
paras = page_info.get("para_blocks") paras_of_layout = page_info.get("para_blocks")
if not paras: if not paras_of_layout:
continue continue
for para in paras: for paras in paras_of_layout:
para_text = '' for para in paras:
for line in para: para_text = ''
for span in line['spans']: for line in para:
span_type = span.get('type') for span in line['spans']:
if span_type == ContentType.Text: span_type = span.get('type')
content = split_long_words(span['content']) if span_type == ContentType.Text:
# content = span['content'] content = split_long_words(span['content'])
elif span_type == ContentType.InlineEquation: pass
content = f"${span['content']}$" elif span_type == ContentType.InlineEquation:
elif span_type == ContentType.InterlineEquation: content = f" ${span['content']}$ "
content = f"\n$$\n{span['content']}\n$$\n" elif span_type == ContentType.InterlineEquation:
elif span_type in [ContentType.Image, ContentType.Table]: content = f"\n$$\n{span['content']}\n$$\n"
content = f"\n![]({join_path(s3_image_save_path, span['image_path'])})\n" elif span_type in [ ContentType.Image, ContentType.Table ]:
para_text += content + ' ' content = f"\n![]({join_path(s3_image_save_path, span['image_path'])})\n"
markdown.append(para_text.strip() + ' ') para_text += content + ' '
markdown.append(para_text.strip() + ' ')
return '\n\n'.join(markdown) return '\n\n'.join(markdown)
......
...@@ -18,6 +18,33 @@ def _is_in_or_part_overlap(box1, box2) -> bool: ...@@ -18,6 +18,33 @@ def _is_in_or_part_overlap(box1, box2) -> bool:
y1_1 < y0_2 or # box1在box2的上边 y1_1 < y0_2 or # box1在box2的上边
y0_1 > y1_2) # box1在box2的下边 y0_1 > y1_2) # box1在box2的下边
def _is_in_or_part_overlap_with_area_ratio(box1, box2, area_ratio_threshold=0.6):
"""
判断box1是否在box2里面,或者box1和box2有部分重叠,且重叠面积占box1的比例超过area_ratio_threshold
"""
if box1 is None or box2 is None:
return False
x0_1, y0_1, x1_1, y1_1 = box1
x0_2, y0_2, x1_2, y1_2 = box2
if not _is_in_or_part_overlap(box1, box2):
return False
# 计算重叠面积
x_left = max(x0_1, x0_2)
y_top = max(y0_1, y0_2)
x_right = min(x1_1, x1_2)
y_bottom = min(y1_1, y1_2)
overlap_area = (x_right - x_left) * (y_bottom - y_top)
# 计算box1的面积
box1_area = (x1_1 - x0_1) * (y1_1 - y0_1)
return overlap_area / box1_area > area_ratio_threshold
def _is_in(box1, box2) -> bool: def _is_in(box1, box2) -> bool:
""" """
box1是否完全在box2里面 box1是否完全在box2里面
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment