Commit f62d1aa7 authored by 赵小蒙's avatar 赵小蒙

将模型和pymu坐标的转换逻辑抽象成方法

parent 388223f2
def get_scale_ratio(ocr_page_info, page):
pix = page.get_pixmap(dpi=72)
pymu_width = int(pix.w)
pymu_height = int(pix.h)
width_from_json = ocr_page_info['page_info']['width']
height_from_json = ocr_page_info['page_info']['height']
horizontal_scale_ratio = width_from_json / pymu_width
vertical_scale_ratio = height_from_json / pymu_height
return horizontal_scale_ratio, vertical_scale_ratio
...@@ -4,6 +4,7 @@ import time ...@@ -4,6 +4,7 @@ import time
from loguru import logger from loguru import logger
from magic_pdf.libs.commons import read_file, join_path, fitz, get_img_s3_client, get_delta_time, get_docx_model_output from magic_pdf.libs.commons import read_file, join_path, fitz, get_img_s3_client, get_delta_time, get_docx_model_output
from magic_pdf.libs.coordinate_transform import get_scale_ratio
from magic_pdf.libs.safe_filename import sanitize_filename from magic_pdf.libs.safe_filename import sanitize_filename
from magic_pdf.pre_proc.detect_footer_by_model import parse_footers from magic_pdf.pre_proc.detect_footer_by_model import parse_footers
from magic_pdf.pre_proc.detect_footnote import parse_footnotes_by_model from magic_pdf.pre_proc.detect_footnote import parse_footnotes_by_model
...@@ -82,7 +83,7 @@ def parse_pdf_by_ocr( ...@@ -82,7 +83,7 @@ def parse_pdf_by_ocr(
page_no_bboxes = parse_pageNos(page_id, page, ocr_page_info) page_no_bboxes = parse_pageNos(page_id, page, ocr_page_info)
header_bboxes = parse_headers(page_id, page, ocr_page_info) header_bboxes = parse_headers(page_id, page, ocr_page_info)
footer_bboxes = parse_footers(page_id, page, ocr_page_info) footer_bboxes = parse_footers(page_id, page, ocr_page_info)
footnote_bboxes = parse_footnotes_by_model(page_id, page, ocr_page_info, md_bookname_save_path, debug_mode=debug_mode) footnote_bboxes = parse_footnotes_by_model(page_id, page, ocr_page_info, md_bookname_save_path, debug_mode=debug_mode)
# 构建需要remove的bbox列表 # 构建需要remove的bbox列表
need_remove_spans_bboxes = [] need_remove_spans_bboxes = []
...@@ -90,35 +91,19 @@ def parse_pdf_by_ocr( ...@@ -90,35 +91,19 @@ def parse_pdf_by_ocr(
need_remove_spans_bboxes.extend(header_bboxes) need_remove_spans_bboxes.extend(header_bboxes)
need_remove_spans_bboxes.extend(footer_bboxes) need_remove_spans_bboxes.extend(footer_bboxes)
need_remove_spans_bboxes.extend(footnote_bboxes) need_remove_spans_bboxes.extend(footnote_bboxes)
remove_bboxes.append(need_remove_spans_bboxes)
layout_dets = ocr_page_info['layout_dets'] layout_dets = ocr_page_info['layout_dets']
spans = [] spans = []
# 将模型坐标转换成pymu格式下的未缩放坐标 # 计算模型坐标和pymu坐标的缩放比例
DPI = 72 # use this resolution horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(ocr_page_info, page)
pix = page.get_pixmap(dpi=DPI)
pageL = 0
pageR = int(pix.w)
pageU = 0
pageD = int(pix.h)
width_from_json = ocr_page_info['page_info']['width']
height_from_json = ocr_page_info['page_info']['height']
LR_scaleRatio = width_from_json / (pageR - pageL)
UD_scaleRatio = height_from_json / (pageD - pageU)
for layout_det in layout_dets: for layout_det in layout_dets:
category_id = layout_det['category_id'] category_id = layout_det['category_id']
allow_category_id_list = [1, 7, 13, 14, 15] allow_category_id_list = [1, 7, 13, 14, 15]
if category_id in allow_category_id_list: if category_id in allow_category_id_list:
x0, y0, _, _, x1, y1, _, _ = layout_det['poly'] x0, y0, _, _, x1, y1, _, _ = layout_det['poly']
x0 = x0 / LR_scaleRatio bbox = [int(x0/horizontal_scale_ratio), int(y0/vertical_scale_ratio), int(x1/horizontal_scale_ratio), int(y1/vertical_scale_ratio)]
y0 = y0 / UD_scaleRatio
x1 = x1 / LR_scaleRatio
y1 = y1 / UD_scaleRatio
bbox = [int(x0), int(y0), int(x1), int(y1)]
'''要删除的''' '''要删除的'''
# 3: 'header', # 页眉 # 3: 'header', # 页眉
# 4: 'page number', # 页码 # 4: 'page number', # 页码
...@@ -184,6 +169,5 @@ def parse_pdf_by_ocr( ...@@ -184,6 +169,5 @@ def parse_pdf_by_ocr(
page_info = construct_page_component(page_id, blocks, layout_bboxes) page_info = construct_page_component(page_id, blocks, layout_bboxes)
pdf_info_dict[f"page_{page_id}"] = page_info pdf_info_dict[f"page_{page_id}"] = page_info
# logger.info(remove_bboxes)
return pdf_info_dict return pdf_info_dict
from magic_pdf.libs.commons import fitz # pyMuPDF库 from magic_pdf.libs.commons import fitz # pyMuPDF库
from magic_pdf.libs.coordinate_transform import get_scale_ratio
def parse_footers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict): def parse_footers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
...@@ -8,23 +9,12 @@ def parse_footers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict): ...@@ -8,23 +9,12 @@ def parse_footers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
:param res_dir_path: str类型,是每一个pdf文档,在当前.py文件的目录下生成一个与pdf文档同名的文件夹,res_dir_path就是文件夹的dir :param res_dir_path: str类型,是每一个pdf文档,在当前.py文件的目录下生成一个与pdf文档同名的文件夹,res_dir_path就是文件夹的dir
:param json_from_DocXchain_obj: dict类型,把pdf文档送入DocXChain模型中后,提取bbox,结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict :param json_from_DocXchain_obj: dict类型,把pdf文档送入DocXChain模型中后,提取bbox,结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
""" """
DPI = 72 # use this resolution
pix = page.get_pixmap(dpi=DPI)
pageL = 0
pageR = int(pix.w)
pageU = 0
pageD = int(pix.h)
#--------- 通过json_from_DocXchain来获取 footer ---------# #--------- 通过json_from_DocXchain来获取 footer ---------#
footer_bbox_from_DocXChain = [] footer_bbox_from_DocXChain = []
xf_json = json_from_DocXchain_obj xf_json = json_from_DocXchain_obj
width_from_json = xf_json['page_info']['width'] horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(xf_json, page)
height_from_json = xf_json['page_info']['height']
LR_scaleRatio = width_from_json / (pageR - pageL)
UD_scaleRatio = height_from_json / (pageD - pageU)
# {0: 'title', # 标题 # {0: 'title', # 标题
# 1: 'figure', # 图片 # 1: 'figure', # 图片
...@@ -42,10 +32,10 @@ def parse_footers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict): ...@@ -42,10 +32,10 @@ def parse_footers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
# 13: 'embedding', # 嵌入公式 # 13: 'embedding', # 嵌入公式
# 14: 'isolated'} # 单行公式 # 14: 'isolated'} # 单行公式
for xf in xf_json['layout_dets']: for xf in xf_json['layout_dets']:
L = xf['poly'][0] / LR_scaleRatio L = xf['poly'][0] / horizontal_scale_ratio
U = xf['poly'][1] / UD_scaleRatio U = xf['poly'][1] / vertical_scale_ratio
R = xf['poly'][2] / LR_scaleRatio R = xf['poly'][2] / horizontal_scale_ratio
D = xf['poly'][5] / UD_scaleRatio D = xf['poly'][5] / vertical_scale_ratio
# L += pageL # 有的页面,artBox偏移了。不在(0,0) # L += pageL # 有的页面,artBox偏移了。不在(0,0)
# R += pageL # R += pageL
# U += pageU # U += pageU
......
from collections import Counter from collections import Counter
from magic_pdf.libs.commons import fitz # pyMuPDF库 from magic_pdf.libs.commons import fitz # pyMuPDF库
from magic_pdf.libs.coordinate_transform import get_scale_ratio
def parse_footnotes_by_model(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict, md_bookname_save_path, debug_mode=False): def parse_footnotes_by_model(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict, md_bookname_save_path, debug_mode=False):
...@@ -9,22 +10,12 @@ def parse_footnotes_by_model(page_ID: int, page: fitz.Page, json_from_DocXchain_ ...@@ -9,22 +10,12 @@ def parse_footnotes_by_model(page_ID: int, page: fitz.Page, json_from_DocXchain_
:param res_dir_path: str类型,是每一个pdf文档,在当前.py文件的目录下生成一个与pdf文档同名的文件夹,res_dir_path就是文件夹的dir :param res_dir_path: str类型,是每一个pdf文档,在当前.py文件的目录下生成一个与pdf文档同名的文件夹,res_dir_path就是文件夹的dir
:param json_from_DocXchain_obj: dict类型,把pdf文档送入DocXChain模型中后,提取bbox,结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict :param json_from_DocXchain_obj: dict类型,把pdf文档送入DocXChain模型中后,提取bbox,结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
""" """
DPI = 72 # use this resolution
pix = page.get_pixmap(dpi=DPI)
pageL = 0
pageR = int(pix.w)
pageU = 0
pageD = int(pix.h)
#--------- 通过json_from_DocXchain来获取 footnote ---------# #--------- 通过json_from_DocXchain来获取 footnote ---------#
footnote_bbox_from_DocXChain = [] footnote_bbox_from_DocXChain = []
xf_json = json_from_DocXchain_obj xf_json = json_from_DocXchain_obj
width_from_json = xf_json['page_info']['width'] horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(xf_json, page)
height_from_json = xf_json['page_info']['height']
LR_scaleRatio = width_from_json / (pageR - pageL)
UD_scaleRatio = height_from_json / (pageD - pageU)
# {0: 'title', # 标题 # {0: 'title', # 标题
# 1: 'figure', # 图片 # 1: 'figure', # 图片
...@@ -42,10 +33,10 @@ def parse_footnotes_by_model(page_ID: int, page: fitz.Page, json_from_DocXchain_ ...@@ -42,10 +33,10 @@ def parse_footnotes_by_model(page_ID: int, page: fitz.Page, json_from_DocXchain_
# 13: 'embedding', # 嵌入公式 # 13: 'embedding', # 嵌入公式
# 14: 'isolated'} # 单行公式 # 14: 'isolated'} # 单行公式
for xf in xf_json['layout_dets']: for xf in xf_json['layout_dets']:
L = xf['poly'][0] / LR_scaleRatio L = xf['poly'][0] / horizontal_scale_ratio
U = xf['poly'][1] / UD_scaleRatio U = xf['poly'][1] / vertical_scale_ratio
R = xf['poly'][2] / LR_scaleRatio R = xf['poly'][2] / horizontal_scale_ratio
D = xf['poly'][5] / UD_scaleRatio D = xf['poly'][5] / vertical_scale_ratio
# L += pageL # 有的页面,artBox偏移了。不在(0,0) # L += pageL # 有的页面,artBox偏移了。不在(0,0)
# R += pageL # R += pageL
# U += pageU # U += pageU
......
from magic_pdf.libs.commons import fitz # pyMuPDF库 from magic_pdf.libs.commons import fitz # pyMuPDF库
from magic_pdf.libs.coordinate_transform import get_scale_ratio
def parse_headers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict): def parse_headers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
...@@ -8,22 +9,12 @@ def parse_headers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict): ...@@ -8,22 +9,12 @@ def parse_headers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
:param res_dir_path: str类型,是每一个pdf文档,在当前.py文件的目录下生成一个与pdf文档同名的文件夹,res_dir_path就是文件夹的dir :param res_dir_path: str类型,是每一个pdf文档,在当前.py文件的目录下生成一个与pdf文档同名的文件夹,res_dir_path就是文件夹的dir
:param json_from_DocXchain_obj: dict类型,把pdf文档送入DocXChain模型中后,提取bbox,结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict :param json_from_DocXchain_obj: dict类型,把pdf文档送入DocXChain模型中后,提取bbox,结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
""" """
DPI = 72 # use this resolution
pix = page.get_pixmap(dpi=DPI)
pageL = 0
pageR = int(pix.w)
pageU = 0
pageD = int(pix.h)
#--------- 通过json_from_DocXchain来获取 header ---------# #--------- 通过json_from_DocXchain来获取 header ---------#
header_bbox_from_DocXChain = [] header_bbox_from_DocXChain = []
xf_json = json_from_DocXchain_obj xf_json = json_from_DocXchain_obj
width_from_json = xf_json['page_info']['width'] horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(xf_json, page)
height_from_json = xf_json['page_info']['height']
LR_scaleRatio = width_from_json / (pageR - pageL)
UD_scaleRatio = height_from_json / (pageD - pageU)
# {0: 'title', # 标题 # {0: 'title', # 标题
# 1: 'figure', # 图片 # 1: 'figure', # 图片
...@@ -41,10 +32,10 @@ def parse_headers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict): ...@@ -41,10 +32,10 @@ def parse_headers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
# 13: 'embedding', # 嵌入公式 # 13: 'embedding', # 嵌入公式
# 14: 'isolated'} # 单行公式 # 14: 'isolated'} # 单行公式
for xf in xf_json['layout_dets']: for xf in xf_json['layout_dets']:
L = xf['poly'][0] / LR_scaleRatio L = xf['poly'][0] / horizontal_scale_ratio
U = xf['poly'][1] / UD_scaleRatio U = xf['poly'][1] / vertical_scale_ratio
R = xf['poly'][2] / LR_scaleRatio R = xf['poly'][2] / horizontal_scale_ratio
D = xf['poly'][5] / UD_scaleRatio D = xf['poly'][5] / vertical_scale_ratio
# L += pageL # 有的页面,artBox偏移了。不在(0,0) # L += pageL # 有的页面,artBox偏移了。不在(0,0)
# R += pageL # R += pageL
# U += pageU # U += pageU
......
from magic_pdf.libs.commons import fitz # pyMuPDF库 from magic_pdf.libs.commons import fitz # pyMuPDF库
from magic_pdf.libs.coordinate_transform import get_scale_ratio
def parse_pageNos(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict): def parse_pageNos(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
...@@ -8,22 +9,12 @@ def parse_pageNos(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict): ...@@ -8,22 +9,12 @@ def parse_pageNos(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
:param res_dir_path: str类型,是每一个pdf文档,在当前.py文件的目录下生成一个与pdf文档同名的文件夹,res_dir_path就是文件夹的dir :param res_dir_path: str类型,是每一个pdf文档,在当前.py文件的目录下生成一个与pdf文档同名的文件夹,res_dir_path就是文件夹的dir
:param json_from_DocXchain_obj: dict类型,把pdf文档送入DocXChain模型中后,提取bbox,结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict :param json_from_DocXchain_obj: dict类型,把pdf文档送入DocXChain模型中后,提取bbox,结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
""" """
DPI = 72 # use this resolution
pix = page.get_pixmap(dpi=DPI)
pageL = 0
pageR = int(pix.w)
pageU = 0
pageD = int(pix.h)
#--------- 通过json_from_DocXchain来获取 pageNo ---------# #--------- 通过json_from_DocXchain来获取 pageNo ---------#
pageNo_bbox_from_DocXChain = [] pageNo_bbox_from_DocXChain = []
xf_json = json_from_DocXchain_obj xf_json = json_from_DocXchain_obj
width_from_json = xf_json['page_info']['width'] horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(xf_json, page)
height_from_json = xf_json['page_info']['height']
LR_scaleRatio = width_from_json / (pageR - pageL)
UD_scaleRatio = height_from_json / (pageD - pageU)
# {0: 'title', # 标题 # {0: 'title', # 标题
# 1: 'figure', # 图片 # 1: 'figure', # 图片
...@@ -41,10 +32,10 @@ def parse_pageNos(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict): ...@@ -41,10 +32,10 @@ def parse_pageNos(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
# 13: 'embedding', # 嵌入公式 # 13: 'embedding', # 嵌入公式
# 14: 'isolated'} # 单行公式 # 14: 'isolated'} # 单行公式
for xf in xf_json['layout_dets']: for xf in xf_json['layout_dets']:
L = xf['poly'][0] / LR_scaleRatio L = xf['poly'][0] / horizontal_scale_ratio
U = xf['poly'][1] / UD_scaleRatio U = xf['poly'][1] / vertical_scale_ratio
R = xf['poly'][2] / LR_scaleRatio R = xf['poly'][2] / horizontal_scale_ratio
D = xf['poly'][5] / UD_scaleRatio D = xf['poly'][5] / vertical_scale_ratio
# L += pageL # 有的页面,artBox偏移了。不在(0,0) # L += pageL # 有的页面,artBox偏移了。不在(0,0)
# R += pageL # R += pageL
# U += pageU # U += pageU
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment