Unverified Commit 0746daf9 authored by drunkpig's avatar drunkpig Committed by GitHub

Merge pull request #23 from myhloli/master

fix 参数名称
parents d867304f d438b97a
...@@ -17,10 +17,10 @@ def split_long_words(text): ...@@ -17,10 +17,10 @@ def split_long_words(text):
return ' '.join(segments) return ' '.join(segments)
def ocr_mk_nlp_markdown(pdf_info_dict: dict): def ocr_mk_nlp_markdown(pdf_info_dict: list):
markdown = [] markdown = []
for _, page_info in pdf_info_dict.items(): for page_info in pdf_info_dict:
blocks = page_info.get("preproc_blocks") blocks = page_info.get("preproc_blocks")
if not blocks: if not blocks:
continue continue
...@@ -41,10 +41,10 @@ def ocr_mk_nlp_markdown(pdf_info_dict: dict): ...@@ -41,10 +41,10 @@ def ocr_mk_nlp_markdown(pdf_info_dict: dict):
return '\n'.join(markdown) return '\n'.join(markdown)
def ocr_mk_mm_markdown(pdf_info_dict: dict): def ocr_mk_mm_markdown(pdf_info_dict: list):
markdown = [] markdown = []
for _, page_info in pdf_info_dict.items(): for page_info in pdf_info_dict:
blocks = page_info.get("preproc_blocks") blocks = page_info.get("preproc_blocks")
if not blocks: if not blocks:
continue continue
...@@ -78,17 +78,18 @@ def ocr_mk_mm_markdown_with_para(pdf_info_list: list, img_buket_path): ...@@ -78,17 +78,18 @@ def ocr_mk_mm_markdown_with_para(pdf_info_list: list, img_buket_path):
return '\n\n'.join(markdown) return '\n\n'.join(markdown)
def ocr_mk_nlp_markdown_with_para(pdf_info_dict: dict): def ocr_mk_nlp_markdown_with_para(pdf_info_dict: list):
markdown = [] markdown = []
for _, page_info in pdf_info_dict.items(): for page_info in pdf_info_dict:
paras_of_layout = page_info.get("para_blocks") paras_of_layout = page_info.get("para_blocks")
page_markdown = ocr_mk_markdown_with_para_core(paras_of_layout, "nlp") page_markdown = ocr_mk_markdown_with_para_core(paras_of_layout, "nlp")
markdown.extend(page_markdown) markdown.extend(page_markdown)
return '\n\n'.join(markdown) return '\n\n'.join(markdown)
def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: dict): def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list):
markdown_with_para_and_pagination = [] markdown_with_para_and_pagination = []
for page_no, page_info in pdf_info_dict.items(): page_no = 0
for page_info in pdf_info_dict:
paras_of_layout = page_info.get("para_blocks") paras_of_layout = page_info.get("para_blocks")
if not paras_of_layout: if not paras_of_layout:
continue continue
...@@ -97,6 +98,7 @@ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: dict): ...@@ -97,6 +98,7 @@ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: dict):
'page_no': page_no, 'page_no': page_no,
'md_content': '\n\n'.join(page_markdown) 'md_content': '\n\n'.join(page_markdown)
}) })
page_no += 1
return markdown_with_para_and_pagination return markdown_with_para_and_pagination
...@@ -171,9 +173,9 @@ def para_to_standard_format(para, img_buket_path): ...@@ -171,9 +173,9 @@ def para_to_standard_format(para, img_buket_path):
} }
return para_content return para_content
def make_standard_format_with_para(pdf_info_dict: dict, img_buket_path: str): def make_standard_format_with_para(pdf_info_dict: list, img_buket_path: str):
content_list = [] content_list = []
for _, page_info in pdf_info_dict.items(): for page_info in pdf_info_dict:
paras_of_layout = page_info.get("para_blocks") paras_of_layout = page_info.get("para_blocks")
if not paras_of_layout: if not paras_of_layout:
continue continue
...@@ -227,7 +229,7 @@ def line_to_standard_format(line, img_buket_path): ...@@ -227,7 +229,7 @@ def line_to_standard_format(line, img_buket_path):
return content return content
def ocr_mk_mm_standard_format(pdf_info_dict: dict): def ocr_mk_mm_standard_format(pdf_info_dict: list):
""" """
content_list content_list
type string image/text/table/equation(行间的单独拿出来,行内的和text合并) type string image/text/table/equation(行间的单独拿出来,行内的和text合并)
...@@ -237,7 +239,7 @@ def ocr_mk_mm_standard_format(pdf_info_dict: dict): ...@@ -237,7 +239,7 @@ def ocr_mk_mm_standard_format(pdf_info_dict: dict):
img_path string s3://full/path/to/img.jpg img_path string s3://full/path/to/img.jpg
""" """
content_list = [] content_list = []
for _, page_info in pdf_info_dict.items(): for page_info in pdf_info_dict:
blocks = page_info.get("preproc_blocks") blocks = page_info.get("preproc_blocks")
if not blocks: if not blocks:
continue continue
......
from loguru import logger
from magic_pdf.io import AbsReaderWriter from magic_pdf.io.AbsReaderWriter import AbsReaderWriter
from magic_pdf.libs.commons import fitz from magic_pdf.libs.commons import fitz
from loguru import logger
from magic_pdf.libs.commons import join_path from magic_pdf.libs.commons import join_path
from magic_pdf.libs.hash_utils import compute_sha256 from magic_pdf.libs.hash_utils import compute_sha256
def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWriter:AbsReaderWriter): def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWriter: AbsReaderWriter):
""" """
从第page_num页的page中,根据bbox进行裁剪出一张jpg图片,返回图片路径 从第page_num页的page中,根据bbox进行裁剪出一张jpg图片,返回图片路径
save_path:需要同时支持s3和本地, 图片存放在save_path下,文件名是: {page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。 save_path:需要同时支持s3和本地, 图片存放在save_path下,文件名是: {page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。
...@@ -20,6 +20,10 @@ def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWri ...@@ -20,6 +20,10 @@ def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWri
# 新版本生成平铺路径 # 新版本生成平铺路径
img_hash256_path = f"{compute_sha256(img_path)}.jpg" img_hash256_path = f"{compute_sha256(img_path)}.jpg"
if any([bbox[0] >= bbox[2], bbox[1] >= bbox[3]]):
logger.warning(f"image_bboxes: 错误的box, {bbox}")
return img_hash256_path
# 将坐标转换为fitz.Rect对象 # 将坐标转换为fitz.Rect对象
rect = fitz.Rect(*bbox) rect = fitz.Rect(*bbox)
# 配置缩放倍数为3倍 # 配置缩放倍数为3倍
...@@ -29,50 +33,6 @@ def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWri ...@@ -29,50 +33,6 @@ def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWri
byte_data = pix.tobytes(output='jpeg', jpg_quality=95) byte_data = pix.tobytes(output='jpeg', jpg_quality=95)
imageWriter.write(byte_data, path=img_hash256_path, mode="binary") imageWriter.write(byte_data, img_hash256_path, AbsReaderWriter.MODE_BIN)
imageWriter.write(content=byte_data, path=img_hash256_path, mode="binary")
return img_hash256_path return img_hash256_path
def save_images_by_bboxes(page_num: int, page: fitz.Page, pdf_bytes_md5: str,
image_bboxes: list, images_overlap_backup: list, table_bboxes: list,
equation_inline_bboxes: list,
equation_interline_bboxes: list, imageWriter) -> dict:
"""
返回一个dict, key为bbox, 值是图片地址
"""
image_info = []
image_backup_info = []
table_info = []
inline_eq_info = []
interline_eq_info = []
# 图片的保存路径组成是这样的: {s3_or_local_path}/{book_name}/{images|tables|equations}/{page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg
def return_path(type):
return join_path(pdf_bytes_md5, type)
for bbox in image_bboxes:
if any([bbox[0] >= bbox[2], bbox[1] >= bbox[3]]):
logger.warning(f"image_bboxes: 错误的box, {bbox}")
continue
image_path = cut_image(bbox, page_num, page, return_path("images"), imageWriter)
image_info.append({"bbox": bbox, "image_path": image_path})
for bbox in images_overlap_backup:
if any([bbox[0] >= bbox[2], bbox[1] >= bbox[3]]):
logger.warning(f"images_overlap_backup: 错误的box, {bbox}")
continue
image_path = cut_image(bbox, page_num, page, return_path("images"), imageWriter)
image_backup_info.append({"bbox": bbox, "image_path": image_path})
for bbox in table_bboxes:
if any([bbox[0] >= bbox[2], bbox[1] >= bbox[3]]):
logger.warning(f"table_bboxes: 错误的box, {bbox}")
continue
image_path = cut_image(bbox, page_num, page, return_path("tables"), imageWriter)
table_info.append({"bbox": bbox, "image_path": image_path})
return image_info, image_backup_info, table_info, inline_eq_info, interline_eq_info
...@@ -16,7 +16,7 @@ from magic_pdf.pre_proc.detect_footer_by_model import parse_footers ...@@ -16,7 +16,7 @@ from magic_pdf.pre_proc.detect_footer_by_model import parse_footers
from magic_pdf.pre_proc.detect_footnote import parse_footnotes_by_model from magic_pdf.pre_proc.detect_footnote import parse_footnotes_by_model
from magic_pdf.pre_proc.detect_header import parse_headers from magic_pdf.pre_proc.detect_header import parse_headers
from magic_pdf.pre_proc.detect_page_number import parse_pageNos from magic_pdf.pre_proc.detect_page_number import parse_pageNos
from magic_pdf.pre_proc.ocr_cut_image import cut_image_and_table from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
from magic_pdf.pre_proc.ocr_detect_layout import layout_detect from magic_pdf.pre_proc.ocr_detect_layout import layout_detect
from magic_pdf.pre_proc.ocr_dict_merge import ( from magic_pdf.pre_proc.ocr_dict_merge import (
merge_spans_to_line_by_layout, merge_lines_to_block, merge_spans_to_line_by_layout, merge_lines_to_block,
...@@ -27,7 +27,6 @@ from magic_pdf.pre_proc.ocr_span_list_modify import remove_spans_by_bboxes, remo ...@@ -27,7 +27,6 @@ from magic_pdf.pre_proc.ocr_span_list_modify import remove_spans_by_bboxes, remo
from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox
def parse_pdf_by_ocr( def parse_pdf_by_ocr(
pdf_bytes, pdf_bytes,
pdf_model_output, pdf_model_output,
...@@ -148,7 +147,7 @@ def parse_pdf_by_ocr( ...@@ -148,7 +147,7 @@ def parse_pdf_by_ocr(
spans, dropped_spans_by_removed_bboxes = remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict) spans, dropped_spans_by_removed_bboxes = remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict)
'''对image和table截图''' '''对image和table截图'''
spans = cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter) spans = ocr_cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter)
'''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)''' '''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
displayed_list = [] displayed_list = []
...@@ -202,10 +201,10 @@ def parse_pdf_by_ocr( ...@@ -202,10 +201,10 @@ def parse_pdf_by_ocr(
'''构造pdf_info_dict''' '''构造pdf_info_dict'''
page_info = ocr_construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree, page_info = ocr_construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
images, tables, interline_equations, inline_equations, images, tables, interline_equations, inline_equations,
dropped_text_block, dropped_image_block, dropped_table_block, dropped_text_block, dropped_image_block, dropped_table_block,
dropped_equation_block, dropped_equation_block,
need_remove_spans_bboxes_dict) need_remove_spans_bboxes_dict)
pdf_info_dict[f"page_{page_id}"] = page_info pdf_info_dict[f"page_{page_id}"] = page_info
"""分段""" """分段"""
......
...@@ -17,6 +17,7 @@ from magic_pdf.libs.hash_utils import compute_md5 ...@@ -17,6 +17,7 @@ from magic_pdf.libs.hash_utils import compute_md5
from magic_pdf.libs.markdown_utils import escape_special_markdown_char from magic_pdf.libs.markdown_utils import escape_special_markdown_char
from magic_pdf.libs.safe_filename import sanitize_filename from magic_pdf.libs.safe_filename import sanitize_filename
from magic_pdf.libs.vis_utils import draw_bbox_on_page, draw_layout_bbox_on_page from magic_pdf.libs.vis_utils import draw_bbox_on_page, draw_layout_bbox_on_page
from magic_pdf.pre_proc.cut_image import txt_save_images_by_bboxes
from magic_pdf.pre_proc.detect_images import parse_images from magic_pdf.pre_proc.detect_images import parse_images
from magic_pdf.pre_proc.detect_tables import parse_tables # 获取tables的bbox from magic_pdf.pre_proc.detect_tables import parse_tables # 获取tables的bbox
from magic_pdf.pre_proc.detect_equation import parse_equations # 获取equations的bbox from magic_pdf.pre_proc.detect_equation import parse_equations # 获取equations的bbox
...@@ -48,8 +49,6 @@ from para.exceptions import ( ...@@ -48,8 +49,6 @@ from para.exceptions import (
) )
''' '''
from magic_pdf.libs.commons import read_file, join_path
from magic_pdf.libs.pdf_image_tools import save_images_by_bboxes
from magic_pdf.post_proc.remove_footnote import merge_footnote_blocks, remove_footnote_blocks from magic_pdf.post_proc.remove_footnote import merge_footnote_blocks, remove_footnote_blocks
from magic_pdf.pre_proc.citationmarker_remove import remove_citation_marker from magic_pdf.pre_proc.citationmarker_remove import remove_citation_marker
from magic_pdf.pre_proc.equations_replace import combine_chars_to_pymudict, remove_chars_in_text_blocks, replace_equations_in_textblock from magic_pdf.pre_proc.equations_replace import combine_chars_to_pymudict, remove_chars_in_text_blocks, replace_equations_in_textblock
...@@ -194,7 +193,7 @@ def parse_pdf_by_txt( ...@@ -194,7 +193,7 @@ def parse_pdf_by_txt(
""" """
# 把图、表、公式都进行截图,保存到存储上,返回图片路径作为内容 # 把图、表、公式都进行截图,保存到存储上,返回图片路径作为内容
image_info, image_backup_info, table_info, inline_eq_info, interline_eq_info = save_images_by_bboxes( image_info, image_backup_info, table_info, inline_eq_info, interline_eq_info = txt_save_images_by_bboxes(
page_id, page_id,
page, page,
pdf_bytes_md5, pdf_bytes_md5,
......
...@@ -26,6 +26,7 @@ from magic_pdf.libs.drop_reason import DropReason ...@@ -26,6 +26,7 @@ from magic_pdf.libs.drop_reason import DropReason
from magic_pdf.libs.markdown_utils import escape_special_markdown_char from magic_pdf.libs.markdown_utils import escape_special_markdown_char
from magic_pdf.libs.safe_filename import sanitize_filename from magic_pdf.libs.safe_filename import sanitize_filename
from magic_pdf.libs.vis_utils import draw_bbox_on_page, draw_layout_bbox_on_page from magic_pdf.libs.vis_utils import draw_bbox_on_page, draw_layout_bbox_on_page
from magic_pdf.pre_proc.cut_image import txt_save_images_by_bboxes
from magic_pdf.pre_proc.detect_images import parse_images from magic_pdf.pre_proc.detect_images import parse_images
from magic_pdf.pre_proc.detect_tables import parse_tables # 获取tables的bbox from magic_pdf.pre_proc.detect_tables import parse_tables # 获取tables的bbox
from magic_pdf.pre_proc.detect_equation import parse_equations # 获取equations的bbox from magic_pdf.pre_proc.detect_equation import parse_equations # 获取equations的bbox
...@@ -62,7 +63,6 @@ from para.exceptions import ( ...@@ -62,7 +63,6 @@ from para.exceptions import (
""" """
from magic_pdf.libs.commons import read_file, join_path from magic_pdf.libs.commons import read_file, join_path
from magic_pdf.libs.pdf_image_tools import save_images_by_bboxes
from magic_pdf.post_proc.remove_footnote import ( from magic_pdf.post_proc.remove_footnote import (
merge_footnote_blocks, merge_footnote_blocks,
remove_footnote_blocks, remove_footnote_blocks,
...@@ -323,7 +323,7 @@ def parse_pdf_for_train( ...@@ -323,7 +323,7 @@ def parse_pdf_for_train(
# 把图、表、公式都进行截图,保存到存储上,返回图片路径作为内容 # 把图、表、公式都进行截图,保存到存储上,返回图片路径作为内容
image_info, image_backup_info, table_info, inline_eq_info, interline_eq_info = ( image_info, image_backup_info, table_info, inline_eq_info, interline_eq_info = (
save_images_by_bboxes( txt_save_images_by_bboxes(
book_name, book_name,
page_id, page_id,
page, page,
......
...@@ -105,8 +105,8 @@ if __name__ == '__main__': ...@@ -105,8 +105,8 @@ if __name__ == '__main__':
pdf_file_path = r"linshixuqiu\25536-00.pdf" pdf_file_path = r"linshixuqiu\25536-00.pdf"
model_file_path = r"linshixuqiu\25536-00.json" model_file_path = r"linshixuqiu\25536-00.json"
pdf_bytes = drw.read(path=pdf_file_path, mode=AbsReaderWriter.MODE_BIN) pdf_bytes = drw.read(pdf_file_path, AbsReaderWriter.MODE_BIN)
model_json_txt = drw.read(path=model_file_path, mode=AbsReaderWriter.MODE_TXT) model_json_txt = drw.read(model_file_path, AbsReaderWriter.MODE_TXT)
pdf_type = UNIPipe.classify(pdf_bytes) pdf_type = UNIPipe.classify(pdf_bytes)
logger.info(f"pdf_type is {pdf_type}") logger.info(f"pdf_type is {pdf_type}")
...@@ -122,5 +122,5 @@ if __name__ == '__main__': ...@@ -122,5 +122,5 @@ if __name__ == '__main__':
md_content = pipe.mk_markdown(pdf_mid_data, "imgs") md_content = pipe.mk_markdown(pdf_mid_data, "imgs")
md_writer = DiskReaderWriter(write_path) md_writer = DiskReaderWriter(write_path)
md_writer.write(content=md_content, path="25536-00.md", mode=AbsReaderWriter.MODE_TXT) md_writer.write(md_content, "25536-00.md", AbsReaderWriter.MODE_TXT)
md_writer.write(content=json.dumps(JsonCompressor.decompress_json(pdf_mid_data), ensure_ascii=False, indent=4), path="25536-00.json", mode=AbsReaderWriter.MODE_TXT) md_writer.write(json.dumps(JsonCompressor.decompress_json(pdf_mid_data), ensure_ascii=False, indent=4), "25536-00.json", AbsReaderWriter.MODE_TXT)
from magic_pdf.libs.commons import join_path
from magic_pdf.libs.ocr_content_type import ContentType
from magic_pdf.libs.pdf_image_tools import cut_image
def ocr_cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter):
def return_path(type):
return join_path(pdf_bytes_md5, type)
for span in spans:
span_type = span['type']
if span_type == ContentType.Image:
span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('images'),
imageWriter=imageWriter)
elif span_type == ContentType.Table:
span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('tables'),
imageWriter=imageWriter)
return spans
def txt_save_images_by_bboxes(page_num: int, page, pdf_bytes_md5: str,
image_bboxes: list, images_overlap_backup: list, table_bboxes: list,
equation_inline_bboxes: list,
equation_interline_bboxes: list, imageWriter) -> dict:
"""
返回一个dict, key为bbox, 值是图片地址
"""
image_info = []
image_backup_info = []
table_info = []
inline_eq_info = []
interline_eq_info = []
# 图片的保存路径组成是这样的: {s3_or_local_path}/{book_name}/{images|tables|equations}/{page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg
def return_path(type):
return join_path(pdf_bytes_md5, type)
for bbox in image_bboxes:
image_path = cut_image(bbox, page_num, page, return_path("images"), imageWriter)
image_info.append({"bbox": bbox, "image_path": image_path})
for bbox in images_overlap_backup:
image_path = cut_image(bbox, page_num, page, return_path("images"), imageWriter)
image_backup_info.append({"bbox": bbox, "image_path": image_path})
for bbox in table_bboxes:
image_path = cut_image(bbox, page_num, page, return_path("tables"), imageWriter)
table_info.append({"bbox": bbox, "image_path": image_path})
return image_info, image_backup_info, table_info, inline_eq_info, interline_eq_info
from magic_pdf.libs.commons import join_path
from magic_pdf.libs.ocr_content_type import ContentType
from magic_pdf.libs.pdf_image_tools import cut_image
def cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter):
def return_path(type):
return join_path(pdf_bytes_md5, type)
for span in spans:
span_type = span['type']
if span_type == ContentType.Image:
span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('images'), imageWriter=imageWriter)
elif span_type == ContentType.Table:
span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('tables'), imageWriter=imageWriter)
return spans
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment