Commit 8a2736a5 authored by 赵小蒙's avatar 赵小蒙

截图增加s3上传逻辑,移除宽或高为0的spans

parent 0b35b73c
...@@ -4,7 +4,7 @@ import os ...@@ -4,7 +4,7 @@ import os
from loguru import logger from loguru import logger
from pathlib import Path from pathlib import Path
from magic_pdf.dict2md.ocr_mkcontent import mk_nlp_markdown, mk_mm_markdown from magic_pdf.dict2md.ocr_mkcontent import ocr_mk_nlp_markdown, ocr_mk_mm_markdown
from magic_pdf.libs.commons import join_path from magic_pdf.libs.commons import join_path
from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
...@@ -30,12 +30,12 @@ def read_json_file(file_path): ...@@ -30,12 +30,12 @@ def read_json_file(file_path):
if __name__ == '__main__': if __name__ == '__main__':
# ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.pdf" ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.pdf"
# ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json" ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json"
# ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.pdf" # ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.pdf"
# ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.json" # ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.json"
ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_1_org.pdf" # ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_1_org.pdf"
ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_1.json" # ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_1_fix.json"
try: try:
ocr_pdf_model_info = read_json_file(ocr_json_file_path) ocr_pdf_model_info = read_json_file(ocr_json_file_path)
pth = Path(ocr_json_file_path) pth = Path(ocr_json_file_path)
...@@ -56,8 +56,8 @@ if __name__ == '__main__': ...@@ -56,8 +56,8 @@ if __name__ == '__main__':
if not os.path.exists(parent_dir): if not os.path.exists(parent_dir):
os.makedirs(parent_dir) os.makedirs(parent_dir)
# markdown_content = mk_nlp_markdown(pdf_info_dict) # markdown_content = ocr_mk_nlp_markdown(pdf_info_dict)
markdown_content = mk_mm_markdown(pdf_info_dict) markdown_content = ocr_mk_mm_markdown(pdf_info_dict)
with open(text_content_save_path, "w", encoding="utf-8") as f: with open(text_content_save_path, "w", encoding="utf-8") as f:
f.write(markdown_content) f.write(markdown_content)
......
...@@ -208,7 +208,7 @@ def parse_pdf_by_ocr( ...@@ -208,7 +208,7 @@ def parse_pdf_by_ocr(
spans, dropped_text_block, dropped_image_block, dropped_table_block = remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict) spans, dropped_text_block, dropped_image_block, dropped_table_block = remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict)
# 对image和table截图 # 对image和table截图
spans = cut_image_and_table(spans, page, page_id, book_name, save_path) spans = cut_image_and_table(spans, page, page_id, book_name, save_path, img_s3_client)
# 行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧) # 行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)
displayed_list = [] displayed_list = []
......
...@@ -3,7 +3,7 @@ from magic_pdf.libs.ocr_content_type import ContentType ...@@ -3,7 +3,7 @@ from magic_pdf.libs.ocr_content_type import ContentType
from magic_pdf.libs.pdf_image_tools import cut_image from magic_pdf.libs.pdf_image_tools import cut_image
def cut_image_and_table(spans, page, page_id, book_name, save_path): def cut_image_and_table(spans, page, page_id, book_name, save_path, img_s3_client):
def s3_return_path(type): def s3_return_path(type):
return join_path(book_name, type) return join_path(book_name, type)
...@@ -13,8 +13,8 @@ def cut_image_and_table(spans, page, page_id, book_name, save_path): ...@@ -13,8 +13,8 @@ def cut_image_and_table(spans, page, page_id, book_name, save_path):
for span in spans: for span in spans:
span_type = span['type'] span_type = span['type']
if span_type == ContentType.Image: if span_type == ContentType.Image:
span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('images')) span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('images'), s3_return_path=s3_return_path('images'), img_s3_client=img_s3_client)
elif span_type == ContentType.Table: elif span_type == ContentType.Table:
span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('tables')) span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('tables'), s3_return_path=s3_return_path('tables'), img_s3_client=img_s3_client)
return spans return spans
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment