Commit d5dbed73 authored by 赵小蒙's avatar 赵小蒙

目录重构

parent 7c7910e4
...@@ -2,7 +2,7 @@ import json ...@@ -2,7 +2,7 @@ import json
import os import os
from tqdm import tqdm from tqdm import tqdm
from pdf_tools.libs import join_path from magic_pdf.libs import join_path
with open('/mnt/petrelfs/share_data/ouyanglinke/OCR/OCR_validation_dataset.json', 'r') as f: with open('/mnt/petrelfs/share_data/ouyanglinke/OCR/OCR_validation_dataset.json', 'r') as f:
samples = json.load(f) samples = json.load(f)
......
from pdf_tools.libs import fitz # PyMuPDF from magic_pdf.libs import fitz # PyMuPDF
# PDF文件路径 # PDF文件路径
pdf_path = "D:\\project\\20231108code-clean\\code-clean\\tmp\\unittest\\download-pdfs\\scihub\\scihub_53700000\\libgen.scimag53724000-53724999.zip_10.1097\\00129191-200509000-00018.pdf" pdf_path = "D:\\project\\20231108code-clean\\code-clean\\tmp\\unittest\\download-pdfs\\scihub\\scihub_53700000\\libgen.scimag53724000-53724999.zip_10.1097\\00129191-200509000-00018.pdf"
......
...@@ -5,9 +5,9 @@ from pathlib import Path ...@@ -5,9 +5,9 @@ from pathlib import Path
import click import click
from loguru import logger from loguru import logger
from pdf_tools.libs import join_path from magic_pdf.libs import join_path
from pdf_tools.dict2md.mkcontent import mk_mm_markdown from magic_pdf.dict2md.mkcontent import mk_mm_markdown
from pdf_tools.pipeline import parse_pdf_by_model from magic_pdf.pipeline import parse_pdf_by_model
......
import math import math
from loguru import logger from loguru import logger
from pdf_tools.libs.boxbase import find_bottom_nearest_text_bbox, find_top_nearest_text_bbox from magic_pdf.libs.boxbase import find_bottom_nearest_text_bbox, find_top_nearest_text_bbox
def mk_nlp_markdown(para_dict: dict): def mk_nlp_markdown(para_dict: dict):
......
...@@ -16,8 +16,8 @@ from collections import Counter ...@@ -16,8 +16,8 @@ from collections import Counter
import click import click
import numpy as np import numpy as np
from pdf_tools.libs.commons import mymax, get_top_percent_list from magic_pdf.libs.commons import mymax, get_top_percent_list
from pdf_tools.filter.pdf_meta_scan import scan_max_page, junk_limit_min from magic_pdf.filter.pdf_meta_scan import scan_max_page, junk_limit_min
TEXT_LEN_THRESHOLD = 100 TEXT_LEN_THRESHOLD = 100
AVG_TEXT_LEN_THRESHOLD = 200 AVG_TEXT_LEN_THRESHOLD = 200
......
...@@ -5,13 +5,13 @@ ...@@ -5,13 +5,13 @@
import sys import sys
import click import click
from pdf_tools.libs.commons import read_file, mymax, get_top_percent_list from magic_pdf.libs.commons import read_file, mymax, get_top_percent_list
from pdf_tools.libs.commons import fitz from magic_pdf.libs.commons import fitz
from loguru import logger from loguru import logger
from collections import Counter from collections import Counter
from pdf_tools.libs.drop_reason import DropReason from magic_pdf.libs.drop_reason import DropReason
from pdf_tools.libs.language import detect_lang from magic_pdf.libs.language import detect_lang
scan_max_page = 50 scan_max_page = 50
junk_limit_min = 10 junk_limit_min = 10
......
...@@ -3,9 +3,9 @@ ...@@ -3,9 +3,9 @@
from pdf_tools.layout.layout_spiler_recog import get_spilter_of_page from magic_pdf.layout.layout_spiler_recog import get_spilter_of_page
from pdf_tools.libs.boxbase import _is_in, _is_in_or_part_overlap, _is_vertical_full_overlap from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap, _is_vertical_full_overlap
from pdf_tools.libs.commons import mymax from magic_pdf.libs.commons import mymax
X0_IDX = 0 X0_IDX = 0
Y0_IDX = 1 Y0_IDX = 1
......
from pdf_tools.layout.bbox_sort import X0_EXT_IDX, X0_IDX, X1_EXT_IDX, X1_IDX, Y0_IDX, Y1_EXT_IDX, Y1_IDX from magic_pdf.layout.bbox_sort import X0_EXT_IDX, X0_IDX, X1_EXT_IDX, X1_IDX, Y0_IDX, Y1_EXT_IDX, Y1_IDX
from pdf_tools.libs.boxbase import _is_bottom_full_overlap, _left_intersect, _right_intersect from magic_pdf.libs.boxbase import _is_bottom_full_overlap, _left_intersect, _right_intersect
def find_all_left_bbox_direct(this_bbox, all_bboxes) -> list: def find_all_left_bbox_direct(this_bbox, all_bboxes) -> list:
......
...@@ -3,9 +3,9 @@ ...@@ -3,9 +3,9 @@
""" """
from loguru import logger from loguru import logger
from pdf_tools.layout.bbox_sort import CONTENT_IDX, CONTENT_TYPE_IDX, X0_EXT_IDX, X0_IDX, X1_EXT_IDX, X1_IDX, Y0_EXT_IDX, Y0_IDX, Y1_EXT_IDX, Y1_IDX, paper_bbox_sort from magic_pdf.layout.bbox_sort import CONTENT_IDX, CONTENT_TYPE_IDX, X0_EXT_IDX, X0_IDX, X1_EXT_IDX, X1_IDX, Y0_EXT_IDX, Y0_IDX, Y1_EXT_IDX, Y1_IDX, paper_bbox_sort
from pdf_tools.layout.layout_det_utils import find_all_left_bbox_direct, find_all_right_bbox_direct, find_bottom_bbox_direct_from_left_edge, find_bottom_bbox_direct_from_right_edge, find_top_bbox_direct_from_left_edge, find_top_bbox_direct_from_right_edge, find_all_top_bbox_direct, find_all_bottom_bbox_direct, get_left_edge_bboxes, get_right_edge_bboxes from magic_pdf.layout.layout_det_utils import find_all_left_bbox_direct, find_all_right_bbox_direct, find_bottom_bbox_direct_from_left_edge, find_bottom_bbox_direct_from_right_edge, find_top_bbox_direct_from_left_edge, find_top_bbox_direct_from_right_edge, find_all_top_bbox_direct, find_all_bottom_bbox_direct, get_left_edge_bboxes, get_right_edge_bboxes
from pdf_tools.libs.boxbase import get_bbox_in_boundry from magic_pdf.libs.boxbase import get_bbox_in_boundry
LAYOUT_V = "V" LAYOUT_V = "V"
......
...@@ -3,8 +3,8 @@ ...@@ -3,8 +3,8 @@
""" """
import os import os
from pdf_tools.libs.commons import fitz from magic_pdf.libs.commons import fitz
from pdf_tools.libs.boxbase import _is_in_or_part_overlap from magic_pdf.libs.boxbase import _is_in_or_part_overlap
def __rect_filter_by_width(rect, page_w, page_h): def __rect_filter_by_width(rect, page_w, page_h):
......
...@@ -50,7 +50,7 @@ Usage ...@@ -50,7 +50,7 @@ Usage
---------------------------------------------------------------------------------- ----------------------------------------------------------------------------------
""" """
import sys import sys
from pdf_tools.libs.commons import fitz from magic_pdf.libs.commons import fitz
def column_boxes(page, footer_margin=50, header_margin=50, no_image_text=True): def column_boxes(page, footer_margin=50, header_margin=50, no_image_text=True):
......
...@@ -10,7 +10,7 @@ import spacy ...@@ -10,7 +10,7 @@ import spacy
import en_core_web_sm import en_core_web_sm
import zh_core_web_sm import zh_core_web_sm
from pdf_tools.libs.language import detect_lang from magic_pdf.libs.language import detect_lang
class NLPModels: class NLPModels:
......
...@@ -4,9 +4,9 @@ from typing import Tuple ...@@ -4,9 +4,9 @@ from typing import Tuple
import io import io
# from app.common.s3 import get_s3_client # from app.common.s3 import get_s3_client
from pdf_tools.libs.commons import fitz from magic_pdf.libs.commons import fitz
from loguru import logger from loguru import logger
from pdf_tools.libs.commons import parse_bucket_key, join_path from magic_pdf.libs.commons import parse_bucket_key, join_path
def cut_image(bbox: Tuple, page_num: int, page: fitz.Page, save_parent_path: str, s3_return_path=None, img_s3_client=None, upload_switch=True): def cut_image(bbox: Tuple, page_num: int, page: fitz.Page, save_parent_path: str, s3_return_path=None, img_s3_client=None, upload_switch=True):
......
from pdf_tools.libs.commons import fitz from magic_pdf.libs.commons import fitz
import os import os
......
import os import os
import unicodedata import unicodedata
from pdf_tools.para.commons import * from magic_pdf.para.commons import *
if sys.version_info[0] >= 3: if sys.version_info[0] >= 3:
......
from pdf_tools.para.commons import * from magic_pdf.para.commons import *
if sys.version_info[0] >= 3: if sys.version_info[0] >= 3:
......
import sys import sys
from pdf_tools.libs.commons import fitz from magic_pdf.libs.commons import fitz
from termcolor import cprint from termcolor import cprint
......
import math import math
from collections import defaultdict from collections import defaultdict
from pdf_tools.para.commons import * from magic_pdf.para.commons import *
if sys.version_info[0] >= 3: if sys.version_info[0] >= 3:
sys.stdout.reconfigure(encoding="utf-8") # type: ignore sys.stdout.reconfigure(encoding="utf-8") # type: ignore
......
from pdf_tools.libs.commons import fitz from magic_pdf.libs.commons import fitz
from pdf_tools.para.commons import * from magic_pdf.para.commons import *
if sys.version_info[0] >= 3: if sys.version_info[0] >= 3:
......
import math import math
from pdf_tools.para.commons import * from magic_pdf.para.commons import *
if sys.version_info[0] >= 3: if sys.version_info[0] >= 3:
......
import os import os
import json import json
from pdf_tools.para.commons import * from magic_pdf.para.commons import *
from pdf_tools.para.raw_processor import RawBlockProcessor from magic_pdf.para.raw_processor import RawBlockProcessor
from pdf_tools.para.layout_match_processor import LayoutFilterProcessor from magic_pdf.para.layout_match_processor import LayoutFilterProcessor
from pdf_tools.para.stats import BlockStatisticsCalculator from magic_pdf.para.stats import BlockStatisticsCalculator
from pdf_tools.para.stats import DocStatisticsCalculator from magic_pdf.para.stats import DocStatisticsCalculator
from pdf_tools.para.title_processor import TitleProcessor from magic_pdf.para.title_processor import TitleProcessor
from pdf_tools.para.block_termination_processor import BlockTerminationProcessor from magic_pdf.para.block_termination_processor import BlockTerminationProcessor
from pdf_tools.para.block_continuation_processor import BlockContinuationProcessor from magic_pdf.para.block_continuation_processor import BlockContinuationProcessor
from pdf_tools.para.draw import DrawAnnos from magic_pdf.para.draw import DrawAnnos
from pdf_tools.para.exceptions import ( from magic_pdf.para.exceptions import (
DenseSingleLineBlockException, DenseSingleLineBlockException,
TitleDetectionException, TitleDetectionException,
TitleLevelException, TitleLevelException,
......
from collections import Counter from collections import Counter
import numpy as np import numpy as np
from pdf_tools.para.commons import * from magic_pdf.para.commons import *
if sys.version_info[0] >= 3: if sys.version_info[0] >= 3:
......
...@@ -2,9 +2,9 @@ import os ...@@ -2,9 +2,9 @@ import os
import re import re
import numpy as np import numpy as np
from pdf_tools.libs.nlp_utils import NLPModels from magic_pdf.libs.nlp_utils import NLPModels
from pdf_tools.para.commons import * from magic_pdf.para.commons import *
if sys.version_info[0] >= 3: if sys.version_info[0] >= 3:
sys.stdout.reconfigure(encoding="utf-8") # type: ignore sys.stdout.reconfigure(encoding="utf-8") # type: ignore
......
...@@ -2,28 +2,28 @@ import time ...@@ -2,28 +2,28 @@ import time
# from anyio import Path # from anyio import Path
from pdf_tools.libs.commons import fitz, get_delta_time, get_img_s3_client from magic_pdf.libs.commons import fitz, get_delta_time, get_img_s3_client
import json import json
import os import os
import math import math
from loguru import logger from loguru import logger
from pdf_tools.layout.bbox_sort import ( from magic_pdf.layout.bbox_sort import (
prepare_bboxes_for_layout_split, prepare_bboxes_for_layout_split,
) )
from pdf_tools.layout.layout_sort import LAYOUT_UNPROC, get_bboxes_layout, get_columns_cnt_of_layout, sort_text_block from magic_pdf.layout.layout_sort import LAYOUT_UNPROC, get_bboxes_layout, get_columns_cnt_of_layout, sort_text_block
from pdf_tools.libs.drop_reason import DropReason from magic_pdf.libs.drop_reason import DropReason
from pdf_tools.libs.markdown_utils import escape_special_markdown_char from magic_pdf.libs.markdown_utils import escape_special_markdown_char
from pdf_tools.libs.safe_filename import sanitize_filename from magic_pdf.libs.safe_filename import sanitize_filename
from pdf_tools.libs.vis_utils import draw_bbox_on_page, draw_layout_bbox_on_page from magic_pdf.libs.vis_utils import draw_bbox_on_page, draw_layout_bbox_on_page
from pdf_tools.pre_proc.detect_images import parse_images from magic_pdf.pre_proc.detect_images import parse_images
from pdf_tools.pre_proc.detect_tables import parse_tables # 获取tables的bbox from magic_pdf.pre_proc.detect_tables import parse_tables # 获取tables的bbox
from pdf_tools.pre_proc.detect_equation import parse_equations # 获取equations的bbox from magic_pdf.pre_proc.detect_equation import parse_equations # 获取equations的bbox
from pdf_tools.pre_proc.detect_header import parse_headers # 获取headers的bbox from magic_pdf.pre_proc.detect_header import parse_headers # 获取headers的bbox
from pdf_tools.pre_proc.detect_page_number import parse_pageNos # 获取pageNos的bbox from magic_pdf.pre_proc.detect_page_number import parse_pageNos # 获取pageNos的bbox
from pdf_tools.pre_proc.detect_footnote import parse_footnotes_by_model, parse_footnotes_by_rule # 获取footnotes的bbox from magic_pdf.pre_proc.detect_footnote import parse_footnotes_by_model, parse_footnotes_by_rule # 获取footnotes的bbox
from pdf_tools.pre_proc.detect_footer_by_model import parse_footers # 获取footers的bbox from magic_pdf.pre_proc.detect_footer_by_model import parse_footers # 获取footers的bbox
from pdf_tools.post_proc.detect_para import ( from magic_pdf.post_proc.detect_para import (
ParaProcessPipeline, ParaProcessPipeline,
TitleDetectionException, TitleDetectionException,
TitleLevelException, TitleLevelException,
...@@ -31,9 +31,9 @@ from pdf_tools.post_proc.detect_para import ( ...@@ -31,9 +31,9 @@ from pdf_tools.post_proc.detect_para import (
ParaMergeException, ParaMergeException,
DenseSingleLineBlockException, DenseSingleLineBlockException,
) )
from pdf_tools.pre_proc.main_text_font import get_main_text_font from magic_pdf.pre_proc.main_text_font import get_main_text_font
from pdf_tools.pre_proc.remove_colored_strip_bbox import remove_colored_strip_textblock from magic_pdf.pre_proc.remove_colored_strip_bbox import remove_colored_strip_textblock
from pdf_tools.pre_proc.remove_footer_header import remove_headder_footer_one_page from magic_pdf.pre_proc.remove_footer_header import remove_headder_footer_one_page
''' '''
from para.para_pipeline import ParaProcessPipeline from para.para_pipeline import ParaProcessPipeline
...@@ -46,19 +46,19 @@ from para.exceptions import ( ...@@ -46,19 +46,19 @@ from para.exceptions import (
) )
''' '''
from pdf_tools.libs.commons import read_file, join_path from magic_pdf.libs.commons import read_file, join_path
from pdf_tools.libs.pdf_image_tools import save_images_by_bboxes from magic_pdf.libs.pdf_image_tools import save_images_by_bboxes
from pdf_tools.post_proc.remove_footnote import merge_footnote_blocks, remove_footnote_blocks from magic_pdf.post_proc.remove_footnote import merge_footnote_blocks, remove_footnote_blocks
from pdf_tools.pre_proc.citationmarker_remove import remove_citation_marker from magic_pdf.pre_proc.citationmarker_remove import remove_citation_marker
from pdf_tools.pre_proc.equations_replace import combine_chars_to_pymudict, remove_chars_in_text_blocks, replace_equations_in_textblock from magic_pdf.pre_proc.equations_replace import combine_chars_to_pymudict, remove_chars_in_text_blocks, replace_equations_in_textblock
from pdf_tools.pre_proc.pdf_pre_filter import pdf_filter from magic_pdf.pre_proc.pdf_pre_filter import pdf_filter
from pdf_tools.pre_proc.detect_footer_header_by_statistics import drop_footer_header from magic_pdf.pre_proc.detect_footer_header_by_statistics import drop_footer_header
from pdf_tools.pre_proc.construct_paras import construct_page_component from magic_pdf.pre_proc.construct_paras import construct_page_component
from pdf_tools.pre_proc.fix_image import combine_images, fix_image_vertical, fix_seperated_image, include_img_title from magic_pdf.pre_proc.fix_image import combine_images, fix_image_vertical, fix_seperated_image, include_img_title
from pdf_tools.post_proc.pdf_post_filter import pdf_post_filter from magic_pdf.post_proc.pdf_post_filter import pdf_post_filter
from pdf_tools.pre_proc.remove_rotate_bbox import get_side_boundry, remove_rotate_side_textblock, remove_side_blank_block from magic_pdf.pre_proc.remove_rotate_bbox import get_side_boundry, remove_rotate_side_textblock, remove_side_blank_block
from pdf_tools.pre_proc.resolve_bbox_conflict import check_text_block_horizontal_overlap, resolve_bbox_overlap_conflict from magic_pdf.pre_proc.resolve_bbox_conflict import check_text_block_horizontal_overlap, resolve_bbox_overlap_conflict
from pdf_tools.pre_proc.fix_table import fix_table_text_block, fix_tables, include_table_title from magic_pdf.pre_proc.fix_table import fix_table_text_block, fix_tables, include_table_title
denseSingleLineBlockException_msg = DenseSingleLineBlockException().message denseSingleLineBlockException_msg = DenseSingleLineBlockException().message
titleDetectionException_msg = TitleDetectionException().message titleDetectionException_msg = TitleDetectionException().message
...@@ -108,7 +108,7 @@ def parse_pdf_by_model( ...@@ -108,7 +108,7 @@ def parse_pdf_by_model(
debug_mode=False, debug_mode=False,
): ):
pdf_bytes = read_file(s3_pdf_path, s3_pdf_profile) pdf_bytes = read_file(s3_pdf_path, s3_pdf_profile)
save_tmp_path = os.path.join(os.path.dirname(__file__), "../../..", "tmp", "unittest") save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "tmp", "unittest")
md_bookname_save_path = "" md_bookname_save_path = ""
book_name = sanitize_filename(book_name) book_name = sanitize_filename(book_name)
if debug_mode: if debug_mode:
......
...@@ -11,8 +11,8 @@ import numpy as np ...@@ -11,8 +11,8 @@ import numpy as np
from termcolor import cprint from termcolor import cprint
from pdf_tools.libs.commons import fitz from magic_pdf.libs.commons import fitz
from pdf_tools.libs.nlp_utils import NLPModels from magic_pdf.libs.nlp_utils import NLPModels
if sys.version_info[0] >= 3: if sys.version_info[0] >= 3:
......
from loguru import logger from loguru import logger
from pdf_tools.layout.layout_sort import get_columns_cnt_of_layout from magic_pdf.layout.layout_sort import get_columns_cnt_of_layout
from pdf_tools.libs.drop_reason import DropReason from magic_pdf.libs.drop_reason import DropReason
def __is_pseudo_single_column(page_info) -> bool: def __is_pseudo_single_column(page_info) -> bool:
......
from pdf_tools.libs.boxbase import _is_in, _is_in_or_part_overlap from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap
import collections # 统计库 import collections # 统计库
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
https://aicarrier.feishu.cn/wiki/YLOPwo1PGiwFRdkwmyhcZmr0n3d https://aicarrier.feishu.cn/wiki/YLOPwo1PGiwFRdkwmyhcZmr0n3d
""" """
import re import re
from pdf_tools.libs.nlp_utils import NLPModels from magic_pdf.libs.nlp_utils import NLPModels
__NLP_MODEL = NLPModels() __NLP_MODEL = NLPModels()
......
from pdf_tools.libs.boxbase import _is_in # 正则 from magic_pdf.libs.boxbase import _is_in # 正则
from pdf_tools.libs.commons import fitz # pyMuPDF库 from magic_pdf.libs.commons import fitz # pyMuPDF库
def __solve_contain_bboxs(all_bbox_list: list): def __solve_contain_bboxs(all_bbox_list: list):
......
from pdf_tools.libs.commons import fitz # pyMuPDF库 from magic_pdf.libs.commons import fitz # pyMuPDF库
def parse_footers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict): def parse_footers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
......
from collections import defaultdict from collections import defaultdict
from pdf_tools.libs.boxbase import calculate_iou from magic_pdf.libs.boxbase import calculate_iou
def compare_bbox_with_list(bbox, bbox_list, tolerance=1): def compare_bbox_with_list(bbox, bbox_list, tolerance=1):
......
from collections import Counter from collections import Counter
from pdf_tools.libs.commons import fitz # pyMuPDF库 from magic_pdf.libs.commons import fitz # pyMuPDF库
def parse_footnotes_by_model(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict, md_bookname_save_path, debug_mode=False): def parse_footnotes_by_model(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict, md_bookname_save_path, debug_mode=False):
......
from pdf_tools.libs.commons import fitz # pyMuPDF库 from magic_pdf.libs.commons import fitz # pyMuPDF库
def parse_headers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict): def parse_headers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
......
import collections # 统计库 import collections # 统计库
import re import re
from pdf_tools.libs.commons import fitz # pyMuPDF库 from magic_pdf.libs.commons import fitz # pyMuPDF库
#--------------------------------------- Tool Functions --------------------------------------# #--------------------------------------- Tool Functions --------------------------------------#
......
from pdf_tools.libs.commons import fitz # pyMuPDF库 from magic_pdf.libs.commons import fitz # pyMuPDF库
def parse_pageNos(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict): def parse_pageNos(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
......
from pdf_tools.libs.commons import fitz # pyMuPDF库 from magic_pdf.libs.commons import fitz # pyMuPDF库
def parse_tables(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict): def parse_tables(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
......
""" """
对pymupdf返回的结构里的公式进行替换,替换为模型识别的公式结果 对pymupdf返回的结构里的公式进行替换,替换为模型识别的公式结果
""" """
from pdf_tools.libs.commons import fitz from magic_pdf.libs.commons import fitz
import json import json
import os import os
from pathlib import Path from pathlib import Path
......
...@@ -2,9 +2,9 @@ ...@@ -2,9 +2,9 @@
import re import re
from pdf_tools.libs.boxbase import _is_in_or_part_overlap, _is_part_overlap, find_bottom_nearest_text_bbox, find_left_nearest_text_bbox, find_right_nearest_text_bbox, find_top_nearest_text_bbox from magic_pdf.libs.boxbase import _is_in_or_part_overlap, _is_part_overlap, find_bottom_nearest_text_bbox, find_left_nearest_text_bbox, find_right_nearest_text_bbox, find_top_nearest_text_bbox
from pdf_tools.libs.textbase import get_text_block_base_info from magic_pdf.libs.textbase import get_text_block_base_info
def fix_image_vertical(image_bboxes:list, text_blocks:list): def fix_image_vertical(image_bboxes:list, text_blocks:list):
""" """
......
from pdf_tools.libs.commons import fitz # pyMuPDF库 from magic_pdf.libs.commons import fitz # pyMuPDF库
import re import re
from pdf_tools.libs.boxbase import _is_in_or_part_overlap, _is_part_overlap, find_bottom_nearest_text_bbox, find_left_nearest_text_bbox, find_right_nearest_text_bbox, find_top_nearest_text_bbox # json from magic_pdf.libs.boxbase import _is_in_or_part_overlap, _is_part_overlap, find_bottom_nearest_text_bbox, find_left_nearest_text_bbox, find_right_nearest_text_bbox, find_top_nearest_text_bbox # json
## version 2 ## version 2
......
from pdf_tools.libs.commons import fitz from magic_pdf.libs.commons import fitz
from pdf_tools.libs.boxbase import _is_in, _is_in_or_part_overlap from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap
from pdf_tools.libs.drop_reason import DropReason from magic_pdf.libs.drop_reason import DropReason
def __area(box): def __area(box):
......
from pdf_tools.libs.boxbase import _is_in, _is_in_or_part_overlap, calculate_overlap_area_2_minbox_area_ratio from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap, calculate_overlap_area_2_minbox_area_ratio
from loguru import logger from loguru import logger
from pdf_tools.libs.drop_tag import COLOR_BG_HEADER_TXT_BLOCK from magic_pdf.libs.drop_tag import COLOR_BG_HEADER_TXT_BLOCK
def __area(box): def __area(box):
......
import re import re
from pdf_tools.libs.boxbase import _is_in_or_part_overlap from magic_pdf.libs.boxbase import _is_in_or_part_overlap
def remove_headder_footer_one_page(text_raw_blocks, image_bboxes, table_bboxes, header_bboxs, footer_bboxs, def remove_headder_footer_one_page(text_raw_blocks, image_bboxes, table_bboxes, header_bboxs, footer_bboxs,
......
import math import math
from pdf_tools.libs.boxbase import is_vbox_on_side from magic_pdf.libs.boxbase import is_vbox_on_side
def detect_non_horizontal_texts(result_dict): def detect_non_horizontal_texts(result_dict):
......
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
2. 然后去掉出现在文字blcok上的图片bbox 2. 然后去掉出现在文字blcok上的图片bbox
""" """
from pdf_tools.libs.boxbase import _is_in, _is_in_or_part_overlap, _is_left_overlap from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap, _is_left_overlap
def resolve_bbox_overlap_conflict(images:list, tables:list, interline_equations:list, inline_equations:list, text_raw_blocks:list): def resolve_bbox_overlap_conflict(images:list, tables:list, interline_equations:list, inline_equations:list, text_raw_blocks:list):
......
# 最终版:把那种text_block有重叠,且inline_formula位置在重叠部分的,认定整个页面都有问题,所有的inline_formula都改成no_check # 最终版:把那种text_block有重叠,且inline_formula位置在重叠部分的,认定整个页面都有问题,所有的inline_formula都改成no_check
from pdf_tools.libs import fitz from magic_pdf.libs import fitz
def check_inline_formula(page, inline_formula_boxes): def check_inline_formula(page, inline_formula_boxes):
......
...@@ -3,7 +3,7 @@ from typing import Tuple ...@@ -3,7 +3,7 @@ from typing import Tuple
import os import os
import boto3, json import boto3, json
from botocore.config import Config from botocore.config import Config
from pdf_tools.libs import fitz from magic_pdf.libs import fitz
from loguru import logger from loguru import logger
from pathlib import Path from pathlib import Path
from tqdm import tqdm from tqdm import tqdm
...@@ -22,13 +22,13 @@ from validation import cal_edit_distance, format_gt_bbox, label_match, detect_va ...@@ -22,13 +22,13 @@ from validation import cal_edit_distance, format_gt_bbox, label_match, detect_va
# from pdf2text_recogPara import parse_blocks_per_page # from pdf2text_recogPara import parse_blocks_per_page
# from bbox_sort import bbox_sort, CONTENT_IDX, CONTENT_TYPE_IDX # from bbox_sort import bbox_sort, CONTENT_IDX, CONTENT_TYPE_IDX
from pdf_tools.layout.bbox_sort import bbox_sort, CONTENT_IDX, CONTENT_TYPE_IDX from magic_pdf.layout.bbox_sort import bbox_sort, CONTENT_IDX, CONTENT_TYPE_IDX
from pdf_tools.pre_proc import parse_images # 获取figures的bbox from magic_pdf.pre_proc import parse_images # 获取figures的bbox
from pdf_tools.pre_proc.detect_tables import parse_tables # 获取tables的bbox from magic_pdf.pre_proc.detect_tables import parse_tables # 获取tables的bbox
from pdf_tools.pre_proc import parse_equations # 获取equations的bbox from magic_pdf.pre_proc import parse_equations # 获取equations的bbox
# from pdf2text_recogFootnote import parse_footnotes # 获取footnotes的bbox # from pdf2text_recogFootnote import parse_footnotes # 获取footnotes的bbox
from pdf_tools.post_proc.detect_para import process_blocks_per_page from magic_pdf.post_proc.detect_para import process_blocks_per_page
from pdf_tools.libs import parse_aws_param, parse_bucket_key, read_file, join_path from magic_pdf.libs import parse_aws_param, parse_bucket_key, read_file, join_path
def cut_image(bbox: Tuple, page_num: int, page: fitz.Page, save_parent_path: str, s3_profile: str): def cut_image(bbox: Tuple, page_num: int, page: fitz.Page, save_parent_path: str, s3_profile: str):
......
from pdf_tools.libs import fitz # pyMuPDF库 from magic_pdf.libs import fitz # pyMuPDF库
def calculate_overlapRatio_between_rect1_and_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> (float, float): def calculate_overlapRatio_between_rect1_and_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> (float, float):
......
from pdf_tools.libs import fitz from magic_pdf.libs import fitz
from typing import List from typing import List
......
import re import re
from pdf_tools.libs import _is_in_or_part_overlap from magic_pdf.libs import _is_in_or_part_overlap
from pdf_tools.libs import fitz from magic_pdf.libs import fitz
import collections import collections
......
...@@ -11,8 +11,8 @@ import numpy as np ...@@ -11,8 +11,8 @@ import numpy as np
from termcolor import cprint from termcolor import cprint
from pdf_tools.libs import fitz from magic_pdf.libs import fitz
from pdf_tools.libs import NLPModels from magic_pdf.libs import NLPModels
if sys.version_info[0] >= 3: if sys.version_info[0] >= 3:
......
from pdf_tools.libs.commons import fitz # pyMuPDF库 from magic_pdf.libs.commons import fitz # pyMuPDF库
def parse_titles(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict, exclude_bboxes): def parse_titles(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict, exclude_bboxes):
......
...@@ -2,7 +2,7 @@ import numpy as np ...@@ -2,7 +2,7 @@ import numpy as np
import tqdm import tqdm
import json import json
from validation import cal_edit_distance, format_gt_bbox from validation import cal_edit_distance, format_gt_bbox
from pdf_tools.layout.layout_sort import sort_with_layout from magic_pdf.layout.layout_sort import sort_with_layout
with open('/mnt/petrelfs/share_data/ouyanglinke/OCR/OCR_validation_dataset_final_rotated_formulafix_highdpi_scihub.json', 'r') as f: with open('/mnt/petrelfs/share_data/ouyanglinke/OCR/OCR_validation_dataset_final_rotated_formulafix_highdpi_scihub.json', 'r') as f:
samples = json.load(f) samples = json.load(f)
......
import io import io
import json import json
import os import os
from pdf_tools.libs import fitz from magic_pdf.libs import fitz
from app.common.s3 import get_s3_config, get_s3_client from app.common.s3 import get_s3_config, get_s3_client
from pdf_tools.libs import join_path, json_dump_path, read_file, parse_bucket_key from magic_pdf.libs import join_path, json_dump_path, read_file, parse_bucket_key
from loguru import logger from loguru import logger
test_pdf_dir_path = "s3://llm-pdf-text/unittest/pdf/" test_pdf_dir_path = "s3://llm-pdf-text/unittest/pdf/"
......
...@@ -2,9 +2,9 @@ import os ...@@ -2,9 +2,9 @@ import os
import pytest import pytest
from pdf_tools.filter import classify_by_area, classify_by_text_len, classify_by_avg_words, \ from magic_pdf.filter import classify_by_area, classify_by_text_len, classify_by_avg_words, \
classify_by_img_num, classify_by_text_layout, classify_by_img_narrow_strips classify_by_img_num, classify_by_text_layout, classify_by_img_narrow_strips
from pdf_tools.filter.pdf_meta_scan import get_pdf_page_size_pts, get_pdf_textlen_per_page, get_imgs_per_page from magic_pdf.filter.pdf_meta_scan import get_pdf_page_size_pts, get_pdf_textlen_per_page, get_imgs_per_page
from test.test_commons import get_docs_from_test_pdf, get_test_json_data from test.test_commons import get_docs_from_test_pdf, get_test_json_data
# 获取当前目录 # 获取当前目录
......
import os import os
import pytest import pytest
from pdf_tools.filter.pdf_meta_scan import get_pdf_page_size_pts, get_image_info, get_pdf_text_layout_per_page, get_language from magic_pdf.filter.pdf_meta_scan import get_pdf_page_size_pts, get_image_info, get_pdf_text_layout_per_page, get_language
from test.test_commons import get_docs_from_test_pdf, get_test_json_data from test.test_commons import get_docs_from_test_pdf, get_test_json_data
# 获取当前目录 # 获取当前目录
......
import unittest import unittest
from pdf_tools.post_proc.detect_para import BlockContinuationProcessor from magic_pdf.post_proc.detect_para import BlockContinuationProcessor
# from ... pdf2text_recogPara import BlockContinuationProcessor # another way to import # from ... pdf2text_recogPara import BlockContinuationProcessor # another way to import
......
import unittest import unittest
from pdf_tools.post_proc.detect_para import BlockTerminationProcessor from magic_pdf.post_proc.detect_para import BlockTerminationProcessor
# from ... pdf2text_recogPara import BlockInnerParasProcessor # another way to import # from ... pdf2text_recogPara import BlockInnerParasProcessor # another way to import
......
import unittest import unittest
from pdf_tools.post_proc.detect_para import ( from magic_pdf.post_proc.detect_para import (
is_bbox_overlap, is_bbox_overlap,
is_in_bbox, is_in_bbox,
is_line_right_aligned_from_neighbors, is_line_right_aligned_from_neighbors,
......
...@@ -2,7 +2,7 @@ import json ...@@ -2,7 +2,7 @@ import json
import unittest import unittest
from utils_for_test_para import UtilsForTestPara from utils_for_test_para import UtilsForTestPara
from pdf_tools.post_proc.detect_para import TitleProcessor from magic_pdf.post_proc.detect_para import TitleProcessor
# from ... pdf2text_recogPara import * # another way to import # from ... pdf2text_recogPara import * # another way to import
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment