Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
d5dbed73
Commit
d5dbed73
authored
Mar 01, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
目录重构
parent
7c7910e4
Changes
85
Hide whitespace changes
Inline
Side-by-side
Showing
85 changed files
with
136 additions
and
136 deletions
+136
-136
download.py
demo/download.py
+1
-1
draw_bbox.py
demo/draw_bbox.py
+1
-1
pdf2md.py
demo/pdf2md.py
+3
-3
__init__.py
magic_pdf/__init__.py
+0
-0
__init__.py
magic_pdf/dict2md/__init__.py
+0
-0
mkcontent.py
magic_pdf/dict2md/mkcontent.py
+1
-1
__init__.py
magic_pdf/filter/__init__.py
+0
-0
pdf_classify_by_type.py
magic_pdf/filter/pdf_classify_by_type.py
+2
-2
pdf_meta_scan.py
magic_pdf/filter/pdf_meta_scan.py
+4
-4
__init__.py
magic_pdf/layout/__init__.py
+0
-0
bbox_sort.py
magic_pdf/layout/bbox_sort.py
+3
-3
layout_det_utils.py
magic_pdf/layout/layout_det_utils.py
+2
-2
layout_sort.py
magic_pdf/layout/layout_sort.py
+3
-3
layout_spiler_recog.py
magic_pdf/layout/layout_spiler_recog.py
+2
-2
mcol_sort.py
magic_pdf/layout/mcol_sort.py
+1
-1
__init__.py
magic_pdf/libs/__init__.py
+0
-0
boxbase.py
magic_pdf/libs/boxbase.py
+0
-0
calc_span_stats.py
magic_pdf/libs/calc_span_stats.py
+0
-0
commons.py
magic_pdf/libs/commons.py
+0
-0
drop_reason.py
magic_pdf/libs/drop_reason.py
+0
-0
drop_tag.py
magic_pdf/libs/drop_tag.py
+0
-0
json_compressor.py
magic_pdf/libs/json_compressor.py
+0
-0
language.py
magic_pdf/libs/language.py
+0
-0
markdown_utils.py
magic_pdf/libs/markdown_utils.py
+0
-0
nlp_utils.py
magic_pdf/libs/nlp_utils.py
+1
-1
pdf_image_tools.py
magic_pdf/libs/pdf_image_tools.py
+2
-2
safe_filename.py
magic_pdf/libs/safe_filename.py
+0
-0
textbase.py
magic_pdf/libs/textbase.py
+0
-0
vis_utils.py
magic_pdf/libs/vis_utils.py
+1
-1
__init__.py
magic_pdf/para/__init__.py
+0
-0
block_continuation_processor.py
magic_pdf/para/block_continuation_processor.py
+1
-1
block_termination_processor.py
magic_pdf/para/block_termination_processor.py
+1
-1
commons.py
magic_pdf/para/commons.py
+1
-1
denoise.py
magic_pdf/para/denoise.py
+1
-1
draw.py
magic_pdf/para/draw.py
+2
-2
exceptions.py
magic_pdf/para/exceptions.py
+0
-0
layout_match_processor.py
magic_pdf/para/layout_match_processor.py
+1
-1
para_pipeline.py
magic_pdf/para/para_pipeline.py
+11
-11
raw_processor.py
magic_pdf/para/raw_processor.py
+0
-0
stats.py
magic_pdf/para/stats.py
+1
-1
title_processor.py
magic_pdf/para/title_processor.py
+2
-2
pdf_parse_by_model.py
magic_pdf/pdf_parse_by_model.py
+33
-33
__init__.py
magic_pdf/post_proc/__init__.py
+0
-0
detect_para.py
magic_pdf/post_proc/detect_para.py
+2
-2
pdf_post_filter.py
magic_pdf/post_proc/pdf_post_filter.py
+2
-2
remove_footnote.py
magic_pdf/post_proc/remove_footnote.py
+1
-1
__init__.py
magic_pdf/pre_proc/__init__.py
+0
-0
citationmarker_remove.py
magic_pdf/pre_proc/citationmarker_remove.py
+1
-1
construct_paras.py
magic_pdf/pre_proc/construct_paras.py
+0
-0
detect_equation.py
magic_pdf/pre_proc/detect_equation.py
+2
-2
detect_footer_by_model.py
magic_pdf/pre_proc/detect_footer_by_model.py
+1
-1
detect_footer_header_by_statistics.py
magic_pdf/pre_proc/detect_footer_header_by_statistics.py
+1
-1
detect_footnote.py
magic_pdf/pre_proc/detect_footnote.py
+1
-1
detect_header.py
magic_pdf/pre_proc/detect_header.py
+1
-1
detect_images.py
magic_pdf/pre_proc/detect_images.py
+1
-1
detect_page_number.py
magic_pdf/pre_proc/detect_page_number.py
+1
-1
detect_tables.py
magic_pdf/pre_proc/detect_tables.py
+1
-1
equations_replace.py
magic_pdf/pre_proc/equations_replace.py
+1
-1
fix_image.py
magic_pdf/pre_proc/fix_image.py
+2
-2
fix_table.py
magic_pdf/pre_proc/fix_table.py
+2
-2
main_text_font.py
magic_pdf/pre_proc/main_text_font.py
+0
-0
pdf_pre_filter.py
magic_pdf/pre_proc/pdf_pre_filter.py
+3
-3
post_layout_split.py
magic_pdf/pre_proc/post_layout_split.py
+0
-0
remove_colored_strip_bbox.py
magic_pdf/pre_proc/remove_colored_strip_bbox.py
+2
-2
remove_footer_header.py
magic_pdf/pre_proc/remove_footer_header.py
+1
-1
remove_rotate_bbox.py
magic_pdf/pre_proc/remove_rotate_bbox.py
+1
-1
resolve_bbox_conflict.py
magic_pdf/pre_proc/resolve_bbox_conflict.py
+1
-1
statistics.py
magic_pdf/pre_proc/statistics.py
+0
-0
check_inline_formula.py
othoers/check_inline_formula.py
+1
-1
pdf2json_infer.py
othoers/pdf2json_infer.py
+7
-7
pdf2text_evaluatePdfLayout.py
othoers/pdf2text_evaluatePdfLayout.py
+1
-1
pdf2text_getNumberOfColumn.py
othoers/pdf2text_getNumberOfColumn.py
+1
-1
pdf2text_recogFootnoteLine.py
othoers/pdf2text_recogFootnoteLine.py
+2
-2
pdf2text_recogPara_v2.py
othoers/pdf2text_recogPara_v2.py
+2
-2
pdf2text_recogTitle.py
othoers/pdf2text_recogTitle.py
+1
-1
vali_bbox_sort.py
othoers/vali_bbox_sort.py
+1
-1
__init__.py
pdf_tools/pre_proc/__init__.py
+0
-0
setup.py
setup.py
+2
-2
test_commons.py
tests/test_commons.py
+2
-2
test_classify.py
tests/test_metascan_classify/test_classify.py
+2
-2
test_meta_scan.py
tests/test_metascan_classify/test_meta_scan.py
+1
-1
test_pdf2text_recogPara_BlockContinuationProcessor.py
...ara/test_pdf2text_recogPara_BlockContinuationProcessor.py
+1
-1
test_pdf2text_recogPara_BlockInnerParasProcessor.py
..._para/test_pdf2text_recogPara_BlockInnerParasProcessor.py
+1
-1
test_pdf2text_recogPara_Common.py
tests/test_para/test_pdf2text_recogPara_Common.py
+1
-1
test_pdf2text_recogPara_TitleProcessor.py
tests/test_para/test_pdf2text_recogPara_TitleProcessor.py
+1
-1
No files found.
demo/download.py
View file @
d5dbed73
...
...
@@ -2,7 +2,7 @@ import json
import
os
from
tqdm
import
tqdm
from
pdf_tools
.libs
import
join_path
from
magic_pdf
.libs
import
join_path
with
open
(
'/mnt/petrelfs/share_data/ouyanglinke/OCR/OCR_validation_dataset.json'
,
'r'
)
as
f
:
samples
=
json
.
load
(
f
)
...
...
demo/draw_bbox.py
View file @
d5dbed73
from
pdf_tools
.libs
import
fitz
# PyMuPDF
from
magic_pdf
.libs
import
fitz
# PyMuPDF
# PDF文件路径
pdf_path
=
"D:
\\
project
\\
20231108code-clean
\\
code-clean
\\
tmp
\\
unittest
\\
download-pdfs
\\
scihub
\\
scihub_53700000
\\
libgen.scimag53724000-53724999.zip_10.1097
\\
00129191-200509000-00018.pdf"
...
...
demo/pdf2md.py
View file @
d5dbed73
...
...
@@ -5,9 +5,9 @@ from pathlib import Path
import
click
from
loguru
import
logger
from
pdf_tools
.libs
import
join_path
from
pdf_tools
.dict2md.mkcontent
import
mk_mm_markdown
from
pdf_tools
.pipeline
import
parse_pdf_by_model
from
magic_pdf
.libs
import
join_path
from
magic_pdf
.dict2md.mkcontent
import
mk_mm_markdown
from
magic_pdf
.pipeline
import
parse_pdf_by_model
...
...
pdf_tools
/__init__.py
→
magic_pdf
/__init__.py
View file @
d5dbed73
File moved
pdf_tools
/dict2md/__init__.py
→
magic_pdf
/dict2md/__init__.py
View file @
d5dbed73
File moved
pdf_tools
/dict2md/mkcontent.py
→
magic_pdf
/dict2md/mkcontent.py
View file @
d5dbed73
import
math
from
loguru
import
logger
from
pdf_tools
.libs.boxbase
import
find_bottom_nearest_text_bbox
,
find_top_nearest_text_bbox
from
magic_pdf
.libs.boxbase
import
find_bottom_nearest_text_bbox
,
find_top_nearest_text_bbox
def
mk_nlp_markdown
(
para_dict
:
dict
):
...
...
pdf_tools
/filter/__init__.py
→
magic_pdf
/filter/__init__.py
View file @
d5dbed73
File moved
pdf_tools
/filter/pdf_classify_by_type.py
→
magic_pdf
/filter/pdf_classify_by_type.py
View file @
d5dbed73
...
...
@@ -16,8 +16,8 @@ from collections import Counter
import
click
import
numpy
as
np
from
pdf_tools
.libs.commons
import
mymax
,
get_top_percent_list
from
pdf_tools
.filter.pdf_meta_scan
import
scan_max_page
,
junk_limit_min
from
magic_pdf
.libs.commons
import
mymax
,
get_top_percent_list
from
magic_pdf
.filter.pdf_meta_scan
import
scan_max_page
,
junk_limit_min
TEXT_LEN_THRESHOLD
=
100
AVG_TEXT_LEN_THRESHOLD
=
200
...
...
pdf_tools
/filter/pdf_meta_scan.py
→
magic_pdf
/filter/pdf_meta_scan.py
View file @
d5dbed73
...
...
@@ -5,13 +5,13 @@
import
sys
import
click
from
pdf_tools
.libs.commons
import
read_file
,
mymax
,
get_top_percent_list
from
pdf_tools
.libs.commons
import
fitz
from
magic_pdf
.libs.commons
import
read_file
,
mymax
,
get_top_percent_list
from
magic_pdf
.libs.commons
import
fitz
from
loguru
import
logger
from
collections
import
Counter
from
pdf_tools
.libs.drop_reason
import
DropReason
from
pdf_tools
.libs.language
import
detect_lang
from
magic_pdf
.libs.drop_reason
import
DropReason
from
magic_pdf
.libs.language
import
detect_lang
scan_max_page
=
50
junk_limit_min
=
10
...
...
pdf_tools
/layout/__init__.py
→
magic_pdf
/layout/__init__.py
View file @
d5dbed73
File moved
pdf_tools
/layout/bbox_sort.py
→
magic_pdf
/layout/bbox_sort.py
View file @
d5dbed73
...
...
@@ -3,9 +3,9 @@
from
pdf_tools
.layout.layout_spiler_recog
import
get_spilter_of_page
from
pdf_tools
.libs.boxbase
import
_is_in
,
_is_in_or_part_overlap
,
_is_vertical_full_overlap
from
pdf_tools
.libs.commons
import
mymax
from
magic_pdf
.layout.layout_spiler_recog
import
get_spilter_of_page
from
magic_pdf
.libs.boxbase
import
_is_in
,
_is_in_or_part_overlap
,
_is_vertical_full_overlap
from
magic_pdf
.libs.commons
import
mymax
X0_IDX
=
0
Y0_IDX
=
1
...
...
pdf_tools
/layout/layout_det_utils.py
→
magic_pdf
/layout/layout_det_utils.py
View file @
d5dbed73
from
pdf_tools
.layout.bbox_sort
import
X0_EXT_IDX
,
X0_IDX
,
X1_EXT_IDX
,
X1_IDX
,
Y0_IDX
,
Y1_EXT_IDX
,
Y1_IDX
from
pdf_tools
.libs.boxbase
import
_is_bottom_full_overlap
,
_left_intersect
,
_right_intersect
from
magic_pdf
.layout.bbox_sort
import
X0_EXT_IDX
,
X0_IDX
,
X1_EXT_IDX
,
X1_IDX
,
Y0_IDX
,
Y1_EXT_IDX
,
Y1_IDX
from
magic_pdf
.libs.boxbase
import
_is_bottom_full_overlap
,
_left_intersect
,
_right_intersect
def
find_all_left_bbox_direct
(
this_bbox
,
all_bboxes
)
->
list
:
...
...
pdf_tools
/layout/layout_sort.py
→
magic_pdf
/layout/layout_sort.py
View file @
d5dbed73
...
...
@@ -3,9 +3,9 @@
"""
from
loguru
import
logger
from
pdf_tools
.layout.bbox_sort
import
CONTENT_IDX
,
CONTENT_TYPE_IDX
,
X0_EXT_IDX
,
X0_IDX
,
X1_EXT_IDX
,
X1_IDX
,
Y0_EXT_IDX
,
Y0_IDX
,
Y1_EXT_IDX
,
Y1_IDX
,
paper_bbox_sort
from
pdf_tools
.layout.layout_det_utils
import
find_all_left_bbox_direct
,
find_all_right_bbox_direct
,
find_bottom_bbox_direct_from_left_edge
,
find_bottom_bbox_direct_from_right_edge
,
find_top_bbox_direct_from_left_edge
,
find_top_bbox_direct_from_right_edge
,
find_all_top_bbox_direct
,
find_all_bottom_bbox_direct
,
get_left_edge_bboxes
,
get_right_edge_bboxes
from
pdf_tools
.libs.boxbase
import
get_bbox_in_boundry
from
magic_pdf
.layout.bbox_sort
import
CONTENT_IDX
,
CONTENT_TYPE_IDX
,
X0_EXT_IDX
,
X0_IDX
,
X1_EXT_IDX
,
X1_IDX
,
Y0_EXT_IDX
,
Y0_IDX
,
Y1_EXT_IDX
,
Y1_IDX
,
paper_bbox_sort
from
magic_pdf
.layout.layout_det_utils
import
find_all_left_bbox_direct
,
find_all_right_bbox_direct
,
find_bottom_bbox_direct_from_left_edge
,
find_bottom_bbox_direct_from_right_edge
,
find_top_bbox_direct_from_left_edge
,
find_top_bbox_direct_from_right_edge
,
find_all_top_bbox_direct
,
find_all_bottom_bbox_direct
,
get_left_edge_bboxes
,
get_right_edge_bboxes
from
magic_pdf
.libs.boxbase
import
get_bbox_in_boundry
LAYOUT_V
=
"V"
...
...
pdf_tools
/layout/layout_spiler_recog.py
→
magic_pdf
/layout/layout_spiler_recog.py
View file @
d5dbed73
...
...
@@ -3,8 +3,8 @@
"""
import
os
from
pdf_tools
.libs.commons
import
fitz
from
pdf_tools
.libs.boxbase
import
_is_in_or_part_overlap
from
magic_pdf
.libs.commons
import
fitz
from
magic_pdf
.libs.boxbase
import
_is_in_or_part_overlap
def
__rect_filter_by_width
(
rect
,
page_w
,
page_h
):
...
...
pdf_tools
/layout/mcol_sort.py
→
magic_pdf
/layout/mcol_sort.py
View file @
d5dbed73
...
...
@@ -50,7 +50,7 @@ Usage
----------------------------------------------------------------------------------
"""
import
sys
from
pdf_tools
.libs.commons
import
fitz
from
magic_pdf
.libs.commons
import
fitz
def
column_boxes
(
page
,
footer_margin
=
50
,
header_margin
=
50
,
no_image_text
=
True
):
...
...
pdf_tools
/libs/__init__.py
→
magic_pdf
/libs/__init__.py
View file @
d5dbed73
File moved
pdf_tools
/libs/boxbase.py
→
magic_pdf
/libs/boxbase.py
View file @
d5dbed73
File moved
pdf_tools
/libs/calc_span_stats.py
→
magic_pdf
/libs/calc_span_stats.py
View file @
d5dbed73
File moved
pdf_tools
/libs/commons.py
→
magic_pdf
/libs/commons.py
View file @
d5dbed73
File moved
pdf_tools
/libs/drop_reason.py
→
magic_pdf
/libs/drop_reason.py
View file @
d5dbed73
File moved
pdf_tools
/libs/drop_tag.py
→
magic_pdf
/libs/drop_tag.py
View file @
d5dbed73
File moved
pdf_tools
/libs/json_compressor.py
→
magic_pdf
/libs/json_compressor.py
View file @
d5dbed73
File moved
pdf_tools
/libs/language.py
→
magic_pdf
/libs/language.py
View file @
d5dbed73
File moved
pdf_tools
/libs/markdown_utils.py
→
magic_pdf
/libs/markdown_utils.py
View file @
d5dbed73
File moved
pdf_tools
/libs/nlp_utils.py
→
magic_pdf
/libs/nlp_utils.py
View file @
d5dbed73
...
...
@@ -10,7 +10,7 @@ import spacy
import
en_core_web_sm
import
zh_core_web_sm
from
pdf_tools
.libs.language
import
detect_lang
from
magic_pdf
.libs.language
import
detect_lang
class
NLPModels
:
...
...
pdf_tools
/libs/pdf_image_tools.py
→
magic_pdf
/libs/pdf_image_tools.py
View file @
d5dbed73
...
...
@@ -4,9 +4,9 @@ from typing import Tuple
import
io
# from app.common.s3 import get_s3_client
from
pdf_tools
.libs.commons
import
fitz
from
magic_pdf
.libs.commons
import
fitz
from
loguru
import
logger
from
pdf_tools
.libs.commons
import
parse_bucket_key
,
join_path
from
magic_pdf
.libs.commons
import
parse_bucket_key
,
join_path
def
cut_image
(
bbox
:
Tuple
,
page_num
:
int
,
page
:
fitz
.
Page
,
save_parent_path
:
str
,
s3_return_path
=
None
,
img_s3_client
=
None
,
upload_switch
=
True
):
...
...
pdf_tools
/libs/safe_filename.py
→
magic_pdf
/libs/safe_filename.py
View file @
d5dbed73
File moved
pdf_tools
/libs/textbase.py
→
magic_pdf
/libs/textbase.py
View file @
d5dbed73
File moved
pdf_tools
/libs/vis_utils.py
→
magic_pdf
/libs/vis_utils.py
View file @
d5dbed73
from
pdf_tools
.libs.commons
import
fitz
from
magic_pdf
.libs.commons
import
fitz
import
os
...
...
pdf_tools
/para/__init__.py
→
magic_pdf
/para/__init__.py
View file @
d5dbed73
File moved
pdf_tools
/para/block_continuation_processor.py
→
magic_pdf
/para/block_continuation_processor.py
View file @
d5dbed73
import
os
import
unicodedata
from
pdf_tools
.para.commons
import
*
from
magic_pdf
.para.commons
import
*
if
sys
.
version_info
[
0
]
>=
3
:
...
...
pdf_tools
/para/block_termination_processor.py
→
magic_pdf
/para/block_termination_processor.py
View file @
d5dbed73
from
pdf_tools
.para.commons
import
*
from
magic_pdf
.para.commons
import
*
if
sys
.
version_info
[
0
]
>=
3
:
...
...
pdf_tools
/para/commons.py
→
magic_pdf
/para/commons.py
View file @
d5dbed73
import
sys
from
pdf_tools
.libs.commons
import
fitz
from
magic_pdf
.libs.commons
import
fitz
from
termcolor
import
cprint
...
...
pdf_tools
/para/denoise.py
→
magic_pdf
/para/denoise.py
View file @
d5dbed73
import
math
from
collections
import
defaultdict
from
pdf_tools
.para.commons
import
*
from
magic_pdf
.para.commons
import
*
if
sys
.
version_info
[
0
]
>=
3
:
sys
.
stdout
.
reconfigure
(
encoding
=
"utf-8"
)
# type: ignore
...
...
pdf_tools
/para/draw.py
→
magic_pdf
/para/draw.py
View file @
d5dbed73
from
pdf_tools
.libs.commons
import
fitz
from
magic_pdf
.libs.commons
import
fitz
from
pdf_tools
.para.commons
import
*
from
magic_pdf
.para.commons
import
*
if
sys
.
version_info
[
0
]
>=
3
:
...
...
pdf_tools
/para/exceptions.py
→
magic_pdf
/para/exceptions.py
View file @
d5dbed73
File moved
pdf_tools
/para/layout_match_processor.py
→
magic_pdf
/para/layout_match_processor.py
View file @
d5dbed73
import
math
from
pdf_tools
.para.commons
import
*
from
magic_pdf
.para.commons
import
*
if
sys
.
version_info
[
0
]
>=
3
:
...
...
pdf_tools
/para/para_pipeline.py
→
magic_pdf
/para/para_pipeline.py
View file @
d5dbed73
import
os
import
json
from
pdf_tools
.para.commons
import
*
from
pdf_tools
.para.raw_processor
import
RawBlockProcessor
from
pdf_tools
.para.layout_match_processor
import
LayoutFilterProcessor
from
pdf_tools
.para.stats
import
BlockStatisticsCalculator
from
pdf_tools
.para.stats
import
DocStatisticsCalculator
from
pdf_tools
.para.title_processor
import
TitleProcessor
from
pdf_tools
.para.block_termination_processor
import
BlockTerminationProcessor
from
pdf_tools
.para.block_continuation_processor
import
BlockContinuationProcessor
from
pdf_tools
.para.draw
import
DrawAnnos
from
pdf_tools
.para.exceptions
import
(
from
magic_pdf
.para.commons
import
*
from
magic_pdf
.para.raw_processor
import
RawBlockProcessor
from
magic_pdf
.para.layout_match_processor
import
LayoutFilterProcessor
from
magic_pdf
.para.stats
import
BlockStatisticsCalculator
from
magic_pdf
.para.stats
import
DocStatisticsCalculator
from
magic_pdf
.para.title_processor
import
TitleProcessor
from
magic_pdf
.para.block_termination_processor
import
BlockTerminationProcessor
from
magic_pdf
.para.block_continuation_processor
import
BlockContinuationProcessor
from
magic_pdf
.para.draw
import
DrawAnnos
from
magic_pdf
.para.exceptions
import
(
DenseSingleLineBlockException
,
TitleDetectionException
,
TitleLevelException
,
...
...
pdf_tools
/para/raw_processor.py
→
magic_pdf
/para/raw_processor.py
View file @
d5dbed73
File moved
pdf_tools
/para/stats.py
→
magic_pdf
/para/stats.py
View file @
d5dbed73
from
collections
import
Counter
import
numpy
as
np
from
pdf_tools
.para.commons
import
*
from
magic_pdf
.para.commons
import
*
if
sys
.
version_info
[
0
]
>=
3
:
...
...
pdf_tools
/para/title_processor.py
→
magic_pdf
/para/title_processor.py
View file @
d5dbed73
...
...
@@ -2,9 +2,9 @@ import os
import
re
import
numpy
as
np
from
pdf_tools
.libs.nlp_utils
import
NLPModels
from
magic_pdf
.libs.nlp_utils
import
NLPModels
from
pdf_tools
.para.commons
import
*
from
magic_pdf
.para.commons
import
*
if
sys
.
version_info
[
0
]
>=
3
:
sys
.
stdout
.
reconfigure
(
encoding
=
"utf-8"
)
# type: ignore
...
...
pdf_tools/pipeline
/pdf_parse_by_model.py
→
magic_pdf
/pdf_parse_by_model.py
View file @
d5dbed73
...
...
@@ -2,28 +2,28 @@ import time
# from anyio import Path
from
pdf_tools
.libs.commons
import
fitz
,
get_delta_time
,
get_img_s3_client
from
magic_pdf
.libs.commons
import
fitz
,
get_delta_time
,
get_img_s3_client
import
json
import
os
import
math
from
loguru
import
logger
from
pdf_tools
.layout.bbox_sort
import
(
from
magic_pdf
.layout.bbox_sort
import
(
prepare_bboxes_for_layout_split
,
)
from
pdf_tools
.layout.layout_sort
import
LAYOUT_UNPROC
,
get_bboxes_layout
,
get_columns_cnt_of_layout
,
sort_text_block
from
pdf_tools
.libs.drop_reason
import
DropReason
from
pdf_tools
.libs.markdown_utils
import
escape_special_markdown_char
from
pdf_tools
.libs.safe_filename
import
sanitize_filename
from
pdf_tools
.libs.vis_utils
import
draw_bbox_on_page
,
draw_layout_bbox_on_page
from
pdf_tools
.pre_proc.detect_images
import
parse_images
from
pdf_tools
.pre_proc.detect_tables
import
parse_tables
# 获取tables的bbox
from
pdf_tools
.pre_proc.detect_equation
import
parse_equations
# 获取equations的bbox
from
pdf_tools
.pre_proc.detect_header
import
parse_headers
# 获取headers的bbox
from
pdf_tools
.pre_proc.detect_page_number
import
parse_pageNos
# 获取pageNos的bbox
from
pdf_tools
.pre_proc.detect_footnote
import
parse_footnotes_by_model
,
parse_footnotes_by_rule
# 获取footnotes的bbox
from
pdf_tools
.pre_proc.detect_footer_by_model
import
parse_footers
# 获取footers的bbox
from
pdf_tools
.post_proc.detect_para
import
(
from
magic_pdf
.layout.layout_sort
import
LAYOUT_UNPROC
,
get_bboxes_layout
,
get_columns_cnt_of_layout
,
sort_text_block
from
magic_pdf
.libs.drop_reason
import
DropReason
from
magic_pdf
.libs.markdown_utils
import
escape_special_markdown_char
from
magic_pdf
.libs.safe_filename
import
sanitize_filename
from
magic_pdf
.libs.vis_utils
import
draw_bbox_on_page
,
draw_layout_bbox_on_page
from
magic_pdf
.pre_proc.detect_images
import
parse_images
from
magic_pdf
.pre_proc.detect_tables
import
parse_tables
# 获取tables的bbox
from
magic_pdf
.pre_proc.detect_equation
import
parse_equations
# 获取equations的bbox
from
magic_pdf
.pre_proc.detect_header
import
parse_headers
# 获取headers的bbox
from
magic_pdf
.pre_proc.detect_page_number
import
parse_pageNos
# 获取pageNos的bbox
from
magic_pdf
.pre_proc.detect_footnote
import
parse_footnotes_by_model
,
parse_footnotes_by_rule
# 获取footnotes的bbox
from
magic_pdf
.pre_proc.detect_footer_by_model
import
parse_footers
# 获取footers的bbox
from
magic_pdf
.post_proc.detect_para
import
(
ParaProcessPipeline
,
TitleDetectionException
,
TitleLevelException
,
...
...
@@ -31,9 +31,9 @@ from pdf_tools.post_proc.detect_para import (
ParaMergeException
,
DenseSingleLineBlockException
,
)
from
pdf_tools
.pre_proc.main_text_font
import
get_main_text_font
from
pdf_tools
.pre_proc.remove_colored_strip_bbox
import
remove_colored_strip_textblock
from
pdf_tools
.pre_proc.remove_footer_header
import
remove_headder_footer_one_page
from
magic_pdf
.pre_proc.main_text_font
import
get_main_text_font
from
magic_pdf
.pre_proc.remove_colored_strip_bbox
import
remove_colored_strip_textblock
from
magic_pdf
.pre_proc.remove_footer_header
import
remove_headder_footer_one_page
'''
from para.para_pipeline import ParaProcessPipeline
...
...
@@ -46,19 +46,19 @@ from para.exceptions import (
)
'''
from
pdf_tools
.libs.commons
import
read_file
,
join_path
from
pdf_tools
.libs.pdf_image_tools
import
save_images_by_bboxes
from
pdf_tools
.post_proc.remove_footnote
import
merge_footnote_blocks
,
remove_footnote_blocks
from
pdf_tools
.pre_proc.citationmarker_remove
import
remove_citation_marker
from
pdf_tools
.pre_proc.equations_replace
import
combine_chars_to_pymudict
,
remove_chars_in_text_blocks
,
replace_equations_in_textblock
from
pdf_tools
.pre_proc.pdf_pre_filter
import
pdf_filter
from
pdf_tools
.pre_proc.detect_footer_header_by_statistics
import
drop_footer_header
from
pdf_tools
.pre_proc.construct_paras
import
construct_page_component
from
pdf_tools
.pre_proc.fix_image
import
combine_images
,
fix_image_vertical
,
fix_seperated_image
,
include_img_title
from
pdf_tools
.post_proc.pdf_post_filter
import
pdf_post_filter
from
pdf_tools
.pre_proc.remove_rotate_bbox
import
get_side_boundry
,
remove_rotate_side_textblock
,
remove_side_blank_block
from
pdf_tools
.pre_proc.resolve_bbox_conflict
import
check_text_block_horizontal_overlap
,
resolve_bbox_overlap_conflict
from
pdf_tools
.pre_proc.fix_table
import
fix_table_text_block
,
fix_tables
,
include_table_title
from
magic_pdf
.libs.commons
import
read_file
,
join_path
from
magic_pdf
.libs.pdf_image_tools
import
save_images_by_bboxes
from
magic_pdf
.post_proc.remove_footnote
import
merge_footnote_blocks
,
remove_footnote_blocks
from
magic_pdf
.pre_proc.citationmarker_remove
import
remove_citation_marker
from
magic_pdf
.pre_proc.equations_replace
import
combine_chars_to_pymudict
,
remove_chars_in_text_blocks
,
replace_equations_in_textblock
from
magic_pdf
.pre_proc.pdf_pre_filter
import
pdf_filter
from
magic_pdf
.pre_proc.detect_footer_header_by_statistics
import
drop_footer_header
from
magic_pdf
.pre_proc.construct_paras
import
construct_page_component
from
magic_pdf
.pre_proc.fix_image
import
combine_images
,
fix_image_vertical
,
fix_seperated_image
,
include_img_title
from
magic_pdf
.post_proc.pdf_post_filter
import
pdf_post_filter
from
magic_pdf
.pre_proc.remove_rotate_bbox
import
get_side_boundry
,
remove_rotate_side_textblock
,
remove_side_blank_block
from
magic_pdf
.pre_proc.resolve_bbox_conflict
import
check_text_block_horizontal_overlap
,
resolve_bbox_overlap_conflict
from
magic_pdf
.pre_proc.fix_table
import
fix_table_text_block
,
fix_tables
,
include_table_title
denseSingleLineBlockException_msg
=
DenseSingleLineBlockException
()
.
message
titleDetectionException_msg
=
TitleDetectionException
()
.
message
...
...
@@ -108,7 +108,7 @@ def parse_pdf_by_model(
debug_mode
=
False
,
):
pdf_bytes
=
read_file
(
s3_pdf_path
,
s3_pdf_profile
)
save_tmp_path
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"../..
/..
"
,
"tmp"
,
"unittest"
)
save_tmp_path
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"../.."
,
"tmp"
,
"unittest"
)
md_bookname_save_path
=
""
book_name
=
sanitize_filename
(
book_name
)
if
debug_mode
:
...
...
pdf_tools/pipeline
/__init__.py
→
magic_pdf/post_proc
/__init__.py
View file @
d5dbed73
File moved
pdf_tools
/post_proc/detect_para.py
→
magic_pdf
/post_proc/detect_para.py
View file @
d5dbed73
...
...
@@ -11,8 +11,8 @@ import numpy as np
from
termcolor
import
cprint
from
pdf_tools
.libs.commons
import
fitz
from
pdf_tools
.libs.nlp_utils
import
NLPModels
from
magic_pdf
.libs.commons
import
fitz
from
magic_pdf
.libs.nlp_utils
import
NLPModels
if
sys
.
version_info
[
0
]
>=
3
:
...
...
pdf_tools
/post_proc/pdf_post_filter.py
→
magic_pdf
/post_proc/pdf_post_filter.py
View file @
d5dbed73
from
loguru
import
logger
from
pdf_tools
.layout.layout_sort
import
get_columns_cnt_of_layout
from
pdf_tools
.libs.drop_reason
import
DropReason
from
magic_pdf
.layout.layout_sort
import
get_columns_cnt_of_layout
from
magic_pdf
.libs.drop_reason
import
DropReason
def
__is_pseudo_single_column
(
page_info
)
->
bool
:
...
...
pdf_tools
/post_proc/remove_footnote.py
→
magic_pdf
/post_proc/remove_footnote.py
View file @
d5dbed73
from
pdf_tools
.libs.boxbase
import
_is_in
,
_is_in_or_part_overlap
from
magic_pdf
.libs.boxbase
import
_is_in
,
_is_in_or_part_overlap
import
collections
# 统计库
...
...
pdf_tools/post
_proc/__init__.py
→
magic_pdf/pre
_proc/__init__.py
View file @
d5dbed73
File moved
pdf_tools
/pre_proc/citationmarker_remove.py
→
magic_pdf
/pre_proc/citationmarker_remove.py
View file @
d5dbed73
...
...
@@ -3,7 +3,7 @@
https://aicarrier.feishu.cn/wiki/YLOPwo1PGiwFRdkwmyhcZmr0n3d
"""
import
re
from
pdf_tools
.libs.nlp_utils
import
NLPModels
from
magic_pdf
.libs.nlp_utils
import
NLPModels
__NLP_MODEL
=
NLPModels
()
...
...
pdf_tools
/pre_proc/construct_paras.py
→
magic_pdf
/pre_proc/construct_paras.py
View file @
d5dbed73
File moved
pdf_tools
/pre_proc/detect_equation.py
→
magic_pdf
/pre_proc/detect_equation.py
View file @
d5dbed73
from
pdf_tools
.libs.boxbase
import
_is_in
# 正则
from
pdf_tools
.libs.commons
import
fitz
# pyMuPDF库
from
magic_pdf
.libs.boxbase
import
_is_in
# 正则
from
magic_pdf
.libs.commons
import
fitz
# pyMuPDF库
def
__solve_contain_bboxs
(
all_bbox_list
:
list
):
...
...
pdf_tools
/pre_proc/detect_footer_by_model.py
→
magic_pdf
/pre_proc/detect_footer_by_model.py
View file @
d5dbed73
from
pdf_tools
.libs.commons
import
fitz
# pyMuPDF库
from
magic_pdf
.libs.commons
import
fitz
# pyMuPDF库
def
parse_footers
(
page_ID
:
int
,
page
:
fitz
.
Page
,
json_from_DocXchain_obj
:
dict
):
...
...
pdf_tools
/pre_proc/detect_footer_header_by_statistics.py
→
magic_pdf
/pre_proc/detect_footer_header_by_statistics.py
View file @
d5dbed73
from
collections
import
defaultdict
from
pdf_tools
.libs.boxbase
import
calculate_iou
from
magic_pdf
.libs.boxbase
import
calculate_iou
def
compare_bbox_with_list
(
bbox
,
bbox_list
,
tolerance
=
1
):
...
...
pdf_tools
/pre_proc/detect_footnote.py
→
magic_pdf
/pre_proc/detect_footnote.py
View file @
d5dbed73
from
collections
import
Counter
from
pdf_tools
.libs.commons
import
fitz
# pyMuPDF库
from
magic_pdf
.libs.commons
import
fitz
# pyMuPDF库
def
parse_footnotes_by_model
(
page_ID
:
int
,
page
:
fitz
.
Page
,
json_from_DocXchain_obj
:
dict
,
md_bookname_save_path
,
debug_mode
=
False
):
...
...
pdf_tools
/pre_proc/detect_header.py
→
magic_pdf
/pre_proc/detect_header.py
View file @
d5dbed73
from
pdf_tools
.libs.commons
import
fitz
# pyMuPDF库
from
magic_pdf
.libs.commons
import
fitz
# pyMuPDF库
def
parse_headers
(
page_ID
:
int
,
page
:
fitz
.
Page
,
json_from_DocXchain_obj
:
dict
):
...
...
pdf_tools
/pre_proc/detect_images.py
→
magic_pdf
/pre_proc/detect_images.py
View file @
d5dbed73
import
collections
# 统计库
import
re
from
pdf_tools
.libs.commons
import
fitz
# pyMuPDF库
from
magic_pdf
.libs.commons
import
fitz
# pyMuPDF库
#--------------------------------------- Tool Functions --------------------------------------#
...
...
pdf_tools
/pre_proc/detect_page_number.py
→
magic_pdf
/pre_proc/detect_page_number.py
View file @
d5dbed73
from
pdf_tools
.libs.commons
import
fitz
# pyMuPDF库
from
magic_pdf
.libs.commons
import
fitz
# pyMuPDF库
def
parse_pageNos
(
page_ID
:
int
,
page
:
fitz
.
Page
,
json_from_DocXchain_obj
:
dict
):
...
...
pdf_tools
/pre_proc/detect_tables.py
→
magic_pdf
/pre_proc/detect_tables.py
View file @
d5dbed73
from
pdf_tools
.libs.commons
import
fitz
# pyMuPDF库
from
magic_pdf
.libs.commons
import
fitz
# pyMuPDF库
def
parse_tables
(
page_ID
:
int
,
page
:
fitz
.
Page
,
json_from_DocXchain_obj
:
dict
):
...
...
pdf_tools
/pre_proc/equations_replace.py
→
magic_pdf
/pre_proc/equations_replace.py
View file @
d5dbed73
"""
对pymupdf返回的结构里的公式进行替换,替换为模型识别的公式结果
"""
from
pdf_tools
.libs.commons
import
fitz
from
magic_pdf
.libs.commons
import
fitz
import
json
import
os
from
pathlib
import
Path
...
...
pdf_tools
/pre_proc/fix_image.py
→
magic_pdf
/pre_proc/fix_image.py
View file @
d5dbed73
...
...
@@ -2,9 +2,9 @@
import
re
from
pdf_tools
.libs.boxbase
import
_is_in_or_part_overlap
,
_is_part_overlap
,
find_bottom_nearest_text_bbox
,
find_left_nearest_text_bbox
,
find_right_nearest_text_bbox
,
find_top_nearest_text_bbox
from
magic_pdf
.libs.boxbase
import
_is_in_or_part_overlap
,
_is_part_overlap
,
find_bottom_nearest_text_bbox
,
find_left_nearest_text_bbox
,
find_right_nearest_text_bbox
,
find_top_nearest_text_bbox
from
pdf_tools
.libs.textbase
import
get_text_block_base_info
from
magic_pdf
.libs.textbase
import
get_text_block_base_info
def
fix_image_vertical
(
image_bboxes
:
list
,
text_blocks
:
list
):
"""
...
...
pdf_tools
/pre_proc/fix_table.py
→
magic_pdf
/pre_proc/fix_table.py
View file @
d5dbed73
from
pdf_tools
.libs.commons
import
fitz
# pyMuPDF库
from
magic_pdf
.libs.commons
import
fitz
# pyMuPDF库
import
re
from
pdf_tools
.libs.boxbase
import
_is_in_or_part_overlap
,
_is_part_overlap
,
find_bottom_nearest_text_bbox
,
find_left_nearest_text_bbox
,
find_right_nearest_text_bbox
,
find_top_nearest_text_bbox
# json
from
magic_pdf
.libs.boxbase
import
_is_in_or_part_overlap
,
_is_part_overlap
,
find_bottom_nearest_text_bbox
,
find_left_nearest_text_bbox
,
find_right_nearest_text_bbox
,
find_top_nearest_text_bbox
# json
## version 2
...
...
pdf_tools
/pre_proc/main_text_font.py
→
magic_pdf
/pre_proc/main_text_font.py
View file @
d5dbed73
File moved
pdf_tools
/pre_proc/pdf_pre_filter.py
→
magic_pdf
/pre_proc/pdf_pre_filter.py
View file @
d5dbed73
from
pdf_tools
.libs.commons
import
fitz
from
pdf_tools
.libs.boxbase
import
_is_in
,
_is_in_or_part_overlap
from
pdf_tools
.libs.drop_reason
import
DropReason
from
magic_pdf
.libs.commons
import
fitz
from
magic_pdf
.libs.boxbase
import
_is_in
,
_is_in_or_part_overlap
from
magic_pdf
.libs.drop_reason
import
DropReason
def
__area
(
box
):
...
...
pdf_tools
/pre_proc/post_layout_split.py
→
magic_pdf
/pre_proc/post_layout_split.py
View file @
d5dbed73
File moved
pdf_tools
/pre_proc/remove_colored_strip_bbox.py
→
magic_pdf
/pre_proc/remove_colored_strip_bbox.py
View file @
d5dbed73
from
pdf_tools
.libs.boxbase
import
_is_in
,
_is_in_or_part_overlap
,
calculate_overlap_area_2_minbox_area_ratio
from
magic_pdf
.libs.boxbase
import
_is_in
,
_is_in_or_part_overlap
,
calculate_overlap_area_2_minbox_area_ratio
from
loguru
import
logger
from
pdf_tools
.libs.drop_tag
import
COLOR_BG_HEADER_TXT_BLOCK
from
magic_pdf
.libs.drop_tag
import
COLOR_BG_HEADER_TXT_BLOCK
def
__area
(
box
):
...
...
pdf_tools
/pre_proc/remove_footer_header.py
→
magic_pdf
/pre_proc/remove_footer_header.py
View file @
d5dbed73
import
re
from
pdf_tools
.libs.boxbase
import
_is_in_or_part_overlap
from
magic_pdf
.libs.boxbase
import
_is_in_or_part_overlap
def
remove_headder_footer_one_page
(
text_raw_blocks
,
image_bboxes
,
table_bboxes
,
header_bboxs
,
footer_bboxs
,
...
...
pdf_tools
/pre_proc/remove_rotate_bbox.py
→
magic_pdf
/pre_proc/remove_rotate_bbox.py
View file @
d5dbed73
import
math
from
pdf_tools
.libs.boxbase
import
is_vbox_on_side
from
magic_pdf
.libs.boxbase
import
is_vbox_on_side
def
detect_non_horizontal_texts
(
result_dict
):
...
...
pdf_tools
/pre_proc/resolve_bbox_conflict.py
→
magic_pdf
/pre_proc/resolve_bbox_conflict.py
View file @
d5dbed73
...
...
@@ -5,7 +5,7 @@
2. 然后去掉出现在文字blcok上的图片bbox
"""
from
pdf_tools
.libs.boxbase
import
_is_in
,
_is_in_or_part_overlap
,
_is_left_overlap
from
magic_pdf
.libs.boxbase
import
_is_in
,
_is_in_or_part_overlap
,
_is_left_overlap
def
resolve_bbox_overlap_conflict
(
images
:
list
,
tables
:
list
,
interline_equations
:
list
,
inline_equations
:
list
,
text_raw_blocks
:
list
):
...
...
pdf_tools
/pre_proc/statistics.py
→
magic_pdf
/pre_proc/statistics.py
View file @
d5dbed73
File moved
othoers/check_inline_formula.py
View file @
d5dbed73
# 最终版:把那种text_block有重叠,且inline_formula位置在重叠部分的,认定整个页面都有问题,所有的inline_formula都改成no_check
from
pdf_tools
.libs
import
fitz
from
magic_pdf
.libs
import
fitz
def
check_inline_formula
(
page
,
inline_formula_boxes
):
...
...
othoers/pdf2json_infer.py
View file @
d5dbed73
...
...
@@ -3,7 +3,7 @@ from typing import Tuple
import
os
import
boto3
,
json
from
botocore.config
import
Config
from
pdf_tools
.libs
import
fitz
from
magic_pdf
.libs
import
fitz
from
loguru
import
logger
from
pathlib
import
Path
from
tqdm
import
tqdm
...
...
@@ -22,13 +22,13 @@ from validation import cal_edit_distance, format_gt_bbox, label_match, detect_va
# from pdf2text_recogPara import parse_blocks_per_page
# from bbox_sort import bbox_sort, CONTENT_IDX, CONTENT_TYPE_IDX
from
pdf_tools
.layout.bbox_sort
import
bbox_sort
,
CONTENT_IDX
,
CONTENT_TYPE_IDX
from
pdf_tools
.pre_proc
import
parse_images
# 获取figures的bbox
from
pdf_tools
.pre_proc.detect_tables
import
parse_tables
# 获取tables的bbox
from
pdf_tools
.pre_proc
import
parse_equations
# 获取equations的bbox
from
magic_pdf
.layout.bbox_sort
import
bbox_sort
,
CONTENT_IDX
,
CONTENT_TYPE_IDX
from
magic_pdf
.pre_proc
import
parse_images
# 获取figures的bbox
from
magic_pdf
.pre_proc.detect_tables
import
parse_tables
# 获取tables的bbox
from
magic_pdf
.pre_proc
import
parse_equations
# 获取equations的bbox
# from pdf2text_recogFootnote import parse_footnotes # 获取footnotes的bbox
from
pdf_tools
.post_proc.detect_para
import
process_blocks_per_page
from
pdf_tools
.libs
import
parse_aws_param
,
parse_bucket_key
,
read_file
,
join_path
from
magic_pdf
.post_proc.detect_para
import
process_blocks_per_page
from
magic_pdf
.libs
import
parse_aws_param
,
parse_bucket_key
,
read_file
,
join_path
def
cut_image
(
bbox
:
Tuple
,
page_num
:
int
,
page
:
fitz
.
Page
,
save_parent_path
:
str
,
s3_profile
:
str
):
...
...
othoers/pdf2text_evaluatePdfLayout.py
View file @
d5dbed73
from
pdf_tools
.libs
import
fitz
# pyMuPDF库
from
magic_pdf
.libs
import
fitz
# pyMuPDF库
def
calculate_overlapRatio_between_rect1_and_rect2
(
L1
:
float
,
U1
:
float
,
R1
:
float
,
D1
:
float
,
L2
:
float
,
U2
:
float
,
R2
:
float
,
D2
:
float
)
->
(
float
,
float
):
...
...
othoers/pdf2text_getNumberOfColumn.py
View file @
d5dbed73
from
pdf_tools
.libs
import
fitz
from
magic_pdf
.libs
import
fitz
from
typing
import
List
...
...
othoers/pdf2text_recogFootnoteLine.py
View file @
d5dbed73
import
re
from
pdf_tools
.libs
import
_is_in_or_part_overlap
from
pdf_tools
.libs
import
fitz
from
magic_pdf
.libs
import
_is_in_or_part_overlap
from
magic_pdf
.libs
import
fitz
import
collections
...
...
othoers/pdf2text_recogPara_v2.py
View file @
d5dbed73
...
...
@@ -11,8 +11,8 @@ import numpy as np
from
termcolor
import
cprint
from
pdf_tools
.libs
import
fitz
from
pdf_tools
.libs
import
NLPModels
from
magic_pdf
.libs
import
fitz
from
magic_pdf
.libs
import
NLPModels
if
sys
.
version_info
[
0
]
>=
3
:
...
...
othoers/pdf2text_recogTitle.py
View file @
d5dbed73
from
pdf_tools
.libs.commons
import
fitz
# pyMuPDF库
from
magic_pdf
.libs.commons
import
fitz
# pyMuPDF库
def
parse_titles
(
page_ID
:
int
,
page
:
fitz
.
Page
,
json_from_DocXchain_obj
:
dict
,
exclude_bboxes
):
...
...
othoers/vali_bbox_sort.py
View file @
d5dbed73
...
...
@@ -2,7 +2,7 @@ import numpy as np
import
tqdm
import
json
from
validation
import
cal_edit_distance
,
format_gt_bbox
from
pdf_tools
.layout.layout_sort
import
sort_with_layout
from
magic_pdf
.layout.layout_sort
import
sort_with_layout
with
open
(
'/mnt/petrelfs/share_data/ouyanglinke/OCR/OCR_validation_dataset_final_rotated_formulafix_highdpi_scihub.json'
,
'r'
)
as
f
:
samples
=
json
.
load
(
f
)
...
...
pdf_tools/pre_proc/__init__.py
deleted
100644 → 0
View file @
7c7910e4
setup.py
View file @
d5dbed73
from
setuptools
import
setup
,
find_packages
setup
(
name
=
"Magic-PDF"
,
# 项目名
version
=
"0.1.0"
,
# 版本号
name
=
"Magic-PDF"
,
# 项目名
version
=
"0.1.0"
,
# 版本号
packages
=
find_packages
(),
# 包含所有的包
install_requires
=
[
'PyMuPDF>=1.23.25'
,
'boto3>=1.34.52'
,
...
...
tests/test_commons.py
View file @
d5dbed73
import
io
import
json
import
os
from
pdf_tools
.libs
import
fitz
from
magic_pdf
.libs
import
fitz
from
app.common.s3
import
get_s3_config
,
get_s3_client
from
pdf_tools
.libs
import
join_path
,
json_dump_path
,
read_file
,
parse_bucket_key
from
magic_pdf
.libs
import
join_path
,
json_dump_path
,
read_file
,
parse_bucket_key
from
loguru
import
logger
test_pdf_dir_path
=
"s3://llm-pdf-text/unittest/pdf/"
...
...
tests/test_metascan_classify/test_classify.py
View file @
d5dbed73
...
...
@@ -2,9 +2,9 @@ import os
import
pytest
from
pdf_tools
.filter
import
classify_by_area
,
classify_by_text_len
,
classify_by_avg_words
,
\
from
magic_pdf
.filter
import
classify_by_area
,
classify_by_text_len
,
classify_by_avg_words
,
\
classify_by_img_num
,
classify_by_text_layout
,
classify_by_img_narrow_strips
from
pdf_tools
.filter.pdf_meta_scan
import
get_pdf_page_size_pts
,
get_pdf_textlen_per_page
,
get_imgs_per_page
from
magic_pdf
.filter.pdf_meta_scan
import
get_pdf_page_size_pts
,
get_pdf_textlen_per_page
,
get_imgs_per_page
from
test.test_commons
import
get_docs_from_test_pdf
,
get_test_json_data
# 获取当前目录
...
...
tests/test_metascan_classify/test_meta_scan.py
View file @
d5dbed73
import
os
import
pytest
from
pdf_tools
.filter.pdf_meta_scan
import
get_pdf_page_size_pts
,
get_image_info
,
get_pdf_text_layout_per_page
,
get_language
from
magic_pdf
.filter.pdf_meta_scan
import
get_pdf_page_size_pts
,
get_image_info
,
get_pdf_text_layout_per_page
,
get_language
from
test.test_commons
import
get_docs_from_test_pdf
,
get_test_json_data
# 获取当前目录
...
...
tests/test_para/test_pdf2text_recogPara_BlockContinuationProcessor.py
View file @
d5dbed73
import
unittest
from
pdf_tools
.post_proc.detect_para
import
BlockContinuationProcessor
from
magic_pdf
.post_proc.detect_para
import
BlockContinuationProcessor
# from ... pdf2text_recogPara import BlockContinuationProcessor # another way to import
...
...
tests/test_para/test_pdf2text_recogPara_BlockInnerParasProcessor.py
View file @
d5dbed73
import
unittest
from
pdf_tools
.post_proc.detect_para
import
BlockTerminationProcessor
from
magic_pdf
.post_proc.detect_para
import
BlockTerminationProcessor
# from ... pdf2text_recogPara import BlockInnerParasProcessor # another way to import
...
...
tests/test_para/test_pdf2text_recogPara_Common.py
View file @
d5dbed73
import
unittest
from
pdf_tools
.post_proc.detect_para
import
(
from
magic_pdf
.post_proc.detect_para
import
(
is_bbox_overlap
,
is_in_bbox
,
is_line_right_aligned_from_neighbors
,
...
...
tests/test_para/test_pdf2text_recogPara_TitleProcessor.py
View file @
d5dbed73
...
...
@@ -2,7 +2,7 @@ import json
import
unittest
from
utils_for_test_para
import
UtilsForTestPara
from
pdf_tools
.post_proc.detect_para
import
TitleProcessor
from
magic_pdf
.post_proc.detect_para
import
TitleProcessor
# from ... pdf2text_recogPara import * # another way to import
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment