Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
2e487cac
Commit
2e487cac
authored
Mar 06, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
修复目录重构导致的引用异常
parent
846dbecf
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
13 additions
and
13 deletions
+13
-13
download.py
demo/download.py
+1
-1
draw_bbox.py
demo/draw_bbox.py
+1
-1
pdf2md.py
demo/pdf2md.py
+1
-1
test_commons.py
tests/test_commons.py
+2
-2
test_classify.py
tests/test_metascan_classify/test_classify.py
+2
-2
test_meta_scan.py
tests/test_metascan_classify/test_meta_scan.py
+1
-1
test_para_pipeline.py
tests/test_para/test_para_pipeline.py
+5
-5
No files found.
demo/download.py
View file @
2e487cac
...
@@ -2,7 +2,7 @@ import json
...
@@ -2,7 +2,7 @@ import json
import
os
import
os
from
tqdm
import
tqdm
from
tqdm
import
tqdm
from
magic_pdf.libs
import
join_path
from
magic_pdf.libs
.commons
import
join_path
with
open
(
'/mnt/petrelfs/share_data/ouyanglinke/OCR/OCR_validation_dataset.json'
,
'r'
)
as
f
:
with
open
(
'/mnt/petrelfs/share_data/ouyanglinke/OCR/OCR_validation_dataset.json'
,
'r'
)
as
f
:
samples
=
json
.
load
(
f
)
samples
=
json
.
load
(
f
)
...
...
demo/draw_bbox.py
View file @
2e487cac
from
magic_pdf.libs
import
fitz
# PyMuPDF
from
magic_pdf.libs
.commons
import
fitz
# PyMuPDF
# PDF文件路径
# PDF文件路径
pdf_path
=
"D:
\\
project
\\
20231108code-clean
\\
code-clean
\\
tmp
\\
unittest
\\
download-pdfs
\\
scihub
\\
scihub_53700000
\\
libgen.scimag53724000-53724999.zip_10.1097
\\
00129191-200509000-00018.pdf"
pdf_path
=
"D:
\\
project
\\
20231108code-clean
\\
code-clean
\\
tmp
\\
unittest
\\
download-pdfs
\\
scihub
\\
scihub_53700000
\\
libgen.scimag53724000-53724999.zip_10.1097
\\
00129191-200509000-00018.pdf"
...
...
demo/pdf2md.py
View file @
2e487cac
...
@@ -5,7 +5,7 @@ from pathlib import Path
...
@@ -5,7 +5,7 @@ from pathlib import Path
import
click
import
click
from
loguru
import
logger
from
loguru
import
logger
from
magic_pdf.libs
import
join_path
from
magic_pdf.libs
.commons
import
join_path
from
magic_pdf.dict2md.mkcontent
import
mk_mm_markdown
from
magic_pdf.dict2md.mkcontent
import
mk_mm_markdown
from
magic_pdf.pipeline
import
parse_pdf_by_model
from
magic_pdf.pipeline
import
parse_pdf_by_model
...
...
tests/test_commons.py
View file @
2e487cac
import
io
import
io
import
json
import
json
import
os
import
os
from
magic_pdf.libs
import
fitz
from
magic_pdf.libs
.commons
import
fitz
from
app.common.s3
import
get_s3_config
,
get_s3_client
from
app.common.s3
import
get_s3_config
,
get_s3_client
from
magic_pdf.libs
import
join_path
,
json_dump_path
,
read_file
,
parse_bucket_key
from
magic_pdf.libs
.commons
import
join_path
,
json_dump_path
,
read_file
,
parse_bucket_key
from
loguru
import
logger
from
loguru
import
logger
test_pdf_dir_path
=
"s3://llm-pdf-text/unittest/pdf/"
test_pdf_dir_path
=
"s3://llm-pdf-text/unittest/pdf/"
...
...
tests/test_metascan_classify/test_classify.py
View file @
2e487cac
...
@@ -2,10 +2,10 @@ import os
...
@@ -2,10 +2,10 @@ import os
import
pytest
import
pytest
from
magic_pdf.filter
import
classify_by_area
,
classify_by_text_len
,
classify_by_avg_words
,
\
from
magic_pdf.filter
.pdf_classify_by_type
import
classify_by_area
,
classify_by_text_len
,
classify_by_avg_words
,
\
classify_by_img_num
,
classify_by_text_layout
,
classify_by_img_narrow_strips
classify_by_img_num
,
classify_by_text_layout
,
classify_by_img_narrow_strips
from
magic_pdf.filter.pdf_meta_scan
import
get_pdf_page_size_pts
,
get_pdf_textlen_per_page
,
get_imgs_per_page
from
magic_pdf.filter.pdf_meta_scan
import
get_pdf_page_size_pts
,
get_pdf_textlen_per_page
,
get_imgs_per_page
from
test.test_commons
import
get_docs_from_test_pdf
,
get_test_json_data
from
test
s
.test_commons
import
get_docs_from_test_pdf
,
get_test_json_data
# 获取当前目录
# 获取当前目录
current_directory
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
current_directory
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
...
...
tests/test_metascan_classify/test_meta_scan.py
View file @
2e487cac
...
@@ -2,7 +2,7 @@ import os
...
@@ -2,7 +2,7 @@ import os
import
pytest
import
pytest
from
magic_pdf.filter.pdf_meta_scan
import
get_pdf_page_size_pts
,
get_image_info
,
get_pdf_text_layout_per_page
,
get_language
from
magic_pdf.filter.pdf_meta_scan
import
get_pdf_page_size_pts
,
get_image_info
,
get_pdf_text_layout_per_page
,
get_language
from
test.test_commons
import
get_docs_from_test_pdf
,
get_test_json_data
from
test
s
.test_commons
import
get_docs_from_test_pdf
,
get_test_json_data
# 获取当前目录
# 获取当前目录
current_directory
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
current_directory
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
...
...
tests/test_para/test_para_pipeline.py
View file @
2e487cac
...
@@ -11,21 +11,21 @@ Execute the following command to run the tests under directory code-clean:
...
@@ -11,21 +11,21 @@ Execute the following command to run the tests under directory code-clean:
"""
"""
from
test.test_para.test_pdf2text_recogPara_Common
import
(
from
test
s
.test_para.test_pdf2text_recogPara_Common
import
(
TestIsBboxOverlap
,
TestIsBboxOverlap
,
TestIsInBbox
,
TestIsInBbox
,
TestIsBboxOverlap
,
TestIsBboxOverlap
,
TestIsLineLeftAlignedFromNeighbors
,
TestIsLineLeftAlignedFromNeighbors
,
TestIsLineRightAlignedFromNeighbors
,
TestIsLineRightAlignedFromNeighbors
,
)
)
from
test.test_para.test_pdf2text_recogPara_EquationsProcessor
import
TestCalcOverlapPct
from
test
s
.test_para.test_pdf2text_recogPara_EquationsProcessor
import
TestCalcOverlapPct
from
test.test_para.test_pdf2text_recogPara_BlockInnerParasProcessor
import
TestIsConsistentLines
from
test
s
.test_para.test_pdf2text_recogPara_BlockInnerParasProcessor
import
TestIsConsistentLines
from
test.test_para.test_pdf2text_recogPara_BlockContinuationProcessor
import
(
from
test
s
.test_para.test_pdf2text_recogPara_BlockContinuationProcessor
import
(
TestIsAlphabetChar
,
TestIsAlphabetChar
,
TestIsChineseChar
,
TestIsChineseChar
,
TestIsOtherLetterChar
,
TestIsOtherLetterChar
,
)
)
from
test.test_para.test_pdf2text_recogPara_TitleProcessor
import
TestTitleProcessor
from
test
s
.test_para.test_pdf2text_recogPara_TitleProcessor
import
TestTitleProcessor
# Test suite
# Test suite
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment