Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
435ab922
Unverified
Commit
435ab922
authored
Apr 16, 2024
by
drunkpig
Committed by
GitHub
Apr 16, 2024
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'master' into master
parents
18b02ae3
ef03c906
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
119 additions
and
33 deletions
+119
-33
magicpdf.py
magic_pdf/cli/magicpdf.py
+55
-24
pdf_image_tools.py
magic_pdf/libs/pdf_image_tools.py
+1
-1
magic_model.py
magic_pdf/model/magic_model.py
+49
-0
para_split.py
magic_pdf/para/para_split.py
+4
-2
UNIPipe.py
magic_pdf/pipe/UNIPipe.py
+6
-2
AbsReaderWriter.py
magic_pdf/rw/AbsReaderWriter.py
+0
-0
DiskReaderWriter.py
magic_pdf/rw/DiskReaderWriter.py
+1
-1
S3ReaderWriter.py
magic_pdf/rw/S3ReaderWriter.py
+2
-2
__init__.py
magic_pdf/rw/__init__.py
+0
-0
user_api.py
magic_pdf/user_api.py
+1
-1
No files found.
magic_pdf/cli/magicpdf.py
View file @
435ab922
...
...
@@ -21,7 +21,11 @@ python magicpdf.py --json s3://llm-pdf-text/scihub/xxxx.json?bytes=0,81350
python magicpdf.py --pdf /home/llm/Downloads/xxxx.pdf --model /home/llm/Downloads/xxxx.json 或者 python magicpdf.py --pdf /home/llm/Downloads/xxxx.pdf
"""
import
os
import
json
as
json_parse
from
datetime
import
datetime
import
click
from
magic_pdf.pipe.UNIPipe
import
UNIPipe
from
magic_pdf.libs.config_reader
import
get_s3_config
from
magic_pdf.libs.path_utils
import
(
parse_s3path
,
...
...
@@ -29,25 +33,14 @@ from magic_pdf.libs.path_utils import (
remove_non_official_s3_args
,
)
from
magic_pdf.libs.config_reader
import
get_local_dir
from
magic_pdf.io.S3ReaderWriter
import
S3ReaderWriter
,
MODE_BIN
from
magic_pdf.io.DiskReaderWriter
import
DiskReaderWriter
from
magic_pdf.spark.spark_api
import
parse_union_pdf
,
parse_txt_pdf
,
parse_ocr_pdf
import
os
import
json
as
json_parse
from
datetime
import
datetime
from
magic_pdf.rw.S3ReaderWriter
import
S3ReaderWriter
,
MODE_BIN
,
MODE_TXT
from
magic_pdf.rw.DiskReaderWriter
import
DiskReaderWriter
from
magic_pdf.libs.json_compressor
import
JsonCompressor
parse_pdf_methods
=
click
.
Choice
([
"ocr"
,
"txt"
,
"auto"
])
def
get_pdf_parse_method
(
method
):
if
method
==
"ocr"
:
return
parse_ocr_pdf
elif
method
==
"txt"
:
return
parse_txt_pdf
return
parse_union_pdf
def
prepare_env
():
local_parent_dir
=
os
.
path
.
join
(
get_local_dir
(),
"magic-pdf"
,
datetime
.
now
()
.
strftime
(
"
%
Y-
%
m-
%
d"
)
...
...
@@ -60,6 +53,28 @@ def prepare_env():
return
local_image_dir
,
local_md_dir
def
_do_parse
(
pdf_bytes
,
model_list
,
parse_method
,
image_writer
,
md_writer
,
image_dir
):
uni_pipe
=
UNIPipe
()
jso_useful_key
=
{
"_pdf_type"
:
"txt"
,
"model_list"
:
model_list
,
}
if
parse_method
==
"ocr"
:
jso_useful_key
[
"_pdf_type"
]
=
"ocr"
pdf_mid_data
=
uni_pipe
.
parse
(
pdf_bytes
,
image_writer
,
jso_useful_key
)
md_content
=
UNIPipe
.
mk_markdown
(
pdf_mid_data
,
image_dir
)
part_file_name
=
datetime
.
now
()
.
strftime
(
"
%
H-
%
M-
%
S"
)
md_writer
.
write
(
content
=
md_content
,
path
=
f
"{part_file_name}.md"
,
mode
=
MODE_TXT
)
md_writer
.
write
(
content
=
json_parse
.
dumps
(
JsonCompressor
.
decompress_json
(
pdf_mid_data
),
ensure_ascii
=
False
,
indent
=
4
),
path
=
f
"{part_file_name}.json"
,
mode
=
MODE_TXT
,
)
@
click
.
group
()
def
cli
():
pass
...
...
@@ -96,11 +111,20 @@ def json_command(json, method):
jso
=
json_parse
.
loads
(
read_s3_path
(
json
)
.
decode
(
"utf-8"
))
pdf_data
=
read_s3_path
(
jso
[
"file_location"
])
local_image_dir
,
_
=
prepare_env
()
local_image_dir
,
local_md_dir
=
prepare_env
()
local_image_rw
,
local_md_rw
=
DiskReaderWriter
(
local_image_dir
),
DiskReaderWriter
(
local_md_dir
)
local_image_rw
=
DiskReaderWriter
(
local_image_dir
)
parse
=
get_pdf_parse_method
(
method
)
parse
(
pdf_data
,
jso
[
"doc_layout_result"
],
local_image_rw
,
is_debug
=
True
)
_do_parse
(
pdf_data
,
jso
[
'doc_layout_result'
],
method
,
local_image_rw
,
local_md_rw
,
local_image_dir
,
)
@
cli
.
command
()
...
...
@@ -128,15 +152,22 @@ def pdf_command(pdf, model, method):
pdf_data
=
read_fn
(
pdf
)
jso
=
json_parse
.
loads
(
read_fn
(
model
)
.
decode
(
"utf-8"
))
local_image_dir
,
_
=
prepare_env
()
local_image_rw
=
DiskReaderWriter
(
local_image_dir
)
parse
=
get_pdf_parse_method
(
method
)
parse
(
pdf_data
,
jso
,
local_image_rw
,
is_debug
=
True
)
local_image_dir
,
local_md_dir
=
prepare_env
()
local_image_rw
,
local_md_rw
=
DiskReaderWriter
(
local_image_dir
),
DiskReaderWriter
(
local_md_dir
)
_do_parse
(
pdf_data
,
jso
,
method
,
local_image_rw
,
local_md_rw
,
local_image_dir
,
)
if
__name__
==
"__main__"
:
"""
python magic_pdf/cli/magicpdf.py json-command --json s3://llm-pdf-text/pdf_ebook_and_paper/
format/v070/part-66028dd46437-000076.jsonl?bytes=0,308393
python magic_pdf/cli/magicpdf.py json-command --json s3://llm-pdf-text/pdf_ebook_and_paper/
manual/v001/part-660407a28beb-000002.jsonl?bytes=0,63551
"""
cli
()
magic_pdf/libs/pdf_image_tools.py
View file @
435ab922
from
magic_pdf.
io
.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.
rw
.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.libs.commons
import
fitz
from
magic_pdf.libs.commons
import
join_path
from
magic_pdf.libs.hash_utils
import
compute_sha256
...
...
magic_pdf/model/magic_model.py
0 → 100644
View file @
435ab922
class
MagicModel
():
"""
每个函数没有得到元素的时候返回空list
"""
def
__fix_axis
():
# TODO 计算
self
.
__model_list
=
xx
def
__init__
(
model_list
:
list
,
page
:
Page
):
self
.
__model_list
=
model_list
self
.
__fix_axis
()
self
.
__page
=
page
def
get_imgs
(
self
,
page_no
:
int
):
# @许瑞
return_lst
=
[]
img
=
{
"bbox"
:[
x0
,
y0
,
x1
,
y1
]
}
img_caption
=
{
"bbox"
:[
x0
,
y0
,
x1
,
y1
],
"text"
:
""
,
}
return
[{
"img"
:
img
,
"caption"
:
img_caption
},]
def
get_tables
(
self
,
page_no
:
int
)
->
list
:
# 3个坐标, caption, table主体,table-note
pass
# 许瑞
def
get_equations
(
self
,
page_no
:
int
)
->
list
:
# 有坐标,也有字
return
inline_equations
,
interline_equations
# @凯文
def
get_discarded
(
self
,
page_no
:
int
)
->
list
:
# 自研模型,只有坐标
pass
# @凯文
def
get_text_blocks
(
self
,
page_no
:
int
)
->
list
:
# 自研模型搞的,只有坐标,没有字
pass
# @凯文
def
get_title_blocks
(
self
,
page_no
:
int
)
->
list
:
# 自研模型,只有坐标,没字
pass
# @凯文
def
get_ocr_text
(
self
,
page_no
:
int
)
->
list
:
# paddle 搞的,有字也有坐标
pass
# @小蒙
def
get_ocr_spans
(
self
,
page_no
:
int
)
->
list
:
pass
# @小蒙
\ No newline at end of file
magic_pdf/para/para_split.py
View file @
435ab922
...
...
@@ -299,9 +299,9 @@ def __split_para_in_layoutbox(lines_group, new_layout_bbox, lang="en", char_avg_
layout_list_info
[
0
]
=
True
if
end
==
total_lines
-
1
:
layout_list_info
[
1
]
=
True
else
:
else
:
# 是普通文本
for
i
,
line
in
enumerate
(
lines
[
start
:
end
+
1
]):
# 如果i有下一行,那么就要根据下一行位置综合判断是否要分段。如果i之后没有行,那么只需要判断
一下行
结尾特征。
# 如果i有下一行,那么就要根据下一行位置综合判断是否要分段。如果i之后没有行,那么只需要判断
i行自己的
结尾特征。
cur_line_type
=
line
[
'spans'
][
-
1
][
'type'
]
next_line
=
lines
[
i
+
1
]
if
i
<
total_lines
-
1
else
None
...
...
@@ -341,6 +341,8 @@ def __connect_list_inter_layout(layout_paras, new_layout_bbox, layout_list_info,
"""
if
len
(
layout_paras
)
==
0
or
len
(
layout_list_info
)
==
0
:
# 0的时候最后的return 会出错
return
layout_paras
,
[
False
,
False
]
# if page_num==343:
# pass
for
i
in
range
(
1
,
len
(
layout_paras
)):
pre_layout_list_info
=
layout_list_info
[
i
-
1
]
...
...
magic_pdf/pipe/UNIPipe.py
View file @
435ab922
import
json
from
loguru
import
logger
from
magic_pdf.io.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.io.DiskReaderWriter
import
DiskReaderWriter
from
magic_pdf.dict2md.mkcontent
import
mk_universal_format
,
mk_mm_markdown
from
magic_pdf.dict2md.ocr_mkcontent
import
make_standard_format_with_para
,
ocr_mk_mm_markdown_with_para
from
magic_pdf.filter.pdf_classify_by_type
import
classify
from
magic_pdf.filter.pdf_meta_scan
import
pdf_meta_scan
from
magic_pdf.rw.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.rw.DiskReaderWriter
import
DiskReaderWriter
from
magic_pdf.libs.commons
import
join_path
from
magic_pdf.libs.json_compressor
import
JsonCompressor
from
magic_pdf.pipe.AbsPipe
import
AbsPipe
...
...
magic_pdf/
io
/AbsReaderWriter.py
→
magic_pdf/
rw
/AbsReaderWriter.py
View file @
435ab922
File moved
magic_pdf/
io
/DiskReaderWriter.py
→
magic_pdf/
rw
/DiskReaderWriter.py
View file @
435ab922
import
os
from
magic_pdf.
io
.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.
rw
.AbsReaderWriter
import
AbsReaderWriter
from
loguru
import
logger
...
...
magic_pdf/
io
/S3ReaderWriter.py
→
magic_pdf/
rw
/S3ReaderWriter.py
View file @
435ab922
from
magic_pdf.
io
.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.
rw
.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.libs.commons
import
parse_aws_param
,
parse_bucket_key
import
boto3
from
loguru
import
logger
...
...
@@ -11,7 +11,7 @@ MODE_BIN = "binary"
class
S3ReaderWriter
(
AbsReaderWriter
):
def
__init__
(
self
,
ak
:
str
,
sk
:
str
,
endpoint_url
:
str
,
addressing_style
:
str
,
parent_path
:
str
):
def
__init__
(
self
,
ak
:
str
,
sk
:
str
,
endpoint_url
:
str
,
addressing_style
:
str
=
'auto'
,
parent_path
:
str
=
''
):
self
.
client
=
self
.
_get_client
(
ak
,
sk
,
endpoint_url
,
addressing_style
)
self
.
path
=
parent_path
...
...
magic_pdf/
io
/__init__.py
→
magic_pdf/
rw
/__init__.py
View file @
435ab922
File moved
magic_pdf/user_api.py
View file @
435ab922
...
...
@@ -14,7 +14,7 @@
"""
from
loguru
import
logger
from
magic_pdf.
io
import
AbsReaderWriter
from
magic_pdf.
rw
import
AbsReaderWriter
from
magic_pdf.pdf_parse_by_ocr
import
parse_pdf_by_ocr
from
magic_pdf.pdf_parse_by_txt
import
parse_pdf_by_txt
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment