Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
b16599cd
Commit
b16599cd
authored
Apr 22, 2024
by
许瑞
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
feat: update cli
parent
be52bbe9
Changes
4
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
534 additions
and
240 deletions
+534
-240
magicpdf.py
magic_pdf/cli/magicpdf.py
+35
-20
pdf_parse_by_txt_v2.py
magic_pdf/pdf_parse_by_txt_v2.py
+214
-0
equations_replace.py
magic_pdf/pre_proc/equations_replace.py
+284
-219
user_api.py
magic_pdf/user_api.py
+1
-1
No files found.
magic_pdf/cli/magicpdf.py
View file @
b16599cd
...
@@ -26,6 +26,8 @@ import json as json_parse
...
@@ -26,6 +26,8 @@ import json as json_parse
from
datetime
import
datetime
from
datetime
import
datetime
import
click
import
click
from
magic_pdf.pipe.UNIPipe
import
UNIPipe
from
magic_pdf.pipe.UNIPipe
import
UNIPipe
from
magic_pdf.pipe.OCRPipe
import
OCRPipe
from
magic_pdf.pipe.TXTPipe
import
TXTPipe
from
magic_pdf.libs.config_reader
import
get_s3_config
from
magic_pdf.libs.config_reader
import
get_s3_config
from
magic_pdf.libs.path_utils
import
(
from
magic_pdf.libs.path_utils
import
(
parse_s3path
,
parse_s3path
,
...
@@ -33,9 +35,9 @@ from magic_pdf.libs.path_utils import (
...
@@ -33,9 +35,9 @@ from magic_pdf.libs.path_utils import (
remove_non_official_s3_args
,
remove_non_official_s3_args
,
)
)
from
magic_pdf.libs.config_reader
import
get_local_dir
from
magic_pdf.libs.config_reader
import
get_local_dir
from
magic_pdf.rw.S3ReaderWriter
import
S3ReaderWriter
,
MODE_BIN
,
MODE_TXT
from
magic_pdf.rw.S3ReaderWriter
import
S3ReaderWriter
from
magic_pdf.rw.DiskReaderWriter
import
DiskReaderWriter
from
magic_pdf.rw.DiskReaderWriter
import
DiskReaderWriter
from
magic_pdf.rw.AbsReaderWriter
import
AbsReaderWriter
parse_pdf_methods
=
click
.
Choice
([
"ocr"
,
"txt"
,
"auto"
])
parse_pdf_methods
=
click
.
Choice
([
"ocr"
,
"txt"
,
"auto"
])
...
@@ -53,24 +55,34 @@ def prepare_env():
...
@@ -53,24 +55,34 @@ def prepare_env():
def
_do_parse
(
pdf_bytes
,
model_list
,
parse_method
,
image_writer
,
md_writer
,
image_dir
):
def
_do_parse
(
pdf_bytes
,
model_list
,
parse_method
,
image_writer
,
md_writer
,
image_dir
):
uni_pipe
=
UNIPipe
(
pdf_bytes
,
model_list
,
image_writer
,
image_dir
,
is_debug
=
True
)
if
parse_method
==
"auto"
:
jso_useful_key
=
{
pipe
=
UNIPipe
(
pdf_bytes
,
model_list
,
image_writer
,
image_dir
,
is_debug
=
True
)
"_pdf_type"
:
"txt"
,
elif
parse_method
==
"txt"
:
"model_list"
:
model_list
,
pipe
=
TXTPipe
(
pdf_bytes
,
model_list
,
image_writer
,
image_dir
,
is_debug
=
True
)
}
elif
parse_method
==
"ocr"
:
if
parse_method
==
"ocr"
:
pipe
=
OCRPipe
(
pdf_bytes
,
model_list
,
image_writer
,
image_dir
,
is_debug
=
True
)
jso_useful_key
[
"_pdf_type"
]
=
"ocr"
else
:
print
(
"unknow parse method"
)
uni_pipe
.
pipe_parse
()
os
.
exit
(
1
)
md_content
=
uni_pipe
.
pipe_mk_markdown
()
pipe
.
pipe_classify
()
pipe
.
pipe_parse
()
md_content
=
pipe
.
pipe_mk_markdown
()
part_file_name
=
datetime
.
now
()
.
strftime
(
"
%
H-
%
M-
%
S"
)
part_file_name
=
datetime
.
now
()
.
strftime
(
"
%
H-
%
M-
%
S"
)
md_writer
.
write
(
content
=
md_content
,
path
=
f
"{part_file_name}.md"
,
mode
=
MODE_TXT
)
md_writer
.
write
(
md_writer
.
write
(
content
=
json_parse
.
dumps
(
content
=
md_content
,
path
=
f
"{part_file_name}.md"
,
mode
=
AbsReaderWriter
.
MODE_TXT
uni_pipe
.
pdf_mid_data
,
ensure_ascii
=
False
,
indent
=
4
)
),
md_writer
.
write
(
content
=
json_parse
.
dumps
(
pipe
.
pdf_mid_data
,
ensure_ascii
=
False
,
indent
=
4
),
path
=
f
"{part_file_name}.json"
,
path
=
f
"{part_file_name}.json"
,
mode
=
MODE_TXT
,
mode
=
AbsReaderWriter
.
MODE_TXT
,
)
try
:
content_list
=
pipe
.
pipe_mk_uni_format
()
except
Exception
as
e
:
print
(
e
)
md_writer
.
write
(
str
(
content_list
),
f
"{part_file_name}.txt"
,
AbsReaderWriter
.
MODE_TXT
)
)
...
@@ -106,7 +118,10 @@ def json_command(json, method):
...
@@ -106,7 +118,10 @@ def json_command(json, method):
byte_start
,
byte_end
=
int
(
may_range_params
[
0
]),
int
(
may_range_params
[
1
])
byte_start
,
byte_end
=
int
(
may_range_params
[
0
]),
int
(
may_range_params
[
1
])
byte_end
+=
byte_start
-
1
byte_end
+=
byte_start
-
1
return
s3_rw
.
read_jsonl
(
return
s3_rw
.
read_jsonl
(
remove_non_official_s3_args
(
s3path
),
byte_start
,
byte_end
,
MODE_BIN
remove_non_official_s3_args
(
s3path
),
byte_start
,
byte_end
,
AbsReaderWriter
.
MODE_BIN
,
)
)
jso
=
json_parse
.
loads
(
read_s3_path
(
json
)
.
decode
(
"utf-8"
))
jso
=
json_parse
.
loads
(
read_s3_path
(
json
)
.
decode
(
"utf-8"
))
...
@@ -119,7 +134,7 @@ def json_command(json, method):
...
@@ -119,7 +134,7 @@ def json_command(json, method):
_do_parse
(
_do_parse
(
pdf_data
,
pdf_data
,
jso
[
'doc_layout_result'
],
jso
[
"doc_layout_result"
],
method
,
method
,
local_image_rw
,
local_image_rw
,
local_md_rw
,
local_md_rw
,
...
@@ -148,7 +163,7 @@ def pdf_command(pdf, model, method):
...
@@ -148,7 +163,7 @@ def pdf_command(pdf, model, method):
def
read_fn
(
path
):
def
read_fn
(
path
):
disk_rw
=
DiskReaderWriter
(
os
.
path
.
dirname
(
path
))
disk_rw
=
DiskReaderWriter
(
os
.
path
.
dirname
(
path
))
return
disk_rw
.
read
(
os
.
path
.
basename
(
path
),
MODE_BIN
)
return
disk_rw
.
read
(
os
.
path
.
basename
(
path
),
AbsReaderWriter
.
MODE_BIN
)
pdf_data
=
read_fn
(
pdf
)
pdf_data
=
read_fn
(
pdf
)
jso
=
json_parse
.
loads
(
read_fn
(
model
)
.
decode
(
"utf-8"
))
jso
=
json_parse
.
loads
(
read_fn
(
model
)
.
decode
(
"utf-8"
))
...
...
magic_pdf/pdf_parse_by_txt_v2.py
0 → 100644
View file @
b16599cd
import
time
from
loguru
import
logger
from
magic_pdf.layout.layout_sort
import
get_bboxes_layout
from
magic_pdf.libs.convert_utils
import
dict_to_list
from
magic_pdf.libs.hash_utils
import
compute_md5
from
magic_pdf.libs.commons
import
fitz
,
get_delta_time
from
magic_pdf.model.magic_model
import
MagicModel
from
magic_pdf.pre_proc.construct_page_dict
import
ocr_construct_page_component_v2
from
magic_pdf.pre_proc.cut_image
import
ocr_cut_image_and_table
from
magic_pdf.pre_proc.ocr_detect_all_bboxes
import
ocr_prepare_bboxes_for_layout_split
from
magic_pdf.pre_proc.ocr_dict_merge
import
(
sort_blocks_by_layout
,
fill_spans_in_blocks
,
fix_block_spans
,
)
from
magic_pdf.libs.ocr_content_type
import
ContentType
from
magic_pdf.pre_proc.ocr_span_list_modify
import
(
remove_overlaps_min_spans
,
get_qa_need_list_v2
,
)
from
magic_pdf.pre_proc.equations_replace
import
(
combine_chars_to_pymudict
,
remove_chars_in_text_blocks
,
replace_equations_in_textblock
,
)
from
magic_pdf.pre_proc.equations_replace
import
(
combine_chars_to_pymudict
,
remove_chars_in_text_blocks
,
replace_equations_in_textblock
,
)
from
magic_pdf.pre_proc.citationmarker_remove
import
remove_citation_marker
def
txt_spans_extract
(
pdf_page
,
inline_equations
,
interline_equations
):
text_raw_blocks
=
pdf_page
.
get_text
(
"dict"
,
flags
=
fitz
.
TEXTFLAGS_TEXT
)[
"blocks"
]
char_level_text_blocks
=
pdf_page
.
get_text
(
"rawdict"
,
flags
=
fitz
.
TEXTFLAGS_TEXT
)[
"blocks"
]
text_blocks
=
combine_chars_to_pymudict
(
text_raw_blocks
,
char_level_text_blocks
)
text_blocks
=
replace_equations_in_textblock
(
text_blocks
,
inline_equations
,
interline_equations
)
text_blocks
=
remove_citation_marker
(
text_blocks
)
text_blocks
=
remove_chars_in_text_blocks
(
text_blocks
)
spans
=
[]
for
v
in
text_blocks
:
for
line
in
v
[
"lines"
]:
for
span
in
line
[
"spans"
]:
spans
.
append
(
{
"bbox"
:
list
(
span
[
"bbox"
]),
"content"
:
span
[
"text"
],
"type"
:
ContentType
.
Text
,
}
)
return
spans
def
replace_text_span
(
pymu_spans
,
ocr_spans
):
return
list
(
filter
(
lambda
x
:
x
[
"type"
]
!=
ContentType
.
Text
,
ocr_spans
))
+
pymu_spans
def
parse_pdf_by_txt
(
pdf_bytes
,
model_list
,
imageWriter
,
start_page_id
=
0
,
end_page_id
=
None
,
debug_mode
=
False
,
):
pdf_bytes_md5
=
compute_md5
(
pdf_bytes
)
pdf_docs
=
fitz
.
open
(
"pdf"
,
pdf_bytes
)
"""初始化空的pdf_info_dict"""
pdf_info_dict
=
{}
"""用model_list和docs对象初始化magic_model"""
magic_model
=
MagicModel
(
model_list
,
pdf_docs
)
"""根据输入的起始范围解析pdf"""
end_page_id
=
end_page_id
if
end_page_id
else
len
(
pdf_docs
)
-
1
"""初始化启动时间"""
start_time
=
time
.
time
()
for
page_id
in
range
(
start_page_id
,
end_page_id
+
1
):
"""debug时输出每页解析的耗时"""
if
debug_mode
:
time_now
=
time
.
time
()
logger
.
info
(
f
"page_id: {page_id}, last_page_cost_time: {get_delta_time(start_time)}"
)
start_time
=
time_now
"""从magic_model对象中获取后面会用到的区块信息"""
img_blocks
=
magic_model
.
get_imgs
(
page_id
)
table_blocks
=
magic_model
.
get_tables
(
page_id
)
discarded_blocks
=
magic_model
.
get_discarded
(
page_id
)
text_blocks
=
magic_model
.
get_text_blocks
(
page_id
)
title_blocks
=
magic_model
.
get_title_blocks
(
page_id
)
inline_equations
,
interline_equations
,
interline_equation_blocks
=
(
magic_model
.
get_equations
(
page_id
)
)
page_w
,
page_h
=
magic_model
.
get_page_size
(
page_id
)
"""将所有区块的bbox整理到一起"""
all_bboxes
=
ocr_prepare_bboxes_for_layout_split
(
img_blocks
,
table_blocks
,
discarded_blocks
,
text_blocks
,
title_blocks
,
interline_equation_blocks
,
page_w
,
page_h
,
)
"""根据区块信息计算layout"""
page_boundry
=
[
0
,
0
,
page_w
,
page_h
]
layout_bboxes
,
layout_tree
=
get_bboxes_layout
(
all_bboxes
,
page_boundry
,
page_id
)
"""根据layout顺序,对当前页面所有需要留下的block进行排序"""
sorted_blocks
=
sort_blocks_by_layout
(
all_bboxes
,
layout_bboxes
)
"""ocr 中文本类的 span 用 pymu spans 替换!"""
ocr_spans
=
magic_model
.
get_all_spans
(
page_id
)
pymu_spans
=
txt_spans_extract
(
pdf_docs
[
page_id
],
inline_equations
,
interline_equations
)
spans
=
replace_text_span
(
pymu_spans
,
ocr_spans
)
"""删除重叠spans中较小的那些"""
spans
,
dropped_spans_by_span_overlap
=
remove_overlaps_min_spans
(
spans
)
"""对image和table截图"""
spans
=
ocr_cut_image_and_table
(
spans
,
pdf_docs
[
page_id
],
page_id
,
pdf_bytes_md5
,
imageWriter
)
"""将span填入排好序的blocks中"""
block_with_spans
=
fill_spans_in_blocks
(
sorted_blocks
,
spans
)
"""对block进行fix操作"""
fix_blocks
=
fix_block_spans
(
block_with_spans
,
img_blocks
,
table_blocks
)
"""获取QA需要外置的list"""
images
,
tables
,
interline_equations
=
get_qa_need_list_v2
(
fix_blocks
)
"""构造pdf_info_dict"""
page_info
=
ocr_construct_page_component_v2
(
fix_blocks
,
layout_bboxes
,
page_id
,
page_w
,
page_h
,
layout_tree
,
images
,
tables
,
interline_equations
,
discarded_blocks
,
)
pdf_info_dict
[
f
"page_{page_id}"
]
=
page_info
"""分段"""
pass
"""dict转list"""
pdf_info_list
=
dict_to_list
(
pdf_info_dict
)
new_pdf_info_dict
=
{
"pdf_info"
:
pdf_info_list
,
}
return
new_pdf_info_dict
if
__name__
==
"__main__"
:
if
1
:
import
fitz
import
json
with
open
(
"/opt/data/pdf/20240418/25536-00.pdf"
,
"rb"
)
as
f
:
pdf_bytes
=
f
.
read
()
pdf_docs
=
fitz
.
open
(
"pdf"
,
pdf_bytes
)
with
open
(
"/opt/data/pdf/20240418/25536-00.json"
)
as
f
:
model_list
=
json
.
loads
(
f
.
readline
())
magic_model
=
MagicModel
(
model_list
,
pdf_docs
)
for
i
in
range
(
7
):
print
(
magic_model
.
get_imgs
(
i
))
for
page_no
,
page
in
enumerate
(
pdf_docs
):
inline_equations
,
interline_equations
,
interline_equation_blocks
=
(
magic_model
.
get_equations
(
page_no
)
)
text_raw_blocks
=
page
.
get_text
(
"dict"
,
flags
=
fitz
.
TEXTFLAGS_TEXT
)[
"blocks"
]
char_level_text_blocks
=
page
.
get_text
(
"rawdict"
,
flags
=
fitz
.
TEXTFLAGS_TEXT
)[
"blocks"
]
text_blocks
=
combine_chars_to_pymudict
(
text_raw_blocks
,
char_level_text_blocks
)
text_blocks
=
replace_equations_in_textblock
(
text_blocks
,
inline_equations
,
interline_equations
)
text_blocks
=
remove_citation_marker
(
text_blocks
)
text_blocks
=
remove_chars_in_text_blocks
(
text_blocks
)
magic_pdf/pre_proc/equations_replace.py
View file @
b16599cd
This diff is collapsed.
Click to expand it.
magic_pdf/user_api.py
View file @
b16599cd
...
@@ -16,7 +16,7 @@ from loguru import logger
...
@@ -16,7 +16,7 @@ from loguru import logger
from
magic_pdf.rw
import
AbsReaderWriter
from
magic_pdf.rw
import
AbsReaderWriter
from
magic_pdf.pdf_parse_by_ocr_v2
import
parse_pdf_by_ocr
from
magic_pdf.pdf_parse_by_ocr_v2
import
parse_pdf_by_ocr
from
magic_pdf.pdf_parse_by_txt
import
parse_pdf_by_txt
from
magic_pdf.pdf_parse_by_txt
_v2
import
parse_pdf_by_txt
PARSE_TYPE_TXT
=
"txt"
PARSE_TYPE_TXT
=
"txt"
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment