Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
0746daf9
Unverified
Commit
0746daf9
authored
Apr 16, 2024
by
drunkpig
Committed by
GitHub
Apr 16, 2024
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #23 from myhloli/master
fix 参数名称
parents
d867304f
d438b97a
Changes
8
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
88 additions
and
94 deletions
+88
-94
ocr_mkcontent.py
magic_pdf/dict2md/ocr_mkcontent.py
+14
-12
pdf_image_tools.py
magic_pdf/libs/pdf_image_tools.py
+8
-48
pdf_parse_by_ocr.py
magic_pdf/pdf_parse_by_ocr.py
+6
-7
pdf_parse_by_txt.py
magic_pdf/pdf_parse_by_txt.py
+2
-3
pdf_parse_for_train.py
magic_pdf/pdf_parse_for_train.py
+2
-2
UNIPipe.py
magic_pdf/pipe/UNIPipe.py
+4
-4
cut_image.py
magic_pdf/pre_proc/cut_image.py
+52
-0
ocr_cut_image.py
magic_pdf/pre_proc/ocr_cut_image.py
+0
-18
No files found.
magic_pdf/dict2md/ocr_mkcontent.py
View file @
0746daf9
...
...
@@ -17,10 +17,10 @@ def split_long_words(text):
return
' '
.
join
(
segments
)
def
ocr_mk_nlp_markdown
(
pdf_info_dict
:
dic
t
):
def
ocr_mk_nlp_markdown
(
pdf_info_dict
:
lis
t
):
markdown
=
[]
for
_
,
page_info
in
pdf_info_dict
.
items
()
:
for
page_info
in
pdf_info_dict
:
blocks
=
page_info
.
get
(
"preproc_blocks"
)
if
not
blocks
:
continue
...
...
@@ -41,10 +41,10 @@ def ocr_mk_nlp_markdown(pdf_info_dict: dict):
return
'
\n
'
.
join
(
markdown
)
def
ocr_mk_mm_markdown
(
pdf_info_dict
:
dic
t
):
def
ocr_mk_mm_markdown
(
pdf_info_dict
:
lis
t
):
markdown
=
[]
for
_
,
page_info
in
pdf_info_dict
.
items
()
:
for
page_info
in
pdf_info_dict
:
blocks
=
page_info
.
get
(
"preproc_blocks"
)
if
not
blocks
:
continue
...
...
@@ -78,17 +78,18 @@ def ocr_mk_mm_markdown_with_para(pdf_info_list: list, img_buket_path):
return
'
\n\n
'
.
join
(
markdown
)
def
ocr_mk_nlp_markdown_with_para
(
pdf_info_dict
:
dic
t
):
def
ocr_mk_nlp_markdown_with_para
(
pdf_info_dict
:
lis
t
):
markdown
=
[]
for
_
,
page_info
in
pdf_info_dict
.
items
()
:
for
page_info
in
pdf_info_dict
:
paras_of_layout
=
page_info
.
get
(
"para_blocks"
)
page_markdown
=
ocr_mk_markdown_with_para_core
(
paras_of_layout
,
"nlp"
)
markdown
.
extend
(
page_markdown
)
return
'
\n\n
'
.
join
(
markdown
)
def
ocr_mk_mm_markdown_with_para_and_pagination
(
pdf_info_dict
:
dic
t
):
def
ocr_mk_mm_markdown_with_para_and_pagination
(
pdf_info_dict
:
lis
t
):
markdown_with_para_and_pagination
=
[]
for
page_no
,
page_info
in
pdf_info_dict
.
items
():
page_no
=
0
for
page_info
in
pdf_info_dict
:
paras_of_layout
=
page_info
.
get
(
"para_blocks"
)
if
not
paras_of_layout
:
continue
...
...
@@ -97,6 +98,7 @@ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: dict):
'page_no'
:
page_no
,
'md_content'
:
'
\n\n
'
.
join
(
page_markdown
)
})
page_no
+=
1
return
markdown_with_para_and_pagination
...
...
@@ -171,9 +173,9 @@ def para_to_standard_format(para, img_buket_path):
}
return
para_content
def
make_standard_format_with_para
(
pdf_info_dict
:
dic
t
,
img_buket_path
:
str
):
def
make_standard_format_with_para
(
pdf_info_dict
:
lis
t
,
img_buket_path
:
str
):
content_list
=
[]
for
_
,
page_info
in
pdf_info_dict
.
items
()
:
for
page_info
in
pdf_info_dict
:
paras_of_layout
=
page_info
.
get
(
"para_blocks"
)
if
not
paras_of_layout
:
continue
...
...
@@ -227,7 +229,7 @@ def line_to_standard_format(line, img_buket_path):
return
content
def
ocr_mk_mm_standard_format
(
pdf_info_dict
:
dic
t
):
def
ocr_mk_mm_standard_format
(
pdf_info_dict
:
lis
t
):
"""
content_list
type string image/text/table/equation(行间的单独拿出来,行内的和text合并)
...
...
@@ -237,7 +239,7 @@ def ocr_mk_mm_standard_format(pdf_info_dict: dict):
img_path string s3://full/path/to/img.jpg
"""
content_list
=
[]
for
_
,
page_info
in
pdf_info_dict
.
items
()
:
for
page_info
in
pdf_info_dict
:
blocks
=
page_info
.
get
(
"preproc_blocks"
)
if
not
blocks
:
continue
...
...
magic_pdf/libs/pdf_image_tools.py
View file @
0746daf9
from
loguru
import
logger
from
magic_pdf.io
import
AbsReaderWriter
from
magic_pdf.io
.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.libs.commons
import
fitz
from
loguru
import
logger
from
magic_pdf.libs.commons
import
join_path
from
magic_pdf.libs.hash_utils
import
compute_sha256
def
cut_image
(
bbox
:
tuple
,
page_num
:
int
,
page
:
fitz
.
Page
,
return_path
,
imageWriter
:
AbsReaderWriter
):
def
cut_image
(
bbox
:
tuple
,
page_num
:
int
,
page
:
fitz
.
Page
,
return_path
,
imageWriter
:
AbsReaderWriter
):
"""
从第page_num页的page中,根据bbox进行裁剪出一张jpg图片,返回图片路径
save_path:需要同时支持s3和本地, 图片存放在save_path下,文件名是: {page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。
...
...
@@ -20,6 +20,10 @@ def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWri
# 新版本生成平铺路径
img_hash256_path
=
f
"{compute_sha256(img_path)}.jpg"
if
any
([
bbox
[
0
]
>=
bbox
[
2
],
bbox
[
1
]
>=
bbox
[
3
]]):
logger
.
warning
(
f
"image_bboxes: 错误的box, {bbox}"
)
return
img_hash256_path
# 将坐标转换为fitz.Rect对象
rect
=
fitz
.
Rect
(
*
bbox
)
# 配置缩放倍数为3倍
...
...
@@ -29,50 +33,6 @@ def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWri
byte_data
=
pix
.
tobytes
(
output
=
'jpeg'
,
jpg_quality
=
95
)
imageWriter
.
write
(
byte_data
,
path
=
img_hash256_path
,
mode
=
"binary"
)
imageWriter
.
write
(
content
=
byte_data
,
path
=
img_hash256_path
,
mode
=
"binary"
)
imageWriter
.
write
(
byte_data
,
img_hash256_path
,
AbsReaderWriter
.
MODE_BIN
)
return
img_hash256_path
def
save_images_by_bboxes
(
page_num
:
int
,
page
:
fitz
.
Page
,
pdf_bytes_md5
:
str
,
image_bboxes
:
list
,
images_overlap_backup
:
list
,
table_bboxes
:
list
,
equation_inline_bboxes
:
list
,
equation_interline_bboxes
:
list
,
imageWriter
)
->
dict
:
"""
返回一个dict, key为bbox, 值是图片地址
"""
image_info
=
[]
image_backup_info
=
[]
table_info
=
[]
inline_eq_info
=
[]
interline_eq_info
=
[]
# 图片的保存路径组成是这样的: {s3_or_local_path}/{book_name}/{images|tables|equations}/{page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg
def
return_path
(
type
):
return
join_path
(
pdf_bytes_md5
,
type
)
for
bbox
in
image_bboxes
:
if
any
([
bbox
[
0
]
>=
bbox
[
2
],
bbox
[
1
]
>=
bbox
[
3
]]):
logger
.
warning
(
f
"image_bboxes: 错误的box, {bbox}"
)
continue
image_path
=
cut_image
(
bbox
,
page_num
,
page
,
return_path
(
"images"
),
imageWriter
)
image_info
.
append
({
"bbox"
:
bbox
,
"image_path"
:
image_path
})
for
bbox
in
images_overlap_backup
:
if
any
([
bbox
[
0
]
>=
bbox
[
2
],
bbox
[
1
]
>=
bbox
[
3
]]):
logger
.
warning
(
f
"images_overlap_backup: 错误的box, {bbox}"
)
continue
image_path
=
cut_image
(
bbox
,
page_num
,
page
,
return_path
(
"images"
),
imageWriter
)
image_backup_info
.
append
({
"bbox"
:
bbox
,
"image_path"
:
image_path
})
for
bbox
in
table_bboxes
:
if
any
([
bbox
[
0
]
>=
bbox
[
2
],
bbox
[
1
]
>=
bbox
[
3
]]):
logger
.
warning
(
f
"table_bboxes: 错误的box, {bbox}"
)
continue
image_path
=
cut_image
(
bbox
,
page_num
,
page
,
return_path
(
"tables"
),
imageWriter
)
table_info
.
append
({
"bbox"
:
bbox
,
"image_path"
:
image_path
})
return
image_info
,
image_backup_info
,
table_info
,
inline_eq_info
,
interline_eq_info
magic_pdf/pdf_parse_by_ocr.py
View file @
0746daf9
...
...
@@ -16,7 +16,7 @@ from magic_pdf.pre_proc.detect_footer_by_model import parse_footers
from
magic_pdf.pre_proc.detect_footnote
import
parse_footnotes_by_model
from
magic_pdf.pre_proc.detect_header
import
parse_headers
from
magic_pdf.pre_proc.detect_page_number
import
parse_pageNos
from
magic_pdf.pre_proc.
ocr_cut_image
import
cut_image_and_table
from
magic_pdf.pre_proc.
cut_image
import
ocr_
cut_image_and_table
from
magic_pdf.pre_proc.ocr_detect_layout
import
layout_detect
from
magic_pdf.pre_proc.ocr_dict_merge
import
(
merge_spans_to_line_by_layout
,
merge_lines_to_block
,
...
...
@@ -27,7 +27,6 @@ from magic_pdf.pre_proc.ocr_span_list_modify import remove_spans_by_bboxes, remo
from
magic_pdf.pre_proc.remove_bbox_overlap
import
remove_overlap_between_bbox
def
parse_pdf_by_ocr
(
pdf_bytes
,
pdf_model_output
,
...
...
@@ -148,7 +147,7 @@ def parse_pdf_by_ocr(
spans
,
dropped_spans_by_removed_bboxes
=
remove_spans_by_bboxes_dict
(
spans
,
need_remove_spans_bboxes_dict
)
'''对image和table截图'''
spans
=
cut_image_and_table
(
spans
,
page
,
page_id
,
pdf_bytes_md5
,
imageWriter
)
spans
=
ocr_
cut_image_and_table
(
spans
,
page
,
page_id
,
pdf_bytes_md5
,
imageWriter
)
'''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
displayed_list
=
[]
...
...
magic_pdf/pdf_parse_by_txt.py
View file @
0746daf9
...
...
@@ -17,6 +17,7 @@ from magic_pdf.libs.hash_utils import compute_md5
from
magic_pdf.libs.markdown_utils
import
escape_special_markdown_char
from
magic_pdf.libs.safe_filename
import
sanitize_filename
from
magic_pdf.libs.vis_utils
import
draw_bbox_on_page
,
draw_layout_bbox_on_page
from
magic_pdf.pre_proc.cut_image
import
txt_save_images_by_bboxes
from
magic_pdf.pre_proc.detect_images
import
parse_images
from
magic_pdf.pre_proc.detect_tables
import
parse_tables
# 获取tables的bbox
from
magic_pdf.pre_proc.detect_equation
import
parse_equations
# 获取equations的bbox
...
...
@@ -48,8 +49,6 @@ from para.exceptions import (
)
'''
from
magic_pdf.libs.commons
import
read_file
,
join_path
from
magic_pdf.libs.pdf_image_tools
import
save_images_by_bboxes
from
magic_pdf.post_proc.remove_footnote
import
merge_footnote_blocks
,
remove_footnote_blocks
from
magic_pdf.pre_proc.citationmarker_remove
import
remove_citation_marker
from
magic_pdf.pre_proc.equations_replace
import
combine_chars_to_pymudict
,
remove_chars_in_text_blocks
,
replace_equations_in_textblock
...
...
@@ -194,7 +193,7 @@ def parse_pdf_by_txt(
"""
# 把图、表、公式都进行截图,保存到存储上,返回图片路径作为内容
image_info
,
image_backup_info
,
table_info
,
inline_eq_info
,
interline_eq_info
=
save_images_by_bboxes
(
image_info
,
image_backup_info
,
table_info
,
inline_eq_info
,
interline_eq_info
=
txt_
save_images_by_bboxes
(
page_id
,
page
,
pdf_bytes_md5
,
...
...
magic_pdf/pdf_parse_for_train.py
View file @
0746daf9
...
...
@@ -26,6 +26,7 @@ from magic_pdf.libs.drop_reason import DropReason
from
magic_pdf.libs.markdown_utils
import
escape_special_markdown_char
from
magic_pdf.libs.safe_filename
import
sanitize_filename
from
magic_pdf.libs.vis_utils
import
draw_bbox_on_page
,
draw_layout_bbox_on_page
from
magic_pdf.pre_proc.cut_image
import
txt_save_images_by_bboxes
from
magic_pdf.pre_proc.detect_images
import
parse_images
from
magic_pdf.pre_proc.detect_tables
import
parse_tables
# 获取tables的bbox
from
magic_pdf.pre_proc.detect_equation
import
parse_equations
# 获取equations的bbox
...
...
@@ -62,7 +63,6 @@ from para.exceptions import (
"""
from
magic_pdf.libs.commons
import
read_file
,
join_path
from
magic_pdf.libs.pdf_image_tools
import
save_images_by_bboxes
from
magic_pdf.post_proc.remove_footnote
import
(
merge_footnote_blocks
,
remove_footnote_blocks
,
...
...
@@ -323,7 +323,7 @@ def parse_pdf_for_train(
# 把图、表、公式都进行截图,保存到存储上,返回图片路径作为内容
image_info
,
image_backup_info
,
table_info
,
inline_eq_info
,
interline_eq_info
=
(
save_images_by_bboxes
(
txt_
save_images_by_bboxes
(
book_name
,
page_id
,
page
,
...
...
magic_pdf/pipe/UNIPipe.py
View file @
0746daf9
...
...
@@ -105,8 +105,8 @@ if __name__ == '__main__':
pdf_file_path
=
r"linshixuqiu\25536-00.pdf"
model_file_path
=
r"linshixuqiu\25536-00.json"
pdf_bytes
=
drw
.
read
(
p
ath
=
pdf_file_path
,
mode
=
AbsReaderWriter
.
MODE_BIN
)
model_json_txt
=
drw
.
read
(
path
=
model_file_path
,
mode
=
AbsReaderWriter
.
MODE_TXT
)
pdf_bytes
=
drw
.
read
(
p
df_file_path
,
AbsReaderWriter
.
MODE_BIN
)
model_json_txt
=
drw
.
read
(
model_file_path
,
AbsReaderWriter
.
MODE_TXT
)
pdf_type
=
UNIPipe
.
classify
(
pdf_bytes
)
logger
.
info
(
f
"pdf_type is {pdf_type}"
)
...
...
@@ -122,5 +122,5 @@ if __name__ == '__main__':
md_content
=
pipe
.
mk_markdown
(
pdf_mid_data
,
"imgs"
)
md_writer
=
DiskReaderWriter
(
write_path
)
md_writer
.
write
(
content
=
md_content
,
path
=
"25536-00.md"
,
mode
=
AbsReaderWriter
.
MODE_TXT
)
md_writer
.
write
(
content
=
json
.
dumps
(
JsonCompressor
.
decompress_json
(
pdf_mid_data
),
ensure_ascii
=
False
,
indent
=
4
),
path
=
"25536-00.json"
,
mode
=
AbsReaderWriter
.
MODE_TXT
)
md_writer
.
write
(
md_content
,
"25536-00.md"
,
AbsReaderWriter
.
MODE_TXT
)
md_writer
.
write
(
json
.
dumps
(
JsonCompressor
.
decompress_json
(
pdf_mid_data
),
ensure_ascii
=
False
,
indent
=
4
),
"25536-00.json"
,
AbsReaderWriter
.
MODE_TXT
)
magic_pdf/pre_proc/cut_image.py
0 → 100644
View file @
0746daf9
from
magic_pdf.libs.commons
import
join_path
from
magic_pdf.libs.ocr_content_type
import
ContentType
from
magic_pdf.libs.pdf_image_tools
import
cut_image
def
ocr_cut_image_and_table
(
spans
,
page
,
page_id
,
pdf_bytes_md5
,
imageWriter
):
def
return_path
(
type
):
return
join_path
(
pdf_bytes_md5
,
type
)
for
span
in
spans
:
span_type
=
span
[
'type'
]
if
span_type
==
ContentType
.
Image
:
span
[
'image_path'
]
=
cut_image
(
span
[
'bbox'
],
page_id
,
page
,
return_path
=
return_path
(
'images'
),
imageWriter
=
imageWriter
)
elif
span_type
==
ContentType
.
Table
:
span
[
'image_path'
]
=
cut_image
(
span
[
'bbox'
],
page_id
,
page
,
return_path
=
return_path
(
'tables'
),
imageWriter
=
imageWriter
)
return
spans
def
txt_save_images_by_bboxes
(
page_num
:
int
,
page
,
pdf_bytes_md5
:
str
,
image_bboxes
:
list
,
images_overlap_backup
:
list
,
table_bboxes
:
list
,
equation_inline_bboxes
:
list
,
equation_interline_bboxes
:
list
,
imageWriter
)
->
dict
:
"""
返回一个dict, key为bbox, 值是图片地址
"""
image_info
=
[]
image_backup_info
=
[]
table_info
=
[]
inline_eq_info
=
[]
interline_eq_info
=
[]
# 图片的保存路径组成是这样的: {s3_or_local_path}/{book_name}/{images|tables|equations}/{page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg
def
return_path
(
type
):
return
join_path
(
pdf_bytes_md5
,
type
)
for
bbox
in
image_bboxes
:
image_path
=
cut_image
(
bbox
,
page_num
,
page
,
return_path
(
"images"
),
imageWriter
)
image_info
.
append
({
"bbox"
:
bbox
,
"image_path"
:
image_path
})
for
bbox
in
images_overlap_backup
:
image_path
=
cut_image
(
bbox
,
page_num
,
page
,
return_path
(
"images"
),
imageWriter
)
image_backup_info
.
append
({
"bbox"
:
bbox
,
"image_path"
:
image_path
})
for
bbox
in
table_bboxes
:
image_path
=
cut_image
(
bbox
,
page_num
,
page
,
return_path
(
"tables"
),
imageWriter
)
table_info
.
append
({
"bbox"
:
bbox
,
"image_path"
:
image_path
})
return
image_info
,
image_backup_info
,
table_info
,
inline_eq_info
,
interline_eq_info
magic_pdf/pre_proc/ocr_cut_image.py
deleted
100644 → 0
View file @
d867304f
from
magic_pdf.libs.commons
import
join_path
from
magic_pdf.libs.ocr_content_type
import
ContentType
from
magic_pdf.libs.pdf_image_tools
import
cut_image
def
cut_image_and_table
(
spans
,
page
,
page_id
,
pdf_bytes_md5
,
imageWriter
):
def
return_path
(
type
):
return
join_path
(
pdf_bytes_md5
,
type
)
for
span
in
spans
:
span_type
=
span
[
'type'
]
if
span_type
==
ContentType
.
Image
:
span
[
'image_path'
]
=
cut_image
(
span
[
'bbox'
],
page_id
,
page
,
return_path
=
return_path
(
'images'
),
imageWriter
=
imageWriter
)
elif
span_type
==
ContentType
.
Table
:
span
[
'image_path'
]
=
cut_image
(
span
[
'bbox'
],
page_id
,
page
,
return_path
=
return_path
(
'tables'
),
imageWriter
=
imageWriter
)
return
spans
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment