Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
d47e7b82
Unverified
Commit
d47e7b82
authored
May 06, 2024
by
Kaiwen Liu
Committed by
GitHub
May 06, 2024
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'magicpdf:master' into master
parents
82489929
7f51d099
Changes
11
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
39 additions
and
18 deletions
+39
-18
demo_commons.bak
demo/demo_commons.bak
+0
-0
download.bak
demo/download.bak
+0
-0
ocr_demo.bak
demo/ocr_demo.bak
+0
-0
pdf2md.bak
demo/pdf2md.bak
+0
-0
s3pdf2md.bak
demo/s3pdf2md.bak
+0
-0
text_demo.bak
demo/text_demo.bak
+0
-0
draw_bbox.py
magic_pdf/libs/draw_bbox.py
+12
-8
ocr_content_type.py
magic_pdf/libs/ocr_content_type.py
+1
-0
pdf_parse_union_core.py
magic_pdf/pdf_parse_union_core.py
+11
-6
ocr_detect_all_bboxes.py
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
+4
-1
ocr_dict_merge.py
magic_pdf/pre_proc/ocr_dict_merge.py
+11
-3
No files found.
demo/demo_commons.
py
→
demo/demo_commons.
bak
View file @
d47e7b82
File moved
demo/download.
py
→
demo/download.
bak
View file @
d47e7b82
File moved
demo/ocr_demo.
py
→
demo/ocr_demo.
bak
View file @
d47e7b82
File moved
demo/pdf2md.
py
→
demo/pdf2md.
bak
View file @
d47e7b82
File moved
demo/s3pdf2md.
py
→
demo/s3pdf2md.
bak
View file @
d47e7b82
File moved
demo/text_demo.
py
→
demo/text_demo.
bak
View file @
d47e7b82
File moved
magic_pdf/libs/draw_bbox.py
View file @
d47e7b82
...
@@ -65,14 +65,8 @@ def draw_bbox_with_number(i, bbox_list, page, rgb_config, fill_config):
...
@@ -65,14 +65,8 @@ def draw_bbox_with_number(i, bbox_list, page, rgb_config, fill_config):
def
draw_layout_bbox
(
pdf_info
,
pdf_bytes
,
out_path
):
def
draw_layout_bbox
(
pdf_info
,
pdf_bytes
,
out_path
):
layout_bbox_list
=
[]
layout_bbox_list
=
[]
blocks_bbox_list
=
[]
dropped_bbox_list
=
[]
dropped_bbox_list
=
[]
tables_list
,
tables_body_list
,
tables_caption_list
,
tables_footnote_list
=
(
tables_list
,
tables_body_list
,
tables_caption_list
,
tables_footnote_list
=
[],
[],
[],
[]
[],
[],
[],
[],
)
imgs_list
,
imgs_body_list
,
imgs_caption_list
=
[],
[],
[]
imgs_list
,
imgs_body_list
,
imgs_caption_list
=
[],
[],
[]
titles_list
=
[]
titles_list
=
[]
texts_list
=
[]
texts_list
=
[]
...
@@ -80,7 +74,6 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path):
...
@@ -80,7 +74,6 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path):
for
page
in
pdf_info
:
for
page
in
pdf_info
:
page_layout_list
=
[]
page_layout_list
=
[]
page_dropped_list
=
[]
page_dropped_list
=
[]
page_blocks_bbox_list
=
[]
tables
,
tables_body
,
tables_caption
,
tables_footnote
=
[],
[],
[],
[]
tables
,
tables_body
,
tables_caption
,
tables_footnote
=
[],
[],
[],
[]
imgs
,
imgs_body
,
imgs_caption
=
[],
[],
[]
imgs
,
imgs_body
,
imgs_caption
=
[],
[],
[]
titles
=
[]
titles
=
[]
...
@@ -154,12 +147,22 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path):
...
@@ -154,12 +147,22 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path):
interline_equation_list
=
[]
interline_equation_list
=
[]
image_list
=
[]
image_list
=
[]
table_list
=
[]
table_list
=
[]
dropped_list
=
[]
for
page
in
pdf_info
:
for
page
in
pdf_info
:
page_text_list
=
[]
page_text_list
=
[]
page_inline_equation_list
=
[]
page_inline_equation_list
=
[]
page_interline_equation_list
=
[]
page_interline_equation_list
=
[]
page_image_list
=
[]
page_image_list
=
[]
page_table_list
=
[]
page_table_list
=
[]
page_dropped_list
=
[]
# 构造dropped_list
for
block
in
page
[
"discarded_blocks"
]:
if
block
[
"type"
]
==
BlockType
.
Discarded
:
for
line
in
block
[
"lines"
]:
for
span
in
line
[
"spans"
]:
page_dropped_list
.
append
(
span
[
"bbox"
])
dropped_list
.
append
(
page_dropped_list
)
# 构造其余useful_list
for
block
in
page
[
"para_blocks"
]:
for
block
in
page
[
"para_blocks"
]:
if
block
[
"type"
]
in
[
if
block
[
"type"
]
in
[
BlockType
.
Text
,
BlockType
.
Text
,
...
@@ -205,6 +208,7 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path):
...
@@ -205,6 +208,7 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path):
draw_bbox_without_number
(
i
,
interline_equation_list
,
page
,
[
0
,
0
,
255
],
False
)
draw_bbox_without_number
(
i
,
interline_equation_list
,
page
,
[
0
,
0
,
255
],
False
)
draw_bbox_without_number
(
i
,
image_list
,
page
,
[
255
,
204
,
0
],
False
)
draw_bbox_without_number
(
i
,
image_list
,
page
,
[
255
,
204
,
0
],
False
)
draw_bbox_without_number
(
i
,
table_list
,
page
,
[
204
,
0
,
255
],
False
)
draw_bbox_without_number
(
i
,
table_list
,
page
,
[
204
,
0
,
255
],
False
)
draw_bbox_without_number
(
i
,
dropped_list
,
page
,
[
158
,
158
,
158
],
False
)
# Save the PDF
# Save the PDF
pdf_docs
.
save
(
f
"{out_path}/spans.pdf"
)
pdf_docs
.
save
(
f
"{out_path}/spans.pdf"
)
magic_pdf/libs/ocr_content_type.py
View file @
d47e7b82
...
@@ -17,4 +17,5 @@ class BlockType:
...
@@ -17,4 +17,5 @@ class BlockType:
Title
=
"title"
Title
=
"title"
InterlineEquation
=
"interline_equation"
InterlineEquation
=
"interline_equation"
Footnote
=
"footnote"
Footnote
=
"footnote"
Discarded
=
"discarded"
magic_pdf/pdf_parse_union_core.py
View file @
d47e7b82
...
@@ -17,7 +17,8 @@ from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
...
@@ -17,7 +17,8 @@ from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
from
magic_pdf.pre_proc.equations_replace
import
remove_chars_in_text_blocks
,
replace_equations_in_textblock
,
\
from
magic_pdf.pre_proc.equations_replace
import
remove_chars_in_text_blocks
,
replace_equations_in_textblock
,
\
combine_chars_to_pymudict
combine_chars_to_pymudict
from
magic_pdf.pre_proc.ocr_detect_all_bboxes
import
ocr_prepare_bboxes_for_layout_split
from
magic_pdf.pre_proc.ocr_detect_all_bboxes
import
ocr_prepare_bboxes_for_layout_split
from
magic_pdf.pre_proc.ocr_dict_merge
import
sort_blocks_by_layout
,
fill_spans_in_blocks
,
fix_block_spans
from
magic_pdf.pre_proc.ocr_dict_merge
import
sort_blocks_by_layout
,
fill_spans_in_blocks
,
fix_block_spans
,
\
fix_discarded_block
from
magic_pdf.pre_proc.ocr_span_list_modify
import
remove_overlaps_min_spans
,
get_qa_need_list_v2
from
magic_pdf.pre_proc.ocr_span_list_modify
import
remove_overlaps_min_spans
,
get_qa_need_list_v2
from
magic_pdf.pre_proc.resolve_bbox_conflict
import
check_useful_block_horizontal_overlap
from
magic_pdf.pre_proc.resolve_bbox_conflict
import
check_useful_block_horizontal_overlap
...
@@ -122,15 +123,19 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
...
@@ -122,15 +123,19 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
spans
=
ocr_cut_image_and_table
(
spans
,
pdf_docs
[
page_id
],
page_id
,
pdf_bytes_md5
,
imageWriter
)
spans
=
ocr_cut_image_and_table
(
spans
,
pdf_docs
[
page_id
],
page_id
,
pdf_bytes_md5
,
imageWriter
)
'''将所有区块的bbox整理到一起'''
'''将所有区块的bbox整理到一起'''
all_bboxes
=
ocr_prepare_bboxes_for_layout_split
(
all_bboxes
,
all_discarded_blocks
=
ocr_prepare_bboxes_for_layout_split
(
img_blocks
,
table_blocks
,
discarded_blocks
,
text_blocks
,
title_blocks
,
img_blocks
,
table_blocks
,
discarded_blocks
,
text_blocks
,
title_blocks
,
interline_equations
,
page_w
,
page_h
)
interline_equations
,
page_w
,
page_h
)
'''先处理不需要排版的discarded_blocks'''
discarded_block_with_spans
,
spans
=
fill_spans_in_blocks
(
all_discarded_blocks
,
spans
,
0.4
)
fix_discarded_blocks
=
fix_discarded_block
(
discarded_block_with_spans
)
'''如果当前页面没有bbox则跳过'''
'''如果当前页面没有bbox则跳过'''
if
len
(
all_bboxes
)
==
0
:
if
len
(
all_bboxes
)
==
0
:
logger
.
warning
(
f
"skip this page, not found bbox, page_id: {page_id}"
)
logger
.
warning
(
f
"skip this page, not found
useful
bbox, page_id: {page_id}"
)
return
ocr_construct_page_component_v2
([],
[],
page_id
,
page_w
,
page_h
,
[],
return
ocr_construct_page_component_v2
([],
[],
page_id
,
page_w
,
page_h
,
[],
[],
[],
interline_equations
,
discarded_blocks
,
[],
[],
interline_equations
,
fix_
discarded_blocks
,
need_drop
,
drop_reason
)
need_drop
,
drop_reason
)
"""在切分之前,先检查一下bbox是否有左右重叠的情况,如果有,那么就认为这个pdf暂时没有能力处理好,这种左右重叠的情况大概率是由于pdf里的行间公式、表格没有被正确识别出来造成的 """
"""在切分之前,先检查一下bbox是否有左右重叠的情况,如果有,那么就认为这个pdf暂时没有能力处理好,这种左右重叠的情况大概率是由于pdf里的行间公式、表格没有被正确识别出来造成的 """
...
@@ -171,7 +176,7 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
...
@@ -171,7 +176,7 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
sorted_blocks
=
sort_blocks_by_layout
(
all_bboxes
,
layout_bboxes
)
sorted_blocks
=
sort_blocks_by_layout
(
all_bboxes
,
layout_bboxes
)
'''将span填入排好序的blocks中'''
'''将span填入排好序的blocks中'''
block_with_spans
=
fill_spans_in_blocks
(
sorted_blocks
,
spans
)
block_with_spans
,
spans
=
fill_spans_in_blocks
(
sorted_blocks
,
spans
,
0.6
)
'''对block进行fix操作'''
'''对block进行fix操作'''
fix_blocks
=
fix_block_spans
(
block_with_spans
,
img_blocks
,
table_blocks
)
fix_blocks
=
fix_block_spans
(
block_with_spans
,
img_blocks
,
table_blocks
)
...
@@ -181,7 +186,7 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
...
@@ -181,7 +186,7 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
'''构造pdf_info_dict'''
'''构造pdf_info_dict'''
page_info
=
ocr_construct_page_component_v2
(
fix_blocks
,
layout_bboxes
,
page_id
,
page_w
,
page_h
,
layout_tree
,
page_info
=
ocr_construct_page_component_v2
(
fix_blocks
,
layout_bboxes
,
page_id
,
page_w
,
page_h
,
layout_tree
,
images
,
tables
,
interline_equations
,
discarded_blocks
,
images
,
tables
,
interline_equations
,
fix_
discarded_blocks
,
need_drop
,
drop_reason
)
need_drop
,
drop_reason
)
return
page_info
return
page_info
...
...
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
View file @
d47e7b82
...
@@ -7,6 +7,7 @@ from magic_pdf.libs.ocr_content_type import BlockType
...
@@ -7,6 +7,7 @@ from magic_pdf.libs.ocr_content_type import BlockType
def
ocr_prepare_bboxes_for_layout_split
(
img_blocks
,
table_blocks
,
discarded_blocks
,
text_blocks
,
def
ocr_prepare_bboxes_for_layout_split
(
img_blocks
,
table_blocks
,
discarded_blocks
,
text_blocks
,
title_blocks
,
interline_equation_blocks
,
page_w
,
page_h
):
title_blocks
,
interline_equation_blocks
,
page_w
,
page_h
):
all_bboxes
=
[]
all_bboxes
=
[]
all_discarded_blocks
=
[]
for
image
in
img_blocks
:
for
image
in
img_blocks
:
x0
,
y0
,
x1
,
y1
=
image
[
'bbox'
]
x0
,
y0
,
x1
,
y1
=
image
[
'bbox'
]
all_bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
BlockType
.
Image
,
None
,
None
,
None
,
None
])
all_bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
BlockType
.
Image
,
None
,
None
,
None
,
None
])
...
@@ -38,10 +39,12 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc
...
@@ -38,10 +39,12 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc
'''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50
%
区域的(限定footnote)'''
'''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50
%
区域的(限定footnote)'''
for
discarded
in
discarded_blocks
:
for
discarded
in
discarded_blocks
:
x0
,
y0
,
x1
,
y1
=
discarded
[
'bbox'
]
x0
,
y0
,
x1
,
y1
=
discarded
[
'bbox'
]
all_discarded_blocks
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
BlockType
.
Discarded
,
None
,
None
,
None
,
None
])
# 将footnote加入到all_bboxes中,用来计算layout
if
(
x1
-
x0
)
>
(
page_w
/
3
)
and
(
y1
-
y0
)
>
10
and
y0
>
(
page_h
/
2
):
if
(
x1
-
x0
)
>
(
page_w
/
3
)
and
(
y1
-
y0
)
>
10
and
y0
>
(
page_h
/
2
):
all_bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
BlockType
.
Footnote
,
None
,
None
,
None
,
None
])
all_bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
BlockType
.
Footnote
,
None
,
None
,
None
,
None
])
return
all_bboxes
return
all_bboxes
,
all_discarded_blocks
def
fix_text_overlap_title_blocks
(
all_bboxes
):
def
fix_text_overlap_title_blocks
(
all_bboxes
):
...
...
magic_pdf/pre_proc/ocr_dict_merge.py
View file @
d47e7b82
...
@@ -141,7 +141,7 @@ def sort_blocks_by_layout(all_bboxes, layout_bboxes):
...
@@ -141,7 +141,7 @@ def sort_blocks_by_layout(all_bboxes, layout_bboxes):
return
sort_blocks
return
sort_blocks
def
fill_spans_in_blocks
(
blocks
,
spans
):
def
fill_spans_in_blocks
(
blocks
,
spans
,
radio
):
'''
'''
将allspans中的span按位置关系,放入blocks中
将allspans中的span按位置关系,放入blocks中
'''
'''
...
@@ -156,7 +156,7 @@ def fill_spans_in_blocks(blocks, spans):
...
@@ -156,7 +156,7 @@ def fill_spans_in_blocks(blocks, spans):
block_spans
=
[]
block_spans
=
[]
for
span
in
spans
:
for
span
in
spans
:
span_bbox
=
span
[
'bbox'
]
span_bbox
=
span
[
'bbox'
]
if
calculate_overlap_area_in_bbox1_area_ratio
(
span_bbox
,
block_bbox
)
>
0.6
:
if
calculate_overlap_area_in_bbox1_area_ratio
(
span_bbox
,
block_bbox
)
>
radio
:
block_spans
.
append
(
span
)
block_spans
.
append
(
span
)
'''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
'''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
...
@@ -178,7 +178,7 @@ def fill_spans_in_blocks(blocks, spans):
...
@@ -178,7 +178,7 @@ def fill_spans_in_blocks(blocks, spans):
for
span
in
block_spans
:
for
span
in
block_spans
:
spans
.
remove
(
span
)
spans
.
remove
(
span
)
return
block_with_spans
return
block_with_spans
,
spans
def
fix_block_spans
(
block_with_spans
,
img_blocks
,
table_blocks
):
def
fix_block_spans
(
block_with_spans
,
img_blocks
,
table_blocks
):
...
@@ -204,6 +204,14 @@ def fix_block_spans(block_with_spans, img_blocks, table_blocks):
...
@@ -204,6 +204,14 @@ def fix_block_spans(block_with_spans, img_blocks, table_blocks):
return
fix_blocks
return
fix_blocks
def
fix_discarded_block
(
discarded_block_with_spans
):
fix_discarded_blocks
=
[]
for
block
in
discarded_block_with_spans
:
block
=
fix_text_block
(
block
)
fix_discarded_blocks
.
append
(
block
)
return
fix_discarded_blocks
def
merge_spans_to_block
(
spans
:
list
,
block_bbox
:
list
,
block_type
:
str
):
def
merge_spans_to_block
(
spans
:
list
,
block_bbox
:
list
,
block_type
:
str
):
block_spans
=
[]
block_spans
=
[]
# 如果有img_caption,则将img_block中的text_spans放入img_caption_block中
# 如果有img_caption,则将img_block中的text_spans放入img_caption_block中
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment