Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
26c23782
Commit
26c23782
authored
Mar 14, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
ocr模式下content type 抽象
parent
b6f051d8
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
64 additions
and
48 deletions
+64
-48
ocr_mkcontent.py
magic_pdf/dict2md/ocr_mkcontent.py
+7
-4
draw_bbox.py
magic_pdf/libs/draw_bbox.py
+10
-8
ocr_content_type.py
magic_pdf/libs/ocr_content_type.py
+7
-0
pdf_parse_by_ocr.py
magic_pdf/pdf_parse_by_ocr.py
+12
-11
ocr_cut_image.py
magic_pdf/pre_proc/ocr_cut_image.py
+3
-2
ocr_dict_merge.py
magic_pdf/pre_proc/ocr_dict_merge.py
+4
-3
ocr_span_list_modify.py
magic_pdf/pre_proc/ocr_span_list_modify.py
+21
-20
No files found.
magic_pdf/dict2md/ocr_mkcontent.py
View file @
26c23782
from
magic_pdf.libs.ocr_content_type
import
ContentType
def
mk_nlp_markdown
(
pdf_info_dict
:
dict
):
markdown
=
[]
...
...
@@ -12,9 +15,9 @@ def mk_nlp_markdown(pdf_info_dict: dict):
if
not
span
.
get
(
'content'
):
continue
content
=
span
[
'content'
]
.
replace
(
'$'
,
'
\
$'
)
# 转义$
if
span
[
'type'
]
==
'inline_equation'
:
if
span
[
'type'
]
==
ContentType
.
InlineEquation
:
content
=
f
"${content}$"
elif
span
[
'type'
]
==
'displayed_equation'
:
elif
span
[
'type'
]
==
ContentType
.
InterlineEquation
:
content
=
f
"$$
\n
{content}
\n
$$"
line_text
+=
content
+
' '
# 在行末添加两个空格以强制换行
...
...
@@ -41,9 +44,9 @@ def mk_mm_markdown(pdf_info_dict: dict):
content
=
f
""
else
:
content
=
span
[
'content'
]
.
replace
(
'$'
,
'
\
$'
)
# 转义$
if
span
[
'type'
]
==
'inline_equation'
:
if
span
[
'type'
]
==
ContentType
.
InlineEquation
:
content
=
f
"${content}$"
elif
span
[
'type'
]
==
'displayed_equation'
:
elif
span
[
'type'
]
==
ContentType
.
InterlineEquation
:
content
=
f
"$$
\n
{content}
\n
$$"
line_text
+=
content
+
' '
# 在行末添加两个空格以强制换行
...
...
magic_pdf/libs/draw_bbox.py
View file @
26c23782
from
magic_pdf.libs.commons
import
fitz
# PyMuPDF
from
magic_pdf.libs.ocr_content_type
import
ContentType
def
draw_bbox_without_number
(
i
,
bbox_list
,
page
,
rgb_config
):
new_rgb
=
[]
...
...
@@ -49,30 +51,30 @@ def draw_layout_bbox(pdf_info_dict, input_path, out_path):
def
draw_text_bbox
(
pdf_info_dict
,
input_path
,
out_path
):
text_list
=
[]
inline_equation_list
=
[]
displayed
_equation_list
=
[]
interline
_equation_list
=
[]
for
page
in
pdf_info_dict
.
values
():
page_text_list
=
[]
page_inline_equation_list
=
[]
page_
displayed
_equation_list
=
[]
page_
interline
_equation_list
=
[]
for
block
in
page
[
'preproc_blocks'
]:
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
if
span
[
'type'
]
==
'text'
:
if
span
[
'type'
]
==
ContentType
.
Text
:
page_text_list
.
append
(
span
[
'bbox'
])
elif
span
[
'type'
]
==
'inline_equation'
:
elif
span
[
'type'
]
==
ContentType
.
InlineEquation
:
page_inline_equation_list
.
append
(
span
[
'bbox'
])
elif
span
[
'type'
]
==
'displayed_equation'
:
page_
displayed
_equation_list
.
append
(
span
[
'bbox'
])
elif
span
[
'type'
]
==
ContentType
.
InterlineEquation
:
page_
interline
_equation_list
.
append
(
span
[
'bbox'
])
text_list
.
append
(
page_text_list
)
inline_equation_list
.
append
(
page_inline_equation_list
)
displayed_equation_list
.
append
(
page_displayed
_equation_list
)
interline_equation_list
.
append
(
page_interline
_equation_list
)
doc
=
fitz
.
open
(
input_path
)
for
i
,
page
in
enumerate
(
doc
):
# 获取当前页面的数据
draw_bbox_without_number
(
i
,
text_list
,
page
,
[
255
,
0
,
0
])
draw_bbox_without_number
(
i
,
inline_equation_list
,
page
,
[
0
,
255
,
0
])
draw_bbox_without_number
(
i
,
displayed
_equation_list
,
page
,
[
0
,
0
,
255
])
draw_bbox_without_number
(
i
,
interline
_equation_list
,
page
,
[
0
,
0
,
255
])
# Save the PDF
doc
.
save
(
f
"{out_path}/text.pdf"
)
magic_pdf/libs/ocr_content_type.py
0 → 100644
View file @
26c23782
class
ContentType
:
Image
=
"image"
Table
=
"table"
Text
=
"text"
InlineEquation
=
"inline_equation"
InterlineEquation
=
"interline_equation"
magic_pdf/pdf_parse_by_ocr.py
View file @
26c23782
...
...
@@ -14,6 +14,7 @@ from magic_pdf.libs.commons import (
get_docx_model_output
,
)
from
magic_pdf.libs.coordinate_transform
import
get_scale_ratio
from
magic_pdf.libs.ocr_content_type
import
ContentType
from
magic_pdf.libs.safe_filename
import
sanitize_filename
from
magic_pdf.pre_proc.detect_footer_by_model
import
parse_footers
from
magic_pdf.pre_proc.detect_footnote
import
parse_footnotes_by_model
...
...
@@ -44,10 +45,10 @@ def construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, lay
'tables'
:
tables
,
'interline_equations'
:
interline_equations
,
'inline_equations'
:
inline_equations
,
'drop
p
ed_text_block'
:
dropped_text_block
,
'drop
p
ed_image_block'
:
dropped_image_block
,
'drop
p
ed_table_block'
:
dropped_table_block
,
'drop
p
ed_bboxes'
:
need_remove_spans_bboxes_dict
,
'droped_text_block'
:
dropped_text_block
,
'droped_image_block'
:
dropped_image_block
,
'droped_table_block'
:
dropped_table_block
,
'droped_bboxes'
:
need_remove_spans_bboxes_dict
,
}
return
return_dict
...
...
@@ -164,7 +165,7 @@ def parse_pdf_by_ocr(
# 1: 'image', # 图片
# 7: 'table', # 表格
# 13: 'inline_equation', # 行内公式
# 14: '
displayed
_equation', # 行间公式
# 14: '
interline
_equation', # 行间公式
# 15: 'text', # ocr识别文本
"""layout信息"""
# 11: 'full column', # 单栏
...
...
@@ -173,20 +174,20 @@ def parse_pdf_by_ocr(
"bbox"
:
bbox
,
}
if
category_id
==
1
:
span
[
"type"
]
=
"image"
span
[
"type"
]
=
ContentType
.
Image
elif
category_id
==
7
:
span
[
"type"
]
=
"table"
span
[
"type"
]
=
ContentType
.
Table
elif
category_id
==
13
:
span
[
"content"
]
=
layout_det
[
"latex"
]
span
[
"type"
]
=
"inline_equation"
span
[
"type"
]
=
ContentType
.
InlineEquation
elif
category_id
==
14
:
span
[
"content"
]
=
layout_det
[
"latex"
]
span
[
"type"
]
=
"displayed_equation"
span
[
"type"
]
=
ContentType
.
InterlineEquation
elif
category_id
==
15
:
span
[
"content"
]
=
layout_det
[
"text"
]
span
[
"type"
]
=
"text"
span
[
"type"
]
=
ContentType
.
Text
# print(span)
spans
.
append
(
span
)
else
:
...
...
@@ -213,7 +214,7 @@ def parse_pdf_by_ocr(
# bbox去除粘连
spans
=
remove_overlap_between_bbox
(
spans
)
# 对tpye=["
displayed
_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
# 对tpye=["
interline
_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
spans
=
adjust_bbox_for_standalone_block
(
spans
)
...
...
magic_pdf/pre_proc/ocr_cut_image.py
View file @
26c23782
from
magic_pdf.libs.commons
import
join_path
from
magic_pdf.libs.ocr_content_type
import
ContentType
from
magic_pdf.libs.pdf_image_tools
import
cut_image
...
...
@@ -11,9 +12,9 @@ def cut_image_and_table(spans, page, page_id, book_name, save_path):
for
span
in
spans
:
span_type
=
span
[
'type'
]
if
span_type
==
'image'
:
if
span_type
==
ContentType
.
Image
:
span
[
'image_path'
]
=
cut_image
(
span
[
'bbox'
],
page_id
,
page
,
img_save_path
(
'images'
))
elif
span_type
==
'table'
:
elif
span_type
==
ContentType
.
Table
:
span
[
'image_path'
]
=
cut_image
(
span
[
'bbox'
],
page_id
,
page
,
img_save_path
(
'tables'
))
return
spans
magic_pdf/pre_proc/ocr_dict_merge.py
View file @
26c23782
...
...
@@ -2,6 +2,7 @@ from loguru import logger
from
magic_pdf.libs.boxbase
import
__is_overlaps_y_exceeds_threshold
,
get_minbox_if_overlap_by_ratio
,
\
calculate_overlap_area_in_bbox1_area_ratio
from
magic_pdf.libs.ocr_content_type
import
ContentType
# 将每一个line中的span从左到右排序
...
...
@@ -29,10 +30,10 @@ def merge_spans_to_line(spans):
lines
=
[]
current_line
=
[
spans
[
0
]]
for
span
in
spans
[
1
:]:
# 如果当前的span类型为"
displayed_equation" 或者 当前行中已经有"displayed
_equation"
# 如果当前的span类型为"
interline_equation" 或者 当前行中已经有"interline
_equation"
# image和table类型,同上
if
span
[
'type'
]
in
[
"displayed_equation"
,
"image"
,
"table"
]
or
any
(
s
[
'type'
]
in
[
"displayed_equation"
,
"image"
,
"table"
]
for
s
in
current_line
):
if
span
[
'type'
]
in
[
ContentType
.
InterlineEquation
,
ContentType
.
Image
,
ContentType
.
Table
]
or
any
(
s
[
'type'
]
in
[
ContentType
.
InterlineEquation
,
ContentType
.
Image
,
ContentType
.
Table
]
for
s
in
current_line
):
# 则开始新行
lines
.
append
(
current_line
)
current_line
=
[
span
]
...
...
magic_pdf/pre_proc/ocr_span_list_modify.py
View file @
26c23782
...
...
@@ -2,6 +2,7 @@ from loguru import logger
from
magic_pdf.libs.boxbase
import
calculate_overlap_area_in_bbox1_area_ratio
,
get_minbox_if_overlap_by_ratio
,
\
__is_overlaps_y_exceeds_threshold
from
magic_pdf.libs.ocr_content_type
import
ContentType
def
remove_overlaps_min_spans
(
spans
):
...
...
@@ -49,22 +50,22 @@ def remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict):
for
span
in
need_remove_spans
:
spans
.
remove
(
span
)
span
[
'tag'
]
=
drop_tag
if
span
[
'type'
]
in
[
'text'
,
'inline_equation'
,
'displayed_equation'
]:
if
span
[
'type'
]
in
[
ContentType
.
Text
,
ContentType
.
InlineEquation
,
ContentType
.
InterlineEquation
]:
dropped_text_block
.
append
(
span
)
elif
span
[
'type'
]
==
'image'
:
elif
span
[
'type'
]
==
ContentType
.
Image
:
dropped_image_block
.
append
(
span
)
elif
span
[
'type'
]
==
'table'
:
elif
span
[
'type'
]
==
ContentType
.
Table
:
dropped_table_block
.
append
(
span
)
return
spans
,
dropped_text_block
,
dropped_image_block
,
dropped_table_block
def
adjust_bbox_for_standalone_block
(
spans
):
# 对tpye=["
displayed
_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
# 对tpye=["
interline
_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
for
sb_span
in
spans
:
if
sb_span
[
'type'
]
in
[
"displayed_equation"
,
"image"
,
"table"
]:
if
sb_span
[
'type'
]
in
[
ContentType
.
InterlineEquation
,
ContentType
.
Image
,
ContentType
.
Table
]:
for
text_span
in
spans
:
if
text_span
[
'type'
]
in
[
'text'
,
'inline_equation'
]:
if
text_span
[
'type'
]
in
[
ContentType
.
Text
,
ContentType
.
InlineEquation
]:
# 判断span2的纵向高度是否被span所覆盖
if
sb_span
[
'bbox'
][
1
]
<
text_span
[
'bbox'
][
1
]
and
sb_span
[
'bbox'
][
3
]
>
text_span
[
'bbox'
][
3
]:
# 判断span2是否在span左边
...
...
@@ -81,7 +82,7 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
lines
=
[]
current_line
=
[
spans
[
0
]]
if
spans
[
0
][
"type"
]
in
[
"displayed_equation"
,
"image"
,
"table"
]:
if
spans
[
0
][
"type"
]
in
[
ContentType
.
InterlineEquation
,
ContentType
.
Image
,
ContentType
.
Table
]:
displayed_list
.
append
(
spans
[
0
])
line_first_y0
=
spans
[
0
][
"bbox"
][
1
]
...
...
@@ -91,16 +92,16 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
for
span
in
spans
[
1
:]:
# if span.get("content","") == "78.":
# print("debug")
# 如果当前的span类型为"
displayed_equation" 或者 当前行中已经有"displayed
_equation"
# 如果当前的span类型为"
interline_equation" 或者 当前行中已经有"interline
_equation"
# image和table类型,同上
if
span
[
'type'
]
in
[
"displayed_equation"
,
"image"
,
"table"
]
or
any
(
s
[
'type'
]
in
[
"displayed_equation"
,
"image"
,
"table"
]
for
s
in
current_line
):
if
span
[
'type'
]
in
[
ContentType
.
InterlineEquation
,
ContentType
.
Image
,
ContentType
.
Table
]
or
any
(
s
[
'type'
]
in
[
ContentType
.
InterlineEquation
,
ContentType
.
Image
,
ContentType
.
Table
]
for
s
in
current_line
):
# 传入
if
span
[
"type"
]
in
[
"displayed_equation"
,
"image"
,
"table"
]:
if
span
[
"type"
]
in
[
ContentType
.
InterlineEquation
,
ContentType
.
Image
,
ContentType
.
Table
]:
displayed_list
.
append
(
span
)
# 则开始新行
lines
.
append
(
current_line
)
if
len
(
current_line
)
>
1
or
current_line
[
0
][
"type"
]
in
[
"text"
,
"inline_equation"
]:
if
len
(
current_line
)
>
1
or
current_line
[
0
][
"type"
]
in
[
ContentType
.
Text
,
ContentType
.
InlineEquation
]:
text_inline_lines
.
append
((
current_line
,
(
line_first_y0
,
line_first_y
)))
current_line
=
[
span
]
line_first_y0
=
span
[
"bbox"
][
1
]
...
...
@@ -125,7 +126,7 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
# 添加最后一行
if
current_line
:
lines
.
append
(
current_line
)
if
len
(
current_line
)
>
1
or
current_line
[
0
][
"type"
]
in
[
"text"
,
"inline_equation"
]:
if
len
(
current_line
)
>
1
or
current_line
[
0
][
"type"
]
in
[
ContentType
.
Text
,
ContentType
.
InlineEquation
]:
text_inline_lines
.
append
((
current_line
,
(
line_first_y0
,
line_first_y
)))
for
line
in
text_inline_lines
:
# 按照x0坐标排序
...
...
@@ -159,10 +160,10 @@ def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines:
span
[
'bbox'
],
(
0
,
y0
,
0
,
y1
)):
# 调整公式类型
if
span
[
"type"
]
==
"displayed_equation"
:
if
span
[
"type"
]
==
ContentType
.
InterlineEquation
:
# 最后一行是行间公式
if
j
+
1
>=
len
(
text_inline_lines
):
span
[
"type"
]
=
"inline_equation"
span
[
"type"
]
=
ContentType
.
InlineEquation
span
[
"bbox"
][
1
]
=
y0
span
[
"bbox"
][
3
]
=
y1
else
:
...
...
@@ -170,7 +171,7 @@ def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines:
y0_next
,
y1_next
=
text_inline_lines
[
j
+
1
][
1
]
if
not
__is_overlaps_y_exceeds_threshold
(
span
[
'bbox'
],
(
0
,
y0_next
,
0
,
y1_next
))
and
3
*
(
y1
-
y0
)
>
span_y
-
span_y0
:
span
[
"type"
]
=
"inline_equation"
span
[
"type"
]
=
ContentType
.
InlineEquation
span
[
"bbox"
][
1
]
=
y0
span
[
"bbox"
][
3
]
=
y1
break
...
...
@@ -193,13 +194,13 @@ def get_qa_need_list(blocks):
for
block
in
blocks
:
for
line
in
block
[
"lines"
]:
for
span
in
line
[
"spans"
]:
if
span
[
"type"
]
==
"image"
:
if
span
[
"type"
]
==
ContentType
.
Image
:
images
.
append
(
span
)
elif
span
[
"type"
]
==
"table"
:
elif
span
[
"type"
]
==
ContentType
.
Table
:
tables
.
append
(
span
)
elif
span
[
"type"
]
==
"inline_equation"
:
elif
span
[
"type"
]
==
ContentType
.
InlineEquation
:
inline_equations
.
append
(
span
)
elif
span
[
"type"
]
==
"displayed_equation"
:
elif
span
[
"type"
]
==
ContentType
.
InterlineEquation
:
interline_equations
.
append
(
span
)
else
:
continue
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment