Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
d5ea44f9
Commit
d5ea44f9
authored
Mar 16, 2024
by
xuchao
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
按照统一格式组合文本型pdf的解析结果
parent
f5bfaaf6
Changes
8
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
223 additions
and
105 deletions
+223
-105
demo_test.py
demo/demo_test.py
+11
-5
mkcontent.py
magic_pdf/dict2md/mkcontent.py
+185
-85
drop_tag.py
magic_pdf/libs/drop_tag.py
+8
-1
pipeline.py
magic_pdf/pipeline.py
+5
-4
equations_replace.py
magic_pdf/pre_proc/equations_replace.py
+3
-2
remove_footer_header.py
magic_pdf/pre_proc/remove_footer_header.py
+4
-3
remove_rotate_bbox.py
magic_pdf/pre_proc/remove_rotate_bbox.py
+4
-3
resolve_bbox_conflict.py
magic_pdf/pre_proc/resolve_bbox_conflict.py
+3
-2
No files found.
demo/demo_test.py
View file @
d5ea44f9
...
@@ -5,6 +5,7 @@ from pathlib import Path
...
@@ -5,6 +5,7 @@ from pathlib import Path
import
click
import
click
from
magic_pdf.dict2md.mkcontent
import
mk_mm_markdown
from
magic_pdf.pipeline
import
(
from
magic_pdf.pipeline
import
(
meta_scan
,
meta_scan
,
classify_by_type
,
classify_by_type
,
...
@@ -55,14 +56,19 @@ def demo_parse_pdf(book_name=None, start_page_id=0, debug_mode=True):
...
@@ -55,14 +56,19 @@ def demo_parse_pdf(book_name=None, start_page_id=0, debug_mode=True):
write_json_to_local
(
jso
,
book_name
)
write_json_to_local
(
jso
,
book_name
)
jso_md
=
pdf_intermediate_dict_to_markdown
(
jso
,
debug_mode
=
debug_mode
)
jso_md
=
pdf_intermediate_dict_to_markdown
(
jso
,
debug_mode
=
debug_mode
)
md_content
=
jso_md
.
get
(
"content"
)
content
=
jso_md
.
get
(
"content_list"
)
markdown_content
=
mk_mm_markdown
(
content
)
if
book_name
is
not
None
:
if
book_name
is
not
None
:
save_tmp_path
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"../.."
,
"tmp"
,
"unittest"
)
save_tmp_path
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"../.."
,
"tmp"
,
"unittest"
,
"md"
,
book_name
)
markdown_save_path
=
join_path
(
save_tmp_path
,
"md"
,
book_name
+
".md"
)
uni_format_save_path
=
join_path
(
save_tmp_path
,
"book"
+
".json"
)
markdown_save_path
=
join_path
(
save_tmp_path
,
"book"
+
".md"
)
with
open
(
uni_format_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
f
.
write
(
json
.
dumps
(
content
,
ensure_ascii
=
False
,
indent
=
4
))
with
open
(
markdown_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
with
open
(
markdown_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
f
.
write
(
md_content
)
f
.
write
(
markdown_content
)
else
:
else
:
logger
.
info
(
md_content
)
logger
.
info
(
json
.
dumps
(
content
,
ensure_ascii
=
False
)
)
def
demo_save_tables
(
book_name
=
None
,
start_page_id
=
0
,
debug_mode
=
True
):
def
demo_save_tables
(
book_name
=
None
,
start_page_id
=
0
,
debug_mode
=
True
):
...
...
magic_pdf/dict2md/mkcontent.py
View file @
d5ea44f9
...
@@ -2,9 +2,15 @@ import math
...
@@ -2,9 +2,15 @@ import math
from
loguru
import
logger
from
loguru
import
logger
from
magic_pdf.libs.boxbase
import
find_bottom_nearest_text_bbox
,
find_top_nearest_text_bbox
from
magic_pdf.libs.boxbase
import
find_bottom_nearest_text_bbox
,
find_top_nearest_text_bbox
from
magic_pdf.libs.ocr_content_type
import
ContentType
TYPE_INLINE_EQUATION
=
ContentType
.
InlineEquation
TYPE_INTERLINE_EQUATION
=
ContentType
.
InterlineEquation
UNI_FORMAT_TEXT_TYPE
=
[
'text'
,
'h1'
,
'h2'
,
'h3'
,
'h4'
,
'h5'
,
'h6'
]
def
mk_nlp_markdown
(
para_dict
:
dict
):
@
DeprecationWarning
def
mk_nlp_markdown_1
(
para_dict
:
dict
):
"""
"""
对排序后的bboxes拼接内容
对排序后的bboxes拼接内容
"""
"""
...
@@ -69,8 +75,8 @@ def __insert_before(content, image_content, target):
...
@@ -69,8 +75,8 @@ def __insert_before(content, image_content, target):
return
content
return
content
@
DeprecationWarning
def
mk_mm_markdown
(
para_dict
:
dict
):
def
mk_mm_markdown
_1
(
para_dict
:
dict
):
"""拼装多模态markdown"""
"""拼装多模态markdown"""
content_lst
=
[]
content_lst
=
[]
for
_
,
page_info
in
para_dict
.
items
():
for
_
,
page_info
in
para_dict
.
items
():
...
@@ -137,7 +143,7 @@ def mk_mm_markdown(para_dict: dict):
...
@@ -137,7 +143,7 @@ def mk_mm_markdown(para_dict: dict):
else
:
else
:
page_md
=
__insert_before
(
page_md
,
img_content
,
line_txt
)
page_md
=
__insert_before
(
page_md
,
img_content
,
line_txt
)
else
:
else
:
logger
.
error
(
f
"Can't find the location of image {img['image_path']} in the markdown file"
)
logger
.
error
(
f
"Can't find the location of image {img['image_path']} in the markdown file
#1
"
)
else
:
# 应当在两个block之间
else
:
# 应当在两个block之间
# 找到上方最近的block,如果上方没有就找大下方最近的block
# 找到上方最近的block,如果上方没有就找大下方最近的block
top_txt_block
=
find_top_nearest_text_bbox
(
pymu_raw_blocks
,
imgbox
)
top_txt_block
=
find_top_nearest_text_bbox
(
pymu_raw_blocks
,
imgbox
)
...
@@ -150,7 +156,7 @@ def mk_mm_markdown(para_dict: dict):
...
@@ -150,7 +156,7 @@ def mk_mm_markdown(para_dict: dict):
line_txt
=
""
.
join
([
s
[
'text'
]
for
s
in
bottom_txt_block
[
'lines'
][
0
][
'spans'
]])
line_txt
=
""
.
join
([
s
[
'text'
]
for
s
in
bottom_txt_block
[
'lines'
][
0
][
'spans'
]])
page_md
=
__insert_before
(
page_md
,
img_content
,
line_txt
)
page_md
=
__insert_before
(
page_md
,
img_content
,
line_txt
)
else
:
else
:
logger
.
error
(
f
"Can't find the location of image {img['image_path']} in the markdown file"
)
logger
.
error
(
f
"Can't find the location of image {img['image_path']} in the markdown file
#2
"
)
content_lst
.
append
(
page_md
)
content_lst
.
append
(
page_md
)
...
@@ -160,90 +166,184 @@ def mk_mm_markdown(para_dict: dict):
...
@@ -160,90 +166,184 @@ def mk_mm_markdown(para_dict: dict):
return
content_text
return
content_text
@
DeprecationWarning
def
__insert_after_para
(
text
,
image_path
,
content_list
):
def
mk_mm_markdown_1
(
para_dict
:
dict
):
"""
在content_list中找到text,将image_path作为一个新的node插入到text后面
"""
for
i
,
c
in
enumerate
(
content_list
):
content_type
=
c
.
get
(
"type"
)
if
content_type
in
UNI_FORMAT_TEXT_TYPE
and
text
in
c
.
get
(
"text"
,
''
):
img_node
=
{
"type"
:
"image"
,
"img_path"
:
image_path
,
"img_alt"
:
""
,
"img_title"
:
""
,
"img_caption"
:
""
}
content_list
.
insert
(
i
+
1
,
img_node
)
break
else
:
logger
.
error
(
f
"Can't find the location of image {image_path} in the markdown file, search target is {text}"
)
def
__insert_before_para
(
text
,
image_path
,
content_list
):
"""
"""
得到images和tables变量
在content_list中找到text,将image_path作为一个新的node插入到text前面
"""
"""
image_all_list
=
[]
for
i
,
c
in
enumerate
(
content_list
):
content_type
=
c
.
get
(
"type"
)
if
content_type
in
UNI_FORMAT_TEXT_TYPE
and
text
in
c
.
get
(
"text"
,
''
):
img_node
=
{
"type"
:
"image"
,
"img_path"
:
image_path
,
"img_alt"
:
""
,
"img_title"
:
""
,
"img_caption"
:
""
}
content_list
.
insert
(
i
,
img_node
)
break
else
:
logger
.
error
(
f
"Can't find the location of image {image_path} in the markdown file, search target is {text}"
)
def
mk_universal_format
(
para_dict
:
dict
):
"""
构造统一格式 https://aicarrier.feishu.cn/wiki/FqmMwcH69iIdCWkkyjvcDwNUnTY
"""
content_lst
=
[]
for
_
,
page_info
in
para_dict
.
items
():
for
_
,
page_info
in
para_dict
.
items
():
images
=
page_info
.
get
(
"images"
,[])
page_lst
=
[]
# 一个page内的段落列表
tables
=
page_info
.
get
(
"tables"
,[]
)
para_blocks
=
page_info
.
get
(
"para_blocks"
)
image_backup
=
page_info
.
get
(
"image_backup"
,
[])
pymu_raw_blocks
=
page_info
.
get
(
"preproc_blocks"
)
table_backup
=
page_info
.
get
(
"table_backup"
,[])
all_page_images
=
[]
all_page_images
=
[]
all_page_images
.
extend
(
images
)
all_page_images
.
extend
(
page_info
.
get
(
"images"
,[]))
all_page_images
.
extend
(
image_backup
)
all_page_images
.
extend
(
page_info
.
get
(
"image_backup"
,
[])
)
all_page_images
.
extend
(
tables
)
all_page_images
.
extend
(
page_info
.
get
(
"tables"
,[]))
all_page_images
.
extend
(
table_backup
)
all_page_images
.
extend
(
page_info
.
get
(
"table_backup"
,[])
)
pymu_raw_blocks
=
page_info
.
get
(
"pymu_raw_blocks"
)
if
not
para_blocks
or
not
pymu_raw_blocks
:
# 只有图片的拼接的场景
for
img
in
all_page_images
:
# 提取每个图片所在位置
content_node
=
{
for
image_info
in
all_page_images
:
"type"
:
"image"
,
x0_image
,
y0_image
,
x1_image
,
y1_image
=
image_info
[
'bbox'
][:
4
]
"img_path"
:
img
[
'image_path'
],
image_path
=
image_info
[
'image_path'
]
"img_alt"
:
""
,
"img_title"
:
""
,
# 判断图片处于原始PDF中哪个模块之间
"img_caption"
:
""
image_internal_dict
=
{}
}
image_external_dict
=
{}
page_lst
.
append
(
content_node
)
# TODO 图片顺序
between_dict
=
{}
else
:
for
block
in
para_blocks
:
item
=
block
[
"paras"
]
for
_
,
p
in
item
.
items
():
font_type
=
p
[
'para_font_type'
]
# 对于文本来说,要么是普通文本,要么是个行间公式
if
font_type
==
TYPE_INTERLINE_EQUATION
:
content_node
=
{
"type"
:
"equation"
,
"latex"
:
p
[
"para_text"
]
}
page_lst
.
append
(
content_node
)
else
:
para_text
=
p
[
"para_text"
]
is_title
=
p
[
"is_para_title"
]
title_level
=
p
[
'para_title_level'
]
if
is_title
:
content_node
=
{
"type"
:
f
"h{title_level}"
,
"text"
:
para_text
}
page_lst
.
append
(
content_node
)
else
:
content_node
=
{
"type"
:
"text"
,
"text"
:
para_text
}
page_lst
.
append
(
content_node
)
content_lst
.
extend
(
page_lst
)
"""插入图片"""
for
img
in
all_page_images
:
imgbox
=
img
[
'bbox'
]
img_content
=
f
"{img['image_path']}"
# 先看在哪个block内
for
block
in
pymu_raw_blocks
:
for
block
in
pymu_raw_blocks
:
x0
,
y0
,
x1
,
y1
=
block
[
'bbox'
][:
4
]
bbox
=
block
[
'bbox'
]
if
bbox
[
0
]
-
1
<=
imgbox
[
0
]
<
bbox
[
2
]
+
1
and
bbox
[
1
]
-
1
<=
imgbox
[
1
]
<
bbox
[
3
]
+
1
:
# 确定在这个大的block内,然后进入逐行比较距离
# 在某个模块内部
for
l
in
block
[
'lines'
]:
if
x0
<=
x0_image
<
x1
and
y0
<=
y0_image
<
y1
:
line_box
=
l
[
'bbox'
]
image_internal_dict
[
'bbox'
]
=
[
x0_image
,
y0_image
,
x1_image
,
y1_image
]
if
line_box
[
0
]
-
1
<=
imgbox
[
0
]
<
line_box
[
2
]
+
1
and
line_box
[
1
]
-
1
<=
imgbox
[
1
]
<
line_box
[
3
]
+
1
:
# 在line内的,插入line前面
image_internal_dict
[
'path'
]
=
image_path
line_txt
=
""
.
join
([
s
[
'text'
]
for
s
in
l
[
'spans'
]])
__insert_before_para
(
line_txt
,
img_content
,
content_lst
)
# 确定图片在哪句文本之前
break
y_pre
=
0
break
for
line
in
block
[
'lines'
]:
else
:
# 在行与行之间
x0
,
y0
,
x1
,
y1
=
line
[
'spans'
][
0
][
'bbox'
]
# 找到图片x0,y0与line的x0,y0最近的line
if
x0
<=
x0_image
<
x1
and
y_pre
<=
y0_image
<
y0
:
min_distance
=
100000
text
=
line
[
'spans'
][
'text'
]
min_line
=
None
image_internal_dict
[
'text'
]
=
text
for
l
in
block
[
'lines'
]:
image_internal_dict
[
'markdown_image'
]
=
f
''
line_box
=
l
[
'bbox'
]
distance
=
math
.
sqrt
((
line_box
[
0
]
-
imgbox
[
0
])
**
2
+
(
line_box
[
1
]
-
imgbox
[
1
])
**
2
)
if
distance
<
min_distance
:
min_distance
=
distance
min_line
=
l
if
min_line
:
line_txt
=
""
.
join
([
s
[
'text'
]
for
s
in
min_line
[
'spans'
]])
img_h
=
imgbox
[
3
]
-
imgbox
[
1
]
if
min_distance
<
img_h
:
# 文字在图片前面
__insert_after_para
(
line_txt
,
img_content
,
content_lst
)
else
:
__insert_before_para
(
line_txt
,
img_content
,
content_lst
)
break
break
else
:
else
:
y_pre
=
y0
logger
.
error
(
f
"Can't find the location of image {img['image_path']} in the markdown file #1"
)
# 在某两个模块之间
else
:
# 应当在两个block之间
elif
x0
<=
x0_image
<
x1
:
# 找到上方最近的block,如果上方没有就找大下方最近的block
distance
=
math
.
sqrt
((
x1_image
-
x0
)
**
2
+
(
y1_image
-
y0
)
**
2
)
top_txt_block
=
find_top_nearest_text_bbox
(
pymu_raw_blocks
,
imgbox
)
between_dict
[
block
[
'number'
]]
=
distance
if
top_txt_block
:
line_txt
=
""
.
join
([
s
[
'text'
]
for
s
in
top_txt_block
[
'lines'
][
-
1
][
'spans'
]])
# 找到与定位点距离最小的文本block
__insert_after_para
(
line_txt
,
img_content
,
content_lst
)
if
between_dict
:
else
:
min_key
=
min
(
between_dict
,
key
=
between_dict
.
get
)
bottom_txt_block
=
find_bottom_nearest_text_bbox
(
pymu_raw_blocks
,
imgbox
)
spans_list
=
[]
if
bottom_txt_block
:
for
span
in
pymu_raw_blocks
[
min_key
][
'lines'
]:
line_txt
=
""
.
join
([
s
[
'text'
]
for
s
in
bottom_txt_block
[
'lines'
][
0
][
'spans'
]])
for
text_piece
in
span
[
'spans'
]:
__insert_before_para
(
line_txt
,
img_content
,
content_lst
)
# 防止索引定位文本内容过多
else
:
# TODO ,图片可能独占一列,这种情况上下是没有图片的
if
len
(
spans_list
)
<
60
:
logger
.
error
(
f
"Can't find the location of image {img['image_path']} in the markdown file #2"
)
spans_list
.
append
(
text_piece
[
'text'
])
# end for
text1
=
''
.
join
(
spans_list
)
return
content_lst
image_external_dict
[
'bbox'
]
=
[
x0_image
,
y0_image
,
x1_image
,
y1_image
]
image_external_dict
[
'path'
]
=
image_path
image_external_dict
[
'text'
]
=
text1
image_external_dict
[
'markdown_image'
]
=
f
''
# 将内部图片或外部图片存入当页所有图片的列表
if
len
(
image_internal_dict
)
!=
0
:
image_all_list
.
append
(
image_internal_dict
)
elif
len
(
image_external_dict
)
!=
0
:
image_all_list
.
append
(
image_external_dict
)
else
:
logger
.
error
(
f
"Can't find the location of image {image_path} in the markdown file"
)
content_text
=
mk_nlp_markdown
(
para_dict
)
for
image_info_extract
in
image_all_list
:
loc
=
__find_index
(
content_text
,
image_info_extract
[
'text'
])
if
loc
is
not
None
:
content_text
=
__insert_string
(
content_text
,
image_info_extract
[
'markdown_image'
],
loc
)
else
:
logger
.
error
(
f
"Can't find the location of image {image_info_extract['path']} in the markdown file"
)
return
content_text
\ No newline at end of file
def
mk_mm_markdown
(
content_list
):
"""
基于同一格式的内容列表,构造markdown,含图片
"""
content_md
=
[]
for
c
in
content_list
:
content_type
=
c
.
get
(
"type"
)
if
content_type
==
"text"
:
content_md
.
append
(
c
.
get
(
"text"
))
elif
content_type
==
"equation"
:
content_md
.
append
(
f
"$$
\n
{c.get('latex')}
\n
$$"
)
elif
content_type
in
UNI_FORMAT_TEXT_TYPE
:
content_md
.
append
(
f
"{'#'*int(content_type[1])} {c.get('text')}"
)
elif
content_type
==
"image"
:
content_md
.
append
(
f
"})"
)
return
"
\n\n
"
.
join
(
content_md
)
def
mk_nlp_markdown
(
content_list
):
"""
基于同一格式的内容列表,构造markdown,不含图片
"""
content_md
=
[]
for
c
in
content_list
:
content_type
=
c
.
get
(
"type"
)
if
content_type
==
"text"
:
content_md
.
append
(
c
.
get
(
"text"
))
elif
content_type
==
"equation"
:
content_md
.
append
(
f
"$$
\n
{c.get('latex')}
\n
$$"
)
elif
content_type
in
UNI_FORMAT_TEXT_TYPE
:
content_md
.
append
(
f
"{'#'*int(content_type[1])} {c.get('text')}"
)
return
"
\n\n
"
.
join
(
content_md
)
\ No newline at end of file
magic_pdf/libs/drop_tag.py
View file @
d5ea44f9
COLOR_BG_HEADER_TXT_BLOCK
=
"color_background_header_txt_block"
COLOR_BG_HEADER_TXT_BLOCK
=
"color_background_header_txt_block"
PAGE_NO
=
"page-no"
# 页码
CONTENT_IN_FOOT_OR_HEADER
=
'in-foot-header-area'
# 页眉页脚内的文本
VERTICAL_TEXT
=
'vertical-text'
# 垂直文本
ROTATE_TEXT
=
'rotate-text'
# 旋转文本
EMPTY_SIDE_BLOCK
=
'empty-side-block'
# 边缘上的空白没有任何内容的block
ON_IMAGE_TEXT
=
'on-image-text'
# 文本在图片上
ON_TABLE_TEXT
=
'on-table-text'
# 文本在表格上
\ No newline at end of file
magic_pdf/pipeline.py
View file @
d5ea44f9
...
@@ -7,7 +7,7 @@ from magic_pdf.dict2md.ocr_mkcontent import ocr_mk_nlp_markdown, ocr_mk_mm_markd
...
@@ -7,7 +7,7 @@ from magic_pdf.dict2md.ocr_mkcontent import ocr_mk_nlp_markdown, ocr_mk_mm_markd
from
magic_pdf.libs.commons
import
read_file
,
join_path
,
parse_bucket_key
,
formatted_time
,
s3_image_save_path
from
magic_pdf.libs.commons
import
read_file
,
join_path
,
parse_bucket_key
,
formatted_time
,
s3_image_save_path
from
magic_pdf.libs.drop_reason
import
DropReason
from
magic_pdf.libs.drop_reason
import
DropReason
from
magic_pdf.libs.json_compressor
import
JsonCompressor
from
magic_pdf.libs.json_compressor
import
JsonCompressor
from
magic_pdf.dict2md.mkcontent
import
mk_nlp_markdown
from
magic_pdf.dict2md.mkcontent
import
mk_nlp_markdown
,
mk_universal_format
from
magic_pdf.pdf_parse_by_model
import
parse_pdf_by_model
from
magic_pdf.pdf_parse_by_model
import
parse_pdf_by_model
from
magic_pdf.filter.pdf_classify_by_type
import
classify
from
magic_pdf.filter.pdf_classify_by_type
import
classify
from
magic_pdf.filter.pdf_meta_scan
import
pdf_meta_scan
from
magic_pdf.filter.pdf_meta_scan
import
pdf_meta_scan
...
@@ -237,9 +237,10 @@ def pdf_intermediate_dict_to_markdown(jso: dict, debug_mode=False) -> dict:
...
@@ -237,9 +237,10 @@ def pdf_intermediate_dict_to_markdown(jso: dict, debug_mode=False) -> dict:
pdf_intermediate_dict
=
jso
[
'pdf_intermediate_dict'
]
pdf_intermediate_dict
=
jso
[
'pdf_intermediate_dict'
]
# 将 pdf_intermediate_dict 解压
# 将 pdf_intermediate_dict 解压
pdf_intermediate_dict
=
JsonCompressor
.
decompress_json
(
pdf_intermediate_dict
)
pdf_intermediate_dict
=
JsonCompressor
.
decompress_json
(
pdf_intermediate_dict
)
markdown_content
=
mk_nlp_markdown
(
pdf_intermediate_dict
)
#markdown_content = mk_nlp_markdown(pdf_intermediate_dict)
jso
[
"content"
]
=
markdown_content
jso
[
'content_list'
]
=
mk_universal_format
(
pdf_intermediate_dict
)
logger
.
info
(
f
"book_name is:{get_data_source(jso)}/{jso['file_id']},markdown content length is {len(markdown_content)}"
,
file
=
sys
.
stderr
)
#jso["content"] = markdown_content
logger
.
info
(
f
"book_name is:{get_data_source(jso)}/{jso['file_id']}"
)
# 把无用的信息清空
# 把无用的信息清空
jso
[
"doc_layout_result"
]
=
""
jso
[
"doc_layout_result"
]
=
""
jso
[
"pdf_intermediate_dict"
]
=
""
jso
[
"pdf_intermediate_dict"
]
=
""
...
...
magic_pdf/pre_proc/equations_replace.py
View file @
d5ea44f9
...
@@ -6,9 +6,10 @@ import json
...
@@ -6,9 +6,10 @@ import json
import
os
import
os
from
pathlib
import
Path
from
pathlib
import
Path
from
loguru
import
logger
from
loguru
import
logger
from
magic_pdf.libs.ocr_content_type
import
ContentType
TYPE_INLINE_EQUATION
=
"inline-equation"
TYPE_INLINE_EQUATION
=
ContentType
.
InlineEquation
TYPE_INTERLINE_EQUATION
=
"interline-equation"
TYPE_INTERLINE_EQUATION
=
ContentType
.
InterlineEquation
def
combine_chars_to_pymudict
(
block_dict
,
char_dict
):
def
combine_chars_to_pymudict
(
block_dict
,
char_dict
):
...
...
magic_pdf/pre_proc/remove_footer_header.py
View file @
d5ea44f9
import
re
import
re
from
magic_pdf.libs.boxbase
import
_is_in_or_part_overlap
from
magic_pdf.libs.boxbase
import
_is_in_or_part_overlap
from
magic_pdf.libs.drop_tag
import
CONTENT_IN_FOOT_OR_HEADER
,
PAGE_NO
def
remove_headder_footer_one_page
(
text_raw_blocks
,
image_bboxes
,
table_bboxes
,
header_bboxs
,
footer_bboxs
,
def
remove_headder_footer_one_page
(
text_raw_blocks
,
image_bboxes
,
table_bboxes
,
header_bboxs
,
footer_bboxs
,
...
@@ -67,7 +68,7 @@ def remove_headder_footer_one_page(text_raw_blocks, image_bboxes, table_bboxes,
...
@@ -67,7 +68,7 @@ def remove_headder_footer_one_page(text_raw_blocks, image_bboxes, table_bboxes,
blk
[
'lines'
]
.
remove
(
line
)
blk
[
'lines'
]
.
remove
(
line
)
else
:
else
:
# if not blk['lines']:
# if not blk['lines']:
blk
[
'tag'
]
=
'in-foot-header-area'
blk
[
'tag'
]
=
CONTENT_IN_FOOT_OR_HEADER
text_block_to_remove
.
append
(
blk
)
text_block_to_remove
.
append
(
blk
)
"""有的时候由于pageNo太小了,总是会有一点和content_boundry重叠一点,被放入正文,因此对于pageNo,进行span粒度的删除"""
"""有的时候由于pageNo太小了,总是会有一点和content_boundry重叠一点,被放入正文,因此对于pageNo,进行span粒度的删除"""
...
@@ -80,7 +81,7 @@ def remove_headder_footer_one_page(text_raw_blocks, image_bboxes, table_bboxes,
...
@@ -80,7 +81,7 @@ def remove_headder_footer_one_page(text_raw_blocks, image_bboxes, table_bboxes,
for
span
in
line
[
'spans'
]:
for
span
in
line
[
'spans'
]:
if
_is_in_or_part_overlap
(
pagenobox
,
span
[
'bbox'
]):
if
_is_in_or_part_overlap
(
pagenobox
,
span
[
'bbox'
]):
# span['text'] = ''
# span['text'] = ''
span
[
'tag'
]
=
"page-no"
span
[
'tag'
]
=
PAGE_NO
# 检查这个block是否只有这一个span,如果是,那么就把这个block也删除
# 检查这个block是否只有这一个span,如果是,那么就把这个block也删除
if
len
(
line
[
'spans'
])
==
1
and
len
(
block
[
'lines'
])
==
1
:
if
len
(
line
[
'spans'
])
==
1
and
len
(
block
[
'lines'
])
==
1
:
page_no_block_2_remove
.
append
(
block
)
page_no_block_2_remove
.
append
(
block
)
...
@@ -96,7 +97,7 @@ def remove_headder_footer_one_page(text_raw_blocks, image_bboxes, table_bboxes,
...
@@ -96,7 +97,7 @@ def remove_headder_footer_one_page(text_raw_blocks, image_bboxes, table_bboxes,
if
last_span
[
'text'
]
.
strip
()
and
not
re
.
search
(
'[a-zA-Z]'
,
last_span
[
'text'
])
and
re
.
search
(
'[0-9]'
,
if
last_span
[
'text'
]
.
strip
()
and
not
re
.
search
(
'[a-zA-Z]'
,
last_span
[
'text'
])
and
re
.
search
(
'[0-9]'
,
last_span
[
last_span
[
'text'
]):
'text'
]):
last_span
[
'tag'
]
=
"page-no"
last_span
[
'tag'
]
=
PAGE_NO
page_no_block_2_remove
.
append
(
last_block
)
page_no_block_2_remove
.
append
(
last_block
)
for
b
in
page_no_block_2_remove
:
for
b
in
page_no_block_2_remove
:
...
...
magic_pdf/pre_proc/remove_rotate_bbox.py
View file @
d5ea44f9
import
math
import
math
from
magic_pdf.libs.boxbase
import
is_vbox_on_side
from
magic_pdf.libs.boxbase
import
is_vbox_on_side
from
magic_pdf.libs.drop_tag
import
EMPTY_SIDE_BLOCK
,
ROTATE_TEXT
,
VERTICAL_TEXT
def
detect_non_horizontal_texts
(
result_dict
):
def
detect_non_horizontal_texts
(
result_dict
):
...
@@ -134,13 +135,13 @@ def remove_rotate_side_textblock(pymu_text_block, page_width, page_height):
...
@@ -134,13 +135,13 @@ def remove_rotate_side_textblock(pymu_text_block, page_width, page_height):
is_box_valign
=
(
len
(
set
([
int
(
line
[
'spans'
][
0
][
'bbox'
][
0
]
)
for
line
in
lines
if
len
(
line
[
'spans'
])
>
0
]))
==
1
)
and
(
len
([
int
(
line
[
'spans'
][
0
][
'bbox'
][
0
]
)
for
line
in
lines
if
len
(
line
[
'spans'
])
>
0
])
>
1
)
# 测试bbox在垂直方向是不是x0都相等,也就是在垂直方向排列.同时必须大于等于2个字
is_box_valign
=
(
len
(
set
([
int
(
line
[
'spans'
][
0
][
'bbox'
][
0
]
)
for
line
in
lines
if
len
(
line
[
'spans'
])
>
0
]))
==
1
)
and
(
len
([
int
(
line
[
'spans'
][
0
][
'bbox'
][
0
]
)
for
line
in
lines
if
len
(
line
[
'spans'
])
>
0
])
>
1
)
# 测试bbox在垂直方向是不是x0都相等,也就是在垂直方向排列.同时必须大于等于2个字
if
is_box_valign
:
if
is_box_valign
:
block
[
'tag'
]
=
"vertical-text"
block
[
'tag'
]
=
VERTICAL_TEXT
removed_text_block
.
append
(
block
)
removed_text_block
.
append
(
block
)
continue
continue
for
line
in
lines
:
for
line
in
lines
:
if
line
[
'dir'
]
!=
(
1
,
0
):
if
line
[
'dir'
]
!=
(
1
,
0
):
block
[
'tag'
]
=
"rotate"
block
[
'tag'
]
=
ROTATE_TEXT
removed_text_block
.
append
(
block
)
# 只要有一个line不是dir=(1,0),就把整个block都删掉
removed_text_block
.
append
(
block
)
# 只要有一个line不是dir=(1,0),就把整个block都删掉
break
break
...
@@ -177,7 +178,7 @@ def remove_side_blank_block(pymu_text_block, page_width, page_height):
...
@@ -177,7 +178,7 @@ def remove_side_blank_block(pymu_text_block, page_width, page_height):
continue
continue
if
__is_empty_side_box
(
block
):
if
__is_empty_side_box
(
block
):
block
[
'tag'
]
=
"empty-side-block"
block
[
'tag'
]
=
EMPTY_SIDE_BLOCK
removed_text_block
.
append
(
block
)
removed_text_block
.
append
(
block
)
continue
continue
...
...
magic_pdf/pre_proc/resolve_bbox_conflict.py
View file @
d5ea44f9
...
@@ -6,6 +6,7 @@
...
@@ -6,6 +6,7 @@
"""
"""
from
magic_pdf.libs.boxbase
import
_is_in
,
_is_in_or_part_overlap
,
_is_left_overlap
from
magic_pdf.libs.boxbase
import
_is_in
,
_is_in_or_part_overlap
,
_is_left_overlap
from
magic_pdf.libs.drop_tag
import
ON_IMAGE_TEXT
,
ON_TABLE_TEXT
def
resolve_bbox_overlap_conflict
(
images
:
list
,
tables
:
list
,
interline_equations
:
list
,
inline_equations
:
list
,
text_raw_blocks
:
list
):
def
resolve_bbox_overlap_conflict
(
images
:
list
,
tables
:
list
,
interline_equations
:
list
,
inline_equations
:
list
,
text_raw_blocks
:
list
):
...
@@ -27,14 +28,14 @@ def resolve_bbox_overlap_conflict(images:list, tables:list, interline_equations:
...
@@ -27,14 +28,14 @@ def resolve_bbox_overlap_conflict(images:list, tables:list, interline_equations:
for
text_block
in
text_raw_blocks
:
for
text_block
in
text_raw_blocks
:
text_bbox
=
text_block
[
"bbox"
]
text_bbox
=
text_block
[
"bbox"
]
if
_is_in
(
text_bbox
,
image_box
):
if
_is_in
(
text_bbox
,
image_box
):
text_block
[
'tag'
]
=
"on-image"
text_block
[
'tag'
]
=
ON_IMAGE_TEXT
text_block_removed
.
append
(
text_block
)
text_block_removed
.
append
(
text_block
)
# 去掉table上的文字block
# 去掉table上的文字block
for
table_box
in
tables
:
for
table_box
in
tables
:
for
text_block
in
text_raw_blocks
:
for
text_block
in
text_raw_blocks
:
text_bbox
=
text_block
[
"bbox"
]
text_bbox
=
text_block
[
"bbox"
]
if
_is_in
(
text_bbox
,
table_box
):
if
_is_in
(
text_bbox
,
table_box
):
text_block
[
'tag'
]
=
"on-table"
text_block
[
'tag'
]
=
ON_TABLE_TEXT
text_block_removed
.
append
(
text_block
)
text_block_removed
.
append
(
text_block
)
for
text_block
in
text_block_removed
:
for
text_block
in
text_block_removed
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment