Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
d6a5724b
Commit
d6a5724b
authored
Mar 29, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
table_latex支持
parent
50a543ce
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
112 additions
and
69 deletions
+112
-69
mkcontent.py
magic_pdf/dict2md/mkcontent.py
+112
-69
No files found.
magic_pdf/dict2md/mkcontent.py
View file @
d6a5724b
...
...
@@ -166,45 +166,65 @@ def mk_mm_markdown_1(para_dict: dict):
return
content_text
def
__insert_after_para
(
text
,
image_path
,
content_list
):
def
__insert_after_para
(
text
,
type
,
element
,
content_list
):
"""
在content_list中找到text,将image_path作为一个新的node插入到text后面
"""
for
i
,
c
in
enumerate
(
content_list
):
content_type
=
c
.
get
(
"type"
)
if
content_type
in
UNI_FORMAT_TEXT_TYPE
and
text
in
c
.
get
(
"text"
,
''
):
img_node
=
{
"type"
:
"image"
,
"img_path"
:
image_path
,
"img_alt"
:
""
,
"img_title"
:
""
,
"img_caption"
:
""
}
content_list
.
insert
(
i
+
1
,
img_node
)
if
type
==
"image"
:
content_node
=
{
"type"
:
"image"
,
"img_path"
:
element
.
get
(
"image_path"
),
"img_alt"
:
""
,
"img_title"
:
""
,
"img_caption"
:
""
,
}
elif
type
==
"table"
:
content_node
=
{
"type"
:
"table"
,
"img_path"
:
element
.
get
(
"image_path"
),
"table_latex"
:
element
.
get
(
"text"
),
"table_title"
:
""
,
"table_caption"
:
""
,
"table_quality"
:
element
.
get
(
"quality"
),
}
content_list
.
insert
(
i
+
1
,
content_node
)
break
else
:
logger
.
error
(
f
"Can't find the location of image {
image_path
} in the markdown file, search target is {text}"
)
logger
.
error
(
f
"Can't find the location of image {
element.get('image_path')
} in the markdown file, search target is {text}"
)
def
__insert_before_para
(
text
,
image_path
,
content_list
):
def
__insert_before_para
(
text
,
type
,
element
,
content_list
):
"""
在content_list中找到text,将image_path作为一个新的node插入到text前面
"""
for
i
,
c
in
enumerate
(
content_list
):
content_type
=
c
.
get
(
"type"
)
if
content_type
in
UNI_FORMAT_TEXT_TYPE
and
text
in
c
.
get
(
"text"
,
''
):
img_node
=
{
"type"
:
"image"
,
"img_path"
:
image_path
,
"img_alt"
:
""
,
"img_title"
:
""
,
"img_caption"
:
""
}
content_list
.
insert
(
i
,
img_node
)
if
type
==
"image"
:
content_node
=
{
"type"
:
"image"
,
"img_path"
:
element
.
get
(
"image_path"
),
"img_alt"
:
""
,
"img_title"
:
""
,
"img_caption"
:
""
,
}
elif
type
==
"table"
:
content_node
=
{
"type"
:
"table"
,
"img_path"
:
element
.
get
(
"image_path"
),
"table_latex"
:
element
.
get
(
"text"
),
"table_title"
:
""
,
"table_caption"
:
""
,
"table_quality"
:
element
.
get
(
"quality"
),
}
content_list
.
insert
(
i
,
content_node
)
break
else
:
logger
.
error
(
f
"Can't find the location of image {
image_path
} in the markdown file, search target is {text}"
)
logger
.
error
(
f
"Can't find the location of image {
element.get('image_path')
} in the markdown file, search target is {text}"
)
def
mk_universal_format
(
para_dict
:
dict
):
...
...
@@ -220,9 +240,11 @@ def mk_universal_format(para_dict: dict):
all_page_images
=
[]
all_page_images
.
extend
(
page_info
.
get
(
"images"
,[]))
all_page_images
.
extend
(
page_info
.
get
(
"image_backup"
,
[])
)
all_page_images
.
extend
(
page_info
.
get
(
"tables"
,[]))
all_page_images
.
extend
(
page_info
.
get
(
"table_backup"
,[])
)
# all_page_images.extend(page_info.get("tables",[]))
# all_page_images.extend(page_info.get("table_backup",[]) )
all_page_tables
=
[]
all_page_tables
.
extend
(
page_info
.
get
(
"tables"
,
[]))
if
not
para_blocks
or
not
pymu_raw_blocks
:
# 只有图片的拼接的场景
for
img
in
all_page_images
:
content_node
=
{
...
...
@@ -233,6 +255,16 @@ def mk_universal_format(para_dict: dict):
"img_caption"
:
""
}
page_lst
.
append
(
content_node
)
# TODO 图片顺序
for
table
in
all_page_tables
:
content_node
=
{
"type"
:
"table"
,
"img_path"
:
table
[
'image_path'
],
"table_latex"
:
table
.
get
(
"text"
),
"table_title"
:
""
,
"table_caption"
:
""
,
"table_quality"
:
table
.
get
(
"quality"
),
}
page_lst
.
append
(
content_node
)
# TODO 图片顺序
else
:
for
block
in
para_blocks
:
item
=
block
[
"paras"
]
...
...
@@ -266,56 +298,65 @@ def mk_universal_format(para_dict: dict):
"""插入图片"""
for
img
in
all_page_images
:
imgbox
=
img
[
'bbox'
]
img_content
=
f
"{img['image_path']}"
# 先看在哪个block内
for
block
in
pymu_raw_blocks
:
bbox
=
block
[
'bbox'
]
if
bbox
[
0
]
-
1
<=
imgbox
[
0
]
<
bbox
[
2
]
+
1
and
bbox
[
1
]
-
1
<=
imgbox
[
1
]
<
bbox
[
3
]
+
1
:
# 确定在这个大的block内,然后进入逐行比较距离
for
l
in
block
[
'lines'
]:
line_box
=
l
[
'bbox'
]
if
line_box
[
0
]
-
1
<=
imgbox
[
0
]
<
line_box
[
2
]
+
1
and
line_box
[
1
]
-
1
<=
imgbox
[
1
]
<
line_box
[
3
]
+
1
:
# 在line内的,插入line前面
line_txt
=
""
.
join
([
s
[
'text'
]
for
s
in
l
[
'spans'
]])
__insert_before_para
(
line_txt
,
img_content
,
content_lst
)
break
break
else
:
# 在行与行之间
# 找到图片x0,y0与line的x0,y0最近的line
min_distance
=
100000
min_line
=
None
for
l
in
block
[
'lines'
]:
line_box
=
l
[
'bbox'
]
distance
=
math
.
sqrt
((
line_box
[
0
]
-
imgbox
[
0
])
**
2
+
(
line_box
[
1
]
-
imgbox
[
1
])
**
2
)
if
distance
<
min_distance
:
min_distance
=
distance
min_line
=
l
if
min_line
:
line_txt
=
""
.
join
([
s
[
'text'
]
for
s
in
min_line
[
'spans'
]])
img_h
=
imgbox
[
3
]
-
imgbox
[
1
]
if
min_distance
<
img_h
:
# 文字在图片前面
__insert_after_para
(
line_txt
,
img_content
,
content_lst
)
else
:
__insert_before_para
(
line_txt
,
img_content
,
content_lst
)
break
else
:
logger
.
error
(
f
"Can't find the location of image {img['image_path']} in the markdown file #1"
)
else
:
# 应当在两个block之间
# 找到上方最近的block,如果上方没有就找大下方最近的block
top_txt_block
=
find_top_nearest_text_bbox
(
pymu_raw_blocks
,
imgbox
)
if
top_txt_block
:
line_txt
=
""
.
join
([
s
[
'text'
]
for
s
in
top_txt_block
[
'lines'
][
-
1
][
'spans'
]])
__insert_after_para
(
line_txt
,
img_content
,
content_lst
)
else
:
bottom_txt_block
=
find_bottom_nearest_text_bbox
(
pymu_raw_blocks
,
imgbox
)
if
bottom_txt_block
:
line_txt
=
""
.
join
([
s
[
'text'
]
for
s
in
bottom_txt_block
[
'lines'
][
0
][
'spans'
]])
__insert_before_para
(
line_txt
,
img_content
,
content_lst
)
else
:
# TODO ,图片可能独占一列,这种情况上下是没有图片的
logger
.
error
(
f
"Can't find the location of image {img['image_path']} in the markdown file #2"
)
insert_img_or_table
(
"image"
,
img
,
pymu_raw_blocks
,
content_lst
)
"""插入表格"""
for
table
in
all_page_tables
:
insert_img_or_table
(
"table"
,
table
,
pymu_raw_blocks
,
content_lst
)
# end for
return
content_lst
def
insert_img_or_table
(
type
,
element
,
pymu_raw_blocks
,
content_lst
):
element_bbox
=
element
[
'bbox'
]
# 先看在哪个block内
for
block
in
pymu_raw_blocks
:
bbox
=
block
[
'bbox'
]
if
bbox
[
0
]
-
1
<=
element_bbox
[
0
]
<
bbox
[
2
]
+
1
and
bbox
[
1
]
-
1
<=
element_bbox
[
1
]
<
bbox
[
3
]
+
1
:
# 确定在这个大的block内,然后进入逐行比较距离
for
l
in
block
[
'lines'
]:
line_box
=
l
[
'bbox'
]
if
line_box
[
0
]
-
1
<=
element_bbox
[
0
]
<
line_box
[
2
]
+
1
and
line_box
[
1
]
-
1
<=
element_bbox
[
1
]
<
line_box
[
3
]
+
1
:
# 在line内的,插入line前面
line_txt
=
""
.
join
([
s
[
'text'
]
for
s
in
l
[
'spans'
]])
__insert_before_para
(
line_txt
,
type
,
element
,
content_lst
)
break
break
else
:
# 在行与行之间
# 找到图片x0,y0与line的x0,y0最近的line
min_distance
=
100000
min_line
=
None
for
l
in
block
[
'lines'
]:
line_box
=
l
[
'bbox'
]
distance
=
math
.
sqrt
((
line_box
[
0
]
-
element_bbox
[
0
])
**
2
+
(
line_box
[
1
]
-
element_bbox
[
1
])
**
2
)
if
distance
<
min_distance
:
min_distance
=
distance
min_line
=
l
if
min_line
:
line_txt
=
""
.
join
([
s
[
'text'
]
for
s
in
min_line
[
'spans'
]])
img_h
=
element_bbox
[
3
]
-
element_bbox
[
1
]
if
min_distance
<
img_h
:
# 文字在图片前面
__insert_after_para
(
line_txt
,
type
,
element
,
content_lst
)
else
:
__insert_before_para
(
line_txt
,
type
,
element
,
content_lst
)
break
else
:
logger
.
error
(
f
"Can't find the location of image {element.get('image_path')} in the markdown file #1"
)
else
:
# 应当在两个block之间
# 找到上方最近的block,如果上方没有就找大下方最近的block
top_txt_block
=
find_top_nearest_text_bbox
(
pymu_raw_blocks
,
element_bbox
)
if
top_txt_block
:
line_txt
=
""
.
join
([
s
[
'text'
]
for
s
in
top_txt_block
[
'lines'
][
-
1
][
'spans'
]])
__insert_after_para
(
line_txt
,
type
,
element
,
content_lst
)
else
:
bottom_txt_block
=
find_bottom_nearest_text_bbox
(
pymu_raw_blocks
,
element_bbox
)
if
bottom_txt_block
:
line_txt
=
""
.
join
([
s
[
'text'
]
for
s
in
bottom_txt_block
[
'lines'
][
0
][
'spans'
]])
__insert_before_para
(
line_txt
,
type
,
element
,
content_lst
)
else
:
# TODO ,图片可能独占一列,这种情况上下是没有图片的
logger
.
error
(
f
"Can't find the location of image {element.get('image_path')} in the markdown file #2"
)
def
mk_mm_markdown
(
content_list
):
"""
基于同一格式的内容列表,构造markdown,含图片
...
...
@@ -348,6 +389,8 @@ def mk_nlp_markdown(content_list):
content_md
.
append
(
c
.
get
(
"text"
))
elif
content_type
==
"equation"
:
content_md
.
append
(
f
"$$
\n
{c.get('latex')}
\n
$$"
)
elif
content_type
==
"table"
:
content_md
.
append
(
f
"$$
\n
{c.get('table_latex')}
\n
$$"
)
elif
content_type
in
UNI_FORMAT_TEXT_TYPE
:
content_md
.
append
(
f
"{'#'*int(content_type[1])} {c.get('text')}"
)
return
"
\n\n
"
.
join
(
content_md
)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment