Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
a5c35165
Commit
a5c35165
authored
Jul 30, 2024
by
myhloli
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
feat(dict2md): add page index to para content for standard format v2
parent
0625595c
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
10 additions
and
4 deletions
+10
-4
ocr_mkcontent.py
magic_pdf/dict2md/ocr_mkcontent.py
+10
-4
No files found.
magic_pdf/dict2md/ocr_mkcontent.py
View file @
a5c35165
...
@@ -210,28 +210,32 @@ def para_to_standard_format(para, img_buket_path):
...
@@ -210,28 +210,32 @@ def para_to_standard_format(para, img_buket_path):
return
para_content
return
para_content
def
para_to_standard_format_v2
(
para_block
,
img_buket_path
):
def
para_to_standard_format_v2
(
para_block
,
img_buket_path
,
page_idx
):
para_type
=
para_block
[
'type'
]
para_type
=
para_block
[
'type'
]
if
para_type
==
BlockType
.
Text
:
if
para_type
==
BlockType
.
Text
:
para_content
=
{
para_content
=
{
'type'
:
'text'
,
'type'
:
'text'
,
'text'
:
merge_para_with_text
(
para_block
),
'text'
:
merge_para_with_text
(
para_block
),
'page_idx'
:
page_idx
}
}
elif
para_type
==
BlockType
.
Title
:
elif
para_type
==
BlockType
.
Title
:
para_content
=
{
para_content
=
{
'type'
:
'text'
,
'type'
:
'text'
,
'text'
:
merge_para_with_text
(
para_block
),
'text'
:
merge_para_with_text
(
para_block
),
'text_level'
:
1
'text_level'
:
1
,
'page_idx'
:
page_idx
}
}
elif
para_type
==
BlockType
.
InterlineEquation
:
elif
para_type
==
BlockType
.
InterlineEquation
:
para_content
=
{
para_content
=
{
'type'
:
'equation'
,
'type'
:
'equation'
,
'text'
:
merge_para_with_text
(
para_block
),
'text'
:
merge_para_with_text
(
para_block
),
'text_format'
:
"latex"
'text_format'
:
"latex"
,
'page_idx'
:
page_idx
}
}
elif
para_type
==
BlockType
.
Image
:
elif
para_type
==
BlockType
.
Image
:
para_content
=
{
para_content
=
{
'type'
:
'image'
,
'type'
:
'image'
,
'page_idx'
:
page_idx
}
}
for
block
in
para_block
[
'blocks'
]:
for
block
in
para_block
[
'blocks'
]:
if
block
[
'type'
]
==
BlockType
.
ImageBody
:
if
block
[
'type'
]
==
BlockType
.
ImageBody
:
...
@@ -241,6 +245,7 @@ def para_to_standard_format_v2(para_block, img_buket_path):
...
@@ -241,6 +245,7 @@ def para_to_standard_format_v2(para_block, img_buket_path):
elif
para_type
==
BlockType
.
Table
:
elif
para_type
==
BlockType
.
Table
:
para_content
=
{
para_content
=
{
'type'
:
'table'
,
'type'
:
'table'
,
'page_idx'
:
page_idx
}
}
for
block
in
para_block
[
'blocks'
]:
for
block
in
para_block
[
'blocks'
]:
if
block
[
'type'
]
==
BlockType
.
TableBody
:
if
block
[
'type'
]
==
BlockType
.
TableBody
:
...
@@ -345,6 +350,7 @@ def union_make(pdf_info_dict: list, make_mode: str, drop_mode: str, img_buket_pa
...
@@ -345,6 +350,7 @@ def union_make(pdf_info_dict: list, make_mode: str, drop_mode: str, img_buket_pa
raise
Exception
(
f
"drop_mode can not be null"
)
raise
Exception
(
f
"drop_mode can not be null"
)
paras_of_layout
=
page_info
.
get
(
"para_blocks"
)
paras_of_layout
=
page_info
.
get
(
"para_blocks"
)
page_idx
=
page_info
.
get
(
"page_idx"
)
if
not
paras_of_layout
:
if
not
paras_of_layout
:
continue
continue
if
make_mode
==
MakeMode
.
MM_MD
:
if
make_mode
==
MakeMode
.
MM_MD
:
...
@@ -355,7 +361,7 @@ def union_make(pdf_info_dict: list, make_mode: str, drop_mode: str, img_buket_pa
...
@@ -355,7 +361,7 @@ def union_make(pdf_info_dict: list, make_mode: str, drop_mode: str, img_buket_pa
output_content
.
extend
(
page_markdown
)
output_content
.
extend
(
page_markdown
)
elif
make_mode
==
MakeMode
.
STANDARD_FORMAT
:
elif
make_mode
==
MakeMode
.
STANDARD_FORMAT
:
for
para_block
in
paras_of_layout
:
for
para_block
in
paras_of_layout
:
para_content
=
para_to_standard_format_v2
(
para_block
,
img_buket_path
)
para_content
=
para_to_standard_format_v2
(
para_block
,
img_buket_path
,
page_idx
)
output_content
.
append
(
para_content
)
output_content
.
append
(
para_content
)
if
make_mode
in
[
MakeMode
.
MM_MD
,
MakeMode
.
NLP_MD
]:
if
make_mode
in
[
MakeMode
.
MM_MD
,
MakeMode
.
NLP_MD
]:
return
'
\n\n
'
.
join
(
output_content
)
return
'
\n\n
'
.
join
(
output_content
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment