Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
e9aa103c
Commit
e9aa103c
authored
Mar 22, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
ocr增加分页markdown输出格式
parent
27c080a9
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
60 additions
and
1 deletion
+60
-1
ocr_mkcontent.py
magic_pdf/dict2md/ocr_mkcontent.py
+30
-0
pipeline.py
magic_pdf/pipeline.py
+30
-1
No files found.
magic_pdf/dict2md/ocr_mkcontent.py
View file @
e9aa103c
...
...
@@ -94,6 +94,36 @@ def ocr_mk_mm_markdown_with_para(pdf_info_dict: dict):
return
'
\n\n
'
.
join
(
markdown
)
def
ocr_mk_mm_markdown_with_para_and_pagination
(
pdf_info_dict
:
dict
):
markdown_with_para_and_pagination
=
[]
for
page_no
,
page_info
in
pdf_info_dict
.
items
():
page_markdown
=
[]
paras
=
page_info
.
get
(
"para_blocks"
)
if
not
paras
:
continue
for
para
in
paras
:
para_text
=
''
for
line
in
para
:
for
span
in
line
[
'spans'
]:
span_type
=
span
.
get
(
'type'
)
if
span_type
==
ContentType
.
Text
:
content
=
split_long_words
(
span
[
'content'
])
# content = span['content']
elif
span_type
==
ContentType
.
InlineEquation
:
content
=
f
"${span['content']}$"
elif
span_type
==
ContentType
.
InterlineEquation
:
content
=
f
"
\n
$$
\n
{span['content']}
\n
$$
\n
"
elif
span_type
in
[
ContentType
.
Image
,
ContentType
.
Table
]:
content
=
f
"
\n
})
\n
"
para_text
+=
content
+
' '
page_markdown
.
append
(
para_text
.
strip
()
+
' '
)
markdown_with_para_and_pagination
.
append
({
'page_no'
:
page_no
,
'md'
:
'
\n\n
'
.
join
(
page_markdown
)
})
return
markdown_with_para_and_pagination
def
make_standard_format_with_para
(
pdf_info_dict
:
dict
):
content_list
=
[]
for
_
,
page_info
in
pdf_info_dict
.
items
():
...
...
magic_pdf/pipeline.py
View file @
e9aa103c
...
...
@@ -7,7 +7,7 @@ from magic_pdf.dict2md.ocr_mkcontent import (
ocr_mk_nlp_markdown
,
ocr_mk_mm_markdown
,
ocr_mk_mm_standard_format
,
ocr_mk_mm_markdown_with_para
,
ocr_mk_mm_markdown_with_para
,
ocr_mk_mm_markdown_with_para_and_pagination
,
)
from
magic_pdf.libs.commons
import
(
read_file
,
...
...
@@ -525,6 +525,35 @@ def ocr_pdf_intermediate_dict_to_markdown_with_para(jso: dict, debug_mode=False)
return
jso
def
ocr_pdf_intermediate_dict_to_markdown_with_para_and_pagination
(
jso
:
dict
,
debug_mode
=
False
)
->
dict
:
if
debug_mode
:
pass
else
:
# 如果debug没开,则检测是否有needdrop字段
if
jso
.
get
(
"need_drop"
,
False
):
book_name
=
join_path
(
get_data_source
(
jso
),
jso
[
"file_id"
])
logger
.
info
(
f
"book_name is:{book_name} need drop"
,
file
=
sys
.
stderr
)
jso
[
"dropped"
]
=
True
return
jso
try
:
pdf_intermediate_dict
=
jso
[
"pdf_intermediate_dict"
]
# 将 pdf_intermediate_dict 解压
pdf_intermediate_dict
=
JsonCompressor
.
decompress_json
(
pdf_intermediate_dict
)
markdown_content
=
ocr_mk_mm_markdown_with_para_and_pagination
(
pdf_intermediate_dict
)
jso
[
"content"
]
=
markdown_content
logger
.
info
(
f
"book_name is:{get_data_source(jso)}/{jso['file_id']},markdown content length is {len(markdown_content)}"
,
file
=
sys
.
stderr
,
)
# 把无用的信息清空
# jso["doc_layout_result"] = ""
jso
[
"pdf_intermediate_dict"
]
=
""
# jso["pdf_meta"] = ""
except
Exception
as
e
:
jso
=
exception_handler
(
jso
,
e
)
return
jso
def
ocr_pdf_intermediate_dict_to_markdown_with_para_for_qa
(
jso
:
dict
,
debug_mode
=
False
)
->
dict
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment