Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
83753cbd
Commit
83753cbd
authored
Mar 16, 2024
by
xuchao
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
元素类型引用统一定义
parent
d5ea44f9
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
23 additions
and
16 deletions
+23
-16
ocr_demo.py
demo/ocr_demo.py
+4
-4
mkcontent.py
magic_pdf/dict2md/mkcontent.py
+5
-1
para_split.py
magic_pdf/para/para_split.py
+14
-11
No files found.
demo/ocr_demo.py
View file @
83753cbd
...
@@ -30,13 +30,13 @@ def read_json_file(file_path):
...
@@ -30,13 +30,13 @@ def read_json_file(file_path):
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
ocr_pdf_path
=
r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.pdf"
#
ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.pdf"
ocr_json_file_path
=
r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json"
#
ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json"
# ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.pdf"
# ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.pdf"
# ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.json"
# ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.json"
#
ocr_pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf"
ocr_pdf_path
=
r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf"
#
ocr_json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
ocr_json_file_path
=
r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
try
:
try
:
ocr_pdf_model_info
=
read_json_file
(
ocr_json_file_path
)
ocr_pdf_model_info
=
read_json_file
(
ocr_json_file_path
)
pth
=
Path
(
ocr_json_file_path
)
pth
=
Path
(
ocr_json_file_path
)
...
...
magic_pdf/dict2md/mkcontent.py
View file @
83753cbd
...
@@ -326,7 +326,11 @@ def mk_mm_markdown(content_list):
...
@@ -326,7 +326,11 @@ def mk_mm_markdown(content_list):
if
content_type
==
"text"
:
if
content_type
==
"text"
:
content_md
.
append
(
c
.
get
(
"text"
))
content_md
.
append
(
c
.
get
(
"text"
))
elif
content_type
==
"equation"
:
elif
content_type
==
"equation"
:
content_md
.
append
(
f
"$$
\n
{c.get('latex')}
\n
$$"
)
content
=
c
.
get
(
"latex"
)
if
content
.
startswith
(
"$$"
)
and
content
.
endswith
(
"$$"
):
content_md
.
append
(
content
)
else
:
content_md
.
append
(
f
"
\n
$$
\n
{c.get('latex')}
\n
$$
\n
"
)
elif
content_type
in
UNI_FORMAT_TEXT_TYPE
:
elif
content_type
in
UNI_FORMAT_TEXT_TYPE
:
content_md
.
append
(
f
"{'#'*int(content_type[1])} {c.get('text')}"
)
content_md
.
append
(
f
"{'#'*int(content_type[1])} {c.get('text')}"
)
elif
content_type
==
"image"
:
elif
content_type
==
"image"
:
...
...
magic_pdf/para/para_split.py
View file @
83753cbd
...
@@ -3,11 +3,12 @@ import numpy as np
...
@@ -3,11 +3,12 @@ import numpy as np
from
loguru
import
logger
from
loguru
import
logger
from
magic_pdf.libs.boxbase
import
_is_in
from
magic_pdf.libs.boxbase
import
_is_in
from
magic_pdf.libs.ocr_content_type
import
ContentType
LINE_STOP_FLAG
=
[
'.'
,
'!'
,
'?'
,
'。'
,
'!'
,
'?'
,
":"
,
":"
,
")"
,
")"
,
";"
]
LINE_STOP_FLAG
=
[
'.'
,
'!'
,
'?'
,
'。'
,
'!'
,
'?'
,
":"
,
":"
,
")"
,
")"
,
";"
]
INLINE_EQUATION
=
'inline_equation'
INLINE_EQUATION
=
ContentType
.
InlineEquation
INTER
_EQUATION
=
"displayed_equation"
INTER
LINE_EQUATION
=
ContentType
.
InterlineEquation
TEXT
=
"text"
TEXT
=
"text"
def
__add_line_period
(
blocks
,
layout_bboxes
):
def
__add_line_period
(
blocks
,
layout_bboxes
):
...
@@ -20,20 +21,19 @@ def __add_line_period(blocks, layout_bboxes):
...
@@ -20,20 +21,19 @@ def __add_line_period(blocks, layout_bboxes):
for
line
in
block
[
'lines'
]:
for
line
in
block
[
'lines'
]:
last_span
=
line
[
'spans'
][
-
1
]
last_span
=
line
[
'spans'
][
-
1
]
span_type
=
last_span
[
'type'
]
span_type
=
last_span
[
'type'
]
if
span_type
in
[
TEXT
,
INLINE_EQUATION
]:
if
span_type
in
[
INLINE_EQUATION
]:
span_content
=
last_span
[
'content'
]
.
strip
()
span_content
=
last_span
[
'content'
]
.
strip
()
if
span_type
==
INLINE_EQUATION
and
span_content
[
-
1
]
not
in
LINE_STOP_FLAG
:
if
span_type
==
INLINE_EQUATION
and
span_content
[
-
1
]
not
in
LINE_STOP_FLAG
:
if
span_type
in
[
INLINE_EQUATION
,
INTER_EQUATION
]:
if
span_type
in
[
INLINE_EQUATION
,
INTER
LINE
_EQUATION
]:
last_span
[
'content'
]
=
span_content
+
'.'
last_span
[
'content'
]
=
span_content
+
'.'
def
__valign_lines
(
blocks
,
layout_bboxes
):
def
__valign_lines
(
blocks
,
layout_bboxes
):
"""
"""
对齐行的左侧和右侧。
在一个layoutbox内对齐行的左侧和右侧。
扫描行的左侧和右侧,如果x0, x1差距不超过3就强行对齐到所处layout的左右两侧(和layout有一段距离)。
扫描行的左侧和右侧,如果x0, x1差距不超过一个阈值,就强行对齐到所处layout的左右两侧(和layout有一段距离)。
3是个经验值,TODO,计算得来
3是个经验值,TODO,计算得来,可以设置为1.5个正文字符。
"""
"""
min_distance
=
3
min_distance
=
3
...
@@ -159,11 +159,14 @@ def __split_para_in_layoutbox(lines_group, layout_bboxes, lang="en", char_avg_le
...
@@ -159,11 +159,14 @@ def __split_para_in_layoutbox(lines_group, layout_bboxes, lang="en", char_avg_le
else
:
else
:
para
.
append
(
line
)
para
.
append
(
line
)
else
:
# 其他,图片、表格、行间公式,各自占一段
else
:
# 其他,图片、表格、行间公式,各自占一段
para
.
append
(
line
)
if
len
(
para
)
>
0
:
paras
.
append
(
para
)
paras
.
append
(
para
)
para
=
[]
else
:
paras
.
append
([
line
])
para
=
[]
# para_text = ''.join([get_span_text(span) for line in para for span in line['spans']])
# para_text = ''.join([get_span_text(span) for line in para for span in line['spans']])
# logger.info(para_text)
# logger.info(para_text)
para
=
[]
if
len
(
para
)
>
0
:
if
len
(
para
)
>
0
:
paras
.
append
(
para
)
paras
.
append
(
para
)
# para_text = ''.join([get_span_text(span) for line in para for span in line['spans']])
# para_text = ''.join([get_span_text(span) for line in para for span in line['spans']])
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment