Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
f36c2656
Commit
f36c2656
authored
Mar 22, 2024
by
kernel.h@qq.com
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
使用面积占比方式判断一行文本是不是在一个layoutbox里
parent
a36ef4f8
Changes
4
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
99 additions
and
89 deletions
+99
-89
ocr_demo.py
demo/ocr_demo.py
+2
-2
ocr_mkcontent.py
magic_pdf/dict2md/ocr_mkcontent.py
+19
-18
boxbase.py
magic_pdf/libs/boxbase.py
+27
-0
para_split.py
magic_pdf/para/para_split.py
+51
-69
No files found.
demo/ocr_demo.py
View file @
f36c2656
...
...
@@ -92,5 +92,5 @@ if __name__ == '__main__':
ocr_json_file_path
=
r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
# ocr_pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.pdf"
# ocr_json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.json"
#
ocr_local_parse(ocr_pdf_path, ocr_json_file_path)
ocr_online_parse
(
book_name
=
"美国加州中学教材/edu_00000060"
)
ocr_local_parse
(
ocr_pdf_path
,
ocr_json_file_path
)
#
ocr_online_parse(book_name="美国加州中学教材/edu_00000060")
magic_pdf/dict2md/ocr_mkcontent.py
View file @
f36c2656
...
...
@@ -71,9 +71,10 @@ def ocr_mk_mm_markdown(pdf_info_dict: dict):
def
ocr_mk_mm_markdown_with_para
(
pdf_info_dict
:
dict
):
markdown
=
[]
for
_
,
page_info
in
pdf_info_dict
.
items
():
paras
=
page_info
.
get
(
"para_blocks"
)
if
not
paras
:
paras
_of_layout
=
page_info
.
get
(
"para_blocks"
)
if
not
paras
_of_layout
:
continue
for
paras
in
paras_of_layout
:
for
para
in
paras
:
para_text
=
''
for
line
in
para
:
...
...
@@ -81,12 +82,12 @@ def ocr_mk_mm_markdown_with_para(pdf_info_dict: dict):
span_type
=
span
.
get
(
'type'
)
if
span_type
==
ContentType
.
Text
:
content
=
split_long_words
(
span
[
'content'
])
# content = span['content']
pass
elif
span_type
==
ContentType
.
InlineEquation
:
content
=
f
"${span['content']}$
"
content
=
f
" ${span['content']}$
"
elif
span_type
==
ContentType
.
InterlineEquation
:
content
=
f
"
\n
$$
\n
{span['content']}
\n
$$
\n
"
elif
span_type
in
[
ContentType
.
Image
,
ContentType
.
Table
]:
elif
span_type
in
[
ContentType
.
Image
,
ContentType
.
Table
]:
content
=
f
"
\n
})
\n
"
para_text
+=
content
+
' '
markdown
.
append
(
para_text
.
strip
()
+
' '
)
...
...
magic_pdf/libs/boxbase.py
View file @
f36c2656
...
...
@@ -18,6 +18,33 @@ def _is_in_or_part_overlap(box1, box2) -> bool:
y1_1
<
y0_2
or
# box1在box2的上边
y0_1
>
y1_2
)
# box1在box2的下边
def
_is_in_or_part_overlap_with_area_ratio
(
box1
,
box2
,
area_ratio_threshold
=
0.6
):
"""
判断box1是否在box2里面,或者box1和box2有部分重叠,且重叠面积占box1的比例超过area_ratio_threshold
"""
if
box1
is
None
or
box2
is
None
:
return
False
x0_1
,
y0_1
,
x1_1
,
y1_1
=
box1
x0_2
,
y0_2
,
x1_2
,
y1_2
=
box2
if
not
_is_in_or_part_overlap
(
box1
,
box2
):
return
False
# 计算重叠面积
x_left
=
max
(
x0_1
,
x0_2
)
y_top
=
max
(
y0_1
,
y0_2
)
x_right
=
min
(
x1_1
,
x1_2
)
y_bottom
=
min
(
y1_1
,
y1_2
)
overlap_area
=
(
x_right
-
x_left
)
*
(
y_bottom
-
y_top
)
# 计算box1的面积
box1_area
=
(
x1_1
-
x0_1
)
*
(
y1_1
-
y0_1
)
return
overlap_area
/
box1_area
>
area_ratio_threshold
def
_is_in
(
box1
,
box2
)
->
bool
:
"""
box1是否完全在box2里面
...
...
magic_pdf/para/para_split.py
View file @
f36c2656
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment