Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
59bc15e0
Commit
59bc15e0
authored
Mar 01, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Plain Diff
Merge remote-tracking branch 'origin/master'
parents
7f0371da
b725e72c
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
74 additions
and
5 deletions
+74
-5
.gitignore
.gitignore
+36
-0
pdf_parse_by_model.py
pipeline/pdf_parse_by_model.py
+1
-2
citationmarker_remove.py
pre_proc/citationmarker_remove.py
+8
-1
detect_equation.py
pre_proc/detect_equation.py
+29
-2
No files found.
.gitignore
0 → 100644
View file @
59bc15e0
*.tar
*.tar.gz
venv*/
envs/
slurm_logs/
sync1.sh
data_preprocess_pj1
data-preparation1
__pycache__
*.log
*.pyc
.vscode
debug/
*.ipynb
.idea
spark/__init__.py
# vscode history
.history
.DS_Store
.env
bad_words/
bak/
app/tests/*
temp/
tmp/
tmp
.vscode
.vscode/
/test/
/app/pdf_toolbox/test/test_bookname.txt
pipeline/pdf_parse_by_model.py
View file @
59bc15e0
...
...
@@ -271,9 +271,8 @@ def parse_pdf_by_model(
""""以下进入到公式替换环节 """
char_level_text_blocks
=
page
.
get_text
(
"rawdict"
,
flags
=
fitz
.
TEXTFLAGS_TEXT
)[
'blocks'
]
remain_text_blocks
=
combine_chars_to_pymudict
(
remain_text_blocks
,
char_level_text_blocks
)
# 合并chars
remain_text_blocks
=
remove_citation_marker
(
remain_text_blocks
)
# 先把角标去掉
remain_text_blocks
=
replace_equations_in_textblock
(
remain_text_blocks
,
inline_eq_info
,
interline_eq_info
)
remain_text_blocks
=
remove_citation_marker
(
remain_text_blocks
)
# 公式替换之后去角标,防止公式无法替换成功。但是这样也会带来个问题就是把角标当公式。各有优劣。
remain_text_blocks
=
remove_chars_in_text_blocks
(
remain_text_blocks
)
# 减少中间态数据体积
#debug_show_bbox(pdf_docs, page_id, [b['bbox'] for b in inline_eq_info], [b['bbox'] for b in interline_eq_info], [], join_path(save_path, book_name, f"{book_name}_debug.pdf"), 3)
...
...
pre_proc/citationmarker_remove.py
View file @
59bc15e0
...
...
@@ -114,12 +114,16 @@ def remove_citation_marker(with_char_text_blcoks):
# 找到高度最高的span作为位置比较的基准
max_hi_span
=
line
[
'spans'
][
0
][
'bbox'
]
min_font_sz
=
10000
min_font_sz
=
10000
# line里最小的字体
max_font_sz
=
0
# line里最大的字体
for
s
in
line
[
'spans'
]:
if
max_hi_span
[
3
]
-
max_hi_span
[
1
]
<
s
[
'bbox'
][
3
]
-
s
[
'bbox'
][
1
]:
max_hi_span
=
s
[
'bbox'
]
if
min_font_sz
>
s
[
'size'
]:
min_font_sz
=
s
[
'size'
]
if
max_font_sz
<
s
[
'size'
]:
max_font_sz
=
s
[
'size'
]
base_span_mid_y
=
(
max_hi_span
[
3
]
+
max_hi_span
[
1
])
/
2
...
...
@@ -130,6 +134,9 @@ def remove_citation_marker(with_char_text_blcoks):
span_mid_y
=
(
span
[
'bbox'
][
3
]
+
span
[
'bbox'
][
1
])
/
2
span_font_sz
=
span
[
'size'
]
if
max_font_sz
-
span_font_sz
<
1
:
# 先以字体过滤正文,如果是正文就不再继续判断了
continue
if
(
base_span_mid_y
-
span_mid_y
)
/
span_hi
>
0.2
or
(
base_span_mid_y
-
span_mid_y
>
0
and
abs
(
span_font_sz
-
min_font_sz
)
/
min_font_sz
<
0.1
):
"""
1. 它的前一个char如果是句号或者逗号的话,那么肯定是角标而不是公式
...
...
pre_proc/detect_equation.py
View file @
59bc15e0
import
os
import
collections
# 统计库
import
re
# 正则
import
re
from
libs.boxbase
import
_is_in
# 正则
from
libs.commons
import
fitz
# pyMuPDF库
import
json
# json
from
pathlib
import
Path
def
__solve_contain_bboxs
(
all_bbox_list
:
list
):
"""将两个公式的bbox做判断是否有包含关系,若有的话则删掉较小的bbox"""
dump_list
=
[]
for
i
in
range
(
len
(
all_bbox_list
)):
for
j
in
range
(
i
+
1
,
len
(
all_bbox_list
)):
# 获取当前两个值
bbox1
=
all_bbox_list
[
i
][:
4
]
bbox2
=
all_bbox_list
[
j
][:
4
]
# 删掉较小的框
if
_is_in
(
bbox1
,
bbox2
):
dump_list
.
append
(
all_bbox_list
[
i
])
elif
_is_in
(
bbox2
,
bbox1
):
dump_list
.
append
(
all_bbox_list
[
j
])
# 遍历需要删除的列表中的每个元素
for
item
in
dump_list
:
while
item
in
all_bbox_list
:
all_bbox_list
.
remove
(
item
)
return
all_bbox_list
def
parse_equations
(
page_ID
:
int
,
page
:
fitz
.
Page
,
json_from_DocXchain_obj
:
dict
):
"""
:param page_ID: int类型,当前page在当前pdf文档中是第page_D页。
...
...
@@ -101,4 +127,5 @@ def parse_equations(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict
for
eq_box
in
equationIsolated_from_DocXChain_bboxs
:
eq_box
=
[
eq_box
[
0
]
+
cropbox
[
0
],
eq_box
[
1
]
+
cropbox
[
1
],
eq_box
[
2
]
+
cropbox
[
0
],
eq_box
[
3
]
+
cropbox
[
1
],
eq_box
[
4
]]
return
equationEmbedding_from_DocXChain_bboxs
,
equationIsolated_from_DocXChain_bboxs
deduped_embedding_eq_bboxes
=
__solve_contain_bboxs
(
equationEmbedding_from_DocXChain_bboxs
)
return
deduped_embedding_eq_bboxes
,
equationIsolated_from_DocXChain_bboxs
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment