Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
dcf6e712
Commit
dcf6e712
authored
Apr 22, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
将ocr_parse逻辑切换到v2,并解决几个parse过程中的error
parent
6b6bad4c
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
7 additions
and
7 deletions
+7
-7
pdf_parse_by_ocr_v2.py
magic_pdf/pdf_parse_by_ocr_v2.py
+0
-0
ocr_dict_merge.py
magic_pdf/pre_proc/ocr_dict_merge.py
+3
-3
ocr_span_list_modify.py
magic_pdf/pre_proc/ocr_span_list_modify.py
+3
-3
user_api.py
magic_pdf/user_api.py
+1
-1
No files found.
magic_pdf/parse_by_ocr_v2.py
→
magic_pdf/p
df_p
arse_by_ocr_v2.py
View file @
dcf6e712
File moved
magic_pdf/pre_proc/ocr_dict_merge.py
View file @
dcf6e712
...
@@ -252,7 +252,7 @@ def fix_image_block(block, img_blocks):
...
@@ -252,7 +252,7 @@ def fix_image_block(block, img_blocks):
break
break
# 根据list长度,判断img_block中是否有img_caption
# 根据list长度,判断img_block中是否有img_caption
if
len
(
img_block
[
'img_caption_bbox'
])
>
0
:
if
img_block
[
'img_caption_bbox'
]
is
not
None
:
img_caption_block
,
img_caption_spans
=
merge_spans_to_block
(
img_caption_block
,
img_caption_spans
=
merge_spans_to_block
(
block
[
'spans'
],
img_block
[
'img_caption_bbox'
],
BlockType
.
ImageCaption
block
[
'spans'
],
img_block
[
'img_caption_bbox'
],
BlockType
.
ImageCaption
)
)
...
@@ -280,7 +280,7 @@ def fix_table_block(block, table_blocks):
...
@@ -280,7 +280,7 @@ def fix_table_block(block, table_blocks):
break
break
# 根据list长度,判断table_block中是否有caption
# 根据list长度,判断table_block中是否有caption
if
len
(
table_block
[
'table_caption_bbox'
])
>
0
:
if
table_block
[
'table_caption_bbox'
]
is
not
None
:
table_caption_block
,
table_caption_spans
=
merge_spans_to_block
(
table_caption_block
,
table_caption_spans
=
merge_spans_to_block
(
block
[
'spans'
],
table_block
[
'table_caption_bbox'
],
BlockType
.
TableCaption
block
[
'spans'
],
table_block
[
'table_caption_bbox'
],
BlockType
.
TableCaption
)
)
...
@@ -293,7 +293,7 @@ def fix_table_block(block, table_blocks):
...
@@ -293,7 +293,7 @@ def fix_table_block(block, table_blocks):
block
[
'spans'
]
.
remove
(
span
)
block
[
'spans'
]
.
remove
(
span
)
# 根据list长度,判断table_block中是否有table_note
# 根据list长度,判断table_block中是否有table_note
if
len
(
table_block
[
'table_footnote_bbox'
])
>
0
:
if
table_block
[
'table_footnote_bbox'
]
is
not
None
:
table_footnote_block
,
table_footnote_spans
=
merge_spans_to_block
(
table_footnote_block
,
table_footnote_spans
=
merge_spans_to_block
(
block
[
'spans'
],
table_block
[
'table_footnote_bbox'
],
BlockType
.
TableFootnote
block
[
'spans'
],
table_block
[
'table_footnote_bbox'
],
BlockType
.
TableFootnote
)
)
...
...
magic_pdf/pre_proc/ocr_span_list_modify.py
View file @
dcf6e712
...
@@ -222,10 +222,10 @@ def get_qa_need_list_v2(blocks):
...
@@ -222,10 +222,10 @@ def get_qa_need_list_v2(blocks):
interline_equations
=
[]
interline_equations
=
[]
for
block
in
blocks
:
for
block
in
blocks
:
if
block
[
"type"
]
==
BlockType
.
Image
:
if
block
[
"
block_
type"
]
==
BlockType
.
Image
:
images
.
append
(
block
)
images
.
append
(
block
)
elif
block
[
"type"
]
==
BlockType
.
Table
:
elif
block
[
"
block_
type"
]
==
BlockType
.
Table
:
tables
.
append
(
block
)
tables
.
append
(
block
)
elif
block
[
"type"
]
==
BlockType
.
InterlineEquation
:
elif
block
[
"
block_
type"
]
==
BlockType
.
InterlineEquation
:
interline_equations
.
append
(
block
)
interline_equations
.
append
(
block
)
return
images
,
tables
,
interline_equations
return
images
,
tables
,
interline_equations
magic_pdf/user_api.py
View file @
dcf6e712
...
@@ -15,7 +15,7 @@
...
@@ -15,7 +15,7 @@
from
loguru
import
logger
from
loguru
import
logger
from
magic_pdf.rw
import
AbsReaderWriter
from
magic_pdf.rw
import
AbsReaderWriter
from
magic_pdf.pdf_parse_by_ocr
import
parse_pdf_by_ocr
from
magic_pdf.pdf_parse_by_ocr
_v2
import
parse_pdf_by_ocr
from
magic_pdf.pdf_parse_by_txt
import
parse_pdf_by_txt
from
magic_pdf.pdf_parse_by_txt
import
parse_pdf_by_txt
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment