Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
c9059987
Commit
c9059987
authored
Jul 22, 2024
by
myhloli
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix(magic_pdf): correct color channel conversion for OCR in PDF extract
parent
e7ce3051
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
3 additions
and
58 deletions
+3
-58
pdf_extract_kit.py
magic_pdf/model/pdf_extract_kit.py
+2
-1
self_modify.py
magic_pdf/model/pek_sub_modules/self_modify.py
+1
-57
No files found.
magic_pdf/model/pdf_extract_kit.py
View file @
c9059987
...
...
@@ -215,7 +215,8 @@ class CustomPEKModel:
})
# OCR识别
ocr_res
=
self
.
ocr_model
.
ocr
(
np
.
array
(
new_image
),
mfd_res
=
adjusted_mfdetrec_res
)[
0
]
new_image
=
cv2
.
cvtColor
(
np
.
asarray
(
new_image
),
cv2
.
COLOR_RGB2BGR
)
ocr_res
=
self
.
ocr_model
.
ocr
(
new_image
,
mfd_res
=
adjusted_mfdetrec_res
)[
0
]
# 整合结果
if
ocr_res
:
...
...
magic_pdf/model/pek_sub_modules/self_modify.py
View file @
c9059987
...
...
@@ -301,60 +301,4 @@ class ModifiedPaddleOCR(PaddleOCR):
filter_rec_res
.
append
(
rec_result
)
end
=
time
.
time
()
time_dict
[
'all'
]
=
end
-
start
return
filter_boxes
,
filter_rec_res
,
time_dict
if
__name__
==
'__main__'
:
def
merge_intervals
(
intervals
):
# Sort the intervals based on the start value
intervals
.
sort
(
key
=
lambda
x
:
x
[
0
])
merged
=
[]
for
interval
in
intervals
:
# If the list of merged intervals is empty or if the current
# interval does not overlap with the previous, simply append it.
if
not
merged
or
merged
[
-
1
][
1
]
<
interval
[
0
]:
merged
.
append
(
interval
)
else
:
# Otherwise, there is overlap, so we merge the current and previous intervals.
merged
[
-
1
][
1
]
=
max
(
merged
[
-
1
][
1
],
interval
[
1
])
return
merged
def
remove_intervals
(
original
,
masks
):
# Merge all mask intervals
merged_masks
=
merge_intervals
(
masks
)
result
=
[]
original_start
,
original_end
=
original
for
mask
in
merged_masks
:
mask_start
,
mask_end
=
mask
# If the mask starts after the original range, ignore it
if
mask_start
>
original_end
:
continue
# If the mask ends before the original range starts, ignore it
if
mask_end
<
original_start
:
continue
# Remove the masked part from the original range
if
original_start
<
mask_start
:
result
.
append
([
original_start
,
mask_start
-
1
])
original_start
=
max
(
mask_end
+
1
,
original_start
)
# Add the remaining part of the original range, if any
if
original_start
<=
original_end
:
result
.
append
([
original_start
,
original_end
])
return
result
# Test the function
original_range
=
[
1
,
100
]
masks
=
[[
0
,
15
],
[
25
,
40
],
[
55
,
80
]]
result
=
remove_intervals
(
original_range
,
masks
)
print
(
result
)
# Expected output: [[1, 4], [21, 59], [81, 100]]
return
filter_boxes
,
filter_rec_res
,
time_dict
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment