Commit c9059987 authored by myhloli's avatar myhloli

fix(magic_pdf): correct color channel conversion for OCR in PDF extract

parent e7ce3051
......@@ -215,7 +215,8 @@ class CustomPEKModel:
})
# OCR识别
ocr_res = self.ocr_model.ocr(np.array(new_image), mfd_res=adjusted_mfdetrec_res)[0]
new_image = cv2.cvtColor(np.asarray(new_image), cv2.COLOR_RGB2BGR)
ocr_res = self.ocr_model.ocr(new_image, mfd_res=adjusted_mfdetrec_res)[0]
# 整合结果
if ocr_res:
......
......@@ -301,60 +301,4 @@ class ModifiedPaddleOCR(PaddleOCR):
filter_rec_res.append(rec_result)
end = time.time()
time_dict['all'] = end - start
return filter_boxes, filter_rec_res, time_dict
if __name__ == '__main__':
def merge_intervals(intervals):
# Sort the intervals based on the start value
intervals.sort(key=lambda x: x[0])
merged = []
for interval in intervals:
# If the list of merged intervals is empty or if the current
# interval does not overlap with the previous, simply append it.
if not merged or merged[-1][1] < interval[0]:
merged.append(interval)
else:
# Otherwise, there is overlap, so we merge the current and previous intervals.
merged[-1][1] = max(merged[-1][1], interval[1])
return merged
def remove_intervals(original, masks):
# Merge all mask intervals
merged_masks = merge_intervals(masks)
result = []
original_start, original_end = original
for mask in merged_masks:
mask_start, mask_end = mask
# If the mask starts after the original range, ignore it
if mask_start > original_end:
continue
# If the mask ends before the original range starts, ignore it
if mask_end < original_start:
continue
# Remove the masked part from the original range
if original_start < mask_start:
result.append([original_start, mask_start - 1])
original_start = max(mask_end + 1, original_start)
# Add the remaining part of the original range, if any
if original_start <= original_end:
result.append([original_start, original_end])
return result
# Test the function
original_range = [1, 100]
masks = [[0, 15], [25, 40], [55, 80]]
result = remove_intervals(original_range, masks)
print(result) # Expected output: [[1, 4], [21, 59], [81, 100]]
return filter_boxes, filter_rec_res, time_dict
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment