Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
fc107725
Commit
fc107725
authored
Mar 28, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
ocr_construct_page_component 位置移动
parent
433684c6
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
27 additions
and
25 deletions
+27
-25
pdf_parse_by_model.py
magic_pdf/pdf_parse_by_model.py
+1
-1
pdf_parse_by_ocr.py
magic_pdf/pdf_parse_by_ocr.py
+2
-23
pdf_parse_for_train.py
magic_pdf/pdf_parse_for_train.py
+1
-1
construct_page_dict.py
magic_pdf/pre_proc/construct_page_dict.py
+23
-0
No files found.
magic_pdf/pdf_parse_by_model.py
View file @
fc107725
...
@@ -53,7 +53,7 @@ from magic_pdf.pre_proc.citationmarker_remove import remove_citation_marker
...
@@ -53,7 +53,7 @@ from magic_pdf.pre_proc.citationmarker_remove import remove_citation_marker
from
magic_pdf.pre_proc.equations_replace
import
combine_chars_to_pymudict
,
remove_chars_in_text_blocks
,
replace_equations_in_textblock
from
magic_pdf.pre_proc.equations_replace
import
combine_chars_to_pymudict
,
remove_chars_in_text_blocks
,
replace_equations_in_textblock
from
magic_pdf.pre_proc.pdf_pre_filter
import
pdf_filter
from
magic_pdf.pre_proc.pdf_pre_filter
import
pdf_filter
from
magic_pdf.pre_proc.detect_footer_header_by_statistics
import
drop_footer_header
from
magic_pdf.pre_proc.detect_footer_header_by_statistics
import
drop_footer_header
from
magic_pdf.pre_proc.construct_pa
ras
import
construct_page_component
from
magic_pdf.pre_proc.construct_pa
ge_dict
import
construct_page_component
from
magic_pdf.pre_proc.fix_image
import
combine_images
,
fix_image_vertical
,
fix_seperated_image
,
include_img_title
from
magic_pdf.pre_proc.fix_image
import
combine_images
,
fix_image_vertical
,
fix_seperated_image
,
include_img_title
from
magic_pdf.post_proc.pdf_post_filter
import
pdf_post_filter
from
magic_pdf.post_proc.pdf_post_filter
import
pdf_post_filter
from
magic_pdf.pre_proc.remove_rotate_bbox
import
get_side_boundry
,
remove_rotate_side_textblock
,
remove_side_blank_block
from
magic_pdf.pre_proc.remove_rotate_bbox
import
get_side_boundry
,
remove_rotate_side_textblock
,
remove_side_blank_block
...
...
magic_pdf/pdf_parse_by_ocr.py
View file @
fc107725
...
@@ -18,6 +18,7 @@ from magic_pdf.libs.drop_tag import DropTag
...
@@ -18,6 +18,7 @@ from magic_pdf.libs.drop_tag import DropTag
from
magic_pdf.libs.ocr_content_type
import
ContentType
from
magic_pdf.libs.ocr_content_type
import
ContentType
from
magic_pdf.libs.safe_filename
import
sanitize_filename
from
magic_pdf.libs.safe_filename
import
sanitize_filename
from
magic_pdf.para.para_split
import
para_split
from
magic_pdf.para.para_split
import
para_split
from
magic_pdf.pre_proc.construct_page_dict
import
ocr_construct_page_component
from
magic_pdf.pre_proc.detect_footer_by_model
import
parse_footers
from
magic_pdf.pre_proc.detect_footer_by_model
import
parse_footers
from
magic_pdf.pre_proc.detect_footnote
import
parse_footnotes_by_model
from
magic_pdf.pre_proc.detect_footnote
import
parse_footnotes_by_model
from
magic_pdf.pre_proc.detect_header
import
parse_headers
from
magic_pdf.pre_proc.detect_header
import
parse_headers
...
@@ -33,28 +34,6 @@ from magic_pdf.pre_proc.ocr_span_list_modify import remove_spans_by_bboxes, remo
...
@@ -33,28 +34,6 @@ from magic_pdf.pre_proc.ocr_span_list_modify import remove_spans_by_bboxes, remo
from
magic_pdf.pre_proc.remove_bbox_overlap
import
remove_overlap_between_bbox
from
magic_pdf.pre_proc.remove_bbox_overlap
import
remove_overlap_between_bbox
def
construct_page_component
(
blocks
,
layout_bboxes
,
page_id
,
page_w
,
page_h
,
layout_tree
,
images
,
tables
,
interline_equations
,
inline_equations
,
dropped_text_block
,
dropped_image_block
,
dropped_table_block
,
dropped_equation_block
,
need_remove_spans_bboxes_dict
):
return_dict
=
{
'preproc_blocks'
:
blocks
,
'layout_bboxes'
:
layout_bboxes
,
'page_idx'
:
page_id
,
'page_size'
:
[
page_w
,
page_h
],
'_layout_tree'
:
layout_tree
,
'images'
:
images
,
'tables'
:
tables
,
'interline_equations'
:
interline_equations
,
'inline_equations'
:
inline_equations
,
'droped_text_block'
:
dropped_text_block
,
'droped_image_block'
:
dropped_image_block
,
'droped_table_block'
:
dropped_table_block
,
'dropped_equation_block'
:
dropped_equation_block
,
'droped_bboxes'
:
need_remove_spans_bboxes_dict
,
}
return
return_dict
def
parse_pdf_by_ocr
(
def
parse_pdf_by_ocr
(
pdf_path
,
pdf_path
,
...
@@ -254,7 +233,7 @@ def parse_pdf_by_ocr(
...
@@ -254,7 +233,7 @@ def parse_pdf_by_ocr(
dropped_equation_block
.
append
(
span
)
dropped_equation_block
.
append
(
span
)
'''构造pdf_info_dict'''
'''构造pdf_info_dict'''
page_info
=
construct_page_component
(
blocks
,
layout_bboxes
,
page_id
,
page_w
,
page_h
,
layout_tree
,
page_info
=
ocr_
construct_page_component
(
blocks
,
layout_bboxes
,
page_id
,
page_w
,
page_h
,
layout_tree
,
images
,
tables
,
interline_equations
,
inline_equations
,
images
,
tables
,
interline_equations
,
inline_equations
,
dropped_text_block
,
dropped_image_block
,
dropped_table_block
,
dropped_text_block
,
dropped_image_block
,
dropped_table_block
,
dropped_equation_block
,
dropped_equation_block
,
...
...
magic_pdf/pdf_parse_for_train.py
View file @
fc107725
...
@@ -75,7 +75,7 @@ from magic_pdf.pre_proc.equations_replace import (
...
@@ -75,7 +75,7 @@ from magic_pdf.pre_proc.equations_replace import (
)
)
from
magic_pdf.pre_proc.pdf_pre_filter
import
pdf_filter
from
magic_pdf.pre_proc.pdf_pre_filter
import
pdf_filter
from
magic_pdf.pre_proc.detect_footer_header_by_statistics
import
drop_footer_header
from
magic_pdf.pre_proc.detect_footer_header_by_statistics
import
drop_footer_header
from
magic_pdf.pre_proc.construct_pa
ras
import
construct_page_component
from
magic_pdf.pre_proc.construct_pa
ge_dict
import
construct_page_component
from
magic_pdf.pre_proc.fix_image
import
(
from
magic_pdf.pre_proc.fix_image
import
(
combine_images
,
combine_images
,
fix_image_vertical
,
fix_image_vertical
,
...
...
magic_pdf/pre_proc/construct_pa
ras
.py
→
magic_pdf/pre_proc/construct_pa
ge_dict
.py
View file @
fc107725
...
@@ -28,3 +28,26 @@ def construct_page_component(page_id, image_info, table_info, text_blocks_prepr
...
@@ -28,3 +28,26 @@ def construct_page_component(page_id, image_info, table_info, text_blocks_prepr
return_dict
[
'footnote_bboxes_tmp'
]
=
footnote_bboxes_tmp
return_dict
[
'footnote_bboxes_tmp'
]
=
footnote_bboxes_tmp
return
return_dict
return
return_dict
def
ocr_construct_page_component
(
blocks
,
layout_bboxes
,
page_id
,
page_w
,
page_h
,
layout_tree
,
images
,
tables
,
interline_equations
,
inline_equations
,
dropped_text_block
,
dropped_image_block
,
dropped_table_block
,
dropped_equation_block
,
need_remove_spans_bboxes_dict
):
return_dict
=
{
'preproc_blocks'
:
blocks
,
'layout_bboxes'
:
layout_bboxes
,
'page_idx'
:
page_id
,
'page_size'
:
[
page_w
,
page_h
],
'_layout_tree'
:
layout_tree
,
'images'
:
images
,
'tables'
:
tables
,
'interline_equations'
:
interline_equations
,
'inline_equations'
:
inline_equations
,
'droped_text_block'
:
dropped_text_block
,
'droped_image_block'
:
dropped_image_block
,
'droped_table_block'
:
dropped_table_block
,
'dropped_equation_block'
:
dropped_equation_block
,
'droped_bboxes'
:
need_remove_spans_bboxes_dict
,
}
return
return_dict
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment