Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
143f8114
Commit
143f8114
authored
Mar 18, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Plain Diff
Merge remote-tracking branch 'origin/master'
# Conflicts: # magic_pdf/libs/drop_tag.py
parents
5eab010b
83753cbd
Changes
10
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
243 additions
and
117 deletions
+243
-117
demo_test.py
demo/demo_test.py
+11
-5
ocr_demo.py
demo/ocr_demo.py
+4
-4
mkcontent.py
magic_pdf/dict2md/mkcontent.py
+187
-83
drop_tag.py
magic_pdf/libs/drop_tag.py
+8
-0
para_split.py
magic_pdf/para/para_split.py
+14
-11
pipeline.py
magic_pdf/pipeline.py
+5
-4
equations_replace.py
magic_pdf/pre_proc/equations_replace.py
+3
-2
remove_footer_header.py
magic_pdf/pre_proc/remove_footer_header.py
+4
-3
remove_rotate_bbox.py
magic_pdf/pre_proc/remove_rotate_bbox.py
+4
-3
resolve_bbox_conflict.py
magic_pdf/pre_proc/resolve_bbox_conflict.py
+3
-2
No files found.
demo/demo_test.py
View file @
143f8114
...
@@ -5,6 +5,7 @@ from pathlib import Path
...
@@ -5,6 +5,7 @@ from pathlib import Path
import
click
import
click
from
magic_pdf.dict2md.mkcontent
import
mk_mm_markdown
from
magic_pdf.pipeline
import
(
from
magic_pdf.pipeline
import
(
meta_scan
,
meta_scan
,
classify_by_type
,
classify_by_type
,
...
@@ -55,14 +56,19 @@ def demo_parse_pdf(book_name=None, start_page_id=0, debug_mode=True):
...
@@ -55,14 +56,19 @@ def demo_parse_pdf(book_name=None, start_page_id=0, debug_mode=True):
write_json_to_local
(
jso
,
book_name
)
write_json_to_local
(
jso
,
book_name
)
jso_md
=
pdf_intermediate_dict_to_markdown
(
jso
,
debug_mode
=
debug_mode
)
jso_md
=
pdf_intermediate_dict_to_markdown
(
jso
,
debug_mode
=
debug_mode
)
md_content
=
jso_md
.
get
(
"content"
)
content
=
jso_md
.
get
(
"content_list"
)
markdown_content
=
mk_mm_markdown
(
content
)
if
book_name
is
not
None
:
if
book_name
is
not
None
:
save_tmp_path
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"../.."
,
"tmp"
,
"unittest"
)
save_tmp_path
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"../.."
,
"tmp"
,
"unittest"
,
"md"
,
book_name
)
markdown_save_path
=
join_path
(
save_tmp_path
,
"md"
,
book_name
+
".md"
)
uni_format_save_path
=
join_path
(
save_tmp_path
,
"book"
+
".json"
)
markdown_save_path
=
join_path
(
save_tmp_path
,
"book"
+
".md"
)
with
open
(
uni_format_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
f
.
write
(
json
.
dumps
(
content
,
ensure_ascii
=
False
,
indent
=
4
))
with
open
(
markdown_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
with
open
(
markdown_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
f
.
write
(
md_content
)
f
.
write
(
markdown_content
)
else
:
else
:
logger
.
info
(
md_content
)
logger
.
info
(
json
.
dumps
(
content
,
ensure_ascii
=
False
)
)
def
demo_save_tables
(
book_name
=
None
,
start_page_id
=
0
,
debug_mode
=
True
):
def
demo_save_tables
(
book_name
=
None
,
start_page_id
=
0
,
debug_mode
=
True
):
...
...
demo/ocr_demo.py
View file @
143f8114
...
@@ -30,13 +30,13 @@ def read_json_file(file_path):
...
@@ -30,13 +30,13 @@ def read_json_file(file_path):
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
ocr_pdf_path
=
r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.pdf"
#
ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.pdf"
ocr_json_file_path
=
r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json"
#
ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json"
# ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.pdf"
# ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.pdf"
# ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.json"
# ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.json"
#
ocr_pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf"
ocr_pdf_path
=
r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf"
#
ocr_json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
ocr_json_file_path
=
r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
try
:
try
:
ocr_pdf_model_info
=
read_json_file
(
ocr_json_file_path
)
ocr_pdf_model_info
=
read_json_file
(
ocr_json_file_path
)
pth
=
Path
(
ocr_json_file_path
)
pth
=
Path
(
ocr_json_file_path
)
...
...
magic_pdf/dict2md/mkcontent.py
View file @
143f8114
This diff is collapsed.
Click to expand it.
magic_pdf/libs/drop_tag.py
View file @
143f8114
COLOR_BG_HEADER_TXT_BLOCK
=
"color_background_header_txt_block"
COLOR_BG_HEADER_TXT_BLOCK
=
"color_background_header_txt_block"
PAGE_NO
=
"page-no"
# 页码
CONTENT_IN_FOOT_OR_HEADER
=
'in-foot-header-area'
# 页眉页脚内的文本
VERTICAL_TEXT
=
'vertical-text'
# 垂直文本
ROTATE_TEXT
=
'rotate-text'
# 旋转文本
EMPTY_SIDE_BLOCK
=
'empty-side-block'
# 边缘上的空白没有任何内容的block
ON_IMAGE_TEXT
=
'on-image-text'
# 文本在图片上
ON_TABLE_TEXT
=
'on-table-text'
# 文本在表格上
class
DropTag
:
class
DropTag
:
...
...
magic_pdf/para/para_split.py
View file @
143f8114
...
@@ -3,11 +3,12 @@ import numpy as np
...
@@ -3,11 +3,12 @@ import numpy as np
from
loguru
import
logger
from
loguru
import
logger
from
magic_pdf.libs.boxbase
import
_is_in
from
magic_pdf.libs.boxbase
import
_is_in
from
magic_pdf.libs.ocr_content_type
import
ContentType
LINE_STOP_FLAG
=
[
'.'
,
'!'
,
'?'
,
'。'
,
'!'
,
'?'
,
":"
,
":"
,
")"
,
")"
,
";"
]
LINE_STOP_FLAG
=
[
'.'
,
'!'
,
'?'
,
'。'
,
'!'
,
'?'
,
":"
,
":"
,
")"
,
")"
,
";"
]
INLINE_EQUATION
=
'inline_equation'
INLINE_EQUATION
=
ContentType
.
InlineEquation
INTER
_EQUATION
=
"displayed_equation"
INTER
LINE_EQUATION
=
ContentType
.
InterlineEquation
TEXT
=
"text"
TEXT
=
"text"
def
__add_line_period
(
blocks
,
layout_bboxes
):
def
__add_line_period
(
blocks
,
layout_bboxes
):
...
@@ -20,20 +21,19 @@ def __add_line_period(blocks, layout_bboxes):
...
@@ -20,20 +21,19 @@ def __add_line_period(blocks, layout_bboxes):
for
line
in
block
[
'lines'
]:
for
line
in
block
[
'lines'
]:
last_span
=
line
[
'spans'
][
-
1
]
last_span
=
line
[
'spans'
][
-
1
]
span_type
=
last_span
[
'type'
]
span_type
=
last_span
[
'type'
]
if
span_type
in
[
TEXT
,
INLINE_EQUATION
]:
if
span_type
in
[
INLINE_EQUATION
]:
span_content
=
last_span
[
'content'
]
.
strip
()
span_content
=
last_span
[
'content'
]
.
strip
()
if
span_type
==
INLINE_EQUATION
and
span_content
[
-
1
]
not
in
LINE_STOP_FLAG
:
if
span_type
==
INLINE_EQUATION
and
span_content
[
-
1
]
not
in
LINE_STOP_FLAG
:
if
span_type
in
[
INLINE_EQUATION
,
INTER_EQUATION
]:
if
span_type
in
[
INLINE_EQUATION
,
INTER
LINE
_EQUATION
]:
last_span
[
'content'
]
=
span_content
+
'.'
last_span
[
'content'
]
=
span_content
+
'.'
def
__valign_lines
(
blocks
,
layout_bboxes
):
def
__valign_lines
(
blocks
,
layout_bboxes
):
"""
"""
对齐行的左侧和右侧。
在一个layoutbox内对齐行的左侧和右侧。
扫描行的左侧和右侧,如果x0, x1差距不超过3就强行对齐到所处layout的左右两侧(和layout有一段距离)。
扫描行的左侧和右侧,如果x0, x1差距不超过一个阈值,就强行对齐到所处layout的左右两侧(和layout有一段距离)。
3是个经验值,TODO,计算得来
3是个经验值,TODO,计算得来,可以设置为1.5个正文字符。
"""
"""
min_distance
=
3
min_distance
=
3
...
@@ -159,11 +159,14 @@ def __split_para_in_layoutbox(lines_group, layout_bboxes, lang="en", char_avg_le
...
@@ -159,11 +159,14 @@ def __split_para_in_layoutbox(lines_group, layout_bboxes, lang="en", char_avg_le
else
:
else
:
para
.
append
(
line
)
para
.
append
(
line
)
else
:
# 其他,图片、表格、行间公式,各自占一段
else
:
# 其他,图片、表格、行间公式,各自占一段
para
.
append
(
line
)
if
len
(
para
)
>
0
:
paras
.
append
(
para
)
paras
.
append
(
para
)
para
=
[]
else
:
paras
.
append
([
line
])
para
=
[]
# para_text = ''.join([get_span_text(span) for line in para for span in line['spans']])
# para_text = ''.join([get_span_text(span) for line in para for span in line['spans']])
# logger.info(para_text)
# logger.info(para_text)
para
=
[]
if
len
(
para
)
>
0
:
if
len
(
para
)
>
0
:
paras
.
append
(
para
)
paras
.
append
(
para
)
# para_text = ''.join([get_span_text(span) for line in para for span in line['spans']])
# para_text = ''.join([get_span_text(span) for line in para for span in line['spans']])
...
...
magic_pdf/pipeline.py
View file @
143f8114
...
@@ -7,7 +7,7 @@ from magic_pdf.dict2md.ocr_mkcontent import ocr_mk_nlp_markdown, ocr_mk_mm_markd
...
@@ -7,7 +7,7 @@ from magic_pdf.dict2md.ocr_mkcontent import ocr_mk_nlp_markdown, ocr_mk_mm_markd
from
magic_pdf.libs.commons
import
read_file
,
join_path
,
parse_bucket_key
,
formatted_time
,
s3_image_save_path
from
magic_pdf.libs.commons
import
read_file
,
join_path
,
parse_bucket_key
,
formatted_time
,
s3_image_save_path
from
magic_pdf.libs.drop_reason
import
DropReason
from
magic_pdf.libs.drop_reason
import
DropReason
from
magic_pdf.libs.json_compressor
import
JsonCompressor
from
magic_pdf.libs.json_compressor
import
JsonCompressor
from
magic_pdf.dict2md.mkcontent
import
mk_nlp_markdown
from
magic_pdf.dict2md.mkcontent
import
mk_nlp_markdown
,
mk_universal_format
from
magic_pdf.pdf_parse_by_model
import
parse_pdf_by_model
from
magic_pdf.pdf_parse_by_model
import
parse_pdf_by_model
from
magic_pdf.filter.pdf_classify_by_type
import
classify
from
magic_pdf.filter.pdf_classify_by_type
import
classify
from
magic_pdf.filter.pdf_meta_scan
import
pdf_meta_scan
from
magic_pdf.filter.pdf_meta_scan
import
pdf_meta_scan
...
@@ -237,9 +237,10 @@ def pdf_intermediate_dict_to_markdown(jso: dict, debug_mode=False) -> dict:
...
@@ -237,9 +237,10 @@ def pdf_intermediate_dict_to_markdown(jso: dict, debug_mode=False) -> dict:
pdf_intermediate_dict
=
jso
[
'pdf_intermediate_dict'
]
pdf_intermediate_dict
=
jso
[
'pdf_intermediate_dict'
]
# 将 pdf_intermediate_dict 解压
# 将 pdf_intermediate_dict 解压
pdf_intermediate_dict
=
JsonCompressor
.
decompress_json
(
pdf_intermediate_dict
)
pdf_intermediate_dict
=
JsonCompressor
.
decompress_json
(
pdf_intermediate_dict
)
markdown_content
=
mk_nlp_markdown
(
pdf_intermediate_dict
)
#markdown_content = mk_nlp_markdown(pdf_intermediate_dict)
jso
[
"content"
]
=
markdown_content
jso
[
'content_list'
]
=
mk_universal_format
(
pdf_intermediate_dict
)
logger
.
info
(
f
"book_name is:{get_data_source(jso)}/{jso['file_id']},markdown content length is {len(markdown_content)}"
,
file
=
sys
.
stderr
)
#jso["content"] = markdown_content
logger
.
info
(
f
"book_name is:{get_data_source(jso)}/{jso['file_id']}"
)
# 把无用的信息清空
# 把无用的信息清空
jso
[
"doc_layout_result"
]
=
""
jso
[
"doc_layout_result"
]
=
""
jso
[
"pdf_intermediate_dict"
]
=
""
jso
[
"pdf_intermediate_dict"
]
=
""
...
...
magic_pdf/pre_proc/equations_replace.py
View file @
143f8114
...
@@ -6,9 +6,10 @@ import json
...
@@ -6,9 +6,10 @@ import json
import
os
import
os
from
pathlib
import
Path
from
pathlib
import
Path
from
loguru
import
logger
from
loguru
import
logger
from
magic_pdf.libs.ocr_content_type
import
ContentType
TYPE_INLINE_EQUATION
=
"inline-equation"
TYPE_INLINE_EQUATION
=
ContentType
.
InlineEquation
TYPE_INTERLINE_EQUATION
=
"interline-equation"
TYPE_INTERLINE_EQUATION
=
ContentType
.
InterlineEquation
def
combine_chars_to_pymudict
(
block_dict
,
char_dict
):
def
combine_chars_to_pymudict
(
block_dict
,
char_dict
):
...
...
magic_pdf/pre_proc/remove_footer_header.py
View file @
143f8114
import
re
import
re
from
magic_pdf.libs.boxbase
import
_is_in_or_part_overlap
from
magic_pdf.libs.boxbase
import
_is_in_or_part_overlap
from
magic_pdf.libs.drop_tag
import
CONTENT_IN_FOOT_OR_HEADER
,
PAGE_NO
def
remove_headder_footer_one_page
(
text_raw_blocks
,
image_bboxes
,
table_bboxes
,
header_bboxs
,
footer_bboxs
,
def
remove_headder_footer_one_page
(
text_raw_blocks
,
image_bboxes
,
table_bboxes
,
header_bboxs
,
footer_bboxs
,
...
@@ -67,7 +68,7 @@ def remove_headder_footer_one_page(text_raw_blocks, image_bboxes, table_bboxes,
...
@@ -67,7 +68,7 @@ def remove_headder_footer_one_page(text_raw_blocks, image_bboxes, table_bboxes,
blk
[
'lines'
]
.
remove
(
line
)
blk
[
'lines'
]
.
remove
(
line
)
else
:
else
:
# if not blk['lines']:
# if not blk['lines']:
blk
[
'tag'
]
=
'in-foot-header-area'
blk
[
'tag'
]
=
CONTENT_IN_FOOT_OR_HEADER
text_block_to_remove
.
append
(
blk
)
text_block_to_remove
.
append
(
blk
)
"""有的时候由于pageNo太小了,总是会有一点和content_boundry重叠一点,被放入正文,因此对于pageNo,进行span粒度的删除"""
"""有的时候由于pageNo太小了,总是会有一点和content_boundry重叠一点,被放入正文,因此对于pageNo,进行span粒度的删除"""
...
@@ -80,7 +81,7 @@ def remove_headder_footer_one_page(text_raw_blocks, image_bboxes, table_bboxes,
...
@@ -80,7 +81,7 @@ def remove_headder_footer_one_page(text_raw_blocks, image_bboxes, table_bboxes,
for
span
in
line
[
'spans'
]:
for
span
in
line
[
'spans'
]:
if
_is_in_or_part_overlap
(
pagenobox
,
span
[
'bbox'
]):
if
_is_in_or_part_overlap
(
pagenobox
,
span
[
'bbox'
]):
# span['text'] = ''
# span['text'] = ''
span
[
'tag'
]
=
"page-no"
span
[
'tag'
]
=
PAGE_NO
# 检查这个block是否只有这一个span,如果是,那么就把这个block也删除
# 检查这个block是否只有这一个span,如果是,那么就把这个block也删除
if
len
(
line
[
'spans'
])
==
1
and
len
(
block
[
'lines'
])
==
1
:
if
len
(
line
[
'spans'
])
==
1
and
len
(
block
[
'lines'
])
==
1
:
page_no_block_2_remove
.
append
(
block
)
page_no_block_2_remove
.
append
(
block
)
...
@@ -96,7 +97,7 @@ def remove_headder_footer_one_page(text_raw_blocks, image_bboxes, table_bboxes,
...
@@ -96,7 +97,7 @@ def remove_headder_footer_one_page(text_raw_blocks, image_bboxes, table_bboxes,
if
last_span
[
'text'
]
.
strip
()
and
not
re
.
search
(
'[a-zA-Z]'
,
last_span
[
'text'
])
and
re
.
search
(
'[0-9]'
,
if
last_span
[
'text'
]
.
strip
()
and
not
re
.
search
(
'[a-zA-Z]'
,
last_span
[
'text'
])
and
re
.
search
(
'[0-9]'
,
last_span
[
last_span
[
'text'
]):
'text'
]):
last_span
[
'tag'
]
=
"page-no"
last_span
[
'tag'
]
=
PAGE_NO
page_no_block_2_remove
.
append
(
last_block
)
page_no_block_2_remove
.
append
(
last_block
)
for
b
in
page_no_block_2_remove
:
for
b
in
page_no_block_2_remove
:
...
...
magic_pdf/pre_proc/remove_rotate_bbox.py
View file @
143f8114
import
math
import
math
from
magic_pdf.libs.boxbase
import
is_vbox_on_side
from
magic_pdf.libs.boxbase
import
is_vbox_on_side
from
magic_pdf.libs.drop_tag
import
EMPTY_SIDE_BLOCK
,
ROTATE_TEXT
,
VERTICAL_TEXT
def
detect_non_horizontal_texts
(
result_dict
):
def
detect_non_horizontal_texts
(
result_dict
):
...
@@ -134,13 +135,13 @@ def remove_rotate_side_textblock(pymu_text_block, page_width, page_height):
...
@@ -134,13 +135,13 @@ def remove_rotate_side_textblock(pymu_text_block, page_width, page_height):
is_box_valign
=
(
len
(
set
([
int
(
line
[
'spans'
][
0
][
'bbox'
][
0
]
)
for
line
in
lines
if
len
(
line
[
'spans'
])
>
0
]))
==
1
)
and
(
len
([
int
(
line
[
'spans'
][
0
][
'bbox'
][
0
]
)
for
line
in
lines
if
len
(
line
[
'spans'
])
>
0
])
>
1
)
# 测试bbox在垂直方向是不是x0都相等,也就是在垂直方向排列.同时必须大于等于2个字
is_box_valign
=
(
len
(
set
([
int
(
line
[
'spans'
][
0
][
'bbox'
][
0
]
)
for
line
in
lines
if
len
(
line
[
'spans'
])
>
0
]))
==
1
)
and
(
len
([
int
(
line
[
'spans'
][
0
][
'bbox'
][
0
]
)
for
line
in
lines
if
len
(
line
[
'spans'
])
>
0
])
>
1
)
# 测试bbox在垂直方向是不是x0都相等,也就是在垂直方向排列.同时必须大于等于2个字
if
is_box_valign
:
if
is_box_valign
:
block
[
'tag'
]
=
"vertical-text"
block
[
'tag'
]
=
VERTICAL_TEXT
removed_text_block
.
append
(
block
)
removed_text_block
.
append
(
block
)
continue
continue
for
line
in
lines
:
for
line
in
lines
:
if
line
[
'dir'
]
!=
(
1
,
0
):
if
line
[
'dir'
]
!=
(
1
,
0
):
block
[
'tag'
]
=
"rotate"
block
[
'tag'
]
=
ROTATE_TEXT
removed_text_block
.
append
(
block
)
# 只要有一个line不是dir=(1,0),就把整个block都删掉
removed_text_block
.
append
(
block
)
# 只要有一个line不是dir=(1,0),就把整个block都删掉
break
break
...
@@ -177,7 +178,7 @@ def remove_side_blank_block(pymu_text_block, page_width, page_height):
...
@@ -177,7 +178,7 @@ def remove_side_blank_block(pymu_text_block, page_width, page_height):
continue
continue
if
__is_empty_side_box
(
block
):
if
__is_empty_side_box
(
block
):
block
[
'tag'
]
=
"empty-side-block"
block
[
'tag'
]
=
EMPTY_SIDE_BLOCK
removed_text_block
.
append
(
block
)
removed_text_block
.
append
(
block
)
continue
continue
...
...
magic_pdf/pre_proc/resolve_bbox_conflict.py
View file @
143f8114
...
@@ -6,6 +6,7 @@
...
@@ -6,6 +6,7 @@
"""
"""
from
magic_pdf.libs.boxbase
import
_is_in
,
_is_in_or_part_overlap
,
_is_left_overlap
from
magic_pdf.libs.boxbase
import
_is_in
,
_is_in_or_part_overlap
,
_is_left_overlap
from
magic_pdf.libs.drop_tag
import
ON_IMAGE_TEXT
,
ON_TABLE_TEXT
def
resolve_bbox_overlap_conflict
(
images
:
list
,
tables
:
list
,
interline_equations
:
list
,
inline_equations
:
list
,
text_raw_blocks
:
list
):
def
resolve_bbox_overlap_conflict
(
images
:
list
,
tables
:
list
,
interline_equations
:
list
,
inline_equations
:
list
,
text_raw_blocks
:
list
):
...
@@ -27,14 +28,14 @@ def resolve_bbox_overlap_conflict(images:list, tables:list, interline_equations:
...
@@ -27,14 +28,14 @@ def resolve_bbox_overlap_conflict(images:list, tables:list, interline_equations:
for
text_block
in
text_raw_blocks
:
for
text_block
in
text_raw_blocks
:
text_bbox
=
text_block
[
"bbox"
]
text_bbox
=
text_block
[
"bbox"
]
if
_is_in
(
text_bbox
,
image_box
):
if
_is_in
(
text_bbox
,
image_box
):
text_block
[
'tag'
]
=
"on-image"
text_block
[
'tag'
]
=
ON_IMAGE_TEXT
text_block_removed
.
append
(
text_block
)
text_block_removed
.
append
(
text_block
)
# 去掉table上的文字block
# 去掉table上的文字block
for
table_box
in
tables
:
for
table_box
in
tables
:
for
text_block
in
text_raw_blocks
:
for
text_block
in
text_raw_blocks
:
text_bbox
=
text_block
[
"bbox"
]
text_bbox
=
text_block
[
"bbox"
]
if
_is_in
(
text_bbox
,
table_box
):
if
_is_in
(
text_bbox
,
table_box
):
text_block
[
'tag'
]
=
"on-table"
text_block
[
'tag'
]
=
ON_TABLE_TEXT
text_block_removed
.
append
(
text_block
)
text_block_removed
.
append
(
text_block
)
for
text_block
in
text_block_removed
:
for
text_block
in
text_block_removed
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment