Commit 71a042d9 authored by 赵小蒙's avatar 赵小蒙

footnote检测逻辑更新

parent 779d2e8a
...@@ -75,7 +75,8 @@ def merge_footnote_blocks(page_info, main_text_font): ...@@ -75,7 +75,8 @@ def merge_footnote_blocks(page_info, main_text_font):
is_below(block['bbox'], footnote_bbox) and is_below(block['bbox'], footnote_bbox) and
sum([size >= main_text_size, sum([size >= main_text_size,
len(block['lines']) >= 5, len(block['lines']) >= 5,
block_font == main_text_font]) >= 2] block_font == main_text_font])
>= 2]
# 如果main_text_bboxes_below不为空,说明footnote下面有正文block,这个footnote不成立,跳过 # 如果main_text_bboxes_below不为空,说明footnote下面有正文block,这个footnote不成立,跳过
if len(main_text_bboxes_below) > 0: if len(main_text_bboxes_below) > 0:
continue continue
......
...@@ -104,7 +104,8 @@ def parse_footnotes_by_rule(remain_text_blocks, page_height, page_id, main_text_ ...@@ -104,7 +104,8 @@ def parse_footnotes_by_rule(remain_text_blocks, page_height, page_id, main_text_
list: 符合规则的脚注文本块的边界框列表。 list: 符合规则的脚注文本块的边界框列表。
""" """
if page_id > 20: # if page_id > 20:
if page_id > 2: # 为保证精确度,先只筛选前3页
return [] return []
else: else:
# 存储每一行的文本块大小的列表 # 存储每一行的文本块大小的列表
...@@ -128,7 +129,7 @@ def parse_footnotes_by_rule(remain_text_blocks, page_height, page_id, main_text_ ...@@ -128,7 +129,7 @@ def parse_footnotes_by_rule(remain_text_blocks, page_height, page_id, main_text_
block_line_sizes.append(line_size) block_line_sizes.append(line_size)
span_font = [(span['font'], len(span['text'])) for span in line['spans'] if 'font' in span and len(span['text']) > 0] span_font = [(span['font'], len(span['text'])) for span in line['spans'] if 'font' in span and len(span['text']) > 0]
if span_font: if span_font:
# # todo main_text_font应该用基于字数最多的字体而不是span级别的统计 # main_text_font应该用基于字数最多的字体而不是span级别的统计
# font_names.append(font_name for font_name in span_font) # font_names.append(font_name for font_name in span_font)
# block_fonts.append(font_name for font_name in span_font) # block_fonts.append(font_name for font_name in span_font)
for font, count in span_font: for font, count in span_font:
...@@ -158,9 +159,17 @@ def parse_footnotes_by_rule(remain_text_blocks, page_height, page_id, main_text_ ...@@ -158,9 +159,17 @@ def parse_footnotes_by_rule(remain_text_blocks, page_height, page_id, main_text_
# and len(block['lines']) < 5] # and len(block['lines']) < 5]
footnote_bboxes = [block['bbox'] for block, block_size, block_font in block_sizes if footnote_bboxes = [block['bbox'] for block, block_size, block_font in block_sizes if
block['bbox'][1] > page_height * 0.6 and block['bbox'][1] > page_height * 0.6 and
sum([block_size < main_text_size, # 较为严格的规则
len(block['lines']) < 5, block_size < main_text_size and
block_font != main_text_font]) >= 2] (len(block['lines']) < 5 or
block_font != main_text_font)]
# 较为宽松的规则
# sum([block_size < main_text_size,
# len(block['lines']) < 5,
# block_font != main_text_font])
# >= 2]
return footnote_bboxes return footnote_bboxes
else: else:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment