Commit 11e4e8cc authored by 赵小蒙's avatar 赵小蒙

重构目录结构

parent 56213908
......@@ -3,12 +3,11 @@ import sys
from pathlib import Path
import click
import json
from loguru import logger
from libs.commons import join_path, parse_aws_param, parse_bucket_key, read_file
from mkcontent import mk_mm_markdown, mk_nlp_markdown
from pdf_parse_by_model import parse_pdf_by_model
from libs.commons import join_path
from dict2md.mkcontent import mk_mm_markdown
from pipeline.pdf_parse_by_model import parse_pdf_by_model
......@@ -17,7 +16,7 @@ def main(s3_pdf_path: str, s3_pdf_profile: str, pdf_model_path: str, pdf_model_p
pth = Path(s3_pdf_path)
book_name = pth.name
# book_name = "".join(os.path.basename(s3_pdf_path).split(".")[0:-1])
save_tmp_path = os.path.join(os.path.dirname(__file__), "..", "..","tmp", "unittest")
save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "..", "tmp", "unittest")
save_path = join_path(save_tmp_path, "md")
text_content_save_path = f"{save_path}/{book_name}/book.md"
# metadata_save_path = f"{save_path}/{book_name}/metadata.json"
......
import os
import sys
from pathlib import Path
import click
import json
from loguru import logger
from libs.commons import join_path, parse_aws_param, parse_bucket_key, read_file
from mkcontent import mk_nlp_markdown
from pdf2md import main
from pdf_parse_by_model import parse_pdf_by_model
from demo.pdf2md import main
@click.command()
......
import sys
from typing import Tuple
import os
import click
import boto3, json
from botocore.config import Config
from libs.commons import fitz
......@@ -24,16 +23,11 @@ from validation import cal_edit_distance, format_gt_bbox, label_match, detect_va
# from bbox_sort import bbox_sort, CONTENT_IDX, CONTENT_TYPE_IDX
from layout.bbox_sort import bbox_sort, CONTENT_IDX, CONTENT_TYPE_IDX
from pdf2text_recogFigure import parse_images # 获取figures的bbox
from pre_proc.detect_images import parse_images # 获取figures的bbox
from pdf2text_recogTable import parse_tables # 获取tables的bbox
from pdf2text_recogEquation import parse_equations # 获取equations的bbox
from pdf2text_recogTitle import parse_titles # 获取titles的bbox
from pdf2text_recogHeader import parse_headers # 获取headers的bbox
from pdf2text_recogPageNo import parse_pageNos # 获取pageNos的bbox
from pre_proc.detect_equation import parse_equations # 获取equations的bbox
# from pdf2text_recogFootnote import parse_footnotes # 获取footnotes的bbox
from pdf2text_recogFooter import parse_footers # 获取footers的bbox
from pdf2text_evaluatePdfLayout import evaluate_pdf_layout # 评估页面的Layout是否是规整的。
from pdf2text_recogPara import process_blocks_per_page, postprocess_paras_pipeline
from pdf2text_recogPara import process_blocks_per_page
from libs.commons import parse_aws_param, parse_bucket_key, read_file, join_path
......
......@@ -15,14 +15,13 @@ from libs.drop_reason import DropReason
from libs.markdown_utils import escape_special_markdown_char
from libs.safe_filename import sanitize_filename
from libs.vis_utils import draw_bbox_on_page, draw_layout_bbox_on_page
from pdf2text_recogFigure import parse_images
from pdf2text_recogFootnoteLine import remove_headder_footer_one_page # 获取figures的bbox
from pre_proc.detect_images import parse_images
from pdf2text_recogTable import parse_tables # 获取tables的bbox
from pdf2text_recogEquation import parse_equations # 获取equations的bbox
from pre_proc.detect_equation import parse_equations # 获取equations的bbox
from pdf2text_recogHeader import parse_headers # 获取headers的bbox
from pdf2text_recogPageNo import parse_pageNos # 获取pageNos的bbox
from pdf2text_recogFootnote import parse_footnotes_by_model, parse_footnotes_by_rule # 获取footnotes的bbox
from pdf2text_recogFooter import parse_footers # 获取footers的bbox
from pre_proc.detect_footnote import parse_footnotes_by_model, parse_footnotes_by_rule # 获取footnotes的bbox
from pre_proc.detect_footer_by_model import parse_footers # 获取footers的bbox
from pdf2text_recogPara import (
ParaProcessPipeline,
......@@ -34,6 +33,7 @@ from pdf2text_recogPara import (
)
from pre_proc.main_text_font import get_main_text_font
from pre_proc.remove_colored_strip_bbox import remove_colored_strip_textblock
from pre_proc.remove_footer_header import remove_headder_footer_one_page
'''
from para.para_pipeline import ParaProcessPipeline
......@@ -48,17 +48,17 @@ from para.exceptions import (
from libs.commons import read_file, join_path
from libs.pdf_image_tools import save_images_by_bboxes
from post_proc.footnote_remove import merge_footnote_blocks, remove_footnote_blocks
from post_proc.remove_footnote import merge_footnote_blocks, remove_footnote_blocks
from pre_proc.citationmarker_remove import remove_citation_marker
from pre_proc.equations_replace import combine_chars_to_pymudict, remove_chars_in_text_blocks, replace_equations_in_textblock
from pre_proc.pdf_filter import pdf_filter
from pre_proc.detect_footer_header import drop_footer_header
from pre_proc.pdf_pre_filter import pdf_filter
from pre_proc.detect_footer_header_by_statistics import drop_footer_header
from pre_proc.construct_paras import construct_page_component
from pre_proc.image_fix import combine_images, fix_image_vertical, fix_seperated_image, include_img_title
from pre_proc.fix_image import combine_images, fix_image_vertical, fix_seperated_image, include_img_title
from post_proc.pdf_post_filter import pdf_post_filter
from pre_proc.remove_rotate_bbox import get_side_boundry, remove_rotate_side_textblock, remove_side_blank_block
from pre_proc.resolve_bbox_conflict import check_text_block_horizontal_overlap, resolve_bbox_overlap_conflict
from pre_proc.table_fix import fix_table_text_block, fix_tables, include_table_title
from pre_proc.fix_table import fix_table_text_block, fix_tables, include_table_title
denseSingleLineBlockException_msg = DenseSingleLineBlockException().message
titleDetectionException_msg = TitleDetectionException().message
......@@ -108,7 +108,7 @@ def parse_pdf_by_model(
debug_mode=False,
):
pdf_bytes = read_file(s3_pdf_path, s3_pdf_profile)
save_tmp_path = os.path.join(os.path.dirname(__file__), "..", "tmp", "unittest")
save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "tmp", "unittest")
md_bookname_save_path = ""
book_name = sanitize_filename(book_name)
if debug_mode:
......
from libs.boxbase import _is_in
from pdf2text_recogFootnoteLine import remove_footnote_text, remove_footnote_image
from libs.boxbase import _is_in, _is_in_or_part_overlap
import collections # 统计库
......@@ -113,3 +112,41 @@ def remove_footnote_blocks(page_info):
del page_info['merged_bboxes']
del page_info['footnote_bboxes_tmp']
return page_info
def remove_footnote_text(raw_text_block, footnote_bboxes):
"""
:param raw_text_block: str类型,是当前页的文本内容
:param footnoteBboxes: list类型,是当前页的脚注bbox
"""
footnote_text_blocks = []
for block in raw_text_block:
text_bbox = block['bbox']
# TODO 更严谨点在line级别做
if any([_is_in_or_part_overlap(text_bbox, footnote_bbox) for footnote_bbox in footnote_bboxes]):
# if any([text_bbox[3]>=footnote_bbox[1] for footnote_bbox in footnote_bboxes]):
block['tag'] = 'footnote'
footnote_text_blocks.append(block)
# raw_text_block.remove(block)
# 移除,不能再内部移除,否则会出错
for block in footnote_text_blocks:
raw_text_block.remove(block)
return raw_text_block, footnote_text_blocks
def remove_footnote_image(image_blocks, footnote_bboxes):
"""
:param image_bboxes: list类型,是当前页的图片bbox(结构体)
:param footnoteBboxes: list类型,是当前页的脚注bbox
"""
footnote_imgs_blocks = []
for image_block in image_blocks:
if any([_is_in(image_block['bbox'], footnote_bbox) for footnote_bbox in footnote_bboxes]):
footnote_imgs_blocks.append(image_block)
for footnote_imgs_block in footnote_imgs_blocks:
image_blocks.remove(footnote_imgs_block)
return image_blocks, footnote_imgs_blocks
\ No newline at end of file
import re
from libs.boxbase import _is_in_or_part_overlap
def remove_headder_footer_one_page(text_raw_blocks, image_bboxes, table_bboxes, header_bboxs, footer_bboxs,
page_no_bboxs, page_w, page_h):
"""
删除页眉页脚,页码
从line级别进行删除,删除之后观察这个text-block是否是空的,如果是空的,则移动到remove_list中
"""
header = []
footer = []
if len(header) == 0:
model_header = header_bboxs
if model_header:
x0 = min([x for x, _, _, _ in model_header])
y0 = min([y for _, y, _, _ in model_header])
x1 = max([x1 for _, _, x1, _ in model_header])
y1 = max([y1 for _, _, _, y1 in model_header])
header = [x0, y0, x1, y1]
if len(footer) == 0:
model_footer = footer_bboxs
if model_footer:
x0 = min([x for x, _, _, _ in model_footer])
y0 = min([y for _, y, _, _ in model_footer])
x1 = max([x1 for _, _, x1, _ in model_footer])
y1 = max([y1 for _, _, _, y1 in model_footer])
footer = [x0, y0, x1, y1]
header_y0 = 0 if len(header) == 0 else header[3]
footer_y0 = page_h if len(footer) == 0 else footer[1]
if page_no_bboxs:
top_part = [b for b in page_no_bboxs if b[3] < page_h / 2]
btn_part = [b for b in page_no_bboxs if b[1] > page_h / 2]
top_max_y0 = max([b[1] for b in top_part]) if top_part else 0
btn_min_y1 = min([b[3] for b in btn_part]) if btn_part else page_h
header_y0 = max(header_y0, top_max_y0)
footer_y0 = min(footer_y0, btn_min_y1)
content_boundry = [0, header_y0, page_w, footer_y0]
header = [0, 0, page_w, header_y0]
footer = [0, footer_y0, page_w, page_h]
"""以上计算出来了页眉页脚的边界,下面开始进行删除"""
text_block_to_remove = []
# 首先检查每个textblock
for blk in text_raw_blocks:
if len(blk['lines']) > 0:
for line in blk['lines']:
line_del = []
for span in line['spans']:
span_del = []
if span['bbox'][3] < header_y0:
span_del.append(span)
elif _is_in_or_part_overlap(span['bbox'], header) or _is_in_or_part_overlap(span['bbox'], footer):
span_del.append(span)
for span in span_del:
line['spans'].remove(span)
if not line['spans']:
line_del.append(line)
for line in line_del:
blk['lines'].remove(line)
else:
# if not blk['lines']:
blk['tag'] = 'in-foot-header-area'
text_block_to_remove.append(blk)
"""有的时候由于pageNo太小了,总是会有一点和content_boundry重叠一点,被放入正文,因此对于pageNo,进行span粒度的删除"""
page_no_block_2_remove = []
if page_no_bboxs:
for pagenobox in page_no_bboxs:
for block in text_raw_blocks:
if _is_in_or_part_overlap(pagenobox, block['bbox']): # 在span级别删除页码
for line in block['lines']:
for span in line['spans']:
if _is_in_or_part_overlap(pagenobox, span['bbox']):
# span['text'] = ''
span['tag'] = "page-no"
# 检查这个block是否只有这一个span,如果是,那么就把这个block也删除
if len(line['spans']) == 1 and len(block['lines']) == 1:
page_no_block_2_remove.append(block)
else:
# 测试最后一个是不是页码:规则是,最后一个block仅有1个line,一个span,且text是数字,空格,符号组成,不含字母,并且包含数字
if len(text_raw_blocks) > 0:
text_raw_blocks.sort(key=lambda x: x['bbox'][1], reverse=True)
last_block = text_raw_blocks[0]
if len(last_block['lines']) == 1:
last_line = last_block['lines'][0]
if len(last_line['spans']) == 1:
last_span = last_line['spans'][0]
if last_span['text'].strip() and not re.search('[a-zA-Z]', last_span['text']) and re.search('[0-9]',
last_span[
'text']):
last_span['tag'] = "page-no"
page_no_block_2_remove.append(last_block)
for b in page_no_block_2_remove:
text_block_to_remove.append(b)
for blk in text_block_to_remove:
if blk in text_raw_blocks:
text_raw_blocks.remove(blk)
text_block_remain = text_raw_blocks
image_bbox_to_remove = [bbox for bbox in image_bboxes if not _is_in_or_part_overlap(bbox, content_boundry)]
image_bbox_remain = [bbox for bbox in image_bboxes if _is_in_or_part_overlap(bbox, content_boundry)]
table_bbox_to_remove = [bbox for bbox in table_bboxes if not _is_in_or_part_overlap(bbox, content_boundry)]
table_bbox_remain = [bbox for bbox in table_bboxes if _is_in_or_part_overlap(bbox, content_boundry)]
return image_bbox_remain, table_bbox_remain, text_block_remain, text_block_to_remove, image_bbox_to_remove, table_bbox_to_remove
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment