Commit 11e4e8cc authored by 赵小蒙's avatar 赵小蒙

重构目录结构

parent 56213908
...@@ -3,12 +3,11 @@ import sys ...@@ -3,12 +3,11 @@ import sys
from pathlib import Path from pathlib import Path
import click import click
import json
from loguru import logger from loguru import logger
from libs.commons import join_path, parse_aws_param, parse_bucket_key, read_file from libs.commons import join_path
from mkcontent import mk_mm_markdown, mk_nlp_markdown from dict2md.mkcontent import mk_mm_markdown
from pdf_parse_by_model import parse_pdf_by_model from pipeline.pdf_parse_by_model import parse_pdf_by_model
...@@ -17,7 +16,7 @@ def main(s3_pdf_path: str, s3_pdf_profile: str, pdf_model_path: str, pdf_model_p ...@@ -17,7 +16,7 @@ def main(s3_pdf_path: str, s3_pdf_profile: str, pdf_model_path: str, pdf_model_p
pth = Path(s3_pdf_path) pth = Path(s3_pdf_path)
book_name = pth.name book_name = pth.name
# book_name = "".join(os.path.basename(s3_pdf_path).split(".")[0:-1]) # book_name = "".join(os.path.basename(s3_pdf_path).split(".")[0:-1])
save_tmp_path = os.path.join(os.path.dirname(__file__), "..", "..","tmp", "unittest") save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "..", "tmp", "unittest")
save_path = join_path(save_tmp_path, "md") save_path = join_path(save_tmp_path, "md")
text_content_save_path = f"{save_path}/{book_name}/book.md" text_content_save_path = f"{save_path}/{book_name}/book.md"
# metadata_save_path = f"{save_path}/{book_name}/metadata.json" # metadata_save_path = f"{save_path}/{book_name}/metadata.json"
......
import os
import sys
from pathlib import Path from pathlib import Path
import click import click
import json import json
from loguru import logger
from libs.commons import join_path, parse_aws_param, parse_bucket_key, read_file
from mkcontent import mk_nlp_markdown
from pdf2md import main
from pdf_parse_by_model import parse_pdf_by_model
from demo.pdf2md import main
@click.command() @click.command()
......
import sys import sys
from typing import Tuple from typing import Tuple
import os import os
import click
import boto3, json import boto3, json
from botocore.config import Config from botocore.config import Config
from libs.commons import fitz from libs.commons import fitz
...@@ -24,16 +23,11 @@ from validation import cal_edit_distance, format_gt_bbox, label_match, detect_va ...@@ -24,16 +23,11 @@ from validation import cal_edit_distance, format_gt_bbox, label_match, detect_va
# from bbox_sort import bbox_sort, CONTENT_IDX, CONTENT_TYPE_IDX # from bbox_sort import bbox_sort, CONTENT_IDX, CONTENT_TYPE_IDX
from layout.bbox_sort import bbox_sort, CONTENT_IDX, CONTENT_TYPE_IDX from layout.bbox_sort import bbox_sort, CONTENT_IDX, CONTENT_TYPE_IDX
from pdf2text_recogFigure import parse_images # 获取figures的bbox from pre_proc.detect_images import parse_images # 获取figures的bbox
from pdf2text_recogTable import parse_tables # 获取tables的bbox from pdf2text_recogTable import parse_tables # 获取tables的bbox
from pdf2text_recogEquation import parse_equations # 获取equations的bbox from pre_proc.detect_equation import parse_equations # 获取equations的bbox
from pdf2text_recogTitle import parse_titles # 获取titles的bbox
from pdf2text_recogHeader import parse_headers # 获取headers的bbox
from pdf2text_recogPageNo import parse_pageNos # 获取pageNos的bbox
# from pdf2text_recogFootnote import parse_footnotes # 获取footnotes的bbox # from pdf2text_recogFootnote import parse_footnotes # 获取footnotes的bbox
from pdf2text_recogFooter import parse_footers # 获取footers的bbox from pdf2text_recogPara import process_blocks_per_page
from pdf2text_evaluatePdfLayout import evaluate_pdf_layout # 评估页面的Layout是否是规整的。
from pdf2text_recogPara import process_blocks_per_page, postprocess_paras_pipeline
from libs.commons import parse_aws_param, parse_bucket_key, read_file, join_path from libs.commons import parse_aws_param, parse_bucket_key, read_file, join_path
......
...@@ -15,14 +15,13 @@ from libs.drop_reason import DropReason ...@@ -15,14 +15,13 @@ from libs.drop_reason import DropReason
from libs.markdown_utils import escape_special_markdown_char from libs.markdown_utils import escape_special_markdown_char
from libs.safe_filename import sanitize_filename from libs.safe_filename import sanitize_filename
from libs.vis_utils import draw_bbox_on_page, draw_layout_bbox_on_page from libs.vis_utils import draw_bbox_on_page, draw_layout_bbox_on_page
from pdf2text_recogFigure import parse_images from pre_proc.detect_images import parse_images
from pdf2text_recogFootnoteLine import remove_headder_footer_one_page # 获取figures的bbox
from pdf2text_recogTable import parse_tables # 获取tables的bbox from pdf2text_recogTable import parse_tables # 获取tables的bbox
from pdf2text_recogEquation import parse_equations # 获取equations的bbox from pre_proc.detect_equation import parse_equations # 获取equations的bbox
from pdf2text_recogHeader import parse_headers # 获取headers的bbox from pdf2text_recogHeader import parse_headers # 获取headers的bbox
from pdf2text_recogPageNo import parse_pageNos # 获取pageNos的bbox from pdf2text_recogPageNo import parse_pageNos # 获取pageNos的bbox
from pdf2text_recogFootnote import parse_footnotes_by_model, parse_footnotes_by_rule # 获取footnotes的bbox from pre_proc.detect_footnote import parse_footnotes_by_model, parse_footnotes_by_rule # 获取footnotes的bbox
from pdf2text_recogFooter import parse_footers # 获取footers的bbox from pre_proc.detect_footer_by_model import parse_footers # 获取footers的bbox
from pdf2text_recogPara import ( from pdf2text_recogPara import (
ParaProcessPipeline, ParaProcessPipeline,
...@@ -34,6 +33,7 @@ from pdf2text_recogPara import ( ...@@ -34,6 +33,7 @@ from pdf2text_recogPara import (
) )
from pre_proc.main_text_font import get_main_text_font from pre_proc.main_text_font import get_main_text_font
from pre_proc.remove_colored_strip_bbox import remove_colored_strip_textblock from pre_proc.remove_colored_strip_bbox import remove_colored_strip_textblock
from pre_proc.remove_footer_header import remove_headder_footer_one_page
''' '''
from para.para_pipeline import ParaProcessPipeline from para.para_pipeline import ParaProcessPipeline
...@@ -48,17 +48,17 @@ from para.exceptions import ( ...@@ -48,17 +48,17 @@ from para.exceptions import (
from libs.commons import read_file, join_path from libs.commons import read_file, join_path
from libs.pdf_image_tools import save_images_by_bboxes from libs.pdf_image_tools import save_images_by_bboxes
from post_proc.footnote_remove import merge_footnote_blocks, remove_footnote_blocks from post_proc.remove_footnote import merge_footnote_blocks, remove_footnote_blocks
from pre_proc.citationmarker_remove import remove_citation_marker from pre_proc.citationmarker_remove import remove_citation_marker
from pre_proc.equations_replace import combine_chars_to_pymudict, remove_chars_in_text_blocks, replace_equations_in_textblock from pre_proc.equations_replace import combine_chars_to_pymudict, remove_chars_in_text_blocks, replace_equations_in_textblock
from pre_proc.pdf_filter import pdf_filter from pre_proc.pdf_pre_filter import pdf_filter
from pre_proc.detect_footer_header import drop_footer_header from pre_proc.detect_footer_header_by_statistics import drop_footer_header
from pre_proc.construct_paras import construct_page_component from pre_proc.construct_paras import construct_page_component
from pre_proc.image_fix import combine_images, fix_image_vertical, fix_seperated_image, include_img_title from pre_proc.fix_image import combine_images, fix_image_vertical, fix_seperated_image, include_img_title
from post_proc.pdf_post_filter import pdf_post_filter from post_proc.pdf_post_filter import pdf_post_filter
from pre_proc.remove_rotate_bbox import get_side_boundry, remove_rotate_side_textblock, remove_side_blank_block from pre_proc.remove_rotate_bbox import get_side_boundry, remove_rotate_side_textblock, remove_side_blank_block
from pre_proc.resolve_bbox_conflict import check_text_block_horizontal_overlap, resolve_bbox_overlap_conflict from pre_proc.resolve_bbox_conflict import check_text_block_horizontal_overlap, resolve_bbox_overlap_conflict
from pre_proc.table_fix import fix_table_text_block, fix_tables, include_table_title from pre_proc.fix_table import fix_table_text_block, fix_tables, include_table_title
denseSingleLineBlockException_msg = DenseSingleLineBlockException().message denseSingleLineBlockException_msg = DenseSingleLineBlockException().message
titleDetectionException_msg = TitleDetectionException().message titleDetectionException_msg = TitleDetectionException().message
...@@ -108,7 +108,7 @@ def parse_pdf_by_model( ...@@ -108,7 +108,7 @@ def parse_pdf_by_model(
debug_mode=False, debug_mode=False,
): ):
pdf_bytes = read_file(s3_pdf_path, s3_pdf_profile) pdf_bytes = read_file(s3_pdf_path, s3_pdf_profile)
save_tmp_path = os.path.join(os.path.dirname(__file__), "..", "tmp", "unittest") save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "tmp", "unittest")
md_bookname_save_path = "" md_bookname_save_path = ""
book_name = sanitize_filename(book_name) book_name = sanitize_filename(book_name)
if debug_mode: if debug_mode:
......
from libs.boxbase import _is_in from libs.boxbase import _is_in, _is_in_or_part_overlap
from pdf2text_recogFootnoteLine import remove_footnote_text, remove_footnote_image
import collections # 统计库 import collections # 统计库
...@@ -113,3 +112,41 @@ def remove_footnote_blocks(page_info): ...@@ -113,3 +112,41 @@ def remove_footnote_blocks(page_info):
del page_info['merged_bboxes'] del page_info['merged_bboxes']
del page_info['footnote_bboxes_tmp'] del page_info['footnote_bboxes_tmp']
return page_info return page_info
def remove_footnote_text(raw_text_block, footnote_bboxes):
"""
:param raw_text_block: str类型,是当前页的文本内容
:param footnoteBboxes: list类型,是当前页的脚注bbox
"""
footnote_text_blocks = []
for block in raw_text_block:
text_bbox = block['bbox']
# TODO 更严谨点在line级别做
if any([_is_in_or_part_overlap(text_bbox, footnote_bbox) for footnote_bbox in footnote_bboxes]):
# if any([text_bbox[3]>=footnote_bbox[1] for footnote_bbox in footnote_bboxes]):
block['tag'] = 'footnote'
footnote_text_blocks.append(block)
# raw_text_block.remove(block)
# 移除,不能再内部移除,否则会出错
for block in footnote_text_blocks:
raw_text_block.remove(block)
return raw_text_block, footnote_text_blocks
def remove_footnote_image(image_blocks, footnote_bboxes):
"""
:param image_bboxes: list类型,是当前页的图片bbox(结构体)
:param footnoteBboxes: list类型,是当前页的脚注bbox
"""
footnote_imgs_blocks = []
for image_block in image_blocks:
if any([_is_in(image_block['bbox'], footnote_bbox) for footnote_bbox in footnote_bboxes]):
footnote_imgs_blocks.append(image_block)
for footnote_imgs_block in footnote_imgs_blocks:
image_blocks.remove(footnote_imgs_block)
return image_blocks, footnote_imgs_blocks
\ No newline at end of file
import re
from libs.boxbase import _is_in_or_part_overlap
def remove_headder_footer_one_page(text_raw_blocks, image_bboxes, table_bboxes, header_bboxs, footer_bboxs,
page_no_bboxs, page_w, page_h):
"""
删除页眉页脚,页码
从line级别进行删除,删除之后观察这个text-block是否是空的,如果是空的,则移动到remove_list中
"""
header = []
footer = []
if len(header) == 0:
model_header = header_bboxs
if model_header:
x0 = min([x for x, _, _, _ in model_header])
y0 = min([y for _, y, _, _ in model_header])
x1 = max([x1 for _, _, x1, _ in model_header])
y1 = max([y1 for _, _, _, y1 in model_header])
header = [x0, y0, x1, y1]
if len(footer) == 0:
model_footer = footer_bboxs
if model_footer:
x0 = min([x for x, _, _, _ in model_footer])
y0 = min([y for _, y, _, _ in model_footer])
x1 = max([x1 for _, _, x1, _ in model_footer])
y1 = max([y1 for _, _, _, y1 in model_footer])
footer = [x0, y0, x1, y1]
header_y0 = 0 if len(header) == 0 else header[3]
footer_y0 = page_h if len(footer) == 0 else footer[1]
if page_no_bboxs:
top_part = [b for b in page_no_bboxs if b[3] < page_h / 2]
btn_part = [b for b in page_no_bboxs if b[1] > page_h / 2]
top_max_y0 = max([b[1] for b in top_part]) if top_part else 0
btn_min_y1 = min([b[3] for b in btn_part]) if btn_part else page_h
header_y0 = max(header_y0, top_max_y0)
footer_y0 = min(footer_y0, btn_min_y1)
content_boundry = [0, header_y0, page_w, footer_y0]
header = [0, 0, page_w, header_y0]
footer = [0, footer_y0, page_w, page_h]
"""以上计算出来了页眉页脚的边界,下面开始进行删除"""
text_block_to_remove = []
# 首先检查每个textblock
for blk in text_raw_blocks:
if len(blk['lines']) > 0:
for line in blk['lines']:
line_del = []
for span in line['spans']:
span_del = []
if span['bbox'][3] < header_y0:
span_del.append(span)
elif _is_in_or_part_overlap(span['bbox'], header) or _is_in_or_part_overlap(span['bbox'], footer):
span_del.append(span)
for span in span_del:
line['spans'].remove(span)
if not line['spans']:
line_del.append(line)
for line in line_del:
blk['lines'].remove(line)
else:
# if not blk['lines']:
blk['tag'] = 'in-foot-header-area'
text_block_to_remove.append(blk)
"""有的时候由于pageNo太小了,总是会有一点和content_boundry重叠一点,被放入正文,因此对于pageNo,进行span粒度的删除"""
page_no_block_2_remove = []
if page_no_bboxs:
for pagenobox in page_no_bboxs:
for block in text_raw_blocks:
if _is_in_or_part_overlap(pagenobox, block['bbox']): # 在span级别删除页码
for line in block['lines']:
for span in line['spans']:
if _is_in_or_part_overlap(pagenobox, span['bbox']):
# span['text'] = ''
span['tag'] = "page-no"
# 检查这个block是否只有这一个span,如果是,那么就把这个block也删除
if len(line['spans']) == 1 and len(block['lines']) == 1:
page_no_block_2_remove.append(block)
else:
# 测试最后一个是不是页码:规则是,最后一个block仅有1个line,一个span,且text是数字,空格,符号组成,不含字母,并且包含数字
if len(text_raw_blocks) > 0:
text_raw_blocks.sort(key=lambda x: x['bbox'][1], reverse=True)
last_block = text_raw_blocks[0]
if len(last_block['lines']) == 1:
last_line = last_block['lines'][0]
if len(last_line['spans']) == 1:
last_span = last_line['spans'][0]
if last_span['text'].strip() and not re.search('[a-zA-Z]', last_span['text']) and re.search('[0-9]',
last_span[
'text']):
last_span['tag'] = "page-no"
page_no_block_2_remove.append(last_block)
for b in page_no_block_2_remove:
text_block_to_remove.append(b)
for blk in text_block_to_remove:
if blk in text_raw_blocks:
text_raw_blocks.remove(blk)
text_block_remain = text_raw_blocks
image_bbox_to_remove = [bbox for bbox in image_bboxes if not _is_in_or_part_overlap(bbox, content_boundry)]
image_bbox_remain = [bbox for bbox in image_bboxes if _is_in_or_part_overlap(bbox, content_boundry)]
table_bbox_to_remove = [bbox for bbox in table_bboxes if not _is_in_or_part_overlap(bbox, content_boundry)]
table_bbox_remain = [bbox for bbox in table_bboxes if _is_in_or_part_overlap(bbox, content_boundry)]
return image_bbox_remain, table_bbox_remain, text_block_remain, text_block_to_remove, image_bbox_to_remove, table_bbox_to_remove
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment