Commit 23bacc60 authored by Shuimo's avatar Shuimo

add an option to freely output 'badcase.json

parents d1457937 4191fa96
......@@ -40,15 +40,20 @@ jobs:
pip install -r requirements.txt
fi
- name: benchmark
- name: config-net-reset
run: |
export http_proxy=""
export https_proxy=""
- name: get-benchmark-result
run: |
echo "start test"
cd tools && python ocr_badcase.py pdf_json_label_0306.json ocr_dataset.json json_files.zip badcase.json overall.json base_data.json
cd tools && python text_badcase.py pdf_json_label_0306.json pdf_json_label_0229.json json_files.zip text_badcase text_overall base_data_text.json --s3_bucket_name llm-process-pperf --s3_file_directory qa-validate/pdf-datasets/badcase --AWS_ACCESS_KEY 7X9CWNHIVOHH3LXRD5WK --AWS_SECRET_KEY IHLyTsv7h4ArzReLWUGZNKvwqB7CMrRi6e7ZyUt0 --END_POINT_URL http://p-ceph-norm-inside.pjlab.org.cn:80
python ocr_badcase.py pdf_json_label_0306.json ocr_dataset.json json_files.zip ocr_badcase ocr_overall base_data_ocr.json --s3_bucket_name llm-process-pperf --s3_file_directory qa-validate/pdf-datasets/badcase --AWS_ACCESS_KEY 7X9CWNHIVOHH3LXRD5WK --AWS_SECRET_KEY IHLyTsv7h4ArzReLWUGZNKvwqB7CMrRi6e7ZyUt0 --END_POINT_URL http://p-ceph-norm-inside.pjlab.org.cn:80
notify_to_feishu:
if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'master') }}
needs: [pdf-test]
runs-on: [pdf]
runs-on: pdf
steps:
- name: notify
run: |
......
......@@ -22,15 +22,15 @@ git clone https://github.com/magicpdf/Magic-PDF.git
2.Install the requirements
```sh
cd Magic-PDF
pip install -r requirements.txt
```
3.Run the main script
3.Run the command line
```sh
use demo/text_demo.py
or
use demo/ocr_demo.py
export PYTHONPATH=.
python magic_pdf/cli/magicpdf.py --help
```
### 版权说明
......
......@@ -15,7 +15,7 @@ from loguru import logger
from magic_pdf.libs.config_reader import get_s3_config_dict
from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt
from magic_pdf.spark.base import get_data_source
from magic_pdf.spark.spark_api import get_data_source
def demo_parse_pdf(book_name=None, start_page_id=0, debug_mode=True):
......@@ -67,9 +67,7 @@ def demo_classify_by_type(book_name=None, debug_mode=True):
img_num_list = pdf_meta["imgs_per_page"]
text_len_list = pdf_meta["text_len_per_page"]
text_layout_list = pdf_meta["text_layout_per_page"]
pdf_path = json_object.get("file_location")
is_text_pdf, results = classify(
pdf_path,
total_page,
page_width,
page_height,
......@@ -89,7 +87,7 @@ def demo_meta_scan(book_name=None, debug_mode=True):
s3_pdf_path = json_object.get("file_location")
s3_config = get_s3_config_dict(s3_pdf_path)
pdf_bytes = read_file(s3_pdf_path, s3_config)
res = pdf_meta_scan(s3_pdf_path, pdf_bytes)
res = pdf_meta_scan(pdf_bytes)
logger.info(json.dumps(res, ensure_ascii=False))
write_json_to_local(res, book_name)
......
......@@ -21,28 +21,175 @@ python magicpdf.py --json s3://llm-pdf-text/scihub/xxxx.json?bytes=0,81350
python magicpdf.py --pdf /home/llm/Downloads/xxxx.pdf --model /home/llm/Downloads/xxxx.json 或者 python magicpdf.py --pdf /home/llm/Downloads/xxxx.pdf
"""
import os
import json as json_parse
import click
from loguru import logger
from pathlib import Path
from magic_pdf.pipe.UNIPipe import UNIPipe
from magic_pdf.pipe.OCRPipe import OCRPipe
from magic_pdf.pipe.TXTPipe import TXTPipe
from magic_pdf.libs.config_reader import get_s3_config
from magic_pdf.libs.path_utils import (
parse_s3path,
parse_s3_range_params,
remove_non_official_s3_args,
)
from magic_pdf.libs.config_reader import get_local_dir
from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])
def prepare_env(pdf_file_name):
local_parent_dir = os.path.join(
get_local_dir(), "magic-pdf", pdf_file_name
)
local_image_dir = os.path.join(local_parent_dir, "images")
local_md_dir = local_parent_dir
os.makedirs(local_image_dir, exist_ok=True)
os.makedirs(local_md_dir, exist_ok=True)
return local_image_dir, local_md_dir
def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer, md_writer, image_dir):
if parse_method == "auto":
pipe = UNIPipe(pdf_bytes, model_list, image_writer, image_dir, is_debug=True)
elif parse_method == "txt":
pipe = TXTPipe(pdf_bytes, model_list, image_writer, image_dir, is_debug=True)
elif parse_method == "ocr":
pipe = OCRPipe(pdf_bytes, model_list, image_writer, image_dir, is_debug=True)
else:
print("unknow parse method")
os.exit(1)
pipe.pipe_classify()
pipe.pipe_parse()
md_content = pipe.pipe_mk_markdown()
#part_file_name = datetime.now().strftime("%H-%M-%S")
md_writer.write(
content=md_content, path=f"{pdf_file_name}.md", mode=AbsReaderWriter.MODE_TXT
)
md_writer.write(
content=json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
path=f"{pdf_file_name}.json",
mode=AbsReaderWriter.MODE_TXT,
)
# try:
# content_list = pipe.pipe_mk_uni_format()
# except Exception as e:
# logger.exception(e)
# md_writer.write(
# str(content_list), f"{part_file_name}.txt", AbsReaderWriter.MODE_TXT
# )
import click
@click.group()
def cli():
pass
@cli.command()
@click.option('--json', type=str, help='输入一个S3路径')
def json_command(json):
# 这里处理json相关的逻辑
print(f'处理JSON: {json}')
@click.option("--json", type=str, help="输入一个S3路径")
@click.option(
"--method",
type=parse_pdf_methods,
help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
default="auto",
)
def json_command(json, method):
if not json.startswith("s3://"):
print("usage: python magipdf.py --json s3://some_bucket/some_path")
os.exit(1)
def read_s3_path(s3path):
bucket, key = parse_s3path(s3path)
s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
s3_rw = S3ReaderWriter(
s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
)
may_range_params = parse_s3_range_params(s3path)
if may_range_params is None or 2 != len(may_range_params):
byte_start, byte_end = 0, None
else:
byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
byte_end += byte_start - 1
return s3_rw.read_jsonl(
remove_non_official_s3_args(s3path),
byte_start,
byte_end,
AbsReaderWriter.MODE_BIN,
)
jso = json_parse.loads(read_s3_path(json).decode("utf-8"))
s3_file_path = jso["file_location"]
pdf_file_name = Path(s3_file_path).stem
pdf_data = read_s3_path(s3_file_path)
local_image_dir, local_md_dir = prepare_env(pdf_file_name)
local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
local_md_dir
)
_do_parse(
pdf_file_name,
pdf_data,
jso["doc_layout_result"],
method,
local_image_rw,
local_md_rw,
os.path.basename(local_image_dir),
)
@cli.command()
@click.option('--pdf', type=click.Path(exists=True), required=True, help='PDF文件的路径')
@click.option('--model', type=click.Path(exists=True), help='模型的路径')
def pdf_command(pdf, model):
@click.option(
"--pdf", type=click.Path(exists=True), required=True, help="PDF文件的路径"
)
@click.option("--model", type=click.Path(exists=True), help="模型的路径")
@click.option(
"--method",
type=parse_pdf_methods,
help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
default="auto",
)
def pdf_command(pdf, model, method):
# 这里处理pdf和模型相关的逻辑
print(f'处理PDF: {pdf}')
print(f'加载模型: {model}')
if model is None:
model = pdf.replace(".pdf", ".json")
if not os.path.exists(model):
print(f"make sure json file existed and place under {os.dirname(pdf)}")
os.exit(1)
def read_fn(path):
disk_rw = DiskReaderWriter(os.path.dirname(path))
return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
pdf_data = read_fn(pdf)
jso = json_parse.loads(read_fn(model).decode("utf-8"))
pdf_file_name = Path(pdf).stem
local_image_dir, local_md_dir = prepare_env(pdf_file_name)
local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
local_md_dir
)
_do_parse(
pdf_file_name,
pdf_data,
jso,
method,
local_image_rw,
local_md_rw,
os.path.basename(local_image_dir),
)
if __name__ == '__main__':
if __name__ == "__main__":
"""
python magic_pdf/cli/magicpdf.py json-command --json s3://llm-pdf-text/pdf_ebook_and_paper/manual/v001/part-660407a28beb-000002.jsonl?bytes=0,63551
"""
cli()
......@@ -2,6 +2,7 @@ import math
from loguru import logger
from magic_pdf.libs.boxbase import find_bottom_nearest_text_bbox, find_top_nearest_text_bbox
from magic_pdf.libs.commons import join_path
from magic_pdf.libs.ocr_content_type import ContentType
TYPE_INLINE_EQUATION = ContentType.InlineEquation
......@@ -227,12 +228,12 @@ def __insert_before_para(text, type, element, content_list):
logger.error(f"Can't find the location of image {element.get('image_path')} in the markdown file, search target is {text}")
def mk_universal_format(para_dict: dict):
def mk_universal_format(pdf_info_list: list, img_buket_path):
"""
构造统一格式 https://aicarrier.feishu.cn/wiki/FqmMwcH69iIdCWkkyjvcDwNUnTY
"""
content_lst = []
for _, page_info in para_dict.items():
for page_info in pdf_info_list:
page_lst = [] # 一个page内的段落列表
para_blocks = page_info.get("para_blocks")
pymu_raw_blocks = page_info.get("preproc_blocks")
......@@ -249,7 +250,7 @@ def mk_universal_format(para_dict: dict):
for img in all_page_images:
content_node = {
"type": "image",
"img_path": img['image_path'],
"img_path": join_path(img_buket_path, img['image_path']),
"img_alt":"",
"img_title":"",
"img_caption":""
......@@ -258,7 +259,7 @@ def mk_universal_format(para_dict: dict):
for table in all_page_tables:
content_node = {
"type": "table",
"img_path": table['image_path'],
"img_path": join_path(img_buket_path, table['image_path']),
"table_latex": table.get("text"),
"table_title": "",
"table_caption": "",
......
This diff is collapsed.
......@@ -15,6 +15,7 @@ from collections import Counter
import click
import numpy as np
from loguru import logger
from magic_pdf.libs.commons import mymax, get_top_percent_list
from magic_pdf.filter.pdf_meta_scan import scan_max_page, junk_limit_min
......@@ -298,7 +299,7 @@ def classify_by_img_narrow_strips(page_width, page_height, img_sz_list):
return narrow_strip_pages_ratio < 0.5
def classify(pdf_path, total_page: int, page_width, page_height, img_sz_list: list, text_len_list: list, img_num_list: list, text_layout_list: list):
def classify(total_page: int, page_width, page_height, img_sz_list: list, text_len_list: list, img_num_list: list, text_layout_list: list):
"""
这里的图片和页面长度单位是pts
:param total_page:
......@@ -323,7 +324,7 @@ def classify(pdf_path, total_page: int, page_width, page_height, img_sz_list: li
elif not any(results.values()):
return False, results
else:
print(f"WARNING: {pdf_path} is not classified by area and text_len, by_image_area: {results['by_image_area']}, by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']}, by_text_layout: {results['by_text_layout']}, by_img_narrow_strips: {results['by_img_narrow_strips']}", file=sys.stderr) # 利用这种情况可以快速找出来哪些pdf比较特殊,针对性修正分类算法
logger.warning(f"pdf is not classified by area and text_len, by_image_area: {results['by_image_area']}, by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']}, by_text_layout: {results['by_text_layout']}, by_img_narrow_strips: {results['by_img_narrow_strips']}", file=sys.stderr) # 利用这种情况可以快速找出来哪些pdf比较特殊,针对性修正分类算法
return False, results
......@@ -350,7 +351,7 @@ def main(json_file):
is_needs_password = o['is_needs_password']
if is_encrypted or total_page == 0 or is_needs_password: # 加密的,需要密码的,没有页面的,都不处理
continue
tag = classify(pdf_path, total_page, page_width, page_height, img_sz_list, text_len_list, text_layout_list)
tag = classify(total_page, page_width, page_height, img_sz_list, text_len_list, text_layout_list)
o['is_text_pdf'] = tag
print(json.dumps(o, ensure_ascii=False))
except Exception as e:
......
......@@ -287,7 +287,7 @@ def get_language(doc: fitz.Document):
return language
def pdf_meta_scan(s3_pdf_path: str, pdf_bytes: bytes):
def pdf_meta_scan(pdf_bytes: bytes):
"""
:param s3_pdf_path:
:param pdf_bytes: pdf文件的二进制数据
......@@ -298,8 +298,8 @@ def pdf_meta_scan(s3_pdf_path: str, pdf_bytes: bytes):
is_encrypted = doc.is_encrypted
total_page = len(doc)
if total_page == 0:
logger.warning(f"drop this pdf: {s3_pdf_path}, drop_reason: {DropReason.EMPTY_PDF}")
result = {"need_drop": True, "drop_reason": DropReason.EMPTY_PDF}
logger.warning(f"drop this pdf, drop_reason: {DropReason.EMPTY_PDF}")
result = {"_need_drop": True, "_drop_reason": DropReason.EMPTY_PDF}
return result
else:
page_width_pts, page_height_pts = get_pdf_page_size_pts(doc)
......@@ -322,7 +322,6 @@ def pdf_meta_scan(s3_pdf_path: str, pdf_bytes: bytes):
# 最后输出一条json
res = {
"pdf_path": s3_pdf_path,
"is_needs_password": is_needs_password,
"is_encrypted": is_encrypted,
"total_page": total_page,
......@@ -350,7 +349,7 @@ def main(s3_pdf_path: str, s3_profile: str):
"""
try:
file_content = read_file(s3_pdf_path, s3_profile)
pdf_meta_scan(s3_pdf_path, file_content)
pdf_meta_scan(file_content)
except Exception as e:
print(f"ERROR: {s3_pdf_path}, {e}", file=sys.stderr)
logger.exception(e)
......
from enum import Enum
class ModelBlockTypeEnum(Enum):
TITLE = 0
PLAIN_TEXT = 1
ABANDON = 2
ISOLATE_FORMULA = 8
EMBEDDING = 13
ISOLATED = 14
\ No newline at end of file
from loguru import logger
import math
def _is_in_or_part_overlap(box1, box2) -> bool:
"""
......@@ -332,3 +332,42 @@ def find_right_nearest_text_bbox(pymu_blocks, obj_bbox):
return right_boxes[0]
else:
return None
def bbox_relative_pos(bbox1, bbox2):
x1, y1, x1b, y1b = bbox1
x2, y2, x2b, y2b = bbox2
left = x2b < x1
right = x1b < x2
bottom = y2b < y1
top = y1b < y2
return left, right, bottom, top
def bbox_distance(bbox1, bbox2):
def dist(point1, point2):
return math.sqrt((point1[0]-point2[0])**2 + (point1[1]-point2[1])**2)
x1, y1, x1b, y1b = bbox1
x2, y2, x2b, y2b = bbox2
left, right, bottom, top = bbox_relative_pos(bbox1, bbox2)
if top and left:
return dist((x1, y1b), (x2b, y2))
elif left and bottom:
return dist((x1, y1), (x2b, y2b))
elif bottom and right:
return dist((x1b, y1), (x2, y2b))
elif right and top:
return dist((x1b, y1b), (x2, y2))
elif left:
return x1 - x2b
elif right:
return x2 - x1b
elif bottom:
return y1 - y2b
elif top:
return y2 - y1b
else: # rectangles intersect
return 0
\ No newline at end of file
......@@ -2,6 +2,7 @@
根据bucket的名字返回对应的s3 AK, SK,endpoint三元组
"""
import json
import os
......@@ -10,20 +11,24 @@ from loguru import logger
from magic_pdf.libs.commons import parse_bucket_key
def get_s3_config(bucket_name: str):
"""
~/magic-pdf.json 读出来
"""
def read_config():
home_dir = os.path.expanduser("~")
config_file = os.path.join(home_dir, "magic-pdf.json")
if not os.path.exists(config_file):
raise Exception("magic-pdf.json not found")
raise Exception(f"{config_file} not found")
with open(config_file, "r") as f:
config = json.load(f)
return config
def get_s3_config(bucket_name: str):
"""
~/magic-pdf.json 读出来
"""
config = read_config()
bucket_info = config.get("bucket_info")
if bucket_name not in bucket_info:
......@@ -49,5 +54,10 @@ def get_bucket_name(path):
return bucket
if __name__ == '__main__':
def get_local_dir():
config = read_config()
return config.get("temp-output-dir", "/tmp")
if __name__ == "__main__":
ak, sk, endpoint = get_s3_config("llm-raw")
def dict_to_list(input_dict):
items_list = []
for _, item in input_dict.items():
items_list.append(item)
return items_list
def get_scale_ratio(ocr_page_info, page):
def get_scale_ratio(model_page_info, page):
pix = page.get_pixmap(dpi=72)
pymu_width = int(pix.w)
pymu_height = int(pix.h)
width_from_json = ocr_page_info['page_info']['width']
height_from_json = ocr_page_info['page_info']['height']
width_from_json = model_page_info['page_info']['width']
height_from_json = model_page_info['page_info']['height']
horizontal_scale_ratio = width_from_json / pymu_width
vertical_scale_ratio = height_from_json / pymu_height
return horizontal_scale_ratio, vertical_scale_ratio
from collections import Counter
from magic_pdf.libs.language import detect_lang
def get_language_from_model(model_list: list):
language_lst = []
for ocr_page_info in model_list:
page_text = ""
layout_dets = ocr_page_info["layout_dets"]
for layout_det in layout_dets:
category_id = layout_det["category_id"]
allow_category_id_list = [15]
if category_id in allow_category_id_list:
page_text += layout_det["text"]
page_language = detect_lang(page_text)
language_lst.append(page_language)
# 统计text_language_list中每种语言的个数
count_dict = Counter(language_lst)
# 输出text_language_list中出现的次数最多的语言
language = max(count_dict, key=count_dict.get)
return language
......@@ -8,7 +8,7 @@ class DropReason:
HIGH_COMPUTATIONAL_lOAD_BY_SVGS = "high_computational_load_by_svgs" # 特殊的SVG图,计算量太大,从而丢弃
HIGH_COMPUTATIONAL_lOAD_BY_TOTAL_PAGES = "high_computational_load_by_total_pages" # 计算量超过负荷,当前方法下计算量消耗过大
MISS_DOC_LAYOUT_RESULT = "missing doc_layout_result" # 版面分析失败
Exception = "exception" # 解析中发生异常
Exception = "_exception" # 解析中发生异常
ENCRYPTED = "encrypted" # PDF是加密的
EMPTY_PDF = "total_page=0" # PDF页面总数为0
NOT_IS_TEXT_PDF = "not_is_text_pdf" # 不是文字版PDF,无法直接解析
......
......@@ -16,3 +16,4 @@ class DropTag:
FOOTNOTE = "footnote"
NOT_IN_LAYOUT = "not_in_layout"
SPAN_OVERLAP = "span_overlap"
BLOCK_OVERLAP = "block_overlap"
def float_gt(a, b):
if 0.0001 >= abs(a -b):
return False
return a > b
def float_equal(a, b):
if 0.0001 >= abs(a-b):
return True
return False
\ No newline at end of file
......@@ -4,4 +4,17 @@ class ContentType:
Text = "text"
InlineEquation = "inline_equation"
InterlineEquation = "interline_equation"
class BlockType:
Image = "image"
ImageBody = "image_body"
ImageCaption = "image_caption"
Table = "table"
TableBody = "table_body"
TableCaption = "table_caption"
TableFootnote = "table_footnote"
Text = "text"
Title = "title"
InterlineEquation = "interline_equation"
Footnote = "footnote"
from s3pathlib import S3Path
def remove_non_official_s3_args(s3path):
"""
example: s3://abc/xxxx.json?bytes=0,81350 ==> s3://abc/xxxx.json
"""
arr = s3path.split("?")
return arr[0]
def parse_s3path(s3path: str):
p = S3Path(remove_non_official_s3_args(s3path))
return p.bucket, p.key
def parse_s3_range_params(s3path: str):
"""
example: s3://abc/xxxx.json?bytes=0,81350 ==> [0, 81350]
"""
arr = s3path.split("?bytes=")
if len(arr) == 1:
return None
return arr[1].split(",")
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.libs.commons import fitz
from loguru import logger
from magic_pdf.libs.commons import join_path
from magic_pdf.libs.hash_utils import compute_sha256
def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWriter):
def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWriter: AbsReaderWriter):
"""
从第page_num页的page中,根据bbox进行裁剪出一张jpg图片,返回图片路径
save_path:需要同时支持s3和本地, 图片存放在save_path下,文件名是: {page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。
......@@ -28,49 +28,6 @@ def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWri
byte_data = pix.tobytes(output='jpeg', jpg_quality=95)
imageWriter.write(data=byte_data, path=img_hash256_path, mode="binary")
imageWriter.write(byte_data, img_hash256_path, AbsReaderWriter.MODE_BIN)
return img_hash256_path
def save_images_by_bboxes(page_num: int, page: fitz.Page, pdf_bytes_md5: str,
image_bboxes: list, images_overlap_backup: list, table_bboxes: list,
equation_inline_bboxes: list,
equation_interline_bboxes: list, imageWriter) -> dict:
"""
返回一个dict, key为bbox, 值是图片地址
"""
image_info = []
image_backup_info = []
table_info = []
inline_eq_info = []
interline_eq_info = []
# 图片的保存路径组成是这样的: {s3_or_local_path}/{book_name}/{images|tables|equations}/{page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg
def return_path(type):
return join_path(pdf_bytes_md5, type)
for bbox in image_bboxes:
if any([bbox[0] >= bbox[2], bbox[1] >= bbox[3]]):
logger.warning(f"image_bboxes: 错误的box, {bbox}")
continue
image_path = cut_image(bbox, page_num, page, return_path("images"), imageWriter)
image_info.append({"bbox": bbox, "image_path": image_path})
for bbox in images_overlap_backup:
if any([bbox[0] >= bbox[2], bbox[1] >= bbox[3]]):
logger.warning(f"images_overlap_backup: 错误的box, {bbox}")
continue
image_path = cut_image(bbox, page_num, page, return_path("images"), imageWriter)
image_backup_info.append({"bbox": bbox, "image_path": image_path})
for bbox in table_bboxes:
if any([bbox[0] >= bbox[2], bbox[1] >= bbox[3]]):
logger.warning(f"table_bboxes: 错误的box, {bbox}")
continue
image_path = cut_image(bbox, page_num, page, return_path("tables"), imageWriter)
table_info.append({"bbox": bbox, "image_path": image_path})
return image_info, image_backup_info, table_info, inline_eq_info, interline_eq_info
\ No newline at end of file
This diff is collapsed.
......@@ -299,9 +299,9 @@ def __split_para_in_layoutbox(lines_group, new_layout_bbox, lang="en", char_avg_
layout_list_info[0] = True
if end==total_lines-1:
layout_list_info[1] = True
else:
else: # 是普通文本
for i, line in enumerate(lines[start:end+1]):
# 如果i有下一行,那么就要根据下一行位置综合判断是否要分段。如果i之后没有行,那么只需要判断一下行结尾特征。
# 如果i有下一行,那么就要根据下一行位置综合判断是否要分段。如果i之后没有行,那么只需要判断i行自己的结尾特征。
cur_line_type = line['spans'][-1]['type']
next_line = lines[i+1] if i<total_lines-1 else None
......@@ -547,7 +547,7 @@ def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang, deb
if "Table" in first_line_text or "Figure" in first_line_text:
pass
if debug_mode:
logger.info(line_hi.std())
logger.debug(line_hi.std())
if line_hi.std()<2:
"""行高度相同,那么判断是否居中"""
......@@ -560,7 +560,7 @@ def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang, deb
merge_para = [l[0] for l in layout_para[start:end+1]]
para_text = ''.join([__get_span_text(span) for line in merge_para for span in line['spans']])
if debug_mode:
logger.info(para_text)
logger.debug(para_text)
layout_para[start:end+1] = [merge_para]
index_offset -= end-start
......@@ -587,6 +587,8 @@ def __do_split_page(blocks, layout_bboxes, new_layout_bbox, page_num, lang):
3. 参照上述行尾特征进行分段。
4. 图、表,目前独占一行,不考虑分段。
"""
if page_num==343:
pass
lines_group = __group_line_by_layout(blocks, layout_bboxes, lang) # block内分段
layout_paras, layout_list_info = __split_para_in_layoutbox(lines_group, new_layout_bbox, lang) # layout内分段
layout_paras2, page_list_info = __connect_list_inter_layout(layout_paras, new_layout_bbox, layout_list_info, page_num, lang) # layout之间连接列表段落
......
This diff is collapsed.
......@@ -5,6 +5,7 @@ from magic_pdf.libs.commons import (
get_delta_time,
get_docx_model_output,
)
from magic_pdf.libs.convert_utils import dict_to_list
from magic_pdf.libs.coordinate_transform import get_scale_ratio
from magic_pdf.libs.drop_tag import DropTag
from magic_pdf.libs.hash_utils import compute_md5
......@@ -15,7 +16,7 @@ from magic_pdf.pre_proc.detect_footer_by_model import parse_footers
from magic_pdf.pre_proc.detect_footnote import parse_footnotes_by_model
from magic_pdf.pre_proc.detect_header import parse_headers
from magic_pdf.pre_proc.detect_page_number import parse_pageNos
from magic_pdf.pre_proc.ocr_cut_image import cut_image_and_table
from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
from magic_pdf.pre_proc.ocr_detect_layout import layout_detect
from magic_pdf.pre_proc.ocr_dict_merge import (
merge_spans_to_line_by_layout, merge_lines_to_block,
......@@ -26,7 +27,6 @@ from magic_pdf.pre_proc.ocr_span_list_modify import remove_spans_by_bboxes, remo
from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox
def parse_pdf_by_ocr(
pdf_bytes,
pdf_model_output,
......@@ -147,7 +147,7 @@ def parse_pdf_by_ocr(
spans, dropped_spans_by_removed_bboxes = remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict)
'''对image和table截图'''
spans = cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter)
spans = ocr_cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter)
'''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
displayed_list = []
......@@ -201,13 +201,20 @@ def parse_pdf_by_ocr(
'''构造pdf_info_dict'''
page_info = ocr_construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
images, tables, interline_equations, inline_equations,
dropped_text_block, dropped_image_block, dropped_table_block,
dropped_equation_block,
need_remove_spans_bboxes_dict)
images, tables, interline_equations, inline_equations,
dropped_text_block, dropped_image_block, dropped_table_block,
dropped_equation_block,
need_remove_spans_bboxes_dict)
pdf_info_dict[f"page_{page_id}"] = page_info
"""分段"""
para_split(pdf_info_dict, debug_mode=debug_mode)
return pdf_info_dict
"""dict转list"""
pdf_info_list = dict_to_list(pdf_info_dict)
new_pdf_info_dict = {
"pdf_info": pdf_info_list,
}
return new_pdf_info_dict
import time
from loguru import logger
from magic_pdf.layout.layout_sort import get_bboxes_layout
from magic_pdf.libs.convert_utils import dict_to_list
from magic_pdf.libs.hash_utils import compute_md5
from magic_pdf.libs.commons import fitz, get_delta_time
from magic_pdf.model.magic_model import MagicModel
from magic_pdf.pre_proc.construct_page_dict import ocr_construct_page_component_v2
from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layout_split
from magic_pdf.pre_proc.ocr_dict_merge import sort_blocks_by_layout, fill_spans_in_blocks, fix_block_spans
from magic_pdf.pre_proc.ocr_span_list_modify import remove_overlaps_min_spans, get_qa_need_list_v2
# from magic_pdf.para.para_split import para_split
from magic_pdf.para.para_split_v2 import para_split
def parse_pdf_by_ocr(pdf_bytes,
model_list,
imageWriter,
start_page_id=0,
end_page_id=None,
debug_mode=False,
):
pdf_bytes_md5 = compute_md5(pdf_bytes)
pdf_docs = fitz.open("pdf", pdf_bytes)
'''初始化空的pdf_info_dict'''
pdf_info_dict = {}
'''用model_list和docs对象初始化magic_model'''
magic_model = MagicModel(model_list, pdf_docs)
'''根据输入的起始范围解析pdf'''
end_page_id = end_page_id if end_page_id else len(pdf_docs) - 1
'''初始化启动时间'''
start_time = time.time()
for page_id in range(start_page_id, end_page_id + 1):
'''debug时输出每页解析的耗时'''
if debug_mode:
time_now = time.time()
logger.info(
f"page_id: {page_id}, last_page_cost_time: {get_delta_time(start_time)}"
)
start_time = time_now
'''从magic_model对象中获取后面会用到的区块信息'''
img_blocks = magic_model.get_imgs(page_id)
table_blocks = magic_model.get_tables(page_id)
discarded_blocks = magic_model.get_discarded(page_id)
text_blocks = magic_model.get_text_blocks(page_id)
title_blocks = magic_model.get_title_blocks(page_id)
inline_equations, interline_equations, interline_equation_blocks = magic_model.get_equations(page_id)
page_w, page_h = magic_model.get_page_size(page_id)
'''将所有区块的bbox整理到一起'''
all_bboxes = ocr_prepare_bboxes_for_layout_split(
img_blocks, table_blocks, discarded_blocks, text_blocks, title_blocks,
interline_equation_blocks, page_w, page_h)
'''根据区块信息计算layout'''
page_boundry = [0, 0, page_w, page_h]
layout_bboxes, layout_tree = get_bboxes_layout(all_bboxes, page_boundry, page_id)
'''根据layout顺序,对当前页面所有需要留下的block进行排序'''
sorted_blocks = sort_blocks_by_layout(all_bboxes, layout_bboxes)
'''获取所有需要拼接的span资源'''
spans = magic_model.get_all_spans(page_id)
'''删除重叠spans中较小的那些'''
spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans)
'''对image和table截图'''
spans = ocr_cut_image_and_table(spans, pdf_docs[page_id], page_id, pdf_bytes_md5, imageWriter)
'''将span填入排好序的blocks中'''
block_with_spans = fill_spans_in_blocks(sorted_blocks, spans)
'''对block进行fix操作'''
fix_blocks = fix_block_spans(block_with_spans, img_blocks, table_blocks)
'''获取QA需要外置的list'''
images, tables, interline_equations = get_qa_need_list_v2(fix_blocks)
'''构造pdf_info_dict'''
page_info = ocr_construct_page_component_v2(fix_blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
images, tables, interline_equations, discarded_blocks)
pdf_info_dict[f"page_{page_id}"] = page_info
"""分段"""
try:
para_split(pdf_info_dict, debug_mode=debug_mode)
except Exception as e:
logger.exception(e)
raise e
"""dict转list"""
pdf_info_list = dict_to_list(pdf_info_dict)
new_pdf_info_dict = {
"pdf_info": pdf_info_list,
}
return new_pdf_info_dict
......@@ -11,11 +11,13 @@ from magic_pdf.layout.bbox_sort import (
prepare_bboxes_for_layout_split,
)
from magic_pdf.layout.layout_sort import LAYOUT_UNPROC, get_bboxes_layout, get_columns_cnt_of_layout, sort_text_block
from magic_pdf.libs.convert_utils import dict_to_list
from magic_pdf.libs.drop_reason import DropReason
from magic_pdf.libs.hash_utils import compute_md5
from magic_pdf.libs.markdown_utils import escape_special_markdown_char
from magic_pdf.libs.safe_filename import sanitize_filename
from magic_pdf.libs.vis_utils import draw_bbox_on_page, draw_layout_bbox_on_page
from magic_pdf.pre_proc.cut_image import txt_save_images_by_bboxes
from magic_pdf.pre_proc.detect_images import parse_images
from magic_pdf.pre_proc.detect_tables import parse_tables # 获取tables的bbox
from magic_pdf.pre_proc.detect_equation import parse_equations # 获取equations的bbox
......@@ -47,8 +49,6 @@ from para.exceptions import (
)
'''
from magic_pdf.libs.commons import read_file, join_path
from magic_pdf.libs.pdf_image_tools import save_images_by_bboxes
from magic_pdf.post_proc.remove_footnote import merge_footnote_blocks, remove_footnote_blocks
from magic_pdf.pre_proc.citationmarker_remove import remove_citation_marker
from magic_pdf.pre_proc.equations_replace import combine_chars_to_pymudict, remove_chars_in_text_blocks, replace_equations_in_textblock
......@@ -107,7 +107,7 @@ def parse_pdf_by_txt(
# 去除对junkimg的依赖,简化逻辑
if len(page_imgs) > 1500: # 如果当前页超过1500张图片,直接跳过
logger.warning(f"page_id: {page_id}, img_counts: {len(page_imgs)}, drop this pdf")
result = {"need_drop": True, "drop_reason": DropReason.HIGH_COMPUTATIONAL_lOAD_BY_IMGS}
result = {"_need_drop": True, "_drop_reason": DropReason.HIGH_COMPUTATIONAL_lOAD_BY_IMGS}
if not debug_mode:
return result
......@@ -193,7 +193,7 @@ def parse_pdf_by_txt(
"""
# 把图、表、公式都进行截图,保存到存储上,返回图片路径作为内容
image_info, image_backup_info, table_info, inline_eq_info, interline_eq_info = save_images_by_bboxes(
image_info, image_backup_info, table_info, inline_eq_info, interline_eq_info = txt_save_images_by_bboxes(
page_id,
page,
pdf_bytes_md5,
......@@ -236,7 +236,7 @@ def parse_pdf_by_txt(
if is_text_block_horz_overlap:
# debug_show_bbox(pdf_docs, page_id, [b['bbox'] for b in remain_text_blocks], [], [], join_path(save_path, book_name, f"{book_name}_debug.pdf"), 0)
logger.warning(f"page_id: {page_id}, drop this pdf: {pdf_bytes_md5}, reason: {DropReason.TEXT_BLCOK_HOR_OVERLAP}")
result = {"need_drop": True, "drop_reason": DropReason.TEXT_BLCOK_HOR_OVERLAP}
result = {"_need_drop": True, "_drop_reason": DropReason.TEXT_BLCOK_HOR_OVERLAP}
if not debug_mode:
return result
......@@ -255,14 +255,14 @@ def parse_pdf_by_txt(
if len(remain_text_blocks)>0 and len(all_bboxes)>0 and len(layout_bboxes)==0:
logger.warning(f"page_id: {page_id}, drop this pdf: {pdf_bytes_md5}, reason: {DropReason.CAN_NOT_DETECT_PAGE_LAYOUT}")
result = {"need_drop": True, "drop_reason": DropReason.CAN_NOT_DETECT_PAGE_LAYOUT}
result = {"_need_drop": True, "_drop_reason": DropReason.CAN_NOT_DETECT_PAGE_LAYOUT}
if not debug_mode:
return result
"""以下去掉复杂的布局和超过2列的布局"""
if any([lay["layout_label"] == LAYOUT_UNPROC for lay in layout_bboxes]): # 复杂的布局
logger.warning(f"page_id: {page_id}, drop this pdf: {pdf_bytes_md5}, reason: {DropReason.COMPLICATED_LAYOUT}")
result = {"need_drop": True, "drop_reason": DropReason.COMPLICATED_LAYOUT}
result = {"_need_drop": True, "_drop_reason": DropReason.COMPLICATED_LAYOUT}
if not debug_mode:
return result
......@@ -270,8 +270,8 @@ def parse_pdf_by_txt(
if layout_column_width > 2: # 去掉超过2列的布局pdf
logger.warning(f"page_id: {page_id}, drop this pdf: {pdf_bytes_md5}, reason: {DropReason.TOO_MANY_LAYOUT_COLUMNS}")
result = {
"need_drop": True,
"drop_reason": DropReason.TOO_MANY_LAYOUT_COLUMNS,
"_need_drop": True,
"_drop_reason": DropReason.TOO_MANY_LAYOUT_COLUMNS,
"extra_info": {"column_cnt": layout_column_width},
}
if not debug_mode:
......@@ -377,27 +377,34 @@ def parse_pdf_by_txt(
logger.warning(f"page_id: {page_id}, drop this pdf: {pdf_bytes_md5}, reason: {error_info}")
if error_info == denseSingleLineBlockException_msg:
logger.warning(f"Drop this pdf: {pdf_bytes_md5}, reason: {DropReason.DENSE_SINGLE_LINE_BLOCK}")
result = {"need_drop": True, "drop_reason": DropReason.DENSE_SINGLE_LINE_BLOCK}
result = {"_need_drop": True, "_drop_reason": DropReason.DENSE_SINGLE_LINE_BLOCK}
return result
if error_info == titleDetectionException_msg:
logger.warning(f"Drop this pdf: {pdf_bytes_md5}, reason: {DropReason.TITLE_DETECTION_FAILED}")
result = {"need_drop": True, "drop_reason": DropReason.TITLE_DETECTION_FAILED}
result = {"_need_drop": True, "_drop_reason": DropReason.TITLE_DETECTION_FAILED}
return result
elif error_info == titleLevelException_msg:
logger.warning(f"Drop this pdf: {pdf_bytes_md5}, reason: {DropReason.TITLE_LEVEL_FAILED}")
result = {"need_drop": True, "drop_reason": DropReason.TITLE_LEVEL_FAILED}
result = {"_need_drop": True, "_drop_reason": DropReason.TITLE_LEVEL_FAILED}
return result
elif error_info == paraSplitException_msg:
logger.warning(f"Drop this pdf: {pdf_bytes_md5}, reason: {DropReason.PARA_SPLIT_FAILED}")
result = {"need_drop": True, "drop_reason": DropReason.PARA_SPLIT_FAILED}
result = {"_need_drop": True, "_drop_reason": DropReason.PARA_SPLIT_FAILED}
return result
elif error_info == paraMergeException_msg:
logger.warning(f"Drop this pdf: {pdf_bytes_md5}, reason: {DropReason.PARA_MERGE_FAILED}")
result = {"need_drop": True, "drop_reason": DropReason.PARA_MERGE_FAILED}
result = {"_need_drop": True, "_drop_reason": DropReason.PARA_MERGE_FAILED}
return result
pdf_info_dict, error_info = para_process_pipeline.para_process_pipeline(pdf_info_dict)
if error_info is not None:
return _deal_with_text_exception(error_info)
return pdf_info_dict
"""dict转list"""
pdf_info_list = dict_to_list(pdf_info_dict)
new_pdf_info_dict = {
"pdf_info": pdf_info_list,
}
return new_pdf_info_dict
import time
from loguru import logger
from magic_pdf.layout.layout_sort import get_bboxes_layout
from magic_pdf.libs.convert_utils import dict_to_list
from magic_pdf.libs.hash_utils import compute_md5
from magic_pdf.libs.commons import fitz, get_delta_time
from magic_pdf.model.magic_model import MagicModel
from magic_pdf.pre_proc.construct_page_dict import ocr_construct_page_component_v2
from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layout_split
from magic_pdf.pre_proc.ocr_dict_merge import (
sort_blocks_by_layout,
fill_spans_in_blocks,
fix_block_spans,
)
from magic_pdf.libs.ocr_content_type import ContentType
from magic_pdf.pre_proc.ocr_span_list_modify import (
remove_overlaps_min_spans,
get_qa_need_list_v2,
)
from magic_pdf.pre_proc.equations_replace import (
combine_chars_to_pymudict,
remove_chars_in_text_blocks,
replace_equations_in_textblock,
)
from magic_pdf.pre_proc.equations_replace import (
combine_chars_to_pymudict,
remove_chars_in_text_blocks,
replace_equations_in_textblock,
)
from magic_pdf.pre_proc.citationmarker_remove import remove_citation_marker
from magic_pdf.libs.math import float_equal
from magic_pdf.para.para_split_v2 import para_split
def txt_spans_extract(pdf_page, inline_equations, interline_equations):
text_raw_blocks = pdf_page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"]
char_level_text_blocks = pdf_page.get_text("rawdict", flags=fitz.TEXTFLAGS_TEXT)[
"blocks"
]
text_blocks = combine_chars_to_pymudict(text_raw_blocks, char_level_text_blocks)
text_blocks = replace_equations_in_textblock(
text_blocks, inline_equations, interline_equations
)
text_blocks = remove_citation_marker(text_blocks)
text_blocks = remove_chars_in_text_blocks(text_blocks)
spans = []
for v in text_blocks:
for line in v["lines"]:
for span in line["spans"]:
bbox = span["bbox"]
if float_equal(bbox[0], bbox[2]) or float_equal(bbox[1], bbox[3]):
continue
spans.append(
{
"bbox": list(span["bbox"]),
"content": span["text"],
"type": ContentType.Text,
}
)
return spans
def replace_text_span(pymu_spans, ocr_spans):
return list(filter(lambda x: x["type"] != ContentType.Text, ocr_spans)) + pymu_spans
def parse_pdf_by_txt(
pdf_bytes,
model_list,
imageWriter,
start_page_id=0,
end_page_id=None,
debug_mode=False,
):
pdf_bytes_md5 = compute_md5(pdf_bytes)
pdf_docs = fitz.open("pdf", pdf_bytes)
"""初始化空的pdf_info_dict"""
pdf_info_dict = {}
"""用model_list和docs对象初始化magic_model"""
magic_model = MagicModel(model_list, pdf_docs)
"""根据输入的起始范围解析pdf"""
end_page_id = end_page_id if end_page_id else len(pdf_docs) - 1
"""初始化启动时间"""
start_time = time.time()
for page_id in range(start_page_id, end_page_id + 1):
"""debug时输出每页解析的耗时"""
if debug_mode:
time_now = time.time()
logger.info(
f"page_id: {page_id}, last_page_cost_time: {get_delta_time(start_time)}"
)
start_time = time_now
"""从magic_model对象中获取后面会用到的区块信息"""
img_blocks = magic_model.get_imgs(page_id)
table_blocks = magic_model.get_tables(page_id)
discarded_blocks = magic_model.get_discarded(page_id)
text_blocks = magic_model.get_text_blocks(page_id)
title_blocks = magic_model.get_title_blocks(page_id)
inline_equations, interline_equations, interline_equation_blocks = (
magic_model.get_equations(page_id)
)
page_w, page_h = magic_model.get_page_size(page_id)
"""将所有区块的bbox整理到一起"""
all_bboxes = ocr_prepare_bboxes_for_layout_split(
img_blocks,
table_blocks,
discarded_blocks,
text_blocks,
title_blocks,
interline_equation_blocks,
page_w,
page_h,
)
"""根据区块信息计算layout"""
page_boundry = [0, 0, page_w, page_h]
layout_bboxes, layout_tree = get_bboxes_layout(
all_bboxes, page_boundry, page_id
)
"""根据layout顺序,对当前页面所有需要留下的block进行排序"""
sorted_blocks = sort_blocks_by_layout(all_bboxes, layout_bboxes)
"""ocr 中文本类的 span 用 pymu spans 替换!"""
ocr_spans = magic_model.get_all_spans(page_id)
pymu_spans = txt_spans_extract(
pdf_docs[page_id], inline_equations, interline_equations
)
spans = replace_text_span(pymu_spans, ocr_spans)
"""删除重叠spans中较小的那些"""
spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans)
"""对image和table截图"""
spans = ocr_cut_image_and_table(
spans, pdf_docs[page_id], page_id, pdf_bytes_md5, imageWriter
)
"""将span填入排好序的blocks中"""
block_with_spans = fill_spans_in_blocks(sorted_blocks, spans)
"""对block进行fix操作"""
fix_blocks = fix_block_spans(block_with_spans, img_blocks, table_blocks)
"""获取QA需要外置的list"""
images, tables, interline_equations = get_qa_need_list_v2(fix_blocks)
"""构造pdf_info_dict"""
page_info = ocr_construct_page_component_v2(
fix_blocks,
layout_bboxes,
page_id,
page_w,
page_h,
layout_tree,
images,
tables,
interline_equations,
discarded_blocks,
)
pdf_info_dict[f"page_{page_id}"] = page_info
"""分段"""
try:
para_split(pdf_info_dict, debug_mode=debug_mode)
except Exception as e:
logger.exception(e)
raise e
"""dict转list"""
pdf_info_list = dict_to_list(pdf_info_dict)
new_pdf_info_dict = {
"pdf_info": pdf_info_list,
}
return new_pdf_info_dict
if __name__ == "__main__":
if 1:
import fitz
import json
with open("/opt/data/pdf/20240418/25536-00.pdf", "rb") as f:
pdf_bytes = f.read()
pdf_docs = fitz.open("pdf", pdf_bytes)
with open("/opt/data/pdf/20240418/25536-00.json") as f:
model_list = json.loads(f.readline())
magic_model = MagicModel(model_list, pdf_docs)
for i in range(7):
print(magic_model.get_imgs(i))
for page_no, page in enumerate(pdf_docs):
inline_equations, interline_equations, interline_equation_blocks = (
magic_model.get_equations(page_no)
)
text_raw_blocks = page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"]
char_level_text_blocks = page.get_text(
"rawdict", flags=fitz.TEXTFLAGS_TEXT
)["blocks"]
text_blocks = combine_chars_to_pymudict(
text_raw_blocks, char_level_text_blocks
)
text_blocks = replace_equations_in_textblock(
text_blocks, inline_equations, interline_equations
)
text_blocks = remove_citation_marker(text_blocks)
text_blocks = remove_chars_in_text_blocks(text_blocks)
......@@ -26,6 +26,7 @@ from magic_pdf.libs.drop_reason import DropReason
from magic_pdf.libs.markdown_utils import escape_special_markdown_char
from magic_pdf.libs.safe_filename import sanitize_filename
from magic_pdf.libs.vis_utils import draw_bbox_on_page, draw_layout_bbox_on_page
from magic_pdf.pre_proc.cut_image import txt_save_images_by_bboxes
from magic_pdf.pre_proc.detect_images import parse_images
from magic_pdf.pre_proc.detect_tables import parse_tables # 获取tables的bbox
from magic_pdf.pre_proc.detect_equation import parse_equations # 获取equations的bbox
......@@ -62,7 +63,6 @@ from para.exceptions import (
"""
from magic_pdf.libs.commons import read_file, join_path
from magic_pdf.libs.pdf_image_tools import save_images_by_bboxes
from magic_pdf.post_proc.remove_footnote import (
merge_footnote_blocks,
remove_footnote_blocks,
......@@ -183,8 +183,8 @@ def parse_pdf_for_train(
f"page_id: {page_id}, img_counts: {img_counts}, drop this pdf: {book_name}, drop_reason: {DropReason.HIGH_COMPUTATIONAL_lOAD_BY_IMGS}"
)
result = {
"need_drop": True,
"drop_reason": DropReason.HIGH_COMPUTATIONAL_lOAD_BY_IMGS,
"_need_drop": True,
"_drop_reason": DropReason.HIGH_COMPUTATIONAL_lOAD_BY_IMGS,
}
if not debug_mode:
return result
......@@ -323,7 +323,7 @@ def parse_pdf_for_train(
# 把图、表、公式都进行截图,保存到存储上,返回图片路径作为内容
image_info, image_backup_info, table_info, inline_eq_info, interline_eq_info = (
save_images_by_bboxes(
txt_save_images_by_bboxes(
book_name,
page_id,
page,
......@@ -396,8 +396,8 @@ def parse_pdf_for_train(
f"page_id: {page_id}, drop this pdf: {book_name}, reason: {DropReason.TEXT_BLCOK_HOR_OVERLAP}"
)
result = {
"need_drop": True,
"drop_reason": DropReason.TEXT_BLCOK_HOR_OVERLAP,
"_need_drop": True,
"_drop_reason": DropReason.TEXT_BLCOK_HOR_OVERLAP,
}
if not debug_mode:
return result
......@@ -443,8 +443,8 @@ def parse_pdf_for_train(
f"page_id: {page_id}, drop this pdf: {book_name}, reason: {DropReason.CAN_NOT_DETECT_PAGE_LAYOUT}"
)
result = {
"need_drop": True,
"drop_reason": DropReason.CAN_NOT_DETECT_PAGE_LAYOUT,
"_need_drop": True,
"_drop_reason": DropReason.CAN_NOT_DETECT_PAGE_LAYOUT,
}
if not debug_mode:
return result
......@@ -456,7 +456,7 @@ def parse_pdf_for_train(
logger.warning(
f"page_id: {page_id}, drop this pdf: {book_name}, reason: {DropReason.COMPLICATED_LAYOUT}"
)
result = {"need_drop": True, "drop_reason": DropReason.COMPLICATED_LAYOUT}
result = {"_need_drop": True, "_drop_reason": DropReason.COMPLICATED_LAYOUT}
if not debug_mode:
return result
......@@ -466,8 +466,8 @@ def parse_pdf_for_train(
f"page_id: {page_id}, drop this pdf: {book_name}, reason: {DropReason.TOO_MANY_LAYOUT_COLUMNS}"
)
result = {
"need_drop": True,
"drop_reason": DropReason.TOO_MANY_LAYOUT_COLUMNS,
"_need_drop": True,
"_drop_reason": DropReason.TOO_MANY_LAYOUT_COLUMNS,
"extra_info": {"column_cnt": layout_column_width},
}
if not debug_mode:
......@@ -616,8 +616,8 @@ def parse_pdf_for_train(
f"Drop this pdf: {book_name}, reason: {DropReason.DENSE_SINGLE_LINE_BLOCK}"
)
result = {
"need_drop": True,
"drop_reason": DropReason.DENSE_SINGLE_LINE_BLOCK,
"_need_drop": True,
"_drop_reason": DropReason.DENSE_SINGLE_LINE_BLOCK,
}
return result
if error_info == titleDetectionException_msg:
......@@ -625,27 +625,27 @@ def parse_pdf_for_train(
f"Drop this pdf: {book_name}, reason: {DropReason.TITLE_DETECTION_FAILED}"
)
result = {
"need_drop": True,
"drop_reason": DropReason.TITLE_DETECTION_FAILED,
"_need_drop": True,
"_drop_reason": DropReason.TITLE_DETECTION_FAILED,
}
return result
elif error_info == titleLevelException_msg:
logger.warning(
f"Drop this pdf: {book_name}, reason: {DropReason.TITLE_LEVEL_FAILED}"
)
result = {"need_drop": True, "drop_reason": DropReason.TITLE_LEVEL_FAILED}
result = {"_need_drop": True, "_drop_reason": DropReason.TITLE_LEVEL_FAILED}
return result
elif error_info == paraSplitException_msg:
logger.warning(
f"Drop this pdf: {book_name}, reason: {DropReason.PARA_SPLIT_FAILED}"
)
result = {"need_drop": True, "drop_reason": DropReason.PARA_SPLIT_FAILED}
result = {"_need_drop": True, "_drop_reason": DropReason.PARA_SPLIT_FAILED}
return result
elif error_info == paraMergeException_msg:
logger.warning(
f"Drop this pdf: {book_name}, reason: {DropReason.PARA_MERGE_FAILED}"
)
result = {"need_drop": True, "drop_reason": DropReason.PARA_MERGE_FAILED}
result = {"_need_drop": True, "_drop_reason": DropReason.PARA_MERGE_FAILED}
return result
if debug_mode:
......
from abc import ABC, abstractmethod
from magic_pdf.dict2md.mkcontent import mk_universal_format, mk_mm_markdown
from magic_pdf.dict2md.ocr_mkcontent import make_standard_format_with_para, ocr_mk_mm_markdown_with_para
from magic_pdf.filter.pdf_classify_by_type import classify
from magic_pdf.filter.pdf_meta_scan import pdf_meta_scan
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.libs.drop_reason import DropReason
from magic_pdf.libs.json_compressor import JsonCompressor
class AbsPipe(ABC):
"""
txt和ocr处理的抽象类
"""
PIP_OCR = "ocr"
PIP_TXT = "txt"
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path:str, is_debug:bool=False):
self.pdf_bytes = pdf_bytes
self.model_list = model_list
self.image_writer = image_writer
self.img_parent_path = img_parent_path
self.pdf_mid_data = None # 未压缩
self.is_debug = is_debug
def get_compress_pdf_mid_data(self):
return JsonCompressor.compress_json(self.pdf_mid_data)
@abstractmethod
def pipe_classify(self):
"""
有状态的分类
"""
raise NotImplementedError
@abstractmethod
def pipe_parse(self):
"""
有状态的解析
"""
raise NotImplementedError
@abstractmethod
def pipe_mk_uni_format(self):
"""
有状态的组装统一格式
"""
raise NotImplementedError
@abstractmethod
def pipe_mk_markdown(self):
"""
有状态的组装markdown
"""
raise NotImplementedError
@staticmethod
def classify(pdf_bytes: bytes) -> str:
"""
根据pdf的元数据,判断是否是文本pdf,还是ocr pdf
"""
pdf_meta = pdf_meta_scan(pdf_bytes)
if pdf_meta.get("_need_drop", False): # 如果返回了需要丢弃的标志,则抛出异常
raise Exception(f"pdf meta_scan need_drop,reason is {pdf_meta['_drop_reason']}")
else:
is_encrypted = pdf_meta["is_encrypted"]
is_needs_password = pdf_meta["is_needs_password"]
if is_encrypted or is_needs_password: # 加密的,需要密码的,没有页面的,都不处理
raise Exception(f"pdf meta_scan need_drop,reason is {DropReason.ENCRYPTED}")
else:
is_text_pdf, results = classify(
pdf_meta["total_page"],
pdf_meta["page_width_pts"],
pdf_meta["page_height_pts"],
pdf_meta["image_info_per_page"],
pdf_meta["text_len_per_page"],
pdf_meta["imgs_per_page"],
pdf_meta["text_layout_per_page"],
)
if is_text_pdf:
return AbsPipe.PIP_TXT
else:
return AbsPipe.PIP_OCR
@staticmethod
def mk_uni_format(compressed_pdf_mid_data: str, img_buket_path: str) -> list:
"""
根据pdf类型,生成统一格式content_list
"""
pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
parse_type = pdf_mid_data["_parse_type"]
pdf_info_list = pdf_mid_data["pdf_info"]
if parse_type == AbsPipe.PIP_TXT:
content_list = mk_universal_format(pdf_info_list, img_buket_path)
elif parse_type == AbsPipe.PIP_OCR:
content_list = make_standard_format_with_para(pdf_info_list, img_buket_path)
return content_list
@staticmethod
def mk_markdown(compressed_pdf_mid_data: str, img_buket_path: str) -> list:
"""
根据pdf类型,markdown
"""
pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
parse_type = pdf_mid_data["_parse_type"]
pdf_info_list = pdf_mid_data["pdf_info"]
if parse_type == AbsPipe.PIP_TXT:
# content_list = mk_universal_format(pdf_info_list, img_buket_path)
# md_content = mk_mm_markdown(content_list)
md_content = ocr_mk_mm_markdown_with_para(pdf_info_list, img_buket_path)
elif parse_type == AbsPipe.PIP_OCR:
md_content = ocr_mk_mm_markdown_with_para(pdf_info_list, img_buket_path)
return md_content
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.libs.json_compressor import JsonCompressor
from magic_pdf.pipe.AbsPipe import AbsPipe
from magic_pdf.user_api import parse_ocr_pdf
class OCRPipe(AbsPipe):
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path: str, is_debug:bool=False):
super().__init__(pdf_bytes, model_list, image_writer, img_parent_path, is_debug)
def pipe_classify(self):
pass
def pipe_parse(self):
self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)
def pipe_mk_uni_format(self):
content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), self.img_parent_path)
return content_list
def pipe_mk_markdown(self):
md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), self.img_parent_path)
return md_content
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.libs.json_compressor import JsonCompressor
from magic_pdf.pipe.AbsPipe import AbsPipe
from magic_pdf.user_api import parse_txt_pdf
class TXTPipe(AbsPipe):
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path: str, is_debug:bool=False):
super().__init__(pdf_bytes, model_list, image_writer, img_parent_path, is_debug)
def pipe_classify(self):
pass
def pipe_parse(self):
self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)
def pipe_mk_uni_format(self):
content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), self.img_parent_path)
return content_list
def pipe_mk_markdown(self):
md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), self.img_parent_path)
return md_content
import json
from loguru import logger
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
from magic_pdf.libs.commons import join_path
from magic_pdf.pipe.AbsPipe import AbsPipe
from magic_pdf.user_api import parse_union_pdf, parse_ocr_pdf
class UNIPipe(AbsPipe):
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path: str,
is_debug: bool = False):
self.pdf_type = self.PIP_OCR
super().__init__(pdf_bytes, model_list, image_writer, img_parent_path, is_debug)
def pipe_classify(self):
self.pdf_type = UNIPipe.classify(self.pdf_bytes)
def pipe_parse(self):
if self.pdf_type == self.PIP_TXT:
self.pdf_mid_data = parse_union_pdf(self.pdf_bytes, self.model_list, self.image_writer,
is_debug=self.is_debug)
elif self.pdf_type == self.PIP_OCR:
self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer,
is_debug=self.is_debug)
def pipe_mk_uni_format(self):
content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), self.img_parent_path)
return content_list
def pipe_mk_markdown(self):
markdown_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), self.img_parent_path)
return markdown_content
if __name__ == '__main__':
# 测试
drw = DiskReaderWriter(r"D:/project/20231108code-clean")
pdf_file_path = r"linshixuqiu\19983-00.pdf"
model_file_path = r"linshixuqiu\19983-00.json"
pdf_bytes = drw.read(pdf_file_path, AbsReaderWriter.MODE_BIN)
model_json_txt = drw.read(model_file_path, AbsReaderWriter.MODE_TXT)
model_list = json.loads(model_json_txt)
write_path = r"D:\project\20231108code-clean\linshixuqiu\19983-00"
img_bucket_path = "imgs"
img_writer = DiskReaderWriter(join_path(write_path, img_bucket_path))
pipe = UNIPipe(pdf_bytes, model_list, img_writer, img_bucket_path)
pipe.pipe_classify()
pipe.pipe_parse()
md_content = pipe.pipe_mk_markdown()
try:
content_list = pipe.pipe_mk_uni_format()
except Exception as e:
logger.exception(e)
md_writer = DiskReaderWriter(write_path)
md_writer.write(md_content, "19983-00.md", AbsReaderWriter.MODE_TXT)
md_writer.write(json.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4), "19983-00.json",
AbsReaderWriter.MODE_TXT)
md_writer.write(str(content_list), "19983-00.txt", AbsReaderWriter.MODE_TXT)
......@@ -32,8 +32,8 @@ def meta_scan(jso: dict, doc_layout_check=True) -> dict:
if (
"doc_layout_result" not in jso
): # 检测json中是存在模型数据,如果没有则需要跳过该pdf
jso["need_drop"] = True
jso["drop_reason"] = DropReason.MISS_DOC_LAYOUT_RESULT
jso["_need_drop"] = True
jso["_drop_reason"] = DropReason.MISS_DOC_LAYOUT_RESULT
return jso
try:
data_source = get_data_source(jso)
......@@ -58,10 +58,10 @@ def meta_scan(jso: dict, doc_layout_check=True) -> dict:
start_time = time.time() # 记录开始时间
res = pdf_meta_scan(s3_pdf_path, file_content)
if res.get(
"need_drop", False
"_need_drop", False
): # 如果返回的字典里有need_drop,则提取drop_reason并跳过本次解析
jso["need_drop"] = True
jso["drop_reason"] = res["drop_reason"]
jso["_need_drop"] = True
jso["_drop_reason"] = res["_drop_reason"]
else: # 正常返回
jso["pdf_meta"] = res
jso["content"] = ""
......@@ -85,7 +85,7 @@ def classify_by_type(jso: dict, debug_mode=False) -> dict:
if debug_mode:
pass
else: # 如果debug没开,则检测是否有needdrop字段
if jso.get("need_drop", False):
if jso.get("_need_drop", False):
return jso
# 开始正式逻辑
try:
......@@ -113,8 +113,8 @@ def classify_by_type(jso: dict, debug_mode=False) -> dict:
if (
is_encrypted or is_needs_password
): # 加密的,需要密码的,没有页面的,都不处理
jso["need_drop"] = True
jso["drop_reason"] = DropReason.ENCRYPTED
jso["_need_drop"] = True
jso["_drop_reason"] = DropReason.ENCRYPTED
else:
start_time = time.time() # 记录开始时间
is_text_pdf, results = classify(
......@@ -139,8 +139,8 @@ def classify_by_type(jso: dict, debug_mode=False) -> dict:
if (
text_language not in allow_language
): # 如果语言不在允许的语言中,则drop
jso["need_drop"] = True
jso["drop_reason"] = DropReason.NOT_ALLOW_LANGUAGE
jso["_need_drop"] = True
jso["_drop_reason"] = DropReason.NOT_ALLOW_LANGUAGE
return jso
else:
# 先不drop
......@@ -148,8 +148,8 @@ def classify_by_type(jso: dict, debug_mode=False) -> dict:
jso["_pdf_type"] = "OCR"
jso["pdf_meta"] = pdf_meta
jso["classify_time"] = classify_time
# jso["need_drop"] = True
# jso["drop_reason"] = DropReason.NOT_IS_TEXT_PDF
# jso["_need_drop"] = True
# jso["_drop_reason"] = DropReason.NOT_IS_TEXT_PDF
extra_info = {"classify_rules": []}
for condition, result in results.items():
if not result:
......@@ -162,7 +162,7 @@ def classify_by_type(jso: dict, debug_mode=False) -> dict:
def drop_needdrop_pdf(jso: dict) -> dict:
if jso.get("need_drop", False):
if jso.get("_need_drop", False):
logger.info(
f"book_name is:{get_data_source(jso)}/{jso['file_id']} need drop",
file=sys.stderr,
......@@ -176,7 +176,7 @@ def pdf_intermediate_dict_to_markdown(jso: dict, debug_mode=False) -> dict:
if debug_mode:
pass
else: # 如果debug没开,则检测是否有needdrop字段
if jso.get("need_drop", False):
if jso.get("_need_drop", False):
book_name = join_path(get_data_source(jso), jso["file_id"])
logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
jso["dropped"] = True
......@@ -203,7 +203,7 @@ def parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
if debug_mode:
pass
else: # 如果debug没开,则检测是否有needdrop字段
if jso.get("need_drop", False):
if jso.get("_need_drop", False):
return jso
# 开始正式逻辑
s3_pdf_path = jso.get("file_location")
......@@ -220,8 +220,8 @@ def parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
svgs_per_page_list = jso["pdf_meta"]["svgs_per_page"]
max_svgs = max(svgs_per_page_list)
if max_svgs > 3000:
jso["need_drop"] = True
jso["drop_reason"] = DropReason.HIGH_COMPUTATIONAL_lOAD_BY_SVGS
jso["_need_drop"] = True
jso["_drop_reason"] = DropReason.HIGH_COMPUTATIONAL_lOAD_BY_SVGS
else:
try:
save_path = s3_image_save_path
......@@ -244,10 +244,10 @@ def parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
debug_mode=debug_mode,
)
if pdf_info_dict.get(
"need_drop", False
"_need_drop", False
): # 如果返回的字典里有need_drop,则提取drop_reason并跳过本次解析
jso["need_drop"] = True
jso["drop_reason"] = pdf_info_dict["drop_reason"]
jso["_need_drop"] = True
jso["_drop_reason"] = pdf_info_dict["_drop_reason"]
else: # 正常返回,将 pdf_info_dict 压缩并存储
pdf_info_dict = JsonCompressor.compress_json(pdf_info_dict)
jso["pdf_intermediate_dict"] = pdf_info_dict
......@@ -269,7 +269,7 @@ def parse_pdf_for_model_train(jso: dict, start_page_id=0, debug_mode=False) -> d
if debug_mode:
pass
else: # 如果debug没开,则检测是否有needdrop字段
if jso.get("need_drop", False):
if jso.get("_need_drop", False):
return jso
# 开始正式逻辑
s3_pdf_path = jso.get("file_location")
......@@ -295,8 +295,8 @@ def parse_pdf_for_model_train(jso: dict, start_page_id=0, debug_mode=False) -> d
svgs_per_page_list = jso["pdf_meta"]["svgs_per_page"]
max_svgs = max(svgs_per_page_list)
if max_svgs > 3000:
jso["need_drop"] = True
jso["drop_reason"] = DropReason.HIGH_COMPUTATIONAL_lOAD_BY_SVGS
jso["_need_drop"] = True
jso["_drop_reason"] = DropReason.HIGH_COMPUTATIONAL_lOAD_BY_SVGS
# elif total_page > 1000:
# jso['need_drop'] = True
# jso['drop_reason'] = DropReason.HIGH_COMPUTATIONAL_lOAD_BY_TOTAL_PAGES
......@@ -323,10 +323,10 @@ def parse_pdf_for_model_train(jso: dict, start_page_id=0, debug_mode=False) -> d
debug_mode=debug_mode,
)
if pdf_info_dict.get(
"need_drop", False
"_need_drop", False
): # 如果返回的字典里有need_drop,则提取drop_reason并跳过本次解析
jso["need_drop"] = True
jso["drop_reason"] = pdf_info_dict["drop_reason"]
jso["_need_drop"] = True
jso["_drop_reason"] = pdf_info_dict["_drop_reason"]
else: # 正常返回,将 pdf_info_dict 压缩并存储
jso["parsed_results"] = convert_to_train_format(pdf_info_dict)
pdf_info_dict = JsonCompressor.compress_json(pdf_info_dict)
......
......@@ -17,7 +17,7 @@ def ocr_pdf_intermediate_dict_to_markdown(jso: dict, debug_mode=False) -> dict:
if debug_mode:
pass
else: # 如果debug没开,则检测是否有needdrop字段
if jso.get("need_drop", False):
if jso.get("_need_drop", False):
book_name = join_path(get_data_source(jso), jso["file_id"])
logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
jso["dropped"] = True
......@@ -45,7 +45,7 @@ def ocr_pdf_intermediate_dict_to_markdown_with_para(jso: dict, mode, debug_mode=
if debug_mode:
pass
else: # 如果debug没开,则检测是否有needdrop字段
if jso.get("need_drop", False):
if jso.get("_need_drop", False):
book_name = join_path(get_data_source(jso), jso["file_id"])
logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
jso["dropped"] = True
......@@ -78,7 +78,7 @@ def ocr_pdf_intermediate_dict_to_markdown_with_para_and_pagination(jso: dict, de
if debug_mode:
pass
else: # 如果debug没开,则检测是否有needdrop字段
if jso.get("need_drop", False):
if jso.get("_need_drop", False):
book_name = join_path(get_data_source(jso), jso["file_id"])
logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
jso["dropped"] = True
......@@ -108,7 +108,7 @@ def ocr_pdf_intermediate_dict_to_markdown_with_para_for_qa(
if debug_mode:
pass
else: # 如果debug没开,则检测是否有needdrop字段
if jso.get("need_drop", False):
if jso.get("_need_drop", False):
book_name = join_path(get_data_source(jso), jso["file_id"])
logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
jso["dropped"] = True
......@@ -137,7 +137,7 @@ def ocr_pdf_intermediate_dict_to_standard_format(jso: dict, debug_mode=False) ->
if debug_mode:
pass
else: # 如果debug没开,则检测是否有needdrop字段
if jso.get("need_drop", False):
if jso.get("_need_drop", False):
book_name = join_path(get_data_source(jso), jso["file_id"])
logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
jso["dropped"] = True
......@@ -165,7 +165,7 @@ def ocr_pdf_intermediate_dict_to_standard_format_with_para(jso: dict, debug_mode
if debug_mode:
pass
else: # 如果debug没开,则检测是否有needdrop字段
if jso.get("need_drop", False):
if jso.get("_need_drop", False):
book_name = join_path(get_data_source(jso), jso["file_id"])
logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
jso["dropped"] = True
......@@ -221,7 +221,7 @@ def ocr_parse_pdf_core(pdf_bytes, model_output_json_list, book_name, start_page_
# 专门用来跑被drop的pdf,跑完之后需要把need_drop字段置为false
def ocr_dropped_parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
if not jso.get("need_drop", False):
if not jso.get("_need_drop", False):
return jso
else:
try:
......@@ -233,7 +233,7 @@ def ocr_dropped_parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
)
jso["pdf_intermediate_dict"] = JsonCompressor.compress_json(pdf_info_dict)
jso["parse_time"] = parse_time
jso["need_drop"] = False
jso["_need_drop"] = False
except Exception as e:
jso = exception_handler(jso, e)
return jso
......@@ -244,7 +244,7 @@ def ocr_parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
if debug_mode:
pass
else: # 如果debug没开,则检测是否有needdrop字段
if jso.get("need_drop", False):
if jso.get("_need_drop", False):
return jso
try:
pdf_bytes = get_pdf_bytes(jso)
......
......@@ -18,7 +18,7 @@ def txt_pdf_to_standard_format(jso: dict, debug_mode=False) -> dict:
if debug_mode:
pass
else: # 如果debug没开,则检测是否有needdrop字段
if jso.get("need_drop", False):
if jso.get("_need_drop", False):
book_name = join_path(get_data_source(jso), jso["file_id"])
logger.info(f"book_name is:{book_name} need drop")
jso["dropped"] = True
......@@ -46,7 +46,7 @@ def txt_pdf_to_mm_markdown_format(jso: dict, debug_mode=False) -> dict:
if debug_mode:
pass
else: # 如果debug没开,则检测是否有needdrop字段
if jso.get("need_drop", False):
if jso.get("_need_drop", False):
book_name = join_path(get_data_source(jso), jso["file_id"])
logger.info(f"book_name is:{book_name} need drop")
jso["dropped"] = True
......
......@@ -62,6 +62,6 @@ def pdf_post_filter(page_info) -> tuple:
"""
bool_is_pseudo_single_column, extra_info = __is_pseudo_single_column(page_info)
if bool_is_pseudo_single_column:
return False, {"need_drop": True, "drop_reason": DropReason.PSEUDO_SINGLE_COLUMN, "extra_info": extra_info}
return False, {"_need_drop": True, "_drop_reason": DropReason.PSEUDO_SINGLE_COLUMN, "extra_info": extra_info}
return True, None
\ No newline at end of file
def construct_page_component(page_id, image_info, table_info, text_blocks_preproc, layout_bboxes, inline_eq_info, interline_eq_info, raw_pymu_blocks,
removed_text_blocks, removed_image_blocks, images_backup, droped_table_block, table_backup,layout_tree,
def construct_page_component(page_id, image_info, table_info, text_blocks_preproc, layout_bboxes, inline_eq_info,
interline_eq_info, raw_pymu_blocks,
removed_text_blocks, removed_image_blocks, images_backup, droped_table_block, table_backup,
layout_tree,
page_w, page_h, footnote_bboxes_tmp):
"""
"""
return_dict = {}
return_dict['para_blocks'] = {}
return_dict['preproc_blocks'] = text_blocks_preproc
return_dict['images'] = image_info
......@@ -16,24 +17,24 @@ def construct_page_component(page_id, image_info, table_info, text_blocks_prepr
return_dict['layout_bboxes'] = layout_bboxes
return_dict['pymu_raw_blocks'] = raw_pymu_blocks
return_dict['global_statistic'] = {}
return_dict['droped_text_block'] = removed_text_blocks
return_dict['droped_image_block'] = removed_image_blocks
return_dict['droped_table_block'] = []
return_dict['image_backup'] = images_backup
return_dict['table_backup'] = []
return_dict['table_backup'] = []
return_dict['page_idx'] = page_id
return_dict['page_size'] = [page_w, page_h]
return_dict['_layout_tree'] = layout_tree # 辅助分析layout作用
return_dict['_layout_tree'] = layout_tree # 辅助分析layout作用
return_dict['footnote_bboxes_tmp'] = footnote_bboxes_tmp
return return_dict
def ocr_construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
images, tables, interline_equations, inline_equations,
dropped_text_block, dropped_image_block, dropped_table_block, dropped_equation_block,
need_remove_spans_bboxes_dict):
images, tables, interline_equations, inline_equations,
dropped_text_block, dropped_image_block, dropped_table_block, dropped_equation_block,
need_remove_spans_bboxes_dict):
return_dict = {
'preproc_blocks': blocks,
'layout_bboxes': layout_bboxes,
......@@ -51,3 +52,19 @@ def ocr_construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h,
'droped_bboxes': need_remove_spans_bboxes_dict,
}
return return_dict
def ocr_construct_page_component_v2(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
images, tables, interline_equations, discarded_blocks):
return_dict = {
'preproc_blocks': blocks,
'layout_bboxes': layout_bboxes,
'page_idx': page_id,
'page_size': [page_w, page_h],
'_layout_tree': layout_tree,
'images': images,
'tables': tables,
'interline_equations': interline_equations,
'discarded_blocks': discarded_blocks,
}
return return_dict
from loguru import logger
from magic_pdf.libs.commons import join_path
from magic_pdf.libs.ocr_content_type import ContentType
from magic_pdf.libs.pdf_image_tools import cut_image
def ocr_cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter):
def return_path(type):
return join_path(pdf_bytes_md5, type)
for span in spans:
span_type = span['type']
if span_type == ContentType.Image:
if not check_img_bbox(span['bbox']):
continue
span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('images'),
imageWriter=imageWriter)
elif span_type == ContentType.Table:
if not check_img_bbox(span['bbox']):
continue
span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('tables'),
imageWriter=imageWriter)
return spans
def txt_save_images_by_bboxes(page_num: int, page, pdf_bytes_md5: str,
image_bboxes: list, images_overlap_backup: list, table_bboxes: list,
equation_inline_bboxes: list,
equation_interline_bboxes: list, imageWriter) -> dict:
"""
返回一个dict, key为bbox, 值是图片地址
"""
image_info = []
image_backup_info = []
table_info = []
inline_eq_info = []
interline_eq_info = []
# 图片的保存路径组成是这样的: {s3_or_local_path}/{book_name}/{images|tables|equations}/{page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg
def return_path(type):
return join_path(pdf_bytes_md5, type)
for bbox in image_bboxes:
if not check_img_bbox(bbox):
continue
image_path = cut_image(bbox, page_num, page, return_path("images"), imageWriter)
image_info.append({"bbox": bbox, "image_path": image_path})
for bbox in images_overlap_backup:
if not check_img_bbox(bbox):
continue
image_path = cut_image(bbox, page_num, page, return_path("images"), imageWriter)
image_backup_info.append({"bbox": bbox, "image_path": image_path})
for bbox in table_bboxes:
if not check_img_bbox(bbox):
continue
image_path = cut_image(bbox, page_num, page, return_path("tables"), imageWriter)
table_info.append({"bbox": bbox, "image_path": image_path})
return image_info, image_backup_info, table_info, inline_eq_info, interline_eq_info
def check_img_bbox(bbox) -> bool:
if any([bbox[0] >= bbox[2], bbox[1] >= bbox[3]]):
logger.warning(f"image_bboxes: 错误的box, {bbox}")
return False
return True
This diff is collapsed.
from magic_pdf.libs.commons import join_path
from magic_pdf.libs.ocr_content_type import ContentType
from magic_pdf.libs.pdf_image_tools import cut_image
def cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter):
def return_path(type):
return join_path(pdf_bytes_md5, type)
for span in spans:
span_type = span['type']
if span_type == ContentType.Image:
span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('images'), imageWriter=imageWriter)
elif span_type == ContentType.Table:
span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('tables'), imageWriter=imageWriter)
return spans
from magic_pdf.libs.boxbase import get_minbox_if_overlap_by_ratio, calculate_overlap_area_in_bbox1_area_ratio, \
calculate_iou
from magic_pdf.libs.drop_tag import DropTag
from magic_pdf.libs.ocr_content_type import BlockType
def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_blocks, text_blocks,
title_blocks, interline_equation_blocks, page_w, page_h):
all_bboxes = []
for image in img_blocks:
x0, y0, x1, y1 = image['bbox']
all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Image, None, None, None, None])
for table in table_blocks:
x0, y0, x1, y1 = table['bbox']
all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Table, None, None, None, None])
for text in text_blocks:
x0, y0, x1, y1 = text['bbox']
all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Text, None, None, None, None])
for title in title_blocks:
x0, y0, x1, y1 = title['bbox']
all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Title, None, None, None, None])
for interline_equation in interline_equation_blocks:
x0, y0, x1, y1 = interline_equation['bbox']
all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.InterlineEquation, None, None, None, None])
'''block嵌套问题解决'''
'''文本框与标题框重叠,优先信任文本框'''
all_bboxes = fix_text_overlap_title_blocks(all_bboxes)
'''任何框体与舍弃框重叠,优先信任舍弃框'''
all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)
'''经过以上处理后,还存在大框套小框的情况,则删除小框'''
all_bboxes = remove_overlaps_min_blocks(all_bboxes)
'''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的(限定footnote)'''
for discarded in discarded_blocks:
x0, y0, x1, y1 = discarded['bbox']
if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Footnote, None, None, None, None])
return all_bboxes
def fix_text_overlap_title_blocks(all_bboxes):
# 先提取所有text和title block
text_blocks = []
for block in all_bboxes:
if block[7] == BlockType.Text:
text_blocks.append(block)
title_blocks = []
for block in all_bboxes:
if block[7] == BlockType.Title:
title_blocks.append(block)
for text_block in text_blocks:
for title_block in title_blocks:
text_block_bbox = text_block[0], text_block[1], text_block[2], text_block[3]
title_block_bbox = title_block[0], title_block[1], title_block[2], title_block[3]
if calculate_iou(text_block_bbox, title_block_bbox) > 0.8:
all_bboxes.remove(title_block)
return all_bboxes
def remove_need_drop_blocks(all_bboxes, discarded_blocks):
for block in all_bboxes.copy():
for discarded_block in discarded_blocks:
block_bbox = block[0], block[1], block[2], block[3]
if calculate_overlap_area_in_bbox1_area_ratio(block_bbox, discarded_block['bbox']) > 0.6:
all_bboxes.remove(block)
return all_bboxes
def remove_overlaps_min_blocks(all_bboxes):
# 删除重叠blocks中较小的那些
for block1 in all_bboxes.copy():
for block2 in all_bboxes.copy():
if block1 != block2:
block1_bbox = [block1[0], block1[1], block1[2], block1[3]]
block2_bbox = [block2[0], block2[1], block2[2], block2[3]]
overlap_box = get_minbox_if_overlap_by_ratio(block1_bbox, block2_bbox, 0.8)
if overlap_box is not None:
bbox_to_remove = next(
(block for block in all_bboxes if [block[0], block[1], block[2], block[3]] == overlap_box),
None)
if bbox_to_remove is not None:
all_bboxes.remove(bbox_to_remove)
return all_bboxes
......@@ -3,7 +3,9 @@ from loguru import logger
from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox_if_overlap_by_ratio, \
calculate_overlap_area_in_bbox1_area_ratio
from magic_pdf.libs.drop_tag import DropTag
from magic_pdf.libs.ocr_content_type import ContentType
from magic_pdf.libs.ocr_content_type import ContentType, BlockType
from magic_pdf.pre_proc.ocr_span_list_modify import modify_y_axis, modify_inline_equation
from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox
# 将每一个line中的span从左到右排序
......@@ -24,6 +26,7 @@ def line_sort_spans_by_left_to_right(lines):
})
return line_objects
def merge_spans_to_line(spans):
if len(spans) == 0:
return []
......@@ -37,7 +40,8 @@ def merge_spans_to_line(spans):
# 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
# image和table类型,同上
if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in current_line):
s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in
current_line):
# 则开始新行
lines.append(current_line)
current_line = [span]
......@@ -57,6 +61,7 @@ def merge_spans_to_line(spans):
return lines
def merge_spans_to_line_by_layout(spans, layout_bboxes):
lines = []
new_spans = []
......@@ -103,7 +108,205 @@ def merge_lines_to_block(lines):
return blocks
def sort_blocks_by_layout(all_bboxes, layout_bboxes):
new_blocks = []
sort_blocks = []
for item in layout_bboxes:
layout_bbox = item['layout_bbox']
# 遍历blocks,将每个blocks放入对应的layout中
layout_blocks = []
for block in all_bboxes:
# 如果是footnote则跳过
if block[7] == BlockType.Footnote:
continue
block_bbox = [block[0], block[1], block[2], block[3]]
if calculate_overlap_area_in_bbox1_area_ratio(block_bbox, layout_bbox) > 0.8:
layout_blocks.append(block)
# 如果layout_blocks不为空,则放入new_blocks中
if len(layout_blocks) > 0:
new_blocks.append(layout_blocks)
# 从spans删除已经放入layout_sapns中的span
for layout_block in layout_blocks:
all_bboxes.remove(layout_block)
# 如果new_blocks不为空,则对new_blocks中每个block进行排序
if len(new_blocks) > 0:
for bboxes_in_layout_block in new_blocks:
bboxes_in_layout_block.sort(key=lambda x: x[1]) # 一个layout内部的box,按照y0自上而下排序
sort_blocks.extend(bboxes_in_layout_block)
# sort_blocks中已经包含了当前页面所有最终留下的block,且已经排好了顺序
return sort_blocks
def fill_spans_in_blocks(blocks, spans):
'''
将allspans中的span按位置关系,放入blocks中
'''
block_with_spans = []
for block in blocks:
block_type = block[7]
block_bbox = block[0:4]
block_dict = {
'type': block_type,
'bbox': block_bbox,
}
block_spans = []
for span in spans:
span_bbox = span['bbox']
if calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.7:
block_spans.append(span)
'''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
displayed_list = []
text_inline_lines = []
modify_y_axis(block_spans, displayed_list, text_inline_lines)
'''模型识别错误的行间公式, type类型转换成行内公式'''
block_spans = modify_inline_equation(block_spans, displayed_list, text_inline_lines)
'''bbox去除粘连'''
block_spans = remove_overlap_between_bbox(block_spans)
block_dict['spans'] = block_spans
block_with_spans.append(block_dict)
# 从spans删除已经放入block_spans中的span
if len(block_spans) > 0:
for span in block_spans:
spans.remove(span)
return block_with_spans
def fix_block_spans(block_with_spans, img_blocks, table_blocks):
'''
1、img_block和table_block因为包含caption和footnote的关系,存在block的嵌套关系
需要将caption和footnote的text_span放入相应img_block和table_block内的
caption_block和footnote_block中
2、同时需要删除block中的spans字段
'''
fix_blocks = []
for block in block_with_spans:
block_type = block['type']
if block_type == BlockType.Image:
block = fix_image_block(block, img_blocks)
elif block_type == BlockType.Table:
block = fix_table_block(block, table_blocks)
elif block_type in [BlockType.Text, BlockType.Title, BlockType.InterlineEquation]:
block = fix_text_block(block)
else:
continue
fix_blocks.append(block)
return fix_blocks
def merge_spans_to_block(spans: list, block_bbox: list, block_type: str):
block_spans = []
# 如果有img_caption,则将img_block中的text_spans放入img_caption_block中
for span in spans:
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block_bbox) > 0.8:
block_spans.append(span)
block_lines = merge_spans_to_line(block_spans)
# 对line中的span进行排序
sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
block = {
'bbox': block_bbox,
'type': block_type,
'lines': sort_block_lines
}
return block, block_spans
def make_body_block(span: dict, block_bbox: list, block_type: str):
# 创建body_block
body_line = {
'bbox': block_bbox,
'spans': [span],
}
body_block = {
'bbox': block_bbox,
'type': block_type,
'lines': [body_line]
}
return body_block
def fix_image_block(block, img_blocks):
block['blocks'] = []
# 遍历img_blocks,找到与当前block匹配的img_block
for img_block in img_blocks:
if img_block['bbox'] == block['bbox']:
# 创建img_body_block
for span in block['spans']:
if span['type'] == ContentType.Image and span['bbox'] == img_block['img_body_bbox']:
# 创建img_body_block
img_body_block = make_body_block(span, img_block['img_body_bbox'], BlockType.ImageBody)
block['blocks'].append(img_body_block)
# 从spans中移除img_body_block中已经放入的span
block['spans'].remove(span)
break
# 根据list长度,判断img_block中是否有img_caption
if img_block['img_caption_bbox'] is not None:
img_caption_block, img_caption_spans = merge_spans_to_block(
block['spans'], img_block['img_caption_bbox'], BlockType.ImageCaption
)
block['blocks'].append(img_caption_block)
break
del block['spans']
return block
def fix_table_block(block, table_blocks):
block['blocks'] = []
# 遍历table_blocks,找到与当前block匹配的table_block
for table_block in table_blocks:
if table_block['bbox'] == block['bbox']:
# 创建table_body_block
for span in block['spans']:
if span['type'] == ContentType.Table and span['bbox'] == table_block['table_body_bbox']:
# 创建table_body_block
table_body_block = make_body_block(span, table_block['table_body_bbox'], BlockType.TableBody)
block['blocks'].append(table_body_block)
# 从spans中移除img_body_block中已经放入的span
block['spans'].remove(span)
break
# 根据list长度,判断table_block中是否有caption
if table_block['table_caption_bbox'] is not None:
table_caption_block, table_caption_spans = merge_spans_to_block(
block['spans'], table_block['table_caption_bbox'], BlockType.TableCaption
)
block['blocks'].append(table_caption_block)
# 如果table_caption_block_spans不为空
if len(table_caption_spans) > 0:
# 一些span已经放入了caption_block中,需要从block['spans']中删除
for span in table_caption_spans:
block['spans'].remove(span)
# 根据list长度,判断table_block中是否有table_note
if table_block['table_footnote_bbox'] is not None:
table_footnote_block, table_footnote_spans = merge_spans_to_block(
block['spans'], table_block['table_footnote_bbox'], BlockType.TableFootnote
)
block['blocks'].append(table_footnote_block)
break
del block['spans']
return block
def fix_text_block(block):
block_lines = merge_spans_to_line(block['spans'])
sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
block['lines'] = sort_block_lines
del block['spans']
return block
......@@ -3,7 +3,7 @@ from loguru import logger
from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio, get_minbox_if_overlap_by_ratio, \
__is_overlaps_y_exceeds_threshold
from magic_pdf.libs.drop_tag import DropTag
from magic_pdf.libs.ocr_content_type import ContentType
from magic_pdf.libs.ocr_content_type import ContentType, BlockType
def remove_overlaps_min_spans(spans):
......@@ -50,7 +50,8 @@ def remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict):
need_remove_spans.append(span)
break
# 当drop_tag为DropTag.FOOTNOTE时, 判断span是否在removed_bboxes中任意一个的下方,如果是,则删除该span
elif drop_tag == DropTag.FOOTNOTE and (span['bbox'][1]+span['bbox'][3])/2 > removed_bbox[3] and removed_bbox[0] < (span['bbox'][0]+span['bbox'][2])/2 < removed_bbox[2]:
elif drop_tag == DropTag.FOOTNOTE and (span['bbox'][1] + span['bbox'][3]) / 2 > removed_bbox[3] and \
removed_bbox[0] < (span['bbox'][0] + span['bbox'][2]) / 2 < removed_bbox[2]:
need_remove_spans.append(span)
break
......@@ -162,9 +163,10 @@ def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines:
text_line = text_inline_lines[j]
y0, y1 = text_line[1]
if (
span_y0 < y0 and span_y > y0 or span_y0 < y1 and span_y > y1 or span_y0 < y0 and span_y > y1) and __is_overlaps_y_exceeds_threshold(
span['bbox'], (0, y0, 0, y1)):
span_y0 < y0 < span_y or span_y0 < y1 < span_y or span_y0 < y0 and span_y > y1
) and __is_overlaps_y_exceeds_threshold(
span['bbox'], (0, y0, 0, y1)
):
# 调整公式类型
if span["type"] == ContentType.InterlineEquation:
# 最后一行是行间公式
......@@ -181,8 +183,8 @@ def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines:
span["bbox"][1] = y0
span["bbox"][3] = y1
break
elif span_y < y0 or span_y0 < y0 and span_y > y0 and not __is_overlaps_y_exceeds_threshold(span['bbox'],
(0, y0, 0, y1)):
elif span_y < y0 or span_y0 < y0 < span_y and not __is_overlaps_y_exceeds_threshold(span['bbox'],
(0, y0, 0, y1)):
break
else:
j += 1
......@@ -211,3 +213,19 @@ def get_qa_need_list(blocks):
else:
continue
return images, tables, interline_equations, inline_equations
def get_qa_need_list_v2(blocks):
# 创建 images, tables, interline_equations, inline_equations 的副本
images = []
tables = []
interline_equations = []
for block in blocks:
if block["type"] == BlockType.Image:
images.append(block)
elif block["type"] == BlockType.Table:
tables.append(block)
elif block["type"] == BlockType.InterlineEquation:
interline_equations.append(block)
return images, tables, interline_equations
......@@ -68,7 +68,7 @@ def pdf_filter(page:fitz.Page, text_blocks, table_bboxes, image_bboxes) -> tuple
"""
if __is_contain_color_background_rect(page, text_blocks, image_bboxes):
return False, {"need_drop": True, "drop_reason": DropReason.COLOR_BACKGROUND_TEXT_BOX}
return False, {"_need_drop": True, "_drop_reason": DropReason.COLOR_BACKGROUND_TEXT_BOX}
return True, None
\ No newline at end of file
......@@ -5,7 +5,7 @@ def _remove_overlap_between_bbox(spans):
res = []
for v in spans:
for i in range(len(res)):
if _is_in(res[i]["bbox"], v["bbox"]):
if _is_in(res[i]["bbox"], v["bbox"]) or _is_in(v["bbox"], res[i]["bbox"]):
continue
if _is_in_or_part_overlap(res[i]["bbox"], v["bbox"]):
ix0, iy0, ix1, iy1 = res[i]["bbox"]
......@@ -17,21 +17,21 @@ def _remove_overlap_between_bbox(spans):
if diff_y > diff_x:
if x1 >= ix1:
mid = (x0 + ix1) // 2
ix1 = min(mid, ix1)
x0 = max(mid + 1, x0)
ix1 = min(mid - 0.25, ix1)
x0 = max(mid + 0.25, x0)
else:
mid = (ix0 + x1) // 2
ix0 = max(mid + 1, ix0)
x1 = min(mid, x1)
ix0 = max(mid + 0.25, ix0)
x1 = min(mid -0.25, x1)
else:
if y1 >= iy1:
mid = (y0 + iy1) // 2
y0 = max(mid + 1, y0)
iy1 = min(iy1, mid)
y0 = max(mid + 0.25, y0)
iy1 = min(iy1, mid-0.25)
else:
mid = (iy0 + y1) // 2
y1 = min(y1, mid)
iy0 = max(mid + 1, iy0)
y1 = min(y1, mid-0.25)
iy0 = max(mid + 0.25, iy0)
res[i]["bbox"] = [ix0, iy0, ix1, iy1]
v["bbox"] = [x0, y0, x1, y1]
......
import os
from magic_pdf.io.AbsReaderWriter import AbsReaderWriter
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from loguru import logger
MODE_TXT = "text"
MODE_BIN = "binary"
class DiskReaderWriter(AbsReaderWriter):
def __init__(self, parent_path, encoding='utf-8'):
def __init__(self, parent_path, encoding="utf-8"):
self.path = parent_path
self.encoding = encoding
......@@ -20,10 +22,10 @@ class DiskReaderWriter(AbsReaderWriter):
logger.error(f"文件 {abspath} 不存在")
raise Exception(f"文件 {abspath} 不存在")
if mode == MODE_TXT:
with open(abspath, 'r', encoding = self.encoding) as f:
with open(abspath, "r", encoding=self.encoding) as f:
return f.read()
elif mode == MODE_BIN:
with open(abspath, 'rb') as f:
with open(abspath, "rb") as f:
return f.read()
else:
raise ValueError("Invalid mode. Use 'text' or 'binary'.")
......@@ -33,32 +35,32 @@ class DiskReaderWriter(AbsReaderWriter):
abspath = path
else:
abspath = os.path.join(self.path, path)
directory_path = os.path.dirname(abspath)
if not os.path.exists(directory_path):
os.makedirs(directory_path)
if mode == MODE_TXT:
with open(abspath, 'w', encoding=self.encoding) as f:
with open(abspath, "w", encoding=self.encoding) as f:
f.write(content)
logger.info(f"内容已成功写入 {abspath}")
elif mode == MODE_BIN:
with open(abspath, 'wb') as f:
with open(abspath, "wb") as f:
f.write(content)
logger.info(f"内容已成功写入 {abspath}")
else:
raise ValueError("Invalid mode. Use 'text' or 'binary'.")
def read_jsonl(self, path: str, byte_start=0, byte_end=None, encoding='utf-8'):
def read_jsonl(self, path: str, byte_start=0, byte_end=None, encoding="utf-8"):
return self.read(path)
# 使用示例
if __name__ == "__main__":
file_path = "io/example.txt"
file_path = "io/test/example.txt"
drw = DiskReaderWriter("D:\projects\papayfork\Magic-PDF\magic_pdf")
# 写入内容到文件
drw.write(b"Hello, World!", path="io/example.txt", mode="binary")
drw.write(b"Hello, World!", path="io/test/example.txt", mode="binary")
# 从文件读取内容
content = drw.read(path=file_path)
if content:
logger.info(f"从 {file_path} 读取的内容: {content}")
from magic_pdf.io.AbsReaderWriter import AbsReaderWriter
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.libs.commons import parse_aws_param, parse_bucket_key
import boto3
from loguru import logger
......@@ -11,7 +11,7 @@ MODE_BIN = "binary"
class S3ReaderWriter(AbsReaderWriter):
def __init__(self, ak: str, sk: str, endpoint_url: str, addressing_style: str, parent_path: str):
def __init__(self, ak: str, sk: str, endpoint_url: str, addressing_style: str = 'auto', parent_path: str = ''):
self.client = self._get_client(ak, sk, endpoint_url, addressing_style)
self.path = parent_path
......
from loguru import logger
from magic_pdf.libs.drop_reason import DropReason
from loguru import logger
from magic_pdf.libs.drop_reason import DropReason
def get_data_source(jso: dict):
data_source = jso.get("data_source")
if data_source is None:
data_source = jso.get("file_source")
return data_source
def get_data_type(jso: dict):
data_type = jso.get("data_type")
if data_type is None:
data_type = jso.get("file_type")
return data_type
def get_bookid(jso: dict):
book_id = jso.get("bookid")
if book_id is None:
book_id = jso.get("original_file_id")
return book_id
def exception_handler(jso: dict, e):
logger.exception(e)
jso["need_drop"] = True
jso["drop_reason"] = DropReason.Exception
jso["exception"] = f"ERROR: {e}"
return jso
def get_bookname(jso: dict):
data_source = get_data_source(jso)
file_id = jso.get("file_id")
book_name = f"{data_source}/{file_id}"
return book_name
from loguru import logger
"""
用户输入:
model数组,每个元素代表一个页面
pdf在s3的路径
截图保存的s3位置
from magic_pdf.libs.drop_reason import DropReason
然后:
1)根据s3路径,调用spark集群的api,拿到ak,sk,endpoint,构造出s3PDFReader
2)根据用户输入的s3地址,调用spark集群的api,拿到ak,sk,endpoint,构造出s3ImageWriter
其余部分至于构造s3cli, 获取ak,sk都在code-clean里写代码完成。不要反向依赖!!!
def get_data_source(jso: dict):
data_source = jso.get("data_source")
if data_source is None:
data_source = jso.get("file_source")
return data_source
"""
from loguru import logger
from magic_pdf.io import AbsReaderWriter
from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt
def get_data_type(jso: dict):
data_type = jso.get("data_type")
if data_type is None:
data_type = jso.get("file_type")
return data_type
def parse_txt_pdf(pdf_bytes:bytes, pdf_models:list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args, **kwargs):
"""
解析文本类pdf
"""
pdf_info_dict = parse_pdf_by_txt(
pdf_bytes,
pdf_models,
imageWriter,
start_page_id=start_page,
debug_mode=is_debug,
)
def get_bookid(jso: dict):
book_id = jso.get("bookid")
if book_id is None:
book_id = jso.get("original_file_id")
return book_id
pdf_info_dict["parse_type"] = "txt"
return pdf_info_dict
def exception_handler(jso: dict, e):
logger.exception(e)
jso["_need_drop"] = True
jso["_drop_reason"] = DropReason.Exception
jso["_exception"] = f"ERROR: {e}"
return jso
def parse_ocr_pdf(pdf_bytes:bytes, pdf_models:list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args, **kwargs):
"""
解析ocr类pdf
"""
pdf_info_dict = parse_pdf_by_ocr(
pdf_bytes,
pdf_models,
imageWriter,
start_page_id=start_page,
debug_mode=is_debug,
)
def get_bookname(jso: dict):
data_source = get_data_source(jso)
file_id = jso.get("file_id")
book_name = f"{data_source}/{file_id}"
return book_name
pdf_info_dict["parse_type"] = "ocr"
return pdf_info_dict
def spark_json_extractor(jso: dict) -> dict:
def parse_union_pdf(pdf_bytes:bytes, pdf_models:list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args, **kwargs):
"""
ocr和文本混合的pdf,全部解析出来
从json中提取数据,返回一个dict
"""
def parse_pdf(method):
try:
return method(
pdf_bytes,
pdf_models,
imageWriter,
start_page_id=start_page,
debug_mode=is_debug,
)
except Exception as e:
logger.error(f"{method.__name__} error: {e}")
return None
pdf_info_dict = parse_pdf(parse_pdf_by_txt)
if pdf_info_dict is None or pdf_info_dict.get("need_drop", False):
logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr")
pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
if pdf_info_dict is None:
raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.")
else:
pdf_info_dict["parse_type"] = "ocr"
else:
pdf_info_dict["parse_type"] = "txt"
return pdf_info_dict
def spark_json_extractor(jso:dict):
pass
return {
"_pdf_type": jso["_pdf_type"],
"model_list": jso["doc_layout_result"],
}
"""
用户输入:
model数组,每个元素代表一个页面
pdf在s3的路径
截图保存的s3位置
然后:
1)根据s3路径,调用spark集群的api,拿到ak,sk,endpoint,构造出s3PDFReader
2)根据用户输入的s3地址,调用spark集群的api,拿到ak,sk,endpoint,构造出s3ImageWriter
其余部分至于构造s3cli, 获取ak,sk都在code-clean里写代码完成。不要反向依赖!!!
"""
from loguru import logger
from magic_pdf.rw import AbsReaderWriter
from magic_pdf.pdf_parse_by_ocr_v2 import parse_pdf_by_ocr
from magic_pdf.pdf_parse_by_txt_v2 import parse_pdf_by_txt
PARSE_TYPE_TXT = "txt"
PARSE_TYPE_OCR = "ocr"
def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args,
**kwargs):
"""
解析文本类pdf
"""
pdf_info_dict = parse_pdf_by_txt(
pdf_bytes,
pdf_models,
imageWriter,
start_page_id=start_page,
debug_mode=is_debug,
)
pdf_info_dict["_parse_type"] = PARSE_TYPE_TXT
return pdf_info_dict
def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args,
**kwargs):
"""
解析ocr类pdf
"""
pdf_info_dict = parse_pdf_by_ocr(
pdf_bytes,
pdf_models,
imageWriter,
start_page_id=start_page,
debug_mode=is_debug,
)
pdf_info_dict["_parse_type"] = PARSE_TYPE_OCR
return pdf_info_dict
def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0,
*args, **kwargs):
"""
ocr和文本混合的pdf,全部解析出来
"""
def parse_pdf(method):
try:
return method(
pdf_bytes,
pdf_models,
imageWriter,
start_page_id=start_page,
debug_mode=is_debug,
)
except Exception as e:
logger.error(f"{method.__name__} error: {e}")
return None
pdf_info_dict = parse_pdf(parse_pdf_by_txt)
if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False):
logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr")
pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
if pdf_info_dict is None:
raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.")
else:
pdf_info_dict["_parse_type"] = PARSE_TYPE_OCR
else:
pdf_info_dict["_parse_type"] = PARSE_TYPE_TXT
return pdf_info_dict
......@@ -14,5 +14,7 @@ termcolor>=2.4.0
wordninja>=2.0.0
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl
zh_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-3.7.0/zh_core_web_sm-3.7.0-py3-none-any.whl
scikit-learn==1.4.1.post1
nltk==3.8.1
\ No newline at end of file
scikit-learn>=1.0.2
nltk==3.8.1
s3pathlib>=2.1.1
# 工具脚本使用说明
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment