Unverified Commit 435ab922 authored by drunkpig's avatar drunkpig Committed by GitHub

Merge branch 'master' into master

parents 18b02ae3 ef03c906
......@@ -21,7 +21,11 @@ python magicpdf.py --json s3://llm-pdf-text/scihub/xxxx.json?bytes=0,81350
python magicpdf.py --pdf /home/llm/Downloads/xxxx.pdf --model /home/llm/Downloads/xxxx.json 或者 python magicpdf.py --pdf /home/llm/Downloads/xxxx.pdf
"""
import os
import json as json_parse
from datetime import datetime
import click
from magic_pdf.pipe.UNIPipe import UNIPipe
from magic_pdf.libs.config_reader import get_s3_config
from magic_pdf.libs.path_utils import (
parse_s3path,
......@@ -29,25 +33,14 @@ from magic_pdf.libs.path_utils import (
remove_non_official_s3_args,
)
from magic_pdf.libs.config_reader import get_local_dir
from magic_pdf.io.S3ReaderWriter import S3ReaderWriter, MODE_BIN
from magic_pdf.io.DiskReaderWriter import DiskReaderWriter
from magic_pdf.spark.spark_api import parse_union_pdf, parse_txt_pdf, parse_ocr_pdf
import os
import json as json_parse
from datetime import datetime
from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter, MODE_BIN, MODE_TXT
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
from magic_pdf.libs.json_compressor import JsonCompressor
parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])
def get_pdf_parse_method(method):
if method == "ocr":
return parse_ocr_pdf
elif method == "txt":
return parse_txt_pdf
return parse_union_pdf
def prepare_env():
local_parent_dir = os.path.join(
get_local_dir(), "magic-pdf", datetime.now().strftime("%Y-%m-%d")
......@@ -60,6 +53,28 @@ def prepare_env():
return local_image_dir, local_md_dir
def _do_parse(pdf_bytes, model_list, parse_method, image_writer, md_writer, image_dir):
uni_pipe = UNIPipe()
jso_useful_key = {
"_pdf_type": "txt",
"model_list": model_list,
}
if parse_method == "ocr":
jso_useful_key["_pdf_type"] = "ocr"
pdf_mid_data = uni_pipe.parse(pdf_bytes, image_writer, jso_useful_key)
md_content = UNIPipe.mk_markdown(pdf_mid_data, image_dir)
part_file_name = datetime.now().strftime("%H-%M-%S")
md_writer.write(content=md_content, path=f"{part_file_name}.md", mode=MODE_TXT)
md_writer.write(
content=json_parse.dumps(
JsonCompressor.decompress_json(pdf_mid_data), ensure_ascii=False, indent=4
),
path=f"{part_file_name}.json",
mode=MODE_TXT,
)
@click.group()
def cli():
pass
......@@ -96,11 +111,20 @@ def json_command(json, method):
jso = json_parse.loads(read_s3_path(json).decode("utf-8"))
pdf_data = read_s3_path(jso["file_location"])
local_image_dir, _ = prepare_env()
local_image_dir, local_md_dir = prepare_env()
local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
local_md_dir
)
local_image_rw = DiskReaderWriter(local_image_dir)
parse = get_pdf_parse_method(method)
parse(pdf_data, jso["doc_layout_result"], local_image_rw, is_debug=True)
_do_parse(
pdf_data,
jso['doc_layout_result'],
method,
local_image_rw,
local_md_rw,
local_image_dir,
)
@cli.command()
......@@ -128,15 +152,22 @@ def pdf_command(pdf, model, method):
pdf_data = read_fn(pdf)
jso = json_parse.loads(read_fn(model).decode("utf-8"))
local_image_dir, _ = prepare_env()
local_image_rw = DiskReaderWriter(local_image_dir)
parse = get_pdf_parse_method(method)
parse(pdf_data, jso, local_image_rw, is_debug=True)
local_image_dir, local_md_dir = prepare_env()
local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
local_md_dir
)
_do_parse(
pdf_data,
jso,
method,
local_image_rw,
local_md_rw,
local_image_dir,
)
if __name__ == "__main__":
"""
python magic_pdf/cli/magicpdf.py json-command --json s3://llm-pdf-text/pdf_ebook_and_paper/format/v070/part-66028dd46437-000076.jsonl?bytes=0,308393
python magic_pdf/cli/magicpdf.py json-command --json s3://llm-pdf-text/pdf_ebook_and_paper/manual/v001/part-660407a28beb-000002.jsonl?bytes=0,63551
"""
cli()
from magic_pdf.io.AbsReaderWriter import AbsReaderWriter
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.libs.commons import fitz
from magic_pdf.libs.commons import join_path
from magic_pdf.libs.hash_utils import compute_sha256
......
class MagicModel():
"""
每个函数没有得到元素的时候返回空list
"""
def __fix_axis():
# TODO 计算
self.__model_list = xx
def __init__(model_list:list, page:Page):
self.__model_list = model_list
self.__fix_axis()
self.__page = page
def get_imgs(self, page_no:int): # @许瑞
return_lst = []
img = {
"bbox":[x0,y0,x1,y1]
}
img_caption = {
"bbox":[x0,y0,x1,y1],
"text":"",
}
return [{"img":img, "caption":img_caption},]
def get_tables(self, page_no:int) ->list: # 3个坐标, caption, table主体,table-note
pass # 许瑞
def get_equations(self, page_no:int)->list: # 有坐标,也有字
return inline_equations, interline_equations # @凯文
def get_discarded(self, page_no:int)->list: # 自研模型,只有坐标
pass # @凯文
def get_text_blocks(self, page_no:int)->list: # 自研模型搞的,只有坐标,没有字
pass # @凯文
def get_title_blocks(self, page_no:int)->list: # 自研模型,只有坐标,没字
pass # @凯文
def get_ocr_text(self, page_no:int)->list: # paddle 搞的,有字也有坐标
pass # @小蒙
def get_ocr_spans(self, page_no:int)->list:
pass # @小蒙
\ No newline at end of file
......@@ -299,9 +299,9 @@ def __split_para_in_layoutbox(lines_group, new_layout_bbox, lang="en", char_avg_
layout_list_info[0] = True
if end==total_lines-1:
layout_list_info[1] = True
else:
else: # 是普通文本
for i, line in enumerate(lines[start:end+1]):
# 如果i有下一行,那么就要根据下一行位置综合判断是否要分段。如果i之后没有行,那么只需要判断一下行结尾特征。
# 如果i有下一行,那么就要根据下一行位置综合判断是否要分段。如果i之后没有行,那么只需要判断i行自己的结尾特征。
cur_line_type = line['spans'][-1]['type']
next_line = lines[i+1] if i<total_lines-1 else None
......@@ -341,6 +341,8 @@ def __connect_list_inter_layout(layout_paras, new_layout_bbox, layout_list_info,
"""
if len(layout_paras)==0 or len(layout_list_info)==0: # 0的时候最后的return 会出错
return layout_paras, [False, False]
# if page_num==343:
# pass
for i in range(1, len(layout_paras)):
pre_layout_list_info = layout_list_info[i-1]
......
import json
from loguru import logger
from magic_pdf.io.AbsReaderWriter import AbsReaderWriter
from magic_pdf.io.DiskReaderWriter import DiskReaderWriter
from magic_pdf.dict2md.mkcontent import mk_universal_format, mk_mm_markdown
from magic_pdf.dict2md.ocr_mkcontent import make_standard_format_with_para, ocr_mk_mm_markdown_with_para
from magic_pdf.filter.pdf_classify_by_type import classify
from magic_pdf.filter.pdf_meta_scan import pdf_meta_scan
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
from magic_pdf.libs.commons import join_path
from magic_pdf.libs.json_compressor import JsonCompressor
from magic_pdf.pipe.AbsPipe import AbsPipe
......
import os
from magic_pdf.io.AbsReaderWriter import AbsReaderWriter
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from loguru import logger
......
from magic_pdf.io.AbsReaderWriter import AbsReaderWriter
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.libs.commons import parse_aws_param, parse_bucket_key
import boto3
from loguru import logger
......@@ -11,7 +11,7 @@ MODE_BIN = "binary"
class S3ReaderWriter(AbsReaderWriter):
def __init__(self, ak: str, sk: str, endpoint_url: str, addressing_style: str, parent_path: str):
def __init__(self, ak: str, sk: str, endpoint_url: str, addressing_style: str = 'auto', parent_path: str = ''):
self.client = self._get_client(ak, sk, endpoint_url, addressing_style)
self.path = parent_path
......
......@@ -14,7 +14,7 @@
"""
from loguru import logger
from magic_pdf.io import AbsReaderWriter
from magic_pdf.rw import AbsReaderWriter
from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment