Unverified Commit 1d81631b authored by myhloli's avatar myhloli Committed by GitHub

Merge pull request #74 from myhloli/master

add para_to_standard_format logic
parents a4b687e8 d3542f6a
......@@ -84,13 +84,13 @@ def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer,
path=f"{pdf_file_name}.json",
mode=AbsReaderWriter.MODE_TXT,
)
# try:
# content_list = pipe.pipe_mk_uni_format()
# except Exception as e:
# logger.exception(e)
# md_writer.write(
# str(content_list), f"{part_file_name}.txt", AbsReaderWriter.MODE_TXT
# )
try:
content_list = pipe.pipe_mk_uni_format()
except Exception as e:
logger.exception(e)
md_writer.write(
str(content_list), f"{pdf_file_name}.txt", AbsReaderWriter.MODE_TXT
)
@click.group()
......
......@@ -201,16 +201,58 @@ def para_to_standard_format(para, img_buket_path):
return para_content
def para_to_standard_format_v2(para_block, img_buket_path):
para_type = para_block['type']
if para_type == BlockType.Text:
para_content = {
'type': 'text',
'text': merge_para_with_text(para_block),
}
elif para_type == BlockType.Title:
para_content = {
'type': 'text',
'text': merge_para_with_text(para_block),
'text_level': 1
}
elif para_type == BlockType.InterlineEquation:
para_content = {
'type': 'equation',
'text': merge_para_with_text(para_block),
'text_format': "latex"
}
elif para_type == BlockType.Image:
para_content = {
'type': 'image',
}
for block in para_block['blocks']:
if block['type'] == BlockType.ImageBody:
para_content['img_path'] = join_path(img_buket_path, block["lines"][0]["spans"][0]['image_path'])
if block['type'] == BlockType.ImageCaption:
para_content['img_caption'] = merge_para_with_text(block)
elif para_type == BlockType.Table:
para_content = {
'type': 'table',
}
for block in para_block['blocks']:
if block['type'] == BlockType.TableBody:
para_content['img_path'] = join_path(img_buket_path, block["lines"][0]["spans"][0]['image_path'])
if block['type'] == BlockType.TableCaption:
para_content['table_caption'] = merge_para_with_text(block)
if block['type'] == BlockType.TableFootnote:
para_content['table_footnote'] = merge_para_with_text(block)
return para_content
def make_standard_format_with_para(pdf_info_dict: list, img_buket_path: str):
content_list = []
for page_info in pdf_info_dict:
paras_of_layout = page_info.get("para_blocks")
if not paras_of_layout:
continue
for paras in paras_of_layout:
for para in paras:
para_content = para_to_standard_format(para, img_buket_path)
content_list.append(para_content)
for para_block in paras_of_layout:
para_content = para_to_standard_format_v2(para_block, img_buket_path)
content_list.append(para_content)
return content_list
......
......@@ -92,7 +92,8 @@ class AbsPipe(ABC):
parse_type = pdf_mid_data["_parse_type"]
pdf_info_list = pdf_mid_data["pdf_info"]
if parse_type == AbsPipe.PIP_TXT:
content_list = mk_universal_format(pdf_info_list, img_buket_path)
# content_list = mk_universal_format(pdf_info_list, img_buket_path)
content_list = make_standard_format_with_para(pdf_info_list, img_buket_path)
elif parse_type == AbsPipe.PIP_OCR:
content_list = make_standard_format_with_para(pdf_info_list, img_buket_path)
return content_list
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment