Commit fc49f5c4 authored by myhloli's avatar myhloli

refactor(magic_pdf): remove unused parameters and simplify functions

- Remove unused parameters parse_type and lang from various functions
- Simplify function calls by removing unnecessary arguments
- Update related files to reflect these changes
parent fe21eebd
...@@ -47,19 +47,17 @@ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list, ...@@ -47,19 +47,17 @@ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list,
def ocr_mk_markdown_with_para_core_v2(paras_of_layout, def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
mode, mode,
img_buket_path='', img_buket_path='',
parse_type="auto",
lang=None
): ):
page_markdown = [] page_markdown = []
for para_block in paras_of_layout: for para_block in paras_of_layout:
para_text = '' para_text = ''
para_type = para_block['type'] para_type = para_block['type']
if para_type in [BlockType.Text, BlockType.List, BlockType.Index]: if para_type in [BlockType.Text, BlockType.List, BlockType.Index]:
para_text = merge_para_with_text(para_block, parse_type=parse_type, lang=lang) para_text = merge_para_with_text(para_block)
elif para_type == BlockType.Title: elif para_type == BlockType.Title:
para_text = f'# {merge_para_with_text(para_block, parse_type=parse_type, lang=lang)}' para_text = f'# {merge_para_with_text(para_block)}'
elif para_type == BlockType.InterlineEquation: elif para_type == BlockType.InterlineEquation:
para_text = merge_para_with_text(para_block, parse_type=parse_type, lang=lang) para_text = merge_para_with_text(para_block)
elif para_type == BlockType.Image: elif para_type == BlockType.Image:
if mode == 'nlp': if mode == 'nlp':
continue continue
...@@ -72,17 +70,17 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, ...@@ -72,17 +70,17 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
para_text += f"\n![]({join_path(img_buket_path, span['image_path'])}) \n" para_text += f"\n![]({join_path(img_buket_path, span['image_path'])}) \n"
for block in para_block['blocks']: # 2nd.拼image_caption for block in para_block['blocks']: # 2nd.拼image_caption
if block['type'] == BlockType.ImageCaption: if block['type'] == BlockType.ImageCaption:
para_text += merge_para_with_text(block, parse_type=parse_type, lang=lang) para_text += merge_para_with_text(block)
for block in para_block['blocks']: # 2nd.拼image_caption for block in para_block['blocks']: # 2nd.拼image_caption
if block['type'] == BlockType.ImageFootnote: if block['type'] == BlockType.ImageFootnote:
para_text += merge_para_with_text(block, parse_type=parse_type, lang=lang) para_text += merge_para_with_text(block)
elif para_type == BlockType.Table: elif para_type == BlockType.Table:
if mode == 'nlp': if mode == 'nlp':
continue continue
elif mode == 'mm': elif mode == 'mm':
for block in para_block['blocks']: # 1st.拼table_caption for block in para_block['blocks']: # 1st.拼table_caption
if block['type'] == BlockType.TableCaption: if block['type'] == BlockType.TableCaption:
para_text += merge_para_with_text(block, parse_type=parse_type, lang=lang) para_text += merge_para_with_text(block)
for block in para_block['blocks']: # 2nd.拼table_body for block in para_block['blocks']: # 2nd.拼table_body
if block['type'] == BlockType.TableBody: if block['type'] == BlockType.TableBody:
for line in block['lines']: for line in block['lines']:
...@@ -97,7 +95,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, ...@@ -97,7 +95,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
para_text += f"\n![]({join_path(img_buket_path, span['image_path'])}) \n" para_text += f"\n![]({join_path(img_buket_path, span['image_path'])}) \n"
for block in para_block['blocks']: # 3rd.拼table_footnote for block in para_block['blocks']: # 3rd.拼table_footnote
if block['type'] == BlockType.TableFootnote: if block['type'] == BlockType.TableFootnote:
para_text += merge_para_with_text(block, parse_type=parse_type, lang=lang) para_text += merge_para_with_text(block)
if para_text.strip() == '': if para_text.strip() == '':
continue continue
...@@ -120,7 +118,7 @@ def detect_language(text): ...@@ -120,7 +118,7 @@ def detect_language(text):
return 'empty' return 'empty'
def merge_para_with_text(para_block, parse_type="auto", lang=None): def merge_para_with_text(para_block):
para_text = '' para_text = ''
for i, line in enumerate(para_block['lines']): for i, line in enumerate(para_block['lines']):
...@@ -161,24 +159,24 @@ def merge_para_with_text(para_block, parse_type="auto", lang=None): ...@@ -161,24 +159,24 @@ def merge_para_with_text(para_block, parse_type="auto", lang=None):
return para_text return para_text
def para_to_standard_format_v2(para_block, img_buket_path, page_idx, parse_type="auto", lang=None, drop_reason=None): def para_to_standard_format_v2(para_block, img_buket_path, page_idx, drop_reason=None):
para_type = para_block['type'] para_type = para_block['type']
para_content = {} para_content = {}
if para_type == BlockType.Text: if para_type == BlockType.Text:
para_content = { para_content = {
'type': 'text', 'type': 'text',
'text': merge_para_with_text(para_block, parse_type=parse_type, lang=lang), 'text': merge_para_with_text(para_block),
} }
elif para_type == BlockType.Title: elif para_type == BlockType.Title:
para_content = { para_content = {
'type': 'text', 'type': 'text',
'text': merge_para_with_text(para_block, parse_type=parse_type, lang=lang), 'text': merge_para_with_text(para_block),
'text_level': 1, 'text_level': 1,
} }
elif para_type == BlockType.InterlineEquation: elif para_type == BlockType.InterlineEquation:
para_content = { para_content = {
'type': 'equation', 'type': 'equation',
'text': merge_para_with_text(para_block, parse_type=parse_type, lang=lang), 'text': merge_para_with_text(para_block),
'text_format': 'latex', 'text_format': 'latex',
} }
elif para_type == BlockType.Image: elif para_type == BlockType.Image:
...@@ -189,9 +187,9 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, parse_type= ...@@ -189,9 +187,9 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, parse_type=
img_buket_path, img_buket_path,
block['lines'][0]['spans'][0]['image_path']) block['lines'][0]['spans'][0]['image_path'])
if block['type'] == BlockType.ImageCaption: if block['type'] == BlockType.ImageCaption:
para_content['img_caption'] = merge_para_with_text(block, parse_type=parse_type, lang=lang) para_content['img_caption'] = merge_para_with_text(block)
if block['type'] == BlockType.ImageFootnote: if block['type'] == BlockType.ImageFootnote:
para_content['img_footnote'] = merge_para_with_text(block, parse_type=parse_type, lang=lang) para_content['img_footnote'] = merge_para_with_text(block)
elif para_type == BlockType.Table: elif para_type == BlockType.Table:
para_content = {'type': 'table'} para_content = {'type': 'table'}
for block in para_block['blocks']: for block in para_block['blocks']:
...@@ -202,9 +200,9 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, parse_type= ...@@ -202,9 +200,9 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, parse_type=
para_content['table_body'] = f"\n\n{block['lines'][0]['spans'][0]['html']}\n\n" para_content['table_body'] = f"\n\n{block['lines'][0]['spans'][0]['html']}\n\n"
para_content['img_path'] = join_path(img_buket_path, block["lines"][0]["spans"][0]['image_path']) para_content['img_path'] = join_path(img_buket_path, block["lines"][0]["spans"][0]['image_path'])
if block['type'] == BlockType.TableCaption: if block['type'] == BlockType.TableCaption:
para_content['table_caption'] = merge_para_with_text(block, parse_type=parse_type, lang=lang) para_content['table_caption'] = merge_para_with_text(block)
if block['type'] == BlockType.TableFootnote: if block['type'] == BlockType.TableFootnote:
para_content['table_footnote'] = merge_para_with_text(block, parse_type=parse_type, lang=lang) para_content['table_footnote'] = merge_para_with_text(block)
para_content['page_idx'] = page_idx para_content['page_idx'] = page_idx
...@@ -218,8 +216,7 @@ def union_make(pdf_info_dict: list, ...@@ -218,8 +216,7 @@ def union_make(pdf_info_dict: list,
make_mode: str, make_mode: str,
drop_mode: str, drop_mode: str,
img_buket_path: str = '', img_buket_path: str = '',
parse_type: str = "auto", ):
lang=None):
output_content = [] output_content = []
for page_info in pdf_info_dict: for page_info in pdf_info_dict:
drop_reason_flag = False drop_reason_flag = False
...@@ -246,20 +243,20 @@ def union_make(pdf_info_dict: list, ...@@ -246,20 +243,20 @@ def union_make(pdf_info_dict: list,
continue continue
if make_mode == MakeMode.MM_MD: if make_mode == MakeMode.MM_MD:
page_markdown = ocr_mk_markdown_with_para_core_v2( page_markdown = ocr_mk_markdown_with_para_core_v2(
paras_of_layout, 'mm', img_buket_path, parse_type=parse_type, lang=lang) paras_of_layout, 'mm', img_buket_path)
output_content.extend(page_markdown) output_content.extend(page_markdown)
elif make_mode == MakeMode.NLP_MD: elif make_mode == MakeMode.NLP_MD:
page_markdown = ocr_mk_markdown_with_para_core_v2( page_markdown = ocr_mk_markdown_with_para_core_v2(
paras_of_layout, 'nlp', parse_type=parse_type, lang=lang) paras_of_layout, 'nlp')
output_content.extend(page_markdown) output_content.extend(page_markdown)
elif make_mode == MakeMode.STANDARD_FORMAT: elif make_mode == MakeMode.STANDARD_FORMAT:
for para_block in paras_of_layout: for para_block in paras_of_layout:
if drop_reason_flag: if drop_reason_flag:
para_content = para_to_standard_format_v2( para_content = para_to_standard_format_v2(
para_block, img_buket_path, page_idx, parse_type=parse_type, lang=lang, drop_reason=drop_reason) para_block, img_buket_path, page_idx)
else: else:
para_content = para_to_standard_format_v2( para_content = para_to_standard_format_v2(
para_block, img_buket_path, page_idx, parse_type=parse_type, lang=lang) para_block, img_buket_path, page_idx)
output_content.append(para_content) output_content.append(para_content)
if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]: if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
return '\n\n'.join(output_content) return '\n\n'.join(output_content)
......
...@@ -95,9 +95,7 @@ class AbsPipe(ABC): ...@@ -95,9 +95,7 @@ class AbsPipe(ABC):
""" """
pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data) pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
pdf_info_list = pdf_mid_data["pdf_info"] pdf_info_list = pdf_mid_data["pdf_info"]
parse_type = pdf_mid_data["_parse_type"] content_list = union_make(pdf_info_list, MakeMode.STANDARD_FORMAT, drop_mode, img_buket_path)
lang = pdf_mid_data.get("_lang", None)
content_list = union_make(pdf_info_list, MakeMode.STANDARD_FORMAT, drop_mode, img_buket_path, parse_type, lang)
return content_list return content_list
@staticmethod @staticmethod
...@@ -107,9 +105,7 @@ class AbsPipe(ABC): ...@@ -107,9 +105,7 @@ class AbsPipe(ABC):
""" """
pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data) pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
pdf_info_list = pdf_mid_data["pdf_info"] pdf_info_list = pdf_mid_data["pdf_info"]
parse_type = pdf_mid_data["_parse_type"] md_content = union_make(pdf_info_list, md_make_mode, drop_mode, img_buket_path)
lang = pdf_mid_data.get("_lang", None)
md_content = union_make(pdf_info_list, md_make_mode, drop_mode, img_buket_path, parse_type, lang)
return md_content return md_content
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment