Commit ec1a6ef7 authored by 赵小蒙's avatar 赵小蒙

增加生成多模态markdown逻辑

parent 3c8b2545
...@@ -21,3 +21,31 @@ def mk_nlp_markdown(pdf_info_dict: dict): ...@@ -21,3 +21,31 @@ def mk_nlp_markdown(pdf_info_dict: dict):
# 在行末添加两个空格以强制换行 # 在行末添加两个空格以强制换行
markdown.append(line_text.strip() + ' ') markdown.append(line_text.strip() + ' ')
return '\n'.join(markdown) return '\n'.join(markdown)
def mk_mm_markdown(pdf_info_dict: dict):
markdown = []
for _, page_info in pdf_info_dict.items():
blocks = page_info.get("preproc_blocks")
if not blocks:
continue
for block in blocks:
for line in block['lines']:
line_text = ''
for span in line['spans']:
if not span.get('content'):
if not span.get('image_path'):
continue
else:
content = f"![]({span['image_path']})"
else:
content = span['content'].replace('$', '\$') # 转义$
if span['type'] == 'inline_equation':
content = f"${content}$"
elif span['type'] == 'displayed_equation':
content = f"$$\n{content}\n$$"
line_text += content + ' '
# 在行末添加两个空格以强制换行
markdown.append(line_text.strip() + ' ')
return '\n'.join(markdown)
...@@ -12,8 +12,8 @@ def cut_image_and_table(spans, page, page_id, book_name, save_path): ...@@ -12,8 +12,8 @@ def cut_image_and_table(spans, page, page_id, book_name, save_path):
for span in spans: for span in spans:
span_type = span['type'] span_type = span['type']
if span_type == 'image': if span_type == 'image':
span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('image')) span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('images'))
elif span_type == 'table': elif span_type == 'table':
span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('table')) span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('tables'))
return spans return spans
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment