Commit 99055af3 authored by 赵小蒙's avatar 赵小蒙

英文文本拼接时,如果单个单词超过15个字符,则对该单词进行切分处理。行间公式/图片/表格独立占有一行

parent 1d5d7781
from magic_pdf.libs.commons import s3_image_save_path, join_path from magic_pdf.libs.commons import s3_image_save_path, join_path
from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
from magic_pdf.libs.ocr_content_type import ContentType from magic_pdf.libs.ocr_content_type import ContentType
import wordninja
import re
def split_long_words(text):
segments = text.split(' ')
for i in range(len(segments)):
words = re.findall(r'\w+|[^\w\s]', segments[i], re.UNICODE)
for j in range(len(words)):
if len(words[j]) > 15:
words[j] = ' '.join(wordninja.split(words[j]))
segments[i] = ''.join(words)
return ' '.join(segments)
def ocr_mk_nlp_markdown(pdf_info_dict: dict): def ocr_mk_nlp_markdown(pdf_info_dict: dict):
...@@ -67,17 +80,18 @@ def ocr_mk_mm_markdown_with_para(pdf_info_dict: dict): ...@@ -67,17 +80,18 @@ def ocr_mk_mm_markdown_with_para(pdf_info_dict: dict):
for span in line['spans']: for span in line['spans']:
span_type = span.get('type') span_type = span.get('type')
if span_type == ContentType.Text: if span_type == ContentType.Text:
content = span['content'] content = split_long_words(span['content'])
# content = span['content']
elif span_type == ContentType.InlineEquation: elif span_type == ContentType.InlineEquation:
content = f" ${span['content']}$ " content = f"${span['content']}$"
elif span_type == ContentType.InterlineEquation: elif span_type == ContentType.InterlineEquation:
content = f"$$\n{span['content']}\n$$ " content = f"\n$$\n{span['content']}\n$$\n"
elif span_type in [ContentType.Image, ContentType.Table]: elif span_type in [ContentType.Image, ContentType.Table]:
content = f"![]({join_path(s3_image_save_path, span['image_path'])})" content = f"\n![]({join_path(s3_image_save_path, span['image_path'])})\n"
para_text += content + ' ' para_text += content + ' '
markdown.append(para_text.strip() + ' ') markdown.append(para_text.strip() + ' ')
return '\n'.join(markdown) return '\n\n'.join(markdown)
def make_standard_format_with_para(pdf_info_dict: dict): def make_standard_format_with_para(pdf_info_dict: dict):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment