Unverified Commit 66e3ce9c authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub

fix(ocr_mkcontent): improve language detection and content formatting (#458)

Optimize the language detection logic to enhance content formatting.  This
change addresses issues with long word segmentation. Language detection now uses a
threshold to determine the language of a text based on the proportion of English characters.
Formatting rules for content have been updated to consider a list of languages (initially
including Chinese, Japanese, and Korean) where no space is added between content segments
for inline equations and text spans, improving the handling of Asian languages.

The impact of these changes includes improved accuracy in language detection, better
segmentation of long words, and more appropriate spacing in content formatting for multiple
languages.
parent f4316f02
...@@ -14,7 +14,7 @@ def split_long_words(text): ...@@ -14,7 +14,7 @@ def split_long_words(text):
for i in range(len(segments)): for i in range(len(segments)):
words = re.findall(r'\w+|[^\w]', segments[i], re.UNICODE) words = re.findall(r'\w+|[^\w]', segments[i], re.UNICODE)
for j in range(len(words)): for j in range(len(words)):
if len(words[j]) > 15: if len(words[j]) > 10:
words[j] = ' '.join(wordninja.split(words[j])) words[j] = ' '.join(wordninja.split(words[j]))
segments[i] = ''.join(words) segments[i] = ''.join(words)
return ' '.join(segments) return ' '.join(segments)
...@@ -147,6 +147,18 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""): ...@@ -147,6 +147,18 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
def merge_para_with_text(para_block): def merge_para_with_text(para_block):
def detect_language(text):
en_pattern = r'[a-zA-Z]+'
en_matches = re.findall(en_pattern, text)
en_length = sum(len(match) for match in en_matches)
if len(text) > 0:
if en_length / len(text) >= 0.5:
return 'en'
else:
return "unknown"
else:
return "empty"
para_text = '' para_text = ''
for line in para_block['lines']: for line in para_block['lines']:
line_text = "" line_text = ""
...@@ -162,7 +174,8 @@ def merge_para_with_text(para_block): ...@@ -162,7 +174,8 @@ def merge_para_with_text(para_block):
content = '' content = ''
if span_type == ContentType.Text: if span_type == ContentType.Text:
content = span['content'] content = span['content']
language = detect_lang(content) # language = detect_lang(content)
language = detect_language(content)
if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本 if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本
content = ocr_escape_special_markdown_char(split_long_words(content)) content = ocr_escape_special_markdown_char(split_long_words(content))
else: else:
...@@ -171,12 +184,12 @@ def merge_para_with_text(para_block): ...@@ -171,12 +184,12 @@ def merge_para_with_text(para_block):
content = f" ${span['content']}$ " content = f" ${span['content']}$ "
elif span_type == ContentType.InterlineEquation: elif span_type == ContentType.InterlineEquation:
content = f"\n$$\n{span['content']}\n$$\n" content = f"\n$$\n{span['content']}\n$$\n"
if content != '': if content != '':
if 'zh' in line_lang: # 遇到一些一个字一个span的文档,这种单字语言判断不准,需要用整行文本判断 langs = ['zh', 'ja', 'ko']
para_text += content # 中文语境下,content间不需要空格分隔 if line_lang in langs: # 遇到一些一个字一个span的文档,这种单字语言判断不准,需要用整行文本判断
para_text += content # 中文/日语/韩文语境下,content间不需要空格分隔
else: else:
para_text += content + ' ' # 英文语境下 content间需要空格分隔 para_text += content + ' ' # 西方文本语境下 content间需要空格分隔
return para_text return para_text
...@@ -202,7 +215,6 @@ def para_to_standard_format(para, img_buket_path): ...@@ -202,7 +215,6 @@ def para_to_standard_format(para, img_buket_path):
elif span_type == ContentType.InlineEquation: elif span_type == ContentType.InlineEquation:
content = f"${span['content']}$" content = f"${span['content']}$"
inline_equation_num += 1 inline_equation_num += 1
if language == 'en': # 英文语境下 content间需要空格分隔 if language == 'en': # 英文语境下 content间需要空格分隔
para_text += content + ' ' para_text += content + ' '
else: # 中文语境下,content间不需要空格分隔 else: # 中文语境下,content间不需要空格分隔
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment