Unverified Commit 83e0d55a authored by drunkpig's avatar drunkpig Committed by GitHub

fix: replace \u0002, \u0003 in common text (#521)

* fix replace \u0002, \u0003 in common text

* fix(para): When an English line ends with a hyphen, do not add a space at the end.
parent c0704f75
...@@ -30,10 +30,10 @@ tmp/ ...@@ -30,10 +30,10 @@ tmp/
tmp tmp
.vscode .vscode
.vscode/ .vscode/
/tests/
ocr_demo ocr_demo
/app/common/__init__.py /app/common/__init__.py
/magic_pdf/config/__init__.py /magic_pdf/config/__init__.py
source.dev.env source.dev.env
tmp
...@@ -9,6 +9,20 @@ import wordninja ...@@ -9,6 +9,20 @@ import wordninja
import re import re
def __is_hyphen_at_line_end(line):
"""
Check if a line ends with one or more letters followed by a hyphen.
Args:
line (str): The line of text to check.
Returns:
bool: True if the line ends with one or more letters followed by a hyphen, False otherwise.
"""
# Use regex to check if the line ends with one or more letters followed by a hyphen
return bool(re.search(r'[A-Za-z]+-\s*$', line))
def split_long_words(text): def split_long_words(text):
segments = text.split(' ') segments = text.split(' ')
for i in range(len(segments)): for i in range(len(segments)):
...@@ -184,10 +198,17 @@ def merge_para_with_text(para_block): ...@@ -184,10 +198,17 @@ def merge_para_with_text(para_block):
content = f" ${span['content']}$ " content = f" ${span['content']}$ "
elif span_type == ContentType.InterlineEquation: elif span_type == ContentType.InterlineEquation:
content = f"\n$$\n{span['content']}\n$$\n" content = f"\n$$\n{span['content']}\n$$\n"
if content != '': if content != '':
langs = ['zh', 'ja', 'ko'] langs = ['zh', 'ja', 'ko']
if line_lang in langs: # 遇到一些一个字一个span的文档,这种单字语言判断不准,需要用整行文本判断 if line_lang in langs: # 遇到一些一个字一个span的文档,这种单字语言判断不准,需要用整行文本判断
para_text += content # 中文/日语/韩文语境下,content间不需要空格分隔 para_text += content # 中文/日语/韩文语境下,content间不需要空格分隔
elif line_lang == 'en':
# 如果是前一行带有-连字符,那么末尾不应该加空格
if __is_hyphen_at_line_end(para_text):
para_text += content
else:
para_text += content + ' '
else: else:
para_text += content + ' ' # 西方文本语境下 content间需要空格分隔 para_text += content + ' ' # 西方文本语境下 content间需要空格分隔
return para_text return para_text
......
...@@ -41,6 +41,23 @@ def remove_horizontal_overlap_block_which_smaller(all_bboxes): ...@@ -41,6 +41,23 @@ def remove_horizontal_overlap_block_which_smaller(all_bboxes):
return is_useful_block_horz_overlap, all_bboxes return is_useful_block_horz_overlap, all_bboxes
def __replace_STX_ETX(text_str:str):
""" Replace \u0002 and \u0003, as these characters become garbled when extracted using pymupdf. In fact, they were originally quotation marks.
Drawback: This issue is only observed in English text; it has not been found in Chinese text so far.
Args:
text_str (str): raw text
Returns:
_type_: replaced text
"""
if text_str:
s = text_str.replace('\u0002', "'")
s = s.replace("\u0003", "'")
return s
return text_str
def txt_spans_extract(pdf_page, inline_equations, interline_equations): def txt_spans_extract(pdf_page, inline_equations, interline_equations):
text_raw_blocks = pdf_page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"] text_raw_blocks = pdf_page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"]
char_level_text_blocks = pdf_page.get_text("rawdict", flags=fitz.TEXTFLAGS_TEXT)[ char_level_text_blocks = pdf_page.get_text("rawdict", flags=fitz.TEXTFLAGS_TEXT)[
...@@ -63,7 +80,7 @@ def txt_spans_extract(pdf_page, inline_equations, interline_equations): ...@@ -63,7 +80,7 @@ def txt_spans_extract(pdf_page, inline_equations, interline_equations):
spans.append( spans.append(
{ {
"bbox": list(span["bbox"]), "bbox": list(span["bbox"]),
"content": span["text"], "content": __replace_STX_ETX(span["text"]),
"type": ContentType.Text, "type": ContentType.Text,
"score": 1.0, "score": 1.0,
} }
......
from magic_pdf.dict2md.ocr_mkcontent import __is_hyphen_at_line_end
def test_hyphen_at_line_end():
"""
测试行尾是不是一个连字符
"""
test_cases_ok = [
"I am zhang-",
"you are zhang- ",
"math-",
"This is a TEST-",
"This is a TESTing-",
"美国人 hello-",
]
test_cases_bad = [
"This is a TEST$-",
"This is a TEST21-",
"中国人-",
"美国人 hello人-",
"this is 123-",
]
for test_case in test_cases_ok:
assert __is_hyphen_at_line_end(test_case)
for test_case in test_cases_bad:
assert not __is_hyphen_at_line_end(test_case)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment