Commit f856695c authored by 赵小蒙's avatar 赵小蒙

Merge remote-tracking branch 'origin/master'

parents 1ee81a9a 3726f689
import os
import json
from loguru import logger
from magic_pdf.pipe.UNIPipe import UNIPipe
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
current_script_dir = os.path.dirname(os.path.abspath(__file__))
demo_name = "demo1"
pdf_path = os.path.join(current_script_dir, f"{demo_name}.pdf")
model_path = os.path.join(current_script_dir, f"{demo_name}.json")
pdf_bytes = open(pdf_path, "rb").read()
model_json = json.loads(open(model_path, "r", encoding="utf-8").read())
jso_useful_key = {"_pdf_type": "", "model_list": model_json}
local_image_dir = os.path.join(current_script_dir, 'images')
image_dir = str(os.path.basename(local_image_dir))
image_writer = DiskReaderWriter(local_image_dir)
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
pipe.pipe_classify()
pipe.pipe_parse()
md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
with open(f"{demo_name}.md", "w", encoding="utf-8") as f:
f.write(md_content)
try:
current_script_dir = os.path.dirname(os.path.abspath(__file__))
demo_name = "demo1"
pdf_path = os.path.join(current_script_dir, f"{demo_name}.pdf")
model_path = os.path.join(current_script_dir, f"{demo_name}.json")
pdf_bytes = open(pdf_path, "rb").read()
model_json = json.loads(open(model_path, "r", encoding="utf-8").read())
jso_useful_key = {"_pdf_type": "", "model_list": model_json}
local_image_dir = os.path.join(current_script_dir, 'images')
image_dir = str(os.path.basename(local_image_dir))
image_writer = DiskReaderWriter(local_image_dir)
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
pipe.pipe_classify()
pipe.pipe_parse()
md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
with open(f"{demo_name}.md", "w", encoding="utf-8") as f:
f.write(md_content)
except Exception as e:
logger.exception(e)
\ No newline at end of file
import unicodedata
from fast_langdetect import detect_langs
from fast_langdetect import detect_language
def detect_lang(text: str) -> str:
if len(text) == 0:
return ""
try:
lang_upper = detect_langs(text)
lang_upper = detect_language(text)
except:
html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]])
lang_upper = detect_langs(html_no_ctrl_chars)
lang_upper = detect_language(html_no_ctrl_chars)
try:
lang = lang_upper.lower()
except:
......
__version__ = "0.5.12"
__version__ = "0.5.13"
......@@ -8,4 +8,5 @@ fast-langdetect>=0.1.1
wordninja>=2.0.0
scikit-learn>=1.0.2
pdfminer.six>=20231228
numpy<2.0.0 #2.0版本与fasttext不兼容
# requirements.txt 须保证只引入必需的外部依赖,如有新依赖添加请联系项目管理员
\ No newline at end of file
......@@ -35,7 +35,7 @@ if __name__ == '__main__':
description="A practical tool for converting PDF to Markdown", # 简短描述
long_description=long_description, # 详细描述
long_description_content_type="text/markdown", # 如果README是Markdown格式
url="https://github.com/magicpdf/Magic-PDF",
url="https://github.com/opendatalab/MinerU",
python_requires=">=3.9", # 项目依赖的 Python 版本
entry_points={
"console_scripts": [
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment