Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
00f16239
Commit
00f16239
authored
Apr 10, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
实现parse_ocr_pdf api,切图逻辑s3使用平铺地址,本地使用层级地址,删除预设s3_image_save_path
parent
cfac3b25
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
139 additions
and
22 deletions
+139
-22
python-package.yml
.github/workflows/python-package.yml
+77
-0
ocr_demo.py
demo/ocr_demo.py
+1
-1
ocr_mkcontent.py
magic_pdf/dict2md/ocr_mkcontent.py
+4
-5
commons.py
magic_pdf/libs/commons.py
+1
-1
hash_utils.py
magic_pdf/libs/hash_utils.py
+15
-0
pdf_image_tools.py
magic_pdf/libs/pdf_image_tools.py
+23
-11
pdf_parse_by_ocr.py
magic_pdf/pdf_parse_by_ocr.py
+6
-3
ocr_cut_image.py
magic_pdf/pre_proc/ocr_cut_image.py
+3
-0
spark_api.py
magic_pdf/spark/spark_api.py
+9
-1
No files found.
.github/workflows/python-package.yml
0 → 100644
View file @
00f16239
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
name
:
Python package
on
:
push
:
tags
:
-
'
*released'
workflow_dispatch
:
jobs
:
build
:
runs-on
:
ubuntu-latest
strategy
:
fail-fast
:
false
matrix
:
python-version
:
[
"
3.10"
]
steps
:
-
name
:
Checkout code
uses
:
actions/checkout@v4
with
:
fetch-depth
:
0
-
name
:
Set up Python ${{ matrix.python-version }}
uses
:
actions/setup-python@v5
with
:
python-version
:
${{ matrix.python-version }}
-
name
:
Install dependencies
run
:
|
python -m pip install --upgrade pip
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
-
name
:
Install wheel
run
:
|
python -m pip install wheel
-
name
:
Build wheel
run
:
|
python setup.py bdist_wheel
-
name
:
Upload artifact
uses
:
actions/upload-artifact@v4
with
:
name
:
wheel-file
path
:
dist/*.whl
retention-days
:
30
release
:
needs
:
[
build
]
runs-on
:
ubuntu-latest
steps
:
-
name
:
Checkout code
uses
:
actions/checkout@v4
-
name
:
Download artifact
uses
:
actions/download-artifact@v4
with
:
name
:
wheel-file
path
:
dist
-
name
:
Create and Upload Release
id
:
create_release
uses
:
softprops/action-gh-release@4634c16e79c963813287e889244c50009e7f0981
with
:
files
:
'
./dist/*.whl'
env
:
GITHUB_TOKEN
:
${{ secrets.RELEASE_TOKEN }}
# - name: Publish to PyPI
# uses: pypa/gh-action-pypi-publish@release/v1
# with:
# user: __token__
# password: ${{ secrets.PYPI_TOKEN }}
demo/ocr_demo.py
View file @
00f16239
...
...
@@ -116,7 +116,7 @@ if __name__ == '__main__':
pdf_path
=
r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf"
json_file_path
=
r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
# ocr_local_parse(pdf_path, json_file_path)
book_name
=
"
科数网/edu_00011318
"
book_name
=
"
数学新星网/edu_00001236
"
ocr_online_parse
(
book_name
)
pass
magic_pdf/dict2md/ocr_mkcontent.py
View file @
00f16239
from
magic_pdf.libs.commons
import
s3_image_save_path
,
join_path
from
magic_pdf.libs.language
import
detect_lang
from
magic_pdf.libs.markdown_utils
import
ocr_escape_special_markdown_char
from
magic_pdf.libs.ocr_content_type
import
ContentType
...
...
@@ -56,7 +55,7 @@ def ocr_mk_mm_markdown(pdf_info_dict: dict):
if
not
span
.
get
(
'image_path'
):
continue
else
:
content
=
f
"
})"
content
=
f
""
else
:
content
=
ocr_escape_special_markdown_char
(
span
[
'content'
])
# 转义特殊符号
if
span
[
'type'
]
==
ContentType
.
InlineEquation
:
...
...
@@ -123,7 +122,7 @@ def ocr_mk_markdown_with_para_core(paras_of_layout, mode):
content
=
f
"
\n
$$
\n
{span['content']}
\n
$$
\n
"
elif
span_type
in
[
ContentType
.
Image
,
ContentType
.
Table
]:
if
mode
==
'mm'
:
content
=
f
"
\n

})
\n
"
content
=
f
"
\n

\n
"
elif
mode
==
'nlp'
:
pass
if
content
!=
''
:
...
...
@@ -195,13 +194,13 @@ def line_to_standard_format(line):
if
span
[
'type'
]
==
ContentType
.
Image
:
content
=
{
'type'
:
'image'
,
'img_path'
:
join_path
(
s3_image_save_path
,
span
[
'image_path'
])
'img_path'
:
span
[
'image_path'
]
}
return
content
elif
span
[
'type'
]
==
ContentType
.
Table
:
content
=
{
'type'
:
'table'
,
'img_path'
:
join_path
(
s3_image_save_path
,
span
[
'image_path'
])
'img_path'
:
span
[
'image_path'
]
}
return
content
else
:
...
...
magic_pdf/libs/commons.py
View file @
00f16239
...
...
@@ -24,7 +24,7 @@ error_log_path = "s3://llm-pdf-text/err_logs/"
# json_dump_path = "s3://pdf_books_temp/json_dump/" # 这条路径仅用于临时本地测试,不能提交到main
json_dump_path
=
"s3://llm-pdf-text/json_dump/"
s3_image_save_path
=
"s3://mllm-raw-media/pdf2md_img/"
# TODO
基础库不应该有这些存在的路径,应该在业务代码中定义
# s3_image_save_path = "s3://mllm-raw-media/pdf2md_img/" #
基础库不应该有这些存在的路径,应该在业务代码中定义
def
get_top_percent_list
(
num_list
,
percent
):
...
...
magic_pdf/libs/hash_utils.py
0 → 100644
View file @
00f16239
import
hashlib
def
compute_md5
(
file_bytes
):
hasher
=
hashlib
.
md5
()
hasher
.
update
(
file_bytes
)
return
hasher
.
hexdigest
()
.
upper
()
def
compute_sha256
(
input_string
):
hasher
=
hashlib
.
sha256
()
# 在Python3中,需要将字符串转化为字节对象才能被哈希函数处理
input_bytes
=
input_string
.
encode
(
'utf-8'
)
hasher
.
update
(
input_bytes
)
return
hasher
.
hexdigest
()
magic_pdf/libs/pdf_image_tools.py
View file @
00f16239
...
...
@@ -7,6 +7,7 @@ import io
from
magic_pdf.libs.commons
import
fitz
from
loguru
import
logger
from
magic_pdf.libs.commons
import
parse_bucket_key
,
join_path
from
magic_pdf.libs.hash_utils
import
compute_sha256
def
cut_image
(
bbox
:
Tuple
,
page_num
:
int
,
page
:
fitz
.
Page
,
save_parent_path
:
str
,
s3_return_path
=
None
,
img_s3_client
=
None
,
upload_switch
=
True
):
...
...
@@ -16,9 +17,13 @@ def cut_image(bbox: Tuple, page_num: int, page: fitz.Page, save_parent_path: str
"""
# 拼接文件名
filename
=
f
"{page_num}_{int(bbox[0])}_{int(bbox[1])}_{int(bbox[2])}_{int(bbox[3])}.jpg"
# 拼接路径
image_save_path
=
join_path
(
save_parent_path
,
filename
)
# 老版本返回不带bucket的路径
s3_img_path
=
join_path
(
s3_return_path
,
filename
)
if
s3_return_path
is
not
None
else
None
# 新版本生成s3的平铺路径
s3_img_hash256_path
=
f
"{compute_sha256(s3_img_path)}.jpg"
# 打印图片文件名
# print(f"Saved {image_save_path}")
...
...
@@ -42,12 +47,16 @@ def cut_image(bbox: Tuple, page_num: int, page: fitz.Page, save_parent_path: str
# 截取图片
pix
=
page
.
get_pixmap
(
clip
=
rect
,
matrix
=
zoom
)
if
image_save
_path
.
startswith
(
"s3://"
):
if
save_parent
_path
.
startswith
(
"s3://"
):
if
not
upload_switch
:
pass
else
:
# 图片保存到s3
bucket_name
,
bucket_key
=
parse_bucket_key
(
image_save_path
)
"""图片保存到s3"""
# 从save_parent_path获取bucket_name
bucket_name
,
bucket_key
=
parse_bucket_key
(
save_parent_path
)
# 平铺路径赋值给bucket_key
bucket_key
=
s3_img_hash256_path
# 将字节流上传到s3
byte_data
=
pix
.
tobytes
(
output
=
'jpeg'
,
jpg_quality
=
95
)
file_obj
=
io
.
BytesIO
(
byte_data
)
...
...
@@ -58,18 +67,21 @@ def cut_image(bbox: Tuple, page_num: int, page: fitz.Page, save_parent_path: str
# img_s3_client_once.upload_fileobj(file_obj, bucket_name, bucket_key)
else
:
logger
.
exception
(
"must input img_s3_client"
)
return
s3_img_path
# return s3_img_path # 早期版本要求返回不带bucket的路径
s3_image_save_path
=
f
"s3://{bucket_name}/{s3_img_hash256_path}"
# 新版本返回平铺的s3路径
return
s3_image_save_path
else
:
# 保存图片到本地
# 先检查一下image_save_path的父目录是否存在,如果不存在,就创建
parent_dir
=
os
.
path
.
dirname
(
image_save_path
)
local_image_save_path
=
join_path
(
save_parent_path
,
filename
)
parent_dir
=
os
.
path
.
dirname
(
local_image_save_path
)
if
not
os
.
path
.
exists
(
parent_dir
):
os
.
makedirs
(
parent_dir
)
pix
.
save
(
image_save_path
,
jpg_quality
=
95
)
pix
.
save
(
local_
image_save_path
,
jpg_quality
=
95
)
# 为了直接能在markdown里看,这里把地址改为相对于mardown的地址
pth
=
Path
(
image_save_path
)
image_save_path
=
f
"{pth.parent.name}/{pth.name}"
return
image_save_path
pth
=
Path
(
local_
image_save_path
)
local_
image_save_path
=
f
"{pth.parent.name}/{pth.name}"
return
local_
image_save_path
def
save_images_by_bboxes
(
book_name
:
str
,
page_num
:
int
,
page
:
fitz
.
Page
,
save_path
:
str
,
...
...
magic_pdf/pdf_parse_by_ocr.py
View file @
00f16239
...
...
@@ -15,6 +15,7 @@ from magic_pdf.libs.commons import (
)
from
magic_pdf.libs.coordinate_transform
import
get_scale_ratio
from
magic_pdf.libs.drop_tag
import
DropTag
from
magic_pdf.libs.hash_utils
import
compute_md5
from
magic_pdf.libs.ocr_content_type
import
ContentType
from
magic_pdf.libs.safe_filename
import
sanitize_filename
from
magic_pdf.para.para_split
import
para_split
...
...
@@ -39,18 +40,18 @@ def parse_pdf_by_ocr(
pdf_bytes
,
pdf_model_output
,
save_path
,
book_name
,
book_name
=
""
,
pdf_model_profile
=
None
,
image_s3_config
=
None
,
start_page_id
=
0
,
end_page_id
=
None
,
debug_mode
=
False
,
):
pdf_bytes_md5
=
compute_md5
(
pdf_bytes
)
save_tmp_path
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"../.."
,
"tmp"
,
"unittest"
)
book_name
=
sanitize_filename
(
book_name
)
md_bookname_save_path
=
""
if
debug_mode
:
book_name
=
sanitize_filename
(
book_name
)
save_path
=
join_path
(
save_tmp_path
,
"md"
)
pdf_local_path
=
join_path
(
save_tmp_path
,
"download-pdfs"
,
book_name
)
...
...
@@ -179,6 +180,8 @@ def parse_pdf_by_ocr(
spans
,
dropped_spans_by_removed_bboxes
=
remove_spans_by_bboxes_dict
(
spans
,
need_remove_spans_bboxes_dict
)
'''对image和table截图'''
if
book_name
==
""
:
book_name
=
pdf_bytes_md5
spans
=
cut_image_and_table
(
spans
,
page
,
page_id
,
book_name
,
save_path
,
img_s3_client
)
'''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
...
...
magic_pdf/pre_proc/ocr_cut_image.py
View file @
00f16239
...
...
@@ -4,6 +4,9 @@ from magic_pdf.libs.pdf_image_tools import cut_image
def
cut_image_and_table
(
spans
,
page
,
page_id
,
book_name
,
save_path
,
img_s3_client
):
"""spark环境book_name为pdf_bytes_md5,本地环境会传正常bookname"""
def
s3_return_path
(
type
):
return
join_path
(
book_name
,
type
)
...
...
magic_pdf/spark/spark_api.py
View file @
00f16239
...
...
@@ -15,6 +15,7 @@
from
magic_pdf.io
import
AbsReaderWriter
from
magic_pdf.pdf_parse_by_ocr
import
parse_pdf_by_ocr
def
parse_txt_pdf
(
pdf_bytes
:
bytes
,
pdf_models
:
list
,
imageWriter
:
AbsReaderWriter
,
is_debug
=
False
,
start_page
=
0
,
*
args
,
**
kwargs
):
...
...
@@ -28,7 +29,14 @@ def parse_ocr_pdf(pdf_bytes:bytes, pdf_models:list, imageWriter: AbsReaderWrite
"""
解析ocr类pdf
"""
pass
pdf_info_dict
=
parse_pdf_by_ocr
(
pdf_bytes
,
pdf_models
,
imageWriter
,
start_page_id
=
start_page
,
debug_mode
=
is_debug
,
)
return
pdf_info_dict
def
parse_union_pdf
(
pdf_bytes
:
bytes
,
pdf_models
:
list
,
imageWriter
:
AbsReaderWriter
,
is_debug
=
False
,
start_page
=
0
,
*
args
,
**
kwargs
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment