Commit 6b6f40f3 authored by liukaiwen's avatar liukaiwen

Merge branch 'master' of github.com:papayalove/Magic-PDF

parents 851d191d b7710723
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
name: PDF
on:
push:
branches:
- "master"
paths-ignore:
- "cmds/**"
- "**.md"
pull_request:
branches:
- "master"
paths-ignore:
- "cmds/**"
- "**.md"
workflow_dispatch:
jobs:
cli-test:
runs-on: pdf
timeout-minutes: 40
strategy:
fail-fast: true
steps:
- name: config-net
run: |
export http_proxy=http://bigdata_open_proxy:H89k5qwQRDYfz@10.140.90.20:10811
export https_proxy=http://bigdata_open_proxy:H89k5qwQRDYfz@10.140.90.20:10811
- name: PDF cli
uses: actions/checkout@v3
with:
fetch-depth: 2
- name: check-requirements
run: |
changed_files=$(git diff --name-only -r HEAD~1 HEAD)
echo $changed_files
if [[ $changed_files =~ "requirements.txt" ]]; then
pip install -r requirements.txt
fi
- name: config-net-reset
run: |
export http_proxy=""
export https_proxy=""
- name: test_cli
run: |
echo $GITHUB_WORKSPACE
cd $GITHUB_WORKSPACE && pytest -s -v tests/test_cli/test_cli.py
notify_to_feishu:
if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'master') }}
needs: [cli-test]
runs-on: pdf
steps:
- name: get_actor
run: |
metion_list="quyuan"
echo $GITHUB_ACTOR
if [[ $GITHUB_ACTOR == "drunkpig" ]]; then
metion_list="xuchao"
elif [[ $GITHUB_ACTOR == "myhloli" ]]; then
metion_list="zhaoxiaomeng"
elif [[ $GITHUB_ACTOR == "icecraft" ]]; then
metion_list="xurui1"
fi
echo $metion_list
echo "METIONS=$metion_list" >> "$GITHUB_ENV"
echo ${{ env.METIONS }}
- name: notify
run: |
curl ${{ secrets.WEBHOOK_URL }} -H 'Content-Type: application/json' -d '{
"msgtype": "text",
"text": {
"mentioned_list": ["${{ env.METIONS }}"] , "content": "'${{ github.repository }}' GitHubAction Failed!\n 细节请查看:https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"
}
}'
\ No newline at end of file
......@@ -95,7 +95,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
page_markdown = []
for para_block in paras_of_layout:
para_text = ''
para_type = para_block.get('type')
para_type = para_block['type']
if para_type == BlockType.Text:
para_text = merge_para_with_text(para_block)
elif para_type == BlockType.Title:
......@@ -106,32 +106,30 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
if mode == 'nlp':
continue
elif mode == 'mm':
img_blocks = para_block.get('blocks')
for img_block in img_blocks:
if img_block.get('type') == BlockType.ImageBody:
for line in img_block.get('lines'):
for block in para_block['blocks']:
if block['type'] == BlockType.ImageBody:
for line in block['lines']:
for span in line['spans']:
if span.get('type') == ContentType.Image:
if span['type'] == ContentType.Image:
para_text = f"\n![]({join_path(img_buket_path, span['image_path'])})\n"
for img_block in img_blocks:
if img_block.get('type') == BlockType.ImageCaption:
para_text += merge_para_with_text(img_block)
for block in para_block['blocks']:
if block['type'] == BlockType.ImageCaption:
para_text += merge_para_with_text(block)
elif para_type == BlockType.Table:
if mode == 'nlp':
continue
elif mode == 'mm':
table_blocks = para_block.get('blocks')
for table_block in table_blocks:
if table_block.get('type') == BlockType.TableBody:
for line in table_block.get('lines'):
for block in para_block['blocks']:
if block['type'] == BlockType.TableBody:
for line in block['lines']:
for span in line['spans']:
if span.get('type') == ContentType.Table:
if span['type'] == ContentType.Table:
para_text = f"\n![]({join_path(img_buket_path, span['image_path'])})\n"
for table_block in table_blocks:
if table_block.get('type') == BlockType.TableCaption:
para_text += merge_para_with_text(table_block)
elif table_block.get('type') == BlockType.TableFootnote:
para_text += merge_para_with_text(table_block)
for block in para_block['blocks']:
if block['type'] == BlockType.TableCaption:
para_text += merge_para_with_text(block)
elif block['type'] == BlockType.TableFootnote:
para_text += merge_para_with_text(block)
if para_text.strip() == '':
continue
......@@ -141,11 +139,11 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
return page_markdown
def merge_para_with_text(para):
def merge_para_with_text(para_block):
para_text = ''
for line in para['lines']:
for line in para_block['lines']:
for span in line['spans']:
span_type = span.get('type')
span_type = span['type']
content = ''
language = ''
if span_type == ContentType.Text:
......@@ -159,6 +157,7 @@ def merge_para_with_text(para):
content = f"${span['content']}$"
elif span_type == ContentType.InterlineEquation:
content = f"\n$$\n{span['content']}\n$$\n"
if content != '':
if language == 'en': # 英文语境下 content间需要空格分隔
para_text += content + ' '
......
......@@ -132,7 +132,7 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path):
pdf_docs = fitz.open("pdf", pdf_bytes)
for i, page in enumerate(pdf_docs):
draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False)
draw_bbox_without_number(i, dropped_bbox_list, page, [0, 255, 0], True)
draw_bbox_without_number(i, dropped_bbox_list, page, [158, 158, 158], True)
draw_bbox_without_number(i, tables_list, page, [153, 153, 0], True) # color !
draw_bbox_without_number(i, tables_body_list, page, [204, 204, 0], True)
draw_bbox_without_number(i, tables_caption_list, page, [255, 255, 102], True)
......@@ -142,7 +142,7 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path):
draw_bbox_without_number(i, imgs_caption_list, page, [102, 178, 255], True)
draw_bbox_without_number(i, titles_list, page, [102, 102, 255], True)
draw_bbox_without_number(i, texts_list, page, [153, 0, 76], True)
draw_bbox_without_number(i, interequations_list, page, [160, 160, 160], True)
draw_bbox_without_number(i, interequations_list, page, [0, 255, 0], True)
# Save the PDF
pdf_docs.save(f"{out_path}/layout.pdf")
......
......@@ -61,7 +61,7 @@ def parse_pdf_by_ocr(pdf_bytes,
'''将所有区块的bbox整理到一起'''
all_bboxes = ocr_prepare_bboxes_for_layout_split(
img_blocks, table_blocks, discarded_blocks, text_blocks, title_blocks,
interline_equation_blocks, page_w, page_h)
interline_equations, page_w, page_h)
'''根据区块信息计算layout'''
page_boundry = [0, 0, page_w, page_h]
......
......@@ -57,8 +57,8 @@ def fix_text_overlap_title_blocks(all_bboxes):
for text_block in text_blocks:
for title_block in title_blocks:
text_block_bbox = text_block[0], text_block[1], text_block[2], text_block[3]
title_block_bbox = title_block[0], title_block[1], title_block[2], title_block[3]
text_block_bbox = text_block[:4]
title_block_bbox = title_block[:4]
if calculate_iou(text_block_bbox, title_block_bbox) > 0.8:
all_bboxes.remove(title_block)
......@@ -66,27 +66,37 @@ def fix_text_overlap_title_blocks(all_bboxes):
def remove_need_drop_blocks(all_bboxes, discarded_blocks):
for block in all_bboxes.copy():
need_remove = []
for block in all_bboxes:
for discarded_block in discarded_blocks:
block_bbox = block[0], block[1], block[2], block[3]
block_bbox = block[:4]
if calculate_overlap_area_in_bbox1_area_ratio(block_bbox, discarded_block['bbox']) > 0.6:
all_bboxes.remove(block)
if block not in need_remove:
need_remove.append(block)
break
if len(need_remove) > 0:
for block in need_remove:
all_bboxes.remove(block)
return all_bboxes
def remove_overlaps_min_blocks(all_bboxes):
# 删除重叠blocks中较小的那些
for block1 in all_bboxes.copy():
for block2 in all_bboxes.copy():
need_remove = []
for block1 in all_bboxes:
for block2 in all_bboxes:
if block1 != block2:
block1_bbox = [block1[0], block1[1], block1[2], block1[3]]
block2_bbox = [block2[0], block2[1], block2[2], block2[3]]
block1_bbox = block1[:4]
block2_bbox = block2[:4]
overlap_box = get_minbox_if_overlap_by_ratio(block1_bbox, block2_bbox, 0.8)
if overlap_box is not None:
bbox_to_remove = next(
(block for block in all_bboxes if [block[0], block[1], block[2], block[3]] == overlap_box),
None)
if bbox_to_remove is not None:
all_bboxes.remove(bbox_to_remove)
bbox_to_remove = next((block for block in all_bboxes if block[:4] == overlap_box), None)
if bbox_to_remove is not None and bbox_to_remove not in need_remove:
need_remove.append(bbox_to_remove)
if len(need_remove) > 0:
for block in need_remove:
all_bboxes.remove(block)
return all_bboxes
......@@ -9,16 +9,20 @@ from magic_pdf.libs.ocr_content_type import ContentType, BlockType
def remove_overlaps_min_spans(spans):
dropped_spans = []
# 删除重叠spans中较小的那些
for span1 in spans.copy():
for span2 in spans.copy():
for span1 in spans:
for span2 in spans:
if span1 != span2:
overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.65)
if overlap_box is not None:
bbox_to_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
if bbox_to_remove is not None:
spans.remove(bbox_to_remove)
bbox_to_remove['tag'] = DropTag.SPAN_OVERLAP
dropped_spans.append(bbox_to_remove)
span_need_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
if span_need_remove is not None and span_need_remove not in dropped_spans:
dropped_spans.append(span_need_remove)
if len(dropped_spans) > 0:
for span_need_remove in dropped_spans:
spans.remove(span_need_remove)
span_need_remove['tag'] = DropTag.SPAN_OVERLAP
return spans, dropped_spans
......@@ -29,11 +33,13 @@ def remove_spans_by_bboxes(spans, need_remove_spans_bboxes):
for span in spans:
for removed_bbox in need_remove_spans_bboxes:
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox) > 0.5:
need_remove_spans.append(span)
break
if span not in need_remove_spans:
need_remove_spans.append(span)
break
for span in need_remove_spans:
spans.remove(span)
if len(need_remove_spans) > 0:
for span in need_remove_spans:
spans.remove(span)
return spans
......
......@@ -74,7 +74,7 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
debug_mode=is_debug,
)
except Exception as e:
logger.error(f"{method.__name__} error: {e}")
logger.exception(e)
return None
pdf_info_dict = parse_pdf(parse_pdf_by_txt)
......
......@@ -17,4 +17,6 @@ zh_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/zh_
scikit-learn>=1.0.2
nltk==3.8.1
s3pathlib>=2.1.1
pytest
subprocess
import os
conf = {
"code_path": os.environ.get('GITHUB_WORKSPACE'),
"pdf_dev_path" : os.environ.get('GITHUB_WORKSPACE') + "/tests/test_cli/pdf_dev",
"pdf_res_path": "/home/quyuan/code/Magic-PDF/Magic-PDF/Magic-PDF/data"
}
import subprocess
def check_shell(cmd):
res = subprocess.check_output(cmd, shell=True)
assert res == 0
def count_folders_and_check_contents(directory):
# 获取目录下的所有文件和文件夹
contents = os.listdir(directory)
folder_count = 0
for item in contents:
# 检查是否为文件夹
if os.path.isdir(os.path.join(directory, item)):
folder_count += 1
# 检查文件夹是否为空
folder_path = os.path.join(directory, item)
assert os.listdir(folder_path) is not None
assert folder_count == 3
This source diff could not be displayed because it is too large. You can view the blob instead.
This diff is collapsed.
import pytest
import os
from conf import conf
import subprocess
from lib import common
import logging
pdf_res_path = conf.conf["pdf_res_path"]
code_path = conf.conf["code_path"]
pdf_dev_path = conf.conf["pdf_dev_path"]
class TestCli:
def test_pdf_specify_dir(self):
"""
输入pdf和指定目录的模型结果
"""
cmd = 'cd %s && export PYTHONPATH=. && find %s -type f -name "*.pdf" | xargs -I{} python magic_pdf/cli/magicpdf.py pdf-command --pdf {}' % (code_path, pdf_dev_path)
logging.info(cmd)
common.check_shell(cmd)
common.count_folders_and_check_contents(pdf_res_path)
def test_pdf_specify_jsonl(self):
"""
输入jsonl
"""
cmd = "cd %s && export PYTHONPATH=. && python "
if __name__ == "__main__":
pytest.main()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment