Unverified Commit 03469909 authored by icecraft's avatar icecraft Committed by GitHub

Feat/support footnote in figure (#532)

* feat: support figure footnote

* feat: using the relative position to combine footnote, table, image

* feat: add the readme of projects

* fix: code spell in unittest

---------
Co-authored-by: 's avataricecraft <xurui1@pjlab.org.cn>
parent 4331b837
...@@ -3,6 +3,7 @@ repos: ...@@ -3,6 +3,7 @@ repos:
rev: 5.0.4 rev: 5.0.4
hooks: hooks:
- id: flake8 - id: flake8
args: ["--max-line-length=120", "--ignore=E131,E125,W503,W504,E203"]
- repo: https://github.com/PyCQA/isort - repo: https://github.com/PyCQA/isort
rev: 5.11.5 rev: 5.11.5
hooks: hooks:
...@@ -11,6 +12,7 @@ repos: ...@@ -11,6 +12,7 @@ repos:
rev: v0.32.0 rev: v0.32.0
hooks: hooks:
- id: yapf - id: yapf
args: ["--style={based_on_style: google, column_limit: 120, indent_width: 4}"]
- repo: https://github.com/codespell-project/codespell - repo: https://github.com/codespell-project/codespell
rev: v2.2.1 rev: v2.2.1
hooks: hooks:
...@@ -41,4 +43,4 @@ repos: ...@@ -41,4 +43,4 @@ repos:
rev: v1.3.1 rev: v1.3.1
hooks: hooks:
- id: docformatter - id: docformatter
args: ["--in-place", "--wrap-descriptions", "79"] args: ["--in-place", "--wrap-descriptions", "119"]
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -71,6 +71,7 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename): ...@@ -71,6 +71,7 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
tables_list, tables_body_list = [], [] tables_list, tables_body_list = [], []
tables_caption_list, tables_footnote_list = [], [] tables_caption_list, tables_footnote_list = [], []
imgs_list, imgs_body_list, imgs_caption_list = [], [], [] imgs_list, imgs_body_list, imgs_caption_list = [], [], []
imgs_footnote_list = []
titles_list = [] titles_list = []
texts_list = [] texts_list = []
interequations_list = [] interequations_list = []
...@@ -78,7 +79,7 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename): ...@@ -78,7 +79,7 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
page_layout_list = [] page_layout_list = []
page_dropped_list = [] page_dropped_list = []
tables, tables_body, tables_caption, tables_footnote = [], [], [], [] tables, tables_body, tables_caption, tables_footnote = [], [], [], []
imgs, imgs_body, imgs_caption = [], [], [] imgs, imgs_body, imgs_caption, imgs_footnote = [], [], [], []
titles = [] titles = []
texts = [] texts = []
interequations = [] interequations = []
...@@ -108,6 +109,8 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename): ...@@ -108,6 +109,8 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
imgs_body.append(bbox) imgs_body.append(bbox)
elif nested_block['type'] == BlockType.ImageCaption: elif nested_block['type'] == BlockType.ImageCaption:
imgs_caption.append(bbox) imgs_caption.append(bbox)
elif nested_block['type'] == BlockType.ImageFootnote:
imgs_footnote.append(bbox)
elif block['type'] == BlockType.Title: elif block['type'] == BlockType.Title:
titles.append(bbox) titles.append(bbox)
elif block['type'] == BlockType.Text: elif block['type'] == BlockType.Text:
...@@ -121,6 +124,7 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename): ...@@ -121,6 +124,7 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
imgs_list.append(imgs) imgs_list.append(imgs)
imgs_body_list.append(imgs_body) imgs_body_list.append(imgs_body)
imgs_caption_list.append(imgs_caption) imgs_caption_list.append(imgs_caption)
imgs_footnote_list.append(imgs_footnote)
titles_list.append(titles) titles_list.append(titles)
texts_list.append(texts) texts_list.append(texts)
interequations_list.append(interequations) interequations_list.append(interequations)
...@@ -142,6 +146,8 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename): ...@@ -142,6 +146,8 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
draw_bbox_without_number(i, imgs_body_list, page, [153, 255, 51], True) draw_bbox_without_number(i, imgs_body_list, page, [153, 255, 51], True)
draw_bbox_without_number(i, imgs_caption_list, page, [102, 178, 255], draw_bbox_without_number(i, imgs_caption_list, page, [102, 178, 255],
True) True)
draw_bbox_with_number(i, imgs_footnote_list, page, [255, 178, 102],
True),
draw_bbox_without_number(i, titles_list, page, [102, 102, 255], True) draw_bbox_without_number(i, titles_list, page, [102, 102, 255], True)
draw_bbox_without_number(i, texts_list, page, [153, 0, 76], True) draw_bbox_without_number(i, texts_list, page, [153, 0, 76], True)
draw_bbox_without_number(i, interequations_list, page, [0, 255, 0], draw_bbox_without_number(i, interequations_list, page, [0, 255, 0],
...@@ -241,7 +247,7 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename): ...@@ -241,7 +247,7 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename):
def drow_model_bbox(model_list: list, pdf_bytes, out_path, filename): def drow_model_bbox(model_list: list, pdf_bytes, out_path, filename):
dropped_bbox_list = [] dropped_bbox_list = []
tables_body_list, tables_caption_list, tables_footnote_list = [], [], [] tables_body_list, tables_caption_list, tables_footnote_list = [], [], []
imgs_body_list, imgs_caption_list = [], [] imgs_body_list, imgs_caption_list, imgs_footnote_list = [], [], []
titles_list = [] titles_list = []
texts_list = [] texts_list = []
interequations_list = [] interequations_list = []
...@@ -250,7 +256,7 @@ def drow_model_bbox(model_list: list, pdf_bytes, out_path, filename): ...@@ -250,7 +256,7 @@ def drow_model_bbox(model_list: list, pdf_bytes, out_path, filename):
for i in range(len(model_list)): for i in range(len(model_list)):
page_dropped_list = [] page_dropped_list = []
tables_body, tables_caption, tables_footnote = [], [], [] tables_body, tables_caption, tables_footnote = [], [], []
imgs_body, imgs_caption = [], [] imgs_body, imgs_caption, imgs_footnote = [], [], []
titles = [] titles = []
texts = [] texts = []
interequations = [] interequations = []
...@@ -277,6 +283,8 @@ def drow_model_bbox(model_list: list, pdf_bytes, out_path, filename): ...@@ -277,6 +283,8 @@ def drow_model_bbox(model_list: list, pdf_bytes, out_path, filename):
interequations.append(bbox) interequations.append(bbox)
elif layout_det['category_id'] == CategoryId.Abandon: elif layout_det['category_id'] == CategoryId.Abandon:
page_dropped_list.append(bbox) page_dropped_list.append(bbox)
elif layout_det['category_id'] == CategoryId.ImageFootnote:
imgs_footnote.append(bbox)
tables_body_list.append(tables_body) tables_body_list.append(tables_body)
tables_caption_list.append(tables_caption) tables_caption_list.append(tables_caption)
...@@ -287,6 +295,7 @@ def drow_model_bbox(model_list: list, pdf_bytes, out_path, filename): ...@@ -287,6 +295,7 @@ def drow_model_bbox(model_list: list, pdf_bytes, out_path, filename):
texts_list.append(texts) texts_list.append(texts)
interequations_list.append(interequations) interequations_list.append(interequations)
dropped_bbox_list.append(page_dropped_list) dropped_bbox_list.append(page_dropped_list)
imgs_footnote_list.append(imgs_footnote)
for i, page in enumerate(pdf_docs): for i, page in enumerate(pdf_docs):
draw_bbox_with_number(i, dropped_bbox_list, page, [158, 158, 158], draw_bbox_with_number(i, dropped_bbox_list, page, [158, 158, 158],
...@@ -299,6 +308,8 @@ def drow_model_bbox(model_list: list, pdf_bytes, out_path, filename): ...@@ -299,6 +308,8 @@ def drow_model_bbox(model_list: list, pdf_bytes, out_path, filename):
draw_bbox_with_number(i, imgs_body_list, page, [153, 255, 51], True) draw_bbox_with_number(i, imgs_body_list, page, [153, 255, 51], True)
draw_bbox_with_number(i, imgs_caption_list, page, [102, 178, 255], draw_bbox_with_number(i, imgs_caption_list, page, [102, 178, 255],
True) True)
draw_bbox_with_number(i, imgs_footnote_list, page, [255, 178, 102],
True)
draw_bbox_with_number(i, titles_list, page, [102, 102, 255], True) draw_bbox_with_number(i, titles_list, page, [102, 102, 255], True)
draw_bbox_with_number(i, texts_list, page, [153, 0, 76], True) draw_bbox_with_number(i, texts_list, page, [153, 0, 76], True)
draw_bbox_with_number(i, interequations_list, page, [0, 255, 0], True) draw_bbox_with_number(i, interequations_list, page, [0, 255, 0], True)
......
class ContentType: class ContentType:
Image = "image" Image = 'image'
Table = "table" Table = 'table'
Text = "text" Text = 'text'
InlineEquation = "inline_equation" InlineEquation = 'inline_equation'
InterlineEquation = "interline_equation" InterlineEquation = 'interline_equation'
class BlockType: class BlockType:
Image = "image" Image = 'image'
ImageBody = "image_body" ImageBody = 'image_body'
ImageCaption = "image_caption" ImageCaption = 'image_caption'
Table = "table" ImageFootnote = 'image_footnote'
TableBody = "table_body" Table = 'table'
TableCaption = "table_caption" TableBody = 'table_body'
TableFootnote = "table_footnote" TableCaption = 'table_caption'
Text = "text" TableFootnote = 'table_footnote'
Title = "title" Text = 'text'
InterlineEquation = "interline_equation" Title = 'title'
Footnote = "footnote" InterlineEquation = 'interline_equation'
Discarded = "discarded" Footnote = 'footnote'
Discarded = 'discarded'
class CategoryId: class CategoryId:
...@@ -33,3 +35,4 @@ class CategoryId: ...@@ -33,3 +35,4 @@ class CategoryId:
InlineEquation = 13 InlineEquation = 13
InterlineEquation_YOLO = 14 InterlineEquation_YOLO = 14
OcrText = 15 OcrText = 15
ImageFootnote = 101
This diff is collapsed.
This diff is collapsed.
# 欢迎来到 MinerU 项目列表
## 项目列表
- [llama_index_rag](./llama_index_rag/README.md): 基于 llama_index 构建轻量级 RAG 系统
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment