Unverified Commit 82dd7ac5 authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub

Merge pull request #782 from icecraft/feat/data_api

Feat/data api
parents e36627be c200effc
...@@ -43,3 +43,8 @@ projects/web/node_modules ...@@ -43,3 +43,8 @@ projects/web/node_modules
projects/web/dist projects/web/dist
projects/web_demo/web_demo/static/ projects/web_demo/web_demo/static/
cli_debug/
debug_utils/
# sphinx docs
_build/
...@@ -3,7 +3,7 @@ repos: ...@@ -3,7 +3,7 @@ repos:
rev: 5.0.4 rev: 5.0.4
hooks: hooks:
- id: flake8 - id: flake8
args: ["--max-line-length=120", "--ignore=E131,E125,W503,W504,E203"] args: ["--max-line-length=150", "--ignore=E131,E125,W503,W504,E203"]
- repo: https://github.com/PyCQA/isort - repo: https://github.com/PyCQA/isort
rev: 5.11.5 rev: 5.11.5
hooks: hooks:
...@@ -12,11 +12,12 @@ repos: ...@@ -12,11 +12,12 @@ repos:
rev: v0.32.0 rev: v0.32.0
hooks: hooks:
- id: yapf - id: yapf
args: ["--style={based_on_style: google, column_limit: 120, indent_width: 4}"] args: ["--style={based_on_style: google, column_limit: 150, indent_width: 4}"]
- repo: https://github.com/codespell-project/codespell - repo: https://github.com/codespell-project/codespell
rev: v2.2.1 rev: v2.2.1
hooks: hooks:
- id: codespell - id: codespell
args: ['--skip', '*.json']
- repo: https://github.com/pre-commit/pre-commit-hooks - repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.3.0 rev: v4.3.0
hooks: hooks:
......
Data Api
------------------
.. toctree::
:maxdepth: 2
api/dataset.rst
api/data_reader_writer.rst
api/read_api.rst
Data Reader Writer
--------------------
.. autoclass:: magic_pdf.data.data_reader_writer.DataReader
:members:
:inherited-members:
.. autoclass:: magic_pdf.data.data_reader_writer.DataWriter
:members:
:inherited-members:
.. autoclass:: magic_pdf.data.data_reader_writer.S3DataReader
:members:
:inherited-members:
.. autoclass:: magic_pdf.data.data_reader_writer.S3DataWriter
:members:
:inherited-members:
.. autoclass:: magic_pdf.data.data_reader_writer.FileBasedDataReader
:members:
:inherited-members:
.. autoclass:: magic_pdf.data.data_reader_writer.FileBasedDataWriter
:members:
:inherited-members:
.. autoclass:: magic_pdf.data.data_reader_writer.S3DataReader
:members:
:inherited-members:
.. autoclass:: magic_pdf.data.data_reader_writer.S3DataWriter
:members:
:inherited-members:
.. autoclass:: magic_pdf.data.data_reader_writer.MultiBucketS3DataReader
:members:
:inherited-members:
.. autoclass:: magic_pdf.data.data_reader_writer.MultiBucketS3DataWriter
:members:
:inherited-members:
Dataset Api
------------------
.. autoclass:: magic_pdf.data.dataset.PageableData
:members:
:inherited-members:
.. autoclass:: magic_pdf.data.dataset.Dataset
:members:
:inherited-members:
.. autoclass:: magic_pdf.data.dataset.ImageDataset
:members:
:inherited-members:
.. autoclass:: magic_pdf.data.dataset.PymuDocDataset
:members:
:inherited-members:
.. autoclass:: magic_pdf.data.dataset.Doc
:members:
:inherited-members:
read_api Api
------------------
.. automodule:: magic_pdf.data.read_api
:members:
:inherited-members:
...@@ -24,3 +24,15 @@ Welcome to the MinerU Documentation ...@@ -24,3 +24,15 @@ Welcome to the MinerU Documentation
<a class="github-button" href="https://github.com/opendatalab/MinerU/subscription" data-icon="octicon-eye" data-size="large" aria-label="Watch">Watch</a> <a class="github-button" href="https://github.com/opendatalab/MinerU/subscription" data-icon="octicon-eye" data-size="large" aria-label="Watch">Watch</a>
<a class="github-button" href="https://github.com/opendatalab/MinerU/fork" data-icon="octicon-repo-forked" data-size="large" aria-label="Fork">Fork</a> <a class="github-button" href="https://github.com/opendatalab/MinerU/fork" data-icon="octicon-repo-forked" data-size="large" aria-label="Fork">Fork</a>
</p> </p>
API Reference
-------------
If you are looking for information on a specific function, class or
method, this part of the documentation is for you.
.. toctree::
:maxdepth: 2
api
boto3>=1.28.43
loguru>=0.6.0
myst-parser myst-parser
Pillow==8.4.0
pydantic>=2.7.2,<2.8.0
PyMuPDF>=1.24.9
sphinx sphinx
sphinx-argparse sphinx-argparse
sphinx-book-theme sphinx-book-theme
......
import enum
class SupportedPdfParseMethod(enum.Enum):
OCR = 'ocr'
TXT = 'txt'
class FileNotExisted(Exception):
def __init__(self, path):
self.path = path
def __str__(self):
return f'File {self.path} does not exist.'
class InvalidConfig(Exception):
def __init__(self, msg):
self.msg = msg
def __str__(self):
return f'Invalid config: {self.msg}'
class InvalidParams(Exception):
def __init__(self, msg):
self.msg = msg
def __str__(self):
return f'Invalid params: {self.msg}'
class EmptyData(Exception):
def __init__(self, msg):
self.msg = msg
def __str__(self):
return f'Empty data: {self.msg}'
from magic_pdf.data.data_reader_writer.filebase import \
FileBasedDataReader # noqa: F401
from magic_pdf.data.data_reader_writer.filebase import \
FileBasedDataWriter # noqa: F401
from magic_pdf.data.data_reader_writer.multi_bucket_s3 import \
MultiBucketS3DataReader # noqa: F401
from magic_pdf.data.data_reader_writer.multi_bucket_s3 import \
MultiBucketS3DataWriter # noqa: F401
from magic_pdf.data.data_reader_writer.s3 import S3DataReader # noqa: F401
from magic_pdf.data.data_reader_writer.s3 import S3DataWriter # noqa: F401
from magic_pdf.data.data_reader_writer.base import DataReader # noqa: F401
from magic_pdf.data.data_reader_writer.base import DataWriter # noqa: F401
\ No newline at end of file
from abc import ABC, abstractmethod
class DataReader(ABC):
def read(self, path: str) -> bytes:
"""Read the file.
Args:
path (str): file path to read
Returns:
bytes: the content of the file
"""
return self.read_at(path)
@abstractmethod
def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes:
"""Read the file at offset and limit.
Args:
path (str): the file path
offset (int, optional): the number of bytes skipped. Defaults to 0.
limit (int, optional): the length of bytes want to read. Defaults to -1.
Returns:
bytes: the content of the file
"""
pass
class DataWriter(ABC):
@abstractmethod
def write(self, path: str, data: bytes) -> None:
"""Write the data to the file.
Args:
path (str): the target file where to write
data (bytes): the data want to write
"""
pass
def write_string(self, path: str, data: str) -> None:
"""Write the data to file, the data will be encoded to bytes.
Args:
path (str): the target file where to write
data (str): the data want to write
"""
self.write(path, data.encode())
import os
from magic_pdf.data.data_reader_writer.base import DataReader, DataWriter
class FileBasedDataReader(DataReader):
def __init__(self, parent_dir: str = ''):
"""Initialized with parent_dir.
Args:
parent_dir (str, optional): the parent directory that may be used within methods. Defaults to ''.
"""
self._parent_dir = parent_dir
def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes:
"""Read at offset and limit.
Args:
path (str): the path of file, if the path is relative path, it will be joined with parent_dir.
offset (int, optional): the number of bytes skipped. Defaults to 0.
limit (int, optional): the length of bytes want to read. Defaults to -1.
Returns:
bytes: the content of file
"""
fn_path = path
if not os.path.isabs(fn_path) and len(self._parent_dir) > 0:
fn_path = os.path.join(self._parent_dir, path)
with open(fn_path, 'rb') as f:
f.seek(offset)
if limit == -1:
return f.read()
else:
return f.read(limit)
class FileBasedDataWriter(DataWriter):
def __init__(self, parent_dir: str = '') -> None:
"""Initialized with parent_dir.
Args:
parent_dir (str, optional): the parent directory that may be used within methods. Defaults to ''.
"""
self._parent_dir = parent_dir
def write(self, path: str, data: bytes) -> None:
"""Write file with data.
Args:
path (str): the path of file, if the path is relative path, it will be joined with parent_dir.
data (bytes): the data want to write
"""
fn_path = path
if not os.path.isabs(fn_path) and len(self._parent_dir) > 0:
fn_path = os.path.join(self._parent_dir, path)
with open(fn_path, 'wb') as f:
f.write(data)
from magic_pdf.config.exceptions import InvalidConfig, InvalidParams
from magic_pdf.data.data_reader_writer.base import DataReader, DataWriter
from magic_pdf.data.io.s3 import S3Reader, S3Writer
from magic_pdf.data.schemas import S3Config
from magic_pdf.libs.path_utils import (parse_s3_range_params, parse_s3path,
remove_non_official_s3_args)
class MultiS3Mixin:
def __init__(self, default_bucket: str, s3_configs: list[S3Config]):
"""Initialized with multiple s3 configs.
Args:
default_bucket (str): the default bucket name of the relative path
s3_configs (list[S3Config]): list of s3 configs, the bucket_name must be unique in the list.
Raises:
InvalidConfig: default bucket config not in s3_configs
InvalidConfig: bucket name not unique in s3_configs
InvalidConfig: default bucket must be provided
"""
if len(default_bucket) == 0:
raise InvalidConfig('default_bucket must be provided')
found_default_bucket_config = False
for conf in s3_configs:
if conf.bucket_name == default_bucket:
found_default_bucket_config = True
break
if not found_default_bucket_config:
raise InvalidConfig(
f'default_bucket: {default_bucket} config must be provided in s3_configs: {s3_configs}'
)
uniq_bucket = set([conf.bucket_name for conf in s3_configs])
if len(uniq_bucket) != len(s3_configs):
raise InvalidConfig(
f'the bucket_name in s3_configs: {s3_configs} must be unique'
)
self.default_bucket = default_bucket
self.s3_configs = s3_configs
self._s3_clients_h: dict = {}
class MultiBucketS3DataReader(DataReader, MultiS3Mixin):
def read(self, path: str) -> bytes:
"""Read the path from s3, select diffect bucket client for each request
based on the path, also support range read.
Args:
path (str): the s3 path of file, the path must be in the format of s3://bucket_name/path?offset,limit
for example: s3://bucket_name/path?0,100
Returns:
bytes: the content of s3 file
"""
may_range_params = parse_s3_range_params(path)
if may_range_params is None or 2 != len(may_range_params):
byte_start, byte_len = 0, -1
else:
byte_start, byte_len = int(may_range_params[0]), int(may_range_params[1])
path = remove_non_official_s3_args(path)
return self.read_at(path, byte_start, byte_len)
def __get_s3_client(self, bucket_name: str):
if bucket_name not in set([conf.bucket_name for conf in self.s3_configs]):
raise InvalidParams(
f'bucket name: {bucket_name} not found in s3_configs: {self.s3_configs}'
)
if bucket_name not in self._s3_clients_h:
conf = next(
filter(lambda conf: conf.bucket_name == bucket_name, self.s3_configs)
)
self._s3_clients_h[bucket_name] = S3Reader(
bucket_name,
conf.access_key,
conf.secret_key,
conf.endpoint_url,
conf.addressing_style,
)
return self._s3_clients_h[bucket_name]
def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes:
"""Read the file with offset and limit, select diffect bucket client
for each request based on the path.
Args:
path (str): the file path
offset (int, optional): the number of bytes skipped. Defaults to 0.
limit (int, optional): the number of bytes want to read. Defaults to -1 which means infinite.
Returns:
bytes: the file content
"""
if path.startswith('s3://'):
bucket_name, path = parse_s3path(path)
s3_reader = self.__get_s3_client(bucket_name)
else:
s3_reader = self.__get_s3_client(self.default_bucket)
return s3_reader.read_at(path, offset, limit)
class MultiBucketS3DataWriter(DataWriter, MultiS3Mixin):
def __get_s3_client(self, bucket_name: str):
if bucket_name not in set([conf.bucket_name for conf in self.s3_configs]):
raise InvalidParams(
f'bucket name: {bucket_name} not found in s3_configs: {self.s3_configs}'
)
if bucket_name not in self._s3_clients_h:
conf = next(
filter(lambda conf: conf.bucket_name == bucket_name, self.s3_configs)
)
self._s3_clients_h[bucket_name] = S3Writer(
bucket_name,
conf.access_key,
conf.secret_key,
conf.endpoint_url,
conf.addressing_style,
)
return self._s3_clients_h[bucket_name]
def write(self, path: str, data: bytes) -> None:
"""Write file with data, also select diffect bucket client for each
request based on the path.
Args:
path (str): the path of file, if the path is relative path, it will be joined with parent_dir.
data (bytes): the data want to write
"""
if path.startswith('s3://'):
bucket_name, path = parse_s3path(path)
s3_writer = self.__get_s3_client(bucket_name)
else:
s3_writer = self.__get_s3_client(self.default_bucket)
return s3_writer.write(path, data)
from magic_pdf.data.data_reader_writer.multi_bucket_s3 import (
MultiBucketS3DataReader, MultiBucketS3DataWriter)
from magic_pdf.data.schemas import S3Config
class S3DataReader(MultiBucketS3DataReader):
def __init__(
self,
bucket: str,
ak: str,
sk: str,
endpoint_url: str,
addressing_style: str = 'auto',
):
"""s3 reader client.
Args:
bucket (str): bucket name
ak (str): access key
sk (str): secret key
endpoint_url (str): endpoint url of s3
addressing_style (str, optional): Defaults to 'auto'. Other valid options here are 'path' and 'virtual'
refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html
"""
super().__init__(
bucket,
[
S3Config(
bucket_name=bucket,
access_key=ak,
secret_key=sk,
endpoint_url=endpoint_url,
addressing_style=addressing_style,
)
],
)
class S3DataWriter(MultiBucketS3DataWriter):
def __init__(
self,
bucket: str,
ak: str,
sk: str,
endpoint_url: str,
addressing_style: str = 'auto',
):
"""s3 writer client.
Args:
bucket (str): bucket name
ak (str): access key
sk (str): secret key
endpoint_url (str): endpoint url of s3
addressing_style (str, optional): Defaults to 'auto'. Other valid options here are 'path' and 'virtual'
refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html
"""
super().__init__(
bucket,
[
S3Config(
bucket_name=bucket,
access_key=ak,
secret_key=sk,
endpoint_url=endpoint_url,
addressing_style=addressing_style,
)
],
)
from abc import ABC, abstractmethod
from typing import Iterator
import fitz
from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.data.schemas import PageInfo
from magic_pdf.data.utils import fitz_doc_to_image
class PageableData(ABC):
@abstractmethod
def get_image(self) -> dict:
"""Transform data to image."""
pass
@abstractmethod
def get_doc(self) -> fitz.Page:
"""Get the pymudoc page."""
pass
@abstractmethod
def get_page_info(self) -> PageInfo:
"""Get the page info of the page.
Returns:
PageInfo: the page info of this page
"""
pass
class Dataset(ABC):
@abstractmethod
def __len__(self) -> int:
"""The length of the dataset."""
pass
@abstractmethod
def __iter__(self) -> Iterator[PageableData]:
"""Yield the page data."""
pass
@abstractmethod
def supported_methods(self) -> list[SupportedPdfParseMethod]:
"""The methods that this dataset support.
Returns:
list[SupportedPdfParseMethod]: The supported methods, Valid methods are: OCR, TXT
"""
pass
@abstractmethod
def data_bits(self) -> bytes:
"""The bits used to create this dataset."""
pass
@abstractmethod
def get_page(self, page_id: int) -> PageableData:
"""Get the page indexed by page_id.
Args:
page_id (int): the index of the page
Returns:
PageableData: the page doc object
"""
pass
class PymuDocDataset(Dataset):
def __init__(self, bits: bytes):
"""Initialize the dataset, which wraps the pymudoc documents.
Args:
bits (bytes): the bytes of the pdf
"""
self._records = [Doc(v) for v in fitz.open('pdf', bits)]
self._data_bits = bits
self._raw_data = bits
def __len__(self) -> int:
"""The page number of the pdf."""
return len(self._records)
def __iter__(self) -> Iterator[PageableData]:
"""Yield the page doc object."""
return iter(self._records)
def supported_methods(self) -> list[SupportedPdfParseMethod]:
"""The method supported by this dataset.
Returns:
list[SupportedPdfParseMethod]: the supported methods
"""
return [SupportedPdfParseMethod.OCR, SupportedPdfParseMethod.TXT]
def data_bits(self) -> bytes:
"""The pdf bits used to create this dataset."""
return self._data_bits
def get_page(self, page_id: int) -> PageableData:
"""The page doc object.
Args:
page_id (int): the page doc index
Returns:
PageableData: the page doc object
"""
return self._records[page_id]
class ImageDataset(Dataset):
def __init__(self, bits: bytes):
"""Initialize the dataset, which wraps the pymudoc documents.
Args:
bits (bytes): the bytes of the photo which will be converted to pdf first. then converted to pymudoc.
"""
pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
self._records = [Doc(v) for v in fitz.open('pdf', pdf_bytes)]
self._raw_data = bits
self._data_bits = pdf_bytes
def __len__(self) -> int:
"""The length of the dataset."""
return len(self._records)
def __iter__(self) -> Iterator[PageableData]:
"""Yield the page object."""
return iter(self._records)
def supported_methods(self):
"""The method supported by this dataset.
Returns:
list[SupportedPdfParseMethod]: the supported methods
"""
return [SupportedPdfParseMethod.OCR]
def data_bits(self) -> bytes:
"""The pdf bits used to create this dataset."""
return self._data_bits
def get_page(self, page_id: int) -> PageableData:
"""The page doc object.
Args:
page_id (int): the page doc index
Returns:
PageableData: the page doc object
"""
return self._records[page_id]
class Doc(PageableData):
"""Initialized with pymudoc object."""
def __init__(self, doc: fitz.Page):
self._doc = doc
def get_image(self):
"""Return the imge info.
Returns:
dict: {
img: np.ndarray,
width: int,
height: int
}
"""
return fitz_doc_to_image(self._doc)
def get_doc(self) -> fitz.Page:
"""Get the pymudoc object.
Returns:
fitz.Page: the pymudoc object
"""
return self._doc
def get_page_info(self) -> PageInfo:
"""Get the page info of the page.
Returns:
PageInfo: the page info of this page
"""
page_w = self._doc.rect.width
page_h = self._doc.rect.height
return PageInfo(w=page_w, h=page_h)
def __getattr__(self, name):
if hasattr(self._doc, name):
return getattr(self._doc, name)
from abc import ABC, abstractmethod
class IOReader(ABC):
@abstractmethod
def read(self, path: str) -> bytes:
"""Read the file.
Args:
path (str): file path to read
Returns:
bytes: the content of the file
"""
pass
@abstractmethod
def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes:
"""Read at offset and limit.
Args:
path (str): the path of file, if the path is relative path, it will be joined with parent_dir.
offset (int, optional): the number of bytes skipped. Defaults to 0.
limit (int, optional): the length of bytes want to read. Defaults to -1.
Returns:
bytes: the content of file
"""
pass
class IOWriter:
@abstractmethod
def write(self, path: str, data: bytes) -> None:
"""Write file with data.
Args:
path (str): the path of file, if the path is relative path, it will be joined with parent_dir.
data (bytes): the data want to write
"""
pass
import io
import requests
from magic_pdf.data.io.base import IOReader, IOWriter
class HttpReader(IOReader):
def read(self, url: str) -> bytes:
"""Read the file.
Args:
path (str): file path to read
Returns:
bytes: the content of the file
"""
return requests.get(url).content
def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes:
"""Not Implemented."""
raise NotImplementedError
class HttpWriter(IOWriter):
def write(self, url: str, data: bytes) -> None:
"""Write file with data.
Args:
path (str): the path of file, if the path is relative path, it will be joined with parent_dir.
data (bytes): the data want to write
"""
files = {'file': io.BytesIO(data)}
response = requests.post(url, files=files)
assert 300 > response.status_code and response.status_code > 199
import boto3
from botocore.config import Config
from magic_pdf.data.io.base import IOReader, IOWriter
class S3Reader(IOReader):
def __init__(
self,
bucket: str,
ak: str,
sk: str,
endpoint_url: str,
addressing_style: str = 'auto',
):
"""s3 reader client.
Args:
bucket (str): bucket name
ak (str): access key
sk (str): secret key
endpoint_url (str): endpoint url of s3
addressing_style (str, optional): Defaults to 'auto'. Other valid options here are 'path' and 'virtual'
refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html
"""
self._bucket = bucket
self._ak = ak
self._sk = sk
self._s3_client = boto3.client(
service_name='s3',
aws_access_key_id=ak,
aws_secret_access_key=sk,
endpoint_url=endpoint_url,
config=Config(
s3={'addressing_style': addressing_style},
retries={'max_attempts': 5, 'mode': 'standard'},
),
)
def read(self, key: str) -> bytes:
"""Read the file.
Args:
path (str): file path to read
Returns:
bytes: the content of the file
"""
return self.read_at(key)
def read_at(self, key: str, offset: int = 0, limit: int = -1) -> bytes:
"""Read at offset and limit.
Args:
path (str): the path of file, if the path is relative path, it will be joined with parent_dir.
offset (int, optional): the number of bytes skipped. Defaults to 0.
limit (int, optional): the length of bytes want to read. Defaults to -1.
Returns:
bytes: the content of file
"""
if limit > -1:
range_header = f'bytes={offset}-{offset+limit-1}'
res = self._s3_client.get_object(
Bucket=self._bucket, Key=key, Range=range_header
)
else:
res = self._s3_client.get_object(
Bucket=self._bucket, Key=key, Range=f'bytes={offset}-'
)
return res['Body'].read()
class S3Writer(IOWriter):
def __init__(
self,
bucket: str,
ak: str,
sk: str,
endpoint_url: str,
addressing_style: str = 'auto',
):
"""s3 reader client.
Args:
bucket (str): bucket name
ak (str): access key
sk (str): secret key
endpoint_url (str): endpoint url of s3
addressing_style (str, optional): Defaults to 'auto'. Other valid options here are 'path' and 'virtual'
refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html
"""
self._bucket = bucket
self._ak = ak
self._sk = sk
self._s3_client = boto3.client(
service_name='s3',
aws_access_key_id=ak,
aws_secret_access_key=sk,
endpoint_url=endpoint_url,
config=Config(
s3={'addressing_style': addressing_style},
retries={'max_attempts': 5, 'mode': 'standard'},
),
)
def write(self, key: str, data: bytes):
"""Write file with data.
Args:
path (str): the path of file, if the path is relative path, it will be joined with parent_dir.
data (bytes): the data want to write
"""
self._s3_client.put_object(Bucket=self._bucket, Key=key, Body=data)
import json
import os
from pathlib import Path
from magic_pdf.config.exceptions import EmptyData, InvalidParams
from magic_pdf.data.data_reader_writer import (FileBasedDataReader,
MultiBucketS3DataReader)
from magic_pdf.data.dataset import ImageDataset, PymuDocDataset
def read_jsonl(
s3_path_or_local: str, s3_client: MultiBucketS3DataReader | None = None
) -> list[PymuDocDataset]:
"""Read the jsonl file and return the list of PymuDocDataset.
Args:
s3_path_or_local (str): local file or s3 path
s3_client (MultiBucketS3DataReader | None, optional): s3 client that support multiple bucket. Defaults to None.
Raises:
InvalidParams: if s3_path_or_local is s3 path but s3_client is not provided.
EmptyData: if no pdf file location is provided in some line of jsonl file.
InvalidParams: if the file location is s3 path but s3_client is not provided
Returns:
list[PymuDocDataset]: each line in the jsonl file will be converted to a PymuDocDataset
"""
bits_arr = []
if s3_path_or_local.startswith('s3://'):
if s3_client is None:
raise InvalidParams('s3_client is required when s3_path is provided')
jsonl_bits = s3_client.read(s3_path_or_local)
else:
jsonl_bits = FileBasedDataReader('').read(s3_path_or_local)
jsonl_d = [
json.loads(line) for line in jsonl_bits.decode().split('\n') if line.strip()
]
for d in jsonl_d[:5]:
pdf_path = d.get('file_location', '') or d.get('path', '')
if len(pdf_path) == 0:
raise EmptyData('pdf file location is empty')
if pdf_path.startswith('s3://'):
if s3_client is None:
raise InvalidParams('s3_client is required when s3_path is provided')
bits_arr.append(s3_client.read(pdf_path))
else:
bits_arr.append(FileBasedDataReader('').read(pdf_path))
return [PymuDocDataset(bits) for bits in bits_arr]
def read_local_pdfs(path: str) -> list[PymuDocDataset]:
"""Read pdf from path or directory.
Args:
path (str): pdf file path or directory that contains pdf files
Returns:
list[PymuDocDataset]: each pdf file will converted to a PymuDocDataset
"""
if os.path.isdir(path):
reader = FileBasedDataReader(path)
return [
PymuDocDataset(reader.read(doc_path.name))
for doc_path in Path(path).glob('*.pdf')
]
else:
reader = FileBasedDataReader()
bits = reader.read(path)
return [PymuDocDataset(bits)]
def read_local_images(path: str, suffixes: list[str]) -> list[ImageDataset]:
"""Read images from path or directory.
Args:
path (str): image file path or directory that contains image files
suffixes (list[str]): the suffixes of the image files used to filter the files. Example: ['jpg', 'png']
Returns:
list[ImageDataset]: each image file will converted to a ImageDataset
"""
if os.path.isdir(path):
imgs_bits = []
s_suffixes = set(suffixes)
reader = FileBasedDataReader(path)
for root, _, files in os.walk(path):
for file in files:
suffix = file.split('.')
if suffix[-1] in s_suffixes:
imgs_bits.append(reader.read(file))
return [ImageDataset(bits) for bits in imgs_bits]
else:
reader = FileBasedDataReader()
bits = reader.read(path)
return [ImageDataset(bits)]
from pydantic import BaseModel, Field
class S3Config(BaseModel):
bucket_name: str = Field(description='s3 bucket name', min_length=1)
access_key: str = Field(description='s3 access key', min_length=1)
secret_key: str = Field(description='s3 secret key', min_length=1)
endpoint_url: str = Field(description='s3 endpoint url', min_length=1)
addressing_style: str = Field(description='s3 addressing style', default='auto', min_length=1)
class PageInfo(BaseModel):
w: float = Field(description='the width of page')
h: float = Field(description='the height of page')
import fitz
import numpy as np
from magic_pdf.utils.annotations import ImportPIL
@ImportPIL
def fitz_doc_to_image(doc, dpi=200) -> dict:
"""Convert fitz.Document to image, Then convert the image to numpy array.
Args:
doc (_type_): pymudoc page
dpi (int, optional): reset the dpi of dpi. Defaults to 200.
Returns:
dict: {'img': numpy array, 'width': width, 'height': height }
"""
from PIL import Image
mat = fitz.Matrix(dpi / 72, dpi / 72)
pm = doc.get_pixmap(matrix=mat, alpha=False)
# If the width or height exceeds 9000 after scaling, do not scale further.
if pm.width > 9000 or pm.height > 9000:
pm = doc.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
img = Image.frombytes('RGB', (pm.width, pm.height), pm.samples)
img = np.array(img)
img_dict = {'img': img, 'width': pm.width, 'height': pm.height}
return img_dict
""" """根据bucket的名字返回对应的s3 AK, SK,endpoint三元组."""
根据bucket的名字返回对应的s3 AK, SK,endpoint三元组
"""
import json import json
import os import os
...@@ -12,36 +9,36 @@ from magic_pdf.libs.Constants import MODEL_NAME ...@@ -12,36 +9,36 @@ from magic_pdf.libs.Constants import MODEL_NAME
from magic_pdf.libs.commons import parse_bucket_key from magic_pdf.libs.commons import parse_bucket_key
# 定义配置文件名常量 # 定义配置文件名常量
CONFIG_FILE_NAME = "magic-pdf.json" CONFIG_FILE_NAME = os.getenv('MINERU_TOOLS_CONFIG_JSON', 'magic-pdf.json')
def read_config(): def read_config():
home_dir = os.path.expanduser("~") if os.path.isabs(CONFIG_FILE_NAME):
config_file = CONFIG_FILE_NAME
else:
home_dir = os.path.expanduser('~')
config_file = os.path.join(home_dir, CONFIG_FILE_NAME) config_file = os.path.join(home_dir, CONFIG_FILE_NAME)
if not os.path.exists(config_file): if not os.path.exists(config_file):
raise FileNotFoundError(f"{config_file} not found") raise FileNotFoundError(f'{config_file} not found')
with open(config_file, "r", encoding="utf-8") as f: with open(config_file, 'r', encoding='utf-8') as f:
config = json.load(f) config = json.load(f)
return config return config
def get_s3_config(bucket_name: str): def get_s3_config(bucket_name: str):
""" """~/magic-pdf.json 读出来."""
~/magic-pdf.json 读出来
"""
config = read_config() config = read_config()
bucket_info = config.get("bucket_info") bucket_info = config.get('bucket_info')
if bucket_name not in bucket_info: if bucket_name not in bucket_info:
access_key, secret_key, storage_endpoint = bucket_info["[default]"] access_key, secret_key, storage_endpoint = bucket_info['[default]']
else: else:
access_key, secret_key, storage_endpoint = bucket_info[bucket_name] access_key, secret_key, storage_endpoint = bucket_info[bucket_name]
if access_key is None or secret_key is None or storage_endpoint is None: if access_key is None or secret_key is None or storage_endpoint is None:
raise Exception(f"ak, sk or endpoint not found in {CONFIG_FILE_NAME}") raise Exception(f'ak, sk or endpoint not found in {CONFIG_FILE_NAME}')
# logger.info(f"get_s3_config: ak={access_key}, sk={secret_key}, endpoint={storage_endpoint}") # logger.info(f"get_s3_config: ak={access_key}, sk={secret_key}, endpoint={storage_endpoint}")
...@@ -50,7 +47,7 @@ def get_s3_config(bucket_name: str): ...@@ -50,7 +47,7 @@ def get_s3_config(bucket_name: str):
def get_s3_config_dict(path: str): def get_s3_config_dict(path: str):
access_key, secret_key, storage_endpoint = get_s3_config(get_bucket_name(path)) access_key, secret_key, storage_endpoint = get_s3_config(get_bucket_name(path))
return {"ak": access_key, "sk": secret_key, "endpoint": storage_endpoint} return {'ak': access_key, 'sk': secret_key, 'endpoint': storage_endpoint}
def get_bucket_name(path): def get_bucket_name(path):
...@@ -60,20 +57,20 @@ def get_bucket_name(path): ...@@ -60,20 +57,20 @@ def get_bucket_name(path):
def get_local_models_dir(): def get_local_models_dir():
config = read_config() config = read_config()
models_dir = config.get("models-dir") models_dir = config.get('models-dir')
if models_dir is None: if models_dir is None:
logger.warning(f"'models-dir' not found in {CONFIG_FILE_NAME}, use '/tmp/models' as default") logger.warning(f"'models-dir' not found in {CONFIG_FILE_NAME}, use '/tmp/models' as default")
return "/tmp/models" return '/tmp/models'
else: else:
return models_dir return models_dir
def get_local_layoutreader_model_dir(): def get_local_layoutreader_model_dir():
config = read_config() config = read_config()
layoutreader_model_dir = config.get("layoutreader-model-dir") layoutreader_model_dir = config.get('layoutreader-model-dir')
if layoutreader_model_dir is None or not os.path.exists(layoutreader_model_dir): if layoutreader_model_dir is None or not os.path.exists(layoutreader_model_dir):
home_dir = os.path.expanduser("~") home_dir = os.path.expanduser('~')
layoutreader_at_modelscope_dir_path = os.path.join(home_dir, ".cache/modelscope/hub/ppaanngggg/layoutreader") layoutreader_at_modelscope_dir_path = os.path.join(home_dir, '.cache/modelscope/hub/ppaanngggg/layoutreader')
logger.warning(f"'layoutreader-model-dir' not exists, use {layoutreader_at_modelscope_dir_path} as default") logger.warning(f"'layoutreader-model-dir' not exists, use {layoutreader_at_modelscope_dir_path} as default")
return layoutreader_at_modelscope_dir_path return layoutreader_at_modelscope_dir_path
else: else:
...@@ -82,17 +79,17 @@ def get_local_layoutreader_model_dir(): ...@@ -82,17 +79,17 @@ def get_local_layoutreader_model_dir():
def get_device(): def get_device():
config = read_config() config = read_config()
device = config.get("device-mode") device = config.get('device-mode')
if device is None: if device is None:
logger.warning(f"'device-mode' not found in {CONFIG_FILE_NAME}, use 'cpu' as default") logger.warning(f"'device-mode' not found in {CONFIG_FILE_NAME}, use 'cpu' as default")
return "cpu" return 'cpu'
else: else:
return device return device
def get_table_recog_config(): def get_table_recog_config():
config = read_config() config = read_config()
table_config = config.get("table-config") table_config = config.get('table-config')
if table_config is None: if table_config is None:
logger.warning(f"'table-config' not found in {CONFIG_FILE_NAME}, use 'False' as default") logger.warning(f"'table-config' not found in {CONFIG_FILE_NAME}, use 'False' as default")
return json.loads(f'{{"model": "{MODEL_NAME.TABLE_MASTER}","enable": false, "max_time": 400}}') return json.loads(f'{{"model": "{MODEL_NAME.TABLE_MASTER}","enable": false, "max_time": 400}}')
......
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.libs.commons import fitz # PyMuPDF from magic_pdf.libs.commons import fitz # PyMuPDF
from magic_pdf.libs.Constants import CROSS_PAGE from magic_pdf.libs.Constants import CROSS_PAGE
from magic_pdf.libs.ocr_content_type import BlockType, CategoryId, ContentType from magic_pdf.libs.ocr_content_type import BlockType, CategoryId, ContentType
...@@ -62,7 +63,7 @@ def draw_bbox_with_number(i, bbox_list, page, rgb_config, fill_config, draw_bbox ...@@ -62,7 +63,7 @@ def draw_bbox_with_number(i, bbox_list, page, rgb_config, fill_config, draw_bbox
overlay=True, overlay=True,
) # Draw the rectangle ) # Draw the rectangle
page.insert_text( page.insert_text(
(x1+2, y0 + 10), str(j + 1), fontsize=10, color=new_rgb (x1 + 2, y0 + 10), str(j + 1), fontsize=10, color=new_rgb
) # Insert the index in the top left corner of the rectangle ) # Insert the index in the top left corner of the rectangle
...@@ -86,7 +87,7 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename): ...@@ -86,7 +87,7 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
texts = [] texts = []
interequations = [] interequations = []
lists = [] lists = []
indexs = [] indices = []
for dropped_bbox in page['discarded_blocks']: for dropped_bbox in page['discarded_blocks']:
page_dropped_list.append(dropped_bbox['bbox']) page_dropped_list.append(dropped_bbox['bbox'])
...@@ -122,7 +123,7 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename): ...@@ -122,7 +123,7 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
elif block['type'] == BlockType.List: elif block['type'] == BlockType.List:
lists.append(bbox) lists.append(bbox)
elif block['type'] == BlockType.Index: elif block['type'] == BlockType.Index:
indexs.append(bbox) indices.append(bbox)
tables_list.append(tables) tables_list.append(tables)
tables_body_list.append(tables_body) tables_body_list.append(tables_body)
...@@ -136,7 +137,7 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename): ...@@ -136,7 +137,7 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
texts_list.append(texts) texts_list.append(texts)
interequations_list.append(interequations) interequations_list.append(interequations)
lists_list.append(lists) lists_list.append(lists)
indexs_list.append(indexs) indexs_list.append(indices)
layout_bbox_list = [] layout_bbox_list = []
...@@ -151,30 +152,24 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename): ...@@ -151,30 +152,24 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
for i, page in enumerate(pdf_docs): for i, page in enumerate(pdf_docs):
draw_bbox_without_number(i, dropped_bbox_list, page, [158, 158, 158], draw_bbox_without_number(i, dropped_bbox_list, page, [158, 158, 158], True)
True) draw_bbox_without_number(i, tables_list, page, [153, 153, 0], True) # color !
draw_bbox_without_number(i, tables_list, page, [153, 153, 0], draw_bbox_without_number(i, tables_body_list, page, [204, 204, 0], True)
True) # color ! draw_bbox_without_number(i, tables_caption_list, page, [255, 255, 102], True)
draw_bbox_without_number(i, tables_body_list, page, [204, 204, 0], draw_bbox_without_number(i, tables_footnote_list, page, [229, 255, 204], True)
True)
draw_bbox_without_number(i, tables_caption_list, page, [255, 255, 102],
True)
draw_bbox_without_number(i, tables_footnote_list, page,
[229, 255, 204], True)
draw_bbox_without_number(i, imgs_list, page, [51, 102, 0], True) draw_bbox_without_number(i, imgs_list, page, [51, 102, 0], True)
draw_bbox_without_number(i, imgs_body_list, page, [153, 255, 51], True) draw_bbox_without_number(i, imgs_body_list, page, [153, 255, 51], True)
draw_bbox_without_number(i, imgs_caption_list, page, [102, 178, 255], draw_bbox_without_number(i, imgs_caption_list, page, [102, 178, 255], True)
True) draw_bbox_without_number(i, imgs_footnote_list, page, [255, 178, 102], True),
draw_bbox_without_number(i, imgs_footnote_list, page, [255, 178, 102],
True),
draw_bbox_without_number(i, titles_list, page, [102, 102, 255], True) draw_bbox_without_number(i, titles_list, page, [102, 102, 255], True)
draw_bbox_without_number(i, texts_list, page, [153, 0, 76], True) draw_bbox_without_number(i, texts_list, page, [153, 0, 76], True)
draw_bbox_without_number(i, interequations_list, page, [0, 255, 0], draw_bbox_without_number(i, interequations_list, page, [0, 255, 0], True)
True)
draw_bbox_without_number(i, lists_list, page, [40, 169, 92], True) draw_bbox_without_number(i, lists_list, page, [40, 169, 92], True)
draw_bbox_without_number(i, indexs_list, page, [40, 169, 92], True) draw_bbox_without_number(i, indexs_list, page, [40, 169, 92], True)
draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False, draw_bbox=False) draw_bbox_with_number(
i, layout_bbox_list, page, [255, 0, 0], False, draw_bbox=False
)
# Save the PDF # Save the PDF
pdf_docs.save(f'{out_path}/{filename}_layout.pdf') pdf_docs.save(f'{out_path}/{filename}_layout.pdf')
...@@ -275,7 +270,7 @@ def draw_model_bbox(model_list: list, pdf_bytes, out_path, filename): ...@@ -275,7 +270,7 @@ def draw_model_bbox(model_list: list, pdf_bytes, out_path, filename):
texts_list = [] texts_list = []
interequations_list = [] interequations_list = []
pdf_docs = fitz.open('pdf', pdf_bytes) pdf_docs = fitz.open('pdf', pdf_bytes)
magic_model = MagicModel(model_list, pdf_docs) magic_model = MagicModel(model_list, PymuDocDataset(pdf_bytes))
for i in range(len(model_list)): for i in range(len(model_list)):
page_dropped_list = [] page_dropped_list = []
tables_body, tables_caption, tables_footnote = [], [], [] tables_body, tables_caption, tables_footnote = [], [], []
...@@ -301,8 +296,7 @@ def draw_model_bbox(model_list: list, pdf_bytes, out_path, filename): ...@@ -301,8 +296,7 @@ def draw_model_bbox(model_list: list, pdf_bytes, out_path, filename):
imgs_body.append(bbox) imgs_body.append(bbox)
elif layout_det['category_id'] == CategoryId.ImageCaption: elif layout_det['category_id'] == CategoryId.ImageCaption:
imgs_caption.append(bbox) imgs_caption.append(bbox)
elif layout_det[ elif layout_det['category_id'] == CategoryId.InterlineEquation_YOLO:
'category_id'] == CategoryId.InterlineEquation_YOLO:
interequations.append(bbox) interequations.append(bbox)
elif layout_det['category_id'] == CategoryId.Abandon: elif layout_det['category_id'] == CategoryId.Abandon:
page_dropped_list.append(bbox) page_dropped_list.append(bbox)
...@@ -321,18 +315,15 @@ def draw_model_bbox(model_list: list, pdf_bytes, out_path, filename): ...@@ -321,18 +315,15 @@ def draw_model_bbox(model_list: list, pdf_bytes, out_path, filename):
imgs_footnote_list.append(imgs_footnote) imgs_footnote_list.append(imgs_footnote)
for i, page in enumerate(pdf_docs): for i, page in enumerate(pdf_docs):
draw_bbox_with_number(i, dropped_bbox_list, page, [158, 158, 158], draw_bbox_with_number(
True) # color ! i, dropped_bbox_list, page, [158, 158, 158], True
) # color !
draw_bbox_with_number(i, tables_body_list, page, [204, 204, 0], True) draw_bbox_with_number(i, tables_body_list, page, [204, 204, 0], True)
draw_bbox_with_number(i, tables_caption_list, page, [255, 255, 102], draw_bbox_with_number(i, tables_caption_list, page, [255, 255, 102], True)
True) draw_bbox_with_number(i, tables_footnote_list, page, [229, 255, 204], True)
draw_bbox_with_number(i, tables_footnote_list, page, [229, 255, 204],
True)
draw_bbox_with_number(i, imgs_body_list, page, [153, 255, 51], True) draw_bbox_with_number(i, imgs_body_list, page, [153, 255, 51], True)
draw_bbox_with_number(i, imgs_caption_list, page, [102, 178, 255], draw_bbox_with_number(i, imgs_caption_list, page, [102, 178, 255], True)
True) draw_bbox_with_number(i, imgs_footnote_list, page, [255, 178, 102], True)
draw_bbox_with_number(i, imgs_footnote_list, page, [255, 178, 102],
True)
draw_bbox_with_number(i, titles_list, page, [102, 102, 255], True) draw_bbox_with_number(i, titles_list, page, [102, 102, 255], True)
draw_bbox_with_number(i, texts_list, page, [153, 0, 76], True) draw_bbox_with_number(i, texts_list, page, [153, 0, 76], True)
draw_bbox_with_number(i, interequations_list, page, [0, 255, 0], True) draw_bbox_with_number(i, interequations_list, page, [0, 255, 0], True)
......
import json import json
from magic_pdf.data.dataset import Dataset
from magic_pdf.libs.boxbase import (_is_in, _is_part_overlap, bbox_distance, from magic_pdf.libs.boxbase import (_is_in, _is_part_overlap, bbox_distance,
bbox_relative_pos, box_area, calculate_iou, bbox_relative_pos, box_area, calculate_iou,
calculate_overlap_area_in_bbox1_area_ratio, calculate_overlap_area_in_bbox1_area_ratio,
...@@ -24,7 +25,7 @@ class MagicModel: ...@@ -24,7 +25,7 @@ class MagicModel:
need_remove_list = [] need_remove_list = []
page_no = model_page_info['page_info']['page_no'] page_no = model_page_info['page_info']['page_no']
horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio( horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(
model_page_info, self.__docs[page_no] model_page_info, self.__docs.get_page(page_no)
) )
layout_dets = model_page_info['layout_dets'] layout_dets = model_page_info['layout_dets']
for layout_det in layout_dets: for layout_det in layout_dets:
...@@ -99,7 +100,7 @@ class MagicModel: ...@@ -99,7 +100,7 @@ class MagicModel:
for need_remove in need_remove_list: for need_remove in need_remove_list:
layout_dets.remove(need_remove) layout_dets.remove(need_remove)
def __init__(self, model_list: list, docs: fitz.Document): def __init__(self, model_list: list, docs: Dataset):
self.__model_list = model_list self.__model_list = model_list
self.__docs = docs self.__docs = docs
"""为所有模型数据添加bbox信息(缩放,poly->bbox)""" """为所有模型数据添加bbox信息(缩放,poly->bbox)"""
...@@ -123,7 +124,8 @@ class MagicModel: ...@@ -123,7 +124,8 @@ class MagicModel:
l1 = bbox1[2] - bbox1[0] l1 = bbox1[2] - bbox1[0]
l2 = bbox2[2] - bbox2[0] l2 = bbox2[2] - bbox2[0]
if l2 > l1 and (l2 - l1) / l1 > 0.5: min_l, max_l = min(l1, l2), max(l1, l2)
if (max_l - min_l) * 1.0 / max_l > 0.4:
return float('inf') return float('inf')
return bbox_distance(bbox1, bbox2) return bbox_distance(bbox1, bbox2)
...@@ -213,9 +215,8 @@ class MagicModel: ...@@ -213,9 +215,8 @@ class MagicModel:
筛选出所有和 merged bbox 有 overlap 且 overlap 面积大于 object 的面积的 subjects。 筛选出所有和 merged bbox 有 overlap 且 overlap 面积大于 object 的面积的 subjects。
再求出筛选出的 subjects 和 object 的最短距离 再求出筛选出的 subjects 和 object 的最短距离
""" """
def search_overlap_between_boxes(
subject_idx, object_idx def search_overlap_between_boxes(subject_idx, object_idx):
):
idxes = [subject_idx, object_idx] idxes = [subject_idx, object_idx]
x0s = [all_bboxes[idx]['bbox'][0] for idx in idxes] x0s = [all_bboxes[idx]['bbox'][0] for idx in idxes]
y0s = [all_bboxes[idx]['bbox'][1] for idx in idxes] y0s = [all_bboxes[idx]['bbox'][1] for idx in idxes]
...@@ -243,9 +244,9 @@ class MagicModel: ...@@ -243,9 +244,9 @@ class MagicModel:
for other_object in other_objects: for other_object in other_objects:
ratio = max( ratio = max(
ratio, ratio,
get_overlap_area( get_overlap_area(merged_bbox, other_object['bbox'])
merged_bbox, other_object['bbox'] * 1.0
) * 1.0 / box_area(all_bboxes[object_idx]['bbox']) / box_area(all_bboxes[object_idx]['bbox']),
) )
if ratio >= MERGE_BOX_OVERLAP_AREA_RATIO: if ratio >= MERGE_BOX_OVERLAP_AREA_RATIO:
break break
...@@ -363,12 +364,17 @@ class MagicModel: ...@@ -363,12 +364,17 @@ class MagicModel:
if all_bboxes[j]['category_id'] == subject_category_id: if all_bboxes[j]['category_id'] == subject_category_id:
subject_idx, object_idx = j, i subject_idx, object_idx = j, i
if search_overlap_between_boxes(subject_idx, object_idx) >= MERGE_BOX_OVERLAP_AREA_RATIO: if (
search_overlap_between_boxes(subject_idx, object_idx)
>= MERGE_BOX_OVERLAP_AREA_RATIO
):
dis[i][j] = float('inf') dis[i][j] = float('inf')
dis[j][i] = dis[i][j] dis[j][i] = dis[i][j]
continue continue
dis[i][j] = self._bbox_distance(all_bboxes[subject_idx]['bbox'], all_bboxes[object_idx]['bbox']) dis[i][j] = self._bbox_distance(
all_bboxes[subject_idx]['bbox'], all_bboxes[object_idx]['bbox']
)
dis[j][i] = dis[i][j] dis[j][i] = dis[i][j]
used = set() used = set()
...@@ -584,6 +590,97 @@ class MagicModel: ...@@ -584,6 +590,97 @@ class MagicModel:
with_caption_subject.add(j) with_caption_subject.add(j)
return ret, total_subject_object_dis return ret, total_subject_object_dis
def __tie_up_category_by_distance_v2(
self, page_no, subject_category_id, object_category_id
):
subjects = self.__reduct_overlap(
list(
map(
lambda x: {'bbox': x['bbox'], 'score': x['score']},
filter(
lambda x: x['category_id'] == subject_category_id,
self.__model_list[page_no]['layout_dets'],
),
)
)
)
objects = self.__reduct_overlap(
list(
map(
lambda x: {'bbox': x['bbox'], 'score': x['score']},
filter(
lambda x: x['category_id'] == object_category_id,
self.__model_list[page_no]['layout_dets'],
),
)
)
)
subjects.sort(key=lambda x: x['bbox'][0] ** 2 + x['bbox'][1] ** 2)
objects.sort(key=lambda x: x['bbox'][0] ** 2 + x['bbox'][1] ** 2)
dis = [[float('inf')] * len(subjects) for _ in range(len(objects))]
for i, obj in enumerate(objects):
for j, sub in enumerate(subjects):
dis[i][j] = self._bbox_distance(sub['bbox'], obj['bbox'])
sub_obj_map_h = {i: [] for i in range(len(subjects))}
for i in range(len(objects)):
min_l_idx = 0
for j in range(1, len(subjects)):
if dis[i][j] == float('inf'):
continue
if dis[i][j] < dis[i][min_l_idx]:
min_l_idx = j
if dis[i][min_l_idx] < float('inf'):
sub_obj_map_h[min_l_idx].append(i)
else:
print(i, 'no nearest')
ret = []
for i in sub_obj_map_h.keys():
ret.append(
{
'sub_bbox': subjects[i]['bbox'],
'obj_bboxes': [objects[j]['bbox'] for j in sub_obj_map_h[i]],
'sub_idx': i,
}
)
return ret
def get_imgs_v2(self, page_no: int):
with_captions = self.__tie_up_category_by_distance_v2(page_no, 3, 4)
with_footnotes = self.__tie_up_category_by_distance_v2(
page_no, 3, CategoryId.ImageFootnote
)
ret = []
for v in with_captions:
record = {
'image_bbox': v['sub_bbox'],
'image_caption_bbox_list': v['obj_bboxes'],
}
filter_idx = v['sub_idx']
d = next(filter(lambda x: x['sub_idx'] == filter_idx, with_footnotes))
record['image_footnote_bbox_list'] = d['obj_bboxes']
ret.append(record)
return ret
def get_tables_v2(self, page_no: int) -> list:
with_captions = self.__tie_up_category_by_distance_v2(page_no, 5, 6)
with_footnotes = self.__tie_up_category_by_distance_v2(page_no, 5, 7)
ret = []
for v in with_captions:
record = {
'table_bbox': v['sub_bbox'],
'table_caption_bbox_list': v['obj_bboxes'],
}
filter_idx = v['sub_idx']
d = next(filter(lambda x: x['sub_idx'] == filter_idx, with_footnotes))
record['table_footnote_bbox_list'] = d['obj_bboxes']
ret.append(record)
return ret
def get_imgs(self, page_no: int): def get_imgs(self, page_no: int):
with_captions, _ = self.__tie_up_category_by_distance(page_no, 3, 4) with_captions, _ = self.__tie_up_category_by_distance(page_no, 3, 4)
with_footnotes, _ = self.__tie_up_category_by_distance( with_footnotes, _ = self.__tie_up_category_by_distance(
...@@ -717,10 +814,10 @@ class MagicModel: ...@@ -717,10 +814,10 @@ class MagicModel:
def get_page_size(self, page_no: int): # 获取页面宽高 def get_page_size(self, page_no: int): # 获取页面宽高
# 获取当前页的page对象 # 获取当前页的page对象
page = self.__docs[page_no] page = self.__docs.get_page(page_no).get_page_info()
# 获取当前页的宽高 # 获取当前页的宽高
page_w = page.rect.width page_w = page.w
page_h = page.rect.height page_h = page.h
return page_w, page_h return page_w, page_h
def __get_blocks_by_type( def __get_blocks_by_type(
......
from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union
...@@ -8,10 +10,11 @@ def parse_pdf_by_ocr(pdf_bytes, ...@@ -8,10 +10,11 @@ def parse_pdf_by_ocr(pdf_bytes,
end_page_id=None, end_page_id=None,
debug_mode=False, debug_mode=False,
): ):
return pdf_parse_union(pdf_bytes, dataset = PymuDocDataset(pdf_bytes)
return pdf_parse_union(dataset,
model_list, model_list,
imageWriter, imageWriter,
"ocr", SupportedPdfParseMethod.OCR,
start_page_id=start_page_id, start_page_id=start_page_id,
end_page_id=end_page_id, end_page_id=end_page_id,
debug_mode=debug_mode, debug_mode=debug_mode,
......
from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union
...@@ -9,10 +11,11 @@ def parse_pdf_by_txt( ...@@ -9,10 +11,11 @@ def parse_pdf_by_txt(
end_page_id=None, end_page_id=None,
debug_mode=False, debug_mode=False,
): ):
return pdf_parse_union(pdf_bytes, dataset = PymuDocDataset(pdf_bytes)
return pdf_parse_union(dataset,
model_list, model_list,
imageWriter, imageWriter,
"txt", SupportedPdfParseMethod.TXT,
start_page_id=start_page_id, start_page_id=start_page_id,
end_page_id=end_page_id, end_page_id=end_page_id,
debug_mode=debug_mode, debug_mode=debug_mode,
......
import os import os
import statistics import statistics
import time import time
from loguru import logger
from typing import List from typing import List
import torch import torch
from loguru import logger
from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.data.dataset import Dataset, PageableData
from magic_pdf.libs.clean_memory import clean_memory from magic_pdf.libs.clean_memory import clean_memory
from magic_pdf.libs.commons import fitz, get_delta_time from magic_pdf.libs.commons import fitz, get_delta_time
from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir
...@@ -19,27 +19,35 @@ from magic_pdf.libs.ocr_content_type import ContentType ...@@ -19,27 +19,35 @@ from magic_pdf.libs.ocr_content_type import ContentType
from magic_pdf.model.magic_model import MagicModel from magic_pdf.model.magic_model import MagicModel
from magic_pdf.para.para_split_v3 import para_split from magic_pdf.para.para_split_v3 import para_split
from magic_pdf.pre_proc.citationmarker_remove import remove_citation_marker from magic_pdf.pre_proc.citationmarker_remove import remove_citation_marker
from magic_pdf.pre_proc.construct_page_dict import ocr_construct_page_component_v2 from magic_pdf.pre_proc.construct_page_dict import \
ocr_construct_page_component_v2
from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
from magic_pdf.pre_proc.equations_replace import remove_chars_in_text_blocks, replace_equations_in_textblock, \ from magic_pdf.pre_proc.equations_replace import (
combine_chars_to_pymudict combine_chars_to_pymudict, remove_chars_in_text_blocks,
from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layout_split_v2 replace_equations_in_textblock)
from magic_pdf.pre_proc.ocr_dict_merge import fill_spans_in_blocks, fix_block_spans, fix_discarded_block from magic_pdf.pre_proc.ocr_detect_all_bboxes import \
from magic_pdf.pre_proc.ocr_span_list_modify import remove_overlaps_min_spans, get_qa_need_list_v2, \ ocr_prepare_bboxes_for_layout_split_v2
remove_overlaps_low_confidence_spans from magic_pdf.pre_proc.ocr_dict_merge import (fill_spans_in_blocks,
from magic_pdf.pre_proc.resolve_bbox_conflict import check_useful_block_horizontal_overlap fix_block_spans,
fix_discarded_block)
from magic_pdf.pre_proc.ocr_span_list_modify import (
get_qa_need_list_v2, remove_overlaps_low_confidence_spans,
remove_overlaps_min_spans)
from magic_pdf.pre_proc.resolve_bbox_conflict import \
check_useful_block_horizontal_overlap
def remove_horizontal_overlap_block_which_smaller(all_bboxes): def remove_horizontal_overlap_block_which_smaller(all_bboxes):
useful_blocks = [] useful_blocks = []
for bbox in all_bboxes: for bbox in all_bboxes:
useful_blocks.append({ useful_blocks.append({'bbox': bbox[:4]})
"bbox": bbox[:4] is_useful_block_horz_overlap, smaller_bbox, bigger_bbox = (
}) check_useful_block_horizontal_overlap(useful_blocks)
is_useful_block_horz_overlap, smaller_bbox, bigger_bbox = check_useful_block_horizontal_overlap(useful_blocks) )
if is_useful_block_horz_overlap: if is_useful_block_horz_overlap:
logger.warning( logger.warning(
f"skip this page, reason: {DropReason.USEFUL_BLOCK_HOR_OVERLAP}, smaller bbox is {smaller_bbox}, bigger bbox is {bigger_bbox}") f'skip this page, reason: {DropReason.USEFUL_BLOCK_HOR_OVERLAP}, smaller bbox is {smaller_bbox}, bigger bbox is {bigger_bbox}'
) # noqa: E501
for bbox in all_bboxes.copy(): for bbox in all_bboxes.copy():
if smaller_bbox == bbox[:4]: if smaller_bbox == bbox[:4]:
all_bboxes.remove(bbox) all_bboxes.remove(bbox)
...@@ -47,27 +55,27 @@ def remove_horizontal_overlap_block_which_smaller(all_bboxes): ...@@ -47,27 +55,27 @@ def remove_horizontal_overlap_block_which_smaller(all_bboxes):
return is_useful_block_horz_overlap, all_bboxes return is_useful_block_horz_overlap, all_bboxes
def __replace_STX_ETX(text_str:str): def __replace_STX_ETX(text_str: str):
""" Replace \u0002 and \u0003, as these characters become garbled when extracted using pymupdf. In fact, they were originally quotation marks. """Replace \u0002 and \u0003, as these characters become garbled when extracted using pymupdf. In fact, they were originally quotation marks.
Drawback: This issue is only observed in English text; it has not been found in Chinese text so far. Drawback: This issue is only observed in English text; it has not been found in Chinese text so far.
Args: Args:
text_str (str): raw text text_str (str): raw text
Returns: Returns:
_type_: replaced text _type_: replaced text
""" """ # noqa: E501
if text_str: if text_str:
s = text_str.replace('\u0002', "'") s = text_str.replace('\u0002', "'")
s = s.replace("\u0003", "'") s = s.replace('\u0003', "'")
return s return s
return text_str return text_str
def txt_spans_extract(pdf_page, inline_equations, interline_equations): def txt_spans_extract(pdf_page, inline_equations, interline_equations):
text_raw_blocks = pdf_page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"] text_raw_blocks = pdf_page.get_text('dict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
char_level_text_blocks = pdf_page.get_text("rawdict", flags=fitz.TEXTFLAGS_TEXT)[ char_level_text_blocks = pdf_page.get_text('rawdict', flags=fitz.TEXTFLAGS_TEXT)[
"blocks" 'blocks'
] ]
text_blocks = combine_chars_to_pymudict(text_raw_blocks, char_level_text_blocks) text_blocks = combine_chars_to_pymudict(text_raw_blocks, char_level_text_blocks)
text_blocks = replace_equations_in_textblock( text_blocks = replace_equations_in_textblock(
...@@ -77,54 +85,63 @@ def txt_spans_extract(pdf_page, inline_equations, interline_equations): ...@@ -77,54 +85,63 @@ def txt_spans_extract(pdf_page, inline_equations, interline_equations):
text_blocks = remove_chars_in_text_blocks(text_blocks) text_blocks = remove_chars_in_text_blocks(text_blocks)
spans = [] spans = []
for v in text_blocks: for v in text_blocks:
for line in v["lines"]: for line in v['lines']:
for span in line["spans"]: for span in line['spans']:
bbox = span["bbox"] bbox = span['bbox']
if float_equal(bbox[0], bbox[2]) or float_equal(bbox[1], bbox[3]): if float_equal(bbox[0], bbox[2]) or float_equal(bbox[1], bbox[3]):
continue continue
if span.get('type') not in (ContentType.InlineEquation, ContentType.InterlineEquation): if span.get('type') not in (
ContentType.InlineEquation,
ContentType.InterlineEquation,
):
spans.append( spans.append(
{ {
"bbox": list(span["bbox"]), 'bbox': list(span['bbox']),
"content": __replace_STX_ETX(span["text"]), 'content': __replace_STX_ETX(span['text']),
"type": ContentType.Text, 'type': ContentType.Text,
"score": 1.0, 'score': 1.0,
} }
) )
return spans return spans
def replace_text_span(pymu_spans, ocr_spans): def replace_text_span(pymu_spans, ocr_spans):
return list(filter(lambda x: x["type"] != ContentType.Text, ocr_spans)) + pymu_spans return list(filter(lambda x: x['type'] != ContentType.Text, ocr_spans)) + pymu_spans
def model_init(model_name: str): def model_init(model_name: str):
from transformers import LayoutLMv3ForTokenClassification from transformers import LayoutLMv3ForTokenClassification
if torch.cuda.is_available(): if torch.cuda.is_available():
device = torch.device("cuda") device = torch.device('cuda')
if torch.cuda.is_bf16_supported(): if torch.cuda.is_bf16_supported():
supports_bfloat16 = True supports_bfloat16 = True
else: else:
supports_bfloat16 = False supports_bfloat16 = False
else: else:
device = torch.device("cpu") device = torch.device('cpu')
supports_bfloat16 = False supports_bfloat16 = False
if model_name == "layoutreader": if model_name == 'layoutreader':
# 检测modelscope的缓存目录是否存在 # 检测modelscope的缓存目录是否存在
layoutreader_model_dir = get_local_layoutreader_model_dir() layoutreader_model_dir = get_local_layoutreader_model_dir()
if os.path.exists(layoutreader_model_dir): if os.path.exists(layoutreader_model_dir):
model = LayoutLMv3ForTokenClassification.from_pretrained(layoutreader_model_dir) model = LayoutLMv3ForTokenClassification.from_pretrained(
layoutreader_model_dir
)
else: else:
logger.warning( logger.warning(
f"local layoutreader model not exists, use online model from huggingface") 'local layoutreader model not exists, use online model from huggingface'
model = LayoutLMv3ForTokenClassification.from_pretrained("hantian/layoutreader") )
model = LayoutLMv3ForTokenClassification.from_pretrained(
'hantian/layoutreader'
)
# 检查设备是否支持 bfloat16 # 检查设备是否支持 bfloat16
if supports_bfloat16: if supports_bfloat16:
model.bfloat16() model.bfloat16()
model.to(device).eval() model.to(device).eval()
else: else:
logger.error("model name not allow") logger.error('model name not allow')
exit(1) exit(1)
return model return model
...@@ -145,7 +162,9 @@ class ModelSingleton: ...@@ -145,7 +162,9 @@ class ModelSingleton:
def do_predict(boxes: List[List[int]], model) -> List[int]: def do_predict(boxes: List[List[int]], model) -> List[int]:
from magic_pdf.model.v3.helpers import prepare_inputs, boxes2inputs, parse_logits from magic_pdf.model.v3.helpers import (boxes2inputs, parse_logits,
prepare_inputs)
inputs = boxes2inputs(boxes) inputs = boxes2inputs(boxes)
inputs = prepare_inputs(inputs, model) inputs = prepare_inputs(inputs, model)
logits = model(**inputs).logits.cpu().squeeze(0) logits = model(**inputs).logits.cpu().squeeze(0)
...@@ -193,19 +212,21 @@ def insert_lines_into_block(block_bbox, line_height, page_w, page_h): ...@@ -193,19 +212,21 @@ def insert_lines_into_block(block_bbox, line_height, page_w, page_h):
block_weight = x1 - x0 block_weight = x1 - x0
# 如果block高度小于n行正文,则直接返回block的bbox # 如果block高度小于n行正文,则直接返回block的bbox
if line_height*3 < block_height: if line_height * 3 < block_height:
if block_height > page_h*0.25 and page_w*0.5 > block_weight > page_w*0.25: # 可能是双列结构,可以切细点 if (
lines = int(block_height/line_height)+1 block_height > page_h * 0.25 and page_w * 0.5 > block_weight > page_w * 0.25
): # 可能是双列结构,可以切细点
lines = int(block_height / line_height) + 1
else: else:
# 如果block的宽度超过0.4页面宽度,则将block分成3行 # 如果block的宽度超过0.4页面宽度,则将block分成3行
if block_weight > page_w*0.4: if block_weight > page_w * 0.4:
line_height = (y1 - y0) / 3 line_height = (y1 - y0) / 3
lines = 3 lines = 3
elif block_weight > page_w*0.25: # 否则将block分成两行 elif block_weight > page_w * 0.25: # 否则将block分成两行
line_height = (y1 - y0) / 2 line_height = (y1 - y0) / 2
lines = 2 lines = 2
else: # 判断长宽比 else: # 判断长宽比
if block_height/block_weight > 1.2: # 细长的不分 if block_height / block_weight > 1.2: # 细长的不分
return [[x0, y0, x1, y1]] return [[x0, y0, x1, y1]]
else: # 不细长的还是分成两行 else: # 不细长的还是分成两行
line_height = (y1 - y0) / 2 line_height = (y1 - y0) / 2
...@@ -256,19 +277,23 @@ def sort_lines_by_model(fix_blocks, page_w, page_h, line_height): ...@@ -256,19 +277,23 @@ def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
for left, top, right, bottom in page_line_list: for left, top, right, bottom in page_line_list:
if left < 0: if left < 0:
logger.warning( logger.warning(
f"left < 0, left: {left}, right: {right}, top: {top}, bottom: {bottom}, page_w: {page_w}, page_h: {page_h}") f'left < 0, left: {left}, right: {right}, top: {top}, bottom: {bottom}, page_w: {page_w}, page_h: {page_h}'
) # noqa: E501
left = 0 left = 0
if right > page_w: if right > page_w:
logger.warning( logger.warning(
f"right > page_w, left: {left}, right: {right}, top: {top}, bottom: {bottom}, page_w: {page_w}, page_h: {page_h}") f'right > page_w, left: {left}, right: {right}, top: {top}, bottom: {bottom}, page_w: {page_w}, page_h: {page_h}'
) # noqa: E501
right = page_w right = page_w
if top < 0: if top < 0:
logger.warning( logger.warning(
f"top < 0, left: {left}, right: {right}, top: {top}, bottom: {bottom}, page_w: {page_w}, page_h: {page_h}") f'top < 0, left: {left}, right: {right}, top: {top}, bottom: {bottom}, page_w: {page_w}, page_h: {page_h}'
) # noqa: E501
top = 0 top = 0
if bottom > page_h: if bottom > page_h:
logger.warning( logger.warning(
f"bottom > page_h, left: {left}, right: {right}, top: {top}, bottom: {bottom}, page_w: {page_w}, page_h: {page_h}") f'bottom > page_h, left: {left}, right: {right}, top: {top}, bottom: {bottom}, page_w: {page_w}, page_h: {page_h}'
) # noqa: E501
bottom = page_h bottom = page_h
left = round(left * x_scale) left = round(left * x_scale)
...@@ -277,10 +302,10 @@ def sort_lines_by_model(fix_blocks, page_w, page_h, line_height): ...@@ -277,10 +302,10 @@ def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
bottom = round(bottom * y_scale) bottom = round(bottom * y_scale)
assert ( assert (
1000 >= right >= left >= 0 and 1000 >= bottom >= top >= 0 1000 >= right >= left >= 0 and 1000 >= bottom >= top >= 0
), f"Invalid box. right: {right}, left: {left}, bottom: {bottom}, top: {top}" ), f'Invalid box. right: {right}, left: {left}, bottom: {bottom}, top: {top}' # noqa: E126, E121
boxes.append([left, top, right, bottom]) boxes.append([left, top, right, bottom])
model_manager = ModelSingleton() model_manager = ModelSingleton()
model = model_manager.get_model("layoutreader") model = model_manager.get_model('layoutreader')
with torch.no_grad(): with torch.no_grad():
orders = do_predict(boxes, model) orders = do_predict(boxes, model)
sorted_bboxes = [page_line_list[i] for i in orders] sorted_bboxes = [page_line_list[i] for i in orders]
...@@ -294,146 +319,195 @@ def get_line_height(blocks): ...@@ -294,146 +319,195 @@ def get_line_height(blocks):
if block['type'] in ['text', 'title', 'interline_equation']: if block['type'] in ['text', 'title', 'interline_equation']:
for line in block['lines']: for line in block['lines']:
bbox = line['bbox'] bbox = line['bbox']
page_line_height_list.append(int(bbox[3]-bbox[1])) page_line_height_list.append(int(bbox[3] - bbox[1]))
if len(page_line_height_list) > 0: if len(page_line_height_list) > 0:
return statistics.median(page_line_height_list) return statistics.median(page_line_height_list)
else: else:
return 10 return 10
def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode): def parse_page_core(
page_doc: PageableData, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode
):
need_drop = False need_drop = False
drop_reason = [] drop_reason = []
'''从magic_model对象中获取后面会用到的区块信息''' """从magic_model对象中获取后面会用到的区块信息"""
img_blocks = magic_model.get_imgs(page_id) img_blocks = magic_model.get_imgs(page_id)
table_blocks = magic_model.get_tables(page_id) table_blocks = magic_model.get_tables(page_id)
discarded_blocks = magic_model.get_discarded(page_id) discarded_blocks = magic_model.get_discarded(page_id)
text_blocks = magic_model.get_text_blocks(page_id) text_blocks = magic_model.get_text_blocks(page_id)
title_blocks = magic_model.get_title_blocks(page_id) title_blocks = magic_model.get_title_blocks(page_id)
inline_equations, interline_equations, interline_equation_blocks = magic_model.get_equations(page_id) inline_equations, interline_equations, interline_equation_blocks = (
magic_model.get_equations(page_id)
)
page_w, page_h = magic_model.get_page_size(page_id) page_w, page_h = magic_model.get_page_size(page_id)
spans = magic_model.get_all_spans(page_id) spans = magic_model.get_all_spans(page_id)
'''根据parse_mode,构造spans''' """根据parse_mode,构造spans"""
if parse_mode == "txt": if parse_mode == SupportedPdfParseMethod.TXT:
"""ocr 中文本类的 span 用 pymu spans 替换!""" """ocr 中文本类的 span 用 pymu spans 替换!"""
pymu_spans = txt_spans_extract( pymu_spans = txt_spans_extract(page_doc, inline_equations, interline_equations)
pdf_docs[page_id], inline_equations, interline_equations
)
spans = replace_text_span(pymu_spans, spans) spans = replace_text_span(pymu_spans, spans)
elif parse_mode == "ocr": elif parse_mode == SupportedPdfParseMethod.OCR:
pass pass
else: else:
raise Exception("parse_mode must be txt or ocr") raise Exception('parse_mode must be txt or ocr')
'''删除重叠spans中置信度较低的那些''' """删除重叠spans中置信度较低的那些"""
spans, dropped_spans_by_confidence = remove_overlaps_low_confidence_spans(spans) spans, dropped_spans_by_confidence = remove_overlaps_low_confidence_spans(spans)
'''删除重叠spans中较小的那些''' """删除重叠spans中较小的那些"""
spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans) spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans)
'''对image和table截图''' """对image和table截图"""
spans = ocr_cut_image_and_table(spans, pdf_docs[page_id], page_id, pdf_bytes_md5, imageWriter) spans = ocr_cut_image_and_table(
spans, page_doc, page_id, pdf_bytes_md5, imageWriter
)
'''将所有区块的bbox整理到一起''' """将所有区块的bbox整理到一起"""
# interline_equation_blocks参数不够准,后面切换到interline_equations上 # interline_equation_blocks参数不够准,后面切换到interline_equations上
interline_equation_blocks = [] interline_equation_blocks = []
if len(interline_equation_blocks) > 0: if len(interline_equation_blocks) > 0:
all_bboxes, all_discarded_blocks = ocr_prepare_bboxes_for_layout_split_v2( all_bboxes, all_discarded_blocks = ocr_prepare_bboxes_for_layout_split_v2(
img_blocks, table_blocks, discarded_blocks, text_blocks, title_blocks, img_blocks,
interline_equation_blocks, page_w, page_h) table_blocks,
discarded_blocks,
text_blocks,
title_blocks,
interline_equation_blocks,
page_w,
page_h,
)
else: else:
all_bboxes, all_discarded_blocks = ocr_prepare_bboxes_for_layout_split_v2( all_bboxes, all_discarded_blocks = ocr_prepare_bboxes_for_layout_split_v2(
img_blocks, table_blocks, discarded_blocks, text_blocks, title_blocks, img_blocks,
interline_equations, page_w, page_h) table_blocks,
discarded_blocks,
text_blocks,
title_blocks,
interline_equations,
page_w,
page_h,
)
'''先处理不需要排版的discarded_blocks''' """先处理不需要排版的discarded_blocks"""
discarded_block_with_spans, spans = fill_spans_in_blocks(all_discarded_blocks, spans, 0.4) discarded_block_with_spans, spans = fill_spans_in_blocks(
all_discarded_blocks, spans, 0.4
)
fix_discarded_blocks = fix_discarded_block(discarded_block_with_spans) fix_discarded_blocks = fix_discarded_block(discarded_block_with_spans)
'''如果当前页面没有bbox则跳过''' """如果当前页面没有bbox则跳过"""
if len(all_bboxes) == 0: if len(all_bboxes) == 0:
logger.warning(f"skip this page, not found useful bbox, page_id: {page_id}") logger.warning(f'skip this page, not found useful bbox, page_id: {page_id}')
return ocr_construct_page_component_v2([], [], page_id, page_w, page_h, [], return ocr_construct_page_component_v2(
[], [], interline_equations, fix_discarded_blocks, [],
need_drop, drop_reason) [],
page_id,
page_w,
page_h,
[],
[],
[],
interline_equations,
fix_discarded_blocks,
need_drop,
drop_reason,
)
'''将span填入blocks中''' """将span填入blocks中"""
block_with_spans, spans = fill_spans_in_blocks(all_bboxes, spans, 0.5) block_with_spans, spans = fill_spans_in_blocks(all_bboxes, spans, 0.5)
'''对block进行fix操作''' """对block进行fix操作"""
fix_blocks = fix_block_spans(block_with_spans, img_blocks, table_blocks) fix_blocks = fix_block_spans(block_with_spans, img_blocks, table_blocks)
'''获取所有line并计算正文line的高度''' """获取所有line并计算正文line的高度"""
line_height = get_line_height(fix_blocks) line_height = get_line_height(fix_blocks)
'''获取所有line并对line排序''' """获取所有line并对line排序"""
sorted_bboxes = sort_lines_by_model(fix_blocks, page_w, page_h, line_height) sorted_bboxes = sort_lines_by_model(fix_blocks, page_w, page_h, line_height)
'''根据line的中位数算block的序列关系''' """根据line的中位数算block的序列关系"""
fix_blocks = cal_block_index(fix_blocks, sorted_bboxes) fix_blocks = cal_block_index(fix_blocks, sorted_bboxes)
'''重排block''' """重排block"""
sorted_blocks = sorted(fix_blocks, key=lambda b: b['index']) sorted_blocks = sorted(fix_blocks, key=lambda b: b['index'])
'''获取QA需要外置的list''' """获取QA需要外置的list"""
images, tables, interline_equations = get_qa_need_list_v2(sorted_blocks) images, tables, interline_equations = get_qa_need_list_v2(sorted_blocks)
'''构造pdf_info_dict''' """构造pdf_info_dict"""
page_info = ocr_construct_page_component_v2(sorted_blocks, [], page_id, page_w, page_h, [], page_info = ocr_construct_page_component_v2(
images, tables, interline_equations, fix_discarded_blocks, sorted_blocks,
need_drop, drop_reason) [],
page_id,
page_w,
page_h,
[],
images,
tables,
interline_equations,
fix_discarded_blocks,
need_drop,
drop_reason,
)
return page_info return page_info
def pdf_parse_union(pdf_bytes, def pdf_parse_union(
dataset: Dataset,
model_list, model_list,
imageWriter, imageWriter,
parse_mode, parse_mode,
start_page_id=0, start_page_id=0,
end_page_id=None, end_page_id=None,
debug_mode=False, debug_mode=False,
): ):
pdf_bytes_md5 = compute_md5(pdf_bytes) pdf_bytes_md5 = compute_md5(dataset.data_bits())
pdf_docs = fitz.open("pdf", pdf_bytes)
'''初始化空的pdf_info_dict''' """初始化空的pdf_info_dict"""
pdf_info_dict = {} pdf_info_dict = {}
'''用model_list和docs对象初始化magic_model''' """用model_list和docs对象初始化magic_model"""
magic_model = MagicModel(model_list, pdf_docs) magic_model = MagicModel(model_list, dataset)
'''根据输入的起始范围解析pdf''' """根据输入的起始范围解析pdf"""
# end_page_id = end_page_id if end_page_id else len(pdf_docs) - 1 # end_page_id = end_page_id if end_page_id else len(pdf_docs) - 1
end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(pdf_docs) - 1 end_page_id = (
end_page_id
if end_page_id is not None and end_page_id >= 0
else len(dataset) - 1
)
if end_page_id > len(pdf_docs) - 1: if end_page_id > len(dataset) - 1:
logger.warning("end_page_id is out of range, use pdf_docs length") logger.warning('end_page_id is out of range, use pdf_docs length')
end_page_id = len(pdf_docs) - 1 end_page_id = len(dataset) - 1
'''初始化启动时间''' """初始化启动时间"""
start_time = time.time() start_time = time.time()
for page_id, page in enumerate(pdf_docs): for page_id, page in enumerate(dataset):
'''debug时输出每页解析的耗时''' """debug时输出每页解析的耗时."""
if debug_mode: if debug_mode:
time_now = time.time() time_now = time.time()
logger.info( logger.info(
f"page_id: {page_id}, last_page_cost_time: {get_delta_time(start_time)}" f'page_id: {page_id}, last_page_cost_time: {get_delta_time(start_time)}'
) )
start_time = time_now start_time = time_now
'''解析pdf中的每一页''' """解析pdf中的每一页"""
if start_page_id <= page_id <= end_page_id: if start_page_id <= page_id <= end_page_id:
page_info = parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode) page_info = parse_page_core(
page, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode
)
else: else:
page_w = page.rect.width page_info = page.get_page_info()
page_h = page.rect.height page_w = page_info.w
page_info = ocr_construct_page_component_v2([], [], page_id, page_w, page_h, [], page_h = page_info.h
[], [], [], [], page_info = ocr_construct_page_component_v2(
True, "skip page") [], [], page_id, page_w, page_h, [], [], [], [], [], True, 'skip page'
pdf_info_dict[f"page_{page_id}"] = page_info )
pdf_info_dict[f'page_{page_id}'] = page_info
"""分段""" """分段"""
para_split(pdf_info_dict, debug_mode=debug_mode) para_split(pdf_info_dict, debug_mode=debug_mode)
...@@ -441,7 +515,7 @@ def pdf_parse_union(pdf_bytes, ...@@ -441,7 +515,7 @@ def pdf_parse_union(pdf_bytes,
"""dict转list""" """dict转list"""
pdf_info_list = dict_to_list(pdf_info_dict) pdf_info_list = dict_to_list(pdf_info_dict)
new_pdf_info_dict = { new_pdf_info_dict = {
"pdf_info": pdf_info_list, 'pdf_info': pdf_info_list,
} }
clean_memory() clean_memory()
......
...@@ -6,8 +6,8 @@ import click ...@@ -6,8 +6,8 @@ import click
from loguru import logger from loguru import logger
import magic_pdf.model as model_config import magic_pdf.model as model_config
from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_span_bbox, from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_line_sort_bbox,
draw_model_bbox, draw_line_sort_bbox) draw_model_bbox, draw_span_bbox)
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
from magic_pdf.pipe.OCRPipe import OCRPipe from magic_pdf.pipe.OCRPipe import OCRPipe
from magic_pdf.pipe.TXTPipe import TXTPipe from magic_pdf.pipe.TXTPipe import TXTPipe
......
from loguru import logger
def ImportPIL(f):
try:
import PIL # noqa: F401
except ImportError:
logger.error('Pillow not installed, please install by pip.')
exit(1)
return f
{"track_id":"e8824f5a-9fcb-4ee5-b2d4-6bf2c67019dc","path":"s3://sci-hub/enbook-scimag/78800000/libgen.scimag78872000-78872999/10.1017/cbo9780511770425.012.pdf","file_type":"pdf","content_type":"application/pdf","content_length":80078,"title":"German Idealism and the Concept of Punishment || Conclusion","remark":{"file_id":"scihub_78800000/libgen.scimag78872000-78872999.zip_10.1017/cbo9780511770425.012","file_source_type":"paper","original_file_id":"10.1017/cbo9780511770425.012","file_name":"10.1017/cbo9780511770425.012.pdf","author":"Merle, Jean-Christophe"}}
{"track_id":"e8824f5a-9fcb-4ee5-b2d4-6bf2c67019dc","path":"tests/test_data/assets/pdfs/test_02.pdf","file_type":"pdf","content_type":"application/pdf","content_length":80078,"title":"German Idealism and the Concept of Punishment || Conclusion","remark":{"file_id":"scihub_78800000/libgen.scimag78872000-78872999.zip_10.1017/cbo9780511770425.012","file_source_type":"paper","original_file_id":"10.1017/cbo9780511770425.012","file_name":"10.1017/cbo9780511770425.012.pdf","author":"Merle, Jean-Christophe"}}
import os
import shutil
from magic_pdf.data.data_reader_writer import (FileBasedDataReader,
FileBasedDataWriter)
def test_filebased_reader_writer():
unitest_dir = '/tmp/magic_pdf/unittest/data/filebased_reader_writer'
sub_dir = os.path.join(unitest_dir, 'sub')
abs_fn = os.path.join(unitest_dir, 'abspath.txt')
os.makedirs(sub_dir, exist_ok=True)
writer = FileBasedDataWriter(sub_dir)
reader = FileBasedDataReader(sub_dir)
writer.write('test.txt', b'hello world')
assert reader.read('test.txt') == b'hello world'
writer.write(abs_fn, b'hello world')
assert reader.read(abs_fn) == b'hello world'
shutil.rmtree(unitest_dir)
import json
import os
import fitz
import pytest
from magic_pdf.data.data_reader_writer import (MultiBucketS3DataReader,
MultiBucketS3DataWriter)
from magic_pdf.data.schemas import S3Config
@pytest.mark.skipif(
os.getenv('S3_ACCESS_KEY_2', None) is None, reason='need s3 config!'
)
def test_multi_bucket_s3_reader_writer():
"""test multi bucket s3 reader writer must config s3 config in the
environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export
S3_SECRET_KEY=xxx export S3_ENDPOINT=xxx.
export S3_BUCKET_2=xxx export S3_ACCESS_KEY_2=xxx export S3_SECRET_KEY_2=xxx export S3_ENDPOINT_2=xxx
"""
bucket = os.getenv('S3_BUCKET', '')
ak = os.getenv('S3_ACCESS_KEY', '')
sk = os.getenv('S3_SECRET_KEY', '')
endpoint_url = os.getenv('S3_ENDPOINT', '')
bucket_2 = os.getenv('S3_BUCKET_2', '')
ak_2 = os.getenv('S3_ACCESS_KEY_2', '')
sk_2 = os.getenv('S3_SECRET_KEY_2', '')
endpoint_url_2 = os.getenv('S3_ENDPOINT_2', '')
s3configs = [
S3Config(
bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
),
S3Config(
bucket_name=bucket_2,
access_key=ak_2,
secret_key=sk_2,
endpoint_url=endpoint_url_2,
),
]
reader = MultiBucketS3DataReader(default_bucket=bucket, s3_configs=s3configs)
writer = MultiBucketS3DataWriter(default_bucket=bucket, s3_configs=s3configs)
bits = reader.read('meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl')
assert bits == reader.read(
f's3://{bucket}/meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl'
)
bits = reader.read(
f's3://{bucket_2}/enbook-scimag/78800000/libgen.scimag78872000-78872999/10.1017/cbo9780511770425.012.pdf'
)
docs = fitz.open('pdf', bits)
assert len(docs) == 10
bits = reader.read(
'meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl?bytes=566,713'
)
assert bits == reader.read_at(
'meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl', 566, 713
)
assert len(json.loads(bits)) > 0
writer.write_string(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt', 'abc'
)
assert 'abc'.encode() == reader.read(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
)
writer.write(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt',
'123'.encode(),
)
assert '123'.encode() == reader.read(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt'
)
import json
import os
import pytest
from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
@pytest.mark.skipif(
os.getenv('S3_ACCESS_KEY', None) is None, reason='need s3 config!'
)
def test_multi_bucket_s3_reader_writer():
"""test multi bucket s3 reader writer must config s3 config in the
environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export
S3_SECRET_KEY=xxx export S3_ENDPOINT=xxx."""
bucket = os.getenv('S3_BUCKET', '')
ak = os.getenv('S3_ACCESS_KEY', '')
sk = os.getenv('S3_SECRET_KEY', '')
endpoint_url = os.getenv('S3_ENDPOINT', '')
reader = S3DataReader(bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint_url)
writer = S3DataWriter(bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint_url)
bits = reader.read('meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl')
assert bits == reader.read(
f's3://{bucket}/meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl'
)
bits = reader.read(
'meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl?bytes=566,713'
)
assert bits == reader.read_at(
'meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl', 566, 713
)
assert len(json.loads(bits)) > 0
writer.write_string(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt', 'abc'
)
assert 'abc'.encode() == reader.read(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
)
writer.write(
f'{bucket}/unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt',
'123'.encode(),
)
assert '123'.encode() == reader.read(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt'
)
import json
import os
import pytest
from magic_pdf.data.io.s3 import S3Reader, S3Writer
@pytest.mark.skipif(
os.getenv('S3_ACCESS_KEY', None) is None, reason='s3 config not found'
)
def test_s3_reader():
"""test s3 reader.
must config s3 config in the environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export S3_SECRET_KEY=xxx
export S3_ENDPOINT=xxx
"""
bucket = os.getenv('S3_BUCKET', '')
ak = os.getenv('S3_ACCESS_KEY', '')
sk = os.getenv('S3_SECRET_KEY', '')
endpoint_url = os.getenv('S3_ENDPOINT', '')
reader = S3Reader(bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint_url)
bits = reader.read(
'meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl'
)
assert len(bits) > 0
bits = reader.read_at(
'meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl',
566,
713,
)
assert len(json.loads(bits)) > 0
@pytest.mark.skipif(
os.getenv('S3_ACCESS_KEY', None) is None, reason='s3 config not found'
)
def test_s3_writer():
"""test s3 reader.
must config s3 config in the environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export S3_SECRET_KEY=xxx
export S3_ENDPOINT=xxx
"""
bucket = os.getenv('S3_BUCKET', '')
ak = os.getenv('S3_ACCESS_KEY', '')
sk = os.getenv('S3_SECRET_KEY', '')
endpoint_url = os.getenv('S3_ENDPOINT', '')
writer = S3Writer(bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint_url)
test_fn = 'unittest/io/test.jsonl'
writer.write(test_fn, '123'.encode())
reader = S3Reader(bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint_url)
bits = reader.read(test_fn)
assert bits.decode() == '123'
from magic_pdf.data.dataset import ImageDataset, PymuDocDataset
def test_pymudataset():
with open('tests/test_data/assets/pdfs/test_01.pdf', 'rb') as f:
bits = f.read()
datasets = PymuDocDataset(bits)
assert len(datasets) > 0
assert datasets.get_page(0).get_page_info().h > 100
def test_imagedataset():
with open('tests/test_data/assets/pngs/test_01.png', 'rb') as f:
bits = f.read()
datasets = ImageDataset(bits)
assert len(datasets) == 1
assert datasets.get_page(0).get_page_info().w > 100
import os
import pytest
from magic_pdf.data.data_reader_writer import MultiBucketS3DataReader
from magic_pdf.data.read_api import (read_jsonl, read_local_images,
read_local_pdfs)
from magic_pdf.data.schemas import S3Config
def test_read_local_pdfs():
datasets = read_local_pdfs('tests/test_data/assets/pdfs')
assert len(datasets) == 2
assert len(datasets[0]) > 0
assert len(datasets[1]) > 0
assert datasets[0].get_page(0).get_page_info().w > 0
assert datasets[0].get_page(0).get_page_info().h > 0
def test_read_local_images():
datasets = read_local_images('tests/test_data/assets/pngs', suffixes=['png'])
assert len(datasets) == 2
assert len(datasets[0]) == 1
assert len(datasets[1]) == 1
assert datasets[0].get_page(0).get_page_info().w > 0
assert datasets[0].get_page(0).get_page_info().h > 0
@pytest.mark.skipif(
os.getenv('S3_ACCESS_KEY_2', None) is None, reason='need s3 config!'
)
def test_read_json():
"""test multi bucket s3 reader writer must config s3 config in the
environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export
S3_SECRET_KEY=xxx export S3_ENDPOINT=xxx.
export S3_BUCKET_2=xxx export S3_ACCESS_KEY_2=xxx export S3_SECRET_KEY_2=xxx export S3_ENDPOINT_2=xxx
"""
bucket = os.getenv('S3_BUCKET', '')
ak = os.getenv('S3_ACCESS_KEY', '')
sk = os.getenv('S3_SECRET_KEY', '')
endpoint_url = os.getenv('S3_ENDPOINT', '')
bucket_2 = os.getenv('S3_BUCKET_2', '')
ak_2 = os.getenv('S3_ACCESS_KEY_2', '')
sk_2 = os.getenv('S3_SECRET_KEY_2', '')
endpoint_url_2 = os.getenv('S3_ENDPOINT_2', '')
s3configs = [
S3Config(
bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
),
S3Config(
bucket_name=bucket_2,
access_key=ak_2,
secret_key=sk_2,
endpoint_url=endpoint_url_2,
),
]
reader = MultiBucketS3DataReader(bucket, s3configs)
datasets = read_jsonl(
f's3://{bucket}/meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl',
reader,
)
assert len(datasets) > 0
assert len(datasets[0]) == 10
datasets = read_jsonl('tests/test_data/assets/jsonl/test_01.jsonl', reader)
assert len(datasets) == 1
assert len(datasets[0]) == 10
datasets = read_jsonl('tests/test_data/assets/jsonl/test_02.jsonl')
assert len(datasets) == 1
assert len(datasets[0]) == 1
[
{
"layout_dets": [
{
"category_id": 3,
"poly": [
776.7277221679688,
688.448974609375,
1242.224365234375,
688.448974609375,
1242.224365234375,
1182.0628662109375,
776.7277221679688,
1182.0628662109375
],
"score": 0.999997079372406
},
{
"category_id": 3,
"poly": [
775.9269409179688,
1389.754638671875,
1243.672119140625,
1389.754638671875,
1243.672119140625,
1859.716064453125,
775.9269409179688,
1859.716064453125
],
"score": 0.9999949932098389
},
{
"category_id": 1,
"poly": [
752.11572265625,
1939.3634033203125,
1430.1146240234375,
1939.3634033203125,
1430.1146240234375,
2041.1771240234375,
752.11572265625,
2041.1771240234375
],
"score": 0.999975323677063
},
{
"category_id": 3,
"poly": [
46.55152893066406,
686.12939453125,
638.8861083984375,
686.12939453125,
638.8861083984375,
1803.419189453125,
46.55152893066406,
1803.419189453125
],
"score": 0.999961256980896
},
{
"category_id": 3,
"poly": [
33.684722900390625,
150.77980041503906,
1238.0679931640625,
150.77980041503906,
1238.0679931640625,
524.98291015625,
33.684722900390625,
524.98291015625
],
"score": 0.9999504089355469
},
{
"category_id": 1,
"poly": [
24.685693740844727,
1875.9998779296875,
703.5064697265625,
1875.9998779296875,
703.5064697265625,
2050.7431640625,
24.685693740844727,
2050.7431640625
],
"score": 0.9999105334281921
},
{
"category_id": 1,
"poly": [
750.97705078125,
1252.206787109375,
1430.0809326171875,
1252.206787109375,
1430.0809326171875,
1357.2947998046875,
750.97705078125,
1357.2947998046875
],
"score": 0.999853789806366
},
{
"category_id": 4,
"poly": [
904.842041015625,
1213.027099609375,
1273.5655517578125,
1213.027099609375,
1273.5655517578125,
1242.717529296875,
904.842041015625,
1242.717529296875
],
"score": 0.9995817542076111
},
{
"category_id": 4,
"poly": [
905.3208618164062,
1898.5325927734375,
1273.1282958984375,
1898.5325927734375,
1273.1282958984375,
1928.9906005859375,
905.3208618164062,
1928.9906005859375
],
"score": 0.9986443519592285
},
{
"category_id": 4,
"poly": [
372.0135498046875,
556.02685546875,
1084.9647216796875,
556.02685546875,
1084.9647216796875,
586.6792602539062,
372.0135498046875,
586.6792602539062
],
"score": 0.9985352754592896
},
{
"category_id": 2,
"poly": [
1350.63671875,
79.77919006347656,
1379.6220703125,
79.77919006347656,
1379.6220703125,
99.83788299560547,
1350.63671875,
99.83788299560547
],
"score": 0.9973036646842957
},
{
"category_id": 4,
"poly": [
203.2659912109375,
597.2034912109375,
1251.0240478515625,
597.2034912109375,
1251.0240478515625,
657.985595703125,
203.2659912109375,
657.985595703125
],
"score": 0.9622809886932373
},
{
"category_id": 0,
"poly": [
70.87332916259766,
1834.5714111328125,
657.8504638671875,
1834.5714111328125,
657.8504638671875,
1865.07373046875,
70.87332916259766,
1865.07373046875
],
"score": 0.8580453395843506
},
{
"category_id": 1,
"poly": [
189.0360870361328,
597.2406616210938,
1252.3204345703125,
597.2406616210938,
1252.3204345703125,
658.4781494140625,
189.0360870361328,
658.4781494140625
],
"score": 0.3083903193473816
},
{
"category_id": 13,
"poly": [
1190,
1980,
1206,
1980,
1206,
1997,
1190,
1997
],
"score": 0.51,
"latex": ":"
},
{
"category_id": 13,
"poly": [
1219,
1331,
1235,
1331,
1235,
1348,
1219,
1348
],
"score": 0.49,
"latex": ":"
},
{
"category_id": 13,
"poly": [
798,
2016,
813,
2016,
813,
2033,
798,
2033
],
"score": 0.41,
"latex": ":"
},
{
"category_id": 13,
"poly": [
135,
1991,
148,
1991,
148,
2006,
135,
2006
],
"score": 0.39,
"latex": ":"
},
{
"category_id": 13,
"poly": [
400,
1916,
416,
1916,
416,
1933,
400,
1933
],
"score": 0.38,
"latex": ":"
},
{
"category_id": 13,
"poly": [
1148,
1944,
1162,
1944,
1162,
1961,
1148,
1961
],
"score": 0.31,
"latex": ":"
},
{
"category_id": 15,
"poly": [
798.0,
1943.0,
1147.0,
1943.0,
1147.0,
1968.0,
798.0,
1968.0
],
"score": 0.95,
"text": "Fig 4 SSCP analysis of FHIT exon 4. T"
},
{
"category_id": 15,
"poly": [
1163.0,
1943.0,
1425.0,
1943.0,
1425.0,
1968.0,
1163.0,
1968.0
],
"score": 0.96,
"text": "Tumor tissue ; N :Corresponding"
},
{
"category_id": 15,
"poly": [
755.0,
1979.0,
1189.0,
1979.0,
1189.0,
2004.0,
755.0,
2004.0
],
"score": 0.92,
"text": "normal tissue ; M : PBR322/Hae II Marker ; ssDNA"
},
{
"category_id": 15,
"poly": [
1207.0,
1979.0,
1422.0,
1979.0,
1422.0,
2004.0,
1207.0,
2004.0
],
"score": 0.97,
"text": "Single-stranded DNA ; ds-"
},
{
"category_id": 15,
"poly": [
755.0,
2015.0,
797.0,
2015.0,
797.0,
2038.0,
755.0,
2038.0
],
"score": 1.0,
"text": "DNA"
},
{
"category_id": 15,
"poly": [
814.0,
2015.0,
996.0,
2015.0,
996.0,
2038.0,
814.0,
2038.0
],
"score": 0.98,
"text": "Double-stranded DNA"
},
{
"category_id": 15,
"poly": [
71.0,
1880.0,
698.0,
1880.0,
698.0,
1902.0,
71.0,
1902.0
],
"score": 0.96,
"text": "Fig 2Alterations of PCR amplified products of FHIT exon 3,4,5 and"
},
{
"category_id": 15,
"poly": [
28.0,
1916.0,
399.0,
1916.0,
399.0,
1937.0,
28.0,
1937.0
],
"score": 0.98,
"text": "microsatellite marker D3S1300、D3S1312.A"
},
{
"category_id": 15,
"poly": [
417.0,
1916.0,
701.0,
1916.0,
701.0,
1937.0,
417.0,
1937.0
],
"score": 0.9,
"text": "Deletion of exon5(arrows);B :"
},
{
"category_id": 15,
"poly": [
29.0,
1953.0,
700.0,
1953.0,
700.0,
1974.0,
29.0,
1974.0
],
"score": 0.95,
"text": "Deletion of exon 3 A( arrows);C : Deletion of microsatellite marker D3S1300,"
},
{
"category_id": 15,
"poly": [
28.0,
1989.0,
134.0,
1989.0,
134.0,
2014.0,
28.0,
2014.0
],
"score": 1.0,
"text": "D3S1312.T"
},
{
"category_id": 15,
"poly": [
149.0,
1989.0,
696.0,
1989.0,
696.0,
2014.0,
149.0,
2014.0
],
"score": 0.96,
"text": "Tumor ; N : Corresponding normal tissue ; L : Corresponding lymph"
},
{
"category_id": 15,
"poly": [
30.0,
2027.0,
634.0,
2027.0,
634.0,
2047.0,
30.0,
2047.0
],
"score": 0.94,
"text": "node tissue;M :DL2000 DNA marker;L1:Lewis ;A :A549;S SPAC-1"
},
{
"category_id": 15,
"poly": [
801.0,
1259.0,
1427.0,
1259.0,
1427.0,
1280.0,
801.0,
1280.0
],
"score": 0.94,
"text": "Fig 3SSCP analysis of FHIT exon 3.The arrow indicateda deletion of"
},
{
"category_id": 15,
"poly": [
757.0,
1294.0,
1424.0,
1294.0,
1424.0,
1318.0,
757.0,
1318.0
],
"score": 0.96,
"text": "exon 3 of 41T. T : Tumor tissue ; N : Corresponding normal tissue ; M PBR322/"
},
{
"category_id": 15,
"poly": [
755.0,
1329.0,
1218.0,
1329.0,
1218.0,
1355.0,
755.0,
1355.0
],
"score": 0.95,
"text": "Hae Il Marker / ssDNA : Single-stranded DNA ; dsDNA"
},
{
"category_id": 15,
"poly": [
1236.0,
1329.0,
1418.0,
1329.0,
1418.0,
1355.0,
1236.0,
1355.0
],
"score": 1.0,
"text": "Double-strandedDNA"
},
{
"category_id": 15,
"poly": [
910.0,
1217.0,
1269.0,
1217.0,
1269.0,
1241.0,
910.0,
1241.0
],
"score": 1.0,
"text": "图3FHIT基因外显子3的SSCP分析"
},
{
"category_id": 15,
"poly": [
909.0,
1904.0,
1269.0,
1904.0,
1269.0,
1927.0,
909.0,
1927.0
],
"score": 1.0,
"text": "图4FHIT基因外显子4的SSCP分析"
},
{
"category_id": 15,
"poly": [
374.0,
563.0,
1077.0,
563.0,
1077.0,
583.0,
374.0,
583.0
],
"score": 0.99,
"text": "图1FHIT基因外显子3、4、5、8和微卫星灶的PCR扩增产物琼脂糖电泳图"
},
{
"category_id": 15,
"poly": [
1351.0,
81.0,
1376.0,
81.0,
1376.0,
102.0,
1351.0,
102.0
],
"score": 1.0,
"text": "13"
},
{
"category_id": 15,
"poly": [
207.0,
600.0,
1245.0,
600.0,
1245.0,
624.0,
207.0,
624.0
],
"score": 0.96,
"text": "Fig 1 Agarose electrophoresis of PCR products of exor( A)3 ,4 ,5 ,8 and three microsatellite markers( B)of FHIT gene"
},
{
"category_id": 15,
"poly": [
309.0,
634.0,
1142.0,
634.0,
1142.0,
662.0,
309.0,
662.0
],
"score": 0.97,
"text": "M1 :DL2000 DNA marker ; M2 PBR322/Hae Il marker ; T :Tumor ; N :Corresponding normal tissue"
},
{
"category_id": 15,
"poly": [
73.0,
1840.0,
651.0,
1840.0,
651.0,
1864.0,
73.0,
1864.0
],
"score": 1.0,
"text": "图2FHIT基因外显子和微卫星灶PCR扩增产物缺失电泳图"
},
{
"category_id": 15,
"poly": [
207.0,
600.0,
1245.0,
600.0,
1245.0,
625.0,
207.0,
625.0
],
"score": 0.96,
"text": "Fig 1 Agarose electrophoresis of PCR products of exor A)3 ,4 ,5 ,8 and three microsatellite markers( B)of FHIT gene"
},
{
"category_id": 15,
"poly": [
309.0,
635.0,
1142.0,
635.0,
1142.0,
661.0,
309.0,
661.0
],
"score": 0.97,
"text": "M1 :DL2000 DNA marker ; M2 PBR322/Hae Il marker ; T Tumor ; N :Corresponding normal tissue"
}
],
"page_info": {
"page_no": 0,
"height": 2080,
"width": 1472
}
}
]
This source diff could not be displayed because it is too large. You can view the blob instead.
import json
from magic_pdf.data.read_api import read_local_pdfs
from magic_pdf.model.magic_model import MagicModel
def test_magic_model_image_v2():
datasets = read_local_pdfs('tests/test_model/assets/test_01.pdf')
with open('tests/test_model/assets/test_01.model.json') as f:
model_json = json.load(f)
magic_model = MagicModel(model_json, datasets[0])
imgs = magic_model.get_imgs_v2(0)
print(imgs)
tables = magic_model.get_tables_v2(0)
print(tables)
def test_magic_model_table_v2():
datasets = read_local_pdfs('tests/test_model/assets/test_02.pdf')
with open('tests/test_model/assets/test_02.model.json') as f:
model_json = json.load(f)
magic_model = MagicModel(model_json, datasets[0])
tables = magic_model.get_tables_v2(5)
print(tables)
tables = magic_model.get_tables_v2(8)
print(tables)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment