Commit 59bc15e0 authored by 赵小蒙's avatar 赵小蒙

Merge remote-tracking branch 'origin/master'

parents 7f0371da b725e72c
*.tar
*.tar.gz
venv*/
envs/
slurm_logs/
sync1.sh
data_preprocess_pj1
data-preparation1
__pycache__
*.log
*.pyc
.vscode
debug/
*.ipynb
.idea
spark/__init__.py
# vscode history
.history
.DS_Store
.env
bad_words/
bak/
app/tests/*
temp/
tmp/
tmp
.vscode
.vscode/
/test/
/app/pdf_toolbox/test/test_bookname.txt
......@@ -271,9 +271,8 @@ def parse_pdf_by_model(
""""以下进入到公式替换环节 """
char_level_text_blocks = page.get_text("rawdict", flags=fitz.TEXTFLAGS_TEXT)['blocks']
remain_text_blocks = combine_chars_to_pymudict(remain_text_blocks, char_level_text_blocks)# 合并chars
remain_text_blocks = remove_citation_marker(remain_text_blocks) # 先把角标去掉
remain_text_blocks = replace_equations_in_textblock(remain_text_blocks, inline_eq_info, interline_eq_info)
remain_text_blocks = remove_citation_marker(remain_text_blocks) # 公式替换之后去角标,防止公式无法替换成功。但是这样也会带来个问题就是把角标当公式。各有优劣。
remain_text_blocks = remove_chars_in_text_blocks(remain_text_blocks) # 减少中间态数据体积
#debug_show_bbox(pdf_docs, page_id, [b['bbox'] for b in inline_eq_info], [b['bbox'] for b in interline_eq_info], [], join_path(save_path, book_name, f"{book_name}_debug.pdf"), 3)
......
......@@ -114,12 +114,16 @@ def remove_citation_marker(with_char_text_blcoks):
# 找到高度最高的span作为位置比较的基准
max_hi_span = line['spans'][0]['bbox']
min_font_sz = 10000
min_font_sz = 10000 # line里最小的字体
max_font_sz = 0 # line里最大的字体
for s in line['spans']:
if max_hi_span[3]-max_hi_span[1]<s['bbox'][3]-s['bbox'][1]:
max_hi_span = s['bbox']
if min_font_sz>s['size']:
min_font_sz = s['size']
if max_font_sz<s['size']:
max_font_sz = s['size']
base_span_mid_y = (max_hi_span[3]+max_hi_span[1])/2
......@@ -130,6 +134,9 @@ def remove_citation_marker(with_char_text_blcoks):
span_mid_y = (span['bbox'][3]+span['bbox'][1])/2
span_font_sz = span['size']
if max_font_sz-span_font_sz<1: # 先以字体过滤正文,如果是正文就不再继续判断了
continue
if (base_span_mid_y-span_mid_y)/span_hi>0.2 or (base_span_mid_y-span_mid_y>0 and abs(span_font_sz-min_font_sz)/min_font_sz<0.1):
"""
1. 它的前一个char如果是句号或者逗号的话,那么肯定是角标而不是公式
......
import os
import collections # 统计库
import re # 正则
import re
from libs.boxbase import _is_in # 正则
from libs.commons import fitz # pyMuPDF库
import json # json
from pathlib import Path
def __solve_contain_bboxs(all_bbox_list: list):
"""将两个公式的bbox做判断是否有包含关系,若有的话则删掉较小的bbox"""
dump_list = []
for i in range(len(all_bbox_list)):
for j in range(i + 1, len(all_bbox_list)):
# 获取当前两个值
bbox1 = all_bbox_list[i][:4]
bbox2 = all_bbox_list[j][:4]
# 删掉较小的框
if _is_in(bbox1, bbox2):
dump_list.append(all_bbox_list[i])
elif _is_in(bbox2, bbox1):
dump_list.append(all_bbox_list[j])
# 遍历需要删除的列表中的每个元素
for item in dump_list:
while item in all_bbox_list:
all_bbox_list.remove(item)
return all_bbox_list
def parse_equations(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
"""
:param page_ID: int类型,当前page在当前pdf文档中是第page_D页。
......@@ -101,4 +127,5 @@ def parse_equations(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict
for eq_box in equationIsolated_from_DocXChain_bboxs:
eq_box = [eq_box[0]+cropbox[0], eq_box[1]+cropbox[1], eq_box[2]+cropbox[0], eq_box[3]+cropbox[1], eq_box[4]]
return equationEmbedding_from_DocXChain_bboxs, equationIsolated_from_DocXChain_bboxs
deduped_embedding_eq_bboxes = __solve_contain_bboxs(equationEmbedding_from_DocXChain_bboxs)
return deduped_embedding_eq_bboxes, equationIsolated_from_DocXChain_bboxs
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment