Commit 4cf7e9a2 authored by myhloli's avatar myhloli

refactor(pdf_parse): adjust block splitting logic for wide blocks

- Modify the logic for splitting wide blocks exceeding 0.4 page width
- Remove the specific case for blocks exceeding 0.25 page width
- Add comments to explain the reasoning behind different splitting strategies
parent acab8de5
...@@ -208,13 +208,12 @@ def insert_lines_into_block(block_bbox, line_height, page_w, page_h): ...@@ -208,13 +208,12 @@ def insert_lines_into_block(block_bbox, line_height, page_w, page_h):
): # 可能是双列结构,可以切细点 ): # 可能是双列结构,可以切细点
lines = int(block_height / line_height) + 1 lines = int(block_height / line_height) + 1
else: else:
# 如果block的宽度超过0.4页面宽度,则将block分成3行 # 如果block的宽度超过0.4页面宽度,则将block分成3行(是一种复杂布局,图不能切的太细)
if block_weight > page_w * 0.4: if block_weight > page_w * 0.4:
line_height = (y1 - y0) / 3 line_height = (y1 - y0) / 3
lines = 3 lines = 3
elif block_weight > page_w * 0.25: # 否则将block分成两行 elif block_weight > page_w * 0.25: # (可能是三列结构,也切细点)
line_height = (y1 - y0) / 2 lines = int(block_height / line_height) + 1
lines = 2
else: # 判断长宽比 else: # 判断长宽比
if block_height / block_weight > 1.2: # 细长的不分 if block_height / block_weight > 1.2: # 细长的不分
return [[x0, y0, x1, y1]] return [[x0, y0, x1, y1]]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment