Unverified Commit ad5596fc authored by icecraft's avatar icecraft Committed by GitHub

fix: #366 the broken chain after the refractor of AbsReaderWriter lead to wrong api invoke (#371)

Co-authored-by: 's avatarshenguanlin <shenguanlin@pjlab.org.cn>
parent 048e0952
...@@ -30,12 +30,10 @@ def read_s3_path(s3path): ...@@ -30,12 +30,10 @@ def read_s3_path(s3path):
byte_start, byte_end = 0, None byte_start, byte_end = 0, None
else: else:
byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1]) byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
byte_end += byte_start - 1 return s3_rw.read_offset(
return s3_rw.read_jsonl(
remove_non_official_s3_args(s3path), remove_non_official_s3_args(s3path),
byte_start, byte_start,
byte_end, byte_end,
AbsReaderWriter.MODE_BIN,
) )
...@@ -71,24 +69,23 @@ def cli(): ...@@ -71,24 +69,23 @@ def cli():
default="", default="",
) )
def jsonl(jsonl, method, output_dir): def jsonl(jsonl, method, output_dir):
print("haha")
model_config.__use_inside_model__ = False model_config.__use_inside_model__ = False
full_jsonl_path = os.path.realpath(jsonl)
if output_dir == "":
output_dir = os.path.join(os.path.dirname(full_jsonl_path), "output")
if jsonl.startswith("s3://"): if jsonl.startswith("s3://"):
jso = json_parse.loads(read_s3_path(jsonl).decode("utf-8")) jso = json_parse.loads(read_s3_path(jsonl).decode("utf-8"))
full_jsonl_path = "."
else: else:
full_jsonl_path = os.path.realpath(jsonl)
with open(jsonl) as f: with open(jsonl) as f:
jso = json_parse.loads(f.readline()) jso = json_parse.loads(f.readline())
if output_dir == "":
output_dir = os.path.join(os.path.dirname(full_jsonl_path), "output")
s3_file_path = jso.get("file_location") s3_file_path = jso.get("file_location")
if s3_file_path is None: if s3_file_path is None:
s3_file_path = jso.get("path") s3_file_path = jso.get("path")
pdf_file_name = Path(s3_file_path).stem pdf_file_name = Path(s3_file_path).stem
pdf_data = read_s3_path(s3_file_path) pdf_data = read_s3_path(s3_file_path)
print(pdf_file_name, jso, method) print(pdf_file_name, jso, method)
do_parse( do_parse(
output_dir, output_dir,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment