My file is a subset split from the file
However, I wonder if my file is just a parquet file split from a self-created parquet file by datasets, what’s wrong with that file? And is there a way to rescan and turn off this flag?
About script:
# Split and upload
for i in range(num_chunks):
start_idx = i * chunk_size
end_idx = min((i + 1) * chunk_size, num_rows)
chunk_df = df.iloc[start_idx:end_idx]
# Tạo tên file: original_name_k.parquet
output_filename = f"{base_name}_{i}.parquet"
temp_file = f"temp_{file_idx}_{i}.parquet"
try:
chunk_df.to_parquet(temp_file, index=False)
# Upload giữ nguyên cấu trúc thư mục
target_path = f"data/vie_Latn/train/{output_filename}"
print(f" ⬆️ Uploading {output_filename} ({len(chunk_df):,} samples)...")
api.upload_file(
path_or_fileobj=temp_file,
path_in_repo=target_path,
repo_id=target_repo,
repo_type="dataset",
commit_message=f"Add {output_filename}"
)
print(f" ✅ Uploaded {output_filename}")
except Exception as e:
print(f" ❌ Failed to process chunk {i}: {e}")