##Data used is " lotusacharya/nepalinewsdataset " from kaggle.
##Code:
%%time
def preprocess_function(rows):
return tokenizer(rows[âtextâ],truncation=True,max_length=128,padding=True)
print(âTokenizing the dataâ)
tokenized_inputs = data.map(
preprocess_function,
batched=True,
num_proc=2,
remove_columns=data[âtrainâ].column_names,
)
tokenized_inputs[âtrainâ] = tokenized_inputs[âtrainâ].add_column(
name=âlabelsâ,column=data[âtrainâ][âlabelâ]
)
tokenized_inputs[âtestâ] = tokenized_inputs[âtestâ].add_column(
name=âlabelsâ,column=data[âtestâ][âlabelâ]
)
tokenized_inputs
from transformers import DataCollatorWithPadding
print(âInitializing Data Collatorâ)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer,
max_length=256,
return_tensors=âtfâ)
print(âPreparing Training and Testing sets to TRAIN the MODELâ)
tf_train_set = model.prepare_tf_dataset(
tokenized_inputs[âtrainâ],
shuffle=True,
batch_size=16,
collate_fn=data_collator,
)
tf_test_set = model.prepare_tf_dataset(
tokenized_inputs[âtestâ],
shuffle=False,
batch_size=16,
collate_fn=data_collator,
)
##Error:
Preparing Training and Testing sets to TRAIN the MODEL
/usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py:2395: UserWarning: max_length
is ignored when padding
=True
and there is no truncation strategy. To pad to max length, use padding='max_length'
.
warnings.warn(
RuntimeError Traceback (most recent call last)
in <cell line: 2>()
1 print(âPreparing Training and Testing sets to TRAIN the MODELâ)
----> 2 tf_train_set = model.prepare_tf_dataset(
3 tokenized_inputs[âtrainâ],
4 shuffle=True,
5 batch_size=16,
1 frames
/usr/local/lib/python3.10/dist-packages/datasets/arrow_dataset.py in _get_output_signature(dataset, collate_fn, collate_fn_args, cols_to_retain, batch_size, num_test_batches)
289 tf_dtype = tf.string
290 else:
â 291 raise RuntimeError(
292 f"Unrecognized array dtype {np_arrays[0].dtype}. \n"
293 âNested types and image/audio types are not supported yet.â
RuntimeError: Unrecognized array dtype object.
Nested types and image/audio types are not supported yet.