##Data used is " lotusacharya/nepalinewsdataset " from kaggle.
##Code:
%%time
def preprocess_function(rows):
return tokenizer(rows[‘text’],truncation=True,max_length=128,padding=True)
print(“Tokenizing the data”)
tokenized_inputs = data.map(
preprocess_function,
batched=True,
num_proc=2,
remove_columns=data[“train”].column_names,
)
tokenized_inputs[‘train’] = tokenized_inputs[‘train’].add_column(
name=“labels”,column=data[‘train’][‘label’]
)
tokenized_inputs[‘test’] = tokenized_inputs[‘test’].add_column(
name=“labels”,column=data[‘test’][‘label’]
)
tokenized_inputs
from transformers import DataCollatorWithPadding
print(“Initializing Data Collator”)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer,
max_length=256,
return_tensors=“tf”)
print(“Preparing Training and Testing sets to TRAIN the MODEL”)
tf_train_set = model.prepare_tf_dataset(
tokenized_inputs[“train”],
shuffle=True,
batch_size=16,
collate_fn=data_collator,
)
tf_test_set = model.prepare_tf_dataset(
tokenized_inputs[“test”],
shuffle=False,
batch_size=16,
collate_fn=data_collator,
)
##Error:
Preparing Training and Testing sets to TRAIN the MODEL
/usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py:2395: UserWarning: max_length
is ignored when padding
=True
and there is no truncation strategy. To pad to max length, use padding='max_length'
.
warnings.warn(
RuntimeError Traceback (most recent call last)
in <cell line: 2>()
1 print(“Preparing Training and Testing sets to TRAIN the MODEL”)
----> 2 tf_train_set = model.prepare_tf_dataset(
3 tokenized_inputs[“train”],
4 shuffle=True,
5 batch_size=16,
1 frames
/usr/local/lib/python3.10/dist-packages/datasets/arrow_dataset.py in _get_output_signature(dataset, collate_fn, collate_fn_args, cols_to_retain, batch_size, num_test_batches)
289 tf_dtype = tf.string
290 else:
→ 291 raise RuntimeError(
292 f"Unrecognized array dtype {np_arrays[0].dtype}. \n"
293 “Nested types and image/audio types are not supported yet.”
RuntimeError: Unrecognized array dtype object.
Nested types and image/audio types are not supported yet.