I have an imbalanced dataset to which I want to apply SMOTE algorithm.
train_df = pd.read_csv("../input/euos-slas/train.csv")
raw_datasets = Dataset.from_pandas(train_df).train_test_split(test_size=0.2)
def tokenize_function(examples):
encoding = tokenizer(examples["smiles"], padding="max_length", add_special_tokens=False, truncation=True, max_length=256)
if "sol_category" in examples.keys():
encoding["labels"] = F.one_hot(torch.tensor(examples["sol_category"])).to(torch.float32).numpy()
return encoding
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["smiles", "Id", "sol_category"])
Here is what I have tried:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X = tokenized_datasets["train"]["input_ids"]
y = tokenized_datasets["train"]["labels"]
X = np.array(X)
y = np.array(y)
X_res, y_res = sm.fit_resample(X, y)
tokenized_datasets["train"]["input_ids"] = X_res
tokenized_datasets["train"]["labels"] = y_res
But I am getting this error:
TypeError: 'Dataset' object does not support item assignment
How can I apply SMOTE to only the train part.