How to apply SMOTE to a Dataset

I have an imbalanced dataset to which I want to apply SMOTE algorithm.

train_df = pd.read_csv("../input/euos-slas/train.csv")
raw_datasets = Dataset.from_pandas(train_df).train_test_split(test_size=0.2)
def tokenize_function(examples):
    encoding = tokenizer(examples["smiles"], padding="max_length", add_special_tokens=False, truncation=True, max_length=256)
    if "sol_category" in examples.keys():
        encoding["labels"] = F.one_hot(torch.tensor(examples["sol_category"])).to(torch.float32).numpy()
    return encoding

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["smiles", "Id", "sol_category"])

Here is what I have tried:

from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)

X = tokenized_datasets["train"]["input_ids"]
y = tokenized_datasets["train"]["labels"]

X = np.array(X)
y = np.array(y)

X_res, y_res = sm.fit_resample(X, y)

tokenized_datasets["train"]["input_ids"] = X_res
tokenized_datasets["train"]["labels"] = y_res

But I am getting this error:

TypeError: 'Dataset' object does not support item assignment

How can I apply SMOTE to only the train part.

I believe you found your answer by now.
Anyhow, I faced the same issue, so here would be a way:

def upsample_dataset(dataset: datasets.Dataset, label_col: str='label'):
    '''Upsample a dataset without the stream mechanism of HF
    '''
    # X contains all the data (labels too). Makes it easier later
    X = np.array([
        dataset[col_name]
        for col_name in dataset.column_names
    ]).T

    # Get labels from dataset
    y = dataset[label_col]

    # OverSample dataset
    sm = SMOTE(random_state=42)
    X_balanced, _ = sm.fit_resample(X, y)

    # Return new balanced dataset
    return datasets.Dataset.from_dict( {
            col_name: X_balanced.T[i]
            for i, col_name in enumerate(dataset.column_names)
        })

It implies to load the whole dataset, wich is not great, but its a way so far…