Chapter 7 questions

raw_datasets = load_dataset(“conll2003”)

barfs .

Unable to find ‘hf://datasets/conll2003@365071bdbe92d4b8e4f153d7df57706a5be793ac/conll2003/train/0000.parquet’

etc

Looks like it has simply been scrubbed, which rather breaks the example notebook

When I call to_tf_dataset, I get this error.

/opt/conda/lib/python3.10/site-packages/datasets/formatting/formatting.py:197: FutureWarning: In the future `np.object` will be defined as the corresponding NumPy scalar.
  (isinstance(x, np.ndarray) and (x.dtype == np.object or x.shape != array[0].shape))
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[24], line 1
----> 1 tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
      2     columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
      3     collate_fn=data_collator,
      4     shuffle=True,
      5     batch_size=16,
      6 )
      8 tf_eval_dataset = tokenized_datasets["validation"].to_tf_dataset(
      9     columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
     10     collate_fn=data_collator,
     11     shuffle=False,
     12     batch_size=16,
     13 )

File /opt/conda/lib/python3.10/site-packages/datasets/arrow_dataset.py:381, in TensorflowDatasetMixin.to_tf_dataset(self, columns, batch_size, shuffle, collate_fn, drop_remainder, collate_fn_args, label_cols, dummy_labels, prefetch)
    378 retained_columns = [key for key in self.features.keys() if key in cols_to_retain]
    379 dataset = self.with_format("numpy", columns=retained_columns)
--> 381 columns_to_dtypes, output_signature = self._get_output_signature(
    382     dataset, collate_fn, collate_fn_args, batch_size=batch_size if drop_remainder else None
    383 )
    384 all_columns = list(columns_to_dtypes.keys())
    385 all_dtypes = list(columns_to_dtypes.values())

File /opt/conda/lib/python3.10/site-packages/datasets/arrow_dataset.py:244, in TensorflowDatasetMixin._get_output_signature(dataset, collate_fn, collate_fn_args, batch_size)
    242     raise ValueError("Unable to get the output signature because the dataset is empty.")
    243 test_batch_size = min(len(dataset), 4)
--> 244 test_batch = dataset[:test_batch_size]
    245 test_batch = [{key: value[i] for key, value in test_batch.items()} for i in range(test_batch_size)]
    246 test_batch = collate_fn(test_batch, **collate_fn_args)

File /opt/conda/lib/python3.10/site-packages/datasets/arrow_dataset.py:1764, in Dataset.__getitem__(self, key)
   1762 def __getitem__(self, key):  # noqa: F811
   1763     """Can be used to index columns (by string names) or rows (by integer index or iterable of indices or bools)."""
-> 1764     return self._getitem(
   1765         key,
   1766     )

File /opt/conda/lib/python3.10/site-packages/datasets/arrow_dataset.py:1749, in Dataset._getitem(self, key, decoded, **kwargs)
   1747 formatter = get_formatter(format_type, features=self.features, decoded=decoded, **format_kwargs)
   1748 pa_subtable = query_table(self._data, key, indices=self._indices if self._indices is not None else None)
-> 1749 formatted_output = format_table(
   1750     pa_subtable, key, formatter=formatter, format_columns=format_columns, output_all_columns=output_all_columns
   1751 )
   1752 return formatted_output

File /opt/conda/lib/python3.10/site-packages/datasets/formatting/formatting.py:540, in format_table(table, key, formatter, format_columns, output_all_columns)
    538 else:
    539     pa_table_to_format = pa_table.drop(col for col in pa_table.column_names if col not in format_columns)
--> 540     formatted_output = formatter(pa_table_to_format, query_type=query_type)
    541     if output_all_columns:
    542         if isinstance(formatted_output, MutableMapping):

File /opt/conda/lib/python3.10/site-packages/datasets/formatting/formatting.py:285, in Formatter.__call__(self, pa_table, query_type)
    283     return self.format_column(pa_table)
    284 elif query_type == "batch":
--> 285     return self.format_batch(pa_table)

File /opt/conda/lib/python3.10/site-packages/datasets/formatting/formatting.py:346, in NumpyFormatter.format_batch(self, pa_table)
    345 def format_batch(self, pa_table: pa.Table) -> dict:
--> 346     batch = self.numpy_arrow_extractor(**self.np_array_kwargs).extract_batch(pa_table)
    347     if self.decoded:
    348         batch = self.python_features_decoder.decode_batch(batch)

File /opt/conda/lib/python3.10/site-packages/datasets/formatting/formatting.py:160, in NumpyArrowExtractor.extract_batch(self, pa_table)
    159 def extract_batch(self, pa_table: pa.Table) -> dict:
--> 160     return {col: self._arrow_array_to_numpy(pa_table[col]) for col in pa_table.column_names}

File /opt/conda/lib/python3.10/site-packages/datasets/formatting/formatting.py:160, in <dictcomp>(.0)
    159 def extract_batch(self, pa_table: pa.Table) -> dict:
--> 160     return {col: self._arrow_array_to_numpy(pa_table[col]) for col in pa_table.column_names}

File /opt/conda/lib/python3.10/site-packages/datasets/formatting/formatting.py:196, in NumpyArrowExtractor._arrow_array_to_numpy(self, pa_array)
    194         array: List = pa_array.to_numpy(zero_copy_only=zero_copy_only).tolist()
    195 if len(array) > 0:
--> 196     if any(
    197         (isinstance(x, np.ndarray) and (x.dtype == np.object or x.shape != array[0].shape))
    198         or (isinstance(x, float) and np.isnan(x))
    199         for x in array
    200     ):
    201         return np.array(array, copy=False, **{**self.np_array_kwargs, "dtype": np.object})
    202 return np.array(array, copy=False, **self.np_array_kwargs)

File /opt/conda/lib/python3.10/site-packages/datasets/formatting/formatting.py:197, in <genexpr>(.0)
    194         array: List = pa_array.to_numpy(zero_copy_only=zero_copy_only).tolist()
    195 if len(array) > 0:
    196     if any(
--> 197         (isinstance(x, np.ndarray) and (x.dtype == np.object or x.shape != array[0].shape))
    198         or (isinstance(x, float) and np.isnan(x))
    199         for x in array
    200     ):
    201         return np.array(array, copy=False, **{**self.np_array_kwargs, "dtype": np.object})
    202 return np.array(array, copy=False, **self.np_array_kwargs)

File /opt/conda/lib/python3.10/site-packages/numpy/__init__.py:324, in __getattr__(attr)
    319     warnings.warn(
    320         f"In the future `np.{attr}` will be defined as the "
    321         "corresponding NumPy scalar.", FutureWarning, stacklevel=2)
    323 if attr in __former_attrs__:
--> 324     raise AttributeError(__former_attrs__[attr])
    326 if attr == 'testing':
    327     import numpy.testing as testing

AttributeError: module 'numpy' has no attribute 'object'.
`np.object` was a deprecated alias for the builtin `object`. To avoid this error in existing code, use `object` by itself. Doing this will not modify any behavior and is safe. 
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations

Chapter 7, Token classification has a .compute_metric() function that uses argmax() instead of softmax(). I have attached it below. I’m wondering why does this not cause problems with gradient back-propagation? Argmax is non-differentiable, so I would assume there’d be problems, but example clearly works, and I must be missing something here.

link:

code:
import numpy as np

def compute_metrics(eval_preds):
logits, labels = eval_preds
predictions = np.argmax(logits, axis=-1)

# Remove ignored index (special tokens) and convert to labels
true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
true_predictions = [
    [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
return {
    "precision": all_metrics["overall_precision"],
    "recall": all_metrics["overall_recall"],
    "f1": all_metrics["overall_f1"],
    "accuracy": all_metrics["overall_accuracy"],
}

I’ve found several issues with the “Training a causal language model from scratch” tutorial, you can find those below:

  • In the “Training with :hugs: Accelerate” section, the variable ‘tokenized_dataset’ should be corrected to ‘tokenized_datasets’. This appears to be a typo.
  • There is a deprecation issue to note: Replace Accelerator(fp16=True) with Accelerator(mixed_precision='fp16') to use the updated syntax.
  • Regarding the evaluation with the accelerator, the correct code should be losses.append(accelerator.gather(outputs.loss.view(-1))) since outputs.loss returns a scalar and does not have a shape.
  • In the accelerator training loop, the variable ‘samples_per_step’ is undefined. I assume it should be the same as ‘batch_size’, which was 32.

i hope this helps, best.

This seems like a typo to me, In the QuestionAnswering section, inside the fuction preprocess_training_examples(examples) there is a loop that finds the end position of the answer, the following code is in the course page.

# Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

This gives an index out-of-range error. I think the correct version is

while sequence_ids[idx] != 1:
            idx += 1
        context_end = idx - 1

When training a causal language model from scratch , i am getting this error:
‘’---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[258], line 3
1 from transformers import Trainer, TrainingArguments
----> 3 args = TrainingArguments(
4 output_dir=“codeparrot-ds”,
5 per_device_train_batch_size=32,
6 per_device_eval_batch_size=32,
7 evaluation_strategy=“steps”,
8 eval_steps=5_000,
9 logging_steps=5_000,
10 gradient_accumulation_steps=8,
11 num_train_epochs=1,
12 weight_decay=0.1,
13 warmup_steps=1_000,
14 lr_scheduler_type=“cosine”,
15 learning_rate=5e-4,
16 save_steps=5_000,
17 fp16=True,
18 push_to_hub=True,
19 )
21 trainer = Trainer(
22 model=model,
23 tokenizer=tokenizer,
(…)
27 eval_dataset=tokenized_datasets[“valid”],
28 )

File :125, in init(self, output_dir, overwrite_output_dir, do_train, do_eval, do_predict, evaluation_strategy, prediction_loss_only, per_device_train_batch_size, per_device_eval_batch_size, per_gpu_train_batch_size, per_gpu_eval_batch_size, gradient_accumulation_steps, eval_accumulation_steps, eval_delay, learning_rate, weight_decay, adam_beta1, adam_beta2, adam_epsilon, max_grad_norm, num_train_epochs, max_steps, lr_scheduler_type, lr_scheduler_kwargs, warmup_ratio, warmup_steps, log_level, log_level_replica, log_on_each_node, logging_dir, logging_strategy, logging_first_step, logging_steps, logging_nan_inf_filter, save_strategy, save_steps, save_total_limit, save_safetensors, save_on_each_node, save_only_model, no_cuda, use_cpu, use_mps_device, seed, data_seed, jit_mode_eval, use_ipex, bf16, fp16, fp16_opt_level, half_precision_backend, bf16_full_eval, fp16_full_eval, tf32, local_rank, ddp_backend, tpu_num_cores, tpu_metrics_debug, debug, dataloader_drop_last, eval_steps, dataloader_num_workers, dataloader_prefetch_factor, past_index, run_name, disable_tqdm, remove_unused_columns, label_names, load_best_model_at_end, metric_for_best_model, greater_is_better, ignore_data_skip, fsdp, fsdp_min_num_params, fsdp_config, fsdp_transformer_layer_cls_to_wrap, accelerator_config, deepspeed, label_smoothing_factor, optim, optim_args, adafactor, group_by_length, length_column_name, report_to, ddp_find_unused_parameters, ddp_bucket_cap_mb, ddp_broadcast_buffers, dataloader_pin_memory, dataloader_persistent_workers, skip_memory_metrics, use_legacy_prediction_loop, push_to_hub, resume_from_checkpoint, hub_model_id, hub_strategy, hub_token, hub_private_repo, hub_always_push, gradient_checkpointing, gradient_checkpointing_kwargs, include_inputs_for_metrics, eval_do_concat_batches, fp16_backend, push_to_hub_model_id, push_to_hub_organization, push_to_hub_token, mp_parameters, auto_find_batch_size, full_determinism, torchdynamo, ray_scope, ddp_timeout, torch_compile, torch_compile_backend, torch_compile_mode, dispatch_batches, split_batches, include_tokens_per_second, include_num_input_tokens_seen, neftune_noise_alpha, optim_target_modules)

File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\transformers\training_args.py:1612, in TrainingArguments.post_init(self)
1600 raise ValueError(“–optim adamw_torch_fused with --fp16 requires PyTorch>2.0”)
1602 if (
1603 self.framework == “pt”
1604 and is_torch_available()
(…)
1610 and (self.fp16 or self.fp16_full_eval)
1611 ):
→ 1612 raise ValueError(
1613 “FP16 Mixed precision training with AMP or APEX (--fp16) and FP16 half precision evaluation”
1614 " (--fp16_full_eval) can only be used on CUDA or MLU devices or NPU devices or certain XPU devices (with IPEX)."
1615 )
1617 if (
1618 self.framework == “pt”
1619 and is_torch_available()
(…)
1627 and (self.bf16 or self.bf16_full_eval)
1628 ):
1629 raise ValueError(
1630 “BF16 Mixed precision training with AMP (--bf16) and BF16 half precision evaluation”
1631 " (--bf16_full_eval) can only be used on CUDA, XPU (with IPEX), NPU, MLU or CPU/TPU/NeuronCore devices."
1632 )

ValueError: FP16 Mixed precision training with AMP or APEX (--fp16) and FP16 half precision evaluation (--fp16_full_eval) can only be used on CUDA or MLU devices or NPU devices or certain XPU devices (with IPEX).
‘’

I cant train the whole codeparrot dataset so i tried to use a subset here before training

    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"][:10000],
    eval_dataset=tokenized_datasets["valid"],
)```

but it gives me error: 
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
Cell In[36], line 1
----> 1 trainer.train()

File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:1771, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
   1768 try:
   1769     # Disable progress bars when uploading models during checkpoints to avoid polluting stdout
   1770     hf_hub_utils.disable_progress_bars()
-> 1771     return inner_training_loop(
   1772         args=args,
   1773         resume_from_checkpoint=resume_from_checkpoint,
   1774         trial=trial,
   1775         ignore_keys_for_eval=ignore_keys_for_eval,
   1776     )
   1777 finally:
   1778     hf_hub_utils.enable_progress_bars()

File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:2085, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
   2082     rng_to_sync = True
   2084 step = -1
-> 2085 for step, inputs in enumerate(epoch_iterator):
   2086     total_batched_samples += 1
   2088     if self.args.include_num_input_tokens_seen:

File /opt/conda/lib/python3.10/site-packages/accelerate/data_loader.py:452, in DataLoaderShard.__iter__(self)
    450 # We iterate one batch ahead to check when we are at the end
    451 try:
--> 452     current_batch = next(dataloader_iter)
    453 except StopIteration:
    454     yield

File /opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py:630, in _BaseDataLoaderIter.__next__(self)
    627 if self._sampler_iter is None:
    628     # TODO(https://github.com/pytorch/pytorch/issues/76750)
    629     self._reset()  # type: ignore[call-arg]
--> 630 data = self._next_data()
    631 self._num_yielded += 1
    632 if self._dataset_kind == _DatasetKind.Iterable and \
    633         self._IterableDataset_len_called is not None and \
    634         self._num_yielded > self._IterableDataset_len_called:

File /opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py:674, in _SingleProcessDataLoaderIter._next_data(self)
    672 def _next_data(self):
    673     index = self._next_index()  # may raise StopIteration
--> 674     data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
    675     if self._pin_memory:
    676         data = _utils.pin_memory.pin_memory(data, self._pin_memory_device)

File /opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py:51, in _MapDatasetFetcher.fetch(self, possibly_batched_index)
     49         data = self.dataset.__getitems__(possibly_batched_index)
     50     else:
---> 51         data = [self.dataset[idx] for idx in possibly_batched_index]
     52 else:
     53     data = self.dataset[possibly_batched_index]

File /opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py:51, in <listcomp>(.0)
     49         data = self.dataset.__getitems__(possibly_batched_index)
     50     else:
---> 51         data = [self.dataset[idx] for idx in possibly_batched_index]
     52 else:
     53     data = self.dataset[possibly_batched_index]

KeyError: 0

how can I modify preprocessing functions (both train and validation) and compute_metrics function for squad v2 ? I am having some difficulties in implementing them because of unanswerable questions. Also, received “KeyError: no_answer_probabiliy” in evaluation…how can I handle that, what should I do ?

Kiwihead15
You could try to create a new model under your account.

I got confusion about some code snippet of " Fine-tuning a masked language model" part. Why do we need to repeat “loss” in below code ?

for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(**batch)

        loss = outputs.loss
        losses.append(accelerator.gather(loss.repeat(batch_size)))

    losses = torch.cat(losses)
    losses = losses[: len(eval_dataset)]
    try:
        perplexity = math.exp(torch.mean(losses))
    except OverflowError:
        perplexity = float("inf")

If it’s about the adding samples as this thread said.

Hi - I would greatly appreciate any advice regarding the following…

I have CUDA 12.2 installed but the cuda-supported pytorch options are only 11.8, 12.1 and 12.4. Is this the reason pytorch is not recognising my CUDA installation, i.e., cuda=12.2 being incompatible?

The issue is extrapolated however, because tensorflow (TF) also does not recognise my cuda device.

The result is that I am struggling very much to get my transformer model trained due to not having access to me NVIDIA GPU.

I definitely have CUDA and GPU! However I do not have CUDNN. Please can anybody lend any advice.

Best wishes,

The dataset has been down, may you change it for easy followup

Hi - I am a bit confused about whole_word_masking_data_collator - it doesn’t seem like we actually use this in either of the training runs. When I try to use this collator in my trainer - I get an index error on word_ids on the line word_ids = feature.pop(“word_ids”) in the function.

Possible mistake in Summarization:
In “Preprocessing the data” section, it says:

The tokenizers in :hugs: Transformers provide a nifty text_target argument that allows you to tokenize the labels in parallel to the inputs. Here is an example of how the inputs and targets are processed for mT5:

Then it provides the below code, but the code doesn’t use the “text_target” argument for tokenizing the labels. Is that a mistake?

max_input_length = 512
max_target_length = 30


def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["review_body"],
        max_length=max_input_length,
        truncation=True,
    )
    labels = tokenizer(
        examples["review_title"], max_length=max_target_length, truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs```
1 Like

I had the same error. Seems as if the step to create the repository is missing from the tutorial.

# Lastly, to push our model to the Hub, we will need to create a Repository object in a 
# working folder. First log in to Hugging Face, if you’re not logged in already. We’ll 
# determine the repository name from the model ID we want to give our model (feel free 
# to replace the repo_name with your own choice; it just needs to contain your username, 
# which is what the function get_full_repo_name() does):

from huggingface_hub import Repository, get_full_repo_name, create_repo

model_name = "bert-finetuned-ner-accelerate"
repo_name = get_full_repo_name(model_name)

# Adding this line resolved the issue for me
# Also be sure to import "create_repo"
# Only need to run once, comment after first run
repo_url = create_repo(repo_id=repo_name)

# Clone
output_dir = "bert-finetuned-ner-accelerate"
repo = Repository(output_dir, clone_from=repo_name)
1 Like

Getting an error:

Error: You must call wandb.init() before wandb.log()

“Main NLP Tasks” in section “Fine tuning a Masked Language Model” at the following line

trainer.train()

The solution is to update the training arguments defined a few lines above.

training_args = TrainingArguments(
    output_dir=f"{model_name}-finetuned-imdb",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=True,
    fp16=True,
    logging_steps=logging_steps,
    report_to="none" # ADD THIS LINE!
)

This is covered in another thread but an initial non-working answer is given, with another poster correcting None to “none” in a response. Also, other alternative solutions are offered that could be tried.

1 Like

The exercises in this course no longer seem practical as there have been too many changes in the datasets, models, and python module functions between when it was written and today.

I’m now spending more time debugging problems than I am learning the intended topic of each section. I think from here to the end of the course I will just read through the information instead of attempting to solve the bugs in the exercise.

Example, got an error in the code evaluating the rouge score.

AttributeError: 'numpy.float64' object has no attribute 'mid'

This is because rouge metric no longer returns a collection of low, mid, high aggregate scores as expected by the code in the course. It now returns a simple dict of rouge types. ROUGE - a Hugging Face Space by evaluate-metric

Also, the Amazon Reviews dataset is not longer available. I spent quite a bit of time recreating a similar dataset form the Wikipedia dataset. So I could continue the course exercise.

# Run once. Can take a very long time.
# Make sure the saved files presist.

from datasets import load_dataset, Dataset, load_from_disk
import random

# Amazon Review Dataset is defunct, need a replacement
# According to the wikipedia dataset, it has the title and text needed where the title can be
# assumed to be the summary of the text. It won't be as good as the original, but will allow
# the exercises of the course to move forward
# https://huggingface.co/datasets/wikimedia/wikipedia
# But it does not have the train, test, and validation splits, and is HUGE.
# So manually faking the splits, renaming columns, and adding a missing column with random values

spanish_dataset_raw = load_dataset(
    path="wikimedia/wikipedia",
    name="20231101.es",
    trust_remote_code=True
)

english_dataset_raw = load_dataset(
    path="wikimedia/wikipedia",
    name="20231101.en",
    trust_remote_code=True
)

# english_dataset = english_dataset_raw
# spanish_dataset = spanish_dataset_raw

# At the time of this writing the english wikipedia dataset was 6.4 million records
# But the amazon reviews dataset was only 200,000/5,000/5,000 for train/vald/test

# Get a smaller portion of records, split into "test" & "train"
english_dataset = english_dataset_raw["train"].train_test_split(test_size=10_000, train_size=200_000)
# Divide the test split in half for "test" and valudation
english_dataset_test_split = english_dataset["test"].train_test_split(test_size=0.5, train_size=0.5)
# Assemble the the various splits into one dictionary
english_dataset['test'] = english_dataset_test_split['test'];
english_dataset['validation'] = english_dataset_test_split['train'];



# Repeat for the spanish dataset
spanish_dataset = spanish_dataset_raw["train"].train_test_split(test_size=10_000, train_size=200_000)
# Divide the test split in half for "test" and valudation
spanish_dataset_test_split = spanish_dataset["test"].train_test_split(test_size=0.5, train_size=0.5)
# Assemble the the various splits into one dictionary
spanish_dataset['test'] = spanish_dataset_test_split['test'];
spanish_dataset['validation'] = spanish_dataset_test_split['train'];

# add the missing product_category column
product_categories = ["home","apparel","wireless","other","beauty","drugstore","kitchen","toy","sports","automotive","lawn_and_garden","home_improvement","pet_products","digital_ebook_purchase","pc","electronics","office_product","shoes","grocery","book"]

def add_product_category(example):
    example["product_category"] = random.choice(product_categories)
    return example
    
english_dataset = english_dataset.map(add_product_category)
spanish_dataset = spanish_dataset.map(add_product_category)

# Rename columns to match course data
english_dataset = english_dataset.rename_column("text", "review_body")
english_dataset = english_dataset.rename_column("title", "review_title")
spanish_dataset = spanish_dataset.rename_column("text", "review_body")
spanish_dataset = spanish_dataset.rename_column("title", "review_title")

english_dataset['train'][0]

english_dataset.save_to_disk("english_dataset")
spanish_dataset.save_to_disk("spanish_dataset")

# I'm working in Kaggle so I made sure to save a version of the Notebook so that the files presisted between loads

next cell

# If the cell above has been executed and the dataset are available in the notebook presistent disk memory
# Then do not run the cell above, Run this cell to load from disk

english_dataset = load_from_disk("english_dataset")
spanish_dataset = load_from_disk("spanish_dataset")
1 Like

Thank you for sharing these fixes!

1 Like