raw_datasets = load_dataset(“conll2003”)
barfs .
Unable to find ‘hf://datasets/conll2003@365071bdbe92d4b8e4f153d7df57706a5be793ac/conll2003/train/0000.parquet’
etc
Looks like it has simply been scrubbed, which rather breaks the example notebook
raw_datasets = load_dataset(“conll2003”)
barfs .
Unable to find ‘hf://datasets/conll2003@365071bdbe92d4b8e4f153d7df57706a5be793ac/conll2003/train/0000.parquet’
etc
Looks like it has simply been scrubbed, which rather breaks the example notebook
When I call to_tf_dataset
, I get this error.
/opt/conda/lib/python3.10/site-packages/datasets/formatting/formatting.py:197: FutureWarning: In the future `np.object` will be defined as the corresponding NumPy scalar.
(isinstance(x, np.ndarray) and (x.dtype == np.object or x.shape != array[0].shape))
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
Cell In[24], line 1
----> 1 tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
2 columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
3 collate_fn=data_collator,
4 shuffle=True,
5 batch_size=16,
6 )
8 tf_eval_dataset = tokenized_datasets["validation"].to_tf_dataset(
9 columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
10 collate_fn=data_collator,
11 shuffle=False,
12 batch_size=16,
13 )
File /opt/conda/lib/python3.10/site-packages/datasets/arrow_dataset.py:381, in TensorflowDatasetMixin.to_tf_dataset(self, columns, batch_size, shuffle, collate_fn, drop_remainder, collate_fn_args, label_cols, dummy_labels, prefetch)
378 retained_columns = [key for key in self.features.keys() if key in cols_to_retain]
379 dataset = self.with_format("numpy", columns=retained_columns)
--> 381 columns_to_dtypes, output_signature = self._get_output_signature(
382 dataset, collate_fn, collate_fn_args, batch_size=batch_size if drop_remainder else None
383 )
384 all_columns = list(columns_to_dtypes.keys())
385 all_dtypes = list(columns_to_dtypes.values())
File /opt/conda/lib/python3.10/site-packages/datasets/arrow_dataset.py:244, in TensorflowDatasetMixin._get_output_signature(dataset, collate_fn, collate_fn_args, batch_size)
242 raise ValueError("Unable to get the output signature because the dataset is empty.")
243 test_batch_size = min(len(dataset), 4)
--> 244 test_batch = dataset[:test_batch_size]
245 test_batch = [{key: value[i] for key, value in test_batch.items()} for i in range(test_batch_size)]
246 test_batch = collate_fn(test_batch, **collate_fn_args)
File /opt/conda/lib/python3.10/site-packages/datasets/arrow_dataset.py:1764, in Dataset.__getitem__(self, key)
1762 def __getitem__(self, key): # noqa: F811
1763 """Can be used to index columns (by string names) or rows (by integer index or iterable of indices or bools)."""
-> 1764 return self._getitem(
1765 key,
1766 )
File /opt/conda/lib/python3.10/site-packages/datasets/arrow_dataset.py:1749, in Dataset._getitem(self, key, decoded, **kwargs)
1747 formatter = get_formatter(format_type, features=self.features, decoded=decoded, **format_kwargs)
1748 pa_subtable = query_table(self._data, key, indices=self._indices if self._indices is not None else None)
-> 1749 formatted_output = format_table(
1750 pa_subtable, key, formatter=formatter, format_columns=format_columns, output_all_columns=output_all_columns
1751 )
1752 return formatted_output
File /opt/conda/lib/python3.10/site-packages/datasets/formatting/formatting.py:540, in format_table(table, key, formatter, format_columns, output_all_columns)
538 else:
539 pa_table_to_format = pa_table.drop(col for col in pa_table.column_names if col not in format_columns)
--> 540 formatted_output = formatter(pa_table_to_format, query_type=query_type)
541 if output_all_columns:
542 if isinstance(formatted_output, MutableMapping):
File /opt/conda/lib/python3.10/site-packages/datasets/formatting/formatting.py:285, in Formatter.__call__(self, pa_table, query_type)
283 return self.format_column(pa_table)
284 elif query_type == "batch":
--> 285 return self.format_batch(pa_table)
File /opt/conda/lib/python3.10/site-packages/datasets/formatting/formatting.py:346, in NumpyFormatter.format_batch(self, pa_table)
345 def format_batch(self, pa_table: pa.Table) -> dict:
--> 346 batch = self.numpy_arrow_extractor(**self.np_array_kwargs).extract_batch(pa_table)
347 if self.decoded:
348 batch = self.python_features_decoder.decode_batch(batch)
File /opt/conda/lib/python3.10/site-packages/datasets/formatting/formatting.py:160, in NumpyArrowExtractor.extract_batch(self, pa_table)
159 def extract_batch(self, pa_table: pa.Table) -> dict:
--> 160 return {col: self._arrow_array_to_numpy(pa_table[col]) for col in pa_table.column_names}
File /opt/conda/lib/python3.10/site-packages/datasets/formatting/formatting.py:160, in <dictcomp>(.0)
159 def extract_batch(self, pa_table: pa.Table) -> dict:
--> 160 return {col: self._arrow_array_to_numpy(pa_table[col]) for col in pa_table.column_names}
File /opt/conda/lib/python3.10/site-packages/datasets/formatting/formatting.py:196, in NumpyArrowExtractor._arrow_array_to_numpy(self, pa_array)
194 array: List = pa_array.to_numpy(zero_copy_only=zero_copy_only).tolist()
195 if len(array) > 0:
--> 196 if any(
197 (isinstance(x, np.ndarray) and (x.dtype == np.object or x.shape != array[0].shape))
198 or (isinstance(x, float) and np.isnan(x))
199 for x in array
200 ):
201 return np.array(array, copy=False, **{**self.np_array_kwargs, "dtype": np.object})
202 return np.array(array, copy=False, **self.np_array_kwargs)
File /opt/conda/lib/python3.10/site-packages/datasets/formatting/formatting.py:197, in <genexpr>(.0)
194 array: List = pa_array.to_numpy(zero_copy_only=zero_copy_only).tolist()
195 if len(array) > 0:
196 if any(
--> 197 (isinstance(x, np.ndarray) and (x.dtype == np.object or x.shape != array[0].shape))
198 or (isinstance(x, float) and np.isnan(x))
199 for x in array
200 ):
201 return np.array(array, copy=False, **{**self.np_array_kwargs, "dtype": np.object})
202 return np.array(array, copy=False, **self.np_array_kwargs)
File /opt/conda/lib/python3.10/site-packages/numpy/__init__.py:324, in __getattr__(attr)
319 warnings.warn(
320 f"In the future `np.{attr}` will be defined as the "
321 "corresponding NumPy scalar.", FutureWarning, stacklevel=2)
323 if attr in __former_attrs__:
--> 324 raise AttributeError(__former_attrs__[attr])
326 if attr == 'testing':
327 import numpy.testing as testing
AttributeError: module 'numpy' has no attribute 'object'.
`np.object` was a deprecated alias for the builtin `object`. To avoid this error in existing code, use `object` by itself. Doing this will not modify any behavior and is safe.
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
Chapter 7, Token classification has a .compute_metric() function that uses argmax() instead of softmax(). I have attached it below. I’m wondering why does this not cause problems with gradient back-propagation? Argmax is non-differentiable, so I would assume there’d be problems, but example clearly works, and I must be missing something here.
link:
code:
import numpy as np
def compute_metrics(eval_preds):
logits, labels = eval_preds
predictions = np.argmax(logits, axis=-1)
# Remove ignored index (special tokens) and convert to labels
true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
true_predictions = [
[label_names[p] for (p, l) in zip(prediction, label) if l != -100]
for prediction, label in zip(predictions, labels)
]
all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
return {
"precision": all_metrics["overall_precision"],
"recall": all_metrics["overall_recall"],
"f1": all_metrics["overall_f1"],
"accuracy": all_metrics["overall_accuracy"],
}
I’ve found several issues with the “Training a causal language model from scratch” tutorial, you can find those below:
Accelerator(fp16=True)
with Accelerator(mixed_precision='fp16')
to use the updated syntax.losses.append(accelerator.gather(outputs.loss.view(-1)))
since outputs.loss
returns a scalar and does not have a shape.i hope this helps, best.
This seems like a typo to me, In the QuestionAnswering section, inside the fuction preprocess_training_examples(examples)
there is a loop that finds the end position of the answer, the following code is in the course page.
# Find the start and end of the context
idx = 0
while sequence_ids[idx] != 1:
idx += 1
context_start = idx
while sequence_ids[idx] == 1:
idx += 1
context_end = idx - 1
This gives an index out-of-range error. I think the correct version is
while sequence_ids[idx] != 1:
idx += 1
context_end = idx - 1
When training a causal language model from scratch , i am getting this error:
‘’---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[258], line 3
1 from transformers import Trainer, TrainingArguments
----> 3 args = TrainingArguments(
4 output_dir=“codeparrot-ds”,
5 per_device_train_batch_size=32,
6 per_device_eval_batch_size=32,
7 evaluation_strategy=“steps”,
8 eval_steps=5_000,
9 logging_steps=5_000,
10 gradient_accumulation_steps=8,
11 num_train_epochs=1,
12 weight_decay=0.1,
13 warmup_steps=1_000,
14 lr_scheduler_type=“cosine”,
15 learning_rate=5e-4,
16 save_steps=5_000,
17 fp16=True,
18 push_to_hub=True,
19 )
21 trainer = Trainer(
22 model=model,
23 tokenizer=tokenizer,
(…)
27 eval_dataset=tokenized_datasets[“valid”],
28 )
File :125, in init(self, output_dir, overwrite_output_dir, do_train, do_eval, do_predict, evaluation_strategy, prediction_loss_only, per_device_train_batch_size, per_device_eval_batch_size, per_gpu_train_batch_size, per_gpu_eval_batch_size, gradient_accumulation_steps, eval_accumulation_steps, eval_delay, learning_rate, weight_decay, adam_beta1, adam_beta2, adam_epsilon, max_grad_norm, num_train_epochs, max_steps, lr_scheduler_type, lr_scheduler_kwargs, warmup_ratio, warmup_steps, log_level, log_level_replica, log_on_each_node, logging_dir, logging_strategy, logging_first_step, logging_steps, logging_nan_inf_filter, save_strategy, save_steps, save_total_limit, save_safetensors, save_on_each_node, save_only_model, no_cuda, use_cpu, use_mps_device, seed, data_seed, jit_mode_eval, use_ipex, bf16, fp16, fp16_opt_level, half_precision_backend, bf16_full_eval, fp16_full_eval, tf32, local_rank, ddp_backend, tpu_num_cores, tpu_metrics_debug, debug, dataloader_drop_last, eval_steps, dataloader_num_workers, dataloader_prefetch_factor, past_index, run_name, disable_tqdm, remove_unused_columns, label_names, load_best_model_at_end, metric_for_best_model, greater_is_better, ignore_data_skip, fsdp, fsdp_min_num_params, fsdp_config, fsdp_transformer_layer_cls_to_wrap, accelerator_config, deepspeed, label_smoothing_factor, optim, optim_args, adafactor, group_by_length, length_column_name, report_to, ddp_find_unused_parameters, ddp_bucket_cap_mb, ddp_broadcast_buffers, dataloader_pin_memory, dataloader_persistent_workers, skip_memory_metrics, use_legacy_prediction_loop, push_to_hub, resume_from_checkpoint, hub_model_id, hub_strategy, hub_token, hub_private_repo, hub_always_push, gradient_checkpointing, gradient_checkpointing_kwargs, include_inputs_for_metrics, eval_do_concat_batches, fp16_backend, push_to_hub_model_id, push_to_hub_organization, push_to_hub_token, mp_parameters, auto_find_batch_size, full_determinism, torchdynamo, ray_scope, ddp_timeout, torch_compile, torch_compile_backend, torch_compile_mode, dispatch_batches, split_batches, include_tokens_per_second, include_num_input_tokens_seen, neftune_noise_alpha, optim_target_modules)
File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\transformers\training_args.py:1612, in TrainingArguments.post_init(self)
1600 raise ValueError(“–optim adamw_torch_fused with --fp16 requires PyTorch>2.0”)
1602 if (
1603 self.framework == “pt”
1604 and is_torch_available()
(…)
1610 and (self.fp16 or self.fp16_full_eval)
1611 ):
→ 1612 raise ValueError(
1613 “FP16 Mixed precision training with AMP or APEX (--fp16
) and FP16 half precision evaluation”
1614 " (--fp16_full_eval
) can only be used on CUDA or MLU devices or NPU devices or certain XPU devices (with IPEX)."
1615 )
1617 if (
1618 self.framework == “pt”
1619 and is_torch_available()
(…)
1627 and (self.bf16 or self.bf16_full_eval)
1628 ):
1629 raise ValueError(
1630 “BF16 Mixed precision training with AMP (--bf16
) and BF16 half precision evaluation”
1631 " (--bf16_full_eval
) can only be used on CUDA, XPU (with IPEX), NPU, MLU or CPU/TPU/NeuronCore devices."
1632 )
ValueError: FP16 Mixed precision training with AMP or APEX (--fp16
) and FP16 half precision evaluation (--fp16_full_eval
) can only be used on CUDA or MLU devices or NPU devices or certain XPU devices (with IPEX).
‘’
I cant train the whole codeparrot dataset so i tried to use a subset here before training
model=model,
tokenizer=tokenizer,
args=args,
data_collator=data_collator,
train_dataset=tokenized_datasets["train"][:10000],
eval_dataset=tokenized_datasets["valid"],
)```
but it gives me error:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
Cell In[36], line 1
----> 1 trainer.train()
File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:1771, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1768 try:
1769 # Disable progress bars when uploading models during checkpoints to avoid polluting stdout
1770 hf_hub_utils.disable_progress_bars()
-> 1771 return inner_training_loop(
1772 args=args,
1773 resume_from_checkpoint=resume_from_checkpoint,
1774 trial=trial,
1775 ignore_keys_for_eval=ignore_keys_for_eval,
1776 )
1777 finally:
1778 hf_hub_utils.enable_progress_bars()
File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:2085, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
2082 rng_to_sync = True
2084 step = -1
-> 2085 for step, inputs in enumerate(epoch_iterator):
2086 total_batched_samples += 1
2088 if self.args.include_num_input_tokens_seen:
File /opt/conda/lib/python3.10/site-packages/accelerate/data_loader.py:452, in DataLoaderShard.__iter__(self)
450 # We iterate one batch ahead to check when we are at the end
451 try:
--> 452 current_batch = next(dataloader_iter)
453 except StopIteration:
454 yield
File /opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py:630, in _BaseDataLoaderIter.__next__(self)
627 if self._sampler_iter is None:
628 # TODO(https://github.com/pytorch/pytorch/issues/76750)
629 self._reset() # type: ignore[call-arg]
--> 630 data = self._next_data()
631 self._num_yielded += 1
632 if self._dataset_kind == _DatasetKind.Iterable and \
633 self._IterableDataset_len_called is not None and \
634 self._num_yielded > self._IterableDataset_len_called:
File /opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py:674, in _SingleProcessDataLoaderIter._next_data(self)
672 def _next_data(self):
673 index = self._next_index() # may raise StopIteration
--> 674 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
675 if self._pin_memory:
676 data = _utils.pin_memory.pin_memory(data, self._pin_memory_device)
File /opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py:51, in _MapDatasetFetcher.fetch(self, possibly_batched_index)
49 data = self.dataset.__getitems__(possibly_batched_index)
50 else:
---> 51 data = [self.dataset[idx] for idx in possibly_batched_index]
52 else:
53 data = self.dataset[possibly_batched_index]
File /opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py:51, in <listcomp>(.0)
49 data = self.dataset.__getitems__(possibly_batched_index)
50 else:
---> 51 data = [self.dataset[idx] for idx in possibly_batched_index]
52 else:
53 data = self.dataset[possibly_batched_index]
KeyError: 0
how can I modify preprocessing functions (both train and validation) and compute_metrics function for squad v2 ? I am having some difficulties in implementing them because of unanswerable questions. Also, received “KeyError: no_answer_probabiliy” in evaluation…how can I handle that, what should I do ?
Kiwihead15
You could try to create a new model under your account.
I got confusion about some code snippet of " Fine-tuning a masked language model" part. Why do we need to repeat “loss” in below code ?
for step, batch in enumerate(eval_dataloader):
with torch.no_grad():
outputs = model(**batch)
loss = outputs.loss
losses.append(accelerator.gather(loss.repeat(batch_size)))
losses = torch.cat(losses)
losses = losses[: len(eval_dataset)]
try:
perplexity = math.exp(torch.mean(losses))
except OverflowError:
perplexity = float("inf")
If it’s about the adding samples as this thread said.
Hi - I would greatly appreciate any advice regarding the following…
I have CUDA 12.2 installed but the cuda-supported pytorch options are only 11.8, 12.1 and 12.4. Is this the reason pytorch is not recognising my CUDA installation, i.e., cuda=12.2 being incompatible?
The issue is extrapolated however, because tensorflow (TF) also does not recognise my cuda device.
The result is that I am struggling very much to get my transformer model trained due to not having access to me NVIDIA GPU.
I definitely have CUDA and GPU! However I do not have CUDNN. Please can anybody lend any advice.
Best wishes,
The dataset has been down, may you change it for easy followup
Hi - I am a bit confused about whole_word_masking_data_collator - it doesn’t seem like we actually use this in either of the training runs. When I try to use this collator in my trainer - I get an index error on word_ids on the line word_ids = feature.pop(“word_ids”) in the function.
Possible mistake in Summarization:
In “Preprocessing the data” section, it says:
The tokenizers in
Transformers provide a nifty
text_target
argument that allows you to tokenize the labels in parallel to the inputs. Here is an example of how the inputs and targets are processed for mT5:
Then it provides the below code, but the code doesn’t use the “text_target” argument for tokenizing the labels. Is that a mistake?
max_input_length = 512
max_target_length = 30
def preprocess_function(examples):
model_inputs = tokenizer(
examples["review_body"],
max_length=max_input_length,
truncation=True,
)
labels = tokenizer(
examples["review_title"], max_length=max_target_length, truncation=True
)
model_inputs["labels"] = labels["input_ids"]
return model_inputs```
I had the same error. Seems as if the step to create the repository is missing from the tutorial.
# Lastly, to push our model to the Hub, we will need to create a Repository object in a
# working folder. First log in to Hugging Face, if you’re not logged in already. We’ll
# determine the repository name from the model ID we want to give our model (feel free
# to replace the repo_name with your own choice; it just needs to contain your username,
# which is what the function get_full_repo_name() does):
from huggingface_hub import Repository, get_full_repo_name, create_repo
model_name = "bert-finetuned-ner-accelerate"
repo_name = get_full_repo_name(model_name)
# Adding this line resolved the issue for me
# Also be sure to import "create_repo"
# Only need to run once, comment after first run
repo_url = create_repo(repo_id=repo_name)
# Clone
output_dir = "bert-finetuned-ner-accelerate"
repo = Repository(output_dir, clone_from=repo_name)
Getting an error:
Error: You must call wandb.init() before wandb.log()
“Main NLP Tasks” in section “Fine tuning a Masked Language Model” at the following line
trainer.train()
The solution is to update the training arguments defined a few lines above.
training_args = TrainingArguments(
output_dir=f"{model_name}-finetuned-imdb",
overwrite_output_dir=True,
evaluation_strategy="epoch",
learning_rate=2e-5,
weight_decay=0.01,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
push_to_hub=True,
fp16=True,
logging_steps=logging_steps,
report_to="none" # ADD THIS LINE!
)
This is covered in another thread but an initial non-working answer is given, with another poster correcting None to “none” in a response. Also, other alternative solutions are offered that could be tried.
The exercises in this course no longer seem practical as there have been too many changes in the datasets, models, and python module functions between when it was written and today.
I’m now spending more time debugging problems than I am learning the intended topic of each section. I think from here to the end of the course I will just read through the information instead of attempting to solve the bugs in the exercise.
Example, got an error in the code evaluating the rouge score.
AttributeError: 'numpy.float64' object has no attribute 'mid'
This is because rouge metric no longer returns a collection of low, mid, high aggregate scores as expected by the code in the course. It now returns a simple dict of rouge types. ROUGE - a Hugging Face Space by evaluate-metric
Also, the Amazon Reviews dataset is not longer available. I spent quite a bit of time recreating a similar dataset form the Wikipedia dataset. So I could continue the course exercise.
# Run once. Can take a very long time.
# Make sure the saved files presist.
from datasets import load_dataset, Dataset, load_from_disk
import random
# Amazon Review Dataset is defunct, need a replacement
# According to the wikipedia dataset, it has the title and text needed where the title can be
# assumed to be the summary of the text. It won't be as good as the original, but will allow
# the exercises of the course to move forward
# https://huggingface.co/datasets/wikimedia/wikipedia
# But it does not have the train, test, and validation splits, and is HUGE.
# So manually faking the splits, renaming columns, and adding a missing column with random values
spanish_dataset_raw = load_dataset(
path="wikimedia/wikipedia",
name="20231101.es",
trust_remote_code=True
)
english_dataset_raw = load_dataset(
path="wikimedia/wikipedia",
name="20231101.en",
trust_remote_code=True
)
# english_dataset = english_dataset_raw
# spanish_dataset = spanish_dataset_raw
# At the time of this writing the english wikipedia dataset was 6.4 million records
# But the amazon reviews dataset was only 200,000/5,000/5,000 for train/vald/test
# Get a smaller portion of records, split into "test" & "train"
english_dataset = english_dataset_raw["train"].train_test_split(test_size=10_000, train_size=200_000)
# Divide the test split in half for "test" and valudation
english_dataset_test_split = english_dataset["test"].train_test_split(test_size=0.5, train_size=0.5)
# Assemble the the various splits into one dictionary
english_dataset['test'] = english_dataset_test_split['test'];
english_dataset['validation'] = english_dataset_test_split['train'];
# Repeat for the spanish dataset
spanish_dataset = spanish_dataset_raw["train"].train_test_split(test_size=10_000, train_size=200_000)
# Divide the test split in half for "test" and valudation
spanish_dataset_test_split = spanish_dataset["test"].train_test_split(test_size=0.5, train_size=0.5)
# Assemble the the various splits into one dictionary
spanish_dataset['test'] = spanish_dataset_test_split['test'];
spanish_dataset['validation'] = spanish_dataset_test_split['train'];
# add the missing product_category column
product_categories = ["home","apparel","wireless","other","beauty","drugstore","kitchen","toy","sports","automotive","lawn_and_garden","home_improvement","pet_products","digital_ebook_purchase","pc","electronics","office_product","shoes","grocery","book"]
def add_product_category(example):
example["product_category"] = random.choice(product_categories)
return example
english_dataset = english_dataset.map(add_product_category)
spanish_dataset = spanish_dataset.map(add_product_category)
# Rename columns to match course data
english_dataset = english_dataset.rename_column("text", "review_body")
english_dataset = english_dataset.rename_column("title", "review_title")
spanish_dataset = spanish_dataset.rename_column("text", "review_body")
spanish_dataset = spanish_dataset.rename_column("title", "review_title")
english_dataset['train'][0]
english_dataset.save_to_disk("english_dataset")
spanish_dataset.save_to_disk("spanish_dataset")
# I'm working in Kaggle so I made sure to save a version of the Notebook so that the files presisted between loads
next cell
# If the cell above has been executed and the dataset are available in the notebook presistent disk memory
# Then do not run the cell above, Run this cell to load from disk
english_dataset = load_from_disk("english_dataset")
spanish_dataset = load_from_disk("spanish_dataset")
Thank you for sharing these fixes!