raw_datasets = load_dataset(“conll2003”)
barfs .
Unable to find ‘hf://datasets/conll2003@365071bdbe92d4b8e4f153d7df57706a5be793ac/conll2003/train/0000.parquet’
etc
Looks like it has simply been scrubbed, which rather breaks the example notebook
raw_datasets = load_dataset(“conll2003”)
barfs .
Unable to find ‘hf://datasets/conll2003@365071bdbe92d4b8e4f153d7df57706a5be793ac/conll2003/train/0000.parquet’
etc
Looks like it has simply been scrubbed, which rather breaks the example notebook
When I call to_tf_dataset
, I get this error.
/opt/conda/lib/python3.10/site-packages/datasets/formatting/formatting.py:197: FutureWarning: In the future `np.object` will be defined as the corresponding NumPy scalar.
(isinstance(x, np.ndarray) and (x.dtype == np.object or x.shape != array[0].shape))
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
Cell In[24], line 1
----> 1 tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
2 columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
3 collate_fn=data_collator,
4 shuffle=True,
5 batch_size=16,
6 )
8 tf_eval_dataset = tokenized_datasets["validation"].to_tf_dataset(
9 columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
10 collate_fn=data_collator,
11 shuffle=False,
12 batch_size=16,
13 )
File /opt/conda/lib/python3.10/site-packages/datasets/arrow_dataset.py:381, in TensorflowDatasetMixin.to_tf_dataset(self, columns, batch_size, shuffle, collate_fn, drop_remainder, collate_fn_args, label_cols, dummy_labels, prefetch)
378 retained_columns = [key for key in self.features.keys() if key in cols_to_retain]
379 dataset = self.with_format("numpy", columns=retained_columns)
--> 381 columns_to_dtypes, output_signature = self._get_output_signature(
382 dataset, collate_fn, collate_fn_args, batch_size=batch_size if drop_remainder else None
383 )
384 all_columns = list(columns_to_dtypes.keys())
385 all_dtypes = list(columns_to_dtypes.values())
File /opt/conda/lib/python3.10/site-packages/datasets/arrow_dataset.py:244, in TensorflowDatasetMixin._get_output_signature(dataset, collate_fn, collate_fn_args, batch_size)
242 raise ValueError("Unable to get the output signature because the dataset is empty.")
243 test_batch_size = min(len(dataset), 4)
--> 244 test_batch = dataset[:test_batch_size]
245 test_batch = [{key: value[i] for key, value in test_batch.items()} for i in range(test_batch_size)]
246 test_batch = collate_fn(test_batch, **collate_fn_args)
File /opt/conda/lib/python3.10/site-packages/datasets/arrow_dataset.py:1764, in Dataset.__getitem__(self, key)
1762 def __getitem__(self, key): # noqa: F811
1763 """Can be used to index columns (by string names) or rows (by integer index or iterable of indices or bools)."""
-> 1764 return self._getitem(
1765 key,
1766 )
File /opt/conda/lib/python3.10/site-packages/datasets/arrow_dataset.py:1749, in Dataset._getitem(self, key, decoded, **kwargs)
1747 formatter = get_formatter(format_type, features=self.features, decoded=decoded, **format_kwargs)
1748 pa_subtable = query_table(self._data, key, indices=self._indices if self._indices is not None else None)
-> 1749 formatted_output = format_table(
1750 pa_subtable, key, formatter=formatter, format_columns=format_columns, output_all_columns=output_all_columns
1751 )
1752 return formatted_output
File /opt/conda/lib/python3.10/site-packages/datasets/formatting/formatting.py:540, in format_table(table, key, formatter, format_columns, output_all_columns)
538 else:
539 pa_table_to_format = pa_table.drop(col for col in pa_table.column_names if col not in format_columns)
--> 540 formatted_output = formatter(pa_table_to_format, query_type=query_type)
541 if output_all_columns:
542 if isinstance(formatted_output, MutableMapping):
File /opt/conda/lib/python3.10/site-packages/datasets/formatting/formatting.py:285, in Formatter.__call__(self, pa_table, query_type)
283 return self.format_column(pa_table)
284 elif query_type == "batch":
--> 285 return self.format_batch(pa_table)
File /opt/conda/lib/python3.10/site-packages/datasets/formatting/formatting.py:346, in NumpyFormatter.format_batch(self, pa_table)
345 def format_batch(self, pa_table: pa.Table) -> dict:
--> 346 batch = self.numpy_arrow_extractor(**self.np_array_kwargs).extract_batch(pa_table)
347 if self.decoded:
348 batch = self.python_features_decoder.decode_batch(batch)
File /opt/conda/lib/python3.10/site-packages/datasets/formatting/formatting.py:160, in NumpyArrowExtractor.extract_batch(self, pa_table)
159 def extract_batch(self, pa_table: pa.Table) -> dict:
--> 160 return {col: self._arrow_array_to_numpy(pa_table[col]) for col in pa_table.column_names}
File /opt/conda/lib/python3.10/site-packages/datasets/formatting/formatting.py:160, in <dictcomp>(.0)
159 def extract_batch(self, pa_table: pa.Table) -> dict:
--> 160 return {col: self._arrow_array_to_numpy(pa_table[col]) for col in pa_table.column_names}
File /opt/conda/lib/python3.10/site-packages/datasets/formatting/formatting.py:196, in NumpyArrowExtractor._arrow_array_to_numpy(self, pa_array)
194 array: List = pa_array.to_numpy(zero_copy_only=zero_copy_only).tolist()
195 if len(array) > 0:
--> 196 if any(
197 (isinstance(x, np.ndarray) and (x.dtype == np.object or x.shape != array[0].shape))
198 or (isinstance(x, float) and np.isnan(x))
199 for x in array
200 ):
201 return np.array(array, copy=False, **{**self.np_array_kwargs, "dtype": np.object})
202 return np.array(array, copy=False, **self.np_array_kwargs)
File /opt/conda/lib/python3.10/site-packages/datasets/formatting/formatting.py:197, in <genexpr>(.0)
194 array: List = pa_array.to_numpy(zero_copy_only=zero_copy_only).tolist()
195 if len(array) > 0:
196 if any(
--> 197 (isinstance(x, np.ndarray) and (x.dtype == np.object or x.shape != array[0].shape))
198 or (isinstance(x, float) and np.isnan(x))
199 for x in array
200 ):
201 return np.array(array, copy=False, **{**self.np_array_kwargs, "dtype": np.object})
202 return np.array(array, copy=False, **self.np_array_kwargs)
File /opt/conda/lib/python3.10/site-packages/numpy/__init__.py:324, in __getattr__(attr)
319 warnings.warn(
320 f"In the future `np.{attr}` will be defined as the "
321 "corresponding NumPy scalar.", FutureWarning, stacklevel=2)
323 if attr in __former_attrs__:
--> 324 raise AttributeError(__former_attrs__[attr])
326 if attr == 'testing':
327 import numpy.testing as testing
AttributeError: module 'numpy' has no attribute 'object'.
`np.object` was a deprecated alias for the builtin `object`. To avoid this error in existing code, use `object` by itself. Doing this will not modify any behavior and is safe.
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
Chapter 7, Token classification has a .compute_metric() function that uses argmax() instead of softmax(). I have attached it below. I’m wondering why does this not cause problems with gradient back-propagation? Argmax is non-differentiable, so I would assume there’d be problems, but example clearly works, and I must be missing something here.
link:
code:
import numpy as np
def compute_metrics(eval_preds):
logits, labels = eval_preds
predictions = np.argmax(logits, axis=-1)
# Remove ignored index (special tokens) and convert to labels
true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
true_predictions = [
[label_names[p] for (p, l) in zip(prediction, label) if l != -100]
for prediction, label in zip(predictions, labels)
]
all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
return {
"precision": all_metrics["overall_precision"],
"recall": all_metrics["overall_recall"],
"f1": all_metrics["overall_f1"],
"accuracy": all_metrics["overall_accuracy"],
}
I’ve found several issues with the “Training a causal language model from scratch” tutorial, you can find those below:
Accelerator(fp16=True)
with Accelerator(mixed_precision='fp16')
to use the updated syntax.losses.append(accelerator.gather(outputs.loss.view(-1)))
since outputs.loss
returns a scalar and does not have a shape.i hope this helps, best.
This seems like a typo to me, In the QuestionAnswering section, inside the fuction preprocess_training_examples(examples)
there is a loop that finds the end position of the answer, the following code is in the course page.
# Find the start and end of the context
idx = 0
while sequence_ids[idx] != 1:
idx += 1
context_start = idx
while sequence_ids[idx] == 1:
idx += 1
context_end = idx - 1
This gives an index out-of-range error. I think the correct version is
while sequence_ids[idx] != 1:
idx += 1
context_end = idx - 1
When training a causal language model from scratch , i am getting this error:
‘’---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[258], line 3
1 from transformers import Trainer, TrainingArguments
----> 3 args = TrainingArguments(
4 output_dir=“codeparrot-ds”,
5 per_device_train_batch_size=32,
6 per_device_eval_batch_size=32,
7 evaluation_strategy=“steps”,
8 eval_steps=5_000,
9 logging_steps=5_000,
10 gradient_accumulation_steps=8,
11 num_train_epochs=1,
12 weight_decay=0.1,
13 warmup_steps=1_000,
14 lr_scheduler_type=“cosine”,
15 learning_rate=5e-4,
16 save_steps=5_000,
17 fp16=True,
18 push_to_hub=True,
19 )
21 trainer = Trainer(
22 model=model,
23 tokenizer=tokenizer,
(…)
27 eval_dataset=tokenized_datasets[“valid”],
28 )
File :125, in init(self, output_dir, overwrite_output_dir, do_train, do_eval, do_predict, evaluation_strategy, prediction_loss_only, per_device_train_batch_size, per_device_eval_batch_size, per_gpu_train_batch_size, per_gpu_eval_batch_size, gradient_accumulation_steps, eval_accumulation_steps, eval_delay, learning_rate, weight_decay, adam_beta1, adam_beta2, adam_epsilon, max_grad_norm, num_train_epochs, max_steps, lr_scheduler_type, lr_scheduler_kwargs, warmup_ratio, warmup_steps, log_level, log_level_replica, log_on_each_node, logging_dir, logging_strategy, logging_first_step, logging_steps, logging_nan_inf_filter, save_strategy, save_steps, save_total_limit, save_safetensors, save_on_each_node, save_only_model, no_cuda, use_cpu, use_mps_device, seed, data_seed, jit_mode_eval, use_ipex, bf16, fp16, fp16_opt_level, half_precision_backend, bf16_full_eval, fp16_full_eval, tf32, local_rank, ddp_backend, tpu_num_cores, tpu_metrics_debug, debug, dataloader_drop_last, eval_steps, dataloader_num_workers, dataloader_prefetch_factor, past_index, run_name, disable_tqdm, remove_unused_columns, label_names, load_best_model_at_end, metric_for_best_model, greater_is_better, ignore_data_skip, fsdp, fsdp_min_num_params, fsdp_config, fsdp_transformer_layer_cls_to_wrap, accelerator_config, deepspeed, label_smoothing_factor, optim, optim_args, adafactor, group_by_length, length_column_name, report_to, ddp_find_unused_parameters, ddp_bucket_cap_mb, ddp_broadcast_buffers, dataloader_pin_memory, dataloader_persistent_workers, skip_memory_metrics, use_legacy_prediction_loop, push_to_hub, resume_from_checkpoint, hub_model_id, hub_strategy, hub_token, hub_private_repo, hub_always_push, gradient_checkpointing, gradient_checkpointing_kwargs, include_inputs_for_metrics, eval_do_concat_batches, fp16_backend, push_to_hub_model_id, push_to_hub_organization, push_to_hub_token, mp_parameters, auto_find_batch_size, full_determinism, torchdynamo, ray_scope, ddp_timeout, torch_compile, torch_compile_backend, torch_compile_mode, dispatch_batches, split_batches, include_tokens_per_second, include_num_input_tokens_seen, neftune_noise_alpha, optim_target_modules)
File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\transformers\training_args.py:1612, in TrainingArguments.post_init(self)
1600 raise ValueError(“–optim adamw_torch_fused with --fp16 requires PyTorch>2.0”)
1602 if (
1603 self.framework == “pt”
1604 and is_torch_available()
(…)
1610 and (self.fp16 or self.fp16_full_eval)
1611 ):
→ 1612 raise ValueError(
1613 “FP16 Mixed precision training with AMP or APEX (--fp16
) and FP16 half precision evaluation”
1614 " (--fp16_full_eval
) can only be used on CUDA or MLU devices or NPU devices or certain XPU devices (with IPEX)."
1615 )
1617 if (
1618 self.framework == “pt”
1619 and is_torch_available()
(…)
1627 and (self.bf16 or self.bf16_full_eval)
1628 ):
1629 raise ValueError(
1630 “BF16 Mixed precision training with AMP (--bf16
) and BF16 half precision evaluation”
1631 " (--bf16_full_eval
) can only be used on CUDA, XPU (with IPEX), NPU, MLU or CPU/TPU/NeuronCore devices."
1632 )
ValueError: FP16 Mixed precision training with AMP or APEX (--fp16
) and FP16 half precision evaluation (--fp16_full_eval
) can only be used on CUDA or MLU devices or NPU devices or certain XPU devices (with IPEX).
‘’
I cant train the whole codeparrot dataset so i tried to use a subset here before training
model=model,
tokenizer=tokenizer,
args=args,
data_collator=data_collator,
train_dataset=tokenized_datasets["train"][:10000],
eval_dataset=tokenized_datasets["valid"],
)```
but it gives me error:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
Cell In[36], line 1
----> 1 trainer.train()
File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:1771, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1768 try:
1769 # Disable progress bars when uploading models during checkpoints to avoid polluting stdout
1770 hf_hub_utils.disable_progress_bars()
-> 1771 return inner_training_loop(
1772 args=args,
1773 resume_from_checkpoint=resume_from_checkpoint,
1774 trial=trial,
1775 ignore_keys_for_eval=ignore_keys_for_eval,
1776 )
1777 finally:
1778 hf_hub_utils.enable_progress_bars()
File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:2085, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
2082 rng_to_sync = True
2084 step = -1
-> 2085 for step, inputs in enumerate(epoch_iterator):
2086 total_batched_samples += 1
2088 if self.args.include_num_input_tokens_seen:
File /opt/conda/lib/python3.10/site-packages/accelerate/data_loader.py:452, in DataLoaderShard.__iter__(self)
450 # We iterate one batch ahead to check when we are at the end
451 try:
--> 452 current_batch = next(dataloader_iter)
453 except StopIteration:
454 yield
File /opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py:630, in _BaseDataLoaderIter.__next__(self)
627 if self._sampler_iter is None:
628 # TODO(https://github.com/pytorch/pytorch/issues/76750)
629 self._reset() # type: ignore[call-arg]
--> 630 data = self._next_data()
631 self._num_yielded += 1
632 if self._dataset_kind == _DatasetKind.Iterable and \
633 self._IterableDataset_len_called is not None and \
634 self._num_yielded > self._IterableDataset_len_called:
File /opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py:674, in _SingleProcessDataLoaderIter._next_data(self)
672 def _next_data(self):
673 index = self._next_index() # may raise StopIteration
--> 674 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
675 if self._pin_memory:
676 data = _utils.pin_memory.pin_memory(data, self._pin_memory_device)
File /opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py:51, in _MapDatasetFetcher.fetch(self, possibly_batched_index)
49 data = self.dataset.__getitems__(possibly_batched_index)
50 else:
---> 51 data = [self.dataset[idx] for idx in possibly_batched_index]
52 else:
53 data = self.dataset[possibly_batched_index]
File /opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py:51, in <listcomp>(.0)
49 data = self.dataset.__getitems__(possibly_batched_index)
50 else:
---> 51 data = [self.dataset[idx] for idx in possibly_batched_index]
52 else:
53 data = self.dataset[possibly_batched_index]
KeyError: 0
how can I modify preprocessing functions (both train and validation) and compute_metrics function for squad v2 ? I am having some difficulties in implementing them because of unanswerable questions. Also, received “KeyError: no_answer_probabiliy” in evaluation…how can I handle that, what should I do ?
Kiwihead15
You could try to create a new model under your account.
I got confusion about some code snippet of " Fine-tuning a masked language model" part. Why do we need to repeat “loss” in below code ?
for step, batch in enumerate(eval_dataloader):
with torch.no_grad():
outputs = model(**batch)
loss = outputs.loss
losses.append(accelerator.gather(loss.repeat(batch_size)))
losses = torch.cat(losses)
losses = losses[: len(eval_dataset)]
try:
perplexity = math.exp(torch.mean(losses))
except OverflowError:
perplexity = float("inf")
If it’s about the adding samples as this thread said.