I follow the example in examples/pytorch/text-classification/run_glue.py
and according to the README.md
I run command:
export TASK_NAME=mrpc
python run_glue.py \
--model_name_or_path bert-base-cased \
--task_name $TASK_NAME \
--do_train \
--do_eval \
--max_seq_length 128 \
--per_device_train_batch_size 32 \
--learning_rate 2e-5 \
--num_train_epochs 3 \
--output_dir /tmp/$TASK_NAME/
But I get the errors. The log is:
2023-07-20 22:13:53.394768: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-07-20 22:13:53.875943: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
[2023-07-20 22:13:54,224] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
07/20/2023 22:13:54 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 4distributed training: True, 16-bits training: False
07/20/2023 22:13:54 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
_n_gpu=4,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=IntervalStrategy.NO,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=HubStrategy.EVERY_SAVE,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_inputs_for_metrics=False,
jit_mode_eval=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=2e-05,
length_column_name=length,
load_best_model_at_end=False,
local_rank=0,
log_level=passive,
log_level_replica=warning,
log_on_each_node=True,
logging_dir=/tmp/mrpc/runs/Jul20_22-13-54_Fairfax4way04RTX4090,
logging_first_step=False,
logging_nan_inf_filter=True,
logging_steps=500,
logging_strategy=IntervalStrategy.STEPS,
lr_scheduler_type=SchedulerType.LINEAR,
max_grad_norm=1.0,
max_steps=-1,
metric_for_best_model=None,
mp_parameters=,
no_cuda=False,
num_train_epochs=3.0,
optim=OptimizerNames.ADAMW_HF,
optim_args=None,
output_dir=/tmp/mrpc/,
overwrite_output_dir=True,
past_index=-1,
per_device_eval_batch_size=8,
per_device_train_batch_size=4,
prediction_loss_only=False,
push_to_hub=False,
push_to_hub_model_id=None,
push_to_hub_organization=None,
push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
ray_scope=last,
remove_unused_columns=True,
report_to=['tensorboard', 'wandb'],
resume_from_checkpoint=None,
run_name=/tmp/mrpc/,
save_on_each_node=False,
save_safetensors=False,
save_steps=500,
save_strategy=IntervalStrategy.STEPS,
save_total_limit=None,
seed=42,
sharded_ddp=[],
skip_memory_metrics=True,
tf32=None,
torch_compile=False,
torch_compile_backend=None,
torch_compile_mode=None,
torchdynamo=None,
tpu_metrics_debug=False,
tpu_num_cores=None,
use_ipex=False,
use_legacy_prediction_loop=False,
use_mps_device=False,
warmup_ratio=0.0,
warmup_steps=0,
weight_decay=0.0,
xpu_backend=None,
)
07/20/2023 22:13:55 - INFO - datasets.info - Loading Dataset Infos from /home/xiaofeng.wu/.cache/huggingface/modules/datasets_modules/datasets/glue/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad
07/20/2023 22:13:55 - INFO - datasets.builder - Overwrite dataset info from restored data version if exists.
07/20/2023 22:13:55 - INFO - datasets.info - Loading Dataset info from /home/xiaofeng.wu/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad
07/20/2023 22:13:55 - WARNING - datasets.builder - Found cached dataset glue (/home/xiaofeng.wu/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
07/20/2023 22:13:55 - INFO - datasets.info - Loading Dataset info from /home/xiaofeng.wu/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad
100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 3/3 [00:00<00:00, 1536.94it/s]
[INFO|configuration_utils.py:712] 2023-07-20 22:13:55,393 >> loading configuration file config.json from cache at /home/xiaofeng.wu/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/config.json
[INFO|configuration_utils.py:768] 2023-07-20 22:13:55,395 >> Model config BertConfig {
"_name_or_path": "bert-base-cased",
"architectures": [
"BertForMaskedLM"
],
"attention_probs_dropout_prob": 0.1,
"classifier_dropout": null,
"finetuning_task": "mrpc",
"gradient_checkpointing": false,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 512,
"model_type": "bert",
"num_attention_heads": 12,
"num_hidden_layers": 12,
"pad_token_id": 0,
"position_embedding_type": "absolute",
"transformers_version": "4.31.0.dev0",
"type_vocab_size": 2,
"use_cache": true,
"vocab_size": 28996
}
[INFO|configuration_utils.py:712] 2023-07-20 22:13:55,408 >> loading configuration file config.json from cache at /home/xiaofeng.wu/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/config.json
[INFO|configuration_utils.py:768] 2023-07-20 22:13:55,409 >> Model config BertConfig {
"_name_or_path": "bert-base-cased",
"architectures": [
"BertForMaskedLM"
],
"attention_probs_dropout_prob": 0.1,
"classifier_dropout": null,
"gradient_checkpointing": false,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 512,
"model_type": "bert",
"num_attention_heads": 12,
"num_hidden_layers": 12,
"pad_token_id": 0,
"position_embedding_type": "absolute",
"transformers_version": "4.31.0.dev0",
"type_vocab_size": 2,
"use_cache": true,
"vocab_size": 28996
}
[INFO|tokenization_utils_base.py:1855] 2023-07-20 22:13:55,409 >> loading file vocab.txt from cache at /home/xiaofeng.wu/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/vocab.txt
[INFO|tokenization_utils_base.py:1855] 2023-07-20 22:13:55,409 >> loading file tokenizer.json from cache at /home/xiaofeng.wu/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/tokenizer.json
[INFO|tokenization_utils_base.py:1855] 2023-07-20 22:13:55,409 >> loading file added_tokens.json from cache at None
[INFO|tokenization_utils_base.py:1855] 2023-07-20 22:13:55,409 >> loading file special_tokens_map.json from cache at None
[INFO|tokenization_utils_base.py:1855] 2023-07-20 22:13:55,409 >> loading file tokenizer_config.json from cache at /home/xiaofeng.wu/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/tokenizer_config.json
[INFO|configuration_utils.py:712] 2023-07-20 22:13:55,409 >> loading configuration file config.json from cache at /home/xiaofeng.wu/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/config.json
[INFO|configuration_utils.py:768] 2023-07-20 22:13:55,409 >> Model config BertConfig {
"_name_or_path": "bert-base-cased",
"architectures": [
"BertForMaskedLM"
],
"attention_probs_dropout_prob": 0.1,
"classifier_dropout": null,
"gradient_checkpointing": false,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 512,
"model_type": "bert",
"num_attention_heads": 12,
"num_hidden_layers": 12,
"pad_token_id": 0,
"position_embedding_type": "absolute",
"transformers_version": "4.31.0.dev0",
"type_vocab_size": 2,
"use_cache": true,
"vocab_size": 28996
}
[INFO|modeling_utils.py:2617] 2023-07-20 22:13:55,433 >> loading weights file model.safetensors from cache at /home/xiaofeng.wu/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/model.safetensors
[INFO|modeling_utils.py:3330] 2023-07-20 22:13:55,890 >> Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[WARNING|modeling_utils.py:3342] 2023-07-20 22:13:55,890 >> Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
07/20/2023 22:13:55 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at /home/xiaofeng.wu/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-2a6454e11c1b5865.arrow
07/20/2023 22:13:55 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at /home/xiaofeng.wu/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-45c8996e4f22b027.arrow
07/20/2023 22:13:55 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at /home/xiaofeng.wu/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-1abee9d0411db8a4.arrow
07/20/2023 22:13:55 - INFO - __main__ - Sample 2619 of the training set: {'sentence1': 'The proceedings were taken up with prosecutors outlining their case against Amrozi , reading 33 pages of documents outlining allegations against him .', 'sentence2': 'Proceedings were taken up with prosecutors outlining their case against Amrozi , reading a 33-page accusation letter to the court .', 'label': 1, 'idx': 2916, 'input_ids': [101, 1109, 10830, 1127, 1678, 1146, 1114, 24987, 1149, 13260, 1147, 1692, 1222, 7277, 2180, 5303, 117, 3455, 3081, 5097, 1104, 4961, 1149, 13260, 9966, 1222, 1140, 119, 102, 20661, 1127, 1678, 1146, 1114, 24987, 1149, 13260, 1147, 1692, 1222, 7277, 2180, 5303, 117, 3455, 170, 3081, 118, 3674, 21100, 2998, 1106, 1103, 2175, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}.
07/20/2023 22:13:55 - INFO - __main__ - Sample 456 of the training set: {'sentence1': "Chechen officials working for the Moscow-backed government are a frequent target for rebels and tension is running high ahead of next Sunday 's presidential election in war-torn Chechnya .", 'sentence2': "Officials in Chechnya 's Moscow-backed government are a frequent target for rebels , and tension is running high ahead of Sunday 's presidential election in the war-ravaged region .", 'label': 1, 'idx': 509, 'input_ids': [101, 20394, 11252, 1424, 3878, 1684, 1111, 1103, 4116, 118, 5534, 1433, 1132, 170, 6539, 4010, 1111, 9283, 1105, 6646, 1110, 1919, 1344, 3075, 1104, 1397, 3625, 112, 188, 5200, 1728, 1107, 1594, 118, 7820, 20394, 11252, 15449, 119, 102, 9018, 1116, 1107, 20394, 11252, 15449, 112, 188, 4116, 118, 5534, 1433, 1132, 170, 6539, 4010, 1111, 9283, 117, 1105, 6646, 1110, 1919, 1344, 3075, 1104, 3625, 112, 188, 5200, 1728, 1107, 1103, 1594, 118, 187, 15677, 3660, 1805, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}.
07/20/2023 22:13:55 - INFO - __main__ - Sample 102 of the training set: {'sentence1': "Standard & Poor 's 500 stock index futures declined 4.40 points to 983.50 , while Nasdaq futures fell 6.5 points to 1,206.50 .", 'sentence2': "The Standard & Poor 's 500 Index was up 1.75 points , or 0.18 percent , to 977.68 .", 'label': 0, 'idx': 116, 'input_ids': [101, 6433, 111, 11767, 112, 188, 2260, 4482, 7448, 2174, 1116, 5799, 125, 119, 1969, 1827, 1106, 5103, 1495, 119, 1851, 117, 1229, 11896, 1116, 1810, 4426, 2174, 1116, 2204, 127, 119, 126, 1827, 1106, 122, 117, 20278, 119, 1851, 119, 102, 1109, 6433, 111, 11767, 112, 188, 2260, 10146, 1108, 1146, 122, 119, 3453, 1827, 117, 1137, 121, 119, 1407, 3029, 117, 1106, 5311, 1559, 119, 5599, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}.
[INFO|trainer.py:777] 2023-07-20 22:13:57,241 >> The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence2, sentence1, idx. If sentence2, sentence1, idx are not expected by `BertForSequenceClassification.forward`, you can safely ignore this message.
[INFO|trainer.py:1021] 2023-07-20 22:13:57,244 >> Saving optimizer_grouped_parameter_names checkpoint to /tmp/mrpc/
/home/xiaofeng.wu/prjs/transformers/src/transformers/optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
warnings.warn(
[INFO|trainer.py:1729] 2023-07-20 22:13:57,335 >> ***** Running training *****
[INFO|trainer.py:1730] 2023-07-20 22:13:57,335 >> Num examples = 3,668
[INFO|trainer.py:1731] 2023-07-20 22:13:57,335 >> Num Epochs = 3
[INFO|trainer.py:1732] 2023-07-20 22:13:57,335 >> Instantaneous batch size per device = 4
[INFO|trainer.py:1734] 2023-07-20 22:13:57,335 >> Training with DataParallel so batch size has been adjusted to: 16
[INFO|trainer.py:1735] 2023-07-20 22:13:57,335 >> Total train batch size (w. parallel, distributed & accumulation) = 16
[INFO|trainer.py:1736] 2023-07-20 22:13:57,335 >> Gradient Accumulation steps = 1
[INFO|trainer.py:1737] 2023-07-20 22:13:57,336 >> Total optimization steps = 690
[INFO|trainer.py:1738] 2023-07-20 22:13:57,336 >> Number of trainable parameters = 108,311,810
[INFO|integrations.py:716] 2023-07-20 22:13:57,337 >> Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
wandb: Tracking run with wandb version 0.15.4
wandb: W&B syncing is set to `offline` in this directory.
wandb: Run `wandb online` or set WANDB_MODE=online to enable cloud syncing.
0%| | 0/690 [00:00<?, ?it/s]/home/xiaofeng.wu/anaconda3/envs/py311/lib/python3.11/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.
warnings.warn('Was asked to gather along dimension 0, but all '
0%| | 1/690 [00:04<57:21, 4.99s/it]Traceback (most recent call last):
File "/home/xiaofeng.wu/prjs/transformers/dev/eval_bert/run_glue.py", line 626, in <module>
main()
File "/home/xiaofeng.wu/prjs/transformers/dev/eval_bert/run_glue.py", line 534, in main
train_result = trainer.train(resume_from_checkpoint=checkpoint)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/xiaofeng.wu/prjs/transformers/src/transformers/trainer.py", line 1586, in train
return inner_training_loop(
^^^^^^^^^^^^^^^^^^^^
File "/home/xiaofeng.wu/prjs/transformers/src/transformers/trainer.py", line 1854, in _inner_training_loop
if (
RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
wandb: Waiting for W&B process to finish... (failed 1).
wandb: You can sync this run to the cloud by running:
wandb: wandb sync /home/xiaofeng.wu/prjs/transformers/dev/eval_bert/wandb/offline-run-20230720_221357-uus3tiiq
wandb: Find logs at: ./wandb/offline-run-20230720_221357-uus3tiiq/logs
Env version
- Driver Version: 530.30.02
- CUDA Version: 12.1
- torch version: 2.1.0.dev20230708+cu118
- transformers version: 4.31.0.dev0
Thu Jul 20 23:29:56 2023
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 530.30.02 Driver Version: 530.30.02 CUDA Version: 12.1 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 NVIDIA GeForce RTX 4090 On | 00000000:21:00.0 Off | Off |
| 0% 49C P8 33W / 450W| 6MiB / 24564MiB | 0% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
| 1 NVIDIA GeForce RTX 4090 On | 00000000:22:00.0 Off | Off |
| 0% 39C P8 33W / 480W| 6MiB / 24564MiB | 0% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
| 2 NVIDIA GeForce RTX 4090 On | 00000000:41:00.0 Off | Off |
| 0% 43C P8 43W / 480W| 6MiB / 24564MiB | 0% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
| 3 NVIDIA GeForce RTX 4090 On | 00000000:43:00.0 Off | Off |
| 0% 40C P8 36W / 480W| 6MiB / 24564MiB | 0% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
Name: torch
Version: 2.1.0.dev20230708+cu118
Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration
Home-page: https://pytorch.org/
Author: PyTorch Team
Author-email: packages@pytorch.org
License: BSD-3
Location: /home/xiaofeng.wu/anaconda3/envs/py311/lib/python3.11/site-packages
Requires: filelock, fsspec, jinja2, networkx, pytorch-triton, sympy, typing-extensions
Required-by: accelerate, deepspeed, fairseq, torchaudio, torchvision, triton
Name: transformers
Version: 4.31.0.dev0
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /home/xiaofeng.wu/anaconda3/envs/py311/lib/python3.11/site-packages
Editable project location: /home/xiaofeng.wu/prjs/transformers
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: