rirv938:
A100
Here is the config. Only difference is gradient accumulation 1 → 4 for single node version. And both were launched using torchrun.
{
"bf16": {
"desc": null,
"value": false
},
"fp16": {
"desc": null,
"value": true
},
"fsdp": {
"desc": null,
"value": []
},
"seed": {
"desc": null,
"value": 100
},
"tf32": {
"desc": null,
"value": null
},
"debug": {
"desc": null,
"value": []
},
"n_ctx": {
"desc": null,
"value": 1024
},
"optim": {
"desc": null,
"value": "adamw_hf"
},
"top_k": {
"desc": null,
"value": 50
},
"top_p": {
"desc": null,
"value": 1
},
"_wandb": {
"desc": null,
"value": {
"m": [
{
"1": "train/global_step",
"6": [
3
]
},
{
"1": "eval/loss",
"5": 1,
"6": [
1
]
},
{
"1": "eval/pairwise_accuracy",
"5": 1,
"6": [
1
]
},
{
"1": "eval/runtime",
"5": 1,
"6": [
1
]
},
{
"1": "eval/samples_per_second",
"5": 1,
"6": [
1
]
},
{
"1": "eval/steps_per_second",
"5": 1,
"6": [
1
]
},
{
"1": "train/loss",
"5": 1,
"6": [
1
]
},
{
"1": "train/grad_norm",
"5": 1,
"6": [
1
]
},
{
"1": "train/learning_rate",
"5": 1,
"6": [
1
]
},
{
"1": "train/epoch",
"5": 1,
"6": [
1
]
}
],
"t": {
"1": [
1,
11,
49,
55
],
"2": [
1,
5,
11,
49,
51,
53,
55,
71,
98
],
"3": [
7,
13,
23,
66
],
"4": "3.10.12",
"5": "0.17.0",
"6": "4.41.2",
"8": [
5
],
"9": {
"1": "transformers_trainer"
},
"13": "linux-x86_64"
},
"framework": "huggingface",
"start_time": 1717389249,
"cli_version": "0.17.0",
"is_jupyter_run": false,
"python_version": "3.10.12",
"is_kaggle_kernel": false,
"huggingface_version": "4.41.2"
}
},
"n_embd": {
"desc": null,
"value": 1024
},
"n_head": {
"desc": null,
"value": 16
},
"prefix": {
"desc": null,
"value": null
},
"do_eval": {
"desc": null,
"value": true
},
"n_inner": {
"desc": null,
"value": null
},
"n_layer": {
"desc": null,
"value": 24
},
"no_cuda": {
"desc": null,
"value": false
},
"use_cpu": {
"desc": null,
"value": false
},
"do_train": {
"desc": null,
"value": false
},
"id2label": {
"desc": null,
"value": {
"0": "LABEL_0"
}
},
"label2id": {
"desc": null,
"value": {
"LABEL_0": 0
}
},
"run_name": {
"desc": null,
"value": "/tmp/reward_model_100_gpt2-medium_433860e1"
},
"use_ipex": {
"desc": null,
"value": false
},
"adafactor": {
"desc": null,
"value": false
},
"data_seed": {
"desc": null,
"value": null
},
"deepspeed": {
"desc": null,
"value": null
},
"do_sample": {
"desc": null,
"value": false
},
"hub_token": {
"desc": null,
"value": "<HUB_TOKEN>"
},
"log_level": {
"desc": null,
"value": "passive"
},
"max_steps": {
"desc": null,
"value": -1
},
"n_special": {
"desc": null,
"value": 0
},
"num_beams": {
"desc": null,
"value": 1
},
"ray_scope": {
"desc": null,
"value": "last"
},
"report_to": {
"desc": null,
"value": [
"wandb"
]
},
"typical_p": {
"desc": null,
"value": 1
},
"use_cache": {
"desc": null,
"value": true
},
"adam_beta1": {
"desc": null,
"value": 0.9
},
"adam_beta2": {
"desc": null,
"value": 0.999
},
"attn_pdrop": {
"desc": null,
"value": 0.1
},
"do_predict": {
"desc": null,
"value": false
},
"embd_pdrop": {
"desc": null,
"value": 0.1
},
"eval_delay": {
"desc": null,
"value": 0
},
"eval_steps": {
"desc": null,
"value": 1953
},
"is_decoder": {
"desc": null,
"value": false
},
"local_rank": {
"desc": null,
"value": 0
},
"max_length": {
"desc": null,
"value": 20
},
"min_length": {
"desc": null,
"value": 0
},
"model_type": {
"desc": null,
"value": "gpt2"
},
"optim_args": {
"desc": null,
"value": null
},
"output_dir": {
"desc": null,
"value": "/tmp/reward_model_100_gpt2-medium_433860e1"
},
"past_index": {
"desc": null,
"value": -1
},
"save_steps": {
"desc": null,
"value": 1953
},
"vocab_size": {
"desc": null,
"value": 50257
},
"ddp_backend": {
"desc": null,
"value": null
},
"ddp_timeout": {
"desc": null,
"value": 1800
},
"fsdp_config": {
"desc": null,
"value": {
"xla": false,
"xla_fsdp_v2": false,
"min_num_params": 0,
"xla_fsdp_grad_ckpt": false
}
},
"label_names": {
"desc": null,
"value": null
},
"logging_dir": {
"desc": null,
"value": "/tmp/reward_model_100_gpt2-medium_433860e1/runs/Jun03_04-34-17_training-bs-fix-master-0"
},
"n_positions": {
"desc": null,
"value": 1024
},
"push_to_hub": {
"desc": null,
"value": false
},
"resid_pdrop": {
"desc": null,
"value": 0.1
},
"return_dict": {
"desc": null,
"value": true
},
"temperature": {
"desc": null,
"value": 1
},
"torch_dtype": {
"desc": null,
"value": null
},
"torchdynamo": {
"desc": null,
"value": null
},
"torchscript": {
"desc": null,
"value": false
},
"adam_epsilon": {
"desc": null,
"value": 1e-8
},
"bos_token_id": {
"desc": null,
"value": 50256
},
"disable_tqdm": {
"desc": null,
"value": false
},
"eos_token_id": {
"desc": null,
"value": 50256
},
"fp16_backend": {
"desc": null,
"value": "auto"
},
"hub_model_id": {
"desc": null,
"value": null
},
"hub_strategy": {
"desc": null,
"value": "every_save"
},
"pad_token_id": {
"desc": null,
"value": 50256
},
"problem_type": {
"desc": null,
"value": null
},
"pruned_heads": {
"desc": null,
"value": {}
},
"sep_token_id": {
"desc": null,
"value": null
},
"summary_type": {
"desc": null,
"value": "cls_index"
},
"use_bfloat16": {
"desc": null,
"value": false
},
"warmup_ratio": {
"desc": null,
"value": 0.05
},
"warmup_steps": {
"desc": null,
"value": 0
},
"weight_decay": {
"desc": null,
"value": 0
},
"_name_or_path": {
"desc": null,
"value": "openai-community/gpt2-medium"
},
"architectures": {
"desc": null,
"value": [
"GPT2LMHeadModel"
]
},
"bad_words_ids": {
"desc": null,
"value": null
},
"eval_strategy": {
"desc": null,
"value": "steps"
},
"jit_mode_eval": {
"desc": null,
"value": false
},
"learning_rate": {
"desc": null,
"value": 0.00004
},
"logging_steps": {
"desc": null,
"value": 1953
},
"max_grad_norm": {
"desc": null,
"value": 1
},
"mp_parameters": {
"desc": null,
"value": ""
},
"output_scores": {
"desc": null,
"value": false
},
"save_strategy": {
"desc": null,
"value": "steps"
},
"split_batches": {
"desc": null,
"value": null
},
"torch_compile": {
"desc": null,
"value": false
},
"tpu_num_cores": {
"desc": null,
"value": null
},
"bf16_full_eval": {
"desc": null,
"value": false
},
"early_stopping": {
"desc": null,
"value": false
},
"fp16_full_eval": {
"desc": null,
"value": false
},
"fp16_opt_level": {
"desc": null,
"value": "O1"
},
"length_penalty": {
"desc": null,
"value": 1
},
"tf_legacy_loss": {
"desc": null,
"value": false
},
"use_mps_device": {
"desc": null,
"value": false
},
"finetuning_task": {
"desc": null,
"value": null
},
"group_by_length": {
"desc": null,
"value": false
},
"hub_always_push": {
"desc": null,
"value": false
},
"num_beam_groups": {
"desc": null,
"value": 1
},
"save_only_model": {
"desc": null,
"value": false
},
"suppress_tokens": {
"desc": null,
"value": null
},
"tokenizer_class": {
"desc": null,
"value": null
},
"dispatch_batches": {
"desc": null,
"value": null
},
"full_determinism": {
"desc": null,
"value": false
},
"hub_private_repo": {
"desc": null,
"value": false
},
"ignore_data_skip": {
"desc": null,
"value": false
},
"log_on_each_node": {
"desc": null,
"value": true
},
"logging_strategy": {
"desc": null,
"value": "steps"
},
"num_train_epochs": {
"desc": null,
"value": 1
},
"save_safetensors": {
"desc": null,
"value": true
},
"save_total_limit": {
"desc": null,
"value": 4
},
"summary_use_proj": {
"desc": null,
"value": true
},
"ddp_bucket_cap_mb": {
"desc": null,
"value": 100
},
"diversity_penalty": {
"desc": null,
"value": 0
},
"greater_is_better": {
"desc": null,
"value": null
},
"initializer_range": {
"desc": null,
"value": 0.02
},
"log_level_replica": {
"desc": null,
"value": "warning"
},
"lr_scheduler_type": {
"desc": null,
"value": "cosine"
},
"output_attentions": {
"desc": null,
"value": false
},
"push_to_hub_token": {
"desc": null,
"value": "<PUSH_TO_HUB_TOKEN>"
},
"save_on_each_node": {
"desc": null,
"value": false
},
"tpu_metrics_debug": {
"desc": null,
"value": false
},
"accelerator_config": {
"desc": null,
"value": {
"even_batches": true,
"non_blocking": false,
"split_batches": false,
"dispatch_batches": null,
"use_seedable_sampler": true,
"gradient_accumulation_kwargs": null
}
},
"batch_eval_metrics": {
"desc": null,
"value": false
},
"is_encoder_decoder": {
"desc": null,
"value": false
},
"layer_norm_epsilon": {
"desc": null,
"value": 0.00001
},
"length_column_name": {
"desc": null,
"value": "length"
},
"logging_first_step": {
"desc": null,
"value": true
},
"repetition_penalty": {
"desc": null,
"value": 1
},
"scale_attn_weights": {
"desc": null,
"value": true
},
"summary_activation": {
"desc": null,
"value": null
},
"torch_compile_mode": {
"desc": null,
"value": null
},
"activation_function": {
"desc": null,
"value": "gelu_new"
},
"add_cross_attention": {
"desc": null,
"value": false
},
"evaluation_strategy": {
"desc": null,
"value": "steps"
},
"forced_bos_token_id": {
"desc": null,
"value": null
},
"forced_eos_token_id": {
"desc": null,
"value": null
},
"fsdp_min_num_params": {
"desc": null,
"value": 0
},
"lr_scheduler_kwargs": {
"desc": null,
"value": {}
},
"neftune_noise_alpha": {
"desc": null,
"value": null
},
"skip_memory_metrics": {
"desc": null,
"value": true
},
"tie_encoder_decoder": {
"desc": null,
"value": false
},
"tie_word_embeddings": {
"desc": null,
"value": true
},
"auto_find_batch_size": {
"desc": null,
"value": false
},
"dataloader_drop_last": {
"desc": null,
"value": false
},
"no_repeat_ngram_size": {
"desc": null,
"value": 0
},
"num_return_sequences": {
"desc": null,
"value": 1
},
"optim_target_modules": {
"desc": null,
"value": null
},
"output_hidden_states": {
"desc": null,
"value": false
},
"overwrite_output_dir": {
"desc": null,
"value": false
},
"prediction_loss_only": {
"desc": null,
"value": false
},
"push_to_hub_model_id": {
"desc": null,
"value": null
},
"task_specific_params": {
"desc": null,
"value": {
"text-generation": {
"do_sample": true,
"max_length": 50
}
}
},
"transformers_version": {
"desc": null,
"value": "4.41.2"
},
"begin_suppress_tokens": {
"desc": null,
"value": null
},
"dataloader_pin_memory": {
"desc": null,
"value": true
},
"ddp_broadcast_buffers": {
"desc": null,
"value": false
},
"metric_for_best_model": {
"desc": null,
"value": null
},
"remove_invalid_values": {
"desc": null,
"value": false
},
"remove_unused_columns": {
"desc": null,
"value": false
},
"summary_first_dropout": {
"desc": null,
"value": 0.1
},
"torch_compile_backend": {
"desc": null,
"value": null
},
"dataloader_num_workers": {
"desc": null,
"value": 12
},
"decoder_start_token_id": {
"desc": null,
"value": null
},
"eval_do_concat_batches": {
"desc": null,
"value": true
},
"gradient_checkpointing": {
"desc": null,
"value": false
},
"half_precision_backend": {
"desc": null,
"value": "auto"
},
"label_smoothing_factor": {
"desc": null,
"value": 0
},
"load_best_model_at_end": {
"desc": null,
"value": false
},
"logging_nan_inf_filter": {
"desc": null,
"value": true
},
"predict_special_tokens": {
"desc": null,
"value": true
},
"resume_from_checkpoint": {
"desc": null,
"value": null
},
"summary_proj_to_labels": {
"desc": null,
"value": true
},
"chunk_size_feed_forward": {
"desc": null,
"value": 0
},
"eval_accumulation_steps": {
"desc": null,
"value": null
},
"per_gpu_eval_batch_size": {
"desc": null,
"value": null
},
"reorder_and_upcast_attn": {
"desc": null,
"value": false
},
"return_dict_in_generate": {
"desc": null,
"value": false
},
"per_gpu_train_batch_size": {
"desc": null,
"value": null
},
"push_to_hub_organization": {
"desc": null,
"value": null
},
"include_tokens_per_second": {
"desc": null,
"value": false
},
"dataloader_prefetch_factor": {
"desc": null,
"value": null
},
"ddp_find_unused_parameters": {
"desc": null,
"value": null
},
"include_inputs_for_metrics": {
"desc": null,
"value": false
},
"per_device_eval_batch_size": {
"desc": null,
"value": 8
},
"use_legacy_prediction_loop": {
"desc": null,
"value": false
},
"cross_attention_hidden_size": {
"desc": null,
"value": null
},
"gradient_accumulation_steps": {
"desc": null,
"value": 4
},
"per_device_train_batch_size": {
"desc": null,
"value": 16
},
"encoder_no_repeat_ngram_size": {
"desc": null,
"value": 0
},
"dataloader_persistent_workers": {
"desc": null,
"value": false
},
"gradient_checkpointing_kwargs": {
"desc": null,
"value": null
},
"include_num_input_tokens_seen": {
"desc": null,
"value": false
},
"scale_attn_by_inverse_layer_idx": {
"desc": null,
"value": false
},
"exponential_decay_length_penalty": {
"desc": null,
"value": null
},
"fsdp_transformer_layer_cls_to_wrap": {
"desc": null,
"value": null
},
"restore_callback_states_from_checkpoint": {
"desc": null,
"value": false
}
}