Hi all,
I have a Python script training an Hugging Face model. However, when it comes to pushing it to the hub, I run into an error related to Git LFS locking. Could anyone help me fixing it please?
Here are the logs of the error:
Training completed. Do not forget to share your model on huggingface.co/models =)
Saving model checkpoint to /tmp/models/MODEL_NAME
Configuration saved in /tmp/models/MODEL_NAME/config.json
Configuration saved in /tmp/models/MODEL_NAME/generation_config.json
Model weights saved in /tmp/models/MODEL_NAME/pytorch_model.bin
tokenizer config file saved in /tmp/models/MODEL_NAME/tokenizer_config.json
Special tokens file saved in /tmp/models/MODEL_NAME/special_tokens_map.json
Remote "origin" does not support the Git LFS locking API. Consider disabling it with:
$ git config lfs.https://user:TOKEN@huggingface.co/ORGANIZATION_NAME/MODEL_NAME.git/info/lfs.locksverify false
batch response: Post "https://user:***@huggingface.co/ORGANIZATION_NAME/MODEL_NAME.git/info/lfs/objects/batch": tls: failed to verify certificate: x509: certificate signed by unknown authority
error: failed to push some refs to 'https://huggingface.co/ORGANIZATION_NAME/MODEL_NAME'
WARNING:huggingface_hub.repository:Remote "origin" does not support the Git LFS locking API. Consider disabling it with:
$ git config lfs.https://user:TOKEN@huggingface.co/ORGANIZATION_NAME/MODEL_NAME.git/info/lfs.locksverify false
batch response: Post "https://user:***@huggingface.co/ORGANIZATION_NAME/MODEL_NAME.git/info/lfs/objects/batch": tls: failed to verify certificate: x509: certificate signed by unknown authority
error: failed to push some refs to 'https://huggingface.co/ORGANIZATION_NAME/MODEL_NAME'
[other reports on model performance]
Traceback (most recent call last):
File "/path-to-my-repo/src/train_model.py", line 5, in <module>
CLI(train_model)
File "/opt/conda/lib/python3.10/site-packages/jsonargparse/_cli.py", line 85, in CLI
return _run_component(component, cfg_init)
File "/opt/conda/lib/python3.10/site-packages/jsonargparse/_cli.py", line 147, in _run_component
return component(**cfg)
File "/path-to-my-repo/my-package/llms/training.py", line 213, in train_model
trainer.push_to_hub(
File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 3698, in push_to_hub
git_head_commit_url = self.repo.push_to_hub(
File "/opt/conda/lib/python3.10/site-packages/huggingface_hub-0.16.4-py3.8.egg/huggingface_hub/repository.py", line 1307, in push_to_hub
File "/opt/conda/lib/python3.10/site-packages/huggingface_hub-0.16.4-py3.8.egg/huggingface_hub/repository.py", line 1102, in git_push
OSError: Remote "origin" does not support the Git LFS locking API. Consider disabling it with:
$ git config lfs.https://user:TOKEN@huggingface.co/ORGANIZATION_NAME/MODEL_NAME.git/info/lfs.locksverify false
batch response: Post "https://user:***@huggingface.co/ORGANIZATION_NAME/MODEL_NAME.git/info/lfs/objects/batch": tls: failed to verify certificate: x509: certificate signed by unknown authority
error: failed to push some refs to 'https://huggingface.co/ORGANIZATION_NAME/MODEL_NAME'
And here is my training code, if it can provide additional insights:
def train_model(
model_id: str,
ds_name: str,
ds_config: str = 'default',
learning_rate: float = 5e-5,
max_steps: int = -1,
num_train_epochs: int = 40,
batch_size: int = 128,
batch_size_tokenizer: int = 512,
gradient_accumulation_steps: int = 4,
hub_token: Optional[str] = None,
organization: Optional[str] = None,
output_dir: str = "./models/",
tokenizer: AutoTokenizer | str = "seyonec/ChemBERTa-zinc-base-v1",
pretrained_encoder: str = "seyonec/ChemBERTa-zinc-base-v1",
pretrained_decoder: str = "seyonec/ChemBERTa-zinc-base-v1",
encoder_max_length: int = 512,
decoder_max_length: int = 512,
tie_encoder_decoder: bool = False,
delete_repo_first: bool = False,
training_args: Optional[Seq2SeqTrainingArguments] = None,
resume_from_checkpoint: Optional[str] = None,
optuna_n_trials: int = 0,
):
"""Trains a model on a given dataset.
Args:
...
"""
if hub_token is not None:
hf.login(token=hub_token)
# Setup output directory and Hugging Face repository
output_dir += f"/{model_id}"
if organization is not None:
hub_model_id = f"{organization}/{model_id}"
if delete_repo_first:
delete_hf_repository(repo_id=hub_model_id, token=hub_token)
else:
hub_model_id = None
if isinstance(tokenizer, str):
tokenizer = AutoTokenizer.from_pretrained(tokenizer)
elif tokenizer is None:
tokenizer = AutoTokenizer.from_pretrained(pretrained_encoder)
dataset_tokenized = load_tokenized_dataset(
ds_name,
ds_config,
tokenizer,
batch_size_tokenizer,
encoder_max_length,
decoder_max_length,
token=hub_token,
)
per_device_batch_size = batch_size // gradient_accumulation_steps
if training_args is None:
training_args = Seq2SeqTrainingArguments(
output_dir=output_dir,
# Optimizer-related configs
learning_rate=learning_rate,
optim="adamw_torch",
lr_scheduler_type="cosine", # Default: "linear"
# Generation configs
predict_with_generate=True,
generation_num_beams=1, # Greedy strategy
# Batch size and device configs
per_device_train_batch_size=per_device_batch_size,
per_device_eval_batch_size=per_device_batch_size,
gradient_accumulation_steps=gradient_accumulation_steps,
auto_find_batch_size=True,
# torch_compile=True,
fp16=True,
# Evaluation and checkpointing configs
evaluation_strategy="steps",
max_steps=max_steps,
num_train_epochs=num_train_epochs,
eval_steps=100,
save_steps=200,
# eval_steps=7500,
# warmup_steps=2000,
save_strategy="steps",
save_total_limit=1,
load_best_model_at_end=True,
metric_for_best_model="valid_smiles",
# Logging configs
log_level="info",
logging_steps=50,
disable_tqdm=True,
# Hub information configs
push_to_hub=True, # NOTE: Done manually further down
hub_token=hub_token,
hub_model_id=hub_model_id,
hub_strategy="checkpoint", # NOTE: Allows to resume training from last checkpoint
hub_private_repo=True,
report_to=["tensorboard"],
# Other configs
seed=42,
data_seed=42,
)
rouge = evaluate.load("rouge")
metric = partial(
compute_metrics_with_chem,
rouge=rouge,
tokenizer=tokenizer,
)
bert2bert = lambda: get_model(
pretrained_encoder=pretrained_encoder,
pretrained_decoder=pretrained_decoder,
max_length=encoder_max_length,
tie_encoder_decoder=tie_encoder_decoder,
)
trainer = Seq2SeqTrainer(
model_init=bert2bert,
tokenizer=tokenizer,
args=training_args,
compute_metrics=metric,
train_dataset=dataset_tokenized["train"],
eval_dataset=dataset_tokenized["test"],
)
if optuna_n_trials > 0:
def optuna_hp_space(trial):
return {
"learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
"per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [16, 32, 64, 128]),
"lr_scheduler_type": trial.suggest_categorical("lr_scheduler_type", ["linear", "cosine"]),
}
def compute_objective(metrics: Dict[str, float]):
return metrics["eval_loss"], metrics["eval_reassembly"]
best_trials = trainer.hyperparameter_search(
direction=["minimize", "maximize"],
backend="optuna",
hp_space=optuna_hp_space,
n_trials=optuna_n_trials,
compute_objective=compute_objective,
)
print("-" * 80)
print(f"Best trials:\n{best_trials}")
print("-" * 80)
else:
trainer.train(
resume_from_checkpoint=resume_from_checkpoint, # "last-checkpoint",
)
if hub_model_id is not None:
trainer.push_to_hub(
commit_message="Initial version",
model_name=hub_model_id,
license="mit",
finetuned_from=f"{pretrained_encoder}",
tasks=["Text2Text Generation"],
dataset=ds_name,
dataset_args=ds_config,
)
Thanks in advance for your time and support!