KeyError: 'url' when push huggingface tokenizer to hub in accelerator multi-gpu multi process

i got a strange error when trying to save the tokenizer to huggingface hub by self.tokenizer.push_to_hub()

i currently use 2 T4 gpu with kaggle and the command line to run is:
!accelerate launch --multi_gpu --num_processes 2 --config_file 'config/ds_zero3_cpu.yaml' train/fact_verify.py \

 File "/kaggle/working/fact-verification/train/fact_verify.py", line 10, in <module>
   join_fact_verify_run()
 File "/kaggle/working/fact-verification/model/claim_verification/joint_cross_encoder/trainer.py", line 389, in join_fact_verify_run
   main(args=args)
 File "/kaggle/working/fact-verification/model/claim_verification/joint_cross_encoder/trainer.py", line 344, in main
   trainer(
 File "/kaggle/working/fact-verification/model/claim_verification/joint_cross_encoder/trainer.py", line 137, in __call__
   self.tokenizer.push_to_hub(model_name, token='TOKEN', private=True)
 File "/opt/conda/lib/python3.10/site-packages/transformers/utils/hub.py", line 882, in push_to_hub
   repo_id = self._create_repo(
 File "/opt/conda/lib/python3.10/site-packages/transformers/utils/hub.py", line 709, in _create_repo
   url = create_repo(repo_id=repo_id, token=token, private=private, exist_ok=True)
 File "/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py", line 118, in _inner_fn
   return fn(*args, **kwargs)
 File "/opt/conda/lib/python3.10/site-packages/huggingface_hub/hf_api.py", line 2326, in create_repo
   return RepoUrl(d["url"], endpoint=self.endpoint)
KeyError: 'url'

my code quiet long so i will post some line i think got trouble is:

self.accelerator = Accelerator(
            log_with="wandb",
            mixed_precision='fp16',
            kwargs_handlers=[DistributedDataParallelKwargs(find_unused_parameters=True)],
        )
self.accelerator.wait_for_everyone()
self.model = JointCrossEncoder.from_pretrained(pretrained_model, token='TOKEN')
self.accelerator.wait_for_everyone()
self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model, token='TOKEN')
self.accelerator.wait_for_everyone()
self.tokenizer.push_to_hub(model_name, token='TOKEN', private=True)

My deepspeed config file ds_zero3_cpu.yaml is

compute_environment: LOCAL_MACHINE
deepspeed_config:
  gradient_accumulation_steps: 1
  gradient_clipping: 1.0
  offload_optimizer_device: cpu
  offload_param_device: cpu
  zero3_init_flag: true
  zero3_save_16bit_model: true
  zero_stage: 3
distributed_type: DEEPSPEED
downcast_bf16: 'no'
dynamo_backend: 'NO'
mixed_precision: fp16
fsdp_config: {}
machine_rank: 0
main_training_function: main
megatron_lm_config: {}
mixed_precision: 'no'
num_machines: 1
num_processes: 2
rdzv_backend: static
same_network: true
use_cpu: false

I am currently using kaggle environment and i have down load the latest accelerator and deepspeed version by:
!pip install deepspeed accelerator

my run notebook: Fact checking | Kaggle
in case you need the full version of my code:

from .model import JointCrossEncoder, JointCrossEncoderConfig
from .dataloader import FactVerifyDataloader, RerankDataloaderConfig, FactVerificationBatch
from accelerate import Accelerator, DeepSpeedPlugin
from transformers import AutoModel, AutoConfig, AutoTokenizer
from torch.utils.data import DataLoader
import torch
import torch.nn as nn
from torch.optim import Optimizer
import torch.nn.functional as F
from torcheval.metrics import MulticlassF1Score, BinaryF1Score, MulticlassConfusionMatrix
import argparse
import math
from typing import Type, Dict, List
import os
from tqdm import tqdm
from accelerate import DistributedDataParallelKwargs
from sentence_transformers import SentenceTransformer, CrossEncoder
from peft import get_peft_model, LoraConfig, TaskType
import wandb
import os

os.environ["WANDB_API_KEY"] = "TOKEN"
wandb.login()

class BinaryFocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2):
        super(BinaryFocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
    def forward(self, inputs, targets):
        BCE_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
        pt = torch.exp(-BCE_loss)
        F_loss = self.alpha * (1-pt)**self.gamma * BCE_loss
        return torch.mean(F_loss)
    
class JointCrossEncoderTrainer:
    def __init__(
            self,
            args,
            config:JointCrossEncoderConfig,
            pretrained_model = None,
            use_lora:bool=False
    ):
        # deepspeed_plugin = DeepSpeedPlugin(
        #     gradient_accumulation_steps=1,
        #     gradient_clipping=1,
        #     offload_optimizer_device='cpu',
        #     offload_param_device='cpu',
        #     zero3_init_flag=True,
        #     zero3_save_16bit_model=True,
        #     zero_stage=3,
        # )
        self.accelerator = Accelerator(
            log_with="wandb",
            mixed_precision='fp16',
            kwargs_handlers=[DistributedDataParallelKwargs(find_unused_parameters=True)],
            #deepspeed_plugin=deepspeed_plugin,
        )
        self.accelerator.init_trackers(
            args.project_name,
            config={
                "num_epochs": args.epochs,
                "batch_size": args.batch_size,
                "pretrained_model": args.model,
                "num hard negative": args.num_hard_negatives,
                "tokenize": args.word_tokenize,
                "batch_size": args.batch_size,
                "use_focal_loss": args.use_focal_loss,
                "weight of class": args.weight,
                "patient": args.patient,
            },
            init_kwargs={"wandb": {
                "name": "nguyen-brat",
                "entity": "uit-challenge"
            }}
        )
        self.device = self.accelerator.device

        self.config = config
        self.args = args
        if not pretrained_model:
            self.accelerator.wait_for_everyone()
            self.model = JointCrossEncoder(config=config)
            self.accelerator.wait_for_everyone()
            self.tokenizer = AutoTokenizer.from_pretrained(config.model)
        else:
            self.accelerator.wait_for_everyone()
            self.model = JointCrossEncoder.from_pretrained(pretrained_model, token='TOKEN')
            self.accelerator.wait_for_everyone()
            self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model, token='TOKEN')
        if use_lora:
            peft_config = LoraConfig(
                #task_type=TaskType.FEATURE_EXTRACTION,
                inference_mode=False,
                r=8,
                lora_alpha=32,
                bias='all',
                lora_dropout=0.1,
                target_modules='feature_extractor.*.query_key_value*|feature_extractor.*.mlp.dense_h_to_4h|feature_extractor.*.mlp.dense_4h_to_h|feature_extractor.*.dense*|evident_aggrerators.*.out_proj',
                modules_to_save=[
                    'aggerator',
                    #'single_evident_linear',
                ]
            )
            self.model = get_peft_model(self.model, peft_config)
            print('*********************')
            print(self.model.print_trainable_parameters())
            print('*********************')

    
    def smart_batching_collate(self, batch):
        batch = batch[0]
        fact_claims_ids = self.tokenizer(*batch.claims_facts, padding='max_length', truncation='only_second', return_tensors="pt", max_length=self.config.max_length)

        return fact_claims_ids, batch.label, batch.is_positive, batch.is_positive_ohot


    def __call__(
            self,
            train_dataloader: DataLoader,
            val_dataloader: DataLoader=None,
            epochs: int = 10,
            scheduler: str = 'WarmupLinear',
            warmup_steps: int = 10000,
            optimizer_class: Type[Optimizer] = torch.optim.AdamW,
            optimizer_params: Dict[str, object] = {'lr': 2e-5},
            weight_decay: float = 0.01,
            use_focal_loss = False,
            weight = [.3, .3, .3],
            output_path: str = None,
            save_best_model: bool = True,
            show_progress_bar: bool = True,
            patient: int = 4,
            evaluation_steps = 500,
            model_name="claim_verify_join_encoder_v2",
            push_to_hub=False,
    ):
        if push_to_hub:
            self.accelerator.wait_for_everyone()
            self.tokenizer.push_to_hub(model_name, token='TOKEN', private=True)
        self.accelerator.wait_for_everyone()
        self.tokenizer.save_pretrained(output_path)

        wandb_tracker = self.accelerator.get_tracker("wandb")
        train_dataloader.collate_fn = self.smart_batching_collate
        if val_dataloader != None:
            val_dataloader.collate_fn = self.smart_batching_collate

        if output_path is not None:
            os.makedirs(output_path, exist_ok=True)

        self.best_score = -9999999
        self.best_losses = 9999999
        patient_count = 0
        num_train_steps = int(len(train_dataloader) * epochs)

        # Prepare optimizers
        param_optimizer = list(self.model.named_parameters())

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': weight_decay},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]

        optimizer = optimizer_class(optimizer_grouped_parameters, **optimizer_params)

        if isinstance(scheduler, str):
            scheduler = SentenceTransformer._get_scheduler(optimizer, scheduler=scheduler, warmup_steps=warmup_steps, t_total=num_train_steps)

        if use_focal_loss:
            multi_loss_fct = torch.hub.load(
                'adeelh/pytorch-multi-class-focal-loss',
                model='focal_loss',
                alpha=weight,
                gamma=2,
                reduction='mean',
                device=self.device,
                dtype=torch.float32,
                force_reload=False
            )
            binary_loss_fct = BinaryFocalLoss()
        else:
            multi_loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(weight).to(self.device))
            binary_loss_fct = nn.BCEWithLogitsLoss()

        if val_dataloader == None:
            self.model, optimizer, scheduler, train_dataloader = self.accelerator.prepare(self.model, optimizer, scheduler, train_dataloader)
        else:
            self.model, optimizer, scheduler, train_dataloader, val_dataloader = self.accelerator.prepare(self.model, optimizer, scheduler, train_dataloader, val_dataloader)

        train_loss_list = []
        acc_list = []

        metrics = [MulticlassF1Score(num_classes=3), MulticlassConfusionMatrix(num_classes=3)]
        for epoch in range(epochs):
            training_steps = 0
            self.accelerator.print(f'epoch: {epoch+1}/{epochs} ')
            self.model.zero_grad()
            self.model.train()

            for fact_claims_ids, labels, is_positive, is_positive_ohot in tqdm(train_dataloader, desc="Iteration", smoothing=0.05, disable=not show_progress_bar):
                optimizer.zero_grad()
                with self.accelerator.autocast(): ###########
                    multi_evident_logits, single_evident_logits = self.model(fact_claims_ids, is_positive)
                    multi_evident_loss_value = multi_loss_fct(multi_evident_logits, labels)
                    single_evident_loss_value = multi_loss_fct(single_evident_logits, labels)
                    # is_positive_loss_value = binary_loss_fct(positive_logits, is_positive_ohot)
                    loss_value = (multi_evident_loss_value*0.9 + single_evident_loss_value*0.1)
                    #loss_value = multi_evident_loss_value
                self.accelerator.backward(loss_value)
                optimizer.step()
                scheduler.step()
                training_steps += 1

                # step evaluation
                if evaluation_steps > 0 and training_steps % evaluation_steps == 0:
                    self.model.eval()
                    train_result = {
                        "multiple evident loss":multi_evident_loss_value.item(),
                        "single evident loss":single_evident_loss_value.item(),
                    }
                    if val_dataloader is not None:
                        acc = self.val_evaluation(val_dataloader, metrics=metrics)
                        train_result["f1 score"] = acc[0]
                    wandb_tracker.log(train_result, step=evaluation_steps)
                    table = wandb.Table(data=acc[1].tolist(), columns=["supported", "refuted", "nei"])
                    wandb_tracker.log({"predictions confusion matrix":table}, commit=False, step=evaluation_steps)
                    self.model.train()

            # epoch evaluation
            if val_dataloader is not None:
                self.model.eval()
                acc = self.val_evaluation(val_dataloader, metrics=metrics)
                acc_list.append(acc)
                if (acc[0] > self.best_score) and save_best_model:
                    patient_count = 0
                    self.best_score = acc[0]
                    self.save_during_training(output_path)
                    if push_to_hub:
                        self.save_to_hub(model_name)
                elif patient_count == patient:
                    break
                else:
                    patient_count += 1
                self.model.zero_grad()
                self.model.train()
            else:
                if (loss_value.item() < self.best_losses) and save_best_model:
                    patient_count = 0
                    self.best_losses = loss_value.item()
                    self.save_during_training(output_path)
                    if push_to_hub:
                        self.save_to_hub(model_name)
                elif patient_count == patient:
                    break
                else:
                    patient_count += 1


            #self.accelerator.print(f'loss value is {loss_value.item()}')
            self.accelerator.print(f'multiple evident loss value is {multi_evident_loss_value.item()}')
            self.accelerator.print(f'single evident loss value is {single_evident_loss_value.item()}')
            if val_dataloader != None:
                self.accelerator.print(f'f1 score is: {acc[0]}')
                self.accelerator.print(f'confusion matrix is {acc[1]}')
            train_loss_list.append(loss_value.item())
            self.accelerator.wait_for_everyone()

        if not save_best_model:
            self.save_during_training(output_path)
            if push_to_hub:
                self.save_to_hub(model_name)

        self.accelerator.end_training()
        return train_loss_list, acc_list
    
    def val_evaluation(self,
                       val_dataloader,
                       metrics,
                       ):
        with torch.no_grad():
            with self.accelerator.autocast(): ###################
                self.accelerator.print('Val evaluation processing !')
                output = []
                for fact_claims_ids, labels, is_positive, _ in val_dataloader:
                    multi_evident_logits, _ = self.model(fact_claims_ids, is_positive)
                    for metric in metrics:
                        metric.update(multi_evident_logits, labels)
                for metric in metrics:
                    output.append(metric.compute())
                    metric.reset()
        return output
    
    def save_during_training(self, output_path):
        unwrapped_model = self.accelerator.unwrap_model(self.model)
        self.accelerator.wait_for_everyone()
        unwrapped_model.save_pretrained(
            output_path,
            is_main_process=self.accelerator.is_main_process,
            save_function=self.accelerator.save,
            state_dict=self.accelerator.get_state_dict(self.model),
        )

    def save_to_hub(
            self,
            model_name='claim_verify_join_encoder_v2',
    ):
        unwrapped_model = self.accelerator.unwrap_model(self.model)
        self.accelerator.wait_for_everyone()
        unwrapped_model.push_to_hub(model_name, token='TOKEN', private=True)
    

def main(args):
    dataloader_config = RerankDataloaderConfig(
        num_hard_negatives = args.num_hard_negatives,
        batch_size = args.batch_size,
        remove_duplicate_context = args.remove_duplicate_context,
        word_tokenize=args.word_tokenize,
    )

    train_data = FactVerifyDataloader(
        config=dataloader_config,
        data_path=args.train_data_path,
    )
    val_dataloader = None
    if args.val_data_path != None:
        val_data = FactVerifyDataloader(
            config=dataloader_config,
            data_path=args.val_data_path,
        )
        val_dataloader = DataLoader(val_data) # batch size is always  because it has bactched when creat data
    train_dataloader = DataLoader(train_data)
    model_config = JointCrossEncoderConfig(
        model=args.model,
        nins=args.num_hard_negatives+1,
    )
    model_config.max_length = args.max_length
    trainer = JointCrossEncoderTrainer(
        args=args,
        config=model_config,
        pretrained_model=args.pretrained_model,
        use_lora=args.use_lora
    )
    warnmup_step = math.ceil(len(train_dataloader) * 10 * 0.1)
    weight = args.weight if args.weight else [.3, .3, .3]
    trainer(
        train_dataloader=train_dataloader,
        val_dataloader=val_dataloader,
        epochs=args.epochs,
        use_focal_loss=args.use_focal_loss,
        weight=weight,
        warmup_steps = warnmup_step,
        output_path = args.save_model_path,
        patient=args.patient,
        model_name=args.model_name,
        push_to_hub=args.push_to_hub,
        evaluation_steps=args.evaluation_steps,
    )

def parse_args():
    """
    Parse arguments from command line.
    """
    parser = argparse.ArgumentParser(description="Arguments for fact verify Trainning")
    parser.add_argument("--model", default='amberoad/bert-multilingual-passage-reranking-msmarco', type=str)
    parser.add_argument("--pretrained_model", default=None, type=str)
    parser.add_argument("--use_lora", default=False, action=argparse.BooleanOptionalAction)
    parser.add_argument("--max_length", default=256, type=int)
    parser.add_argument("--num_label", default=2, type=int)
    parser.add_argument("--train_data_path", default='data/clean_data/train.json', type=str)
    parser.add_argument("--model_name", default='claim_verify_join_encoder_v2', type=str)
    parser.add_argument("--val_data_path", default=None, type=str)
    parser.add_argument("--num_hard_negatives", default=4, type=int)
    parser.add_argument("--batch_size", default=8, type=int)
    parser.add_argument("--remove_duplicate_context", default=False, action=argparse.BooleanOptionalAction)
    parser.add_argument("--word_tokenize", default=False, action=argparse.BooleanOptionalAction)
    parser.add_argument("--epochs", default=30, type=int)
    parser.add_argument("--use_focal_loss", default=False, action=argparse.BooleanOptionalAction, help='whether to use focal loss or not')
    parser.add_argument("--weight", nargs='+', type=float, help="weight of label in loss")
    parser.add_argument("--save_model_path", default="model/claim_verification/joint_cross_encoder/saved_model", type=str)
    parser.add_argument("--patient", default=4, type=int)
    parser.add_argument("--device", type=str, default="cuda:0", help="Specify which gpu device to use.")
    parser.add_argument("--push_to_hub", default=True, action=argparse.BooleanOptionalAction, help='whether to use focal loss or not')
    parser.add_argument("--evaluation_steps", default=400, type=int)
    parser.add_argument("--project_name", default='fact verify UIT', type=str)
    args = parser.parse_args()
    return args

if __name__ == "__main__":
    args = parse_args()
    main(args=args)

Hi, please block out all of your token keys, that is a critical security risk you are putting on yourself

1 Like

i have fix the issue by adding accelerator.is_main_process after push the model or tokenizer to hub i guess there is some conflic while to process is pushing to hub at the same time but i afraid if only the main process push to hub their would be some missing parameter :thinking: :thinking: :thinking: :thinking:

if self.accelerator.is_main_process:
      self.tokenizer.push_to_hub(model_name, token=TOKEN, private=True)

unwrapped_model = self.accelerator.unwrap_model(self.model)
self.accelerator.wait_for_everyone()
 if self.accelerator.is_main_process:
     unwrapped_model.push_to_hub(model_name, token=TOKEN, private=True)