XLNet pre-training fails with multiple GPUs on Sagemaker

I’m initializing the run_plm example on Sagemaker from my local machine but getting the following error message:

RuntimeError: Expected to have finished reduction in the prior iteration before starting a new one. This error indicates that your module has parameters that were not used in producing loss. You can enable unused parameter detection by passing the keyword argument find_unused_parameters=True to torch.nn.parallel.DistributedDataParallel

Seems like I’m setting that flag using the “ddp_find_unused_parameters” in training args.

Here is my setup:

import os
import logging
import sagemaker
import sys
import boto3
from sagemaker.huggingface import HuggingFace

logger = logging.getLogger(__name__)
logging.basicConfig(
    level=logging.getLevelName("INFO"),
    handlers=[logging.StreamHandler(sys.stdout)],
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)

def main():
    bucket = "<BUCKET>"
    session = sagemaker.Session(default_bucket=bucket)
    role = "<ROLE>"

    base_job_name = f"pre-training"
    checkpoint_s3_uri = "<S3URL>"
    checkpoint_local_path="/opt/ml/checkpoints"
    source_dir = "./code"
    entry_point = "run_plm.py"

    train_dataset_s3_uri = "<S3URL>"

    instance_type = "ml.p3.16xlarge"
    
    model_args = {
        "tokenizer_name": "<CUSTOM-UNIGRAM-TOKENIZER>",
        "use_auth_token": True
    }

    data_training_args = {
        "max_seq_length": 1024,
        "plm_probability": 1/6,
        "max_span_length": 5,
        "line_by_line": True,
        "pad_to_max_length": True,
        "max_train_samples": 100,
        "preprocessing_num_workers": 32
    }

    training_args = {
        "num_train_epochs": 5,
        "per_device_train_batch_size": 1,
        "per_device_eval_batch_size": 1,
        "gradient_accumulation_steps": 1,
        "eval_accumulation_steps": 1,
        "fp16": "True",
        "optim": "adamw_torch",
        "ddp_find_unused_parameters": False,
        "output_dir": checkpoint_local_path,
        "overwrite_output_dir": True,
        "report_to": "mlflow",
        "save_steps": 500,
        "save_total_limit": 2,
        "logging_strategy": "epoch",
        "logging_steps": 50,
        "evaluation_strategy": "epoch",
        "eval_steps": 1,
        "learning_rate": 5e-5,
        "do_train": 1,
        "do_eval": 0,
        "do_predict": 0,
        "push_to_hub": True,
        "hub_private_repo": True,
        "hub_model_id": "<PRIVATE-MODEL-PATH>",
        "hub_token": os.environ["HF_WRITE_ACCESS_TOKEN"],
        "hub_strategy": "every_save",
        "seed": 42,
    }
    
    distribution = {"smdistributed": {"dataparallel": {"enabled": True}}}

    metric_definitions = [
        {"Name": "loss", "Regex": "'loss': ([0-9]+(.|e\-)[0-9]+),?"},
        {"Name": "eval_loss", "Regex": "'eval_loss': ([0-9]+(.|e\-)[0-9]+),?"}
    ]

    huggingface_estimator = HuggingFace(
        entry_point=entry_point,
        source_dir=source_dir,
        base_job_name=base_job_name,
        output_path=f"s3://{bucket}/",
        instance_type=instance_type,
        sagemaker_session=session,
        instance_count=1,
        role=role,
        transformers_version='4.28.1',
        pytorch_version='2.0.0',
        py_version='py310',
        hyperparameters={**model_args, **data_training_args, **training_args},
        distribution = distribution,
        checkpoint_s3_uri=checkpoint_s3_uri,
        checkpoint_local_path=checkpoint_local_path,
        volume_size=100,
        metric_definitions=metric_definitions
    )

    huggingface_estimator.fit(
        {
            "train": train_dataset_s3_uri
        },
        logs="All"
    )

if __name__ == "__main__":
    main()

Anyone have an idea? Thanks.