I’m initializing the run_plm example on Sagemaker from my local machine but getting the following error message:
RuntimeError: Expected to have finished reduction in the prior iteration before starting a new one. This error indicates that your module has parameters that were not used in producing loss. You can enable unused parameter detection by passing the keyword argument
find_unused_parameters=True to torch.nn.parallel.DistributedDataParallel
Seems like I’m setting that flag using the “ddp_find_unused_parameters” in training args.
Here is my setup:
import os
import logging
import sagemaker
import sys
import boto3
from sagemaker.huggingface import HuggingFace
logger = logging.getLogger(__name__)
logging.basicConfig(
level=logging.getLevelName("INFO"),
handlers=[logging.StreamHandler(sys.stdout)],
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
def main():
bucket = "<BUCKET>"
session = sagemaker.Session(default_bucket=bucket)
role = "<ROLE>"
base_job_name = f"pre-training"
checkpoint_s3_uri = "<S3URL>"
checkpoint_local_path="/opt/ml/checkpoints"
source_dir = "./code"
entry_point = "run_plm.py"
train_dataset_s3_uri = "<S3URL>"
instance_type = "ml.p3.16xlarge"
model_args = {
"tokenizer_name": "<CUSTOM-UNIGRAM-TOKENIZER>",
"use_auth_token": True
}
data_training_args = {
"max_seq_length": 1024,
"plm_probability": 1/6,
"max_span_length": 5,
"line_by_line": True,
"pad_to_max_length": True,
"max_train_samples": 100,
"preprocessing_num_workers": 32
}
training_args = {
"num_train_epochs": 5,
"per_device_train_batch_size": 1,
"per_device_eval_batch_size": 1,
"gradient_accumulation_steps": 1,
"eval_accumulation_steps": 1,
"fp16": "True",
"optim": "adamw_torch",
"ddp_find_unused_parameters": False,
"output_dir": checkpoint_local_path,
"overwrite_output_dir": True,
"report_to": "mlflow",
"save_steps": 500,
"save_total_limit": 2,
"logging_strategy": "epoch",
"logging_steps": 50,
"evaluation_strategy": "epoch",
"eval_steps": 1,
"learning_rate": 5e-5,
"do_train": 1,
"do_eval": 0,
"do_predict": 0,
"push_to_hub": True,
"hub_private_repo": True,
"hub_model_id": "<PRIVATE-MODEL-PATH>",
"hub_token": os.environ["HF_WRITE_ACCESS_TOKEN"],
"hub_strategy": "every_save",
"seed": 42,
}
distribution = {"smdistributed": {"dataparallel": {"enabled": True}}}
metric_definitions = [
{"Name": "loss", "Regex": "'loss': ([0-9]+(.|e\-)[0-9]+),?"},
{"Name": "eval_loss", "Regex": "'eval_loss': ([0-9]+(.|e\-)[0-9]+),?"}
]
huggingface_estimator = HuggingFace(
entry_point=entry_point,
source_dir=source_dir,
base_job_name=base_job_name,
output_path=f"s3://{bucket}/",
instance_type=instance_type,
sagemaker_session=session,
instance_count=1,
role=role,
transformers_version='4.28.1',
pytorch_version='2.0.0',
py_version='py310',
hyperparameters={**model_args, **data_training_args, **training_args},
distribution = distribution,
checkpoint_s3_uri=checkpoint_s3_uri,
checkpoint_local_path=checkpoint_local_path,
volume_size=100,
metric_definitions=metric_definitions
)
huggingface_estimator.fit(
{
"train": train_dataset_s3_uri
},
logs="All"
)
if __name__ == "__main__":
main()
Anyone have an idea? Thanks.