notebook_launcher(train_and_evaluate, num_processes=2)
I’m calling the above from within a Kaggle T4x2 notebook and it does start train the model for 2 epochs on both GPUs, but after that training just abruptly ends without logging anything. Is there some way to get more info on what might be going wrong? I’m using a custom training loop similar to the following (the miniai framework has a lot of details hidden, tell me if something is missing that might help):
def train_and_evaluate():
from sklearn.model_selection import KFold
kfold = KFold(n_splits=n_splits, shuffle=True)
eval_metrics = []
for fold, (train_ids, val_ids) in enumerate(kfold.split(df_train)):
dls = DataLoaders(
#valid,
#train,
)
model = #automodelfrompretrained
lr = #lr
epochs = #epochs
tmax = epochs * len(dls.train)
sched = partial(optim.lr_scheduler.OneCycleLR, anneal_strategy='cos', pct_start=0.01, max_lr=lr, total_steps=tmax)
cbs = [
HFTrainCB(), #adapt interface for huggingface models and use accelerate
# Inherits from: https://github.com/johnowhitaker/miniminiai/blob/e84407d11ec2d9d244f7d32b4052b988e887ae0c/miniminiai/miniminiai.py#L384C27-L384C27
ProgressCB(plot=True), #printprogress
HFMetricsCB(), #getmetrics
BatchSchedCB(sched), #step scheduler
]
opt_func = partial(optim.AdamW)
learn = Learner(model, dls, lr=lr, cbs=cbs, opt_func=opt_func)
learn.fit(epochs)