I implement a “sliding” eval dataset by using TrainerCallback and indefinite IterableDataset, here it’s:
class DatasetSetting:
def __init__(self, do_encode: bool, per_eval_size: int, process_fn: Callable=None):
user_enc, item_enc = None, None
if do_encode:
pkl_path = Path('data/user_item_encoder.pkl')
if not pkl_path.exists(): fit_encoder()
enc: OrdinalEncoder = joblib.load(pkl_path)
user_enc, item_enc = (MyEncoder(cate) for cate in enc.categories_)
self.eval_iter = IterableDataset.from_generator(
lambda: generate_with_neg('eval', user_enc, item_enc)
).repeat(None).map(process_fn).iter(per_eval_size) # repeat indefinitely with .repeat(None)
self.dataset = IterableDatasetDict({
'train': IterableDataset.from_generator(
lambda: generate_with_neg('train', user_enc, item_enc)
),
'test': IterableDataset.from_generator(lambda: generate_with_neg('test', user_enc, item_enc, 2)),
}).map(process_fn)
@property
def eval_dataset(self):
return Dataset.from_dict(next(self.eval_iter)) # not sure it's new? just add .map(None), "Moreover if your function returns nothing (None), then map will run your function and return the dataset unchanged. If no function is provided, default to identity function: lambda x: x." but have map log whenever you access it
class EvalSlidingCallback(TrainerCallback):
def __init__(self, trainer: Trainer, ds_setting: DatasetSetting):
self.trainer = trainer
self.ds_setting = ds_setting
self.in_trainloop = False
def on_train_begin(self, args, state, control, **kwargs):
self.in_trainloop = True
def on_train_end(self, args, state, control, **kwargs):
self.in_trainloop = False
def on_evaluate(self, args, state, control, **kwargs):
if not self.in_trainloop: return
self.trainer.eval_dataset = self.ds_setting.eval_dataset
and after you init a trainer, use this to add the callback:
trainer.add_callback(EvalSlidingCallback(trainer, ds_setting))
anyway, it’s possible to make ues of the mechanism of huggingface transformers, hope it helps