Evaluate subset of data during training

I implement a “sliding” eval dataset by using TrainerCallback and indefinite IterableDataset, here it’s:

class DatasetSetting:
    def __init__(self, do_encode: bool,  per_eval_size: int, process_fn: Callable=None):
        user_enc, item_enc = None, None
        if do_encode:
            pkl_path = Path('data/user_item_encoder.pkl')
            if not pkl_path.exists(): fit_encoder()
            enc: OrdinalEncoder = joblib.load(pkl_path)
            user_enc, item_enc = (MyEncoder(cate) for cate in enc.categories_)

        self.eval_iter = IterableDataset.from_generator(
            lambda: generate_with_neg('eval', user_enc, item_enc)
        ).repeat(None).map(process_fn).iter(per_eval_size) # repeat indefinitely with .repeat(None)

        self.dataset = IterableDatasetDict({
            'train': IterableDataset.from_generator(
                lambda: generate_with_neg('train', user_enc, item_enc)
            ),
            'test': IterableDataset.from_generator(lambda: generate_with_neg('test', user_enc, item_enc, 2)),
        }).map(process_fn)
        
    @property
    def eval_dataset(self):
        return Dataset.from_dict(next(self.eval_iter)) # not sure it's new? just add .map(None), "Moreover if your function returns nothing (None), then map will run your function and return the dataset unchanged. If no function is provided, default to identity function: lambda x: x." but have map log whenever you access it
class EvalSlidingCallback(TrainerCallback):
    def __init__(self, trainer: Trainer, ds_setting: DatasetSetting):
        self.trainer = trainer
        self.ds_setting = ds_setting
        self.in_trainloop = False

    def on_train_begin(self, args, state, control, **kwargs):
        self.in_trainloop = True

    def on_train_end(self, args, state, control, **kwargs):
        self.in_trainloop = False

    def on_evaluate(self, args, state, control, **kwargs):
        if not self.in_trainloop: return
        self.trainer.eval_dataset = self.ds_setting.eval_dataset

and after you init a trainer, use this to add the callback:

trainer.add_callback(EvalSlidingCallback(trainer, ds_setting))

anyway, it’s possible to make ues of the mechanism of huggingface transformers, hope it helps

1 Like