Hello,
I would like to pass small set of data to Flaubert model so as to avoid “out of memory system” and I looking around for a tip to do it , I came up with these codelines but somehow I am getting this error :
tokenized_dataset = dataset.map(lambda x: flaubert_tokenizer(x['verbatim'], padding=True, truncation=True, max_length=512), batched=True)
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask'])
data_loader = DataLoader(tokenized_dataset, batch_size=23)
list_hidden_state = []
for n, batch in enumerate(tqdm(data_loader)):
#print(len(batch))
#device = 'cuda' if torch.cuda.is_available() else 'cpu'
#batch = {k: v.to(device) for k, v in batch.items()}
hidden_state = flaubert(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
cls_embedding = hidden_state[0]
cls_embedding = cls_embedding[:,0].detach().numpy()
list_hidden_state.append(cls_embedding)
list_hidden_state = np.array(list_hidden_state)
list_hidden_state = np.concatenate(list_hidden_state)
print(len(list_hidden_state))
print(list_hidden_state)
Stack trace error look like this
Traceback (most recent call last):
File "/gpfsdswork/projects/rech/kpf/umg16uw/expe_5/test/knn.py", line 168, in <module>
main()
File "/gpfsdswork/projects/rech/kpf/umg16uw/expe_5/test/knn.py", line 163, in main
testing(traindir, language_model_dir, outdir, testdir, modeldir, resultdir)
File "/gpfsdswork/projects/rech/kpf/umg16uw/expe_5/test/knn.py", line 54, in testing
model, train_acc = train_model(train_file, path_to_model_lge, out_dir, model_dir)
File "/gpfsdswork/projects/rech/kpf/umg16uw/expe_5/test/knn.py", line 35, in train_model
Xtrain, lge_size = get_flaubert_layer(Xtrain, path_to_model_lge)
File "/gpfsdswork/projects/rech/kpf/umg16uw/expe_5/test/../traitements/functions_for_processing.py", line 206, in get_flaubert_layer
for n, batch in enumerate(tqdm(data_loader)):
File "/linkhome/rech/genlig01/umg16uw/.conda/envs/bert/lib/python3.9/site-packages/tqdm/std.py", line 1133, in __iter__
for obj in iterable:
File "/linkhome/rech/genlig01/umg16uw/.conda/envs/bert/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 521, in __next__
data = self._next_data()
File "/linkhome/rech/genlig01/umg16uw/.conda/envs/bert/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 561, in _next_data
data = self._dataset_fetcher.fetch(index) # may raise StopIteration
File "/linkhome/rech/genlig01/umg16uw/.conda/envs/bert/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py", line 47, in fetch
return self.collate_fn(data)
File "/linkhome/rech/genlig01/umg16uw/.conda/envs/bert/lib/python3.9/site-packages/torch/utils/data/_utils/collate.py", line 74, in default_collate
return {key: default_collate([d[key] for d in batch]) for key in elem}
File "/linkhome/rech/genlig01/umg16uw/.conda/envs/bert/lib/python3.9/site-packages/torch/utils/data/_utils/collate.py", line 74, in <dictcomp>
return {key: default_collate([d[key] for d in batch]) for key in elem}
File "/linkhome/rech/genlig01/umg16uw/.conda/envs/bert/lib/python3.9/site-packages/torch/utils/data/_utils/collate.py", line 56, in default_collate
return torch.stack(batch, 0, out=out)
RuntimeError: stack expects each tensor to be equal size, but got [197] at entry 0 and [194] at entry 11
Do you any idea how can I resolve the error ?