During the creation of my dataset I would like to add sent2vec representations of input sentences to the dataset. The code would look like this:
import sent2vec
from datasets import load_dataset
sent2vec_model = sent2vec.Sent2vecModel()
sent2vec_model.load_model(sent2vec_path, inference_mode=True)
datasets = load_dataset("text", data_files={"train": train_f, "validation": valid_f})
def preprocess(sentences):
embedded_sents = sent2vec_model.embed_sentences(sentences["text"])
return {"text": sentences["text"], "embeddings": embedded_sents}
datasets.map(preprocess, batch_size=None, batched=True)
Unfortunately this won’t work as the sent2vec model can’t be pickled (it seems), and the fingerprint generation thus fails. At first I thought the issue was that map uses multiprocessing by default but using num_proc=1
does not help either. From the error trace it seems that the error arises during the fingerprint/hash update when the sent2vec model is trying to pickled.
File "/mnt/c/dev/python/neural-fuzzy-repair/nfr/finetuning.py", line 48, in create_datasets
datasets.map(preprocess, batch_size=None, batched=True)
File "/home/bram/.local/share/virtualenvs/neural-fuzzy-repair-b49KnSNp/lib/python3.8/site-packages/datasets/dataset_dict.py", line 283, in map
{
File "/home/bram/.local/share/virtualenvs/neural-fuzzy-repair-b49KnSNp/lib/python3.8/site-packages/datasets/dataset_dict.py", line 284, in <dictcomp>
k: dataset.map(
File "/home/bram/.local/share/virtualenvs/neural-fuzzy-repair-b49KnSNp/lib/python3.8/site-packages/datasets/arrow_dataset.py", line 1240, in map
return self._map_single(
File "/home/bram/.local/share/virtualenvs/neural-fuzzy-repair-b49KnSNp/lib/python3.8/site-packages/datasets/arrow_dataset.py", line 156, in wrapper
out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
File "/home/bram/.local/share/virtualenvs/neural-fuzzy-repair-b49KnSNp/lib/python3.8/site-packages/datasets/fingerprint.py", line 157, in wrapper
kwargs[fingerprint_name] = update_fingerprint(
File "/home/bram/.local/share/virtualenvs/neural-fuzzy-repair-b49KnSNp/lib/python3.8/site-packages/datasets/fingerprint.py", line 105, in update_fingerprint
hasher.update(transform_args[key])
File "/home/bram/.local/share/virtualenvs/neural-fuzzy-repair-b49KnSNp/lib/python3.8/site-packages/datasets/fingerprint.py", line 57, in update
self.m.update(self.hash(value).encode("utf-8"))
File "/home/bram/.local/share/virtualenvs/neural-fuzzy-repair-b49KnSNp/lib/python3.8/site-packages/datasets/fingerprint.py", line 53, in hash
return cls.hash_default(value)
File "/home/bram/.local/share/virtualenvs/neural-fuzzy-repair-b49KnSNp/lib/python3.8/site-packages/datasets/fingerprint.py", line 46, in hash_default
return cls.hash_bytes(dumps(value))
File "/home/bram/.local/share/virtualenvs/neural-fuzzy-repair-b49KnSNp/lib/python3.8/site-packages/datasets/utils/py_utils.py", line 367, in dumps
dump(obj, file)
File "/home/bram/.local/share/virtualenvs/neural-fuzzy-repair-b49KnSNp/lib/python3.8/site-packages/datasets/utils/py_utils.py", line 339, in dump
Pickler(file, recurse=True).dump(obj)
File "/home/bram/.local/share/virtualenvs/neural-fuzzy-repair-b49KnSNp/lib/python3.8/site-packages/dill/_dill.py", line 446, in dump
StockPickler.dump(self, obj)
File "/home/bram/.pyenv/versions/3.8.6/lib/python3.8/pickle.py", line 485, in dump
self.save(obj)
File "/home/bram/.pyenv/versions/3.8.6/lib/python3.8/pickle.py", line 558, in save
f(self, obj) # Call unbound method with explicit self
File "/home/bram/.local/share/virtualenvs/neural-fuzzy-repair-b49KnSNp/lib/python3.8/site-packages/dill/_dill.py", line 1435, in save_function
pickler.save_reduce(_create_function, (obj.__code__,
File "/home/bram/.pyenv/versions/3.8.6/lib/python3.8/pickle.py", line 690, in save_reduce
save(args)
File "/home/bram/.pyenv/versions/3.8.6/lib/python3.8/pickle.py", line 558, in save
f(self, obj) # Call unbound method with explicit self
File "/home/bram/.pyenv/versions/3.8.6/lib/python3.8/pickle.py", line 899, in save_tuple
save(element)
File "/home/bram/.pyenv/versions/3.8.6/lib/python3.8/pickle.py", line 558, in save
f(self, obj) # Call unbound method with explicit self
File "/home/bram/.pyenv/versions/3.8.6/lib/python3.8/pickle.py", line 884, in save_tuple
save(element)
File "/home/bram/.pyenv/versions/3.8.6/lib/python3.8/pickle.py", line 558, in save
f(self, obj) # Call unbound method with explicit self
File "/home/bram/.local/share/virtualenvs/neural-fuzzy-repair-b49KnSNp/lib/python3.8/site-packages/dill/_dill.py", line 1170, in save_cell
pickler.save_reduce(_create_cell, (f,), obj=obj)
File "/home/bram/.pyenv/versions/3.8.6/lib/python3.8/pickle.py", line 690, in save_reduce
save(args)
File "/home/bram/.pyenv/versions/3.8.6/lib/python3.8/pickle.py", line 558, in save
f(self, obj) # Call unbound method with explicit self
File "/home/bram/.pyenv/versions/3.8.6/lib/python3.8/pickle.py", line 884, in save_tuple
save(element)
File "/home/bram/.pyenv/versions/3.8.6/lib/python3.8/pickle.py", line 601, in save
self.save_reduce(obj=obj, *rv)
File "/home/bram/.pyenv/versions/3.8.6/lib/python3.8/pickle.py", line 715, in save_reduce
save(state)
File "/home/bram/.pyenv/versions/3.8.6/lib/python3.8/pickle.py", line 558, in save
f(self, obj) # Call unbound method with explicit self
File "/home/bram/.local/share/virtualenvs/neural-fuzzy-repair-b49KnSNp/lib/python3.8/site-packages/dill/_dill.py", line 933, in save_module_dict
StockPickler.save_dict(pickler, obj)
File "/home/bram/.pyenv/versions/3.8.6/lib/python3.8/pickle.py", line 969, in save_dict
self._batch_setitems(obj.items())
File "/home/bram/.pyenv/versions/3.8.6/lib/python3.8/pickle.py", line 995, in _batch_setitems
save(v)
File "/home/bram/.pyenv/versions/3.8.6/lib/python3.8/pickle.py", line 576, in save
rv = reduce(self.proto)
File "stringsource", line 2, in sent2vec.Sent2vecModel.__reduce_cython__
TypeError: no default __reduce__ due to non-trivial __cinit__
Is there any way around this? For instance by completely disabling the fingerprinting?