I am trying to map a dataset based on the stack smol xs that is transformed using this map function:
def tokenize(code_file):
code_file["tokens"] = tokenizer.encode(code_file["content"]).tokens
code_file["token_ids"] = tokenizer.encode(code_file["content"]).ids
return code_file
data = data.map(tokenize, num_proc=8)
I uploaded the tokenizer used here. This was trained on the above dataset using the tokenizers library, it is a unigram tokenizer.
In a different file after I have resaved the data after performing the mapping function:
data.save_to_disk("tokenized_data")
I then try to do another map function but run into a big error:
data = load_from_disk(f"data/tokenized/{vocab_size}_{self.data_path}")
# I also remove the top 1% largest files, but that's a whole separate process that doesn't affect this error
def _multisetify(code_file):
# code_file["tokens"] is always a list (I checked)
# and the type of the class(code_file["tokens"]) is always class (I checked)
code_file['tokens'] = Multiset(code_file["tokens"])
# other types I have tried that also don't work:
# Counter(code_file["tokens"])
# dict(Counter(code_file["tokens"])
# np.array((code_file["tokens"])) # this fails at 999 instead of the usual 1999 and also generates a different error, I don't want a numpy array, I just used this to see if it happens with non dict-like types
# set(code_file["tokens"]) # this does work, but I can't use a set for my needs
return code_file
data.shuffle()
self.data = data.map(_multisetify, num_proc=1)
This code returns the following error every time after map hits 1999 exactly, the code is slightly different since I have a very large code base but above is my minimally reproducible example:
Map: 23%|âââ | 1999/8700 [00:14<00:50, 133.90 examples/s]
Traceback (most recent call last):
File "/Users/caden/Documents/ncs/ncs-env/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 3538, in _map_single
writer.write(example)
File "/Users/caden/Documents/ncs/ncs-env/lib/python3.10/site-packages/datasets/arrow_writer.py", line 500, in write
self.write_examples_on_file()
File "/Users/caden/Documents/ncs/ncs-env/lib/python3.10/site-packages/datasets/arrow_writer.py", line 458, in write_examples_on_file
self.write_batch(batch_examples=batch_examples)
File "/Users/caden/Documents/ncs/ncs-env/lib/python3.10/site-packages/datasets/arrow_writer.py", line 568, in write_batch
arrays.append(pa.array(typed_sequence))
File "pyarrow/array.pxi", line 248, in pyarrow.lib.array
File "pyarrow/array.pxi", line 112, in pyarrow.lib._handle_arrow_array_protocol
File "/Users/caden/Documents/ncs/ncs-env/lib/python3.10/site-packages/datasets/arrow_writer.py", line 208, in __arrow_array__
out = cast_array_to_feature(
File "/Users/caden/Documents/ncs/ncs-env/lib/python3.10/site-packages/datasets/table.py", line 1804, in wrapper
return func(array, *args, **kwargs)
File "/Users/caden/Documents/ncs/ncs-env/lib/python3.10/site-packages/datasets/table.py", line 2122, in cast_array_to_feature
raise TypeError(f"Couldn't cast array of type\n{_short_str(array.type)}\nto\n{_short_str(feature)}")
TypeError: Couldn't cast array of type
struct<a: int64, aa: int64, aaa: int64, aaaa: int64, aaaaa: int64, aaaaaa: int64, aaaaaaa: int64, aaaaaaaaa: int64, aaaaaaaaaaa: int64, aaab: int64, aaac: int64, aaad: int64, aaae: int64, aab: int64, aabb: int64, aabbmax: int64, aabbmin: int64, aabf: int64, aac: int64, aacc: int64, aad: int64, aadf: int64, aae: int64, aaf: int64, aalda: int64, aam: int64, aao: int64, aarch: int64, aav: int64, aawa: int64, ab: int64, aba: int64, abaa: int64, abac: int64, abaf: int64, abandon: int64, abb: int64, abba: int64, abbc: int64, abbe: int64, abbr: int64, abbrev: int64, abbrevabbrev: int64, abbrevcode: int64, abbreviated: int64, abbreviation: int64, abbreviations: int64, abbrevoffset: int64, abbrtitle: int64, abc: int64, abca: int64, abcabc: int64, abcd: int64, abcde: int64, abcdef: int64, abcdefg: int64, abcdefgh: int64, abcdefghijklmnop: int64, abd: int64, abda: int64, abe: int64, abeb: int64, abfb: int64, abfd: int64, abff: int64, abi: int64, abilities: int64, ability: int64, abindexlabeltext: int64, abits: int64, able: int64, abledasyncuser: int64, abledirecti: int64, ableheadersorted: int64, ablequeryable: int64, ableuse: int64, ablock: int64, ably: int64, abm: int64, abn: int64, abort: int64, aborted: int64, about: int64, above: int64, abp: int64, abra: int64, abri: int64, abrightsreserved: int64, abs: int64, absabs: int64, absabsabs: int64, absence: int64, absent: int64, absf: int64, absfr: int64, absl: int64, absmax: int64, absnonneg: int64, absolute: int64, absolutely: int64, a
...
b: int64, zbit: int64, zbl: int64, zc: int64, zca: int64, zch: int64, zd: int64, zde: int64, ze: int64, zeal: int64, zed: int64, zedatasizeresult: int64, zee: int64, zei: int64, zek: int64, zel: int64, zen: int64, zeni: int64, zeo: int64, zep: int64, zer: int64, zero: int64, zeroes: int64, zeroextend: int64, zerop: int64, zeropadding: int64, zeroprescription: int64, zeror: int64, zerorewrite: int64, zerozero: int64, zessizesizedsize: int64, zeta: int64, zeu: int64, zew: int64, zext: int64, zey: int64, zez: int64, zf: int64, zg: int64, zge: int64, zh: int64, zhang: int64, zi: int64, zia: int64, zie: int64, zif: int64, zig: int64, zil: int64, zilla: int64, zim: int64, zindex: int64, zio: int64, zip: int64, zipfile: int64, ziplist: int64, zipped: int64, zipunzip: int64, zir: int64, zj: int64, zk: int64, zl: int64, zlib: int64, zlibcompress: int64, zm: int64, zmzmzm: int64, zmzmzmzm: int64, zmzmzmzmzm: int64, zn: int64, zne: int64, zo: int64, zone: int64, zonefile: int64, zonesubzone: int64, zonezone: int64, zoo: int64, zoom: int64, zos: int64, zp: int64, zpo: int64, zq: int64, zr: int64, zs: int64, zsc: int64, zsh: int64, zshapeinfo: int64, zso: int64, zt: int64, zu: int64, zur: int64, zv: int64, zvmessage: int64, zvmessageerror: int64, zvmessagesprintf: int64, zvmessaget: int64, zw: int64, zwa: int64, zx: int64, zxj: int64, zy: int64, zyb: int64, zyc: int64, zyg: int64, zym: int64, zyr: int64, zyu: int64, zz: int64, zzanosupervisort: int64, zzle: int64, zzz: int64, zzzz: int64>
to
{'a': Value(dtype='int64', id=None), 'aa': Value(dtype='int64', id=None), 'aaa': Value(dtype='int64', id=None), 'aaaa': Value(dtype='int64', id=None), 'aaaaa': Value(dtype='int64', id=None), 'aaaaaa': Value(dtype='int64', id=None), 'aaab': Value(dtype='int64', id=None), 'aaad': Value(dtype='int64', id=None), 'aaae': Value(dtype='int64', id=None), 'aab': Value(dtype='int64', id=None), 'aabaa': Value(dtype='int64', id=None), 'aabb': Value(dtype='int64', id=None), 'aac': Value(dtype='int64', id=None), 'aaca': Value(dtype='int64', id=None), 'aad': Value(dtype='int64', id=None), 'aae': Value(dtype='int64', id=None), 'aaf': Value(dtype='int64', id=None), 'aarch': Value(dtype='int64', id=None), 'ab': Value(dtype='int64', id=None), 'aba': Value(dtype='int64', id=None), 'abab': Value(dtype='int64', id=None), 'aband': Value(dtype='int64', id=None), 'abandon': Value(dtype='int64', id=None), 'abb': Value(dtype='int64', id=None), 'abbr': Value(dtype='int64', id=None), 'abbreviation': Value(dtype='int64', id=None), 'abc': Value(dtype='int64', id=None), 'abcabc': Value(dtype='int64', id=None), 'abcd': Value(dtype='int64', id=None), 'abcdef': Value(dtype='int64', id=None), 'abcdefghijklmnop': Value(dtype='int64', id=None), 'abd': Value(dtype='int64', id=None), 'abda': Value(dtype='int64', id=None), 'abe': Value(dtype='int64', id=None), 'abeladdressstore': Value(dtype='int64', id=None), 'abelcommentstore': Value(dtype='int64', id=None), 'abfb': Value(dtype='int64', id=None), 'abfd': Value(dty
...
'zerozero': Value(dtype='int64', id=None), 'zerozerorefl': Value(dtype='int64', id=None), 'zerozerosuc': Value(dtype='int64', id=None), 'zerozerozero': Value(dtype='int64', id=None), 'zerozerozerozero': Value(dtype='int64', id=None), 'zerva': Value(dtype='int64', id=None), 'zeta': Value(dtype='int64', id=None), 'zext': Value(dtype='int64', id=None), 'zg': Value(dtype='int64', id=None), 'zh': Value(dtype='int64', id=None), 'zhi': Value(dtype='int64', id=None), 'zi': Value(dtype='int64', id=None), 'zif': Value(dtype='int64', id=None), 'zig': Value(dtype='int64', id=None), 'zilla': Value(dtype='int64', id=None), 'ziness': Value(dtype='int64', id=None), 'zip': Value(dtype='int64', id=None), 'zipwith': Value(dtype='int64', id=None), 'zk': Value(dtype='int64', id=None), 'zl': Value(dtype='int64', id=None), 'zlib': Value(dtype='int64', id=None), 'zm': Value(dtype='int64', id=None), 'zmap': Value(dtype='int64', id=None), 'zo': Value(dtype='int64', id=None), 'zone': Value(dtype='int64', id=None), 'zonezone': Value(dtype='int64', id=None), 'zonk': Value(dtype='int64', id=None), 'zoom': Value(dtype='int64', id=None), 'zp': Value(dtype='int64', id=None), 'zr': Value(dtype='int64', id=None), 'zs': Value(dtype='int64', id=None), 'zu': Value(dtype='int64', id=None), 'zv': Value(dtype='int64', id=None), 'zw': Value(dtype='int64', id=None), 'zx': Value(dtype='int64', id=None), 'zz': Value(dtype='int64', id=None), 'zzz': Value(dtype='int64', id=None), 'zzzzzzzz': Value(dtype='int64', id=None)}
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Users/caden/Documents/ncs/src/embedding_models/tf_idf.py", line 193, in <module>
test = Tfidf(vocab_size=vocab_size)
File "/Users/caden/Documents/ncs/src/embedding_models/tf_idf.py", line 69, in __init__
self.data = data.map(_multisetify, num_proc=1)
File "/Users/caden/Documents/ncs/ncs-env/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 602, in wrapper
out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
File "/Users/caden/Documents/ncs/ncs-env/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 567, in wrapper
out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
File "/Users/caden/Documents/ncs/ncs-env/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 3161, in map
for rank, done, content in Dataset._map_single(**dataset_kwargs):
File "/Users/caden/Documents/ncs/ncs-env/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 3587, in _map_single
writer.finalize()
File "/Users/caden/Documents/ncs/ncs-env/lib/python3.10/site-packages/datasets/arrow_writer.py", line 599, in finalize
self.write_examples_on_file()
File "/Users/caden/Documents/ncs/ncs-env/lib/python3.10/site-packages/datasets/arrow_writer.py", line 458, in write_examples_on_file
self.write_batch(batch_examples=batch_examples)
File "/Users/caden/Documents/ncs/ncs-env/lib/python3.10/site-packages/datasets/arrow_writer.py", line 568, in write_batch
arrays.append(pa.array(typed_sequence))
File "pyarrow/array.pxi", line 248, in pyarrow.lib.array
File "pyarrow/array.pxi", line 112, in pyarrow.lib._handle_arrow_array_protocol
File "/Users/caden/Documents/ncs/ncs-env/lib/python3.10/site-packages/datasets/arrow_writer.py", line 208, in __arrow_array__
out = cast_array_to_feature(
File "/Users/caden/Documents/ncs/ncs-env/lib/python3.10/site-packages/datasets/table.py", line 1804, in wrapper
return func(array, *args, **kwargs)
File "/Users/caden/Documents/ncs/ncs-env/lib/python3.10/site-packages/datasets/table.py", line 2122, in cast_array_to_feature
raise TypeError(f"Couldn't cast array of type\n{_short_str(array.type)}\nto\n{_short_str(feature)}")
TypeError: Couldn't cast array of type
struct<a: int64, aa: int64, aaa: int64, aaaa: int64, aaaaa: int64, aaaaaa: int64, aaaaaaa: int64, aaaaaaaaa: int64, aaaaaaaaaaa: int64, aaab: int64, aaac: int64, aaad: int64, aaae: int64, aab: int64, aabb: int64, aabbmax: int64, aabbmin: int64, aabf: int64, aac: int64, aacc: int64, aad: int64, aadf: int64, aae: int64, aaf: int64, aalda: int64, aam: int64, aao: int64, aarch: int64, aav: int64, aawa: int64, ab: int64, aba: int64, abaa: int64, abac: int64, abaf: int64, abandon: int64, abb: int64, abba: int64, abbc: int64, abbe: int64, abbr: int64, abbrev: int64, abbrevabbrev: int64, abbrevcode: int64, abbreviated: int64, abbreviation: int64, abbreviations: int64, abbrevoffset: int64, abbrtitle: int64, abc: int64, abca: int64, abcabc: int64, abcd: int64, abcde: int64, abcdef: int64, abcdefg: int64, abcdefgh: int64, abcdefghijklmnop: int64, abd: int64, abda: int64, abe: int64, abeb: int64, abfb: int64, abfd: int64, abff: int64, abi: int64, abilities: int64, ability: int64, abindexlabeltext: int64, abits: int64, able: int64, abledasyncuser: int64, abledirecti: int64, ableheadersorted: int64, ablequeryable: int64, ableuse: int64, ablock: int64, ably: int64, abm: int64, abn: int64, abort: int64, aborted: int64, about: int64, above: int64, abp: int64, abra: int64, abri: int64, abrightsreserved: int64, abs: int64, absabs: int64, absabsabs: int64, absence: int64, absent: int64, absf: int64, absfr: int64, absl: int64, absmax: int64, absnonneg: int64, absolute: int64, absolutely: int64, a
...
b: int64, zbit: int64, zbl: int64, zc: int64, zca: int64, zch: int64, zd: int64, zde: int64, ze: int64, zeal: int64, zed: int64, zedatasizeresult: int64, zee: int64, zei: int64, zek: int64, zel: int64, zen: int64, zeni: int64, zeo: int64, zep: int64, zer: int64, zero: int64, zeroes: int64, zeroextend: int64, zerop: int64, zeropadding: int64, zeroprescription: int64, zeror: int64, zerorewrite: int64, zerozero: int64, zessizesizedsize: int64, zeta: int64, zeu: int64, zew: int64, zext: int64, zey: int64, zez: int64, zf: int64, zg: int64, zge: int64, zh: int64, zhang: int64, zi: int64, zia: int64, zie: int64, zif: int64, zig: int64, zil: int64, zilla: int64, zim: int64, zindex: int64, zio: int64, zip: int64, zipfile: int64, ziplist: int64, zipped: int64, zipunzip: int64, zir: int64, zj: int64, zk: int64, zl: int64, zlib: int64, zlibcompress: int64, zm: int64, zmzmzm: int64, zmzmzmzm: int64, zmzmzmzmzm: int64, zn: int64, zne: int64, zo: int64, zone: int64, zonefile: int64, zonesubzone: int64, zonezone: int64, zoo: int64, zoom: int64, zos: int64, zp: int64, zpo: int64, zq: int64, zr: int64, zs: int64, zsc: int64, zsh: int64, zshapeinfo: int64, zso: int64, zt: int64, zu: int64, zur: int64, zv: int64, zvmessage: int64, zvmessageerror: int64, zvmessagesprintf: int64, zvmessaget: int64, zw: int64, zwa: int64, zx: int64, zxj: int64, zy: int64, zyb: int64, zyc: int64, zyg: int64, zym: int64, zyr: int64, zyu: int64, zz: int64, zzanosupervisort: int64, zzle: int64, zzz: int64, zzzz: int64>
to
{'a': Value(dtype='int64', id=None), 'aa': Value(dtype='int64', id=None), 'aaa': Value(dtype='int64', id=None), 'aaaa': Value(dtype='int64', id=None), 'aaaaa': Value(dtype='int64', id=None), 'aaaaaa': Value(dtype='int64', id=None), 'aaab': Value(dtype='int64', id=None), 'aaad': Value(dtype='int64', id=None), 'aaae': Value(dtype='int64', id=None), 'aab': Value(dtype='int64', id=None), 'aabaa': Value(dtype='int64', id=None), 'aabb': Value(dtype='int64', id=None), 'aac': Value(dtype='int64', id=None), 'aaca': Value(dtype='int64', id=None), 'aad': Value(dtype='int64', id=None), 'aae': Value(dtype='int64', id=None), 'aaf': Value(dtype='int64', id=None), 'aarch': Value(dtype='int64', id=None), 'ab': Value(dtype='int64', id=None), 'aba': Value(dtype='int64', id=None), 'abab': Value(dtype='int64', id=None), 'aband': Value(dtype='int64', id=None), 'abandon': Value(dtype='int64', id=None), 'abb': Value(dtype='int64', id=None), 'abbr': Value(dtype='int64', id=None), 'abbreviation': Value(dtype='int64', id=None), 'abc': Value(dtype='int64', id=None), 'abcabc': Value(dtype='int64', id=None), 'abcd': Value(dtype='int64', id=None), 'abcdef': Value(dtype='int64', id=None), 'abcdefghijklmnop': Value(dtype='int64', id=None), 'abd': Value(dtype='int64', id=None), 'abda': Value(dtype='int64', id=None), 'abe': Value(dtype='int64', id=None), 'abeladdressstore': Value(dtype='int64', id=None), 'abelcommentstore': Value(dtype='int64', id=None), 'abfb': Value(dtype='int64', id=None), 'abfd': Value(dty
...
'zerozero': Value(dtype='int64', id=None), 'zerozerorefl': Value(dtype='int64', id=None), 'zerozerosuc': Value(dtype='int64', id=None), 'zerozerozero': Value(dtype='int64', id=None), 'zerozerozerozero': Value(dtype='int64', id=None), 'zerva': Value(dtype='int64', id=None), 'zeta': Value(dtype='int64', id=None), 'zext': Value(dtype='int64', id=None), 'zg': Value(dtype='int64', id=None), 'zh': Value(dtype='int64', id=None), 'zhi': Value(dtype='int64', id=None), 'zi': Value(dtype='int64', id=None), 'zif': Value(dtype='int64', id=None), 'zig': Value(dtype='int64', id=None), 'zilla': Value(dtype='int64', id=None), 'ziness': Value(dtype='int64', id=None), 'zip': Value(dtype='int64', id=None), 'zipwith': Value(dtype='int64', id=None), 'zk': Value(dtype='int64', id=None), 'zl': Value(dtype='int64', id=None), 'zlib': Value(dtype='int64', id=None), 'zm': Value(dtype='int64', id=None), 'zmap': Value(dtype='int64', id=None), 'zo': Value(dtype='int64', id=None), 'zone': Value(dtype='int64', id=None), 'zonezone': Value(dtype='int64', id=None), 'zonk': Value(dtype='int64', id=None), 'zoom': Value(dtype='int64', id=None), 'zp': Value(dtype='int64', id=None), 'zr': Value(dtype='int64', id=None), 'zs': Value(dtype='int64', id=None), 'zu': Value(dtype='int64', id=None), 'zv': Value(dtype='int64', id=None), 'zw': Value(dtype='int64', id=None), 'zx': Value(dtype='int64', id=None), 'zz': Value(dtype='int64', id=None), 'zzz': Value(dtype='int64', id=None), 'zzzzzzzz': Value(dtype='int64', id=None)}
I have tried adding multiprocessing and batching and nothing fixes it, I donât want to go digging into hugging face code to find why it is happening.