class IngenxPreTokenizer(PreTokenizer):
def __init__(self):
super().__init__()
self.base_tokenizer = IngenxTokenizer()
def pre_tokenize(self, pretok: PreTokenizedString):
# text = pretok.normalized
processed = self.base_tokenizer.process_text(pretok)
normalized_tokens = []
current_offset = 0
for token in processed:
token_len = len(token)
normalized_tokens.append((
token,
(current_offset, current_offset + token_len)
))
current_offset += token_len + 1
pretok.tokens = normalized_tokens
return pretok
class IngenxTokenTrainer:
def __init__(self,df,size_dataset =240340,vocab_size=150000,min_freq = 5,batch_size=1000):
self.tokenizer = IngenxTokenizer()
self.df = df
self.size_dataset = size_dataset
self.vocab_size = vocab_size
self.min_freq = min_freq
self.batch_size=1000
self.special_tokens = ["<|unk|>","<|pad|>","</|eos|>",
"<|var|>","</|var|>","<|val|>","<|val|>",
"<|func|>","<|func|>","<|op|>","</|op|>"
]
self.training_corpus = self.preprare_dataset()
def preprare_dataset(self):
X2 = np.random.choice(len(self.df), size=self.size_dataset, replace=False)
training_texts = [f"{self.df.iloc[i]['problem']} {self.df.iloc[i]['solution']}" for i in X2]
return training_texts
def get_training_corpus(self):
dataset = self.training_corpus
with tqdm(total=len(dataset), desc="Processing training corpus", unit="batch") as pbar:
for start_idx in range(0, len(dataset), self.batch_size):
batch = dataset[start_idx : start_idx + self.batch_size]
pbar.update(len(batch))
yield batch
def train_tokenizer(self):
tokenizer = Tokenizer(BPE())
tokenizer.pre_tokenizer = PreTokenizer.custom(IngenxPreTokenizer)
# tokenizer.pre_tokenizer = Whitespace()
trainer = BpeTrainer(
vocab_size=self.vocab_size,
min_frequency=self.min_freq,
special_tokens=self.special_tokens
)
tokenizer.train_from_iterator(self.get_training_corpus(),trainer=trainer, length=len(self.training_corpus))
tokenizer.save("ingenx_tokenizer.json")
return tokenizer
when i run this code
i got this error
Exception Traceback (most recent call last)
<ipython-input-30-3f931020c7fd> in <cell line: 1>()
----> 1 a.train_tokenizer()
<ipython-input-27-f7ff35c251b2> in train_tokenizer(self)
37 special_tokens=self.special_tokens
38 )
---> 39 tokenizer.train_from_iterator(self.get_training_corpus(),trainer=trainer, length=len(self.training_corpus))
40 tokenizer.save("ingenx_tokenizer.json")
41 return tokenizer
Exception: TypeError: IngenxPreTokenizer.pre_tokenize() missing 1 required positional argument: 'pretok'