Train_from_iterator throwing TypeError: expected string or buffer errir

im not able to figure out what im doing wrong here

class IngenxPreTokenizer:
    def __init__(self):
        super().__init__()
        self.base_tokenizer = IngenxTokenizer()
    
    def pre_tokenize(self, pretok: PreTokenizedString):
        processed = self.base_tokenizer.process_text(pretok)
        normalized_tokens = []
        current_offset = 0
        for token in processed:
            token_len = len(token)
            normalized_tokens.append((
                token,
                (current_offset, current_offset + token_len)
            ))
            current_offset += token_len + 1
            
        pretok.tokens = normalized_tokens
        return pretok
    

class IngenxTokenTrainer:
    def __init__(self,df,size_dataset =240340,vocab_size=150000,min_freq = 5,batch_size=1000):
        self.tokenizer = IngenxTokenizer()
        self.df = df
        self.size_dataset = size_dataset
        self.vocab_size = vocab_size
        self.min_freq = min_freq
        self.batch_size=1000
        self.special_tokens = ["<|unk|>","<|pad|>","</|eos|>",
                               "<|var|>","</|var|>","<|val|>","<|val|>",
                               "<|func|>","<|func|>","<|op|>","</|op|>"
                              ]
        self.training_corpus = self.preprare_dataset()
        
    def preprare_dataset(self):
        X2 = np.random.choice(len(self.df), size=self.size_dataset, replace=False)
        examples = []
        for i in X2:
            # Convert to string and handle None/NaN values
            problem = str(self.df.iloc[i]['problem']) if pd.notna(self.df.iloc[i]['problem']) else ""
            solution = str(self.df.iloc[i]['solution']) if pd.notna(self.df.iloc[i]['solution']) else ""
            example = f"{problem} {solution}".strip()
            examples.append(example)
        return examples
        
    def get_training_corpus(self):
        dataset = self.training_corpus
        with tqdm(total=len(dataset), desc="Processing training corpus", unit="text") as pbar:
            for text in dataset:
                pbar.update(1)
                yield text
                
                
    def train_tokenizer(self):
        tokenizer = Tokenizer(BPE())
        tokenizer.pre_tokenizer = PreTokenizer.custom(IngenxPreTokenizer())

        trainer = BpeTrainer(
            vocab_size=self.vocab_size,
            min_frequency=self.min_freq,
            special_tokens=self.special_tokens
        )
        tokenizer.train_from_iterator(self.get_training_corpus(),trainer=trainer, length=len(self.training_corpus))
        tokenizer.save("ingenx_tokenizewr.json")
        return tokenizer

error

Exception                                 Traceback (most recent call last)
<ipython-input-47-3f931020c7fd> in <cell line: 1>()
----> 1 a.train_tokenizer()

<ipython-input-44-3011da9bf75a> in train_tokenizer(self)
     41             special_tokens=self.special_tokens
     42         )
---> 43         tokenizer.train_from_iterator(self.get_training_corpus(),trainer=trainer, length=len(self.training_corpus))
     44         tokenizer.save("ingenx_tokenizewr.json")
     45         return tokenizer

Exception: TypeError: expected string or buffer
1 Like

Could you please ensure that the dataset is not empty?

1 Like

when i use Whitespace as a PreTokenizer it work just fine but when i try to do same with my custom PreTokenizer it throw’s that error

1 Like