Entropy tokenizer

vnosri · May 15, 2024, 12:19pm

Hello, im making a tokenizer based just on entropy, here the idea to discuss

Not need a model, im looking fo better way to encode fast model to split sentences as input/output clasification from the nothing .

The code

import sys
import math
import re

class TextProcessor:
    def __init__(self, texto):
        self.texto = texto

    def entropy(self):
        simbolos = {}
        total_caracteres = len(self.texto)

        for caracter in self.texto:
            simbolos[caracter] = simbolos.get(caracter, 0) + 1

        entropia = 0
        for count in simbolos.values():
            probabilidad = count / total_caracteres
            entropia -= probabilidad * math.log2(probabilidad)

        return simbolos, entropia

    def common_string(self, cadena1, cadena2):
        longitud1 = len(cadena1)
        longitud2 = len(cadena2)
        comun = ''
        subcadenas_comunes = []

        for i in range(longitud1):
            for j in range(longitud2):
                k = 0
                while (i+k < longitud1 and j+k < longitud2 and cadena1[i+k] == cadena2[j+k]):
                    k += 1
                if k > 0:
                    subcadenas_comunes.append(cadena1[i:i+k])

        if subcadenas_comunes:
            comun = max(subcadenas_comunes, key=len)

        return comun

    def magic_split(self):
        unique_symbols = set(self.texto)
        symbol_distances = {}
        for symbol in unique_symbols:
            indices = [i for i, char in enumerate(self.texto) if char == symbol]
            if len(indices) > 1:
                distances = [indices[i + 1] - indices[i] for i in range(len(indices) - 1)]
                symbol_distances[symbol] = distances

        variation = {symbol: max(distances) - min(distances) for symbol, distances in symbol_distances.items() if distances}

        mins = {}
        for v in variation:
            if variation[v]!=0 and variation[v]!=1:
                mins[v] = variation[v]

        best_symbol = min(mins, key=mins.get)

        return best_symbol

    def rotate_string(self, string, n):
        indice = n % len(string)
        string_rotado = string[indice:] + string[:indice]
        return string_rotado

    def rotate_compare(self, tokiA, tokiB):
        if tokiA >= tokiB:
            tokA = tokiA
            tokB = tokiB
            ltokA = len(tokA)
        else:
            tokA = tokiB
            tokB = tokiA
            ltokA = len(tokB)

        i = 0
        rotations = {}
        while i < ltokA:
            tokrotated = self.rotate_string(tokA, i)
            rotations[str(i)] = self.common_string(tokrotated, tokB) 
            i += 1

        best_r = ""
        for x in rotations:
            lb = len(best_r)
            rot = rotations[x]
            lrot = len(rot)
            if lrot > 1 and lrot < ltokA and lrot > lb:
                best_r = rot

        return best_r

    def get_subTokens(self, spl):
        sub_tokens = self.texto.split(spl)
        toks = []
        for tok in sub_tokens:
            for tok2 in sub_tokens:
                if tok != tok2:
                    toks.append(self.rotate_compare(tok, tok2))

        return list(set(toks))

    def tokenize(self, spliter_optimo):
        tokens = self.get_subTokens(spliter_optimo)
        tokenized_sentence = {}
        chunk = self.texto.split(spliter_optimo)
        for txt in chunk:
            best_split = ""
            if len(txt)<3:
                tokenized_sentence[txt]= txt
            else:

                for tok in tokens:
                    if tok != "":
                        lt = len(tok)
                        lb = len(best_split)
                        spltxt = txt.split(tok)
                        if len(spltxt) > 1:
                            l0 = len(spltxt[0])
                            l1 = len(spltxt[1])
                            if lt < len(txt) and lt > lb:
                                best_split = tok
                                tokenized_sentence[txt] = " " + spltxt[0] + "-" + tok + "-" + spltxt[1]
        
        return tokenized_sentence


    def symbol_distances(self,texto, tokens):
    # Ordena los tokens por longitud descendente para garantizar la división más larga posible.
        txt = texto
        for tok in tokens:
            if tok !='':
                txt = txt.replace(tok,"-"+tok+"-")

        #print(txt)
        arr = txt.split("-")
        return [elem for elem in arr if elem != '']


    def distances(self,tokens):
        tokens_unicos = {}
        for i, token in enumerate(tokens):
            if token not in tokens_unicos:
                tokens_unicos[token] = [i]
            else:
                tokens_unicos[token].append(i)
        
        return tokens_unicos



    def from_distances(self,tokens_distancias):
        rebuild={}

        for tok in tokens_distancias:
            for dis in tokens_distancias[tok]:
                try:
                    rebuild[dis]=tok
                except:
                    pass
        return ({k: rebuild[k] for k in sorted(rebuild)})


# Ejemplo de uso:
texto_ejemplo = "cuando te digo vete , te aburres , corres o andas  ? cuando me dices vete , me aburro, corro y ando"
processor = TextProcessor(texto_ejemplo)
spliter_optimo = processor.magic_split()
tokenized_sentence = processor.tokenize(spliter_optimo)

token_txt =""

for token in tokenized_sentence:
    token_txt += "-"+tokenized_sentence[token]


tokens = set(token_txt.split("-"))
symb = processor.symbol_distances(texto_ejemplo,tokens)

print("Tokens")
print(tokens)

print("Number of symbols in tokens:")
print(len(tokens))

print("Number of symbols in chars:")
print(len(set(texto_ejemplo)))
print("Length of text",len(texto_ejemplo))

print("Texto original:", texto_ejemplo)
print("Spliter óptimo:", spliter_optimo)
print("Frase tokenizada:", tokenized_sentence)
print("Length tokenized",len(tokenized_sentence))
print("Token Sentences", symb)
print("Lenght Token Sentence", len(symb))

distances = processor.distances(symb)

print("Token Distances", distances)
print("Token Distance Length", len(distances))


print(processor.from_distances(distances))

The Result

Tokens
{'', ' a', '?', 'o,', 'me', ' ', ' co', 'aburr', 'o', 'ndo', 'rres', 'corr', 'di', 'ando', 'es', 'and', ' cu', 'go', 'y', ',', 'ces', 'te', ' ve', 'as'}
Number of symbols in tokens:
24
Number of symbols in chars:
19
Length of text 99
Texto original: cuando te digo vete , te aburres , corres o andas  ? cuando me dices vete , me aburro, corro y ando
Spliter óptimo:  
Frase tokenizada: {'cuando': ' cu-ando-', 'te': 'te', 'digo': ' -di-go', 'vete': ' ve-te-', ',': ',', 'aburres': ' -aburr-es', 'corres': ' co-rres-', 'o': 'o', 'andas': ' -and-as', '': '', '?': '?', 'me': 'me', 'dices': ' -di-ces', 'aburro,': ' -aburr-o,', 'corro': ' -corr-o', 'y': 'y', 'ando': ' a-ndo-'}
Length tokenized 17
Token Sentences ['cu', 'and', 'o', ' ', 'te', ' ', 'di', 'g', 'o', ' ', 've', 'te', ' ', ',', ' ', 'te', ' ', 'a', 'bu', 'rr', 'es', ' ', ',', ' ', 'c', 'o', 'rr', 'es', ' ', 'o', ' ', 'a', 'nd', 'as', ' ', ' ', '?', ' ', 'cu', 'and', 'o', ' ', 'me', ' ', 'di', 'c', 'es', ' ', 've', 'te', ' ', ',', ' ', 'me', ' ', 'a', 'burr', 'o', ',', ' ', 'c', 'o', 'rr', 'o', ' ', 'y', ' ', 'a', 'nd', 'o']
Lenght Token Sentence 70
Token Distances {'cu': [0, 38], 'and': [1, 39], 'o': [2, 8, 25, 29, 40, 57, 61, 63, 69], ' ': [3, 5, 9, 12, 14, 16, 21, 23, 28, 30, 34, 35, 37, 41, 43, 47, 50, 52, 54, 59, 64, 66], 'te': [4, 11, 15, 49], 'di': [6, 44], 'g': [7], 've': [10, 48], ',': [13, 22, 51, 58], 'a': [17, 31, 55, 67], 'bu': [18], 'rr': [19, 26, 62], 'es': [20, 27, 46], 'c': [24, 45, 60], 'nd': [32, 68], 'as': [33], '?': [36], 'me': [42, 53], 'burr': [56], 'y': [65]}
Token Distance Length 20
{0: 'cu', 1: 'and', 2: 'o', 3: ' ', 4: 'te', 5: ' ', 6: 'di', 7: 'g', 8: 'o', 9: ' ', 10: 've', 11: 'te', 12: ' ', 13: ',', 14: ' ', 15: 'te', 16: ' ', 17: 'a', 18: 'bu', 19: 'rr', 20: 'es', 21: ' ', 22: ',', 23: ' ', 24: 'c', 25: 'o', 26: 'rr', 27: 'es', 28: ' ', 29: 'o', 30: ' ', 31: 'a', 32: 'nd', 33: 'as', 34: ' ', 35: ' ', 36: '?', 37: ' ', 38: 'cu', 39: 'and', 40: 'o', 41: ' ', 42: 'me', 43: ' ', 44: 'di', 45: 'c', 46: 'es', 47: ' ', 48: 've', 49: 'te', 50: ' ', 51: ',', 52: ' ', 53: 'me', 54: ' ', 55: 'a', 56: 'burr', 57: 'o', 58: ',', 59: ' ', 60: 'c', 61: 'o', 62: 'rr', 63: 'o', 64: ' ', 65: 'y', 66: ' ', 67: 'a', 68: 'nd', 69: 'o'}

Idea is group information in the better encoding way, using bigger symbols availiable with more number of repetitions, spliting words by this way we can found patterns to split sentences on Input/Outpus with no modes.
This way can be really fast , and maybe have applications also in compression,

What do you think ?

Suggestions, ideas ?

Topic		Replies	Views
Automatic sentence segmentation and encoding 🤗Tokenizers	0	847	October 12, 2020
Split document into sentences for sentence embedding Beginners	2	6948	February 9, 2021
Use a pretrained ByteLevelBPETokenizer on text 🤗Tokenizers	1	3922	July 17, 2020
Tokenizers v0.8.0 is out! 🤗Tokenizers	0	1518	July 7, 2020
TokenizerFast with various units (e.g., BPE, wordpiece, word, character, unigram) Intermediate	1	429	November 12, 2020

Entropy tokenizer

The code

The Result

Related topics