# Entropy tokenizer

Hello, im making a tokenizer based just on entropy, here the idea to discuss

Not need a model, im looking fo better way to encode fast model to split sentences as input/output clasification from the nothing .

### The code

``````import sys
import math
import re

class TextProcessor:
def __init__(self, texto):
self.texto = texto

def entropy(self):
simbolos = {}
total_caracteres = len(self.texto)

for caracter in self.texto:
simbolos[caracter] = simbolos.get(caracter, 0) + 1

entropia = 0
for count in simbolos.values():

return simbolos, entropia

comun = ''

for i in range(longitud1):
for j in range(longitud2):
k = 0
while (i+k < longitud1 and j+k < longitud2 and cadena1[i+k] == cadena2[j+k]):
k += 1
if k > 0:

return comun

def magic_split(self):
unique_symbols = set(self.texto)
symbol_distances = {}
for symbol in unique_symbols:
indices = [i for i, char in enumerate(self.texto) if char == symbol]
if len(indices) > 1:
distances = [indices[i + 1] - indices[i] for i in range(len(indices) - 1)]
symbol_distances[symbol] = distances

variation = {symbol: max(distances) - min(distances) for symbol, distances in symbol_distances.items() if distances}

mins = {}
for v in variation:
if variation[v]!=0 and variation[v]!=1:
mins[v] = variation[v]

best_symbol = min(mins, key=mins.get)

return best_symbol

def rotate_string(self, string, n):
indice = n % len(string)

def rotate_compare(self, tokiA, tokiB):
if tokiA >= tokiB:
tokA = tokiA
tokB = tokiB
ltokA = len(tokA)
else:
tokA = tokiB
tokB = tokiA
ltokA = len(tokB)

i = 0
rotations = {}
while i < ltokA:
tokrotated = self.rotate_string(tokA, i)
rotations[str(i)] = self.common_string(tokrotated, tokB)
i += 1

best_r = ""
for x in rotations:
lb = len(best_r)
rot = rotations[x]
lrot = len(rot)
if lrot > 1 and lrot < ltokA and lrot > lb:
best_r = rot

return best_r

def get_subTokens(self, spl):
sub_tokens = self.texto.split(spl)
toks = []
for tok in sub_tokens:
for tok2 in sub_tokens:
if tok != tok2:
toks.append(self.rotate_compare(tok, tok2))

return list(set(toks))

def tokenize(self, spliter_optimo):
tokens = self.get_subTokens(spliter_optimo)
tokenized_sentence = {}
chunk = self.texto.split(spliter_optimo)
for txt in chunk:
best_split = ""
if len(txt)<3:
tokenized_sentence[txt]= txt
else:

for tok in tokens:
if tok != "":
lt = len(tok)
lb = len(best_split)
spltxt = txt.split(tok)
if len(spltxt) > 1:
l0 = len(spltxt[0])
l1 = len(spltxt[1])
if lt < len(txt) and lt > lb:
best_split = tok
tokenized_sentence[txt] = " " + spltxt[0] + "-" + tok + "-" + spltxt[1]

def symbol_distances(self,texto, tokens):
# Ordena los tokens por longitud descendente para garantizar la división más larga posible.
txt = texto
for tok in tokens:
if tok !='':
txt = txt.replace(tok,"-"+tok+"-")

#print(txt)
arr = txt.split("-")
return [elem for elem in arr if elem != '']

def distances(self,tokens):
tokens_unicos = {}
for i, token in enumerate(tokens):
if token not in tokens_unicos:
tokens_unicos[token] = [i]
else:
tokens_unicos[token].append(i)

def from_distances(self,tokens_distancias):
rebuild={}

for tok in tokens_distancias:
for dis in tokens_distancias[tok]:
try:
rebuild[dis]=tok
except:
pass
return ({k: rebuild[k] for k in sorted(rebuild)})

# Ejemplo de uso:
texto_ejemplo = "cuando te digo vete , te aburres , corres o andas  ? cuando me dices vete , me aburro, corro y ando"
processor = TextProcessor(texto_ejemplo)
spliter_optimo = processor.magic_split()
tokenized_sentence = processor.tokenize(spliter_optimo)

token_txt =""

for token in tokenized_sentence:
token_txt += "-"+tokenized_sentence[token]

tokens = set(token_txt.split("-"))
symb = processor.symbol_distances(texto_ejemplo,tokens)

print("Tokens")
print(tokens)

print("Number of symbols in tokens:")
print(len(tokens))

print("Number of symbols in chars:")
print(len(set(texto_ejemplo)))
print("Length of text",len(texto_ejemplo))

print("Texto original:", texto_ejemplo)
print("Spliter óptimo:", spliter_optimo)
print("Length tokenized",len(tokenized_sentence))
print("Token Sentences", symb)
print("Lenght Token Sentence", len(symb))

distances = processor.distances(symb)

print("Token Distances", distances)
print("Token Distance Length", len(distances))

print(processor.from_distances(distances))

``````

### The Result

``````Tokens
{'', ' a', '?', 'o,', 'me', ' ', ' co', 'aburr', 'o', 'ndo', 'rres', 'corr', 'di', 'ando', 'es', 'and', ' cu', 'go', 'y', ',', 'ces', 'te', ' ve', 'as'}
Number of symbols in tokens:
24
Number of symbols in chars:
19
Length of text 99
Texto original: cuando te digo vete , te aburres , corres o andas  ? cuando me dices vete , me aburro, corro y ando
Spliter óptimo:
Frase tokenizada: {'cuando': ' cu-ando-', 'te': 'te', 'digo': ' -di-go', 'vete': ' ve-te-', ',': ',', 'aburres': ' -aburr-es', 'corres': ' co-rres-', 'o': 'o', 'andas': ' -and-as', '': '', '?': '?', 'me': 'me', 'dices': ' -di-ces', 'aburro,': ' -aburr-o,', 'corro': ' -corr-o', 'y': 'y', 'ando': ' a-ndo-'}
Length tokenized 17
Token Sentences ['cu', 'and', 'o', ' ', 'te', ' ', 'di', 'g', 'o', ' ', 've', 'te', ' ', ',', ' ', 'te', ' ', 'a', 'bu', 'rr', 'es', ' ', ',', ' ', 'c', 'o', 'rr', 'es', ' ', 'o', ' ', 'a', 'nd', 'as', ' ', ' ', '?', ' ', 'cu', 'and', 'o', ' ', 'me', ' ', 'di', 'c', 'es', ' ', 've', 'te', ' ', ',', ' ', 'me', ' ', 'a', 'burr', 'o', ',', ' ', 'c', 'o', 'rr', 'o', ' ', 'y', ' ', 'a', 'nd', 'o']
Lenght Token Sentence 70
Token Distances {'cu': [0, 38], 'and': [1, 39], 'o': [2, 8, 25, 29, 40, 57, 61, 63, 69], ' ': [3, 5, 9, 12, 14, 16, 21, 23, 28, 30, 34, 35, 37, 41, 43, 47, 50, 52, 54, 59, 64, 66], 'te': [4, 11, 15, 49], 'di': [6, 44], 'g': [7], 've': [10, 48], ',': [13, 22, 51, 58], 'a': [17, 31, 55, 67], 'bu': [18], 'rr': [19, 26, 62], 'es': [20, 27, 46], 'c': [24, 45, 60], 'nd': [32, 68], 'as': [33], '?': [36], 'me': [42, 53], 'burr': [56], 'y': [65]}
Token Distance Length 20
{0: 'cu', 1: 'and', 2: 'o', 3: ' ', 4: 'te', 5: ' ', 6: 'di', 7: 'g', 8: 'o', 9: ' ', 10: 've', 11: 'te', 12: ' ', 13: ',', 14: ' ', 15: 'te', 16: ' ', 17: 'a', 18: 'bu', 19: 'rr', 20: 'es', 21: ' ', 22: ',', 23: ' ', 24: 'c', 25: 'o', 26: 'rr', 27: 'es', 28: ' ', 29: 'o', 30: ' ', 31: 'a', 32: 'nd', 33: 'as', 34: ' ', 35: ' ', 36: '?', 37: ' ', 38: 'cu', 39: 'and', 40: 'o', 41: ' ', 42: 'me', 43: ' ', 44: 'di', 45: 'c', 46: 'es', 47: ' ', 48: 've', 49: 'te', 50: ' ', 51: ',', 52: ' ', 53: 'me', 54: ' ', 55: 'a', 56: 'burr', 57: 'o', 58: ',', 59: ' ', 60: 'c', 61: 'o', 62: 'rr', 63: 'o', 64: ' ', 65: 'y', 66: ' ', 67: 'a', 68: 'nd', 69: 'o'}

``````

Idea is group information in the better encoding way, using bigger symbols availiable with more number of repetitions, spliting words by this way we can found patterns to split sentences on Input/Outpus with no modes.
This way can be really fast , and maybe have applications also in compression,

What do you think ?

Suggestions, ideas ?