Hello, im making a tokenizer based just on entropy, here the idea to discuss
Not need a model, im looking fo better way to encode fast model to split sentences as input/output clasification from the nothing .
The code
import sys
import math
import re
class TextProcessor:
def __init__(self, texto):
self.texto = texto
def entropy(self):
simbolos = {}
total_caracteres = len(self.texto)
for caracter in self.texto:
simbolos[caracter] = simbolos.get(caracter, 0) + 1
entropia = 0
for count in simbolos.values():
probabilidad = count / total_caracteres
entropia -= probabilidad * math.log2(probabilidad)
return simbolos, entropia
def common_string(self, cadena1, cadena2):
longitud1 = len(cadena1)
longitud2 = len(cadena2)
comun = ''
subcadenas_comunes = []
for i in range(longitud1):
for j in range(longitud2):
k = 0
while (i+k < longitud1 and j+k < longitud2 and cadena1[i+k] == cadena2[j+k]):
k += 1
if k > 0:
subcadenas_comunes.append(cadena1[i:i+k])
if subcadenas_comunes:
comun = max(subcadenas_comunes, key=len)
return comun
def magic_split(self):
unique_symbols = set(self.texto)
symbol_distances = {}
for symbol in unique_symbols:
indices = [i for i, char in enumerate(self.texto) if char == symbol]
if len(indices) > 1:
distances = [indices[i + 1] - indices[i] for i in range(len(indices) - 1)]
symbol_distances[symbol] = distances
variation = {symbol: max(distances) - min(distances) for symbol, distances in symbol_distances.items() if distances}
mins = {}
for v in variation:
if variation[v]!=0 and variation[v]!=1:
mins[v] = variation[v]
best_symbol = min(mins, key=mins.get)
return best_symbol
def rotate_string(self, string, n):
indice = n % len(string)
string_rotado = string[indice:] + string[:indice]
return string_rotado
def rotate_compare(self, tokiA, tokiB):
if tokiA >= tokiB:
tokA = tokiA
tokB = tokiB
ltokA = len(tokA)
else:
tokA = tokiB
tokB = tokiA
ltokA = len(tokB)
i = 0
rotations = {}
while i < ltokA:
tokrotated = self.rotate_string(tokA, i)
rotations[str(i)] = self.common_string(tokrotated, tokB)
i += 1
best_r = ""
for x in rotations:
lb = len(best_r)
rot = rotations[x]
lrot = len(rot)
if lrot > 1 and lrot < ltokA and lrot > lb:
best_r = rot
return best_r
def get_subTokens(self, spl):
sub_tokens = self.texto.split(spl)
toks = []
for tok in sub_tokens:
for tok2 in sub_tokens:
if tok != tok2:
toks.append(self.rotate_compare(tok, tok2))
return list(set(toks))
def tokenize(self, spliter_optimo):
tokens = self.get_subTokens(spliter_optimo)
tokenized_sentence = {}
chunk = self.texto.split(spliter_optimo)
for txt in chunk:
best_split = ""
if len(txt)<3:
tokenized_sentence[txt]= txt
else:
for tok in tokens:
if tok != "":
lt = len(tok)
lb = len(best_split)
spltxt = txt.split(tok)
if len(spltxt) > 1:
l0 = len(spltxt[0])
l1 = len(spltxt[1])
if lt < len(txt) and lt > lb:
best_split = tok
tokenized_sentence[txt] = " " + spltxt[0] + "-" + tok + "-" + spltxt[1]
return tokenized_sentence
def symbol_distances(self,texto, tokens):
# Ordena los tokens por longitud descendente para garantizar la división más larga posible.
txt = texto
for tok in tokens:
if tok !='':
txt = txt.replace(tok,"-"+tok+"-")
#print(txt)
arr = txt.split("-")
return [elem for elem in arr if elem != '']
def distances(self,tokens):
tokens_unicos = {}
for i, token in enumerate(tokens):
if token not in tokens_unicos:
tokens_unicos[token] = [i]
else:
tokens_unicos[token].append(i)
return tokens_unicos
def from_distances(self,tokens_distancias):
rebuild={}
for tok in tokens_distancias:
for dis in tokens_distancias[tok]:
try:
rebuild[dis]=tok
except:
pass
return ({k: rebuild[k] for k in sorted(rebuild)})
# Ejemplo de uso:
texto_ejemplo = "cuando te digo vete , te aburres , corres o andas ? cuando me dices vete , me aburro, corro y ando"
processor = TextProcessor(texto_ejemplo)
spliter_optimo = processor.magic_split()
tokenized_sentence = processor.tokenize(spliter_optimo)
token_txt =""
for token in tokenized_sentence:
token_txt += "-"+tokenized_sentence[token]
tokens = set(token_txt.split("-"))
symb = processor.symbol_distances(texto_ejemplo,tokens)
print("Tokens")
print(tokens)
print("Number of symbols in tokens:")
print(len(tokens))
print("Number of symbols in chars:")
print(len(set(texto_ejemplo)))
print("Length of text",len(texto_ejemplo))
print("Texto original:", texto_ejemplo)
print("Spliter óptimo:", spliter_optimo)
print("Frase tokenizada:", tokenized_sentence)
print("Length tokenized",len(tokenized_sentence))
print("Token Sentences", symb)
print("Lenght Token Sentence", len(symb))
distances = processor.distances(symb)
print("Token Distances", distances)
print("Token Distance Length", len(distances))
print(processor.from_distances(distances))
The Result
Tokens
{'', ' a', '?', 'o,', 'me', ' ', ' co', 'aburr', 'o', 'ndo', 'rres', 'corr', 'di', 'ando', 'es', 'and', ' cu', 'go', 'y', ',', 'ces', 'te', ' ve', 'as'}
Number of symbols in tokens:
24
Number of symbols in chars:
19
Length of text 99
Texto original: cuando te digo vete , te aburres , corres o andas ? cuando me dices vete , me aburro, corro y ando
Spliter óptimo:
Frase tokenizada: {'cuando': ' cu-ando-', 'te': 'te', 'digo': ' -di-go', 'vete': ' ve-te-', ',': ',', 'aburres': ' -aburr-es', 'corres': ' co-rres-', 'o': 'o', 'andas': ' -and-as', '': '', '?': '?', 'me': 'me', 'dices': ' -di-ces', 'aburro,': ' -aburr-o,', 'corro': ' -corr-o', 'y': 'y', 'ando': ' a-ndo-'}
Length tokenized 17
Token Sentences ['cu', 'and', 'o', ' ', 'te', ' ', 'di', 'g', 'o', ' ', 've', 'te', ' ', ',', ' ', 'te', ' ', 'a', 'bu', 'rr', 'es', ' ', ',', ' ', 'c', 'o', 'rr', 'es', ' ', 'o', ' ', 'a', 'nd', 'as', ' ', ' ', '?', ' ', 'cu', 'and', 'o', ' ', 'me', ' ', 'di', 'c', 'es', ' ', 've', 'te', ' ', ',', ' ', 'me', ' ', 'a', 'burr', 'o', ',', ' ', 'c', 'o', 'rr', 'o', ' ', 'y', ' ', 'a', 'nd', 'o']
Lenght Token Sentence 70
Token Distances {'cu': [0, 38], 'and': [1, 39], 'o': [2, 8, 25, 29, 40, 57, 61, 63, 69], ' ': [3, 5, 9, 12, 14, 16, 21, 23, 28, 30, 34, 35, 37, 41, 43, 47, 50, 52, 54, 59, 64, 66], 'te': [4, 11, 15, 49], 'di': [6, 44], 'g': [7], 've': [10, 48], ',': [13, 22, 51, 58], 'a': [17, 31, 55, 67], 'bu': [18], 'rr': [19, 26, 62], 'es': [20, 27, 46], 'c': [24, 45, 60], 'nd': [32, 68], 'as': [33], '?': [36], 'me': [42, 53], 'burr': [56], 'y': [65]}
Token Distance Length 20
{0: 'cu', 1: 'and', 2: 'o', 3: ' ', 4: 'te', 5: ' ', 6: 'di', 7: 'g', 8: 'o', 9: ' ', 10: 've', 11: 'te', 12: ' ', 13: ',', 14: ' ', 15: 'te', 16: ' ', 17: 'a', 18: 'bu', 19: 'rr', 20: 'es', 21: ' ', 22: ',', 23: ' ', 24: 'c', 25: 'o', 26: 'rr', 27: 'es', 28: ' ', 29: 'o', 30: ' ', 31: 'a', 32: 'nd', 33: 'as', 34: ' ', 35: ' ', 36: '?', 37: ' ', 38: 'cu', 39: 'and', 40: 'o', 41: ' ', 42: 'me', 43: ' ', 44: 'di', 45: 'c', 46: 'es', 47: ' ', 48: 've', 49: 'te', 50: ' ', 51: ',', 52: ' ', 53: 'me', 54: ' ', 55: 'a', 56: 'burr', 57: 'o', 58: ',', 59: ' ', 60: 'c', 61: 'o', 62: 'rr', 63: 'o', 64: ' ', 65: 'y', 66: ' ', 67: 'a', 68: 'nd', 69: 'o'}
Idea is group information in the better encoding way, using bigger symbols availiable with more number of repetitions, spliting words by this way we can found patterns to split sentences on Input/Outpus with no modes.
This way can be really fast , and maybe have applications also in compression,
What do you think ?
Suggestions, ideas ?