shithub: zelda3

ref: 43db8cc0f529a2b8004a684526d91582d0d6d5d7
dir: /other/make_text_dict.py/

View raw version
import array

memos = {}
memoslist = []
def memo(s):
  m = memos.get(s)
  if m == None:
    m = len(memoslist)
    memos[s] = m
    memoslist.append(s)
  return m

def tos(s): return "".join(memoslist[c] for c in s)

lines = []
for line in open('dialogue.txt', 'r').read().splitlines():
  line = line.split(': ')[1]

  r = array.array('H')

  i = 0
  while i < len(line):
    if line[i] == '[':
      j = line.index(']', i + 1)
      r.append(memo(line[i:j+1]))
      i = j + 1
    else:
      r.append(memo(line[i]))
      i += 1
      
  #print(repr(line))
  #print(r)
  lines.append(list(r))
import collections


def find_all_ngrams(lines, N, cost):
  ctr = collections.Counter()
  for line in lines:
    for i in range(len(line) - N + 1):
      if line[i] != line[i+1]:
        ctr[tuple(line[i:i+N])] += 1
  r = list((b, a) for a, b in ctr.items() if b >= 2)
  if len(r) == 0:
    return None, 0
  b, a = max(r)
  return a, (N - cost) * b - N - 2 # 2 is the overhead of the dict

def find_best_ngram(cost):
  best_score=0

  for i in range(2, 32):
    text, score = find_all_ngrams(lines, i, cost)
    if score > best_score:
      best_score = score
      best_text = text
  return best_score, best_text

def update_ngrams(lines, replace_from, replace_to):
  for line in lines:
    for i in range(len(line) - len(replace_from) + 1):
      if tuple(line[i:i+len(replace_from)]) == replace_from:
        line[i:i+len(replace_from)] = replace_to

total_gain = 0

original_tokens = sum(len(line) for line in lines)


kTextDictionary_US = [
'    ', '   ', '  ', "'s ", 'and ', 
'are ', 'all ', 'ain', 'and', 'at ', 
'ast', 'an', 'at', 'ble', 'ba', 
'be', 'bo', 'can ', 'che', 'com', 
'ck', 'des', 'di', 'do', 'en ', 
'er ', 'ear', 'ent', 'ed ', 'en', 
'er', 'ev', 'for', 'fro', 'give ', 
'get', 'go', 'have', 'has', 'her', 
'hi', 'ha', 'ight ', 'ing ', 'in', 
'is', 'it', 'just', 'know', 'ly ', 
'la', 'lo', 'man', 'ma', 'me', 
'mu', "n't ", 'non', 'not', 'open', 
'ound', 'out ', 'of', 'on', 'or', 
'per', 'ple', 'pow', 'pro', 're ', 
're', 'some', 'se', 'sh', 'so', 
'st', 'ter ', 'thin', 'ter', 'tha', 
'the', 'thi', 'to', 'tr', 'up', 
'ver', 'with', 'wa', 'we', 'wh', 
'wi', 'you', 'Her', 'Tha', 'The', 
'Thi', 'You', 
]


dictionary = []

for i in range(111+256):
  best_score, best_text = find_best_ngram(1 if i < 111 else 2)
  if best_score == 0:
    break

  total_gain += best_score

  print(f'Removed best bigram "{tos(best_text)}" with gain {best_score}, total gain {total_gain} / {original_tokens}')

  dictionary.append(best_text)

  update_ngrams(lines, best_text, [memo('{%s}' % tos(best_text))])

#print('kTextDictionary_NEW = [')
#for i, d in enumerate(dictionary):
#  repl = tos(d).replace('{', '').replace('}', '')
#  print(f'{repr(repl)},')
#print(']')


for i, a in enumerate(lines):
  print(i, tos(a))