a = open("beeradvocate.vw", 'r')
b = open("beeradvocate.vw2", 'w')
c = open("beeradvocate.words2", 'w')

wid = {}

nread = 0
for l in a.readlines():
  nread += 1
  if (not nread % 1000): print nread
  ws = l[1:].strip().split()
  ws2 = []
  for w in ws:
    if not wid.has_key(w):
      wid[w] = len(wid)
    ws2.append(wid[w])
  b.write('| ' + ' ' .join([str(x) + ':1' for x in ws2]) + '\n')
diw = {}
for k in wid.keys():
  diw[wid[k]] = k
for i in range(len(diw)):
  c.write(str(i) + ' ' + diw[i] + '\n') 
a.close()
b.close()
c.close()

