import gzip
import sys

# f = gzip.open("aggressive_dedup.json.gz", 'w')

# rs = set()

# c = 0
# c2 = 0

# for l in gzip.open("user_dedup.json.gz"):
#   c += 1
#   r = eval(l)
#   h = hash(r['reviewText'])
#   if not (h in rs):
#     c2 += 1
#     f.write(l)
#   if not (c % 100000):
#     print c, c2
#   rs.add(h)

  
f = gzip.open(sys.argv[1] + '2', 'w')

pairs = set()

c = 0
c2 = 0

for l in gzip.open(sys.argv[1]):
  c += 1
  r = eval(l)
  ui = (r['reviewerID'],r['asin'])
  if not (ui in pairs):
    f.write(l)
    c2 += 1
  if not (c % 100000):
    print c, c2
  pairs.add(ui)

print c, c2
