import gzip import simplejson import string import sys import struct def removeNonAscii(s): return "".join(i for i in s if ord(i)<128) def processText(s): s = str(removeNonAscii(s)).lower() s = s.translate(string.maketrans("",""), string.punctuation) return ' '.join(s.split()) def parse(filename): f = gzip.open(filename, 'r') entry = {} for l in f: l = l.strip() colonPos = l.find(':') if colonPos == -1: yield entry entry = {} continue eName = l[:colonPos] rest = l[colonPos+2:] entry[eName] = rest yield entry def parseCategories(filename): entries = {} f = gzip.open(filename, 'r') entry = {} entry['categories'] = set() for l in f: if not l.startswith(' '): if len(entry['categories']) > 0: entry['categories'] = list(entry['categories']) #print entry entries[entry['asin/ID']] = entry entry = {"asin/ID": l[:10], "categories": set()} if l.startswith(' Books,'): cat1 = l.split(',') if len(cat1) > 1: entry['categories'].add(cat1[1].strip()) #Misses the last entry but who cares return entries def parseDescriptions(filename, es): entries = {} f = gzip.open(filename, 'r') entry = {} for l in f: if l.startswith('product/productId:'): entry = {} entry['asin/ID'] = l.split(': ')[1][:10] entry['url'] = "http://www.amazon.com/dp/" + entry['asin/ID'] elif l.startswith('product/description:'): entry['description'] = processText(l.split('product/description: ')[1].strip()) elif es.has_key(entry['asin/ID']): entry['categories'] = list(es[entry['asin/ID']]['categories']) entries[entry['asin/ID']] = entry return entries cats = parseCategories(sys.argv[1]) descs = parseDescriptions(sys.argv[2], cats) X = open(sys.argv[3], "rb") F1 = open("book_descriptions_50k.json", 'w') F2 = open("book_images_50k.json", 'w') count = 0 asin = X.read(10) while asin != "": asinS =''.join(struct.unpack("cccccccccc", asin)) imfeat = [] for i in range(4096): f = struct.unpack("f", X.read(4)) imfeat.append(f[0]) if descs.has_key(asinS): F1.write(str(descs[asinS]) + '\n') d = dict(descs[asinS]) d['image_feature'] = imfeat F2.write(str(d) + '\n') count += 1 print count if count >= 50000: break asin = X.read(10)