import flickrapi from flickrapi import FlickrError from elementtree.ElementTree import parse from elementtree.ElementTree import ElementTree import os import random import commands import md5 import sys import struct from commands import getstatusoutput import string from nltk.stem.porter import PorterStemmer as stemmer random.seed(0) from elementtree.ElementTree import dump from elementtree.ElementTree import tostring from elementtree.ElementTree import fromstring import urllib2 import os import threading import thread import time def gso(command): s,o = commands.getstatusoutput(command) if (s != 0): print "Got nonzero exit status for", command print o sys.exit(1) return s,o def photoXml(idfile, output): b = open(output, 'w') b.write("\n") x = open(idfile) items = set([l.strip() for l in x.readlines()]) x.close() count = 0 currentlyValid = True lastline = "" photoID = None for l in items: l = l.strip() count += 1 print count infoTree = None if os.path.exists("/mnt/ram/flickr/photos/" + l[:2] + '/' + l + ".etree"): infoTree = parse("/mnt/ram/flickr/photos/" + l[:2] + '/' + l + ".etree") else: infoTree = parse("/lfs/1/flickr/flickr/data/photos/" + l[:2] + '/' + l + ".etree") photoTree = infoTree.find('photo') for l2 in tostring(photoTree).split('\n')[:-2]: b.write(l2 + '\n') b.write("\n") b.write("\n") b.close() def augment(photoXml, output, which = None): a = open(photoXml) b = open(output, 'w') photoID = None count = 0 for l in a.xreadlines(): if l.startswith(""): x = None if which == "contexts": if os.path.exists("/mnt/ram/flickr/contexts/" + photoID[:2] + '/' + photoID + ".etree"): x = open("/mnt/ram/flickr/contexts/" + photoID[:2] + '/' + photoID + ".etree") else: x = open("/lfs/1/flickr/flickr/data/contexts/" + photoID[:2] + '/' + photoID + ".etree") lines = x.readlines() b.write("\t\n") for line in lines[1:-1]: b.write('\t\t' + line) b.write("\t\n") x.close() if which == "galleries": if os.path.exists("/mnt/ram/flickr/galleries/" + photoID[:2] + '/' + photoID + ".etree"): x = open("/mnt/ram/flickr/galleries/" + photoID[:2] + '/' + photoID + ".etree") else: x = open("/lfs/1/flickr/flickr/data/galleries/" + photoID[:2] + '/' + photoID + ".etree") lines = x.readlines() b.write("\t\n") for line in lines[1:-1]: if not line.startswith("\n") x.close() if which == "comments": if os.path.exists("/mnt/ram/flickr/comments/" + photoID[:2] + '/' + photoID + ".etree"): x = open("/mnt/ram/flickr/comments/" + photoID[:2] + '/' + photoID + ".etree") else: x = open("/lfs/1/flickr/flickr/data/comments/" + photoID[:2] + '/' + photoID + ".etree") lines = x.readlines() b.write("\t\n") for line in lines[1:-1]: if not line.startswith("\n") x.close() if which == "labels": labels = [] for source in ("NUS", "CLEF", "MIR", "PASCAL"): fname = None if os.path.exists("/mnt/ram/flickr/labels" + source + '/' + photoID[:2] + '/' + photoID + ".labels"): fname = "/mnt/ram/flickr/labels" + source + '/' + photoID[:2] + '/' + photoID + ".labels" else: fname = "/lfs/1/flickr/flickr/data/labels" + source + '/' + photoID[:2] + '/' + photoID + ".labels" if os.path.exists(fname): x = open(fname) for line in x.readlines(): labels.append((line.strip(), source)) if len(labels): b.write("\t\n") for label,source in labels: b.write('\t\t\n") b.write("\t\n") b.write(l) b.close() def buildIncremental(idfile, output): #gso("cd /mnt/ram/flickr/ && tar xf /lfs/1/flickr/flickr/data/photos.tar") #photoXml(idfile, output) #gso("rm /mnt/ram/flickr/* -rf") #for which in ("contexts", "galleries", "labels"): #gso("cd /mnt/ram/flickr/ && tar xf /lfs/1/flickr/flickr/data/" + which + ".tar") #augment(output, output + "_temp", which) #gso("mv " + output + "_temp " + output) #gso("rm /mnt/ram/flickr/* -rf") augment(output, output + "_comments", "comments") def getSubset(idfile, output): a = open("/lfs/1/flickr/flickr/labeledPhotos.xml_comments") b = open(output, 'w') x = open(idfile) items = set([l.strip() for l in x.readlines()]) x.close() count = 0 currentlyValid = True lastline = "" photoID = None for l in a.xreadlines(): if l.startswith(""): if not (count % 1000): print count count += 1 if not currentlyValid: currentlyValid = True lastline = l continue #else: #x = open("/lfs/1/flickr/flickr/data/comments/" + photoID[:2] + '/' + photoID + ".etree") #lines = x.readlines()[1:-1] #try: #fromstring('\n'.join(lines)) #except Exception as e: #print photoID #print '\n'.join(lines) #sys.exit(1) #for l2 in x.readlines()[1:-1]: #b.write('\t' + l2) if l.startswith("") and l == '\n': lastline = l continue b.write(l) lastline = l b.close() a.close() def photoIterator(path): a = open(path) linebuf = [] count = 0 a.readline() for l in a.xreadlines(): linebuf.append(l) if "" in l: x = ''.join(linebuf) linebuf = [] count += 1 if not (count % 1000): print "Read", count, "photos" try: yield fromstring(x) except Exception as e: print x sys.exit(1) def peopleIterator(path): a = open(path) linebuf = [] count = 0 for l in a.xreadlines(): linebuf.append(l) if "" in l: x = ''.join(linebuf[1:]) linebuf = [] count += 1 if not (count % 1000): print "Read", count, "contacts" yield fromstring(x) def peopleXml(photoXml, output): people = set() for photo in photoIterator(photoXml): ownerID = photo.find('owner').attrib['nsid'] people.add(ownerID) b = open(output, 'w') b.write("\n") count = 0 for person in people: count += 1 if not (count % 1000): print "Wrote", count, "people" x = open("/mnt/ram/flickr/contacts/" + person[:2] + '/' + person + ".etree") b.write('\n') for l in x.readlines()[1:-1]: b.write(l) b.write("\n\n") b.write("\n") currentPersonId = 0 personIds = {} def getPersonId(person): global currentPersonId if person in personIds: return personIds[person] personIds[person] = currentPersonId currentPersonId += 1 currentPhotoId = 0 photoIds = {} def getPhotoId(photo): global currentPhotoId if photo in photoIds: return photoIds[photo] photoIds[photo] = currentPhotoId currentPhotoId += 1 def peopleGraph(peopleXml, output): lines = {} for contactss in peopleIterator(peopleXml): # We only care about people who have photos. getPersonId(contactss.attrib["nsid"]) for contactss in peopleIterator(peopleXml): conts = set() for contacts in contactss.findall("contacts"): for contact in contacts.findall("contact"): if contact.attrib['nsid'] in personIds: conts.add(contact.attrib["nsid"]) line = [] for c in conts: line.append(c) lines[contactss.attrib["nsid"]] = line c = open(output, 'w') c.write(str(currentPersonId) + '\n') for k in personIds.keys(): c.write(str(k) + '\n') for k in lines.keys(): c.write(str(k) + ' ' + str(len(lines[k]))) for k2 in lines[k]: c.write(' ' + str(k2)) c.write('\n') def addSetCount(s, key): if not key in s: s[key] = 0 s[key] += 1 def addToSet(s, key, value): if not key in s: s[key] = set() s[key].add(value) def wordsForPhoto(photo, stopwords): exclude = set(string.punctuation) exclude.remove("'") tagWords = [] groupWords = [] titleWords = [] if photo.find('title').text: titleWords.append(photo.find('title').text) if photo.find('description').text: titleWords.append(photo.find('description').text) galleryWords = [] commentWords = [] for comments in photo.findall('comments'): for c in comments.findall('comment'): commentWords.append(c.text) titleWords = [t for t in titleWords if t != None] commentWords = [t for t in commentWords if t != None] words = ' '.join(groupWords).lower() + ' ' + ' '.join(titleWords).lower() + ' ' + ' '.join(commentWords).lower() words = ''.join(ch for ch in words if ch not in exclude) ret = set() for w in words.split(): try: if not w in stopwords: ret.add(w) except Exception as e: print w return ret def photoText(xmlOutputList, groupIds=None): a = open("stopwords.txt") stopwords = set([l.strip() for l in a.readlines()]) a.close() allLabels_ = {'PASCAL': set(['sheep', 'horse', 'bicycle', 'aeroplane', 'cow', 'sofa', 'motorbike', 'dog', 'cat', 'person', 'train', 'diningtable', 'bottle', 'bus', 'pottedplant', 'tvmonitor', 'chair', 'bird', 'boat', 'car']), 'MIR': set(['night', 'people', 'clouds', 'sea', 'river', 'portrait', 'male', 'female', 'baby', 'tree', 'flower', 'dog', 'bird', 'car']), 'CLEF': set(['cute', 'Partylife', 'Winter', 'Sky', 'inactive', 'Vehicle', 'Overexposed', 'No_Blur', 'Family_Friends', 'Underexposed', 'Trees', 'Landscape_Nature', 'Day', 'MusicalInstrument', 'River', 'Motion_Blur', 'Summer', 'Visual_Arts', 'car', 'Out_of_focus', 'Small_Group', 'bicycle', 'fish', 'Sports', 'Citylife', 'Building_Sights', 'artificial', 'Water', 'Baby', 'bird', 'Plants', 'Overall_Quality', 'Clouds', 'Work', 'Architecture', 'active', 'bodypart', 'Still_Life', 'melancholic', 'scary', 'technical', 'skateboard', 'Autumn', 'Portrait', 'Aesthetic_Impression', 'Toy', 'Travel', 'Snow', 'Adult', 'female', 'Mountains', 'horse', 'Sunny', 'euphoric', 'unpleasant', 'calm', 'Child', 'Flowers', 'Desert', 'happy', 'Animals', 'Partly_Blurred', 'train', 'Night', 'Single_Person', 'Painting', 'Neutral_Illumination', 'Sunset_Sunrise', 'boring', 'Indoor', 'cat', 'Sea', 'Church', 'Birthday', 'male', 'abstract', 'Lake', 'Teenager', 'airplane', 'ship', 'funny', 'Food', 'Shadow', 'Rain', 'Big_Group', 'Macro', 'insect', 'Beach_Holidays', 'Bridge', 'Outdoor', 'natural', 'Spring', 'old_person', 'No_Persons', 'dog', 'Fancy', 'Street', 'Graffiti', 'Park_Garden']), 'NUS': set(['bridge', 'map', 'glacier', 'house', 'fish', 'sign', 'snow', 'fox', 'protest', 'street', 'zebra', 'train', 'rocks', 'birds', 'earthquake', 'surf', 'mountain', 'toy', 'police', 'dancing', 'whales', 'sun', 'sky', 'lake', 'tree', 'moon', 'fire', 'window', 'book', 'animal', 'vehicle', 'boats', 'flags', 'valley', 'beach', 'temple', 'horses', 'plants', 'buildings', 'elk', 'garden', 'tattoo', 'food', 'cars', 'frost', 'bear', 'water', 'sports', 'running', 'waterfall', 'sand', 'statue', 'wedding', 'cityscape', 'leaf', 'railroad', 'flowers', 'castle', 'rainbow', 'town', 'tiger', 'soccer', 'clouds', 'reflection', 'cow', 'harbor', 'coral', 'nighttime', 'dog', 'ocean', 'swimmers', 'person', 'airport', 'plane', 'sunset', 'cat', 'military', 'tower', 'grass', 'computer', 'road'])} PASCAL = ["PASCAL:" + l for l in allLabels_["PASCAL"]] CLEF = ["CLEF:" + l for l in allLabels_["CLEF"]] MIR = ["MIR:" + l for l in allLabels_["MIR"]] NUS = ["NUS:" + l for l in allLabels_["NUS"]] allLabels = [PASCAL, CLEF, MIR, NUS] for i in range(4): allLabels[i].sort() allLabels = allLabels[0] + allLabels[1] + allLabels[2] + allLabels[3] + ['taken_at_home'] if groupIds == None: groupIds = dict(zip(allLabels, range(40000, 40000 + len(allLabels)))) print "WARNING: SETTING GROUP IDS MANUALLY" wordCount = {} for photo in photoIterator(xmlOutputList[-3][0]): words = wordsForPhoto(photo, stopwords) for w in words: addSetCount(wordCount, w) counts = {} countsLabel = {} for photoXml, output, outputIDs in xmlOutputList: countLabel = dict([(l, 0) for l in allLabels]) count = 0 for photo in photoIterator(photoXml): for labels in photo.findall("labels"): for label in labels.findall("label"): l = label.attrib['source'] + ':' + label.text countLabel[l] += 1 if takenAtHome(photo) == '1': countLabel['taken_at_home'] += 1 count += 1 counts[photoXml] = count countsLabel[photoXml] = countLabel frequencies = [(wordCount[w], w) for w in wordCount.keys() if wordCount[w] > 1] frequencies.sort() frequencies.reverse() frequencies = frequencies[:20000] wordIds = {} wordCount_ = {} count = 0 for c,w in frequencies: wordIds[w] = count wordCount_[count] = c count += 1 frequencies = [f[1] for f in frequencies] for photoXml, output, outputIDs in xmlOutputList: output = open(output, 'w') output.write(str(len(frequencies)) + '\n') count = 0 unprintable = 0 for f in frequencies: try: output.write(str(count) + ' ' + f + '\n') except Exception as e: output.write(str(count) + ' ' + "unknown_word" + str(unprintable) + '\n') unprintable += 1 count += 1 output.write(str(counts[photoXml])) wordCountLabel = dict([(l,{}) for l in allLabels]) for photo in photoIterator(photoXml): words = wordsForPhoto(photo, stopwords) for labels in photo.findall("labels"): for label in labels.findall("label"): l = label.attrib['source'] + ':' + label.text hist = {} for w in words: if w in wordIds: addSetCount(wordCountLabel[l], wordIds[w]) addSetCount(hist, wordIds[w]) output.write(photo.attrib['id'] + ' ' + photo.find('owner').attrib['nsid'] + ' ') out = [] for h in hist.keys(): out.append(str(h) + ':' + str(hist[h])) output.write(str(len(out)) + ' ' + ' '.join(out) + '\n') if takenAtHome(photo) == '1': countLabel['taken_at_home'] += 1 for w in words: if w in wordIds: addSetCount(wordCountLabel['taken_at_home'], wordIds[w]) output.close() outputIDs = open(outputIDs, 'w') for l in allLabels: vk = [] for k in wordCountLabel[l].keys(): if not k in wordCount_.keys(): continue frequencyPositive = wordCountLabel[l][k] * 1.0/countsLabel[photoXml][l] frequencyAll = wordCount_[k] * 1.0/counts[photoXml] d = frequencyPositive * 1.0/frequencyAll if wordCountLabel[l][k] > 1: vk.append((d, k)) #vk.append((wordCountLabel[l][k], k)) vk.sort() vk.reverse() wordsToUse = [str(v[1]) + ':' + str(v[0]) for v in vk] outputIDs.write(str(groupIds[l]) + ' ' + l + ' ' + str(len(wordsToUse)) + ' ' + ' '.join([str(x) for x in wordsToUse]) + '\n') outputIDs.close() def takenAtHome(photo): sim = '.' ownerLocation = [] photoLocation = [] if photo.find('owner').attrib.has_key('location'): ownerLocation = photo.find('owner').attrib['location'].replace(', ', ',').replace("USA", "United States").lower().split(',') for location in photo.findall("location"): for locality in location.findall('locality'): photoLocation.append(locality.text.lower()) for county in location.findall('county'): photoLocation.append(county.text.lower()) for region in location.findall('region'): photoLocation.append(region.text.lower()) for country in location.findall('country'): photoLocation.append(country.text.lower()) if len(ownerLocation) and ownerLocation[0] in photoLocation: sim = '1' elif len(ownerLocation) > 1 and len(photoLocation) > 1: sim = '0' return sim # Tags and groups def photoInfo(xmlOutputList): allLabels_ = {'PASCAL': set(['sheep', 'horse', 'bicycle', 'aeroplane', 'cow', 'sofa', 'motorbike', 'dog', 'cat', 'person', 'train', 'diningtable', 'bottle', 'bus', 'pottedplant', 'tvmonitor', 'chair', 'bird', 'boat', 'car']), 'MIR': set(['night', 'people', 'clouds', 'sea', 'river', 'portrait', 'male', 'female', 'baby', 'tree', 'flower', 'dog', 'bird', 'car']), 'CLEF': set(['cute', 'Partylife', 'Winter', 'Sky', 'inactive', 'Vehicle', 'Overexposed', 'No_Blur', 'Family_Friends', 'Underexposed', 'Trees', 'Landscape_Nature', 'Day', 'MusicalInstrument', 'River', 'Motion_Blur', 'Summer', 'Visual_Arts', 'car', 'Out_of_focus', 'Small_Group', 'bicycle', 'fish', 'Sports', 'Citylife', 'Building_Sights', 'artificial', 'Water', 'Baby', 'bird', 'Plants', 'Overall_Quality', 'Clouds', 'Work', 'Architecture', 'active', 'bodypart', 'Still_Life', 'melancholic', 'scary', 'technical', 'skateboard', 'Autumn', 'Portrait', 'Aesthetic_Impression', 'Toy', 'Travel', 'Snow', 'Adult', 'female', 'Mountains', 'horse', 'Sunny', 'euphoric', 'unpleasant', 'calm', 'Child', 'Flowers', 'Desert', 'happy', 'Animals', 'Partly_Blurred', 'train', 'Night', 'Single_Person', 'Painting', 'Neutral_Illumination', 'Sunset_Sunrise', 'boring', 'Indoor', 'cat', 'Sea', 'Church', 'Birthday', 'male', 'abstract', 'Lake', 'Teenager', 'airplane', 'ship', 'funny', 'Food', 'Shadow', 'Rain', 'Big_Group', 'Macro', 'insect', 'Beach_Holidays', 'Bridge', 'Outdoor', 'natural', 'Spring', 'old_person', 'No_Persons', 'dog', 'Fancy', 'Street', 'Graffiti', 'Park_Garden']), 'NUS': set(['bridge', 'map', 'glacier', 'house', 'fish', 'sign', 'snow', 'fox', 'protest', 'street', 'zebra', 'train', 'rocks', 'birds', 'earthquake', 'surf', 'mountain', 'toy', 'police', 'dancing', 'whales', 'sun', 'sky', 'lake', 'tree', 'moon', 'fire', 'window', 'book', 'animal', 'vehicle', 'boats', 'flags', 'valley', 'beach', 'temple', 'horses', 'plants', 'buildings', 'elk', 'garden', 'tattoo', 'food', 'cars', 'frost', 'bear', 'water', 'sports', 'running', 'waterfall', 'sand', 'statue', 'wedding', 'cityscape', 'leaf', 'railroad', 'flowers', 'castle', 'rainbow', 'town', 'tiger', 'soccer', 'clouds', 'reflection', 'cow', 'harbor', 'coral', 'nighttime', 'dog', 'ocean', 'swimmers', 'person', 'airport', 'plane', 'sunset', 'cat', 'military', 'tower', 'grass', 'computer', 'road'])} groupCount = {} tagCount = {} NPASCAL = len(allLabels_["PASCAL"]) NCLEF = len(allLabels_["CLEF"]) NMIR = len(allLabels_["MIR"]) NNUS = len(allLabels_["NUS"]) PASCAL = ["PASCAL:" + l for l in allLabels_["PASCAL"]] CLEF = ["CLEF:" + l for l in allLabels_["CLEF"]] MIR = ["MIR:" + l for l in allLabels_["MIR"]] NUS = ["NUS:" + l for l in allLabels_["NUS"]] allLabels = [PASCAL, CLEF, MIR, NUS] for i in range(4): allLabels[i].sort() allLabels = allLabels[0] + allLabels[1] + allLabels[2] + allLabels[3] + ['taken_at_home'] trainingTestingPhotos = {'PASCAL': set(), 'CLEF': set(), 'MIR': set(), 'NUS': set()} for s in ['PASCAL', 'CLEF', 'MIR', 'NUS']: a = open("data/sets/_" + s + ".txt") for p in a.readlines(): trainingTestingPhotos[s].add(p.strip()) a.close() for photo in photoIterator(xmlOutputList[-3][0]): for tag in photo.find("tags").findall("tag"): addSetCount(tagCount, tag.text) for groups in photo.findall("groups"): for g in groups.findall("pool"): addSetCount(groupCount, g.attrib['id']) counts = {} countsLabel = {} for photoXml,_,_ in xmlOutputList: countLabel = dict([(l, 0) for l in allLabels]) count = 0 for photo in photoIterator(photoXml): for labels in photo.findall("labels"): for label in labels.findall("label"): l = label.attrib['source'] + ':' + label.text countLabel[l] += 1 if takenAtHome(photo) == '1': countLabel['taken_at_home'] += 1 count += 1 counts[photoXml] = count countsLabel[photoXml] = countLabel NGroups = min(20000, len(groupCount.keys())) NTags = min(20000, len(tagCount.keys())) currentId = 0 groupIds = {} groupCount_ = {} tagCount_ = {} # Find the most common groups vk = [] for k in groupCount.keys(): vk.append((groupCount[k], k)) vk.sort() vk.reverse() for v in vk[:NGroups]: groupIds[v[1]] = currentId groupCount_[currentId] = v[0] currentId += 1 # Find the most common tags vk = [] for k in tagCount.keys(): vk.append((tagCount[k], k)) vk.sort() vk.reverse() for v in vk[:NTags]: groupIds[v[1]] = currentId tagCount_[currentId] = v[0] currentId += 1 for l in allLabels: groupIds[l] = currentId currentId += 1 idGroup = dict([(groupIds[x], x) for x in groupIds.keys()]) print "CURRENT ID=", currentId for photoXml, output, outputIDs in xmlOutputList: tagsForLabel = dict([(l,dict()) for l in allLabels]) groupsForLabel = dict([(l,dict()) for l in allLabels]) # Vector: id user group_indicator tag_indicator label_indicator output = open(output, 'w') output.write(str(NGroups) + ' ' + str(NTags) + ' ' + str(len(allLabels)) + '\n') unknown_id = 1 for i in range(0, NGroups): try: output.write(str(i) + ' ' + idGroup[i] + '\n') except Exception as e: output.write(str(i) + ' ' + "unknown_group" + str(unknown_id) + '\n') unknown_id += 1 for i in range(NGroups, NGroups + NTags): try: output.write(str(i) + ' ' + idGroup[i] + '\n') except Exception as e: output.write(str(i) + ' ' + "unknown_group" + str(unknown_id) + '\n') unknown_id += 1 for i in range(NGroups + NTags, NGroups + NTags + len(allLabels)): output.write(str(i) + ' ' + idGroup[i] + '\n') output.write(str(counts[photoXml]) + '\n') count = 0 for photo in photoIterator(photoXml): count += 1 photoIndicator = ['.']*(NGroups+NTags+len(allLabels)) tagsForPhoto = set() groupsForPhoto = set() for tag in photo.find("tags").findall("tag"): if tag.text in groupIds: photoIndicator[groupIds[tag.text]] = '1' tagsForPhoto.add(groupIds[tag.text]) for groups in photo.findall("groups"): for g in groups.findall("pool"): if g.attrib['id'] in groupIds: photoIndicator[groupIds[g.attrib['id']]] = '1' groupsForPhoto.add(groupIds[g.attrib['id']]) for s in ['PASCAL', 'CLEF', 'MIR', 'NUS']: if photo.attrib['id'] in trainingTestingPhotos[s]: for l in allLabels_[s]: photoIndicator[groupIds[s + ':' + l]] = '0' for labels in photo.findall("labels"): for label in labels.findall("label"): l = label.attrib['source'] + ':' + label.text photoIndicator[groupIds[l]] = '1' for t in tagsForPhoto: addSetCount(tagsForLabel[l], t) for g in groupsForPhoto: addSetCount(groupsForLabel[l], g) sim = takenAtHome(photo) photoIndicator[groupIds['taken_at_home']] = sim if sim == '1': for t in tagsForPhoto: addSetCount(tagsForLabel['taken_at_home'], t) for g in groupsForPhoto: addSetCount(groupsForLabel['taken_at_home'], g) output.write(photo.attrib['id'] + ' ' + photo.find('owner').attrib['nsid'] + ' ' + ''.join([str(x) for x in photoIndicator]) + '\n') output.close() outputIDs = open(outputIDs, 'w') for l in allLabels: vk = [] for k in groupsForLabel[l].keys(): if not k in groupCount_.keys(): continue frequencyPositive = groupsForLabel[l][k] * 1.0/countsLabel[photoXml][l] # How many times does this term occur in positively labeled images? frequencyAll = groupCount_[k] * 1.0/counts[photoXml] # How many times does it occur altogether? d = frequencyPositive * 1.0/frequencyAll if groupsForLabel[l][k] > 1: vk.append((d, k)) vk.sort() vk.reverse() groupsToUse = [str(v[1]) + ':' + str(v[0]) for v in vk] vk = [] for k in tagsForLabel[l].keys(): if not k in tagCount_.keys(): continue frequencyPositive = tagsForLabel[l][k] * 1.0/countsLabel[photoXml][l] frequencyAll = tagCount_[k] * 1.0/counts[photoXml] d = frequencyPositive * 1.0/frequencyAll if tagsForLabel[l][k] > 1: vk.append((d, k)) vk.sort() vk.reverse() tagsToUse = [str(v[1]) + ':' + str(v[0]) for v in vk] outputIDs.write(str(groupIds[l]) + ' ' + l + ' ' + str(len(groupsToUse)) + ' ' + str(len(tagsToUse)) + ' ' + ' '.join([str(x) for x in groupsToUse]) + ' ' + ' '.join([str(x) for x in tagsToUse]) + '\n') outputIDs.close() return groupIds def otherStuffIndicator(xmlOutputList): sets = set() galleries = set() locations = set() persons = set() for photoXml,_ in xmlOutputList: for photo in photoIterator(photoXml): for gallery in photo.find("galleries").findall("gallery"): galleries.add(gallery.attrib['id']) for location in photo.findall("location"): if location.attrib.has_key('place_id'): locations.add(location.attrib['place_id']) persons.add(photo.find('owner').attrib['nsid']) for groups in photo.findall("groups"): for s in groups.findall("set"): sets.add(s.attrib['id']) thingID = {} currentID = 0 for s in sets: thingID[s] = currentID currentID += 1 for g in galleries: thingID[g] = currentID currentID += 1 for l in locations: thingID[l] = currentID currentID += 1 for p in persons: thingID[p] = currentID currentID += 1 for photoXml,output in xmlOutputList: output = open(output, 'w') output.write(str(currentID) + '\n') indicator = [0]*currentID for photo in photoIterator(photoXml): for gallery in photo.find("galleries").findall("gallery"): indicator[thingID[gallery.attrib['id']]] = 1 for location in photo.findall("location"): if location.attrib.has_key('place_id'): indicator[thingID[location.attrib['place_id']]] = 1 indicator[thingID[photo.find('owner').attrib['nsid']]] = 1 for groups in photo.findall("groups"): for s in groups.findall("set"): indicator[thingID[s.attrib['id']]] = 1 output.write(photo.attrib['id'] + ' ' + ''.join([str(x) for x in indicator]) + '\n') output.close() # People, sets, galleries, and locations def photoCliques(photoXml, output): global currentPhotoId global photoIds currentPhotoId = 0 photoIds = {} personPhotos = {} setPhotos = {} galleryPhotos = {} locationPhotos = {} for photo in photoIterator(photoXml): getPhotoId(photo.attrib['id']) for groups in photo.findall("groups"): for s in groups.findall("set"): addToSet(setPhotos, s.attrib['id'], photo.attrib['id']) addToSet(personPhotos, photo.find('owner').attrib['nsid'], photo.attrib['id']) for gallery in photo.find("galleries").findall("gallery"): addToSet(galleryPhotos, gallery.attrib['id'], photo.attrib['id']) for location in photo.findall("location"): if location.attrib.has_key('place_id'): addToSet(locationPhotos, location.attrib['place_id'], photo.attrib['id']) output = open(output, 'w') output.write(str(currentPhotoId) + '\n') output.write(str(len(personPhotos)) + '\n') output.write(str(len(setPhotos)) + '\n') output.write(str(len(galleryPhotos)) + '\n') output.write(str(len(locationPhotos)) + '\n') for k in photoIds.keys(): output.write(str(k) + '\n') for k in personPhotos.keys(): s = personPhotos[k] output.write(k + ' ' + str(len(s)) + ' ' + ' '.join(s) + '\n') for k in setPhotos.keys(): s = setPhotos[k] output.write(k + ' ' + str(len(s)) + ' ' + ' '.join(s) + '\n') for k in galleryPhotos.keys(): s = galleryPhotos[k] output.write(k + ' ' + str(len(s)) + ' ' + ' '.join(s) + '\n') for k in locationPhotos.keys(): s = locationPhotos[k] output.write(k + ' ' + str(len(s)) + ' ' + ' '.join(s) + '\n') output.close() def binaryFeatures(idfile, output): a = open(idfile, 'r') ids = [int(x.strip()) for x in a.readlines()] a.close() o = open(output, 'wb') count = 0 for i in ids: count += 1 if not (count % 1000): print count o.write(struct.pack('l', i)) x = open("data/features_1024/" + str(i)[:2] + '/' + str(i) + ".vec", 'rb') o.write(x.read()) o.close() def binaryFeaturesNUS(idfile, output, which): a = open("/lfs/1/flickr/flickr/NUSwide/NUS-WIDE-urls.txt", 'r') imids = [] imfeats = {} for x in a.readlines()[1:]: imids.append(x.split()[1]) a.close() a = open(which, 'r') ind = 0 for l in a.readlines(): imfeats[imids[ind]] = l.strip() ind += 1 a.close() a = open(idfile, 'r') b = open(output, 'w') for l in a.readlines(): l = l.strip() b.write(l + ' ' + imfeats[l] + '\n') b.close()