import flickrapi
from flickrapi import FlickrError
from elementtree.ElementTree import parse
from elementtree.ElementTree import ElementTree
import os
import random
import commands
import md5
import sys
import struct
from commands import getstatusoutput
import string
from nltk.stem.porter import PorterStemmer as stemmer
random.seed(0)
from elementtree.ElementTree import dump
from elementtree.ElementTree import tostring
from elementtree.ElementTree import fromstring
import urllib2
import os
import threading
import thread
import time
def gso(command):
s,o = commands.getstatusoutput(command)
if (s != 0):
print "Got nonzero exit status for", command
print o
sys.exit(1)
return s,o
def photoXml(idfile, output):
b = open(output, 'w')
b.write("\n")
x = open(idfile)
items = set([l.strip() for l in x.readlines()])
x.close()
count = 0
currentlyValid = True
lastline = ""
photoID = None
for l in items:
l = l.strip()
count += 1
print count
infoTree = None
if os.path.exists("/mnt/ram/flickr/photos/" + l[:2] + '/' + l + ".etree"):
infoTree = parse("/mnt/ram/flickr/photos/" + l[:2] + '/' + l + ".etree")
else:
infoTree = parse("/lfs/1/flickr/flickr/data/photos/" + l[:2] + '/' + l + ".etree")
photoTree = infoTree.find('photo')
for l2 in tostring(photoTree).split('\n')[:-2]:
b.write(l2 + '\n')
b.write("\n")
b.write("\n")
b.close()
def augment(photoXml, output, which = None):
a = open(photoXml)
b = open(output, 'w')
photoID = None
count = 0
for l in a.xreadlines():
if l.startswith(""):
x = None
if which == "contexts":
if os.path.exists("/mnt/ram/flickr/contexts/" + photoID[:2] + '/' + photoID + ".etree"):
x = open("/mnt/ram/flickr/contexts/" + photoID[:2] + '/' + photoID + ".etree")
else:
x = open("/lfs/1/flickr/flickr/data/contexts/" + photoID[:2] + '/' + photoID + ".etree")
lines = x.readlines()
b.write("\t\n")
for line in lines[1:-1]:
b.write('\t\t' + line)
b.write("\t\n")
x.close()
if which == "galleries":
if os.path.exists("/mnt/ram/flickr/galleries/" + photoID[:2] + '/' + photoID + ".etree"):
x = open("/mnt/ram/flickr/galleries/" + photoID[:2] + '/' + photoID + ".etree")
else:
x = open("/lfs/1/flickr/flickr/data/galleries/" + photoID[:2] + '/' + photoID + ".etree")
lines = x.readlines()
b.write("\t\n")
for line in lines[1:-1]:
if not line.startswith("\n")
x.close()
if which == "comments":
if os.path.exists("/mnt/ram/flickr/comments/" + photoID[:2] + '/' + photoID + ".etree"):
x = open("/mnt/ram/flickr/comments/" + photoID[:2] + '/' + photoID + ".etree")
else:
x = open("/lfs/1/flickr/flickr/data/comments/" + photoID[:2] + '/' + photoID + ".etree")
lines = x.readlines()
b.write("\t\n")
for line in lines[1:-1]:
if not line.startswith("\n")
x.close()
if which == "labels":
labels = []
for source in ("NUS", "CLEF", "MIR", "PASCAL"):
fname = None
if os.path.exists("/mnt/ram/flickr/labels" + source + '/' + photoID[:2] + '/' + photoID + ".labels"):
fname = "/mnt/ram/flickr/labels" + source + '/' + photoID[:2] + '/' + photoID + ".labels"
else:
fname = "/lfs/1/flickr/flickr/data/labels" + source + '/' + photoID[:2] + '/' + photoID + ".labels"
if os.path.exists(fname):
x = open(fname)
for line in x.readlines():
labels.append((line.strip(), source))
if len(labels):
b.write("\t\n")
for label,source in labels:
b.write('\t\t\n")
b.write("\t\n")
b.write(l)
b.close()
def buildIncremental(idfile, output):
#gso("cd /mnt/ram/flickr/ && tar xf /lfs/1/flickr/flickr/data/photos.tar")
#photoXml(idfile, output)
#gso("rm /mnt/ram/flickr/* -rf")
#for which in ("contexts", "galleries", "labels"):
#gso("cd /mnt/ram/flickr/ && tar xf /lfs/1/flickr/flickr/data/" + which + ".tar")
#augment(output, output + "_temp", which)
#gso("mv " + output + "_temp " + output)
#gso("rm /mnt/ram/flickr/* -rf")
augment(output, output + "_comments", "comments")
def getSubset(idfile, output):
a = open("/lfs/1/flickr/flickr/labeledPhotos.xml_comments")
b = open(output, 'w')
x = open(idfile)
items = set([l.strip() for l in x.readlines()])
x.close()
count = 0
currentlyValid = True
lastline = ""
photoID = None
for l in a.xreadlines():
if l.startswith(""):
if not (count % 1000): print count
count += 1
if not currentlyValid:
currentlyValid = True
lastline = l
continue
#else:
#x = open("/lfs/1/flickr/flickr/data/comments/" + photoID[:2] + '/' + photoID + ".etree")
#lines = x.readlines()[1:-1]
#try:
#fromstring('\n'.join(lines))
#except Exception as e:
#print photoID
#print '\n'.join(lines)
#sys.exit(1)
#for l2 in x.readlines()[1:-1]:
#b.write('\t' + l2)
if l.startswith("") and l == '\n':
lastline = l
continue
b.write(l)
lastline = l
b.close()
a.close()
def photoIterator(path):
a = open(path)
linebuf = []
count = 0
a.readline()
for l in a.xreadlines():
linebuf.append(l)
if "" in l:
x = ''.join(linebuf)
linebuf = []
count += 1
if not (count % 1000): print "Read", count, "photos"
try:
yield fromstring(x)
except Exception as e:
print x
sys.exit(1)
def peopleIterator(path):
a = open(path)
linebuf = []
count = 0
for l in a.xreadlines():
linebuf.append(l)
if "" in l:
x = ''.join(linebuf[1:])
linebuf = []
count += 1
if not (count % 1000): print "Read", count, "contacts"
yield fromstring(x)
def peopleXml(photoXml, output):
people = set()
for photo in photoIterator(photoXml):
ownerID = photo.find('owner').attrib['nsid']
people.add(ownerID)
b = open(output, 'w')
b.write("\n")
count = 0
for person in people:
count += 1
if not (count % 1000): print "Wrote", count, "people"
x = open("/mnt/ram/flickr/contacts/" + person[:2] + '/' + person + ".etree")
b.write('\n')
for l in x.readlines()[1:-1]:
b.write(l)
b.write("\n\n")
b.write("\n")
currentPersonId = 0
personIds = {}
def getPersonId(person):
global currentPersonId
if person in personIds: return personIds[person]
personIds[person] = currentPersonId
currentPersonId += 1
currentPhotoId = 0
photoIds = {}
def getPhotoId(photo):
global currentPhotoId
if photo in photoIds: return photoIds[photo]
photoIds[photo] = currentPhotoId
currentPhotoId += 1
def peopleGraph(peopleXml, output):
lines = {}
for contactss in peopleIterator(peopleXml):
# We only care about people who have photos.
getPersonId(contactss.attrib["nsid"])
for contactss in peopleIterator(peopleXml):
conts = set()
for contacts in contactss.findall("contacts"):
for contact in contacts.findall("contact"):
if contact.attrib['nsid'] in personIds:
conts.add(contact.attrib["nsid"])
line = []
for c in conts:
line.append(c)
lines[contactss.attrib["nsid"]] = line
c = open(output, 'w')
c.write(str(currentPersonId) + '\n')
for k in personIds.keys():
c.write(str(k) + '\n')
for k in lines.keys():
c.write(str(k) + ' ' + str(len(lines[k])))
for k2 in lines[k]:
c.write(' ' + str(k2))
c.write('\n')
def addSetCount(s, key):
if not key in s:
s[key] = 0
s[key] += 1
def addToSet(s, key, value):
if not key in s:
s[key] = set()
s[key].add(value)
def wordsForPhoto(photo, stopwords):
exclude = set(string.punctuation)
exclude.remove("'")
tagWords = []
groupWords = []
titleWords = []
if photo.find('title').text:
titleWords.append(photo.find('title').text)
if photo.find('description').text:
titleWords.append(photo.find('description').text)
galleryWords = []
commentWords = []
for comments in photo.findall('comments'):
for c in comments.findall('comment'):
commentWords.append(c.text)
titleWords = [t for t in titleWords if t != None]
commentWords = [t for t in commentWords if t != None]
words = ' '.join(groupWords).lower() + ' ' + ' '.join(titleWords).lower() + ' ' + ' '.join(commentWords).lower()
words = ''.join(ch for ch in words if ch not in exclude)
ret = set()
for w in words.split():
try:
if not w in stopwords:
ret.add(w)
except Exception as e:
print w
return ret
def photoText(xmlOutputList, groupIds=None):
a = open("stopwords.txt")
stopwords = set([l.strip() for l in a.readlines()])
a.close()
allLabels_ = {'PASCAL': set(['sheep', 'horse', 'bicycle', 'aeroplane', 'cow', 'sofa', 'motorbike', 'dog', 'cat', 'person', 'train', 'diningtable', 'bottle', 'bus', 'pottedplant', 'tvmonitor', 'chair', 'bird', 'boat', 'car']), 'MIR': set(['night', 'people', 'clouds', 'sea', 'river', 'portrait', 'male', 'female', 'baby', 'tree', 'flower', 'dog', 'bird', 'car']), 'CLEF': set(['cute', 'Partylife', 'Winter', 'Sky', 'inactive', 'Vehicle', 'Overexposed', 'No_Blur', 'Family_Friends', 'Underexposed', 'Trees', 'Landscape_Nature', 'Day', 'MusicalInstrument', 'River', 'Motion_Blur', 'Summer', 'Visual_Arts', 'car', 'Out_of_focus', 'Small_Group', 'bicycle', 'fish', 'Sports', 'Citylife', 'Building_Sights', 'artificial', 'Water', 'Baby', 'bird', 'Plants', 'Overall_Quality', 'Clouds', 'Work', 'Architecture', 'active', 'bodypart', 'Still_Life', 'melancholic', 'scary', 'technical', 'skateboard', 'Autumn', 'Portrait', 'Aesthetic_Impression', 'Toy', 'Travel', 'Snow', 'Adult', 'female', 'Mountains', 'horse', 'Sunny', 'euphoric', 'unpleasant', 'calm', 'Child', 'Flowers', 'Desert', 'happy', 'Animals', 'Partly_Blurred', 'train', 'Night', 'Single_Person', 'Painting', 'Neutral_Illumination', 'Sunset_Sunrise', 'boring', 'Indoor', 'cat', 'Sea', 'Church', 'Birthday', 'male', 'abstract', 'Lake', 'Teenager', 'airplane', 'ship', 'funny', 'Food', 'Shadow', 'Rain', 'Big_Group', 'Macro', 'insect', 'Beach_Holidays', 'Bridge', 'Outdoor', 'natural', 'Spring', 'old_person', 'No_Persons', 'dog', 'Fancy', 'Street', 'Graffiti', 'Park_Garden']), 'NUS': set(['bridge', 'map', 'glacier', 'house', 'fish', 'sign', 'snow', 'fox', 'protest', 'street', 'zebra', 'train', 'rocks', 'birds', 'earthquake', 'surf', 'mountain', 'toy', 'police', 'dancing', 'whales', 'sun', 'sky', 'lake', 'tree', 'moon', 'fire', 'window', 'book', 'animal', 'vehicle', 'boats', 'flags', 'valley', 'beach', 'temple', 'horses', 'plants', 'buildings', 'elk', 'garden', 'tattoo', 'food', 'cars', 'frost', 'bear', 'water', 'sports', 'running', 'waterfall', 'sand', 'statue', 'wedding', 'cityscape', 'leaf', 'railroad', 'flowers', 'castle', 'rainbow', 'town', 'tiger', 'soccer', 'clouds', 'reflection', 'cow', 'harbor', 'coral', 'nighttime', 'dog', 'ocean', 'swimmers', 'person', 'airport', 'plane', 'sunset', 'cat', 'military', 'tower', 'grass', 'computer', 'road'])}
PASCAL = ["PASCAL:" + l for l in allLabels_["PASCAL"]]
CLEF = ["CLEF:" + l for l in allLabels_["CLEF"]]
MIR = ["MIR:" + l for l in allLabels_["MIR"]]
NUS = ["NUS:" + l for l in allLabels_["NUS"]]
allLabels = [PASCAL, CLEF, MIR, NUS]
for i in range(4):
allLabels[i].sort()
allLabels = allLabels[0] + allLabels[1] + allLabels[2] + allLabels[3] + ['taken_at_home']
if groupIds == None:
groupIds = dict(zip(allLabels, range(40000, 40000 + len(allLabels))))
print "WARNING: SETTING GROUP IDS MANUALLY"
wordCount = {}
for photo in photoIterator(xmlOutputList[-3][0]):
words = wordsForPhoto(photo, stopwords)
for w in words:
addSetCount(wordCount, w)
counts = {}
countsLabel = {}
for photoXml, output, outputIDs in xmlOutputList:
countLabel = dict([(l, 0) for l in allLabels])
count = 0
for photo in photoIterator(photoXml):
for labels in photo.findall("labels"):
for label in labels.findall("label"):
l = label.attrib['source'] + ':' + label.text
countLabel[l] += 1
if takenAtHome(photo) == '1':
countLabel['taken_at_home'] += 1
count += 1
counts[photoXml] = count
countsLabel[photoXml] = countLabel
frequencies = [(wordCount[w], w) for w in wordCount.keys() if wordCount[w] > 1]
frequencies.sort()
frequencies.reverse()
frequencies = frequencies[:20000]
wordIds = {}
wordCount_ = {}
count = 0
for c,w in frequencies:
wordIds[w] = count
wordCount_[count] = c
count += 1
frequencies = [f[1] for f in frequencies]
for photoXml, output, outputIDs in xmlOutputList:
output = open(output, 'w')
output.write(str(len(frequencies)) + '\n')
count = 0
unprintable = 0
for f in frequencies:
try:
output.write(str(count) + ' ' + f + '\n')
except Exception as e:
output.write(str(count) + ' ' + "unknown_word" + str(unprintable) + '\n')
unprintable += 1
count += 1
output.write(str(counts[photoXml]))
wordCountLabel = dict([(l,{}) for l in allLabels])
for photo in photoIterator(photoXml):
words = wordsForPhoto(photo, stopwords)
for labels in photo.findall("labels"):
for label in labels.findall("label"):
l = label.attrib['source'] + ':' + label.text
hist = {}
for w in words:
if w in wordIds:
addSetCount(wordCountLabel[l], wordIds[w])
addSetCount(hist, wordIds[w])
output.write(photo.attrib['id'] + ' ' + photo.find('owner').attrib['nsid'] + ' ')
out = []
for h in hist.keys():
out.append(str(h) + ':' + str(hist[h]))
output.write(str(len(out)) + ' ' + ' '.join(out) + '\n')
if takenAtHome(photo) == '1':
countLabel['taken_at_home'] += 1
for w in words:
if w in wordIds:
addSetCount(wordCountLabel['taken_at_home'], wordIds[w])
output.close()
outputIDs = open(outputIDs, 'w')
for l in allLabels:
vk = []
for k in wordCountLabel[l].keys():
if not k in wordCount_.keys(): continue
frequencyPositive = wordCountLabel[l][k] * 1.0/countsLabel[photoXml][l]
frequencyAll = wordCount_[k] * 1.0/counts[photoXml]
d = frequencyPositive * 1.0/frequencyAll
if wordCountLabel[l][k] > 1:
vk.append((d, k))
#vk.append((wordCountLabel[l][k], k))
vk.sort()
vk.reverse()
wordsToUse = [str(v[1]) + ':' + str(v[0]) for v in vk]
outputIDs.write(str(groupIds[l]) + ' ' + l + ' ' + str(len(wordsToUse)) + ' ' + ' '.join([str(x) for x in wordsToUse]) + '\n')
outputIDs.close()
def takenAtHome(photo):
sim = '.'
ownerLocation = []
photoLocation = []
if photo.find('owner').attrib.has_key('location'):
ownerLocation = photo.find('owner').attrib['location'].replace(', ', ',').replace("USA", "United States").lower().split(',')
for location in photo.findall("location"):
for locality in location.findall('locality'):
photoLocation.append(locality.text.lower())
for county in location.findall('county'):
photoLocation.append(county.text.lower())
for region in location.findall('region'):
photoLocation.append(region.text.lower())
for country in location.findall('country'):
photoLocation.append(country.text.lower())
if len(ownerLocation) and ownerLocation[0] in photoLocation:
sim = '1'
elif len(ownerLocation) > 1 and len(photoLocation) > 1:
sim = '0'
return sim
# Tags and groups
def photoInfo(xmlOutputList):
allLabels_ = {'PASCAL': set(['sheep', 'horse', 'bicycle', 'aeroplane', 'cow', 'sofa', 'motorbike', 'dog', 'cat', 'person', 'train', 'diningtable', 'bottle', 'bus', 'pottedplant', 'tvmonitor', 'chair', 'bird', 'boat', 'car']), 'MIR': set(['night', 'people', 'clouds', 'sea', 'river', 'portrait', 'male', 'female', 'baby', 'tree', 'flower', 'dog', 'bird', 'car']), 'CLEF': set(['cute', 'Partylife', 'Winter', 'Sky', 'inactive', 'Vehicle', 'Overexposed', 'No_Blur', 'Family_Friends', 'Underexposed', 'Trees', 'Landscape_Nature', 'Day', 'MusicalInstrument', 'River', 'Motion_Blur', 'Summer', 'Visual_Arts', 'car', 'Out_of_focus', 'Small_Group', 'bicycle', 'fish', 'Sports', 'Citylife', 'Building_Sights', 'artificial', 'Water', 'Baby', 'bird', 'Plants', 'Overall_Quality', 'Clouds', 'Work', 'Architecture', 'active', 'bodypart', 'Still_Life', 'melancholic', 'scary', 'technical', 'skateboard', 'Autumn', 'Portrait', 'Aesthetic_Impression', 'Toy', 'Travel', 'Snow', 'Adult', 'female', 'Mountains', 'horse', 'Sunny', 'euphoric', 'unpleasant', 'calm', 'Child', 'Flowers', 'Desert', 'happy', 'Animals', 'Partly_Blurred', 'train', 'Night', 'Single_Person', 'Painting', 'Neutral_Illumination', 'Sunset_Sunrise', 'boring', 'Indoor', 'cat', 'Sea', 'Church', 'Birthday', 'male', 'abstract', 'Lake', 'Teenager', 'airplane', 'ship', 'funny', 'Food', 'Shadow', 'Rain', 'Big_Group', 'Macro', 'insect', 'Beach_Holidays', 'Bridge', 'Outdoor', 'natural', 'Spring', 'old_person', 'No_Persons', 'dog', 'Fancy', 'Street', 'Graffiti', 'Park_Garden']), 'NUS': set(['bridge', 'map', 'glacier', 'house', 'fish', 'sign', 'snow', 'fox', 'protest', 'street', 'zebra', 'train', 'rocks', 'birds', 'earthquake', 'surf', 'mountain', 'toy', 'police', 'dancing', 'whales', 'sun', 'sky', 'lake', 'tree', 'moon', 'fire', 'window', 'book', 'animal', 'vehicle', 'boats', 'flags', 'valley', 'beach', 'temple', 'horses', 'plants', 'buildings', 'elk', 'garden', 'tattoo', 'food', 'cars', 'frost', 'bear', 'water', 'sports', 'running', 'waterfall', 'sand', 'statue', 'wedding', 'cityscape', 'leaf', 'railroad', 'flowers', 'castle', 'rainbow', 'town', 'tiger', 'soccer', 'clouds', 'reflection', 'cow', 'harbor', 'coral', 'nighttime', 'dog', 'ocean', 'swimmers', 'person', 'airport', 'plane', 'sunset', 'cat', 'military', 'tower', 'grass', 'computer', 'road'])}
groupCount = {}
tagCount = {}
NPASCAL = len(allLabels_["PASCAL"])
NCLEF = len(allLabels_["CLEF"])
NMIR = len(allLabels_["MIR"])
NNUS = len(allLabels_["NUS"])
PASCAL = ["PASCAL:" + l for l in allLabels_["PASCAL"]]
CLEF = ["CLEF:" + l for l in allLabels_["CLEF"]]
MIR = ["MIR:" + l for l in allLabels_["MIR"]]
NUS = ["NUS:" + l for l in allLabels_["NUS"]]
allLabels = [PASCAL, CLEF, MIR, NUS]
for i in range(4):
allLabels[i].sort()
allLabels = allLabels[0] + allLabels[1] + allLabels[2] + allLabels[3] + ['taken_at_home']
trainingTestingPhotos = {'PASCAL': set(), 'CLEF': set(), 'MIR': set(), 'NUS': set()}
for s in ['PASCAL', 'CLEF', 'MIR', 'NUS']:
a = open("data/sets/_" + s + ".txt")
for p in a.readlines():
trainingTestingPhotos[s].add(p.strip())
a.close()
for photo in photoIterator(xmlOutputList[-3][0]):
for tag in photo.find("tags").findall("tag"):
addSetCount(tagCount, tag.text)
for groups in photo.findall("groups"):
for g in groups.findall("pool"):
addSetCount(groupCount, g.attrib['id'])
counts = {}
countsLabel = {}
for photoXml,_,_ in xmlOutputList:
countLabel = dict([(l, 0) for l in allLabels])
count = 0
for photo in photoIterator(photoXml):
for labels in photo.findall("labels"):
for label in labels.findall("label"):
l = label.attrib['source'] + ':' + label.text
countLabel[l] += 1
if takenAtHome(photo) == '1':
countLabel['taken_at_home'] += 1
count += 1
counts[photoXml] = count
countsLabel[photoXml] = countLabel
NGroups = min(20000, len(groupCount.keys()))
NTags = min(20000, len(tagCount.keys()))
currentId = 0
groupIds = {}
groupCount_ = {}
tagCount_ = {}
# Find the most common groups
vk = []
for k in groupCount.keys():
vk.append((groupCount[k], k))
vk.sort()
vk.reverse()
for v in vk[:NGroups]:
groupIds[v[1]] = currentId
groupCount_[currentId] = v[0]
currentId += 1
# Find the most common tags
vk = []
for k in tagCount.keys():
vk.append((tagCount[k], k))
vk.sort()
vk.reverse()
for v in vk[:NTags]:
groupIds[v[1]] = currentId
tagCount_[currentId] = v[0]
currentId += 1
for l in allLabels:
groupIds[l] = currentId
currentId += 1
idGroup = dict([(groupIds[x], x) for x in groupIds.keys()])
print "CURRENT ID=", currentId
for photoXml, output, outputIDs in xmlOutputList:
tagsForLabel = dict([(l,dict()) for l in allLabels])
groupsForLabel = dict([(l,dict()) for l in allLabels])
# Vector: id user group_indicator tag_indicator label_indicator
output = open(output, 'w')
output.write(str(NGroups) + ' ' + str(NTags) + ' ' + str(len(allLabels)) + '\n')
unknown_id = 1
for i in range(0, NGroups):
try:
output.write(str(i) + ' ' + idGroup[i] + '\n')
except Exception as e:
output.write(str(i) + ' ' + "unknown_group" + str(unknown_id) + '\n')
unknown_id += 1
for i in range(NGroups, NGroups + NTags):
try:
output.write(str(i) + ' ' + idGroup[i] + '\n')
except Exception as e:
output.write(str(i) + ' ' + "unknown_group" + str(unknown_id) + '\n')
unknown_id += 1
for i in range(NGroups + NTags, NGroups + NTags + len(allLabels)):
output.write(str(i) + ' ' + idGroup[i] + '\n')
output.write(str(counts[photoXml]) + '\n')
count = 0
for photo in photoIterator(photoXml):
count += 1
photoIndicator = ['.']*(NGroups+NTags+len(allLabels))
tagsForPhoto = set()
groupsForPhoto = set()
for tag in photo.find("tags").findall("tag"):
if tag.text in groupIds:
photoIndicator[groupIds[tag.text]] = '1'
tagsForPhoto.add(groupIds[tag.text])
for groups in photo.findall("groups"):
for g in groups.findall("pool"):
if g.attrib['id'] in groupIds:
photoIndicator[groupIds[g.attrib['id']]] = '1'
groupsForPhoto.add(groupIds[g.attrib['id']])
for s in ['PASCAL', 'CLEF', 'MIR', 'NUS']:
if photo.attrib['id'] in trainingTestingPhotos[s]:
for l in allLabels_[s]:
photoIndicator[groupIds[s + ':' + l]] = '0'
for labels in photo.findall("labels"):
for label in labels.findall("label"):
l = label.attrib['source'] + ':' + label.text
photoIndicator[groupIds[l]] = '1'
for t in tagsForPhoto:
addSetCount(tagsForLabel[l], t)
for g in groupsForPhoto:
addSetCount(groupsForLabel[l], g)
sim = takenAtHome(photo)
photoIndicator[groupIds['taken_at_home']] = sim
if sim == '1':
for t in tagsForPhoto:
addSetCount(tagsForLabel['taken_at_home'], t)
for g in groupsForPhoto:
addSetCount(groupsForLabel['taken_at_home'], g)
output.write(photo.attrib['id'] + ' ' + photo.find('owner').attrib['nsid'] + ' ' + ''.join([str(x) for x in photoIndicator]) + '\n')
output.close()
outputIDs = open(outputIDs, 'w')
for l in allLabels:
vk = []
for k in groupsForLabel[l].keys():
if not k in groupCount_.keys(): continue
frequencyPositive = groupsForLabel[l][k] * 1.0/countsLabel[photoXml][l] # How many times does this term occur in positively labeled images?
frequencyAll = groupCount_[k] * 1.0/counts[photoXml] # How many times does it occur altogether?
d = frequencyPositive * 1.0/frequencyAll
if groupsForLabel[l][k] > 1:
vk.append((d, k))
vk.sort()
vk.reverse()
groupsToUse = [str(v[1]) + ':' + str(v[0]) for v in vk]
vk = []
for k in tagsForLabel[l].keys():
if not k in tagCount_.keys(): continue
frequencyPositive = tagsForLabel[l][k] * 1.0/countsLabel[photoXml][l]
frequencyAll = tagCount_[k] * 1.0/counts[photoXml]
d = frequencyPositive * 1.0/frequencyAll
if tagsForLabel[l][k] > 1:
vk.append((d, k))
vk.sort()
vk.reverse()
tagsToUse = [str(v[1]) + ':' + str(v[0]) for v in vk]
outputIDs.write(str(groupIds[l]) + ' ' + l + ' ' + str(len(groupsToUse)) + ' ' + str(len(tagsToUse)) + ' ' + ' '.join([str(x) for x in groupsToUse]) + ' ' + ' '.join([str(x) for x in tagsToUse]) + '\n')
outputIDs.close()
return groupIds
def otherStuffIndicator(xmlOutputList):
sets = set()
galleries = set()
locations = set()
persons = set()
for photoXml,_ in xmlOutputList:
for photo in photoIterator(photoXml):
for gallery in photo.find("galleries").findall("gallery"):
galleries.add(gallery.attrib['id'])
for location in photo.findall("location"):
if location.attrib.has_key('place_id'):
locations.add(location.attrib['place_id'])
persons.add(photo.find('owner').attrib['nsid'])
for groups in photo.findall("groups"):
for s in groups.findall("set"):
sets.add(s.attrib['id'])
thingID = {}
currentID = 0
for s in sets:
thingID[s] = currentID
currentID += 1
for g in galleries:
thingID[g] = currentID
currentID += 1
for l in locations:
thingID[l] = currentID
currentID += 1
for p in persons:
thingID[p] = currentID
currentID += 1
for photoXml,output in xmlOutputList:
output = open(output, 'w')
output.write(str(currentID) + '\n')
indicator = [0]*currentID
for photo in photoIterator(photoXml):
for gallery in photo.find("galleries").findall("gallery"):
indicator[thingID[gallery.attrib['id']]] = 1
for location in photo.findall("location"):
if location.attrib.has_key('place_id'):
indicator[thingID[location.attrib['place_id']]] = 1
indicator[thingID[photo.find('owner').attrib['nsid']]] = 1
for groups in photo.findall("groups"):
for s in groups.findall("set"):
indicator[thingID[s.attrib['id']]] = 1
output.write(photo.attrib['id'] + ' ' + ''.join([str(x) for x in indicator]) + '\n')
output.close()
# People, sets, galleries, and locations
def photoCliques(photoXml, output):
global currentPhotoId
global photoIds
currentPhotoId = 0
photoIds = {}
personPhotos = {}
setPhotos = {}
galleryPhotos = {}
locationPhotos = {}
for photo in photoIterator(photoXml):
getPhotoId(photo.attrib['id'])
for groups in photo.findall("groups"):
for s in groups.findall("set"):
addToSet(setPhotos, s.attrib['id'], photo.attrib['id'])
addToSet(personPhotos, photo.find('owner').attrib['nsid'], photo.attrib['id'])
for gallery in photo.find("galleries").findall("gallery"):
addToSet(galleryPhotos, gallery.attrib['id'], photo.attrib['id'])
for location in photo.findall("location"):
if location.attrib.has_key('place_id'):
addToSet(locationPhotos, location.attrib['place_id'], photo.attrib['id'])
output = open(output, 'w')
output.write(str(currentPhotoId) + '\n')
output.write(str(len(personPhotos)) + '\n')
output.write(str(len(setPhotos)) + '\n')
output.write(str(len(galleryPhotos)) + '\n')
output.write(str(len(locationPhotos)) + '\n')
for k in photoIds.keys():
output.write(str(k) + '\n')
for k in personPhotos.keys():
s = personPhotos[k]
output.write(k + ' ' + str(len(s)) + ' ' + ' '.join(s) + '\n')
for k in setPhotos.keys():
s = setPhotos[k]
output.write(k + ' ' + str(len(s)) + ' ' + ' '.join(s) + '\n')
for k in galleryPhotos.keys():
s = galleryPhotos[k]
output.write(k + ' ' + str(len(s)) + ' ' + ' '.join(s) + '\n')
for k in locationPhotos.keys():
s = locationPhotos[k]
output.write(k + ' ' + str(len(s)) + ' ' + ' '.join(s) + '\n')
output.close()
def binaryFeatures(idfile, output):
a = open(idfile, 'r')
ids = [int(x.strip()) for x in a.readlines()]
a.close()
o = open(output, 'wb')
count = 0
for i in ids:
count += 1
if not (count % 1000): print count
o.write(struct.pack('l', i))
x = open("data/features_1024/" + str(i)[:2] + '/' + str(i) + ".vec", 'rb')
o.write(x.read())
o.close()
def binaryFeaturesNUS(idfile, output, which):
a = open("/lfs/1/flickr/flickr/NUSwide/NUS-WIDE-urls.txt", 'r')
imids = []
imfeats = {}
for x in a.readlines()[1:]:
imids.append(x.split()[1])
a.close()
a = open(which, 'r')
ind = 0
for l in a.readlines():
imfeats[imids[ind]] = l.strip()
ind += 1
a.close()
a = open(idfile, 'r')
b = open(output, 'w')
for l in a.readlines():
l = l.strip()
b.write(l + ' ' + imfeats[l] + '\n')
b.close()