import flickrapi
from flickrapi import FlickrError
from elementtree.ElementTree import parse
from elementtree.ElementTree import ElementTree
import os
import random
import commands
import md5
import sys
import struct
from commands import getstatusoutput
import string
from nltk.stem.porter import PorterStemmer as stemmer

random.seed(0)

from elementtree.ElementTree import dump
from elementtree.ElementTree import tostring
from elementtree.ElementTree import fromstring

import urllib2
import os
import threading
import thread
import time

def gso(command):
  s,o = commands.getstatusoutput(command)
  if (s != 0):
    print "Got nonzero exit status for", command
    print o
    sys.exit(1)
  return s,o

def photoXml(idfile, output):
  b = open(output, 'w')
  b.write("<photos>\n")

  x = open(idfile)
  items = set([l.strip() for l in x.readlines()])
  x.close()

  count = 0
  currentlyValid = True
  lastline = ""
  photoID = None
  for l in items:
    l = l.strip()
    count += 1
    print count
    infoTree = None
    if os.path.exists("/mnt/ram/flickr/photos/" + l[:2] + '/' + l + ".etree"):
      infoTree = parse("/mnt/ram/flickr/photos/" + l[:2] + '/' + l + ".etree")
    else:
      infoTree = parse("/lfs/1/flickr/flickr/data/photos/" + l[:2] + '/' + l + ".etree")
    photoTree = infoTree.find('photo')
    for l2 in tostring(photoTree).split('\n')[:-2]:
      b.write(l2 + '\n')
    b.write("</photo>\n")
  b.write("</photos>\n")
  b.close()

def augment(photoXml, output, which = None):
  a = open(photoXml)
  b = open(output, 'w')
  photoID = None
  count = 0
  for l in a.xreadlines():
    if l.startswith("<photo "):
      idstart = l.find("id=") + 4
      idend = l.find('"', idstart)
      photoID = l[idstart:idend]
      count += 1
      if not (count % 1000): print count
    if l.startswith("</photo>"):
      x = None

      if which == "contexts":
        if os.path.exists("/mnt/ram/flickr/contexts/" + photoID[:2] + '/' + photoID + ".etree"):
          x = open("/mnt/ram/flickr/contexts/" + photoID[:2] + '/' + photoID + ".etree")
        else:
          x = open("/lfs/1/flickr/flickr/data/contexts/" + photoID[:2] + '/' + photoID + ".etree")
        lines = x.readlines()
        b.write("\t<groups>\n")
        for line in lines[1:-1]:
          b.write('\t\t' + line)
        b.write("\t</groups>\n")
        x.close()

      if which == "galleries":
        if os.path.exists("/mnt/ram/flickr/galleries/" + photoID[:2] + '/' + photoID + ".etree"):
          x = open("/mnt/ram/flickr/galleries/" + photoID[:2] + '/' + photoID + ".etree")
        else:
          x = open("/lfs/1/flickr/flickr/data/galleries/" + photoID[:2] + '/' + photoID + ".etree")
        lines = x.readlines()
        b.write("\t<galleries>\n")
        for line in lines[1:-1]:
          if not line.startswith("<galleries") and not line.startswith("</galleries"):
            b.write("\t" + line)
        b.write("\t</galleries>\n")
        x.close()

      if which == "comments":
        if os.path.exists("/mnt/ram/flickr/comments/" + photoID[:2] + '/' + photoID + ".etree"):
          x = open("/mnt/ram/flickr/comments/" + photoID[:2] + '/' + photoID + ".etree")
        else:
          x = open("/lfs/1/flickr/flickr/data/comments/" + photoID[:2] + '/' + photoID + ".etree")
        lines = x.readlines()
        b.write("\t<comments>\n")
        for line in lines[1:-1]:
	  if not line.startswith("<comments") and not line.startswith("</comments"):
            b.write('\t' + line)
        b.write("\t</comments>\n")
        x.close()

      if which == "labels":
        labels = []
        for source in ("NUS", "CLEF", "MIR", "PASCAL"):
          fname = None
          if os.path.exists("/mnt/ram/flickr/labels" + source + '/' + photoID[:2] + '/' + photoID + ".labels"):
            fname = "/mnt/ram/flickr/labels" + source + '/' + photoID[:2] + '/' + photoID + ".labels"
          else:
            fname = "/lfs/1/flickr/flickr/data/labels" + source + '/' + photoID[:2] + '/' + photoID + ".labels"
          if os.path.exists(fname):
            x = open(fname)
            for line in x.readlines():
              labels.append((line.strip(), source))
        if len(labels):
          b.write("\t<labels>\n")
          for label,source in labels:
            b.write('\t\t<label source="' + source + '">' + label + "</label>\n")
          b.write("\t</labels>\n")
    b.write(l)
  b.close()

def buildIncremental(idfile, output):
  #gso("cd /mnt/ram/flickr/ && tar xf /lfs/1/flickr/flickr/data/photos.tar")
  #photoXml(idfile, output)
  #gso("rm /mnt/ram/flickr/* -rf")
  #for which in ("contexts", "galleries", "labels"):
    #gso("cd /mnt/ram/flickr/ && tar xf /lfs/1/flickr/flickr/data/" + which + ".tar")
    #augment(output, output + "_temp", which)
    #gso("mv " + output + "_temp " + output)
    #gso("rm /mnt/ram/flickr/* -rf")
  augment(output, output + "_comments", "comments")
  

def getSubset(idfile, output):
  a = open("/lfs/1/flickr/flickr/labeledPhotos.xml_comments")
  b = open(output, 'w')
  x = open(idfile)
  items = set([l.strip() for l in x.readlines()])
  x.close()
  
  count = 0
  currentlyValid = True
  lastline = ""
  photoID = None
  for l in a.xreadlines():
    if l.startswith("</photo>"):
      if not (count % 1000): print count
      count += 1
      if not currentlyValid:
        currentlyValid = True
        lastline = l
        continue
      #else:
        #x = open("/lfs/1/flickr/flickr/data/comments/" + photoID[:2] + '/' + photoID + ".etree")
        #lines = x.readlines()[1:-1]
        #try:
          #fromstring('\n'.join(lines))
        #except Exception as e:
          #print photoID
          #print '\n'.join(lines)
          #sys.exit(1)
        #for l2 in x.readlines()[1:-1]:
          #b.write('\t' + l2)
    if l.startswith("<photo "):
      idstart = l.find("id=") + 4
      idend = l.find('"', idstart)
      photoID = l[idstart:idend]
      if (not photoID in items) or (not os.path.exists("/lfs/1/flickr/flickr/data/comments/" + photoID[:2] + '/' + photoID + ".etree")):
        currentlyValid = False
    if not currentlyValid:
      lastline = l
      continue
    if lastline.startswith("</photo>") and l == '\n':
      lastline = l
      continue
    b.write(l)
    lastline = l
  b.close()
  a.close()  


def photoIterator(path):
  a = open(path)
  linebuf = []
  count = 0
  a.readline()
  for l in a.xreadlines():
    linebuf.append(l)
    if "</photo>" in l:
      x = ''.join(linebuf)
      linebuf = []
      count += 1
      if not (count % 1000): print "Read", count, "photos"
      try:
        yield fromstring(x)
      except Exception as e:
        print x
        sys.exit(1)

def peopleIterator(path):
  a = open(path)
  linebuf = []
  count = 0
  for l in a.xreadlines():
    linebuf.append(l)
    if "</contactlist>" in l:
      x = ''.join(linebuf[1:])
      linebuf = []
      count += 1
      if not (count % 1000): print "Read", count, "contacts"
      yield fromstring(x)

def peopleXml(photoXml, output):
  people = set()
  for photo in photoIterator(photoXml):
    ownerID = photo.find('owner').attrib['nsid']
    people.add(ownerID)

  b = open(output, 'w')
  b.write("<contactlists>\n")
  count = 0
  for person in people:
    count += 1
    if not (count % 1000): print "Wrote", count, "people"
    x = open("/mnt/ram/flickr/contacts/" + person[:2] + '/' + person + ".etree")
    b.write('<contactlist nsid="' + person + '">\n')
    for l in x.readlines()[1:-1]:
      b.write(l)
    b.write("</contactlist>\n\n")
  b.write("</contactlists>\n")

currentPersonId = 0
personIds = {}
def getPersonId(person):
  global currentPersonId
  if person in personIds: return personIds[person]
  personIds[person] = currentPersonId
  currentPersonId += 1
  
currentPhotoId = 0
photoIds = {}
def getPhotoId(photo):
  global currentPhotoId
  if photo in photoIds: return photoIds[photo]
  photoIds[photo] = currentPhotoId
  currentPhotoId += 1

def peopleGraph(peopleXml, output):
  lines = {}  
  for contactss in peopleIterator(peopleXml):
    # We only care about people who have photos.
    getPersonId(contactss.attrib["nsid"])
  for contactss in peopleIterator(peopleXml):
    conts = set()
    for contacts in contactss.findall("contacts"):
      for contact in contacts.findall("contact"):
        if contact.attrib['nsid'] in personIds:
          conts.add(contact.attrib["nsid"])
    line = []
    for c in conts:
      line.append(c)
    lines[contactss.attrib["nsid"]] = line
  c = open(output, 'w')
  c.write(str(currentPersonId) + '\n')
  for k in personIds.keys():
    c.write(str(k) + '\n')
  for k in lines.keys():
    c.write(str(k) + ' ' + str(len(lines[k])))
    for k2 in lines[k]:
      c.write(' ' + str(k2))
    c.write('\n')

def addSetCount(s, key):
  if not key in s:
    s[key] = 0
  s[key] += 1

def addToSet(s, key, value):
  if not key in s:
    s[key] = set()
  s[key].add(value)
  
def wordsForPhoto(photo, stopwords):
  exclude = set(string.punctuation)
  exclude.remove("'")
  tagWords = []
  groupWords = []
  titleWords = []
  if photo.find('title').text:
    titleWords.append(photo.find('title').text)
  if photo.find('description').text:
    titleWords.append(photo.find('description').text)
  galleryWords = []
  commentWords = []
  for comments in photo.findall('comments'):
    for c in comments.findall('comment'):
      commentWords.append(c.text)
  titleWords = [t for t in titleWords if t != None]
  commentWords = [t for t in commentWords if t != None]
  words = ' '.join(groupWords).lower() + ' ' + ' '.join(titleWords).lower() + ' ' + ' '.join(commentWords).lower()
  words = ''.join(ch for ch in words if ch not in exclude)
  ret = set()
  for w in words.split():
    try:
      if not w in stopwords:
        ret.add(w)
    except Exception as e:
      print w
  return ret

def photoText(xmlOutputList, groupIds=None):
  a = open("stopwords.txt")
  stopwords = set([l.strip() for l in a.readlines()])
  a.close()

  allLabels_ = {'PASCAL': set(['sheep', 'horse', 'bicycle', 'aeroplane', 'cow', 'sofa', 'motorbike', 'dog', 'cat', 'person', 'train', 'diningtable', 'bottle', 'bus', 'pottedplant', 'tvmonitor', 'chair', 'bird', 'boat', 'car']), 'MIR': set(['night', 'people', 'clouds', 'sea', 'river', 'portrait', 'male', 'female', 'baby', 'tree', 'flower', 'dog', 'bird', 'car']), 'CLEF': set(['cute', 'Partylife', 'Winter', 'Sky', 'inactive', 'Vehicle', 'Overexposed', 'No_Blur', 'Family_Friends', 'Underexposed', 'Trees', 'Landscape_Nature', 'Day', 'MusicalInstrument', 'River', 'Motion_Blur', 'Summer', 'Visual_Arts', 'car', 'Out_of_focus', 'Small_Group', 'bicycle', 'fish', 'Sports', 'Citylife', 'Building_Sights', 'artificial', 'Water', 'Baby', 'bird', 'Plants', 'Overall_Quality', 'Clouds', 'Work', 'Architecture', 'active', 'bodypart', 'Still_Life', 'melancholic', 'scary', 'technical', 'skateboard', 'Autumn', 'Portrait', 'Aesthetic_Impression', 'Toy', 'Travel', 'Snow', 'Adult', 'female', 'Mountains', 'horse', 'Sunny', 'euphoric', 'unpleasant', 'calm', 'Child', 'Flowers', 'Desert', 'happy', 'Animals', 'Partly_Blurred', 'train', 'Night', 'Single_Person', 'Painting', 'Neutral_Illumination', 'Sunset_Sunrise', 'boring', 'Indoor', 'cat', 'Sea', 'Church', 'Birthday', 'male', 'abstract', 'Lake', 'Teenager', 'airplane', 'ship', 'funny', 'Food', 'Shadow', 'Rain', 'Big_Group', 'Macro', 'insect', 'Beach_Holidays', 'Bridge', 'Outdoor', 'natural', 'Spring', 'old_person', 'No_Persons', 'dog', 'Fancy', 'Street', 'Graffiti', 'Park_Garden']), 'NUS': set(['bridge', 'map', 'glacier', 'house', 'fish', 'sign', 'snow', 'fox', 'protest', 'street', 'zebra', 'train', 'rocks', 'birds', 'earthquake', 'surf', 'mountain', 'toy', 'police', 'dancing', 'whales', 'sun', 'sky', 'lake', 'tree', 'moon', 'fire', 'window', 'book', 'animal', 'vehicle', 'boats', 'flags', 'valley', 'beach', 'temple', 'horses', 'plants', 'buildings', 'elk', 'garden', 'tattoo', 'food', 'cars', 'frost', 'bear', 'water', 'sports', 'running', 'waterfall', 'sand', 'statue', 'wedding', 'cityscape', 'leaf', 'railroad', 'flowers', 'castle', 'rainbow', 'town', 'tiger', 'soccer', 'clouds', 'reflection', 'cow', 'harbor', 'coral', 'nighttime', 'dog', 'ocean', 'swimmers', 'person', 'airport', 'plane', 'sunset', 'cat', 'military', 'tower', 'grass', 'computer', 'road'])}
  PASCAL = ["PASCAL:" + l for l in allLabels_["PASCAL"]]
  CLEF = ["CLEF:" + l for l in allLabels_["CLEF"]]
  MIR = ["MIR:" + l for l in allLabels_["MIR"]]
  NUS = ["NUS:" + l for l in allLabels_["NUS"]]
  allLabels = [PASCAL, CLEF, MIR, NUS]
  for i in range(4):
    allLabels[i].sort()
  allLabels = allLabels[0] + allLabels[1] + allLabels[2] + allLabels[3] + ['taken_at_home']
  
  if groupIds == None:
    groupIds = dict(zip(allLabels, range(40000, 40000 + len(allLabels))))
    print "WARNING: SETTING GROUP IDS MANUALLY"

  wordCount = {}
  for photo in photoIterator(xmlOutputList[-3][0]):
    words = wordsForPhoto(photo, stopwords)
    for w in words:
       addSetCount(wordCount, w)

  counts = {}
  countsLabel = {}
  for photoXml, output, outputIDs in xmlOutputList:
    countLabel = dict([(l, 0) for l in allLabels])
    count = 0
    for photo in photoIterator(photoXml):
      for labels in photo.findall("labels"):
        for label in labels.findall("label"):
          l = label.attrib['source'] + ':' + label.text
          countLabel[l] += 1
      if takenAtHome(photo) == '1':
        countLabel['taken_at_home'] += 1
      count += 1
    counts[photoXml] = count
    countsLabel[photoXml] = countLabel

  frequencies = [(wordCount[w], w) for w in wordCount.keys() if wordCount[w] > 1]
  frequencies.sort()
  frequencies.reverse()
  frequencies = frequencies[:20000]
  wordIds = {}
  wordCount_ = {}
  count = 0
  for c,w in frequencies:
    wordIds[w] = count
    wordCount_[count] = c
    count += 1
  frequencies = [f[1] for f in frequencies]

  for photoXml, output, outputIDs in xmlOutputList:
    output = open(output, 'w')
    output.write(str(len(frequencies)) + '\n')
    count = 0
    unprintable = 0
    for f in frequencies:
      try:
        output.write(str(count) + ' ' + f + '\n')
      except Exception as e:
        output.write(str(count) + ' ' + "unknown_word" + str(unprintable) + '\n')
        unprintable += 1
      count += 1
    output.write(str(counts[photoXml]))
    wordCountLabel = dict([(l,{}) for l in allLabels])
    for photo in photoIterator(photoXml):
      
      words = wordsForPhoto(photo, stopwords)
      for labels in photo.findall("labels"):
        for label in labels.findall("label"):
          l = label.attrib['source'] + ':' + label.text
          hist = {}
          for w in words:
            if w in wordIds:
              addSetCount(wordCountLabel[l], wordIds[w])
              addSetCount(hist, wordIds[w])
        output.write(photo.attrib['id'] + ' ' + photo.find('owner').attrib['nsid'] + ' ')
        out = []
        for h in hist.keys():
          out.append(str(h) + ':' + str(hist[h]))
        output.write(str(len(out)) + ' ' + ' '.join(out) + '\n')
      if takenAtHome(photo) == '1':
        countLabel['taken_at_home'] += 1
        for w in words:
          if w in wordIds:
            addSetCount(wordCountLabel['taken_at_home'], wordIds[w])
    output.close()
    outputIDs = open(outputIDs, 'w')
    for l in allLabels:
      vk = []
      for k in wordCountLabel[l].keys():
        if not k in wordCount_.keys(): continue
        frequencyPositive = wordCountLabel[l][k] * 1.0/countsLabel[photoXml][l]
        frequencyAll = wordCount_[k] * 1.0/counts[photoXml]
        d = frequencyPositive * 1.0/frequencyAll
        if wordCountLabel[l][k] > 1:
          vk.append((d, k))
        #vk.append((wordCountLabel[l][k], k))
      
      vk.sort()
      vk.reverse()
      wordsToUse = [str(v[1]) + ':' + str(v[0]) for v in vk]

      outputIDs.write(str(groupIds[l]) + ' ' + l + ' ' + str(len(wordsToUse)) + ' ' + ' '.join([str(x) for x in wordsToUse]) + '\n')
    outputIDs.close()
        
def takenAtHome(photo):
  sim = '.'
  ownerLocation = []
  photoLocation = []
  if photo.find('owner').attrib.has_key('location'):
    ownerLocation = photo.find('owner').attrib['location'].replace(', ', ',').replace("USA", "United States").lower().split(',')
  for location in photo.findall("location"):
    for locality in location.findall('locality'):
      photoLocation.append(locality.text.lower())
    for county in location.findall('county'):
      photoLocation.append(county.text.lower())
    for region in location.findall('region'):
      photoLocation.append(region.text.lower())
    for country in location.findall('country'):
      photoLocation.append(country.text.lower())
  if len(ownerLocation) and ownerLocation[0] in photoLocation:
    sim = '1'
  elif len(ownerLocation) > 1 and len(photoLocation) > 1:
    sim = '0'
  return sim
    
# Tags and groups
def photoInfo(xmlOutputList):
  allLabels_ = {'PASCAL': set(['sheep', 'horse', 'bicycle', 'aeroplane', 'cow', 'sofa', 'motorbike', 'dog', 'cat', 'person', 'train', 'diningtable', 'bottle', 'bus', 'pottedplant', 'tvmonitor', 'chair', 'bird', 'boat', 'car']), 'MIR': set(['night', 'people', 'clouds', 'sea', 'river', 'portrait', 'male', 'female', 'baby', 'tree', 'flower', 'dog', 'bird', 'car']), 'CLEF': set(['cute', 'Partylife', 'Winter', 'Sky', 'inactive', 'Vehicle', 'Overexposed', 'No_Blur', 'Family_Friends', 'Underexposed', 'Trees', 'Landscape_Nature', 'Day', 'MusicalInstrument', 'River', 'Motion_Blur', 'Summer', 'Visual_Arts', 'car', 'Out_of_focus', 'Small_Group', 'bicycle', 'fish', 'Sports', 'Citylife', 'Building_Sights', 'artificial', 'Water', 'Baby', 'bird', 'Plants', 'Overall_Quality', 'Clouds', 'Work', 'Architecture', 'active', 'bodypart', 'Still_Life', 'melancholic', 'scary', 'technical', 'skateboard', 'Autumn', 'Portrait', 'Aesthetic_Impression', 'Toy', 'Travel', 'Snow', 'Adult', 'female', 'Mountains', 'horse', 'Sunny', 'euphoric', 'unpleasant', 'calm', 'Child', 'Flowers', 'Desert', 'happy', 'Animals', 'Partly_Blurred', 'train', 'Night', 'Single_Person', 'Painting', 'Neutral_Illumination', 'Sunset_Sunrise', 'boring', 'Indoor', 'cat', 'Sea', 'Church', 'Birthday', 'male', 'abstract', 'Lake', 'Teenager', 'airplane', 'ship', 'funny', 'Food', 'Shadow', 'Rain', 'Big_Group', 'Macro', 'insect', 'Beach_Holidays', 'Bridge', 'Outdoor', 'natural', 'Spring', 'old_person', 'No_Persons', 'dog', 'Fancy', 'Street', 'Graffiti', 'Park_Garden']), 'NUS': set(['bridge', 'map', 'glacier', 'house', 'fish', 'sign', 'snow', 'fox', 'protest', 'street', 'zebra', 'train', 'rocks', 'birds', 'earthquake', 'surf', 'mountain', 'toy', 'police', 'dancing', 'whales', 'sun', 'sky', 'lake', 'tree', 'moon', 'fire', 'window', 'book', 'animal', 'vehicle', 'boats', 'flags', 'valley', 'beach', 'temple', 'horses', 'plants', 'buildings', 'elk', 'garden', 'tattoo', 'food', 'cars', 'frost', 'bear', 'water', 'sports', 'running', 'waterfall', 'sand', 'statue', 'wedding', 'cityscape', 'leaf', 'railroad', 'flowers', 'castle', 'rainbow', 'town', 'tiger', 'soccer', 'clouds', 'reflection', 'cow', 'harbor', 'coral', 'nighttime', 'dog', 'ocean', 'swimmers', 'person', 'airport', 'plane', 'sunset', 'cat', 'military', 'tower', 'grass', 'computer', 'road'])}
  groupCount = {}
  tagCount = {}
  
  NPASCAL = len(allLabels_["PASCAL"])
  NCLEF = len(allLabels_["CLEF"])
  NMIR = len(allLabels_["MIR"])
  NNUS = len(allLabels_["NUS"])
  
  PASCAL = ["PASCAL:" + l for l in allLabels_["PASCAL"]]
  CLEF = ["CLEF:" + l for l in allLabels_["CLEF"]]
  MIR = ["MIR:" + l for l in allLabels_["MIR"]]
  NUS = ["NUS:" + l for l in allLabels_["NUS"]]
  
  allLabels = [PASCAL, CLEF, MIR, NUS]
  
  for i in range(4):
    allLabels[i].sort()

  allLabels = allLabels[0] + allLabels[1] + allLabels[2] + allLabels[3] + ['taken_at_home']

  trainingTestingPhotos = {'PASCAL': set(), 'CLEF': set(), 'MIR': set(), 'NUS': set()}
  for s in ['PASCAL', 'CLEF', 'MIR', 'NUS']:
    a = open("data/sets/_" + s + ".txt")
    for p in a.readlines():
      trainingTestingPhotos[s].add(p.strip())
    a.close()

  for photo in photoIterator(xmlOutputList[-3][0]):
    for tag in photo.find("tags").findall("tag"):
      addSetCount(tagCount, tag.text)
    for groups in photo.findall("groups"):
      for g in groups.findall("pool"):
        addSetCount(groupCount, g.attrib['id'])

  counts = {}
  countsLabel = {}
  for photoXml,_,_ in xmlOutputList:
    countLabel = dict([(l, 0) for l in allLabels])
    count = 0
    for photo in photoIterator(photoXml):
      for labels in photo.findall("labels"):
        for label in labels.findall("label"):
          l = label.attrib['source'] + ':' + label.text
          countLabel[l] += 1
      if takenAtHome(photo) == '1':
        countLabel['taken_at_home'] += 1
      count += 1
    counts[photoXml] = count
    countsLabel[photoXml] = countLabel

  NGroups = min(20000, len(groupCount.keys()))
  NTags = min(20000, len(tagCount.keys()))
  currentId = 0
  groupIds = {}

  groupCount_ = {}
  tagCount_ = {}

  # Find the most common groups
  vk = []
  for k in groupCount.keys():
    vk.append((groupCount[k], k))
  vk.sort()
  vk.reverse()
  for v in vk[:NGroups]:
    groupIds[v[1]] = currentId
    groupCount_[currentId] = v[0]
    currentId += 1

  # Find the most common tags
  vk = []
  for k in tagCount.keys():
    vk.append((tagCount[k], k))
  vk.sort()
  vk.reverse()
  for v in vk[:NTags]:
    groupIds[v[1]] = currentId
    tagCount_[currentId] = v[0]
    currentId += 1

  for l in allLabels:
    groupIds[l] = currentId
    currentId += 1

  idGroup = dict([(groupIds[x], x) for x in groupIds.keys()])
  print "CURRENT ID=", currentId

  for photoXml, output, outputIDs in xmlOutputList:
    tagsForLabel = dict([(l,dict()) for l in allLabels])
    groupsForLabel = dict([(l,dict()) for l in allLabels])

    # Vector: id user group_indicator tag_indicator label_indicator
    output = open(output, 'w')
    output.write(str(NGroups) + ' ' + str(NTags) + ' ' + str(len(allLabels)) + '\n')
    unknown_id = 1
    for i in range(0, NGroups):
      try:
        output.write(str(i) + ' ' + idGroup[i] + '\n')
      except Exception as e:
        output.write(str(i) + ' ' + "unknown_group" + str(unknown_id) + '\n')
        unknown_id += 1
    for i in range(NGroups, NGroups + NTags):
      try:
        output.write(str(i) + ' ' + idGroup[i] + '\n')
      except Exception as e:
        output.write(str(i) + ' ' + "unknown_group" + str(unknown_id) + '\n')
        unknown_id += 1
    for i in range(NGroups + NTags, NGroups + NTags + len(allLabels)):
      output.write(str(i) + ' ' + idGroup[i] + '\n')
    output.write(str(counts[photoXml]) + '\n')
    count = 0

    for photo in photoIterator(photoXml):
      count += 1
      photoIndicator = ['.']*(NGroups+NTags+len(allLabels))
      tagsForPhoto = set()
      groupsForPhoto = set()
      for tag in photo.find("tags").findall("tag"):
        if tag.text in groupIds:
          photoIndicator[groupIds[tag.text]] = '1'
          tagsForPhoto.add(groupIds[tag.text])
      for groups in photo.findall("groups"):
        for g in groups.findall("pool"):
          if g.attrib['id'] in groupIds:
            photoIndicator[groupIds[g.attrib['id']]] = '1'
            groupsForPhoto.add(groupIds[g.attrib['id']])
      for s in ['PASCAL', 'CLEF', 'MIR', 'NUS']:
        if photo.attrib['id'] in trainingTestingPhotos[s]:
          for l in allLabels_[s]:
            photoIndicator[groupIds[s + ':' + l]] = '0'
      for labels in photo.findall("labels"):
        for label in labels.findall("label"):
          l = label.attrib['source'] + ':' + label.text
          photoIndicator[groupIds[l]] = '1'
          for t in tagsForPhoto:
            addSetCount(tagsForLabel[l], t)
          for g in groupsForPhoto:
            addSetCount(groupsForLabel[l], g)
      sim = takenAtHome(photo)
      photoIndicator[groupIds['taken_at_home']] = sim
      if sim == '1':
        for t in tagsForPhoto:
          addSetCount(tagsForLabel['taken_at_home'], t)
        for g in groupsForPhoto:
          addSetCount(groupsForLabel['taken_at_home'], g)
      output.write(photo.attrib['id'] + ' ' + photo.find('owner').attrib['nsid'] + ' ' + ''.join([str(x) for x in photoIndicator]) + '\n')
    output.close()
    outputIDs = open(outputIDs, 'w')
    for l in allLabels:
      vk = []
      for k in groupsForLabel[l].keys():
        if not k in groupCount_.keys(): continue
        frequencyPositive = groupsForLabel[l][k] * 1.0/countsLabel[photoXml][l] # How many times does this term occur in positively labeled images?
        frequencyAll = groupCount_[k] * 1.0/counts[photoXml] # How many times does it occur altogether?
        d = frequencyPositive * 1.0/frequencyAll
        if groupsForLabel[l][k] > 1:
          vk.append((d, k))
      vk.sort()
      vk.reverse()
      groupsToUse = [str(v[1]) + ':' + str(v[0]) for v in vk]
      vk = []
      for k in tagsForLabel[l].keys():
        if not k in tagCount_.keys(): continue
        frequencyPositive = tagsForLabel[l][k] * 1.0/countsLabel[photoXml][l]
        frequencyAll = tagCount_[k] * 1.0/counts[photoXml]
        d = frequencyPositive * 1.0/frequencyAll
        if tagsForLabel[l][k] > 1:
          vk.append((d, k))
      vk.sort()
      vk.reverse()
      tagsToUse = [str(v[1]) + ':' + str(v[0]) for v in vk]
      outputIDs.write(str(groupIds[l]) + ' ' + l + ' ' + str(len(groupsToUse)) + ' ' + str(len(tagsToUse)) + ' ' + ' '.join([str(x) for x in groupsToUse]) + ' ' + ' '.join([str(x) for x in tagsToUse]) + '\n')
    outputIDs.close()
  return groupIds

  
def otherStuffIndicator(xmlOutputList):
  sets = set()
  galleries = set()
  locations = set()
  persons = set()
  for photoXml,_ in xmlOutputList:
    for photo in photoIterator(photoXml):
      for gallery in photo.find("galleries").findall("gallery"):
        galleries.add(gallery.attrib['id'])
      for location in photo.findall("location"):
        if location.attrib.has_key('place_id'):
          locations.add(location.attrib['place_id'])
      persons.add(photo.find('owner').attrib['nsid'])
      for groups in photo.findall("groups"):
        for s in groups.findall("set"):
          sets.add(s.attrib['id'])
  thingID = {}
  currentID = 0
  for s in sets:
    thingID[s] = currentID
    currentID += 1
  for g in galleries:
    thingID[g] = currentID
    currentID += 1
  for l in locations:
    thingID[l] = currentID
    currentID += 1
  for p in persons:
    thingID[p] = currentID
    currentID += 1

  for photoXml,output in xmlOutputList:
    output = open(output, 'w')
    output.write(str(currentID) + '\n')
    indicator = [0]*currentID
    for photo in photoIterator(photoXml):
      for gallery in photo.find("galleries").findall("gallery"):
        indicator[thingID[gallery.attrib['id']]] = 1
      for location in photo.findall("location"):
        if location.attrib.has_key('place_id'):
          indicator[thingID[location.attrib['place_id']]] = 1
      indicator[thingID[photo.find('owner').attrib['nsid']]] = 1
      for groups in photo.findall("groups"):
        for s in groups.findall("set"):
          indicator[thingID[s.attrib['id']]] = 1
      output.write(photo.attrib['id'] + ' ' + ''.join([str(x) for x in indicator]) + '\n')
    output.close()
  

# People, sets, galleries, and locations
def photoCliques(photoXml, output):
  global currentPhotoId
  global photoIds
  currentPhotoId = 0
  photoIds = {}
  personPhotos = {}
  setPhotos = {}
  galleryPhotos = {}
  locationPhotos = {}

  for photo in photoIterator(photoXml):
    getPhotoId(photo.attrib['id'])
    for groups in photo.findall("groups"):
      for s in groups.findall("set"):
        addToSet(setPhotos, s.attrib['id'], photo.attrib['id'])
    addToSet(personPhotos, photo.find('owner').attrib['nsid'], photo.attrib['id'])
    for gallery in photo.find("galleries").findall("gallery"):
      addToSet(galleryPhotos, gallery.attrib['id'], photo.attrib['id'])
    for location in photo.findall("location"):
      if location.attrib.has_key('place_id'):
        addToSet(locationPhotos, location.attrib['place_id'], photo.attrib['id'])

  output = open(output, 'w')
  output.write(str(currentPhotoId) + '\n')
  output.write(str(len(personPhotos)) + '\n')
  output.write(str(len(setPhotos)) + '\n')
  output.write(str(len(galleryPhotos)) + '\n')
  output.write(str(len(locationPhotos)) + '\n')
  for k in photoIds.keys():
    output.write(str(k) + '\n')
  for k in personPhotos.keys():
    s = personPhotos[k]
    output.write(k + ' ' + str(len(s)) + ' ' + ' '.join(s) + '\n')
  for k in setPhotos.keys():
    s = setPhotos[k]
    output.write(k + ' ' + str(len(s)) + ' ' + ' '.join(s) + '\n')
  for k in galleryPhotos.keys():
    s = galleryPhotos[k]
    output.write(k + ' ' + str(len(s)) + ' ' + ' '.join(s) + '\n')
  for k in locationPhotos.keys():
    s = locationPhotos[k]
    output.write(k + ' ' + str(len(s)) + ' ' + ' '.join(s) + '\n')
  output.close()

def binaryFeatures(idfile, output):
  a = open(idfile, 'r')
  ids = [int(x.strip()) for x in a.readlines()]
  a.close()
  o = open(output, 'wb')
  count = 0
  for i in ids:
    count += 1
    if not (count % 1000): print count
    o.write(struct.pack('l', i))
    x = open("data/features_1024/" + str(i)[:2] + '/' + str(i) + ".vec", 'rb')
    o.write(x.read())
  o.close()
  
def binaryFeaturesNUS(idfile, output, which):
  a = open("/lfs/1/flickr/flickr/NUSwide/NUS-WIDE-urls.txt", 'r')
  imids = []
  imfeats = {}
  for x in a.readlines()[1:]:
    imids.append(x.split()[1])
  a.close()

  a = open(which, 'r')
  ind = 0
  for l in a.readlines():
    imfeats[imids[ind]] = l.strip()
    ind += 1
  a.close()
  
  a = open(idfile, 'r')
  b = open(output, 'w')
  for l in a.readlines():
    l = l.strip()
    b.write(l + ' ' + imfeats[l] + '\n')
  b.close()