#include "common.hpp" class topicCorpus { public: topicCorpus(corpus* corp, int K, double latentReg, double wordReg, double lambda, char* ldaModel) : corp(corp), K(K), latentReg(latentReg), wordReg(wordReg), lambda(lambda) { srand(0); nUsers = corp->nUsers; nBeers = corp->nBeers; nWords = corp->nWords; votesPerUser = new std::vector[nUsers]; votesPerBeer = new std::vector[nBeers]; trainVotesPerUser = new std::vector[nUsers]; trainVotesPerBeer = new std::vector[nBeers]; for (std::vector::iterator it = corp->V->begin(); it != corp->V->end(); it++) { vote* vi = *it; votesPerUser[vi->user].push_back(vi); } for (int user = 0; user < nUsers; user++) for (std::vector::iterator it = votesPerUser[user].begin(); it != votesPerUser[user].end(); it++) { vote* vi = *it; votesPerBeer[vi->item].push_back(vi); } double testFraction = 0.1; if (corp->V->size() > 2400000) { double trainFraction = 2000000.0 / corp->V->size(); testFraction = (1.0 - trainFraction)/2; } for (std::vector::iterator it = corp->V->begin(); it != corp->V->end(); it ++) { double r = rand() * 1.0 / RAND_MAX; if (r < testFraction) { testVotes.insert(*it); } else if (r < 2*testFraction) validVotes.push_back(*it); else { trainVotes.push_back(*it); trainVotesPerUser[(*it)->user].push_back(*it); trainVotesPerBeer[(*it)->item].push_back(*it); if (nTrainingPerUser.find((*it)->user) == nTrainingPerUser.end()) nTrainingPerUser[(*it)->user] = 0; if (nTrainingPerBeer.find((*it)->item) == nTrainingPerBeer.end()) nTrainingPerBeer[(*it)->item] = 0; nTrainingPerUser[(*it)->user] ++; nTrainingPerBeer[(*it)->item] ++; } } std::vector remove; for (std::set::iterator it = testVotes.begin(); it != testVotes.end(); it ++) { if (nTrainingPerUser.find((*it)->user) == nTrainingPerUser.end()) remove.push_back(*it); else if (nTrainingPerBeer.find((*it)->item) == nTrainingPerBeer.end()) remove.push_back(*it); } for (std::vector::iterator it = remove.begin(); it != remove.end(); it ++) { // testVotes.erase(*it); } NW = 1 + 1 + (K + 1) * (nUsers + nBeers) + K * nWords; // Initialize parameters and latent variables // Zero all weights W = new double [NW]; for (int i = 0; i < NW; i++) W[i] = 0; getG(W, &alpha, &kappa, &beta_user, &beta_beer, &gamma_user, &gamma_beer, &topicWords, true); // Set alpha to the average for (std::vector::iterator vi = trainVotes.begin(); vi != trainVotes.end(); vi++) { *alpha += (*vi)->value; } *alpha /= trainVotes.size(); double train, valid, test, testSte; validTestError(train, valid, test, testSte); printf("Error w/ offset term only = %f/%f/%f (%f)\n", train, valid, test, testSte); // Set beta to user and product offsets for (std::vector::iterator vi = trainVotes.begin(); vi != trainVotes.end(); vi++) { vote* v = *vi; beta_user[v->user] += v->value - *alpha; beta_beer[v->item] += v->value - *alpha; } for (int u = 0; u < nUsers; u++) beta_user[u] /= votesPerUser[u].size(); for (int b = 0; b < nBeers; b++) beta_beer[b] /= votesPerBeer[b].size(); validTestError(train, valid, test, testSte); printf("Error w/ offset and bias = %f/%f/%f (%f)\n", train, valid, test, testSte); // Actually the model works better if we initialize none of these terms if (lambda > 0) { *alpha = 0; for (int u = 0; u < nUsers; u++) beta_user[u] = 0; for (int b = 0; b < nBeers; b++) beta_beer[b] = 0; } wordTopicCounts = new int*[nWords]; for (int w = 0; w < nWords; w++) { wordTopicCounts[w] = new int[K]; for (int k = 0; k < K; k++) wordTopicCounts[w][k] = 0; } // Generate random topic assignments topicCounts = new long long[K]; for (int k = 0; k < K; k++) topicCounts[k] = 0; beerTopicCounts = new int*[nBeers]; beerWords = new int[nBeers]; for (int b = 0; b < nBeers; b ++) { beerTopicCounts[b] = new int[K]; for (int k = 0; k < K; k ++) beerTopicCounts[b][k] = 0; beerWords[b] = 0; } for (std::vector::iterator vi = trainVotes.begin(); vi != trainVotes.end(); vi++) { vote* v = *vi; wordTopics[v] = new int[v->words.size()]; beerWords[(*vi)->item] += v->words.size(); for (int wp = 0; wp < (int) v->words.size(); wp++) { int wi = v->words[wp]; int t = rand() % K; wordTopics[v][wp] = t; beerTopicCounts[(*vi)->item][t]++; wordTopicCounts[wi][t]++; topicCounts[t]++; } } // Initialize the background word frequency totalWords = 0; backgroundWords = new double[nWords]; for (int w = 0; w < nWords; w ++) backgroundWords[w] = 0; for (std::vector::iterator vi = trainVotes.begin(); vi != trainVotes.end(); vi++) { for (std::vector::iterator it = (*vi)->words.begin(); it != (*vi)->words.end(); it++) { totalWords++; backgroundWords[*it]++; } } for (int w = 0; w < nWords; w++) backgroundWords[w] /= totalWords; if (lambda == 0) { /* for (int u = 0; u < nUsers; u++) { if (nTrainingPerUser.find(u) == nTrainingPerUser.end()) continue; for (int k = 0; k < K; k++) gamma_user[u][k] = rand() * 1.0 / RAND_MAX; } for (int b = 0; b < nBeers; b++) { if (nTrainingPerBeer.find(b) == nTrainingPerBeer.end()) continue; for (int k = 0; k < K; k++) gamma_beer[b][k] = rand() * 1.0 / RAND_MAX; } */ FILE* f = fopen_(ldaModel, "r"); char* product = new char [100]; while (fscanf(f, "%s", product) == 1) { int b = corp->beerIds[std::string(product)]; for (int k = 0; k < K; k ++) { float x; fscanf(f, "%f", &x); gamma_beer[b][k] = x; } } delete [] product; fclose(f); } else { for (int w = 0; w < nWords; w++) for (int k = 0; k < K; k++) topicWords[w][k] = 0;//rand() * 1.0 / RAND_MAX; } normalizeWordWeights(); if (lambda > 0) updateTopics(true); *kappa = 1.0; // sort(trainVotes.begin(), trainVotes.end(), voteCompare); } ~topicCorpus() { delete[] votesPerBeer; delete[] votesPerUser; delete[] trainVotesPerBeer; delete[] trainVotesPerUser; for (int w = 0; w < nWords; w ++) delete[] wordTopicCounts[w]; delete[] wordTopicCounts; for (int b = 0; b < nBeers; b ++) delete[] beerTopicCounts[b]; delete[] beerTopicCounts; delete[] beerWords; delete[] topicCounts; delete[] backgroundWords; for (std::vector::iterator vi = trainVotes.begin(); vi != trainVotes.end(); vi++) { delete[] wordTopics[*vi]; } clearG(&alpha, &kappa, &beta_user, &beta_beer, &gamma_user, &gamma_beer, &topicWords); delete[] W; } double prediction(vote* vi); void dl(double* grad); void train(int emIterations, int gradIterations); double lsq(void); void validTestError(double& train, double& valid, double& test, double& testSte); void normalizeWordWeights(void); void save(char* modelPath, char* predictionPath); corpus* corp; std::vector trainVotes; std::vector validVotes; std::set testVotes; std::map bestValidPredictions; // Ratings for each product, sorted by timestamp. std::vector* votesPerBeer; std::vector* votesPerUser; std::vector* trainVotesPerBeer; std::vector* trainVotesPerUser; std::map nearestVote; int putG(double* g, double* alpha, double* kappa, double* beta_user, double* beta_beer, double** gamma_user, double** gamma_beer, double** topicWords); int getG(double* g, double** alpha, double** kappa, double** beta_user, double** beta_beer, double*** gamma_user, double*** gamma_beer, double*** topicWords, bool init); void clearG(double** alpha, double** kappa, double** beta_user, double** beta_beer, double*** gamma_user, double*** gamma_beer, double*** topicWords); void wordZ(double* res); void topicZ(int beer, double& res); void updateTopics(bool sample); void topWords(); // Model parameters double* alpha; double* kappa; double* beta_user; double* beta_beer; double** gamma_user; double** gamma_beer; double* W; double** topicWords; double* backgroundWords; // Latent variables std::map wordTopics; // Counters int** beerTopicCounts; // How many times does each topic occur for each product? int* beerWords; long long* topicCounts; // How many times does each topic occur? int** wordTopicCounts; // How many times does this topic occur for this word? long long totalWords; // How many words are there? int NW; int K; double latentReg; double wordReg; double lambda; std::map nTrainingPerUser; std::map nTrainingPerBeer; int whichTopics; int nUsers; int nBeers; int nWords; };