#pragma once #include "stdio.h" #include "stdlib.h" #include "vector" #include "math.h" #include "string.h" #include #include #include "omp.h" #include "map" #include "set" #include "vector" #include "common.hpp" #include "algorithm" #include "lbfgs.h" #include "sstream" #include "gzstream.h" FILE* fopen_(const char* p, const char* m) { FILE* f = fopen(p, m); if (!f) { printf("Failed to open %s\n", p); exit(1); } return f; } struct vote { // stuff used by recommender system int user; int item; float value; int voteTime; std::vector words; }; typedef struct vote vote; bool wordCountCompare(std::pair p1, std::pair p2) { return p1.second > p2.second; } bool voteCompare(vote* v1, vote* v2) { return v1->item > v2->item; } template int sgn(T val) { return (val > T(0)) - (val < T(0)); } class corpus { public: corpus(std::string voteFile, int max) { std::map uCounts; std::map bCounts; std::string uName; std::string bName; float value; int voteTime; int nw; int nRead = 0; igzstream in; in.open(voteFile.c_str()); std::string line; std::string sWord; // printf("Fields are swapped\n"); while (std::getline(in, line)) { std::stringstream ss(line); ss >> uName >> bName >> value >> voteTime >> nw; if (value > 5 or value < 0) { printf("Got bad value of %f\nOther fields were %s %s %d\n", value, uName.c_str(), bName.c_str(), voteTime); exit(0); } for (int w = 0; w < nw; w++) { ss >> sWord; if (wordCount.find(sWord) == wordCount.end()) wordCount[sWord] = 0; wordCount[sWord]++; } if (uCounts.find(uName) == uCounts.end()) uCounts[uName] = 0; if (bCounts.find(bName) == bCounts.end()) bCounts[bName] = 0; uCounts[uName]++; bCounts[bName]++; nRead++; if (nRead % 100000 == 0) { printf("."); fflush(stdout); } if (max > 0 and (int) nRead >= max) break; } in.close(); printf("\nnUsers = %d, nItems = %d, nRatings = %d\n", (int) uCounts.size(), (int) bCounts.size(), nRead); V = new std::vector(); vote* v = new vote(); std::map userIds; std::map beerIds; nUsers = 0; nBeers = 0; // Comment this block to include all users // nUsers = 1; // nBeers = 1; // userIds["NOT_ENOUGH_VOTES"] = 0; // beerIds["NOT_ENOUGH_VOTES"] = 0; // rUserIds[0] = "NOT_ENOUGH_VOTES"; // rBeerIds[0] = "NOT_ENOUGH_VOTES"; // vote* v_ = new vote(); // v_->user = 0; // v_->item = 0; // v_->value = 0; // v_->voteTime = 0; // V->push_back(v_); int userMin = 0; int beerMin = 0; int maxWords = 5000; std::vector < std::pair > whichWords; for (std::map::iterator it = wordCount.begin(); it != wordCount.end(); it++) whichWords.push_back(*it); sort(whichWords.begin(), whichWords.end(), wordCountCompare); if ((int) whichWords.size() < maxWords) maxWords = (int) whichWords.size(); nWords = maxWords; for (int w = 0; w < maxWords; w++) { wordId[whichWords[w].first] = w; idWord[w] = whichWords[w].first; } igzstream in2; in2.open(voteFile.c_str()); nRead = 0; while (std::getline(in2, line)) { std::stringstream ss(line); ss >> uName >> bName >> value >> voteTime >> nw; for (int w = 0; w < nw; w++) { ss >> sWord; if (wordId.find(sWord) != wordId.end()) v->words.push_back(wordId[sWord]); } if (uCounts[uName] >= userMin) { if (userIds.find(uName) == userIds.end()) { rUserIds[nUsers] = uName; userIds[uName] = nUsers++; } v->user = userIds[uName]; } else v->user = 0; if (bCounts[bName] >= beerMin) { if (beerIds.find(bName) == beerIds.end()) { rBeerIds[nBeers] = bName; beerIds[bName] = nBeers++; } v->item = beerIds[bName]; } else v->item = 0; v->value = value; v->voteTime = voteTime; V->push_back(v); v = new vote(); nRead++; if (nRead % 100000 == 0) { printf("."); fflush( stdout); } if (max > 0 and (int) nRead >= max) break; } // sort(V->begin(), V->end(), voteCompare); printf("\n"); // printf("nUsers = %d, nItems = %d, nRatings = %d\n", (int) userIds.size(), (int) beerIds.size(), (int) V->size()); delete v; in2.close(); } ~corpus() { for (std::vector::iterator it = V->begin(); it != V->end(); it++) delete *it; delete V; } std::vector* V; int nUsers; int nBeers; int nWords; std::map userIds; std::map beerIds; std::map rUserIds; std::map rBeerIds; std::map wordCount; std::map wordId; std::map idWord; };