#include using namespace YMLL; int main(int argc, char* argv[]) { // Get training and testing sets. // If there is no testing set, just use the training set as testing set. string trainFilename; cout << "Training set file (CSV): "; cin >> trainFilename; string testFilename; cout << "Testing set file (CSV): "; cin >> testFilename; TDSpecialDataLoad loadTrain(trainFilename); TDSpecialDataLoad loadTest(testFilename); Dataset train; Dataset test; train.Load(loadTrain); test.Load(loadTest); size_t maxTrain = train.size(); size_t maxTest = test.size(); size_t maxIndependents = train.IndependentsSize(); int choice; // Get the type of SVM and set its parameters. auto_ptr machine; bool isClassification; do { cout << "(1) SVMStar\n"; cout << "(2) SVMLight classification\n"; cout << "(3) LibSVM classification\n"; cin >> choice; } while (choice < 1 || choice > 3); switch (choice) { case 1: { isClassification = true; double parameter = 1.0; bool bias = false; size_t maxIterations = 30000; double maxGamma = 0.999999; double Cplus = numeric_limits::max(); double Cminus = numeric_limits::max(); int selection; do { cout << "(1) Sigma = " << parameter << "\n"; cout << "(2) Add bias = " << bias << "\n"; cout << "(3) Max. iterations = " << maxIterations << "\n"; cout << "(4) Max. gamma = " << maxGamma << "\n"; cout << "(5) C plus = " << Cplus << "\n"; cout << "(6) C minus = " << Cminus << "\n"; cout << "Parameter to change (0 to end)"; cin >> selection; switch (selection) { case 1: cout << "Sigma: "; cin >> parameter; break; case 2: cout << "Add bias: "; cin >> bias; break; case 3: cout << "Max. iterations: "; cin >> maxIterations; break; case 4: cout << "Max. gamma: "; cin >> maxGamma; break; case 5: cout << "C plus: "; cin >> Cplus; break; case 6: cout << "C minus: "; cin >> Cminus; break; } } while (selection != 0); SVMStarMachine tempMachine; tempMachine.SetParameter(parameter); tempMachine.SetVariable("add bias", bias); tempMachine.SetVariable("max iterations", maxIterations); tempMachine.SetVariable("max gamma", maxGamma); tempMachine.SetVariable("C plus", Cplus); tempMachine.SetVariable("C minus", Cminus); machine.reset(tempMachine.ClonePtr()); break; } case 2: { isClassification = true; double parameter = 1.0; long biasHyperplane = 1; long removeInconsistent= 0; long skipFinalOptCheck = 0; long maxQPSize = 10; long newVarsInQP = 0; long iterToShrink = -9999; long kernelCacheSize = 40; double C = 0.0; double EPS = 0.1; double transductionPosRatio = -1.0; double cost = 1.0; double epsilonCrit = 0.001; double rho = 1.0; long XADepth = 0; long computeLOO = 0; long kernel = 2; bool useSigma = true; int selection; do { cout << "(1) Sigma = " << parameter << "\n"; cout << "(2) Biased hyperplane = " << biasHyperplane << "\n"; cout << "(3) Remove inconsistent = " << removeInconsistent << "\n"; cout << "(4) Skip final opt check = " << skipFinalOptCheck << "\n"; cout << "(5) Max QP size = " << maxQPSize << "\n"; cout << "(6) New vars in QP = " << newVarsInQP << "\n"; cout << "(7) Iter to shrink = " << iterToShrink << "\n"; cout << "(8) Kernel cache size = " << kernelCacheSize << "\n"; cout << "(9) C = " << C << "\n"; cout << "(10) EPS = " << EPS << "\n"; cout << "(11) Transduction pos ratio = " << transductionPosRatio << "\n"; cout << "(12) Cost ratio = " << cost << "\n"; cout << "(13) Epsilon crit = " << epsilonCrit << "\n"; cout << "(14) Rho = " << rho << "\n"; cout << "(15) XA depth = " << XADepth << "\n"; cout << "(16) Compute LOO = " << computeLOO << "\n"; cout << "(17) Kernel type = " << kernel << "\n"; cout << "(18) Use sigma instead of gamma for Gaussian kernel? = " << useSigma << "\n"; cout << "Parameter to change (0 to end)"; cin >> selection; switch (selection) { case 1: cout << "Sigma: "; cin >> parameter; break; case 2: cout << "Biased hyperplane: "; cin >> biasHyperplane; break; case 3: cout << "Remove inconsistent: "; cin >> removeInconsistent; break; case 4: cout << "Skip final opt check: "; cin >> skipFinalOptCheck; break; case 5: cout << "Max QP size: "; cin >> maxQPSize; break; case 6: cout << "New vars in QP: "; cin >> newVarsInQP; break; case 7: cout << "Iter to shrink: "; cin >> iterToShrink; break; case 8: cout << "Kernel cache size: "; cin >> kernelCacheSize; break; case 9: cout << "C: "; cin >> C; break; case 10: cout << "EPS: "; cin >> EPS; break; case 11: cout << "Transduction pos ratio: "; cin >> transductionPosRatio; break; case 12: cout << "Cost ratio: "; cin >> cost; break; case 13: cout << "Epsilon crit: "; cin >> epsilonCrit; break; case 14: cout << "Rho: "; cin >> rho; break; case 15: cout << "XA depth: "; cin >> XADepth; break; case 16: cout << "Compute LOO: "; cin >> computeLOO; break; case 17: cout << "Kernel type: "; cin >> kernel; break; case 18: cout << "Use sigma instead of gamma for Gaussian kernel?: "; cin >> useSigma; break; } } while (selection != 0); SVMLightMachine tempMachine; tempMachine.SetVariable("svm type", (long)1); tempMachine.SetVariable("biased hyperplane", biasHyperplane); tempMachine.SetVariable("remove inconsistent", removeInconsistent); tempMachine.SetVariable("skip final opt check", skipFinalOptCheck); tempMachine.SetVariable("max qp size", maxQPSize); tempMachine.SetVariable("new vars in qp", newVarsInQP); tempMachine.SetVariable("iter to shrink", iterToShrink); tempMachine.SetVariable("kernel cache size", kernelCacheSize); tempMachine.SetVariable("c", C); tempMachine.SetVariable("eps", EPS); tempMachine.SetVariable("transduction pos ratio", transductionPosRatio); tempMachine.SetVariable("cost ratio", cost); tempMachine.SetVariable("epsilon crit", epsilonCrit); tempMachine.SetVariable("rho", rho); tempMachine.SetVariable("xa depth", XADepth); tempMachine.SetVariable("compute loo", computeLOO); tempMachine.SetVariable("kernel type", kernel); tempMachine.ChangeGaussianParameter_ = useSigma; tempMachine.SetParameter(parameter); machine.reset(tempMachine.ClonePtr()); break; } case 3: { isClassification = true; double parameter = 1.0; int kernel = 2; double cost = 1e38; double P = 0.001; double cacheSize = 40.0; double epsilon = 0.1; int shrink = 1; bool useSigma = true; double posWeight = 1.0; double negWeight = 1.0; int selection; do { cout << "(1) Sigma = " << parameter << "\n"; cout << "(2) Kernel type = " << kernel << "\n"; cout << "(3) Cost = " << cost << "\n"; cout << "(4) P = " << P << "\n"; cout << "(5) Cache size = " << cacheSize << "\n"; cout << "(6) Epsilon = " << epsilon << "\n"; cout << "(7) Shrinking = " << shrink << "\n"; cout << "(8) Use sigma instead of gamma for Gaussian kernel? = " << useSigma << "\n"; cout << "(9) Class 1 weight = " << posWeight << "\n"; cout << "(10) Class -1 weight = " << negWeight << "\n"; cout << "Parameter to change (0 to end)"; cin >> selection; switch (selection) { case 1: cout << "Sigma: "; cin >> parameter; break; case 2: cout << "Kernel type: "; cin >> kernel; break; case 3: cout << "Cost: "; cin >> cost; break; case 4: cout << "P: "; cin >> P; break; case 5: cout << "Cache size: "; cin >> cacheSize; break; case 6: cout << "Epsilon: "; cin >> epsilon; break; case 7: cout << "Shrinking: "; cin >> shrink; break; case 8: cout << "Use sigma instead of gamma for Gaussian kernel?: "; cin >> useSigma; break; case 9: cout << "Class 1 weight: "; cin >> posWeight; break; case 10: cout << "Class -1 weight: "; cin >> negWeight; break; } } while (selection != 0); LibSVMMachine tempMachine; tempMachine.SetVariable("svm type", 0); tempMachine.SetVariable("kernel type", kernel); tempMachine.SetVariable("cost", cost); tempMachine.SetVariable("p", P); tempMachine.SetVariable("cache size", cacheSize); tempMachine.SetVariable("epsilon", epsilon); tempMachine.SetVariable("shrinking", shrink); tempMachine.SetVariable("pos weight", posWeight); tempMachine.SetVariable("neg weight", negWeight); tempMachine.ChangeGaussianParameter_ = useSigma; tempMachine.SetParameter(parameter); machine.reset(tempMachine.ClonePtr()); break; } } int machineChoice = choice; // Use Matthews correlation coefficient to assess the models. auto_ptr performancemeasurer(new MatthewsCorrelationCoefficientPerformanceMeasurer); string performanceMeasurerType = "Matthews cc"; bool findMin = performancemeasurer->lessIsBetter_; // Select the type of validation method. do { cout << "(1) Training\n"; cout << "(2) Testing\n"; cout << "(3) LOO\n"; cout << "(4) Boostrap\n"; cout << "(5) NFoldCV\n"; cout << "(6) StratifiedNFoldCV\n"; cout << "(7) Random subsampling\n"; cin >> choice; } while (choice < 1 || choice > 7); auto_ptr OF; switch (choice) { case 1: // Training OF.reset(new FixedTestingSetObjectiveFunction); static_cast(&*OF)->test_ = train; break; case 2: // Testing OF.reset(new FixedTestingSetObjectiveFunction); static_cast(&*OF)->test_ = test; break; case 3: // LOO { OF.reset(new ObjectiveFunction); LOODatasetSplit loo; OF->SetOuterLoopDatasetSplit(loo); break; } case 4: // Bootstrap { OF.reset(new ObjectiveFunction); BootStrapDatasetSplit bootstrap; OF->SetOuterLoopDatasetSplit(bootstrap); size_t repeat; cout << "No. of times to repeat validation: "; cin >> repeat; OF->SetVariable("outer loop repeat", (size_t)repeat); break; } case 5: // N Fold CV { OF.reset(new ObjectiveFunction); NFoldCVDatasetSplit cv; OF->SetOuterLoopDatasetSplit(cv); size_t folds; cout << "No. of folds: "; cin >> folds; size_t repeat; cout << "No. of times to repeat validation: "; cin >> repeat; OF->SetVariable("outer loop repeat", (size_t)repeat); OF->SetVariable("outer loop parameter", (double)folds); break; } case 6: // Stratified N Fold CV { OF.reset(new ObjectiveFunction); StratifiedNFoldCVDatasetSplit scv; OF->SetOuterLoopDatasetSplit(scv); size_t folds; cout << "No. of folds: "; cin >> folds; size_t repeat; cout << "No. of times to repeat validation: "; cin >> repeat; OF->SetVariable("outer loop repeat", (size_t)repeat); OF->SetVariable("outer loop parameter", (double)folds); break; } case 7: // Random sampling { OF.reset(new ObjectiveFunction); RandomDatasetSplit random; OF->SetOuterLoopDatasetSplit(random); double proportion; cout << "Proportion of testing set: "; cin >> proportion; size_t repeat; cout << "No. of times to repeat validation: "; cin >> repeat; OF->SetVariable("outer loop repeat", (size_t)repeat); OF->SetVariable("outer loop parameter", (double)(size_t(proportion*train.size()))); break; } } OF->SetMachine(*machine); // Prepare the objective function. auto_ptr objectiveFunction(OF->ClonePtr()); objectiveFunction->SetPerformanceMeasurer(*performancemeasurer); // Prepare RFE. auto_ptr ds(new RFEDescriptorSelection); ds->SetVariable("find minimum", findMin); ds->SetObjectiveFunction(*objectiveFunction); trainFilename = "finalTrain.csv"; // Filename for the RFE-selected training set. testFilename = "finalTest.csv"; // Filename for the RFE-selected testing set. size_t minDescriptors = 5; // Minimum number of descriptors in the final datasets. // descripThreshold and descripToRemove are used to control how many descriptors to remove in each round. // For descripToRemove, values < 1 indicates that the proportion of descriptors to remove in each round and // values >= 1 indicates the actual number of descriptors to remove in each round. // For example, if descripThreshold is "100000,100,50" and descripToRemove is "0.1,5,1", it means that // when the number of descriptors is 101 to 100000, RFE will remove 10% of the descriptors in each round // until the number of descriptors is between 50 (inclusive) to 100, then it will remove 5 descriptors in each round // and when the number of descriptors drops below 50, it will remove 1 descriptor in each round. string descripThreshold = "1000000"; string descripToRemove = "0.1"; int selection; do { cout << "(1) Final train filename = " << trainFilename << "\n"; cout << "(2) Final test filename = " << testFilename << "\n"; cout << "(3) Min. descriptors = " << minDescriptors << "\n"; cout << "(4) Descriptors threshold = " << descripThreshold << "\n"; cout << "(5) Descriptors to remove = " << descripToRemove << "\n"; cout << "Parameter to change (0 to end)"; cin >> selection; switch (selection) { case 1: cout << "Final train filename: "; cin >> trainFilename; break; case 2: cout << "Final test filename: "; cin >> testFilename; break; case 3: cout << "Min. descriptors: "; cin >> minDescriptors; break; case 4: cout << "Descriptors threshold: "; cin >> descripThreshold; break; case 5: cout << "Descriptors to remove: "; cin >> descripToRemove; break; } } while (selection != 0); ds->SetVariable("minimum descriptors", minDescriptors); RFEDescriptorSelection* rfe = static_cast(&*ds); // Set descriptors to remove int ipos = 0; int npos; vector thresholds; while ((npos=descripThreshold.find(',',ipos)) != -1) { thresholds.push_back(atoi(descripThreshold.substr(ipos, npos-ipos).c_str())); ipos = npos + 1; } thresholds.push_back(atoi(descripThreshold.substr(ipos).c_str())); ipos = 0; vector descriptorsToRemove; while ((npos=descripToRemove.find(',',ipos)) != -1) { descriptorsToRemove.push_back(atof(descripToRemove.substr(ipos, npos-ipos).c_str())); ipos = npos + 1; } descriptorsToRemove.push_back(atof(descripToRemove.substr(ipos).c_str())); const size_t maxThresholds = std::min(thresholds.size(), descriptorsToRemove.size()); rfe->descriptorsToRemove_.clear(); for (size_t i=0; idescriptorsToRemove_.insert(pair(thresholds[i],descriptorsToRemove[i])); } // Start RFE. ds->SetDataset(train); ds->Initialize(); while (ds->Next()) { int i; double error; while (ds->Iteration()) { ds->GetVariable("current descriptor index", i); ds->GetVariable("current error", error); string output = ntos(i+1) + ": " + "DJ(i): " + ntos(error) + "\n"; ofstream f("iterationOutput.txt", ios::app); f << output; f.close(); } size_t size; ds->GetVariable("current descriptor subset size", size); string output; output = "Variables used: " + ntos(size) + ", " + performanceMeasurerType + ": " + ntos(ds->bestErrors_[size-1]) + "\n\n"; ofstream f("iterationSummary.txt", ios::app); f << output; f.close(); TDSpecialDataSave tdspecialsavetrain(trainFilename); tdspecialsavetrain.IndependentsSize(ds->dataset_.IndependentsSize()); ds->dataset_.Save(tdspecialsavetrain); TDSpecialDataSave tdspecialsavetest(testFilename); tdspecialsavetest.IndependentsSize(ds->dataset_.IndependentsSize()); Dataset resumeTest = test; resumeTest.RemoveDescriptors(ds->dataset_); resumeTest.Save(tdspecialsavetest); } TDSpecialDataSave tdspecialsavetrain(trainFilename); tdspecialsavetrain.IndependentsSize(ds->dataset_.IndependentsSize()); ds->dataset_.Save(tdspecialsavetrain); TDSpecialDataSave tdspecialsavetest(testFilename); tdspecialsavetest.IndependentsSize(ds->dataset_.IndependentsSize()); Dataset resumeTest = test; resumeTest.RemoveDescriptors(ds->dataset_); resumeTest.Save(tdspecialsavetest); return 0; }