diff --git a/Modules/IO/TestKernel/include/otbReadDataFile.h b/Modules/IO/TestKernel/include/otbReadDataFile.h new file mode 100644 index 0000000000000000000000000000000000000000..5ba16330a2581fd991077f5e7fb3d5b10ab4d4a7 --- /dev/null +++ b/Modules/IO/TestKernel/include/otbReadDataFile.h @@ -0,0 +1,115 @@ +/* + * Copyright (C) 2005-2017 Centre National d'Etudes Spatiales (CNES) + * + * This file is part of Orfeo Toolbox + * + * https://www.orfeo-toolbox.org/ + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "otbStringUtils.h" +#include "otb_boost_string_header.h" + +#include "itkListSample.h" +#include <fstream> +#include <string> +#include <algorithm> + +namespace otb +{ +/** Utility function to read the data file letter.scale, a CSV type file + * (whitespace separators) with the letter index in first column, followed by + * 16 descriptors. Each descriptor is a pair 'index:value' + */ +template <typename TInput, typename TTarget> +bool ReadDataFile( + const std::string & infname, + itk::SmartPointer<itk::Statistics::ListSample<TInput> > samples, + itk::SmartPointer<itk::Statistics::ListSample<TTarget> > labels) +{ + typedef typename itk::Statistics::ListSample<TInput>::MeasurementType IValueType; + typedef typename itk::Statistics::ListSample<TTarget>::MeasurementType TValueType; + + std::ifstream ifs; + ifs.open(infname.c_str()); + + if(!ifs) + { + std::cout<<"Could not read file "<<infname<<std::endl; + return false; + } + + unsigned int nbfeatures = 0; + + while (!ifs.eof()) + { + std::string line; + std::getline(ifs, line); + boost::algorithm::trim(line); + + if(nbfeatures == 0) + { + nbfeatures = std::count(line.begin(),line.end(),' '); + samples->SetMeasurementVectorSize(nbfeatures); + } + + if(line.size()>1) + { + TInput sample; + itk::NumericTraits<TInput>::SetLength(sample, nbfeatures); + + std::string::size_type pos = line.find_first_of(" ", 0); + + // Parse label + TTarget label; + itk::NumericTraits<TTarget>::SetLength(label,1); + label[0] = boost::lexical_cast<TValueType>(line.substr(0, pos)); + + bool endOfLine = false; + unsigned int id = 0; + + while(!endOfLine) + { + std::string::size_type nextpos = line.find_first_of(" ", pos+1); + if(nextpos == std::string::npos) + { + endOfLine = true; + nextpos = line.size(); + } + + std::string::size_type semicolonpos = line.find_first_of(":", pos+1, nextpos-pos-1); + if (semicolonpos == std::string::npos) + { + id++; + sample[id - 1] = boost::lexical_cast<IValueType>(line.substr(pos+1,nextpos-pos-1)); + } + else + { + id = boost::lexical_cast<unsigned int>(line.substr(pos+1,semicolonpos-pos-1)); + sample[id - 1] = boost::lexical_cast<IValueType>( + line.substr(semicolonpos+1,nextpos-semicolonpos-1)); + } + pos = nextpos; + + } + samples->PushBack(sample); + labels->PushBack(label); + } + } + + ifs.close(); + return true; +} + +} // end of namespace otb diff --git a/Modules/Learning/Supervised/test/otbTrainMachineLearningModel.cxx b/Modules/Learning/Supervised/test/otbTrainMachineLearningModel.cxx index 0633cd0f53a7acbf991cf2d727b1e766c9a753a3..dd34b41b8b3a4f9711530f99892f5780515e2566 100644 --- a/Modules/Learning/Supervised/test/otbTrainMachineLearningModel.cxx +++ b/Modules/Learning/Supervised/test/otbTrainMachineLearningModel.cxx @@ -26,6 +26,8 @@ #include <otbMachineLearningModel.h> #include "otbConfusionMatrixCalculator.h" +#include "otbReadDataFile.h" + #include "otb_boost_string_header.h" typedef otb::MachineLearningModel<float,short> MachineLearningModelType; @@ -46,142 +48,6 @@ typedef MachineLearningModelRegressionType::TargetListSampleType TargetListSampl typedef otb::ConfusionMatrixCalculator<TargetListSampleType, TargetListSampleType> ConfusionMatrixCalculatorType; -bool ReadDataFile(const std::string & infname, InputListSampleType * samples, TargetListSampleType * labels) -{ - std::ifstream ifs; - ifs.open(infname.c_str()); - - if(!ifs) - { - std::cout<<"Could not read file "<<infname<<std::endl; - return false; - } - - unsigned int nbfeatures = 0; - - while (!ifs.eof()) - { - std::string line; - std::getline(ifs, line); - boost::algorithm::trim(line); - - if(nbfeatures == 0) - { - nbfeatures = std::count(line.begin(),line.end(),' '); - } - - if(line.size()>1) - { - InputSampleType sample(nbfeatures); - sample.Fill(0); - - std::string::size_type pos = line.find_first_of(" ", 0); - - // Parse label - TargetSampleType label; - label[0] = atoi(line.substr(0, pos).c_str()); - - bool endOfLine = false; - unsigned int id = 0; - - while(!endOfLine) - { - std::string::size_type nextpos = line.find_first_of(" ", pos+1); - - if(pos == std::string::npos) - { - endOfLine = true; - nextpos = line.size()-1; - } - else - { - std::string feature = line.substr(pos,nextpos-pos); - std::string::size_type semicolonpos = feature.find_first_of(":"); - id = atoi(feature.substr(0,semicolonpos).c_str()); - sample[id - 1] = atof(feature.substr(semicolonpos+1,feature.size()-semicolonpos).c_str()); - pos = nextpos; - } - - } - samples->SetMeasurementVectorSize(itk::NumericTraits<InputSampleType>::GetLength(sample)); - samples->PushBack(sample); - labels->PushBack(label); - } - } - - //std::cout<<"Retrieved "<<samples->Size()<<" samples"<<std::endl; - ifs.close(); - return true; -} - -bool ReadDataRegressionFile(const std::string & infname, InputListSampleRegressionType * samples, TargetListSampleRegressionType * labels) -{ - std::ifstream ifs; - ifs.open(infname.c_str()); - - if(!ifs) - { - std::cout<<"Could not read file "<<infname<<std::endl; - return false; - } - - unsigned int nbfeatures = 0; - - while (!ifs.eof()) - { - std::string line; - std::getline(ifs, line); - - if(nbfeatures == 0) - { - nbfeatures = std::count(line.begin(),line.end(),' ')-1; - //std::cout<<"Found "<<nbfeatures<<" features per samples"<<std::endl; - } - - if(line.size()>1) - { - InputSampleRegressionType sample(nbfeatures); - sample.Fill(0); - - std::string::size_type pos = line.find_first_of(" ", 0); - - // Parse label - TargetSampleRegressionType label; - label[0] = atof(line.substr(0, pos).c_str()); - - bool endOfLine = false; - unsigned int id = 0; - - while(!endOfLine) - { - std::string::size_type nextpos = line.find_first_of(" ", pos+1); - - if(nextpos == std::string::npos) - { - endOfLine = true; - nextpos = line.size()-1; - } - else - { - std::string feature = line.substr(pos,nextpos-pos); - std::string::size_type semicolonpos = feature.find_first_of(":"); - id = atoi(feature.substr(0,semicolonpos).c_str()); - sample[id - 1] = atof(feature.substr(semicolonpos+1,feature.size()-semicolonpos).c_str()); - pos = nextpos; - } - - } - samples->SetMeasurementVectorSize(itk::NumericTraits<InputSampleRegressionType>::GetLength(sample)); - samples->PushBack(sample); - labels->PushBack(label); - } - } - - //std::cout<<"Retrieved "<<samples->Size()<<" samples"<<std::endl; - ifs.close(); - return true; -} - #ifdef OTB_USE_LIBSVM #include "otbLibSVMMachineLearningModel.h" int otbLibSVMMachineLearningModelNew(int itkNotUsed(argc), char * itkNotUsed(argv) []) @@ -205,7 +71,7 @@ int otbLibSVMMachineLearningModel(int argc, char * argv[]) InputListSampleType::Pointer samples = InputListSampleType::New(); TargetListSampleType::Pointer labels = TargetListSampleType::New(); - if (!ReadDataFile(argv[1], samples, labels)) + if (!otb::ReadDataFile(argv[1], samples, labels)) { std::cout << "Failed to read samples file " << argv[1] << std::endl; return EXIT_FAILURE; @@ -294,7 +160,7 @@ int otbSVMMachineLearningModel(int argc, char * argv[]) InputListSampleType::Pointer samples = InputListSampleType::New(); TargetListSampleType::Pointer labels = TargetListSampleType::New(); - if(!ReadDataFile(argv[1],samples,labels)) + if(!otb::ReadDataFile(argv[1],samples,labels)) { std::cout<<"Failed to read samples file "<<argv[1]<<std::endl; return EXIT_FAILURE; @@ -364,7 +230,7 @@ int otbSVMMachineLearningRegressionModel(int argc, char * argv[]) InputListSampleRegressionType::Pointer samples = InputListSampleRegressionType::New(); TargetListSampleRegressionType::Pointer labels = TargetListSampleRegressionType::New(); - if(!ReadDataRegressionFile(argv[1],samples,labels)) + if(!otb::ReadDataFile(argv[1],samples,labels)) { std::cout<<"Failed to read samples file "<<argv[1]<<std::endl; return EXIT_FAILURE; @@ -439,7 +305,7 @@ int otbKNearestNeighborsMachineLearningModel(int argc, char * argv[]) InputListSampleType::Pointer samples = InputListSampleType::New(); TargetListSampleType::Pointer labels = TargetListSampleType::New(); - if(!ReadDataFile(argv[1],samples,labels)) + if(!otb::ReadDataFile(argv[1],samples,labels)) { std::cout<<"Failed to read samples file "<<argv[1]<<std::endl; return EXIT_FAILURE; @@ -516,7 +382,7 @@ int otbRandomForestsMachineLearningModel(int argc, char * argv[]) InputListSampleType::Pointer samples = InputListSampleType::New(); TargetListSampleType::Pointer labels = TargetListSampleType::New(); - if(!ReadDataFile(argv[1],samples,labels)) + if(!otb::ReadDataFile(argv[1],samples,labels)) { std::cout<<"Failed to read samples file "<<argv[1]<<std::endl; return EXIT_FAILURE; @@ -603,7 +469,7 @@ int otbBoostMachineLearningModel(int argc, char * argv[]) InputListSampleType::Pointer samples = InputListSampleType::New(); TargetListSampleType::Pointer labels = TargetListSampleType::New(); - if(!ReadDataFile(argv[1],samples,labels)) + if(!otb::ReadDataFile(argv[1],samples,labels)) { std::cout<<"Failed to read samples file "<<argv[1]<<std::endl; return EXIT_FAILURE; @@ -689,7 +555,7 @@ int otbANNMachineLearningModel(int argc, char * argv[]) InputListSampleType::Pointer samples = InputListSampleType::New(); TargetListSampleType::Pointer labels = TargetListSampleType::New(); - if (!ReadDataFile(argv[1], samples, labels)) + if (!otb::ReadDataFile(argv[1], samples, labels)) { std::cout << "Failed to read samples file " << argv[1] << std::endl; return EXIT_FAILURE; @@ -779,7 +645,7 @@ int otbNormalBayesMachineLearningModel(int argc, char * argv[]) InputListSampleType::Pointer samples = InputListSampleType::New(); TargetListSampleType::Pointer labels = TargetListSampleType::New(); - if(!ReadDataFile(argv[1],samples,labels)) + if(!otb::ReadDataFile(argv[1],samples,labels)) { std::cout<<"Failed to read samples file "<<argv[1]<<std::endl; return EXIT_FAILURE; @@ -856,7 +722,7 @@ int otbDecisionTreeMachineLearningModel(int argc, char * argv[]) InputListSampleType::Pointer samples = InputListSampleType::New(); TargetListSampleType::Pointer labels = TargetListSampleType::New(); - if(!ReadDataFile(argv[1],samples,labels)) + if(!otb::ReadDataFile(argv[1],samples,labels)) { std::cout<<"Failed to read samples file "<<argv[1]<<std::endl; return EXIT_FAILURE; @@ -934,7 +800,7 @@ int otbGradientBoostedTreeMachineLearningModel(int argc, char * argv[]) InputListSampleType::Pointer samples = InputListSampleType::New(); TargetListSampleType::Pointer labels = TargetListSampleType::New(); - if(!ReadDataFile(argv[1],samples,labels)) + if(!otb::ReadDataFile(argv[1],samples,labels)) { std::cout<<"Failed to read samples file "<<argv[1]<<std::endl; return EXIT_FAILURE; @@ -995,139 +861,6 @@ int otbGradientBoostedTreeMachineLearningModel(int argc, char * argv[]) #ifdef OTB_USE_SHARK #include <chrono> // If shark is on, then we are using c++11 -bool SharkReadDataFile(const std::string & infname, InputListSampleType * samples, TargetListSampleType * labels) -{ - std::ifstream ifs(infname.c_str()); - - if(!ifs) - { - std::cout<<"Could not read file "<<infname<<std::endl; - return false; - } - - unsigned int nbfeatures = 0; - - std::string line; - while (std::getline(ifs, line)) - { - boost::algorithm::trim(line); - - if(nbfeatures == 0) - { - nbfeatures = std::count(line.begin(),line.end(),' '); - } - - if(line.size()>1) - { - InputSampleType sample(nbfeatures); - sample.Fill(0); - - std::string::size_type pos = line.find_first_of(" ", 0); - - // Parse label - TargetSampleType label; - label[0] = std::stoi(line.substr(0, pos).c_str()); - - bool endOfLine = false; - unsigned int id = 0; - - while(!endOfLine) - { - std::string::size_type nextpos = line.find_first_of(" ", pos+1); - - if(pos == std::string::npos) - { - endOfLine = true; - nextpos = line.size()-1; - } - else - { - std::string feature = line.substr(pos,nextpos-pos); - std::string::size_type semicolonpos = feature.find_first_of(":"); - id = std::stoi(feature.substr(0,semicolonpos).c_str()); - sample[id - 1] = atof(feature.substr(semicolonpos+1,feature.size()-semicolonpos).c_str()); - pos = nextpos; - } - - } - samples->SetMeasurementVectorSize(itk::NumericTraits<InputSampleType>::GetLength(sample)); - samples->PushBack(sample); - labels->PushBack(label); - } - } - - //std::cout<<"Retrieved "<<samples->Size()<<" samples"<<std::endl; - ifs.close(); - return true; -} - -bool SharkReadDataRegressionFile(const std::string & infname, InputListSampleRegressionType * samples, TargetListSampleRegressionType * labels) -{ - std::ifstream ifs(infname.c_str()); - if(!ifs) - { - std::cout<<"Could not read file "<<infname<<std::endl; - return false; - } - - unsigned int nbfeatures = 0; - - while (!ifs.eof()) - { - std::string line; - std::getline(ifs, line); - - if(nbfeatures == 0) - { - nbfeatures = std::count(line.begin(),line.end(),' ')-1; - //std::cout<<"Found "<<nbfeatures<<" features per samples"<<std::endl; - } - - if(line.size()>1) - { - InputSampleRegressionType sample(nbfeatures); - sample.Fill(0); - - std::string::size_type pos = line.find_first_of(" ", 0); - - // Parse label - TargetSampleRegressionType label; - label[0] = atof(line.substr(0, pos).c_str()); - - bool endOfLine = false; - unsigned int id = 0; - - while(!endOfLine) - { - std::string::size_type nextpos = line.find_first_of(" ", pos+1); - - if(nextpos == std::string::npos) - { - endOfLine = true; - nextpos = line.size()-1; - } - else - { - std::string feature = line.substr(pos,nextpos-pos); - std::string::size_type semicolonpos = feature.find_first_of(":"); - id = std::stoi(feature.substr(0,semicolonpos).c_str()); - sample[id - 1] = atof(feature.substr(semicolonpos+1,feature.size()-semicolonpos).c_str()); - pos = nextpos; - } - - } - samples->SetMeasurementVectorSize(itk::NumericTraits<InputSampleRegressionType>::GetLength(sample)); - samples->PushBack(sample); - labels->PushBack(label); - } - } - - //std::cout<<"Retrieved "<<samples->Size()<<" samples"<<std::endl; - ifs.close(); - return true; -} - - #include "otbSharkRandomForestsMachineLearningModel.h" int otbSharkRFMachineLearningModelNew(int itkNotUsed(argc), char * itkNotUsed(argv) []) { @@ -1149,7 +882,7 @@ int otbSharkRFMachineLearningModel(int argc, char * argv[]) InputListSampleType::Pointer samples = InputListSampleType::New(); TargetListSampleType::Pointer labels = TargetListSampleType::New(); - if(!SharkReadDataFile(argv[1],samples,labels)) + if(!otb::ReadDataFile(argv[1],samples,labels)) { std::cout<<"Failed to read samples file "<<argv[1]<<std::endl; return EXIT_FAILURE;