From 825a0b3d6ab432917fb7b928040805e5ea803a0a Mon Sep 17 00:00:00 2001 From: Jordi Inglada <jordi.inglada@cesbio.cnes.fr> Date: Tue, 27 Feb 2018 15:48:06 +0100 Subject: [PATCH] ENH: implementation of 3 augmentation algorithms --- .../app/otbSampleAugmentation.cxx | 129 +++++------ .../include/otbSampleAugmentation.h | 203 ++++++++++++++++++ .../AppClassification/test/CMakeLists.txt | 2 +- 3 files changed, 272 insertions(+), 62 deletions(-) create mode 100644 Modules/Applications/AppClassification/include/otbSampleAugmentation.h diff --git a/Modules/Applications/AppClassification/app/otbSampleAugmentation.cxx b/Modules/Applications/AppClassification/app/otbSampleAugmentation.cxx index c182711c77..36fc02f8bf 100644 --- a/Modules/Applications/AppClassification/app/otbSampleAugmentation.cxx +++ b/Modules/Applications/AppClassification/app/otbSampleAugmentation.cxx @@ -21,6 +21,7 @@ #include "otbWrapperApplication.h" #include "otbWrapperApplicationFactory.h" #include "otbOGRDataSourceWrapper.h" +#include "otbSampleAugmentation.h" namespace otb { @@ -43,8 +44,8 @@ public: itkTypeMacro(SampleAugmentation, otb::Application); /** Filters typedef */ - using SampleType = std::vector<double>; - using SampleVectorType = std::vector<SampleType>; + using SampleType = sampleAugmentation::SampleType; + using SampleVectorType = sampleAugmentation::SampleVectorType; private: @@ -120,7 +121,7 @@ private: ogr::Layer layer = ogrDS->GetLayer(this->GetParameterInt("layer")); ogr::Feature feature = layer.ogr().GetNextFeature(); - ClearChoices( "exclude" ); + ClearChoices("exclude"); ClearChoices("field"); for(int iField=0; iField<feature.ogr().GetFieldCount(); iField++) @@ -176,14 +177,21 @@ private: std::vector<std::string> cFieldNames = GetChoiceNames("field"); std::string fieldName = cFieldNames[selectedCFieldIdx.front()]; - std::vector<std::string> excludedFeatures = GetExcludedFeatures( GetChoiceNames( "exclude" ), GetSelectedItems( "exclude" )); + std::vector<std::string> excludedFeatures = + GetExcludedFeatures( GetChoiceNames( "exclude" ), + GetSelectedItems( "exclude" )); for(const auto& ef : excludedFeatures) - std::cout << ef << " excluded\n"; + otbAppLogINFO("Excluding feature " << ef << '\n'); auto inSamples = extractSamples(vectors, this->GetParameterInt("layer"), fieldName, this->GetParameterInt("label"), excludedFeatures); - auto newSamples = augmentSamples(inSamples, this->GetParameterInt("samples")); + SampleVectorType newSamples; + // sampleAugmentation::replicateSamples(inSamples, this->GetParameterInt("samples"), + // newSamples); + sampleAugmentation::smote(inSamples, this->GetParameterInt("samples"), + newSamples, + 4); writeSamples(vectors, output, newSamples, this->GetParameterInt("layer"), fieldName, this->GetParameterInt("label"), @@ -194,8 +202,9 @@ private: /** Extracts the samples of a single class from the vector data to a * vector and excludes some unwanted features. */ - SampleVectorType extractSamples(const ogr::DataSource::Pointer vectors, size_t layerName, - std::string classField, int label, + SampleVectorType extractSamples(const ogr::DataSource::Pointer vectors, + size_t layerName, + const std::string& classField, const int label, const std::vector<std::string>& excludedFeatures = {}) { ogr::Layer layer = vectors->GetLayer(layerName); @@ -212,15 +221,7 @@ private: } auto numberOfFields = feature.ogr().GetFieldCount(); - std::set<size_t> excludedIds; - if( excludedFeatures.size() != 0) - { - for(const auto& fieldName : excludedFeatures) - { - auto idx = feature.ogr().GetFieldIndex( fieldName.c_str() ); - excludedIds.insert(idx); - } - } + auto excludedIds = getExcludedFeaturesIds(excludedFeatures, layer); otbAppLogINFO("The vector file contains " << numberOfFields << " fields.\n"); SampleVectorType samples; bool goesOn{feature.addr() != 0}; @@ -229,14 +230,12 @@ private: // Retrieve all the features for each field in the ogr layer. if(feature.ogr().GetFieldAsInteger(classField.c_str()) == label) { + SampleType mv; for(auto idx=0; idx<numberOfFields; ++idx) { - OGRFieldType fieldType = feature.ogr().GetFieldDefnRef(idx)->GetType(); if(excludedIds.find(idx) == excludedIds.cend() && - (fieldType == OFTInteger - || ogr::version_proxy::IsOFTInteger64( fieldType ) - || fieldType == OFTReal)) + isNumericField(feature, idx)) mv.push_back(feature.ogr().GetFieldAsDouble(idx)); } samples.push_back(mv); @@ -247,37 +246,16 @@ private: return samples; } - SampleVectorType augmentSamples(const SampleVectorType& inSamples, - const size_t nbSamples) - { - SampleVectorType newSamples; - for(size_t i=0; i<nbSamples; ++i) - { - newSamples.push_back(inSamples[i%inSamples.size()]); - } - return newSamples; - } - - void writeSamples(const ogr::DataSource::Pointer vectors, - ogr::DataSource::Pointer output, + void writeSamples(const ogr::DataSource::Pointer& vectors, + ogr::DataSource::Pointer& output, const SampleVectorType& samples, - size_t layerName, - std::string classField, int label, + const size_t layerName, + const std::string& classField, int label, const std::vector<std::string>& excludedFeatures = {}) { auto inputLayer = vectors->GetLayer(layerName); - std::set<size_t> excludedIds; - if( excludedFeatures.size() != 0) - { - auto feature = *(inputLayer).begin(); - for(const auto& fieldName : excludedFeatures) - { - auto idx = feature.ogr().GetFieldIndex( fieldName.c_str() ); - excludedIds.insert(idx); - } - } - + auto excludedIds = getExcludedFeaturesIds(excludedFeatures, inputLayer); OGRSpatialReference * oSRS = nullptr; if (inputLayer.GetSpatialRef()) @@ -296,7 +274,7 @@ private: } auto featureCount = outputLayer.GetFeatureCount(false); - auto templateFeature = *(inputLayer).begin(); + auto templateFeature = selectTemplateFeature(inputLayer, classField, label); for(const auto& sample : samples) { ogr::Feature dstFeature(outputLayer.GetLayerDefn()); @@ -305,27 +283,18 @@ private: auto sampleFieldCounter = 0; for (int k=0 ; k < layerDefn.GetFieldCount() ; k++) { - OGRFieldType fieldType = dstFeature.ogr().GetFieldDefnRef(k)->GetType(); if(excludedIds.find(k) == excludedIds.cend() && - (fieldType == OFTInteger - || ogr::version_proxy::IsOFTInteger64( fieldType ) - || fieldType == OFTReal)) + isNumericField(dstFeature, k)) { dstFeature.ogr().SetField(k, sample[sampleFieldCounter++]); } } - // for (unsigned int i=0 ; i<nbBand ; ++i) - // { - // imgComp = static_cast<double>(itk::DefaultConvertPixelTraits<PixelType>::GetNthComponent(i,imgPixel)); - // // Fill the output OGRDataSource - // dstFeature[m_SampleFieldNames[i]].SetValue(imgComp); - // } outputLayer.CreateFeature( dstFeature ); } } - std::vector<std::string> GetExcludedFeatures(std::vector <std::string> fieldNames, - std::vector<int> selectedIdx) + std::vector<std::string> GetExcludedFeatures(const std::vector<std::string>& fieldNames, + const std::vector<int>& selectedIdx) { auto nbFeatures = static_cast<unsigned int>(selectedIdx.size()); std::vector<std::string> result( nbFeatures ); @@ -335,7 +304,45 @@ private: } return result; } -}; + ogr::Feature selectTemplateFeature(const ogr::Layer& inputLayer, + const std::string& classField, int label) + { + auto featureIt = inputLayer.begin(); + bool goesOn{(*featureIt).addr() != 0}; + while( goesOn ) + { + if((*featureIt).ogr().GetFieldAsInteger(classField.c_str()) == label) + { + return *featureIt; + } + ++featureIt; + } + return *(inputLayer.begin()); + } + std::set<size_t> getExcludedFeaturesIds(const std::vector<std::string>& excludedFeatures, + const ogr::Layer& inputLayer) + { + auto feature = *(inputLayer).begin(); + std::set<size_t> excludedIds; + if( excludedFeatures.size() != 0) + { + for(const auto& fieldName : excludedFeatures) + { + auto idx = feature.ogr().GetFieldIndex( fieldName.c_str() ); + excludedIds.insert(idx); + } + } + return excludedIds; + } + bool isNumericField(const ogr::Feature& feature, + const int idx) + { + OGRFieldType fieldType = feature.ogr().GetFieldDefnRef(idx)->GetType(); + return (fieldType == OFTInteger + || ogr::version_proxy::IsOFTInteger64( fieldType ) + || fieldType == OFTReal); + } + }; } // end of namespace Wrapper } // end of namespace otb diff --git a/Modules/Applications/AppClassification/include/otbSampleAugmentation.h b/Modules/Applications/AppClassification/include/otbSampleAugmentation.h new file mode 100644 index 0000000000..27b9f07847 --- /dev/null +++ b/Modules/Applications/AppClassification/include/otbSampleAugmentation.h @@ -0,0 +1,203 @@ +/* + * Copyright (C) 2005-2017 Centre National d'Etudes Spatiales (CNES) + * + * This file is part of Orfeo Toolbox + * + * https://www.orfeo-toolbox.org/ + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef otbSampleAugmentation_h +#define otbSampleAugmentation_h + +#include <vector> +#include <algorithm> +#include <random> +#include <ctime> +#include <cassert> +#include <iostream> + +namespace otb +{ + +namespace sampleAugmentation +{ +using SampleType = std::vector<double>; +using SampleVectorType = std::vector<SampleType>; + +/** +Estimate standard deviations of the components in one pass using +Welford's algorithm +*/ +SampleType estimateStds(SampleVectorType samples) +{ + const auto nbSamples = samples.size(); + const auto nbComponents = samples[0].size(); + SampleType stds(nbComponents, 0.0); + SampleType means(nbComponents, 0.0); + for(size_t i=0; i<nbSamples; ++i) + { + for(size_t j=0; j<nbComponents; ++j) + { + const auto mu = means[j]; + const auto x = samples[i][j]; + auto muNew = mu+(x-mu)/(i+1); + stds[j] += (x-mu)*(x-muNew); + means[j] = muNew; + } + } + for(auto std : stds) + std = std::sqrt(std/nbSamples); + return stds; +} + +/** Create new samples by replicating input samples. We loop through +* the input samples and add them to the new data set until nbSamples +* are added. The elements of newSamples are removed before proceeding. +*/ +void replicateSamples(const SampleVectorType& inSamples, + const size_t nbSamples, + SampleVectorType& newSamples) +{ + newSamples.resize(nbSamples); + for(size_t i=0; i<nbSamples; ++i) + { + newSamples[i] = inSamples[i%inSamples.size()]; + } +} + +/** Create new samples by adding noise to existing samples. Gaussian +* noise is added to randomly selected samples. The standard deviation +* of the noise added to each component is the same as the one of the +* input variables multiplied by stdFactor (defaults to 1). The +* elements of newSamples are removed before proceeding. +*/ +void jitterSamples(const SampleVectorType& inSamples, + const size_t nbSamples, + SampleVectorType& newSamples, + float stdFactor=1.0, + const int seed = std::time(nullptr)) +{ + newSamples.resize(nbSamples); + const auto nbComponents = inSamples[0].size(); + std::random_device rd; + std::mt19937 gen(rd()); + // The input samples are selected randomly with replacement + std::srand(seed); + // We use one gaussian distribution per component since they may + // have different stds + auto stds = estimateStds(inSamples); + std::vector<std::normal_distribution<double>> gaussDis; + for(size_t i=0; i<nbComponents; ++i) + gaussDis.emplace_back(std::normal_distribution<double>{0.0, stds[i]*stdFactor}); + for(size_t i=0; i<nbSamples; ++i) + { + newSamples[i] = inSamples[std::rand()%nbSamples]; + for(size_t j=0; j<nbComponents; ++j) + newSamples[i][j] += gaussDis[j](gen); + } +} + + +struct NeighborType +{ + size_t index; + double distance; +}; + +struct NeighborSorter +{ + constexpr bool operator ()(const NeighborType& a, const NeighborType& b) const + { + return b.distance > a.distance; + } +}; + +double computeDistance(const SampleType& x, const SampleType& y) +{ + assert(x.size()==y.size()); + double dist{0}; + for(size_t i=0; i<x.size(); ++i) + { + dist += (x[i]-y[i])*(x[i]-y[i])/(x.size()*x.size()); + } + return std::sqrt(dist); +} + +using NNIndicesType = std::vector<NeighborType>; +using NNVectorType = std::vector<NNIndicesType>; +/** Returns the indices of the nearest neighbors for each input sample +*/ +void findKNNIndices(const SampleVectorType& inSamples, + const size_t nbNeighbors, + NNVectorType& nnVector) +{ + const auto nbSamples = inSamples.size(); + nnVector.resize(nbSamples); + for(size_t sampleIdx=0; sampleIdx<nbSamples; ++sampleIdx) + { + NNIndicesType nns; + for(size_t neighborIdx=0; neighborIdx<nbSamples; ++neighborIdx) + { + if(sampleIdx!=neighborIdx) + nns.push_back({neighborIdx, computeDistance(inSamples[sampleIdx], + inSamples[neighborIdx])}); + } + std::partial_sort(nns.begin(), nns.begin()+nbNeighbors, nns.end(), NeighborSorter{}); + nns.resize(nbNeighbors); + nnVector[sampleIdx] = nns; + } +} + +/** Generate the new sample in the line linking s1 and s2 +*/ +SampleType smoteCombine(SampleType s1, SampleType s2, double position) +{ + auto result = s1; + for(size_t i=0; i<s1.size(); ++i) + result[i] = s1[i]+(s2[i]-s1[i])*position; + return result; +} + +/** Create new samples using the SMOTE algorithm +Chawla, N. V., Bowyer, K. W., Hall, L. O., & Kegelmeyer, W. P., Smote: +synthetic minority over-sampling technique, Journal of artificial +intelligence research, 16(), 321–357 (2002). +http://dx.doi.org/10.1613/jair.953 +*/ +void smote(const SampleVectorType& inSamples, + const size_t nbSamples, + SampleVectorType& newSamples, + const int nbNeighbors, + const int seed = std::time(nullptr)) +{ + newSamples.resize(nbSamples); + NNVectorType nnVector; + findKNNIndices(inSamples, nbNeighbors, nnVector); + // The input samples are selected randomly with replacement + std::srand(seed); + for(size_t i=0; i<nbSamples; ++i) + { + const auto sampleIdx = std::rand()%nbSamples; + const auto sample = inSamples[sampleIdx]; + const auto neighborIdx = nnVector[sampleIdx][std::rand()%nbNeighbors].index; + const auto neighbor = inSamples[neighborIdx]; + newSamples[i] = smoteCombine(sample, neighbor, std::rand()/double{RAND_MAX}); + } +} + +} +} + +#endif diff --git a/Modules/Applications/AppClassification/test/CMakeLists.txt b/Modules/Applications/AppClassification/test/CMakeLists.txt index c30773decb..c0ec37fea3 100644 --- a/Modules/Applications/AppClassification/test/CMakeLists.txt +++ b/Modules/Applications/AppClassification/test/CMakeLists.txt @@ -981,5 +981,5 @@ otb_test_application(NAME apTvClSampleAugmentation -label 3 -samples 100 -out ${TEMP}/apTvClSampleAugmentation.sqlite - # -excluded_features OGC_FID name class originfid + -exclude originfid ) -- GitLab