diff --git a/CMakeLists.txt b/CMakeLists.txt
index b99a2f814ef782ba94f31ac30dd0733203e7f057..28c2db78a72b9dbe6260c1749ce96e28244e1e89 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -31,6 +31,7 @@ INCLUDE_DIRECTORIES(
 ${Hyperspectral_SOURCE_DIR}/Code/BasicFilters
 ${Hyperspectral_SOURCE_DIR}/Code/BasicFilters/GPUCuda
 ${Hyperspectral_SOURCE_DIR}/Code/Hyperspectral
+${Hyperspectral_SOURCE_DIR}/Code/Hyperspectral/GPUCuda
 ${Hyperspectral_SOURCE_DIR}/Code/Vahine
 ${Hyperspectral_SOURCE_DIR}/Code/RX
 )
diff --git a/Code/Hyperspectral/CMakeLists.txt b/Code/Hyperspectral/CMakeLists.txt
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..54297795346c792cf088d84d5ba8dcab3b1fb511 100644
--- a/Code/Hyperspectral/CMakeLists.txt
+++ b/Code/Hyperspectral/CMakeLists.txt
@@ -0,0 +1 @@
+ADD_SUBDIRECTORY(GPUCuda)
diff --git a/Code/Hyperspectral/GPUCuda/CMakeLists.txt b/Code/Hyperspectral/GPUCuda/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2c935cf55df391efba0e7dea5f5c83cbd637c3a4
--- /dev/null
+++ b/Code/Hyperspectral/GPUCuda/CMakeLists.txt
@@ -0,0 +1,13 @@
+FILE(GLOB OTBHyperspectralGPUCuda_CUDASRCS "*.cu" )
+CUDA_ADD_LIBRARY(OTBHyperspectralGPUCuda_nvcc ${OTBHyperspectralGPUCuda_CUDASRCS})
+
+
+FILE(GLOB OTBHyperspectralGPUCuda_CPPSRCS "*.cxx" )
+
+ADD_LIBRARY(OTBHyperspectralGPUCuda ${OTBHyperspectralGPUCuda_CPPSRCS})
+TARGET_LINK_LIBRARIES (OTBHyperspectralGPUCuda OTBHyperspectralGPUCuda_nvcc OTBCommon OTBBasicFilters ITKBasicFilters )
+
+IF(OTB_LIBRARY_PROPERTIES)
+  SET_TARGET_PROPERTIES(OTBHyperspectralGPUCuda PROPERTIES ${OTB_LIBRARY_PROPERTIES})
+ENDIF(OTB_LIBRARY_PROPERTIES)
+
diff --git a/Code/Hyperspectral/GPUCuda/foo.cxx b/Code/Hyperspectral/GPUCuda/foo.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/Code/Hyperspectral/GPUCuda/otbCudaFCLSFilter.cu b/Code/Hyperspectral/GPUCuda/otbCudaFCLSFilter.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ee92148b23cac03d6a51b194417ab3ccbf3359aa
--- /dev/null
+++ b/Code/Hyperspectral/GPUCuda/otbCudaFCLSFilter.cu
@@ -0,0 +1,356 @@
+/*
+ * otbCudaFCLSFilter.cu
+ *
+ */
+
+#include <cstdio>
+#include <cuda.h>
+#include <driver_types.h>
+#include <vector_types.h>
+
+//Block thread size
+#define BLOCK_SIZE (16*16)
+
+#define num_bands 152
+#define num_endmembers 13
+
+#define CUDA_SAFE_CALL( call ) \
+  do \
+  { \
+  cudaError_t status = call; \
+  check_error(status); \
+  } while(0)
+
+void check_error(cudaError_t error)
+{
+
+#define ELSEIF_CUDA_HANDLE_ERROR_CODE( code ) \
+  else if(error == code) \
+    printf("Cuda error : %d %s\n", error, #code);
+
+  if(error == cudaSuccess) {}
+  ELSEIF_CUDA_HANDLE_ERROR_CODE(cudaErrorMissingConfiguration)
+  ELSEIF_CUDA_HANDLE_ERROR_CODE(cudaErrorMemoryAllocation)
+  ELSEIF_CUDA_HANDLE_ERROR_CODE(cudaErrorInitializationError)
+  ELSEIF_CUDA_HANDLE_ERROR_CODE(cudaErrorLaunchFailure)
+  ELSEIF_CUDA_HANDLE_ERROR_CODE(cudaErrorPriorLaunchFailure)
+  ELSEIF_CUDA_HANDLE_ERROR_CODE(cudaErrorLaunchTimeout)
+  ELSEIF_CUDA_HANDLE_ERROR_CODE(cudaErrorLaunchOutOfResources)
+  ELSEIF_CUDA_HANDLE_ERROR_CODE(cudaErrorInvalidDeviceFunction)
+  ELSEIF_CUDA_HANDLE_ERROR_CODE(cudaErrorInvalidConfiguration)
+  ELSEIF_CUDA_HANDLE_ERROR_CODE(cudaErrorInvalidDevice)
+  ELSEIF_CUDA_HANDLE_ERROR_CODE(cudaErrorInvalidValue)
+  ELSEIF_CUDA_HANDLE_ERROR_CODE(cudaErrorInvalidPitchValue)
+  ELSEIF_CUDA_HANDLE_ERROR_CODE(cudaErrorInvalidSymbol)
+  ELSEIF_CUDA_HANDLE_ERROR_CODE(cudaErrorMapBufferObjectFailed)
+  ELSEIF_CUDA_HANDLE_ERROR_CODE(cudaErrorUnmapBufferObjectFailed)
+  ELSEIF_CUDA_HANDLE_ERROR_CODE(cudaErrorInvalidHostPointer)
+  ELSEIF_CUDA_HANDLE_ERROR_CODE(cudaErrorInvalidDevicePointer)
+  ELSEIF_CUDA_HANDLE_ERROR_CODE(cudaErrorInvalidTexture)
+  ELSEIF_CUDA_HANDLE_ERROR_CODE(cudaErrorInvalidTextureBinding)
+  ELSEIF_CUDA_HANDLE_ERROR_CODE(cudaErrorInvalidChannelDescriptor)
+  ELSEIF_CUDA_HANDLE_ERROR_CODE(cudaErrorInvalidMemcpyDirection)
+  ELSEIF_CUDA_HANDLE_ERROR_CODE(cudaErrorAddressOfConstant)
+
+#undef ELSEIF_HANDLE_ERROR_CODE
+}
+
+
+__global__ void UnconstrainedKernel(float* d_image_vector,
+                                    float* d_image_unmixed,
+                                    float* d_endmembersInv,
+                                    int numSamples)
+{
+  int x = blockIdx.x * blockDim.x + threadIdx.x;
+  float l_abu[num_endmembers];
+  
+  if (x < numSamples)
+    {
+    float l_pixel[num_bands];
+  
+    for (int t=0; t < num_bands; t++)
+      {
+      l_pixel[t] = d_image_vector[ t + num_bands*x ];
+      }
+  
+    for(int e = 0; e < num_endmembers; e++)
+      {
+      l_abu[e] = 0;
+      for (int k = 0; k < num_bands; k++)
+        {
+        l_abu[e] += d_endmembersInv[k + e*num_bands] * l_pixel[k];
+        }
+      d_image_unmixed[e + num_endmembers*x] = l_abu[e];
+      }
+    }
+    //syncthreads();
+}
+
+
+__global__ void ISRAKernel(float* d_image_vector, float* d_image_unmixed, float* d_endmembers, 
+                           float* d_endmembersT, int numSamples, int maxiter)
+{
+  int x = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (x < numSamples)
+    {
+    float l_pixel[num_bands];
+    float l_abu[num_endmembers];
+    
+    float numerator;
+    float denominator;
+    float dot = 0.0;
+    
+    for (int t=0; t < num_bands; t++)
+      {
+      l_pixel[t] = d_image_vector[ t + num_bands*x ];
+      }
+      
+    for (int t=0; t < num_endmembers; t++)
+      {
+      l_abu[t] = d_image_unmixed[t + num_endmembers*x];
+      }
+      
+    for(int it = 0; it < maxiter; it++)
+      {      
+      for(int e = 0; e < num_endmembers; e++)
+        {
+        numerator = 0;
+        denominator = 0;
+        
+        // For all bands
+        for (int k = 0; k < num_bands; k++)
+          {
+          numerator = numerator + d_endmembers[k + e*num_bands] * l_pixel[k];
+  
+          // Calculate dot product
+          dot = 0;
+          for (int s = 0; s < num_endmembers; s++)
+            {
+            dot += d_endmembersT[s + k*num_endmembers] * l_abu[s];
+            }
+  
+          denominator += dot * d_endmembers[k + e*num_bands];        
+          
+          }
+  
+        l_abu[e] = l_abu[e] * (numerator/denominator);
+        l_abu[e] = numerator/denominator;
+        }
+      }
+        
+    for(int k=0; k < num_endmembers;k++)
+      {
+      d_image_unmixed[k + num_endmembers*x] = l_abu[k];
+      }
+    }
+    //syncthreads();
+}
+
+__global__ void FCLSUKernel(float* d_image_unmixed,int numSamples)
+{
+  float sum;
+  // float pixel[num_endmembers];
+  
+  int x = blockIdx.x * blockDim.x + threadIdx.x;
+  if (x < numSamples)
+    {
+    sum = 0;
+    for(int i = 0; i < num_endmembers; i++)
+      {
+      //pixel[i] = d_image_unmixed[i + num_endmembers*x ];
+      //sum += pixel[i];
+      sum += d_image_unmixed[ i + num_endmembers*x ];
+      }
+      
+    for (int k = 0; k < num_endmembers; k++)
+      {
+      //pixel[k] = pixel[k] / sum;
+      d_image_unmixed[k + num_endmembers*x] = d_image_unmixed[k + num_endmembers*x ] / sum;
+      }
+    }
+    //syncthreads();
+}
+
+
+__global__ void UnconstrainedISRAKernel(float* d_image_vector,
+                                        float* d_image_unmixed,
+                                        float* d_endmembers, 
+                                        float* d_endmembersT,
+                                        float* d_endmembersInv,
+                                        int numSamples,
+                                        int maxiter)
+{
+  int x = blockIdx.x * blockDim.x + threadIdx.x;
+
+  float l_pixel[num_bands];
+  float l_abu[num_endmembers];
+  float l_abu2[num_endmembers];
+  
+  float numerator = 0;
+  float denominator = 0;
+  float dot = 0;
+  
+  if (x < numSamples)
+    {
+    // Unconstrained  
+    for (int t=0; t < num_bands; t++)
+      {
+      l_pixel[t] = d_image_vector[ t + num_bands*x ];
+      }
+  
+    for(int e = 0; e < num_endmembers; e++)
+      {
+      l_abu[e] = 0;
+      for (int k1 = 0; k1 < num_bands; k1++)
+        {
+        l_abu[e] += d_endmembersInv[k1 + e*num_bands] * l_pixel[k1];
+        }
+      }
+
+    // ISRA
+    /*
+    for(int it = 0; it < maxiter; it++)
+      {      
+      for(int e = 0; e < num_endmembers; e++)
+        {
+        numerator   = 0;
+        denominator = 0;
+        
+        // For all bands
+        for (int k = 0; k < num_bands; k++)
+          {
+          numerator = numerator + d_endmembers[k + e*num_bands] * l_pixel[k];
+  
+          // Calculate dot product
+          dot = 0;
+          for (int s = 0; s < num_endmembers; s++)
+            {
+            dot += d_endmembersT[s + k*num_endmembers] * l_abu[s];
+            }
+  
+          denominator += dot * d_endmembers[k + e*num_bands];
+          //denominator = dot;        
+          }
+        l_abu2[e] = l_abu[e] * (numerator/denominator);
+        }
+        
+       for(int e = 0; e < num_endmembers; e++)
+        {
+        l_abu[e] = l_abu2[e];
+        }       
+      }
+    syncthreads();
+    */
+
+    for(int e = 0; e < num_endmembers; e++)
+      {
+      d_image_unmixed[e + num_endmembers*x] = l_abu[e];
+      }
+    }
+}
+
+extern "C" void fclsProcessing( float* d_image_vector, 
+                                float* d_image_unmixed,
+                                float* d_endmembers,
+                                float* d_endmembersT,             
+                                float* d_endmembersInv,  
+                                int numSamples,
+                                int numBands,
+                                int nbEndmembers,
+                                int maxIter,
+                                int blockSize)
+{
+  dim3 dimBlock( blockSize );
+  dim3 dimGrid ( (numSamples + blockSize) / blockSize );
+  
+  printf( " %d \n " , (numSamples + blockSize) / blockSize );
+  
+  UnconstrainedISRAKernel<<<dimGrid,dimBlock>>>(d_image_vector, d_image_unmixed, d_endmembers, d_endmembersT, d_endmembersInv, numSamples, maxIter);
+  cudaThreadSynchronize();
+
+  //UnconstrainedKernel<<<dimGrid,dimBlock>>>(d_image_vector, d_image_unmixed, d_endmembersInv, numSamples);
+  //cudaThreadSynchronize();
+
+  //ISRAKernel<<<dimGrid,dimBlock>>>(d_image_vector, d_image_unmixed, d_endmembers, d_endmembersT, numSamples, maxIter);
+  //cudaThreadSynchronize();
+  
+  //FCLSUKernel<<<dimGrid,dimBlock>>>(d_image_unmixed, numSamples);
+  //cudaThreadSynchronize();
+}
+
+
+extern "C" void fclsMallocEndMembers( 
+                            float** d_endmembers,
+                            float** d_endmembersT,
+                            float** d_endmembersInv,
+                            int numBands,
+                            int nbEndmembers)
+{
+
+  float* d_endmembers_ = 0;
+  float* d_endmembersT_ = 0;
+  float* d_endmembersInv_ = 0;
+
+//  printf( "Allocating %d KB\n", numBands*nbEndmembers*sizeof(float) /  1024 );
+  CUDA_SAFE_CALL( cudaMalloc((void**) &d_endmembers_, nbEndmembers*numBands*sizeof(float)));
+
+//  printf( "Allocating %d KB\n", numBands*nbEndmembers*sizeof(float) /  1024 );
+  CUDA_SAFE_CALL( cudaMalloc((void**) &d_endmembersT_, nbEndmembers*numBands*sizeof(float)));
+
+//  printf( "Allocating %d KB\n", numBands*nbEndmembers*sizeof(float) /  1024 );
+  CUDA_SAFE_CALL( cudaMalloc((void**) &d_endmembersInv_, nbEndmembers*numBands*sizeof(float)));
+  
+  
+  *d_endmembers = d_endmembers_;
+  *d_endmembersT = d_endmembersT_;
+  *d_endmembersInv = d_endmembersInv_;
+}
+
+extern "C" void fclsMallocImage( float** d_image_vector,
+                            float** d_image_unmixed,
+                            int imageWidth,
+                            int imageHeight,
+                            int numBands,
+                            int nbEndmembers)
+{
+  float* d_image_vector_ = 0;
+  float* d_image_unmixed_ = 0;
+
+//  printf( "Allocating %d KB\n", numBands*imageWidth*imageHeight*sizeof(float) /  1024);
+  CUDA_SAFE_CALL(cudaMalloc((void**) &d_image_vector_, numBands*imageWidth*imageHeight*sizeof(float)));
+
+//  printf( "Allocating %d KB\n", nbEndmembers*imageWidth*imageHeight*sizeof(float) /  1024 /1024);
+  CUDA_SAFE_CALL( cudaMalloc((void**) &d_image_unmixed_, nbEndmembers*imageWidth*imageHeight*sizeof(float)));
+
+  *d_image_vector = d_image_vector_;
+  *d_image_unmixed = d_image_unmixed_;
+}
+
+         
+
+extern "C" void fclsCopyHostToDevice( float* d_ptr,
+                                      const float* h_ptr,
+                                      int nb_bytes)
+{
+  CUDA_SAFE_CALL( cudaMemcpy(d_ptr, h_ptr, nb_bytes, cudaMemcpyHostToDevice));
+}
+
+extern "C" void fclsCopyDeviceToHost( float* h_ptr,
+                                      const float* d_ptr,
+                                      int nb_bytes)
+{
+  CUDA_SAFE_CALL( cudaMemcpy(h_ptr, d_ptr, nb_bytes, cudaMemcpyDeviceToHost));
+}
+
+extern "C" void fclsFree( float* d_ptr)
+{
+  CUDA_SAFE_CALL( cudaFree(d_ptr) );
+}
+
+extern "C" void fclsInit( void )
+{
+  if (cuInit(0) != CUDA_SUCCESS)
+    exit (0);
+}
+
diff --git a/Code/Hyperspectral/GPUCuda/otbCudaFCLSFilter.h b/Code/Hyperspectral/GPUCuda/otbCudaFCLSFilter.h
new file mode 100644
index 0000000000000000000000000000000000000000..248f1ac743195868fd8154484e09122ba3098863
--- /dev/null
+++ b/Code/Hyperspectral/GPUCuda/otbCudaFCLSFilter.h
@@ -0,0 +1,176 @@
+/*=========================================================================
+
+  Program:   ORFEO Toolbox
+  Language:  C++
+  Date:      $Date$
+  Version:   $Revision$
+
+
+  Copyright (c) Centre National d'Etudes Spatiales. All rights reserved.
+  See OTBCopyright.txt for details.
+
+  Some parts of this code are derived from ITK. See ITKCopyright.txt
+  for details.
+
+
+     This software is distributed WITHOUT ANY WARRANTY; without even
+     the implied warranty of MERCHANT2ABILITY or FITNESS FOR A PARTICULAR
+     PURPOSE.  See the above copyright notices for more information.
+
+=========================================================================*/
+#ifndef __otbCudaFCLSFilter_h
+#define __otbCudaFCLSFilter_h
+
+#include "itkMacro.h"
+#include "otbUnaryFunctorImageFilter.h"
+#include "vnl/algo/vnl_svd.h"
+
+namespace otb
+{
+namespace Functor
+{
+template <class TInputPixel, class TOutputPixel>
+class Dummy
+{
+public:
+
+  typedef vnl_vector<float> VectorType;
+  typedef vnl_matrix<float> MatrixType;
+
+  Dummy(){}
+  virtual ~Dummy(){}
+
+  void SetMatrix(const MatrixType& U)
+  {
+    m_U = U;
+    m_Ut = m_U.transpose();
+    m_Uinv = vnl_svd<float>(m_U).inverse();
+  }
+
+  unsigned int GetOutputSize() const
+  {
+    return m_U.cols();
+  }
+
+  const MatrixType& GetMatrix() const
+  {
+    return m_U;
+  }
+
+  const MatrixType& GetMatrixT() const
+  {
+    return m_Ut;
+  }
+
+  const MatrixType& GetMatrixInv() const
+  {
+    return m_Uinv;
+  }
+
+  bool operator !=(const Dummy&) const
+  {
+    return true;
+  }
+  bool operator ==(const Dummy& other) const
+  {
+    return !(*this != other);
+  }
+
+  inline TOutputPixel operator ()(const TInputPixel& A) const
+  {
+    TOutputPixel B;
+    return static_cast<TOutputPixel>(B);
+  }
+
+private:
+  MatrixType m_U;
+  MatrixType m_Ut;
+  MatrixType m_Uinv;
+};
+}
+
+
+template<class TInputImage, class TOutputImage>
+class ITK_EXPORT CudaFCLSFilter: public otb::UnaryFunctorImageFilter<TInputImage, TOutputImage, Functor::Dummy<
+    typename TInputImage::PixelType, typename TOutputImage::PixelType> >
+{
+public:
+  /** Standard class typedefs. */
+  typedef CudaFCLSFilter Self;
+  typedef itk::UnaryFunctorImageFilter<TInputImage, TOutputImage,Functor::Dummy<
+  typename TInputImage::PixelType, typename TOutputImage::PixelType> > Superclass;
+  typedef itk::SmartPointer<Self> Pointer;
+  typedef itk::SmartPointer<const Self> ConstPointer;
+
+  /** Method for creation through the object factory. */
+  itkNewMacro(Self);
+
+  /** Run-time type information (and related methods). */
+  itkTypeMacro(CudaFCLSFilter, UnaryFunctorImageFilter);
+
+  typedef TInputImage  InputImageType;
+  typedef TOutputImage OutputImageType;
+
+  typedef itk::ImageRegionConstIterator<InputImageType> InputImageConstIterator;
+  typedef itk::ImageRegionIterator<InputImageType> InputImageIterator;
+
+  typedef itk::ImageRegionConstIterator<OutputImageType> OutputImageConstIterator;
+  typedef itk::ImageRegionIterator<OutputImageType> OutputImageIterator;
+  typedef typename  Superclass::OutputImageRegionType   OutputImageRegionType;
+
+  typedef typename OutputImageType::RegionType RegionType;
+  typedef typename OutputImageType::SizeType   SizeType;
+
+  typedef Functor::Dummy<
+      typename TInputImage::PixelType,
+      typename TOutputImage::PixelType
+      > FunctorType;
+
+  typedef typename FunctorType::VectorType VectorType;
+  typedef typename FunctorType::MatrixType MatrixType;
+
+  itkSetMacro(BlockSize, int);
+  itkGetMacro(BlockSize, int);
+
+  itkSetMacro(MaxIter, int);
+  itkGetMacro(MaxIter, int);
+
+  void SetMatrix(const MatrixType& U)
+  {
+    this->GetFunctor().SetMatrix(U);
+    this->Modified();
+  }
+
+protected:
+  /// Constructor
+  CudaFCLSFilter();
+
+  /// Destructor
+  virtual ~CudaFCLSFilter() {}
+
+  void ThreadedGenerateData(const OutputImageRegionType& outputRegionForThread,
+                            int threadId );
+
+private:
+  CudaFCLSFilter(const Self &); //purposely not implemented
+  void operator =(const Self&); //purposely not implemented
+
+  int m_BlockSize;
+  int m_MaxIter;
+
+  float*   m_GPUInputImage;
+  float*   m_GPUOutputImage;
+  SizeType m_GPUImageSize;
+
+  float*   m_GPUU;
+  float*   m_GPUUt;
+  float*   m_GPUUinv;
+
+};
+} // end namespace otb
+
+#ifndef OTB_MANUAL_INSTANTIATION
+#include "otbCudaFCLSFilter.txx"
+#endif
+
+#endif
diff --git a/Code/Hyperspectral/GPUCuda/otbCudaFCLSFilter.txx b/Code/Hyperspectral/GPUCuda/otbCudaFCLSFilter.txx
new file mode 100644
index 0000000000000000000000000000000000000000..75a4a2084c6b07bfeab0d35112df074b1cf4fd76
--- /dev/null
+++ b/Code/Hyperspectral/GPUCuda/otbCudaFCLSFilter.txx
@@ -0,0 +1,139 @@
+/*=========================================================================
+
+  Program:   ORFEO Toolbox
+  Language:  C++
+  Date:      $Date$
+  Version:   $Revision$
+
+
+  Copyright (c) Centre National d'Etudes Spatiales. All rights reserved.
+  See OTBCopyright.txt for details.
+
+  Some parts of this code are derived from ITK. See ITKCopyright.txt
+  for details.
+
+
+     This software is distributed WITHOUT ANY WARRANTY; without even
+     the implied warranty of MERCHANT2ABILITY or FITNESS FOR A PARTICULAR
+     PURPOSE.  See the above copyright notices for more information.
+
+=========================================================================*/
+#ifndef __otbCudaFCLSFilter_txx
+#define __otbCudaFCLSFilter_txx
+
+#include "otbCudaFCLSFilter.h"
+
+extern "C" void fclsInit();
+
+extern "C" void fclsMallocEndMembers(
+                            float** d_endmembers,
+                            float** d_endmembersT,
+                            float** d_endmembersInv,
+                            int numBands,
+                            int nbEndmembers);
+
+extern "C" void fclsMallocImage( float** d_image_vector,
+                            float** d_image_unmixed,
+                            int imageWidth,
+                            int imageHeight,
+                            int numBands,
+                            int nbEndmembers);
+
+extern "C" void fclsProcessing(float* d_image_vector,
+                               float* d_image_unmixed,
+                               float* d_endmembers,
+                               float* d_endmembersT,
+                               float* d_endmembersInv,
+                               int numSamples,
+                               int numBands,
+                               int nbEndmembers,
+                               int maxIter,
+                               int blockSize);
+
+extern "C" void fclsCopyHostToDevice( float* d_ptr,
+                                      const float* h_ptr,
+                                      int nb_bytes);
+
+extern "C" void fclsCopyDeviceToHost( float* h_ptr,
+                                      const float* d_ptr,
+                                      int nb_bytes);
+
+extern "C" void fclsFree( float* d_ptr);
+
+namespace otb
+{
+
+template <class TInputPointSet, class TOutputImage>
+CudaFCLSFilter<TInputPointSet, TOutputImage>
+::CudaFCLSFilter()
+{
+  fclsInit();
+  m_BlockSize = 512;
+  m_GPUU = 0;
+  m_GPUUt = 0;
+  m_GPUUinv = 0;
+  m_GPUInputImage = 0;
+  m_GPUOutputImage = 0;
+  m_GPUImageSize.Fill(0);
+  this->SetNumberOfThreads(1);
+}
+
+template<class TInputPointSet, class TOutputImage>
+void
+CudaFCLSFilter<TInputPointSet, TOutputImage>
+::ThreadedGenerateData(
+    const   OutputImageRegionType&     outputRegionForThread,
+    int   threadId)
+{
+  //std::cout << "Region : " << outputRegionForThread << std::endl;
+  typename InputImageType::Pointer inputPtr = dynamic_cast<InputImageType *> (this->itk::ProcessObject::GetInput(0));
+  typename OutputImageType::Pointer outputPtr = dynamic_cast<OutputImageType *> (this->itk::ProcessObject::GetOutput(0));
+
+  int numBands = inputPtr->GetNumberOfComponentsPerPixel();
+  int numEndmembers = outputPtr->GetNumberOfComponentsPerPixel();
+
+  int imageWidth = outputPtr->GetBufferedRegion().GetSize()[0];
+  int imageHeight = outputPtr->GetBufferedRegion().GetSize()[1];
+
+  float * pix = inputPtr->GetBufferPointer();
+  float * unmixed = outputPtr->GetBufferPointer();
+
+  if( !m_GPUU )
+    {
+    fclsMallocEndMembers(&m_GPUU, &m_GPUUt, &m_GPUUinv, numBands, numEndmembers);
+    fclsCopyHostToDevice(m_GPUU, this->GetFunctor().GetMatrix().data_block(), numBands * numEndmembers * sizeof(float));
+    fclsCopyHostToDevice(m_GPUUt, this->GetFunctor().GetMatrixT().data_block(), numBands * numEndmembers * sizeof(float));
+    fclsCopyHostToDevice(m_GPUUinv, this->GetFunctor().GetMatrixInv().data_block(), numBands * numEndmembers * sizeof(float));
+    }
+
+  if (m_GPUImageSize != inputPtr->GetBufferedRegion().GetSize())
+    {
+    if(m_GPUInputImage)
+      {
+      fclsFree(m_GPUInputImage);
+      fclsFree(m_GPUOutputImage);
+      }
+
+    fclsMallocImage(&m_GPUInputImage, &m_GPUOutputImage, imageWidth, imageHeight, numBands, numEndmembers);
+    m_GPUImageSize = inputPtr->GetBufferedRegion().GetSize();
+    }
+
+  fclsCopyHostToDevice(m_GPUInputImage, pix, numBands * imageWidth * imageHeight * sizeof(float));
+
+  fclsProcessing(m_GPUInputImage,
+                 m_GPUOutputImage,
+                 m_GPUU,
+                 m_GPUUt,
+                 m_GPUUinv,
+                 imageWidth * imageHeight,
+                 numBands,
+                 numEndmembers,
+                 m_MaxIter,
+                 m_BlockSize);
+
+  fclsCopyDeviceToHost(unmixed, m_GPUOutputImage, numEndmembers * imageWidth * imageHeight * sizeof(float));
+}
+
+}
+
+#endif
diff --git a/Testing/CMakeLists.txt b/Testing/CMakeLists.txt
index b34eaa27bf0f76991e95441980351cff94eb9173..abe5aa77fc06925cb9cd8ddf2b88c44ea685f2d1 100644
--- a/Testing/CMakeLists.txt
+++ b/Testing/CMakeLists.txt
@@ -109,7 +109,7 @@ ADD_TEST(bfTvFullyConstrainedLeastSquareImageFilter_SYNTHETIC
          otbFullyConstrainedLeastSquareImageFilterTest
          ${DATA}/SYNTHETIC100/hsi_cube.tif
          ${DATA}/SYNTHETIC100/endmembers.tif
-         ${TEMP}/bfFullyConstrainedLeastSquareImageFilter.tif)
+         ${TEMP}/bfFullyConstrainedLeastSquareImageFilter_SYNTHETIC.tif)
 
 
 ADD_TEST(bfTvFullyConstrainedLeastSquareImageFilter_AVIRIS512
@@ -202,10 +202,11 @@ IF( OTB_USE_CUBLAS )
 
 SET( otbHyperCublasTests1_SRC
   otbCublasStreamingStatisticsVectorImageFilter.cxx
+  otbCudaFCLSImageFilter.cxx
 )
 
 ADD_EXECUTABLE(otbHyperCublasTests1 ${otbHyperCublasTests1_SRC} otbHyperCublasTests1.cxx)
-TARGET_LINK_LIBRARIES(otbHyperCublasTests1 OTBCommon OTBIO OTBBasicFilters OTBTesting OTBBasicFiltersGPUCuda)
+TARGET_LINK_LIBRARIES(otbHyperCublasTests1 OTBCommon OTBIO OTBBasicFilters OTBTesting OTBBasicFiltersGPUCuda OTBHyperspectralGPUCuda )
 
 ADD_TEST(bfTuCublasStreamingStatisticsVectorImageFilterNew
          ${TESTEXE_DIR}/otbHyperCublasTests1
@@ -223,6 +224,33 @@ ADD_TEST(bfTvCublasStreamingStatisticsVectorImageFilter_AVIRIS512
          ${DATA}/AVIRIS/extract512/f970619t01p02_r02_sc01_goodbands.hdr
          ${TEMP}/aviris512_stats_cublas.txt)
 
+
+ADD_TEST(bfTuCudaFullyConstrainedLeastSquareImageFilterNew
+         ${TESTEXE_DIR}/otbHyperCublasTests1
+         otbCudaFullyConstrainedLeastSquareImageFilterNewTest)
+
+ADD_TEST(bfTvCudaFullyConstrainedLeastSquareImageFilter_SYNTHETIC
+         ${TESTEXE_DIR}/otbHyperCublasTests1
+         otbCudaFullyConstrainedLeastSquareImageFilterTest
+         ${DATA}/SYNTHETIC100/hsi_cube.tif
+         ${DATA}/SYNTHETIC100/endmembers.tif
+         ${TEMP}/bfCudaFullyConstrainedLeastSquareImageFilter_SYNTHETIC.hdr
+         500 # maxiter
+         512 # GPU block size
+         1 # stream size in MB
+         )
+
+ADD_TEST(bfTvCudaFullyConstrainedLeastSquareImageFilter_AVIRIS512_block512
+         ${TESTEXE_DIR}/otbHyperCublasTests1
+         otbCudaFullyConstrainedLeastSquareImageFilterTest
+         ${DATA}/AVIRIS/extract512/f970619t01p02_r02_sc01_goodbands.hdr
+         ${TEMP}/aviris_vcaimagefilter_13.hdr
+         ${TEMP}/bfCudaFullyConstrainedLeastSquareImageFilter_AVIRIS512.hdr
+         500 # maxiter
+         512 # GPU block size
+         1 # stream size in MB
+         )
+
 ENDIF( OTB_USE_CUBLAS )
 
 ###################
diff --git a/Testing/otbCudaFCLSImageFilter.cxx b/Testing/otbCudaFCLSImageFilter.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..a611d8a2e787b0690c2dc2cf3354d0e415b4cf6f
--- /dev/null
+++ b/Testing/otbCudaFCLSImageFilter.cxx
@@ -0,0 +1,84 @@
+/*=========================================================================
+
+  Program:   ORFEO Toolbox
+  Language:  C++
+  Date:      $Date$
+  Version:   $Revision$
+
+
+  Copyright (c) Centre National d'Etudes Spatiales. All rights reserved.
+  See OTBCopyright.txt for details.
+
+
+  This software is distributed WITHOUT ANY WARRANTY; without even
+  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+  PURPOSE.  See the above copyright notices for more information.
+
+=========================================================================*/
+#include "otbCudaFCLSFilter.h"
+
+#include "otbVectorImage.h"
+#include "otbImageFileReader.h"
+#include "otbStreamingImageFileWriter.h"
+#include "otbVectorImageToMatrixImageFilter.h"
+#include "otbStandardWriterWatcher.h"
+#include <vnl/vnl_inverse.h>
+
+const unsigned int Dimension = 2;
+typedef float PixelType;
+
+typedef otb::VectorImage<PixelType, Dimension> ImageType;
+typedef otb::ImageFileReader<ImageType> ReaderType;
+typedef otb::CudaFCLSFilter<ImageType,ImageType> FullyConstrainedLeastSquareSolverType;
+typedef otb::VectorImageToMatrixImageFilter<ImageType> VectorImageToMatrixImageFilterType;
+typedef otb::StreamingImageFileWriter<ImageType> WriterType;
+
+int otbCudaFullyConstrainedLeastSquareImageFilterNewTest(int argc, char * argv[])
+{
+  FullyConstrainedLeastSquareSolverType::Pointer filter = FullyConstrainedLeastSquareSolverType::New();
+  std::cout << filter << std::endl;
+  return EXIT_SUCCESS;
+}
+
+int otbCudaFullyConstrainedLeastSquareImageFilterTest(int argc, char * argv[])
+{
+  const char * inputImage = argv[1];
+  const char * inputEndmembers = argv[2];
+  const char * outputImage = argv[3];
+  int maxIter = atoi(argv[4]);
+  int blockSize = atoi(argv[5]);
+  int streamSizeMB = atoi(argv[6]);
+
+  ReaderType::Pointer readerImage = ReaderType::New();
+  readerImage->SetFileName(inputImage);
+
+  ReaderType::Pointer readerEndMembers = ReaderType::New();
+  readerEndMembers->SetFileName(inputEndmembers);
+  VectorImageToMatrixImageFilterType::Pointer endMember2Matrix = VectorImageToMatrixImageFilterType::New();
+  endMember2Matrix->SetInput(readerEndMembers->GetOutput());
+
+  endMember2Matrix->Update();
+
+  typedef VectorImageToMatrixImageFilterType::MatrixType MatrixType;
+  MatrixType endMembers = endMember2Matrix->GetMatrix();
+  MatrixType pinv = vnl_matrix_inverse<PixelType>(endMembers);
+
+  FullyConstrainedLeastSquareSolverType::Pointer unmixer = \
+      FullyConstrainedLeastSquareSolverType::New();
+
+  unmixer->SetInput(readerImage->GetOutput());
+  unmixer->SetMatrix(endMembers);
+  unmixer->SetMaxIter(maxIter);
+  unmixer->SetBlockSize(blockSize);
+
+  WriterType::Pointer writer = WriterType::New();
+  writer->SetFileName(outputImage);
+  writer->SetInput(unmixer->GetOutput());
+  writer->SetBufferMemorySize(streamSizeMB * 1024 * 1024);
+
+  otb::StandardWriterWatcher w4(writer,unmixer,"FullyConstrainedLeastSquare");
+
+  writer->Update();
+
+  return EXIT_SUCCESS;
+}
diff --git a/Testing/otbHyperCublasTests1.cxx b/Testing/otbHyperCublasTests1.cxx
index 0f6d19be70f05ad146ebed75ef50e02f4abd686a..c832283500936a948d523c1a388d0abb22f9fe12 100644
--- a/Testing/otbHyperCublasTests1.cxx
+++ b/Testing/otbHyperCublasTests1.cxx
@@ -25,5 +25,6 @@ void RegisterTests()
 {
   REGISTER_TEST(otbCublasStreamingStatisticsVectorImageFilterNewTest);
   REGISTER_TEST(otbCublasStreamingStatisticsVectorImageFilterTest);
-//  REGISTER_TEST(otbCudaCorrelationNoStreamingTest);
+  REGISTER_TEST(otbCudaFullyConstrainedLeastSquareImageFilterNewTest);
+  REGISTER_TEST(otbCudaFullyConstrainedLeastSquareImageFilterTest);
 }