Complete (and final) revamping of the CUDA rand engine.

New:
* based 100% on CuRand

* reproduced seed interface from Torch:
    cutorch.manualSeed(seed)     : set a seed for repeatable experiments
    seed = cutorch.initialSeed() : retrieve initial seed
    cutorch.seed()               : force seeding (this is done once on startup)

* I only implemented the distributions that were meaningful
  for floating point tensors (CUDA is float only for now):
    uniform(a,b)
    normal(mean,std)
    logNormal(mean,std)
    bernoulli(p)
    cauchy(median,sigma)
    exponential(lambda)
    geometric(p)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4c9b3fe..23bb46c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,7 +5,7 @@
 
 CUDA_ADD_LIBRARY(THC SHARED ${src} ${src-cuda})
 CUDA_ADD_CUBLAS_TO_TARGET(THC)
-TARGET_LINK_LIBRARIES(THC TH)
+TARGET_LINK_LIBRARIES(THC TH ${CUDA_curand_LIBRARY})
 
 INSTALL(TARGETS THC
           RUNTIME DESTINATION "${Torch_INSTALL_BIN_SUBDIR}"
diff --git a/THCTensorRandom.cu b/THCTensorRandom.cu
index 330c6ef..a860d36 100644
--- a/THCTensorRandom.cu
+++ b/THCTensorRandom.cu
@@ -1,448 +1,168 @@
 #include "THCTensorRandom.h"
 #include "THCGeneral.h"
 
-#include <thrust/random.h>
-#include <thrust/fill.h>
 #include <thrust/functional.h>
-#include <thrust/reduce.h>
-#include <thrust/inner_product.h>
-#include <thrust/sequence.h>
+#include <curand.h>
 
-/* The initial seed. */
-__device__ static int initf = 0;
-__device__ static unsigned long the_initial_seed = 0;
-__device__ static unsigned long step = 0;
+/* Generator */
+static curandGenerator_t gen;
 
-/* Max float precision */
-/* This is the max block size when drawing from a unique seed */
-#define MAXBLOCK (1<<24)
+/* Initial seed */
+static int initf = 0;
+static unsigned long initial_seed = 0;
 
-/* Seeds */
+/* Random seed (this must be called once) */
 __host__ unsigned long THCRandom_seed()
 {
-  unsigned long s = (unsigned long)1; // TODO: this should be random
+  unsigned long s = (unsigned long)time(0);
   THCRandom_manualSeed(s);
   return s;
 }
 
-__host__ void THCRandom_manualSeed(unsigned long the_seed_)
+/* Manually set the seed */
+__host__ void THCRandom_manualSeed(unsigned long seed)
 {
-  the_initial_seed = the_seed_;
+  initial_seed = seed;
+  if (initf == 1) curandDestroyGenerator(gen);
+  curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_MTGP32);
+  curandSetPseudoRandomGeneratorSeed(gen, initial_seed);
   initf = 1;
 }
 
+/* Get the initial seed */
 __host__ unsigned long THCRandom_initialSeed()
 {
-  if(initf == 0) THCRandom_seed();
-  return the_initial_seed;
+  return initial_seed;
 }
 
-__host__ __device__ unsigned long THCRandom_random()
-{
-  thrust::default_random_engine rng(the_initial_seed); rng.discard(step++);
-  thrust::uniform_int_distribution<unsigned long> ufm(0,(((unsigned long)1)<<31)-1);
-  return ufm(rng);
-}
-
-/* generates a random number on [0,1)-double-interval */
-__host__ __device__ static double __uniform__()
-{
-  thrust::default_random_engine rng(the_initial_seed); rng.discard(step++);
-  thrust::uniform_real_distribution<double> ufm(0,1);
-  return ufm(rng);
-}
-
-__host__ __device__ unsigned long THCRandom_random1(long b)
-{
-  //THArgCheck(b > 0, 1, "upper bound must be strictly positive");
-  return(THCRandom_random() % b + 1);
-}
-
-__host__ __device__ unsigned long THCRandom_random2(long a, long b)
-{
-  //THArgCheck(b >= a, 2, "upper bound must be larger than lower bound");
-  return((THCRandom_random() % (b+1-a)) + a);
-}
-
-__host__ __device__ double THCRandom_uniform(double a, double b)
-{
-  return(__uniform__() * (b - a) + a);
-}
-
-__host__ __device__ double THCRandom_normal(double mean, double stdv)
-{
-  //THArgCheck(stdv > 0, 2, "standard deviation must be strictly positive");
-  thrust::default_random_engine rng(the_initial_seed); rng.discard(step++);
-  thrust::random::experimental::normal_distribution<double> normal(mean,stdv);
-  return normal(rng);
-}
-
-__host__ __device__ double THCRandom_exponential(double lambda)
-{
-  return(-1. / lambda * log(1-__uniform__()));
-}
-
-__host__ __device__ double THCRandom_cauchy(double median, double sigma)
-{
-  return(median + sigma * tan(M_PI*(__uniform__()-0.5)));
-}
-
-__host__ __device__ double THCRandom_logNormal(double mean, double stdv)
-{
-  //THArgCheck(stdv > 0, 2, "standard deviation must be strictly positive");
-  double zm = mean*mean;
-  double zs = stdv*stdv;
-  thrust::default_random_engine rng(the_initial_seed); rng.discard(step++);
-  thrust::random::experimental::normal_distribution<double> normal(log(zm/sqrt(zs + zm)), sqrt(log(zs/zm+1)));
-  return exp(normal(rng));
-}
-
-__host__ __device__ int THCRandom_geometric(double p)
-{
-  //THArgCheck(p > 0 && p < 1, 1, "must be > 0 and < 1");
-  return((int)(log(1-__uniform__()) / log(p)) + 1);
-}
-
-__host__ __device__ int THCRandom_bernoulli(double p)
-{
-  //THArgCheck(p > 0 && p < 1, 1, "must be > 0 and < 1");
-  return(__uniform__() <= p);
-}
-
-struct random_functor
-{
-  const long step;
-
-  random_functor(long step_) : step(step_) {}
-
-  __host__ __device__ float operator()(const float& x) const
-  {
-    thrust::default_random_engine rng(the_initial_seed+step); rng.discard(x);
-    thrust::uniform_int_distribution<unsigned long> ufm(0,(((unsigned long)1)<<31)-1);
-    unsigned long r = ufm(rng);
-    return (float)(r % ((1UL << FLT_MANT_DIG)+1));
-  }
-};
-
-TH_API void THCudaTensor_random(THCudaTensor *self_) { 
-  THCudaTensor *self = THCudaTensor_newContiguous(self_);
-  long size = THCudaTensor_nElement(self);
-  thrust::device_ptr<float> self_data(THCudaTensor_data(self));
-
-  while (true) {
-      long bsize = size < MAXBLOCK ? size : MAXBLOCK;
-      thrust::sequence(self_data, self_data+bsize, 0);
-      thrust::transform(self_data, self_data+bsize, self_data, random_functor(step));
-      step+=bsize;
-      self_data+=bsize;
-      size-=bsize;
-      if (bsize == 0) break;
-  }
-
-  THCudaTensor_freeCopyTo(self, self_);
-};
-
-struct random1_functor
-{
-  const long b;
-  const long step;
-
-  random1_functor(long b_, long step_) : b(b_),step(step_) {}
-
-  __host__ __device__ float operator()(const float& x) const
-  {
-    thrust::default_random_engine rng(the_initial_seed+step); rng.discard(x);
-    thrust::uniform_int_distribution<unsigned long> ufm(0,(((unsigned long)1)<<31)-1);
-    unsigned long r = ufm(rng);
-    return (float)(r % b + 1);
-  }
-};
-
-TH_API void THCudaTensor_random1(THCudaTensor *self_, long b) {
-  THCudaTensor *self = THCudaTensor_newContiguous(self_);
-  long size = THCudaTensor_nElement(self);
-  thrust::device_ptr<float> self_data(THCudaTensor_data(self));
-
-  while (true) {
-      long bsize = size < MAXBLOCK ? size : MAXBLOCK;
-      thrust::sequence(self_data, self_data+bsize, 0);
-      thrust::transform(self_data, self_data+bsize, self_data, random1_functor(b,step));
-      step+=bsize;
-      self_data+=bsize;
-      size-=bsize;
-      if (bsize == 0) break;
-  }
-
-  THCudaTensor_freeCopyTo(self, self_);
-};
-
-struct random2_functor
-{
-  const long a,b;
-  const long step;
-
-  random2_functor(long a_, long b_, long step_) : a(a_),b(b_),step(step_) {}
-
-  __host__ __device__ float operator()(const float& x) const
-  {
-    thrust::default_random_engine rng(the_initial_seed+step); rng.discard(x);
-    thrust::uniform_int_distribution<unsigned long> ufm(0,(((unsigned long)1)<<31)-1);
-    unsigned long r = ufm(rng);
-    return (float)((r % (b+1-a)) + a);
-  }
-};
-
-TH_API void THCudaTensor_random2(THCudaTensor *self_, long a, long b) {
-  THCudaTensor *self = THCudaTensor_newContiguous(self_);
-  long size = THCudaTensor_nElement(self);
-  thrust::device_ptr<float> self_data(THCudaTensor_data(self));
-  
-  while (true) {
-      long bsize = size < MAXBLOCK ? size : MAXBLOCK;
-      thrust::sequence(self_data, self_data+bsize, 0);
-      thrust::transform(self_data, self_data+bsize, self_data, random2_functor(a,b,step));
-      step+=bsize;
-      self_data+=bsize;
-      size-=bsize;
-      if (bsize == 0) break;
-  }
-  
-  THCudaTensor_freeCopyTo(self, self_);
-};
-
+/* The following functors are use to modify uniform distributions  */
 struct bernoulli_functor
 {
   const double p;
-  const long step;
-
-  bernoulli_functor(double p_, long step_) : p(p_),step(step_) {}
+  bernoulli_functor(double p_) : p(p_) {}
 
   __host__ __device__ float operator()(const float& x) const
   {
-    thrust::default_random_engine rng(the_initial_seed+step); rng.discard(x);
-    thrust::uniform_real_distribution<float> uniform(0,1);
-    return (float)(uniform(rng) <= p);
+    return (float)(x <= p);
   }
 };
 
-TH_API void THCudaTensor_bernoulli(THCudaTensor *self_, double p) {
-  THCudaTensor *self = THCudaTensor_newContiguous(self_);
-  long size = THCudaTensor_nElement(self);
-  thrust::device_ptr<float> self_data(THCudaTensor_data(self));
-  
-  while (true) {
-      long bsize = size < MAXBLOCK ? size : MAXBLOCK;
-      thrust::sequence(self_data, self_data+bsize, 0);
-      thrust::transform(self_data, self_data+bsize, self_data, bernoulli_functor(p,step));
-      step+=bsize;
-      self_data+=bsize;
-      size-=bsize;
-      if (bsize == 0) break;
-  }
-
-  THCudaTensor_freeCopyTo(self, self_);
-};
-
-struct uniform_functor
+struct geometric_functor
 {
-  const double a,b;
-  const long step;
-
-  uniform_functor(double a_, double b_, long step_) : a(a_),b(b_),step(step_) {}
+  const double p;
+  geometric_functor(double p_) : p(p_) {}
 
   __host__ __device__ float operator()(const float& x) const
   {
-    thrust::default_random_engine rng(the_initial_seed+step); rng.discard(x);
-    thrust::uniform_real_distribution<float> uniform(a,b);
-    return uniform(rng);
+    return (float)((log(1-x) / log(p)) + 1);
+  }
+};
+
+struct exponential_functor
+{
+  const double lambda;
+  exponential_functor(double lambda_) : lambda(lambda_) {}
+
+  __host__ __device__ float operator()(const float& x) const
+  {
+    return (float)(-1. / lambda * log(1-x));
+  }
+};
+
+struct cauchy_functor
+{
+  const double median,sigma;
+  cauchy_functor(double median_, double sigma_) : median(median_),sigma(sigma_) {}
+
+  __host__ __device__ float operator()(const float& x) const
+  {
+    return (float)(median + sigma * tan(M_PI*(x-0.5)));
   }
 };
 
 TH_API void THCudaTensor_uniform(THCudaTensor *self_, double a, double b) {
   THCudaTensor *self = THCudaTensor_newContiguous(self_);
   long size = THCudaTensor_nElement(self);
-  thrust::device_ptr<float> self_data(THCudaTensor_data(self));
-  
-  while (true) {
-      long bsize = size < MAXBLOCK ? size : MAXBLOCK;
-      thrust::sequence(self_data, self_data+bsize, 0);
-      thrust::transform(self_data, self_data+bsize, self_data, uniform_functor(a,b,step));
-      step+=bsize;
-      self_data+=bsize;
-      size-=bsize;
-      if (bsize == 0) break;
+  float *data = THCudaTensor_data(self);
+
+  curandGenerateUniform(gen, data, size);
+
+  if ((a != 0) || (b != 1)) {
+      THCudaTensor_mul(self, b-a);
+      THCudaTensor_add(self, a);
   }
 
   THCudaTensor_freeCopyTo(self, self_);
 };
 
-struct normal_functor
-{
-  const double mean,stdv;
-  const long step;
+TH_API void THCudaTensor_bernoulli(THCudaTensor *self_, double p) {
+  THCudaTensor *self = THCudaTensor_newContiguous(self_);
+  long size = THCudaTensor_nElement(self);
+  float *data = THCudaTensor_data(self);
+  thrust::device_ptr<float> tdata(data);
+  
+  curandGenerateUniform(gen, data, size);
+  
+  thrust::transform(tdata, tdata+size, tdata, bernoulli_functor(p));
 
-  normal_functor(double mean_, double stdv_, long step_) : mean(mean_),stdv(stdv_),step(step_) {}
-
-  __host__ __device__ 
-  float operator()(const float& x) const
-  {
-    thrust::default_random_engine rng(the_initial_seed+step); rng.discard(x);
-    thrust::random::experimental::normal_distribution<float> normal(mean,stdv);
-    return normal(rng);
-  }
+  THCudaTensor_freeCopyTo(self, self_);
 };
 
 TH_API void THCudaTensor_normal(THCudaTensor *self_, double mean, double stdv) {
   THCudaTensor *self = THCudaTensor_newContiguous(self_);
   long size = THCudaTensor_nElement(self);
-  thrust::device_ptr<float> self_data(THCudaTensor_data(self));
+  float *data = THCudaTensor_data(self);
 
-  while (true) {
-      long bsize = size < MAXBLOCK ? size : MAXBLOCK;
-      thrust::sequence(self_data, self_data+bsize, 0);
-      thrust::transform(self_data, self_data+bsize, self_data, normal_functor(mean,stdv,step));
-      step+=bsize;
-      self_data+=bsize;
-      size-=bsize;
-      if (bsize == 0) break;
-  }
+  curandGenerateNormal(gen, data, size, mean, stdv);
 
   THCudaTensor_freeCopyTo(self, self_);
 };
 
-struct geometric_functor
-{
-  const double p;
-  const long step;
-
-  geometric_functor(double p_, long step_) : p(p_),step(step_) {}
-
-  __host__ __device__ float operator()(const float& x) const
-  {
-    thrust::default_random_engine rng(the_initial_seed+step); rng.discard(x);
-    thrust::uniform_real_distribution<float> uniform(0,1);
-    float u = uniform(rng);
-    return (float)((log(1-u) / log(p)) + 1);
-  }
-};
-
-TH_API void THCudaTensor_geometric(THCudaTensor *self_, double p) {
-  THCudaTensor *self = THCudaTensor_newContiguous(self_);
-  long size = THCudaTensor_nElement(self);
-  thrust::device_ptr<float> self_data(THCudaTensor_data(self));
-  
-  while (true) {
-      long bsize = size < MAXBLOCK ? size : MAXBLOCK;
-      thrust::sequence(self_data, self_data+bsize, 0);
-      thrust::transform(self_data, self_data+bsize, self_data, geometric_functor(p,step));
-      step+=bsize;
-      self_data+=bsize;
-      size-=bsize;
-      if (bsize == 0) break;
-  }
-
-  THCudaTensor_freeCopyTo(self, self_);
-};
-
-struct exponential_functor
-{
-  const double lambda;
-  const long step;
-
-  exponential_functor(double lambda_, long step_) : lambda(lambda_),step(step_) {}
-
-  __host__ __device__ float operator()(const float& x) const
-  {
-    thrust::default_random_engine rng(the_initial_seed+step); rng.discard(x);
-    thrust::uniform_real_distribution<float> uniform(0,1);
-    float u = uniform(rng);
-    return (float)(-1. / lambda * log(1-u));
-  }
-};
-
-TH_API void THCudaTensor_exponential(THCudaTensor *self_, double lambda) {
-  THCudaTensor *self = THCudaTensor_newContiguous(self_);
-  long size = THCudaTensor_nElement(self);
-  thrust::device_ptr<float> self_data(THCudaTensor_data(self));
-  
-  while (true) {
-      long bsize = size < MAXBLOCK ? size : MAXBLOCK;
-      thrust::sequence(self_data, self_data+bsize, 0);
-      thrust::transform(self_data, self_data+bsize, self_data, exponential_functor(lambda,step));
-      step+=bsize;
-      self_data+=bsize;
-      size-=bsize;
-      if (bsize == 0) break;
-  }
-
-  THCudaTensor_freeCopyTo(self, self_);
-};
-
-struct cauchy_functor
-{
-  const double median,sigma;
-  const long step;
-
-  cauchy_functor(double median_, double sigma_, long step_) : median(median_),sigma(sigma_),step(step_) {}
-
-  __host__ __device__ float operator()(const float& x) const
-  {
-    thrust::default_random_engine rng(the_initial_seed+step); rng.discard(x);
-    thrust::uniform_real_distribution<float> uniform(0,1);
-    float u = uniform(rng);
-    return (float)(median + sigma * tan(M_PI*(u-0.5)));
-  }
-};
-
-TH_API void THCudaTensor_cauchy(THCudaTensor *self_, double median, double sigma) {
-  THCudaTensor *self = THCudaTensor_newContiguous(self_);
-  long size = THCudaTensor_nElement(self);
-  thrust::device_ptr<float> self_data(THCudaTensor_data(self));
-  
-  while (true) {
-      long bsize = size < MAXBLOCK ? size : MAXBLOCK;
-      thrust::sequence(self_data, self_data+bsize, 0);
-      thrust::transform(self_data, self_data+bsize, self_data, cauchy_functor(median,sigma,step));
-      step+=bsize;
-      self_data+=bsize;
-      size-=bsize;
-      if (bsize == 0) break;
-  }
-
-  THCudaTensor_freeCopyTo(self, self_);
-};
-
-struct logNormal_functor
-{
-  const double mean,stdv;
-  const long step;
-
-  logNormal_functor(double mean_, double stdv_, long step_) : mean(mean_),stdv(stdv_),step(step_) {}
-
-  __host__ __device__ float operator()(const float& x) const
-  {
-    double zm = mean*mean;
-    double zs = stdv*stdv;
-    thrust::default_random_engine rng(the_initial_seed+step); rng.discard(x);
-    thrust::random::experimental::normal_distribution<double> normal(log(zm/sqrt(zs + zm)), sqrt(log(zs/zm+1)));
-    return exp(normal(rng));
-  }
-};
-
 TH_API void THCudaTensor_logNormal(THCudaTensor *self_, double mean, double stdv) {
   THCudaTensor *self = THCudaTensor_newContiguous(self_);
   long size = THCudaTensor_nElement(self);
-  thrust::device_ptr<float> self_data(THCudaTensor_data(self));
+  float *data = THCudaTensor_data(self);
   
-  while (true) {
-      long bsize = size < MAXBLOCK ? size : MAXBLOCK;
-      thrust::sequence(self_data, self_data+bsize, 0);
-      thrust::transform(self_data, self_data+bsize, self_data, logNormal_functor(mean,stdv,step));
-      step+=bsize;
-      self_data+=bsize;
-      size-=bsize;
-      if (bsize == 0) break;
-  }
+  curandGenerateLogNormal(gen, data, size, mean, stdv);
+
+  THCudaTensor_freeCopyTo(self, self_);
+};
+
+TH_API void THCudaTensor_geometric(THCudaTensor *self_, double p) {
+  THCudaTensor *self = THCudaTensor_newContiguous(self_);
+  long size = THCudaTensor_nElement(self);
+  float *data = THCudaTensor_data(self);
+  thrust::device_ptr<float> tdata(data);
+  
+  curandGenerateUniform(gen, data, size);
+  
+  thrust::transform(tdata, tdata+size, tdata, geometric_functor(p));
+
+  THCudaTensor_freeCopyTo(self, self_);
+};
+
+TH_API void THCudaTensor_exponential(THCudaTensor *self_, double lambda) {
+  THCudaTensor *self = THCudaTensor_newContiguous(self_);
+  long size = THCudaTensor_nElement(self);
+  float *data = THCudaTensor_data(self);
+  thrust::device_ptr<float> tdata(data);
+  
+  curandGenerateUniform(gen, data, size);
+  
+  thrust::transform(tdata, tdata+size, tdata, exponential_functor(lambda));
+
+  THCudaTensor_freeCopyTo(self, self_);
+};
+
+TH_API void THCudaTensor_cauchy(THCudaTensor *self_, double median, double sigma) {
+  THCudaTensor *self = THCudaTensor_newContiguous(self_);
+  long size = THCudaTensor_nElement(self);
+  float *data = THCudaTensor_data(self);
+  thrust::device_ptr<float> tdata(data);
+  
+  curandGenerateUniform(gen, data, size);
+  
+  thrust::transform(tdata, tdata+size, tdata, cauchy_functor(median, sigma));
 
   THCudaTensor_freeCopyTo(self, self_);
 };
diff --git a/THCTensorRandom.h b/THCTensorRandom.h
index dd25295..6eb39a6 100644
--- a/THCTensorRandom.h
+++ b/THCTensorRandom.h
@@ -6,20 +6,10 @@
 TH_API unsigned long THCRandom_seed();
 TH_API void THCRandom_manualSeed(unsigned long the_seed_);
 TH_API unsigned long THCRandom_initialSeed();
-TH_API unsigned long THCRandom_random();
-TH_API unsigned long THCRandom_random1(long b);
-TH_API unsigned long THCRandom_random2(long a, long b);
-TH_API double THCRandom_uniform(double a, double b);
-TH_API double THCRandom_normal(double mean, double stdv);
-TH_API double THCRandom_exponential(double lambda);
-TH_API double THCRandom_cauchy(double median, double sigma);
-TH_API double THCRandom_logNormal(double mean, double stdv);
-TH_API int THCRandom_geometric(double p);
-TH_API int THCRandom_bernoulli(double p);
 
-TH_API void THCudaTensor_random(THCudaTensor *self);
-TH_API void THCudaTensor_random1(THCudaTensor *self, long b);
-TH_API void THCudaTensor_random2(THCudaTensor *self, long a, long b);
+// TH_API void THCudaTensor_random(THCudaTensor *self);
+// TH_API void THCudaTensor_random1(THCudaTensor *self, long b);
+// TH_API void THCudaTensor_random2(THCudaTensor *self, long a, long b);
 TH_API void THCudaTensor_geometric(THCudaTensor *self, double p);
 TH_API void THCudaTensor_bernoulli(THCudaTensor *self, double p);
 TH_API void THCudaTensor_uniform(THCudaTensor *self, double a, double b);