THCTensorMathPairwise.cu - platform/external/pytorch - Git at Google

 #include "THCTensorMath.h"
 #include "THCGeneral.h"
 #include "THCHalf.h"
 #include "THCTensorCopy.h"
 #include "THCApply.cuh"
 #include "THCNumerics.cuh"

 template <typename T>
 struct TensorAddConstantOp {
   TensorAddConstantOp(T v) : val(v) {}
   __device__ __forceinline__ void operator()(T* out, T* in) {
     *out = *in + val;
   }

   __device__ __forceinline__ void operator()(T* v) {
     *v += val;
   }

   const T val;
 };

 #ifdef CUDA_HALF_TENSOR
 template <>
 struct TensorAddConstantOp<half> {
 #ifdef CUDA_HALF_INSTRUCTIONS
   TensorAddConstantOp(half v) : val(v) {}
 #else
   TensorAddConstantOp(half v) : fval(THC_half2float(v)) {}
 #endif

   __device__ __forceinline__ void operator()(half* out, half* in) {
 #ifdef CUDA_HALF_INSTRUCTIONS
     *out = __hadd(*in, val);
 #else
     float fin = __half2float(*in);
     float fout = fin + fval;
     *out = __float2half(fout);
 #endif
   }

   __device__ __forceinline__ void operator()(half* v) {
 #ifdef CUDA_HALF_INSTRUCTIONS
     *v = __hadd(*v, val);
 #else
     float fv = __half2float(*v);
     fv += fval;
     *v = __float2half(fv);
 #endif
   }

 #ifdef CUDA_HALF_INSTRUCTIONS
   const half val;
 #else
   const float fval;
 #endif
 };
 #endif // CUDA_HALF_TENSOR


 template <typename T>
 struct TensorSubConstantOp {
   TensorSubConstantOp(T v) : val(v) {}
   __device__ __forceinline__ void operator()(T* out, T* in) {
     *out = *in - val;
   }

   __device__ __forceinline__ void operator()(T* v) {
     *v -= val;
   }

   const T val;
 };


 #ifdef CUDA_HALF_TENSOR
 template <>
 struct TensorSubConstantOp<half> {
 #ifdef CUDA_HALF_INSTRUCTIONS
   TensorSubConstantOp(half v): val(THC_float2half(-(THC_half2float(v)))) {}
 #else
   TensorSubConstantOp(half v): fval(-(THC_half2float(v))) {}
 #endif

   __device__ __forceinline__ void operator()(half* out, half* in) {
 #ifdef CUDA_HALF_INSTRUCTIONS
     *out = __hadd(*in, val);
 #else
     float fin = __half2float(*in);
     float fout = fin + fval;
     *out = __float2half(fout);
 #endif
   }

   __device__ __forceinline__ void operator()(half* v) {
 #ifdef CUDA_HALF_INSTRUCTIONS
     *v = __hadd(*v, val);
 #else
     float fv = __half2float(*v);
     fv += fval;
     *v = __float2half(fv);
 #endif
   }

 #ifdef CUDA_HALF_INSTRUCTIONS
   const half val;
 #else
   const float fval;
 #endif
 };
 #endif // CUDA_HALF_TENSOR


 template <typename T>
 struct TensorMulConstantOp {
   TensorMulConstantOp(T v) : val(v) {}
   __device__ __forceinline__ void operator()(T* out, T* in) {
     *out = *in * val;
   }

   __device__ __forceinline__ void operator()(T* v) {
     *v *= val;
   }

   const T val;
 };

 #ifdef CUDA_HALF_TENSOR
 template <>
 struct TensorMulConstantOp<half> {
 #ifdef CUDA_HALF_INSTRUCTIONS
   TensorMulConstantOp(half v) : val(v) {}
 #else
   TensorMulConstantOp(half v) : fval(THC_half2float(v)) {}
 #endif

   __device__ __forceinline__ void operator()(half* out, half* in) {
 #ifdef CUDA_HALF_INSTRUCTIONS
     *out = __hmul(*in, val);
 #else
     float fin = __half2float(*in);
     float fout = fin * fval;
     *out = __float2half(fout);
 #endif
   }

   __device__ __forceinline__ void operator()(half* v) {
 #ifdef CUDA_HALF_INSTRUCTIONS
     *v = __hmul(*v, val);
 #else
     float fv = __half2float(*v);
     fv *= fval;
     *v = __float2half(fv);
 #endif
   }

 #ifdef CUDA_HALF_INSTRUCTIONS
   const half val;
 #else
   const float fval;
 #endif
 };
 #endif // CUDA_HALF_TENSOR

 template <typename T>
 struct TensorDivConstantOp {
   TensorDivConstantOp(T v) : val(v) {}
   __device__ __forceinline__ void operator()(T* out, T* in) {
     *out = *in / val;
   }

   __device__ __forceinline__ void operator()(T* v) {
     *v /= val;
   }

   const T val;
 };

 template <>
 struct TensorDivConstantOp<float> {
   TensorDivConstantOp(float v) : val(1.f / v) {}
   __device__ __forceinline__ void operator()(float* out, float* in) {
     *out = *in * val;
   }

   __device__ __forceinline__ void operator()(float* v) {
     *v *= val;
   }

   const float val;
 };

 template <>
 struct TensorDivConstantOp<double> {
   TensorDivConstantOp(double v) : val(1. / v) {}
   __device__ __forceinline__ void operator()(double* out, double* in) {
     *out = *in * val;
   }

   __device__ __forceinline__ void operator()(double* v) {
     *v *= val;
   }

   const double val;
 };

 #ifdef CUDA_HALF_TENSOR
 template <>
 struct TensorDivConstantOp<half> {
 #ifdef CUDA_HALF_INSTRUCTIONS
   TensorDivConstantOp(half v) : val(ScalarInv<half>::to(v)) {}
 #else
   TensorDivConstantOp(half v) : fval(1.f / THC_half2float(v)) {}
 #endif
   __device__ __forceinline__ void operator()(half* out, half* in) {
 #ifdef CUDA_HALF_INSTRUCTIONS
     *out = __hmul(*in, val);
 #else
     float fin = __half2float(*in);
     float fout = fin * fval;
     *out = __float2half(fout);
 #endif
   }

   __device__ __forceinline__ void operator()(half* v) {
 #ifdef CUDA_HALF_INSTRUCTIONS
     *v = __hmul(*v, val);
 #else
     float fv = __half2float(*v);
     fv *= fval;
     *v = __float2half(fv);
 #endif
   }

 #ifdef CUDA_HALF_INSTRUCTIONS
   const half val;
 #else
   const float fval;
 #endif
 };
 #endif // CUDA_HALF_TENSOR

 template <int Upper>
 struct TensorTriOp {
   TensorTriOp(float *start_, long stride0_, long stride1_, long k_)
     : start(start_), stride0(stride0_), stride1(stride1_), k(k_) {}

   __device__ __forceinline__ int mask(float *in) {
     ptrdiff_t n = in - start;
     long row, col;
     if (stride0 > stride1)
     {
       row = (long) (n / stride0);
       col = (long) ((n % stride0) / stride1);
     }
     else
     {
       row = (long) ((n % stride1) / stride0);
       col = (long) (n / stride1);
     }

     return Upper ? (col - row >= k) : (col - row <= k);
   }

   __device__ __forceinline__ void operator()(float* out, float* in) {
     *out = mask(in) ? *in : 0;
   }

   __device__ __forceinline__ void operator()(float* v) {
     if (!mask(v))
       *v = 0;
   }

   const float *start;
   const long stride0, stride1, k;
 };

 void THCudaTensor_tril(THCState *state, THCudaTensor *self_, THCudaTensor *src_, long k)
 {
   THAssert(THCudaTensor_checkGPU(state, 2, self_, src_));
   THArgCheck(src_->nDimension == 2, 1, "expected a matrix");

   THCudaTensor *src = src_;
   if (self_ == src_)
     src = THCudaTensor_newContiguous(state, src_);

   long stride0 = src->stride[0];
   long stride1 = src->stride[1];
   float *start = THCudaTensor_data(state, src) + src->storageOffset;

   TensorTriOp<0> op(start, stride0, stride1, k);

   if (self_ == src_) {
     if (!THC_pointwiseApply1(state, src, op)) {
       THArgCheck(false, 2, CUTORCH_DIM_WARNING);
     }
   } else {
     THCudaTensor_resizeAs(state, self_, src);

     if (!THC_pointwiseApply2(state, self_, src, op)) {
       THArgCheck(false, 2, CUTORCH_DIM_WARNING);
     }
   }

   if (self_ == src_)
     THCudaTensor_freeCopyTo(state, src, src_);

   THCudaCheck(cudaGetLastError());
 }

 void THCudaTensor_triu(THCState *state, THCudaTensor *self_, THCudaTensor *src_, long k)
 {
   THAssert(THCudaTensor_checkGPU(state, 2, self_, src_));
   THArgCheck(src_->nDimension == 2, 1, "expected a matrix");

   THCudaTensor *src = src_;
   if (self_ == src_)
     src = THCudaTensor_newContiguous(state, src_);

   long stride0 = src->stride[0];
   long stride1 = src->stride[1];
   float *start = THCudaTensor_data(state, src) + src->storageOffset;

   TensorTriOp<1> op(start, stride0, stride1, k);

   if (self_ == src_) {
     if (!THC_pointwiseApply1(state, src, op)) {
       THArgCheck(false, 2, CUTORCH_DIM_WARNING);
     }
   } else {
     THCudaTensor_resizeAs(state, self_, src);

     if (!THC_pointwiseApply2(state, self_, src, op)) {
       THArgCheck(false, 2, CUTORCH_DIM_WARNING);
     }
   }

   if (self_ == src_)
     THCudaTensor_freeCopyTo(state, src, src_);

   THCudaCheck(cudaGetLastError());
 }

 #include "generic/THCTensorMathPairwise.cu"
 #include "THCGenerateAllTypes.h"

 // Copy the kth diagonal of a matrix B to a vector A.
 __global__ void THCudaTensor_copyFromDiagonal(float* a, float* b, long start, long size, long strideSum, long strideA) {
   for (long linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
        linearIndex < size;
        linearIndex += gridDim.x * blockDim.x) {
     const long bOffset = start + strideSum * linearIndex;
     a[strideA * linearIndex] = b[bOffset];
   }
 }

 // Copy vector B to the kth diagonal of a matrix A
 __global__ void THCudaTensor_copyToDiagonal(float* a, float* b, long start, long size, long strideSum, long strideB) {
   for (long linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
        linearIndex < size;
        linearIndex += gridDim.x * blockDim.x) {
     const long aOffset = start + strideSum * linearIndex;
     a[aOffset] = b[strideB * linearIndex];
   }
 }

 void THCudaTensor_diag(THCState *state, THCudaTensor *self_, THCudaTensor *src_, long k){
   THAssert(THCudaTensor_checkGPU(state, 2, self_, src_));
   int nDimension = THCudaTensor_nDimension(state, src_);
   THArgCheck((nDimension == 2) || (nDimension == 1), 1, "expected a matrix or a vector");
   if (nDimension == 2) {
     long stride0 = THCudaTensor_stride(state, src_, 0);
     long stride1 = THCudaTensor_stride(state, src_, 1);
     long size0 = THCudaTensor_size(state, src_, 0);
     long size1 = THCudaTensor_size(state, src_, 1);
     long size = (k > 0) ? min((long long)size0, (long long)size1 - k) : min((long long)size0 + k, (long long)size1);
     THCudaTensor_resize1d(state, self_, size);
     long strideSelf = THCudaTensor_stride(state, self_, 0);
     const dim3 threads(min((long long)THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock, (long long)size));
     dim3 grid(min((long long)1024, (long long)THCCeilDiv(size, (long)threads.x)));
     long start = (k >= 0 ? k * stride1 : -k * stride0);
     THCudaTensor_copyFromDiagonal<<<grid, threads, 0, THCState_getCurrentStream(state)>>>
     (THCudaTensor_data(state, self_), THCudaTensor_data(state, src_), start, size, stride0 + stride1, strideSelf);
   } else {
     long totalElements = THCudaTensor_nElement(state, src_);
     long size = (k > 0) ? totalElements + k : totalElements - k;
     long strideSrc = THCudaTensor_stride(state, src_, 0);
     THCudaTensor_resize2d(state, self_, size, size);
     THCudaTensor_zero(state, self_);
     long stride0 = THCudaTensor_stride(state, self_, 0);
     long stride1 = THCudaTensor_stride(state, self_, 1);
     const dim3 threads(min((long long)THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock, (long long)size));
     dim3 grid(min((long long)1024, (long long)THCCeilDiv(size, (long)threads.x)));
     long start = (k >= 0 ? k * stride1 : -k * stride0);
     THCudaTensor_copyToDiagonal<<<grid, threads, 0, THCState_getCurrentStream(state)>>>
     (THCudaTensor_data(state, self_), THCudaTensor_data(state, src_), start, totalElements, stride0 + stride1, strideSrc);
   }
   THCudaCheck(cudaGetLastError());
 }

 float THCudaTensor_trace(THCState *state, THCudaTensor *src_) {
   THAssert(THCudaTensor_checkGPU(state, 1, src_));
   THArgCheck((src_->nDimension == 2), 1, "expected a matrix");
   THCudaTensor *diag = THCudaTensor_new(state);
   THCudaTensor_diag(state, diag, src_, 0);
   float trace = THCudaTensor_sumall(state, diag);
   THCudaTensor_free(state, diag);
   return trace;
 }
	#include "THCTensorMath.h"
	#include "THCGeneral.h"
	#include "THCHalf.h"
	#include "THCTensorCopy.h"
	#include "THCApply.cuh"
	#include "THCNumerics.cuh"

	template <typename T>
	struct TensorAddConstantOp {
	TensorAddConstantOp(T v) : val(v) {}
	__device__ __forceinline__ void operator()(T* out, T* in) {
	out = in + val;
	}

	__device__ __forceinline__ void operator()(T* v) {
	*v += val;
	}

	const T val;
	};

	#ifdef CUDA_HALF_TENSOR
	template <>
	struct TensorAddConstantOp<half> {
	#ifdef CUDA_HALF_INSTRUCTIONS
	TensorAddConstantOp(half v) : val(v) {}
	#else
	TensorAddConstantOp(half v) : fval(THC_half2float(v)) {}
	#endif

	__device__ __forceinline__ void operator()(half* out, half* in) {
	#ifdef CUDA_HALF_INSTRUCTIONS
	out = __hadd(in, val);
	#else
	float fin = __half2float(*in);
	float fout = fin + fval;
	*out = __float2half(fout);
	#endif
	}

	__device__ __forceinline__ void operator()(half* v) {
	#ifdef CUDA_HALF_INSTRUCTIONS
	v = __hadd(v, val);
	#else
	float fv = __half2float(*v);
	fv += fval;
	*v = __float2half(fv);
	#endif
	}

	#ifdef CUDA_HALF_INSTRUCTIONS
	const half val;
	#else
	const float fval;
	#endif
	};
	#endif // CUDA_HALF_TENSOR


	template <typename T>
	struct TensorSubConstantOp {
	TensorSubConstantOp(T v) : val(v) {}
	__device__ __forceinline__ void operator()(T* out, T* in) {
	out = in - val;
	}

	__device__ __forceinline__ void operator()(T* v) {
	*v -= val;
	}

	const T val;
	};


	#ifdef CUDA_HALF_TENSOR
	template <>
	struct TensorSubConstantOp<half> {
	#ifdef CUDA_HALF_INSTRUCTIONS
	TensorSubConstantOp(half v): val(THC_float2half(-(THC_half2float(v)))) {}
	#else
	TensorSubConstantOp(half v): fval(-(THC_half2float(v))) {}
	#endif

	__device__ __forceinline__ void operator()(half* out, half* in) {
	#ifdef CUDA_HALF_INSTRUCTIONS
	out = __hadd(in, val);
	#else
	float fin = __half2float(*in);
	float fout = fin + fval;
	*out = __float2half(fout);
	#endif
	}

	__device__ __forceinline__ void operator()(half* v) {
	#ifdef CUDA_HALF_INSTRUCTIONS
	v = __hadd(v, val);
	#else
	float fv = __half2float(*v);
	fv += fval;
	*v = __float2half(fv);
	#endif
	}

	#ifdef CUDA_HALF_INSTRUCTIONS
	const half val;
	#else
	const float fval;
	#endif
	};
	#endif // CUDA_HALF_TENSOR


	template <typename T>
	struct TensorMulConstantOp {
	TensorMulConstantOp(T v) : val(v) {}
	__device__ __forceinline__ void operator()(T* out, T* in) {
	out = in * val;
	}

	__device__ __forceinline__ void operator()(T* v) {
	v = val;
	}

	const T val;
	};

	#ifdef CUDA_HALF_TENSOR
	template <>
	struct TensorMulConstantOp<half> {
	#ifdef CUDA_HALF_INSTRUCTIONS
	TensorMulConstantOp(half v) : val(v) {}
	#else
	TensorMulConstantOp(half v) : fval(THC_half2float(v)) {}
	#endif

	__device__ __forceinline__ void operator()(half* out, half* in) {
	#ifdef CUDA_HALF_INSTRUCTIONS
	out = __hmul(in, val);
	#else
	float fin = __half2float(*in);
	float fout = fin * fval;
	*out = __float2half(fout);
	#endif
	}

	__device__ __forceinline__ void operator()(half* v) {
	#ifdef CUDA_HALF_INSTRUCTIONS
	v = __hmul(v, val);
	#else
	float fv = __half2float(*v);
	fv *= fval;
	*v = __float2half(fv);
	#endif
	}

	#ifdef CUDA_HALF_INSTRUCTIONS
	const half val;
	#else
	const float fval;
	#endif
	};
	#endif // CUDA_HALF_TENSOR

	template <typename T>
	struct TensorDivConstantOp {
	TensorDivConstantOp(T v) : val(v) {}
	__device__ __forceinline__ void operator()(T* out, T* in) {
	out = in / val;
	}

	__device__ __forceinline__ void operator()(T* v) {
	*v /= val;
	}

	const T val;
	};

	template <>
	struct TensorDivConstantOp<float> {
	TensorDivConstantOp(float v) : val(1.f / v) {}
	__device__ __forceinline__ void operator()(float* out, float* in) {
	out = in * val;
	}

	__device__ __forceinline__ void operator()(float* v) {
	v = val;
	}

	const float val;
	};

	template <>
	struct TensorDivConstantOp<double> {
	TensorDivConstantOp(double v) : val(1. / v) {}
	__device__ __forceinline__ void operator()(double* out, double* in) {
	out = in * val;
	}

	__device__ __forceinline__ void operator()(double* v) {
	v = val;
	}

	const double val;
	};

	#ifdef CUDA_HALF_TENSOR
	template <>
	struct TensorDivConstantOp<half> {
	#ifdef CUDA_HALF_INSTRUCTIONS
	TensorDivConstantOp(half v) : val(ScalarInv<half>::to(v)) {}
	#else
	TensorDivConstantOp(half v) : fval(1.f / THC_half2float(v)) {}
	#endif
	__device__ __forceinline__ void operator()(half* out, half* in) {
	#ifdef CUDA_HALF_INSTRUCTIONS
	out = __hmul(in, val);
	#else
	float fin = __half2float(*in);
	float fout = fin * fval;
	*out = __float2half(fout);
	#endif
	}

	__device__ __forceinline__ void operator()(half* v) {
	#ifdef CUDA_HALF_INSTRUCTIONS
	v = __hmul(v, val);
	#else
	float fv = __half2float(*v);
	fv *= fval;
	*v = __float2half(fv);
	#endif
	}

	#ifdef CUDA_HALF_INSTRUCTIONS
	const half val;
	#else
	const float fval;
	#endif
	};
	#endif // CUDA_HALF_TENSOR

	template <int Upper>
	struct TensorTriOp {
	TensorTriOp(float *start_, long stride0_, long stride1_, long k_)
	: start(start_), stride0(stride0_), stride1(stride1_), k(k_) {}

	__device__ __forceinline__ int mask(float *in) {
	ptrdiff_t n = in - start;
	long row, col;
	if (stride0 > stride1)
	{
	row = (long) (n / stride0);
	col = (long) ((n % stride0) / stride1);
	}
	else
	{
	row = (long) ((n % stride1) / stride0);
	col = (long) (n / stride1);
	}

	return Upper ? (col - row >= k) : (col - row <= k);
	}

	__device__ __forceinline__ void operator()(float* out, float* in) {
	out = mask(in) ? in : 0;
	}

	__device__ __forceinline__ void operator()(float* v) {
	if (!mask(v))
	*v = 0;
	}

	const float *start;
	const long stride0, stride1, k;
	};

	void THCudaTensor_tril(THCState state, THCudaTensor self_, THCudaTensor *src_, long k)
	{
	THAssert(THCudaTensor_checkGPU(state, 2, self_, src_));
	THArgCheck(src_->nDimension == 2, 1, "expected a matrix");

	THCudaTensor *src = src_;
	if (self_ == src_)
	src = THCudaTensor_newContiguous(state, src_);

	long stride0 = src->stride[0];
	long stride1 = src->stride[1];
	float *start = THCudaTensor_data(state, src) + src->storageOffset;

	TensorTriOp<0> op(start, stride0, stride1, k);

	if (self_ == src_) {
	if (!THC_pointwiseApply1(state, src, op)) {
	THArgCheck(false, 2, CUTORCH_DIM_WARNING);
	}
	} else {
	THCudaTensor_resizeAs(state, self_, src);

	if (!THC_pointwiseApply2(state, self_, src, op)) {
	THArgCheck(false, 2, CUTORCH_DIM_WARNING);
	}
	}

	if (self_ == src_)
	THCudaTensor_freeCopyTo(state, src, src_);

	THCudaCheck(cudaGetLastError());
	}

	void THCudaTensor_triu(THCState state, THCudaTensor self_, THCudaTensor *src_, long k)
	{
	THAssert(THCudaTensor_checkGPU(state, 2, self_, src_));
	THArgCheck(src_->nDimension == 2, 1, "expected a matrix");

	THCudaTensor *src = src_;
	if (self_ == src_)
	src = THCudaTensor_newContiguous(state, src_);

	long stride0 = src->stride[0];
	long stride1 = src->stride[1];
	float *start = THCudaTensor_data(state, src) + src->storageOffset;

	TensorTriOp<1> op(start, stride0, stride1, k);

	if (self_ == src_) {
	if (!THC_pointwiseApply1(state, src, op)) {
	THArgCheck(false, 2, CUTORCH_DIM_WARNING);
	}
	} else {
	THCudaTensor_resizeAs(state, self_, src);

	if (!THC_pointwiseApply2(state, self_, src, op)) {
	THArgCheck(false, 2, CUTORCH_DIM_WARNING);
	}
	}

	if (self_ == src_)
	THCudaTensor_freeCopyTo(state, src, src_);

	THCudaCheck(cudaGetLastError());
	}

	#include "generic/THCTensorMathPairwise.cu"
	#include "THCGenerateAllTypes.h"

	// Copy the kth diagonal of a matrix B to a vector A.
	__global__ void THCudaTensor_copyFromDiagonal(float* a, float* b, long start, long size, long strideSum, long strideA) {
	for (long linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
	linearIndex < size;
	linearIndex += gridDim.x * blockDim.x) {
	const long bOffset = start + strideSum * linearIndex;
	a[strideA * linearIndex] = b[bOffset];
	}
	}

	// Copy vector B to the kth diagonal of a matrix A
	__global__ void THCudaTensor_copyToDiagonal(float* a, float* b, long start, long size, long strideSum, long strideB) {
	for (long linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
	linearIndex < size;
	linearIndex += gridDim.x * blockDim.x) {
	const long aOffset = start + strideSum * linearIndex;
	a[aOffset] = b[strideB * linearIndex];
	}
	}

	void THCudaTensor_diag(THCState state, THCudaTensor self_, THCudaTensor *src_, long k){
	THAssert(THCudaTensor_checkGPU(state, 2, self_, src_));
	int nDimension = THCudaTensor_nDimension(state, src_);
	THArgCheck((nDimension == 2) \|\| (nDimension == 1), 1, "expected a matrix or a vector");
	if (nDimension == 2) {
	long stride0 = THCudaTensor_stride(state, src_, 0);
	long stride1 = THCudaTensor_stride(state, src_, 1);
	long size0 = THCudaTensor_size(state, src_, 0);
	long size1 = THCudaTensor_size(state, src_, 1);
	long size = (k > 0) ? min((long long)size0, (long long)size1 - k) : min((long long)size0 + k, (long long)size1);
	THCudaTensor_resize1d(state, self_, size);
	long strideSelf = THCudaTensor_stride(state, self_, 0);
	const dim3 threads(min((long long)THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock, (long long)size));
	dim3 grid(min((long long)1024, (long long)THCCeilDiv(size, (long)threads.x)));
	long start = (k >= 0 ? k * stride1 : -k * stride0);
	THCudaTensor_copyFromDiagonal<<<grid, threads, 0, THCState_getCurrentStream(state)>>>
	(THCudaTensor_data(state, self_), THCudaTensor_data(state, src_), start, size, stride0 + stride1, strideSelf);
	} else {
	long totalElements = THCudaTensor_nElement(state, src_);
	long size = (k > 0) ? totalElements + k : totalElements - k;
	long strideSrc = THCudaTensor_stride(state, src_, 0);
	THCudaTensor_resize2d(state, self_, size, size);
	THCudaTensor_zero(state, self_);
	long stride0 = THCudaTensor_stride(state, self_, 0);
	long stride1 = THCudaTensor_stride(state, self_, 1);
	const dim3 threads(min((long long)THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock, (long long)size));
	dim3 grid(min((long long)1024, (long long)THCCeilDiv(size, (long)threads.x)));
	long start = (k >= 0 ? k * stride1 : -k * stride0);
	THCudaTensor_copyToDiagonal<<<grid, threads, 0, THCState_getCurrentStream(state)>>>
	(THCudaTensor_data(state, self_), THCudaTensor_data(state, src_), start, totalElements, stride0 + stride1, strideSrc);
	}
	THCudaCheck(cudaGetLastError());
	}

	float THCudaTensor_trace(THCState state, THCudaTensor src_) {
	THAssert(THCudaTensor_checkGPU(state, 1, src_));
	THArgCheck((src_->nDimension == 2), 1, "expected a matrix");
	THCudaTensor *diag = THCudaTensor_new(state);
	THCudaTensor_diag(state, diag, src_, 0);
	float trace = THCudaTensor_sumall(state, diag);
	THCudaTensor_free(state, diag);
	return trace;
	}