Added initial cuSparse groundwork, and fp16 (__half) GEMM function.
This commit is contained in:
parent
8c7b99cd9f
commit
0d006ae326
40
Array.h
40
Array.h
@ -1,9 +1,9 @@
|
||||
#ifndef CUDATOOLS_ARRAY_H
|
||||
#define CUDATOOLS_ARRAY_H
|
||||
|
||||
#include "Complex.h"
|
||||
#include "Core.h"
|
||||
#include "Macros.h"
|
||||
#include "Types.h"
|
||||
#include <Eigen/Dense>
|
||||
#include <cmath>
|
||||
#include <complex>
|
||||
@ -18,10 +18,9 @@
|
||||
#define POINTER pHost
|
||||
#endif
|
||||
|
||||
namespace CudaTools {
|
||||
using namespace CudaTools::Types;
|
||||
|
||||
/** Type alises and lots of metaprogramming definitions, primarily dealing with
|
||||
* the different numeric types and overrides. */
|
||||
namespace CudaTools {
|
||||
|
||||
template <typename T>
|
||||
using EigenMat = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor>;
|
||||
@ -32,23 +31,6 @@ template <typename T> struct EigenAdaptConst_S { typedef EigenMapMat<T> type; };
|
||||
template <typename T> struct EigenAdaptConst_S<const T> { typedef ConstEigenMapMat<T> type; };
|
||||
template <typename T> using EigenAdaptConst = typename EigenAdaptConst_S<T>::type;
|
||||
|
||||
template <typename T> struct ComplexUnderlying_S { typedef T type; };
|
||||
template <> struct ComplexUnderlying_S<complex64> { typedef float type; };
|
||||
template <> struct ComplexUnderlying_S<complex128> { typedef double type; };
|
||||
template <typename T> using ComplexUnderlying = typename ComplexUnderlying_S<T>::type;
|
||||
|
||||
template <typename T> struct ComplexConversion_S { typedef T type; };
|
||||
template <> struct ComplexConversion_S<complex64> { typedef std::complex<float> type; };
|
||||
template <> struct ComplexConversion_S<complex128> { typedef std::complex<double> type; };
|
||||
template <typename T> using ComplexConversion = typename ComplexConversion_S<T>::type;
|
||||
|
||||
template <typename T> inline constexpr bool is_int = std::is_integral<T>::value;
|
||||
template <typename T> inline constexpr bool is_float = std::is_floating_point<T>::value;
|
||||
template <typename T>
|
||||
inline constexpr bool is_complex =
|
||||
std::is_same<T, complex64>::value or std::is_same<T, complex128>::value;
|
||||
template <typename T> inline constexpr bool is_num = is_int<T> or is_float<T> or is_complex<T>;
|
||||
|
||||
template <typename T> class Array;
|
||||
using Slice = std::pair<uint32_t, uint32_t>;
|
||||
|
||||
@ -576,7 +558,7 @@ template <typename T> class Array {
|
||||
* Sets the values of the entire Array to a constant. This is restricted to numerical types.
|
||||
*/
|
||||
HD void setConstant(const T value) const {
|
||||
static_assert(is_num<T>, "Function only available on numeric types.");
|
||||
static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
|
||||
for (auto it = begin(); it != end(); ++it) {
|
||||
*it = value;
|
||||
}
|
||||
@ -588,7 +570,7 @@ template <typename T> class Array {
|
||||
* \brief Host only
|
||||
*/
|
||||
void setRandom(const T min, const T max) const {
|
||||
static_assert(is_num<T>, "Function only available on numeric types.");
|
||||
static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
|
||||
if constexpr (is_complex<T>) {
|
||||
CT_ERROR_IF(max.real(), <, min.real(),
|
||||
"Upper bound of range cannot be larger than lower bound");
|
||||
@ -623,7 +605,7 @@ template <typename T> class Array {
|
||||
* restricted to numerical types.
|
||||
*/
|
||||
HD void setRange(T min, const T step = 1) const {
|
||||
static_assert(is_num<T>, "Function only available on numeric types.");
|
||||
static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
|
||||
for (auto it = begin(); it != end(); ++it) {
|
||||
*it = min;
|
||||
min += step;
|
||||
@ -650,7 +632,7 @@ template <typename T> class Array {
|
||||
* \brief Host only
|
||||
*/
|
||||
static Array constant(const Shape& shape, const T value) {
|
||||
static_assert(is_num<T>, "Function only available on numeric types.");
|
||||
static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
|
||||
Array<T> arr(shape);
|
||||
arr.setConstant(value);
|
||||
return arr;
|
||||
@ -662,7 +644,7 @@ template <typename T> class Array {
|
||||
* \brief Host only
|
||||
*/
|
||||
static Array random(const Shape& shape, const T min, const T max) {
|
||||
static_assert(is_num<T>, "Function only available on numeric types.");
|
||||
static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
|
||||
Array<T> arr(shape);
|
||||
arr.setRandom(min, max);
|
||||
return arr;
|
||||
@ -673,7 +655,7 @@ template <typename T> class Array {
|
||||
* \brief Host only
|
||||
*/
|
||||
static Array range(const T min, const T max, const T step = 1) {
|
||||
static_assert(is_num<T>, "Function only available on numeric types.");
|
||||
static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
|
||||
CT_ERROR_IF(max, <, min, "Upper bound of range cannot be larger than lower bound");
|
||||
Array<T> arr({(uint32_t)((max - min) / step)});
|
||||
arr.setRange(min, step);
|
||||
@ -698,7 +680,7 @@ template <typename T> class Array {
|
||||
* \brief Host only
|
||||
*/
|
||||
Array transposed() const {
|
||||
static_assert(is_num<T>, "Function only available on numeric types.");
|
||||
static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
|
||||
CT_ERROR_IF(shape().axes(), !=, 2, "Tranpose can only occur on two-dimensional arrays");
|
||||
Array<T> new_arr({mShape.rows(), mShape.cols()});
|
||||
new_arr.eigenMap() = this->eigenMap().transpose().eval();
|
||||
@ -711,7 +693,7 @@ template <typename T> class Array {
|
||||
* \brief Host only
|
||||
*/
|
||||
void transpose() {
|
||||
static_assert(is_num<T>, "Function only available on numeric types.");
|
||||
static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
|
||||
CT_ERROR_IF(shape().axes(), !=, 2, "Tranpose can only occur on two-dimensional arrays");
|
||||
Array<T> new_arr(*this, {mShape.cols(), mShape.rows()});
|
||||
new_arr.eigenMap() = this->eigenMap().transpose().eval();
|
||||
|
||||
86
BLAS.h
86
BLAS.h
@ -2,16 +2,13 @@
|
||||
#define CUDATOOLS_BLAS_H
|
||||
|
||||
#include "Array.h"
|
||||
#include "Complex.h"
|
||||
#include "Core.h"
|
||||
#include "Macros.h"
|
||||
#include "Types.h"
|
||||
|
||||
#ifdef CUDACC
|
||||
#include <cuComplex.h>
|
||||
#endif
|
||||
using namespace CudaTools::Types;
|
||||
|
||||
namespace CudaTools {
|
||||
|
||||
namespace BLAS {
|
||||
|
||||
struct BatchInfo {
|
||||
@ -19,17 +16,20 @@ struct BatchInfo {
|
||||
uint32_t size;
|
||||
};
|
||||
|
||||
template <typename T> struct Check {
|
||||
struct Check {
|
||||
template <typename T>
|
||||
static void isAtLeast2D(const Array<T>& arr, const std::string& name = "Array") {
|
||||
CT_ERROR_IF(arr.shape().axes(), <, 2, (name + " needs to be at least 2D").c_str());
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
static void isSquare(const Array<T>& arr, const std::string& name = "Array") {
|
||||
isAtLeast2D(arr, name);
|
||||
CT_ERROR_IF(arr.shape().rows(), !=, arr.shape().cols(), (name + " is not square").c_str())
|
||||
};
|
||||
|
||||
static void isValidMatmul(const Array<T>& A, const Array<T>& B, const Array<T>& C,
|
||||
template <typename T, typename U, typename V>
|
||||
static void isValidMatmul(const Array<T>& A, const Array<U>& B, const Array<V>& C,
|
||||
const std::string& nameA = "A", const std::string& nameB = "B",
|
||||
const std::string nameC = "C") {
|
||||
isAtLeast2D(A, nameA);
|
||||
@ -46,7 +46,7 @@ template <typename T> struct Check {
|
||||
("The shape of " + nameA + nameB + " does not match the shape of " + nameC).c_str());
|
||||
};
|
||||
|
||||
static uint32_t getUpperItems(const Array<T>& arr) {
|
||||
template <typename T> static uint32_t getUpperItems(const Array<T>& arr) {
|
||||
uint32_t upperItems = 1;
|
||||
for (uint32_t iAxis = 0; iAxis < arr.shape().axes() - 2; ++iAxis) {
|
||||
upperItems *= arr.shape().dim(iAxis);
|
||||
@ -54,7 +54,8 @@ template <typename T> struct Check {
|
||||
return upperItems;
|
||||
};
|
||||
|
||||
static void matchUpperShape(const Array<T>& A, const Array<T>& B,
|
||||
template <typename T, typename U>
|
||||
static void matchUpperShape(const Array<T>& A, const Array<U>& B,
|
||||
const std::string& nameA = "A", const std::string& nameB = "B") {
|
||||
CT_ERROR_IF(A.shape().axes(), !=, B.shape().axes(),
|
||||
(nameA + " and " + nameB + " shapes do not match for broadcasting").c_str());
|
||||
@ -67,7 +68,8 @@ template <typename T> struct Check {
|
||||
}
|
||||
};
|
||||
|
||||
static BatchInfo isBroadcastable(const Array<T>& A, const Array<T>& B, const Array<T>& C,
|
||||
template <typename T, typename U, typename V>
|
||||
static BatchInfo isBroadcastable(const Array<T>& A, const Array<U>& B, const Array<V>& C,
|
||||
const std::string& nameA = "A", const std::string& nameB = "B",
|
||||
const std::string nameC = "C") {
|
||||
isValidMatmul(A, B, C, nameA, nameB, nameC);
|
||||
@ -130,7 +132,7 @@ template <typename T> class Batch {
|
||||
Batch(const Array<T>& arr) {
|
||||
CT_ERROR(arr.isView(), "Array cannot be a view");
|
||||
mShape = Shape({arr.shape().rows(), arr.shape().cols()});
|
||||
mBatchSize = mCount = Check<T>::getUpperItems(arr);
|
||||
mBatchSize = mCount = Check::getUpperItems(arr);
|
||||
|
||||
mBatch = Array<T*>({mBatchSize});
|
||||
|
||||
@ -159,7 +161,7 @@ template <typename T> class Batch {
|
||||
#endif
|
||||
if (mCount == 0) {
|
||||
mShape = arr.shape();
|
||||
mBatchSize = mCount = Check<T>::getUpperItems(arr);
|
||||
mBatchSize = mCount = Check::getUpperItems(arr);
|
||||
} else {
|
||||
CT_ERROR_IF(arr.shape(), !=, mShape, "Cannot add matrix of different shape to batch");
|
||||
}
|
||||
@ -195,15 +197,30 @@ template <typename T> struct CudaComplexConversion_S { typedef T type; };
|
||||
#ifdef CUDACC
|
||||
template <> struct CudaComplexConversion_S<complex64> { typedef cuComplex type; };
|
||||
template <> struct CudaComplexConversion_S<complex128> { typedef cuDoubleComplex type; };
|
||||
#else
|
||||
|
||||
#endif
|
||||
|
||||
template <typename T> using CudaComplexConversion = typename CudaComplexConversion_S<T>::type;
|
||||
|
||||
template <typename T> struct CublasTypeLetter_S { char letter; };
|
||||
template <> struct CublasTypeLetter_S<real32> { char letter = 'S'; };
|
||||
template <> struct CublasTypeLetter_S<real64> { char letter = 'D'; };
|
||||
template <> struct CublasTypeLetter_S<complex64> { char letter = 'C'; };
|
||||
template <> struct CublasTypeLetter_S<complex128> { char letter = 'Z'; };
|
||||
#ifdef CUDACC
|
||||
template <> struct CublasTypeLetter_S<real16> { char letter = 'H'; };
|
||||
#endif
|
||||
|
||||
template <typename T> char CublasTypeLetter = CublasTypeLetter_S<T>::letter;
|
||||
|
||||
// Shorthands to reduce clutter.
|
||||
|
||||
#define CAST(var) reinterpret_cast<CudaComplexConversion<T>*>(var)
|
||||
#define DCAST(var) reinterpret_cast<CudaComplexConversion<T>**>(var)
|
||||
|
||||
#define cublas(T, func) cublas##CublasTypeLetter<T>##func
|
||||
|
||||
template <typename T, typename F1, typename F2, typename F3, typename F4, typename... Args>
|
||||
constexpr void invoke(F1 f1, F2 f2, F3 f3, F4 f4, Args&&... args) {
|
||||
if constexpr (std::is_same<T, real32>::value) {
|
||||
@ -215,7 +232,26 @@ constexpr void invoke(F1 f1, F2 f2, F3 f3, F4 f4, Args&&... args) {
|
||||
} else if constexpr (std::is_same<T, complex128>::value) {
|
||||
CUBLAS_CHECK(f4(args...));
|
||||
} else {
|
||||
CT_ERROR(true, "BLAS functions are not callable with that type");
|
||||
CT_ERROR(true, "This BLAS function is not callable with that type");
|
||||
}
|
||||
}
|
||||
|
||||
// If someone can think of a better solution, please tell me.
|
||||
template <typename T, typename F1, typename F2, typename F3, typename F4, typename F5,
|
||||
typename... Args>
|
||||
constexpr void invoke5(F1 f1, F2 f2, F3 f3, F4 f4, F5 f5, Args&&... args) {
|
||||
if constexpr (std::is_same<T, real32>::value) {
|
||||
CUBLAS_CHECK(f1(args...));
|
||||
} else if constexpr (std::is_same<T, real64>::value) {
|
||||
CUBLAS_CHECK(f2(args...));
|
||||
} else if constexpr (std::is_same<T, complex64>::value) {
|
||||
CUBLAS_CHECK(f3(args...));
|
||||
} else if constexpr (std::is_same<T, complex128>::value) {
|
||||
CUBLAS_CHECK(f4(args...));
|
||||
} else if constexpr (std::is_same<T, real16>::value) {
|
||||
CUBLAS_CHECK(f5(args...));
|
||||
} else {
|
||||
CT_ERROR(true, "This BLAS function is not callable with that type");
|
||||
}
|
||||
}
|
||||
|
||||
@ -227,7 +263,7 @@ template <typename T>
|
||||
StreamID GEMV(const T alpha, const Array<T>& A, const Array<T>& x, const T beta, const Array<T>& y,
|
||||
const StreamID& stream = DEF_CUBLAS_STREAM) {
|
||||
|
||||
BatchInfo bi = Check<T>::isBroadcastable(A, x, y, "A", "x", "y");
|
||||
BatchInfo bi = Check::isBroadcastable(A, x, y, "A", "x", "y");
|
||||
CT_ERROR_IF(x.shape().cols(), !=, 1, "x must be a column vector");
|
||||
CT_ERROR_IF(y.shape().cols(), !=, 1, "x must be a column vector");
|
||||
|
||||
@ -241,7 +277,6 @@ StreamID GEMV(const T alpha, const Array<T>& A, const Array<T>& x, const T beta,
|
||||
Manager::get()->cublasHandle(), CUBLAS_OP_N, rows, cols, CAST(&a),
|
||||
CAST(A.dataDevice()), rows, CAST(x.dataDevice()), 1, CAST(&b),
|
||||
CAST(y.dataDevice()), 1);
|
||||
|
||||
} else { // Greater than 2, so broadcast.
|
||||
invoke<T>(cublasSgemvStridedBatched, cublasDgemvStridedBatched, cublasCgemvStridedBatched,
|
||||
cublasZgemvStridedBatched, Manager::get()->cublasHandle(), CUBLAS_OP_N, rows,
|
||||
@ -269,11 +304,11 @@ StreamID GEMV(const T alpha, const Array<T>& A, const Array<T>& x, const T beta,
|
||||
* Computes the matrix-matrix product: \f$ C = \alpha AB + \beta C \f$. It will automatically
|
||||
* broadcast the operation if applicable.
|
||||
*/
|
||||
template <typename T>
|
||||
StreamID GEMM(const T alpha, const Array<T>& A, const Array<T>& B, const T beta, const Array<T>& C,
|
||||
template <typename T, typename U, typename V>
|
||||
StreamID GEMM(const T alpha, const Array<U>& A, const Array<U>& B, const T beta, const Array<V>& C,
|
||||
const StreamID& stream = DEF_CUBLAS_STREAM) {
|
||||
|
||||
BatchInfo bi = Check<T>::isBroadcastable(A, B, C, "A", "B", "C");
|
||||
BatchInfo bi = Check::isBroadcastable(A, B, C, "A", "B", "C");
|
||||
// A is m x k, B is k x n.
|
||||
uint32_t m = A.shape().rows();
|
||||
uint32_t k = A.shape().cols();
|
||||
@ -282,18 +317,19 @@ StreamID GEMM(const T alpha, const Array<T>& A, const Array<T>& B, const T beta,
|
||||
T a = alpha, b = beta;
|
||||
#ifdef CUDA
|
||||
CUBLAS_CHECK(cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream)));
|
||||
|
||||
if (bi.size == 1) {
|
||||
invoke<T>(cublasSgemm, cublasDgemm, cublasCgemm, cublasZgemm,
|
||||
invoke5<T>(cublasSgemm, cublasDgemm, cublasCgemm, cublasZgemm, cublasHgemm,
|
||||
Manager::get()->cublasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, CAST(&a),
|
||||
CAST(A.dataDevice()), m, CAST(B.dataDevice()), k, CAST(&b), CAST(C.dataDevice()),
|
||||
m);
|
||||
|
||||
} else { // Greater than 2, so broadcast.
|
||||
invoke<T>(cublasSgemmStridedBatched, cublasDgemmStridedBatched, cublasCgemmStridedBatched,
|
||||
cublasZgemmStridedBatched, Manager::get()->cublasHandle(), CUBLAS_OP_N,
|
||||
CUBLAS_OP_N, m, n, k, CAST(&a), CAST(A.dataDevice()), m, bi.strideA,
|
||||
CAST(B.dataDevice()), k, bi.strideB, CAST(&b), CAST(C.dataDevice()), m,
|
||||
bi.strideC, bi.size);
|
||||
invoke5<T>(cublasSgemmStridedBatched, cublasDgemmStridedBatched, cublasCgemmStridedBatched,
|
||||
cublasZgemmStridedBatched, cublasHgemmStridedBatched,
|
||||
Manager::get()->cublasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, CAST(&a),
|
||||
CAST(A.dataDevice()), m, bi.strideA, CAST(B.dataDevice()), k, bi.strideB,
|
||||
CAST(&b), CAST(C.dataDevice()), m, bi.strideC, bi.size);
|
||||
}
|
||||
|
||||
#else
|
||||
@ -487,7 +523,7 @@ class PLUBatch : public Batch<T> {
|
||||
* Constructor of a PLUBatch from a multi-dimensional array, batched across upper dimensions.
|
||||
*/
|
||||
PLUBatch(const Array<T>& arr) : Batch<T>(arr) {
|
||||
Check<T>::isSquare(arr, "LU Array");
|
||||
Check::isSquare(arr, "LU Array");
|
||||
|
||||
mPivotsBatch = Array<int32_t>({this->mBatchSize * this->mShape.rows()});
|
||||
mInfoLU = Array<int32_t>({this->mBatchSize});
|
||||
|
||||
39
Core.h
39
Core.h
@ -81,6 +81,7 @@ class Manager {
|
||||
#ifdef CUDACC
|
||||
std::unordered_map<std::string, cudaStream_t> mStreams;
|
||||
cublasHandle_t mCublas;
|
||||
cusparseHandle_t mCusparse;
|
||||
#endif
|
||||
public:
|
||||
/**
|
||||
@ -94,6 +95,7 @@ class Manager {
|
||||
#ifdef CUDACC
|
||||
cudaStream_t stream(const StreamID& stream) const;
|
||||
cublasHandle_t cublasHandle() const;
|
||||
cusparseHandle_t cusparseHandle() const;
|
||||
#endif
|
||||
};
|
||||
|
||||
@ -391,6 +393,7 @@ Manager::Manager(const std::vector<std::string>& names) {
|
||||
addStream(name);
|
||||
}
|
||||
CUBLAS_CHECK(cublasCreate(&mCublas));
|
||||
CUSPARSE_CHECK(cusparseCreate(&mCusparse));
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -400,6 +403,7 @@ Manager::~Manager() {
|
||||
CUDA_CHECK(cudaStreamDestroy(it.second));
|
||||
}
|
||||
CUBLAS_CHECK(cublasDestroy(mCublas));
|
||||
CUSPARSE_CHECK(cusparseDestroy(mCusparse));
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -439,8 +443,10 @@ cudaStream_t Manager::stream(const StreamID& stream) const {
|
||||
}
|
||||
|
||||
cublasHandle_t Manager::cublasHandle() const { return mCublas; };
|
||||
cusparseHandle_t Manager::cusparseHandle() const { return mCusparse; };
|
||||
|
||||
Manager Manager::mManagerInstance = Manager({"defaultMemory", "defaultCublas", "defaultKernel"});
|
||||
Manager Manager::mManagerInstance =
|
||||
Manager({"defaultMemory", "defaultCublas", "defaultCusparse", "defaultKernel"});
|
||||
#else
|
||||
Manager Manager::mManagerInstance = Manager({""});
|
||||
#endif
|
||||
@ -674,37 +680,6 @@ void GraphManager::joinBranch(const StreamID& orig_stream, const StreamID& branc
|
||||
orig_stream.wait(*event);
|
||||
}
|
||||
|
||||
#ifdef CUDACC
|
||||
const char* cublasGetErrorString(cublasStatus_t error) {
|
||||
switch (error) {
|
||||
case CUBLAS_STATUS_SUCCESS:
|
||||
return "CUBLAS_STATUS_SUCCESS";
|
||||
|
||||
case CUBLAS_STATUS_NOT_INITIALIZED:
|
||||
return "CUBLAS_STATUS_NOT_INITIALIZED";
|
||||
|
||||
case CUBLAS_STATUS_ALLOC_FAILED:
|
||||
return "CUBLAS_STATUS_ALLOC_FAILED";
|
||||
|
||||
case CUBLAS_STATUS_INVALID_VALUE:
|
||||
return "CUBLAS_STATUS_INVALID_VALUE";
|
||||
|
||||
case CUBLAS_STATUS_ARCH_MISMATCH:
|
||||
return "CUBLAS_STATUS_ARCH_MISMATCH";
|
||||
|
||||
case CUBLAS_STATUS_MAPPING_ERROR:
|
||||
return "CUBLAS_STATUS_MAPPING_ERROR";
|
||||
|
||||
case CUBLAS_STATUS_EXECUTION_FAILED:
|
||||
return "CUBLAS_STATUS_EXECUTION_FAILED";
|
||||
|
||||
case CUBLAS_STATUS_INTERNAL_ERROR:
|
||||
return "CUBLAS_STATUS_INTERNAL_ERROR";
|
||||
}
|
||||
|
||||
return "<unknown>";
|
||||
}
|
||||
#endif
|
||||
}; // namespace CudaTools
|
||||
#endif // CUDATOOLS_IMPLEMENTATION
|
||||
|
||||
|
||||
22
Macros.h
22
Macros.h
@ -9,9 +9,6 @@
|
||||
#define CUDACC
|
||||
#endif
|
||||
|
||||
using real32 = float; /**< Type alias for 32-bit floating point datatype. */
|
||||
using real64 = double; /**< Type alias for 64-bit floating point datatype. */
|
||||
|
||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 0)
|
||||
#define DEVICE
|
||||
#endif
|
||||
@ -124,14 +121,19 @@ using real64 = double; /**< Type alias for 64-bit floating point datatype. */
|
||||
#ifdef CUDACC
|
||||
|
||||
#include <cublas_v2.h>
|
||||
#include <cuda_bf16.h>
|
||||
#include <cuda_fp16.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <cusparse.h>
|
||||
|
||||
#define DEVICE_FUNC __device__
|
||||
#define HD __host__ __device__
|
||||
#define SHARED __shared__
|
||||
|
||||
#define KERNEL(call, ...) __global__ void call(__VA_ARGS__)
|
||||
|
||||
#else
|
||||
#define DEVICE_FUNC
|
||||
#define HD
|
||||
#define SHARED
|
||||
|
||||
@ -139,8 +141,6 @@ using real64 = double; /**< Type alias for 64-bit floating point datatype. */
|
||||
|
||||
#endif // CUDACC
|
||||
|
||||
//#define KERNEL(call, settings, ...) CudaTools::runKernel(call, settings, __VA_ARGS__)
|
||||
|
||||
///////////////////
|
||||
// DEVICE MACROS //
|
||||
///////////////////
|
||||
@ -252,8 +252,16 @@ using real64 = double; /**< Type alias for 64-bit floating point datatype. */
|
||||
do { \
|
||||
cublasStatus_t err = (call); \
|
||||
if (err != CUBLAS_STATUS_SUCCESS) { \
|
||||
printf("[cuBLAS] %s:%d\n | %s\n", __FILE__, __LINE__, \
|
||||
CudaTools::cublasGetErrorString(err)); \
|
||||
printf("[cuBLAS] %s:%d\n | %s\n", __FILE__, __LINE__, cublasGetStatusName(err)); \
|
||||
throw std::exception(); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define CUSPARSE_CHECK(call) \
|
||||
do { \
|
||||
cusparseStatus_t err = (call); \
|
||||
if (err != CUSPARSE_STATUS_SUCCESS) { \
|
||||
printf("[cuSPARSE] %s:%d\n | %s\n", __FILE__, __LINE__, cusparseGetErrorName(err)); \
|
||||
throw std::exception(); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
2
Makefile
2
Makefile
@ -7,7 +7,7 @@ INCLUDE :=
|
||||
LIBS_DIR :=
|
||||
LIBS_DIR_GPU := /usr/local/cuda/lib64
|
||||
LIBS :=
|
||||
LIBS_GPU := cuda cudart cublas
|
||||
LIBS_GPU := cuda cudart cublas cusparse
|
||||
|
||||
TARGET = tests
|
||||
SRC_DIR = .
|
||||
|
||||
@ -7,7 +7,7 @@ INCLUDE := <<Put extra include directories here, separated by a space>>
|
||||
LIBS_DIR := <<Put library directories here, separated by a space>>
|
||||
LIBS_DIR_GPU := /usr/local/cuda/lib64 <<Put extra include GPU library directories here, separated by a space>>
|
||||
LIBS := <<Put the names of the libraries here, separated by a space>>
|
||||
LIBS_GPU := cuda cudart cublas <<Put extra GPU libraries here, separated by a space>>
|
||||
LIBS_GPU := cuda cudart cublas cusparse <<Put extra GPU libraries here, separated by a space>>
|
||||
|
||||
TARGET = <<Put the name of your target here>>
|
||||
SRC_DIR = .
|
||||
|
||||
10
Sparse.h
Normal file
10
Sparse.h
Normal file
@ -0,0 +1,10 @@
|
||||
#ifndef CUDATOOLS_SPARSE_H
|
||||
#define CUDATOOLS_SPARSE_H
|
||||
|
||||
#include "Array.h"
|
||||
#include "Core.h"
|
||||
#include "Macros.h"
|
||||
#include "Types.h"
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@ -11,6 +11,25 @@
|
||||
|
||||
namespace CudaTools {
|
||||
|
||||
namespace Types {
|
||||
|
||||
using real32 = float; /**< Type alias for 32-bit floating point datatype. */
|
||||
using real64 = double; /**< Type alias for 64-bit floating point datatype. */
|
||||
|
||||
#ifdef CUDACC
|
||||
|
||||
using real16 = __half;
|
||||
using realb16 = __nv_bfloat16;
|
||||
|
||||
#else
|
||||
|
||||
using real16 = float; /**< Type alias for 16-bit floating point datatype, when using GPU. Otherwise,
|
||||
defaults to float. */
|
||||
using realb16 = float; /**< Type alias for the 16-bit bfloat datatype, when using GPU. Otherwise,
|
||||
defaults to float. */
|
||||
|
||||
#endif // CUDACC
|
||||
|
||||
template <typename T> class complex {
|
||||
private:
|
||||
T r = 0;
|
||||
@ -107,11 +126,10 @@ template complex<real64> operator*<real64>(const real64, const complex<real64>);
|
||||
template complex<real32> operator/<real32>(const real32, const complex<real32>);
|
||||
template complex<real64> operator/<real64>(const real64, const complex<real64>);
|
||||
|
||||
}; // namespace CudaTools
|
||||
#ifdef CUDACC
|
||||
using complex64 = complex<real32>;
|
||||
using complex128 = complex<real64>;
|
||||
|
||||
#ifdef CUDA
|
||||
using complex64 = CudaTools::complex<real32>;
|
||||
using complex128 = CudaTools::complex<real64>;
|
||||
#else
|
||||
using complex64 = std::complex<real32>; /**< Type alias for 64-bit complex floating point datatype.
|
||||
* This adapts depending on the CUDA compilation flag, and
|
||||
@ -122,4 +140,27 @@ using complex128 =
|
||||
* CudaTools::complex<real64>. */
|
||||
#endif
|
||||
|
||||
/** Type alises and lots of metaprogramming definitions, primarily dealing with
|
||||
* the different numeric types and overrides. */
|
||||
|
||||
template <typename T> struct ComplexUnderlying_S { typedef T type; };
|
||||
template <> struct ComplexUnderlying_S<complex64> { typedef float type; };
|
||||
template <> struct ComplexUnderlying_S<complex128> { typedef double type; };
|
||||
template <typename T> using ComplexUnderlying = typename ComplexUnderlying_S<T>::type;
|
||||
|
||||
template <typename T> struct ComplexConversion_S { typedef T type; };
|
||||
template <> struct ComplexConversion_S<complex64> { typedef std::complex<float> type; };
|
||||
template <> struct ComplexConversion_S<complex128> { typedef std::complex<double> type; };
|
||||
template <typename T> using ComplexConversion = typename ComplexConversion_S<T>::type;
|
||||
|
||||
template <typename T> inline constexpr bool is_int = std::is_integral<T>::value;
|
||||
template <typename T> inline constexpr bool is_float = std::is_floating_point<T>::value;
|
||||
template <typename T>
|
||||
inline constexpr bool is_complex =
|
||||
std::is_same<T, complex64>::value or std::is_same<T, complex128>::value;
|
||||
template <typename T> inline constexpr bool is_host_num = is_int<T> or is_float<T> or is_complex<T>;
|
||||
|
||||
}; // namespace Types
|
||||
}; // namespace CudaTools
|
||||
|
||||
#endif
|
||||
@ -21,7 +21,7 @@ for your own project, after following a few rules.
|
||||
|
||||
The usage of this libary will be illustrated through examples, and further details
|
||||
can be found in the other sections. The examples are given in the `samples <https://git.acem.ece.illinois.edu/kjao/CudaTools/src/branch/main/samples>`__ folder.
|
||||
Throughout this documentation, there are a few common terms that may appear. First,we refer to the CPU as the host, and the GPU as the device. So, a host function refers
|
||||
Throughout this documentation, there are a few common terms that may appear. First, we refer to the CPU as the host, and the GPU as the device. So, a host function refers
|
||||
to a function runnable on the CPU, and a device function refers to a function that is runnable
|
||||
on a device. A kernel is a specific function that the host can call to be run on the device.
|
||||
|
||||
@ -42,17 +42,17 @@ macros provided. For example,
|
||||
return 0;
|
||||
}
|
||||
|
||||
The ``DEFINE_KERNEL(name, ...)`` macro takes in the function name and its arguments.
|
||||
The ``KERNEL(name, ...)`` macro takes in the function name and its arguments.
|
||||
The second argument in the ``KERNEL()`` macro is are the launch parameters for
|
||||
kernel. The launch parameters have several items, but for 'embarassingly parallel'
|
||||
cases, we can simply generate the settings with the number of threads. More detail with
|
||||
cases, we can simply generate the settings with the number of threads using ``CudaTools::Kernel::basic``. More detail with
|
||||
creating launch parameters can be found :ref:`here <CudaTools::Kernel::Settings>`. In the above example,
|
||||
there is only one thread. The rest of the arguments are just the kernel arguments. For more detail,
|
||||
see :ref:`here <Macro Functions>`.
|
||||
|
||||
.. warning::
|
||||
These kernel definitions must be in a file that will be compiled by ``nvcc``. Also,
|
||||
for header files, there is an additional macro ``DECLARE_KERNEL(name, ...)`` to declare it
|
||||
for header files, there is an additional macro ``KERNEL(name, ...)`` to declare it
|
||||
and make it available to other files.
|
||||
|
||||
Since many applications used classes, a macro is provided to 'convert' a class into
|
||||
@ -192,7 +192,8 @@ situations and with the ``CudaTools::Kernel::basic()`` launch parameters. If com
|
||||
mark the loop with ``#pragma parallel for`` and attempt to use OpenMP for parallelism.
|
||||
|
||||
.. warning::
|
||||
Notice that a view must be passed to the kernel, and not the original object. This
|
||||
Notice that a view must be passed to the kernel, and not the original object, otherwise a copy
|
||||
would be made.
|
||||
|
||||
The Array also supports other helpful functions, such as multi-dimensional indexing, slicing, and
|
||||
a few other functions.
|
||||
|
||||
@ -90,6 +90,7 @@ int main() {
|
||||
CudaTools::Array<uint32_t> A = CudaTools::Array<uint32_t>::constant({100}, 50);
|
||||
CudaTools::Array<uint32_t> B = CudaTools::Array<uint32_t>::constant({100}, 0);
|
||||
|
||||
// Executes process without graph.
|
||||
TIME(doFunc(A.view(), B.view()), ExecuteNoGraph);
|
||||
|
||||
std::cout << A.slice({{0, 10}}) << "\n";
|
||||
@ -97,6 +98,7 @@ int main() {
|
||||
A.setConstant(50);
|
||||
B.setConstant(0);
|
||||
|
||||
// Executes process with graph.
|
||||
CudaTools::GraphManager gm;
|
||||
CudaTools::Graph graph("graphStream", myGraph, &gm, A.view(), B.view());
|
||||
TIME(graph.execute().wait(), ExecuteGraph);
|
||||
|
||||
@ -2,13 +2,14 @@
|
||||
#define CUDATOOLS_ARRAY_MAX_AXES 8
|
||||
#include "Array.h"
|
||||
#include "BLAS.h"
|
||||
#include "Complex.h"
|
||||
#include "Core.h"
|
||||
#include "Types.h"
|
||||
|
||||
#include <Eigen/Core>
|
||||
#include <chrono>
|
||||
#include <complex>
|
||||
|
||||
using namespace CudaTools::Types;
|
||||
namespace CT = CudaTools;
|
||||
|
||||
/////////////
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user