Added initial cuSparse groundwork, and fp16 (__half) GEMM function.
This commit is contained in:
parent
8c7b99cd9f
commit
0d006ae326
40
Array.h
40
Array.h
@ -1,9 +1,9 @@
|
|||||||
#ifndef CUDATOOLS_ARRAY_H
|
#ifndef CUDATOOLS_ARRAY_H
|
||||||
#define CUDATOOLS_ARRAY_H
|
#define CUDATOOLS_ARRAY_H
|
||||||
|
|
||||||
#include "Complex.h"
|
|
||||||
#include "Core.h"
|
#include "Core.h"
|
||||||
#include "Macros.h"
|
#include "Macros.h"
|
||||||
|
#include "Types.h"
|
||||||
#include <Eigen/Dense>
|
#include <Eigen/Dense>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <complex>
|
#include <complex>
|
||||||
@ -18,10 +18,9 @@
|
|||||||
#define POINTER pHost
|
#define POINTER pHost
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
namespace CudaTools {
|
using namespace CudaTools::Types;
|
||||||
|
|
||||||
/** Type alises and lots of metaprogramming definitions, primarily dealing with
|
namespace CudaTools {
|
||||||
* the different numeric types and overrides. */
|
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
using EigenMat = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor>;
|
using EigenMat = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor>;
|
||||||
@ -32,23 +31,6 @@ template <typename T> struct EigenAdaptConst_S { typedef EigenMapMat<T> type; };
|
|||||||
template <typename T> struct EigenAdaptConst_S<const T> { typedef ConstEigenMapMat<T> type; };
|
template <typename T> struct EigenAdaptConst_S<const T> { typedef ConstEigenMapMat<T> type; };
|
||||||
template <typename T> using EigenAdaptConst = typename EigenAdaptConst_S<T>::type;
|
template <typename T> using EigenAdaptConst = typename EigenAdaptConst_S<T>::type;
|
||||||
|
|
||||||
template <typename T> struct ComplexUnderlying_S { typedef T type; };
|
|
||||||
template <> struct ComplexUnderlying_S<complex64> { typedef float type; };
|
|
||||||
template <> struct ComplexUnderlying_S<complex128> { typedef double type; };
|
|
||||||
template <typename T> using ComplexUnderlying = typename ComplexUnderlying_S<T>::type;
|
|
||||||
|
|
||||||
template <typename T> struct ComplexConversion_S { typedef T type; };
|
|
||||||
template <> struct ComplexConversion_S<complex64> { typedef std::complex<float> type; };
|
|
||||||
template <> struct ComplexConversion_S<complex128> { typedef std::complex<double> type; };
|
|
||||||
template <typename T> using ComplexConversion = typename ComplexConversion_S<T>::type;
|
|
||||||
|
|
||||||
template <typename T> inline constexpr bool is_int = std::is_integral<T>::value;
|
|
||||||
template <typename T> inline constexpr bool is_float = std::is_floating_point<T>::value;
|
|
||||||
template <typename T>
|
|
||||||
inline constexpr bool is_complex =
|
|
||||||
std::is_same<T, complex64>::value or std::is_same<T, complex128>::value;
|
|
||||||
template <typename T> inline constexpr bool is_num = is_int<T> or is_float<T> or is_complex<T>;
|
|
||||||
|
|
||||||
template <typename T> class Array;
|
template <typename T> class Array;
|
||||||
using Slice = std::pair<uint32_t, uint32_t>;
|
using Slice = std::pair<uint32_t, uint32_t>;
|
||||||
|
|
||||||
@ -576,7 +558,7 @@ template <typename T> class Array {
|
|||||||
* Sets the values of the entire Array to a constant. This is restricted to numerical types.
|
* Sets the values of the entire Array to a constant. This is restricted to numerical types.
|
||||||
*/
|
*/
|
||||||
HD void setConstant(const T value) const {
|
HD void setConstant(const T value) const {
|
||||||
static_assert(is_num<T>, "Function only available on numeric types.");
|
static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
|
||||||
for (auto it = begin(); it != end(); ++it) {
|
for (auto it = begin(); it != end(); ++it) {
|
||||||
*it = value;
|
*it = value;
|
||||||
}
|
}
|
||||||
@ -588,7 +570,7 @@ template <typename T> class Array {
|
|||||||
* \brief Host only
|
* \brief Host only
|
||||||
*/
|
*/
|
||||||
void setRandom(const T min, const T max) const {
|
void setRandom(const T min, const T max) const {
|
||||||
static_assert(is_num<T>, "Function only available on numeric types.");
|
static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
|
||||||
if constexpr (is_complex<T>) {
|
if constexpr (is_complex<T>) {
|
||||||
CT_ERROR_IF(max.real(), <, min.real(),
|
CT_ERROR_IF(max.real(), <, min.real(),
|
||||||
"Upper bound of range cannot be larger than lower bound");
|
"Upper bound of range cannot be larger than lower bound");
|
||||||
@ -623,7 +605,7 @@ template <typename T> class Array {
|
|||||||
* restricted to numerical types.
|
* restricted to numerical types.
|
||||||
*/
|
*/
|
||||||
HD void setRange(T min, const T step = 1) const {
|
HD void setRange(T min, const T step = 1) const {
|
||||||
static_assert(is_num<T>, "Function only available on numeric types.");
|
static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
|
||||||
for (auto it = begin(); it != end(); ++it) {
|
for (auto it = begin(); it != end(); ++it) {
|
||||||
*it = min;
|
*it = min;
|
||||||
min += step;
|
min += step;
|
||||||
@ -650,7 +632,7 @@ template <typename T> class Array {
|
|||||||
* \brief Host only
|
* \brief Host only
|
||||||
*/
|
*/
|
||||||
static Array constant(const Shape& shape, const T value) {
|
static Array constant(const Shape& shape, const T value) {
|
||||||
static_assert(is_num<T>, "Function only available on numeric types.");
|
static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
|
||||||
Array<T> arr(shape);
|
Array<T> arr(shape);
|
||||||
arr.setConstant(value);
|
arr.setConstant(value);
|
||||||
return arr;
|
return arr;
|
||||||
@ -662,7 +644,7 @@ template <typename T> class Array {
|
|||||||
* \brief Host only
|
* \brief Host only
|
||||||
*/
|
*/
|
||||||
static Array random(const Shape& shape, const T min, const T max) {
|
static Array random(const Shape& shape, const T min, const T max) {
|
||||||
static_assert(is_num<T>, "Function only available on numeric types.");
|
static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
|
||||||
Array<T> arr(shape);
|
Array<T> arr(shape);
|
||||||
arr.setRandom(min, max);
|
arr.setRandom(min, max);
|
||||||
return arr;
|
return arr;
|
||||||
@ -673,7 +655,7 @@ template <typename T> class Array {
|
|||||||
* \brief Host only
|
* \brief Host only
|
||||||
*/
|
*/
|
||||||
static Array range(const T min, const T max, const T step = 1) {
|
static Array range(const T min, const T max, const T step = 1) {
|
||||||
static_assert(is_num<T>, "Function only available on numeric types.");
|
static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
|
||||||
CT_ERROR_IF(max, <, min, "Upper bound of range cannot be larger than lower bound");
|
CT_ERROR_IF(max, <, min, "Upper bound of range cannot be larger than lower bound");
|
||||||
Array<T> arr({(uint32_t)((max - min) / step)});
|
Array<T> arr({(uint32_t)((max - min) / step)});
|
||||||
arr.setRange(min, step);
|
arr.setRange(min, step);
|
||||||
@ -698,7 +680,7 @@ template <typename T> class Array {
|
|||||||
* \brief Host only
|
* \brief Host only
|
||||||
*/
|
*/
|
||||||
Array transposed() const {
|
Array transposed() const {
|
||||||
static_assert(is_num<T>, "Function only available on numeric types.");
|
static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
|
||||||
CT_ERROR_IF(shape().axes(), !=, 2, "Tranpose can only occur on two-dimensional arrays");
|
CT_ERROR_IF(shape().axes(), !=, 2, "Tranpose can only occur on two-dimensional arrays");
|
||||||
Array<T> new_arr({mShape.rows(), mShape.cols()});
|
Array<T> new_arr({mShape.rows(), mShape.cols()});
|
||||||
new_arr.eigenMap() = this->eigenMap().transpose().eval();
|
new_arr.eigenMap() = this->eigenMap().transpose().eval();
|
||||||
@ -711,7 +693,7 @@ template <typename T> class Array {
|
|||||||
* \brief Host only
|
* \brief Host only
|
||||||
*/
|
*/
|
||||||
void transpose() {
|
void transpose() {
|
||||||
static_assert(is_num<T>, "Function only available on numeric types.");
|
static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
|
||||||
CT_ERROR_IF(shape().axes(), !=, 2, "Tranpose can only occur on two-dimensional arrays");
|
CT_ERROR_IF(shape().axes(), !=, 2, "Tranpose can only occur on two-dimensional arrays");
|
||||||
Array<T> new_arr(*this, {mShape.cols(), mShape.rows()});
|
Array<T> new_arr(*this, {mShape.cols(), mShape.rows()});
|
||||||
new_arr.eigenMap() = this->eigenMap().transpose().eval();
|
new_arr.eigenMap() = this->eigenMap().transpose().eval();
|
||||||
|
|||||||
92
BLAS.h
92
BLAS.h
@ -2,16 +2,13 @@
|
|||||||
#define CUDATOOLS_BLAS_H
|
#define CUDATOOLS_BLAS_H
|
||||||
|
|
||||||
#include "Array.h"
|
#include "Array.h"
|
||||||
#include "Complex.h"
|
|
||||||
#include "Core.h"
|
#include "Core.h"
|
||||||
#include "Macros.h"
|
#include "Macros.h"
|
||||||
|
#include "Types.h"
|
||||||
|
|
||||||
#ifdef CUDACC
|
using namespace CudaTools::Types;
|
||||||
#include <cuComplex.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
namespace CudaTools {
|
namespace CudaTools {
|
||||||
|
|
||||||
namespace BLAS {
|
namespace BLAS {
|
||||||
|
|
||||||
struct BatchInfo {
|
struct BatchInfo {
|
||||||
@ -19,17 +16,20 @@ struct BatchInfo {
|
|||||||
uint32_t size;
|
uint32_t size;
|
||||||
};
|
};
|
||||||
|
|
||||||
template <typename T> struct Check {
|
struct Check {
|
||||||
|
template <typename T>
|
||||||
static void isAtLeast2D(const Array<T>& arr, const std::string& name = "Array") {
|
static void isAtLeast2D(const Array<T>& arr, const std::string& name = "Array") {
|
||||||
CT_ERROR_IF(arr.shape().axes(), <, 2, (name + " needs to be at least 2D").c_str());
|
CT_ERROR_IF(arr.shape().axes(), <, 2, (name + " needs to be at least 2D").c_str());
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
static void isSquare(const Array<T>& arr, const std::string& name = "Array") {
|
static void isSquare(const Array<T>& arr, const std::string& name = "Array") {
|
||||||
isAtLeast2D(arr, name);
|
isAtLeast2D(arr, name);
|
||||||
CT_ERROR_IF(arr.shape().rows(), !=, arr.shape().cols(), (name + " is not square").c_str())
|
CT_ERROR_IF(arr.shape().rows(), !=, arr.shape().cols(), (name + " is not square").c_str())
|
||||||
};
|
};
|
||||||
|
|
||||||
static void isValidMatmul(const Array<T>& A, const Array<T>& B, const Array<T>& C,
|
template <typename T, typename U, typename V>
|
||||||
|
static void isValidMatmul(const Array<T>& A, const Array<U>& B, const Array<V>& C,
|
||||||
const std::string& nameA = "A", const std::string& nameB = "B",
|
const std::string& nameA = "A", const std::string& nameB = "B",
|
||||||
const std::string nameC = "C") {
|
const std::string nameC = "C") {
|
||||||
isAtLeast2D(A, nameA);
|
isAtLeast2D(A, nameA);
|
||||||
@ -46,7 +46,7 @@ template <typename T> struct Check {
|
|||||||
("The shape of " + nameA + nameB + " does not match the shape of " + nameC).c_str());
|
("The shape of " + nameA + nameB + " does not match the shape of " + nameC).c_str());
|
||||||
};
|
};
|
||||||
|
|
||||||
static uint32_t getUpperItems(const Array<T>& arr) {
|
template <typename T> static uint32_t getUpperItems(const Array<T>& arr) {
|
||||||
uint32_t upperItems = 1;
|
uint32_t upperItems = 1;
|
||||||
for (uint32_t iAxis = 0; iAxis < arr.shape().axes() - 2; ++iAxis) {
|
for (uint32_t iAxis = 0; iAxis < arr.shape().axes() - 2; ++iAxis) {
|
||||||
upperItems *= arr.shape().dim(iAxis);
|
upperItems *= arr.shape().dim(iAxis);
|
||||||
@ -54,7 +54,8 @@ template <typename T> struct Check {
|
|||||||
return upperItems;
|
return upperItems;
|
||||||
};
|
};
|
||||||
|
|
||||||
static void matchUpperShape(const Array<T>& A, const Array<T>& B,
|
template <typename T, typename U>
|
||||||
|
static void matchUpperShape(const Array<T>& A, const Array<U>& B,
|
||||||
const std::string& nameA = "A", const std::string& nameB = "B") {
|
const std::string& nameA = "A", const std::string& nameB = "B") {
|
||||||
CT_ERROR_IF(A.shape().axes(), !=, B.shape().axes(),
|
CT_ERROR_IF(A.shape().axes(), !=, B.shape().axes(),
|
||||||
(nameA + " and " + nameB + " shapes do not match for broadcasting").c_str());
|
(nameA + " and " + nameB + " shapes do not match for broadcasting").c_str());
|
||||||
@ -67,7 +68,8 @@ template <typename T> struct Check {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
static BatchInfo isBroadcastable(const Array<T>& A, const Array<T>& B, const Array<T>& C,
|
template <typename T, typename U, typename V>
|
||||||
|
static BatchInfo isBroadcastable(const Array<T>& A, const Array<U>& B, const Array<V>& C,
|
||||||
const std::string& nameA = "A", const std::string& nameB = "B",
|
const std::string& nameA = "A", const std::string& nameB = "B",
|
||||||
const std::string nameC = "C") {
|
const std::string nameC = "C") {
|
||||||
isValidMatmul(A, B, C, nameA, nameB, nameC);
|
isValidMatmul(A, B, C, nameA, nameB, nameC);
|
||||||
@ -130,7 +132,7 @@ template <typename T> class Batch {
|
|||||||
Batch(const Array<T>& arr) {
|
Batch(const Array<T>& arr) {
|
||||||
CT_ERROR(arr.isView(), "Array cannot be a view");
|
CT_ERROR(arr.isView(), "Array cannot be a view");
|
||||||
mShape = Shape({arr.shape().rows(), arr.shape().cols()});
|
mShape = Shape({arr.shape().rows(), arr.shape().cols()});
|
||||||
mBatchSize = mCount = Check<T>::getUpperItems(arr);
|
mBatchSize = mCount = Check::getUpperItems(arr);
|
||||||
|
|
||||||
mBatch = Array<T*>({mBatchSize});
|
mBatch = Array<T*>({mBatchSize});
|
||||||
|
|
||||||
@ -159,7 +161,7 @@ template <typename T> class Batch {
|
|||||||
#endif
|
#endif
|
||||||
if (mCount == 0) {
|
if (mCount == 0) {
|
||||||
mShape = arr.shape();
|
mShape = arr.shape();
|
||||||
mBatchSize = mCount = Check<T>::getUpperItems(arr);
|
mBatchSize = mCount = Check::getUpperItems(arr);
|
||||||
} else {
|
} else {
|
||||||
CT_ERROR_IF(arr.shape(), !=, mShape, "Cannot add matrix of different shape to batch");
|
CT_ERROR_IF(arr.shape(), !=, mShape, "Cannot add matrix of different shape to batch");
|
||||||
}
|
}
|
||||||
@ -195,15 +197,30 @@ template <typename T> struct CudaComplexConversion_S { typedef T type; };
|
|||||||
#ifdef CUDACC
|
#ifdef CUDACC
|
||||||
template <> struct CudaComplexConversion_S<complex64> { typedef cuComplex type; };
|
template <> struct CudaComplexConversion_S<complex64> { typedef cuComplex type; };
|
||||||
template <> struct CudaComplexConversion_S<complex128> { typedef cuDoubleComplex type; };
|
template <> struct CudaComplexConversion_S<complex128> { typedef cuDoubleComplex type; };
|
||||||
|
#else
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
template <typename T> using CudaComplexConversion = typename CudaComplexConversion_S<T>::type;
|
template <typename T> using CudaComplexConversion = typename CudaComplexConversion_S<T>::type;
|
||||||
|
|
||||||
|
template <typename T> struct CublasTypeLetter_S { char letter; };
|
||||||
|
template <> struct CublasTypeLetter_S<real32> { char letter = 'S'; };
|
||||||
|
template <> struct CublasTypeLetter_S<real64> { char letter = 'D'; };
|
||||||
|
template <> struct CublasTypeLetter_S<complex64> { char letter = 'C'; };
|
||||||
|
template <> struct CublasTypeLetter_S<complex128> { char letter = 'Z'; };
|
||||||
|
#ifdef CUDACC
|
||||||
|
template <> struct CublasTypeLetter_S<real16> { char letter = 'H'; };
|
||||||
|
#endif
|
||||||
|
|
||||||
|
template <typename T> char CublasTypeLetter = CublasTypeLetter_S<T>::letter;
|
||||||
|
|
||||||
// Shorthands to reduce clutter.
|
// Shorthands to reduce clutter.
|
||||||
|
|
||||||
#define CAST(var) reinterpret_cast<CudaComplexConversion<T>*>(var)
|
#define CAST(var) reinterpret_cast<CudaComplexConversion<T>*>(var)
|
||||||
#define DCAST(var) reinterpret_cast<CudaComplexConversion<T>**>(var)
|
#define DCAST(var) reinterpret_cast<CudaComplexConversion<T>**>(var)
|
||||||
|
|
||||||
|
#define cublas(T, func) cublas##CublasTypeLetter<T>##func
|
||||||
|
|
||||||
template <typename T, typename F1, typename F2, typename F3, typename F4, typename... Args>
|
template <typename T, typename F1, typename F2, typename F3, typename F4, typename... Args>
|
||||||
constexpr void invoke(F1 f1, F2 f2, F3 f3, F4 f4, Args&&... args) {
|
constexpr void invoke(F1 f1, F2 f2, F3 f3, F4 f4, Args&&... args) {
|
||||||
if constexpr (std::is_same<T, real32>::value) {
|
if constexpr (std::is_same<T, real32>::value) {
|
||||||
@ -215,7 +232,26 @@ constexpr void invoke(F1 f1, F2 f2, F3 f3, F4 f4, Args&&... args) {
|
|||||||
} else if constexpr (std::is_same<T, complex128>::value) {
|
} else if constexpr (std::is_same<T, complex128>::value) {
|
||||||
CUBLAS_CHECK(f4(args...));
|
CUBLAS_CHECK(f4(args...));
|
||||||
} else {
|
} else {
|
||||||
CT_ERROR(true, "BLAS functions are not callable with that type");
|
CT_ERROR(true, "This BLAS function is not callable with that type");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If someone can think of a better solution, please tell me.
|
||||||
|
template <typename T, typename F1, typename F2, typename F3, typename F4, typename F5,
|
||||||
|
typename... Args>
|
||||||
|
constexpr void invoke5(F1 f1, F2 f2, F3 f3, F4 f4, F5 f5, Args&&... args) {
|
||||||
|
if constexpr (std::is_same<T, real32>::value) {
|
||||||
|
CUBLAS_CHECK(f1(args...));
|
||||||
|
} else if constexpr (std::is_same<T, real64>::value) {
|
||||||
|
CUBLAS_CHECK(f2(args...));
|
||||||
|
} else if constexpr (std::is_same<T, complex64>::value) {
|
||||||
|
CUBLAS_CHECK(f3(args...));
|
||||||
|
} else if constexpr (std::is_same<T, complex128>::value) {
|
||||||
|
CUBLAS_CHECK(f4(args...));
|
||||||
|
} else if constexpr (std::is_same<T, real16>::value) {
|
||||||
|
CUBLAS_CHECK(f5(args...));
|
||||||
|
} else {
|
||||||
|
CT_ERROR(true, "This BLAS function is not callable with that type");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -227,7 +263,7 @@ template <typename T>
|
|||||||
StreamID GEMV(const T alpha, const Array<T>& A, const Array<T>& x, const T beta, const Array<T>& y,
|
StreamID GEMV(const T alpha, const Array<T>& A, const Array<T>& x, const T beta, const Array<T>& y,
|
||||||
const StreamID& stream = DEF_CUBLAS_STREAM) {
|
const StreamID& stream = DEF_CUBLAS_STREAM) {
|
||||||
|
|
||||||
BatchInfo bi = Check<T>::isBroadcastable(A, x, y, "A", "x", "y");
|
BatchInfo bi = Check::isBroadcastable(A, x, y, "A", "x", "y");
|
||||||
CT_ERROR_IF(x.shape().cols(), !=, 1, "x must be a column vector");
|
CT_ERROR_IF(x.shape().cols(), !=, 1, "x must be a column vector");
|
||||||
CT_ERROR_IF(y.shape().cols(), !=, 1, "x must be a column vector");
|
CT_ERROR_IF(y.shape().cols(), !=, 1, "x must be a column vector");
|
||||||
|
|
||||||
@ -241,7 +277,6 @@ StreamID GEMV(const T alpha, const Array<T>& A, const Array<T>& x, const T beta,
|
|||||||
Manager::get()->cublasHandle(), CUBLAS_OP_N, rows, cols, CAST(&a),
|
Manager::get()->cublasHandle(), CUBLAS_OP_N, rows, cols, CAST(&a),
|
||||||
CAST(A.dataDevice()), rows, CAST(x.dataDevice()), 1, CAST(&b),
|
CAST(A.dataDevice()), rows, CAST(x.dataDevice()), 1, CAST(&b),
|
||||||
CAST(y.dataDevice()), 1);
|
CAST(y.dataDevice()), 1);
|
||||||
|
|
||||||
} else { // Greater than 2, so broadcast.
|
} else { // Greater than 2, so broadcast.
|
||||||
invoke<T>(cublasSgemvStridedBatched, cublasDgemvStridedBatched, cublasCgemvStridedBatched,
|
invoke<T>(cublasSgemvStridedBatched, cublasDgemvStridedBatched, cublasCgemvStridedBatched,
|
||||||
cublasZgemvStridedBatched, Manager::get()->cublasHandle(), CUBLAS_OP_N, rows,
|
cublasZgemvStridedBatched, Manager::get()->cublasHandle(), CUBLAS_OP_N, rows,
|
||||||
@ -269,11 +304,11 @@ StreamID GEMV(const T alpha, const Array<T>& A, const Array<T>& x, const T beta,
|
|||||||
* Computes the matrix-matrix product: \f$ C = \alpha AB + \beta C \f$. It will automatically
|
* Computes the matrix-matrix product: \f$ C = \alpha AB + \beta C \f$. It will automatically
|
||||||
* broadcast the operation if applicable.
|
* broadcast the operation if applicable.
|
||||||
*/
|
*/
|
||||||
template <typename T>
|
template <typename T, typename U, typename V>
|
||||||
StreamID GEMM(const T alpha, const Array<T>& A, const Array<T>& B, const T beta, const Array<T>& C,
|
StreamID GEMM(const T alpha, const Array<U>& A, const Array<U>& B, const T beta, const Array<V>& C,
|
||||||
const StreamID& stream = DEF_CUBLAS_STREAM) {
|
const StreamID& stream = DEF_CUBLAS_STREAM) {
|
||||||
|
|
||||||
BatchInfo bi = Check<T>::isBroadcastable(A, B, C, "A", "B", "C");
|
BatchInfo bi = Check::isBroadcastable(A, B, C, "A", "B", "C");
|
||||||
// A is m x k, B is k x n.
|
// A is m x k, B is k x n.
|
||||||
uint32_t m = A.shape().rows();
|
uint32_t m = A.shape().rows();
|
||||||
uint32_t k = A.shape().cols();
|
uint32_t k = A.shape().cols();
|
||||||
@ -282,18 +317,19 @@ StreamID GEMM(const T alpha, const Array<T>& A, const Array<T>& B, const T beta,
|
|||||||
T a = alpha, b = beta;
|
T a = alpha, b = beta;
|
||||||
#ifdef CUDA
|
#ifdef CUDA
|
||||||
CUBLAS_CHECK(cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream)));
|
CUBLAS_CHECK(cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream)));
|
||||||
|
|
||||||
if (bi.size == 1) {
|
if (bi.size == 1) {
|
||||||
invoke<T>(cublasSgemm, cublasDgemm, cublasCgemm, cublasZgemm,
|
invoke5<T>(cublasSgemm, cublasDgemm, cublasCgemm, cublasZgemm, cublasHgemm,
|
||||||
Manager::get()->cublasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, CAST(&a),
|
Manager::get()->cublasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, CAST(&a),
|
||||||
CAST(A.dataDevice()), m, CAST(B.dataDevice()), k, CAST(&b), CAST(C.dataDevice()),
|
CAST(A.dataDevice()), m, CAST(B.dataDevice()), k, CAST(&b), CAST(C.dataDevice()),
|
||||||
m);
|
m);
|
||||||
|
|
||||||
} else { // Greater than 2, so broadcast.
|
} else { // Greater than 2, so broadcast.
|
||||||
invoke<T>(cublasSgemmStridedBatched, cublasDgemmStridedBatched, cublasCgemmStridedBatched,
|
invoke5<T>(cublasSgemmStridedBatched, cublasDgemmStridedBatched, cublasCgemmStridedBatched,
|
||||||
cublasZgemmStridedBatched, Manager::get()->cublasHandle(), CUBLAS_OP_N,
|
cublasZgemmStridedBatched, cublasHgemmStridedBatched,
|
||||||
CUBLAS_OP_N, m, n, k, CAST(&a), CAST(A.dataDevice()), m, bi.strideA,
|
Manager::get()->cublasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, CAST(&a),
|
||||||
CAST(B.dataDevice()), k, bi.strideB, CAST(&b), CAST(C.dataDevice()), m,
|
CAST(A.dataDevice()), m, bi.strideA, CAST(B.dataDevice()), k, bi.strideB,
|
||||||
bi.strideC, bi.size);
|
CAST(&b), CAST(C.dataDevice()), m, bi.strideC, bi.size);
|
||||||
}
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
@ -487,7 +523,7 @@ class PLUBatch : public Batch<T> {
|
|||||||
* Constructor of a PLUBatch from a multi-dimensional array, batched across upper dimensions.
|
* Constructor of a PLUBatch from a multi-dimensional array, batched across upper dimensions.
|
||||||
*/
|
*/
|
||||||
PLUBatch(const Array<T>& arr) : Batch<T>(arr) {
|
PLUBatch(const Array<T>& arr) : Batch<T>(arr) {
|
||||||
Check<T>::isSquare(arr, "LU Array");
|
Check::isSquare(arr, "LU Array");
|
||||||
|
|
||||||
mPivotsBatch = Array<int32_t>({this->mBatchSize * this->mShape.rows()});
|
mPivotsBatch = Array<int32_t>({this->mBatchSize * this->mShape.rows()});
|
||||||
mInfoLU = Array<int32_t>({this->mBatchSize});
|
mInfoLU = Array<int32_t>({this->mBatchSize});
|
||||||
|
|||||||
39
Core.h
39
Core.h
@ -81,6 +81,7 @@ class Manager {
|
|||||||
#ifdef CUDACC
|
#ifdef CUDACC
|
||||||
std::unordered_map<std::string, cudaStream_t> mStreams;
|
std::unordered_map<std::string, cudaStream_t> mStreams;
|
||||||
cublasHandle_t mCublas;
|
cublasHandle_t mCublas;
|
||||||
|
cusparseHandle_t mCusparse;
|
||||||
#endif
|
#endif
|
||||||
public:
|
public:
|
||||||
/**
|
/**
|
||||||
@ -94,6 +95,7 @@ class Manager {
|
|||||||
#ifdef CUDACC
|
#ifdef CUDACC
|
||||||
cudaStream_t stream(const StreamID& stream) const;
|
cudaStream_t stream(const StreamID& stream) const;
|
||||||
cublasHandle_t cublasHandle() const;
|
cublasHandle_t cublasHandle() const;
|
||||||
|
cusparseHandle_t cusparseHandle() const;
|
||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -391,6 +393,7 @@ Manager::Manager(const std::vector<std::string>& names) {
|
|||||||
addStream(name);
|
addStream(name);
|
||||||
}
|
}
|
||||||
CUBLAS_CHECK(cublasCreate(&mCublas));
|
CUBLAS_CHECK(cublasCreate(&mCublas));
|
||||||
|
CUSPARSE_CHECK(cusparseCreate(&mCusparse));
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -400,6 +403,7 @@ Manager::~Manager() {
|
|||||||
CUDA_CHECK(cudaStreamDestroy(it.second));
|
CUDA_CHECK(cudaStreamDestroy(it.second));
|
||||||
}
|
}
|
||||||
CUBLAS_CHECK(cublasDestroy(mCublas));
|
CUBLAS_CHECK(cublasDestroy(mCublas));
|
||||||
|
CUSPARSE_CHECK(cusparseDestroy(mCusparse));
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -439,8 +443,10 @@ cudaStream_t Manager::stream(const StreamID& stream) const {
|
|||||||
}
|
}
|
||||||
|
|
||||||
cublasHandle_t Manager::cublasHandle() const { return mCublas; };
|
cublasHandle_t Manager::cublasHandle() const { return mCublas; };
|
||||||
|
cusparseHandle_t Manager::cusparseHandle() const { return mCusparse; };
|
||||||
|
|
||||||
Manager Manager::mManagerInstance = Manager({"defaultMemory", "defaultCublas", "defaultKernel"});
|
Manager Manager::mManagerInstance =
|
||||||
|
Manager({"defaultMemory", "defaultCublas", "defaultCusparse", "defaultKernel"});
|
||||||
#else
|
#else
|
||||||
Manager Manager::mManagerInstance = Manager({""});
|
Manager Manager::mManagerInstance = Manager({""});
|
||||||
#endif
|
#endif
|
||||||
@ -674,37 +680,6 @@ void GraphManager::joinBranch(const StreamID& orig_stream, const StreamID& branc
|
|||||||
orig_stream.wait(*event);
|
orig_stream.wait(*event);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CUDACC
|
|
||||||
const char* cublasGetErrorString(cublasStatus_t error) {
|
|
||||||
switch (error) {
|
|
||||||
case CUBLAS_STATUS_SUCCESS:
|
|
||||||
return "CUBLAS_STATUS_SUCCESS";
|
|
||||||
|
|
||||||
case CUBLAS_STATUS_NOT_INITIALIZED:
|
|
||||||
return "CUBLAS_STATUS_NOT_INITIALIZED";
|
|
||||||
|
|
||||||
case CUBLAS_STATUS_ALLOC_FAILED:
|
|
||||||
return "CUBLAS_STATUS_ALLOC_FAILED";
|
|
||||||
|
|
||||||
case CUBLAS_STATUS_INVALID_VALUE:
|
|
||||||
return "CUBLAS_STATUS_INVALID_VALUE";
|
|
||||||
|
|
||||||
case CUBLAS_STATUS_ARCH_MISMATCH:
|
|
||||||
return "CUBLAS_STATUS_ARCH_MISMATCH";
|
|
||||||
|
|
||||||
case CUBLAS_STATUS_MAPPING_ERROR:
|
|
||||||
return "CUBLAS_STATUS_MAPPING_ERROR";
|
|
||||||
|
|
||||||
case CUBLAS_STATUS_EXECUTION_FAILED:
|
|
||||||
return "CUBLAS_STATUS_EXECUTION_FAILED";
|
|
||||||
|
|
||||||
case CUBLAS_STATUS_INTERNAL_ERROR:
|
|
||||||
return "CUBLAS_STATUS_INTERNAL_ERROR";
|
|
||||||
}
|
|
||||||
|
|
||||||
return "<unknown>";
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}; // namespace CudaTools
|
}; // namespace CudaTools
|
||||||
#endif // CUDATOOLS_IMPLEMENTATION
|
#endif // CUDATOOLS_IMPLEMENTATION
|
||||||
|
|
||||||
|
|||||||
22
Macros.h
22
Macros.h
@ -9,9 +9,6 @@
|
|||||||
#define CUDACC
|
#define CUDACC
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
using real32 = float; /**< Type alias for 32-bit floating point datatype. */
|
|
||||||
using real64 = double; /**< Type alias for 64-bit floating point datatype. */
|
|
||||||
|
|
||||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 0)
|
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 0)
|
||||||
#define DEVICE
|
#define DEVICE
|
||||||
#endif
|
#endif
|
||||||
@ -124,14 +121,19 @@ using real64 = double; /**< Type alias for 64-bit floating point datatype. */
|
|||||||
#ifdef CUDACC
|
#ifdef CUDACC
|
||||||
|
|
||||||
#include <cublas_v2.h>
|
#include <cublas_v2.h>
|
||||||
|
#include <cuda_bf16.h>
|
||||||
|
#include <cuda_fp16.h>
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
|
#include <cusparse.h>
|
||||||
|
|
||||||
|
#define DEVICE_FUNC __device__
|
||||||
#define HD __host__ __device__
|
#define HD __host__ __device__
|
||||||
#define SHARED __shared__
|
#define SHARED __shared__
|
||||||
|
|
||||||
#define KERNEL(call, ...) __global__ void call(__VA_ARGS__)
|
#define KERNEL(call, ...) __global__ void call(__VA_ARGS__)
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
#define DEVICE_FUNC
|
||||||
#define HD
|
#define HD
|
||||||
#define SHARED
|
#define SHARED
|
||||||
|
|
||||||
@ -139,8 +141,6 @@ using real64 = double; /**< Type alias for 64-bit floating point datatype. */
|
|||||||
|
|
||||||
#endif // CUDACC
|
#endif // CUDACC
|
||||||
|
|
||||||
//#define KERNEL(call, settings, ...) CudaTools::runKernel(call, settings, __VA_ARGS__)
|
|
||||||
|
|
||||||
///////////////////
|
///////////////////
|
||||||
// DEVICE MACROS //
|
// DEVICE MACROS //
|
||||||
///////////////////
|
///////////////////
|
||||||
@ -252,8 +252,16 @@ using real64 = double; /**< Type alias for 64-bit floating point datatype. */
|
|||||||
do { \
|
do { \
|
||||||
cublasStatus_t err = (call); \
|
cublasStatus_t err = (call); \
|
||||||
if (err != CUBLAS_STATUS_SUCCESS) { \
|
if (err != CUBLAS_STATUS_SUCCESS) { \
|
||||||
printf("[cuBLAS] %s:%d\n | %s\n", __FILE__, __LINE__, \
|
printf("[cuBLAS] %s:%d\n | %s\n", __FILE__, __LINE__, cublasGetStatusName(err)); \
|
||||||
CudaTools::cublasGetErrorString(err)); \
|
throw std::exception(); \
|
||||||
|
} \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#define CUSPARSE_CHECK(call) \
|
||||||
|
do { \
|
||||||
|
cusparseStatus_t err = (call); \
|
||||||
|
if (err != CUSPARSE_STATUS_SUCCESS) { \
|
||||||
|
printf("[cuSPARSE] %s:%d\n | %s\n", __FILE__, __LINE__, cusparseGetErrorName(err)); \
|
||||||
throw std::exception(); \
|
throw std::exception(); \
|
||||||
} \
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|||||||
2
Makefile
2
Makefile
@ -7,7 +7,7 @@ INCLUDE :=
|
|||||||
LIBS_DIR :=
|
LIBS_DIR :=
|
||||||
LIBS_DIR_GPU := /usr/local/cuda/lib64
|
LIBS_DIR_GPU := /usr/local/cuda/lib64
|
||||||
LIBS :=
|
LIBS :=
|
||||||
LIBS_GPU := cuda cudart cublas
|
LIBS_GPU := cuda cudart cublas cusparse
|
||||||
|
|
||||||
TARGET = tests
|
TARGET = tests
|
||||||
SRC_DIR = .
|
SRC_DIR = .
|
||||||
|
|||||||
@ -7,7 +7,7 @@ INCLUDE := <<Put extra include directories here, separated by a space>>
|
|||||||
LIBS_DIR := <<Put library directories here, separated by a space>>
|
LIBS_DIR := <<Put library directories here, separated by a space>>
|
||||||
LIBS_DIR_GPU := /usr/local/cuda/lib64 <<Put extra include GPU library directories here, separated by a space>>
|
LIBS_DIR_GPU := /usr/local/cuda/lib64 <<Put extra include GPU library directories here, separated by a space>>
|
||||||
LIBS := <<Put the names of the libraries here, separated by a space>>
|
LIBS := <<Put the names of the libraries here, separated by a space>>
|
||||||
LIBS_GPU := cuda cudart cublas <<Put extra GPU libraries here, separated by a space>>
|
LIBS_GPU := cuda cudart cublas cusparse <<Put extra GPU libraries here, separated by a space>>
|
||||||
|
|
||||||
TARGET = <<Put the name of your target here>>
|
TARGET = <<Put the name of your target here>>
|
||||||
SRC_DIR = .
|
SRC_DIR = .
|
||||||
|
|||||||
10
Sparse.h
Normal file
10
Sparse.h
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
#ifndef CUDATOOLS_SPARSE_H
|
||||||
|
#define CUDATOOLS_SPARSE_H
|
||||||
|
|
||||||
|
#include "Array.h"
|
||||||
|
#include "Core.h"
|
||||||
|
#include "Macros.h"
|
||||||
|
#include "Types.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif
|
||||||
@ -11,6 +11,25 @@
|
|||||||
|
|
||||||
namespace CudaTools {
|
namespace CudaTools {
|
||||||
|
|
||||||
|
namespace Types {
|
||||||
|
|
||||||
|
using real32 = float; /**< Type alias for 32-bit floating point datatype. */
|
||||||
|
using real64 = double; /**< Type alias for 64-bit floating point datatype. */
|
||||||
|
|
||||||
|
#ifdef CUDACC
|
||||||
|
|
||||||
|
using real16 = __half;
|
||||||
|
using realb16 = __nv_bfloat16;
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
using real16 = float; /**< Type alias for 16-bit floating point datatype, when using GPU. Otherwise,
|
||||||
|
defaults to float. */
|
||||||
|
using realb16 = float; /**< Type alias for the 16-bit bfloat datatype, when using GPU. Otherwise,
|
||||||
|
defaults to float. */
|
||||||
|
|
||||||
|
#endif // CUDACC
|
||||||
|
|
||||||
template <typename T> class complex {
|
template <typename T> class complex {
|
||||||
private:
|
private:
|
||||||
T r = 0;
|
T r = 0;
|
||||||
@ -107,11 +126,10 @@ template complex<real64> operator*<real64>(const real64, const complex<real64>);
|
|||||||
template complex<real32> operator/<real32>(const real32, const complex<real32>);
|
template complex<real32> operator/<real32>(const real32, const complex<real32>);
|
||||||
template complex<real64> operator/<real64>(const real64, const complex<real64>);
|
template complex<real64> operator/<real64>(const real64, const complex<real64>);
|
||||||
|
|
||||||
}; // namespace CudaTools
|
#ifdef CUDACC
|
||||||
|
using complex64 = complex<real32>;
|
||||||
|
using complex128 = complex<real64>;
|
||||||
|
|
||||||
#ifdef CUDA
|
|
||||||
using complex64 = CudaTools::complex<real32>;
|
|
||||||
using complex128 = CudaTools::complex<real64>;
|
|
||||||
#else
|
#else
|
||||||
using complex64 = std::complex<real32>; /**< Type alias for 64-bit complex floating point datatype.
|
using complex64 = std::complex<real32>; /**< Type alias for 64-bit complex floating point datatype.
|
||||||
* This adapts depending on the CUDA compilation flag, and
|
* This adapts depending on the CUDA compilation flag, and
|
||||||
@ -122,4 +140,27 @@ using complex128 =
|
|||||||
* CudaTools::complex<real64>. */
|
* CudaTools::complex<real64>. */
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/** Type alises and lots of metaprogramming definitions, primarily dealing with
|
||||||
|
* the different numeric types and overrides. */
|
||||||
|
|
||||||
|
template <typename T> struct ComplexUnderlying_S { typedef T type; };
|
||||||
|
template <> struct ComplexUnderlying_S<complex64> { typedef float type; };
|
||||||
|
template <> struct ComplexUnderlying_S<complex128> { typedef double type; };
|
||||||
|
template <typename T> using ComplexUnderlying = typename ComplexUnderlying_S<T>::type;
|
||||||
|
|
||||||
|
template <typename T> struct ComplexConversion_S { typedef T type; };
|
||||||
|
template <> struct ComplexConversion_S<complex64> { typedef std::complex<float> type; };
|
||||||
|
template <> struct ComplexConversion_S<complex128> { typedef std::complex<double> type; };
|
||||||
|
template <typename T> using ComplexConversion = typename ComplexConversion_S<T>::type;
|
||||||
|
|
||||||
|
template <typename T> inline constexpr bool is_int = std::is_integral<T>::value;
|
||||||
|
template <typename T> inline constexpr bool is_float = std::is_floating_point<T>::value;
|
||||||
|
template <typename T>
|
||||||
|
inline constexpr bool is_complex =
|
||||||
|
std::is_same<T, complex64>::value or std::is_same<T, complex128>::value;
|
||||||
|
template <typename T> inline constexpr bool is_host_num = is_int<T> or is_float<T> or is_complex<T>;
|
||||||
|
|
||||||
|
}; // namespace Types
|
||||||
|
}; // namespace CudaTools
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
@ -21,7 +21,7 @@ for your own project, after following a few rules.
|
|||||||
|
|
||||||
The usage of this libary will be illustrated through examples, and further details
|
The usage of this libary will be illustrated through examples, and further details
|
||||||
can be found in the other sections. The examples are given in the `samples <https://git.acem.ece.illinois.edu/kjao/CudaTools/src/branch/main/samples>`__ folder.
|
can be found in the other sections. The examples are given in the `samples <https://git.acem.ece.illinois.edu/kjao/CudaTools/src/branch/main/samples>`__ folder.
|
||||||
Throughout this documentation, there are a few common terms that may appear. First,we refer to the CPU as the host, and the GPU as the device. So, a host function refers
|
Throughout this documentation, there are a few common terms that may appear. First, we refer to the CPU as the host, and the GPU as the device. So, a host function refers
|
||||||
to a function runnable on the CPU, and a device function refers to a function that is runnable
|
to a function runnable on the CPU, and a device function refers to a function that is runnable
|
||||||
on a device. A kernel is a specific function that the host can call to be run on the device.
|
on a device. A kernel is a specific function that the host can call to be run on the device.
|
||||||
|
|
||||||
@ -42,17 +42,17 @@ macros provided. For example,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
The ``DEFINE_KERNEL(name, ...)`` macro takes in the function name and its arguments.
|
The ``KERNEL(name, ...)`` macro takes in the function name and its arguments.
|
||||||
The second argument in the ``KERNEL()`` macro is are the launch parameters for
|
The second argument in the ``KERNEL()`` macro is are the launch parameters for
|
||||||
kernel. The launch parameters have several items, but for 'embarassingly parallel'
|
kernel. The launch parameters have several items, but for 'embarassingly parallel'
|
||||||
cases, we can simply generate the settings with the number of threads. More detail with
|
cases, we can simply generate the settings with the number of threads using ``CudaTools::Kernel::basic``. More detail with
|
||||||
creating launch parameters can be found :ref:`here <CudaTools::Kernel::Settings>`. In the above example,
|
creating launch parameters can be found :ref:`here <CudaTools::Kernel::Settings>`. In the above example,
|
||||||
there is only one thread. The rest of the arguments are just the kernel arguments. For more detail,
|
there is only one thread. The rest of the arguments are just the kernel arguments. For more detail,
|
||||||
see :ref:`here <Macro Functions>`.
|
see :ref:`here <Macro Functions>`.
|
||||||
|
|
||||||
.. warning::
|
.. warning::
|
||||||
These kernel definitions must be in a file that will be compiled by ``nvcc``. Also,
|
These kernel definitions must be in a file that will be compiled by ``nvcc``. Also,
|
||||||
for header files, there is an additional macro ``DECLARE_KERNEL(name, ...)`` to declare it
|
for header files, there is an additional macro ``KERNEL(name, ...)`` to declare it
|
||||||
and make it available to other files.
|
and make it available to other files.
|
||||||
|
|
||||||
Since many applications used classes, a macro is provided to 'convert' a class into
|
Since many applications used classes, a macro is provided to 'convert' a class into
|
||||||
@ -192,7 +192,8 @@ situations and with the ``CudaTools::Kernel::basic()`` launch parameters. If com
|
|||||||
mark the loop with ``#pragma parallel for`` and attempt to use OpenMP for parallelism.
|
mark the loop with ``#pragma parallel for`` and attempt to use OpenMP for parallelism.
|
||||||
|
|
||||||
.. warning::
|
.. warning::
|
||||||
Notice that a view must be passed to the kernel, and not the original object. This
|
Notice that a view must be passed to the kernel, and not the original object, otherwise a copy
|
||||||
|
would be made.
|
||||||
|
|
||||||
The Array also supports other helpful functions, such as multi-dimensional indexing, slicing, and
|
The Array also supports other helpful functions, such as multi-dimensional indexing, slicing, and
|
||||||
a few other functions.
|
a few other functions.
|
||||||
|
|||||||
@ -90,6 +90,7 @@ int main() {
|
|||||||
CudaTools::Array<uint32_t> A = CudaTools::Array<uint32_t>::constant({100}, 50);
|
CudaTools::Array<uint32_t> A = CudaTools::Array<uint32_t>::constant({100}, 50);
|
||||||
CudaTools::Array<uint32_t> B = CudaTools::Array<uint32_t>::constant({100}, 0);
|
CudaTools::Array<uint32_t> B = CudaTools::Array<uint32_t>::constant({100}, 0);
|
||||||
|
|
||||||
|
// Executes process without graph.
|
||||||
TIME(doFunc(A.view(), B.view()), ExecuteNoGraph);
|
TIME(doFunc(A.view(), B.view()), ExecuteNoGraph);
|
||||||
|
|
||||||
std::cout << A.slice({{0, 10}}) << "\n";
|
std::cout << A.slice({{0, 10}}) << "\n";
|
||||||
@ -97,6 +98,7 @@ int main() {
|
|||||||
A.setConstant(50);
|
A.setConstant(50);
|
||||||
B.setConstant(0);
|
B.setConstant(0);
|
||||||
|
|
||||||
|
// Executes process with graph.
|
||||||
CudaTools::GraphManager gm;
|
CudaTools::GraphManager gm;
|
||||||
CudaTools::Graph graph("graphStream", myGraph, &gm, A.view(), B.view());
|
CudaTools::Graph graph("graphStream", myGraph, &gm, A.view(), B.view());
|
||||||
TIME(graph.execute().wait(), ExecuteGraph);
|
TIME(graph.execute().wait(), ExecuteGraph);
|
||||||
|
|||||||
@ -2,13 +2,14 @@
|
|||||||
#define CUDATOOLS_ARRAY_MAX_AXES 8
|
#define CUDATOOLS_ARRAY_MAX_AXES 8
|
||||||
#include "Array.h"
|
#include "Array.h"
|
||||||
#include "BLAS.h"
|
#include "BLAS.h"
|
||||||
#include "Complex.h"
|
|
||||||
#include "Core.h"
|
#include "Core.h"
|
||||||
|
#include "Types.h"
|
||||||
|
|
||||||
#include <Eigen/Core>
|
#include <Eigen/Core>
|
||||||
#include <chrono>
|
#include <chrono>
|
||||||
#include <complex>
|
#include <complex>
|
||||||
|
|
||||||
|
using namespace CudaTools::Types;
|
||||||
namespace CT = CudaTools;
|
namespace CT = CudaTools;
|
||||||
|
|
||||||
/////////////
|
/////////////
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user