Added initial cuSparse groundwork, and fp16 (__half) GEMM function.

2023-05-31 18:15:26 -05:00 · 2023-05-31 18:15:26 -05:00 · 0d006ae326
commit 0d006ae326
parent 8c7b99cd9f
11 changed files with 164 additions and 108 deletions
--- a/Array.h
+++ b/Array.h
@ -1,9 +1,9 @@
 #ifndef CUDATOOLS_ARRAY_H
 #define CUDATOOLS_ARRAY_H
 #include "Complex.h"
 #include "Core.h"
 #include "Macros.h"
 #include "Types.h"
 #include <Eigen/Dense>
 #include <cmath>
 #include <complex>
@ -18,10 +18,9 @@
 #define POINTER pHost
 #endif
-namespace CudaTools {
+using namespace CudaTools::Types;
-/** Type alises and lots of metaprogramming definitions, primarily dealing with
+namespace CudaTools {
 * the different numeric types and overrides. */
 template <typename T>
 using EigenMat = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor>;
@ -32,23 +31,6 @@ template <typename T> struct EigenAdaptConst_S { typedef EigenMapMat<T> type; };
 template <typename T> struct EigenAdaptConst_S<const T> { typedef ConstEigenMapMat<T> type; };
 template <typename T> using EigenAdaptConst = typename EigenAdaptConst_S<T>::type;
 template <typename T> struct ComplexUnderlying_S { typedef T type; };
 template <> struct ComplexUnderlying_S<complex64> { typedef float type; };
 template <> struct ComplexUnderlying_S<complex128> { typedef double type; };
 template <typename T> using ComplexUnderlying = typename ComplexUnderlying_S<T>::type;
 template <typename T> struct ComplexConversion_S { typedef T type; };
 template <> struct ComplexConversion_S<complex64> { typedef std::complex<float> type; };
 template <> struct ComplexConversion_S<complex128> { typedef std::complex<double> type; };
 template <typename T> using ComplexConversion = typename ComplexConversion_S<T>::type;
 template <typename T> inline constexpr bool is_int = std::is_integral<T>::value;
 template <typename T> inline constexpr bool is_float = std::is_floating_point<T>::value;
 template <typename T>
 inline constexpr bool is_complex =
    std::is_same<T, complex64>::value or std::is_same<T, complex128>::value;
 template <typename T> inline constexpr bool is_num = is_int<T> or is_float<T> or is_complex<T>;
 template <typename T> class Array;
 using Slice = std::pair<uint32_t, uint32_t>;
@ -576,7 +558,7 @@ template <typename T> class Array {
     * Sets the values of the entire Array to a constant. This is restricted to numerical types.
     */
    HD void setConstant(const T value) const {
-        static_assert(is_num<T>, "Function only available on numeric types.");
+        static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
        for (auto it = begin(); it != end(); ++it) {
            *it = value;
        }
@ -588,7 +570,7 @@ template <typename T> class Array {
     * \brief Host only
     */
    void setRandom(const T min, const T max) const {
-        static_assert(is_num<T>, "Function only available on numeric types.");
+        static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
        if constexpr (is_complex<T>) {
            CT_ERROR_IF(max.real(), <, min.real(),
                        "Upper bound of range cannot be larger than lower bound");
@ -623,7 +605,7 @@ template <typename T> class Array {
     * restricted to numerical types.
     */
    HD void setRange(T min, const T step = 1) const {
-        static_assert(is_num<T>, "Function only available on numeric types.");
+        static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
        for (auto it = begin(); it != end(); ++it) {
            *it = min;
            min += step;
@ -650,7 +632,7 @@ template <typename T> class Array {
     * \brief Host only
     */
    static Array constant(const Shape& shape, const T value) {
-        static_assert(is_num<T>, "Function only available on numeric types.");
+        static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
        Array<T> arr(shape);
        arr.setConstant(value);
        return arr;
@ -662,7 +644,7 @@ template <typename T> class Array {
     * \brief Host only
     */
    static Array random(const Shape& shape, const T min, const T max) {
-        static_assert(is_num<T>, "Function only available on numeric types.");
+        static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
        Array<T> arr(shape);
        arr.setRandom(min, max);
        return arr;
@ -673,7 +655,7 @@ template <typename T> class Array {
     * \brief Host only
     */
    static Array range(const T min, const T max, const T step = 1) {
-        static_assert(is_num<T>, "Function only available on numeric types.");
+        static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
        CT_ERROR_IF(max, <, min, "Upper bound of range cannot be larger than lower bound");
        Array<T> arr({(uint32_t)((max - min) / step)});
        arr.setRange(min, step);
@ -698,7 +680,7 @@ template <typename T> class Array {
     * \brief Host only
     */
    Array transposed() const {
-        static_assert(is_num<T>, "Function only available on numeric types.");
+        static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
        CT_ERROR_IF(shape().axes(), !=, 2, "Tranpose can only occur on two-dimensional arrays");
        Array<T> new_arr({mShape.rows(), mShape.cols()});
        new_arr.eigenMap() = this->eigenMap().transpose().eval();
@ -711,7 +693,7 @@ template <typename T> class Array {
     * \brief Host only
     */
    void transpose() {
-        static_assert(is_num<T>, "Function only available on numeric types.");
+        static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
        CT_ERROR_IF(shape().axes(), !=, 2, "Tranpose can only occur on two-dimensional arrays");
        Array<T> new_arr(*this, {mShape.cols(), mShape.rows()});
        new_arr.eigenMap() = this->eigenMap().transpose().eval();
--- a/BLAS.h
+++ b/BLAS.h
@ -2,16 +2,13 @@
 #define CUDATOOLS_BLAS_H
 #include "Array.h"
 #include "Complex.h"
 #include "Core.h"
 #include "Macros.h"
 #include "Types.h"
-#ifdef CUDACC
+using namespace CudaTools::Types;
 #include <cuComplex.h>
 #endif
 namespace CudaTools {
 namespace BLAS {
 struct BatchInfo {
@ -19,17 +16,20 @@ struct BatchInfo {
    uint32_t size;
 };
-template <typename T> struct Check {
+struct Check {
    template <typename T>
    static void isAtLeast2D(const Array<T>& arr, const std::string& name = "Array") {
        CT_ERROR_IF(arr.shape().axes(), <, 2, (name + " needs to be at least 2D").c_str());
    };
    template <typename T>
    static void isSquare(const Array<T>& arr, const std::string& name = "Array") {
        isAtLeast2D(arr, name);
        CT_ERROR_IF(arr.shape().rows(), !=, arr.shape().cols(), (name + " is not square").c_str())
    };
-    static void isValidMatmul(const Array<T>& A, const Array<T>& B, const Array<T>& C,
+    template <typename T, typename U, typename V>
    static void isValidMatmul(const Array<T>& A, const Array<U>& B, const Array<V>& C,
                              const std::string& nameA = "A", const std::string& nameB = "B",
                              const std::string nameC = "C") {
        isAtLeast2D(A, nameA);
@ -46,7 +46,7 @@ template <typename T> struct Check {
            ("The shape of " + nameA + nameB + " does not match the shape of " + nameC).c_str());
    };
-    static uint32_t getUpperItems(const Array<T>& arr) {
+    template <typename T> static uint32_t getUpperItems(const Array<T>& arr) {
        uint32_t upperItems = 1;
        for (uint32_t iAxis = 0; iAxis < arr.shape().axes() - 2; ++iAxis) {
            upperItems *= arr.shape().dim(iAxis);
@ -54,7 +54,8 @@ template <typename T> struct Check {
        return upperItems;
    };
-    static void matchUpperShape(const Array<T>& A, const Array<T>& B,
+    template <typename T, typename U>
    static void matchUpperShape(const Array<T>& A, const Array<U>& B,
                                const std::string& nameA = "A", const std::string& nameB = "B") {
        CT_ERROR_IF(A.shape().axes(), !=, B.shape().axes(),
                    (nameA + " and " + nameB + " shapes do not match for broadcasting").c_str());
@ -67,7 +68,8 @@ template <typename T> struct Check {
        }
    };
-    static BatchInfo isBroadcastable(const Array<T>& A, const Array<T>& B, const Array<T>& C,
+    template <typename T, typename U, typename V>
    static BatchInfo isBroadcastable(const Array<T>& A, const Array<U>& B, const Array<V>& C,
                                     const std::string& nameA = "A", const std::string& nameB = "B",
                                     const std::string nameC = "C") {
        isValidMatmul(A, B, C, nameA, nameB, nameC);
@ -130,7 +132,7 @@ template <typename T> class Batch {
    Batch(const Array<T>& arr) {
        CT_ERROR(arr.isView(), "Array cannot be a view");
        mShape = Shape({arr.shape().rows(), arr.shape().cols()});
-        mBatchSize = mCount = Check<T>::getUpperItems(arr);
+        mBatchSize = mCount = Check::getUpperItems(arr);
        mBatch = Array<T*>({mBatchSize});
@ -159,7 +161,7 @@ template <typename T> class Batch {
 #endif
        if (mCount == 0) {
            mShape = arr.shape();
-            mBatchSize = mCount = Check<T>::getUpperItems(arr);
+            mBatchSize = mCount = Check::getUpperItems(arr);
        } else {
            CT_ERROR_IF(arr.shape(), !=, mShape, "Cannot add matrix of different shape to batch");
        }
@ -195,15 +197,30 @@ template <typename T> struct CudaComplexConversion_S { typedef T type; };
 #ifdef CUDACC
 template <> struct CudaComplexConversion_S<complex64> { typedef cuComplex type; };
 template <> struct CudaComplexConversion_S<complex128> { typedef cuDoubleComplex type; };
 #else
 #endif
 template <typename T> using CudaComplexConversion = typename CudaComplexConversion_S<T>::type;
 template <typename T> struct CublasTypeLetter_S { char letter; };
 template <> struct CublasTypeLetter_S<real32> { char letter = 'S'; };
 template <> struct CublasTypeLetter_S<real64> { char letter = 'D'; };
 template <> struct CublasTypeLetter_S<complex64> { char letter = 'C'; };
 template <> struct CublasTypeLetter_S<complex128> { char letter = 'Z'; };
 #ifdef CUDACC
 template <> struct CublasTypeLetter_S<real16> { char letter = 'H'; };
 #endif
 template <typename T> char CublasTypeLetter = CublasTypeLetter_S<T>::letter;
 // Shorthands to reduce clutter.
 #define CAST(var) reinterpret_cast<CudaComplexConversion<T>*>(var)
 #define DCAST(var) reinterpret_cast<CudaComplexConversion<T>**>(var)
 #define cublas(T, func) cublas##CublasTypeLetter<T>##func
 template <typename T, typename F1, typename F2, typename F3, typename F4, typename... Args>
 constexpr void invoke(F1 f1, F2 f2, F3 f3, F4 f4, Args&&... args) {
    if constexpr (std::is_same<T, real32>::value) {
@ -215,7 +232,26 @@ constexpr void invoke(F1 f1, F2 f2, F3 f3, F4 f4, Args&&... args) {
    } else if constexpr (std::is_same<T, complex128>::value) {
        CUBLAS_CHECK(f4(args...));
    } else {
-        CT_ERROR(true, "BLAS functions are not callable with that type");
+        CT_ERROR(true, "This BLAS function is not callable with that type");
    }
 }
 // If someone can think of a better solution, please tell me.
 template <typename T, typename F1, typename F2, typename F3, typename F4, typename F5,
          typename... Args>
 constexpr void invoke5(F1 f1, F2 f2, F3 f3, F4 f4, F5 f5, Args&&... args) {
    if constexpr (std::is_same<T, real32>::value) {
        CUBLAS_CHECK(f1(args...));
    } else if constexpr (std::is_same<T, real64>::value) {
        CUBLAS_CHECK(f2(args...));
    } else if constexpr (std::is_same<T, complex64>::value) {
        CUBLAS_CHECK(f3(args...));
    } else if constexpr (std::is_same<T, complex128>::value) {
        CUBLAS_CHECK(f4(args...));
    } else if constexpr (std::is_same<T, real16>::value) {
        CUBLAS_CHECK(f5(args...));
    } else {
        CT_ERROR(true, "This BLAS function is not callable with that type");
    }
 }
@ -227,7 +263,7 @@ template <typename T>
 StreamID GEMV(const T alpha, const Array<T>& A, const Array<T>& x, const T beta, const Array<T>& y,
              const StreamID& stream = DEF_CUBLAS_STREAM) {
-    BatchInfo bi = Check<T>::isBroadcastable(A, x, y, "A", "x", "y");
+    BatchInfo bi = Check::isBroadcastable(A, x, y, "A", "x", "y");
    CT_ERROR_IF(x.shape().cols(), !=, 1, "x must be a column vector");
    CT_ERROR_IF(y.shape().cols(), !=, 1, "x must be a column vector");
@ -241,7 +277,6 @@ StreamID GEMV(const T alpha, const Array<T>& A, const Array<T>& x, const T beta,
                  Manager::get()->cublasHandle(), CUBLAS_OP_N, rows, cols, CAST(&a),
                  CAST(A.dataDevice()), rows, CAST(x.dataDevice()), 1, CAST(&b),
                  CAST(y.dataDevice()), 1);
    } else { // Greater than 2, so broadcast.
        invoke<T>(cublasSgemvStridedBatched, cublasDgemvStridedBatched, cublasCgemvStridedBatched,
                  cublasZgemvStridedBatched, Manager::get()->cublasHandle(), CUBLAS_OP_N, rows,
@ -269,11 +304,11 @@ StreamID GEMV(const T alpha, const Array<T>& A, const Array<T>& x, const T beta,
 * Computes the matrix-matrix product: \f$ C = \alpha AB + \beta C \f$. It will automatically
 * broadcast the operation if applicable.
 */
-template <typename T>
+template <typename T, typename U, typename V>
-StreamID GEMM(const T alpha, const Array<T>& A, const Array<T>& B, const T beta, const Array<T>& C,
+StreamID GEMM(const T alpha, const Array<U>& A, const Array<U>& B, const T beta, const Array<V>& C,
              const StreamID& stream = DEF_CUBLAS_STREAM) {
-    BatchInfo bi = Check<T>::isBroadcastable(A, B, C, "A", "B", "C");
+    BatchInfo bi = Check::isBroadcastable(A, B, C, "A", "B", "C");
    // A is m x k, B is k x n.
    uint32_t m = A.shape().rows();
    uint32_t k = A.shape().cols();
@ -282,18 +317,19 @@ StreamID GEMM(const T alpha, const Array<T>& A, const Array<T>& B, const T beta,
    T a = alpha, b = beta;
 #ifdef CUDA
    CUBLAS_CHECK(cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream)));
    if (bi.size == 1) {
-        invoke<T>(cublasSgemm, cublasDgemm, cublasCgemm, cublasZgemm,
+        invoke5<T>(cublasSgemm, cublasDgemm, cublasCgemm, cublasZgemm, cublasHgemm,
-                  Manager::get()->cublasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, CAST(&a),
+                   Manager::get()->cublasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, CAST(&a),
-                  CAST(A.dataDevice()), m, CAST(B.dataDevice()), k, CAST(&b), CAST(C.dataDevice()),
+                   CAST(A.dataDevice()), m, CAST(B.dataDevice()), k, CAST(&b), CAST(C.dataDevice()),
-                  m);
+                   m);
    } else { // Greater than 2, so broadcast.
-        invoke<T>(cublasSgemmStridedBatched, cublasDgemmStridedBatched, cublasCgemmStridedBatched,
+        invoke5<T>(cublasSgemmStridedBatched, cublasDgemmStridedBatched, cublasCgemmStridedBatched,
-                  cublasZgemmStridedBatched, Manager::get()->cublasHandle(), CUBLAS_OP_N,
+                   cublasZgemmStridedBatched, cublasHgemmStridedBatched,
-                  CUBLAS_OP_N, m, n, k, CAST(&a), CAST(A.dataDevice()), m, bi.strideA,
+                   Manager::get()->cublasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, CAST(&a),
-                  CAST(B.dataDevice()), k, bi.strideB, CAST(&b), CAST(C.dataDevice()), m,
+                   CAST(A.dataDevice()), m, bi.strideA, CAST(B.dataDevice()), k, bi.strideB,
-                  bi.strideC, bi.size);
+                   CAST(&b), CAST(C.dataDevice()), m, bi.strideC, bi.size);
    }
 #else
@ -487,7 +523,7 @@ class PLUBatch : public Batch<T> {
     * Constructor of a PLUBatch from a multi-dimensional array, batched across upper dimensions.
     */
    PLUBatch(const Array<T>& arr) : Batch<T>(arr) {
-        Check<T>::isSquare(arr, "LU Array");
+        Check::isSquare(arr, "LU Array");
        mPivotsBatch = Array<int32_t>({this->mBatchSize * this->mShape.rows()});
        mInfoLU = Array<int32_t>({this->mBatchSize});
--- a/Core.h
+++ b/Core.h
@ -81,6 +81,7 @@ class Manager {
 #ifdef CUDACC
    std::unordered_map<std::string, cudaStream_t> mStreams;
    cublasHandle_t mCublas;
    cusparseHandle_t mCusparse;
 #endif
  public:
    /**
@ -94,6 +95,7 @@ class Manager {
 #ifdef CUDACC
    cudaStream_t stream(const StreamID& stream) const;
    cublasHandle_t cublasHandle() const;
    cusparseHandle_t cusparseHandle() const;
 #endif
 };
@ -391,6 +393,7 @@ Manager::Manager(const std::vector<std::string>& names) {
        addStream(name);
    }
    CUBLAS_CHECK(cublasCreate(&mCublas));
    CUSPARSE_CHECK(cusparseCreate(&mCusparse));
 #endif
 }
@ -400,6 +403,7 @@ Manager::~Manager() {
        CUDA_CHECK(cudaStreamDestroy(it.second));
    }
    CUBLAS_CHECK(cublasDestroy(mCublas));
    CUSPARSE_CHECK(cusparseDestroy(mCusparse));
 #endif
 }
@ -439,8 +443,10 @@ cudaStream_t Manager::stream(const StreamID& stream) const {
 }
 cublasHandle_t Manager::cublasHandle() const { return mCublas; };
 cusparseHandle_t Manager::cusparseHandle() const { return mCusparse; };
-Manager Manager::mManagerInstance = Manager({"defaultMemory", "defaultCublas", "defaultKernel"});
+Manager Manager::mManagerInstance =
    Manager({"defaultMemory", "defaultCublas", "defaultCusparse", "defaultKernel"});
 #else
 Manager Manager::mManagerInstance = Manager({""});
 #endif
@ -674,37 +680,6 @@ void GraphManager::joinBranch(const StreamID& orig_stream, const StreamID& branc
    orig_stream.wait(*event);
 }
 #ifdef CUDACC
 const char* cublasGetErrorString(cublasStatus_t error) {
    switch (error) {
    case CUBLAS_STATUS_SUCCESS:
        return "CUBLAS_STATUS_SUCCESS";
    case CUBLAS_STATUS_NOT_INITIALIZED:
        return "CUBLAS_STATUS_NOT_INITIALIZED";
    case CUBLAS_STATUS_ALLOC_FAILED:
        return "CUBLAS_STATUS_ALLOC_FAILED";
    case CUBLAS_STATUS_INVALID_VALUE:
        return "CUBLAS_STATUS_INVALID_VALUE";
    case CUBLAS_STATUS_ARCH_MISMATCH:
        return "CUBLAS_STATUS_ARCH_MISMATCH";
    case CUBLAS_STATUS_MAPPING_ERROR:
        return "CUBLAS_STATUS_MAPPING_ERROR";
    case CUBLAS_STATUS_EXECUTION_FAILED:
        return "CUBLAS_STATUS_EXECUTION_FAILED";
    case CUBLAS_STATUS_INTERNAL_ERROR:
        return "CUBLAS_STATUS_INTERNAL_ERROR";
    }
    return "<unknown>";
 }
 #endif
 };     // namespace CudaTools
 #endif // CUDATOOLS_IMPLEMENTATION
--- a/Macros.h
+++ b/Macros.h
@ -9,9 +9,6 @@
 #define CUDACC
 #endif
 using real32 = float;  /**< Type alias for 32-bit floating point datatype. */
 using real64 = double; /**< Type alias for 64-bit floating point datatype. */
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 0)
 #define DEVICE
 #endif
@ -124,14 +121,19 @@ using real64 = double; /**< Type alias for 64-bit floating point datatype. */
 #ifdef CUDACC
 #include <cublas_v2.h>
 #include <cuda_bf16.h>
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
 #include <cusparse.h>
 #define DEVICE_FUNC __device__
 #define HD __host__ __device__
 #define SHARED __shared__
 #define KERNEL(call, ...) __global__ void call(__VA_ARGS__)
 #else
 #define DEVICE_FUNC
 #define HD
 #define SHARED
@ -139,8 +141,6 @@ using real64 = double; /**< Type alias for 64-bit floating point datatype. */
 #endif // CUDACC
 //#define KERNEL(call, settings, ...) CudaTools::runKernel(call, settings, __VA_ARGS__)
 ///////////////////
 // DEVICE MACROS //
 ///////////////////
@ -252,8 +252,16 @@ using real64 = double; /**< Type alias for 64-bit floating point datatype. */
    do {                                                                                           \
        cublasStatus_t err = (call);                                                               \
        if (err != CUBLAS_STATUS_SUCCESS) {                                                        \
-            printf("[cuBLAS] %s:%d\n | %s\n", __FILE__, __LINE__,                                  \
+            printf("[cuBLAS] %s:%d\n | %s\n", __FILE__, __LINE__, cublasGetStatusName(err));       \
-                   CudaTools::cublasGetErrorString(err));                                          \
+            throw std::exception();                                                                \
        }                                                                                          \
    } while (0)
 #define CUSPARSE_CHECK(call)                                                                       \
    do {                                                                                           \
        cusparseStatus_t err = (call);                                                             \
        if (err != CUSPARSE_STATUS_SUCCESS) {                                                      \
            printf("[cuSPARSE] %s:%d\n | %s\n", __FILE__, __LINE__, cusparseGetErrorName(err));    \
            throw std::exception();                                                                \
        }                                                                                          \
    } while (0)
--- a/2
+++ b/2
@ -7,7 +7,7 @@ INCLUDE :=
 LIBS_DIR :=
 LIBS_DIR_GPU := /usr/local/cuda/lib64
 LIBS :=
-LIBS_GPU := cuda cudart cublas
+LIBS_GPU := cuda cudart cublas cusparse
 TARGET = tests
 SRC_DIR = .
--- a/Makefile.template
+++ b/Makefile.template
@ -7,7 +7,7 @@ INCLUDE := <<Put extra include directories here, separated by a space>>
 LIBS_DIR := <<Put library directories here, separated by a space>>
 LIBS_DIR_GPU := /usr/local/cuda/lib64 <<Put extra include GPU library directories here, separated by a space>>
 LIBS := <<Put the names of the libraries here, separated by a space>>
-LIBS_GPU := cuda cudart cublas <<Put extra GPU libraries here, separated by a space>>
+LIBS_GPU := cuda cudart cublas cusparse <<Put extra GPU libraries here, separated by a space>>
 TARGET = <<Put the name of your target here>>
 SRC_DIR = .
--- a/Sparse.h
+++ b/Sparse.h
@ -0,0 +1,10 @@
 #ifndef CUDATOOLS_SPARSE_H
 #define CUDATOOLS_SPARSE_H
 #include "Array.h"
 #include "Core.h"
 #include "Macros.h"
 #include "Types.h"
 #endif
 #endif
--- a/Complex.h
+++ b/Complex.h
@ -11,6 +11,25 @@
 namespace CudaTools {
 namespace Types {
 using real32 = float;  /**< Type alias for 32-bit floating point datatype. */
 using real64 = double; /**< Type alias for 64-bit floating point datatype. */
 #ifdef CUDACC
 using real16 = __half;
 using realb16 = __nv_bfloat16;
 #else
 using real16 = float; /**< Type alias for 16-bit floating point datatype, when using GPU. Otherwise,
                         defaults to float. */
 using realb16 = float; /**< Type alias for the 16-bit bfloat datatype, when using GPU. Otherwise,
                          defaults to float. */
 #endif // CUDACC
 template <typename T> class complex {
  private:
    T r = 0;
@ -107,11 +126,10 @@ template complex<real64> operator*<real64>(const real64, const complex<real64>);
 template complex<real32> operator/<real32>(const real32, const complex<real32>);
 template complex<real64> operator/<real64>(const real64, const complex<real64>);
-}; // namespace CudaTools
+#ifdef CUDACC
 using complex64 = complex<real32>;
 using complex128 = complex<real64>;
 #ifdef CUDA
 using complex64 = CudaTools::complex<real32>;
 using complex128 = CudaTools::complex<real64>;
 #else
 using complex64 = std::complex<real32>; /**< Type alias for 64-bit complex floating point datatype.
                                         * This adapts depending on the CUDA compilation flag, and
@ -122,4 +140,27 @@ using complex128 =
                           * CudaTools::complex<real64>. */
 #endif
 /** Type alises and lots of metaprogramming definitions, primarily dealing with
 * the different numeric types and overrides. */
 template <typename T> struct ComplexUnderlying_S { typedef T type; };
 template <> struct ComplexUnderlying_S<complex64> { typedef float type; };
 template <> struct ComplexUnderlying_S<complex128> { typedef double type; };
 template <typename T> using ComplexUnderlying = typename ComplexUnderlying_S<T>::type;
 template <typename T> struct ComplexConversion_S { typedef T type; };
 template <> struct ComplexConversion_S<complex64> { typedef std::complex<float> type; };
 template <> struct ComplexConversion_S<complex128> { typedef std::complex<double> type; };
 template <typename T> using ComplexConversion = typename ComplexConversion_S<T>::type;
 template <typename T> inline constexpr bool is_int = std::is_integral<T>::value;
 template <typename T> inline constexpr bool is_float = std::is_floating_point<T>::value;
 template <typename T>
 inline constexpr bool is_complex =
    std::is_same<T, complex64>::value or std::is_same<T, complex128>::value;
 template <typename T> inline constexpr bool is_host_num = is_int<T> or is_float<T> or is_complex<T>;
 }; // namespace Types
 }; // namespace CudaTools
 #endif
--- a/docs/source/usage.rst
+++ b/docs/source/usage.rst
@ -21,7 +21,7 @@ for your own project, after following a few rules.
 The usage of this libary will be illustrated through examples, and further details
 can be found in the other sections. The examples are given in the `samples <https://git.acem.ece.illinois.edu/kjao/CudaTools/src/branch/main/samples>`__ folder.
-Throughout this documentation, there are a few common terms that may appear. First,we refer to the CPU as the host, and the GPU as the device. So, a host function refers
+Throughout this documentation, there are a few common terms that may appear. First, we refer to the CPU as the host, and the GPU as the device. So, a host function refers
 to a function runnable on the CPU, and a device function refers to a function that is runnable
 on a device. A kernel is a specific function that the host can call to be run on the device.
@ -42,17 +42,17 @@ macros provided. For example,
        return 0;
    }
-The ``DEFINE_KERNEL(name, ...)`` macro takes in the function name and its arguments.
+The ``KERNEL(name, ...)`` macro takes in the function name and its arguments.
 The second argument in the ``KERNEL()`` macro is are the launch parameters for
 kernel. The launch parameters have several items, but for 'embarassingly parallel'
-cases, we can simply generate the settings with the number of threads. More detail with
+cases, we can simply generate the settings with the number of threads using ``CudaTools::Kernel::basic``. More detail with
 creating launch parameters can be found :ref:`here <CudaTools::Kernel::Settings>`. In the above example,
 there is only one thread. The rest of the arguments are just the kernel arguments. For more detail,
 see :ref:`here <Macro Functions>`.
 .. warning::
   These kernel definitions must be in a file that will be compiled by ``nvcc``. Also,
-   for header files, there is an additional macro ``DECLARE_KERNEL(name, ...)`` to declare it
+   for header files, there is an additional macro ``KERNEL(name, ...)`` to declare it
   and make it available to other files.
 Since many applications used classes, a macro is provided to 'convert' a class into
@ -192,7 +192,8 @@ situations and with the ``CudaTools::Kernel::basic()`` launch parameters. If com
 mark the loop with ``#pragma parallel for`` and attempt to use OpenMP for parallelism.
 .. warning::
-   Notice that a view must be passed to the kernel, and not the original object. This
+   Notice that a view must be passed to the kernel, and not the original object, otherwise a copy
   would be made.
 The Array also supports other helpful functions, such as multi-dimensional indexing, slicing, and
 a few other functions.
--- a/samples/5_SimpleGraph/main.cu.cpp
+++ b/samples/5_SimpleGraph/main.cu.cpp
@ -90,6 +90,7 @@ int main() {
    CudaTools::Array<uint32_t> A = CudaTools::Array<uint32_t>::constant({100}, 50);
    CudaTools::Array<uint32_t> B = CudaTools::Array<uint32_t>::constant({100}, 0);
    // Executes process without graph.
    TIME(doFunc(A.view(), B.view()), ExecuteNoGraph);
    std::cout << A.slice({{0, 10}}) << "\n";
@ -97,6 +98,7 @@ int main() {
    A.setConstant(50);
    B.setConstant(0);
    // Executes process with graph.
    CudaTools::GraphManager gm;
    CudaTools::Graph graph("graphStream", myGraph, &gm, A.view(), B.view());
    TIME(graph.execute().wait(), ExecuteGraph);
--- a/tests.cu.cpp
+++ b/tests.cu.cpp
@ -2,13 +2,14 @@
 #define CUDATOOLS_ARRAY_MAX_AXES 8
 #include "Array.h"
 #include "BLAS.h"
 #include "Complex.h"
 #include "Core.h"
 #include "Types.h"
 #include <Eigen/Core>
 #include <chrono>
 #include <complex>
 using namespace CudaTools::Types;
 namespace CT = CudaTools;
 /////////////