Updated docs build process to make docs

Added Python library support
Corrected #ifdef for Eigen compatibility
2023-05-31 22:30:34 -05:00 · 2023-05-31 22:30:34 -05:00 · 2023-05-31 22:30:34 -05:00 · 2023-05-31 22:30:34 -05:00 · 2023-05-31 22:30:34 -05:00 · 2023-05-31 22:30:34 -05:00
21 changed files with 243 additions and 153 deletions
--- a/0
+++ b/0
--- a/110
+++ b/110
@ -1,95 +1,21 @@
-CC := g++-10
-NVCC := nvcc
-CFLAGS := -Wall -std=c++17 -fopenmp -MMD
-NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
+.PHONY: all docs install

-INCLUDE :=
-LIBS_DIR :=
-LIBS_DIR_GPU := /usr/local/cuda/lib64
-LIBS :=
-LIBS_GPU := cuda cudart cublas
+all: install docs

-TARGET = tests
-SRC_DIR = .
-BUILD_DIR = build
+docs: SHELL:=/bin/bash
+docs:
+	if [ ! -d "docs/.venv" ]; then python3 -m venv "docs/.venv"; fi
+	( \
+		source docs/.venv/bin/activate;\
+		pip install -r docs/requirements.txt; \
+		doxygen docs/Doxyfile; \
+		sphinx-build -b html docs/source docs/build/html; \
+		deactivate \
+	)

-# Should not need to modify below.
-
-CPU_BUILD_DIR = $(BUILD_DIR)/cpu
-GPU_BUILD_DIR = $(BUILD_DIR)/gpu
-
-SRC = $(wildcard $(SRC_DIR)/*/*.cpp) $(wildcard $(SRC_DIR)/*.cpp)
-
-# Get source files and object files.
-GCC_SRC = $(filter-out %.cu.cpp ,$(SRC))
-NVCC_SRC = $(filter %.cu.cpp, $(SRC))
-GCC_OBJ = $(GCC_SRC:$(SRC_DIR)/%.cpp=%.o)
-NVCC_OBJ = $(NVCC_SRC:$(SRC_DIR)/%.cpp=%.o)
-
-# If compiling for CPU, all go to GCC. Otherwise, they are split.
-CPU_OBJ = $(addprefix $(CPU_BUILD_DIR)/,$(GCC_OBJ)) $(addprefix $(CPU_BUILD_DIR)/,$(NVCC_OBJ))
-GPU_GCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(GCC_OBJ))
-GPU_NVCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(NVCC_OBJ))
-
-# $(info $$GCC_SRC is [${GCC_SRC}])
-# $(info $$NVCC_SRC is [${NVCC_SRC}])
-# $(info $$GCC_OBJ is [${GCC_OBJ}])
-# $(info $$NVCC_OBJ is [${NVCC_OBJ}])
-
-# $(info $$CPU_OBJ is [${CPU_OBJ}])
-# $(info $$GPU_GCC_OBJ is [${GPU_GCC_OBJ}])
-# $(info $$GPU_NVCC_OBJ is [${GPU_NVCC_OBJ}])
-
-HEADER = $(wildcard $(SRC_DIR)/*/*.h) $(wildcard $(SRC_DIR)/*.h)
-CPU_DEPS = $(wildcard $(CPU_BUILD_DIR)/*.d)
-GPU_DEPS = $(wildcard $(GPU_BUILD_DIR)/*.d)
-
-INC := $(INCLUDE:%=-I%)
-LIB := $(LIBS_DIR:%=-L%)
-LIB_GPU := $(LIBS_DIR_GPU:%=-L%)
-LD := $(LIBS:%=-l%)
-LD_GPU := $(LIBS_GPU:%=-l%)
-
-# Reminder:
-# $< = first prerequisite
-# $@ = the target which matched the rule
-# $^ = all prerequisites
-
-.PHONY: all clean
-
-all : cpu gpu
-
-cpu: $(TARGET)CPU
-gpu: $(TARGET)GPU
-
-$(TARGET)CPU: $(CPU_OBJ)
-	$(CC) $(CFLAGS) $^ -o $@ $(INC) $(LIB) $(LD)
-
-$(CPU_BUILD_DIR)/%.o $(CPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cpp | $(CPU_BUILD_DIR)
-	$(CC) $(CFLAGS) -c -o $@ $< $(INC)
-
-# For GPU, we need to build the NVCC objects, the NVCC linked object, and the
-# regular ones. Then, we link them all together.
-$(TARGET)GPU: $(GPU_BUILD_DIR)/link.o $(GPU_GCC_OBJ) | $(GPU_BUILD_DIR)
-	$(CC) -g -DCUDA $(CFLAGS) $(GPU_NVCC_OBJ) $^ -o $@ $(INC) $(LIB) $(LIB_GPU) $(LD) $(LD_GPU)
-
-$(GPU_BUILD_DIR)/link.o: $(GPU_NVCC_OBJ) | $(GPU_BUILD_DIR)
-	$(NVCC) --device-link $^ -o $@
-
-$(GPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cu.cpp | $(GPU_BUILD_DIR)
-	$(NVCC) $(NVCC_FLAGS) -DCUDA -x cu --device-c -o $@ $< $(INC)
-
-$(GPU_BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp | $(GPU_BUILD_DIR)
-	$(CC) $(CFLAGS) -g -DCUDA -c -o $@ $< $(INC)
-
-include $(CPU_DEPS)
-include $(GPU_DEPS)
-
-$(CPU_BUILD_DIR):
-	mkdir -p $@
-
-$(GPU_BUILD_DIR):
-	mkdir -p $@
-
-clean:
-	rm -Rf $(BUILD_DIR) $(TARGET)CPU $(TARGET)GPU
+install:
+	install -d $(DEST_DIR)/
+	install -d $(DEST_DIR)/include/
+	for file in include/*; do \
+		install -m 644 $$file $(DEST_DIR)/include/; \
+	done
--- a/Makefile.template
+++ b/Makefile.template
@ -1,7 +1,7 @@
 CC := g++-10
 NVCC := nvcc
-CFLAGS := -Wall -std=c++17 -fopenmp -MMD
-NVCC_FLAGS := -MMD -w -Xcompiler
+CFLAGS := -std=c++17 -MMD -Wall -fopenmp
+NVCC_FLAGS := -std=c++17 -MMD -Xcudafe="--diag_suppress=20012" -Xcompiler -fopenmp

 INCLUDE := <<Put extra include directories here, separated by a space>>
 LIBS_DIR := <<Put library directories here, separated by a space>>
@ -74,7 +74,7 @@ $(TARGET)GPU: $(GPU_BUILD_DIR)/link.o $(GPU_GCC_OBJ) | $(GPU_BUILD_DIR)
 	$(CC) -g -DCUDA $(CFLAGS) $(GPU_NVCC_OBJ) $^ -o $@ $(INC) $(LIB) $(LIB_GPU) $(LD) $(LD_GPU)

 $(GPU_BUILD_DIR)/link.o: $(GPU_NVCC_OBJ) | $(GPU_BUILD_DIR)
-	$(NVCC) --device-link $^ -o $@
+	$(NVCC) --device-link -lgomp $^ -o $@

 $(GPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cu.cpp | $(GPU_BUILD_DIR)
 	$(NVCC) $(NVCC_FLAGS) -DCUDA -x cu --device-c -o $@ $< $(INC)
--- a/11
+++ b/11
@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+
+if [ -z "$1" ]
+then
+   printf "Usage: setup.sh install_directory [dependencies].\n" 1>&2
+   exit 1
+fi
+
+cd $(dirname $0)
+libname=$(basename $(pwd))
+make DEST_DIR="$1" install
--- a/2
+++ b/2
@ -1,2 +0,0 @@
-doxygen docs/Doxyfile
-sphinx-build -b html docs/source docs/build/html
--- a/docs/Doxyfile
+++ b/docs/Doxyfile
@ -778,7 +778,7 @@ WARNINGS               = YES
 # will automatically be disabled.
 # The default value is: YES.

-WARN_IF_UNDOCUMENTED   = YES
+WARN_IF_UNDOCUMENTED   = NO

 # If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
 # potential errors in the documentation, such as not documenting some parameters
@ -829,7 +829,7 @@ WARN_LOGFILE           =
 # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
 # Note: If this tag is empty the current directory is searched.

-INPUT                  = "./"
+INPUT                  = "./include"

 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -27,8 +27,8 @@ extensions = [
    'breathe',
 ]

-breathe_projects = {"DGEMS": "../build/xml"}
-breathe_default_project = "DGEMS"
+breathe_projects = {"CudaTools": "../build/xml"}
+breathe_default_project = "CudaTools"

 bibtex_bibfiles = ['refs.bib']

--- a/docs/source/core.rst
+++ b/docs/source/core.rst
@ -42,6 +42,8 @@ Host-Device Automation
 Compilation Options
 -------------------
 .. doxygendefine:: CUDATOOLS_ARRAY_MAX_AXES
+.. doxygendefine:: CUDATOOLS_USE_EIGEN
+.. doxygendefine:: CUDATOOLS_USE_PYTHON

 Macro Functions
 ===============
--- a/include/Array.h
+++ b/include/Array.h
@ -4,7 +4,7 @@
 #include "Core.h"
 #include "Macros.h"
 #include "Types.h"
-#include <Eigen/Dense>
+
 #include <cmath>
 #include <complex>
 #include <cstdlib>
@ -12,6 +12,16 @@
 #include <random>
 #include <type_traits>

+#ifdef CUDATOOLS_USE_EIGEN
+#include <Eigen/Dense>
+#endif
+
+#ifdef CUDATOOLS_USE_PYTHON
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+namespace py = pybind11;
+#endif
+
 #ifdef DEVICE
 #define POINTER pDevice
 #else
@ -22,6 +32,7 @@ using namespace CudaTools::Types;

 namespace CudaTools {

+#ifdef CUDATOOLS_USE_EIGEN
 template <typename T>
 using EigenMat = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor>;
 template <typename T> using EigenMapMat = Eigen::Map<EigenMat<T>>;
@ -30,9 +41,16 @@ template <typename T> using ConstEigenMapMat = Eigen::Map<const EigenMat<T>>;
 template <typename T> struct EigenAdaptConst_S { typedef EigenMapMat<T> type; };
 template <typename T> struct EigenAdaptConst_S<const T> { typedef ConstEigenMapMat<T> type; };
 template <typename T> using EigenAdaptConst = typename EigenAdaptConst_S<T>::type;
+#endif

 template <typename T> class Array;
-using Slice = std::pair<uint32_t, uint32_t>;
+
+struct Slice {
+    uint32_t first;
+    uint32_t second;
+    HD Slice(const std::initializer_list<uint32_t> i)
+        : first(*i.begin()), second(*(i.begin() + 1)) {}
+};

 template <typename T> class ArrayIterator {
  private:
@ -181,7 +199,7 @@ template <typename T> class Array {

    uint32_t mEndOffset = 0;

-    void freeArrays() {
+    HD void freeArrays() {
 #ifndef DEVICE
        if (not mIsView) {
            if (pDevice != nullptr) CudaTools::free(pDevice);
@ -490,6 +508,7 @@ template <typename T> class Array {
     */
    HD void flatten() { reshape({mShape.mItems}); };

+#ifdef CUDATOOLS_USE_EIGEN
    /**
     * Returns the Eigen::Map of this Array.
     */
@ -501,6 +520,7 @@ template <typename T> class Array {
        return EigenAdaptConst<ComplexConversion<T>>((ComplexConversion<T>*)POINTER, mShape.rows(),
                                                     mShape.cols());
    };
+#endif

    /**
     * Gets the Shape of the Array.
@ -528,7 +548,7 @@ template <typename T> class Array {
    /**
     * Copies this Array and returns a new Array with the same memory.
     */
-    HD Array copy() const {
+    Array copy() const {
        Array<T> arr(mShape, (pDevice == nullptr));

        auto arr_it = arr.begin();
@ -679,6 +699,8 @@ template <typename T> class Array {
     * Its self assigning version is transpose. This is restricted to numerical types.
     * \brief Host only
     */
+
+#ifdef CUDATOOLS_USE_EIGEN
    Array transposed() const {
        static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
        CT_ERROR_IF(shape().axes(), !=, 2, "Tranpose can only occur on two-dimensional arrays");
@ -708,6 +730,7 @@ template <typename T> class Array {
        Array<T> inv(shape());
        inv.eigenMap() = this->eigenMap().inverse();
    };
+#endif

    /**
     * Pins the memory (page locks) for faster memory transfer in concurrent
@ -733,6 +756,22 @@ template <typename T> class Array {
        CT_ERROR(mIsSlice, "Cannot update device copy on a slice");
        return CudaTools::copy(pHost, pDevice, mShape.items() * sizeof(T), stream);
    };
+
+#ifdef CUDATOOLS_USE_PYTHON
+    /**
+     * Returns a py::array for making an Array available as a Python numpy array.
+     */
+    py::array pyArray() const {
+        std::vector<py::ssize_t> dims, strides;
+        for (uint iAxis = 0; iAxis < mShape.axes(); ++iAxis) {
+            dims.push_back(static_cast<py::ssize_t>(mShape.dim(iAxis)));
+            strides.push_back(sizeof(T) * static_cast<py::ssize_t>(mShape.stride(iAxis)));
+        }
+        return py::array_t<T, py::array::f_style>(
+            py::buffer_info((void*)pHost, sizeof(T), py::format_descriptor<T>::format(),
+                            static_cast<py::ssize_t>(mShape.axes()), dims, strides));
+    };
+#endif
 };

 template <typename T>
@ -774,9 +813,11 @@ template <typename T> std::ostream& operator<<(std::ostream& out, const Array<T>
        bool negative = false;
        for (auto it = arr.begin(); it != arr.end(); ++it) {
            T val = *it;
-            if (*it < 0) {
-                negative = true;
-                val *= -1;
+            if constexpr (not std::is_unsigned<T>::value) {
+                if (*it < 0) {
+                    negative = true;
+                    val *= -1;
+                }
            }
            max_val = (val > max_val) ? val : max_val;
        }
--- a/include/BLAS.h
+++ b/include/BLAS.h
@ -1,6 +1,10 @@
 #ifndef CUDATOOLS_BLAS_H
 #define CUDATOOLS_BLAS_H

+#ifndef CUDATOOLS_USE_EIGEN
+#error "Cannot use CudaTools BLAS.h header without Eigen."
+#endif
+
 #include "Array.h"
 #include "Core.h"
 #include "Macros.h"
@ -138,7 +142,7 @@ template <typename T> class Batch {

        Array<T> batch = arr.reshaped({mBatchSize, mShape.rows(), mShape.cols()});
        for (uint32_t i = 0; i < mBatchSize; ++i) {
-#ifdef CUDA
+#ifdef CUDACC
            mBatch[i] = batch[i].dataDevice();
 #else
            mBatch[i] = batch[i].data();
@ -154,7 +158,7 @@ template <typename T> class Batch {
    void add(const Array<T>& arr) {
        CT_ERROR(not arr.isView(), "Cannot add non-view Arrays");
        CT_ERROR_IF(mCount, ==, mBatchSize, "Batch is full, cannot add more arrays");
-#ifdef CUDA
+#ifdef CUDACC
        mBatch[mCount] = arr.dataDevice();
 #else
        mBatch[mCount] = arr.data();
@ -270,7 +274,7 @@ StreamID GEMV(const T alpha, const Array<T>& A, const Array<T>& x, const T beta,
    uint32_t rows = A.shape().rows();
    uint32_t cols = A.shape().cols();
    T a = alpha, b = beta;
-#ifdef CUDA
+#ifdef CUDACC
    CUBLAS_CHECK(cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream)));
    if (bi.size == 1) {
        invoke<T>(cublasSgemv, cublasDgemv, cublasCgemv, cublasZgemv,
@ -315,7 +319,7 @@ StreamID GEMM(const T alpha, const Array<T>& A, const Array<T>& B, const T beta,
    uint32_t n = B.shape().cols();

    T a = alpha, b = beta;
-#ifdef CUDA
+#ifdef CUDACC
    CUBLAS_CHECK(cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream)));

    if (bi.size == 1) {
@ -368,7 +372,7 @@ StreamID DGMM(const Array<T>& A, const Array<T>& X, const Array<T>& C, const boo
    CT_ERROR_IF(A.shape().cols(), !=, C.shape().cols(),
                "Rows of 'A' and columns of 'C' need to match.");

-#ifdef CUDA
+#ifdef CUDACC
    uint32_t m = C.shape().rows();
    uint32_t n = C.shape().cols();
    auto mode = (left) ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT;
@ -544,7 +548,7 @@ class PLUBatch : public Batch<T> {
     * Computes the inplace PLU decomposition of batch of arrays.
     */
    StreamID computeLU(const StreamID& stream = DEF_CUBLAS_STREAM) {
-#ifdef CUDA
+#ifdef CUDACC
        uint32_t n = this->mShape.rows();
        CUBLAS_CHECK(
            cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream)));
@ -575,7 +579,7 @@ class PLUBatch : public Batch<T> {
        CT_ERROR_IF(b.shape().rows(), !=, this->mShape.rows(),
                    "The length of each column of b must match the matrix rank");

-#ifdef CUDA
+#ifdef CUDACC
        uint32_t n = b.shape().rows();
        uint32_t nrhs = b.shape().cols();
        CUBLAS_CHECK(
--- a/include/Core.h
+++ b/include/Core.h
@ -143,7 +143,7 @@ Settings basic(const size_t threads, const StreamID& stream = DEF_KERNEL_STREAM)

 template <typename F, typename... Args>
 StreamID launch(F func, const Kernel::Settings& sett, Args... args) {
-#ifdef CUDA
+#ifdef CUDACC
    func<<<sett.blockGrid, sett.threadBlock, sett.sharedMemoryBytes,
           Manager::get()->stream(sett.stream.mId)>>>(args...);
 #else
@ -256,8 +256,8 @@ template <typename F, typename... Args> class Graph {

    ~Graph() {
 #ifdef CUDACC
-        CUDA_CHECK(cudaGraphDestroy(mGraph));
-        CUDA_CHECK(cudaGraphExecDestroy(mInstance));
+        cudaGraphDestroy(mGraph);
+        cudaGraphExecDestroy(mInstance);
 #endif
    };

@ -316,9 +316,11 @@ struct GraphManager {
 };

 }; // namespace CudaTools
+#endif // CUDATOOLS_H

 #ifdef CUDATOOLS_IMPLEMENTATION
-
+#ifndef __CUDATOOLS_IMPLEMENTED__
+#define __CUDATOOLS_IMPLEMENTED__
 namespace CudaTools {

 //////////////////////
@ -400,9 +402,9 @@ Manager::Manager(const std::vector<std::string>& names) {
 Manager::~Manager() {
 #ifdef CUDACC
    for (auto& it : mStreams) {
-        CUDA_CHECK(cudaStreamDestroy(it.second));
+        cudaStreamDestroy(it.second);
    }
-    CUBLAS_CHECK(cublasDestroy(mCublas));
+    cublasDestroy(mCublas);
    // CUSPARSE_CHECK(cusparseDestroy(mCusparse));
 #endif
 }
@ -640,7 +642,7 @@ Event::Event() {

 Event::~Event() {
 #ifdef CUDACC
-    CUDA_CHECK(cudaEventDestroy(mEvent));
+    cudaEventDestroy(mEvent);
 #endif
 }

@ -656,9 +658,6 @@ void Event::record(const StreamID& stream) {

 GraphManager::~GraphManager() {
 #ifdef CUDACC
-    for (void* func : mHostData) {
-        delete func;
-    }
    for (Event* event : mEvents) {
        delete event;
    }
@ -680,6 +679,5 @@ void GraphManager::joinBranch(const StreamID& orig_stream, const StreamID& branc
 }

 };     // namespace CudaTools
+#endif
 #endif // CUDATOOLS_IMPLEMENTATION
-
-#endif // CUDATOOLS_H
--- a/include/Macros.h
+++ b/include/Macros.h
@ -49,6 +49,18 @@
 */
 #define SHARED

+/**
+ * \def CUDATOOLS_USE_EIGEN
+ * Compile the CudaTools library with Eigen support.
+ */
+#define CUDATOOLS_USE_EIGEN
+
+/**
+ * \def CUDATOOLS_USE_PYTHON
+ * Compile the CudaTools library with Python support.
+ */
+#define CUDATOOLS_USE_PYTHON
+
 /**
 * \def KERNEL(call, settings, ...)
 * Used to call a CUDA kernel.
@ -218,12 +230,13 @@
 #ifdef DEVICE
 #define CT_ERROR_IF(a, op, b, msg)                                                                 \
    if (a op b) {                                                                                  \
-        printf("[ERROR] %s:%d\n | %s: (" #a ") " #op " (" #b ").\n", __FILE__, __LINE__, msg);     \
+        printf("\033[1;31m[CudaTools]\033[0m %s:%d\n | %s: (" #a ") " #op " (" #b ").\n",          \
+               __FILE__, __LINE__, msg);                                                           \
    }

 #define CT_ERROR(a, msg)                                                                           \
    if (a) {                                                                                       \
-        printf("[ERROR] %s:%d\n | %s: " #a ".\n", __FILE__, __LINE__, msg);                        \
+        printf("\033[1;31m[CudaTools]\033[0m %s:%d\n | %s: " #a ".\n", __FILE__, __LINE__, msg);   \
    }
 #else

@ -233,14 +246,14 @@
        std::ostringstream os_b;                                                                   \
        os_a << a;                                                                                 \
        os_b << b;                                                                                 \
-        printf("[ERROR] %s:%d\n | %s: (" #a ")%s " #op " (" #b ")%s.\n", __FILE__, __LINE__, msg,  \
-               os_a.str().c_str(), os_b.str().c_str());                                            \
+        printf("\033[1;31m[CudaTools]\033[0m %s:%d\n | %s: (" #a ")%s " #op " (" #b ")%s.\n",      \
+               __FILE__, __LINE__, msg, os_a.str().c_str(), os_b.str().c_str());                   \
        throw std::exception();                                                                    \
    }

 #define CT_ERROR(a, msg)                                                                           \
    if (a) {                                                                                       \
-        printf("[ERROR] %s:%d\n | %s: " #a ".\n", __FILE__, __LINE__, msg);                        \
+        printf("\033[1;31m[CudaTools]\033[0m %s:%d\n | %s: " #a ".\n", __FILE__, __LINE__, msg);   \
        throw std::exception();                                                                    \
    }
 #endif
@ -253,7 +266,8 @@
    do {                                                                                           \
        cudaError_t err = (call);                                                                  \
        if (err != cudaSuccess) {                                                                  \
-            printf("[CUDA] %s:%d\n | %s\n", __FILE__, __LINE__, cudaGetErrorString(err));          \
+            printf("\033[1;31m[CUDA]\033[0m %s:%d\n | %s\n", __FILE__, __LINE__,                   \
+                   cudaGetErrorString(err));                                                       \
            throw std::exception();                                                                \
        }                                                                                          \
    } while (0)
@ -262,7 +276,8 @@
    do {                                                                                           \
        cublasStatus_t err = (call);                                                               \
        if (err != CUBLAS_STATUS_SUCCESS) {                                                        \
-            printf("[cuBLAS] %s:%d\n | %s\n", __FILE__, __LINE__, cublasGetStatusName(err));       \
+            printf("\033[1;31m[cuBLAS]\033[0m %s:%d\n | %s\n", __FILE__, __LINE__,                 \
+                   cublasGetStatusName(err));                                                      \
            throw std::exception();                                                                \
        }                                                                                          \
    } while (0)
--- a/include/Types.h
+++ b/include/Types.h
--- a/samples/1_CoreKernel/Makefile
+++ b/samples/1_CoreKernel/Makefile
@ -1,9 +1,9 @@
 CC := g++-10
 NVCC := nvcc
-CFLAGS := -Wall -std=c++17 -fopenmp -MMD
-NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
+CFLAGS := -std=c++17 -MMD -Wall -fopenmp
+NVCC_FLAGS := -std=c++17 -MMD -Xcudafe="--diag_suppress=20012" -Xcompiler -fopenmp

-INCLUDE := ../../
+INCLUDE := ../../include/CudaTools
 LIBS_DIR :=
 LIBS_DIR_GPU := /usr/local/cuda/lib64
 LIBS :=
--- a/samples/2_CoreClass/Makefile
+++ b/samples/2_CoreClass/Makefile
@ -1,9 +1,9 @@
 CC := g++-10
 NVCC := nvcc
-CFLAGS := -Wall -std=c++17 -fopenmp -MMD
-NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
+CFLAGS := -std=c++17 -MMD -Wall -fopenmp
+NVCC_FLAGS := -std=c++17 -MMD -Xcudafe="--diag_suppress=20012" -Xcompiler -fopenmp

-INCLUDE := ../../
+INCLUDE := ../../include/CudaTools
 LIBS_DIR :=
 LIBS_DIR_GPU := /usr/local/cuda/lib64
 LIBS :=
--- a/samples/3_ArrayKernel/Makefile
+++ b/samples/3_ArrayKernel/Makefile
@ -1,9 +1,9 @@
 CC := g++-10
 NVCC := nvcc
-CFLAGS := -Wall -std=c++17 -fopenmp -MMD
-NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
+CFLAGS := -std=c++17 -MMD -Wall -fopenmp
+NVCC_FLAGS := -std=c++17 -MMD -Xcudafe="--diag_suppress=20012" -Xcompiler -fopenmp

-INCLUDE := ../../
+INCLUDE := ../../include/CudaTools
 LIBS_DIR :=
 LIBS_DIR_GPU := /usr/local/cuda/lib64
 LIBS :=
--- a/samples/4_ArrayFunctions/Makefile
+++ b/samples/4_ArrayFunctions/Makefile
@ -1,9 +1,9 @@
 CC := g++-10
 NVCC := nvcc
-CFLAGS := -Wall -std=c++17 -fopenmp -MMD
-NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
+CFLAGS := -std=c++17 -MMD -Wall -fopenmp
+NVCC_FLAGS := -std=c++17 -MMD -Xcudafe="--diag_suppress=20012" -Xcompiler -fopenmp

-INCLUDE := ../../
+INCLUDE := ../../include/CudaTools
 LIBS_DIR :=
 LIBS_DIR_GPU := /usr/local/cuda/lib64
 LIBS :=
--- a/samples/5_SimpleGraph/Makefile
+++ b/samples/5_SimpleGraph/Makefile
@ -1,9 +1,9 @@
 CC := g++-10
 NVCC := nvcc
-CFLAGS := -Wall -std=c++17 -fopenmp -MMD
-NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
+CFLAGS := -std=c++17 -MMD -Wall -fopenmp
+NVCC_FLAGS := -std=c++17 -MMD -Xcudafe="--diag_suppress=20012" -Xcompiler -fopenmp

-INCLUDE := ../../
+INCLUDE := ../../include/CudaTools
 LIBS_DIR :=
 LIBS_DIR_GPU := /usr/local/cuda/lib64
 LIBS :=
--- a/tests/Makefile
+++ b/tests/Makefile
@ -0,0 +1,95 @@
+CC := g++-10
+NVCC := nvcc
+CFLAGS := -std=c++17 -MMD -Wall -fopenmp
+NVCC_FLAGS := -std=c++17 -MMD -Xcudafe="--diag_suppress=20012" -Xcompiler -fopenmp
+
+INCLUDE := ../include/CudaTools
+LIBS_DIR :=
+LIBS_DIR_GPU := /usr/local/cuda/lib64
+LIBS :=
+LIBS_GPU := cuda cudart cublas
+
+TARGET = tests
+SRC_DIR = .
+BUILD_DIR = build
+
+# Should not need to modify below.
+
+CPU_BUILD_DIR = $(BUILD_DIR)/cpu
+GPU_BUILD_DIR = $(BUILD_DIR)/gpu
+
+SRC = $(wildcard $(SRC_DIR)/*/*.cpp) $(wildcard $(SRC_DIR)/*.cpp)
+
+# Get source files and object files.
+GCC_SRC = $(filter-out %.cu.cpp ,$(SRC))
+NVCC_SRC = $(filter %.cu.cpp, $(SRC))
+GCC_OBJ = $(GCC_SRC:$(SRC_DIR)/%.cpp=%.o)
+NVCC_OBJ = $(NVCC_SRC:$(SRC_DIR)/%.cpp=%.o)
+
+# If compiling for CPU, all go to GCC. Otherwise, they are split.
+CPU_OBJ = $(addprefix $(CPU_BUILD_DIR)/,$(GCC_OBJ)) $(addprefix $(CPU_BUILD_DIR)/,$(NVCC_OBJ))
+GPU_GCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(GCC_OBJ))
+GPU_NVCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(NVCC_OBJ))
+
+# $(info $$GCC_SRC is [${GCC_SRC}])
+# $(info $$NVCC_SRC is [${NVCC_SRC}])
+# $(info $$GCC_OBJ is [${GCC_OBJ}])
+# $(info $$NVCC_OBJ is [${NVCC_OBJ}])
+
+# $(info $$CPU_OBJ is [${CPU_OBJ}])
+# $(info $$GPU_GCC_OBJ is [${GPU_GCC_OBJ}])
+# $(info $$GPU_NVCC_OBJ is [${GPU_NVCC_OBJ}])
+
+HEADER = $(wildcard $(SRC_DIR)/*/*.h) $(wildcard $(SRC_DIR)/*.h)
+CPU_DEPS = $(wildcard $(CPU_BUILD_DIR)/*.d)
+GPU_DEPS = $(wildcard $(GPU_BUILD_DIR)/*.d)
+
+INC := $(INCLUDE:%=-I%)
+LIB := $(LIBS_DIR:%=-L%)
+LIB_GPU := $(LIBS_DIR_GPU:%=-L%)
+LD := $(LIBS:%=-l%)
+LD_GPU := $(LIBS_GPU:%=-l%)
+
+# Reminder:
+# $< = first prerequisite
+# $@ = the target which matched the rule
+# $^ = all prerequisites
+
+.PHONY: all clean
+
+all : cpu gpu
+
+cpu: $(TARGET)CPU
+gpu: $(TARGET)GPU
+
+$(TARGET)CPU: $(CPU_OBJ)
+	$(CC) $(CFLAGS) $^ -o $@ $(INC) $(LIB) $(LD)
+
+$(CPU_BUILD_DIR)/%.o $(CPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cpp | $(CPU_BUILD_DIR)
+	$(CC) $(CFLAGS) -c -o $@ $< $(INC)
+
+# For GPU, we need to build the NVCC objects, the NVCC linked object, and the
+# regular ones. Then, we link them all together.
+$(TARGET)GPU: $(GPU_BUILD_DIR)/link.o $(GPU_GCC_OBJ) | $(GPU_BUILD_DIR)
+	$(CC) -g -DCUDA $(CFLAGS) $(GPU_NVCC_OBJ) $^ -o $@ $(INC) $(LIB) $(LIB_GPU) $(LD) $(LD_GPU)
+
+$(GPU_BUILD_DIR)/link.o: $(GPU_NVCC_OBJ) | $(GPU_BUILD_DIR)
+	$(NVCC) --device-link -lgomp $^ -o $@
+
+$(GPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cu.cpp | $(GPU_BUILD_DIR)
+	$(NVCC) $(NVCC_FLAGS) -DCUDA -x cu --device-c -o $@ $< $(INC)
+
+$(GPU_BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp | $(GPU_BUILD_DIR)
+	$(CC) $(CFLAGS) -g -DCUDA -c -o $@ $< $(INC)
+
+-include $(CPU_DEPS)
+-include $(GPU_DEPS)
+
+$(CPU_BUILD_DIR):
+	mkdir -p $@
+
+$(GPU_BUILD_DIR):
+	mkdir -p $@
+
+clean:
+	rm -Rf $(BUILD_DIR) $(TARGET)CPU $(TARGET)GPU
--- a/tests/tests.cu.cpp
+++ b/tests/tests.cu.cpp
@ -1,9 +1,9 @@
 #define CUDATOOLS_IMPLEMENTATION
 #define CUDATOOLS_ARRAY_MAX_AXES 8
-#include "Array.h"
-#include "BLAS.h"
-#include "Core.h"
-#include "Types.h"
+#include <Array.h>
+#include <BLAS.h>
+#include <Core.h>
+#include <Types.h>

 #include <Eigen/Core>
 #include <chrono>
Author	SHA1	Message	Date
Kenneth Jao	45e8e274cb	Updated docs build process to make docs	2023-05-31 22:30:34 -05:00
Kenneth Jao	cdd35acd34	Added Python library support	2023-05-31 22:30:34 -05:00
Kenneth Jao	0607618407	Corrected #ifdef for Eigen compatibility	2023-05-31 22:30:34 -05:00
Kenneth Jao	4371ef8162	Added Eigen compilation option	2023-05-31 22:30:34 -05:00
Kenneth Jao	828b83a139	Better header guards for including.	2023-05-31 22:30:34 -05:00
Kenneth Jao	c7ee614324	Now compatible with new library structure	2023-05-31 22:30:34 -05:00
Kenneth Jao	443929521d	Updated directory structure and Makefile	2023-05-31 22:30:34 -05:00
Kenneth Jao	fb63a6c2bc	Added CUDA Compatible slicing	2023-05-31 22:30:34 -05:00
Kenneth Jao	1248a58265	Fixed compilation macro name issue	2023-05-31 22:30:34 -05:00
Kenneth Jao	9c66a61288	Fixed compilation macro name issue	2023-05-31 22:30:34 -05:00