21 changed files with 153 additions and 243 deletions
--- a/include/Array.h
+++ b/include/Array.h
@ -4,7 +4,7 @@
 #include "Core.h"
 #include "Macros.h"
 #include "Types.h"
-
+#include <Eigen/Dense>
 #include <cmath>
 #include <complex>
 #include <cstdlib>
@ -12,16 +12,6 @@
 #include <random>
 #include <type_traits>
 #ifdef CUDATOOLS_USE_EIGEN
 #include <Eigen/Dense>
 #endif
 #ifdef CUDATOOLS_USE_PYTHON
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
 namespace py = pybind11;
 #endif
 #ifdef DEVICE
 #define POINTER pDevice
 #else
@ -32,7 +22,6 @@ using namespace CudaTools::Types;
 namespace CudaTools {
 #ifdef CUDATOOLS_USE_EIGEN
 template <typename T>
 using EigenMat = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor>;
 template <typename T> using EigenMapMat = Eigen::Map<EigenMat<T>>;
@ -41,16 +30,9 @@ template <typename T> using ConstEigenMapMat = Eigen::Map<const EigenMat<T>>;
 template <typename T> struct EigenAdaptConst_S { typedef EigenMapMat<T> type; };
 template <typename T> struct EigenAdaptConst_S<const T> { typedef ConstEigenMapMat<T> type; };
 template <typename T> using EigenAdaptConst = typename EigenAdaptConst_S<T>::type;
 #endif
 template <typename T> class Array;
-
+using Slice = std::pair<uint32_t, uint32_t>;
 struct Slice {
    uint32_t first;
    uint32_t second;
    HD Slice(const std::initializer_list<uint32_t> i)
        : first(*i.begin()), second(*(i.begin() + 1)) {}
 };
 template <typename T> class ArrayIterator {
  private:
@ -199,7 +181,7 @@ template <typename T> class Array {
    uint32_t mEndOffset = 0;
-    HD void freeArrays() {
+    void freeArrays() {
 #ifndef DEVICE
        if (not mIsView) {
            if (pDevice != nullptr) CudaTools::free(pDevice);
@ -508,7 +490,6 @@ template <typename T> class Array {
     */
    HD void flatten() { reshape({mShape.mItems}); };
 #ifdef CUDATOOLS_USE_EIGEN
    /**
     * Returns the Eigen::Map of this Array.
     */
@ -520,7 +501,6 @@ template <typename T> class Array {
        return EigenAdaptConst<ComplexConversion<T>>((ComplexConversion<T>*)POINTER, mShape.rows(),
                                                     mShape.cols());
    };
 #endif
    /**
     * Gets the Shape of the Array.
@ -548,7 +528,7 @@ template <typename T> class Array {
    /**
     * Copies this Array and returns a new Array with the same memory.
     */
-    Array copy() const {
+    HD Array copy() const {
        Array<T> arr(mShape, (pDevice == nullptr));
        auto arr_it = arr.begin();
@ -699,8 +679,6 @@ template <typename T> class Array {
     * Its self assigning version is transpose. This is restricted to numerical types.
     * \brief Host only
     */
 #ifdef CUDATOOLS_USE_EIGEN
    Array transposed() const {
        static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
        CT_ERROR_IF(shape().axes(), !=, 2, "Tranpose can only occur on two-dimensional arrays");
@ -730,7 +708,6 @@ template <typename T> class Array {
        Array<T> inv(shape());
        inv.eigenMap() = this->eigenMap().inverse();
    };
 #endif
    /**
     * Pins the memory (page locks) for faster memory transfer in concurrent
@ -756,22 +733,6 @@ template <typename T> class Array {
        CT_ERROR(mIsSlice, "Cannot update device copy on a slice");
        return CudaTools::copy(pHost, pDevice, mShape.items() * sizeof(T), stream);
    };
 #ifdef CUDATOOLS_USE_PYTHON
    /**
     * Returns a py::array for making an Array available as a Python numpy array.
     */
    py::array pyArray() const {
        std::vector<py::ssize_t> dims, strides;
        for (uint iAxis = 0; iAxis < mShape.axes(); ++iAxis) {
            dims.push_back(static_cast<py::ssize_t>(mShape.dim(iAxis)));
            strides.push_back(sizeof(T) * static_cast<py::ssize_t>(mShape.stride(iAxis)));
        }
        return py::array_t<T, py::array::f_style>(
            py::buffer_info((void*)pHost, sizeof(T), py::format_descriptor<T>::format(),
                            static_cast<py::ssize_t>(mShape.axes()), dims, strides));
    };
 #endif
 };
 template <typename T>
@ -813,12 +774,10 @@ template <typename T> std::ostream& operator<<(std::ostream& out, const Array<T>
        bool negative = false;
        for (auto it = arr.begin(); it != arr.end(); ++it) {
            T val = *it;
            if constexpr (not std::is_unsigned<T>::value) {
            if (*it < 0) {
                negative = true;
                val *= -1;
            }
            }
            max_val = (val > max_val) ? val : max_val;
        }
        width = std::to_string(max_val).size() + 1;
--- a/include/BLAS.h
+++ b/include/BLAS.h
@ -1,10 +1,6 @@
 #ifndef CUDATOOLS_BLAS_H
 #define CUDATOOLS_BLAS_H
 #ifndef CUDATOOLS_USE_EIGEN
 #error "Cannot use CudaTools BLAS.h header without Eigen."
 #endif
 #include "Array.h"
 #include "Core.h"
 #include "Macros.h"
@ -142,7 +138,7 @@ template <typename T> class Batch {
        Array<T> batch = arr.reshaped({mBatchSize, mShape.rows(), mShape.cols()});
        for (uint32_t i = 0; i < mBatchSize; ++i) {
-#ifdef CUDACC
+#ifdef CUDA
            mBatch[i] = batch[i].dataDevice();
 #else
            mBatch[i] = batch[i].data();
@ -158,7 +154,7 @@ template <typename T> class Batch {
    void add(const Array<T>& arr) {
        CT_ERROR(not arr.isView(), "Cannot add non-view Arrays");
        CT_ERROR_IF(mCount, ==, mBatchSize, "Batch is full, cannot add more arrays");
-#ifdef CUDACC
+#ifdef CUDA
        mBatch[mCount] = arr.dataDevice();
 #else
        mBatch[mCount] = arr.data();
@ -274,7 +270,7 @@ StreamID GEMV(const T alpha, const Array<T>& A, const Array<T>& x, const T beta,
    uint32_t rows = A.shape().rows();
    uint32_t cols = A.shape().cols();
    T a = alpha, b = beta;
-#ifdef CUDACC
+#ifdef CUDA
    CUBLAS_CHECK(cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream)));
    if (bi.size == 1) {
        invoke<T>(cublasSgemv, cublasDgemv, cublasCgemv, cublasZgemv,
@ -319,7 +315,7 @@ StreamID GEMM(const T alpha, const Array<T>& A, const Array<T>& B, const T beta,
    uint32_t n = B.shape().cols();
    T a = alpha, b = beta;
-#ifdef CUDACC
+#ifdef CUDA
    CUBLAS_CHECK(cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream)));
    if (bi.size == 1) {
@ -372,7 +368,7 @@ StreamID DGMM(const Array<T>& A, const Array<T>& X, const Array<T>& C, const boo
    CT_ERROR_IF(A.shape().cols(), !=, C.shape().cols(),
                "Rows of 'A' and columns of 'C' need to match.");
-#ifdef CUDACC
+#ifdef CUDA
    uint32_t m = C.shape().rows();
    uint32_t n = C.shape().cols();
    auto mode = (left) ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT;
@ -548,7 +544,7 @@ class PLUBatch : public Batch<T> {
     * Computes the inplace PLU decomposition of batch of arrays.
     */
    StreamID computeLU(const StreamID& stream = DEF_CUBLAS_STREAM) {
-#ifdef CUDACC
+#ifdef CUDA
        uint32_t n = this->mShape.rows();
        CUBLAS_CHECK(
            cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream)));
@ -579,7 +575,7 @@ class PLUBatch : public Batch<T> {
        CT_ERROR_IF(b.shape().rows(), !=, this->mShape.rows(),
                    "The length of each column of b must match the matrix rank");
-#ifdef CUDACC
+#ifdef CUDA
        uint32_t n = b.shape().rows();
        uint32_t nrhs = b.shape().cols();
        CUBLAS_CHECK(
--- a/include/Core.h
+++ b/include/Core.h
@ -143,7 +143,7 @@ Settings basic(const size_t threads, const StreamID& stream = DEF_KERNEL_STREAM)
 template <typename F, typename... Args>
 StreamID launch(F func, const Kernel::Settings& sett, Args... args) {
-#ifdef CUDACC
+#ifdef CUDA
    func<<<sett.blockGrid, sett.threadBlock, sett.sharedMemoryBytes,
           Manager::get()->stream(sett.stream.mId)>>>(args...);
 #else
@ -256,8 +256,8 @@ template <typename F, typename... Args> class Graph {
    ~Graph() {
 #ifdef CUDACC
-        cudaGraphDestroy(mGraph);
+        CUDA_CHECK(cudaGraphDestroy(mGraph));
-        cudaGraphExecDestroy(mInstance);
+        CUDA_CHECK(cudaGraphExecDestroy(mInstance));
 #endif
    };
@ -316,11 +316,9 @@ struct GraphManager {
 };
 }; // namespace CudaTools
 #endif // CUDATOOLS_H
 #ifdef CUDATOOLS_IMPLEMENTATION
-#ifndef __CUDATOOLS_IMPLEMENTED__
+
 #define __CUDATOOLS_IMPLEMENTED__
 namespace CudaTools {
 //////////////////////
@ -402,9 +400,9 @@ Manager::Manager(const std::vector<std::string>& names) {
 Manager::~Manager() {
 #ifdef CUDACC
    for (auto& it : mStreams) {
-        cudaStreamDestroy(it.second);
+        CUDA_CHECK(cudaStreamDestroy(it.second));
    }
-    cublasDestroy(mCublas);
+    CUBLAS_CHECK(cublasDestroy(mCublas));
    // CUSPARSE_CHECK(cusparseDestroy(mCusparse));
 #endif
 }
@ -642,7 +640,7 @@ Event::Event() {
 Event::~Event() {
 #ifdef CUDACC
-    cudaEventDestroy(mEvent);
+    CUDA_CHECK(cudaEventDestroy(mEvent));
 #endif
 }
@ -658,6 +656,9 @@ void Event::record(const StreamID& stream) {
 GraphManager::~GraphManager() {
 #ifdef CUDACC
    for (void* func : mHostData) {
        delete func;
    }
    for (Event* event : mEvents) {
        delete event;
    }
@ -679,5 +680,6 @@ void GraphManager::joinBranch(const StreamID& orig_stream, const StreamID& branc
 }
 };     // namespace CudaTools
 #endif
 #endif // CUDATOOLS_IMPLEMENTATION
 #endif // CUDATOOLS_H
--- a/0
+++ b/0
--- a/include/Macros.h
+++ b/include/Macros.h
@ -49,18 +49,6 @@
 */
 #define SHARED
 /**
 * \def CUDATOOLS_USE_EIGEN
 * Compile the CudaTools library with Eigen support.
 */
 #define CUDATOOLS_USE_EIGEN
 /**
 * \def CUDATOOLS_USE_PYTHON
 * Compile the CudaTools library with Python support.
 */
 #define CUDATOOLS_USE_PYTHON
 /**
 * \def KERNEL(call, settings, ...)
 * Used to call a CUDA kernel.
@ -230,13 +218,12 @@
 #ifdef DEVICE
 #define CT_ERROR_IF(a, op, b, msg)                                                                 \
    if (a op b) {                                                                                  \
-        printf("\033[1;31m[CudaTools]\033[0m %s:%d\n | %s: (" #a ") " #op " (" #b ").\n",          \
+        printf("[ERROR] %s:%d\n | %s: (" #a ") " #op " (" #b ").\n", __FILE__, __LINE__, msg);     \
               __FILE__, __LINE__, msg);                                                           \
    }
 #define CT_ERROR(a, msg)                                                                           \
    if (a) {                                                                                       \
-        printf("\033[1;31m[CudaTools]\033[0m %s:%d\n | %s: " #a ".\n", __FILE__, __LINE__, msg);   \
+        printf("[ERROR] %s:%d\n | %s: " #a ".\n", __FILE__, __LINE__, msg);                        \
    }
 #else
@ -246,14 +233,14 @@
        std::ostringstream os_b;                                                                   \
        os_a << a;                                                                                 \
        os_b << b;                                                                                 \
-        printf("\033[1;31m[CudaTools]\033[0m %s:%d\n | %s: (" #a ")%s " #op " (" #b ")%s.\n",      \
+        printf("[ERROR] %s:%d\n | %s: (" #a ")%s " #op " (" #b ")%s.\n", __FILE__, __LINE__, msg,  \
-               __FILE__, __LINE__, msg, os_a.str().c_str(), os_b.str().c_str());                   \
+               os_a.str().c_str(), os_b.str().c_str());                                            \
        throw std::exception();                                                                    \
    }
 #define CT_ERROR(a, msg)                                                                           \
    if (a) {                                                                                       \
-        printf("\033[1;31m[CudaTools]\033[0m %s:%d\n | %s: " #a ".\n", __FILE__, __LINE__, msg);   \
+        printf("[ERROR] %s:%d\n | %s: " #a ".\n", __FILE__, __LINE__, msg);                        \
        throw std::exception();                                                                    \
    }
 #endif
@ -266,8 +253,7 @@
    do {                                                                                           \
        cudaError_t err = (call);                                                                  \
        if (err != cudaSuccess) {                                                                  \
-            printf("\033[1;31m[CUDA]\033[0m %s:%d\n | %s\n", __FILE__, __LINE__,                   \
+            printf("[CUDA] %s:%d\n | %s\n", __FILE__, __LINE__, cudaGetErrorString(err));          \
                   cudaGetErrorString(err));                                                       \
            throw std::exception();                                                                \
        }                                                                                          \
    } while (0)
@ -276,8 +262,7 @@
    do {                                                                                           \
        cublasStatus_t err = (call);                                                               \
        if (err != CUBLAS_STATUS_SUCCESS) {                                                        \
-            printf("\033[1;31m[cuBLAS]\033[0m %s:%d\n | %s\n", __FILE__, __LINE__,                 \
+            printf("[cuBLAS] %s:%d\n | %s\n", __FILE__, __LINE__, cublasGetStatusName(err));       \
                   cublasGetStatusName(err));                                                      \
            throw std::exception();                                                                \
        }                                                                                          \
    } while (0)
--- a/110
+++ b/110
@ -1,21 +1,95 @@
-.PHONY: all docs install
+CC := g++-10
 NVCC := nvcc
 CFLAGS := -Wall -std=c++17 -fopenmp -MMD
 NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
-all: install docs
+INCLUDE :=
 LIBS_DIR :=
 LIBS_DIR_GPU := /usr/local/cuda/lib64
 LIBS :=
 LIBS_GPU := cuda cudart cublas
-docs: SHELL:=/bin/bash
+TARGET = tests
-docs:
+SRC_DIR = .
-	if [ ! -d "docs/.venv" ]; then python3 -m venv "docs/.venv"; fi
+BUILD_DIR = build
 	( \
 		source docs/.venv/bin/activate;\
 		pip install -r docs/requirements.txt; \
 		doxygen docs/Doxyfile; \
 		sphinx-build -b html docs/source docs/build/html; \
 		deactivate \
 	)
-install:
+# Should not need to modify below.
-	install -d $(DEST_DIR)/
+
-	install -d $(DEST_DIR)/include/
+CPU_BUILD_DIR = $(BUILD_DIR)/cpu
-	for file in include/*; do \
+GPU_BUILD_DIR = $(BUILD_DIR)/gpu
-		install -m 644 $$file $(DEST_DIR)/include/; \
+
-	done
+SRC = $(wildcard $(SRC_DIR)/*/*.cpp) $(wildcard $(SRC_DIR)/*.cpp)
 # Get source files and object files.
 GCC_SRC = $(filter-out %.cu.cpp ,$(SRC))
 NVCC_SRC = $(filter %.cu.cpp, $(SRC))
 GCC_OBJ = $(GCC_SRC:$(SRC_DIR)/%.cpp=%.o)
 NVCC_OBJ = $(NVCC_SRC:$(SRC_DIR)/%.cpp=%.o)
 # If compiling for CPU, all go to GCC. Otherwise, they are split.
 CPU_OBJ = $(addprefix $(CPU_BUILD_DIR)/,$(GCC_OBJ)) $(addprefix $(CPU_BUILD_DIR)/,$(NVCC_OBJ))
 GPU_GCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(GCC_OBJ))
 GPU_NVCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(NVCC_OBJ))
 # $(info $$GCC_SRC is [${GCC_SRC}])
 # $(info $$NVCC_SRC is [${NVCC_SRC}])
 # $(info $$GCC_OBJ is [${GCC_OBJ}])
 # $(info $$NVCC_OBJ is [${NVCC_OBJ}])
 # $(info $$CPU_OBJ is [${CPU_OBJ}])
 # $(info $$GPU_GCC_OBJ is [${GPU_GCC_OBJ}])
 # $(info $$GPU_NVCC_OBJ is [${GPU_NVCC_OBJ}])
 HEADER = $(wildcard $(SRC_DIR)/*/*.h) $(wildcard $(SRC_DIR)/*.h)
 CPU_DEPS = $(wildcard $(CPU_BUILD_DIR)/*.d)
 GPU_DEPS = $(wildcard $(GPU_BUILD_DIR)/*.d)
 INC := $(INCLUDE:%=-I%)
 LIB := $(LIBS_DIR:%=-L%)
 LIB_GPU := $(LIBS_DIR_GPU:%=-L%)
 LD := $(LIBS:%=-l%)
 LD_GPU := $(LIBS_GPU:%=-l%)
 # Reminder:
 # $< = first prerequisite
 # $@ = the target which matched the rule
 # $^ = all prerequisites
 .PHONY: all clean
 all : cpu gpu
 cpu: $(TARGET)CPU
 gpu: $(TARGET)GPU
 $(TARGET)CPU: $(CPU_OBJ)
 	$(CC) $(CFLAGS) $^ -o $@ $(INC) $(LIB) $(LD)
 $(CPU_BUILD_DIR)/%.o $(CPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cpp | $(CPU_BUILD_DIR)
 	$(CC) $(CFLAGS) -c -o $@ $< $(INC)
 # For GPU, we need to build the NVCC objects, the NVCC linked object, and the
 # regular ones. Then, we link them all together.
 $(TARGET)GPU: $(GPU_BUILD_DIR)/link.o $(GPU_GCC_OBJ) | $(GPU_BUILD_DIR)
 	$(CC) -g -DCUDA $(CFLAGS) $(GPU_NVCC_OBJ) $^ -o $@ $(INC) $(LIB) $(LIB_GPU) $(LD) $(LD_GPU)
 $(GPU_BUILD_DIR)/link.o: $(GPU_NVCC_OBJ) | $(GPU_BUILD_DIR)
 	$(NVCC) --device-link $^ -o $@
 $(GPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cu.cpp | $(GPU_BUILD_DIR)
 	$(NVCC) $(NVCC_FLAGS) -DCUDA -x cu --device-c -o $@ $< $(INC)
 $(GPU_BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp | $(GPU_BUILD_DIR)
 	$(CC) $(CFLAGS) -g -DCUDA -c -o $@ $< $(INC)
 -include $(CPU_DEPS)
 -include $(GPU_DEPS)
 $(CPU_BUILD_DIR):
 	mkdir -p $@
 $(GPU_BUILD_DIR):
 	mkdir -p $@
 clean:
 	rm -Rf $(BUILD_DIR) $(TARGET)CPU $(TARGET)GPU
--- a/Makefile.template
+++ b/Makefile.template
@ -1,7 +1,7 @@
 CC := g++-10
 NVCC := nvcc
-CFLAGS := -std=c++17 -MMD -Wall -fopenmp
+CFLAGS := -Wall -std=c++17 -fopenmp -MMD
-NVCC_FLAGS := -std=c++17 -MMD -Xcudafe="--diag_suppress=20012" -Xcompiler -fopenmp
+NVCC_FLAGS := -MMD -w -Xcompiler
 INCLUDE := <<Put extra include directories here, separated by a space>>
 LIBS_DIR := <<Put library directories here, separated by a space>>
@ -74,7 +74,7 @@ $(TARGET)GPU: $(GPU_BUILD_DIR)/link.o $(GPU_GCC_OBJ) | $(GPU_BUILD_DIR)
 	$(CC) -g -DCUDA $(CFLAGS) $(GPU_NVCC_OBJ) $^ -o $@ $(INC) $(LIB) $(LIB_GPU) $(LD) $(LD_GPU)
 $(GPU_BUILD_DIR)/link.o: $(GPU_NVCC_OBJ) | $(GPU_BUILD_DIR)
-	$(NVCC) --device-link -lgomp $^ -o $@
+	$(NVCC) --device-link $^ -o $@
 $(GPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cu.cpp | $(GPU_BUILD_DIR)
 	$(NVCC) $(NVCC_FLAGS) -DCUDA -x cu --device-c -o $@ $< $(INC)
--- a/11
+++ b/11
@ -1,11 +0,0 @@
 #!/usr/bin/env bash
 if [ -z "$1" ]
 then
   printf "Usage: setup.sh install_directory [dependencies].\n" 1>&2
   exit 1
 fi
 cd $(dirname $0)
 libname=$(basename $(pwd))
 make DEST_DIR="$1" install
--- a/include/Types.h
+++ b/include/Types.h
--- a/2
+++ b/2
@ -0,0 +1,2 @@
 doxygen docs/Doxyfile
 sphinx-build -b html docs/source docs/build/html
--- a/docs/Doxyfile
+++ b/docs/Doxyfile
@ -778,7 +778,7 @@ WARNINGS               = YES
 # will automatically be disabled.
 # The default value is: YES.
-WARN_IF_UNDOCUMENTED   = NO
+WARN_IF_UNDOCUMENTED   = YES
 # If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
 # potential errors in the documentation, such as not documenting some parameters
@ -829,7 +829,7 @@ WARN_LOGFILE           =
 # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
 # Note: If this tag is empty the current directory is searched.
-INPUT                  = "./include"
+INPUT                  = "./"
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -27,8 +27,8 @@ extensions = [
    'breathe',
 ]
-breathe_projects = {"CudaTools": "../build/xml"}
+breathe_projects = {"DGEMS": "../build/xml"}
-breathe_default_project = "CudaTools"
+breathe_default_project = "DGEMS"
 bibtex_bibfiles = ['refs.bib']
--- a/docs/source/core.rst
+++ b/docs/source/core.rst
@ -42,8 +42,6 @@ Host-Device Automation
 Compilation Options
 -------------------
 .. doxygendefine:: CUDATOOLS_ARRAY_MAX_AXES
 .. doxygendefine:: CUDATOOLS_USE_EIGEN
 .. doxygendefine:: CUDATOOLS_USE_PYTHON
 Macro Functions
 ===============
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
--- a/samples/1_CoreKernel/Makefile
+++ b/samples/1_CoreKernel/Makefile
@ -1,9 +1,9 @@
 CC := g++-10
 NVCC := nvcc
-CFLAGS := -std=c++17 -MMD -Wall -fopenmp
+CFLAGS := -Wall -std=c++17 -fopenmp -MMD
-NVCC_FLAGS := -std=c++17 -MMD -Xcudafe="--diag_suppress=20012" -Xcompiler -fopenmp
+NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
-INCLUDE := ../../include/CudaTools
+INCLUDE := ../../
 LIBS_DIR :=
 LIBS_DIR_GPU := /usr/local/cuda/lib64
 LIBS :=
--- a/samples/2_CoreClass/Makefile
+++ b/samples/2_CoreClass/Makefile
@ -1,9 +1,9 @@
 CC := g++-10
 NVCC := nvcc
-CFLAGS := -std=c++17 -MMD -Wall -fopenmp
+CFLAGS := -Wall -std=c++17 -fopenmp -MMD
-NVCC_FLAGS := -std=c++17 -MMD -Xcudafe="--diag_suppress=20012" -Xcompiler -fopenmp
+NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
-INCLUDE := ../../include/CudaTools
+INCLUDE := ../../
 LIBS_DIR :=
 LIBS_DIR_GPU := /usr/local/cuda/lib64
 LIBS :=
--- a/samples/3_ArrayKernel/Makefile
+++ b/samples/3_ArrayKernel/Makefile
@ -1,9 +1,9 @@
 CC := g++-10
 NVCC := nvcc
-CFLAGS := -std=c++17 -MMD -Wall -fopenmp
+CFLAGS := -Wall -std=c++17 -fopenmp -MMD
-NVCC_FLAGS := -std=c++17 -MMD -Xcudafe="--diag_suppress=20012" -Xcompiler -fopenmp
+NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
-INCLUDE := ../../include/CudaTools
+INCLUDE := ../../
 LIBS_DIR :=
 LIBS_DIR_GPU := /usr/local/cuda/lib64
 LIBS :=
--- a/samples/4_ArrayFunctions/Makefile
+++ b/samples/4_ArrayFunctions/Makefile
@ -1,9 +1,9 @@
 CC := g++-10
 NVCC := nvcc
-CFLAGS := -std=c++17 -MMD -Wall -fopenmp
+CFLAGS := -Wall -std=c++17 -fopenmp -MMD
-NVCC_FLAGS := -std=c++17 -MMD -Xcudafe="--diag_suppress=20012" -Xcompiler -fopenmp
+NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
-INCLUDE := ../../include/CudaTools
+INCLUDE := ../../
 LIBS_DIR :=
 LIBS_DIR_GPU := /usr/local/cuda/lib64
 LIBS :=
--- a/samples/5_SimpleGraph/Makefile
+++ b/samples/5_SimpleGraph/Makefile
@ -1,9 +1,9 @@
 CC := g++-10
 NVCC := nvcc
-CFLAGS := -std=c++17 -MMD -Wall -fopenmp
+CFLAGS := -Wall -std=c++17 -fopenmp -MMD
-NVCC_FLAGS := -std=c++17 -MMD -Xcudafe="--diag_suppress=20012" -Xcompiler -fopenmp
+NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
-INCLUDE := ../../include/CudaTools
+INCLUDE := ../../
 LIBS_DIR :=
 LIBS_DIR_GPU := /usr/local/cuda/lib64
 LIBS :=
--- a/tests/tests.cu.cpp
+++ b/tests/tests.cu.cpp
@ -1,9 +1,9 @@
 #define CUDATOOLS_IMPLEMENTATION
 #define CUDATOOLS_ARRAY_MAX_AXES 8
-#include <Array.h>
+#include "Array.h"
-#include <BLAS.h>
+#include "BLAS.h"
-#include <Core.h>
+#include "Core.h"
-#include <Types.h>
+#include "Types.h"
 #include <Eigen/Core>
 #include <chrono>
--- a/tests/Makefile
+++ b/tests/Makefile
@ -1,95 +0,0 @@
 CC := g++-10
 NVCC := nvcc
 CFLAGS := -std=c++17 -MMD -Wall -fopenmp
 NVCC_FLAGS := -std=c++17 -MMD -Xcudafe="--diag_suppress=20012" -Xcompiler -fopenmp
 INCLUDE := ../include/CudaTools
 LIBS_DIR :=
 LIBS_DIR_GPU := /usr/local/cuda/lib64
 LIBS :=
 LIBS_GPU := cuda cudart cublas
 TARGET = tests
 SRC_DIR = .
 BUILD_DIR = build
 # Should not need to modify below.
 CPU_BUILD_DIR = $(BUILD_DIR)/cpu
 GPU_BUILD_DIR = $(BUILD_DIR)/gpu
 SRC = $(wildcard $(SRC_DIR)/*/*.cpp) $(wildcard $(SRC_DIR)/*.cpp)
 # Get source files and object files.
 GCC_SRC = $(filter-out %.cu.cpp ,$(SRC))
 NVCC_SRC = $(filter %.cu.cpp, $(SRC))
 GCC_OBJ = $(GCC_SRC:$(SRC_DIR)/%.cpp=%.o)
 NVCC_OBJ = $(NVCC_SRC:$(SRC_DIR)/%.cpp=%.o)
 # If compiling for CPU, all go to GCC. Otherwise, they are split.
 CPU_OBJ = $(addprefix $(CPU_BUILD_DIR)/,$(GCC_OBJ)) $(addprefix $(CPU_BUILD_DIR)/,$(NVCC_OBJ))
 GPU_GCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(GCC_OBJ))
 GPU_NVCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(NVCC_OBJ))
 # $(info $$GCC_SRC is [${GCC_SRC}])
 # $(info $$NVCC_SRC is [${NVCC_SRC}])
 # $(info $$GCC_OBJ is [${GCC_OBJ}])
 # $(info $$NVCC_OBJ is [${NVCC_OBJ}])
 # $(info $$CPU_OBJ is [${CPU_OBJ}])
 # $(info $$GPU_GCC_OBJ is [${GPU_GCC_OBJ}])
 # $(info $$GPU_NVCC_OBJ is [${GPU_NVCC_OBJ}])
 HEADER = $(wildcard $(SRC_DIR)/*/*.h) $(wildcard $(SRC_DIR)/*.h)
 CPU_DEPS = $(wildcard $(CPU_BUILD_DIR)/*.d)
 GPU_DEPS = $(wildcard $(GPU_BUILD_DIR)/*.d)
 INC := $(INCLUDE:%=-I%)
 LIB := $(LIBS_DIR:%=-L%)
 LIB_GPU := $(LIBS_DIR_GPU:%=-L%)
 LD := $(LIBS:%=-l%)
 LD_GPU := $(LIBS_GPU:%=-l%)
 # Reminder:
 # $< = first prerequisite
 # $@ = the target which matched the rule
 # $^ = all prerequisites
 .PHONY: all clean
 all : cpu gpu
 cpu: $(TARGET)CPU
 gpu: $(TARGET)GPU
 $(TARGET)CPU: $(CPU_OBJ)
 	$(CC) $(CFLAGS) $^ -o $@ $(INC) $(LIB) $(LD)
 $(CPU_BUILD_DIR)/%.o $(CPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cpp | $(CPU_BUILD_DIR)
 	$(CC) $(CFLAGS) -c -o $@ $< $(INC)
 # For GPU, we need to build the NVCC objects, the NVCC linked object, and the
 # regular ones. Then, we link them all together.
 $(TARGET)GPU: $(GPU_BUILD_DIR)/link.o $(GPU_GCC_OBJ) | $(GPU_BUILD_DIR)
 	$(CC) -g -DCUDA $(CFLAGS) $(GPU_NVCC_OBJ) $^ -o $@ $(INC) $(LIB) $(LIB_GPU) $(LD) $(LD_GPU)
 $(GPU_BUILD_DIR)/link.o: $(GPU_NVCC_OBJ) | $(GPU_BUILD_DIR)
 	$(NVCC) --device-link -lgomp $^ -o $@
 $(GPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cu.cpp | $(GPU_BUILD_DIR)
 	$(NVCC) $(NVCC_FLAGS) -DCUDA -x cu --device-c -o $@ $< $(INC)
 $(GPU_BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp | $(GPU_BUILD_DIR)
 	$(CC) $(CFLAGS) -g -DCUDA -c -o $@ $< $(INC)
 -include $(CPU_DEPS)
 -include $(GPU_DEPS)
 $(CPU_BUILD_DIR):
 	mkdir -p $@
 $(GPU_BUILD_DIR):
 	mkdir -p $@
 clean:
 	rm -Rf $(BUILD_DIR) $(TARGET)CPU $(TARGET)GPU
		`@ -0,0 +1,2 @@`
							`doxygen docs/Doxyfile`
							`sphinx-build -b html docs/source docs/build/html`