Compare commits

...

10 Commits

21 changed files with 243 additions and 153 deletions

0
DEPENDENCIES Normal file
View File

110
Makefile
View File

@ -1,95 +1,21 @@
CC := g++-10
NVCC := nvcc
CFLAGS := -Wall -std=c++17 -fopenmp -MMD
NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
.PHONY: all docs install
INCLUDE :=
LIBS_DIR :=
LIBS_DIR_GPU := /usr/local/cuda/lib64
LIBS :=
LIBS_GPU := cuda cudart cublas
all: install docs
TARGET = tests
SRC_DIR = .
BUILD_DIR = build
docs: SHELL:=/bin/bash
docs:
if [ ! -d "docs/.venv" ]; then python3 -m venv "docs/.venv"; fi
( \
source docs/.venv/bin/activate;\
pip install -r docs/requirements.txt; \
doxygen docs/Doxyfile; \
sphinx-build -b html docs/source docs/build/html; \
deactivate \
)
# Should not need to modify below.
CPU_BUILD_DIR = $(BUILD_DIR)/cpu
GPU_BUILD_DIR = $(BUILD_DIR)/gpu
SRC = $(wildcard $(SRC_DIR)/*/*.cpp) $(wildcard $(SRC_DIR)/*.cpp)
# Get source files and object files.
GCC_SRC = $(filter-out %.cu.cpp ,$(SRC))
NVCC_SRC = $(filter %.cu.cpp, $(SRC))
GCC_OBJ = $(GCC_SRC:$(SRC_DIR)/%.cpp=%.o)
NVCC_OBJ = $(NVCC_SRC:$(SRC_DIR)/%.cpp=%.o)
# If compiling for CPU, all go to GCC. Otherwise, they are split.
CPU_OBJ = $(addprefix $(CPU_BUILD_DIR)/,$(GCC_OBJ)) $(addprefix $(CPU_BUILD_DIR)/,$(NVCC_OBJ))
GPU_GCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(GCC_OBJ))
GPU_NVCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(NVCC_OBJ))
# $(info $$GCC_SRC is [${GCC_SRC}])
# $(info $$NVCC_SRC is [${NVCC_SRC}])
# $(info $$GCC_OBJ is [${GCC_OBJ}])
# $(info $$NVCC_OBJ is [${NVCC_OBJ}])
# $(info $$CPU_OBJ is [${CPU_OBJ}])
# $(info $$GPU_GCC_OBJ is [${GPU_GCC_OBJ}])
# $(info $$GPU_NVCC_OBJ is [${GPU_NVCC_OBJ}])
HEADER = $(wildcard $(SRC_DIR)/*/*.h) $(wildcard $(SRC_DIR)/*.h)
CPU_DEPS = $(wildcard $(CPU_BUILD_DIR)/*.d)
GPU_DEPS = $(wildcard $(GPU_BUILD_DIR)/*.d)
INC := $(INCLUDE:%=-I%)
LIB := $(LIBS_DIR:%=-L%)
LIB_GPU := $(LIBS_DIR_GPU:%=-L%)
LD := $(LIBS:%=-l%)
LD_GPU := $(LIBS_GPU:%=-l%)
# Reminder:
# $< = first prerequisite
# $@ = the target which matched the rule
# $^ = all prerequisites
.PHONY: all clean
all : cpu gpu
cpu: $(TARGET)CPU
gpu: $(TARGET)GPU
$(TARGET)CPU: $(CPU_OBJ)
$(CC) $(CFLAGS) $^ -o $@ $(INC) $(LIB) $(LD)
$(CPU_BUILD_DIR)/%.o $(CPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cpp | $(CPU_BUILD_DIR)
$(CC) $(CFLAGS) -c -o $@ $< $(INC)
# For GPU, we need to build the NVCC objects, the NVCC linked object, and the
# regular ones. Then, we link them all together.
$(TARGET)GPU: $(GPU_BUILD_DIR)/link.o $(GPU_GCC_OBJ) | $(GPU_BUILD_DIR)
$(CC) -g -DCUDA $(CFLAGS) $(GPU_NVCC_OBJ) $^ -o $@ $(INC) $(LIB) $(LIB_GPU) $(LD) $(LD_GPU)
$(GPU_BUILD_DIR)/link.o: $(GPU_NVCC_OBJ) | $(GPU_BUILD_DIR)
$(NVCC) --device-link $^ -o $@
$(GPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cu.cpp | $(GPU_BUILD_DIR)
$(NVCC) $(NVCC_FLAGS) -DCUDA -x cu --device-c -o $@ $< $(INC)
$(GPU_BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp | $(GPU_BUILD_DIR)
$(CC) $(CFLAGS) -g -DCUDA -c -o $@ $< $(INC)
-include $(CPU_DEPS)
-include $(GPU_DEPS)
$(CPU_BUILD_DIR):
mkdir -p $@
$(GPU_BUILD_DIR):
mkdir -p $@
clean:
rm -Rf $(BUILD_DIR) $(TARGET)CPU $(TARGET)GPU
install:
install -d $(DEST_DIR)/
install -d $(DEST_DIR)/include/
for file in include/*; do \
install -m 644 $$file $(DEST_DIR)/include/; \
done

View File

@ -1,7 +1,7 @@
CC := g++-10
NVCC := nvcc
CFLAGS := -Wall -std=c++17 -fopenmp -MMD
NVCC_FLAGS := -MMD -w -Xcompiler
CFLAGS := -std=c++17 -MMD -Wall -fopenmp
NVCC_FLAGS := -std=c++17 -MMD -Xcudafe="--diag_suppress=20012" -Xcompiler -fopenmp
INCLUDE := <<Put extra include directories here, separated by a space>>
LIBS_DIR := <<Put library directories here, separated by a space>>
@ -74,7 +74,7 @@ $(TARGET)GPU: $(GPU_BUILD_DIR)/link.o $(GPU_GCC_OBJ) | $(GPU_BUILD_DIR)
$(CC) -g -DCUDA $(CFLAGS) $(GPU_NVCC_OBJ) $^ -o $@ $(INC) $(LIB) $(LIB_GPU) $(LD) $(LD_GPU)
$(GPU_BUILD_DIR)/link.o: $(GPU_NVCC_OBJ) | $(GPU_BUILD_DIR)
$(NVCC) --device-link $^ -o $@
$(NVCC) --device-link -lgomp $^ -o $@
$(GPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cu.cpp | $(GPU_BUILD_DIR)
$(NVCC) $(NVCC_FLAGS) -DCUDA -x cu --device-c -o $@ $< $(INC)

11
SETUP Executable file
View File

@ -0,0 +1,11 @@
#!/usr/bin/env bash
if [ -z "$1" ]
then
printf "Usage: setup.sh install_directory [dependencies].\n" 1>&2
exit 1
fi
cd $(dirname $0)
libname=$(basename $(pwd))
make DEST_DIR="$1" install

View File

@ -1,2 +0,0 @@
doxygen docs/Doxyfile
sphinx-build -b html docs/source docs/build/html

View File

@ -778,7 +778,7 @@ WARNINGS = YES
# will automatically be disabled.
# The default value is: YES.
WARN_IF_UNDOCUMENTED = YES
WARN_IF_UNDOCUMENTED = NO
# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
# potential errors in the documentation, such as not documenting some parameters
@ -829,7 +829,7 @@ WARN_LOGFILE =
# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
# Note: If this tag is empty the current directory is searched.
INPUT = "./"
INPUT = "./include"
# This tag can be used to specify the character encoding of the source files
# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses

View File

@ -27,8 +27,8 @@ extensions = [
'breathe',
]
breathe_projects = {"DGEMS": "../build/xml"}
breathe_default_project = "DGEMS"
breathe_projects = {"CudaTools": "../build/xml"}
breathe_default_project = "CudaTools"
bibtex_bibfiles = ['refs.bib']

View File

@ -42,6 +42,8 @@ Host-Device Automation
Compilation Options
-------------------
.. doxygendefine:: CUDATOOLS_ARRAY_MAX_AXES
.. doxygendefine:: CUDATOOLS_USE_EIGEN
.. doxygendefine:: CUDATOOLS_USE_PYTHON
Macro Functions
===============

View File

@ -4,7 +4,7 @@
#include "Core.h"
#include "Macros.h"
#include "Types.h"
#include <Eigen/Dense>
#include <cmath>
#include <complex>
#include <cstdlib>
@ -12,6 +12,16 @@
#include <random>
#include <type_traits>
#ifdef CUDATOOLS_USE_EIGEN
#include <Eigen/Dense>
#endif
#ifdef CUDATOOLS_USE_PYTHON
#include <pybind11/numpy.h>
#include <pybind11/pybind11.h>
namespace py = pybind11;
#endif
#ifdef DEVICE
#define POINTER pDevice
#else
@ -22,6 +32,7 @@ using namespace CudaTools::Types;
namespace CudaTools {
#ifdef CUDATOOLS_USE_EIGEN
template <typename T>
using EigenMat = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor>;
template <typename T> using EigenMapMat = Eigen::Map<EigenMat<T>>;
@ -30,9 +41,16 @@ template <typename T> using ConstEigenMapMat = Eigen::Map<const EigenMat<T>>;
template <typename T> struct EigenAdaptConst_S { typedef EigenMapMat<T> type; };
template <typename T> struct EigenAdaptConst_S<const T> { typedef ConstEigenMapMat<T> type; };
template <typename T> using EigenAdaptConst = typename EigenAdaptConst_S<T>::type;
#endif
template <typename T> class Array;
using Slice = std::pair<uint32_t, uint32_t>;
struct Slice {
uint32_t first;
uint32_t second;
HD Slice(const std::initializer_list<uint32_t> i)
: first(*i.begin()), second(*(i.begin() + 1)) {}
};
template <typename T> class ArrayIterator {
private:
@ -181,7 +199,7 @@ template <typename T> class Array {
uint32_t mEndOffset = 0;
void freeArrays() {
HD void freeArrays() {
#ifndef DEVICE
if (not mIsView) {
if (pDevice != nullptr) CudaTools::free(pDevice);
@ -490,6 +508,7 @@ template <typename T> class Array {
*/
HD void flatten() { reshape({mShape.mItems}); };
#ifdef CUDATOOLS_USE_EIGEN
/**
* Returns the Eigen::Map of this Array.
*/
@ -501,6 +520,7 @@ template <typename T> class Array {
return EigenAdaptConst<ComplexConversion<T>>((ComplexConversion<T>*)POINTER, mShape.rows(),
mShape.cols());
};
#endif
/**
* Gets the Shape of the Array.
@ -528,7 +548,7 @@ template <typename T> class Array {
/**
* Copies this Array and returns a new Array with the same memory.
*/
HD Array copy() const {
Array copy() const {
Array<T> arr(mShape, (pDevice == nullptr));
auto arr_it = arr.begin();
@ -679,6 +699,8 @@ template <typename T> class Array {
* Its self assigning version is transpose. This is restricted to numerical types.
* \brief Host only
*/
#ifdef CUDATOOLS_USE_EIGEN
Array transposed() const {
static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
CT_ERROR_IF(shape().axes(), !=, 2, "Tranpose can only occur on two-dimensional arrays");
@ -708,6 +730,7 @@ template <typename T> class Array {
Array<T> inv(shape());
inv.eigenMap() = this->eigenMap().inverse();
};
#endif
/**
* Pins the memory (page locks) for faster memory transfer in concurrent
@ -733,6 +756,22 @@ template <typename T> class Array {
CT_ERROR(mIsSlice, "Cannot update device copy on a slice");
return CudaTools::copy(pHost, pDevice, mShape.items() * sizeof(T), stream);
};
#ifdef CUDATOOLS_USE_PYTHON
/**
* Returns a py::array for making an Array available as a Python numpy array.
*/
py::array pyArray() const {
std::vector<py::ssize_t> dims, strides;
for (uint iAxis = 0; iAxis < mShape.axes(); ++iAxis) {
dims.push_back(static_cast<py::ssize_t>(mShape.dim(iAxis)));
strides.push_back(sizeof(T) * static_cast<py::ssize_t>(mShape.stride(iAxis)));
}
return py::array_t<T, py::array::f_style>(
py::buffer_info((void*)pHost, sizeof(T), py::format_descriptor<T>::format(),
static_cast<py::ssize_t>(mShape.axes()), dims, strides));
};
#endif
};
template <typename T>
@ -774,9 +813,11 @@ template <typename T> std::ostream& operator<<(std::ostream& out, const Array<T>
bool negative = false;
for (auto it = arr.begin(); it != arr.end(); ++it) {
T val = *it;
if (*it < 0) {
negative = true;
val *= -1;
if constexpr (not std::is_unsigned<T>::value) {
if (*it < 0) {
negative = true;
val *= -1;
}
}
max_val = (val > max_val) ? val : max_val;
}

View File

@ -1,6 +1,10 @@
#ifndef CUDATOOLS_BLAS_H
#define CUDATOOLS_BLAS_H
#ifndef CUDATOOLS_USE_EIGEN
#error "Cannot use CudaTools BLAS.h header without Eigen."
#endif
#include "Array.h"
#include "Core.h"
#include "Macros.h"
@ -138,7 +142,7 @@ template <typename T> class Batch {
Array<T> batch = arr.reshaped({mBatchSize, mShape.rows(), mShape.cols()});
for (uint32_t i = 0; i < mBatchSize; ++i) {
#ifdef CUDA
#ifdef CUDACC
mBatch[i] = batch[i].dataDevice();
#else
mBatch[i] = batch[i].data();
@ -154,7 +158,7 @@ template <typename T> class Batch {
void add(const Array<T>& arr) {
CT_ERROR(not arr.isView(), "Cannot add non-view Arrays");
CT_ERROR_IF(mCount, ==, mBatchSize, "Batch is full, cannot add more arrays");
#ifdef CUDA
#ifdef CUDACC
mBatch[mCount] = arr.dataDevice();
#else
mBatch[mCount] = arr.data();
@ -270,7 +274,7 @@ StreamID GEMV(const T alpha, const Array<T>& A, const Array<T>& x, const T beta,
uint32_t rows = A.shape().rows();
uint32_t cols = A.shape().cols();
T a = alpha, b = beta;
#ifdef CUDA
#ifdef CUDACC
CUBLAS_CHECK(cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream)));
if (bi.size == 1) {
invoke<T>(cublasSgemv, cublasDgemv, cublasCgemv, cublasZgemv,
@ -315,7 +319,7 @@ StreamID GEMM(const T alpha, const Array<T>& A, const Array<T>& B, const T beta,
uint32_t n = B.shape().cols();
T a = alpha, b = beta;
#ifdef CUDA
#ifdef CUDACC
CUBLAS_CHECK(cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream)));
if (bi.size == 1) {
@ -368,7 +372,7 @@ StreamID DGMM(const Array<T>& A, const Array<T>& X, const Array<T>& C, const boo
CT_ERROR_IF(A.shape().cols(), !=, C.shape().cols(),
"Rows of 'A' and columns of 'C' need to match.");
#ifdef CUDA
#ifdef CUDACC
uint32_t m = C.shape().rows();
uint32_t n = C.shape().cols();
auto mode = (left) ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT;
@ -544,7 +548,7 @@ class PLUBatch : public Batch<T> {
* Computes the inplace PLU decomposition of batch of arrays.
*/
StreamID computeLU(const StreamID& stream = DEF_CUBLAS_STREAM) {
#ifdef CUDA
#ifdef CUDACC
uint32_t n = this->mShape.rows();
CUBLAS_CHECK(
cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream)));
@ -575,7 +579,7 @@ class PLUBatch : public Batch<T> {
CT_ERROR_IF(b.shape().rows(), !=, this->mShape.rows(),
"The length of each column of b must match the matrix rank");
#ifdef CUDA
#ifdef CUDACC
uint32_t n = b.shape().rows();
uint32_t nrhs = b.shape().cols();
CUBLAS_CHECK(

View File

@ -143,7 +143,7 @@ Settings basic(const size_t threads, const StreamID& stream = DEF_KERNEL_STREAM)
template <typename F, typename... Args>
StreamID launch(F func, const Kernel::Settings& sett, Args... args) {
#ifdef CUDA
#ifdef CUDACC
func<<<sett.blockGrid, sett.threadBlock, sett.sharedMemoryBytes,
Manager::get()->stream(sett.stream.mId)>>>(args...);
#else
@ -256,8 +256,8 @@ template <typename F, typename... Args> class Graph {
~Graph() {
#ifdef CUDACC
CUDA_CHECK(cudaGraphDestroy(mGraph));
CUDA_CHECK(cudaGraphExecDestroy(mInstance));
cudaGraphDestroy(mGraph);
cudaGraphExecDestroy(mInstance);
#endif
};
@ -316,9 +316,11 @@ struct GraphManager {
};
}; // namespace CudaTools
#endif // CUDATOOLS_H
#ifdef CUDATOOLS_IMPLEMENTATION
#ifndef __CUDATOOLS_IMPLEMENTED__
#define __CUDATOOLS_IMPLEMENTED__
namespace CudaTools {
//////////////////////
@ -400,9 +402,9 @@ Manager::Manager(const std::vector<std::string>& names) {
Manager::~Manager() {
#ifdef CUDACC
for (auto& it : mStreams) {
CUDA_CHECK(cudaStreamDestroy(it.second));
cudaStreamDestroy(it.second);
}
CUBLAS_CHECK(cublasDestroy(mCublas));
cublasDestroy(mCublas);
// CUSPARSE_CHECK(cusparseDestroy(mCusparse));
#endif
}
@ -640,7 +642,7 @@ Event::Event() {
Event::~Event() {
#ifdef CUDACC
CUDA_CHECK(cudaEventDestroy(mEvent));
cudaEventDestroy(mEvent);
#endif
}
@ -656,9 +658,6 @@ void Event::record(const StreamID& stream) {
GraphManager::~GraphManager() {
#ifdef CUDACC
for (void* func : mHostData) {
delete func;
}
for (Event* event : mEvents) {
delete event;
}
@ -680,6 +679,5 @@ void GraphManager::joinBranch(const StreamID& orig_stream, const StreamID& branc
}
}; // namespace CudaTools
#endif
#endif // CUDATOOLS_IMPLEMENTATION
#endif // CUDATOOLS_H

View File

@ -49,6 +49,18 @@
*/
#define SHARED
/**
* \def CUDATOOLS_USE_EIGEN
* Compile the CudaTools library with Eigen support.
*/
#define CUDATOOLS_USE_EIGEN
/**
* \def CUDATOOLS_USE_PYTHON
* Compile the CudaTools library with Python support.
*/
#define CUDATOOLS_USE_PYTHON
/**
* \def KERNEL(call, settings, ...)
* Used to call a CUDA kernel.
@ -218,12 +230,13 @@
#ifdef DEVICE
#define CT_ERROR_IF(a, op, b, msg) \
if (a op b) { \
printf("[ERROR] %s:%d\n | %s: (" #a ") " #op " (" #b ").\n", __FILE__, __LINE__, msg); \
printf("\033[1;31m[CudaTools]\033[0m %s:%d\n | %s: (" #a ") " #op " (" #b ").\n", \
__FILE__, __LINE__, msg); \
}
#define CT_ERROR(a, msg) \
if (a) { \
printf("[ERROR] %s:%d\n | %s: " #a ".\n", __FILE__, __LINE__, msg); \
printf("\033[1;31m[CudaTools]\033[0m %s:%d\n | %s: " #a ".\n", __FILE__, __LINE__, msg); \
}
#else
@ -233,14 +246,14 @@
std::ostringstream os_b; \
os_a << a; \
os_b << b; \
printf("[ERROR] %s:%d\n | %s: (" #a ")%s " #op " (" #b ")%s.\n", __FILE__, __LINE__, msg, \
os_a.str().c_str(), os_b.str().c_str()); \
printf("\033[1;31m[CudaTools]\033[0m %s:%d\n | %s: (" #a ")%s " #op " (" #b ")%s.\n", \
__FILE__, __LINE__, msg, os_a.str().c_str(), os_b.str().c_str()); \
throw std::exception(); \
}
#define CT_ERROR(a, msg) \
if (a) { \
printf("[ERROR] %s:%d\n | %s: " #a ".\n", __FILE__, __LINE__, msg); \
printf("\033[1;31m[CudaTools]\033[0m %s:%d\n | %s: " #a ".\n", __FILE__, __LINE__, msg); \
throw std::exception(); \
}
#endif
@ -253,7 +266,8 @@
do { \
cudaError_t err = (call); \
if (err != cudaSuccess) { \
printf("[CUDA] %s:%d\n | %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
printf("\033[1;31m[CUDA]\033[0m %s:%d\n | %s\n", __FILE__, __LINE__, \
cudaGetErrorString(err)); \
throw std::exception(); \
} \
} while (0)
@ -262,7 +276,8 @@
do { \
cublasStatus_t err = (call); \
if (err != CUBLAS_STATUS_SUCCESS) { \
printf("[cuBLAS] %s:%d\n | %s\n", __FILE__, __LINE__, cublasGetStatusName(err)); \
printf("\033[1;31m[cuBLAS]\033[0m %s:%d\n | %s\n", __FILE__, __LINE__, \
cublasGetStatusName(err)); \
throw std::exception(); \
} \
} while (0)

View File

@ -1,9 +1,9 @@
CC := g++-10
NVCC := nvcc
CFLAGS := -Wall -std=c++17 -fopenmp -MMD
NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
CFLAGS := -std=c++17 -MMD -Wall -fopenmp
NVCC_FLAGS := -std=c++17 -MMD -Xcudafe="--diag_suppress=20012" -Xcompiler -fopenmp
INCLUDE := ../../
INCLUDE := ../../include/CudaTools
LIBS_DIR :=
LIBS_DIR_GPU := /usr/local/cuda/lib64
LIBS :=

View File

@ -1,9 +1,9 @@
CC := g++-10
NVCC := nvcc
CFLAGS := -Wall -std=c++17 -fopenmp -MMD
NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
CFLAGS := -std=c++17 -MMD -Wall -fopenmp
NVCC_FLAGS := -std=c++17 -MMD -Xcudafe="--diag_suppress=20012" -Xcompiler -fopenmp
INCLUDE := ../../
INCLUDE := ../../include/CudaTools
LIBS_DIR :=
LIBS_DIR_GPU := /usr/local/cuda/lib64
LIBS :=

View File

@ -1,9 +1,9 @@
CC := g++-10
NVCC := nvcc
CFLAGS := -Wall -std=c++17 -fopenmp -MMD
NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
CFLAGS := -std=c++17 -MMD -Wall -fopenmp
NVCC_FLAGS := -std=c++17 -MMD -Xcudafe="--diag_suppress=20012" -Xcompiler -fopenmp
INCLUDE := ../../
INCLUDE := ../../include/CudaTools
LIBS_DIR :=
LIBS_DIR_GPU := /usr/local/cuda/lib64
LIBS :=

View File

@ -1,9 +1,9 @@
CC := g++-10
NVCC := nvcc
CFLAGS := -Wall -std=c++17 -fopenmp -MMD
NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
CFLAGS := -std=c++17 -MMD -Wall -fopenmp
NVCC_FLAGS := -std=c++17 -MMD -Xcudafe="--diag_suppress=20012" -Xcompiler -fopenmp
INCLUDE := ../../
INCLUDE := ../../include/CudaTools
LIBS_DIR :=
LIBS_DIR_GPU := /usr/local/cuda/lib64
LIBS :=

View File

@ -1,9 +1,9 @@
CC := g++-10
NVCC := nvcc
CFLAGS := -Wall -std=c++17 -fopenmp -MMD
NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
CFLAGS := -std=c++17 -MMD -Wall -fopenmp
NVCC_FLAGS := -std=c++17 -MMD -Xcudafe="--diag_suppress=20012" -Xcompiler -fopenmp
INCLUDE := ../../
INCLUDE := ../../include/CudaTools
LIBS_DIR :=
LIBS_DIR_GPU := /usr/local/cuda/lib64
LIBS :=

95
tests/Makefile Normal file
View File

@ -0,0 +1,95 @@
CC := g++-10
NVCC := nvcc
CFLAGS := -std=c++17 -MMD -Wall -fopenmp
NVCC_FLAGS := -std=c++17 -MMD -Xcudafe="--diag_suppress=20012" -Xcompiler -fopenmp
INCLUDE := ../include/CudaTools
LIBS_DIR :=
LIBS_DIR_GPU := /usr/local/cuda/lib64
LIBS :=
LIBS_GPU := cuda cudart cublas
TARGET = tests
SRC_DIR = .
BUILD_DIR = build
# Should not need to modify below.
CPU_BUILD_DIR = $(BUILD_DIR)/cpu
GPU_BUILD_DIR = $(BUILD_DIR)/gpu
SRC = $(wildcard $(SRC_DIR)/*/*.cpp) $(wildcard $(SRC_DIR)/*.cpp)
# Get source files and object files.
GCC_SRC = $(filter-out %.cu.cpp ,$(SRC))
NVCC_SRC = $(filter %.cu.cpp, $(SRC))
GCC_OBJ = $(GCC_SRC:$(SRC_DIR)/%.cpp=%.o)
NVCC_OBJ = $(NVCC_SRC:$(SRC_DIR)/%.cpp=%.o)
# If compiling for CPU, all go to GCC. Otherwise, they are split.
CPU_OBJ = $(addprefix $(CPU_BUILD_DIR)/,$(GCC_OBJ)) $(addprefix $(CPU_BUILD_DIR)/,$(NVCC_OBJ))
GPU_GCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(GCC_OBJ))
GPU_NVCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(NVCC_OBJ))
# $(info $$GCC_SRC is [${GCC_SRC}])
# $(info $$NVCC_SRC is [${NVCC_SRC}])
# $(info $$GCC_OBJ is [${GCC_OBJ}])
# $(info $$NVCC_OBJ is [${NVCC_OBJ}])
# $(info $$CPU_OBJ is [${CPU_OBJ}])
# $(info $$GPU_GCC_OBJ is [${GPU_GCC_OBJ}])
# $(info $$GPU_NVCC_OBJ is [${GPU_NVCC_OBJ}])
HEADER = $(wildcard $(SRC_DIR)/*/*.h) $(wildcard $(SRC_DIR)/*.h)
CPU_DEPS = $(wildcard $(CPU_BUILD_DIR)/*.d)
GPU_DEPS = $(wildcard $(GPU_BUILD_DIR)/*.d)
INC := $(INCLUDE:%=-I%)
LIB := $(LIBS_DIR:%=-L%)
LIB_GPU := $(LIBS_DIR_GPU:%=-L%)
LD := $(LIBS:%=-l%)
LD_GPU := $(LIBS_GPU:%=-l%)
# Reminder:
# $< = first prerequisite
# $@ = the target which matched the rule
# $^ = all prerequisites
.PHONY: all clean
all : cpu gpu
cpu: $(TARGET)CPU
gpu: $(TARGET)GPU
$(TARGET)CPU: $(CPU_OBJ)
$(CC) $(CFLAGS) $^ -o $@ $(INC) $(LIB) $(LD)
$(CPU_BUILD_DIR)/%.o $(CPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cpp | $(CPU_BUILD_DIR)
$(CC) $(CFLAGS) -c -o $@ $< $(INC)
# For GPU, we need to build the NVCC objects, the NVCC linked object, and the
# regular ones. Then, we link them all together.
$(TARGET)GPU: $(GPU_BUILD_DIR)/link.o $(GPU_GCC_OBJ) | $(GPU_BUILD_DIR)
$(CC) -g -DCUDA $(CFLAGS) $(GPU_NVCC_OBJ) $^ -o $@ $(INC) $(LIB) $(LIB_GPU) $(LD) $(LD_GPU)
$(GPU_BUILD_DIR)/link.o: $(GPU_NVCC_OBJ) | $(GPU_BUILD_DIR)
$(NVCC) --device-link -lgomp $^ -o $@
$(GPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cu.cpp | $(GPU_BUILD_DIR)
$(NVCC) $(NVCC_FLAGS) -DCUDA -x cu --device-c -o $@ $< $(INC)
$(GPU_BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp | $(GPU_BUILD_DIR)
$(CC) $(CFLAGS) -g -DCUDA -c -o $@ $< $(INC)
-include $(CPU_DEPS)
-include $(GPU_DEPS)
$(CPU_BUILD_DIR):
mkdir -p $@
$(GPU_BUILD_DIR):
mkdir -p $@
clean:
rm -Rf $(BUILD_DIR) $(TARGET)CPU $(TARGET)GPU

View File

@ -1,9 +1,9 @@
#define CUDATOOLS_IMPLEMENTATION
#define CUDATOOLS_ARRAY_MAX_AXES 8
#include "Array.h"
#include "BLAS.h"
#include "Core.h"
#include "Types.h"
#include <Array.h>
#include <BLAS.h>
#include <Core.h>
#include <Types.h>
#include <Eigen/Core>
#include <chrono>