Compare commits
10 Commits
9d66e0905c
...
45e8e274cb
| Author | SHA1 | Date | |
|---|---|---|---|
| 45e8e274cb | |||
| cdd35acd34 | |||
| 0607618407 | |||
| 4371ef8162 | |||
| 828b83a139 | |||
| c7ee614324 | |||
| 443929521d | |||
| fb63a6c2bc | |||
| 1248a58265 | |||
| 9c66a61288 |
0
DEPENDENCIES
Normal file
0
DEPENDENCIES
Normal file
110
Makefile
110
Makefile
@ -1,95 +1,21 @@
|
||||
CC := g++-10
|
||||
NVCC := nvcc
|
||||
CFLAGS := -Wall -std=c++17 -fopenmp -MMD
|
||||
NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
|
||||
.PHONY: all docs install
|
||||
|
||||
INCLUDE :=
|
||||
LIBS_DIR :=
|
||||
LIBS_DIR_GPU := /usr/local/cuda/lib64
|
||||
LIBS :=
|
||||
LIBS_GPU := cuda cudart cublas
|
||||
all: install docs
|
||||
|
||||
TARGET = tests
|
||||
SRC_DIR = .
|
||||
BUILD_DIR = build
|
||||
docs: SHELL:=/bin/bash
|
||||
docs:
|
||||
if [ ! -d "docs/.venv" ]; then python3 -m venv "docs/.venv"; fi
|
||||
( \
|
||||
source docs/.venv/bin/activate;\
|
||||
pip install -r docs/requirements.txt; \
|
||||
doxygen docs/Doxyfile; \
|
||||
sphinx-build -b html docs/source docs/build/html; \
|
||||
deactivate \
|
||||
)
|
||||
|
||||
# Should not need to modify below.
|
||||
|
||||
CPU_BUILD_DIR = $(BUILD_DIR)/cpu
|
||||
GPU_BUILD_DIR = $(BUILD_DIR)/gpu
|
||||
|
||||
SRC = $(wildcard $(SRC_DIR)/*/*.cpp) $(wildcard $(SRC_DIR)/*.cpp)
|
||||
|
||||
# Get source files and object files.
|
||||
GCC_SRC = $(filter-out %.cu.cpp ,$(SRC))
|
||||
NVCC_SRC = $(filter %.cu.cpp, $(SRC))
|
||||
GCC_OBJ = $(GCC_SRC:$(SRC_DIR)/%.cpp=%.o)
|
||||
NVCC_OBJ = $(NVCC_SRC:$(SRC_DIR)/%.cpp=%.o)
|
||||
|
||||
# If compiling for CPU, all go to GCC. Otherwise, they are split.
|
||||
CPU_OBJ = $(addprefix $(CPU_BUILD_DIR)/,$(GCC_OBJ)) $(addprefix $(CPU_BUILD_DIR)/,$(NVCC_OBJ))
|
||||
GPU_GCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(GCC_OBJ))
|
||||
GPU_NVCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(NVCC_OBJ))
|
||||
|
||||
# $(info $$GCC_SRC is [${GCC_SRC}])
|
||||
# $(info $$NVCC_SRC is [${NVCC_SRC}])
|
||||
# $(info $$GCC_OBJ is [${GCC_OBJ}])
|
||||
# $(info $$NVCC_OBJ is [${NVCC_OBJ}])
|
||||
|
||||
# $(info $$CPU_OBJ is [${CPU_OBJ}])
|
||||
# $(info $$GPU_GCC_OBJ is [${GPU_GCC_OBJ}])
|
||||
# $(info $$GPU_NVCC_OBJ is [${GPU_NVCC_OBJ}])
|
||||
|
||||
HEADER = $(wildcard $(SRC_DIR)/*/*.h) $(wildcard $(SRC_DIR)/*.h)
|
||||
CPU_DEPS = $(wildcard $(CPU_BUILD_DIR)/*.d)
|
||||
GPU_DEPS = $(wildcard $(GPU_BUILD_DIR)/*.d)
|
||||
|
||||
INC := $(INCLUDE:%=-I%)
|
||||
LIB := $(LIBS_DIR:%=-L%)
|
||||
LIB_GPU := $(LIBS_DIR_GPU:%=-L%)
|
||||
LD := $(LIBS:%=-l%)
|
||||
LD_GPU := $(LIBS_GPU:%=-l%)
|
||||
|
||||
# Reminder:
|
||||
# $< = first prerequisite
|
||||
# $@ = the target which matched the rule
|
||||
# $^ = all prerequisites
|
||||
|
||||
.PHONY: all clean
|
||||
|
||||
all : cpu gpu
|
||||
|
||||
cpu: $(TARGET)CPU
|
||||
gpu: $(TARGET)GPU
|
||||
|
||||
$(TARGET)CPU: $(CPU_OBJ)
|
||||
$(CC) $(CFLAGS) $^ -o $@ $(INC) $(LIB) $(LD)
|
||||
|
||||
$(CPU_BUILD_DIR)/%.o $(CPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cpp | $(CPU_BUILD_DIR)
|
||||
$(CC) $(CFLAGS) -c -o $@ $< $(INC)
|
||||
|
||||
# For GPU, we need to build the NVCC objects, the NVCC linked object, and the
|
||||
# regular ones. Then, we link them all together.
|
||||
$(TARGET)GPU: $(GPU_BUILD_DIR)/link.o $(GPU_GCC_OBJ) | $(GPU_BUILD_DIR)
|
||||
$(CC) -g -DCUDA $(CFLAGS) $(GPU_NVCC_OBJ) $^ -o $@ $(INC) $(LIB) $(LIB_GPU) $(LD) $(LD_GPU)
|
||||
|
||||
$(GPU_BUILD_DIR)/link.o: $(GPU_NVCC_OBJ) | $(GPU_BUILD_DIR)
|
||||
$(NVCC) --device-link $^ -o $@
|
||||
|
||||
$(GPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cu.cpp | $(GPU_BUILD_DIR)
|
||||
$(NVCC) $(NVCC_FLAGS) -DCUDA -x cu --device-c -o $@ $< $(INC)
|
||||
|
||||
$(GPU_BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp | $(GPU_BUILD_DIR)
|
||||
$(CC) $(CFLAGS) -g -DCUDA -c -o $@ $< $(INC)
|
||||
|
||||
-include $(CPU_DEPS)
|
||||
-include $(GPU_DEPS)
|
||||
|
||||
$(CPU_BUILD_DIR):
|
||||
mkdir -p $@
|
||||
|
||||
$(GPU_BUILD_DIR):
|
||||
mkdir -p $@
|
||||
|
||||
clean:
|
||||
rm -Rf $(BUILD_DIR) $(TARGET)CPU $(TARGET)GPU
|
||||
install:
|
||||
install -d $(DEST_DIR)/
|
||||
install -d $(DEST_DIR)/include/
|
||||
for file in include/*; do \
|
||||
install -m 644 $$file $(DEST_DIR)/include/; \
|
||||
done
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
CC := g++-10
|
||||
NVCC := nvcc
|
||||
CFLAGS := -Wall -std=c++17 -fopenmp -MMD
|
||||
NVCC_FLAGS := -MMD -w -Xcompiler
|
||||
CFLAGS := -std=c++17 -MMD -Wall -fopenmp
|
||||
NVCC_FLAGS := -std=c++17 -MMD -Xcudafe="--diag_suppress=20012" -Xcompiler -fopenmp
|
||||
|
||||
INCLUDE := <<Put extra include directories here, separated by a space>>
|
||||
LIBS_DIR := <<Put library directories here, separated by a space>>
|
||||
@ -74,7 +74,7 @@ $(TARGET)GPU: $(GPU_BUILD_DIR)/link.o $(GPU_GCC_OBJ) | $(GPU_BUILD_DIR)
|
||||
$(CC) -g -DCUDA $(CFLAGS) $(GPU_NVCC_OBJ) $^ -o $@ $(INC) $(LIB) $(LIB_GPU) $(LD) $(LD_GPU)
|
||||
|
||||
$(GPU_BUILD_DIR)/link.o: $(GPU_NVCC_OBJ) | $(GPU_BUILD_DIR)
|
||||
$(NVCC) --device-link $^ -o $@
|
||||
$(NVCC) --device-link -lgomp $^ -o $@
|
||||
|
||||
$(GPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cu.cpp | $(GPU_BUILD_DIR)
|
||||
$(NVCC) $(NVCC_FLAGS) -DCUDA -x cu --device-c -o $@ $< $(INC)
|
||||
|
||||
11
SETUP
Executable file
11
SETUP
Executable file
@ -0,0 +1,11 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
if [ -z "$1" ]
|
||||
then
|
||||
printf "Usage: setup.sh install_directory [dependencies].\n" 1>&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
cd $(dirname $0)
|
||||
libname=$(basename $(pwd))
|
||||
make DEST_DIR="$1" install
|
||||
@ -1,2 +0,0 @@
|
||||
doxygen docs/Doxyfile
|
||||
sphinx-build -b html docs/source docs/build/html
|
||||
@ -778,7 +778,7 @@ WARNINGS = YES
|
||||
# will automatically be disabled.
|
||||
# The default value is: YES.
|
||||
|
||||
WARN_IF_UNDOCUMENTED = YES
|
||||
WARN_IF_UNDOCUMENTED = NO
|
||||
|
||||
# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
|
||||
# potential errors in the documentation, such as not documenting some parameters
|
||||
@ -829,7 +829,7 @@ WARN_LOGFILE =
|
||||
# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
|
||||
# Note: If this tag is empty the current directory is searched.
|
||||
|
||||
INPUT = "./"
|
||||
INPUT = "./include"
|
||||
|
||||
# This tag can be used to specify the character encoding of the source files
|
||||
# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
|
||||
|
||||
@ -27,8 +27,8 @@ extensions = [
|
||||
'breathe',
|
||||
]
|
||||
|
||||
breathe_projects = {"DGEMS": "../build/xml"}
|
||||
breathe_default_project = "DGEMS"
|
||||
breathe_projects = {"CudaTools": "../build/xml"}
|
||||
breathe_default_project = "CudaTools"
|
||||
|
||||
bibtex_bibfiles = ['refs.bib']
|
||||
|
||||
|
||||
@ -42,6 +42,8 @@ Host-Device Automation
|
||||
Compilation Options
|
||||
-------------------
|
||||
.. doxygendefine:: CUDATOOLS_ARRAY_MAX_AXES
|
||||
.. doxygendefine:: CUDATOOLS_USE_EIGEN
|
||||
.. doxygendefine:: CUDATOOLS_USE_PYTHON
|
||||
|
||||
Macro Functions
|
||||
===============
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
#include "Core.h"
|
||||
#include "Macros.h"
|
||||
#include "Types.h"
|
||||
#include <Eigen/Dense>
|
||||
|
||||
#include <cmath>
|
||||
#include <complex>
|
||||
#include <cstdlib>
|
||||
@ -12,6 +12,16 @@
|
||||
#include <random>
|
||||
#include <type_traits>
|
||||
|
||||
#ifdef CUDATOOLS_USE_EIGEN
|
||||
#include <Eigen/Dense>
|
||||
#endif
|
||||
|
||||
#ifdef CUDATOOLS_USE_PYTHON
|
||||
#include <pybind11/numpy.h>
|
||||
#include <pybind11/pybind11.h>
|
||||
namespace py = pybind11;
|
||||
#endif
|
||||
|
||||
#ifdef DEVICE
|
||||
#define POINTER pDevice
|
||||
#else
|
||||
@ -22,6 +32,7 @@ using namespace CudaTools::Types;
|
||||
|
||||
namespace CudaTools {
|
||||
|
||||
#ifdef CUDATOOLS_USE_EIGEN
|
||||
template <typename T>
|
||||
using EigenMat = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor>;
|
||||
template <typename T> using EigenMapMat = Eigen::Map<EigenMat<T>>;
|
||||
@ -30,9 +41,16 @@ template <typename T> using ConstEigenMapMat = Eigen::Map<const EigenMat<T>>;
|
||||
template <typename T> struct EigenAdaptConst_S { typedef EigenMapMat<T> type; };
|
||||
template <typename T> struct EigenAdaptConst_S<const T> { typedef ConstEigenMapMat<T> type; };
|
||||
template <typename T> using EigenAdaptConst = typename EigenAdaptConst_S<T>::type;
|
||||
#endif
|
||||
|
||||
template <typename T> class Array;
|
||||
using Slice = std::pair<uint32_t, uint32_t>;
|
||||
|
||||
struct Slice {
|
||||
uint32_t first;
|
||||
uint32_t second;
|
||||
HD Slice(const std::initializer_list<uint32_t> i)
|
||||
: first(*i.begin()), second(*(i.begin() + 1)) {}
|
||||
};
|
||||
|
||||
template <typename T> class ArrayIterator {
|
||||
private:
|
||||
@ -181,7 +199,7 @@ template <typename T> class Array {
|
||||
|
||||
uint32_t mEndOffset = 0;
|
||||
|
||||
void freeArrays() {
|
||||
HD void freeArrays() {
|
||||
#ifndef DEVICE
|
||||
if (not mIsView) {
|
||||
if (pDevice != nullptr) CudaTools::free(pDevice);
|
||||
@ -490,6 +508,7 @@ template <typename T> class Array {
|
||||
*/
|
||||
HD void flatten() { reshape({mShape.mItems}); };
|
||||
|
||||
#ifdef CUDATOOLS_USE_EIGEN
|
||||
/**
|
||||
* Returns the Eigen::Map of this Array.
|
||||
*/
|
||||
@ -501,6 +520,7 @@ template <typename T> class Array {
|
||||
return EigenAdaptConst<ComplexConversion<T>>((ComplexConversion<T>*)POINTER, mShape.rows(),
|
||||
mShape.cols());
|
||||
};
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Gets the Shape of the Array.
|
||||
@ -528,7 +548,7 @@ template <typename T> class Array {
|
||||
/**
|
||||
* Copies this Array and returns a new Array with the same memory.
|
||||
*/
|
||||
HD Array copy() const {
|
||||
Array copy() const {
|
||||
Array<T> arr(mShape, (pDevice == nullptr));
|
||||
|
||||
auto arr_it = arr.begin();
|
||||
@ -679,6 +699,8 @@ template <typename T> class Array {
|
||||
* Its self assigning version is transpose. This is restricted to numerical types.
|
||||
* \brief Host only
|
||||
*/
|
||||
|
||||
#ifdef CUDATOOLS_USE_EIGEN
|
||||
Array transposed() const {
|
||||
static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
|
||||
CT_ERROR_IF(shape().axes(), !=, 2, "Tranpose can only occur on two-dimensional arrays");
|
||||
@ -708,6 +730,7 @@ template <typename T> class Array {
|
||||
Array<T> inv(shape());
|
||||
inv.eigenMap() = this->eigenMap().inverse();
|
||||
};
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Pins the memory (page locks) for faster memory transfer in concurrent
|
||||
@ -733,6 +756,22 @@ template <typename T> class Array {
|
||||
CT_ERROR(mIsSlice, "Cannot update device copy on a slice");
|
||||
return CudaTools::copy(pHost, pDevice, mShape.items() * sizeof(T), stream);
|
||||
};
|
||||
|
||||
#ifdef CUDATOOLS_USE_PYTHON
|
||||
/**
|
||||
* Returns a py::array for making an Array available as a Python numpy array.
|
||||
*/
|
||||
py::array pyArray() const {
|
||||
std::vector<py::ssize_t> dims, strides;
|
||||
for (uint iAxis = 0; iAxis < mShape.axes(); ++iAxis) {
|
||||
dims.push_back(static_cast<py::ssize_t>(mShape.dim(iAxis)));
|
||||
strides.push_back(sizeof(T) * static_cast<py::ssize_t>(mShape.stride(iAxis)));
|
||||
}
|
||||
return py::array_t<T, py::array::f_style>(
|
||||
py::buffer_info((void*)pHost, sizeof(T), py::format_descriptor<T>::format(),
|
||||
static_cast<py::ssize_t>(mShape.axes()), dims, strides));
|
||||
};
|
||||
#endif
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
@ -774,9 +813,11 @@ template <typename T> std::ostream& operator<<(std::ostream& out, const Array<T>
|
||||
bool negative = false;
|
||||
for (auto it = arr.begin(); it != arr.end(); ++it) {
|
||||
T val = *it;
|
||||
if (*it < 0) {
|
||||
negative = true;
|
||||
val *= -1;
|
||||
if constexpr (not std::is_unsigned<T>::value) {
|
||||
if (*it < 0) {
|
||||
negative = true;
|
||||
val *= -1;
|
||||
}
|
||||
}
|
||||
max_val = (val > max_val) ? val : max_val;
|
||||
}
|
||||
@ -1,6 +1,10 @@
|
||||
#ifndef CUDATOOLS_BLAS_H
|
||||
#define CUDATOOLS_BLAS_H
|
||||
|
||||
#ifndef CUDATOOLS_USE_EIGEN
|
||||
#error "Cannot use CudaTools BLAS.h header without Eigen."
|
||||
#endif
|
||||
|
||||
#include "Array.h"
|
||||
#include "Core.h"
|
||||
#include "Macros.h"
|
||||
@ -138,7 +142,7 @@ template <typename T> class Batch {
|
||||
|
||||
Array<T> batch = arr.reshaped({mBatchSize, mShape.rows(), mShape.cols()});
|
||||
for (uint32_t i = 0; i < mBatchSize; ++i) {
|
||||
#ifdef CUDA
|
||||
#ifdef CUDACC
|
||||
mBatch[i] = batch[i].dataDevice();
|
||||
#else
|
||||
mBatch[i] = batch[i].data();
|
||||
@ -154,7 +158,7 @@ template <typename T> class Batch {
|
||||
void add(const Array<T>& arr) {
|
||||
CT_ERROR(not arr.isView(), "Cannot add non-view Arrays");
|
||||
CT_ERROR_IF(mCount, ==, mBatchSize, "Batch is full, cannot add more arrays");
|
||||
#ifdef CUDA
|
||||
#ifdef CUDACC
|
||||
mBatch[mCount] = arr.dataDevice();
|
||||
#else
|
||||
mBatch[mCount] = arr.data();
|
||||
@ -270,7 +274,7 @@ StreamID GEMV(const T alpha, const Array<T>& A, const Array<T>& x, const T beta,
|
||||
uint32_t rows = A.shape().rows();
|
||||
uint32_t cols = A.shape().cols();
|
||||
T a = alpha, b = beta;
|
||||
#ifdef CUDA
|
||||
#ifdef CUDACC
|
||||
CUBLAS_CHECK(cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream)));
|
||||
if (bi.size == 1) {
|
||||
invoke<T>(cublasSgemv, cublasDgemv, cublasCgemv, cublasZgemv,
|
||||
@ -315,7 +319,7 @@ StreamID GEMM(const T alpha, const Array<T>& A, const Array<T>& B, const T beta,
|
||||
uint32_t n = B.shape().cols();
|
||||
|
||||
T a = alpha, b = beta;
|
||||
#ifdef CUDA
|
||||
#ifdef CUDACC
|
||||
CUBLAS_CHECK(cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream)));
|
||||
|
||||
if (bi.size == 1) {
|
||||
@ -368,7 +372,7 @@ StreamID DGMM(const Array<T>& A, const Array<T>& X, const Array<T>& C, const boo
|
||||
CT_ERROR_IF(A.shape().cols(), !=, C.shape().cols(),
|
||||
"Rows of 'A' and columns of 'C' need to match.");
|
||||
|
||||
#ifdef CUDA
|
||||
#ifdef CUDACC
|
||||
uint32_t m = C.shape().rows();
|
||||
uint32_t n = C.shape().cols();
|
||||
auto mode = (left) ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT;
|
||||
@ -544,7 +548,7 @@ class PLUBatch : public Batch<T> {
|
||||
* Computes the inplace PLU decomposition of batch of arrays.
|
||||
*/
|
||||
StreamID computeLU(const StreamID& stream = DEF_CUBLAS_STREAM) {
|
||||
#ifdef CUDA
|
||||
#ifdef CUDACC
|
||||
uint32_t n = this->mShape.rows();
|
||||
CUBLAS_CHECK(
|
||||
cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream)));
|
||||
@ -575,7 +579,7 @@ class PLUBatch : public Batch<T> {
|
||||
CT_ERROR_IF(b.shape().rows(), !=, this->mShape.rows(),
|
||||
"The length of each column of b must match the matrix rank");
|
||||
|
||||
#ifdef CUDA
|
||||
#ifdef CUDACC
|
||||
uint32_t n = b.shape().rows();
|
||||
uint32_t nrhs = b.shape().cols();
|
||||
CUBLAS_CHECK(
|
||||
@ -143,7 +143,7 @@ Settings basic(const size_t threads, const StreamID& stream = DEF_KERNEL_STREAM)
|
||||
|
||||
template <typename F, typename... Args>
|
||||
StreamID launch(F func, const Kernel::Settings& sett, Args... args) {
|
||||
#ifdef CUDA
|
||||
#ifdef CUDACC
|
||||
func<<<sett.blockGrid, sett.threadBlock, sett.sharedMemoryBytes,
|
||||
Manager::get()->stream(sett.stream.mId)>>>(args...);
|
||||
#else
|
||||
@ -256,8 +256,8 @@ template <typename F, typename... Args> class Graph {
|
||||
|
||||
~Graph() {
|
||||
#ifdef CUDACC
|
||||
CUDA_CHECK(cudaGraphDestroy(mGraph));
|
||||
CUDA_CHECK(cudaGraphExecDestroy(mInstance));
|
||||
cudaGraphDestroy(mGraph);
|
||||
cudaGraphExecDestroy(mInstance);
|
||||
#endif
|
||||
};
|
||||
|
||||
@ -316,9 +316,11 @@ struct GraphManager {
|
||||
};
|
||||
|
||||
}; // namespace CudaTools
|
||||
#endif // CUDATOOLS_H
|
||||
|
||||
#ifdef CUDATOOLS_IMPLEMENTATION
|
||||
|
||||
#ifndef __CUDATOOLS_IMPLEMENTED__
|
||||
#define __CUDATOOLS_IMPLEMENTED__
|
||||
namespace CudaTools {
|
||||
|
||||
//////////////////////
|
||||
@ -400,9 +402,9 @@ Manager::Manager(const std::vector<std::string>& names) {
|
||||
Manager::~Manager() {
|
||||
#ifdef CUDACC
|
||||
for (auto& it : mStreams) {
|
||||
CUDA_CHECK(cudaStreamDestroy(it.second));
|
||||
cudaStreamDestroy(it.second);
|
||||
}
|
||||
CUBLAS_CHECK(cublasDestroy(mCublas));
|
||||
cublasDestroy(mCublas);
|
||||
// CUSPARSE_CHECK(cusparseDestroy(mCusparse));
|
||||
#endif
|
||||
}
|
||||
@ -640,7 +642,7 @@ Event::Event() {
|
||||
|
||||
Event::~Event() {
|
||||
#ifdef CUDACC
|
||||
CUDA_CHECK(cudaEventDestroy(mEvent));
|
||||
cudaEventDestroy(mEvent);
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -656,9 +658,6 @@ void Event::record(const StreamID& stream) {
|
||||
|
||||
GraphManager::~GraphManager() {
|
||||
#ifdef CUDACC
|
||||
for (void* func : mHostData) {
|
||||
delete func;
|
||||
}
|
||||
for (Event* event : mEvents) {
|
||||
delete event;
|
||||
}
|
||||
@ -680,6 +679,5 @@ void GraphManager::joinBranch(const StreamID& orig_stream, const StreamID& branc
|
||||
}
|
||||
|
||||
}; // namespace CudaTools
|
||||
#endif
|
||||
#endif // CUDATOOLS_IMPLEMENTATION
|
||||
|
||||
#endif // CUDATOOLS_H
|
||||
@ -49,6 +49,18 @@
|
||||
*/
|
||||
#define SHARED
|
||||
|
||||
/**
|
||||
* \def CUDATOOLS_USE_EIGEN
|
||||
* Compile the CudaTools library with Eigen support.
|
||||
*/
|
||||
#define CUDATOOLS_USE_EIGEN
|
||||
|
||||
/**
|
||||
* \def CUDATOOLS_USE_PYTHON
|
||||
* Compile the CudaTools library with Python support.
|
||||
*/
|
||||
#define CUDATOOLS_USE_PYTHON
|
||||
|
||||
/**
|
||||
* \def KERNEL(call, settings, ...)
|
||||
* Used to call a CUDA kernel.
|
||||
@ -218,12 +230,13 @@
|
||||
#ifdef DEVICE
|
||||
#define CT_ERROR_IF(a, op, b, msg) \
|
||||
if (a op b) { \
|
||||
printf("[ERROR] %s:%d\n | %s: (" #a ") " #op " (" #b ").\n", __FILE__, __LINE__, msg); \
|
||||
printf("\033[1;31m[CudaTools]\033[0m %s:%d\n | %s: (" #a ") " #op " (" #b ").\n", \
|
||||
__FILE__, __LINE__, msg); \
|
||||
}
|
||||
|
||||
#define CT_ERROR(a, msg) \
|
||||
if (a) { \
|
||||
printf("[ERROR] %s:%d\n | %s: " #a ".\n", __FILE__, __LINE__, msg); \
|
||||
printf("\033[1;31m[CudaTools]\033[0m %s:%d\n | %s: " #a ".\n", __FILE__, __LINE__, msg); \
|
||||
}
|
||||
#else
|
||||
|
||||
@ -233,14 +246,14 @@
|
||||
std::ostringstream os_b; \
|
||||
os_a << a; \
|
||||
os_b << b; \
|
||||
printf("[ERROR] %s:%d\n | %s: (" #a ")%s " #op " (" #b ")%s.\n", __FILE__, __LINE__, msg, \
|
||||
os_a.str().c_str(), os_b.str().c_str()); \
|
||||
printf("\033[1;31m[CudaTools]\033[0m %s:%d\n | %s: (" #a ")%s " #op " (" #b ")%s.\n", \
|
||||
__FILE__, __LINE__, msg, os_a.str().c_str(), os_b.str().c_str()); \
|
||||
throw std::exception(); \
|
||||
}
|
||||
|
||||
#define CT_ERROR(a, msg) \
|
||||
if (a) { \
|
||||
printf("[ERROR] %s:%d\n | %s: " #a ".\n", __FILE__, __LINE__, msg); \
|
||||
printf("\033[1;31m[CudaTools]\033[0m %s:%d\n | %s: " #a ".\n", __FILE__, __LINE__, msg); \
|
||||
throw std::exception(); \
|
||||
}
|
||||
#endif
|
||||
@ -253,7 +266,8 @@
|
||||
do { \
|
||||
cudaError_t err = (call); \
|
||||
if (err != cudaSuccess) { \
|
||||
printf("[CUDA] %s:%d\n | %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
|
||||
printf("\033[1;31m[CUDA]\033[0m %s:%d\n | %s\n", __FILE__, __LINE__, \
|
||||
cudaGetErrorString(err)); \
|
||||
throw std::exception(); \
|
||||
} \
|
||||
} while (0)
|
||||
@ -262,7 +276,8 @@
|
||||
do { \
|
||||
cublasStatus_t err = (call); \
|
||||
if (err != CUBLAS_STATUS_SUCCESS) { \
|
||||
printf("[cuBLAS] %s:%d\n | %s\n", __FILE__, __LINE__, cublasGetStatusName(err)); \
|
||||
printf("\033[1;31m[cuBLAS]\033[0m %s:%d\n | %s\n", __FILE__, __LINE__, \
|
||||
cublasGetStatusName(err)); \
|
||||
throw std::exception(); \
|
||||
} \
|
||||
} while (0)
|
||||
@ -1,9 +1,9 @@
|
||||
CC := g++-10
|
||||
NVCC := nvcc
|
||||
CFLAGS := -Wall -std=c++17 -fopenmp -MMD
|
||||
NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
|
||||
CFLAGS := -std=c++17 -MMD -Wall -fopenmp
|
||||
NVCC_FLAGS := -std=c++17 -MMD -Xcudafe="--diag_suppress=20012" -Xcompiler -fopenmp
|
||||
|
||||
INCLUDE := ../../
|
||||
INCLUDE := ../../include/CudaTools
|
||||
LIBS_DIR :=
|
||||
LIBS_DIR_GPU := /usr/local/cuda/lib64
|
||||
LIBS :=
|
||||
|
||||
@ -1,9 +1,9 @@
|
||||
CC := g++-10
|
||||
NVCC := nvcc
|
||||
CFLAGS := -Wall -std=c++17 -fopenmp -MMD
|
||||
NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
|
||||
CFLAGS := -std=c++17 -MMD -Wall -fopenmp
|
||||
NVCC_FLAGS := -std=c++17 -MMD -Xcudafe="--diag_suppress=20012" -Xcompiler -fopenmp
|
||||
|
||||
INCLUDE := ../../
|
||||
INCLUDE := ../../include/CudaTools
|
||||
LIBS_DIR :=
|
||||
LIBS_DIR_GPU := /usr/local/cuda/lib64
|
||||
LIBS :=
|
||||
|
||||
@ -1,9 +1,9 @@
|
||||
CC := g++-10
|
||||
NVCC := nvcc
|
||||
CFLAGS := -Wall -std=c++17 -fopenmp -MMD
|
||||
NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
|
||||
CFLAGS := -std=c++17 -MMD -Wall -fopenmp
|
||||
NVCC_FLAGS := -std=c++17 -MMD -Xcudafe="--diag_suppress=20012" -Xcompiler -fopenmp
|
||||
|
||||
INCLUDE := ../../
|
||||
INCLUDE := ../../include/CudaTools
|
||||
LIBS_DIR :=
|
||||
LIBS_DIR_GPU := /usr/local/cuda/lib64
|
||||
LIBS :=
|
||||
|
||||
@ -1,9 +1,9 @@
|
||||
CC := g++-10
|
||||
NVCC := nvcc
|
||||
CFLAGS := -Wall -std=c++17 -fopenmp -MMD
|
||||
NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
|
||||
CFLAGS := -std=c++17 -MMD -Wall -fopenmp
|
||||
NVCC_FLAGS := -std=c++17 -MMD -Xcudafe="--diag_suppress=20012" -Xcompiler -fopenmp
|
||||
|
||||
INCLUDE := ../../
|
||||
INCLUDE := ../../include/CudaTools
|
||||
LIBS_DIR :=
|
||||
LIBS_DIR_GPU := /usr/local/cuda/lib64
|
||||
LIBS :=
|
||||
|
||||
@ -1,9 +1,9 @@
|
||||
CC := g++-10
|
||||
NVCC := nvcc
|
||||
CFLAGS := -Wall -std=c++17 -fopenmp -MMD
|
||||
NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
|
||||
CFLAGS := -std=c++17 -MMD -Wall -fopenmp
|
||||
NVCC_FLAGS := -std=c++17 -MMD -Xcudafe="--diag_suppress=20012" -Xcompiler -fopenmp
|
||||
|
||||
INCLUDE := ../../
|
||||
INCLUDE := ../../include/CudaTools
|
||||
LIBS_DIR :=
|
||||
LIBS_DIR_GPU := /usr/local/cuda/lib64
|
||||
LIBS :=
|
||||
|
||||
95
tests/Makefile
Normal file
95
tests/Makefile
Normal file
@ -0,0 +1,95 @@
|
||||
CC := g++-10
|
||||
NVCC := nvcc
|
||||
CFLAGS := -std=c++17 -MMD -Wall -fopenmp
|
||||
NVCC_FLAGS := -std=c++17 -MMD -Xcudafe="--diag_suppress=20012" -Xcompiler -fopenmp
|
||||
|
||||
INCLUDE := ../include/CudaTools
|
||||
LIBS_DIR :=
|
||||
LIBS_DIR_GPU := /usr/local/cuda/lib64
|
||||
LIBS :=
|
||||
LIBS_GPU := cuda cudart cublas
|
||||
|
||||
TARGET = tests
|
||||
SRC_DIR = .
|
||||
BUILD_DIR = build
|
||||
|
||||
# Should not need to modify below.
|
||||
|
||||
CPU_BUILD_DIR = $(BUILD_DIR)/cpu
|
||||
GPU_BUILD_DIR = $(BUILD_DIR)/gpu
|
||||
|
||||
SRC = $(wildcard $(SRC_DIR)/*/*.cpp) $(wildcard $(SRC_DIR)/*.cpp)
|
||||
|
||||
# Get source files and object files.
|
||||
GCC_SRC = $(filter-out %.cu.cpp ,$(SRC))
|
||||
NVCC_SRC = $(filter %.cu.cpp, $(SRC))
|
||||
GCC_OBJ = $(GCC_SRC:$(SRC_DIR)/%.cpp=%.o)
|
||||
NVCC_OBJ = $(NVCC_SRC:$(SRC_DIR)/%.cpp=%.o)
|
||||
|
||||
# If compiling for CPU, all go to GCC. Otherwise, they are split.
|
||||
CPU_OBJ = $(addprefix $(CPU_BUILD_DIR)/,$(GCC_OBJ)) $(addprefix $(CPU_BUILD_DIR)/,$(NVCC_OBJ))
|
||||
GPU_GCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(GCC_OBJ))
|
||||
GPU_NVCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(NVCC_OBJ))
|
||||
|
||||
# $(info $$GCC_SRC is [${GCC_SRC}])
|
||||
# $(info $$NVCC_SRC is [${NVCC_SRC}])
|
||||
# $(info $$GCC_OBJ is [${GCC_OBJ}])
|
||||
# $(info $$NVCC_OBJ is [${NVCC_OBJ}])
|
||||
|
||||
# $(info $$CPU_OBJ is [${CPU_OBJ}])
|
||||
# $(info $$GPU_GCC_OBJ is [${GPU_GCC_OBJ}])
|
||||
# $(info $$GPU_NVCC_OBJ is [${GPU_NVCC_OBJ}])
|
||||
|
||||
HEADER = $(wildcard $(SRC_DIR)/*/*.h) $(wildcard $(SRC_DIR)/*.h)
|
||||
CPU_DEPS = $(wildcard $(CPU_BUILD_DIR)/*.d)
|
||||
GPU_DEPS = $(wildcard $(GPU_BUILD_DIR)/*.d)
|
||||
|
||||
INC := $(INCLUDE:%=-I%)
|
||||
LIB := $(LIBS_DIR:%=-L%)
|
||||
LIB_GPU := $(LIBS_DIR_GPU:%=-L%)
|
||||
LD := $(LIBS:%=-l%)
|
||||
LD_GPU := $(LIBS_GPU:%=-l%)
|
||||
|
||||
# Reminder:
|
||||
# $< = first prerequisite
|
||||
# $@ = the target which matched the rule
|
||||
# $^ = all prerequisites
|
||||
|
||||
.PHONY: all clean
|
||||
|
||||
all : cpu gpu
|
||||
|
||||
cpu: $(TARGET)CPU
|
||||
gpu: $(TARGET)GPU
|
||||
|
||||
$(TARGET)CPU: $(CPU_OBJ)
|
||||
$(CC) $(CFLAGS) $^ -o $@ $(INC) $(LIB) $(LD)
|
||||
|
||||
$(CPU_BUILD_DIR)/%.o $(CPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cpp | $(CPU_BUILD_DIR)
|
||||
$(CC) $(CFLAGS) -c -o $@ $< $(INC)
|
||||
|
||||
# For GPU, we need to build the NVCC objects, the NVCC linked object, and the
|
||||
# regular ones. Then, we link them all together.
|
||||
$(TARGET)GPU: $(GPU_BUILD_DIR)/link.o $(GPU_GCC_OBJ) | $(GPU_BUILD_DIR)
|
||||
$(CC) -g -DCUDA $(CFLAGS) $(GPU_NVCC_OBJ) $^ -o $@ $(INC) $(LIB) $(LIB_GPU) $(LD) $(LD_GPU)
|
||||
|
||||
$(GPU_BUILD_DIR)/link.o: $(GPU_NVCC_OBJ) | $(GPU_BUILD_DIR)
|
||||
$(NVCC) --device-link -lgomp $^ -o $@
|
||||
|
||||
$(GPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cu.cpp | $(GPU_BUILD_DIR)
|
||||
$(NVCC) $(NVCC_FLAGS) -DCUDA -x cu --device-c -o $@ $< $(INC)
|
||||
|
||||
$(GPU_BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp | $(GPU_BUILD_DIR)
|
||||
$(CC) $(CFLAGS) -g -DCUDA -c -o $@ $< $(INC)
|
||||
|
||||
-include $(CPU_DEPS)
|
||||
-include $(GPU_DEPS)
|
||||
|
||||
$(CPU_BUILD_DIR):
|
||||
mkdir -p $@
|
||||
|
||||
$(GPU_BUILD_DIR):
|
||||
mkdir -p $@
|
||||
|
||||
clean:
|
||||
rm -Rf $(BUILD_DIR) $(TARGET)CPU $(TARGET)GPU
|
||||
@ -1,9 +1,9 @@
|
||||
#define CUDATOOLS_IMPLEMENTATION
|
||||
#define CUDATOOLS_ARRAY_MAX_AXES 8
|
||||
#include "Array.h"
|
||||
#include "BLAS.h"
|
||||
#include "Core.h"
|
||||
#include "Types.h"
|
||||
#include <Array.h>
|
||||
#include <BLAS.h>
|
||||
#include <Core.h>
|
||||
#include <Types.h>
|
||||
|
||||
#include <Eigen/Core>
|
||||
#include <chrono>
|
||||
Loading…
x
Reference in New Issue
Block a user