Compare commits
No commits in common. "45e8e274cbf4414bad28f076c9d43f89963975b8" and "9d66e0905c62182df8650f754d663aadc0114267" have entirely different histories.
45e8e274cb
...
9d66e0905c
@ -4,7 +4,7 @@
|
|||||||
#include "Core.h"
|
#include "Core.h"
|
||||||
#include "Macros.h"
|
#include "Macros.h"
|
||||||
#include "Types.h"
|
#include "Types.h"
|
||||||
|
#include <Eigen/Dense>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <complex>
|
#include <complex>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
@ -12,16 +12,6 @@
|
|||||||
#include <random>
|
#include <random>
|
||||||
#include <type_traits>
|
#include <type_traits>
|
||||||
|
|
||||||
#ifdef CUDATOOLS_USE_EIGEN
|
|
||||||
#include <Eigen/Dense>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef CUDATOOLS_USE_PYTHON
|
|
||||||
#include <pybind11/numpy.h>
|
|
||||||
#include <pybind11/pybind11.h>
|
|
||||||
namespace py = pybind11;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef DEVICE
|
#ifdef DEVICE
|
||||||
#define POINTER pDevice
|
#define POINTER pDevice
|
||||||
#else
|
#else
|
||||||
@ -32,7 +22,6 @@ using namespace CudaTools::Types;
|
|||||||
|
|
||||||
namespace CudaTools {
|
namespace CudaTools {
|
||||||
|
|
||||||
#ifdef CUDATOOLS_USE_EIGEN
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
using EigenMat = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor>;
|
using EigenMat = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor>;
|
||||||
template <typename T> using EigenMapMat = Eigen::Map<EigenMat<T>>;
|
template <typename T> using EigenMapMat = Eigen::Map<EigenMat<T>>;
|
||||||
@ -41,16 +30,9 @@ template <typename T> using ConstEigenMapMat = Eigen::Map<const EigenMat<T>>;
|
|||||||
template <typename T> struct EigenAdaptConst_S { typedef EigenMapMat<T> type; };
|
template <typename T> struct EigenAdaptConst_S { typedef EigenMapMat<T> type; };
|
||||||
template <typename T> struct EigenAdaptConst_S<const T> { typedef ConstEigenMapMat<T> type; };
|
template <typename T> struct EigenAdaptConst_S<const T> { typedef ConstEigenMapMat<T> type; };
|
||||||
template <typename T> using EigenAdaptConst = typename EigenAdaptConst_S<T>::type;
|
template <typename T> using EigenAdaptConst = typename EigenAdaptConst_S<T>::type;
|
||||||
#endif
|
|
||||||
|
|
||||||
template <typename T> class Array;
|
template <typename T> class Array;
|
||||||
|
using Slice = std::pair<uint32_t, uint32_t>;
|
||||||
struct Slice {
|
|
||||||
uint32_t first;
|
|
||||||
uint32_t second;
|
|
||||||
HD Slice(const std::initializer_list<uint32_t> i)
|
|
||||||
: first(*i.begin()), second(*(i.begin() + 1)) {}
|
|
||||||
};
|
|
||||||
|
|
||||||
template <typename T> class ArrayIterator {
|
template <typename T> class ArrayIterator {
|
||||||
private:
|
private:
|
||||||
@ -199,7 +181,7 @@ template <typename T> class Array {
|
|||||||
|
|
||||||
uint32_t mEndOffset = 0;
|
uint32_t mEndOffset = 0;
|
||||||
|
|
||||||
HD void freeArrays() {
|
void freeArrays() {
|
||||||
#ifndef DEVICE
|
#ifndef DEVICE
|
||||||
if (not mIsView) {
|
if (not mIsView) {
|
||||||
if (pDevice != nullptr) CudaTools::free(pDevice);
|
if (pDevice != nullptr) CudaTools::free(pDevice);
|
||||||
@ -508,7 +490,6 @@ template <typename T> class Array {
|
|||||||
*/
|
*/
|
||||||
HD void flatten() { reshape({mShape.mItems}); };
|
HD void flatten() { reshape({mShape.mItems}); };
|
||||||
|
|
||||||
#ifdef CUDATOOLS_USE_EIGEN
|
|
||||||
/**
|
/**
|
||||||
* Returns the Eigen::Map of this Array.
|
* Returns the Eigen::Map of this Array.
|
||||||
*/
|
*/
|
||||||
@ -520,7 +501,6 @@ template <typename T> class Array {
|
|||||||
return EigenAdaptConst<ComplexConversion<T>>((ComplexConversion<T>*)POINTER, mShape.rows(),
|
return EigenAdaptConst<ComplexConversion<T>>((ComplexConversion<T>*)POINTER, mShape.rows(),
|
||||||
mShape.cols());
|
mShape.cols());
|
||||||
};
|
};
|
||||||
#endif
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Gets the Shape of the Array.
|
* Gets the Shape of the Array.
|
||||||
@ -548,7 +528,7 @@ template <typename T> class Array {
|
|||||||
/**
|
/**
|
||||||
* Copies this Array and returns a new Array with the same memory.
|
* Copies this Array and returns a new Array with the same memory.
|
||||||
*/
|
*/
|
||||||
Array copy() const {
|
HD Array copy() const {
|
||||||
Array<T> arr(mShape, (pDevice == nullptr));
|
Array<T> arr(mShape, (pDevice == nullptr));
|
||||||
|
|
||||||
auto arr_it = arr.begin();
|
auto arr_it = arr.begin();
|
||||||
@ -699,8 +679,6 @@ template <typename T> class Array {
|
|||||||
* Its self assigning version is transpose. This is restricted to numerical types.
|
* Its self assigning version is transpose. This is restricted to numerical types.
|
||||||
* \brief Host only
|
* \brief Host only
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#ifdef CUDATOOLS_USE_EIGEN
|
|
||||||
Array transposed() const {
|
Array transposed() const {
|
||||||
static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
|
static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
|
||||||
CT_ERROR_IF(shape().axes(), !=, 2, "Tranpose can only occur on two-dimensional arrays");
|
CT_ERROR_IF(shape().axes(), !=, 2, "Tranpose can only occur on two-dimensional arrays");
|
||||||
@ -730,7 +708,6 @@ template <typename T> class Array {
|
|||||||
Array<T> inv(shape());
|
Array<T> inv(shape());
|
||||||
inv.eigenMap() = this->eigenMap().inverse();
|
inv.eigenMap() = this->eigenMap().inverse();
|
||||||
};
|
};
|
||||||
#endif
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Pins the memory (page locks) for faster memory transfer in concurrent
|
* Pins the memory (page locks) for faster memory transfer in concurrent
|
||||||
@ -756,22 +733,6 @@ template <typename T> class Array {
|
|||||||
CT_ERROR(mIsSlice, "Cannot update device copy on a slice");
|
CT_ERROR(mIsSlice, "Cannot update device copy on a slice");
|
||||||
return CudaTools::copy(pHost, pDevice, mShape.items() * sizeof(T), stream);
|
return CudaTools::copy(pHost, pDevice, mShape.items() * sizeof(T), stream);
|
||||||
};
|
};
|
||||||
|
|
||||||
#ifdef CUDATOOLS_USE_PYTHON
|
|
||||||
/**
|
|
||||||
* Returns a py::array for making an Array available as a Python numpy array.
|
|
||||||
*/
|
|
||||||
py::array pyArray() const {
|
|
||||||
std::vector<py::ssize_t> dims, strides;
|
|
||||||
for (uint iAxis = 0; iAxis < mShape.axes(); ++iAxis) {
|
|
||||||
dims.push_back(static_cast<py::ssize_t>(mShape.dim(iAxis)));
|
|
||||||
strides.push_back(sizeof(T) * static_cast<py::ssize_t>(mShape.stride(iAxis)));
|
|
||||||
}
|
|
||||||
return py::array_t<T, py::array::f_style>(
|
|
||||||
py::buffer_info((void*)pHost, sizeof(T), py::format_descriptor<T>::format(),
|
|
||||||
static_cast<py::ssize_t>(mShape.axes()), dims, strides));
|
|
||||||
};
|
|
||||||
#endif
|
|
||||||
};
|
};
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
@ -813,12 +774,10 @@ template <typename T> std::ostream& operator<<(std::ostream& out, const Array<T>
|
|||||||
bool negative = false;
|
bool negative = false;
|
||||||
for (auto it = arr.begin(); it != arr.end(); ++it) {
|
for (auto it = arr.begin(); it != arr.end(); ++it) {
|
||||||
T val = *it;
|
T val = *it;
|
||||||
if constexpr (not std::is_unsigned<T>::value) {
|
|
||||||
if (*it < 0) {
|
if (*it < 0) {
|
||||||
negative = true;
|
negative = true;
|
||||||
val *= -1;
|
val *= -1;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
max_val = (val > max_val) ? val : max_val;
|
max_val = (val > max_val) ? val : max_val;
|
||||||
}
|
}
|
||||||
width = std::to_string(max_val).size() + 1;
|
width = std::to_string(max_val).size() + 1;
|
||||||
@ -1,10 +1,6 @@
|
|||||||
#ifndef CUDATOOLS_BLAS_H
|
#ifndef CUDATOOLS_BLAS_H
|
||||||
#define CUDATOOLS_BLAS_H
|
#define CUDATOOLS_BLAS_H
|
||||||
|
|
||||||
#ifndef CUDATOOLS_USE_EIGEN
|
|
||||||
#error "Cannot use CudaTools BLAS.h header without Eigen."
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#include "Array.h"
|
#include "Array.h"
|
||||||
#include "Core.h"
|
#include "Core.h"
|
||||||
#include "Macros.h"
|
#include "Macros.h"
|
||||||
@ -142,7 +138,7 @@ template <typename T> class Batch {
|
|||||||
|
|
||||||
Array<T> batch = arr.reshaped({mBatchSize, mShape.rows(), mShape.cols()});
|
Array<T> batch = arr.reshaped({mBatchSize, mShape.rows(), mShape.cols()});
|
||||||
for (uint32_t i = 0; i < mBatchSize; ++i) {
|
for (uint32_t i = 0; i < mBatchSize; ++i) {
|
||||||
#ifdef CUDACC
|
#ifdef CUDA
|
||||||
mBatch[i] = batch[i].dataDevice();
|
mBatch[i] = batch[i].dataDevice();
|
||||||
#else
|
#else
|
||||||
mBatch[i] = batch[i].data();
|
mBatch[i] = batch[i].data();
|
||||||
@ -158,7 +154,7 @@ template <typename T> class Batch {
|
|||||||
void add(const Array<T>& arr) {
|
void add(const Array<T>& arr) {
|
||||||
CT_ERROR(not arr.isView(), "Cannot add non-view Arrays");
|
CT_ERROR(not arr.isView(), "Cannot add non-view Arrays");
|
||||||
CT_ERROR_IF(mCount, ==, mBatchSize, "Batch is full, cannot add more arrays");
|
CT_ERROR_IF(mCount, ==, mBatchSize, "Batch is full, cannot add more arrays");
|
||||||
#ifdef CUDACC
|
#ifdef CUDA
|
||||||
mBatch[mCount] = arr.dataDevice();
|
mBatch[mCount] = arr.dataDevice();
|
||||||
#else
|
#else
|
||||||
mBatch[mCount] = arr.data();
|
mBatch[mCount] = arr.data();
|
||||||
@ -274,7 +270,7 @@ StreamID GEMV(const T alpha, const Array<T>& A, const Array<T>& x, const T beta,
|
|||||||
uint32_t rows = A.shape().rows();
|
uint32_t rows = A.shape().rows();
|
||||||
uint32_t cols = A.shape().cols();
|
uint32_t cols = A.shape().cols();
|
||||||
T a = alpha, b = beta;
|
T a = alpha, b = beta;
|
||||||
#ifdef CUDACC
|
#ifdef CUDA
|
||||||
CUBLAS_CHECK(cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream)));
|
CUBLAS_CHECK(cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream)));
|
||||||
if (bi.size == 1) {
|
if (bi.size == 1) {
|
||||||
invoke<T>(cublasSgemv, cublasDgemv, cublasCgemv, cublasZgemv,
|
invoke<T>(cublasSgemv, cublasDgemv, cublasCgemv, cublasZgemv,
|
||||||
@ -319,7 +315,7 @@ StreamID GEMM(const T alpha, const Array<T>& A, const Array<T>& B, const T beta,
|
|||||||
uint32_t n = B.shape().cols();
|
uint32_t n = B.shape().cols();
|
||||||
|
|
||||||
T a = alpha, b = beta;
|
T a = alpha, b = beta;
|
||||||
#ifdef CUDACC
|
#ifdef CUDA
|
||||||
CUBLAS_CHECK(cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream)));
|
CUBLAS_CHECK(cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream)));
|
||||||
|
|
||||||
if (bi.size == 1) {
|
if (bi.size == 1) {
|
||||||
@ -372,7 +368,7 @@ StreamID DGMM(const Array<T>& A, const Array<T>& X, const Array<T>& C, const boo
|
|||||||
CT_ERROR_IF(A.shape().cols(), !=, C.shape().cols(),
|
CT_ERROR_IF(A.shape().cols(), !=, C.shape().cols(),
|
||||||
"Rows of 'A' and columns of 'C' need to match.");
|
"Rows of 'A' and columns of 'C' need to match.");
|
||||||
|
|
||||||
#ifdef CUDACC
|
#ifdef CUDA
|
||||||
uint32_t m = C.shape().rows();
|
uint32_t m = C.shape().rows();
|
||||||
uint32_t n = C.shape().cols();
|
uint32_t n = C.shape().cols();
|
||||||
auto mode = (left) ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT;
|
auto mode = (left) ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT;
|
||||||
@ -548,7 +544,7 @@ class PLUBatch : public Batch<T> {
|
|||||||
* Computes the inplace PLU decomposition of batch of arrays.
|
* Computes the inplace PLU decomposition of batch of arrays.
|
||||||
*/
|
*/
|
||||||
StreamID computeLU(const StreamID& stream = DEF_CUBLAS_STREAM) {
|
StreamID computeLU(const StreamID& stream = DEF_CUBLAS_STREAM) {
|
||||||
#ifdef CUDACC
|
#ifdef CUDA
|
||||||
uint32_t n = this->mShape.rows();
|
uint32_t n = this->mShape.rows();
|
||||||
CUBLAS_CHECK(
|
CUBLAS_CHECK(
|
||||||
cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream)));
|
cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream)));
|
||||||
@ -579,7 +575,7 @@ class PLUBatch : public Batch<T> {
|
|||||||
CT_ERROR_IF(b.shape().rows(), !=, this->mShape.rows(),
|
CT_ERROR_IF(b.shape().rows(), !=, this->mShape.rows(),
|
||||||
"The length of each column of b must match the matrix rank");
|
"The length of each column of b must match the matrix rank");
|
||||||
|
|
||||||
#ifdef CUDACC
|
#ifdef CUDA
|
||||||
uint32_t n = b.shape().rows();
|
uint32_t n = b.shape().rows();
|
||||||
uint32_t nrhs = b.shape().cols();
|
uint32_t nrhs = b.shape().cols();
|
||||||
CUBLAS_CHECK(
|
CUBLAS_CHECK(
|
||||||
@ -143,7 +143,7 @@ Settings basic(const size_t threads, const StreamID& stream = DEF_KERNEL_STREAM)
|
|||||||
|
|
||||||
template <typename F, typename... Args>
|
template <typename F, typename... Args>
|
||||||
StreamID launch(F func, const Kernel::Settings& sett, Args... args) {
|
StreamID launch(F func, const Kernel::Settings& sett, Args... args) {
|
||||||
#ifdef CUDACC
|
#ifdef CUDA
|
||||||
func<<<sett.blockGrid, sett.threadBlock, sett.sharedMemoryBytes,
|
func<<<sett.blockGrid, sett.threadBlock, sett.sharedMemoryBytes,
|
||||||
Manager::get()->stream(sett.stream.mId)>>>(args...);
|
Manager::get()->stream(sett.stream.mId)>>>(args...);
|
||||||
#else
|
#else
|
||||||
@ -256,8 +256,8 @@ template <typename F, typename... Args> class Graph {
|
|||||||
|
|
||||||
~Graph() {
|
~Graph() {
|
||||||
#ifdef CUDACC
|
#ifdef CUDACC
|
||||||
cudaGraphDestroy(mGraph);
|
CUDA_CHECK(cudaGraphDestroy(mGraph));
|
||||||
cudaGraphExecDestroy(mInstance);
|
CUDA_CHECK(cudaGraphExecDestroy(mInstance));
|
||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -316,11 +316,9 @@ struct GraphManager {
|
|||||||
};
|
};
|
||||||
|
|
||||||
}; // namespace CudaTools
|
}; // namespace CudaTools
|
||||||
#endif // CUDATOOLS_H
|
|
||||||
|
|
||||||
#ifdef CUDATOOLS_IMPLEMENTATION
|
#ifdef CUDATOOLS_IMPLEMENTATION
|
||||||
#ifndef __CUDATOOLS_IMPLEMENTED__
|
|
||||||
#define __CUDATOOLS_IMPLEMENTED__
|
|
||||||
namespace CudaTools {
|
namespace CudaTools {
|
||||||
|
|
||||||
//////////////////////
|
//////////////////////
|
||||||
@ -402,9 +400,9 @@ Manager::Manager(const std::vector<std::string>& names) {
|
|||||||
Manager::~Manager() {
|
Manager::~Manager() {
|
||||||
#ifdef CUDACC
|
#ifdef CUDACC
|
||||||
for (auto& it : mStreams) {
|
for (auto& it : mStreams) {
|
||||||
cudaStreamDestroy(it.second);
|
CUDA_CHECK(cudaStreamDestroy(it.second));
|
||||||
}
|
}
|
||||||
cublasDestroy(mCublas);
|
CUBLAS_CHECK(cublasDestroy(mCublas));
|
||||||
// CUSPARSE_CHECK(cusparseDestroy(mCusparse));
|
// CUSPARSE_CHECK(cusparseDestroy(mCusparse));
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
@ -642,7 +640,7 @@ Event::Event() {
|
|||||||
|
|
||||||
Event::~Event() {
|
Event::~Event() {
|
||||||
#ifdef CUDACC
|
#ifdef CUDACC
|
||||||
cudaEventDestroy(mEvent);
|
CUDA_CHECK(cudaEventDestroy(mEvent));
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -658,6 +656,9 @@ void Event::record(const StreamID& stream) {
|
|||||||
|
|
||||||
GraphManager::~GraphManager() {
|
GraphManager::~GraphManager() {
|
||||||
#ifdef CUDACC
|
#ifdef CUDACC
|
||||||
|
for (void* func : mHostData) {
|
||||||
|
delete func;
|
||||||
|
}
|
||||||
for (Event* event : mEvents) {
|
for (Event* event : mEvents) {
|
||||||
delete event;
|
delete event;
|
||||||
}
|
}
|
||||||
@ -679,5 +680,6 @@ void GraphManager::joinBranch(const StreamID& orig_stream, const StreamID& branc
|
|||||||
}
|
}
|
||||||
|
|
||||||
}; // namespace CudaTools
|
}; // namespace CudaTools
|
||||||
#endif
|
|
||||||
#endif // CUDATOOLS_IMPLEMENTATION
|
#endif // CUDATOOLS_IMPLEMENTATION
|
||||||
|
|
||||||
|
#endif // CUDATOOLS_H
|
||||||
@ -49,18 +49,6 @@
|
|||||||
*/
|
*/
|
||||||
#define SHARED
|
#define SHARED
|
||||||
|
|
||||||
/**
|
|
||||||
* \def CUDATOOLS_USE_EIGEN
|
|
||||||
* Compile the CudaTools library with Eigen support.
|
|
||||||
*/
|
|
||||||
#define CUDATOOLS_USE_EIGEN
|
|
||||||
|
|
||||||
/**
|
|
||||||
* \def CUDATOOLS_USE_PYTHON
|
|
||||||
* Compile the CudaTools library with Python support.
|
|
||||||
*/
|
|
||||||
#define CUDATOOLS_USE_PYTHON
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* \def KERNEL(call, settings, ...)
|
* \def KERNEL(call, settings, ...)
|
||||||
* Used to call a CUDA kernel.
|
* Used to call a CUDA kernel.
|
||||||
@ -230,13 +218,12 @@
|
|||||||
#ifdef DEVICE
|
#ifdef DEVICE
|
||||||
#define CT_ERROR_IF(a, op, b, msg) \
|
#define CT_ERROR_IF(a, op, b, msg) \
|
||||||
if (a op b) { \
|
if (a op b) { \
|
||||||
printf("\033[1;31m[CudaTools]\033[0m %s:%d\n | %s: (" #a ") " #op " (" #b ").\n", \
|
printf("[ERROR] %s:%d\n | %s: (" #a ") " #op " (" #b ").\n", __FILE__, __LINE__, msg); \
|
||||||
__FILE__, __LINE__, msg); \
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#define CT_ERROR(a, msg) \
|
#define CT_ERROR(a, msg) \
|
||||||
if (a) { \
|
if (a) { \
|
||||||
printf("\033[1;31m[CudaTools]\033[0m %s:%d\n | %s: " #a ".\n", __FILE__, __LINE__, msg); \
|
printf("[ERROR] %s:%d\n | %s: " #a ".\n", __FILE__, __LINE__, msg); \
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
|
|
||||||
@ -246,14 +233,14 @@
|
|||||||
std::ostringstream os_b; \
|
std::ostringstream os_b; \
|
||||||
os_a << a; \
|
os_a << a; \
|
||||||
os_b << b; \
|
os_b << b; \
|
||||||
printf("\033[1;31m[CudaTools]\033[0m %s:%d\n | %s: (" #a ")%s " #op " (" #b ")%s.\n", \
|
printf("[ERROR] %s:%d\n | %s: (" #a ")%s " #op " (" #b ")%s.\n", __FILE__, __LINE__, msg, \
|
||||||
__FILE__, __LINE__, msg, os_a.str().c_str(), os_b.str().c_str()); \
|
os_a.str().c_str(), os_b.str().c_str()); \
|
||||||
throw std::exception(); \
|
throw std::exception(); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define CT_ERROR(a, msg) \
|
#define CT_ERROR(a, msg) \
|
||||||
if (a) { \
|
if (a) { \
|
||||||
printf("\033[1;31m[CudaTools]\033[0m %s:%d\n | %s: " #a ".\n", __FILE__, __LINE__, msg); \
|
printf("[ERROR] %s:%d\n | %s: " #a ".\n", __FILE__, __LINE__, msg); \
|
||||||
throw std::exception(); \
|
throw std::exception(); \
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
@ -266,8 +253,7 @@
|
|||||||
do { \
|
do { \
|
||||||
cudaError_t err = (call); \
|
cudaError_t err = (call); \
|
||||||
if (err != cudaSuccess) { \
|
if (err != cudaSuccess) { \
|
||||||
printf("\033[1;31m[CUDA]\033[0m %s:%d\n | %s\n", __FILE__, __LINE__, \
|
printf("[CUDA] %s:%d\n | %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
|
||||||
cudaGetErrorString(err)); \
|
|
||||||
throw std::exception(); \
|
throw std::exception(); \
|
||||||
} \
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
@ -276,8 +262,7 @@
|
|||||||
do { \
|
do { \
|
||||||
cublasStatus_t err = (call); \
|
cublasStatus_t err = (call); \
|
||||||
if (err != CUBLAS_STATUS_SUCCESS) { \
|
if (err != CUBLAS_STATUS_SUCCESS) { \
|
||||||
printf("\033[1;31m[cuBLAS]\033[0m %s:%d\n | %s\n", __FILE__, __LINE__, \
|
printf("[cuBLAS] %s:%d\n | %s\n", __FILE__, __LINE__, cublasGetStatusName(err)); \
|
||||||
cublasGetStatusName(err)); \
|
|
||||||
throw std::exception(); \
|
throw std::exception(); \
|
||||||
} \
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
110
Makefile
110
Makefile
@ -1,21 +1,95 @@
|
|||||||
.PHONY: all docs install
|
CC := g++-10
|
||||||
|
NVCC := nvcc
|
||||||
|
CFLAGS := -Wall -std=c++17 -fopenmp -MMD
|
||||||
|
NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
|
||||||
|
|
||||||
all: install docs
|
INCLUDE :=
|
||||||
|
LIBS_DIR :=
|
||||||
|
LIBS_DIR_GPU := /usr/local/cuda/lib64
|
||||||
|
LIBS :=
|
||||||
|
LIBS_GPU := cuda cudart cublas
|
||||||
|
|
||||||
docs: SHELL:=/bin/bash
|
TARGET = tests
|
||||||
docs:
|
SRC_DIR = .
|
||||||
if [ ! -d "docs/.venv" ]; then python3 -m venv "docs/.venv"; fi
|
BUILD_DIR = build
|
||||||
( \
|
|
||||||
source docs/.venv/bin/activate;\
|
|
||||||
pip install -r docs/requirements.txt; \
|
|
||||||
doxygen docs/Doxyfile; \
|
|
||||||
sphinx-build -b html docs/source docs/build/html; \
|
|
||||||
deactivate \
|
|
||||||
)
|
|
||||||
|
|
||||||
install:
|
# Should not need to modify below.
|
||||||
install -d $(DEST_DIR)/
|
|
||||||
install -d $(DEST_DIR)/include/
|
CPU_BUILD_DIR = $(BUILD_DIR)/cpu
|
||||||
for file in include/*; do \
|
GPU_BUILD_DIR = $(BUILD_DIR)/gpu
|
||||||
install -m 644 $$file $(DEST_DIR)/include/; \
|
|
||||||
done
|
SRC = $(wildcard $(SRC_DIR)/*/*.cpp) $(wildcard $(SRC_DIR)/*.cpp)
|
||||||
|
|
||||||
|
# Get source files and object files.
|
||||||
|
GCC_SRC = $(filter-out %.cu.cpp ,$(SRC))
|
||||||
|
NVCC_SRC = $(filter %.cu.cpp, $(SRC))
|
||||||
|
GCC_OBJ = $(GCC_SRC:$(SRC_DIR)/%.cpp=%.o)
|
||||||
|
NVCC_OBJ = $(NVCC_SRC:$(SRC_DIR)/%.cpp=%.o)
|
||||||
|
|
||||||
|
# If compiling for CPU, all go to GCC. Otherwise, they are split.
|
||||||
|
CPU_OBJ = $(addprefix $(CPU_BUILD_DIR)/,$(GCC_OBJ)) $(addprefix $(CPU_BUILD_DIR)/,$(NVCC_OBJ))
|
||||||
|
GPU_GCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(GCC_OBJ))
|
||||||
|
GPU_NVCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(NVCC_OBJ))
|
||||||
|
|
||||||
|
# $(info $$GCC_SRC is [${GCC_SRC}])
|
||||||
|
# $(info $$NVCC_SRC is [${NVCC_SRC}])
|
||||||
|
# $(info $$GCC_OBJ is [${GCC_OBJ}])
|
||||||
|
# $(info $$NVCC_OBJ is [${NVCC_OBJ}])
|
||||||
|
|
||||||
|
# $(info $$CPU_OBJ is [${CPU_OBJ}])
|
||||||
|
# $(info $$GPU_GCC_OBJ is [${GPU_GCC_OBJ}])
|
||||||
|
# $(info $$GPU_NVCC_OBJ is [${GPU_NVCC_OBJ}])
|
||||||
|
|
||||||
|
HEADER = $(wildcard $(SRC_DIR)/*/*.h) $(wildcard $(SRC_DIR)/*.h)
|
||||||
|
CPU_DEPS = $(wildcard $(CPU_BUILD_DIR)/*.d)
|
||||||
|
GPU_DEPS = $(wildcard $(GPU_BUILD_DIR)/*.d)
|
||||||
|
|
||||||
|
INC := $(INCLUDE:%=-I%)
|
||||||
|
LIB := $(LIBS_DIR:%=-L%)
|
||||||
|
LIB_GPU := $(LIBS_DIR_GPU:%=-L%)
|
||||||
|
LD := $(LIBS:%=-l%)
|
||||||
|
LD_GPU := $(LIBS_GPU:%=-l%)
|
||||||
|
|
||||||
|
# Reminder:
|
||||||
|
# $< = first prerequisite
|
||||||
|
# $@ = the target which matched the rule
|
||||||
|
# $^ = all prerequisites
|
||||||
|
|
||||||
|
.PHONY: all clean
|
||||||
|
|
||||||
|
all : cpu gpu
|
||||||
|
|
||||||
|
cpu: $(TARGET)CPU
|
||||||
|
gpu: $(TARGET)GPU
|
||||||
|
|
||||||
|
$(TARGET)CPU: $(CPU_OBJ)
|
||||||
|
$(CC) $(CFLAGS) $^ -o $@ $(INC) $(LIB) $(LD)
|
||||||
|
|
||||||
|
$(CPU_BUILD_DIR)/%.o $(CPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cpp | $(CPU_BUILD_DIR)
|
||||||
|
$(CC) $(CFLAGS) -c -o $@ $< $(INC)
|
||||||
|
|
||||||
|
# For GPU, we need to build the NVCC objects, the NVCC linked object, and the
|
||||||
|
# regular ones. Then, we link them all together.
|
||||||
|
$(TARGET)GPU: $(GPU_BUILD_DIR)/link.o $(GPU_GCC_OBJ) | $(GPU_BUILD_DIR)
|
||||||
|
$(CC) -g -DCUDA $(CFLAGS) $(GPU_NVCC_OBJ) $^ -o $@ $(INC) $(LIB) $(LIB_GPU) $(LD) $(LD_GPU)
|
||||||
|
|
||||||
|
$(GPU_BUILD_DIR)/link.o: $(GPU_NVCC_OBJ) | $(GPU_BUILD_DIR)
|
||||||
|
$(NVCC) --device-link $^ -o $@
|
||||||
|
|
||||||
|
$(GPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cu.cpp | $(GPU_BUILD_DIR)
|
||||||
|
$(NVCC) $(NVCC_FLAGS) -DCUDA -x cu --device-c -o $@ $< $(INC)
|
||||||
|
|
||||||
|
$(GPU_BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp | $(GPU_BUILD_DIR)
|
||||||
|
$(CC) $(CFLAGS) -g -DCUDA -c -o $@ $< $(INC)
|
||||||
|
|
||||||
|
-include $(CPU_DEPS)
|
||||||
|
-include $(GPU_DEPS)
|
||||||
|
|
||||||
|
$(CPU_BUILD_DIR):
|
||||||
|
mkdir -p $@
|
||||||
|
|
||||||
|
$(GPU_BUILD_DIR):
|
||||||
|
mkdir -p $@
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -Rf $(BUILD_DIR) $(TARGET)CPU $(TARGET)GPU
|
||||||
|
|||||||
@ -1,7 +1,7 @@
|
|||||||
CC := g++-10
|
CC := g++-10
|
||||||
NVCC := nvcc
|
NVCC := nvcc
|
||||||
CFLAGS := -std=c++17 -MMD -Wall -fopenmp
|
CFLAGS := -Wall -std=c++17 -fopenmp -MMD
|
||||||
NVCC_FLAGS := -std=c++17 -MMD -Xcudafe="--diag_suppress=20012" -Xcompiler -fopenmp
|
NVCC_FLAGS := -MMD -w -Xcompiler
|
||||||
|
|
||||||
INCLUDE := <<Put extra include directories here, separated by a space>>
|
INCLUDE := <<Put extra include directories here, separated by a space>>
|
||||||
LIBS_DIR := <<Put library directories here, separated by a space>>
|
LIBS_DIR := <<Put library directories here, separated by a space>>
|
||||||
@ -74,7 +74,7 @@ $(TARGET)GPU: $(GPU_BUILD_DIR)/link.o $(GPU_GCC_OBJ) | $(GPU_BUILD_DIR)
|
|||||||
$(CC) -g -DCUDA $(CFLAGS) $(GPU_NVCC_OBJ) $^ -o $@ $(INC) $(LIB) $(LIB_GPU) $(LD) $(LD_GPU)
|
$(CC) -g -DCUDA $(CFLAGS) $(GPU_NVCC_OBJ) $^ -o $@ $(INC) $(LIB) $(LIB_GPU) $(LD) $(LD_GPU)
|
||||||
|
|
||||||
$(GPU_BUILD_DIR)/link.o: $(GPU_NVCC_OBJ) | $(GPU_BUILD_DIR)
|
$(GPU_BUILD_DIR)/link.o: $(GPU_NVCC_OBJ) | $(GPU_BUILD_DIR)
|
||||||
$(NVCC) --device-link -lgomp $^ -o $@
|
$(NVCC) --device-link $^ -o $@
|
||||||
|
|
||||||
$(GPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cu.cpp | $(GPU_BUILD_DIR)
|
$(GPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cu.cpp | $(GPU_BUILD_DIR)
|
||||||
$(NVCC) $(NVCC_FLAGS) -DCUDA -x cu --device-c -o $@ $< $(INC)
|
$(NVCC) $(NVCC_FLAGS) -DCUDA -x cu --device-c -o $@ $< $(INC)
|
||||||
|
|||||||
11
SETUP
11
SETUP
@ -1,11 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
|
|
||||||
if [ -z "$1" ]
|
|
||||||
then
|
|
||||||
printf "Usage: setup.sh install_directory [dependencies].\n" 1>&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
cd $(dirname $0)
|
|
||||||
libname=$(basename $(pwd))
|
|
||||||
make DEST_DIR="$1" install
|
|
||||||
2
build_docs
Executable file
2
build_docs
Executable file
@ -0,0 +1,2 @@
|
|||||||
|
doxygen docs/Doxyfile
|
||||||
|
sphinx-build -b html docs/source docs/build/html
|
||||||
@ -778,7 +778,7 @@ WARNINGS = YES
|
|||||||
# will automatically be disabled.
|
# will automatically be disabled.
|
||||||
# The default value is: YES.
|
# The default value is: YES.
|
||||||
|
|
||||||
WARN_IF_UNDOCUMENTED = NO
|
WARN_IF_UNDOCUMENTED = YES
|
||||||
|
|
||||||
# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
|
# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
|
||||||
# potential errors in the documentation, such as not documenting some parameters
|
# potential errors in the documentation, such as not documenting some parameters
|
||||||
@ -829,7 +829,7 @@ WARN_LOGFILE =
|
|||||||
# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
|
# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
|
||||||
# Note: If this tag is empty the current directory is searched.
|
# Note: If this tag is empty the current directory is searched.
|
||||||
|
|
||||||
INPUT = "./include"
|
INPUT = "./"
|
||||||
|
|
||||||
# This tag can be used to specify the character encoding of the source files
|
# This tag can be used to specify the character encoding of the source files
|
||||||
# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
|
# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
|
||||||
|
|||||||
@ -27,8 +27,8 @@ extensions = [
|
|||||||
'breathe',
|
'breathe',
|
||||||
]
|
]
|
||||||
|
|
||||||
breathe_projects = {"CudaTools": "../build/xml"}
|
breathe_projects = {"DGEMS": "../build/xml"}
|
||||||
breathe_default_project = "CudaTools"
|
breathe_default_project = "DGEMS"
|
||||||
|
|
||||||
bibtex_bibfiles = ['refs.bib']
|
bibtex_bibfiles = ['refs.bib']
|
||||||
|
|
||||||
|
|||||||
@ -42,8 +42,6 @@ Host-Device Automation
|
|||||||
Compilation Options
|
Compilation Options
|
||||||
-------------------
|
-------------------
|
||||||
.. doxygendefine:: CUDATOOLS_ARRAY_MAX_AXES
|
.. doxygendefine:: CUDATOOLS_ARRAY_MAX_AXES
|
||||||
.. doxygendefine:: CUDATOOLS_USE_EIGEN
|
|
||||||
.. doxygendefine:: CUDATOOLS_USE_PYTHON
|
|
||||||
|
|
||||||
Macro Functions
|
Macro Functions
|
||||||
===============
|
===============
|
||||||
|
|||||||
@ -1,9 +1,9 @@
|
|||||||
CC := g++-10
|
CC := g++-10
|
||||||
NVCC := nvcc
|
NVCC := nvcc
|
||||||
CFLAGS := -std=c++17 -MMD -Wall -fopenmp
|
CFLAGS := -Wall -std=c++17 -fopenmp -MMD
|
||||||
NVCC_FLAGS := -std=c++17 -MMD -Xcudafe="--diag_suppress=20012" -Xcompiler -fopenmp
|
NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
|
||||||
|
|
||||||
INCLUDE := ../../include/CudaTools
|
INCLUDE := ../../
|
||||||
LIBS_DIR :=
|
LIBS_DIR :=
|
||||||
LIBS_DIR_GPU := /usr/local/cuda/lib64
|
LIBS_DIR_GPU := /usr/local/cuda/lib64
|
||||||
LIBS :=
|
LIBS :=
|
||||||
|
|||||||
@ -1,9 +1,9 @@
|
|||||||
CC := g++-10
|
CC := g++-10
|
||||||
NVCC := nvcc
|
NVCC := nvcc
|
||||||
CFLAGS := -std=c++17 -MMD -Wall -fopenmp
|
CFLAGS := -Wall -std=c++17 -fopenmp -MMD
|
||||||
NVCC_FLAGS := -std=c++17 -MMD -Xcudafe="--diag_suppress=20012" -Xcompiler -fopenmp
|
NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
|
||||||
|
|
||||||
INCLUDE := ../../include/CudaTools
|
INCLUDE := ../../
|
||||||
LIBS_DIR :=
|
LIBS_DIR :=
|
||||||
LIBS_DIR_GPU := /usr/local/cuda/lib64
|
LIBS_DIR_GPU := /usr/local/cuda/lib64
|
||||||
LIBS :=
|
LIBS :=
|
||||||
|
|||||||
@ -1,9 +1,9 @@
|
|||||||
CC := g++-10
|
CC := g++-10
|
||||||
NVCC := nvcc
|
NVCC := nvcc
|
||||||
CFLAGS := -std=c++17 -MMD -Wall -fopenmp
|
CFLAGS := -Wall -std=c++17 -fopenmp -MMD
|
||||||
NVCC_FLAGS := -std=c++17 -MMD -Xcudafe="--diag_suppress=20012" -Xcompiler -fopenmp
|
NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
|
||||||
|
|
||||||
INCLUDE := ../../include/CudaTools
|
INCLUDE := ../../
|
||||||
LIBS_DIR :=
|
LIBS_DIR :=
|
||||||
LIBS_DIR_GPU := /usr/local/cuda/lib64
|
LIBS_DIR_GPU := /usr/local/cuda/lib64
|
||||||
LIBS :=
|
LIBS :=
|
||||||
|
|||||||
@ -1,9 +1,9 @@
|
|||||||
CC := g++-10
|
CC := g++-10
|
||||||
NVCC := nvcc
|
NVCC := nvcc
|
||||||
CFLAGS := -std=c++17 -MMD -Wall -fopenmp
|
CFLAGS := -Wall -std=c++17 -fopenmp -MMD
|
||||||
NVCC_FLAGS := -std=c++17 -MMD -Xcudafe="--diag_suppress=20012" -Xcompiler -fopenmp
|
NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
|
||||||
|
|
||||||
INCLUDE := ../../include/CudaTools
|
INCLUDE := ../../
|
||||||
LIBS_DIR :=
|
LIBS_DIR :=
|
||||||
LIBS_DIR_GPU := /usr/local/cuda/lib64
|
LIBS_DIR_GPU := /usr/local/cuda/lib64
|
||||||
LIBS :=
|
LIBS :=
|
||||||
|
|||||||
@ -1,9 +1,9 @@
|
|||||||
CC := g++-10
|
CC := g++-10
|
||||||
NVCC := nvcc
|
NVCC := nvcc
|
||||||
CFLAGS := -std=c++17 -MMD -Wall -fopenmp
|
CFLAGS := -Wall -std=c++17 -fopenmp -MMD
|
||||||
NVCC_FLAGS := -std=c++17 -MMD -Xcudafe="--diag_suppress=20012" -Xcompiler -fopenmp
|
NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
|
||||||
|
|
||||||
INCLUDE := ../../include/CudaTools
|
INCLUDE := ../../
|
||||||
LIBS_DIR :=
|
LIBS_DIR :=
|
||||||
LIBS_DIR_GPU := /usr/local/cuda/lib64
|
LIBS_DIR_GPU := /usr/local/cuda/lib64
|
||||||
LIBS :=
|
LIBS :=
|
||||||
|
|||||||
@ -1,9 +1,9 @@
|
|||||||
#define CUDATOOLS_IMPLEMENTATION
|
#define CUDATOOLS_IMPLEMENTATION
|
||||||
#define CUDATOOLS_ARRAY_MAX_AXES 8
|
#define CUDATOOLS_ARRAY_MAX_AXES 8
|
||||||
#include <Array.h>
|
#include "Array.h"
|
||||||
#include <BLAS.h>
|
#include "BLAS.h"
|
||||||
#include <Core.h>
|
#include "Core.h"
|
||||||
#include <Types.h>
|
#include "Types.h"
|
||||||
|
|
||||||
#include <Eigen/Core>
|
#include <Eigen/Core>
|
||||||
#include <chrono>
|
#include <chrono>
|
||||||
@ -1,95 +0,0 @@
|
|||||||
CC := g++-10
|
|
||||||
NVCC := nvcc
|
|
||||||
CFLAGS := -std=c++17 -MMD -Wall -fopenmp
|
|
||||||
NVCC_FLAGS := -std=c++17 -MMD -Xcudafe="--diag_suppress=20012" -Xcompiler -fopenmp
|
|
||||||
|
|
||||||
INCLUDE := ../include/CudaTools
|
|
||||||
LIBS_DIR :=
|
|
||||||
LIBS_DIR_GPU := /usr/local/cuda/lib64
|
|
||||||
LIBS :=
|
|
||||||
LIBS_GPU := cuda cudart cublas
|
|
||||||
|
|
||||||
TARGET = tests
|
|
||||||
SRC_DIR = .
|
|
||||||
BUILD_DIR = build
|
|
||||||
|
|
||||||
# Should not need to modify below.
|
|
||||||
|
|
||||||
CPU_BUILD_DIR = $(BUILD_DIR)/cpu
|
|
||||||
GPU_BUILD_DIR = $(BUILD_DIR)/gpu
|
|
||||||
|
|
||||||
SRC = $(wildcard $(SRC_DIR)/*/*.cpp) $(wildcard $(SRC_DIR)/*.cpp)
|
|
||||||
|
|
||||||
# Get source files and object files.
|
|
||||||
GCC_SRC = $(filter-out %.cu.cpp ,$(SRC))
|
|
||||||
NVCC_SRC = $(filter %.cu.cpp, $(SRC))
|
|
||||||
GCC_OBJ = $(GCC_SRC:$(SRC_DIR)/%.cpp=%.o)
|
|
||||||
NVCC_OBJ = $(NVCC_SRC:$(SRC_DIR)/%.cpp=%.o)
|
|
||||||
|
|
||||||
# If compiling for CPU, all go to GCC. Otherwise, they are split.
|
|
||||||
CPU_OBJ = $(addprefix $(CPU_BUILD_DIR)/,$(GCC_OBJ)) $(addprefix $(CPU_BUILD_DIR)/,$(NVCC_OBJ))
|
|
||||||
GPU_GCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(GCC_OBJ))
|
|
||||||
GPU_NVCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(NVCC_OBJ))
|
|
||||||
|
|
||||||
# $(info $$GCC_SRC is [${GCC_SRC}])
|
|
||||||
# $(info $$NVCC_SRC is [${NVCC_SRC}])
|
|
||||||
# $(info $$GCC_OBJ is [${GCC_OBJ}])
|
|
||||||
# $(info $$NVCC_OBJ is [${NVCC_OBJ}])
|
|
||||||
|
|
||||||
# $(info $$CPU_OBJ is [${CPU_OBJ}])
|
|
||||||
# $(info $$GPU_GCC_OBJ is [${GPU_GCC_OBJ}])
|
|
||||||
# $(info $$GPU_NVCC_OBJ is [${GPU_NVCC_OBJ}])
|
|
||||||
|
|
||||||
HEADER = $(wildcard $(SRC_DIR)/*/*.h) $(wildcard $(SRC_DIR)/*.h)
|
|
||||||
CPU_DEPS = $(wildcard $(CPU_BUILD_DIR)/*.d)
|
|
||||||
GPU_DEPS = $(wildcard $(GPU_BUILD_DIR)/*.d)
|
|
||||||
|
|
||||||
INC := $(INCLUDE:%=-I%)
|
|
||||||
LIB := $(LIBS_DIR:%=-L%)
|
|
||||||
LIB_GPU := $(LIBS_DIR_GPU:%=-L%)
|
|
||||||
LD := $(LIBS:%=-l%)
|
|
||||||
LD_GPU := $(LIBS_GPU:%=-l%)
|
|
||||||
|
|
||||||
# Reminder:
|
|
||||||
# $< = first prerequisite
|
|
||||||
# $@ = the target which matched the rule
|
|
||||||
# $^ = all prerequisites
|
|
||||||
|
|
||||||
.PHONY: all clean
|
|
||||||
|
|
||||||
all : cpu gpu
|
|
||||||
|
|
||||||
cpu: $(TARGET)CPU
|
|
||||||
gpu: $(TARGET)GPU
|
|
||||||
|
|
||||||
$(TARGET)CPU: $(CPU_OBJ)
|
|
||||||
$(CC) $(CFLAGS) $^ -o $@ $(INC) $(LIB) $(LD)
|
|
||||||
|
|
||||||
$(CPU_BUILD_DIR)/%.o $(CPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cpp | $(CPU_BUILD_DIR)
|
|
||||||
$(CC) $(CFLAGS) -c -o $@ $< $(INC)
|
|
||||||
|
|
||||||
# For GPU, we need to build the NVCC objects, the NVCC linked object, and the
|
|
||||||
# regular ones. Then, we link them all together.
|
|
||||||
$(TARGET)GPU: $(GPU_BUILD_DIR)/link.o $(GPU_GCC_OBJ) | $(GPU_BUILD_DIR)
|
|
||||||
$(CC) -g -DCUDA $(CFLAGS) $(GPU_NVCC_OBJ) $^ -o $@ $(INC) $(LIB) $(LIB_GPU) $(LD) $(LD_GPU)
|
|
||||||
|
|
||||||
$(GPU_BUILD_DIR)/link.o: $(GPU_NVCC_OBJ) | $(GPU_BUILD_DIR)
|
|
||||||
$(NVCC) --device-link -lgomp $^ -o $@
|
|
||||||
|
|
||||||
$(GPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cu.cpp | $(GPU_BUILD_DIR)
|
|
||||||
$(NVCC) $(NVCC_FLAGS) -DCUDA -x cu --device-c -o $@ $< $(INC)
|
|
||||||
|
|
||||||
$(GPU_BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp | $(GPU_BUILD_DIR)
|
|
||||||
$(CC) $(CFLAGS) -g -DCUDA -c -o $@ $< $(INC)
|
|
||||||
|
|
||||||
-include $(CPU_DEPS)
|
|
||||||
-include $(GPU_DEPS)
|
|
||||||
|
|
||||||
$(CPU_BUILD_DIR):
|
|
||||||
mkdir -p $@
|
|
||||||
|
|
||||||
$(GPU_BUILD_DIR):
|
|
||||||
mkdir -p $@
|
|
||||||
|
|
||||||
clean:
|
|
||||||
rm -Rf $(BUILD_DIR) $(TARGET)CPU $(TARGET)GPU
|
|
||||||
Loading…
x
Reference in New Issue
Block a user