Updated documentation samples with modifications
This commit is contained in:
parent
167edfea44
commit
8c7b99cd9f
12
Array.h
12
Array.h
@ -556,7 +556,7 @@ template <typename T> class Array {
|
|||||||
}
|
}
|
||||||
#ifndef DEVICE
|
#ifndef DEVICE
|
||||||
if (pDevice != nullptr) {
|
if (pDevice != nullptr) {
|
||||||
CudaTools::deviceCopy(pDevice, arr.dataDevice(), mShape.items() * sizeof(T)).wait();
|
CudaTools::copy(pDevice, arr.dataDevice(), mShape.items() * sizeof(T)).wait();
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
return arr;
|
return arr;
|
||||||
@ -739,9 +739,8 @@ template <typename T> class Array {
|
|||||||
* \brief Host only
|
* \brief Host only
|
||||||
*/
|
*/
|
||||||
StreamID updateHost(const StreamID& stream = DEF_MEM_STREAM) const {
|
StreamID updateHost(const StreamID& stream = DEF_MEM_STREAM) const {
|
||||||
CT_ERROR(mIsView, "Cannot update host on a view");
|
CT_ERROR(mIsSlice, "Cannot update host copy on a slice");
|
||||||
CudaTools::pull(pHost, pDevice, mShape.items() * sizeof(T), stream);
|
return CudaTools::copy(pDevice, pHost, mShape.items() * sizeof(T), stream);
|
||||||
return stream;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -749,9 +748,8 @@ template <typename T> class Array {
|
|||||||
* \brief Host only
|
* \brief Host only
|
||||||
*/
|
*/
|
||||||
StreamID updateDevice(const StreamID& stream = DEF_MEM_STREAM) const {
|
StreamID updateDevice(const StreamID& stream = DEF_MEM_STREAM) const {
|
||||||
CT_ERROR(mIsView, "Cannot update device on a view");
|
CT_ERROR(mIsSlice, "Cannot update device copy on a slice");
|
||||||
CudaTools::push(pHost, pDevice, mShape.items() * sizeof(T), stream);
|
return CudaTools::copy(pHost, pDevice, mShape.items() * sizeof(T), stream);
|
||||||
return stream;
|
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
156
Core.h
156
Core.h
@ -43,32 +43,26 @@ static const StreamID DEF_KERNEL_STREAM = StreamID{"defaultKernel"};
|
|||||||
*/
|
*/
|
||||||
void* malloc(const size_t size);
|
void* malloc(const size_t size);
|
||||||
|
|
||||||
/**
|
|
||||||
* Pins memory on the host.
|
|
||||||
*/
|
|
||||||
void pin(void* const pHost, const size_t size);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Pushes memory from the device to the host.
|
|
||||||
*/
|
|
||||||
StreamID push(void* const pHost, void* const pDevice, const size_t size,
|
|
||||||
const StreamID& stream = DEF_MEM_STREAM);
|
|
||||||
/**
|
|
||||||
* Pulls memory from the device back to the host.
|
|
||||||
*/
|
|
||||||
StreamID pull(void* const pHost, void* const pDevice, const size_t size,
|
|
||||||
const StreamID& stream = DEF_MEM_STREAM);
|
|
||||||
/**
|
|
||||||
* Copies memory on the device to another location on the device.
|
|
||||||
*/
|
|
||||||
StreamID deviceCopy(void* const pSrc, void* const pDest, const size_t size,
|
|
||||||
const StreamID& stream = DEF_MEM_STREAM);
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Frees memory on the device.
|
* Frees memory on the device.
|
||||||
*/
|
*/
|
||||||
void free(void* const pDevice);
|
void free(void* const pDevice);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Copies memory from the source pointer to the dest pointer.
|
||||||
|
*/
|
||||||
|
StreamID copy(void* const source, void* const dest, const size_t size,
|
||||||
|
const StreamID& stream = DEF_MEM_STREAM);
|
||||||
|
/**
|
||||||
|
* Initializes or sets device memory to a value.
|
||||||
|
*/
|
||||||
|
StreamID memset(void* const pDevice, int value, const size_t size,
|
||||||
|
const StreamID& stream = DEF_MEM_STREAM);
|
||||||
|
/**
|
||||||
|
* Pins memory on the host.
|
||||||
|
*/
|
||||||
|
void pin(void* const pHost, const size_t size);
|
||||||
|
|
||||||
#ifdef CUDACC
|
#ifdef CUDACC
|
||||||
cudaDeviceProp getDeviceProp();
|
cudaDeviceProp getDeviceProp();
|
||||||
static cudaDeviceProp DeviceProperties = getDeviceProp();
|
static cudaDeviceProp DeviceProperties = getDeviceProp();
|
||||||
@ -181,12 +175,12 @@ class Shape {
|
|||||||
HD uint32_t axes() const; /**< Gets the number of axes. */
|
HD uint32_t axes() const; /**< Gets the number of axes. */
|
||||||
HD uint32_t items() const; /**< Gets the total number of items. */
|
HD uint32_t items() const; /**< Gets the total number of items. */
|
||||||
|
|
||||||
HD uint32_t length() const; /**< For 1D shapes, gets the length. In general, gets the dimension
|
HD uint32_t length() const; /**< For 1D shapes, gets the length. In general, gets the
|
||||||
of the last axis. */
|
dimension of the last axis. */
|
||||||
HD uint32_t rows() const; /**< For 2D shapes, gets the number of rows. In general, gets the
|
HD uint32_t rows() const; /**< For 2D shapes, gets the number of rows. In general, gets the
|
||||||
dimension of the second to last axis. */
|
dimension of the second to last axis. */
|
||||||
HD uint32_t cols() const; /**< For 2D shapes, gets the number of columns. In general, gets the
|
HD uint32_t cols() const; /**< For 2D shapes, gets the number of columns. In general, gets
|
||||||
dimension of the second to last axis. */
|
the dimension of the second to last axis. */
|
||||||
|
|
||||||
HD uint32_t
|
HD uint32_t
|
||||||
dim(const uint32_t axis) const; /**< Gets the dimension size of the specified axis. */
|
dim(const uint32_t axis) const; /**< Gets the dimension size of the specified axis. */
|
||||||
@ -227,46 +221,6 @@ template <typename F, typename... Args> struct FuncHolder {
|
|||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
|
||||||
* Accessory struct to deal with host callbacks for CUDA Graphs in a nice fashion.
|
|
||||||
*/
|
|
||||||
struct GraphTools {
|
|
||||||
std::vector<void*> mHostData;
|
|
||||||
std::vector<Event*> mEvents;
|
|
||||||
|
|
||||||
~GraphTools();
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Within a function that is being stream captured, launch a host function that can
|
|
||||||
* be captured into the graph.
|
|
||||||
*/
|
|
||||||
|
|
||||||
template <typename F, typename... Args>
|
|
||||||
void launchHostFunction(const StreamID& stream, F func, Args&&... args) {
|
|
||||||
#ifdef CUDACC
|
|
||||||
FuncHolder<F, Args...>* fh = new FuncHolder<F, Args...>(func, args...);
|
|
||||||
mHostData.push_back((void*)fh);
|
|
||||||
cudaHostFn_t run_func = fh->run;
|
|
||||||
CUDA_CHECK(cudaLaunchHostFunc(Manager::get()->stream(stream), run_func, fh));
|
|
||||||
#else
|
|
||||||
func(args...);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Makes a new branch in the graph to be run in parallel by a new stream.
|
|
||||||
* \param orig_stream the original stream to branch from.
|
|
||||||
* \param branch_stream the stream of the new branch.
|
|
||||||
*/
|
|
||||||
void makeBranch(const StreamID& orig_stream, const StreamID& branch_stream);
|
|
||||||
/**
|
|
||||||
* Joins a existing branch in the graph to collapse a parallel block.
|
|
||||||
* \param orig_stream the original stream to join the branch to.
|
|
||||||
* \param branch_stream the stream of the branch to join.
|
|
||||||
*/
|
|
||||||
void joinBranch(const StreamID& orig_stream, const StreamID& branch_stream);
|
|
||||||
};
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A class that manages CUDA Graphs.
|
* A class that manages CUDA Graphs.
|
||||||
*/
|
*/
|
||||||
@ -319,6 +273,46 @@ template <typename F, typename... Args> class Graph {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A struct to facilitate other CUDA Graphs functionality like creating branches and host callbacks.
|
||||||
|
*/
|
||||||
|
struct GraphManager {
|
||||||
|
std::vector<void*> mHostData;
|
||||||
|
std::vector<Event*> mEvents;
|
||||||
|
|
||||||
|
~GraphManager();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Within a function that is being stream captured, launch a host function that can
|
||||||
|
* be captured into the graph.
|
||||||
|
*/
|
||||||
|
|
||||||
|
template <typename F, typename... Args>
|
||||||
|
void launchHostFunction(const StreamID& stream, F func, Args&&... args) {
|
||||||
|
#ifdef CUDACC
|
||||||
|
FuncHolder<F, Args...>* fh = new FuncHolder<F, Args...>(func, args...);
|
||||||
|
mHostData.push_back((void*)fh);
|
||||||
|
cudaHostFn_t run_func = fh->run;
|
||||||
|
CUDA_CHECK(cudaLaunchHostFunc(Manager::get()->stream(stream), run_func, fh));
|
||||||
|
#else
|
||||||
|
func(args...);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Makes a new branch in the graph to be run in parallel by a new stream.
|
||||||
|
* \param orig_stream the original stream to branch from.
|
||||||
|
* \param branch_stream the stream of the new branch.
|
||||||
|
*/
|
||||||
|
void makeBranch(const StreamID& orig_stream, const StreamID& branch_stream);
|
||||||
|
/**
|
||||||
|
* Joins a existing branch in the graph to collapse a parallel block.
|
||||||
|
* \param orig_stream the original stream to join the branch to.
|
||||||
|
* \param branch_stream the stream of the branch to join.
|
||||||
|
*/
|
||||||
|
void joinBranch(const StreamID& orig_stream, const StreamID& branch_stream);
|
||||||
|
};
|
||||||
|
|
||||||
}; // namespace CudaTools
|
}; // namespace CudaTools
|
||||||
|
|
||||||
#ifdef CUDATOOLS_IMPLEMENTATION
|
#ifdef CUDATOOLS_IMPLEMENTATION
|
||||||
@ -357,27 +351,17 @@ void free(void* const pDevice) {
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
StreamID push(void* const pHost, void* const pDevice, const size_t size, const StreamID& stream) {
|
StreamID copy(void* const source, void* const dest, const size_t size, const StreamID& stream) {
|
||||||
#ifdef CUDACC
|
#ifdef CUDACC
|
||||||
CUDA_CHECK(cudaMemcpyAsync(pDevice, pHost, size, cudaMemcpyHostToDevice,
|
CUDA_CHECK(
|
||||||
Manager::get()->stream(stream)));
|
cudaMemcpyAsync(dest, source, size, cudaMemcpyDefault, Manager::get()->stream(stream)));
|
||||||
#endif
|
#endif
|
||||||
return stream;
|
return stream;
|
||||||
}
|
}
|
||||||
|
|
||||||
StreamID pull(void* const pHost, void* const pDevice, const size_t size, const StreamID& stream) {
|
StreamID memset(void* const pDevice, const int value, const size_t size, const StreamID& stream) {
|
||||||
#ifdef CUDACC
|
#ifdef CUDACC
|
||||||
CUDA_CHECK(cudaMemcpyAsync(pHost, pDevice, size, cudaMemcpyDeviceToHost,
|
CUDA_CHECK(cudaMemsetAsync(pDevice, value, size, Manager::get()->stream(stream)));
|
||||||
Manager::get()->stream(stream)));
|
|
||||||
#endif
|
|
||||||
return stream;
|
|
||||||
}
|
|
||||||
|
|
||||||
StreamID deviceCopy(void* const pSrc, void* const pDest, const size_t size,
|
|
||||||
const StreamID& stream) {
|
|
||||||
#ifdef CUDACC
|
|
||||||
CUDA_CHECK(cudaMemcpyAsync(pDest, pSrc, size, cudaMemcpyDeviceToDevice,
|
|
||||||
Manager::get()->stream(stream)));
|
|
||||||
#endif
|
#endif
|
||||||
return stream;
|
return stream;
|
||||||
}
|
}
|
||||||
@ -661,11 +645,11 @@ void Event::record(const StreamID& stream) {
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////
|
//////////////////////////
|
||||||
// GraphTools Methods //
|
// GraphManager Methods //
|
||||||
////////////////////////
|
//////////////////////////
|
||||||
|
|
||||||
GraphTools::~GraphTools() {
|
GraphManager::~GraphManager() {
|
||||||
#ifdef CUDACC
|
#ifdef CUDACC
|
||||||
for (void* func : mHostData) {
|
for (void* func : mHostData) {
|
||||||
delete func;
|
delete func;
|
||||||
@ -676,14 +660,14 @@ GraphTools::~GraphTools() {
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void GraphTools::makeBranch(const StreamID& orig_stream, const StreamID& branch_stream) {
|
void GraphManager::makeBranch(const StreamID& orig_stream, const StreamID& branch_stream) {
|
||||||
Event* event = new Event();
|
Event* event = new Event();
|
||||||
event->record(orig_stream);
|
event->record(orig_stream);
|
||||||
mEvents.push_back(event);
|
mEvents.push_back(event);
|
||||||
branch_stream.wait(*event);
|
branch_stream.wait(*event);
|
||||||
}
|
}
|
||||||
|
|
||||||
void GraphTools::joinBranch(const StreamID& orig_stream, const StreamID& branch_stream) {
|
void GraphManager::joinBranch(const StreamID& orig_stream, const StreamID& branch_stream) {
|
||||||
Event* event = new Event();
|
Event* event = new Event();
|
||||||
event->record(branch_stream);
|
event->record(branch_stream);
|
||||||
mEvents.push_back(event);
|
mEvents.push_back(event);
|
||||||
|
|||||||
20
Macros.h
20
Macros.h
@ -46,22 +46,6 @@ using real64 = double; /**< Type alias for 64-bit floating point datatype. */
|
|||||||
*/
|
*/
|
||||||
#define SHARED
|
#define SHARED
|
||||||
|
|
||||||
/**
|
|
||||||
* \def DECLARE_KERNEL(call, ...)
|
|
||||||
* Used to declare (in header) a CUDA kernel.
|
|
||||||
* \param call the name of the kernel
|
|
||||||
* \param ... the arguments of the kernel
|
|
||||||
*/
|
|
||||||
#define DECLARE_KERNEL(call, ...)
|
|
||||||
|
|
||||||
/**
|
|
||||||
* \def DEFINE_KERNEL(call, ...)
|
|
||||||
* Used to define (in implementation) a CUDA kernel.
|
|
||||||
* \param call the name of the kernel
|
|
||||||
* \param ... the arguments of the kernel
|
|
||||||
*/
|
|
||||||
#define DEFINE_KERNEL(call, ...)
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* \def KERNEL(call, settings, ...)
|
* \def KERNEL(call, settings, ...)
|
||||||
* Used to call a CUDA kernel.
|
* Used to call a CUDA kernel.
|
||||||
@ -178,11 +162,11 @@ using real64 = double; /**< Type alias for 64-bit floating point datatype. */
|
|||||||
#define UPDATE_FUNC(name) \
|
#define UPDATE_FUNC(name) \
|
||||||
inline CudaTools::StreamID updateHost(const CudaTools::StreamID& stream = \
|
inline CudaTools::StreamID updateHost(const CudaTools::StreamID& stream = \
|
||||||
CudaTools::DEF_MEM_STREAM) { \
|
CudaTools::DEF_MEM_STREAM) { \
|
||||||
return CudaTools::pull(this, that(), sizeof(name)); \
|
return CudaTools::copy(that(), this, sizeof(name)); \
|
||||||
}; \
|
}; \
|
||||||
inline CudaTools::StreamID updateDevice(const CudaTools::StreamID& stream = \
|
inline CudaTools::StreamID updateDevice(const CudaTools::StreamID& stream = \
|
||||||
CudaTools::DEF_MEM_STREAM) { \
|
CudaTools::DEF_MEM_STREAM) { \
|
||||||
return CudaTools::push(this, that(), sizeof(name)); \
|
return CudaTools::copy(this, that(), sizeof(name)); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CUDA
|
#ifdef CUDA
|
||||||
|
|||||||
@ -2,8 +2,9 @@
|
|||||||
Core.h
|
Core.h
|
||||||
======
|
======
|
||||||
|
|
||||||
The ``Core.h`` header file defines some useful types and some macros along with
|
The ``Core.h`` header file defines some useful types and some macros functions
|
||||||
a few core classes.
|
to faciliate the dual CPU-CUDA compilation targets. Additionally, it introduces
|
||||||
|
several classes to enable the usage of CUDA streams, kernels, and graphs.
|
||||||
|
|
||||||
Types
|
Types
|
||||||
=====
|
=====
|
||||||
@ -33,10 +34,6 @@ Compilation Options
|
|||||||
Macro Functions
|
Macro Functions
|
||||||
===============
|
===============
|
||||||
|
|
||||||
Kernel
|
|
||||||
------
|
|
||||||
.. doxygendefine:: DECLARE_KERNEL
|
|
||||||
.. doxygendefine:: DEFINE_KERNEL
|
|
||||||
.. doxygendefine:: KERNEL
|
.. doxygendefine:: KERNEL
|
||||||
|
|
||||||
Device Helpers
|
Device Helpers
|
||||||
@ -50,7 +47,21 @@ Device Class
|
|||||||
.. doxygendefine:: DEVICE_CLASS
|
.. doxygendefine:: DEVICE_CLASS
|
||||||
|
|
||||||
|
|
||||||
Classes and Structs
|
Memory Functions
|
||||||
|
================
|
||||||
|
|
||||||
|
.. doxygenfunction:: CudaTools::malloc
|
||||||
|
|
||||||
|
.. doxygenfunction:: CudaTools::free
|
||||||
|
|
||||||
|
.. doxygenfunction:: CudaTools::copy
|
||||||
|
|
||||||
|
.. doxygenfunction:: CudaTools::memset
|
||||||
|
|
||||||
|
.. doxygenfunction:: CudaTools::pin
|
||||||
|
|
||||||
|
|
||||||
|
Streams and Handles
|
||||||
===================
|
===================
|
||||||
|
|
||||||
CudaTools::StreamID
|
CudaTools::StreamID
|
||||||
@ -64,12 +75,29 @@ CudaTools::Manager
|
|||||||
.. doxygenclass:: CudaTools::Manager
|
.. doxygenclass:: CudaTools::Manager
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
Kernels
|
||||||
|
=======
|
||||||
|
|
||||||
|
.. doxygenfunction:: CudaTools::Kernel::launch
|
||||||
|
|
||||||
|
.. doxygenfunction:: CudaTools::Kernel::basic
|
||||||
|
|
||||||
CudaTools::Kernel::Settings
|
CudaTools::Kernel::Settings
|
||||||
---------------------------
|
---------------------------
|
||||||
|
|
||||||
.. doxygenstruct:: CudaTools::Kernel::Settings
|
.. doxygenstruct:: CudaTools::Kernel::Settings
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
CudaTools::Kernel::Basic
|
|
||||||
------------------------
|
Graphs
|
||||||
.. doxygenfunction:: CudaTools::Kernel::basic
|
======
|
||||||
|
|
||||||
|
CudaTools::Graph
|
||||||
|
----------------
|
||||||
|
.. doxygenclass:: CudaTools::Graph
|
||||||
|
:members:
|
||||||
|
|
||||||
|
CudaTools::GraphManager
|
||||||
|
-----------------------
|
||||||
|
.. doxygenstruct:: CudaTools::GraphManager
|
||||||
|
:members:
|
||||||
|
|||||||
@ -33,12 +33,12 @@ macros provided. For example,
|
|||||||
|
|
||||||
.. code-block:: cpp
|
.. code-block:: cpp
|
||||||
|
|
||||||
DEFINE_KERNEL(add, int x, int y) {
|
KERNEL(add, int x, int y) {
|
||||||
printf("Kernel: %i\n", x + y);
|
printf("Kernel: %i\n", x + y);
|
||||||
}
|
}
|
||||||
|
|
||||||
int main() {
|
int main() {
|
||||||
KERNEL(add, CudaTools::Kernel::basic(1), 1, 1); // Prints 2.
|
CudaTools::Kernel::launch(add, CudaTools::Kernel::basic(1), 1, 1); // Prints 2.
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -79,13 +79,13 @@ being device-compatible. We follow the previous example in a similar fashion.
|
|||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
DEFINE_KERNEL(swap, intPair* const pair) { pair->swap(); }
|
KERNEL(swap, intPair* const pair) { pair->swap(); }
|
||||||
|
|
||||||
int main() {
|
int main() {
|
||||||
intPair pair(1, 2);
|
intPair pair(1, 2);
|
||||||
printf("Before: %u, %u\n", pair.x, pair.y); // Prints 1, 2.
|
printf("Before: %u, %u\n", pair.x, pair.y); // Prints 1, 2.
|
||||||
|
|
||||||
KERNEL(swap, CudaTools::Kernel::basic(1), pair.that()).wait();
|
CudaTools::Kernel::launch(swap, CudaTools::Kernel::basic(1), pair.that()).wait();
|
||||||
pair.updateHost().wait(); // Copies the memory from the device back to the host and waits until finished.
|
pair.updateHost().wait(); // Copies the memory from the device back to the host and waits until finished.
|
||||||
|
|
||||||
printf("After: %u, %u\n", pair.x, pair.y); // Prints 2, 1.
|
printf("After: %u, %u\n", pair.x, pair.y); // Prints 2, 1.
|
||||||
@ -129,7 +129,7 @@ compiled for CPU, then everything will run synchronously, as per usual.
|
|||||||
|
|
||||||
Array Examples
|
Array Examples
|
||||||
==============
|
==============
|
||||||
This file introduces the ``Array`` class, which is a class that provides automatic
|
The ``Array.h`` file introduces the ``Array`` class, which is a class that provides automatic
|
||||||
memory management between device and host. In particular, it provides functionality on
|
memory management between device and host. In particular, it provides functionality on
|
||||||
both the host and device while handling proper memory destruction, with many nice
|
both the host and device while handling proper memory destruction, with many nice
|
||||||
features. In particular it supports mimics many features of the Python package NumPy.`
|
features. In particular it supports mimics many features of the Python package NumPy.`
|
||||||
@ -137,12 +137,12 @@ We can demonstrate a few here.
|
|||||||
|
|
||||||
.. code-block:: cpp
|
.. code-block:: cpp
|
||||||
|
|
||||||
DEFINE_KERNEL(times2, const CudaTools::Array<int> arr) {
|
KERNEL(times2, const CudaTools::Array<int> arr) {
|
||||||
CudaTools::Array<int> flat = arr.flattened();
|
CudaTools::Array<int> flat = arr.flattened();
|
||||||
BASIC_LOOP(arr.shape().items()) { flat[iThread] *= 2; }
|
BASIC_LOOP(arr.shape().items()) { flat[iThread] *= 2; }
|
||||||
}
|
}
|
||||||
|
|
||||||
DEFINE_KERNEL(times2double, const CudaTools::Array<double> arr) {
|
KERNEL(times2double, const CudaTools::Array<double> arr) {
|
||||||
CudaTools::Array<double> flat = arr.flattened();
|
CudaTools::Array<double> flat = arr.flattened();
|
||||||
BASIC_LOOP(arr.shape().items()) { flat[iThread] *= 2; }
|
BASIC_LOOP(arr.shape().items()) { flat[iThread] *= 2; }
|
||||||
}
|
}
|
||||||
@ -165,10 +165,10 @@ We can demonstrate a few here.
|
|||||||
// Call the kernel multiple times asynchronously. Note: since they share same
|
// Call the kernel multiple times asynchronously. Note: since they share same
|
||||||
// stream, they are not run in parallel, just queued on the device.
|
// stream, they are not run in parallel, just queued on the device.
|
||||||
// NOTE: Notice that a view is passed into the kernel, not the Array itself.
|
// NOTE: Notice that a view is passed into the kernel, not the Array itself.
|
||||||
KERNEL(times2, CudaTools::Kernel::basic(arrRange.shape().items()), arrRange.view());
|
CudaTools::Kernel::launch(times2, CudaTools::Kernel::basic(arrRange.shape().items()), arrRange.view());
|
||||||
KERNEL(times2, CudaTools::Kernel::basic(arrConst.shape().items()), arrConst.view());
|
CudaTools::Kernel::launch(times2, CudaTools::Kernel::basic(arrConst.shape().items()), arrConst.view());
|
||||||
KERNEL(times2double, CudaTools::Kernel::basic(arrLinspace.shape().items()), arrLinspace.view());
|
CudaTools::Kernel::launch(times2double, CudaTools::Kernel::basic(arrLinspace.shape().items()), arrLinspace.view());
|
||||||
KERNEL(times2, CudaTools::Kernel::basic(arrComma.shape().items()), arrComma.view()).wait();
|
CudaTools::Kernel::launch(times2, CudaTools::Kernel::basic(arrComma.shape().items()), arrComma.view()).wait();
|
||||||
arrRange.updateHost();
|
arrRange.updateHost();
|
||||||
arrConst.updateHost();
|
arrConst.updateHost();
|
||||||
arrLinspace.updateHost();
|
arrLinspace.updateHost();
|
||||||
@ -239,14 +239,80 @@ view manually with the ``.view()`` function.
|
|||||||
programmer to manage this.
|
programmer to manage this.
|
||||||
|
|
||||||
|
|
||||||
|
Graph Examples
|
||||||
|
==============
|
||||||
|
Additionally, there is support for CUDA Graphs, a way of defining a series of kernel
|
||||||
|
launches and executing later, potentially reducing overhead and timing, as well as
|
||||||
|
control the specific parallel workflow between CPU and GPU. The following
|
||||||
|
snippet illustrates this
|
||||||
|
|
||||||
|
.. code-block:: cpp
|
||||||
|
|
||||||
|
void myGraph(CudaTools::GraphManager* gm, const CudaTools::Array<uint32_t> A,
|
||||||
|
const CudaTools::Array<uint32_t> B) {
|
||||||
|
A.updateDevice("graphStream");
|
||||||
|
gm->makeBranch("graphStream", "graphStreamBranch");
|
||||||
|
B.updateDevice("graphStreamBranch");
|
||||||
|
for (uint32_t iTimes = 0; iTimes < 30; ++iTimes) {
|
||||||
|
CudaTools::Kernel::launch(
|
||||||
|
collatz, CudaTools::Kernel::basic(A.shape().items(), "graphStream"), A.view());
|
||||||
|
CudaTools::Kernel::launch(
|
||||||
|
plusOne, CudaTools::Kernel::basic(A.shape().items(), "graphStreamBranch"), B.view());
|
||||||
|
}
|
||||||
|
|
||||||
|
gm->joinBranch("graphStream", "graphStreamBranch");
|
||||||
|
CudaTools::Kernel::launch(addArray, CudaTools::Kernel::basic(A.shape().items(), "graphStream"),
|
||||||
|
A.view(), B.view());
|
||||||
|
A.updateHost("graphStream");
|
||||||
|
B.updateHost("graphStream");
|
||||||
|
gm->launchHostFunction("graphStream", addNum, A.view(), 5);
|
||||||
|
}
|
||||||
|
|
||||||
|
int main() {
|
||||||
|
CudaTools::Manager::get()->addStream("graphStream");
|
||||||
|
CudaTools::Manager::get()->addStream("graphStreamBranch");
|
||||||
|
|
||||||
|
CudaTools::Array<uint32_t> A = CudaTools::Array<uint32_t>::constant({100}, 50);
|
||||||
|
CudaTools::Array<uint32_t> B = CudaTools::Array<uint32_t>::constant({100}, 0);
|
||||||
|
|
||||||
|
CudaTools::GraphManager gm;
|
||||||
|
CudaTools::Graph graph("graphStream", myGraph, &gm, A.view(), B.view());
|
||||||
|
TIME(graph.execute().wait(), ExecuteGraph);
|
||||||
|
|
||||||
|
std::cout << A.slice({{0, 10}}) << "\n";
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
We first create two new streams to be used in the graph, which define the different parallel
|
||||||
|
streams used. To use CUDA Graphs in CudaTools, we expect the graph to be created from a function, which
|
||||||
|
should be written as if it will be executed. Note that we do not need to use ``.wait()`` here, since the function
|
||||||
|
is intended to be captured into the graph. The capture process is done on the creation of the graph, with
|
||||||
|
the name of the origin stream, the function name, and the arguments of the function. Afterwards,
|
||||||
|
simply run ``graph.execute()`` to execute the captured graph. On CPU, it will simply run the function.
|
||||||
|
|
||||||
|
To access the other functionality like graph branching an capturing host functions, it is
|
||||||
|
necessary to use the ``CudaTools::GraphManager`` class, which stores a variety of necessary variables
|
||||||
|
that need to be kept during the lifetime of the graph execution. **Currently, launching host functions sometimes alters the correct blocking of the stream, in particular with copying. It is not yet known if this is an issue with the library or a technicality within CUDA Graphs itself that needs some special care to resolve.** To read more about the syntax, see :ref:`here <CudaTools::GraphManager>`.
|
||||||
|
|
||||||
|
.. warning::
|
||||||
|
|
||||||
|
A graph capture essentially 'freezes' the variables used in the capture, like
|
||||||
|
function arguments. As a result, the programmer must take care that the variables
|
||||||
|
are well-defined. This is especially relevant to variables on the heap, where you need
|
||||||
|
to make sure the variable is not a copy. Potentially, always using pointers could work,
|
||||||
|
but is not always necessary. Likely ``.view()`` should always be used when dealing with
|
||||||
|
``CudaTools::Array`` objects.
|
||||||
|
|
||||||
|
|
||||||
BLAS Examples
|
BLAS Examples
|
||||||
=============
|
=============
|
||||||
|
|
||||||
|
|
||||||
Compilation and Linking
|
Compilation and Linking
|
||||||
=======================
|
=======================
|
||||||
To compile with this library, there are only a few things necessary.
|
To compile with this library, there are only a few things necessary. First, this library depends on
|
||||||
First, it is recommended you use the provided template ``Makefile``, which can be
|
`Eigen 3.4.0+ <https://eigen.tuxfamily.org/index.php?title=Main_Page>`__, and must be
|
||||||
|
compiled with C++17. Next, it is recommended you use the provided template ``Makefile``, which can be
|
||||||
easily modified to suit your project needs. It already default handles the compilation
|
easily modified to suit your project needs. It already default handles the compilation
|
||||||
and linking with ``nvcc``, so long as you fulfill a few requirements.
|
and linking with ``nvcc``, so long as you fulfill a few requirements.
|
||||||
|
|
||||||
@ -283,7 +349,7 @@ file for the first example:
|
|||||||
CC := g++-10
|
CC := g++-10
|
||||||
NVCC := nvcc
|
NVCC := nvcc
|
||||||
CFLAGS := -Wall -std=c++17 -fopenmp -MMD
|
CFLAGS := -Wall -std=c++17 -fopenmp -MMD
|
||||||
NVCC_FLAGS := -MMD -w -Xcompiler
|
NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
|
||||||
|
|
||||||
INCLUDE := ../../
|
INCLUDE := ../../
|
||||||
LIBS_DIR :=
|
LIBS_DIR :=
|
||||||
|
|||||||
@ -1,7 +1,7 @@
|
|||||||
CC := g++-10
|
CC := g++-10
|
||||||
NVCC := nvcc
|
NVCC := nvcc
|
||||||
CFLAGS := -Wall -std=c++17 -fopenmp -MMD
|
CFLAGS := -Wall -std=c++17 -fopenmp -MMD
|
||||||
NVCC_FLAGS := -MMD -w -Xcompiler
|
NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
|
||||||
|
|
||||||
INCLUDE := ../../
|
INCLUDE := ../../
|
||||||
LIBS_DIR :=
|
LIBS_DIR :=
|
||||||
|
|||||||
@ -1,12 +1,9 @@
|
|||||||
#define CUDATOOLS_IMPLEMENTATION
|
#define CUDATOOLS_IMPLEMENTATION
|
||||||
#include <Core.h>
|
#include <Core.h>
|
||||||
|
|
||||||
DEFINE_KERNEL(add, int x, int y) {
|
KERNEL(add, int x, int y) { printf("Kernel: %i\n", x + y); }
|
||||||
printf("Kernel: %i\n", x + y);
|
|
||||||
}
|
|
||||||
|
|
||||||
int main() {
|
int main() {
|
||||||
KERNEL(add, CudaTools::Kernel::basic(1), 1, 1); // Prints 2.
|
CudaTools::Kernel::launch(add, CudaTools::Kernel::basic(1), 1, 1); // Prints 2.
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -1,7 +1,7 @@
|
|||||||
CC := g++-10
|
CC := g++-10
|
||||||
NVCC := nvcc
|
NVCC := nvcc
|
||||||
CFLAGS := -Wall -std=c++17 -fopenmp -MMD
|
CFLAGS := -Wall -std=c++17 -fopenmp -MMD
|
||||||
NVCC_FLAGS := -MMD -w -Xcompiler
|
NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
|
||||||
|
|
||||||
INCLUDE := ../../
|
INCLUDE := ../../
|
||||||
LIBS_DIR :=
|
LIBS_DIR :=
|
||||||
|
|||||||
@ -3,32 +3,32 @@
|
|||||||
|
|
||||||
class intPair {
|
class intPair {
|
||||||
DEVICE_CLASS(intPair)
|
DEVICE_CLASS(intPair)
|
||||||
public:
|
public:
|
||||||
int x, y;
|
int x, y;
|
||||||
|
|
||||||
intPair(const int x_, const int y_) : x(x_), y(y_) {
|
intPair(const int x_, const int y_) : x(x_), y(y_) {
|
||||||
allocateDevice(); // Allocates memory for this intPair on the device.
|
allocateDevice(); // Allocates memory for this intPair on the device.
|
||||||
updateDevice().wait(); // Copies the memory on the host to the device and waits until finished.
|
updateDevice()
|
||||||
};
|
.wait(); // Copies the memory on the host to the device and waits until finished.
|
||||||
|
};
|
||||||
|
|
||||||
HD void swap() {
|
HD void swap() {
|
||||||
int swap = x;
|
int swap = x;
|
||||||
x = y;
|
x = y;
|
||||||
y = swap;
|
y = swap;
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
DEFINE_KERNEL(swap, intPair* const pair) { pair->swap(); }
|
KERNEL(swap, intPair* const pair) { pair->swap(); }
|
||||||
|
|
||||||
int main() {
|
int main() {
|
||||||
intPair pair(1, 2);
|
intPair pair(1, 2);
|
||||||
printf("Before: %u, %u\n", pair.x, pair.y); // Prints 1, 2.
|
printf("Before: %u, %u\n", pair.x, pair.y); // Prints 1, 2.
|
||||||
|
|
||||||
KERNEL(swap, CudaTools::Kernel::basic(1), pair.that()).wait();
|
CudaTools::Kernel::launch(swap, CudaTools::Kernel::basic(1), pair.that()).wait();
|
||||||
pair.updateHost().wait(); // Copies the memory from the device back to the host and waits until finished.
|
pair.updateHost()
|
||||||
|
.wait(); // Copies the memory from the device back to the host and waits until finished.
|
||||||
|
|
||||||
printf("After: %u, %u\n", pair.x, pair.y); // Prints 2, 1.
|
printf("After: %u, %u\n", pair.x, pair.y); // Prints 2, 1.
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -1,7 +1,7 @@
|
|||||||
CC := g++-10
|
CC := g++-10
|
||||||
NVCC := nvcc
|
NVCC := nvcc
|
||||||
CFLAGS := -Wall -std=c++17 -fopenmp -MMD
|
CFLAGS := -Wall -std=c++17 -fopenmp -MMD
|
||||||
NVCC_FLAGS := -MMD -w -Xcompiler
|
NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
|
||||||
|
|
||||||
INCLUDE := ../../
|
INCLUDE := ../../
|
||||||
LIBS_DIR :=
|
LIBS_DIR :=
|
||||||
|
|||||||
@ -2,12 +2,12 @@
|
|||||||
#include <Array.h>
|
#include <Array.h>
|
||||||
#include <Core.h>
|
#include <Core.h>
|
||||||
|
|
||||||
DEFINE_KERNEL(times2, const CudaTools::Array<int> arr) {
|
KERNEL(times2, const CudaTools::Array<int> arr) {
|
||||||
CudaTools::Array<int> flat = arr.flattened();
|
CudaTools::Array<int> flat = arr.flattened();
|
||||||
BASIC_LOOP(arr.shape().items()) { flat[iThread] *= 2; }
|
BASIC_LOOP(arr.shape().items()) { flat[iThread] *= 2; }
|
||||||
}
|
}
|
||||||
|
|
||||||
DEFINE_KERNEL(times2double, const CudaTools::Array<double> arr) {
|
KERNEL(times2double, const CudaTools::Array<double> arr) {
|
||||||
CudaTools::Array<double> flat = arr.flattened();
|
CudaTools::Array<double> flat = arr.flattened();
|
||||||
BASIC_LOOP(arr.shape().items()) { flat[iThread] *= 2; }
|
BASIC_LOOP(arr.shape().items()) { flat[iThread] *= 2; }
|
||||||
}
|
}
|
||||||
@ -30,10 +30,15 @@ int main() {
|
|||||||
// Call the kernel multiple times asynchronously. Note: since they share same
|
// Call the kernel multiple times asynchronously. Note: since they share same
|
||||||
// stream, they are not run in parallel, just queued on the device.
|
// stream, they are not run in parallel, just queued on the device.
|
||||||
// NOTE: Notice that a view is passed into the kernel, not the Array itself.
|
// NOTE: Notice that a view is passed into the kernel, not the Array itself.
|
||||||
KERNEL(times2, CudaTools::Kernel::basic(arrRange.shape().items()), arrRange.view());
|
CudaTools::Kernel::launch(times2, CudaTools::Kernel::basic(arrRange.shape().items()),
|
||||||
KERNEL(times2, CudaTools::Kernel::basic(arrConst.shape().items()), arrConst.view());
|
arrRange.view());
|
||||||
KERNEL(times2double, CudaTools::Kernel::basic(arrLinspace.shape().items()), arrLinspace.view());
|
CudaTools::Kernel::launch(times2, CudaTools::Kernel::basic(arrConst.shape().items()),
|
||||||
KERNEL(times2, CudaTools::Kernel::basic(arrComma.shape().items()), arrComma.view()).wait();
|
arrConst.view());
|
||||||
|
CudaTools::Kernel::launch(times2double, CudaTools::Kernel::basic(arrLinspace.shape().items()),
|
||||||
|
arrLinspace.view());
|
||||||
|
CudaTools::Kernel::launch(times2, CudaTools::Kernel::basic(arrComma.shape().items()),
|
||||||
|
arrComma.view())
|
||||||
|
.wait();
|
||||||
arrRange.updateHost();
|
arrRange.updateHost();
|
||||||
arrConst.updateHost();
|
arrConst.updateHost();
|
||||||
arrLinspace.updateHost();
|
arrLinspace.updateHost();
|
||||||
|
|||||||
@ -1,7 +1,7 @@
|
|||||||
CC := g++-10
|
CC := g++-10
|
||||||
NVCC := nvcc
|
NVCC := nvcc
|
||||||
CFLAGS := -Wall -std=c++17 -fopenmp -MMD
|
CFLAGS := -Wall -std=c++17 -fopenmp -MMD
|
||||||
NVCC_FLAGS := -MMD -w -Xcompiler
|
NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
|
||||||
|
|
||||||
INCLUDE := ../../
|
INCLUDE := ../../
|
||||||
LIBS_DIR :=
|
LIBS_DIR :=
|
||||||
|
|||||||
95
samples/5_SimpleGraph/Makefile
Normal file
95
samples/5_SimpleGraph/Makefile
Normal file
@ -0,0 +1,95 @@
|
|||||||
|
CC := g++-10
|
||||||
|
NVCC := nvcc
|
||||||
|
CFLAGS := -Wall -std=c++17 -fopenmp -MMD
|
||||||
|
NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
|
||||||
|
|
||||||
|
INCLUDE := ../../
|
||||||
|
LIBS_DIR :=
|
||||||
|
LIBS_DIR_GPU := /usr/local/cuda/lib64
|
||||||
|
LIBS :=
|
||||||
|
LIBS_GPU := cuda cudart cublas
|
||||||
|
|
||||||
|
TARGET = simpleGraph
|
||||||
|
SRC_DIR = .
|
||||||
|
BUILD_DIR = build
|
||||||
|
|
||||||
|
# Should not need to modify below.
|
||||||
|
|
||||||
|
CPU_BUILD_DIR = $(BUILD_DIR)/cpu
|
||||||
|
GPU_BUILD_DIR = $(BUILD_DIR)/gpu
|
||||||
|
|
||||||
|
SRC = $(wildcard $(SRC_DIR)/*/*.cpp) $(wildcard $(SRC_DIR)/*.cpp)
|
||||||
|
|
||||||
|
# Get source files and object files.
|
||||||
|
GCC_SRC = $(filter-out %.cu.cpp ,$(SRC))
|
||||||
|
NVCC_SRC = $(filter %.cu.cpp, $(SRC))
|
||||||
|
GCC_OBJ = $(GCC_SRC:$(SRC_DIR)/%.cpp=%.o)
|
||||||
|
NVCC_OBJ = $(NVCC_SRC:$(SRC_DIR)/%.cpp=%.o)
|
||||||
|
|
||||||
|
# If compiling for CPU, all go to GCC. Otherwise, they are split.
|
||||||
|
CPU_OBJ = $(addprefix $(CPU_BUILD_DIR)/,$(GCC_OBJ)) $(addprefix $(CPU_BUILD_DIR)/,$(NVCC_OBJ))
|
||||||
|
GPU_GCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(GCC_OBJ))
|
||||||
|
GPU_NVCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(NVCC_OBJ))
|
||||||
|
|
||||||
|
# $(info $$GCC_SRC is [${GCC_SRC}])
|
||||||
|
# $(info $$NVCC_SRC is [${NVCC_SRC}])
|
||||||
|
# $(info $$GCC_OBJ is [${GCC_OBJ}])
|
||||||
|
# $(info $$NVCC_OBJ is [${NVCC_OBJ}])
|
||||||
|
|
||||||
|
# $(info $$CPU_OBJ is [${CPU_OBJ}])
|
||||||
|
# $(info $$GPU_GCC_OBJ is [${GPU_GCC_OBJ}])
|
||||||
|
# $(info $$GPU_NVCC_OBJ is [${GPU_NVCC_OBJ}])
|
||||||
|
|
||||||
|
HEADER = $(wildcard $(SRC_DIR)/*/*.h) $(wildcard $(SRC_DIR)/*.h)
|
||||||
|
CPU_DEPS = $(wildcard $(CPU_BUILD_DIR)/*.d)
|
||||||
|
GPU_DEPS = $(wildcard $(GPU_BUILD_DIR)/*.d)
|
||||||
|
|
||||||
|
INC := $(INCLUDE:%=-I%)
|
||||||
|
LIB := $(LIBS_DIR:%=-L%)
|
||||||
|
LIB_GPU := $(LIBS_DIR_GPU:%=-L%)
|
||||||
|
LD := $(LIBS:%=-l%)
|
||||||
|
LD_GPU := $(LIBS_GPU:%=-l%)
|
||||||
|
|
||||||
|
# Reminder:
|
||||||
|
# $< = first prerequisite
|
||||||
|
# $@ = the target which matched the rule
|
||||||
|
# $^ = all prerequisites
|
||||||
|
|
||||||
|
.PHONY: all clean
|
||||||
|
|
||||||
|
all : cpu gpu
|
||||||
|
|
||||||
|
cpu: $(TARGET)CPU
|
||||||
|
gpu: $(TARGET)GPU
|
||||||
|
|
||||||
|
$(TARGET)CPU: $(CPU_OBJ)
|
||||||
|
$(CC) $(CFLAGS) $^ -o $@ $(INC) $(LIB) $(LDFLAGS)
|
||||||
|
|
||||||
|
$(CPU_BUILD_DIR)/%.o $(CPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cpp | $(CPU_BUILD_DIR)
|
||||||
|
$(CC) $(CFLAGS) -c -o $@ $< $(INC)
|
||||||
|
|
||||||
|
# For GPU, we need to build the NVCC objects, the NVCC linked object, and the
|
||||||
|
# regular ones. Then, we link them all together.
|
||||||
|
$(TARGET)GPU: $(GPU_BUILD_DIR)/link.o $(GPU_GCC_OBJ) | $(GPU_BUILD_DIR)
|
||||||
|
$(CC) -g -DCUDA $(CFLAGS) $(GPU_NVCC_OBJ) $^ -o $@ $(INC) $(LIB) $(LIB_GPU) $(LD) $(LD_GPU)
|
||||||
|
|
||||||
|
$(GPU_BUILD_DIR)/link.o: $(GPU_NVCC_OBJ) | $(GPU_BUILD_DIR)
|
||||||
|
$(NVCC) --device-link $^ -o $@
|
||||||
|
|
||||||
|
$(GPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cu.cpp | $(GPU_BUILD_DIR)
|
||||||
|
$(NVCC) $(NVCC_FLAGS) -DCUDA -x cu --device-c -o $@ $< $(INC)
|
||||||
|
|
||||||
|
$(GPU_BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp | $(GPU_BUILD_DIR)
|
||||||
|
$(CC) $(CFLAGS) -g -DCUDA -c -o $@ $< $(INC)
|
||||||
|
|
||||||
|
-include $(CPU_DEPS)
|
||||||
|
-include $(GPU_DEPS)
|
||||||
|
|
||||||
|
$(CPU_BUILD_DIR):
|
||||||
|
mkdir -p $@
|
||||||
|
|
||||||
|
$(GPU_BUILD_DIR):
|
||||||
|
mkdir -p $@
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -Rf $(BUILD_DIR) $(TARGET)CPU $(TARGET)GPU
|
||||||
106
samples/5_SimpleGraph/main.cu.cpp
Normal file
106
samples/5_SimpleGraph/main.cu.cpp
Normal file
@ -0,0 +1,106 @@
|
|||||||
|
#define CUDATOOLS_IMPLEMENTATION
|
||||||
|
#include <Array.h>
|
||||||
|
#include <Core.h>
|
||||||
|
#include <chrono>
|
||||||
|
|
||||||
|
#define TIME_START(name) auto begin_##name = std::chrono::steady_clock::now()
|
||||||
|
|
||||||
|
#define TIME_END(name) \
|
||||||
|
auto end_##name = std::chrono::steady_clock::now(); \
|
||||||
|
auto time_ms_##name = \
|
||||||
|
std::chrono::duration_cast<std::chrono::milliseconds>(end_##name - begin_##name).count(); \
|
||||||
|
auto time_mus_##name = \
|
||||||
|
std::chrono::duration_cast<std::chrono::microseconds>(end_##name - begin_##name).count(); \
|
||||||
|
if (time_ms_##name == 0) { \
|
||||||
|
printf("[%s] Time Elapsed: %ld[µs]\n", #name, time_mus_##name); \
|
||||||
|
} else { \
|
||||||
|
printf("[%s] Time Elapsed: %ld[ms]\n", #name, time_ms_##name); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define TIME(call, name) \
|
||||||
|
TIME_START(name); \
|
||||||
|
call; \
|
||||||
|
TIME_END(name);
|
||||||
|
|
||||||
|
KERNEL(collatz, const CudaTools::Array<uint32_t> arr) {
|
||||||
|
BASIC_LOOP(arr.shape().length()) {
|
||||||
|
if (arr[iThread] % 2) {
|
||||||
|
arr[iThread] = 3 * arr[iThread] + 1;
|
||||||
|
} else {
|
||||||
|
arr[iThread] = arr[iThread] >> 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
KERNEL(plusOne, const CudaTools::Array<uint32_t> arr) {
|
||||||
|
BASIC_LOOP(arr.shape().length()) { arr[iThread] += 1; }
|
||||||
|
}
|
||||||
|
|
||||||
|
KERNEL(addArray, const CudaTools::Array<uint32_t> a, const CudaTools::Array<uint32_t> b) {
|
||||||
|
BASIC_LOOP(a.shape().length()) { a[iThread] += b[iThread]; }
|
||||||
|
}
|
||||||
|
|
||||||
|
void addNum(const CudaTools::Array<uint32_t> A, uint32_t num) {
|
||||||
|
auto Aeig = A.atLeast2D().eigenMap();
|
||||||
|
Aeig = Aeig.array() + num;
|
||||||
|
}
|
||||||
|
|
||||||
|
void doFunc(const CudaTools::Array<uint32_t> A, const CudaTools::Array<uint32_t> B) {
|
||||||
|
A.updateDevice("graphStream").wait();
|
||||||
|
B.updateDevice("graphStreamBranch").wait();
|
||||||
|
for (uint32_t iTimes = 0; iTimes < 30; ++iTimes) {
|
||||||
|
CudaTools::Kernel::launch(
|
||||||
|
collatz, CudaTools::Kernel::basic(A.shape().items(), "graphStream"), A.view());
|
||||||
|
CudaTools::Kernel::launch(
|
||||||
|
plusOne, CudaTools::Kernel::basic(A.shape().items(), "graphStreamBranch"), B.view());
|
||||||
|
}
|
||||||
|
|
||||||
|
CudaTools::Kernel::launch(addArray, CudaTools::Kernel::basic(A.shape().items(), "graphStream"),
|
||||||
|
A.view(), B.view())
|
||||||
|
.wait();
|
||||||
|
A.updateHost("graphStream");
|
||||||
|
B.updateHost("graphStream").wait();
|
||||||
|
addNum(A.view(), 5);
|
||||||
|
}
|
||||||
|
|
||||||
|
void myGraph(CudaTools::GraphManager* gm, const CudaTools::Array<uint32_t> A,
|
||||||
|
const CudaTools::Array<uint32_t> B) {
|
||||||
|
A.updateDevice("graphStream");
|
||||||
|
gm->makeBranch("graphStream", "graphStreamBranch");
|
||||||
|
B.updateDevice("graphStreamBranch");
|
||||||
|
for (uint32_t iTimes = 0; iTimes < 30; ++iTimes) {
|
||||||
|
CudaTools::Kernel::launch(
|
||||||
|
collatz, CudaTools::Kernel::basic(A.shape().items(), "graphStream"), A.view());
|
||||||
|
CudaTools::Kernel::launch(
|
||||||
|
plusOne, CudaTools::Kernel::basic(A.shape().items(), "graphStreamBranch"), B.view());
|
||||||
|
}
|
||||||
|
|
||||||
|
gm->joinBranch("graphStream", "graphStreamBranch");
|
||||||
|
CudaTools::Kernel::launch(addArray, CudaTools::Kernel::basic(A.shape().items(), "graphStream"),
|
||||||
|
A.view(), B.view());
|
||||||
|
A.updateHost("graphStream");
|
||||||
|
B.updateHost("graphStream");
|
||||||
|
gm->launchHostFunction("graphStream", addNum, A.view(), 5);
|
||||||
|
}
|
||||||
|
|
||||||
|
int main() {
|
||||||
|
CudaTools::Manager::get()->addStream("graphStream");
|
||||||
|
CudaTools::Manager::get()->addStream("graphStreamBranch");
|
||||||
|
|
||||||
|
CudaTools::Array<uint32_t> A = CudaTools::Array<uint32_t>::constant({100}, 50);
|
||||||
|
CudaTools::Array<uint32_t> B = CudaTools::Array<uint32_t>::constant({100}, 0);
|
||||||
|
|
||||||
|
TIME(doFunc(A.view(), B.view()), ExecuteNoGraph);
|
||||||
|
|
||||||
|
std::cout << A.slice({{0, 10}}) << "\n";
|
||||||
|
|
||||||
|
A.setConstant(50);
|
||||||
|
B.setConstant(0);
|
||||||
|
|
||||||
|
CudaTools::GraphManager gm;
|
||||||
|
CudaTools::Graph graph("graphStream", myGraph, &gm, A.view(), B.view());
|
||||||
|
TIME(graph.execute().wait(), ExecuteGraph);
|
||||||
|
|
||||||
|
std::cout << A.slice({{0, 10}}) << "\n";
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
35
tests.cu.cpp
35
tests.cu.cpp
@ -117,7 +117,7 @@ KERNEL(plusOne, const CT::Array<uint32_t> arr) {
|
|||||||
BASIC_LOOP(arr.shape().length()) { arr[iThread] += 1; }
|
BASIC_LOOP(arr.shape().length()) { arr[iThread] += 1; }
|
||||||
}
|
}
|
||||||
|
|
||||||
KERNEL(addBoth, const CT::Array<uint32_t> a, const CT::Array<uint32_t> b) {
|
KERNEL(addArray, const CT::Array<uint32_t> a, const CT::Array<uint32_t> b) {
|
||||||
BASIC_LOOP(a.shape().length()) { a[iThread] += b[iThread]; }
|
BASIC_LOOP(a.shape().length()) { a[iThread] += b[iThread]; }
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -491,29 +491,28 @@ template <typename T> uint32_t doBLASTests() {
|
|||||||
return failed;
|
return failed;
|
||||||
}
|
}
|
||||||
|
|
||||||
void myHostFunc(const CT::Array<uint32_t> A, uint32_t num) {
|
void addNum(const CT::Array<uint32_t> A, uint32_t num) {
|
||||||
auto Aeig = A.atLeast2D().eigenMap();
|
auto Aeig = A.atLeast2D().eigenMap();
|
||||||
Aeig = Aeig.array() + num;
|
Aeig = Aeig.array() + num;
|
||||||
}
|
}
|
||||||
|
|
||||||
void myBasicGraph(CT::GraphTools* tools, CT::Array<uint32_t>* A, CT::Array<uint32_t>* B) {
|
void myGraph(CT::GraphManager* gm, const CT::Array<uint32_t> A, const CT::Array<uint32_t> B) {
|
||||||
// tools->launchHostFunction("graphStream", myHostFunc, A->view(), 5);
|
// tools->launchHostFunction("graphStream", myHostFunc, A->view(), 5);
|
||||||
A->updateDevice("graphStream");
|
A.updateDevice("graphStream");
|
||||||
tools->makeBranch("graphStream", "graphStreamBranch");
|
gm->makeBranch("graphStream", "graphStreamBranch");
|
||||||
B->updateDevice("graphStreamBranch");
|
B.updateDevice("graphStreamBranch");
|
||||||
for (uint32_t iTimes = 0; iTimes < 30; ++iTimes) {
|
for (uint32_t iTimes = 0; iTimes < 30; ++iTimes) {
|
||||||
CT::Kernel::launch(collatz, CT::Kernel::basic(A->shape().items(), "graphStream"),
|
CT::Kernel::launch(collatz, CT::Kernel::basic(A.shape().items(), "graphStream"), A.view());
|
||||||
A->view());
|
CT::Kernel::launch(plusOne, CT::Kernel::basic(A.shape().items(), "graphStreamBranch"),
|
||||||
CT::Kernel::launch(plusOne, CT::Kernel::basic(A->shape().items(), "graphStreamBranch"),
|
B.view());
|
||||||
B->view());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
tools->joinBranch("graphStream", "graphStreamBranch");
|
gm->joinBranch("graphStream", "graphStreamBranch");
|
||||||
CT::Kernel::launch(addBoth, CT::Kernel::basic(A->shape().items(), "graphStream"), A->view(),
|
CT::Kernel::launch(addArray, CT::Kernel::basic(A.shape().items(), "graphStream"), A.view(),
|
||||||
B->view());
|
B.view());
|
||||||
A->updateHost("graphStream");
|
A.updateHost("graphStream");
|
||||||
B->updateHost("graphStream");
|
B.updateHost("graphStream");
|
||||||
tools->launchHostFunction("graphStream", myHostFunc, A->view(), 5);
|
gm->launchHostFunction("graphStream", addNum, A.view(), 5);
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t doGraphTest() {
|
uint32_t doGraphTest() {
|
||||||
@ -523,8 +522,8 @@ uint32_t doGraphTest() {
|
|||||||
CT::Manager::get()->addStream("graphStream");
|
CT::Manager::get()->addStream("graphStream");
|
||||||
CT::Manager::get()->addStream("graphStreamBranch");
|
CT::Manager::get()->addStream("graphStreamBranch");
|
||||||
|
|
||||||
CT::GraphTools tools;
|
CT::GraphManager gm;
|
||||||
CT::Graph graph("graphStream", myBasicGraph, &tools, &A, &B);
|
CT::Graph graph("graphStream", myGraph, &gm, A.view(), B.view());
|
||||||
graph.execute().wait();
|
graph.execute().wait();
|
||||||
|
|
||||||
uint32_t errors = 0;
|
uint32_t errors = 0;
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user