Updated documentation samples with modifications
This commit is contained in:
parent
167edfea44
commit
8c7b99cd9f
12
Array.h
12
Array.h
@ -556,7 +556,7 @@ template <typename T> class Array {
|
||||
}
|
||||
#ifndef DEVICE
|
||||
if (pDevice != nullptr) {
|
||||
CudaTools::deviceCopy(pDevice, arr.dataDevice(), mShape.items() * sizeof(T)).wait();
|
||||
CudaTools::copy(pDevice, arr.dataDevice(), mShape.items() * sizeof(T)).wait();
|
||||
}
|
||||
#endif
|
||||
return arr;
|
||||
@ -739,9 +739,8 @@ template <typename T> class Array {
|
||||
* \brief Host only
|
||||
*/
|
||||
StreamID updateHost(const StreamID& stream = DEF_MEM_STREAM) const {
|
||||
CT_ERROR(mIsView, "Cannot update host on a view");
|
||||
CudaTools::pull(pHost, pDevice, mShape.items() * sizeof(T), stream);
|
||||
return stream;
|
||||
CT_ERROR(mIsSlice, "Cannot update host copy on a slice");
|
||||
return CudaTools::copy(pDevice, pHost, mShape.items() * sizeof(T), stream);
|
||||
};
|
||||
|
||||
/**
|
||||
@ -749,9 +748,8 @@ template <typename T> class Array {
|
||||
* \brief Host only
|
||||
*/
|
||||
StreamID updateDevice(const StreamID& stream = DEF_MEM_STREAM) const {
|
||||
CT_ERROR(mIsView, "Cannot update device on a view");
|
||||
CudaTools::push(pHost, pDevice, mShape.items() * sizeof(T), stream);
|
||||
return stream;
|
||||
CT_ERROR(mIsSlice, "Cannot update device copy on a slice");
|
||||
return CudaTools::copy(pHost, pDevice, mShape.items() * sizeof(T), stream);
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
156
Core.h
156
Core.h
@ -43,32 +43,26 @@ static const StreamID DEF_KERNEL_STREAM = StreamID{"defaultKernel"};
|
||||
*/
|
||||
void* malloc(const size_t size);
|
||||
|
||||
/**
|
||||
* Pins memory on the host.
|
||||
*/
|
||||
void pin(void* const pHost, const size_t size);
|
||||
|
||||
/**
|
||||
* Pushes memory from the device to the host.
|
||||
*/
|
||||
StreamID push(void* const pHost, void* const pDevice, const size_t size,
|
||||
const StreamID& stream = DEF_MEM_STREAM);
|
||||
/**
|
||||
* Pulls memory from the device back to the host.
|
||||
*/
|
||||
StreamID pull(void* const pHost, void* const pDevice, const size_t size,
|
||||
const StreamID& stream = DEF_MEM_STREAM);
|
||||
/**
|
||||
* Copies memory on the device to another location on the device.
|
||||
*/
|
||||
StreamID deviceCopy(void* const pSrc, void* const pDest, const size_t size,
|
||||
const StreamID& stream = DEF_MEM_STREAM);
|
||||
|
||||
/**
|
||||
* Frees memory on the device.
|
||||
*/
|
||||
void free(void* const pDevice);
|
||||
|
||||
/**
|
||||
* Copies memory from the source pointer to the dest pointer.
|
||||
*/
|
||||
StreamID copy(void* const source, void* const dest, const size_t size,
|
||||
const StreamID& stream = DEF_MEM_STREAM);
|
||||
/**
|
||||
* Initializes or sets device memory to a value.
|
||||
*/
|
||||
StreamID memset(void* const pDevice, int value, const size_t size,
|
||||
const StreamID& stream = DEF_MEM_STREAM);
|
||||
/**
|
||||
* Pins memory on the host.
|
||||
*/
|
||||
void pin(void* const pHost, const size_t size);
|
||||
|
||||
#ifdef CUDACC
|
||||
cudaDeviceProp getDeviceProp();
|
||||
static cudaDeviceProp DeviceProperties = getDeviceProp();
|
||||
@ -181,12 +175,12 @@ class Shape {
|
||||
HD uint32_t axes() const; /**< Gets the number of axes. */
|
||||
HD uint32_t items() const; /**< Gets the total number of items. */
|
||||
|
||||
HD uint32_t length() const; /**< For 1D shapes, gets the length. In general, gets the dimension
|
||||
of the last axis. */
|
||||
HD uint32_t length() const; /**< For 1D shapes, gets the length. In general, gets the
|
||||
dimension of the last axis. */
|
||||
HD uint32_t rows() const; /**< For 2D shapes, gets the number of rows. In general, gets the
|
||||
dimension of the second to last axis. */
|
||||
HD uint32_t cols() const; /**< For 2D shapes, gets the number of columns. In general, gets the
|
||||
dimension of the second to last axis. */
|
||||
HD uint32_t cols() const; /**< For 2D shapes, gets the number of columns. In general, gets
|
||||
the dimension of the second to last axis. */
|
||||
|
||||
HD uint32_t
|
||||
dim(const uint32_t axis) const; /**< Gets the dimension size of the specified axis. */
|
||||
@ -227,46 +221,6 @@ template <typename F, typename... Args> struct FuncHolder {
|
||||
};
|
||||
};
|
||||
|
||||
/**
|
||||
* Accessory struct to deal with host callbacks for CUDA Graphs in a nice fashion.
|
||||
*/
|
||||
struct GraphTools {
|
||||
std::vector<void*> mHostData;
|
||||
std::vector<Event*> mEvents;
|
||||
|
||||
~GraphTools();
|
||||
|
||||
/**
|
||||
* Within a function that is being stream captured, launch a host function that can
|
||||
* be captured into the graph.
|
||||
*/
|
||||
|
||||
template <typename F, typename... Args>
|
||||
void launchHostFunction(const StreamID& stream, F func, Args&&... args) {
|
||||
#ifdef CUDACC
|
||||
FuncHolder<F, Args...>* fh = new FuncHolder<F, Args...>(func, args...);
|
||||
mHostData.push_back((void*)fh);
|
||||
cudaHostFn_t run_func = fh->run;
|
||||
CUDA_CHECK(cudaLaunchHostFunc(Manager::get()->stream(stream), run_func, fh));
|
||||
#else
|
||||
func(args...);
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
* Makes a new branch in the graph to be run in parallel by a new stream.
|
||||
* \param orig_stream the original stream to branch from.
|
||||
* \param branch_stream the stream of the new branch.
|
||||
*/
|
||||
void makeBranch(const StreamID& orig_stream, const StreamID& branch_stream);
|
||||
/**
|
||||
* Joins a existing branch in the graph to collapse a parallel block.
|
||||
* \param orig_stream the original stream to join the branch to.
|
||||
* \param branch_stream the stream of the branch to join.
|
||||
*/
|
||||
void joinBranch(const StreamID& orig_stream, const StreamID& branch_stream);
|
||||
};
|
||||
|
||||
/**
|
||||
* A class that manages CUDA Graphs.
|
||||
*/
|
||||
@ -319,6 +273,46 @@ template <typename F, typename... Args> class Graph {
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* A struct to facilitate other CUDA Graphs functionality like creating branches and host callbacks.
|
||||
*/
|
||||
struct GraphManager {
|
||||
std::vector<void*> mHostData;
|
||||
std::vector<Event*> mEvents;
|
||||
|
||||
~GraphManager();
|
||||
|
||||
/**
|
||||
* Within a function that is being stream captured, launch a host function that can
|
||||
* be captured into the graph.
|
||||
*/
|
||||
|
||||
template <typename F, typename... Args>
|
||||
void launchHostFunction(const StreamID& stream, F func, Args&&... args) {
|
||||
#ifdef CUDACC
|
||||
FuncHolder<F, Args...>* fh = new FuncHolder<F, Args...>(func, args...);
|
||||
mHostData.push_back((void*)fh);
|
||||
cudaHostFn_t run_func = fh->run;
|
||||
CUDA_CHECK(cudaLaunchHostFunc(Manager::get()->stream(stream), run_func, fh));
|
||||
#else
|
||||
func(args...);
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
* Makes a new branch in the graph to be run in parallel by a new stream.
|
||||
* \param orig_stream the original stream to branch from.
|
||||
* \param branch_stream the stream of the new branch.
|
||||
*/
|
||||
void makeBranch(const StreamID& orig_stream, const StreamID& branch_stream);
|
||||
/**
|
||||
* Joins a existing branch in the graph to collapse a parallel block.
|
||||
* \param orig_stream the original stream to join the branch to.
|
||||
* \param branch_stream the stream of the branch to join.
|
||||
*/
|
||||
void joinBranch(const StreamID& orig_stream, const StreamID& branch_stream);
|
||||
};
|
||||
|
||||
}; // namespace CudaTools
|
||||
|
||||
#ifdef CUDATOOLS_IMPLEMENTATION
|
||||
@ -357,27 +351,17 @@ void free(void* const pDevice) {
|
||||
#endif
|
||||
}
|
||||
|
||||
StreamID push(void* const pHost, void* const pDevice, const size_t size, const StreamID& stream) {
|
||||
StreamID copy(void* const source, void* const dest, const size_t size, const StreamID& stream) {
|
||||
#ifdef CUDACC
|
||||
CUDA_CHECK(cudaMemcpyAsync(pDevice, pHost, size, cudaMemcpyHostToDevice,
|
||||
Manager::get()->stream(stream)));
|
||||
CUDA_CHECK(
|
||||
cudaMemcpyAsync(dest, source, size, cudaMemcpyDefault, Manager::get()->stream(stream)));
|
||||
#endif
|
||||
return stream;
|
||||
}
|
||||
|
||||
StreamID pull(void* const pHost, void* const pDevice, const size_t size, const StreamID& stream) {
|
||||
StreamID memset(void* const pDevice, const int value, const size_t size, const StreamID& stream) {
|
||||
#ifdef CUDACC
|
||||
CUDA_CHECK(cudaMemcpyAsync(pHost, pDevice, size, cudaMemcpyDeviceToHost,
|
||||
Manager::get()->stream(stream)));
|
||||
#endif
|
||||
return stream;
|
||||
}
|
||||
|
||||
StreamID deviceCopy(void* const pSrc, void* const pDest, const size_t size,
|
||||
const StreamID& stream) {
|
||||
#ifdef CUDACC
|
||||
CUDA_CHECK(cudaMemcpyAsync(pDest, pSrc, size, cudaMemcpyDeviceToDevice,
|
||||
Manager::get()->stream(stream)));
|
||||
CUDA_CHECK(cudaMemsetAsync(pDevice, value, size, Manager::get()->stream(stream)));
|
||||
#endif
|
||||
return stream;
|
||||
}
|
||||
@ -661,11 +645,11 @@ void Event::record(const StreamID& stream) {
|
||||
#endif
|
||||
}
|
||||
|
||||
////////////////////////
|
||||
// GraphTools Methods //
|
||||
////////////////////////
|
||||
//////////////////////////
|
||||
// GraphManager Methods //
|
||||
//////////////////////////
|
||||
|
||||
GraphTools::~GraphTools() {
|
||||
GraphManager::~GraphManager() {
|
||||
#ifdef CUDACC
|
||||
for (void* func : mHostData) {
|
||||
delete func;
|
||||
@ -676,14 +660,14 @@ GraphTools::~GraphTools() {
|
||||
#endif
|
||||
}
|
||||
|
||||
void GraphTools::makeBranch(const StreamID& orig_stream, const StreamID& branch_stream) {
|
||||
void GraphManager::makeBranch(const StreamID& orig_stream, const StreamID& branch_stream) {
|
||||
Event* event = new Event();
|
||||
event->record(orig_stream);
|
||||
mEvents.push_back(event);
|
||||
branch_stream.wait(*event);
|
||||
}
|
||||
|
||||
void GraphTools::joinBranch(const StreamID& orig_stream, const StreamID& branch_stream) {
|
||||
void GraphManager::joinBranch(const StreamID& orig_stream, const StreamID& branch_stream) {
|
||||
Event* event = new Event();
|
||||
event->record(branch_stream);
|
||||
mEvents.push_back(event);
|
||||
|
||||
20
Macros.h
20
Macros.h
@ -46,22 +46,6 @@ using real64 = double; /**< Type alias for 64-bit floating point datatype. */
|
||||
*/
|
||||
#define SHARED
|
||||
|
||||
/**
|
||||
* \def DECLARE_KERNEL(call, ...)
|
||||
* Used to declare (in header) a CUDA kernel.
|
||||
* \param call the name of the kernel
|
||||
* \param ... the arguments of the kernel
|
||||
*/
|
||||
#define DECLARE_KERNEL(call, ...)
|
||||
|
||||
/**
|
||||
* \def DEFINE_KERNEL(call, ...)
|
||||
* Used to define (in implementation) a CUDA kernel.
|
||||
* \param call the name of the kernel
|
||||
* \param ... the arguments of the kernel
|
||||
*/
|
||||
#define DEFINE_KERNEL(call, ...)
|
||||
|
||||
/**
|
||||
* \def KERNEL(call, settings, ...)
|
||||
* Used to call a CUDA kernel.
|
||||
@ -178,11 +162,11 @@ using real64 = double; /**< Type alias for 64-bit floating point datatype. */
|
||||
#define UPDATE_FUNC(name) \
|
||||
inline CudaTools::StreamID updateHost(const CudaTools::StreamID& stream = \
|
||||
CudaTools::DEF_MEM_STREAM) { \
|
||||
return CudaTools::pull(this, that(), sizeof(name)); \
|
||||
return CudaTools::copy(that(), this, sizeof(name)); \
|
||||
}; \
|
||||
inline CudaTools::StreamID updateDevice(const CudaTools::StreamID& stream = \
|
||||
CudaTools::DEF_MEM_STREAM) { \
|
||||
return CudaTools::push(this, that(), sizeof(name)); \
|
||||
return CudaTools::copy(this, that(), sizeof(name)); \
|
||||
}
|
||||
|
||||
#ifdef CUDA
|
||||
|
||||
@ -2,8 +2,9 @@
|
||||
Core.h
|
||||
======
|
||||
|
||||
The ``Core.h`` header file defines some useful types and some macros along with
|
||||
a few core classes.
|
||||
The ``Core.h`` header file defines some useful types and some macros functions
|
||||
to faciliate the dual CPU-CUDA compilation targets. Additionally, it introduces
|
||||
several classes to enable the usage of CUDA streams, kernels, and graphs.
|
||||
|
||||
Types
|
||||
=====
|
||||
@ -33,10 +34,6 @@ Compilation Options
|
||||
Macro Functions
|
||||
===============
|
||||
|
||||
Kernel
|
||||
------
|
||||
.. doxygendefine:: DECLARE_KERNEL
|
||||
.. doxygendefine:: DEFINE_KERNEL
|
||||
.. doxygendefine:: KERNEL
|
||||
|
||||
Device Helpers
|
||||
@ -50,7 +47,21 @@ Device Class
|
||||
.. doxygendefine:: DEVICE_CLASS
|
||||
|
||||
|
||||
Classes and Structs
|
||||
Memory Functions
|
||||
================
|
||||
|
||||
.. doxygenfunction:: CudaTools::malloc
|
||||
|
||||
.. doxygenfunction:: CudaTools::free
|
||||
|
||||
.. doxygenfunction:: CudaTools::copy
|
||||
|
||||
.. doxygenfunction:: CudaTools::memset
|
||||
|
||||
.. doxygenfunction:: CudaTools::pin
|
||||
|
||||
|
||||
Streams and Handles
|
||||
===================
|
||||
|
||||
CudaTools::StreamID
|
||||
@ -64,12 +75,29 @@ CudaTools::Manager
|
||||
.. doxygenclass:: CudaTools::Manager
|
||||
:members:
|
||||
|
||||
Kernels
|
||||
=======
|
||||
|
||||
.. doxygenfunction:: CudaTools::Kernel::launch
|
||||
|
||||
.. doxygenfunction:: CudaTools::Kernel::basic
|
||||
|
||||
CudaTools::Kernel::Settings
|
||||
---------------------------
|
||||
|
||||
.. doxygenstruct:: CudaTools::Kernel::Settings
|
||||
:members:
|
||||
|
||||
CudaTools::Kernel::Basic
|
||||
------------------------
|
||||
.. doxygenfunction:: CudaTools::Kernel::basic
|
||||
|
||||
Graphs
|
||||
======
|
||||
|
||||
CudaTools::Graph
|
||||
----------------
|
||||
.. doxygenclass:: CudaTools::Graph
|
||||
:members:
|
||||
|
||||
CudaTools::GraphManager
|
||||
-----------------------
|
||||
.. doxygenstruct:: CudaTools::GraphManager
|
||||
:members:
|
||||
|
||||
@ -33,12 +33,12 @@ macros provided. For example,
|
||||
|
||||
.. code-block:: cpp
|
||||
|
||||
DEFINE_KERNEL(add, int x, int y) {
|
||||
KERNEL(add, int x, int y) {
|
||||
printf("Kernel: %i\n", x + y);
|
||||
}
|
||||
|
||||
int main() {
|
||||
KERNEL(add, CudaTools::Kernel::basic(1), 1, 1); // Prints 2.
|
||||
CudaTools::Kernel::launch(add, CudaTools::Kernel::basic(1), 1, 1); // Prints 2.
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -79,13 +79,13 @@ being device-compatible. We follow the previous example in a similar fashion.
|
||||
};
|
||||
};
|
||||
|
||||
DEFINE_KERNEL(swap, intPair* const pair) { pair->swap(); }
|
||||
KERNEL(swap, intPair* const pair) { pair->swap(); }
|
||||
|
||||
int main() {
|
||||
intPair pair(1, 2);
|
||||
printf("Before: %u, %u\n", pair.x, pair.y); // Prints 1, 2.
|
||||
|
||||
KERNEL(swap, CudaTools::Kernel::basic(1), pair.that()).wait();
|
||||
CudaTools::Kernel::launch(swap, CudaTools::Kernel::basic(1), pair.that()).wait();
|
||||
pair.updateHost().wait(); // Copies the memory from the device back to the host and waits until finished.
|
||||
|
||||
printf("After: %u, %u\n", pair.x, pair.y); // Prints 2, 1.
|
||||
@ -129,7 +129,7 @@ compiled for CPU, then everything will run synchronously, as per usual.
|
||||
|
||||
Array Examples
|
||||
==============
|
||||
This file introduces the ``Array`` class, which is a class that provides automatic
|
||||
The ``Array.h`` file introduces the ``Array`` class, which is a class that provides automatic
|
||||
memory management between device and host. In particular, it provides functionality on
|
||||
both the host and device while handling proper memory destruction, with many nice
|
||||
features. In particular it supports mimics many features of the Python package NumPy.`
|
||||
@ -137,12 +137,12 @@ We can demonstrate a few here.
|
||||
|
||||
.. code-block:: cpp
|
||||
|
||||
DEFINE_KERNEL(times2, const CudaTools::Array<int> arr) {
|
||||
KERNEL(times2, const CudaTools::Array<int> arr) {
|
||||
CudaTools::Array<int> flat = arr.flattened();
|
||||
BASIC_LOOP(arr.shape().items()) { flat[iThread] *= 2; }
|
||||
}
|
||||
|
||||
DEFINE_KERNEL(times2double, const CudaTools::Array<double> arr) {
|
||||
KERNEL(times2double, const CudaTools::Array<double> arr) {
|
||||
CudaTools::Array<double> flat = arr.flattened();
|
||||
BASIC_LOOP(arr.shape().items()) { flat[iThread] *= 2; }
|
||||
}
|
||||
@ -165,10 +165,10 @@ We can demonstrate a few here.
|
||||
// Call the kernel multiple times asynchronously. Note: since they share same
|
||||
// stream, they are not run in parallel, just queued on the device.
|
||||
// NOTE: Notice that a view is passed into the kernel, not the Array itself.
|
||||
KERNEL(times2, CudaTools::Kernel::basic(arrRange.shape().items()), arrRange.view());
|
||||
KERNEL(times2, CudaTools::Kernel::basic(arrConst.shape().items()), arrConst.view());
|
||||
KERNEL(times2double, CudaTools::Kernel::basic(arrLinspace.shape().items()), arrLinspace.view());
|
||||
KERNEL(times2, CudaTools::Kernel::basic(arrComma.shape().items()), arrComma.view()).wait();
|
||||
CudaTools::Kernel::launch(times2, CudaTools::Kernel::basic(arrRange.shape().items()), arrRange.view());
|
||||
CudaTools::Kernel::launch(times2, CudaTools::Kernel::basic(arrConst.shape().items()), arrConst.view());
|
||||
CudaTools::Kernel::launch(times2double, CudaTools::Kernel::basic(arrLinspace.shape().items()), arrLinspace.view());
|
||||
CudaTools::Kernel::launch(times2, CudaTools::Kernel::basic(arrComma.shape().items()), arrComma.view()).wait();
|
||||
arrRange.updateHost();
|
||||
arrConst.updateHost();
|
||||
arrLinspace.updateHost();
|
||||
@ -239,14 +239,80 @@ view manually with the ``.view()`` function.
|
||||
programmer to manage this.
|
||||
|
||||
|
||||
Graph Examples
|
||||
==============
|
||||
Additionally, there is support for CUDA Graphs, a way of defining a series of kernel
|
||||
launches and executing later, potentially reducing overhead and timing, as well as
|
||||
control the specific parallel workflow between CPU and GPU. The following
|
||||
snippet illustrates this
|
||||
|
||||
.. code-block:: cpp
|
||||
|
||||
void myGraph(CudaTools::GraphManager* gm, const CudaTools::Array<uint32_t> A,
|
||||
const CudaTools::Array<uint32_t> B) {
|
||||
A.updateDevice("graphStream");
|
||||
gm->makeBranch("graphStream", "graphStreamBranch");
|
||||
B.updateDevice("graphStreamBranch");
|
||||
for (uint32_t iTimes = 0; iTimes < 30; ++iTimes) {
|
||||
CudaTools::Kernel::launch(
|
||||
collatz, CudaTools::Kernel::basic(A.shape().items(), "graphStream"), A.view());
|
||||
CudaTools::Kernel::launch(
|
||||
plusOne, CudaTools::Kernel::basic(A.shape().items(), "graphStreamBranch"), B.view());
|
||||
}
|
||||
|
||||
gm->joinBranch("graphStream", "graphStreamBranch");
|
||||
CudaTools::Kernel::launch(addArray, CudaTools::Kernel::basic(A.shape().items(), "graphStream"),
|
||||
A.view(), B.view());
|
||||
A.updateHost("graphStream");
|
||||
B.updateHost("graphStream");
|
||||
gm->launchHostFunction("graphStream", addNum, A.view(), 5);
|
||||
}
|
||||
|
||||
int main() {
|
||||
CudaTools::Manager::get()->addStream("graphStream");
|
||||
CudaTools::Manager::get()->addStream("graphStreamBranch");
|
||||
|
||||
CudaTools::Array<uint32_t> A = CudaTools::Array<uint32_t>::constant({100}, 50);
|
||||
CudaTools::Array<uint32_t> B = CudaTools::Array<uint32_t>::constant({100}, 0);
|
||||
|
||||
CudaTools::GraphManager gm;
|
||||
CudaTools::Graph graph("graphStream", myGraph, &gm, A.view(), B.view());
|
||||
TIME(graph.execute().wait(), ExecuteGraph);
|
||||
|
||||
std::cout << A.slice({{0, 10}}) << "\n";
|
||||
return 0;
|
||||
}
|
||||
|
||||
We first create two new streams to be used in the graph, which define the different parallel
|
||||
streams used. To use CUDA Graphs in CudaTools, we expect the graph to be created from a function, which
|
||||
should be written as if it will be executed. Note that we do not need to use ``.wait()`` here, since the function
|
||||
is intended to be captured into the graph. The capture process is done on the creation of the graph, with
|
||||
the name of the origin stream, the function name, and the arguments of the function. Afterwards,
|
||||
simply run ``graph.execute()`` to execute the captured graph. On CPU, it will simply run the function.
|
||||
|
||||
To access the other functionality like graph branching an capturing host functions, it is
|
||||
necessary to use the ``CudaTools::GraphManager`` class, which stores a variety of necessary variables
|
||||
that need to be kept during the lifetime of the graph execution. **Currently, launching host functions sometimes alters the correct blocking of the stream, in particular with copying. It is not yet known if this is an issue with the library or a technicality within CUDA Graphs itself that needs some special care to resolve.** To read more about the syntax, see :ref:`here <CudaTools::GraphManager>`.
|
||||
|
||||
.. warning::
|
||||
|
||||
A graph capture essentially 'freezes' the variables used in the capture, like
|
||||
function arguments. As a result, the programmer must take care that the variables
|
||||
are well-defined. This is especially relevant to variables on the heap, where you need
|
||||
to make sure the variable is not a copy. Potentially, always using pointers could work,
|
||||
but is not always necessary. Likely ``.view()`` should always be used when dealing with
|
||||
``CudaTools::Array`` objects.
|
||||
|
||||
|
||||
BLAS Examples
|
||||
=============
|
||||
|
||||
|
||||
Compilation and Linking
|
||||
=======================
|
||||
To compile with this library, there are only a few things necessary.
|
||||
First, it is recommended you use the provided template ``Makefile``, which can be
|
||||
To compile with this library, there are only a few things necessary. First, this library depends on
|
||||
`Eigen 3.4.0+ <https://eigen.tuxfamily.org/index.php?title=Main_Page>`__, and must be
|
||||
compiled with C++17. Next, it is recommended you use the provided template ``Makefile``, which can be
|
||||
easily modified to suit your project needs. It already default handles the compilation
|
||||
and linking with ``nvcc``, so long as you fulfill a few requirements.
|
||||
|
||||
@ -283,7 +349,7 @@ file for the first example:
|
||||
CC := g++-10
|
||||
NVCC := nvcc
|
||||
CFLAGS := -Wall -std=c++17 -fopenmp -MMD
|
||||
NVCC_FLAGS := -MMD -w -Xcompiler
|
||||
NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
|
||||
|
||||
INCLUDE := ../../
|
||||
LIBS_DIR :=
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
CC := g++-10
|
||||
NVCC := nvcc
|
||||
CFLAGS := -Wall -std=c++17 -fopenmp -MMD
|
||||
NVCC_FLAGS := -MMD -w -Xcompiler
|
||||
NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
|
||||
|
||||
INCLUDE := ../../
|
||||
LIBS_DIR :=
|
||||
|
||||
@ -1,12 +1,9 @@
|
||||
#define CUDATOOLS_IMPLEMENTATION
|
||||
#include <Core.h>
|
||||
|
||||
DEFINE_KERNEL(add, int x, int y) {
|
||||
printf("Kernel: %i\n", x + y);
|
||||
}
|
||||
KERNEL(add, int x, int y) { printf("Kernel: %i\n", x + y); }
|
||||
|
||||
int main() {
|
||||
KERNEL(add, CudaTools::Kernel::basic(1), 1, 1); // Prints 2.
|
||||
CudaTools::Kernel::launch(add, CudaTools::Kernel::basic(1), 1, 1); // Prints 2.
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
CC := g++-10
|
||||
NVCC := nvcc
|
||||
CFLAGS := -Wall -std=c++17 -fopenmp -MMD
|
||||
NVCC_FLAGS := -MMD -w -Xcompiler
|
||||
NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
|
||||
|
||||
INCLUDE := ../../
|
||||
LIBS_DIR :=
|
||||
|
||||
@ -8,7 +8,8 @@ class intPair {
|
||||
|
||||
intPair(const int x_, const int y_) : x(x_), y(y_) {
|
||||
allocateDevice(); // Allocates memory for this intPair on the device.
|
||||
updateDevice().wait(); // Copies the memory on the host to the device and waits until finished.
|
||||
updateDevice()
|
||||
.wait(); // Copies the memory on the host to the device and waits until finished.
|
||||
};
|
||||
|
||||
HD void swap() {
|
||||
@ -18,17 +19,16 @@ class intPair {
|
||||
};
|
||||
};
|
||||
|
||||
DEFINE_KERNEL(swap, intPair* const pair) { pair->swap(); }
|
||||
KERNEL(swap, intPair* const pair) { pair->swap(); }
|
||||
|
||||
int main() {
|
||||
intPair pair(1, 2);
|
||||
printf("Before: %u, %u\n", pair.x, pair.y); // Prints 1, 2.
|
||||
|
||||
KERNEL(swap, CudaTools::Kernel::basic(1), pair.that()).wait();
|
||||
pair.updateHost().wait(); // Copies the memory from the device back to the host and waits until finished.
|
||||
CudaTools::Kernel::launch(swap, CudaTools::Kernel::basic(1), pair.that()).wait();
|
||||
pair.updateHost()
|
||||
.wait(); // Copies the memory from the device back to the host and waits until finished.
|
||||
|
||||
printf("After: %u, %u\n", pair.x, pair.y); // Prints 2, 1.
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
CC := g++-10
|
||||
NVCC := nvcc
|
||||
CFLAGS := -Wall -std=c++17 -fopenmp -MMD
|
||||
NVCC_FLAGS := -MMD -w -Xcompiler
|
||||
NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
|
||||
|
||||
INCLUDE := ../../
|
||||
LIBS_DIR :=
|
||||
|
||||
@ -2,12 +2,12 @@
|
||||
#include <Array.h>
|
||||
#include <Core.h>
|
||||
|
||||
DEFINE_KERNEL(times2, const CudaTools::Array<int> arr) {
|
||||
KERNEL(times2, const CudaTools::Array<int> arr) {
|
||||
CudaTools::Array<int> flat = arr.flattened();
|
||||
BASIC_LOOP(arr.shape().items()) { flat[iThread] *= 2; }
|
||||
}
|
||||
|
||||
DEFINE_KERNEL(times2double, const CudaTools::Array<double> arr) {
|
||||
KERNEL(times2double, const CudaTools::Array<double> arr) {
|
||||
CudaTools::Array<double> flat = arr.flattened();
|
||||
BASIC_LOOP(arr.shape().items()) { flat[iThread] *= 2; }
|
||||
}
|
||||
@ -30,10 +30,15 @@ int main() {
|
||||
// Call the kernel multiple times asynchronously. Note: since they share same
|
||||
// stream, they are not run in parallel, just queued on the device.
|
||||
// NOTE: Notice that a view is passed into the kernel, not the Array itself.
|
||||
KERNEL(times2, CudaTools::Kernel::basic(arrRange.shape().items()), arrRange.view());
|
||||
KERNEL(times2, CudaTools::Kernel::basic(arrConst.shape().items()), arrConst.view());
|
||||
KERNEL(times2double, CudaTools::Kernel::basic(arrLinspace.shape().items()), arrLinspace.view());
|
||||
KERNEL(times2, CudaTools::Kernel::basic(arrComma.shape().items()), arrComma.view()).wait();
|
||||
CudaTools::Kernel::launch(times2, CudaTools::Kernel::basic(arrRange.shape().items()),
|
||||
arrRange.view());
|
||||
CudaTools::Kernel::launch(times2, CudaTools::Kernel::basic(arrConst.shape().items()),
|
||||
arrConst.view());
|
||||
CudaTools::Kernel::launch(times2double, CudaTools::Kernel::basic(arrLinspace.shape().items()),
|
||||
arrLinspace.view());
|
||||
CudaTools::Kernel::launch(times2, CudaTools::Kernel::basic(arrComma.shape().items()),
|
||||
arrComma.view())
|
||||
.wait();
|
||||
arrRange.updateHost();
|
||||
arrConst.updateHost();
|
||||
arrLinspace.updateHost();
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
CC := g++-10
|
||||
NVCC := nvcc
|
||||
CFLAGS := -Wall -std=c++17 -fopenmp -MMD
|
||||
NVCC_FLAGS := -MMD -w -Xcompiler
|
||||
NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
|
||||
|
||||
INCLUDE := ../../
|
||||
LIBS_DIR :=
|
||||
|
||||
95
samples/5_SimpleGraph/Makefile
Normal file
95
samples/5_SimpleGraph/Makefile
Normal file
@ -0,0 +1,95 @@
|
||||
CC := g++-10
|
||||
NVCC := nvcc
|
||||
CFLAGS := -Wall -std=c++17 -fopenmp -MMD
|
||||
NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
|
||||
|
||||
INCLUDE := ../../
|
||||
LIBS_DIR :=
|
||||
LIBS_DIR_GPU := /usr/local/cuda/lib64
|
||||
LIBS :=
|
||||
LIBS_GPU := cuda cudart cublas
|
||||
|
||||
TARGET = simpleGraph
|
||||
SRC_DIR = .
|
||||
BUILD_DIR = build
|
||||
|
||||
# Should not need to modify below.
|
||||
|
||||
CPU_BUILD_DIR = $(BUILD_DIR)/cpu
|
||||
GPU_BUILD_DIR = $(BUILD_DIR)/gpu
|
||||
|
||||
SRC = $(wildcard $(SRC_DIR)/*/*.cpp) $(wildcard $(SRC_DIR)/*.cpp)
|
||||
|
||||
# Get source files and object files.
|
||||
GCC_SRC = $(filter-out %.cu.cpp ,$(SRC))
|
||||
NVCC_SRC = $(filter %.cu.cpp, $(SRC))
|
||||
GCC_OBJ = $(GCC_SRC:$(SRC_DIR)/%.cpp=%.o)
|
||||
NVCC_OBJ = $(NVCC_SRC:$(SRC_DIR)/%.cpp=%.o)
|
||||
|
||||
# If compiling for CPU, all go to GCC. Otherwise, they are split.
|
||||
CPU_OBJ = $(addprefix $(CPU_BUILD_DIR)/,$(GCC_OBJ)) $(addprefix $(CPU_BUILD_DIR)/,$(NVCC_OBJ))
|
||||
GPU_GCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(GCC_OBJ))
|
||||
GPU_NVCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(NVCC_OBJ))
|
||||
|
||||
# $(info $$GCC_SRC is [${GCC_SRC}])
|
||||
# $(info $$NVCC_SRC is [${NVCC_SRC}])
|
||||
# $(info $$GCC_OBJ is [${GCC_OBJ}])
|
||||
# $(info $$NVCC_OBJ is [${NVCC_OBJ}])
|
||||
|
||||
# $(info $$CPU_OBJ is [${CPU_OBJ}])
|
||||
# $(info $$GPU_GCC_OBJ is [${GPU_GCC_OBJ}])
|
||||
# $(info $$GPU_NVCC_OBJ is [${GPU_NVCC_OBJ}])
|
||||
|
||||
HEADER = $(wildcard $(SRC_DIR)/*/*.h) $(wildcard $(SRC_DIR)/*.h)
|
||||
CPU_DEPS = $(wildcard $(CPU_BUILD_DIR)/*.d)
|
||||
GPU_DEPS = $(wildcard $(GPU_BUILD_DIR)/*.d)
|
||||
|
||||
INC := $(INCLUDE:%=-I%)
|
||||
LIB := $(LIBS_DIR:%=-L%)
|
||||
LIB_GPU := $(LIBS_DIR_GPU:%=-L%)
|
||||
LD := $(LIBS:%=-l%)
|
||||
LD_GPU := $(LIBS_GPU:%=-l%)
|
||||
|
||||
# Reminder:
|
||||
# $< = first prerequisite
|
||||
# $@ = the target which matched the rule
|
||||
# $^ = all prerequisites
|
||||
|
||||
.PHONY: all clean
|
||||
|
||||
all : cpu gpu
|
||||
|
||||
cpu: $(TARGET)CPU
|
||||
gpu: $(TARGET)GPU
|
||||
|
||||
$(TARGET)CPU: $(CPU_OBJ)
|
||||
$(CC) $(CFLAGS) $^ -o $@ $(INC) $(LIB) $(LDFLAGS)
|
||||
|
||||
$(CPU_BUILD_DIR)/%.o $(CPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cpp | $(CPU_BUILD_DIR)
|
||||
$(CC) $(CFLAGS) -c -o $@ $< $(INC)
|
||||
|
||||
# For GPU, we need to build the NVCC objects, the NVCC linked object, and the
|
||||
# regular ones. Then, we link them all together.
|
||||
$(TARGET)GPU: $(GPU_BUILD_DIR)/link.o $(GPU_GCC_OBJ) | $(GPU_BUILD_DIR)
|
||||
$(CC) -g -DCUDA $(CFLAGS) $(GPU_NVCC_OBJ) $^ -o $@ $(INC) $(LIB) $(LIB_GPU) $(LD) $(LD_GPU)
|
||||
|
||||
$(GPU_BUILD_DIR)/link.o: $(GPU_NVCC_OBJ) | $(GPU_BUILD_DIR)
|
||||
$(NVCC) --device-link $^ -o $@
|
||||
|
||||
$(GPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cu.cpp | $(GPU_BUILD_DIR)
|
||||
$(NVCC) $(NVCC_FLAGS) -DCUDA -x cu --device-c -o $@ $< $(INC)
|
||||
|
||||
$(GPU_BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp | $(GPU_BUILD_DIR)
|
||||
$(CC) $(CFLAGS) -g -DCUDA -c -o $@ $< $(INC)
|
||||
|
||||
-include $(CPU_DEPS)
|
||||
-include $(GPU_DEPS)
|
||||
|
||||
$(CPU_BUILD_DIR):
|
||||
mkdir -p $@
|
||||
|
||||
$(GPU_BUILD_DIR):
|
||||
mkdir -p $@
|
||||
|
||||
clean:
|
||||
rm -Rf $(BUILD_DIR) $(TARGET)CPU $(TARGET)GPU
|
||||
106
samples/5_SimpleGraph/main.cu.cpp
Normal file
106
samples/5_SimpleGraph/main.cu.cpp
Normal file
@ -0,0 +1,106 @@
|
||||
#define CUDATOOLS_IMPLEMENTATION
|
||||
#include <Array.h>
|
||||
#include <Core.h>
|
||||
#include <chrono>
|
||||
|
||||
#define TIME_START(name) auto begin_##name = std::chrono::steady_clock::now()
|
||||
|
||||
#define TIME_END(name) \
|
||||
auto end_##name = std::chrono::steady_clock::now(); \
|
||||
auto time_ms_##name = \
|
||||
std::chrono::duration_cast<std::chrono::milliseconds>(end_##name - begin_##name).count(); \
|
||||
auto time_mus_##name = \
|
||||
std::chrono::duration_cast<std::chrono::microseconds>(end_##name - begin_##name).count(); \
|
||||
if (time_ms_##name == 0) { \
|
||||
printf("[%s] Time Elapsed: %ld[µs]\n", #name, time_mus_##name); \
|
||||
} else { \
|
||||
printf("[%s] Time Elapsed: %ld[ms]\n", #name, time_ms_##name); \
|
||||
}
|
||||
|
||||
#define TIME(call, name) \
|
||||
TIME_START(name); \
|
||||
call; \
|
||||
TIME_END(name);
|
||||
|
||||
KERNEL(collatz, const CudaTools::Array<uint32_t> arr) {
|
||||
BASIC_LOOP(arr.shape().length()) {
|
||||
if (arr[iThread] % 2) {
|
||||
arr[iThread] = 3 * arr[iThread] + 1;
|
||||
} else {
|
||||
arr[iThread] = arr[iThread] >> 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
KERNEL(plusOne, const CudaTools::Array<uint32_t> arr) {
|
||||
BASIC_LOOP(arr.shape().length()) { arr[iThread] += 1; }
|
||||
}
|
||||
|
||||
KERNEL(addArray, const CudaTools::Array<uint32_t> a, const CudaTools::Array<uint32_t> b) {
|
||||
BASIC_LOOP(a.shape().length()) { a[iThread] += b[iThread]; }
|
||||
}
|
||||
|
||||
void addNum(const CudaTools::Array<uint32_t> A, uint32_t num) {
|
||||
auto Aeig = A.atLeast2D().eigenMap();
|
||||
Aeig = Aeig.array() + num;
|
||||
}
|
||||
|
||||
void doFunc(const CudaTools::Array<uint32_t> A, const CudaTools::Array<uint32_t> B) {
|
||||
A.updateDevice("graphStream").wait();
|
||||
B.updateDevice("graphStreamBranch").wait();
|
||||
for (uint32_t iTimes = 0; iTimes < 30; ++iTimes) {
|
||||
CudaTools::Kernel::launch(
|
||||
collatz, CudaTools::Kernel::basic(A.shape().items(), "graphStream"), A.view());
|
||||
CudaTools::Kernel::launch(
|
||||
plusOne, CudaTools::Kernel::basic(A.shape().items(), "graphStreamBranch"), B.view());
|
||||
}
|
||||
|
||||
CudaTools::Kernel::launch(addArray, CudaTools::Kernel::basic(A.shape().items(), "graphStream"),
|
||||
A.view(), B.view())
|
||||
.wait();
|
||||
A.updateHost("graphStream");
|
||||
B.updateHost("graphStream").wait();
|
||||
addNum(A.view(), 5);
|
||||
}
|
||||
|
||||
void myGraph(CudaTools::GraphManager* gm, const CudaTools::Array<uint32_t> A,
|
||||
const CudaTools::Array<uint32_t> B) {
|
||||
A.updateDevice("graphStream");
|
||||
gm->makeBranch("graphStream", "graphStreamBranch");
|
||||
B.updateDevice("graphStreamBranch");
|
||||
for (uint32_t iTimes = 0; iTimes < 30; ++iTimes) {
|
||||
CudaTools::Kernel::launch(
|
||||
collatz, CudaTools::Kernel::basic(A.shape().items(), "graphStream"), A.view());
|
||||
CudaTools::Kernel::launch(
|
||||
plusOne, CudaTools::Kernel::basic(A.shape().items(), "graphStreamBranch"), B.view());
|
||||
}
|
||||
|
||||
gm->joinBranch("graphStream", "graphStreamBranch");
|
||||
CudaTools::Kernel::launch(addArray, CudaTools::Kernel::basic(A.shape().items(), "graphStream"),
|
||||
A.view(), B.view());
|
||||
A.updateHost("graphStream");
|
||||
B.updateHost("graphStream");
|
||||
gm->launchHostFunction("graphStream", addNum, A.view(), 5);
|
||||
}
|
||||
|
||||
int main() {
|
||||
CudaTools::Manager::get()->addStream("graphStream");
|
||||
CudaTools::Manager::get()->addStream("graphStreamBranch");
|
||||
|
||||
CudaTools::Array<uint32_t> A = CudaTools::Array<uint32_t>::constant({100}, 50);
|
||||
CudaTools::Array<uint32_t> B = CudaTools::Array<uint32_t>::constant({100}, 0);
|
||||
|
||||
TIME(doFunc(A.view(), B.view()), ExecuteNoGraph);
|
||||
|
||||
std::cout << A.slice({{0, 10}}) << "\n";
|
||||
|
||||
A.setConstant(50);
|
||||
B.setConstant(0);
|
||||
|
||||
CudaTools::GraphManager gm;
|
||||
CudaTools::Graph graph("graphStream", myGraph, &gm, A.view(), B.view());
|
||||
TIME(graph.execute().wait(), ExecuteGraph);
|
||||
|
||||
std::cout << A.slice({{0, 10}}) << "\n";
|
||||
return 0;
|
||||
}
|
||||
35
tests.cu.cpp
35
tests.cu.cpp
@ -117,7 +117,7 @@ KERNEL(plusOne, const CT::Array<uint32_t> arr) {
|
||||
BASIC_LOOP(arr.shape().length()) { arr[iThread] += 1; }
|
||||
}
|
||||
|
||||
KERNEL(addBoth, const CT::Array<uint32_t> a, const CT::Array<uint32_t> b) {
|
||||
KERNEL(addArray, const CT::Array<uint32_t> a, const CT::Array<uint32_t> b) {
|
||||
BASIC_LOOP(a.shape().length()) { a[iThread] += b[iThread]; }
|
||||
}
|
||||
|
||||
@ -491,29 +491,28 @@ template <typename T> uint32_t doBLASTests() {
|
||||
return failed;
|
||||
}
|
||||
|
||||
void myHostFunc(const CT::Array<uint32_t> A, uint32_t num) {
|
||||
void addNum(const CT::Array<uint32_t> A, uint32_t num) {
|
||||
auto Aeig = A.atLeast2D().eigenMap();
|
||||
Aeig = Aeig.array() + num;
|
||||
}
|
||||
|
||||
void myBasicGraph(CT::GraphTools* tools, CT::Array<uint32_t>* A, CT::Array<uint32_t>* B) {
|
||||
void myGraph(CT::GraphManager* gm, const CT::Array<uint32_t> A, const CT::Array<uint32_t> B) {
|
||||
// tools->launchHostFunction("graphStream", myHostFunc, A->view(), 5);
|
||||
A->updateDevice("graphStream");
|
||||
tools->makeBranch("graphStream", "graphStreamBranch");
|
||||
B->updateDevice("graphStreamBranch");
|
||||
A.updateDevice("graphStream");
|
||||
gm->makeBranch("graphStream", "graphStreamBranch");
|
||||
B.updateDevice("graphStreamBranch");
|
||||
for (uint32_t iTimes = 0; iTimes < 30; ++iTimes) {
|
||||
CT::Kernel::launch(collatz, CT::Kernel::basic(A->shape().items(), "graphStream"),
|
||||
A->view());
|
||||
CT::Kernel::launch(plusOne, CT::Kernel::basic(A->shape().items(), "graphStreamBranch"),
|
||||
B->view());
|
||||
CT::Kernel::launch(collatz, CT::Kernel::basic(A.shape().items(), "graphStream"), A.view());
|
||||
CT::Kernel::launch(plusOne, CT::Kernel::basic(A.shape().items(), "graphStreamBranch"),
|
||||
B.view());
|
||||
}
|
||||
|
||||
tools->joinBranch("graphStream", "graphStreamBranch");
|
||||
CT::Kernel::launch(addBoth, CT::Kernel::basic(A->shape().items(), "graphStream"), A->view(),
|
||||
B->view());
|
||||
A->updateHost("graphStream");
|
||||
B->updateHost("graphStream");
|
||||
tools->launchHostFunction("graphStream", myHostFunc, A->view(), 5);
|
||||
gm->joinBranch("graphStream", "graphStreamBranch");
|
||||
CT::Kernel::launch(addArray, CT::Kernel::basic(A.shape().items(), "graphStream"), A.view(),
|
||||
B.view());
|
||||
A.updateHost("graphStream");
|
||||
B.updateHost("graphStream");
|
||||
gm->launchHostFunction("graphStream", addNum, A.view(), 5);
|
||||
}
|
||||
|
||||
uint32_t doGraphTest() {
|
||||
@ -523,8 +522,8 @@ uint32_t doGraphTest() {
|
||||
CT::Manager::get()->addStream("graphStream");
|
||||
CT::Manager::get()->addStream("graphStreamBranch");
|
||||
|
||||
CT::GraphTools tools;
|
||||
CT::Graph graph("graphStream", myBasicGraph, &tools, &A, &B);
|
||||
CT::GraphManager gm;
|
||||
CT::Graph graph("graphStream", myGraph, &gm, A.view(), B.view());
|
||||
graph.execute().wait();
|
||||
|
||||
uint32_t errors = 0;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user