Boost C++ Libraries

...one of the most highly regarded and expertly designed C++ library projects in the world. Herb Sutter and Andrei Alexandrescu, C++ Coding Standards

This is the documentation for an old version of Boost. Click here to view this page for the latest version.
PrevUpHomeNext

CUDA

CUDA (Compute Unified Device Architecture) is a platform for parallel computing on NVIDIA GPUs. The application programming interface of CUDA gives access to GPU's instruction set and computation resources (Execution of compute kernels).

Synchronization with CUDA streams

CUDA operation such as compute kernels or memory transfer (between host and device) can be grouped/queued by CUDA streams. are executed on the GPUs. Boost.Fiber enables a fiber to sleep (suspend) till a CUDA stream has completed its operations. This enables applications to run other fibers on the CPU without the need to spawn an additional OS-threads. And resume the fiber when the CUDA streams has finished.

__global__
void kernel( int size, int * a, int * b, int * c) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if ( idx < size) {
        int idx1 = (idx + 1) % 256;
        int idx2 = (idx + 2) % 256;
        float as = (a[idx] + a[idx1] + a[idx2]) / 3.0f;
        float bs = (b[idx] + b[idx1] + b[idx2]) / 3.0f;
        c[idx] = (as + bs) / 2;
    }
}

boost::fibers::fiber f([&done]{
    cudaStream_t stream;
    cudaStreamCreate( & stream);
    int size = 1024 * 1024;
    int full_size = 20 * size;
    int * host_a, * host_b, * host_c;
    cudaHostAlloc( & host_a, full_size * sizeof( int), cudaHostAllocDefault);
    cudaHostAlloc( & host_b, full_size * sizeof( int), cudaHostAllocDefault);
    cudaHostAlloc( & host_c, full_size * sizeof( int), cudaHostAllocDefault);
    int * dev_a, * dev_b, * dev_c;
    cudaMalloc( & dev_a, size * sizeof( int) );
    cudaMalloc( & dev_b, size * sizeof( int) );
    cudaMalloc( & dev_c, size * sizeof( int) );
    std::minstd_rand generator;
    std::uniform_int_distribution<> distribution(1, 6);
    for ( int i = 0; i < full_size; ++i) {
        host_a[i] = distribution( generator);
        host_b[i] = distribution( generator);
    }
    for ( int i = 0; i < full_size; i += size) {
        cudaMemcpyAsync( dev_a, host_a + i, size * sizeof( int), cudaMemcpyHostToDevice, stream);
        cudaMemcpyAsync( dev_b, host_b + i, size * sizeof( int), cudaMemcpyHostToDevice, stream);
        kernel<<< size / 256, 256, 0, stream >>>( size, dev_a, dev_b, dev_c);
        cudaMemcpyAsync( host_c + i, dev_c, size * sizeof( int), cudaMemcpyDeviceToHost, stream);
    }
    auto result = boost::fibers::cuda::waitfor_all( stream); // suspend fiber till CUDA stream has finished
    BOOST_ASSERT( stream == std::get< 0 >( result) );
    BOOST_ASSERT( cudaSuccess == std::get< 1 >( result) );
    std::cout << "f1: GPU computation finished" << std::endl;
    cudaFreeHost( host_a);
    cudaFreeHost( host_b);
    cudaFreeHost( host_c);
    cudaFree( dev_a);
    cudaFree( dev_b);
    cudaFree( dev_c);
    cudaStreamDestroy( stream);
});
f.join();
Synopsis
#include <boost/fiber/cuda/waitfor.hpp>

namespace boost {
namespace fibers {
namespace cuda {

std::tuple< cudaStream_t, cudaError_t > waitfor_all( cudaStream_t st);
std::vector< std::tuple< cudaStream_t, cudaError_t > > waitfor_all( cudaStream_t ... st);

}}}

Non-member function cuda::waitfor()

#include <boost/fiber/cuda/waitfor.hpp>

namespace boost {
namespace fibers {
namespace cuda {

std::tuple< cudaStream_t, cudaError_t > waitfor_all( cudaStream_t st);
std::vector< std::tuple< cudaStream_t, cudaError_t > > waitfor_all( cudaStream_t ... st);

}}}

Effects:

Suspends active fiber till CUDA stream has finished its operations.

Returns:

tuple of stream reference and the CUDA stream status


PrevUpHomeNext