vexcl-2017-delft

VexCL Rapid development of scientiﬁc code for GPUs Denis Demidov
Institute of System Analysis / Russian Academy of Sciences, Kazan Federal University , June 2017

VexCL — a vector expression template library for OpenCL/CUDA Fork
m e on G itH ub Created for ease of C++ based GPGPU development: Convenient notation for vector expressions OpenCL/CUDA JIT code generation Easily combined with existing libraries/code Header-only Supported backends: OpenCL (Khronos C++ bindings, Boost.Compute) NVIDIA CUDA OpenMP Maxeler FPGAs (in progress) The source code is available under the MIT license: https://github.com/ddemidov/vexcl

Hello OpenCL Compute sum of two vectors in parallel: A
and B are large vectors. Compute A = A + B. Overview of (any) OpenCL solution: 1 Initialize OpenCL context 2 Allocate memory 3 Transfer input data 4 Run computations 5 Get the results

Hello OpenCL #include <iostream> #include <vector> #include <string> #include <stdexcept>
#define __CL_ENABLE_EXCEPTIONS #include <CL/cl.hpp> int main() { std::vector<cl::Platform> platform; cl::Platform::get(&platform); if (platform.empty()) throw std::runtime_error("No OpenCL platforms"); cl::Context context; std::vector<cl::Device> device; for(auto p = platform.begin(); device.empty() && p != platform.end(); p++) { std::vector<cl::Device> dev; p->getDevices(CL_DEVICE_TYPE_GPU, &dev); for(auto d = dev.begin(); device.empty() && d != dev.end(); d++) { if (!d->getInfo<CL_DEVICE_AVAILABLE>()) continue; device.push_back(*d); try { context = cl::Context(device); } catch(...) { device.clear(); } } } if (device.empty()) throw std::runtime_error("No GPUs"); cl::CommandQueue queue(context, device[0]); const size_t n = 1024 * 1024; std::vector<double> a(n, 1.5), b(n, 2.7); cl::Buffer A(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, a.size() * sizeof(a[0]), a.data()); cl::Buffer B(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, b.size() * sizeof(b[0]), b.data()); std::string source = R"( kernel void add(ulong n, global const double *a, global double *b) { ulong i = get_global_id(0); if (i < n) b[i] += a[i]; } )"; cl::Program program(context, source); try { program.build(device); } catch (const cl::Error&) { std::cerr << "OpenCL compilation error" << std::endl << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device[0]) << std::endl; throw std::runtime_error("OpenCL build error"); } cl::Kernel add(program, "add"); add.setArg(0, static_cast<cl_ulong>(n)); add.setArg(1, A); add.setArg(2, B); queue.enqueueNDRangeKernel(add, cl::NullRange, n, cl::NullRange); queue.enqueueReadBuffer(B, CL_TRUE, 0, b.size() * sizeof(b[0]), b.data()); std::cout << b[42] << std::endl; }

#define __CL_ENABLE_EXCEPTIONS #include <CL/cl.hpp> int main() { std::vector<cl::Platform> platform; cl::Platform::get(&platform); if (platform.empty()) throw std::runtime_error("No OpenCL platforms"); cl::Context context; std::vector<cl::Device> device; for(auto p = platform.begin(); device.empty() && p != platform.end(); p++) { std::vector<cl::Device> dev; p->getDevices(CL_DEVICE_TYPE_GPU, &dev); for(auto d = dev.begin(); device.empty() && d != dev.end(); d++) { if (!d->getInfo<CL_DEVICE_AVAILABLE>()) continue; device.push_back(*d); try { context = cl::Context(device); } catch(...) { device.clear(); } } } if (device.empty()) throw std::runtime_error("No GPUs"); cl::CommandQueue queue(context, device[0]); const size_t n = 1024 * 1024; std::vector<double> a(n, 1.5), b(n, 2.7); cl::Buffer A(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, a.size() * sizeof(a[0]), a.data()); cl::Buffer B(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, b.size() * sizeof(b[0]), b.data()); std::string source = R"( kernel void add(ulong n, global const double *a, global double *b) { ulong i = get_global_id(0); if (i < n) b[i] += a[i]; } )"; cl::Program program(context, source); try { program.build(device); } catch (const cl::Error&) { std::cerr << "OpenCL compilation error" << std::endl << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device[0]) << std::endl; throw std::runtime_error("OpenCL build error"); } cl::Kernel add(program, "add"); add.setArg(0, static_cast<cl_ulong>(n)); add.setArg(1, A); add.setArg(2, B); queue.enqueueNDRangeKernel(add, cl::NullRange, n, cl::NullRange); queue.enqueueReadBuffer(B, CL_TRUE, 0, b.size() * sizeof(b[0]), b.data()); std::cout << b[42] << std::endl; } 1. Query platforms 10 std::vector<cl::Platform> platform; 11 cl::Platform::get(&platform); 12 13 if (platform.empty()) 14 throw std::runtime_error("No OpenCL platforms");

#define __CL_ENABLE_EXCEPTIONS #include <CL/cl.hpp> int main() { std::vector<cl::Platform> platform; cl::Platform::get(&platform); if (platform.empty()) throw std::runtime_error("No OpenCL platforms"); cl::Context context; std::vector<cl::Device> device; for(auto p = platform.begin(); device.empty() && p != platform.end(); p++) { std::vector<cl::Device> dev; p->getDevices(CL_DEVICE_TYPE_GPU, &dev); for(auto d = dev.begin(); device.empty() && d != dev.end(); d++) { if (!d->getInfo<CL_DEVICE_AVAILABLE>()) continue; device.push_back(*d); try { context = cl::Context(device); } catch(...) { device.clear(); } } } if (device.empty()) throw std::runtime_error("No GPUs"); cl::CommandQueue queue(context, device[0]); const size_t n = 1024 * 1024; std::vector<double> a(n, 1.5), b(n, 2.7); cl::Buffer A(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, a.size() * sizeof(a[0]), a.data()); cl::Buffer B(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, b.size() * sizeof(b[0]), b.data()); std::string source = R"( kernel void add(ulong n, global const double *a, global double *b) { ulong i = get_global_id(0); if (i < n) b[i] += a[i]; } )"; cl::Program program(context, source); try { program.build(device); } catch (const cl::Error&) { std::cerr << "OpenCL compilation error" << std::endl << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device[0]) << std::endl; throw std::runtime_error("OpenCL build error"); } cl::Kernel add(program, "add"); add.setArg(0, static_cast<cl_ulong>(n)); add.setArg(1, A); add.setArg(2, B); queue.enqueueNDRangeKernel(add, cl::NullRange, n, cl::NullRange); queue.enqueueReadBuffer(B, CL_TRUE, 0, b.size() * sizeof(b[0]), b.data()); std::cout << b[42] << std::endl; } 2. Get the ﬁrst available GPU 16 cl::Context context; 17 std::vector<cl::Device> device; 18 for(auto p = platform.begin(); device.empty() && p != platform.end(); p++) { 19 std::vector<cl::Device> dev; 20 p->getDevices(CL_DEVICE_TYPE_GPU, &dev); 21 for(auto d = dev.begin(); device.empty() && d != dev.end(); d++) { 22 if (!d->getInfo<CL_DEVICE_AVAILABLE>()) continue; 23 device.push_back(*d); 24 try { 25 context = cl::Context(device); 26 } catch(...) { 27 device.clear(); 28 } 29 } 30 } 31 if (device.empty()) throw std::runtime_error("No GPUs"); 32 33 cl::CommandQueue queue(context, device[0]);

#define __CL_ENABLE_EXCEPTIONS #include <CL/cl.hpp> int main() { std::vector<cl::Platform> platform; cl::Platform::get(&platform); if (platform.empty()) throw std::runtime_error("No OpenCL platforms"); cl::Context context; std::vector<cl::Device> device; for(auto p = platform.begin(); device.empty() && p != platform.end(); p++) { std::vector<cl::Device> dev; p->getDevices(CL_DEVICE_TYPE_GPU, &dev); for(auto d = dev.begin(); device.empty() && d != dev.end(); d++) { if (!d->getInfo<CL_DEVICE_AVAILABLE>()) continue; device.push_back(*d); try { context = cl::Context(device); } catch(...) { device.clear(); } } } if (device.empty()) throw std::runtime_error("No GPUs"); cl::CommandQueue queue(context, device[0]); const size_t n = 1024 * 1024; std::vector<double> a(n, 1.5), b(n, 2.7); cl::Buffer A(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, a.size() * sizeof(a[0]), a.data()); cl::Buffer B(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, b.size() * sizeof(b[0]), b.data()); std::string source = R"( kernel void add(ulong n, global const double *a, global double *b) { ulong i = get_global_id(0); if (i < n) b[i] += a[i]; } )"; cl::Program program(context, source); try { program.build(device); } catch (const cl::Error&) { std::cerr << "OpenCL compilation error" << std::endl << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device[0]) << std::endl; throw std::runtime_error("OpenCL build error"); } cl::Kernel add(program, "add"); add.setArg(0, static_cast<cl_ulong>(n)); add.setArg(1, A); add.setArg(2, B); queue.enqueueNDRangeKernel(add, cl::NullRange, n, cl::NullRange); queue.enqueueReadBuffer(B, CL_TRUE, 0, b.size() * sizeof(b[0]), b.data()); std::cout << b[42] << std::endl; } 3. Prepare and copy input data 35 const size_t n = 1024 * 1024; 36 std::vector<double> a(n, 1.5), b(n, 2.7); 37 38 cl::Buffer A(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 39 a.size() * sizeof(a[0]), a.data()); 40 41 cl::Buffer B(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 42 b.size() * sizeof(b[0]), b.data());

#define __CL_ENABLE_EXCEPTIONS #include <CL/cl.hpp> int main() { std::vector<cl::Platform> platform; cl::Platform::get(&platform); if (platform.empty()) throw std::runtime_error("No OpenCL platforms"); cl::Context context; std::vector<cl::Device> device; for(auto p = platform.begin(); device.empty() && p != platform.end(); p++) { std::vector<cl::Device> dev; p->getDevices(CL_DEVICE_TYPE_GPU, &dev); for(auto d = dev.begin(); device.empty() && d != dev.end(); d++) { if (!d->getInfo<CL_DEVICE_AVAILABLE>()) continue; device.push_back(*d); try { context = cl::Context(device); } catch(...) { device.clear(); } } } if (device.empty()) throw std::runtime_error("No GPUs"); cl::CommandQueue queue(context, device[0]); const size_t n = 1024 * 1024; std::vector<double> a(n, 1.5), b(n, 2.7); cl::Buffer A(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, a.size() * sizeof(a[0]), a.data()); cl::Buffer B(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, b.size() * sizeof(b[0]), b.data()); std::string source = R"( kernel void add(ulong n, global const double *a, global double *b) { ulong i = get_global_id(0); if (i < n) b[i] += a[i]; } )"; cl::Program program(context, source); try { program.build(device); } catch (const cl::Error&) { std::cerr << "OpenCL compilation error" << std::endl << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device[0]) << std::endl; throw std::runtime_error("OpenCL build error"); } cl::Kernel add(program, "add"); add.setArg(0, static_cast<cl_ulong>(n)); add.setArg(1, A); add.setArg(2, B); queue.enqueueNDRangeKernel(add, cl::NullRange, n, cl::NullRange); queue.enqueueReadBuffer(B, CL_TRUE, 0, b.size() * sizeof(b[0]), b.data()); std::cout << b[42] << std::endl; } 4. Write and compile the compute kernel 44 std::string source = R"( 45 kernel void add(ulong n, global const double *a, global double *b) { 46 ulong i = get_global_id(0); 47 if (i < n) b[i] += a[i]; 48 } 49 )"; 50 51 cl::Program program(context, source); 52 53 try { 54 program.build(device); 55 } catch (const cl::Error&) { 56 std::cerr 57 << "OpenCL compilation error" << std::endl 58 << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device[0]) 59 << std::endl; 60 throw std::runtime_error("OpenCL build error"); 61 } 62 cl::Kernel add(program, "add");

#define __CL_ENABLE_EXCEPTIONS #include <CL/cl.hpp> int main() { std::vector<cl::Platform> platform; cl::Platform::get(&platform); if (platform.empty()) throw std::runtime_error("No OpenCL platforms"); cl::Context context; std::vector<cl::Device> device; for(auto p = platform.begin(); device.empty() && p != platform.end(); p++) { std::vector<cl::Device> dev; p->getDevices(CL_DEVICE_TYPE_GPU, &dev); for(auto d = dev.begin(); device.empty() && d != dev.end(); d++) { if (!d->getInfo<CL_DEVICE_AVAILABLE>()) continue; device.push_back(*d); try { context = cl::Context(device); } catch(...) { device.clear(); } } } if (device.empty()) throw std::runtime_error("No GPUs"); cl::CommandQueue queue(context, device[0]); const size_t n = 1024 * 1024; std::vector<double> a(n, 1.5), b(n, 2.7); cl::Buffer A(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, a.size() * sizeof(a[0]), a.data()); cl::Buffer B(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, b.size() * sizeof(b[0]), b.data()); std::string source = R"( kernel void add(ulong n, global const double *a, global double *b) { ulong i = get_global_id(0); if (i < n) b[i] += a[i]; } )"; cl::Program program(context, source); try { program.build(device); } catch (const cl::Error&) { std::cerr << "OpenCL compilation error" << std::endl << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device[0]) << std::endl; throw std::runtime_error("OpenCL build error"); } cl::Kernel add(program, "add"); add.setArg(0, static_cast<cl_ulong>(n)); add.setArg(1, A); add.setArg(2, B); queue.enqueueNDRangeKernel(add, cl::NullRange, n, cl::NullRange); queue.enqueueReadBuffer(B, CL_TRUE, 0, b.size() * sizeof(b[0]), b.data()); std::cout << b[42] << std::endl; } 5. Set kernel arguments and launch the computation 64 add.setArg(0, static_cast<cl_ulong>(n)); 65 add.setArg(1, A); 66 add.setArg(2, B); 67 68 queue.enqueueNDRangeKernel(add, cl::NullRange, n, cl::NullRange);

#define __CL_ENABLE_EXCEPTIONS #include <CL/cl.hpp> int main() { std::vector<cl::Platform> platform; cl::Platform::get(&platform); if (platform.empty()) throw std::runtime_error("No OpenCL platforms"); cl::Context context; std::vector<cl::Device> device; for(auto p = platform.begin(); device.empty() && p != platform.end(); p++) { std::vector<cl::Device> dev; p->getDevices(CL_DEVICE_TYPE_GPU, &dev); for(auto d = dev.begin(); device.empty() && d != dev.end(); d++) { if (!d->getInfo<CL_DEVICE_AVAILABLE>()) continue; device.push_back(*d); try { context = cl::Context(device); } catch(...) { device.clear(); } } } if (device.empty()) throw std::runtime_error("No GPUs"); cl::CommandQueue queue(context, device[0]); const size_t n = 1024 * 1024; std::vector<double> a(n, 1.5), b(n, 2.7); cl::Buffer A(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, a.size() * sizeof(a[0]), a.data()); cl::Buffer B(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, b.size() * sizeof(b[0]), b.data()); std::string source = R"( kernel void add(ulong n, global const double *a, global double *b) { ulong i = get_global_id(0); if (i < n) b[i] += a[i]; } )"; cl::Program program(context, source); try { program.build(device); } catch (const cl::Error&) { std::cerr << "OpenCL compilation error" << std::endl << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device[0]) << std::endl; throw std::runtime_error("OpenCL build error"); } cl::Kernel add(program, "add"); add.setArg(0, static_cast<cl_ulong>(n)); add.setArg(1, A); add.setArg(2, B); queue.enqueueNDRangeKernel(add, cl::NullRange, n, cl::NullRange); queue.enqueueReadBuffer(B, CL_TRUE, 0, b.size() * sizeof(b[0]), b.data()); std::cout << b[42] << std::endl; } 6. Get the results from the compute device 70 queue.enqueueReadBuffer(B, CL_TRUE, 0, b.size() * sizeof(b[0]), b.data()); 71 std::cout << b[42] << std::endl;

Hello VexCL hello.cpp 1 #include <iostream> 2 #include <vector> 3
#include <vexcl/vexcl.hpp> 4 5 int main() { 6 vex::Context ctx( vex::Filter::GPU ); 7 if (!ctx) throw std::runtime_error("No GPUs"); 8 9 size_t n = 1024 * 1024; 10 std::vector<double> a(n, 1.5), b(n, 2.7); 11 12 vex::vector<double> A(ctx, a); 13 vex::vector<double> B(ctx, b); 14 15 B += A; 16 17 vex::copy(B, b); 18 std::cout << b[42] << std::endl; 19 } CMakeLists.txt 1 cmake_minimum_required(VERSION 3.1) 2 project(hello) 3 4 # When VexCL is installed system wide: 5 find_package(VexCL) 6 7 # When VexCL is copied into a subfolder: 8 # add_subdirectory(vexcl) 9 10 add_executable(hello hello.cpp) 11 target_link_libraries(hello VexCL::OpenCL)

Interface

Initialization Multi-device and multi-platform computations are supported. VexCL context is
initialized from combination of device ﬁlters. Initialize VexCL context on selected devices 1 vex::Context ctx( vex::Filter::Any );

initialized from combination of device ﬁlters. Initialize VexCL context on selected devices 1 vex::Context ctx( vex::Filter::GPU );

initialized from combination of device ﬁlters. Initialize VexCL context on selected devices 1 vex::Context ctx(vex::Filter::Name("Intel"));

initialized from combination of device ﬁlters. Initialize VexCL context on selected devices 1 vex::Context ctx(vex::Filter::GPU && vex::Filter::Platform("AMD"));

Memory and work splitting 1 vex::Context ctx( vex::Filter::Count(1) ); 2
3 vex::vector<double> x(ctx, N); 4 vex::vector<double> y(ctx, N); 5 6 x = vex::element_index() * (1.0 / N); 7 y = sin(2 * x) + sqrt(1 - x * x); x y

Memory and work splitting 1 vex::Context ctx( vex::Filter::GPU ); 2

Memory and work splitting 1 vex::Context ctx( vex::Filter::DoublePrecision ); 2

Copies between host and device memory 1 vex::vector<double> d(ctx, n);
2 std::vector<double> h(n); 3 double a[100]; STL-like range copies 1 vex::copy(d.begin(), d.end(), h.begin()); 2 vex::copy(d.begin(), d.begin() + 100, a); Simple copies 1 vex::copy(d, h); 2 vex::copy(h, d); Map OpenCL buﬀer to host pointer 1 auto p = d.map(devnum); 2 std::sort(&p[0], &p[d.part_size(devnum)]); Access single element (slow) 1 double v = d[42]; 2 d[0] = 0;

What vector expressions are supported? All vectors in an expression
have to be compatible: Have same size Located on same devices What may be used: Vectors, scalars, constants Arithmetic, logical operators Built-in functions User-deﬁned functions Random number generators Slicing and permutations Reduce to a scalar (sum, min, max) Reduce across chosen dimensions Stencil operations Sparse matrix – vector products Fast Fourier Transform Sort, scan, reduce by key

Builtin operations and functions This expression: 1 x = 2
* y - sin(z); export VEXCL_SHOW_KERNELS=1 to see the generated code. . . . results in this kernel: 1 kernel void vexcl_vector_kernel( 2 ulong n, 3 global double * prm_1, 4 int prm_2, 5 global double * prm_3, 6 global double * prm_4 7 ) 8 { 9 for(size_t idx = get_global_id(0); idx < n; idx += get_global_size(0)) { 10 prm_1[idx] = ( ( prm_2 * prm_3[idx] ) - sin( prm_4[idx] ) ); 11 } 12 } − ∗ sin 2 y z

User-deﬁned functions Deﬁning a function: 1 VEX_FUNCTION( double, sqr, (double,
x)(double, y), 2 return x * x + y * y; 3 ); Using the function: 1 Z = sqrt( sqr(X, Y) );

User functions are translated to OpenCL functions 1 Z =
sqrt( sqr(X, Y) ); . . . gets translated to: 1 double sqr(double x, double y) { 2 return x * x + y * y; 3 } 4 5 kernel void vexcl_vector_kernel( 6 ulong n, 7 global double * prm_1, 8 global double * prm_2, 9 global double * prm_3 10 ) 11 { 12 for(size_t idx = get_global_id(0); idx < n; idx += get_global_size(0)) { 13 prm_1[idx] = sqrt( sqr( prm_2[idx], prm_3[idx] ) ); 14 } 15 } sqrt sqr x y

Random number generation VexCL provides1 counter-based random number generators from
Random1232 suite. The generators are stateless; mixing functions are applied to element indices. Implemented families: threefry and philox. Both pass TestU01/BigCrush; up to 264 independent streams with a period of 2128. Performance: ≈ 1010 Samples/sec (Tesla K20c). vex::Random<T,G> — uniform distribution. vex::RandomNormal<T,G> — normal distribution. 1 vex::Random<double> rnd; 2 vex::vector<double> x(ctx, n); 3 4 x = rnd(vex::element_index(), std::rand()); 1Contributed by Pascal Germroth [email protected] 2D. E. Shaw Research, http://www.deshawresearch.com/resources random123.html

Reductions Class vex::Reductor<T, kind> allows to reduce arbitrary vector expression
to a single value of type T. Supported reduction kinds: SUM, SUM_Kahan, MIN, MAX Inner product 1 vex::Reductor<double, vex::SUM> sum(ctx); 2 double s = sum(x * y); Number of elements in x between 0 and 1 1 vex::Reductor<size_t, vex::SUM> sum(ctx); 2 size_t n = sum( (x > 0) && (x < 1) ); Maximum distance from origin 1 vex::Reductor<double, vex::MAX> max(ctx); 2 double d = max( sqrt(x * x + y * y) );

Sparse matrix – vector products Class vex::sparse::matrix<T> holds representation of
a sparse matrix on compute devices. Constructor accepts matrix in common CRS format: row indices, columns and values of nonzero entries. Construct matrix 1 vex::sparse::matrix<double> A(ctx, n, n, ptr, col, val); Compute residual value 2 // vex::vector<double> u, f, r; 3 r = f - A * u; 4 double res = max( fabs(r) );

Raw pointer arithmetic raw_pointer(const vector<T>&) function returns pointer to vector’s
data inside vector expression. May be used to implement complex access patterns. 1D Laplace operator: 1 auto ptr = vex::raw_pointer(x); 2 auto idx = vex::element_index(); 3 4 auto left = if_else(idx > 0, i - 1, i); 5 auto right = if_else(idx + 1 < n, i + 1, i); 6 7 y = 2 * ptr[idx] - ptr[left] - ptr[right];

N-body problem with raw pointers yi = j=i e−|xi −xj
| 1 VEX_FUNCTION(double, nbody, (size_t, n)(size_t, i)(double*, x), 2 double sum = 0, myval = x[i]; 3 for(size_t j = 0; j < n; ++j) 4 if (j != i) sum += exp(-fabs(x[j] - myval)); 5 return sum; 6 ); 7 8 y = nbody(x.size(), vex::element_index(), raw_pointer(x));

Usage examples

Monte Carlo π Compute approximate value of π: area of
circle area of square = πr2 (2r)2 = π 4 , π = 4 area of circle area of square ≈ 4 count(points in circle) count(all points) 1 vex::Random<cl_double2> rnd; // Generates 2D points in [0, 1] × [0, 1] 2 vex::Reductor<size_t, vex::SUM> sum(ctx); 3 4 double pi = 4.0 * sum( length( rnd(vex::element_index(0, n), seed) ) < 1 ) / n;

Monte Carlo π: the generated kernel #if defined(cl_khr_fp64) #pragma OPENCL
EXTENSION cl_khr_fp64: enable #elif defined(cl_amd_fp64) #pragma OPENCL EXTENSION cl_amd_fp64: enable #endif void philox_uint_4_10 ( uint * ctr, uint * key ) { uint m[4]; m[0] = mul_hi(0xD2511F53, ctr[0]); m[1] = 0xD2511F53 * ctr[0]; m[2] = mul_hi(0xCD9E8D57, ctr[2]); m[3] = 0xCD9E8D57 * ctr[2]; ctr[0] = m[2] ˆ ctr[1] ˆ key[0]; ctr[1] = m[3]; ctr[2] = m[0] ˆ ctr[3] ˆ key[1]; ctr[3] = m[1]; key[0] += 0x9E3779B9; key[1] += 0xBB67AE85; m[0] = mul_hi(0xD2511F53, ctr[0]); m[1] = 0xD2511F53 * ctr[0]; m[2] = mul_hi(0xCD9E8D57, ctr[2]); m[3] = 0xCD9E8D57 * ctr[2]; ctr[0] = m[2] ˆ ctr[1] ˆ key[0]; ctr[1] = m[3]; ctr[2] = m[0] ˆ ctr[3] ˆ key[1]; ctr[3] = m[1]; key[0] += 0x9E3779B9; key[1] += 0xBB67AE85; m[0] = mul_hi(0xD2511F53, ctr[0]); m[1] = 0xD2511F53 * ctr[0]; m[2] = mul_hi(0xCD9E8D57, ctr[2]); m[3] = 0xCD9E8D57 * ctr[2]; ctr[0] = m[2] ˆ ctr[1] ˆ key[0]; ctr[1] = m[3]; ctr[2] = m[0] ˆ ctr[3] ˆ key[1]; ctr[3] = m[1]; key[0] += 0x9E3779B9; key[1] += 0xBB67AE85; m[0] = mul_hi(0xD2511F53, ctr[0]); m[1] = 0xD2511F53 * ctr[0]; m[2] = mul_hi(0xCD9E8D57, ctr[2]); m[3] = 0xCD9E8D57 * ctr[2]; ctr[0] = m[2] ˆ ctr[1] ˆ key[0]; ctr[1] = m[3]; ctr[2] = m[0] ˆ ctr[3] ˆ key[1]; ctr[3] = m[1]; key[0] += 0x9E3779B9; key[1] += 0xBB67AE85; m[0] = mul_hi(0xD2511F53, ctr[0]); m[1] = 0xD2511F53 * ctr[0]; m[2] = mul_hi(0xCD9E8D57, ctr[2]); m[3] = 0xCD9E8D57 * ctr[2]; ctr[0] = m[2] ˆ ctr[1] ˆ key[0]; ctr[1] = m[3]; ctr[2] = m[0] ˆ ctr[3] ˆ key[1]; ctr[3] = m[1]; key[0] += 0x9E3779B9; key[1] += 0xBB67AE85; m[0] = mul_hi(0xD2511F53, ctr[0]); m[1] = 0xD2511F53 * ctr[0]; m[2] = mul_hi(0xCD9E8D57, ctr[2]); m[3] = 0xCD9E8D57 * ctr[2]; ctr[0] = m[2] ˆ ctr[1] ˆ key[0]; ctr[1] = m[3]; ctr[2] = m[0] ˆ ctr[3] ˆ key[1]; ctr[3] = m[1]; key[0] += 0x9E3779B9; key[1] += 0xBB67AE85; m[0] = mul_hi(0xD2511F53, ctr[0]); m[1] = 0xD2511F53 * ctr[0]; m[2] = mul_hi(0xCD9E8D57, ctr[2]); m[3] = 0xCD9E8D57 * ctr[2]; ctr[0] = m[2] ˆ ctr[1] ˆ key[0]; ctr[1] = m[3]; ctr[2] = m[0] ˆ ctr[3] ˆ key[1]; ctr[3] = m[1]; key[0] += 0x9E3779B9; key[1] += 0xBB67AE85; m[0] = mul_hi(0xD2511F53, ctr[0]); m[1] = 0xD2511F53 * ctr[0]; m[2] = mul_hi(0xCD9E8D57, ctr[2]); m[3] = 0xCD9E8D57 * ctr[2]; ctr[0] = m[2] ˆ ctr[1] ˆ key[0]; ctr[1] = m[3]; ctr[2] = m[0] ˆ ctr[3] ˆ key[1]; ctr[3] = m[1]; key[0] += 0x9E3779B9; key[1] += 0xBB67AE85; m[0] = mul_hi(0xD2511F53, ctr[0]); m[1] = 0xD2511F53 * ctr[0]; m[2] = mul_hi(0xCD9E8D57, ctr[2]); m[3] = 0xCD9E8D57 * ctr[2]; ctr[0] = m[2] ˆ ctr[1] ˆ key[0]; ctr[1] = m[3]; ctr[2] = m[0] ˆ ctr[3] ˆ key[1]; ctr[3] = m[1]; key[0] += 0x9E3779B9; key[1] += 0xBB67AE85; m[0] = mul_hi(0xD2511F53, ctr[0]); m[1] = 0xD2511F53 * ctr[0]; m[2] = mul_hi(0xCD9E8D57, ctr[2]); m[3] = 0xCD9E8D57 * ctr[2]; ctr[0] = m[2] ˆ ctr[1] ˆ key[0]; ctr[1] = m[3]; ctr[2] = m[0] ˆ ctr[3] ˆ key[1]; ctr[3] = m[1]; } double2 random_double2_philox ( ulong prm1, ulong prm2 ) { union { uint ctr[4]; ulong res_i[2]; double res_f[2]; double2 res; } ctr; uint key[2]; ctr.ctr[0] = prm1; ctr.ctr[1] = prm2; ctr.ctr[2] = prm1; ctr.ctr[3] = prm2; key[0] = 0x12345678; key[1] = 0x12345678; philox_uint_4_10(ctr.ctr, key); ctr.res_f[0] = ctr.res_i[0] / 18446744073709551615.0; ctr.res_f[1] = ctr.res_i[1] / 18446744073709551615.0; return ctr.res; } ulong SUM_ulong ( ulong prm1, ulong prm2 ) { return prm1 + prm2; } kernel void vexcl_reductor_kernel ( ulong n, ulong prm_1, ulong prm_2, double prm_3, global ulong * g_odata, local ulong * smem ) { ulong mySum = (ulong)0; for(ulong idx = get_global_id(0); idx < n; idx += get_global_size(0)) { mySum = SUM_ulong(mySum, ( length( random_double2_philox( (prm_1 + idx), prm_2 ) ) < prm_3 )); } local ulong * sdata = smem; size_t tid = get_local_id(0); size_t block_size = get_local_size(0); sdata[tid] = mySum; barrier(CLK_LOCAL_MEM_FENCE); if (block_size >= 1024) { if (tid < 512) { sdata[tid] = mySum = SUM_ulong(mySum, sdata[tid + 512]); } barrier(CLK_LOCAL_MEM_FENCE); } if (block_size >= 512) { if (tid < 256) { sdata[tid] = mySum = SUM_ulong(mySum, sdata[tid + 256]); } barrier(CLK_LOCAL_MEM_FENCE); } if (block_size >= 256) { if (tid < 128) { sdata[tid] = mySum = SUM_ulong(mySum, sdata[tid + 128]); } barrier(CLK_LOCAL_MEM_FENCE); } if (block_size >= 128) { if (tid < 64) { sdata[tid] = mySum = SUM_ulong(mySum, sdata[tid + 64]); } barrier(CLK_LOCAL_MEM_FENCE); } if (tid < 32) { volatile local ulong * smem = sdata; if (block_size >= 64) { smem[tid] = mySum = SUM_ulong(mySum, smem[tid + 32]); } if (block_size >= 32) { smem[tid] = mySum = SUM_ulong(mySum, smem[tid + 16]); } if (block_size >= 16) { smem[tid] = mySum = SUM_ulong(mySum, smem[tid + 8]); } if (block_size >= 8) { smem[tid] = mySum = SUM_ulong(mySum, smem[tid + 4]); } if (block_size >= 4) { smem[tid] = mySum = SUM_ulong(mySum, smem[tid + 2]); } if (block_size >= 2) { smem[tid] = mySum = SUM_ulong(mySum, smem[tid + 1]); } } if (tid == 0) g_odata[get_group_id(0)] = sdata[0]; }

Solving ODEs on GPUs Lorenz attractor system ˙ x =
−σ (x − y) , ˙ y = Rx − y − xz, ˙ z = −bz + xy. 20 0 20 20 0 20 40 20 40 60 80 Lorenz attractor trajectory Parameter study for the Lorenz attractor system: Solve large number of Lorenz systems, each for a diﬀerent value of R. Let’s use Boost.odeint together with VexCL for that. [1] K. Ahnert, D. Demidov, and M. Mulansky. Solving ordinary diﬀerential equations on GPUs. In Numerical Computations with GPUs (pp. 125-157). Springer, 2014. doi:10.1007/978-3-319-06548-9 7

Using Boost.odeint Generic ODE: dx dt = ˙ x =
f (x, t), x(0) = x0. Using Boost.odeint: Deﬁne state type (what is x?) Choose integration method Provide system function (deﬁne f (x, t)) Integrate over time

Implementation #include <iostream> #include <vector> #include <vexcl/vexcl.hpp> #include <boost/numeric/odeint.hpp> #include
<boost/numeric/odeint/external/vexcl/vexcl.hpp> namespace odeint = boost::numeric::odeint; typedef vex::multivector<double, 3> state_type; typedef odeint::runge_kutta4_classic< state_type, double, state_type, double, odeint::vector_space_algebra, odeint::default_operations > Stepper; struct lorenz_system { const vex::vector<double> &R; double sigma, b; void operator()(const state_type &x, state_type &dxdt, double t) const { dxdt = std::make_tuple( sigma * (x(1) - x(0)), R * x(0) - x(1) - x(0) * x(2), x(0) * x(1) - b * x(2)); } }; int main(int argc, char *argv[]) { const size_t n = (argc > 1) ? std::stoi(argv[1]) : 32; const double tmax = 1.0, dt = 0.01; const double Rmin = 0.1, Rmax = 50.0, dR = (Rmax - Rmin) / (n - 1); vex::Context ctx(vex::Filter::Env && vex::Filter::DoublePrecision); state_type X(ctx, n); vex::vector<double> R(ctx, n); X = 10.0; R = Rmin + dR * vex::element_index(); lorenz_system sys{R, 10.0, 8.0/3}; odeint::integrate_const(Stepper(), sys, X, 0.0, tmax, dt); std::cout << "x = " << X << std::endl; }

<boost/numeric/odeint/external/vexcl/vexcl.hpp> namespace odeint = boost::numeric::odeint; typedef vex::multivector<double, 3> state_type; typedef odeint::runge_kutta4_classic< state_type, double, state_type, double, odeint::vector_space_algebra, odeint::default_operations > Stepper; struct lorenz_system { const vex::vector<double> &R; double sigma, b; void operator()(const state_type &x, state_type &dxdt, double t) const { dxdt = std::make_tuple( sigma * (x(1) - x(0)), R * x(0) - x(1) - x(0) * x(2), x(0) * x(1) - b * x(2)); } }; int main(int argc, char *argv[]) { const size_t n = (argc > 1) ? std::stoi(argv[1]) : 32; const double tmax = 1.0, dt = 0.01; const double Rmin = 0.1, Rmax = 50.0, dR = (Rmax - Rmin) / (n - 1); vex::Context ctx(vex::Filter::Env && vex::Filter::DoublePrecision); state_type X(ctx, n); vex::vector<double> R(ctx, n); X = 10.0; R = Rmin + dR * vex::element_index(); lorenz_system sys{R, 10.0, 8.0/3}; odeint::integrate_const(Stepper(), sys, X, 0.0, tmax, dt); std::cout << "x = " << X << std::endl; } 1. State type and stepper type 9 typedef vex::multivector<double, 3> state_type; 10 11 typedef odeint::runge_kutta4_classic< 12 state_type, double, state_type, double, 13 odeint::vector_space_algebra, odeint::default_operations 14 > Stepper;

<boost/numeric/odeint/external/vexcl/vexcl.hpp> namespace odeint = boost::numeric::odeint; typedef vex::multivector<double, 3> state_type; typedef odeint::runge_kutta4_classic< state_type, double, state_type, double, odeint::vector_space_algebra, odeint::default_operations > Stepper; struct lorenz_system { const vex::vector<double> &R; double sigma, b; void operator()(const state_type &x, state_type &dxdt, double t) const { dxdt = std::make_tuple( sigma * (x(1) - x(0)), R * x(0) - x(1) - x(0) * x(2), x(0) * x(1) - b * x(2)); } }; int main(int argc, char *argv[]) { const size_t n = (argc > 1) ? std::stoi(argv[1]) : 32; const double tmax = 1.0, dt = 0.01; const double Rmin = 0.1, Rmax = 50.0, dR = (Rmax - Rmin) / (n - 1); vex::Context ctx(vex::Filter::Env && vex::Filter::DoublePrecision); state_type X(ctx, n); vex::vector<double> R(ctx, n); X = 10.0; R = Rmin + dR * vex::element_index(); lorenz_system sys{R, 10.0, 8.0/3}; odeint::integrate_const(Stepper(), sys, X, 0.0, tmax, dt); std::cout << "x = " << X << std::endl; } 2. System function 10 struct lorenz_system { 11 const vex::vector<double> &R; 12 double sigma, b; 13 14 void operator()(const state_type &x, state_type &dxdt, double t) const { 15 dxdt = std::make_tuple( 16 sigma * (x(1) - x(0)), 17 R * x(0) - x(1) - x(0) * x(2), 18 x(0) * x(1) - b * x(2)); 19 } 20 };

<boost/numeric/odeint/external/vexcl/vexcl.hpp> namespace odeint = boost::numeric::odeint; typedef vex::multivector<double, 3> state_type; typedef odeint::runge_kutta4_classic< state_type, double, state_type, double, odeint::vector_space_algebra, odeint::default_operations > Stepper; struct lorenz_system { const vex::vector<double> &R; double sigma, b; void operator()(const state_type &x, state_type &dxdt, double t) const { dxdt = std::make_tuple( sigma * (x(1) - x(0)), R * x(0) - x(1) - x(0) * x(2), x(0) * x(1) - b * x(2)); } }; int main(int argc, char *argv[]) { const size_t n = (argc > 1) ? std::stoi(argv[1]) : 32; const double tmax = 1.0, dt = 0.01; const double Rmin = 0.1, Rmax = 50.0, dR = (Rmax - Rmin) / (n - 1); vex::Context ctx(vex::Filter::Env && vex::Filter::DoublePrecision); state_type X(ctx, n); vex::vector<double> R(ctx, n); X = 10.0; R = Rmin + dR * vex::element_index(); lorenz_system sys{R, 10.0, 8.0/3}; odeint::integrate_const(Stepper(), sys, X, 0.0, tmax, dt); std::cout << "x = " << X << std::endl; } 3. Integrate 27 int main(int argc, char *argv[]) { 28 const size_t n = (argc > 1) ? std::stoi(argv[1]) : 32; 29 const double tmax = 1.0, dt = 0.01; 30 const double Rmin = 0.1, Rmax = 50.0, dR = (Rmax - Rmin) / (n - 1); 31 32 vex::Context ctx(vex::Filter::Env && vex::Filter::DoublePrecision); 33 34 state_type X(ctx, n); 35 vex::vector<double> R(ctx, n); 36 37 X = 10.0; 38 R = Rmin + dR * vex::element_index(); 39 40 lorenz_system sys{R, 10.0, 8.0/3}; 41 odeint::integrate_const(Stepper(), sys, X, 0.0, tmax, dt); 42 43 std::cout << "x = " << X << std::endl; 44 }

Performance NVIDIA Tesla K40c, Intel Core i7 920 103 104
105 106 N 10 1 100 101 102 103 104 T(sec) OpenMP VexCL 103 104 105 106 N 10 1 100 101 T(OpenMP) / T

Summary VexCL allows to write compact and readable code. It’s
great for fast prototyping of scientiﬁc GPGPU applications. The performance is often on-par with manually written kernels. Using custom kernels or third-party functionality is still possible.

Summary VexCL allows to write compact and readable code. It’s
great for fast prototyping of scientiﬁc GPGPU applications. The performance is often on-par with manually written kernels. Using custom kernels or third-party functionality is still possible. It stresses the compiler greatly. Compilation takes a lot of RAM and a lot of time. But, it’s easy to combine C++ with Python: http://pybind11.readthedocs.io https://xkcd.com/303/

Creating python bindings for C++ with pybind11 #include <iostream> #include
<vector> #include <vexcl/vexcl.hpp> #include <boost/numeric/odeint.hpp> #include <boost/numeric/odeint/external/vexcl/vexcl.hpp> #include <pybind11/pybind11.h> #include <pybind11/numpy.h> namespace odeint = boost::numeric::odeint; namespace py = pybind11; typedef vex::multivector<double, 3> state_type; typedef odeint::runge_kutta4_classic< state_type, double, state_type, double, odeint::vector_space_algebra, odeint::default_operations > Stepper; vex::Context& ctx(std::string name = "") { static vex::Context c(vex::Filter::Env && vex::Filter::Name(name)); return c; } struct lorenz_system { int n; double sigma, b; vex::vector<double> R; state_type X; lorenz_system(double sigma, double b, py::array_t<double> R) : n(R.size()), sigma(sigma), b(b), R(ctx(), n, R.data()), X(ctx(), n) {} void operator()(const state_type &x, state_type &dxdt, double t) const { dxdt = std::make_tuple( sigma * (x(1) - x(0)), R * x(0) - x(1) - x(0) * x(2), x(0) * x(1) - b * x(2)); } py::array_t<double> advance(py::array_t<double> x_in, int steps, double dt) { for(int i=0, b=0, e=n; i<3; ++i, b+=n, e+=n) vex::copy(x_in.data()+b, x_in.data()+e, X(i).begin()); odeint::integrate_n_steps(Stepper(), std::ref(*this), X, 0.0, dt, steps); py::array_t<double> x_out(std::array<size_t,2>{x_in.shape(0), x_in.shape(1)}); for(int i=0, b=0; i<3; ++i, b+=n) vex::copy(X(i).begin(), X(i).end(), x_out.mutable_data()+b); return x_out; } }; PYBIND11_PLUGIN(pylorenz) { py::module m("pylorenz"); m.def("context", [](std::string name) { std::ostringstream s; s << ctx(name); py::print(s.str()); }, py::arg("name") = std::string("")); py::class_<lorenz_system>(m, "Stepper") .def(py::init<double, double, py::array_t<double>>()) .def("advance", &lorenz_system::advance) ; return m.ptr(); }

<vector> #include <vexcl/vexcl.hpp> #include <boost/numeric/odeint.hpp> #include <boost/numeric/odeint/external/vexcl/vexcl.hpp> #include <pybind11/pybind11.h> #include <pybind11/numpy.h> namespace odeint = boost::numeric::odeint; namespace py = pybind11; typedef vex::multivector<double, 3> state_type; typedef odeint::runge_kutta4_classic< state_type, double, state_type, double, odeint::vector_space_algebra, odeint::default_operations > Stepper; vex::Context& ctx(std::string name = "") { static vex::Context c(vex::Filter::Env && vex::Filter::Name(name)); return c; } struct lorenz_system { int n; double sigma, b; vex::vector<double> R; state_type X; lorenz_system(double sigma, double b, py::array_t<double> R) : n(R.size()), sigma(sigma), b(b), R(ctx(), n, R.data()), X(ctx(), n) {} void operator()(const state_type &x, state_type &dxdt, double t) const { dxdt = std::make_tuple( sigma * (x(1) - x(0)), R * x(0) - x(1) - x(0) * x(2), x(0) * x(1) - b * x(2)); } py::array_t<double> advance(py::array_t<double> x_in, int steps, double dt) { for(int i=0, b=0, e=n; i<3; ++i, b+=n, e+=n) vex::copy(x_in.data()+b, x_in.data()+e, X(i).begin()); odeint::integrate_n_steps(Stepper(), std::ref(*this), X, 0.0, dt, steps); py::array_t<double> x_out(std::array<size_t,2>{x_in.shape(0), x_in.shape(1)}); for(int i=0, b=0; i<3; ++i, b+=n) vex::copy(X(i).begin(), X(i).end(), x_out.mutable_data()+b); return x_out; } }; PYBIND11_PLUGIN(pylorenz) { py::module m("pylorenz"); m.def("context", [](std::string name) { std::ostringstream s; s << ctx(name); py::print(s.str()); }, py::arg("name") = std::string("")); py::class_<lorenz_system>(m, "Stepper") .def(py::init<double, double, py::array_t<double>>()) .def("advance", &lorenz_system::advance) ; return m.ptr(); } 1. State type and stepper type 12 typedef vex::multivector<double, 3> state_type; 13 14 typedef odeint::runge_kutta4_classic< 15 state_type, double, state_type, double, 16 odeint::vector_space_algebra, odeint::default_operations 17 > Stepper;

<vector> #include <vexcl/vexcl.hpp> #include <boost/numeric/odeint.hpp> #include <boost/numeric/odeint/external/vexcl/vexcl.hpp> #include <pybind11/pybind11.h> #include <pybind11/numpy.h> namespace odeint = boost::numeric::odeint; namespace py = pybind11; typedef vex::multivector<double, 3> state_type; typedef odeint::runge_kutta4_classic< state_type, double, state_type, double, odeint::vector_space_algebra, odeint::default_operations > Stepper; vex::Context& ctx(std::string name = "") { static vex::Context c(vex::Filter::Env && vex::Filter::Name(name)); return c; } struct lorenz_system { int n; double sigma, b; vex::vector<double> R; state_type X; lorenz_system(double sigma, double b, py::array_t<double> R) : n(R.size()), sigma(sigma), b(b), R(ctx(), n, R.data()), X(ctx(), n) {} void operator()(const state_type &x, state_type &dxdt, double t) const { dxdt = std::make_tuple( sigma * (x(1) - x(0)), R * x(0) - x(1) - x(0) * x(2), x(0) * x(1) - b * x(2)); } py::array_t<double> advance(py::array_t<double> x_in, int steps, double dt) { for(int i=0, b=0, e=n; i<3; ++i, b+=n, e+=n) vex::copy(x_in.data()+b, x_in.data()+e, X(i).begin()); odeint::integrate_n_steps(Stepper(), std::ref(*this), X, 0.0, dt, steps); py::array_t<double> x_out(std::array<size_t,2>{x_in.shape(0), x_in.shape(1)}); for(int i=0, b=0; i<3; ++i, b+=n) vex::copy(X(i).begin(), X(i).end(), x_out.mutable_data()+b); return x_out; } }; PYBIND11_PLUGIN(pylorenz) { py::module m("pylorenz"); m.def("context", [](std::string name) { std::ostringstream s; s << ctx(name); py::print(s.str()); }, py::arg("name") = std::string("")); py::class_<lorenz_system>(m, "Stepper") .def(py::init<double, double, py::array_t<double>>()) .def("advance", &lorenz_system::advance) ; return m.ptr(); } 2. Context initialization 19 vex::Context& ctx(std::string name = "") { 20 static vex::Context c(vex::Filter::Env && vex::Filter::Name(name)); 21 return c; 22 }

<vector> #include <vexcl/vexcl.hpp> #include <boost/numeric/odeint.hpp> #include <boost/numeric/odeint/external/vexcl/vexcl.hpp> #include <pybind11/pybind11.h> #include <pybind11/numpy.h> namespace odeint = boost::numeric::odeint; namespace py = pybind11; typedef vex::multivector<double, 3> state_type; typedef odeint::runge_kutta4_classic< state_type, double, state_type, double, odeint::vector_space_algebra, odeint::default_operations > Stepper; vex::Context& ctx(std::string name = "") { static vex::Context c(vex::Filter::Env && vex::Filter::Name(name)); return c; } struct lorenz_system { int n; double sigma, b; vex::vector<double> R; state_type X; lorenz_system(double sigma, double b, py::array_t<double> R) : n(R.size()), sigma(sigma), b(b), R(ctx(), n, R.data()), X(ctx(), n) {} void operator()(const state_type &x, state_type &dxdt, double t) const { dxdt = std::make_tuple( sigma * (x(1) - x(0)), R * x(0) - x(1) - x(0) * x(2), x(0) * x(1) - b * x(2)); } py::array_t<double> advance(py::array_t<double> x_in, int steps, double dt) { for(int i=0, b=0, e=n; i<3; ++i, b+=n, e+=n) vex::copy(x_in.data()+b, x_in.data()+e, X(i).begin()); odeint::integrate_n_steps(Stepper(), std::ref(*this), X, 0.0, dt, steps); py::array_t<double> x_out(std::array<size_t,2>{x_in.shape(0), x_in.shape(1)}); for(int i=0, b=0; i<3; ++i, b+=n) vex::copy(X(i).begin(), X(i).end(), x_out.mutable_data()+b); return x_out; } }; PYBIND11_PLUGIN(pylorenz) { py::module m("pylorenz"); m.def("context", [](std::string name) { std::ostringstream s; s << ctx(name); py::print(s.str()); }, py::arg("name") = std::string("")); py::class_<lorenz_system>(m, "Stepper") .def(py::init<double, double, py::array_t<double>>()) .def("advance", &lorenz_system::advance) ; return m.ptr(); } 3. System function 24 struct lorenz_system { 25 int n; 26 double sigma, b; 27 vex::vector<double> R; 28 state_type X; 29 30 lorenz_system(double sigma, double b, py::array_t<double> R) 31 : n(R.size()), sigma(sigma), b(b), R(ctx(), n, R.data()), X(ctx(), n) {} 32 33 void operator()(const state_type &x, state_type &dxdt, double t) const { 34 dxdt = std::make_tuple( 35 sigma * (x(1) - x(0)), 36 R * x(0) - x(1) - x(0) * x(2), 37 x(0) * x(1) - b * x(2)); 38 } 39 40 py::array_t<double> advance(py::array_t<double> x_in, int steps, double dt) { 41 for(int i=0, b=0, e=n; i<3; ++i, b+=n, e+=n) 42 vex::copy(x_in.data()+b, x_in.data()+e, X(i).begin()); 43 44 odeint::integrate_n_steps(Stepper(), std::ref(*this), X, 0.0, dt, steps); 45 46 py::array_t<double> x_out(std::array<size_t,2>{x_in.shape(0), x_in.shape(1)}); 47 for(int i=0, b=0; i<3; ++i, b+=n) 48 vex::copy(X(i).begin(), X(i).end(), x_out.mutable_data()+b); 49 50 return x_out; 51 } 52 };

<vector> #include <vexcl/vexcl.hpp> #include <boost/numeric/odeint.hpp> #include <boost/numeric/odeint/external/vexcl/vexcl.hpp> #include <pybind11/pybind11.h> #include <pybind11/numpy.h> namespace odeint = boost::numeric::odeint; namespace py = pybind11; typedef vex::multivector<double, 3> state_type; typedef odeint::runge_kutta4_classic< state_type, double, state_type, double, odeint::vector_space_algebra, odeint::default_operations > Stepper; vex::Context& ctx(std::string name = "") { static vex::Context c(vex::Filter::Env && vex::Filter::Name(name)); return c; } struct lorenz_system { int n; double sigma, b; vex::vector<double> R; state_type X; lorenz_system(double sigma, double b, py::array_t<double> R) : n(R.size()), sigma(sigma), b(b), R(ctx(), n, R.data()), X(ctx(), n) {} void operator()(const state_type &x, state_type &dxdt, double t) const { dxdt = std::make_tuple( sigma * (x(1) - x(0)), R * x(0) - x(1) - x(0) * x(2), x(0) * x(1) - b * x(2)); } py::array_t<double> advance(py::array_t<double> x_in, int steps, double dt) { for(int i=0, b=0, e=n; i<3; ++i, b+=n, e+=n) vex::copy(x_in.data()+b, x_in.data()+e, X(i).begin()); odeint::integrate_n_steps(Stepper(), std::ref(*this), X, 0.0, dt, steps); py::array_t<double> x_out(std::array<size_t,2>{x_in.shape(0), x_in.shape(1)}); for(int i=0, b=0; i<3; ++i, b+=n) vex::copy(X(i).begin(), X(i).end(), x_out.mutable_data()+b); return x_out; } }; PYBIND11_PLUGIN(pylorenz) { py::module m("pylorenz"); m.def("context", [](std::string name) { std::ostringstream s; s << ctx(name); py::print(s.str()); }, py::arg("name") = std::string("")); py::class_<lorenz_system>(m, "Stepper") .def(py::init<double, double, py::array_t<double>>()) .def("advance", &lorenz_system::advance) ; return m.ptr(); } 4. Python bindings 54 PYBIND11_PLUGIN(pylorenz) { 55 py::module m("pylorenz"); 56 57 m.def("context", [](std::string name) { 58 std::ostringstream s; s << ctx(name); py::print(s.str()); 59 }, py::arg("name") = std::string("")); 60 61 py::class_<lorenz_system>(m, "Stepper") 62 .def(py::init<double, double, py::array_t<double>>()) 63 .def("advance", &lorenz_system::advance) 64 ; 65 66 return m.ptr(); 67 }

Calling C++ code from Python 1 import pylorenz as lorenz
2 3 n = 32 # Number of parameters 4 m = 500 # Length of trajectory 5 6 # Parameters 7 R = linspace(0.1, 10, n) 8 sigma = 10.0 9 b = 8/3 10 11 # Initial solution 12 S = empty((m, 3, n)) 13 S[0,:,:] = 10.0 14 15 # Integration 16 ode = lorenz.Stepper(sigma, b, R) 17 for i in range(1,m): 18 S[i] = ode.advance(S[i-1], 1, 0.01)

Projects using VexCL AMGCL — an efficient, flexible, and extensible
Algebraic Multigrid implementation: https://github.com/ddemidov/amgcl Boost.odeint — numerical solution of Ordinary Differential Equations: https://github.com/boostorg/odeint Antioch — A New Templated Implementation Of Chemistry for Hydrodynamics (The University of Texas at Austin): https://github.com/libantioch/antioch FDBB — Fluid Dynamics Building Blocks ( ): https://gitlab.com/mmoelle1/FDBB

Source code: https://github.com/ddemidov/vexcl Documentation: http://vexcl.readthedocs.io Slides: https://speakerdeck.com/ddemidov Example codes: https://github.com/ddemidov/vexcl-talks

vexcl-2017-delft

vexcl-2017-delft

More Decks by Denis Demidov

Other Decks in Programming

Featured

Transcript