Slide 1

Slide 1 text

VexCL Rapid development of scientific code for GPUs Denis Demidov Institute of System Analysis / Russian Academy of Sciences, Kazan Federal University , June 2017

Slide 2

Slide 2 text

VexCL — a vector expression template library for OpenCL/CUDA Fork m e on G itH ub Created for ease of C++ based GPGPU development: Convenient notation for vector expressions OpenCL/CUDA JIT code generation Easily combined with existing libraries/code Header-only Supported backends: OpenCL (Khronos C++ bindings, Boost.Compute) NVIDIA CUDA OpenMP Maxeler FPGAs (in progress) The source code is available under the MIT license: https://github.com/ddemidov/vexcl

Slide 3

Slide 3 text

Hello OpenCL Compute sum of two vectors in parallel: A and B are large vectors. Compute A = A + B. Overview of (any) OpenCL solution: 1 Initialize OpenCL context 2 Allocate memory 3 Transfer input data 4 Run computations 5 Get the results

Slide 4

Slide 4 text

Hello OpenCL #include #include #include #include #define __CL_ENABLE_EXCEPTIONS #include int main() { std::vector platform; cl::Platform::get(&platform); if (platform.empty()) throw std::runtime_error("No OpenCL platforms"); cl::Context context; std::vector device; for(auto p = platform.begin(); device.empty() && p != platform.end(); p++) { std::vector dev; p->getDevices(CL_DEVICE_TYPE_GPU, &dev); for(auto d = dev.begin(); device.empty() && d != dev.end(); d++) { if (!d->getInfo()) continue; device.push_back(*d); try { context = cl::Context(device); } catch(...) { device.clear(); } } } if (device.empty()) throw std::runtime_error("No GPUs"); cl::CommandQueue queue(context, device[0]); const size_t n = 1024 * 1024; std::vector a(n, 1.5), b(n, 2.7); cl::Buffer A(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, a.size() * sizeof(a[0]), a.data()); cl::Buffer B(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, b.size() * sizeof(b[0]), b.data()); std::string source = R"( kernel void add(ulong n, global const double *a, global double *b) { ulong i = get_global_id(0); if (i < n) b[i] += a[i]; } )"; cl::Program program(context, source); try { program.build(device); } catch (const cl::Error&) { std::cerr << "OpenCL compilation error" << std::endl << program.getBuildInfo(device[0]) << std::endl; throw std::runtime_error("OpenCL build error"); } cl::Kernel add(program, "add"); add.setArg(0, static_cast(n)); add.setArg(1, A); add.setArg(2, B); queue.enqueueNDRangeKernel(add, cl::NullRange, n, cl::NullRange); queue.enqueueReadBuffer(B, CL_TRUE, 0, b.size() * sizeof(b[0]), b.data()); std::cout << b[42] << std::endl; }

Slide 5

Slide 5 text

Hello OpenCL #include #include #include #include #define __CL_ENABLE_EXCEPTIONS #include int main() { std::vector platform; cl::Platform::get(&platform); if (platform.empty()) throw std::runtime_error("No OpenCL platforms"); cl::Context context; std::vector device; for(auto p = platform.begin(); device.empty() && p != platform.end(); p++) { std::vector dev; p->getDevices(CL_DEVICE_TYPE_GPU, &dev); for(auto d = dev.begin(); device.empty() && d != dev.end(); d++) { if (!d->getInfo()) continue; device.push_back(*d); try { context = cl::Context(device); } catch(...) { device.clear(); } } } if (device.empty()) throw std::runtime_error("No GPUs"); cl::CommandQueue queue(context, device[0]); const size_t n = 1024 * 1024; std::vector a(n, 1.5), b(n, 2.7); cl::Buffer A(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, a.size() * sizeof(a[0]), a.data()); cl::Buffer B(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, b.size() * sizeof(b[0]), b.data()); std::string source = R"( kernel void add(ulong n, global const double *a, global double *b) { ulong i = get_global_id(0); if (i < n) b[i] += a[i]; } )"; cl::Program program(context, source); try { program.build(device); } catch (const cl::Error&) { std::cerr << "OpenCL compilation error" << std::endl << program.getBuildInfo(device[0]) << std::endl; throw std::runtime_error("OpenCL build error"); } cl::Kernel add(program, "add"); add.setArg(0, static_cast(n)); add.setArg(1, A); add.setArg(2, B); queue.enqueueNDRangeKernel(add, cl::NullRange, n, cl::NullRange); queue.enqueueReadBuffer(B, CL_TRUE, 0, b.size() * sizeof(b[0]), b.data()); std::cout << b[42] << std::endl; } 1. Query platforms 10 std::vector platform; 11 cl::Platform::get(&platform); 12 13 if (platform.empty()) 14 throw std::runtime_error("No OpenCL platforms");

Slide 6

Slide 6 text

Hello OpenCL #include #include #include #include #define __CL_ENABLE_EXCEPTIONS #include int main() { std::vector platform; cl::Platform::get(&platform); if (platform.empty()) throw std::runtime_error("No OpenCL platforms"); cl::Context context; std::vector device; for(auto p = platform.begin(); device.empty() && p != platform.end(); p++) { std::vector dev; p->getDevices(CL_DEVICE_TYPE_GPU, &dev); for(auto d = dev.begin(); device.empty() && d != dev.end(); d++) { if (!d->getInfo()) continue; device.push_back(*d); try { context = cl::Context(device); } catch(...) { device.clear(); } } } if (device.empty()) throw std::runtime_error("No GPUs"); cl::CommandQueue queue(context, device[0]); const size_t n = 1024 * 1024; std::vector a(n, 1.5), b(n, 2.7); cl::Buffer A(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, a.size() * sizeof(a[0]), a.data()); cl::Buffer B(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, b.size() * sizeof(b[0]), b.data()); std::string source = R"( kernel void add(ulong n, global const double *a, global double *b) { ulong i = get_global_id(0); if (i < n) b[i] += a[i]; } )"; cl::Program program(context, source); try { program.build(device); } catch (const cl::Error&) { std::cerr << "OpenCL compilation error" << std::endl << program.getBuildInfo(device[0]) << std::endl; throw std::runtime_error("OpenCL build error"); } cl::Kernel add(program, "add"); add.setArg(0, static_cast(n)); add.setArg(1, A); add.setArg(2, B); queue.enqueueNDRangeKernel(add, cl::NullRange, n, cl::NullRange); queue.enqueueReadBuffer(B, CL_TRUE, 0, b.size() * sizeof(b[0]), b.data()); std::cout << b[42] << std::endl; } 2. Get the first available GPU 16 cl::Context context; 17 std::vector device; 18 for(auto p = platform.begin(); device.empty() && p != platform.end(); p++) { 19 std::vector dev; 20 p->getDevices(CL_DEVICE_TYPE_GPU, &dev); 21 for(auto d = dev.begin(); device.empty() && d != dev.end(); d++) { 22 if (!d->getInfo()) continue; 23 device.push_back(*d); 24 try { 25 context = cl::Context(device); 26 } catch(...) { 27 device.clear(); 28 } 29 } 30 } 31 if (device.empty()) throw std::runtime_error("No GPUs"); 32 33 cl::CommandQueue queue(context, device[0]);

Slide 7

Slide 7 text

Hello OpenCL #include #include #include #include #define __CL_ENABLE_EXCEPTIONS #include int main() { std::vector platform; cl::Platform::get(&platform); if (platform.empty()) throw std::runtime_error("No OpenCL platforms"); cl::Context context; std::vector device; for(auto p = platform.begin(); device.empty() && p != platform.end(); p++) { std::vector dev; p->getDevices(CL_DEVICE_TYPE_GPU, &dev); for(auto d = dev.begin(); device.empty() && d != dev.end(); d++) { if (!d->getInfo()) continue; device.push_back(*d); try { context = cl::Context(device); } catch(...) { device.clear(); } } } if (device.empty()) throw std::runtime_error("No GPUs"); cl::CommandQueue queue(context, device[0]); const size_t n = 1024 * 1024; std::vector a(n, 1.5), b(n, 2.7); cl::Buffer A(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, a.size() * sizeof(a[0]), a.data()); cl::Buffer B(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, b.size() * sizeof(b[0]), b.data()); std::string source = R"( kernel void add(ulong n, global const double *a, global double *b) { ulong i = get_global_id(0); if (i < n) b[i] += a[i]; } )"; cl::Program program(context, source); try { program.build(device); } catch (const cl::Error&) { std::cerr << "OpenCL compilation error" << std::endl << program.getBuildInfo(device[0]) << std::endl; throw std::runtime_error("OpenCL build error"); } cl::Kernel add(program, "add"); add.setArg(0, static_cast(n)); add.setArg(1, A); add.setArg(2, B); queue.enqueueNDRangeKernel(add, cl::NullRange, n, cl::NullRange); queue.enqueueReadBuffer(B, CL_TRUE, 0, b.size() * sizeof(b[0]), b.data()); std::cout << b[42] << std::endl; } 3. Prepare and copy input data 35 const size_t n = 1024 * 1024; 36 std::vector a(n, 1.5), b(n, 2.7); 37 38 cl::Buffer A(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 39 a.size() * sizeof(a[0]), a.data()); 40 41 cl::Buffer B(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 42 b.size() * sizeof(b[0]), b.data());

Slide 8

Slide 8 text

Hello OpenCL #include #include #include #include #define __CL_ENABLE_EXCEPTIONS #include int main() { std::vector platform; cl::Platform::get(&platform); if (platform.empty()) throw std::runtime_error("No OpenCL platforms"); cl::Context context; std::vector device; for(auto p = platform.begin(); device.empty() && p != platform.end(); p++) { std::vector dev; p->getDevices(CL_DEVICE_TYPE_GPU, &dev); for(auto d = dev.begin(); device.empty() && d != dev.end(); d++) { if (!d->getInfo()) continue; device.push_back(*d); try { context = cl::Context(device); } catch(...) { device.clear(); } } } if (device.empty()) throw std::runtime_error("No GPUs"); cl::CommandQueue queue(context, device[0]); const size_t n = 1024 * 1024; std::vector a(n, 1.5), b(n, 2.7); cl::Buffer A(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, a.size() * sizeof(a[0]), a.data()); cl::Buffer B(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, b.size() * sizeof(b[0]), b.data()); std::string source = R"( kernel void add(ulong n, global const double *a, global double *b) { ulong i = get_global_id(0); if (i < n) b[i] += a[i]; } )"; cl::Program program(context, source); try { program.build(device); } catch (const cl::Error&) { std::cerr << "OpenCL compilation error" << std::endl << program.getBuildInfo(device[0]) << std::endl; throw std::runtime_error("OpenCL build error"); } cl::Kernel add(program, "add"); add.setArg(0, static_cast(n)); add.setArg(1, A); add.setArg(2, B); queue.enqueueNDRangeKernel(add, cl::NullRange, n, cl::NullRange); queue.enqueueReadBuffer(B, CL_TRUE, 0, b.size() * sizeof(b[0]), b.data()); std::cout << b[42] << std::endl; } 4. Write and compile the compute kernel 44 std::string source = R"( 45 kernel void add(ulong n, global const double *a, global double *b) { 46 ulong i = get_global_id(0); 47 if (i < n) b[i] += a[i]; 48 } 49 )"; 50 51 cl::Program program(context, source); 52 53 try { 54 program.build(device); 55 } catch (const cl::Error&) { 56 std::cerr 57 << "OpenCL compilation error" << std::endl 58 << program.getBuildInfo(device[0]) 59 << std::endl; 60 throw std::runtime_error("OpenCL build error"); 61 } 62 cl::Kernel add(program, "add");

Slide 9

Slide 9 text

Hello OpenCL #include #include #include #include #define __CL_ENABLE_EXCEPTIONS #include int main() { std::vector platform; cl::Platform::get(&platform); if (platform.empty()) throw std::runtime_error("No OpenCL platforms"); cl::Context context; std::vector device; for(auto p = platform.begin(); device.empty() && p != platform.end(); p++) { std::vector dev; p->getDevices(CL_DEVICE_TYPE_GPU, &dev); for(auto d = dev.begin(); device.empty() && d != dev.end(); d++) { if (!d->getInfo()) continue; device.push_back(*d); try { context = cl::Context(device); } catch(...) { device.clear(); } } } if (device.empty()) throw std::runtime_error("No GPUs"); cl::CommandQueue queue(context, device[0]); const size_t n = 1024 * 1024; std::vector a(n, 1.5), b(n, 2.7); cl::Buffer A(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, a.size() * sizeof(a[0]), a.data()); cl::Buffer B(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, b.size() * sizeof(b[0]), b.data()); std::string source = R"( kernel void add(ulong n, global const double *a, global double *b) { ulong i = get_global_id(0); if (i < n) b[i] += a[i]; } )"; cl::Program program(context, source); try { program.build(device); } catch (const cl::Error&) { std::cerr << "OpenCL compilation error" << std::endl << program.getBuildInfo(device[0]) << std::endl; throw std::runtime_error("OpenCL build error"); } cl::Kernel add(program, "add"); add.setArg(0, static_cast(n)); add.setArg(1, A); add.setArg(2, B); queue.enqueueNDRangeKernel(add, cl::NullRange, n, cl::NullRange); queue.enqueueReadBuffer(B, CL_TRUE, 0, b.size() * sizeof(b[0]), b.data()); std::cout << b[42] << std::endl; } 5. Set kernel arguments and launch the computation 64 add.setArg(0, static_cast(n)); 65 add.setArg(1, A); 66 add.setArg(2, B); 67 68 queue.enqueueNDRangeKernel(add, cl::NullRange, n, cl::NullRange);

Slide 10

Slide 10 text

Hello OpenCL #include #include #include #include #define __CL_ENABLE_EXCEPTIONS #include int main() { std::vector platform; cl::Platform::get(&platform); if (platform.empty()) throw std::runtime_error("No OpenCL platforms"); cl::Context context; std::vector device; for(auto p = platform.begin(); device.empty() && p != platform.end(); p++) { std::vector dev; p->getDevices(CL_DEVICE_TYPE_GPU, &dev); for(auto d = dev.begin(); device.empty() && d != dev.end(); d++) { if (!d->getInfo()) continue; device.push_back(*d); try { context = cl::Context(device); } catch(...) { device.clear(); } } } if (device.empty()) throw std::runtime_error("No GPUs"); cl::CommandQueue queue(context, device[0]); const size_t n = 1024 * 1024; std::vector a(n, 1.5), b(n, 2.7); cl::Buffer A(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, a.size() * sizeof(a[0]), a.data()); cl::Buffer B(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, b.size() * sizeof(b[0]), b.data()); std::string source = R"( kernel void add(ulong n, global const double *a, global double *b) { ulong i = get_global_id(0); if (i < n) b[i] += a[i]; } )"; cl::Program program(context, source); try { program.build(device); } catch (const cl::Error&) { std::cerr << "OpenCL compilation error" << std::endl << program.getBuildInfo(device[0]) << std::endl; throw std::runtime_error("OpenCL build error"); } cl::Kernel add(program, "add"); add.setArg(0, static_cast(n)); add.setArg(1, A); add.setArg(2, B); queue.enqueueNDRangeKernel(add, cl::NullRange, n, cl::NullRange); queue.enqueueReadBuffer(B, CL_TRUE, 0, b.size() * sizeof(b[0]), b.data()); std::cout << b[42] << std::endl; } 6. Get the results from the compute device 70 queue.enqueueReadBuffer(B, CL_TRUE, 0, b.size() * sizeof(b[0]), b.data()); 71 std::cout << b[42] << std::endl;

Slide 11

Slide 11 text

Hello VexCL hello.cpp 1 #include 2 #include 3 #include 4 5 int main() { 6 vex::Context ctx( vex::Filter::GPU ); 7 if (!ctx) throw std::runtime_error("No GPUs"); 8 9 size_t n = 1024 * 1024; 10 std::vector a(n, 1.5), b(n, 2.7); 11 12 vex::vector A(ctx, a); 13 vex::vector B(ctx, b); 14 15 B += A; 16 17 vex::copy(B, b); 18 std::cout << b[42] << std::endl; 19 } CMakeLists.txt 1 cmake_minimum_required(VERSION 3.1) 2 project(hello) 3 4 # When VexCL is installed system wide: 5 find_package(VexCL) 6 7 # When VexCL is copied into a subfolder: 8 # add_subdirectory(vexcl) 9 10 add_executable(hello hello.cpp) 11 target_link_libraries(hello VexCL::OpenCL)

Slide 12

Slide 12 text

Interface

Slide 13

Slide 13 text

Initialization Multi-device and multi-platform computations are supported. VexCL context is initialized from combination of device filters. Initialize VexCL context on selected devices 1 vex::Context ctx( vex::Filter::Any );

Slide 14

Slide 14 text

Initialization Multi-device and multi-platform computations are supported. VexCL context is initialized from combination of device filters. Initialize VexCL context on selected devices 1 vex::Context ctx( vex::Filter::GPU );

Slide 15

Slide 15 text

Initialization Multi-device and multi-platform computations are supported. VexCL context is initialized from combination of device filters. Initialize VexCL context on selected devices 1 vex::Context ctx(vex::Filter::Name("Intel"));

Slide 16

Slide 16 text

Initialization Multi-device and multi-platform computations are supported. VexCL context is initialized from combination of device filters. Initialize VexCL context on selected devices 1 vex::Context ctx(vex::Filter::GPU && vex::Filter::Platform("AMD"));

Slide 17

Slide 17 text

Memory and work splitting 1 vex::Context ctx( vex::Filter::Count(1) ); 2 3 vex::vector x(ctx, N); 4 vex::vector y(ctx, N); 5 6 x = vex::element_index() * (1.0 / N); 7 y = sin(2 * x) + sqrt(1 - x * x); x y

Slide 18

Slide 18 text

Memory and work splitting 1 vex::Context ctx( vex::Filter::GPU ); 2 3 vex::vector x(ctx, N); 4 vex::vector y(ctx, N); 5 6 x = vex::element_index() * (1.0 / N); 7 y = sin(2 * x) + sqrt(1 - x * x); x y

Slide 19

Slide 19 text

Memory and work splitting 1 vex::Context ctx( vex::Filter::DoublePrecision ); 2 3 vex::vector x(ctx, N); 4 vex::vector y(ctx, N); 5 6 x = vex::element_index() * (1.0 / N); 7 y = sin(2 * x) + sqrt(1 - x * x); x y

Slide 20

Slide 20 text

Copies between host and device memory 1 vex::vector d(ctx, n); 2 std::vector h(n); 3 double a[100]; STL-like range copies 1 vex::copy(d.begin(), d.end(), h.begin()); 2 vex::copy(d.begin(), d.begin() + 100, a); Simple copies 1 vex::copy(d, h); 2 vex::copy(h, d); Map OpenCL buffer to host pointer 1 auto p = d.map(devnum); 2 std::sort(&p[0], &p[d.part_size(devnum)]); Access single element (slow) 1 double v = d[42]; 2 d[0] = 0;

Slide 21

Slide 21 text

What vector expressions are supported? All vectors in an expression have to be compatible: Have same size Located on same devices What may be used: Vectors, scalars, constants Arithmetic, logical operators Built-in functions User-defined functions Random number generators Slicing and permutations Reduce to a scalar (sum, min, max) Reduce across chosen dimensions Stencil operations Sparse matrix – vector products Fast Fourier Transform Sort, scan, reduce by key

Slide 22

Slide 22 text

Builtin operations and functions This expression: 1 x = 2 * y - sin(z); export VEXCL_SHOW_KERNELS=1 to see the generated code. . . . results in this kernel: 1 kernel void vexcl_vector_kernel( 2 ulong n, 3 global double * prm_1, 4 int prm_2, 5 global double * prm_3, 6 global double * prm_4 7 ) 8 { 9 for(size_t idx = get_global_id(0); idx < n; idx += get_global_size(0)) { 10 prm_1[idx] = ( ( prm_2 * prm_3[idx] ) - sin( prm_4[idx] ) ); 11 } 12 } − ∗ sin 2 y z

Slide 23

Slide 23 text

User-defined functions Defining a function: 1 VEX_FUNCTION( double, sqr, (double, x)(double, y), 2 return x * x + y * y; 3 ); Using the function: 1 Z = sqrt( sqr(X, Y) );

Slide 24

Slide 24 text

User functions are translated to OpenCL functions 1 Z = sqrt( sqr(X, Y) ); . . . gets translated to: 1 double sqr(double x, double y) { 2 return x * x + y * y; 3 } 4 5 kernel void vexcl_vector_kernel( 6 ulong n, 7 global double * prm_1, 8 global double * prm_2, 9 global double * prm_3 10 ) 11 { 12 for(size_t idx = get_global_id(0); idx < n; idx += get_global_size(0)) { 13 prm_1[idx] = sqrt( sqr( prm_2[idx], prm_3[idx] ) ); 14 } 15 } sqrt sqr x y

Slide 25

Slide 25 text

Random number generation VexCL provides1 counter-based random number generators from Random1232 suite. The generators are stateless; mixing functions are applied to element indices. Implemented families: threefry and philox. Both pass TestU01/BigCrush; up to 264 independent streams with a period of 2128. Performance: ≈ 1010 Samples/sec (Tesla K20c). vex::Random — uniform distribution. vex::RandomNormal — normal distribution. 1 vex::Random rnd; 2 vex::vector x(ctx, n); 3 4 x = rnd(vex::element_index(), std::rand()); 1Contributed by Pascal Germroth [email protected] 2D. E. Shaw Research, http://www.deshawresearch.com/resources random123.html

Slide 26

Slide 26 text

Reductions Class vex::Reductor allows to reduce arbitrary vector expression to a single value of type T. Supported reduction kinds: SUM, SUM_Kahan, MIN, MAX Inner product 1 vex::Reductor sum(ctx); 2 double s = sum(x * y); Number of elements in x between 0 and 1 1 vex::Reductor sum(ctx); 2 size_t n = sum( (x > 0) && (x < 1) ); Maximum distance from origin 1 vex::Reductor max(ctx); 2 double d = max( sqrt(x * x + y * y) );

Slide 27

Slide 27 text

Sparse matrix – vector products Class vex::sparse::matrix holds representation of a sparse matrix on compute devices. Constructor accepts matrix in common CRS format: row indices, columns and values of nonzero entries. Construct matrix 1 vex::sparse::matrix A(ctx, n, n, ptr, col, val); Compute residual value 2 // vex::vector u, f, r; 3 r = f - A * u; 4 double res = max( fabs(r) );

Slide 28

Slide 28 text

Raw pointer arithmetic raw_pointer(const vector&) function returns pointer to vector’s data inside vector expression. May be used to implement complex access patterns. 1D Laplace operator: 1 auto ptr = vex::raw_pointer(x); 2 auto idx = vex::element_index(); 3 4 auto left = if_else(idx > 0, i - 1, i); 5 auto right = if_else(idx + 1 < n, i + 1, i); 6 7 y = 2 * ptr[idx] - ptr[left] - ptr[right];

Slide 29

Slide 29 text

N-body problem with raw pointers yi = j=i e−|xi −xj | 1 VEX_FUNCTION(double, nbody, (size_t, n)(size_t, i)(double*, x), 2 double sum = 0, myval = x[i]; 3 for(size_t j = 0; j < n; ++j) 4 if (j != i) sum += exp(-fabs(x[j] - myval)); 5 return sum; 6 ); 7 8 y = nbody(x.size(), vex::element_index(), raw_pointer(x));

Slide 30

Slide 30 text

Usage examples

Slide 31

Slide 31 text

Monte Carlo π Compute approximate value of π: area of circle area of square = πr2 (2r)2 = π 4 , π = 4 area of circle area of square ≈ 4 count(points in circle) count(all points)      1 vex::Random rnd; // Generates 2D points in [0, 1] × [0, 1] 2 vex::Reductor sum(ctx); 3 4 double pi = 4.0 * sum( length( rnd(vex::element_index(0, n), seed) ) < 1 ) / n;

Slide 32

Slide 32 text

Monte Carlo π: the generated kernel #if defined(cl_khr_fp64) #pragma OPENCL EXTENSION cl_khr_fp64: enable #elif defined(cl_amd_fp64) #pragma OPENCL EXTENSION cl_amd_fp64: enable #endif void philox_uint_4_10 ( uint * ctr, uint * key ) { uint m[4]; m[0] = mul_hi(0xD2511F53, ctr[0]); m[1] = 0xD2511F53 * ctr[0]; m[2] = mul_hi(0xCD9E8D57, ctr[2]); m[3] = 0xCD9E8D57 * ctr[2]; ctr[0] = m[2] ˆ ctr[1] ˆ key[0]; ctr[1] = m[3]; ctr[2] = m[0] ˆ ctr[3] ˆ key[1]; ctr[3] = m[1]; key[0] += 0x9E3779B9; key[1] += 0xBB67AE85; m[0] = mul_hi(0xD2511F53, ctr[0]); m[1] = 0xD2511F53 * ctr[0]; m[2] = mul_hi(0xCD9E8D57, ctr[2]); m[3] = 0xCD9E8D57 * ctr[2]; ctr[0] = m[2] ˆ ctr[1] ˆ key[0]; ctr[1] = m[3]; ctr[2] = m[0] ˆ ctr[3] ˆ key[1]; ctr[3] = m[1]; key[0] += 0x9E3779B9; key[1] += 0xBB67AE85; m[0] = mul_hi(0xD2511F53, ctr[0]); m[1] = 0xD2511F53 * ctr[0]; m[2] = mul_hi(0xCD9E8D57, ctr[2]); m[3] = 0xCD9E8D57 * ctr[2]; ctr[0] = m[2] ˆ ctr[1] ˆ key[0]; ctr[1] = m[3]; ctr[2] = m[0] ˆ ctr[3] ˆ key[1]; ctr[3] = m[1]; key[0] += 0x9E3779B9; key[1] += 0xBB67AE85; m[0] = mul_hi(0xD2511F53, ctr[0]); m[1] = 0xD2511F53 * ctr[0]; m[2] = mul_hi(0xCD9E8D57, ctr[2]); m[3] = 0xCD9E8D57 * ctr[2]; ctr[0] = m[2] ˆ ctr[1] ˆ key[0]; ctr[1] = m[3]; ctr[2] = m[0] ˆ ctr[3] ˆ key[1]; ctr[3] = m[1]; key[0] += 0x9E3779B9; key[1] += 0xBB67AE85; m[0] = mul_hi(0xD2511F53, ctr[0]); m[1] = 0xD2511F53 * ctr[0]; m[2] = mul_hi(0xCD9E8D57, ctr[2]); m[3] = 0xCD9E8D57 * ctr[2]; ctr[0] = m[2] ˆ ctr[1] ˆ key[0]; ctr[1] = m[3]; ctr[2] = m[0] ˆ ctr[3] ˆ key[1]; ctr[3] = m[1]; key[0] += 0x9E3779B9; key[1] += 0xBB67AE85; m[0] = mul_hi(0xD2511F53, ctr[0]); m[1] = 0xD2511F53 * ctr[0]; m[2] = mul_hi(0xCD9E8D57, ctr[2]); m[3] = 0xCD9E8D57 * ctr[2]; ctr[0] = m[2] ˆ ctr[1] ˆ key[0]; ctr[1] = m[3]; ctr[2] = m[0] ˆ ctr[3] ˆ key[1]; ctr[3] = m[1]; key[0] += 0x9E3779B9; key[1] += 0xBB67AE85; m[0] = mul_hi(0xD2511F53, ctr[0]); m[1] = 0xD2511F53 * ctr[0]; m[2] = mul_hi(0xCD9E8D57, ctr[2]); m[3] = 0xCD9E8D57 * ctr[2]; ctr[0] = m[2] ˆ ctr[1] ˆ key[0]; ctr[1] = m[3]; ctr[2] = m[0] ˆ ctr[3] ˆ key[1]; ctr[3] = m[1]; key[0] += 0x9E3779B9; key[1] += 0xBB67AE85; m[0] = mul_hi(0xD2511F53, ctr[0]); m[1] = 0xD2511F53 * ctr[0]; m[2] = mul_hi(0xCD9E8D57, ctr[2]); m[3] = 0xCD9E8D57 * ctr[2]; ctr[0] = m[2] ˆ ctr[1] ˆ key[0]; ctr[1] = m[3]; ctr[2] = m[0] ˆ ctr[3] ˆ key[1]; ctr[3] = m[1]; key[0] += 0x9E3779B9; key[1] += 0xBB67AE85; m[0] = mul_hi(0xD2511F53, ctr[0]); m[1] = 0xD2511F53 * ctr[0]; m[2] = mul_hi(0xCD9E8D57, ctr[2]); m[3] = 0xCD9E8D57 * ctr[2]; ctr[0] = m[2] ˆ ctr[1] ˆ key[0]; ctr[1] = m[3]; ctr[2] = m[0] ˆ ctr[3] ˆ key[1]; ctr[3] = m[1]; key[0] += 0x9E3779B9; key[1] += 0xBB67AE85; m[0] = mul_hi(0xD2511F53, ctr[0]); m[1] = 0xD2511F53 * ctr[0]; m[2] = mul_hi(0xCD9E8D57, ctr[2]); m[3] = 0xCD9E8D57 * ctr[2]; ctr[0] = m[2] ˆ ctr[1] ˆ key[0]; ctr[1] = m[3]; ctr[2] = m[0] ˆ ctr[3] ˆ key[1]; ctr[3] = m[1]; } double2 random_double2_philox ( ulong prm1, ulong prm2 ) { union { uint ctr[4]; ulong res_i[2]; double res_f[2]; double2 res; } ctr; uint key[2]; ctr.ctr[0] = prm1; ctr.ctr[1] = prm2; ctr.ctr[2] = prm1; ctr.ctr[3] = prm2; key[0] = 0x12345678; key[1] = 0x12345678; philox_uint_4_10(ctr.ctr, key); ctr.res_f[0] = ctr.res_i[0] / 18446744073709551615.0; ctr.res_f[1] = ctr.res_i[1] / 18446744073709551615.0; return ctr.res; } ulong SUM_ulong ( ulong prm1, ulong prm2 ) { return prm1 + prm2; } kernel void vexcl_reductor_kernel ( ulong n, ulong prm_1, ulong prm_2, double prm_3, global ulong * g_odata, local ulong * smem ) { ulong mySum = (ulong)0; for(ulong idx = get_global_id(0); idx < n; idx += get_global_size(0)) { mySum = SUM_ulong(mySum, ( length( random_double2_philox( (prm_1 + idx), prm_2 ) ) < prm_3 )); } local ulong * sdata = smem; size_t tid = get_local_id(0); size_t block_size = get_local_size(0); sdata[tid] = mySum; barrier(CLK_LOCAL_MEM_FENCE); if (block_size >= 1024) { if (tid < 512) { sdata[tid] = mySum = SUM_ulong(mySum, sdata[tid + 512]); } barrier(CLK_LOCAL_MEM_FENCE); } if (block_size >= 512) { if (tid < 256) { sdata[tid] = mySum = SUM_ulong(mySum, sdata[tid + 256]); } barrier(CLK_LOCAL_MEM_FENCE); } if (block_size >= 256) { if (tid < 128) { sdata[tid] = mySum = SUM_ulong(mySum, sdata[tid + 128]); } barrier(CLK_LOCAL_MEM_FENCE); } if (block_size >= 128) { if (tid < 64) { sdata[tid] = mySum = SUM_ulong(mySum, sdata[tid + 64]); } barrier(CLK_LOCAL_MEM_FENCE); } if (tid < 32) { volatile local ulong * smem = sdata; if (block_size >= 64) { smem[tid] = mySum = SUM_ulong(mySum, smem[tid + 32]); } if (block_size >= 32) { smem[tid] = mySum = SUM_ulong(mySum, smem[tid + 16]); } if (block_size >= 16) { smem[tid] = mySum = SUM_ulong(mySum, smem[tid + 8]); } if (block_size >= 8) { smem[tid] = mySum = SUM_ulong(mySum, smem[tid + 4]); } if (block_size >= 4) { smem[tid] = mySum = SUM_ulong(mySum, smem[tid + 2]); } if (block_size >= 2) { smem[tid] = mySum = SUM_ulong(mySum, smem[tid + 1]); } } if (tid == 0) g_odata[get_group_id(0)] = sdata[0]; }

Slide 33

Slide 33 text

Solving ODEs on GPUs Lorenz attractor system ˙ x = −σ (x − y) , ˙ y = Rx − y − xz, ˙ z = −bz + xy. 20 0 20 20 0 20 40 20 40 60 80 Lorenz attractor trajectory Parameter study for the Lorenz attractor system: Solve large number of Lorenz systems, each for a different value of R. Let’s use Boost.odeint together with VexCL for that. [1] K. Ahnert, D. Demidov, and M. Mulansky. Solving ordinary differential equations on GPUs. In Numerical Computations with GPUs (pp. 125-157). Springer, 2014. doi:10.1007/978-3-319-06548-9 7

Slide 34

Slide 34 text

Using Boost.odeint Generic ODE: dx dt = ˙ x = f (x, t), x(0) = x0. Using Boost.odeint: Define state type (what is x?) Choose integration method Provide system function (define f (x, t)) Integrate over time

Slide 35

Slide 35 text

Implementation #include #include #include #include #include namespace odeint = boost::numeric::odeint; typedef vex::multivector state_type; typedef odeint::runge_kutta4_classic< state_type, double, state_type, double, odeint::vector_space_algebra, odeint::default_operations > Stepper; struct lorenz_system { const vex::vector &R; double sigma, b; void operator()(const state_type &x, state_type &dxdt, double t) const { dxdt = std::make_tuple( sigma * (x(1) - x(0)), R * x(0) - x(1) - x(0) * x(2), x(0) * x(1) - b * x(2)); } }; int main(int argc, char *argv[]) { const size_t n = (argc > 1) ? std::stoi(argv[1]) : 32; const double tmax = 1.0, dt = 0.01; const double Rmin = 0.1, Rmax = 50.0, dR = (Rmax - Rmin) / (n - 1); vex::Context ctx(vex::Filter::Env && vex::Filter::DoublePrecision); state_type X(ctx, n); vex::vector R(ctx, n); X = 10.0; R = Rmin + dR * vex::element_index(); lorenz_system sys{R, 10.0, 8.0/3}; odeint::integrate_const(Stepper(), sys, X, 0.0, tmax, dt); std::cout << "x = " << X << std::endl; }

Slide 36

Slide 36 text

Implementation #include #include #include #include #include namespace odeint = boost::numeric::odeint; typedef vex::multivector state_type; typedef odeint::runge_kutta4_classic< state_type, double, state_type, double, odeint::vector_space_algebra, odeint::default_operations > Stepper; struct lorenz_system { const vex::vector &R; double sigma, b; void operator()(const state_type &x, state_type &dxdt, double t) const { dxdt = std::make_tuple( sigma * (x(1) - x(0)), R * x(0) - x(1) - x(0) * x(2), x(0) * x(1) - b * x(2)); } }; int main(int argc, char *argv[]) { const size_t n = (argc > 1) ? std::stoi(argv[1]) : 32; const double tmax = 1.0, dt = 0.01; const double Rmin = 0.1, Rmax = 50.0, dR = (Rmax - Rmin) / (n - 1); vex::Context ctx(vex::Filter::Env && vex::Filter::DoublePrecision); state_type X(ctx, n); vex::vector R(ctx, n); X = 10.0; R = Rmin + dR * vex::element_index(); lorenz_system sys{R, 10.0, 8.0/3}; odeint::integrate_const(Stepper(), sys, X, 0.0, tmax, dt); std::cout << "x = " << X << std::endl; } 1. State type and stepper type 9 typedef vex::multivector state_type; 10 11 typedef odeint::runge_kutta4_classic< 12 state_type, double, state_type, double, 13 odeint::vector_space_algebra, odeint::default_operations 14 > Stepper;

Slide 37

Slide 37 text

Implementation #include #include #include #include #include namespace odeint = boost::numeric::odeint; typedef vex::multivector state_type; typedef odeint::runge_kutta4_classic< state_type, double, state_type, double, odeint::vector_space_algebra, odeint::default_operations > Stepper; struct lorenz_system { const vex::vector &R; double sigma, b; void operator()(const state_type &x, state_type &dxdt, double t) const { dxdt = std::make_tuple( sigma * (x(1) - x(0)), R * x(0) - x(1) - x(0) * x(2), x(0) * x(1) - b * x(2)); } }; int main(int argc, char *argv[]) { const size_t n = (argc > 1) ? std::stoi(argv[1]) : 32; const double tmax = 1.0, dt = 0.01; const double Rmin = 0.1, Rmax = 50.0, dR = (Rmax - Rmin) / (n - 1); vex::Context ctx(vex::Filter::Env && vex::Filter::DoublePrecision); state_type X(ctx, n); vex::vector R(ctx, n); X = 10.0; R = Rmin + dR * vex::element_index(); lorenz_system sys{R, 10.0, 8.0/3}; odeint::integrate_const(Stepper(), sys, X, 0.0, tmax, dt); std::cout << "x = " << X << std::endl; } 2. System function 10 struct lorenz_system { 11 const vex::vector &R; 12 double sigma, b; 13 14 void operator()(const state_type &x, state_type &dxdt, double t) const { 15 dxdt = std::make_tuple( 16 sigma * (x(1) - x(0)), 17 R * x(0) - x(1) - x(0) * x(2), 18 x(0) * x(1) - b * x(2)); 19 } 20 };

Slide 38

Slide 38 text

Implementation #include #include #include #include #include namespace odeint = boost::numeric::odeint; typedef vex::multivector state_type; typedef odeint::runge_kutta4_classic< state_type, double, state_type, double, odeint::vector_space_algebra, odeint::default_operations > Stepper; struct lorenz_system { const vex::vector &R; double sigma, b; void operator()(const state_type &x, state_type &dxdt, double t) const { dxdt = std::make_tuple( sigma * (x(1) - x(0)), R * x(0) - x(1) - x(0) * x(2), x(0) * x(1) - b * x(2)); } }; int main(int argc, char *argv[]) { const size_t n = (argc > 1) ? std::stoi(argv[1]) : 32; const double tmax = 1.0, dt = 0.01; const double Rmin = 0.1, Rmax = 50.0, dR = (Rmax - Rmin) / (n - 1); vex::Context ctx(vex::Filter::Env && vex::Filter::DoublePrecision); state_type X(ctx, n); vex::vector R(ctx, n); X = 10.0; R = Rmin + dR * vex::element_index(); lorenz_system sys{R, 10.0, 8.0/3}; odeint::integrate_const(Stepper(), sys, X, 0.0, tmax, dt); std::cout << "x = " << X << std::endl; } 3. Integrate 27 int main(int argc, char *argv[]) { 28 const size_t n = (argc > 1) ? std::stoi(argv[1]) : 32; 29 const double tmax = 1.0, dt = 0.01; 30 const double Rmin = 0.1, Rmax = 50.0, dR = (Rmax - Rmin) / (n - 1); 31 32 vex::Context ctx(vex::Filter::Env && vex::Filter::DoublePrecision); 33 34 state_type X(ctx, n); 35 vex::vector R(ctx, n); 36 37 X = 10.0; 38 R = Rmin + dR * vex::element_index(); 39 40 lorenz_system sys{R, 10.0, 8.0/3}; 41 odeint::integrate_const(Stepper(), sys, X, 0.0, tmax, dt); 42 43 std::cout << "x = " << X << std::endl; 44 }

Slide 39

Slide 39 text

Performance NVIDIA Tesla K40c, Intel Core i7 920 103 104 105 106 N 10 1 100 101 102 103 104 T(sec) OpenMP VexCL 103 104 105 106 N 10 1 100 101 T(OpenMP) / T

Slide 40

Slide 40 text

Summary VexCL allows to write compact and readable code. It’s great for fast prototyping of scientific GPGPU applications. The performance is often on-par with manually written kernels. Using custom kernels or third-party functionality is still possible.

Slide 41

Slide 41 text

Summary VexCL allows to write compact and readable code. It’s great for fast prototyping of scientific GPGPU applications. The performance is often on-par with manually written kernels. Using custom kernels or third-party functionality is still possible. It stresses the compiler greatly. Compilation takes a lot of RAM and a lot of time. But, it’s easy to combine C++ with Python: http://pybind11.readthedocs.io https://xkcd.com/303/

Slide 42

Slide 42 text

Creating python bindings for C++ with pybind11 #include #include #include #include #include #include #include namespace odeint = boost::numeric::odeint; namespace py = pybind11; typedef vex::multivector state_type; typedef odeint::runge_kutta4_classic< state_type, double, state_type, double, odeint::vector_space_algebra, odeint::default_operations > Stepper; vex::Context& ctx(std::string name = "") { static vex::Context c(vex::Filter::Env && vex::Filter::Name(name)); return c; } struct lorenz_system { int n; double sigma, b; vex::vector R; state_type X; lorenz_system(double sigma, double b, py::array_t R) : n(R.size()), sigma(sigma), b(b), R(ctx(), n, R.data()), X(ctx(), n) {} void operator()(const state_type &x, state_type &dxdt, double t) const { dxdt = std::make_tuple( sigma * (x(1) - x(0)), R * x(0) - x(1) - x(0) * x(2), x(0) * x(1) - b * x(2)); } py::array_t advance(py::array_t x_in, int steps, double dt) { for(int i=0, b=0, e=n; i<3; ++i, b+=n, e+=n) vex::copy(x_in.data()+b, x_in.data()+e, X(i).begin()); odeint::integrate_n_steps(Stepper(), std::ref(*this), X, 0.0, dt, steps); py::array_t x_out(std::array{x_in.shape(0), x_in.shape(1)}); for(int i=0, b=0; i<3; ++i, b+=n) vex::copy(X(i).begin(), X(i).end(), x_out.mutable_data()+b); return x_out; } }; PYBIND11_PLUGIN(pylorenz) { py::module m("pylorenz"); m.def("context", [](std::string name) { std::ostringstream s; s << ctx(name); py::print(s.str()); }, py::arg("name") = std::string("")); py::class_(m, "Stepper") .def(py::init>()) .def("advance", &lorenz_system::advance) ; return m.ptr(); }

Slide 43

Slide 43 text

Creating python bindings for C++ with pybind11 #include #include #include #include #include #include #include namespace odeint = boost::numeric::odeint; namespace py = pybind11; typedef vex::multivector state_type; typedef odeint::runge_kutta4_classic< state_type, double, state_type, double, odeint::vector_space_algebra, odeint::default_operations > Stepper; vex::Context& ctx(std::string name = "") { static vex::Context c(vex::Filter::Env && vex::Filter::Name(name)); return c; } struct lorenz_system { int n; double sigma, b; vex::vector R; state_type X; lorenz_system(double sigma, double b, py::array_t R) : n(R.size()), sigma(sigma), b(b), R(ctx(), n, R.data()), X(ctx(), n) {} void operator()(const state_type &x, state_type &dxdt, double t) const { dxdt = std::make_tuple( sigma * (x(1) - x(0)), R * x(0) - x(1) - x(0) * x(2), x(0) * x(1) - b * x(2)); } py::array_t advance(py::array_t x_in, int steps, double dt) { for(int i=0, b=0, e=n; i<3; ++i, b+=n, e+=n) vex::copy(x_in.data()+b, x_in.data()+e, X(i).begin()); odeint::integrate_n_steps(Stepper(), std::ref(*this), X, 0.0, dt, steps); py::array_t x_out(std::array{x_in.shape(0), x_in.shape(1)}); for(int i=0, b=0; i<3; ++i, b+=n) vex::copy(X(i).begin(), X(i).end(), x_out.mutable_data()+b); return x_out; } }; PYBIND11_PLUGIN(pylorenz) { py::module m("pylorenz"); m.def("context", [](std::string name) { std::ostringstream s; s << ctx(name); py::print(s.str()); }, py::arg("name") = std::string("")); py::class_(m, "Stepper") .def(py::init>()) .def("advance", &lorenz_system::advance) ; return m.ptr(); } 1. State type and stepper type 12 typedef vex::multivector state_type; 13 14 typedef odeint::runge_kutta4_classic< 15 state_type, double, state_type, double, 16 odeint::vector_space_algebra, odeint::default_operations 17 > Stepper;

Slide 44

Slide 44 text

Creating python bindings for C++ with pybind11 #include #include #include #include #include #include #include namespace odeint = boost::numeric::odeint; namespace py = pybind11; typedef vex::multivector state_type; typedef odeint::runge_kutta4_classic< state_type, double, state_type, double, odeint::vector_space_algebra, odeint::default_operations > Stepper; vex::Context& ctx(std::string name = "") { static vex::Context c(vex::Filter::Env && vex::Filter::Name(name)); return c; } struct lorenz_system { int n; double sigma, b; vex::vector R; state_type X; lorenz_system(double sigma, double b, py::array_t R) : n(R.size()), sigma(sigma), b(b), R(ctx(), n, R.data()), X(ctx(), n) {} void operator()(const state_type &x, state_type &dxdt, double t) const { dxdt = std::make_tuple( sigma * (x(1) - x(0)), R * x(0) - x(1) - x(0) * x(2), x(0) * x(1) - b * x(2)); } py::array_t advance(py::array_t x_in, int steps, double dt) { for(int i=0, b=0, e=n; i<3; ++i, b+=n, e+=n) vex::copy(x_in.data()+b, x_in.data()+e, X(i).begin()); odeint::integrate_n_steps(Stepper(), std::ref(*this), X, 0.0, dt, steps); py::array_t x_out(std::array{x_in.shape(0), x_in.shape(1)}); for(int i=0, b=0; i<3; ++i, b+=n) vex::copy(X(i).begin(), X(i).end(), x_out.mutable_data()+b); return x_out; } }; PYBIND11_PLUGIN(pylorenz) { py::module m("pylorenz"); m.def("context", [](std::string name) { std::ostringstream s; s << ctx(name); py::print(s.str()); }, py::arg("name") = std::string("")); py::class_(m, "Stepper") .def(py::init>()) .def("advance", &lorenz_system::advance) ; return m.ptr(); } 2. Context initialization 19 vex::Context& ctx(std::string name = "") { 20 static vex::Context c(vex::Filter::Env && vex::Filter::Name(name)); 21 return c; 22 }

Slide 45

Slide 45 text

Creating python bindings for C++ with pybind11 #include #include #include #include #include #include #include namespace odeint = boost::numeric::odeint; namespace py = pybind11; typedef vex::multivector state_type; typedef odeint::runge_kutta4_classic< state_type, double, state_type, double, odeint::vector_space_algebra, odeint::default_operations > Stepper; vex::Context& ctx(std::string name = "") { static vex::Context c(vex::Filter::Env && vex::Filter::Name(name)); return c; } struct lorenz_system { int n; double sigma, b; vex::vector R; state_type X; lorenz_system(double sigma, double b, py::array_t R) : n(R.size()), sigma(sigma), b(b), R(ctx(), n, R.data()), X(ctx(), n) {} void operator()(const state_type &x, state_type &dxdt, double t) const { dxdt = std::make_tuple( sigma * (x(1) - x(0)), R * x(0) - x(1) - x(0) * x(2), x(0) * x(1) - b * x(2)); } py::array_t advance(py::array_t x_in, int steps, double dt) { for(int i=0, b=0, e=n; i<3; ++i, b+=n, e+=n) vex::copy(x_in.data()+b, x_in.data()+e, X(i).begin()); odeint::integrate_n_steps(Stepper(), std::ref(*this), X, 0.0, dt, steps); py::array_t x_out(std::array{x_in.shape(0), x_in.shape(1)}); for(int i=0, b=0; i<3; ++i, b+=n) vex::copy(X(i).begin(), X(i).end(), x_out.mutable_data()+b); return x_out; } }; PYBIND11_PLUGIN(pylorenz) { py::module m("pylorenz"); m.def("context", [](std::string name) { std::ostringstream s; s << ctx(name); py::print(s.str()); }, py::arg("name") = std::string("")); py::class_(m, "Stepper") .def(py::init>()) .def("advance", &lorenz_system::advance) ; return m.ptr(); } 3. System function 24 struct lorenz_system { 25 int n; 26 double sigma, b; 27 vex::vector R; 28 state_type X; 29 30 lorenz_system(double sigma, double b, py::array_t R) 31 : n(R.size()), sigma(sigma), b(b), R(ctx(), n, R.data()), X(ctx(), n) {} 32 33 void operator()(const state_type &x, state_type &dxdt, double t) const { 34 dxdt = std::make_tuple( 35 sigma * (x(1) - x(0)), 36 R * x(0) - x(1) - x(0) * x(2), 37 x(0) * x(1) - b * x(2)); 38 } 39 40 py::array_t advance(py::array_t x_in, int steps, double dt) { 41 for(int i=0, b=0, e=n; i<3; ++i, b+=n, e+=n) 42 vex::copy(x_in.data()+b, x_in.data()+e, X(i).begin()); 43 44 odeint::integrate_n_steps(Stepper(), std::ref(*this), X, 0.0, dt, steps); 45 46 py::array_t x_out(std::array{x_in.shape(0), x_in.shape(1)}); 47 for(int i=0, b=0; i<3; ++i, b+=n) 48 vex::copy(X(i).begin(), X(i).end(), x_out.mutable_data()+b); 49 50 return x_out; 51 } 52 };

Slide 46

Slide 46 text

Creating python bindings for C++ with pybind11 #include #include #include #include #include #include #include namespace odeint = boost::numeric::odeint; namespace py = pybind11; typedef vex::multivector state_type; typedef odeint::runge_kutta4_classic< state_type, double, state_type, double, odeint::vector_space_algebra, odeint::default_operations > Stepper; vex::Context& ctx(std::string name = "") { static vex::Context c(vex::Filter::Env && vex::Filter::Name(name)); return c; } struct lorenz_system { int n; double sigma, b; vex::vector R; state_type X; lorenz_system(double sigma, double b, py::array_t R) : n(R.size()), sigma(sigma), b(b), R(ctx(), n, R.data()), X(ctx(), n) {} void operator()(const state_type &x, state_type &dxdt, double t) const { dxdt = std::make_tuple( sigma * (x(1) - x(0)), R * x(0) - x(1) - x(0) * x(2), x(0) * x(1) - b * x(2)); } py::array_t advance(py::array_t x_in, int steps, double dt) { for(int i=0, b=0, e=n; i<3; ++i, b+=n, e+=n) vex::copy(x_in.data()+b, x_in.data()+e, X(i).begin()); odeint::integrate_n_steps(Stepper(), std::ref(*this), X, 0.0, dt, steps); py::array_t x_out(std::array{x_in.shape(0), x_in.shape(1)}); for(int i=0, b=0; i<3; ++i, b+=n) vex::copy(X(i).begin(), X(i).end(), x_out.mutable_data()+b); return x_out; } }; PYBIND11_PLUGIN(pylorenz) { py::module m("pylorenz"); m.def("context", [](std::string name) { std::ostringstream s; s << ctx(name); py::print(s.str()); }, py::arg("name") = std::string("")); py::class_(m, "Stepper") .def(py::init>()) .def("advance", &lorenz_system::advance) ; return m.ptr(); } 4. Python bindings 54 PYBIND11_PLUGIN(pylorenz) { 55 py::module m("pylorenz"); 56 57 m.def("context", [](std::string name) { 58 std::ostringstream s; s << ctx(name); py::print(s.str()); 59 }, py::arg("name") = std::string("")); 60 61 py::class_(m, "Stepper") 62 .def(py::init>()) 63 .def("advance", &lorenz_system::advance) 64 ; 65 66 return m.ptr(); 67 }

Slide 47

Slide 47 text

Calling C++ code from Python 1 import pylorenz as lorenz 2 3 n = 32 # Number of parameters 4 m = 500 # Length of trajectory 5 6 # Parameters 7 R = linspace(0.1, 10, n) 8 sigma = 10.0 9 b = 8/3 10 11 # Initial solution 12 S = empty((m, 3, n)) 13 S[0,:,:] = 10.0 14 15 # Integration 16 ode = lorenz.Stepper(sigma, b, R) 17 for i in range(1,m): 18 S[i] = ode.advance(S[i-1], 1, 0.01)

Slide 48

Slide 48 text

Projects using VexCL AMGCL — an efficient, flexible, and extensible Algebraic Multigrid implementation: https://github.com/ddemidov/amgcl Boost.odeint — numerical solution of Ordinary Differential Equations: https://github.com/boostorg/odeint Antioch — A New Templated Implementation Of Chemistry for Hydrodynamics (The University of Texas at Austin): https://github.com/libantioch/antioch FDBB — Fluid Dynamics Building Blocks ( ): https://gitlab.com/mmoelle1/FDBB

Slide 49

Slide 49 text

Source code: https://github.com/ddemidov/vexcl Documentation: http://vexcl.readthedocs.io Slides: https://speakerdeck.com/ddemidov Example codes: https://github.com/ddemidov/vexcl-talks