Machine Learning & Compilers

Optimisation heuristics are too course. (aka. it doesn’t take much
to do better than -O3)

Better than -O3 #!/bin/sh while true; do sort --random-sort <
"cflags.txt" | head -n 20 | xargs gcc -O1 app.c time ./a.out done after 200 attempts, ~5% improvement

*actual homework

200 attempts sounds like a lot… (200 x 10 x
30 s ≈ 16 hrs)

ocean Drop in the 250 GCC flags ≥ 2250 options
≈ 1075 Atoms in the universe ≈ 1080

Exhaustive search is not really* practical. *in this universe

Machine Learning Estimate y = f(x)

Machine Learning Estimate y = f(x) Optimisations Features Cflags #
instructions Workgroup size Arithmetic density CPU or GPU Dataset size

Use a GPU Machine Learning Estimate y = f(x) Use
a CPU The idea.

Use a GPU Machine Learning Estimate y = f(x) Use
a CPU The reality. (same data, wrong conclusions)

Synthesizing Benchmarks for Predictive Modeling

Why?  There aren’t enough benchmarks

Why?  More benchmarks = better models

Why?  No adequate solution

How? Teach an AI to program from GitHub

Implementation

CLgen CLdrive Language Corpus GitHub Software Repositories clsmith clsmith Content
Files Rejection Filter Search engine Source Normalizer Training parameters Rejection Filter LSTM network Synthesizer Synthesis parameters Argument Extractor Benchmark parameters clsmith clsmith Synthesized Benchmarks Benchmark Driver clsmith clsmith Synthesized Payloads clsmith clsmith Performance Results Dynamic Checker

We teach an AI to code by showing it lots
of code. Huge repository of public knowledge: And they have an API :-) $ curl https://api.github.com/search/repositories\? q\=opencl\&sort\=stars\&order\=desc { "total_count": 3155, "incomplete_results": false, "items": [ { "id": 7296244, "name": "lwjgl3", "full_name": "LWJGL/lwjgl3",

OpenCL is not a first-class language. Search repositories using loose
keyword terms. e.g. opencl, nvidia, gpu, cl, amd. Recursively iterate over git trees to get .cl files. /src/guassian.cl #include <common.h> /include/common.h #include “detail/math.cl” Foo:MyOpenCLRepo (0.6% miss rate)

/* Copyright (C) 2004 Joe Bloggs <[email protected]> */ // //
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE // TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION // // 0. You just DO WHAT THE FUCK YOU WANT TO. #define CLAMPING #define THRESHOLD_MIN 1.0f #define THRESHOLD_MAX 1.0f float myclamp(float in) { #ifdef CLAMPING return in > THRESHOLD_MAX ? THRESHOLD_MAX : in < THRESHOLD_MIN ? THRESHOLD_MIN : in; #else return in; #endif // CLAMPING } // Do something really flipping cool __kernel void findAllNodesMergedAabb(__global float* in, __global float* out, int num_elems) { // // int id = get_global_id(0); if (id < num_elems) { out[id] = myclamp(in[id]); } } x 8078 files 2.8 million lines

DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE // TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION // // 0. You just DO WHAT THE FUCK YOU WANT TO. #define CLAMPING #define THRESHOLD_MIN 1.0f #define THRESHOLD_MAX 1.0f float myclamp(float in) { #ifdef CLAMPING return in > THRESHOLD_MAX ? THRESHOLD_MAX : in < THRESHOLD_MIN ? THRESHOLD_MIN : in; #else return in; #endif // CLAMPING } // Do something really flipping cool __kernel void findAllNodesMergedAabb(__global float* in, __global float* out, int num_elems) { // // int id = get_global_id(0); if (id < num_elems) { out[id] = myclamp(in[id]); } }

DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE // TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION // // 0. You just DO WHAT THE FUCK YOU WANT TO. #define CLAMPING #define THRESHOLD_MIN 1.0f #define THRESHOLD_MAX 1.0f float myclamp(float in) { #ifdef CLAMPING return in > THRESHOLD_MAX ? THRESHOLD_MAX : in < THRESHOLD_MIN ? THRESHOLD_MIN : in; #else return in; #endif // CLAMPING } // Do something really flipping cool __kernel void findAllNodesMergedAabb(__global float* in, __global float* out, int num_elems) { // // int id = get_global_id(0); if (id < num_elems) { out[id] = myclamp(in[id]); } } Is this real, valid OpenCL? Can we minimise non-functional variance?

DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE // TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION // // 0. You just DO WHAT THE FUCK YOU WANT TO. #define CLAMPING #define THRESHOLD_MIN 1.0f #define THRESHOLD_MAX 1.0f float myclamp(float in) { #ifdef CLAMPING return in > THRESHOLD_MAX ? THRESHOLD_MAX : in < THRESHOLD_MIN ? THRESHOLD_MIN : in; #else return in; #endif // CLAMPING } // Do something really flipping cool __kernel void findAllNodesMergedAabb(__global float* in, __global float* out, int num_elems) { // // int id = get_global_id(0); if (id < num_elems) { out[id] = myclamp(in[id]); } } Is this real, valid OpenCL? Can we minimise non-functional variance? Strip comments

#define CLAMPING #define THRESHOLD_MIN 1.0f #define THRESHOLD_MAX 1.0f float myclamp(float
in) { #ifdef CLAMPING return in > THRESHOLD_MAX ? THRESHOLD_MAX : in < THRESHOLD_MIN ? THRESHOLD_MIN : in; #else return in; #endif } __kernel void findAllNodesMergedAabb(__global float* in, __global float* out, int num_elems) { int id = get_global_id(0); if (id < num_elems) { out[id] = myclamp(in[id]); } } Strip comments

#define CLAMPING #define THRESHOLD_MIN 1.0f #define THRESHOLD_MAX 1.0f float myclamp(float
in) { #ifdef CLAMPING return in > THRESHOLD_MAX ? THRESHOLD_MAX : in < THRESHOLD_MIN ? THRESHOLD_MIN : in; #else return in; #endif } __kernel void findAllNodesMergedAabb(__global float* in, __global float* out, int num_elems) { int id = get_global_id(0); if (id < num_elems) { out[id] = myclamp(in[id]); } } Strip comments Preprocess

Strip comments float myclamp(float in) { return in > 1.0f
? 1.0f : in < 0.0f ? 0.0f : in; } __kernel void findAllNodesMergedAabb(__global float* in, __global float* out, int num_elems) { int id = get_global_id(0); if (id < num_elems) { out[id] = myclamp(in[id]); } } Preprocess

? 1.0f : in < 0.0f ? 0.0f : in; } __kernel void findAllNodesMergedAabb(__global float* in, __global float* out, int num_elems) { int id = get_global_id(0); if (id < num_elems) { out[id] = myclamp(in[id]); } } Preprocess Does it compile? Does it contain instructions?

? 1.0f : in < 0.0f ? 0.0f : in; } __kernel void findAllNodesMergedAabb(__global float* in, __global float* out, int num_elems) { int id = get_global_id(0); if (id < num_elems) { out[id] = myclamp(in[id]); } } Preprocess Does it compile? Does it contain instructions? Does it compile? Does it contain instructions?

? 1.0f : in < 0.0f ? 0.0f : in; } __kernel void findAllNodesMergedAabb(__global float* in, __global float* out, int num_elems) { int id = get_global_id(0); if (id < num_elems) { out[id] = myclamp(in[id]); } } Preprocess Does it compile? Does it contain instructions? Rewrite function names

float A(float in) { return in > 1.0f ? 1.0f
: in < 0.0f ? 0.0f : in; } __kernel void B(__global float* in, __global float* out, int num_elems) { int id = get_global_id(0); if (id < num_elems) { out[id] = A(in[id]); } } Rewrite function names Strip comments Preprocess Does it compile? Does it contain instructions?

float A(float in) { return in > 1.0f ? 1.0f
: in < 0.0f ? 0.0f : in; } __kernel void B(__global float* in, __global float* out, int num_elems) { int id = get_global_id(0); if (id < num_elems) { out[id] = A(in[id]); } } Rewrite function names Strip comments Preprocess Does it compile? Does it contain instructions? Rewrite variable names

float A(float a) { return a > 1.0f ? 1.0f
: a < 0.0f ? 0.0f : a; } __kernel void B(__global float* a, __global float* b, int c) { int d = get_global_id(0); if (d < c) { b[d] = A(a[d]); } } Rewrite variable names Rewrite function names Strip comments Preprocess Does it compile? Does it contain instructions?

: a < 0.0f ? 0.0f : a; } __kernel void B(__global float* a, __global float* b, int c) { int d = get_global_id(0); if (d < c) { b[d] = A(a[d]); } } Rewrite variable names Rewrite function names Strip comments Preprocess Does it compile? Does it contain instructions? Enforce code style

: in < 0.0f ? 0.0f : a; } __kernel void B(__global float* a, __global float* b, int c) { int d = get_global_id(0); if (d < c) { b[d] = A(a[d]); } } Rewrite variable names Rewrite function names Strip comments Preprocess Enforce code style Does it compile? Does it contain instructions?

Forward Pass Backward Pass Update Weights Network Design 2048 nodes,
3 layers Stochastic Gradient Descent Initial learning rate 0.005 Train for 50 epochs Learning rate decay every 5 epochs

Estimate y = f(x) Distribution of characters 1278595 lines of
OpenCL Machine Learning

S = ‘__kernel void A(__global float* a) {’ depth =
1 while depth > 0: c = predict_next_character(S) if c == ‘{’: depth += 1 if c == ‘}’: depth -= 1 S += c return S Kernel Synthesis

Demo (you had to be there)

__kernel void A(__global float* a, __global float* b, __global float*
c, const int d) { int e = get_global_id(0); float f = 0.0; for (int g = 0; g < d; g++) { c[g] = 0.0f; } barrier(1); a[get_global_id(0)] = 2*b[get_global_id(0)]; }

c, const int d) { int e = get_global_id(0); if (e >= d) { return; } c[e] = a[e] + b[e] + 2 * a[e] + b[e] + 4; }

c, const int d) { unsigned int e = get_global_id(0); float16 f = (float16)(0.0); for (unsigned int g = 0; g < d; g++) { float16 h = a[g]; f.s0 += h.s0; f.s1 += h.s1; /* snip ... */ f.sE += h.sE; f.sF += h.sF; } b[e] = f.s0 + f.s1 + f.s2 + f.s3 + f.s4 + f.s5 + f.s6 + f.s7 + f.s8 + f.s9 + f.sA + f.sB + f.sC + f.sD + f.sE + f.sF; }

c, const int d) { int e = get_global_id(0); if (e >= d) { return; } c[e] = a[e] + b[e] + 2 * a[e] + b[e] + 4; }

_A = random_payload(_A) # generate inputs _B = random_payload(_B) _C
= copy(_C) _D = copy(_B) A_ = k(_A) # compute outputs B_ = k(_B) C_ = k(_C) D_ = k(_D) # differential test assert (A_ != _A || B_ != _B) else NO_OUTPUTS assert (A_ != B_ || C_ != D_) else INPUT_INSENSITIVE assert (A_ == C_ && B_ == D_) else NON_DETERMINISTIC

Does it work? (yes)

7 programs, 1,000 synthetic benchmarks. 1.27x faster

71 programs, 1,000 synthetic benchmarks. 2.66x faster

Good Things Bad Things Basically* language agnostic. 35 million repos
on GitHub. We’re using 0.00004%. Generates 2000 OpenCL benchmarks per machine per day. No support for things declared outside of kernel scope. Undirected almost to a fault. AMD rage.

Thanks! More benchmarks = Better models No general way of
creating benchmarks I taught a Neural Network to program from GitHub Improves state of the art by 3.38x

Machine Learning & Compilers

Machine Learning & Compilers

More Decks by Chris Cummins

Other Decks in Science

Featured

Transcript