Upgrade to Pro — share decks privately, control downloads, hide ads and more …

waifu2xをMetalで書いてみた

 waifu2xをMetalで書いてみた

Mac OS X 10.11でのMetalの使いかたと、waifu2xのアルゴリズムについて簡単に紹介します。

matuyuji

July 11, 2015
Tweet

More Decks by matuyuji

Other Decks in Programming

Transcript

  1. –Metal Programming Guide “A primary goal of Metal is to

    minimize the CPU overhead incurred by executing GPU workloads.”
  2. Rec. 709 luma For each pixel: y = 0.2126 r

    + 0.7152 g + 0.0722 b y = (0.2126, 0.7152, 0.0722)ɾrgb
  3. float3 kRec709Luma = float3(0.2126, 0.7152, 0.0722); void grayscale( texture2d in,

    texture2d out, uint2 gid // Pixel Coordinates { float4 inColor = in.read(gid); float gray = dot(inColor.rgb, kRec709Luma); float4 outColor = float4(gray, gray, gray, inColor.a); out.write(outColor, gid); }
  4. Metal Shading Language C++11 base Restrictions: lambda, recursive functions, goto,

    etc. Add Qualifiers Standard Library: <metal_stdlib>
  5. grayscale.metal #include <metal_stdlib> using namespace metal; constant float3 kRec709Luma =

    float3(0.2126, 0.7152, 0.0722); kernel void grayscale( texture2d<float, access::read> in [[texture(0)]], texture2d<float, access::write> out [[texture(1)]], uint2 gid [[thread_position_in_grid]]) { float4 inColor = in.read(gid); float gray = dot(inColor.rgb, kRec709Luma); float4 outColor = float4(gray, gray, gray, inColor.a); out.write(outColor, gid); }
  6. Execution Model Command Queue Device Command Buffer Compute Command Encoder

    Texture Texture Command Buffer Function Library (Compute) grayscale.metal
  7. MTLDevice Use MTLCopyAllDevices or MTLCreateSystemDefaultDevice for i in MTLCopyAllDevices() {

    print("\(i.name) \(i.headless) \(i.lowPower) \(i.maxThreadsPerThreadgroup)") } Optional("NVIDIA GeForce GT 650M") true false C.MTLSize(width: 1024, height: 1024, depth: 64) Optional("Intel HD Graphics 4000") true true C.MTLSize(width: 512, height: 512, depth: 512) (MBP Mid 2012)
  8. MTLComputeCommandEncoder let pipelineState = try! device.newComputePipelineStateWithFunction(function) let encoder = commandBuf.computeCommandEncoder()

    encoder.setComputePipelineState(pipelineState) encoder.setTexture(inTexture, atIndex: 0) encoder.setTexture(outTexture, atIndex: 1) kernel void grayscale( texture2d<float, access::read> in [[texture(0)]], texture2d<float, access::write> out [[texture(1)]], uint2 gid [[thread_position_in_grid]])
  9. dispatchThreadgroups let width = inTexture.width let height = inTexture.height let

    threadsPerThreadgroup = MTLSizeMake(32, 16, 1) let numGroups = MTLSizeMake( 1 + width / threadsPerThreadgroup.width, 1 + height / threadsPerThreadgroup.height, 1) encoder.dispatchThreadgroups(numGroups, threadsPerThreadgroup: threadsPerThreadgroup)
  10. Execution Model Command Queue Device Command Buffer Compute Command Encoder

    Texture Texture Command Buffer Function Library (Compute) grayscale.metal
  11. Resource Storage Modes texture buffer texture buffer Shared ◦ ◦

    × ◦ Managed × × ◦ Private iOS OS X ◦ default × not available
  12. MTLTexture.getBytes var buf = Array<UInt8>(count: width * height * 4,

    repeatedValue: 0) let region = MTLRegionMake2D(0, 0, width, height) texture.getBytes(&buf, bytesPerRow: rowBytes, fromRegion: region, mipmapLevel: 0) let colorSpace = CGColorSpaceCreateDeviceRGB() let context = CGBitmapContextCreate(&buf, width, height, 8, rowBytes, colorSpace, CGImageAlphaInfo.PremultipliedLast.rawValue)
  13. MetalKit MTKTextureLoader can load images from common file formats such

    as PNG, JPEG, and TIFF. let loader = MTKTextureLoader(device: device) let url = NSURL(fileURLWithPath: path) let inTexture = try! loader.textureWithContentsOfURL (url, options: nil)
  14. Texture Execution Model Command Queue Device Command Buffer Compute Command

    Encoder Texture Buffer Texture Texture Blit Command Encoder Texture Buffer Texture Command Buffer Function Library (Compute & Blit) *.metal
  15. ꒅ ꒅ ꒅ ꒅ ꒅ ꒅ ꒅ ꒅ ꒅ +

    element-wise multiplication
  16. 1 1 1 1 1 1 1 1 1 0

    1 0 1 -4 1 0 1 0 -2 -1 0 -1 1 1 0 1 2 0 -1 0 -1 5 -1 0 -1 0 blur edge detect sharpen emboss w/ norm.
  17. Accelerate Framework func vImageConvolve_ARGBFFFF( _ src: UnsafePointer<vImage_Buffer>, _ dest: UnsafePointer<vImage_Buffer>,

    _ tempBuffer: UnsafeMutablePointer<Void>, _ srcOffsetToROI_X: vImagePixelCount, _ srcOffsetToROI_Y: vImagePixelCount, _ kernel: UnsafePointer<Float>, _ kernel_height: UInt32, _ kernel_width: UInt32, _ backgroundColor: UnsafeMutablePointer<Float>, _ flags: vImage_Flags) -> vImage_Error
  18. convolute.metal kernel void convolve(texture2d<float, access::read> in[[texture(0)]], texture2d<float, access::write> out[[texture(1)]], constant

    float3x3& weight[[buffer(0)]], uint2 gid[[thread_position_in_grid]]) { if (gid.x >= in.get_width() || gid.y >= in.get_height()) return; float4 in00 = in.read(gid + uint2(-1, -1)); float4 in10 = in.read(gid + uint2( 0, -1)); float4 in20 = in.read(gid + uint2(+1, -1)); float4 in01 = in.read(gid + uint2(-1, 0)); float4 in11 = in.read(gid + uint2( 0, 0)); float4 in21 = in.read(gid + uint2(+1, 0)); float4 in02 = in.read(gid + uint2(-1, +1)); float4 in12 = in.read(gid + uint2( 0, +1)); float4 in22 = in.read(gid + uint2(+1, +1)); float4 outColor = ( in00 * weight[0][0] + in10 * weight[1][0] + in20 * weight[2][0] + in01 * weight[0][1] + in11 * weight[1][1] + in21 * weight[2][1] + in02 * weight[0][2] + in12 * weight[1][2] + in22 * weight[2][2]); out.write(outColor, gid); }
  19. https://marcan.st/transf/waifu2x.py import json, sys, numpy as np from scipy import

    misc, signal from PIL import Image infile, outfile, modelpath = sys.argv[1:] model = json.load(open(modelpath)) im = Image.open(infile).convert("YCbCr") im = misc.fromimage(im.resize((2*im.size[0], 2*im.size[1]), resample=Image.NEAREST)).astype("float32") planes = [np.pad(im[:,:,0], len(model), "edge") / 255.0] for step in model: o_planes = [] for bias, weights in zip(step["bias"], step["weight"]): partial = None for ip, kernel in zip(planes, weights): p = signal.convolve2d(ip, np.float32(kernel), "valid") if partial is None: partial = p else: partial += p partial += np.float32(bias) o_planes.append(partial) planes = [np.maximum(p, 0) + 0.1 * np.minimum(p, 0) for p in o_planes] im[:,:,0] = np.clip(planes[0], 0, 1) * 255 misc.toimage(im, mode="YCbCr").convert("RGB").save(outfile)
  20. https://marcan.st/transf/waifu2x.py import json, sys, numpy as np from scipy import

    misc, signal from PIL import Image infile, outfile, modelpath = sys.argv[1:] model = json.load(open(modelpath)) im = Image.open(infile).convert("YCbCr") im = misc.fromimage(im.resize((2*im.size[0], 2*im.size[1]), resample=Image.NEAREST)).astype("float32") planes = [np.pad(im[:,:,0], len(model), "edge") / 255.0] for step in model: o_planes = [] for bias, weights in zip(step["bias"], step["weight"]): partial = None for ip, kernel in zip(planes, weights): p = signal.convolve2d(ip, np.float32(kernel), "valid") if partial is None: partial = p else: partial += p partial += np.float32(bias) o_planes.append(partial) planes = [np.maximum(p, 0) + 0.1 * np.minimum(p, 0) for p in o_planes] im[:,:,0] = np.clip(planes[0], 0, 1) * 255 misc.toimage(im, mode="YCbCr").convert("RGB").save(outfile)
  21. split kernel void splitToRGBChannels( texture2d<float, access::read> in[[texture(0)]], texture2d_array<float, access::write> out[[texture(1)]],

    uint2 gid[[thread_position_in_grid]]) { if (gid.x < in.get_width() && gid.y < in.get_height()) { out.write(float4(in.read(gid).r, 0.0f, 0.0f, 0.0f), gid, 0); out.write(float4(in.read(gid).g, 0.0f, 0.0f, 0.0f), gid, 1); out.write(float4(in.read(gid).b, 0.0f, 0.0f, 0.0f), gid, 2); } } let desc = MTLTextureDescriptor.texture2DDescriptorWithPixelFormat( format, width: width, height: height, mipmapped: false) desc.textureType = .Type2DArray desc.arrayLength = 3 let outTexture = device.newTextureWithDescriptor(desc)
  22. combine kernel void combineRGBChannels( texture2d_array<float, access::read> in[[texture(0)]], texture2d<float, access::write> out[[texture(1)]],

    uint2 gid[[thread_position_in_grid]]) { if (gid.x < in.get_width() && gid.y < in.get_height()) { float4 outColor(in.read(gid, 0).r, in.read(gid, 1).r, in.read(gid, 2).r, 1.0f); out.write(outColor, gid); } }
  23. kernel void waifu2x(texture2d_array<float, access::read> in[[texture(0)]], texture2d<float, access::write> out[[texture(1)]], constant float3x3*

    weights[[buffer(0)]], constant float& bias[[buffer(1)]], uint2 gid[[thread_position_in_grid]]) { if (gid.x >= in.get_width() || gid.y >= in.get_height()) return; float partial = bias; for (uint i = 0; i < in.get_array_size(); ++i) { float3 in0 = float3(in.read(gid + uint2(-1, -1), i).r, in.read(gid + uint2( 0, -1), i).r, in.read(gid + uint2(+1, -1), i).r); float3 in1 = float3(in.read(gid + uint2(-1, 0), i).r, in.read(gid + uint2( 0, 0), i).r, in.read(gid + uint2(+1, 0), i).r); float3 in2 = float3(in.read(gid + uint2(-1, +1), i).r, in.read(gid + uint2( 0, +1), i).r, in.read(gid + uint2(+1, +1), i).r); float3x3 weight = weights[i]; partial += dot(in0, weight[0]) + dot(in1, weight[1]) + dot(in2, weight[2]); } float p = fmax(partial, 0) + 0.1 * fmin(partial, 0); float4 outColor(p, 0, 0, 0); out.write(outColor, gid); }