Upgrade to Pro — share decks privately, control downloads, hide ads and more …

waifu2xをMetalで書いてみた

 waifu2xをMetalで書いてみた

Mac OS X 10.11でのMetalの使いかたと、waifu2xのアルゴリズムについて簡単に紹介します。

matuyuji

July 11, 2015
Tweet

More Decks by matuyuji

Other Decks in Programming

Transcript

  1. XBJGVYΛ.FUBMͰॻ͍ͯΈͨ
    ୈ62ճ Cocoaษڧձؔ੢ (2015.7.11)

    View Slide

  2. @matuyuji
    safx-dev.blogspot.jp

    View Slide

  3. waifu2x

    View Slide

  4. Overview
    Metal
    Metal Programming
    Image Convolution
    waifu2x Algorithm
    waifu2x in Metal

    View Slide

  5. –Metal Programming Guide
    “A primary goal of Metal is to minimize the CPU
    overhead incurred by executing GPU workloads.”

    View Slide

  6. Metal
    Metal Shading Language
    Metal Framework
    MetalKit Framework

    View Slide

  7. Overview
    Metal
    Metal Programming
    Image Convolution
    waifu2x Algorithm
    waifu2x in Metal

    View Slide

  8. Text
    Metal Programming
    grayscale

    View Slide

  9. Rec. 709 luma
    For each pixel:
    y = 0.2126 r + 0.7152 g + 0.0722 b
    y = (0.2126, 0.7152, 0.0722)ɾrgb

    View Slide

  10. Metal Shading Language
    float3 kRec709Luma = float3(0.2126, 0.7152, 0.0722);
    float gray = dot(inColor.rgb, kRec709Luma);

    View Slide

  11. float3 kRec709Luma = float3(0.2126, 0.7152, 0.0722);
    void grayscale(
    texture2d in,
    texture2d out,
    uint2 gid // Pixel Coordinates
    {
    float4 inColor = in.read(gid);
    float gray = dot(inColor.rgb, kRec709Luma);
    float4 outColor = float4(gray, gray, gray, inColor.a);
    out.write(outColor, gid);
    }

    View Slide

  12. Metal Shading Language
    C++11 base
    Restrictions: lambda, recursive functions, goto, etc.
    Add Qualifiers
    Standard Library:

    View Slide

  13. grayscale.metal
    #include
    using namespace metal;
    constant float3 kRec709Luma = float3(0.2126, 0.7152, 0.0722);
    kernel void grayscale(
    texture2d in [[texture(0)]],
    texture2d out [[texture(1)]],
    uint2 gid [[thread_position_in_grid]])
    {
    float4 inColor = in.read(gid);
    float gray = dot(inColor.rgb, kRec709Luma);
    float4 outColor = float4(gray, gray, gray, inColor.a);
    out.write(outColor, gid);
    }

    View Slide

  14. Execution Model
    Command Queue
    Device
    Command Buffer
    Compute
    Command
    Encoder
    Texture
    Texture
    Command Buffer
    Function
    Library
    (Compute)
    grayscale.metal

    View Slide

  15. MTLDevice
    Use MTLCopyAllDevices or
    MTLCreateSystemDefaultDevice
    for i in MTLCopyAllDevices() {
    print("\(i.name) \(i.headless) \(i.lowPower)
    \(i.maxThreadsPerThreadgroup)")
    }
    Optional("NVIDIA GeForce GT 650M") true false
    C.MTLSize(width: 1024, height: 1024, depth: 64)
    Optional("Intel HD Graphics 4000") true true
    C.MTLSize(width: 512, height: 512, depth: 512)
    (MBP Mid 2012)

    View Slide

  16. MTLLibrary & MTLFunction
    let library = device.newDefaultLibrary()!
    let function = library.newFunctionWithName(“grayscale")!

    View Slide

  17. MTLQueue & MTLBuffer
    let queue = device.newCommandQueue()
    let commandBuf = queue.commandBuffer()

    View Slide

  18. Command Queue
    Device
    Command Buffer
    Compute
    Command
    Encoder
    Texture
    Texture
    Function
    Library
    grayscale.metal

    View Slide

  19. MTLComputeCommandEncoder
    let pipelineState = try!
    device.newComputePipelineStateWithFunction(function)
    let encoder = commandBuf.computeCommandEncoder()
    encoder.setComputePipelineState(pipelineState)
    encoder.setTexture(inTexture, atIndex: 0)
    encoder.setTexture(outTexture, atIndex: 1)
    kernel void grayscale(
    texture2d in [[texture(0)]],
    texture2d out [[texture(1)]],
    uint2 gid [[thread_position_in_grid]])

    View Slide

  20. dispatchThreadgroups
    let width = inTexture.width
    let height = inTexture.height
    let threadsPerThreadgroup = MTLSizeMake(32, 16, 1)
    let numGroups = MTLSizeMake(
    1 + width / threadsPerThreadgroup.width,
    1 + height / threadsPerThreadgroup.height,
    1)
    encoder.dispatchThreadgroups(numGroups,
    threadsPerThreadgroup: threadsPerThreadgroup)

    View Slide

  21. threadsPerThreadgroup
    32px
    16px

    View Slide

  22. threadsPerThreadgroup

    View Slide

  23. endEncoding & commit
    encoder.endEncoding()
    commandBuf.commit()
    commandBuf.waitUntilCompleted()

    View Slide

  24. Execution Model
    Command Queue
    Device
    Command Buffer
    Compute
    Command
    Encoder
    Texture
    Texture
    Command Buffer
    Function
    Library
    (Compute)
    grayscale.metal

    View Slide

  25. Managed Resource
    CPU
    GPU

    View Slide

  26. Resource Storage Modes
    texture buffer texture buffer
    Shared ○ ○ × ○
    Managed × × ○
    Private
    iOS OS X
    ○ default × not available

    View Slide

  27. synchronizeResource
    let encoder = commandBuf.blitCommandEncoder()
    encoder.synchronizeResource(texture)
    CPU
    GPU

    View Slide

  28. MTLTexture.getBytes
    var buf = Array(count: width * height * 4,
    repeatedValue: 0)
    let region = MTLRegionMake2D(0, 0, width, height)
    texture.getBytes(&buf, bytesPerRow: rowBytes,
    fromRegion: region, mipmapLevel: 0)
    let colorSpace = CGColorSpaceCreateDeviceRGB()
    let context = CGBitmapContextCreate(&buf,
    width, height, 8, rowBytes, colorSpace,
    CGImageAlphaInfo.PremultipliedLast.rawValue)

    View Slide

  29. MetalKit
    MTKTextureLoader can load images from common
    file formats such as PNG, JPEG, and TIFF.
    let loader = MTKTextureLoader(device: device)
    let url = NSURL(fileURLWithPath: path)
    let inTexture = try! loader.textureWithContentsOfURL
    (url, options: nil)

    View Slide

  30. safx/
    Metal-CommandLine-Sample-Swift

    View Slide

  31. Texture
    Execution Model
    Command Queue
    Device
    Command Buffer
    Compute
    Command
    Encoder
    Texture
    Buffer
    Texture
    Texture
    Blit
    Command
    Encoder
    Texture
    Buffer
    Texture
    Command Buffer
    Function
    Library
    (Compute & Blit)
    *.metal

    View Slide

  32. Overview
    Metal
    Metal Programming
    Image Convolution
    waifu2x Algorithm
    waifu2x in Metal

    View Slide

  33. Text
    Image Convolution

    View Slide

  34. ꒅ ꒅ ꒅ
    ꒅ ꒅ ꒅ
    ꒅ ꒅ ꒅ
    +
    element-wise
    multiplication

    View Slide

  35. 1 1 1
    1 1 1
    1 1 1
    0 1 0
    1 -4 1
    0 1 0
    -2 -1 0
    -1 1 1
    0 1 2
    0 -1 0
    -1 5 -1
    0 -1 0
    blur
    edge detect
    sharpen
    emboss
    w/ norm.

    View Slide

  36. Gimp
    Filters → Generic →
    Convolution Matrix

    View Slide

  37. Accelerate Framework
    func vImageConvolve_ARGBFFFF(
    _ src: UnsafePointer,
    _ dest: UnsafePointer,
    _ tempBuffer: UnsafeMutablePointer,
    _ srcOffsetToROI_X: vImagePixelCount,
    _ srcOffsetToROI_Y: vImagePixelCount,
    _ kernel: UnsafePointer,
    _ kernel_height: UInt32,
    _ kernel_width: UInt32,
    _ backgroundColor: UnsafeMutablePointer,
    _ flags: vImage_Flags) -> vImage_Error

    View Slide

  38. convolute.metal
    kernel void convolve(texture2d in[[texture(0)]],
    texture2d out[[texture(1)]],
    constant float3x3& weight[[buffer(0)]],
    uint2 gid[[thread_position_in_grid]])
    {
    if (gid.x >= in.get_width() || gid.y >= in.get_height()) return;
    float4 in00 = in.read(gid + uint2(-1, -1));
    float4 in10 = in.read(gid + uint2( 0, -1));
    float4 in20 = in.read(gid + uint2(+1, -1));
    float4 in01 = in.read(gid + uint2(-1, 0));
    float4 in11 = in.read(gid + uint2( 0, 0));
    float4 in21 = in.read(gid + uint2(+1, 0));
    float4 in02 = in.read(gid + uint2(-1, +1));
    float4 in12 = in.read(gid + uint2( 0, +1));
    float4 in22 = in.read(gid + uint2(+1, +1));
    float4 outColor =
    ( in00 * weight[0][0] + in10 * weight[1][0] + in20 * weight[2][0]
    + in01 * weight[0][1] + in11 * weight[1][1] + in21 * weight[2][1]
    + in02 * weight[0][2] + in12 * weight[1][2] + in22 * weight[2][2]);
    out.write(outColor, gid);
    }

    View Slide

  39. Overview
    Metal
    Metal Programming
    Image Convolution
    waifu2x Algorithm
    waifu2x in Metal

    View Slide

  40. waifu2x
    YCKHWZ

    View Slide

  41. waifu2x algorithm
    waifu2x
    Convolutional
    Neural Network
    2x (nearest
    neighbour)
    scale2.0x_model.json

    View Slide

  42. waifu2x algorithm
    waifu2x
    Convolutional
    Neural Network
    noise2_model.json

    View Slide

  43. https://marcan.st/transf/waifu2x.py
    import json, sys, numpy as np
    from scipy import misc, signal
    from PIL import Image
    infile, outfile, modelpath = sys.argv[1:]
    model = json.load(open(modelpath))
    im = Image.open(infile).convert("YCbCr")
    im = misc.fromimage(im.resize((2*im.size[0], 2*im.size[1]),
    resample=Image.NEAREST)).astype("float32")
    planes = [np.pad(im[:,:,0], len(model), "edge") / 255.0]
    for step in model:
    o_planes = []
    for bias, weights in zip(step["bias"], step["weight"]):
    partial = None
    for ip, kernel in zip(planes, weights):
    p = signal.convolve2d(ip, np.float32(kernel), "valid")
    if partial is None:
    partial = p
    else:
    partial += p
    partial += np.float32(bias)
    o_planes.append(partial)
    planes = [np.maximum(p, 0) + 0.1 * np.minimum(p, 0) for p in o_planes]
    im[:,:,0] = np.clip(planes[0], 0, 1) * 255
    misc.toimage(im, mode="YCbCr").convert("RGB").save(outfile)

    View Slide


  44. Convolutional Neural Network

    View Slide

  45. Convolutional Neural Network
    W3
    W1
    W2
    b

    View Slide

  46. Leaky ReLUs
    (rectified linear unit)
    vin
    O
    vout

    View Slide

  47. Convolutional Neural Network
    weight: 3 × 5
    bias: 5

    View Slide

  48. waifu2x’s model


    32

    32

    64

    64

    128 128
    3 3

    View Slide

  49. waifu2x’s model


    32

    32

    64

    64

    128 128

    3 3

    View Slide

  50. https://marcan.st/transf/waifu2x.py
    import json, sys, numpy as np
    from scipy import misc, signal
    from PIL import Image
    infile, outfile, modelpath = sys.argv[1:]
    model = json.load(open(modelpath))
    im = Image.open(infile).convert("YCbCr")
    im = misc.fromimage(im.resize((2*im.size[0], 2*im.size[1]),
    resample=Image.NEAREST)).astype("float32")
    planes = [np.pad(im[:,:,0], len(model), "edge") / 255.0]
    for step in model:
    o_planes = []
    for bias, weights in zip(step["bias"], step["weight"]):
    partial = None
    for ip, kernel in zip(planes, weights):
    p = signal.convolve2d(ip, np.float32(kernel), "valid")
    if partial is None:
    partial = p
    else:
    partial += p
    partial += np.float32(bias)
    o_planes.append(partial)
    planes = [np.maximum(p, 0) + 0.1 * np.minimum(p, 0) for p in o_planes]
    im[:,:,0] = np.clip(planes[0], 0, 1) * 255
    misc.toimage(im, mode="YCbCr").convert("RGB").save(outfile)

    View Slide

  51. model.json
    Level 1 Noise Reduction
    Level 2 Noise Reduction
    Resize to 2x
    Resize to 2x (photo)

    View Slide

  52. Overview
    Metal
    Metal Programming
    Image Convolution
    waifu2x Algorithm
    waifu2x in Metal

    View Slide

  53. safx/waifu2x-metal
    Slow
    2x only
    can’t enlarge for large image
    invalid result on Intel GPU

    View Slide

  54. split
    kernel void splitToRGBChannels(
    texture2d in[[texture(0)]],
    texture2d_array out[[texture(1)]],
    uint2 gid[[thread_position_in_grid]])
    {
    if (gid.x < in.get_width() && gid.y < in.get_height()) {
    out.write(float4(in.read(gid).r, 0.0f, 0.0f, 0.0f), gid, 0);
    out.write(float4(in.read(gid).g, 0.0f, 0.0f, 0.0f), gid, 1);
    out.write(float4(in.read(gid).b, 0.0f, 0.0f, 0.0f), gid, 2);
    }
    }
    let desc = MTLTextureDescriptor.texture2DDescriptorWithPixelFormat(
    format, width: width, height: height, mipmapped: false)
    desc.textureType = .Type2DArray
    desc.arrayLength = 3
    let outTexture = device.newTextureWithDescriptor(desc)

    View Slide

  55. combine
    kernel void combineRGBChannels(
    texture2d_array in[[texture(0)]],
    texture2d out[[texture(1)]],
    uint2 gid[[thread_position_in_grid]])
    {
    if (gid.x < in.get_width() && gid.y < in.get_height()) {
    float4 outColor(in.read(gid, 0).r,
    in.read(gid, 1).r,
    in.read(gid, 2).r, 1.0f);
    out.write(outColor, gid);
    }
    }

    View Slide

  56. kernel void waifu2x(texture2d_array in[[texture(0)]],
    texture2d out[[texture(1)]],
    constant float3x3* weights[[buffer(0)]],
    constant float& bias[[buffer(1)]],
    uint2 gid[[thread_position_in_grid]])
    {
    if (gid.x >= in.get_width() || gid.y >= in.get_height()) return;
    float partial = bias;
    for (uint i = 0; i < in.get_array_size(); ++i) {
    float3 in0 = float3(in.read(gid + uint2(-1, -1), i).r,
    in.read(gid + uint2( 0, -1), i).r,
    in.read(gid + uint2(+1, -1), i).r);
    float3 in1 = float3(in.read(gid + uint2(-1, 0), i).r,
    in.read(gid + uint2( 0, 0), i).r,
    in.read(gid + uint2(+1, 0), i).r);
    float3 in2 = float3(in.read(gid + uint2(-1, +1), i).r,
    in.read(gid + uint2( 0, +1), i).r,
    in.read(gid + uint2(+1, +1), i).r);
    float3x3 weight = weights[i];
    partial += dot(in0, weight[0])
    + dot(in1, weight[1])
    + dot(in2, weight[2]);
    }
    float p = fmax(partial, 0) + 0.1 * fmin(partial, 0);
    float4 outColor(p, 0, 0, 0);
    out.write(outColor, gid);
    }

    View Slide

  57. References
    nagadomi/waifu2x
    waifu2xͱͦͷ೿ੜιϑτҰཡ
    http://kourindrug.sakura.ne.jp/waifu2x.html

    View Slide