matuyuji
July 11, 2015
3.4k

# waifu2xをMetalで書いてみた

Mac OS X 10.11でのMetalの使いかたと、waifu2xのアルゴリズムについて簡単に紹介します。

July 11, 2015

## Transcript

Metal
5. ### –Metal Programming Guide “A primary goal of Metal is to

minimize the CPU overhead incurred by executing GPU workloads.”

Metal

9. ### Rec. 709 luma For each pixel: y = 0.2126 r

+ 0.7152 g + 0.0722 b y = (0.2126, 0.7152, 0.0722)ɾrgb
10. ### Metal Shading Language float3 kRec709Luma = float3(0.2126, 0.7152, 0.0722); float

gray = dot(inColor.rgb, kRec709Luma);
11. ### float3 kRec709Luma = float3(0.2126, 0.7152, 0.0722); void grayscale( texture2d in,

texture2d out, uint2 gid // Pixel Coordinates { float4 inColor = in.read(gid); float gray = dot(inColor.rgb, kRec709Luma); float4 outColor = float4(gray, gray, gray, inColor.a); out.write(outColor, gid); }
12. ### Metal Shading Language C++11 base Restrictions: lambda, recursive functions, goto,

etc. Add Qualiﬁers Standard Library: <metal_stdlib>
13. ### grayscale.metal #include <metal_stdlib> using namespace metal; constant float3 kRec709Luma =

float3(0.2126, 0.7152, 0.0722); kernel void grayscale( texture2d<float, access::read> in [[texture(0)]], texture2d<float, access::write> out [[texture(1)]], uint2 gid [[thread_position_in_grid]]) { float4 inColor = in.read(gid); float gray = dot(inColor.rgb, kRec709Luma); float4 outColor = float4(gray, gray, gray, inColor.a); out.write(outColor, gid); }
14. ### Execution Model Command Queue Device Command Buffer Compute Command Encoder

Texture Texture Command Buffer Function Library (Compute) grayscale.metal
15. ### MTLDevice Use MTLCopyAllDevices or MTLCreateSystemDefaultDevice for i in MTLCopyAllDevices() {

print("\(i.name) \(i.headless) \(i.lowPower) \(i.maxThreadsPerThreadgroup)") } Optional("NVIDIA GeForce GT 650M") true false C.MTLSize(width: 1024, height: 1024, depth: 64) Optional("Intel HD Graphics 4000") true true C.MTLSize(width: 512, height: 512, depth: 512) (MBP Mid 2012)
16. ### MTLLibrary & MTLFunction let library = device.newDefaultLibrary()! let function =

library.newFunctionWithName(“grayscale")!
17. ### MTLQueue & MTLBuffer let queue = device.newCommandQueue() let commandBuf =

queue.commandBuffer()
18. ### Command Queue Device Command Buffer Compute Command Encoder Texture Texture

Function Library grayscale.metal
19. ### MTLComputeCommandEncoder let pipelineState = try! device.newComputePipelineStateWithFunction(function) let encoder = commandBuf.computeCommandEncoder()

encoder.setComputePipelineState(pipelineState) encoder.setTexture(inTexture, atIndex: 0) encoder.setTexture(outTexture, atIndex: 1) kernel void grayscale( texture2d<float, access::read> in [[texture(0)]], texture2d<float, access::write> out [[texture(1)]], uint2 gid [[thread_position_in_grid]])

26. ### Resource Storage Modes texture buffer texture buffer Shared ◦ ◦

× ◦ Managed × × ◦ Private iOS OS X ◦ default × not available

28. ### MTLTexture.getBytes var buf = Array<UInt8>(count: width * height * 4,

repeatedValue: 0) let region = MTLRegionMake2D(0, 0, width, height) texture.getBytes(&buf, bytesPerRow: rowBytes, fromRegion: region, mipmapLevel: 0) let colorSpace = CGColorSpaceCreateDeviceRGB() let context = CGBitmapContextCreate(&buf, width, height, 8, rowBytes, colorSpace, CGImageAlphaInfo.PremultipliedLast.rawValue)
29. ### MetalKit MTKTextureLoader can load images from common ﬁle formats such

as PNG, JPEG, and TIFF. let loader = MTKTextureLoader(device: device) let url = NSURL(fileURLWithPath: path) let inTexture = try! loader.textureWithContentsOfURL (url, options: nil)

Metal

34. ### ꒅ ꒅ ꒅ ꒅ ꒅ ꒅ ꒅ ꒅ ꒅ +

element-wise multiplication
35. ### 1 1 1 1 1 1 1 1 1 0

1 0 1 -4 1 0 1 0 -2 -1 0 -1 1 1 0 1 2 0 -1 0 -1 5 -1 0 -1 0 blur edge detect sharpen emboss w/ norm.

37. ### Accelerate Framework func vImageConvolve_ARGBFFFF( _ src: UnsafePointer<vImage_Buffer>, _ dest: UnsafePointer<vImage_Buffer>,

_ tempBuffer: UnsafeMutablePointer<Void>, _ srcOffsetToROI_X: vImagePixelCount, _ srcOffsetToROI_Y: vImagePixelCount, _ kernel: UnsafePointer<Float>, _ kernel_height: UInt32, _ kernel_width: UInt32, _ backgroundColor: UnsafeMutablePointer<Float>, _ flags: vImage_Flags) -> vImage_Error
38. ### convolute.metal kernel void convolve(texture2d<float, access::read> in[[texture(0)]], texture2d<float, access::write> out[[texture(1)]], constant

float3x3& weight[[buffer(0)]], uint2 gid[[thread_position_in_grid]]) { if (gid.x >= in.get_width() || gid.y >= in.get_height()) return; float4 in00 = in.read(gid + uint2(-1, -1)); float4 in10 = in.read(gid + uint2( 0, -1)); float4 in20 = in.read(gid + uint2(+1, -1)); float4 in01 = in.read(gid + uint2(-1, 0)); float4 in11 = in.read(gid + uint2( 0, 0)); float4 in21 = in.read(gid + uint2(+1, 0)); float4 in02 = in.read(gid + uint2(-1, +1)); float4 in12 = in.read(gid + uint2( 0, +1)); float4 in22 = in.read(gid + uint2(+1, +1)); float4 outColor = ( in00 * weight[0][0] + in10 * weight[1][0] + in20 * weight[2][0] + in01 * weight[0][1] + in11 * weight[1][1] + in21 * weight[2][1] + in02 * weight[0][2] + in12 * weight[1][2] + in22 * weight[2][2]); out.write(outColor, gid); }

Metal

43. ### https://marcan.st/transf/waifu2x.py import json, sys, numpy as np from scipy import

misc, signal from PIL import Image infile, outfile, modelpath = sys.argv[1:] model = json.load(open(modelpath)) im = Image.open(infile).convert("YCbCr") im = misc.fromimage(im.resize((2*im.size[0], 2*im.size[1]), resample=Image.NEAREST)).astype("float32") planes = [np.pad(im[:,:,0], len(model), "edge") / 255.0] for step in model: o_planes = [] for bias, weights in zip(step["bias"], step["weight"]): partial = None for ip, kernel in zip(planes, weights): p = signal.convolve2d(ip, np.float32(kernel), "valid") if partial is None: partial = p else: partial += p partial += np.float32(bias) o_planes.append(partial) planes = [np.maximum(p, 0) + 0.1 * np.minimum(p, 0) for p in o_planes] im[:,:,0] = np.clip(planes[0], 0, 1) * 255 misc.toimage(im, mode="YCbCr").convert("RGB").save(outfile)

48. ### waifu2x’s model … … 32 … 32 … 64 …

64 … 128 128 3 3
51. ### model.json Level 1 Noise Reduction Level 2 Noise Reduction Resize

to 2x Resize to 2x (photo)

Metal
53. ### safx/waifu2x-metal Slow 2x only can’t enlarge for large image invalid

result on Intel GPU
54. ### split kernel void splitToRGBChannels( texture2d<float, access::read> in[[texture(0)]], texture2d_array<float, access::write> out[[texture(1)]],

uint2 gid[[thread_position_in_grid]]) { if (gid.x < in.get_width() && gid.y < in.get_height()) { out.write(float4(in.read(gid).r, 0.0f, 0.0f, 0.0f), gid, 0); out.write(float4(in.read(gid).g, 0.0f, 0.0f, 0.0f), gid, 1); out.write(float4(in.read(gid).b, 0.0f, 0.0f, 0.0f), gid, 2); } } let desc = MTLTextureDescriptor.texture2DDescriptorWithPixelFormat( format, width: width, height: height, mipmapped: false) desc.textureType = .Type2DArray desc.arrayLength = 3 let outTexture = device.newTextureWithDescriptor(desc)
55. ### combine kernel void combineRGBChannels( texture2d_array<float, access::read> in[[texture(0)]], texture2d<float, access::write> out[[texture(1)]],

uint2 gid[[thread_position_in_grid]]) { if (gid.x < in.get_width() && gid.y < in.get_height()) { float4 outColor(in.read(gid, 0).r, in.read(gid, 1).r, in.read(gid, 2).r, 1.0f); out.write(outColor, gid); } }
56. ### kernel void waifu2x(texture2d_array<float, access::read> in[[texture(0)]], texture2d<float, access::write> out[[texture(1)]], constant float3x3*

weights[[buffer(0)]], constant float& bias[[buffer(1)]], uint2 gid[[thread_position_in_grid]]) { if (gid.x >= in.get_width() || gid.y >= in.get_height()) return; float partial = bias; for (uint i = 0; i < in.get_array_size(); ++i) { float3 in0 = float3(in.read(gid + uint2(-1, -1), i).r, in.read(gid + uint2( 0, -1), i).r, in.read(gid + uint2(+1, -1), i).r); float3 in1 = float3(in.read(gid + uint2(-1, 0), i).r, in.read(gid + uint2( 0, 0), i).r, in.read(gid + uint2(+1, 0), i).r); float3 in2 = float3(in.read(gid + uint2(-1, +1), i).r, in.read(gid + uint2( 0, +1), i).r, in.read(gid + uint2(+1, +1), i).r); float3x3 weight = weights[i]; partial += dot(in0, weight[0]) + dot(in1, weight[1]) + dot(in2, weight[2]); } float p = fmax(partial, 0) + 0.1 * fmin(partial, 0); float4 outColor(p, 0, 0, 0); out.write(outColor, gid); }