waifu2xをMetalで書いてみた

XBJGVYΛ.FUBMͰॻ͍ͯΈͨ ୈ62ճ Cocoaษڧձؔ੢ (2015.7.11)

@matuyuji safx-dev.blogspot.jp ⌚

waifu2x

Overview Metal Metal Programming Image Convolution waifu2x Algorithm waifu2x in
Metal

–Metal Programming Guide “A primary goal of Metal is to
minimize the CPU overhead incurred by executing GPU workloads.”

Metal Metal Shading Language Metal Framework MetalKit Framework

Metal

Text Metal Programming grayscale

Rec. 709 luma For each pixel: y = 0.2126 r
+ 0.7152 g + 0.0722 b y = (0.2126, 0.7152, 0.0722)ɾrgb

Metal Shading Language float3 kRec709Luma = float3(0.2126, 0.7152, 0.0722); float
gray = dot(inColor.rgb, kRec709Luma);

float3 kRec709Luma = float3(0.2126, 0.7152, 0.0722); void grayscale( texture2d in,
texture2d out, uint2 gid // Pixel Coordinates { float4 inColor = in.read(gid); float gray = dot(inColor.rgb, kRec709Luma); float4 outColor = float4(gray, gray, gray, inColor.a); out.write(outColor, gid); }

Metal Shading Language C++11 base Restrictions: lambda, recursive functions, goto,
etc. Add Qualiﬁers Standard Library: <metal_stdlib>

grayscale.metal #include <metal_stdlib> using namespace metal; constant float3 kRec709Luma =
float3(0.2126, 0.7152, 0.0722); kernel void grayscale( texture2d<float, access::read> in [[texture(0)]], texture2d<float, access::write> out [[texture(1)]], uint2 gid [[thread_position_in_grid]]) { float4 inColor = in.read(gid); float gray = dot(inColor.rgb, kRec709Luma); float4 outColor = float4(gray, gray, gray, inColor.a); out.write(outColor, gid); }

Execution Model Command Queue Device Command Buffer Compute Command Encoder
Texture Texture Command Buffer Function Library (Compute) grayscale.metal

MTLDevice Use MTLCopyAllDevices or MTLCreateSystemDefaultDevice for i in MTLCopyAllDevices() {
print("\(i.name) \(i.headless) \(i.lowPower) \(i.maxThreadsPerThreadgroup)") } Optional("NVIDIA GeForce GT 650M") true false C.MTLSize(width: 1024, height: 1024, depth: 64) Optional("Intel HD Graphics 4000") true true C.MTLSize(width: 512, height: 512, depth: 512) (MBP Mid 2012)

MTLLibrary & MTLFunction let library = device.newDefaultLibrary()! let function =
library.newFunctionWithName(“grayscale")!

MTLQueue & MTLBuffer let queue = device.newCommandQueue() let commandBuf =
queue.commandBuffer()

Command Queue Device Command Buffer Compute Command Encoder Texture Texture
Function Library grayscale.metal

MTLComputeCommandEncoder let pipelineState = try! device.newComputePipelineStateWithFunction(function) let encoder = commandBuf.computeCommandEncoder()
encoder.setComputePipelineState(pipelineState) encoder.setTexture(inTexture, atIndex: 0) encoder.setTexture(outTexture, atIndex: 1) kernel void grayscale( texture2d<float, access::read> in [[texture(0)]], texture2d<float, access::write> out [[texture(1)]], uint2 gid [[thread_position_in_grid]])

dispatchThreadgroups let width = inTexture.width let height = inTexture.height let
threadsPerThreadgroup = MTLSizeMake(32, 16, 1) let numGroups = MTLSizeMake( 1 + width / threadsPerThreadgroup.width, 1 + height / threadsPerThreadgroup.height, 1) encoder.dispatchThreadgroups(numGroups, threadsPerThreadgroup: threadsPerThreadgroup)

threadsPerThreadgroup 32px 16px

threadsPerThreadgroup

endEncoding & commit encoder.endEncoding() commandBuf.commit() commandBuf.waitUntilCompleted()

Execution Model Command Queue Device Command Buffer Compute Command Encoder
Texture Texture Command Buffer Function Library (Compute) grayscale.metal

Managed Resource CPU GPU

Resource Storage Modes texture buffer texture buffer Shared ◦ ◦
× ◦ Managed × × ◦ Private iOS OS X ◦ default × not available

synchronizeResource let encoder = commandBuf.blitCommandEncoder() encoder.synchronizeResource(texture) CPU GPU

MTLTexture.getBytes var buf = Array<UInt8>(count: width * height * 4,
repeatedValue: 0) let region = MTLRegionMake2D(0, 0, width, height) texture.getBytes(&buf, bytesPerRow: rowBytes, fromRegion: region, mipmapLevel: 0) let colorSpace = CGColorSpaceCreateDeviceRGB() let context = CGBitmapContextCreate(&buf, width, height, 8, rowBytes, colorSpace, CGImageAlphaInfo.PremultipliedLast.rawValue)

MetalKit MTKTextureLoader can load images from common ﬁle formats such
as PNG, JPEG, and TIFF. let loader = MTKTextureLoader(device: device) let url = NSURL(fileURLWithPath: path) let inTexture = try! loader.textureWithContentsOfURL (url, options: nil)

safx/ Metal-CommandLine-Sample-Swift

Texture Execution Model Command Queue Device Command Buffer Compute Command
Encoder Texture Buffer Texture Texture Blit Command Encoder Texture Buffer Texture Command Buffer Function Library (Compute & Blit) *.metal

Metal

Text Image Convolution

ꒅ ꒅ ꒅ ꒅ ꒅ ꒅ ꒅ ꒅ ꒅ +
element-wise multiplication

1 1 1 1 1 1 1 1 1 0
1 0 1 -4 1 0 1 0 -2 -1 0 -1 1 1 0 1 2 0 -1 0 -1 5 -1 0 -1 0 blur edge detect sharpen emboss w/ norm.

Gimp Filters → Generic → Convolution Matrix

Accelerate Framework func vImageConvolve_ARGBFFFF( _ src: UnsafePointer<vImage_Buffer>, _ dest: UnsafePointer<vImage_Buffer>,
_ tempBuffer: UnsafeMutablePointer<Void>, _ srcOffsetToROI_X: vImagePixelCount, _ srcOffsetToROI_Y: vImagePixelCount, _ kernel: UnsafePointer<Float>, _ kernel_height: UInt32, _ kernel_width: UInt32, _ backgroundColor: UnsafeMutablePointer<Float>, _ flags: vImage_Flags) -> vImage_Error

convolute.metal kernel void convolve(texture2d<float, access::read> in[[texture(0)]], texture2d<float, access::write> out[[texture(1)]], constant
float3x3& weight[[buffer(0)]], uint2 gid[[thread_position_in_grid]]) { if (gid.x >= in.get_width() || gid.y >= in.get_height()) return; float4 in00 = in.read(gid + uint2(-1, -1)); float4 in10 = in.read(gid + uint2( 0, -1)); float4 in20 = in.read(gid + uint2(+1, -1)); float4 in01 = in.read(gid + uint2(-1, 0)); float4 in11 = in.read(gid + uint2( 0, 0)); float4 in21 = in.read(gid + uint2(+1, 0)); float4 in02 = in.read(gid + uint2(-1, +1)); float4 in12 = in.read(gid + uint2( 0, +1)); float4 in22 = in.read(gid + uint2(+1, +1)); float4 outColor = ( in00 * weight[0][0] + in10 * weight[1][0] + in20 * weight[2][0] + in01 * weight[0][1] + in11 * weight[1][1] + in21 * weight[2][1] + in02 * weight[0][2] + in12 * weight[1][2] + in22 * weight[2][2]); out.write(outColor, gid); }

Metal

waifu2x YCKHWZ

waifu2x algorithm waifu2x Convolutional Neural Network 2x (nearest neighbour) scale2.0x_model.json

waifu2x algorithm waifu2x Convolutional Neural Network noise2_model.json

https://marcan.st/transf/waifu2x.py import json, sys, numpy as np from scipy import
misc, signal from PIL import Image infile, outfile, modelpath = sys.argv[1:] model = json.load(open(modelpath)) im = Image.open(infile).convert("YCbCr") im = misc.fromimage(im.resize((2*im.size[0], 2*im.size[1]), resample=Image.NEAREST)).astype("float32") planes = [np.pad(im[:,:,0], len(model), "edge") / 255.0] for step in model: o_planes = [] for bias, weights in zip(step["bias"], step["weight"]): partial = None for ip, kernel in zip(planes, weights): p = signal.convolve2d(ip, np.float32(kernel), "valid") if partial is None: partial = p else: partial += p partial += np.float32(bias) o_planes.append(partial) planes = [np.maximum(p, 0) + 0.1 * np.minimum(p, 0) for p in o_planes] im[:,:,0] = np.clip(planes[0], 0, 1) * 255 misc.toimage(im, mode="YCbCr").convert("RGB").save(outfile)

… Convolutional Neural Network

Convolutional Neural Network W3 W1 W2 b

Leaky ReLUs (rectiﬁed linear unit) vin O vout

Convolutional Neural Network weight: 3 × 5 bias: 5

waifu2x’s model … … 32 … 32 … 64 …
64 … 128 128 3 3

https://marcan.st/transf/waifu2x.py import json, sys, numpy as np from scipy import
misc, signal from PIL import Image infile, outfile, modelpath = sys.argv[1:] model = json.load(open(modelpath)) im = Image.open(infile).convert("YCbCr") im = misc.fromimage(im.resize((2*im.size[0], 2*im.size[1]), resample=Image.NEAREST)).astype("float32") planes = [np.pad(im[:,:,0], len(model), "edge") / 255.0] for step in model: o_planes = [] for bias, weights in zip(step["bias"], step["weight"]): partial = None for ip, kernel in zip(planes, weights): p = signal.convolve2d(ip, np.float32(kernel), "valid") if partial is None: partial = p else: partial += p partial += np.float32(bias) o_planes.append(partial) planes = [np.maximum(p, 0) + 0.1 * np.minimum(p, 0) for p in o_planes] im[:,:,0] = np.clip(planes[0], 0, 1) * 255 misc.toimage(im, mode="YCbCr").convert("RGB").save(outfile)

model.json Level 1 Noise Reduction Level 2 Noise Reduction Resize
to 2x Resize to 2x (photo)

Metal

safx/waifu2x-metal Slow 2x only can’t enlarge for large image invalid
result on Intel GPU

split kernel void splitToRGBChannels( texture2d<float, access::read> in[[texture(0)]], texture2d_array<float, access::write> out[[texture(1)]],
uint2 gid[[thread_position_in_grid]]) { if (gid.x < in.get_width() && gid.y < in.get_height()) { out.write(float4(in.read(gid).r, 0.0f, 0.0f, 0.0f), gid, 0); out.write(float4(in.read(gid).g, 0.0f, 0.0f, 0.0f), gid, 1); out.write(float4(in.read(gid).b, 0.0f, 0.0f, 0.0f), gid, 2); } } let desc = MTLTextureDescriptor.texture2DDescriptorWithPixelFormat( format, width: width, height: height, mipmapped: false) desc.textureType = .Type2DArray desc.arrayLength = 3 let outTexture = device.newTextureWithDescriptor(desc)

combine kernel void combineRGBChannels( texture2d_array<float, access::read> in[[texture(0)]], texture2d<float, access::write> out[[texture(1)]],
uint2 gid[[thread_position_in_grid]]) { if (gid.x < in.get_width() && gid.y < in.get_height()) { float4 outColor(in.read(gid, 0).r, in.read(gid, 1).r, in.read(gid, 2).r, 1.0f); out.write(outColor, gid); } }

kernel void waifu2x(texture2d_array<float, access::read> in[[texture(0)]], texture2d<float, access::write> out[[texture(1)]], constant float3x3*
weights[[buffer(0)]], constant float& bias[[buffer(1)]], uint2 gid[[thread_position_in_grid]]) { if (gid.x >= in.get_width() || gid.y >= in.get_height()) return; float partial = bias; for (uint i = 0; i < in.get_array_size(); ++i) { float3 in0 = float3(in.read(gid + uint2(-1, -1), i).r, in.read(gid + uint2( 0, -1), i).r, in.read(gid + uint2(+1, -1), i).r); float3 in1 = float3(in.read(gid + uint2(-1, 0), i).r, in.read(gid + uint2( 0, 0), i).r, in.read(gid + uint2(+1, 0), i).r); float3 in2 = float3(in.read(gid + uint2(-1, +1), i).r, in.read(gid + uint2( 0, +1), i).r, in.read(gid + uint2(+1, +1), i).r); float3x3 weight = weights[i]; partial += dot(in0, weight[0]) + dot(in1, weight[1]) + dot(in2, weight[2]); } float p = fmax(partial, 0) + 0.1 * fmin(partial, 0); float4 outColor(p, 0, 0, 0); out.write(outColor, gid); }

References nagadomi/waifu2x waifu2xͱͦͷ೿ੜιϑτҰཡ http://kourindrug.sakura.ne.jp/waifu2x.html

waifu2xをMetalで書いてみた

waifu2xをMetalで書いてみた

More Decks by matuyuji

Other Decks in Programming

Featured

Transcript