Slide 1

Slide 1 text

C++ͰγΣʔμΛॻ͘ NAOMASA MATSUBAYASHI

Slide 2

Slide 2 text

for( int i = 0; i != 24; ++i ) { c[ i ] = a[ i ] + b[ i ]; } a b c Ϩδελ "-6

Slide 3

Slide 3 text

"-6 "-6 "-6 "-6 "-6 "-6 a b c SIMD Ϩδελʹෳ਺ͷ஋Λࡌͤͯ ୔ࢁͷALUͰҰؾʹܭࢉ͢Ε͹଎͍

Slide 4

Slide 4 text

Tensor Core ϩʔυετΞ σΟεύον໋ྩΩϟογϡ ϨδελόϯΫ ALU 1024bitͷϨδελʹ 32bitුಈখ਺఺਺Λ32ݸฒ΂ͯ 1໋ྩͰ32ݸͷ஋Λܭࢉ͠Α͏

Slide 5

Slide 5 text

͜ͷϓϩηοαΛ େྔʹฒ΂ͨΒ େྔͷσʔλΛߴ଎ʹॲཧͰ͖Δ GPU

Slide 6

Slide 6 text

1໋ྩσίʔυͨ͠Β 32ݸͷԋࢉث͕ ܭࢉΛߦ͏ Tensor Core ϩʔυετΞ σΟεύον໋ྩΩϟογϡ ϨδελόϯΫ

Slide 7

Slide 7 text

add.f32 r2, r1, r0; r0ʹ͋Δ32ݸͷ୯ਫ਼౓ුಈখ਺఺਺ͱ r1ʹ͋Δ32ݸͷ୯ਫ਼౓ුಈখ਺఺਺ͷ ͦΕͧΕͷཁૉΛ଍ͨ݁͠ՌΛr2ʹు͘ r0 r1 r2 0 1024 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =

Slide 8

Slide 8 text

ΞηϯϒϦΛ௚઀ॻ͘ͷ͸ਏ͍ͷͰ ߴڃݴޠ͕ཉ͍͠

Slide 9

Slide 9 text

HLSL StructuredBuffer input_buffer : register(t0); RWStructuredBuffer output_buffer : register(u0); [numthreads(1, 1, 1)] void main( uint3 index : SV_DispatchThreadID ) { output_buffer[ index.x ] = input_buffer[ index.x ] * 2.0; } GLSL #version 460 layout( binding=0 ) buffer InputBuffer { float input_buffer[]; }; layout( binding=1 ) buffer OutputBuffer { float output_buffer[]; }; layout( local_size_x=1, local_size_y=1, local_size_z=1 ) in; void main() { output_buffer[ gl_GlobalInvocationID.x ] = input_buffer[ gl_GlobalInvocationID.x ] * 2.0; }

Slide 10

Slide 10 text

CUDA __global__ void find( float *input_buffer, float *output_buffer ) { const int index = blockIdx.x * blockDim.x + threadIdx.x; output_buffer[ index ] = input_buffer[ index ] * 2.0f; } OpenCL C __kernel void func( __global float *input_buffer, __global float *output_buffer ) { unsigned int index = get_global_id( 0 ); output_buffer[ index ] = input_buffer[ index ] * 2.0f; }

Slide 11

Slide 11 text

GPUݻ༗ͷ੍໿ʹ߹ΘͤΔҝ CPUͱ͸ҟͳΔߴڃݴޠ͕༻͍ΒΕΔ

Slide 12

Slide 12 text

GPU޲͚ͷߴڃݴޠͱCPU޲͚ͷߴڃݴޠ͸ҧ͏͚Ͳ GPU޲͚ʹॻ͍ͨίʔυΛCPUͰ࣮ߦ͍ͨ͠ GPUͰ࣮ߦ͢Δؔ਺ʹର͢ΔϢχοτςετΛॻ͖͍ͨ GPU༻ͷ࣮૷ͱCPU༻ͷ࣮૷ͷ ίʔυϕʔεΛڞ௨ʹ͍ͨ͠

Slide 13

Slide 13 text

#version 460 layout(local_size_x = 16, local_size_y = 16 ) in; vec3 diffuse( vec3 surface_pos, vec3 light_pos, vec3 normal, vec3 albedo, float metallicness, vec3 light_energy ) { const vec3 relative_light_pos = light_pos - surface_pos; const vec3 L = normalize( relative_light_pos ); const float light_distance2 = dot( relative_light_pos, relative_light_pos ); const float diffuse = max( dot( L, normal ), 0.0f ) / 3.141592653589793f; const vec3 linear = ( 1 - metallicness ) * diffuse * albedo * light_energy / light_distance2; return linear; } void main() { ... } GLSLͱͯ͠ਖ਼͍͠ίʔυ

Slide 14

Slide 14 text

# define BOOST_TEST_MAIN #include #include #include using namespace glm; vec3 diffuse( vec3 surface_pos, vec3 light_pos, vec3 normal, vec3 albedo, float metallicness, vec3 light_energy ) { const vec3 relative_light_pos = light_pos - surface_pos; const vec3 L = normalize( relative_light_pos ); const float light_distance2 = dot( relative_light_pos, relative_light_pos ); const float diffuse = max( dot( L, normal ), 0.0f ) / 3.141592653589793f; const vec3 linear = ( 1 - metallicness ) * diffuse * albedo * light_energy / light_distance2; return linear; } BOOST_AUTO_TEST_CASE(max_diffuse) { vec3 l = diffuse( vec3( 0.0f, 0.0f, 1.0f ), vec3( 1.0f, 0.0f, 1.0f ), vec3( 1.0f, 0.0f, 0.0f ), vec3( 1.0f, 1.0f, 1.0f ), 0.0f, vec3( 1.0f, 1.0f, 1.0f ) ); BOOST_CHECK_CLOSE( l.x, 1.f/3.141592653589793f, 0.0001f ); } ͦͷ··C++ͱͯ͠ਖ਼͍͠ίʔυ

Slide 15

Slide 15 text

جຊతͳԋࢉ ؔ਺ Φʔόʔϩʔυ ߏ଄ମ ϓϦϓϩηοα ਺஋ܕ αϯϓϥʔ ਫฏԋࢉ ಉظ ܧঝ ςϯϓϨʔτ ඪ४ ϥΠϒϥϦ ੍ޚߏจ new/delete ྫ֎ ࣮ߦ࣌ܕ৘ใ C++03ͷ ػೳ GLSLͷ ػೳ ͜ͷ෦෼͚ͩΛ ࢖ͬͯίʔυΛॻ͘ GLSL͕஀ੜͨ͠ࠒͷঢ়گ ڞ༻ମ

Slide 16

Slide 16 text

جຊతͳԋࢉ ؔ਺ Φʔόʔϩʔυ ߏ଄ମ ϓϦϓϩηοα ਺஋ܕ αϯϓϥʔ ਫฏԋࢉ ಉظ ςϯϓϨʔτ ඪ४ ϥΠϒϥϦ ੍ޚߏจ new/delete ྫ֎ ࣮ߦ࣌ܕ৘ใ C++11ͷ ػೳ GLSLͷ ػೳ ͜ͷ෦෼͚ͩΛ ࢖ͬͯίʔυΛॻ͘ constexpr ϥϜμࣜ auto ӈล஋ࢀর nullptr εϨουϩʔΧϧ ετϨʔδ range based for scoped enum ܧঝ ڞ༻ମ variadic templates

Slide 17

Slide 17 text

جຊతͳԋࢉ ؔ਺ Φʔόʔϩʔυ ߏ଄ମ ϓϦϓϩηοα ਺஋ܕ αϯϓϥʔ ਫฏԋࢉ ಉظ ςϯϓϨʔτ ඪ४ ϥΠϒϥϦ ੍ޚߏจ new/delete ྫ֎ ࣮ߦ࣌ܕ৘ใ C++14ͷ ػೳ GLSLͷ ػೳ ͜ͷ෦෼͚ͩΛ ࢖ͬͯίʔυΛॻ͘ constexpr ϥϜμࣜ auto ӈล஋ࢀর εϨουϩʔΧϧ ετϨʔδ range based for scoped enum generic lambda ໭Γ஋ܕਪ࿦ ू੒ମ ܧঝ ڞ༻ମ nullptr variadic templates

Slide 18

Slide 18 text

جຊతͳԋࢉ ؔ਺ Φʔόʔϩʔυ ߏ଄ମ ϓϦϓϩηοα ਺஋ܕ αϯϓϥʔ ਫฏԋࢉ ಉظ ςϯϓϨʔτ ඪ४ ϥΠϒϥϦ ੍ޚߏจ new/delete ྫ֎ ࣮ߦ࣌ܕ৘ใ C++17ͷ ػೳ GLSLͷ ػೳ ͜ͷ෦෼͚ͩΛ ࢖ͬͯίʔυΛॻ͘ constexpr ϥϜμࣜ auto ӈล஋ࢀর εϨουϩʔΧϧ ετϨʔδ range based for scoped enum ू੒ମ if constexpr constexpr lambda ߏ଄Խଋറ ৞ΈࠐΈࣜ ඪ४ଐੑ ܧঝ ڞ༻ମ generic lambda ໭Γ஋ܕਪ࿦ nullptr variadic templates

Slide 19

Slide 19 text

جຊతͳԋࢉ ؔ਺ Φʔόʔϩʔυ ߏ଄ମ ϓϦϓϩηοα ਺஋ܕ αϯϓϥʔ ਫฏԋࢉ ಉظ ςϯϓϨʔτ ඪ४ ϥΠϒϥϦ ੍ޚߏจ new/delete ྫ֎ ࣮ߦ࣌ܕ৘ใ C++20ͷ ػೳ GLSLͷ ػೳ ͜ͷ෦෼͚ͩΛ ࢖ͬͯίʔυΛॻ͘ constexpr ϥϜμࣜ auto ӈล஋ࢀর εϨουϩʔΧϧ ετϨʔδ range based for scoped enum ू੒ମ constexpr lambda ߏ଄Խଋറ ඪ४ଐੑ ࡾํൺֱԋࢉࢠ ࢦࣔ෇͖ ॳظԽ consteval ίϧʔνϯ Ϟδϡʔϧ ίϯηϓτ ܧঝ ڞ༻ମ if constexpr ৞ΈࠐΈࣜ generic lambda ໭Γ஋ܕਪ࿦ nullptr variadic templates

Slide 20

Slide 20 text

جຊతͳԋࢉ ؔ਺ Φʔόʔϩʔυ ߏ଄ମ ϓϦϓϩηοα ਺஋ܕ αϯϓϥʔ ਫฏԋࢉ ಉظ ςϯϓϨʔτ ඪ४ ϥΠϒϥϦ ੍ޚߏจ new/delete ྫ֎ ࣮ߦ࣌ܕ৘ใ C++23ͷ ػೳ GLSLͷ ػೳ ͜ͷ෦෼͚ͩΛ ࢖ͬͯίʔυΛॻ͘ constexpr ϥϜμࣜ auto ӈล஋ࢀর εϨουϩʔΧϧ ετϨʔδ range based for scoped enum ू੒ମ constexpr lambda ඪ४ଐੑ ࡾํൺֱԋࢉࢠ consteval ίϧʔνϯ Ϟδϡʔϧ ίϯηϓτ ଟ࣍ݩ ఴࣈԋࢉࢠ bfloat16 deducing this decay copy if consteval ςϯϓϨʔτ Ҿ਺ਪ࿦ ͭΒ͍ ܧঝ ڞ༻ମ ࢦࣔ෇͖ ॳظԽ if constexpr ৞ΈࠐΈࣜ generic lambda ໭Γ஋ܕਪ࿦ variadic templates nullptr ߏ଄Խଋറ

Slide 21

Slide 21 text

GPU༻ͷߴڃݴޠ͕΋͏গ͠ ݱ୅ͷC++ʹ͍ۙػೳΛඋ͑ͯཉ͍͠

Slide 22

Slide 22 text

OpenGL 4.5 DirectX 11 DirectX 12 OpenGL 4.6 OpenCL CUDA Vulkan GLSLͰ͍ͩ͘͞ SPIR-VͰ͍ͩ͘͞ SPIR-VͰ͍ͩ͘͞ HLSLͰ͍ͩ͘͞ DXILͰ͍ͩ͘͞ OpenCL CͰ͍ͩ͘͞ C++ͷίʔυʹҰॹʹॻ͍͍ͯͩ͘͞ GPUͰ࣮ߦ͢Δίʔυͷ౉͠ํ

Slide 23

Slide 23 text

OpenGL 4.5 DirectX 11 DirectX 12 OpenGL 4.6 OpenCL CUDA Vulkan GLSLͰ͍ͩ͘͞ SPIR-VͰ͍ͩ͘͞ SPIR-VͰ͍ͩ͘͞ HLSLͰ͍ͩ͘͞ DXILͰ͍ͩ͘͞ OpenCL CͰ͍ͩ͘͞ C++ͷίʔυʹҰॹʹॻ͍͍ͯͩ͘͞ GPUͰ࣮ߦ͢Δίʔυͷ౉͠ํ void main() { vec3 normal = normalize( inpu t_normal.xyz ); vec3 pos = input_position. xyz; vec3 N = normal; ߴڃݴޠ a b × + 3 AST ࣮ߦՄೳόΠφϦ γϦΞϥΠζ ࣈ۟ղੳ ߏจղੳ SPIR-V ߴڃݴޠͷASTΛ όΠφϦܗࣜͰγϦΞϥΠζͨ͠෺ ߴڃݴޠΑΓߴ଎ʹύʔεͰ͖Δ

Slide 24

Slide 24 text

OpenGL 4.5 DirectX 11 DirectX 12 OpenGL 4.6 OpenCL CUDA Vulkan GLSLͰ͍ͩ͘͞ SPIR-VͰ͍ͩ͘͞ SPIR-VͰ͍ͩ͘͞ HLSLͰ͍ͩ͘͞ DXILͰ͍ͩ͘͞ OpenCL CͰ͍ͩ͘͞ C++ͷίʔυʹҰॹʹॻ͍͍ͯͩ͘͞ GPUͰ࣮ߦ͢Δίʔυͷ౉͠ํ Ͱ΋GLSLͰͳͯ͘΋SPIR-VʹͰ͖ΔߴڃݴޠͳΒ Vulkanʹ৯΂ͤ͞Δࣄ͕Ͱ͖Δ GLSLΛSPIR-Vʹ͢ΔϦϑΝϨϯείϯύΠϥ glslc͕ଘࡏ͢Δ

Slide 25

Slide 25 text

DXC https://github.com/microsoft/DirectXShaderCompiler HLSLΛDXILʹ͢Δ ίϯύΠϥ HLSLΛSPIR-Vʹ ͢Δࣄ΋Ͱ͖Δ clang-3.7Λforkͯ͠ ࡞ΒΕ͍ͯΔ

Slide 26

Slide 26 text

https://devblogs.microsoft.com/directx/announcing-hlsl-2021/ HLSL 2021 2021೥ʹϦϦʔε͞Εͨ HLSLͷ৽͍͠όʔδϣϯ DXC 1.8.2405Ҏ߱Ͱ ίϯύΠϧͰ͖Δ

Slide 27

Slide 27 text

HLSL 2021ͷมߋ఺ • C++ͷΑ͏ͳςϯϓϨʔτؔ਺ͱςϯϓϨʔτΫϥε • C++ͷΑ͏ͳԋࢉࢠΦʔόʔϩʔυ • C++ͷΑ͏ͳϏοτϑΟʔϧυ • Ϣʔβఆٛܕͷ҉໧ͷΩϟετͷৼΔ෣͍ΛC++ʹ߹Θͤͨ • ࿦ཧԋࢉͷ୹བྷධՁͷৼΔ෣͍ΛC++ʹ߹Θͤͨ • forϧʔϓ಺Ͱએݴ͞Εͨม਺ͷείʔϓΛC++ʹ߹Θͤͨ HLSL͕C++ʹد͖ͤͯͨ

Slide 28

Slide 28 text

কདྷͷHLSLʹ޲͚ͯఏҊ͞Ε͍ͯΔػೳ • C++ͷΑ͏ͳඪ४ଐੑ • C++ͷΑ͏ͳnumeric_limits • C++ͷΑ͏ͳڞ༻ମ • C++ͷΑ͏ͳॳظԽࢠϦετ • C++ͷΑ͏ͳࢀরܕ • C++ͷΑ͏ͳconstϝϯόؔ਺ • C++ͷΑ͏ͳඇϝϯόԋࢉࢠΦʔόʔϩʔυ • C++ͷΑ͏ͳautoɺdecltypeɺconstexprɺ scoped enumɺvariadic templatesɺϢʔβ ఆٛϦςϥϧɺϥϜμࣜɺstatic assertɺ range-based for https://github.com/microsoft/hlsl-specs/tree/main/proposals

Slide 29

Slide 29 text

RWStructuredBuffer a; RWStructuredBuffer b; template T add( T x, T y ) { return x + y; } [numthreads(256, 1, 1)] void main( uint2 id : SV_DispatchThreadID ) { uint index = id.x; a[index] = add(b[index], b[index + 1]); } GPUͷϝϞϦ্ʹ֬อͨ͠ όοϑΝ͕͜͜ʹϦϯΫ͞ΕΔ ςϯϓϨʔτؔ਺ 256x1x1εϨουΛ local workgroupͱ͢Δ T=floatͰςϯϓϨʔτؔ਺Λݺͼग़͠

Slide 30

Slide 30 text

function(add_shader TARGET SHADER) set(current-shader-path ${CMAKE_CURRENT_SOURCE_DIR}/${SHADER}) set(current-output-path ${CMAKE_CURRENT_BINARY_DIR}/${SHADER}.spv) get_filename_component(current-output-dir ${current-output-path} DIRECTORY) file(MAKE_DIRECTORY ${current-output-dir}) if( "${SHADER}" MATCHES "\.comp\.hlsl$" ) string( REPLACE ".comp.hlsl.spv" ".comp.spv" current-output-path "${current-output-path}" ) add_custom_command( OUTPUT ${current-output-path} COMMAND ${DXC} -spirv -fspv-target-env=vulkan1.3 ${current-shader-path} -T cs_6_8 -Fo ${current-output-path} DEPENDS ${current-shader-path} IMPLICIT_DEPENDS CXX ${current-shader-path} VERBATIM) elseif( "${SHADER}" MATCHES "\.vert\.hlsl$" ) ϑΝΠϧ໊͕.hlslͰऴΘ͍ͬͯͨΒ glslcͰ͸ͳ͘dxcͰίϯύΠϧ͢Δ ϧʔϧΛCMakeʹ௥Ճ DXCʹVulkan 1.3༻ͷSPIR-VΛ ग़ྗ͢ΔΑ͏ཁٻ

Slide 31

Slide 31 text

$ spirv-reflect ./src/example/hlsl/test.comp.spv generator : Google spiregg source lang : HLSL source lang ver : 680 source file : entry point : main (stage=CS) local size : (256, 1, 1) Input variables: 1 0: spirv id : 2 location : (built-in) GlobalInvocationId type : uint3 semantic : name : qualifier : Descriptor bindings: 2 Binding 0.0 spirv id : 3 set : 0 ੜ੒͞ΕͨSPIR-VͷϦϑϨΫγϣϯ৘ใΛμϯϓ

Slide 32

Slide 32 text

Descriptor bindings: 2 Binding 0.0 spirv id : 3 set : 0 binding : 0 type : VK_DESCRIPTOR_TYPE_STORAGE_BUFFER (UAV) count : 1 accessed : true name : a (type.RWStructuredBuffer.float) // size = 0, padded size = 0 struct type.RWStructuredBuffer.float { float ; // abs offset = 0, rel offset = 0, size = 0, padded size = 0 } a; Binding 0.1 spirv id : 4 set : 0 binding : 1 type : VK_DESCRIPTOR_TYPE_STORAGE_BUFFER (UAV) count : 1 accessed : true name : b (type.RWStructuredBuffer.float) // size = 0, padded size = 0 γΣʔμ͸"a"ͱ͍͏໊લͷ֎෦ͷfloatܕͷ഑ྻΛಡΈॻ͖͢Δ ࣮ߦ͢Δࡍʹ͸GPU্ʹϝϞϦΛ֬อͯ͠ "a"ʹ݁ͼ͚ͭΔඞཁ͕͋Δ RWStructuredBuffer a; RWStructuredBuffer b;

Slide 33

Slide 33 text

} a; Binding 0.1 spirv id : 4 set : 0 binding : 1 type : VK_DESCRIPTOR_TYPE_STORAGE_BUFFER (UAV) count : 1 accessed : true name : b (type.RWStructuredBuffer.float) // size = 0, padded size = 0 struct type.RWStructuredBuffer.float { float ; // abs offset = 0, rel offset = 0, size = 0, padded size = 0 } b; γΣʔμ͸"b"ͱ͍͏໊લͷ֎෦ͷfloatܕͷ഑ྻΛಡΈॻ͖͢Δ ࣮ߦ͢Δࡍʹ͸GPU্ʹϝϞϦΛ֬อͯ͠ "b"ʹ݁ͼ͚ͭΔඞཁ͕͋Δ RWStructuredBuffer a; RWStructuredBuffer b;

Slide 34

Slide 34 text

const auto input_data = res.allocator->create_mappable_buffer( sizeof( float ) * 260, vk::BufferUsageFlagBits::eStorageBuffer ); const auto output_data = res.allocator->create_mappable_buffer( sizeof( float ) * 260, vk::BufferUsageFlagBits::eStorageBuffer ); gct::compute comp( gct::compute_create_info() .set_allocator( res.allocator ) .set_descriptor_pool( res.descriptor_pool ) .set_pipeline_cache( res.pipeline_cache ) .set_shader( CMAKE_CURRENT_BINARY_DIR "/test.comp.spv" ) .add_resource( { "a", output_data } ) .add_resource( { "b", input_data } ) ); { auto mapped = input_data->map< float >(); for( unsigned int i = 0u; i != 260; ++i ) { mapped[ i ] = i; GPUͷϝϞϦʹόοϑΝinput_dataͱoutput_dataΛ֬อ

Slide 35

Slide 35 text

sizeof( float ) * 260, vk::BufferUsageFlagBits::eStorageBuffer ); const auto output_data = res.allocator->create_mappable_buffer( sizeof( float ) * 260, vk::BufferUsageFlagBits::eStorageBuffer ); gct::compute comp( gct::compute_create_info() .set_allocator( res.allocator ) .set_descriptor_pool( res.descriptor_pool ) .set_pipeline_cache( res.pipeline_cache ) .set_shader( CMAKE_CURRENT_BINARY_DIR "/test.comp.spv" ) .add_resource( { "a", output_data } ) .add_resource( { "b", input_data } ) ); { auto mapped = input_data->map< float >(); for( unsigned int i = 0u; i != 260; ++i ) { mapped[ i ] = i; } } auto command_buffer = res.queue->get_command_pool()->allocate(); ໊લ͕"a"ͷσεΫϦϓλʹoutput_dataΛ݁ͼ͚ͭΔ ໊લ͕"b"ͷσεΫϦϓλʹinput_dataΛ݁ͼ͚ͭΔ ઌ΄ͲͷHLSL͔Β࡞ͬͨSPIR-VΛ࣮ߦ͢Δҝͷ ίϯϐϡʔτύΠϓϥΠϯΛ࡞Δ

Slide 36

Slide 36 text

); gct::compute comp( gct::compute_create_info() .set_allocator( res.allocator ) .set_descriptor_pool( res.descriptor_pool ) .set_pipeline_cache( res.pipeline_cache ) .set_shader( CMAKE_CURRENT_BINARY_DIR "/test.comp.spv" ) .add_resource( { "a", output_data } ) .add_resource( { "b", input_data } ) ); { auto mapped = input_data->map< float >(); for( unsigned int i = 0u; i != 260; ++i ) { mapped[ i ] = i; } } auto command_buffer = res.queue->get_command_pool()->allocate(); const auto begin = std::chrono::high_resolution_clock::now(); { { auto rec = command_buffer->begin(); rec.sync_to_device( input_data ); rec.transfer_to_compute_barrier( { input_data->get_buffer() }, {} ); comp( rec, 0, 256, 1, 1 ); CPU্ͷεςʔδϯάόοϑΝʹinput_dataʹॻ͘஋Λฒ΂Δ

Slide 37

Slide 37 text

mapped[ i ] = i; } } auto command_buffer = res.queue->get_command_pool()->allocate(); const auto begin = std::chrono::high_resolution_clock::now(); { { auto rec = command_buffer->begin(); rec.sync_to_device( input_data ); rec.transfer_to_compute_barrier( { input_data->get_buffer() }, {} ); comp( rec, 0, 256, 1, 1 ); rec.sync_to_host( output_data ); } command_buffer->execute_and_wait(); } const auto end = std::chrono::high_resolution_clock::now( CPU্ͷεςʔδϯάόοϑΝͷ಺༰ΛGPU্ͷinput_dataʹసૹ ίϯϐϡʔτύΠϓϥΠϯΛ256εϨουͰ࣮ߦ GPU্ͷoutput_dataͷ಺༰ΛCPU্ͷεςʔδϯάόοϑΝʹసૹ

Slide 38

Slide 38 text

$ ./src/example/hlsl/hlsl 1 3 5 7 ... 507 509 511 0 0 0 0 RWStructuredBuffer a; RWStructuredBuffer b; template T add( T x, T y ) { return x + y; } [numthreads(256, 1, 1)] void main( uint2 id : SV_DispatchThreadID ) { uint index = id.x; a[index] = add(b[index], b[index + 1]); } HLSL 2021Λ࢖͏ͱGLSLΑΓগ͚ͩ͠ C++ͱڞ௨ͷίʔυΛॻ͖΍͘͢ͳΔ

Slide 39

Slide 39 text

https://github.com/google/clspv clspv OpenCL CΛ SPIR-Vʹ ίϯύΠϧ͢Δ ຊՈͷclangʹ࣮૷ͨ͠ OpenCL Cͷύʔαͱ ຊՈͷllvmʹ࣮૷ͨ͠ SPIR-VόοΫΤϯυΛ ૊Έ߹Θͤͯಈ͘

Slide 40

Slide 40 text

https://www.khronos.org/opencl/assets/CXX_for_OpenCL.html C++ for OpenCL C++ for OpenCL 1.0 OpenCL C 2.0ʹC++03ͷػೳͷ͏ͪ ༰қʹ௥ՃͰ͖Δ෺Λ௥Ճͨ͠෺ C++ for OpenCL 2021 OpenCL C 3.0ʹC++17ͷػೳͷ͏ͪ ༰қʹ௥ՃͰ͖Δ෺Λ௥Ճͨ͠෺

Slide 41

Slide 41 text

__kernel void test(__global float *a, __global float *b) { const auto index = get_global_id( 0 ); const auto add = []( auto x, auto y ) __private { return x + y; }; a[index] = add( b[index], b[index + 1] ); } Generic Lambda [ C++14 ]

Slide 42

Slide 42 text

IMPLICIT_DEPENDS CXX ${current-shader-path} VERBATIM) elseif( "${SHADER}" MATCHES "\.cl$" ) string( REPLACE ".cl.spv" ".comp.spv" current-output-path "${current-output-path}" ) add_custom_command( OUTPUT ${current-output-path} COMMAND ${CLSPV} --cl-std=CLC++2021 -inline-entry-points -- spv-version=1.6 ${current-shader-path} -o ${current-output-path} DEPENDS ${current-shader-path} IMPLICIT_DEPENDS CXX ${current-shader-path} VERBATIM) else() add_custom_command( OUTPUT ${current-output-path} COMMAND ${GLSLC} -o ${current-output-path} ${current-shader- ϑΝΠϧ໊͕.clͰऴΘ͍ͬͯͨΒ glslcͰ͸ͳ͘clspvͰίϯύΠϧ͢Δ ϧʔϧΛCMakeʹ௥Ճ clspvʹ C++ for OpenCL 2021Λ Vulkan 1.3༻ͷSPIR-Vʹ ͢ΔΑ͏ཁٻ

Slide 43

Slide 43 text

$ spirv-reflect ./src/example/cl/test.comp.spv generator : ??? source lang : OpenCL_CPP source lang ver : 202100 source file : entry point : test (stage=CS) local size : (0, 0, 0) Input variables: 1 0: spirv id : 7 location : (built-in) GlobalInvocationId type : uint3 semantic : name : qualifier : Push constant blocks: 1 0: spirv id : 5 ੜ੒͞ΕͨSPIR-VͷϦϑϨΫγϣϯ৘ใΛμϯϓ

Slide 44

Slide 44 text

Push constant blocks: 1 0: spirv id : 5 name : // size = 16, padded size = 16 struct { uint3 ; // abs offset = 0, rel offset = 0, size = 12, padded size = 16 } ; Descriptor bindings: 2 Binding 0.0 spirv id : 18 set : 0 binding : 0 type : VK_DESCRIPTOR_TYPE_STORAGE_BUFFER (UAV) count : 1 accessed : true name : // size = 0, padded size = 0 struct { float ; // abs offset = 0, rel offset = 0, size = 0, padded size = 0 push constantͰූ߸ͳ͠੔਺3ཁૉͷϕΫτϧΛ౉͢ඞཁ͕͋Δ OpenCLͷίʔυଆͰget_global_idΛݺͿͱ ίϯϐϡʔτύΠϓϥΠϯ͔Β౉ͬͯ͘ΔεϨουIDʹpush constantͷ஋Λ ଍ͨ͠஋͕ฦͬͯ͘Δ Ұ౓ʹ࣮ߦͰ͖ͳ͍͘Β͍େྔͷεϨουΛ ෳ਺ճʹ෼ׂ࣮ͯ͠ߦ͢Δͷʹศར

Slide 45

Slide 45 text

Descriptor bindings: 2 Binding 0.0 spirv id : 18 set : 0 binding : 0 type : VK_DESCRIPTOR_TYPE_STORAGE_BUFFER (UAV) count : 1 accessed : true name : // size = 0, padded size = 0 struct { float ; // abs offset = 0, rel offset = 0, size = 0, padded size = 0 } ; Binding 0.1 spirv id : 19 set : 0 binding : 1 type : VK_DESCRIPTOR_TYPE_STORAGE_BUFFER (UAV) count : 1 accessed : true name : // size = 0, padded size = 0 __kernel void test(__global float *a, __global float *b) { __kernelͳؔ਺ͷҾ਺͕σεΫϦϓλʹͳΔ 0൪໨ͷҾ਺͕binding=0ʹͳΔ ม਺໊ΛϦϑϨΫγϣϯ৘ใʹೖΕͯ͘Εͳ͍ͷͰ binding൪߸Ͱ݁ͼ͚ͭΔඞཁ͕͋Δ

Slide 46

Slide 46 text

} ; Binding 0.1 spirv id : 19 set : 0 binding : 1 type : VK_DESCRIPTOR_TYPE_STORAGE_BUFFER (UAV) count : 1 accessed : true name : // size = 0, padded size = 0 struct { float ; // abs offset = 0, rel offset = 0, size = 0, padded size = 0 } ; __kernel void test(__global float *a, __global float *b) { 1൪໨ͷҾ਺΋ಉ༷

Slide 47

Slide 47 text

$ ./src/dump_spirv -i src/example/cl/test.comp.spv { "descriptor_binding_count": 2, "descriptor_bindings": [ ... "spec_constants": [ { "constant_id": 0, "spirv_id": 8 }, { "constant_id": 1, "spirv_id": 9 }, { "constant_id": 2, "spirv_id": 10 } ], "spirv_execution_model": "GLCompute" } ಛघԽఆ਺͕3ؚͭ·Ε͍ͯΔ ಛघԽఆ਺ͱ͸: γΣʔμͷίϯύΠϧ࣌ʹ஋ΛܾΊΒΕΔఆ਺ ίʔυʹఆ਺͕௚ॻ͖͞Ε͍ͯΔͷͱ ಉ౳ͷ࠷దԽ͕ߦΘΕΔ OpenCLͰ͸LocalWorkgroupͷαΠζΛ ίʔυ্ʹॻ͔ͳ͍ͷͰ ಛघԽఆ਺Ͱࢦఆ͢Δ

Slide 48

Slide 48 text

gct::compute comp( gct::compute_create_info() .set_allocator( res.allocator ) .set_descriptor_pool( res.descriptor_pool ) .set_pipeline_cache( res.pipeline_cache ) .set_shader( CMAKE_CURRENT_BINARY_DIR "/test.comp.spv" ) .add_resource( { 0, output_data } ) .add_resource( { 1, input_data } ) .add_spec( 0, 256 ) .add_spec( 1, 1 ) .add_spec( 2, 1 ) .set_dim( 256, 1, 1 ) ); { auto mapped = input_data->map< float >(); for( unsigned int i = 0u; i != 260; ++i ) { mapped[ i ] = i; } } auto command_buffer = res.queue->get_command_pool()->allocate(); const auto begin = std::chrono::high_resolution_clock::now(); binding=0ͷσεΫϦϓλʹoutput_dataΛ݁ͼ͚ͭΔ binding=1ͷσεΫϦϓλʹinput_dataΛ݁ͼ͚ͭΔ ಛघԽఆ਺ʹ(256,1,1)Ληοτ͢Δ ϦϑϨΫγϣϯ͔ΒLocalWorkgroupͷαΠζ͕औΕͳ͍ͷͰ खಈͰ(256,1,1)Ληοτ͢Δ

Slide 49

Slide 49 text

} auto command_buffer = res.queue->get_command_pool()->allocate(); const auto begin = std::chrono::high_resolution_clock::now(); { { auto rec = command_buffer->begin(); rec.sync_to_device( input_data ); rec.transfer_to_compute_barrier( { input_data->get_buffer() }, {} ); glm::ivec3 offset( 0, 0, 0 ); rec->pushConstants( **comp.get_pipeline()->get_props().get_layout(), comp.get_pipeline()->get_props().get_layout()- >get_props().get_push_constant_range()[ 0 ].stageFlags, 0u, sizeof( glm::ivec3 ), &offset ); comp( rec, 0, 256, 1, 1 ); rec.sync_to_host( output_data ); } command_buffer->execute_and_wait(); push constantʹεϨουIDͷΦϑηοτ(0,0,0)Ληοτ͢Δ

Slide 50

Slide 50 text

auto rec = command_buffer->begin(); rec.sync_to_device( input_data ); rec.transfer_to_compute_barrier( { input_data->get_buffer() }, {} ); glm::ivec3 offset( 0, 0, 0 ); rec->pushConstants( **comp.get_pipeline()->get_props().get_layout(), comp.get_pipeline()->get_props().get_layout()- >get_props().get_push_constant_range()[ 0 ].stageFlags, 0u, sizeof( glm::ivec3 ), &offset ); comp( rec, 0, 256, 1, 1 ); rec.sync_to_host( output_data ); } command_buffer->execute_and_wait(); } const auto end = std::chrono::high_resolution_clock::now(); ࣮ߦ

Slide 51

Slide 51 text

$ ./src/example/cl/cl 1 3 5 7 ... 507 509 511 0 0 0 0 __kernel void test(__global float *a, __global float *b) { const auto index = get_global_id( 0 ); const auto add = []( auto x, auto y ) __private { return x + y; }; a[index] = add( b[index], b[index + 1] ); } C++ for OpenCL 2021Λ࢖͏ͱ HLSL 2021ΑΓଟ͘ͷC++ͷػೳ͕࢖͑Δ ͨͩ͠ίϯϐϡʔτύΠϓϥΠϯઐ༻

Slide 52

Slide 52 text

https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VK_NV_cuda_kernel_launch.html VK_NV_cuda_kernel_launch ͜ͷ֦ு͕αϙʔτ͞Ε͍ͯΔ৔߹ nvcc -ptx Ͱ࡞ͬͨPTX͔Β VkCudaModuleNVΛ࡞Γ Vulkan͔Β CUDAͷ__global__ͳؔ਺Λ ࣮ߦͰ͖Δ NVIDIA͔͠αϙʔτ͍ͯ͠ͳ͍

Slide 53

Slide 53 text

__global__ void test( float *a, float *b ) { const int index = blockIdx.x * blockDim.x + threadIdx.x; const auto add = []( auto x, auto y ) { return x + y; }; a[index] = add( b[index], b[index + 1] ); } __kernel void test(__global float *a, __global float *b) { const auto index = get_global_id( 0 ); const auto add = []( auto x, auto y ) __private { return x + y; }; a[index] = add( b[index], b[index + 1] ); } CUDA ੍ݶ෇͖C++17 C++ for OpenCL 2021 ੍ݶ෇͖C++17

Slide 54

Slide 54 text

ಉ͡C++17ͷαϒηοτͰ΋੍ݶ͕ҧ͏ CUDA C++ for OpenCL 2021 ྫ֎ Ծ૝ؔ਺ ؔ਺ ϙΠϯλ σʔλ ϙΠϯλ ࢀর =࢖͑Δ

Slide 55

Slide 55 text

͜ͷϢχοτ1ݸʹ͖ͭϓϩάϥϜΧ΢ϯλ͕1ݸ

Slide 56

Slide 56 text

͜ͷϢχοτ1ݸʹ͖ͭϓϩάϥϜΧ΢ϯλ͕1ݸ __kernel void test(__global float *a, __global float *b) { const auto index = get_global_id( 0 ); if( index % 2 == 1 ) { a[ index ] = b[ index ]; } else { a[ index ] = 0.0f; } } ෼ذͨ͘͠ͳͬͨΒͲ͏͢Δ? μΠόʔδΣϯε

Slide 57

Slide 57 text

__kernel void test(__global float *a, __global float *b) { const auto index = get_global_id( 0 ); if( index % 2 == 1 ) { a[ index ] = b[ index ]; } else { a[ index ] = 0.0f; } } ⋯ index ⋯ index % 2 a[ index ] = b[ index ] B<> B<> B<> ⋯ a[ index ] = 0.0f ⋯ B<> B<> B<> ෼ذ৚݅ͰϚεΫΛ࡞Δ ྆ํͷϒϩοΫΛ࣮ߦ͢Δ ϚεΫʹج͍ͮͯͲͪΒͷ݁ՌΛ࠾༻͢Δ͔ܾΊΔ a[ index ]

Slide 58

Slide 58 text

ؒ઀෼ذͩͬͨΒ? __device__ float func1() { return 3.0f; } __device__ float func2() { return 5.0f; } __global__ void find( float *input_buffer, float *output_buffer ) { const int index = blockIdx.x * blockDim.x + threadIdx.x; const auto func = ( index % 2 ) ? &func1 : func2; output_buffer[ index ] = func(); }

Slide 59

Slide 59 text

__device__ float func1() { return 3.0f; } __device__ float func2() { return 5.0f; } __global__ void find( float *input_buffer, float *output_buffer ) { const int index = blockIdx.x * blockDim.x + threadIdx.x; const auto func = ( index % 2 ) ? &func1 : func2; output_buffer[ index ] = func(); } GVOD GVOD GVOD GVOD GVOD GVOD GVOD ⋯ func = ඈͼઌͷΞυϨε͔ΒϚεΫΛ࡞Γɺશͯͷؔ਺ΛݺͿ func1()ΛݺͿ func2()ΛݺͿ ⋯ ⋯ ⋯ ϚεΫͰ݁ՌΛબͿ

Slide 60

Slide 60 text

https://patents.google.com/patent/US7761697 ถࠃಛڐ 7761697B1 SIMDͰؒ઀෼ذ͢Δํ๏ ಛڐݖऀ NVIDIA 2026೥11݄21೔·Ͱ

Slide 61

Slide 61 text

Ծ૝ؔ਺͸vtableͷΞυϨεΛ࢖ͬͨ ؒ઀෼ذͳͷͰαϙʔτͰ͖ͳ͍

Slide 62

Slide 62 text

https://shady-gang.github.io/vcc/ Vulkan Clang Compiler C++ΛSPIR-Vʹ ίϯύΠϧ͢ΔίϯύΠϥ ྫ֎ͱ࣮ߦ࣌ܕ৘ใΛআ͘ શͯͷݴޠػೳΛαϙʔτ gotoΛؚΉ ͋ΒΏΔίʔυϑϩʔʹରԠ

Slide 63

Slide 63 text

#include using namespace vcc; extern "C" compute_shader local_size(256, 1, 1) void main( float *a, float *b ) { a[ gl_GlobalInvocationID.x ] = b[ gl_GlobalInvocationID.x ] + b[ gl_GlobalInvocationID.x + 1 ]; } #define vertex_shader __attribute__((annotate("shady::entry_point::Vertex"))) #define fragment_shader __attribute__((annotate("shady::entry_point::Fragment"))) #define compute_shader __attribute__((annotate("shady::entry_point::Compute"))) #define location(i) __attribute__((annotate("shady::location::"#i))) #define descriptor_set(i) __attribute__((annotate("shady::descriptor_set::"#i))) #define descriptor_binding(i) __attribute__((annotate("shady::descriptor_binding::"#i))) #define local_size(x, y, z) __attribute__((annotate("shady::workgroup_size::"#x"::"#y"::"#z))) ϚΫϩ͕ల։͞Εͯ __attribute__ʹͳΓ ίϯύΠϥʹSPIR-VΛుͨ͘Ίʹ ඞཁͳ৘ใ͕ૹΒΕΔ

Slide 64

Slide 64 text

$ spirv-reflect test.comp.spv generator : ??? source lang : Unknown source lang ver : 0 source file : entry point : main (stage=CS) local size : (256, 1, 1) Input variables: 1 0: spirv id : 25 location : (built-in) GlobalInvocationId type : uint3 semantic : name : _ZN3vcc21gl_GlobalInvocationIDE qualifier : Push constant blocks: 1 ੜ੒͞ΕͨSPIR-VͷϦϑϨΫγϣϯ৘ใΛμϯϓ

Slide 65

Slide 65 text

Push constant blocks: 1 0: spirv id : 5 name : __main_args // size = 16, padded size = 16 struct { uint ; // abs offset = 0, rel offset = 0, size = 8, padded size = 8 uint ; // abs offset = 8, rel offset = 8, size = 8, padded size = 8 } __main_args; mainؔ਺ͷҾ਺͕push constantʹͳ͍ͬͯΔ 64bitͷϙΠϯλ2ͭͳͷͰ όοϑΝͷGPUଆ͔ΒݟͨΞυϨεΛͱͬͯ push constantʹಥͬࠐΊ͹ྑ͍

Slide 66

Slide 66 text

gct::compute comp( gct::compute_create_info() .set_allocator( res.allocator ) .set_descriptor_pool( res.descriptor_pool ) .set_pipeline_cache( res.pipeline_cache ) .set_shader( CMAKE_CURRENT_BINARY_DIR "/test.comp.spv" ) ); { auto mapped = input_data->map< float >(); for( unsigned int i = 0u; i != 260; ++i ) { mapped[ i ] = i; } } const auto input_data_addr = input_data->get_buffer()->get_address(); const auto output_data_addr = output_data->get_buffer()->get_address(); auto command_buffer = res.queue->get_command_pool()->allocate(); const auto begin = std::chrono::high_resolution_clock::now(); { { auto rec = command_buffer->begin(); rec.sync_to_device( input_data ); σεΫϦϓλͳ͠ͰίϯϐϡʔτύΠϓϥΠϯΛ࡞Δ

Slide 67

Slide 67 text

gct::compute comp( gct::compute_create_info() .set_allocator( res.allocator ) .set_descriptor_pool( res.descriptor_pool ) .set_pipeline_cache( res.pipeline_cache ) .set_shader( CMAKE_CURRENT_BINARY_DIR "/test.comp.spv" ) ); { auto mapped = input_data->map< float >(); for( unsigned int i = 0u; i != 260; ++i ) { mapped[ i ] = i; } } const auto input_data_addr = input_data->get_buffer()->get_address(); const auto output_data_addr = output_data->get_buffer()->get_address(); auto command_buffer = res.queue->get_command_pool()->allocate(); const auto begin = std::chrono::high_resolution_clock::now(); { { auto rec = command_buffer->begin(); rec.sync_to_device( input_data ); ೖྗ஋༻ͷόοϑΝͱग़ྗ஋༻ͷόοϑΝͷ GPU͔Βݟͨઌ಄ΞυϨεΛऔಘ͢Δ

Slide 68

Slide 68 text

{ auto rec = command_buffer->begin(); rec.sync_to_device( input_data ); rec.transfer_to_compute_barrier( { input_data->get_buffer() }, {} ); std::array< vk::DeviceAddress, 2u > addrs{ *output_data_addr, *input_data_addr }; rec->pushConstants( **comp.get_pipeline()->get_props().get_layout(), comp.get_pipeline()->get_props().get_layout()- >get_props().get_push_constant_range()[ 0 ].stageFlags, 0u, sizeof( vk::DeviceAddress ) * addrs.size(), addrs.data() ); comp( rec, 0, 256, 1, 1 ); rec.sync_to_host( output_data ); } command_buffer->execute_and_wait(); } push constantʹόοϑΝͷΞυϨεΛॻ͘

Slide 69

Slide 69 text

rec.sync_to_device( input_data ); rec.transfer_to_compute_barrier( { input_data->get_buffer() }, {} ); std::array< vk::DeviceAddress, 2u > addrs{ *output_data_addr, *input_data_addr }; rec->pushConstants( **comp.get_pipeline()->get_props().get_layout(), comp.get_pipeline()->get_props().get_layout()- >get_props().get_push_constant_range()[ 0 ].stageFlags, 0u, sizeof( vk::DeviceAddress ) * addrs.size(), addrs.data() ); comp( rec, 0, 256, 1, 1 ); rec.sync_to_host( output_data ); } command_buffer->execute_and_wait(); } const auto end = std::chrono::high_resolution_clock::now(); ࣮ߦ

Slide 70

Slide 70 text

$ ./src/example/vcc/vcc 1 3 5 7 ... 507 509 511 0 0 0 0 #include using namespace vcc; extern "C" compute_shader local_size(256, 1, 1) void main( float *a, float *b ) { a[ gl_GlobalInvocationID.x ] = b[ gl_GlobalInvocationID.x ] + b[ gl_GlobalInvocationID.x + 1 ]; }

Slide 71

Slide 71 text

vcc͸C++ΛLLVM IRʹͨ͠ޙCPSม׵Λߦ͏ int f( int a ) { a += 1; if( a == 3 ) { a = 0; } else { a = g( a ); } return a; } f0 f1 f2 f3 ؔ਺Λbasic blockຖʹผͷؔ਺ʹ͢Δ ֤ؔ਺͸࣍ʹ࣮ߦ͍ͨؔ͠਺Λ ΩϡʔʹੵΉ ϩʔΧϧม਺ ͸ڞ༗ϝϞϦͰ࣍ͷؔ਺ʹ౉͞ΕΔ ͸ ͱ ΛΩϡʔʹੵΉ Ωϡʔʹੵ·Εͨؔ਺͸શ෦࣮ߦ͢Δ f0 f1 f2 ͱ ͸ڞʹ ΛΩϡʔʹੵΉͷͰ ͜͜Ͱ྆ऀ͔Βདྷͨม਺Λ౷߹͢Δ f1 f2 f3

Slide 72

Slide 72 text

vcc͸C++ΛLLVM IRʹͨ͠ޙCPSม׵Λߦ͏ int f( int a ) { a += 1; if( a == 3 ) { a = 0; } else { a = g( a ); } return a; } f0 f1 f2 f3 ؔ਺Λbasic blockຖʹผͷؔ਺ʹ͢Δ ֤ؔ਺͸࣍ʹ࣮ߦ͍ͨؔ͠਺Λ ΩϡʔʹੵΉ ϩʔΧϧม਺ ͸ڞ༗ϝϞϦͰ࣍ͷؔ਺ʹ౉͞ΕΔ ͸ ͱ ΛΩϡʔʹੵΉ Ωϡʔʹੵ·Εͨؔ਺͸શ෦࣮ߦ͢Δ f0 f1 f2 ͱ ͸ڞʹ ΛΩϡʔʹੵΉͷͰ ͜͜Ͱ྆ऀ͔Βདྷͨม਺Λ݁߹͢Δ f1 f2 f3 ར఺1: NVIDIAͷಛڐΛճආͯؒ͠઀෼ذ͕Ͱ͖Δ ར఺2: ͲΜͳʹܹ͘͠μΠόʔδΣϯε͕ੜͯ͡΋ ੑೳΛ٘ਜ਼ʹ࣮ͯ͠ߦͰ͖Δ

Slide 73

Slide 73 text

#include using namespace vcc; extern "C" compute_shader local_size(256, 1, 1) void main( float *a, float *b ) { const auto index = gl_GlobalInvocationID.x; const auto add = []< typename T, typename U >( T x, U y ) { return x + y; }; a[index] = add( b[index], b[index + 1] ); } Familiar template syntax for generic lambdas [ C++20 ] C++ΛLLVM IRʹ͢Δॴ·Ͱ͸ͨͩͷclangͳͷͰ ͜ΜͳίʔυΛॻ͍ͯ΋ίϯύΠϧ͸௨Δ

Slide 74

Slide 74 text

$ ./src/example/vcc/vcc terminate called after throwing an instance of 'vk::DeviceLostError' what(): wait_for_executed failed.: ErrorDeviceLost தࢭ (ίΞμϯϓ) #include using namespace vcc; extern "C" compute_shader local_size(256, 1, 1) void main( float *a, float *b ) { const auto index = gl_GlobalInvocationID.x; const auto add = []< typename T, typename U >( T x, U y ) { return x + y; }; a[index] = add( b[index], b[index + 1] ); } ͳ͔ͥ ແݶϧʔϓʹ ؕΔ

Slide 75

Slide 75 text

#include using namespace vcc; descriptor_set(0) descriptor_binding(1) uniform_constant extern float a[]; descriptor_set(0) descriptor_binding(2) uniform_constant extern float b[]; extern "C" compute_shader local_size(256, 1, 1) void main() { a[ gl_GlobalInvocationID.x ] = b[ gl_GlobalInvocationID.x ] + b[ gl_GlobalInvocationID.x + 1 ]; } σεΫϦϓλΛ࢖͏ͱ

Slide 76

Slide 76 text

$ spirv-reflect test.comp.spv Segmentation fault (ίΞμϯϓ) ܕ৘ใ͕յΕ͍ͯͯಡΊͳ͍ ϦϑϨΫγϣϯ͕ੜ੒͞ΕΔ ख๏͸໘ന͍͕ ݱঢ়ͷ࣮૷͸͙͢ʹ࢖͑Δ඼࣭Ͱ͸ͳ͍

Slide 77

Slide 77 text

·ͱΊ CPUͱGPUͰಉ͡ίʔυΛ࣮ߦ͍ͨ࣌͠GPUଆͰ࢖͏ݴޠ GLSL HLSL 2021 C++ for OpenCL 2021 CUDA C++ GLSLΑΓϚγ άϥϑΟΫεύΠϓϥΠϯ΋OK ϕϯμʔඇґଘͰίϯϐϡʔτύΠϓϥΠϯ͚͕ͩ ಈ͚͹ྑ͍ͳΒ͜Ε NVIDIAͷGPUͰίϯϐϡʔτύΠϓϥΠϯ͚͕ͩ ಈ͚͹ྑ͍ͳΒ͜Ε ͭΒ͍ ਓྨͷເɺάϥϑΟΫεύΠϓϥΠϯ΋OK ͔͠͠ݱঢ়ίϯύΠϥͷಈ͖͕ո͍͠

Slide 78

Slide 78 text

https://fadis.booth.pm/ ٕज़ॻయͰຊग़ͯ͠·͢ /&8