Upgrade to Pro — share decks privately, control downloads, hide ads and more …

極めて速いストレージとの付き合い方

Fadis
September 21, 2018

 極めて速いストレージとの付き合い方

いまどきのツンデレSSDにデレてもらうためにはどうすれば良いかを解説します
これは2018年9月22日に行われた カーネル/VM探検隊@関西 9回目 での発表資料です
サンプルコード: https://github.com/Fadis/spdk_sample

Fadis

September 21, 2018
Tweet

More Decks by Fadis

Other Decks in Programming

Transcript

  1. #include <iostream> #include <string> #include <chrono> #include <fcntl.h> #include <unistd.h>

    #include <sys/stat.h> #include <sys/ioctl.h> #include <linux/fs.h> #include <boost/program_options.hpp> struct io_failure {}; struct file_t; struct context_t { context_t() : block_size( 512 ) {} size_t block_size; std::vector< file_t > files; }; struct file_t { file_t( context_t *c, int f_, size_t size_, size_t offset_, bool
  2. for( size_t i = 0; i != count; ++i )

    ctx.files.emplace_back( &ctx, fd, bs, random ? ( rand() % available ) * bs : i * bs, zero ); const auto begin = std::chrono::high_resolution_clock::now(); if( write ) { for( auto &file: ctx.files ) if( pwrite( file.fd, reinterpret_cast< void* >( file.buffer.get() ), file.size * ctx.block_size, file.offset * ctx.block_size ) < 0 ) throw io_failure(); } else { for( auto &file: ctx.files ) if( pread( file.fd, reinterpret_cast< void* >( file.buffer.get() ), file.size * ctx.block_size, file.offset * ctx.block_size ) < 0 ) throw io_failure(); } const auto end = std::chrono::high_resolution_clock::now(); const size_t elapsed = std::chrono::duration_cast< ϒϩοΫσόΠεͷϥϯμϜͳҐஔʹread·ͨ͸write ಉظI/OͰ܁Γฦ͠ߦ͏
  3. CPU Intel(R) Core(TM) i5-6600 CPU @ 3.30GHz 4ίΞ ϝϞϦ DDR4-2400

    8GBx2 ετϨʔδ Sandisk ExtremePro M.2 NVMe 3D SSD 500GB OS Gentoo Linux default/linux/amd64/17.0/desktop/plasma/systemd 256MBͷಡΈॻ͖ʹཁ͢Δ࣌ؒΛܭଌ͢Δ ϒϩοΫαΠζΛ512όΠτ͔Β256MB·ͰมԽͤ͞Δ ֤ϒϩοΫͷಡΈॻ͖Λߦ͏Ґஔ͸ϥϯμϜ O_DIRECT|O_SYNCΛ࢖͏ ಉ͡ϒϩοΫαΠζʹ͍ͭͯ10ճͷܭଌΛߦ͏
  4. #include <thread> #include <iostream> #include <string> #include <chrono> #include <new>

    #include <fcntl.h> #include <unistd.h> #include <sys/stat.h> #include <sys/ioctl.h> #include <linux/fs.h> #include <libaio.h> #include <boost/program_options.hpp> struct io_failure {}; struct file_t; struct context_t { context_t( size_t bulk_ ) : bulk( bulk_ ), block_size( 512 ), completed_count( 0 ), completed( false ) { io_queue_init( 1, &io_context ); Linux AIOΛ࢖ͬͯલͷI/Oͷ׬ྃΛ଴ͨͣʹ࣍ͷI/OཁٻΛ౤͛Δ Linux Asynchronous I/O
  5. for( auto &file: ctx.files ) { while( 1 ) {

    auto r = io_submit( ctx.io_context, 1, &file.pcb ); if( r >= 0 ) break; if( r != -EAGAIN && r < 0 ) throw io_failure(); } } } poller.join(); if( flush ) fdatasync( fd ); const auto end = std::chrono::high_resolution_clock::now(); const size_t elapsed = std::chrono::duration_cast< std::chrono::nanoseconds >( end - begin ).count(); const size_t transfered = bs * count * ctx.block_size; close( fd ); std::cout << bs * ctx.block_size << "\t" << transfered << "\t" << elapsed << "\t" << ( double( transfered ) / 1000 / 1000 ) / ( double( elapsed ) / 1000 / 1000 / 1000 ) << "MB/s " << ཁٻଆεϨου͸io_submitͰΧʔωϧͷΩϡʔʹI/OཁٻΛશͯੵΜͰ ݁ՌΛड͚औΔεϨουͷ׬ྃΛ଴ͭ ΧʔωϧͷΩϡʔ͕ҰഋͰI/OཁٻΛੵΊͳ͍ͱ͖͸ ϙʔϦϯάͰ࠶ࢼߦ͢Δ
  6. std::thread poller( [&ctx]() { std::vector< io_event > events( ctx.bulk );

    while( !ctx.completed.load() ) { auto ret = io_getevents( ctx.io_context, 0, events.size(), events.data(), nullptr ); if( ret > 0 ) { auto events_end = std::next( events.begin(), ret ); std::for_each( events.begin(), events_end, []( const auto &event ) { auto file = reinterpret_cast< file_t* >( reinterpret_cast< void* >( event.data ) ); if( event.res >= 0 ) { if( int( event.res ) < 0 ) throw io_failure(); if( ++file->context->completed_count == file->context- >files.size() ) { file->context->completed = true; } ݁ՌΛड͚औΔεϨου͸ io_geteventsΛϙʔϦϯάͯ͠׬ྃͨ͠I/Oͷ৘ใΛऔಘ͢Δ ड͚औͬͨ਺͕౤͛ͨI/Oཁٻͷ਺ʹୡͨ͠ΒεϨουΛऴྃͤ͞Δ
  7. CPU Intel(R) Core(TM) i5-6600 CPU @ 3.30GHz 4ίΞ ϝϞϦ DDR4-2400

    8GBx2 ετϨʔδ Sandisk ExtremePro M.2 NVMe 3D SSD 500GB OS Gentoo Linux default/linux/amd64/17.0/desktop/plasma/systemd 256MBͷಡΈॻ͖ʹཁ͢Δ࣌ؒΛܭଌ͢Δ ϒϩοΫαΠζΛ512όΠτ͔Β256MB·ͰมԽͤ͞Δ ֤ϒϩοΫͷಡΈॻ͖Λߦ͏Ґஔ͸ϥϯμϜ ಉظI/O͸O_DIRECT|O_SYNCɺඇಉظI/O͸O_DIRECTͰ࠷ޙʹ1ճfsync ಉ͡ϒϩοΫαΠζʹ͍ͭͯ10ճͷܭଌΛߦ͏
  8. DPDK ύέοτ͖ͨ ύέοτ͖ͨ ී௨ͷωοτϫʔΫυϥΠό ׂΓࠐΈ ύέοτ͋Δ ͋Δ ͋Δ ͋Δ ͋ͬͨ

    1.% ׂΓࠐΈͷΦʔόʔϔου͕େ͖͍ͷͰ CPUΛϏδʔϧʔϓͤͯ͞σόΠεΛ؂ࢹ͢Δ
  9. υϥΠό /7.FυϥΠό "*0υϥΠό SBNEJTLυϥΠό 7JSU*0υϥΠό QNFNυϥΠό bdev blobstore DPDK EAL

    blobfs SCSI NVMe iSCSI λʔήοτ vhost-scsi λʔήοτ NVMe-oF λʔήοτ 41%,Λߏ੒͢Δίϯϙʔωϯτ ʜ DPDK uioυϥΠό ΞϓϦέʔγϣϯ
  10. υϥΠό /7.FυϥΠό "*0υϥΠό SBNEJTLυϥΠό 7JSU*0υϥΠό QNFNυϥΠό bdev blobstore DPDK EAL

    blobfs SCSI NVMe iSCSI λʔήοτ vhost-scsi λʔήοτ NVMe-oF λʔήοτ ʜ DPDK uioυϥΠό ΞϓϦέʔγϣϯ /7.F౳ͷετϨʔδΛΧʔωϧͷ؅ཧ͔Β֎͢ҝʹ %1%,ͷVJPυϥΠόΛ࢖͏ 3FBDUPS΍)VHFQBHFΛ࢖͏ϝϞϦΞϩέʔλ͸ %1%,ͷ࣮૷Λͦͷ··࢖͏ 㱤41%,Λಈ͔͢ʹ͸%1%,͕ඞཁ
  11. υϥΠό /7.FυϥΠό "*0υϥΠό SBNEJTLυϥΠό 7JSU*0υϥΠό QNFNυϥΠό bdev blobstore DPDK EAL

    blobfs SCSI NVMe iSCSI λʔήοτ vhost-scsi λʔήοτ NVMe-oF λʔήοτ ʜ DPDK uioυϥΠό ΞϓϦέʔγϣϯ CEFW ετϨʔδσόΠεͷछྨʹΑΔૢ࡞ํ๏ͷҧ͍Λٵऩ͢Δ
  12. struct spdk_app_opts opts = {}; SPDK_NOTICELOG("entry\n"); spdk_app_opts_init(&opts); opts.name = "bdev";

    const std::string config_file = opt_var[ "config" ].as< std::string >(); opts.config_file = config_file.c_str(); context_t ctx( opt_var[ "concurrency" ].as< size_t >(), opt_var[ "block_size" ].as< size_t >(), opt_var[ "count" ].as< size_t >(), opt_var[ "write" ].as< bool >(), opt_var[ "random" ].as< bool >(), opt_var[ "zero" ].as< bool >(), opt_var[ "flush" ].as< bool >() ); spdk_app_start(&opts, run, &ctx, nullptr ); } spdk_app_startͰ DPDK EALͷॳظԽ εϨουϓʔϧͷ։࢝ ར༻ՄೳͳσόΠε୳͠ ͕ߦΘΕΔ ͜ͷؔ਺͸spdk_app_stop͞ΕΔ·ͰฦΒͳ͍
  13. void run( void *ctx_, void * ) { auto ctx

    = reinterpret_cast< context_t* >( ctx_ ); ctx->bdev = spdk_bdev_get_by_name("Nvme0n1"); if( !ctx->bdev ) ABORT( "σόΠε͕ݟ͔ͭΒͳ͍" ); ctx->page_size = spdk_bdev_get_block_size( ctx->bdev ); const auto max_page_count = spdk_bdev_get_num_blocks( ctx- >bdev ); const size_t count = std::min( ctx->count, max_page_count ); const size_t max_concurrency = ( ctx->max_concurrency ? std::min( ctx->max_concurrency, count ) : count ); const size_t buf_size = ctx->page_size * ctx->block_size * ( ctx->write ? 1u : max_concurrency ); ctx->buffer.reset( reinterpret_cast< uint8_t* >( spdk_dma_zmalloc( buf_size, ctx- >page_size, nullptr ) ), []( uint8_t *p ) { spdk_dma_free( reinterpret_cast< void* >( p ) ); } ); spdk_bdev_get_by_nameͰσόΠεΛऔಘ͢Δ
  14. ctx->page_size = spdk_bdev_get_block_size( ctx->bdev ); const auto max_page_count = spdk_bdev_get_num_blocks(

    ctx- >bdev ); const size_t count = std::min( ctx->count, max_page_count ); const size_t max_concurrency = ( ctx->max_concurrency ? std::min( ctx->max_concurrency, count ) : count ); const size_t buf_size = ctx->page_size * ctx->block_size * ( ctx->write ? 1u : max_concurrency ); ctx->buffer.reset( reinterpret_cast< uint8_t* >( spdk_dma_zmalloc( buf_size, ctx- >page_size, nullptr ) ), []( uint8_t *p ) { spdk_dma_free( reinterpret_cast< void* >( p ) ); } ); if( !ctx->buffer ) ABORT( "όοϑΝΛ֬อ͢Δࣄ͕Ͱ͖ͳ͍" ) std::fill( ctx->buffer.get(), std::next( ctx->buffer.get(), buf_size ), ctx->zero ? 0 : 1 ); const size_t cores = ctx->write ? 1 : rte_lcore_count(); spdk_dma_zmallocͰόοϑΝΛ֬อ͢Δ
  15. const size_t cores = ctx->write ? 1 : rte_lcore_count(); ctx->channels.resize(

    cores ); ctx->files.reserve( count ); if( spdk_bdev_open( ctx->bdev, true, nullptr, nullptr, &ctx- >desc ) < 0 ) ABORT( "σόΠεΛ։͘͜ͱ͕Ͱ͖ͳ͍" ); for( size_t i = 0; i != count; ++i ) { ctx->files.emplace_back( ctx, i % cores, ctx->block_size, ctx- >random ? ( ( rand() % max_page_count ) / ctx->block_size ) * ctx- >block_size : i * ctx->block_size ); ++ctx->global_left_count; } ctx->head = max_concurrency; for( size_t i = 0; i != cores; ++i ) { spdk_event *event = spdk_event_allocate( i, ctx->write ? write_file : read_file, ctx_, nullptr ); spdk_event_call( event ); } } શͯͷϓϩηοα্Ͱ
  16. void write_file( void *context_, void* ) { auto context =

    reinterpret_cast< context_t* >( context_ ); const size_t lcore = rte_lcore_id(); const size_t cores = 1; context->channels[ lcore ].reset( spdk_bdev_get_io_channel( context->desc ), []( spdk_io_channel *p ) { if( p ) spdk_put_io_channel( p ); } ); if( !context->channels[ lcore ] ) ABORT( "νϟωϧΛ֬อͰ͖ͳ͍" ); const auto max_page_count = spdk_bdev_get_num_blocks( context- >bdev ); const size_t count = std::min( context->count, max_page_count ); const size_t max_concurrency = ( context->max_concurrency ? std::min( context->max_concurrency, count ) : count ); context->begin = std::chrono::high_resolution_clock::now(); for( size_t i = lcore; i < max_concurrency; i += cores ) { context->files[ i ].lcore = lcore; write_file_cont( &context->files[ i ] ); spdk_bdev_get_io_channelͰνϟωϧΛ࡞੒
  17. νϟωϧ ίϚϯυ ίϚϯυ ίϚϯυ ͦ͜ͰNVMe౳ͷ φ΢͍ετϨʔδ͸ ΩϡʔΛ ෳ਺࡞ΕΔΑ͏ʹͳ͍ͬͯΔ ίϚϯυ ίϚϯυ

    ίϚϯυ ίϚϯυ ίϚϯυ ίϚϯυ ίϚϯυ ίϚϯυ ίϚϯυ ͜ͷΩϡʔΛSPDKͰ͸ νϟωϧͱݺͿ νϟωϧ͸νϟωϧΛ࡞੒ͨ͠εϨουҎ֎͔Β͸࢖༻Ͱ͖ͳ͍ νϟωϧΛҾ਺ʹͱΔؔ਺͸ಉ࣌ʹෳ਺ͷεϨου͔Βݺ΂Δ
  18. } void write_file_cont( file_t *file ) { if( spdk_bdev_write_blocks( file->context->desc,

    file->context->channels[ file->lcore ].get(), file->context->buffer.get(), file->offset, file->size, []( struct spdk_bdev_io *bdev_io, bool success, void *file_ ) { if( !success ) ABORT( "ϑΝΠϧΛॻ͘͜ͱ͕Ͱ͖ͳ͍" ); spdk_bdev_free_io( bdev_io ); write_flush( file_ ); } , reinterpret_cast< void* >( file ) ) < 0 ) ABORT( "ϑΝΠϧΛॻ͘͜ͱ͕Ͱ͖ͳ͍" ); } σόΠε͔Βॻ͖ࠐΈͷ݁Ռ͕ฦͬͯ͘Δͱ ࢦఆͨ͠ίʔϧόοΫ͕ݺͼग़͞ΕΔ ࢦఆͨ͠Ґஔ΁ͷσʔλͷॻ͖ࠐΈ
  19. void read_file_cont( file_t *file ) { size_t buffer_index; while( !file->context->buffer_index->pop(

    buffer_index ) ); file->buffer_index = buffer_index; if( spdk_bdev_read_blocks( file->context->desc, file->context->channels[ file->lcore ].get(), std::next( file->context->buffer.get(), file->buffer_index * file->context->page_size * file->context->block_size ), file->offset, file->size, []( struct spdk_bdev_io *bdev_io, bool success, void *file_ ) { auto file = reinterpret_cast< file_t* >( file_ ); while( !file->context->buffer_index->push( file- >buffer_index ) ); if( !success ) ABORT( "ϑΝΠϧΛಡΉ͜ͱ͕Ͱ͖ͳ͍" ); σόΠε͔ΒಡΈग़͠ͷ݁Ռ͕ฦͬͯ͘Δͱ ࢦఆͨ͠ίʔϧόοΫ͕ݺͼग़͞ΕΔ ಡΜͩ಺༰͸Ҿ਺Ͱ౉ͨ͠όοϑΝʹೖ͍ͬͯΔ ࢦఆͨ͠Ґஔ͔ΒͷσʔλͷಡΈग़͠
  20. CPU Intel(R) Core(TM) i5-6600 CPU @ 3.30GHz 4ίΞ ϝϞϦ DDR4-2400

    8GBx2 ετϨʔδ Sandisk ExtremePro M.2 NVMe 3D SSD 500GB OS Gentoo Linux default/linux/amd64/17.0/desktop/plasma/systemd 256MBͷಡΈॻ͖ʹཁ͢Δ࣌ؒΛܭଌ͢Δ ϒϩοΫαΠζΛ512όΠτ͔Β128MB·ͰมԽͤ͞Δ ֤ϒϩοΫͷಡΈॻ͖Λߦ͏Ґஔ͸ϥϯμϜ લͷread͕׬ྃ͢Δ·Ͱ࣍ͷreadΛ౤͛ͳ͍ writeͷ׬ྃޙʹflushΛߦ͍ɺflush͕׬ྃ͢Δ·Ͱ࣍ͷwriteΛ౤͛ͳ͍ ಉ͡ϒϩοΫαΠζʹ͍ͭͯ10ճͷܭଌΛߦ͏
  21. void write_flush( void *file_ ) { auto file = reinterpret_cast<

    file_t* >( file_ ); if( --file->context->global_left_count == 0 ) { if( file->context->flush ) { if( spdk_bdev_flush_blocks( file->context->desc, file->context->channel.get(), file->offset, file->size, []( struct spdk_bdev_io *bdev_io, bool success, void *file_ ) { auto end = std::chrono::high_resolution_clock::now(); auto file = reinterpret_cast< file_t* >( file_ ); spdk_bdev_free_io( bdev_io ); if( !success ) ABORT( "ϑΝΠϧΛॻ͘͜ͱ͕Ͱ͖ͳ͍" ); close( file, end ); }, file_ ) < 0 ) ABORT( "ϑΝΠϧΛॻ͘͜ͱ͕Ͱ͖ͳ͍" ); bdevʹσόΠεͷflushΛཁٻ͢Δ spdk_bdev_flush_blocks͸ݺΜͰ͍͕ͨ
  22. return 0; } static int bdev_nvme_flush(struct nvme_bdev *nbdev, struct nvme_bdev_io

    *bio, uint64_t offset, uint64_t nbytes) { spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_SUCCESS); return 0; } static void _bdev_nvme_reset_done(struct spdk_io_channel_iter *i, int status) { void *ctx = spdk_io_channel_iter_get_ctx(i); Spdk-18.07/lib/bdev/nvme/bdev_nvme.c 227ߦ໨ bdev͔ΒNVMeσόΠεʹରͯ͠flushΛཁٻ͞ΕͨΒ Կ΋͠ͳ͍Ͱ੒ޭΛฦ͢
  23. void write_flush( void *file_ ) { auto file = reinterpret_cast<

    file_t* >( file_ ); if( --file->context->global_left_count == 0 ) { if( file->context->flush ) { if( spdk_bdev_nvme_io_passthru( file->context->desc, file->context->channels[ file->lcore ].get(), &file->context->flush_command, nullptr, 0, []( struct spdk_bdev_io *bdev_io, bool success, void *file_ ) { auto end = std::chrono::high_resolution_clock::now(); auto file = reinterpret_cast< file_t* >( file_ ); spdk_bdev_free_io( bdev_io ); if( !success ) ABORT( "ϑΝΠϧΛॻ͘͜ͱ͕Ͱ͖ͳ͍" ); close( file, end ); }, file_ bdevʹ͸σόΠε͕NVMe͚ͩͬͨ࣌ͩ࢖͑Δ spdk_bdev_nvme_io_passthru (NVMeίϚϯυΛࣗ෼Ͱ࡞ͬͯ౤͛Δ) ͕༻ҙ͞Ε͍ͯΔͷͰɺ͜ΕΛ࢖ͬͯFLUSHΛ౤͛Δ
  24. CPU Intel(R) Core(TM) i5-6600 CPU @ 3.30GHz 4ίΞ ϝϞϦ DDR4-2400

    8GBx2 ετϨʔδ Sandisk ExtremePro M.2 NVMe 3D SSD 500GB OS Gentoo Linux default/linux/amd64/17.0/desktop/plasma/systemd 256MBͷಡΈॻ͖ʹཁ͢Δ࣌ؒΛܭଌ͢Δ ϒϩοΫαΠζΛ512όΠτ͔Β128MB·ͰมԽͤ͞Δ ֤ϒϩοΫͷಡΈॻ͖Λߦ͏Ґஔ͸ϥϯμϜ લͷreadͷ׬ྃΛ଴ͨͣʹ࣍ͷreadΛ౤͛Δ લͷwriteͷ׬ྃΛ଴ͨͣʹ࣍ͷwriteΛ౤͛ɺ࠷ޙʹ1ճ͚ͩflush͢Δ ಉ͡ϒϩοΫαΠζʹ͍ͭͯ10ճͷܭଌΛߦ͏
  25. blobstore 8KiB 12KiB 4KiB 4KiB 4KiBͷ੔਺ഒͷେ͖͞ͷblobΛετϨʔδ্ʹ֬อ͢Δ ֬อͨ͠blob͸ޙ͔ΒϦαΠζ͢Δ͜ͱ͕Ͱ͖Δ ֬อͨ͠blob͸4KiB୯ҐͰread/writeͰ͖Δ 8KiB *%

    *% *% *% *% ϑΝΠϧγεςϜͱͯ͠͸༷ʑͳػೳ͕଍Γ͍ͯͳ͍͕ ࠷খݶͷΦʔόʔϔουͰෳ਺ͷσʔλΛετϨʔδʹஔ͚Δ
  26. void init_storage( void *context_, void * ) { auto context

    = reinterpret_cast< context_t* >( context_ ); context->bdev = spdk_bdev_get_by_name("Nvme0n1"); if( !context->bdev ) ABORT( "σόΠε͕ݟ͔ͭΒͳ͍" ); context->bs = spdk_bdev_create_bs_dev( context->bdev, NULL, NULL ); if( !context->bs ) ABORT( "blobstoreσόΠεΛ࡞੒Ͱ͖ͳ͍" ); struct spdk_bs_opts opts; spdk_bs_opts_init( &opts ); opts.max_channel_ops = 8000; spdk_bs_init( context->bs, &opts, []( void *context_, struct spdk_blob_store *blobstore, int bserrno ) { auto context = reinterpret_cast< context_t* >( context_ ); if( bserrno ) ABORT( "blobstoreΛॳظԽͰ͖ͳ͍" ); context->blobstore = blobstore; context->page_size = spdk_bs_get_page_size( context->blobstore ); spdk_bs_initͰεʔύʔϒϩοΫΛ࡞੒
  27. spdk_bs_create_blob( file->context->blobstore, []( void *file_, spdk_blob_id id, int bserrno )

    { if( bserrno ) ABORT( "blobΛ࡞੒Ͱ͖ͳ͍" ) auto file = reinterpret_cast< file_t* >( file_ ); file->id = id; spdk_bs_open_blob( file->context->blobstore, id, []( void *file_, struct spdk_blob *fd, int bserrno ) { if( bserrno ) ABORT( "blobΛ։͘ࣄ͕Ͱ͖ͳ͍" ) auto file = reinterpret_cast< file_t* >( file_ ); file->fd = fd; file->page_count = file->context->headers[ file- >index ].size / file->context->page_size + ( ( file->size % file- >context->page_size ) ? 1u : 0u ); spdk_blob_resize( file->fd, file->page_count, []( void *file_, int bserrno ) { auto file = reinterpret_cast< file_t* >( file_ ); --file->context->meta_count; if( bserrno ) ABORT( "blobΛϦαΠζͰ͖ͳ͍" ) spdk_bs_create_blobͰblobΛ࡞Γ spdk_bs_open_blobͰblobΛ։͖
  28. if( bserrno ) ABORT( "blobΛ։͘ࣄ͕Ͱ͖ͳ͍" ) auto file = reinterpret_cast<

    file_t* >( file_ ); file->fd = fd; file->page_count = file->context->headers[ file- >index ].size / file->context->page_size + ( ( file->size % file- >context->page_size ) ? 1u : 0u ); spdk_blob_resize( file->fd, file->page_count, []( void *file_, int bserrno ) { auto file = reinterpret_cast< file_t* >( file_ ); --file->context->meta_count; if( bserrno ) ABORT( "blobΛϦαΠζͰ͖ͳ͍" ) spdk_event *event = spdk_event_allocate( file->lcore, write_file, file_, nullptr ); spdk_event_call( event ); }, file_ ); }, file_ ); }, file_ ); spdk_blob_resizeͰඞཁͳαΠζʹϦαΠζ͢Δ
  29. void write_file( void *file_, void* ) { auto file =

    reinterpret_cast< file_t* >( file_ ); auto head = std::next( file->context->buffer.get(), file- >context->headers[ file->index ].offset ); const size_t lcore = rte_lcore_id(); file->lcore = lcore; file->begin = std::chrono::high_resolution_clock::now(); spdk_blob_io_write( file->fd, file->context->channels[ lcore ].get(), head, 0, file->page_count, []( void *file_, int bserrno ) { auto end = std::chrono::high_resolution_clock::now(); auto file = reinterpret_cast< file_t* >( file_ ); if( bserrno ) ABORT( "blobʹॻ͖ࠐΉࣄ͕Ͱ͖ͳ͍" ) tar_benchmark::record_elapsed_time( *file->context->results, file->context->headers[ file->index ].size, std::chrono::duration_cast< std::chrono::nanoseconds spdk_blob_io_writeͰblobʹॻ͖ࠐΈ νϟωϧΛҾ਺ʹͱΔؔ਺͸೚ҙͷεϨου͔Βݺͼग़ͤΔ ͦ͏Ͱͳ͍ؔ਺(open౳)͸reactor_0͔Βݺͼग़͢
  30. CPU Intel(R) Core(TM) i5-6600 CPU @ 3.30GHz 4ίΞ ϝϞϦ DDR4-2400

    8GBx2 ετϨʔδ Sandisk ExtremePro M.2 NVMe 3D SSD 500GB OS Gentoo Linux default/linux/amd64/17.0/desktop/plasma/systemd Write git-2.19.0ͷιʔεtarϘʔϧΛඇѹॖͷঢ়ଶ͔ΒετϨʔδʹల։͢Δ ࣮ߦલʹ·ͬ͞ΒͳϑΝΠϧγεςϜΛ࡞Δ O_DIRECT͔ͭfsyncͳ͠ͰϑΝΠϧຖͷwriteʹཁͨ࣌ؒ͠Λܭଌ͢Δ Read git-2.19.0ͷιʔείʔυ͕ల։͞Ε͍ͯΔঢ়ଶ͔ΒશͯͷϑΝΠϧΛಡΉ ࣮ߦલʹΧʔωϧͷΩϟογϡΛഁغ͢Δ ϑΝΠϧຖͷreadʹཁͨ࣌ؒ͠Λܭଌ͢Δ
  31. CPU Intel(R) Core(TM) i5-6600 CPU @ 3.30GHz 4ίΞ ϝϞϦ DDR4-2400

    8GBx2 ετϨʔδ Sandisk ExtremePro M.2 NVMe 3D SSD 500GB OS Gentoo Linux default/linux/amd64/17.0/desktop/plasma/systemd Write gcc-8.2ͷιʔεtarϘʔϧΛඇѹॖͷঢ়ଶ͔ΒετϨʔδʹల։͢Δ ࣮ߦલʹ·ͬ͞ΒͳϑΝΠϧγεςϜΛ࡞Δ O_DIRECT͔ͭfsyncͳ͠ͰϑΝΠϧຖͷwriteʹཁͨ࣌ؒ͠Λܭଌ͢Δ Read gcc-8.2ͷιʔείʔυ͕ల։͞Ε͍ͯΔঢ়ଶ͔ΒશͯͷϑΝΠϧΛಡΉ ࣮ߦલʹΧʔωϧͷΩϟογϡΛഁغ͢Δ ϑΝΠϧຖͷreadʹཁͨ࣌ؒ͠Λܭଌ͢Δ
  32. CMPCGT 8KiB 13KiB 10 bytes 1KiB bdev্Ͱಈ͘Α͏ʹ࡞ΒΕͨϑΝΠϧγεςϜ ֬อͨ͠ϑΝΠϧ͸ޙ͔ΒϦαΠζ͢Δ͜ͱ͕Ͱ͖Δ ֬อͨ͠ϑΝΠϧ͸όΠτ୯ҐͰread/writeͰ͖Δ 7KiB

    IPHF GVHB QJZP GPP CBS ໊લͰࣝผ͞ΕΔ೚ҙͷαΠζͷϑΝΠϧΛ࡞Δࣄ͕Ͱ͖Δ BUUS BUUS BUUS BUUS BUUS ϑΝΠϧͷσʔλͱ͸ผʹxattrΛอଘͰ͖Δ blobstoreΑΓϑΝΠϧγεςϜʹ͍ۙ࢓༷
  33. spdk_fs_set_cache_size( 512 ); spdk_fs_init( context->bs, nullptr, []( fs_request_fn f, void

    *arg ) { spdk_event *event = spdk_event_allocate( 0, []( void *arg1, void *arg2 ) { reinterpret_cast< fs_request_fn >( arg1 )( arg2 ); }, (void *)f, arg ); spdk_event_call( event ); }, []( void *context_, struct spdk_filesystem *fs, int fserrno ) { auto context = reinterpret_cast< context_t* >( context_ ); if( fserrno ) ABORT( "ϑΝΠϧγεςϜΛ࡞੒Ͱ͖ͳ͍" ) context->fs = fs; auto [mapped_tar,tar_size] = tar_benchmark::load_tar( context->input ); context->tar_size = tar_size; context->buffer.reset( spdk_fs_initͰεʔόʔϒϩοΫΛ࡞੒ ୈࡾҾ਺͸ϑΝΠϧૢ࡞͕͋ͬͨ৔߹ʹݺͼग़͞ΕΔؔ਺ blobfsͷϑΝΠϧૢ࡞͸શͯಉҰͷεϨουͰॲཧ͞ΕΔඞཁ͕͋Δҝ શͯͷϦΫΤετΛreactor_0ʹ౤͍͛ͯΔ
  34. void create_file( void *file_, void* ) { auto file =

    reinterpret_cast< file_t* >( file_ ); if( spdk_fs_create_file( file->context->fs, file->context- >channel.get(), file->name.c_str() ) < 0 ) ABORT( "ϑΝΠϧΛ࡞੒͢Δࣄ͕Ͱ͖ͳ͍" ); if( spdk_fs_open_file( file->context->fs, file->context- >channel.get(), file->name.c_str(), 0, &file->fd ) < 0 ) ABORT( "ϑΝΠϧΛ։͘ࣄ͕Ͱ͖ͳ͍" ); auto head = std::next( file->context->buffer.get(), file- >header->offset ); auto begin = std::chrono::high_resolution_clock::now(); if( spdk_file_write( file->fd, file->context->channel.get(), head, 0, file->header->size ) < 0 ) ABORT( "ϑΝΠϧΛॻ͖ࠐΉࣄ͕Ͱ͖ͳ͍" ); if( file->context->flush ) { if( spdk_file_sync( file->fd, file->context->channel.get() ) < 0 ) spdk_fs_create_fileͰϑΝΠϧΛ࡞Γ spdk_fs_open_fileͰϑΝΠϧΛ։͖
  35. if( spdk_file_write( file->fd, file->context->channel.get(), head, 0, file->header->size ) < 0

    ) ABORT( "ϑΝΠϧΛॻ͖ࠐΉࣄ͕Ͱ͖ͳ͍" ); if( file->context->flush ) { if( spdk_file_sync( file->fd, file->context->channel.get() ) < 0 ) ABORT( "ϑΝΠϧΩϟογϡΛಉظ͢Δࣄ͕Ͱ͖ͳ͍" ); } auto end = std::chrono::high_resolution_clock::now(); tar_benchmark::record_elapsed_time( *file->context->results, file->header->size, std::chrono::duration_cast< std::chrono::nanoseconds >( end - begin ).count() ); if( spdk_file_close( file->fd, file->context->channel.get() ) < 0 ) ABORT( "ϑΝΠϧΛด͡Δࣄ͕Ͱ͖ͳ͍" ); spdk_file_writeͰϑΝΠϧʹσʔλΛॻ͍ͯ spdk_file_syncͰblobfsͷΩϟογϡ͕ࡹ͚ΔͷΛ଴ͬͯ
  36. ); if( spdk_file_close( file->fd, file->context->channel.get() ) < 0 ) ABORT(

    "ϑΝΠϧΛด͡Δࣄ͕Ͱ͖ͳ͍" ); } ࠷ޙʹspdk_file_close POSIXͷopen/read/write/fsync/closeͱ ؆୯ʹஔ͖׵͑Δࣄ͕Ͱ͖Δ طଘͷΞϓϦέʔγϣϯΛSPDKʹҠ২͢Δ࣌ʹศར
  37. CPU Intel(R) Core(TM) i5-6600 CPU @ 3.30GHz 4ίΞ ϝϞϦ DDR4-2400

    8GBx2 ετϨʔδ Sandisk ExtremePro M.2 NVMe 3D SSD 500GB OS Gentoo Linux default/linux/amd64/17.0/desktop/plasma/systemd Write gcc-8.2ͷιʔεtarϘʔϧΛඇѹॖͷঢ়ଶ͔ΒετϨʔδʹల։͢Δ ࣮ߦલʹ·ͬ͞ΒͳϑΝΠϧγεςϜΛ࡞Δ O_DIRECT͔ͭfsyncͳ͠ͰϑΝΠϧຖͷwriteʹཁͨ࣌ؒ͠Λܭଌ͢Δ Read gcc-8.2ͷιʔείʔυ͕ల։͞Ε͍ͯΔঢ়ଶ͔ΒશͯͷϑΝΠϧΛಡΉ ࣮ߦલʹΧʔωϧͷΩϟογϡΛഁغ͢Δ ϑΝΠϧຖͷreadʹཁͨ࣌ؒ͠Λܭଌ͢Δ
  38. spdk_nvme_probe( nullptr, reinterpret_cast< void* >( ctx.get() ), probe_cb, attach_cb, nullptr

    ); if( !ctx->ctrlr ) { std::cout << "NVMeσόΠε͸ݟ͔ͭΒͳ͔ͬͨ" << std::endl; return -1; } std::shared_ptr< uint8_t > buf( reinterpret_cast< uint8_t* >( spdk_nvme_ctrlr_alloc_cmb_io_buffer( ctx->ctrlr, 0x1000 ) ), [ctx]( uint8_t *p ) { spdk_nvme_ctrlr_free_cmb_io_buffer( ctx->ctrlr, reinterpret_cast< void* >( p ), 0x1000 ); } ); if( !buf ) std::cout << "CMBΛ֬อͰ͖ͳ͍" << std::endl; else std::cout << "CMBΛ֬อͰ͖ͨ" << std::endl;
  39. $ ./src/cmb Starting SPDK v18.07 / DPDK 18.05.0 initialization... [

    DPDK EAL parameters: cmb -c 0x1 --legacy-mem --file-prefix=spdk0 --base- virtaddr=0x200000000000 --proc-type=auto ] EAL: Detected 4 lcore(s) EAL: Detected 1 NUMA nodes EAL: Auto-detected process type: PRIMARY EAL: Multi-process socket /var/run/dpdk/spdk0/mp_socket EAL: Probing VFIO support... EAL: WARNING! Base virtual address hint (0x20080002f000 != 0x7fc326fc6000) not respected! EAL: This may cause issues with mapping memory into secondary processes EAL: WARNING! Base virtual address hint (0x201000030000 != 0x7fc326fc5000) not respected! EAL: This may cause issues with mapping memory into secondary processes EAL: WARNING! Base virtual address hint (0x201800031000 != 0x7fc326fc4000) not respected! EAL: This may cause issues with mapping memory into secondary processes EAL: PCI device 0000:03:00.0 on NUMA socket 0 EAL: probe driver: 15b7:5002 spdk_nvme 0000:03:00.0 ʹ઀ଓ͠·͢ 0000:03:00.0 ʹ઀ଓ͠·ͨ͠ CMBΛ֬อͰ͖ͳ͍ Sandisk ExtremeProʹ͸CMB͸ແ͔ͬͨ