Upgrade to Pro — share decks privately, control downloads, hide ads and more …

極めて速いストレージとの付き合い方

Fadis
September 21, 2018

 極めて速いストレージとの付き合い方

いまどきのツンデレSSDにデレてもらうためにはどうすれば良いかを解説します
これは2018年9月22日に行われた カーネル/VM探検隊@関西 9回目 での発表資料です
サンプルコード: https://github.com/Fadis/spdk_sample

Fadis

September 21, 2018
Tweet

More Decks by Fadis

Other Decks in Programming

Transcript

  1. ۃΊͯ଎͍ετϨʔδͱͷ෇͖߹͍ํ
    NAOMASA MATSUBAYASHI
    https://github.com/Fadis/spdk_sample
    ͜ͷൃදʹొ৔͢Δαϯϓϧίʔυ

    View Slide

  2. 4"5"ͷଳҬͷݶք
    /7.F
    1$*&YQSFTT

    1$*&YQSFTT(FOYͷଳҬͷݶք
    13ࣾͷ10ສԁҎԼͷ಺ଂSSDͷՁ֨.comొ࿥೔ͱ
    ϝʔΧʔ͕ఏࣔ͢ΔγʔέϯγϟϧϦʔυͷଳҬͷਪҠ

    View Slide

  3. 13ࣾͷ10ສԁҎԼͷ಺ଂSSDͷՁ֨.comొ࿥೔ͱ
    ϝʔΧʔ͕ఏࣔ͢ΔγʔέϯγϟϧϦʔυͷଳҬͷਪҠ
    ϋʔυσΟεΫ͕͜ͷ΁Μ

    View Slide

  4. ͔ͭͯ
    ΋ͷ͘͢͝଎͍ετϨʔδͷ୅໊ࢺͩͬͨ
    JP%SJWFͷ্ҐϞσϧ͕͜ͷ΁Μ
    13ࣾͷ10ສԁҎԼͷ಺ଂSSDͷՁ֨.comొ࿥೔ͱ
    ϝʔΧʔ͕ఏࣔ͢ΔγʔέϯγϟϧϦʔυͷଳҬͷਪҠ

    View Slide

  5. جຊతͳετϨʔδσόΠεͷૢ࡞
    ͜ͷσʔλΛ͍ͩ͘͞
    Ͳ͏ͧ
    σόΠε͕ϖʔδΛ
    ಡΈग़͢ͷʹ͔͔Δ࣌ؒ
    Χʔωϧ ετϨʔδ

    View Slide

  6. ͜ͷσʔλΛ͍ͩ͘͞
    Ͳ͏ͧ
    ΠϚυΩͷߴ଎ͳ
    NANDϑϥογϡϝϞϦͰ΋
    4KBϖʔδΛಡΈॻ͖͢Δͷʹ
    10µඵఔ౓
    ͷ͕͔͔࣌ؒΔ

    View Slide

  7. 4KBϖʔδΛಡΉૢ࡞Λ܁Γฦͯ͠
    500MB/sΛୡ੒͠Α͏ͱ͢Δ৔߹
    ཁٻ͔ΒԠ౴·Ͱ͸
    7.8µඵ
    ͔͔͚͠ΒΕͳ͍
    ʜ
    ͜Ε͸ୡ੒Ͱ͖ͳ͍

    View Slide

  8. ޿ଳҬΛ࣮ݱ͢Δํ๏
    ʜ
    ෺ཧతʹϔου͕1ຊ͔͠ͳ͍
    ϋʔυσΟεΫͱҟͳΓ
    SSD͸ॏෳ͠ͳ͍ϖʔδʹର͢Δ
    ෳ਺ͷίϚϯυΛಉ࣌ʹॲཧͰ͖Δ
    10µඵҎ্

    View Slide

  9. ࠓ೔ͷετϨʔδ͸
    ߴεϧʔϓοτͷׂʹ
    ߴϨΠςϯγͰ͋Δ

    View Slide

  10. ࣮ࡍͷσόΠεͰܭଌͯ͠ΈΑ͏

    View Slide

  11. #include
    #include
    #include
    #include
    #include
    #include
    #include
    #include
    #include
    struct io_failure {};
    struct file_t;
    struct context_t {
    context_t() : block_size( 512 ) {}
    size_t block_size;
    std::vector< file_t > files;
    };
    struct file_t {
    file_t( context_t *c, int f_, size_t size_, size_t offset_, bool

    View Slide

  12. for( size_t i = 0; i != count; ++i )
    ctx.files.emplace_back( &ctx, fd, bs, random ? ( rand() %
    available ) * bs : i * bs, zero );
    const auto begin = std::chrono::high_resolution_clock::now();
    if( write ) {
    for( auto &file: ctx.files )
    if( pwrite( file.fd, reinterpret_cast< void*
    >( file.buffer.get() ), file.size * ctx.block_size, file.offset *
    ctx.block_size ) < 0 ) throw io_failure();
    }
    else {
    for( auto &file: ctx.files )
    if( pread( file.fd, reinterpret_cast< void*
    >( file.buffer.get() ), file.size * ctx.block_size, file.offset *
    ctx.block_size ) < 0 ) throw io_failure();
    }
    const auto end = std::chrono::high_resolution_clock::now();
    const size_t elapsed = std::chrono::duration_cast<
    ϒϩοΫσόΠεͷϥϯμϜͳҐஔʹread·ͨ͸write
    ಉظI/OͰ܁Γฦ͠ߦ͏

    View Slide

  13. CPU Intel(R) Core(TM) i5-6600 CPU @ 3.30GHz 4ίΞ
    ϝϞϦ DDR4-2400 8GBx2
    ετϨʔδ Sandisk ExtremePro M.2 NVMe 3D SSD 500GB
    OS Gentoo Linux default/linux/amd64/17.0/desktop/plasma/systemd
    256MBͷಡΈॻ͖ʹཁ͢Δ࣌ؒΛܭଌ͢Δ
    ϒϩοΫαΠζΛ512όΠτ͔Β256MB·ͰมԽͤ͞Δ
    ֤ϒϩοΫͷಡΈॻ͖Λߦ͏Ґஔ͸ϥϯμϜ
    O_DIRECT|O_SYNCΛ࢖͏
    ಉ͡ϒϩοΫαΠζʹ͍ͭͯ10ճͷܭଌΛߦ͏

    View Slide

  14. View Slide

  15. ϒϩοΫαΠζΛ͔ͳΓେ͖͘͠ͳ͍ͱಉظI/OͰ͸ੑೳ͕ग़ͳ͍
    4MiB

    View Slide

  16. View Slide

  17. Read: 57.8µඵ
    Write: 339µඵ

    View Slide

  18. #include
    #include
    #include
    #include
    #include
    #include
    #include
    #include
    #include
    #include
    #include
    #include
    struct io_failure {};
    struct file_t;
    struct context_t {
    context_t( size_t bulk_ ) : bulk( bulk_ ), block_size( 512 ),
    completed_count( 0 ), completed( false ) {
    io_queue_init( 1, &io_context );
    Linux AIOΛ࢖ͬͯલͷI/Oͷ׬ྃΛ଴ͨͣʹ࣍ͷI/OཁٻΛ౤͛Δ
    Linux Asynchronous I/O

    View Slide

  19. for( auto &file: ctx.files ) {
    while( 1 ) {
    auto r = io_submit( ctx.io_context, 1, &file.pcb );
    if( r >= 0 ) break;
    if( r != -EAGAIN && r < 0 ) throw io_failure();
    }
    }
    }
    poller.join();
    if( flush ) fdatasync( fd );
    const auto end = std::chrono::high_resolution_clock::now();
    const size_t elapsed = std::chrono::duration_cast<
    std::chrono::nanoseconds >( end - begin ).count();
    const size_t transfered = bs * count * ctx.block_size;
    close( fd );
    std::cout << bs * ctx.block_size << "\t" << transfered << "\t"
    << elapsed << "\t" << ( double( transfered ) / 1000 / 1000 ) /
    ( double( elapsed ) / 1000 / 1000 / 1000 ) << "MB/s " <<
    ཁٻଆεϨου͸io_submitͰΧʔωϧͷΩϡʔʹI/OཁٻΛશͯੵΜͰ
    ݁ՌΛड͚औΔεϨουͷ׬ྃΛ଴ͭ
    ΧʔωϧͷΩϡʔ͕ҰഋͰI/OཁٻΛੵΊͳ͍ͱ͖͸
    ϙʔϦϯάͰ࠶ࢼߦ͢Δ

    View Slide

  20. std::thread poller( [&ctx]() {
    std::vector< io_event > events( ctx.bulk );
    while( !ctx.completed.load() ) {
    auto ret = io_getevents( ctx.io_context, 0, events.size(),
    events.data(), nullptr );
    if( ret > 0 ) {
    auto events_end = std::next( events.begin(), ret );
    std::for_each( events.begin(), events_end, []( const auto
    &event ) {
    auto file = reinterpret_cast< file_t* >(
    reinterpret_cast< void* >( event.data )
    );
    if( event.res >= 0 ) {
    if( int( event.res ) < 0 ) throw io_failure();
    if( ++file->context->completed_count == file->context-
    >files.size() ) {
    file->context->completed = true;
    }
    ݁ՌΛड͚औΔεϨου͸
    io_geteventsΛϙʔϦϯάͯ͠׬ྃͨ͠I/Oͷ৘ใΛऔಘ͢Δ
    ड͚औͬͨ਺͕౤͛ͨI/Oཁٻͷ਺ʹୡͨ͠ΒεϨουΛऴྃͤ͞Δ

    View Slide

  21. CPU Intel(R) Core(TM) i5-6600 CPU @ 3.30GHz 4ίΞ
    ϝϞϦ DDR4-2400 8GBx2
    ετϨʔδ Sandisk ExtremePro M.2 NVMe 3D SSD 500GB
    OS Gentoo Linux default/linux/amd64/17.0/desktop/plasma/systemd
    256MBͷಡΈॻ͖ʹཁ͢Δ࣌ؒΛܭଌ͢Δ
    ϒϩοΫαΠζΛ512όΠτ͔Β256MB·ͰมԽͤ͞Δ
    ֤ϒϩοΫͷಡΈॻ͖Λߦ͏Ґஔ͸ϥϯμϜ
    ಉظI/O͸O_DIRECT|O_SYNCɺඇಉظI/O͸O_DIRECTͰ࠷ޙʹ1ճfsync
    ಉ͡ϒϩοΫαΠζʹ͍ͭͯ10ճͷܭଌΛߦ͏

    View Slide

  22. View Slide

  23. ϒϩοΫαΠζ͕খ͍͞৔߹ͷεϧʔϓοτ͕େ෯ʹ޲্
    σόΠε͕େྔͷI/OཁٻΛҰ౓ʹॲཧ͍ͯ͠Δ͜ͱ͕Θ͔Δ

    View Slide

  24. ϓϩηε͸
    ৗʹେ͖ͳϒϩοΫͰେྔͷ*0ཁٻΛͰ͖Δঢ়ଶͱ͸ݶΒͳ͍
    ͜ͷσΟϨΫτϦΤϯτϦΛ
    ಡΉ͜ͱͰ
    ࣍ʹಡΉ΂͖Ґஔ͕֬ఆ͢Δ
    ͜ͷσΟϨΫτϦΤϯτϦΛ
    ಡΉ͜ͱͰ
    ࣍ʹಡΉ΂͖Ґஔ͕֬ఆ͢Δ
    ಡΈ͍ͨϑΝΠϧʹͨͲΓண͘
    σόΠε͕଎͘ͳͬͯ΋
    ͜ͷΑ͏ͳૢ࡞͕ΞϓϦέʔγϣϯ͕଎͘ͳΒͳ͍ݪҼͱͳΔ

    View Slide

  25. ͜ͷ΁Μͷਏ͍*0ͷϨΠςϯγΛ
    ՄೳͳݶΓখ͍ͨ͘͞͠
    Read: 57.8µඵ
    Write: 339µඵ

    View Slide

  26. σόΠεͷԠ౴࣌ؒʹ
    γεςϜίʔϧɺ'4ͷॲཧɺׂΓࠐΈ౳ͷΦʔόʔϔου͕৐Δ
    Φʔόʔϔου
    Φʔόʔϔου

    View Slide

  27. Φʔόʔϔου
    Φʔόʔϔου
    ͜͜Λ
    ιϑτ΢ΣΞϨϕϧͰ
    ॖΊΔ͜ͱ͸Ͱ͖ͳ͍͕

    View Slide

  28. Φʔόʔϔου
    Φʔόʔϔου
    ͜͜͸
    ιϑτ΢ΣΞϨϕϧͰ
    ॖΊΒΕΔՄೳੑ͕͋Δ

    View Slide

  29. খ͍͞ϒϩοΫͷ*0Ͱ
    ૬ରతʹେ͖͘ͳΔ
    ΦʔόʔϔουΛݮΒ͍ͨ͠
    খ͍͞ύέοτͷ*0Ͱ
    ૬ରతʹେ͖͘ͳΔ
    ΦʔόʔϔουΛݮΒ͍ͨ͠

    View Slide

  30. খ͍͞ύέοτͷ*0Ͱ
    ૬ରతʹେ͖͘ͳΔ
    ΦʔόʔϔουΛݮΒ͍ͨ͠
    Data
    Plane
    Development
    Kit
    খ͍͞ϒϩοΫͷ*0Ͱ
    ૬ରతʹେ͖͘ͳΔ
    ΦʔόʔϔουΛݮΒ͍ͨ͠
    Storage
    Performance
    Development
    Kit
    http://spdk.io/
    https://www.dpdk.org/

    View Slide

  31. DPDK
    Ϣʔβۭؒϓϩηε͕NICΛ઎༗͢Δ͜ͱͰ
    ίϯςΩετεΠονΛճආ
    ૹड৴ͷաఔͰൃੜ͢ΔσʔλͷίϐʔΛ࠷খԽ

    View Slide

  32. DPDK
    ύέοτ͖ͨ
    ύέοτ͖ͨ
    ී௨ͷωοτϫʔΫυϥΠό
    ׂΓࠐΈ
    ύέοτ͋Δ
    ͋Δ
    ͋Δ
    ͋Δ
    ͋ͬͨ
    1.%
    ׂΓࠐΈͷΦʔόʔϔου͕େ͖͍ͷͰ
    CPUΛϏδʔϧʔϓͤͯ͞σόΠεΛ؂ࢹ͢Δ

    View Slide

  33. SPDK
    41%,͸%1%,ͱಉ͡Α͏ʹ
    Ϣʔβۭؒͷϓϩηε͕௚઀σόΠεΛૢ࡞͢ΔࣄͰ
    *0ʹ͔͔Δ࣌ؒͷ͏ͪιϑτ΢ΣΞ༝དྷͷ΋ͷΛ
    ࠷খʹ͢Δ͜ͱΛ໨ࢦ͢

    View Slide

  34. υϥΠό
    /7.FυϥΠό
    "*0υϥΠό
    SBNEJTLυϥΠό
    7JSU*0υϥΠό QNFNυϥΠό
    bdev
    blobstore
    DPDK EAL
    blobfs SCSI NVMe
    iSCSI
    λʔήοτ
    vhost-scsi
    λʔήοτ
    NVMe-oF
    λʔήοτ
    41%,Λߏ੒͢Δίϯϙʔωϯτ
    ʜ
    DPDK uioυϥΠό
    ΞϓϦέʔγϣϯ

    View Slide

  35. υϥΠό
    /7.FυϥΠό
    "*0υϥΠό
    SBNEJTLυϥΠό
    7JSU*0υϥΠό QNFNυϥΠό
    bdev
    blobstore
    DPDK EAL
    blobfs SCSI NVMe
    iSCSI
    λʔήοτ
    vhost-scsi
    λʔήοτ
    NVMe-oF
    λʔήοτ
    ʜ
    DPDK uioυϥΠό
    ΞϓϦέʔγϣϯ
    /7.F౳ͷετϨʔδΛΧʔωϧͷ؅ཧ͔Β֎͢ҝʹ
    %1%,ͷVJPυϥΠόΛ࢖͏
    3FBDUPS΍)VHFQBHFΛ࢖͏ϝϞϦΞϩέʔλ͸
    %1%,ͷ࣮૷Λͦͷ··࢖͏
    㱤41%,Λಈ͔͢ʹ͸%1%,͕ඞཁ

    View Slide

  36. υϥΠό
    /7.FυϥΠό
    "*0υϥΠό
    SBNEJTLυϥΠό
    7JSU*0υϥΠό QNFNυϥΠό
    bdev
    blobstore
    DPDK EAL
    blobfs SCSI NVMe
    iSCSI
    λʔήοτ
    vhost-scsi
    λʔήοτ
    NVMe-oF
    λʔήοτ
    ʜ
    DPDK uioυϥΠό
    ΞϓϦέʔγϣϯ
    CEFW
    ετϨʔδσόΠεͷछྨʹΑΔૢ࡞ํ๏ͷҧ͍Λٵऩ͢Δ

    View Slide

  37. ઌ΄ͲͷϕϯνϚʔΫΛ
    bdevͷ্Ͱಈ͘Α͏ʹ͠Α͏

    View Slide

  38. [Global]
    ReactorMask 0x000F
    [Nvme]
    TransportID "trtype:PCIe traddr:0000:03:00.0" Nvme0
    ·ͣSPDKͷઃఆϑΝΠϧΛ༻ҙ͢Δ
    4൪໨ͷϓϩηοα·ͰεϨουϓʔϧʹ࢖ͬͯྑ͍
    PCI-Expressͷ0000:03:00.0ʹܨ͕ͬͨσόΠεΛ
    Nvme0ͱݺͿ͜ͱʹ͢Δ

    View Slide

  39. struct spdk_app_opts opts = {};
    SPDK_NOTICELOG("entry\n");
    spdk_app_opts_init(&opts);
    opts.name = "bdev";
    const std::string config_file = opt_var[ "config" ].as<
    std::string >();
    opts.config_file = config_file.c_str();
    context_t ctx(
    opt_var[ "concurrency" ].as< size_t >(),
    opt_var[ "block_size" ].as< size_t >(),
    opt_var[ "count" ].as< size_t >(),
    opt_var[ "write" ].as< bool >(),
    opt_var[ "random" ].as< bool >(),
    opt_var[ "zero" ].as< bool >(),
    opt_var[ "flush" ].as< bool >()
    );
    spdk_app_start(&opts, run, &ctx, nullptr );
    }
    spdk_app_startͰ
    DPDK EALͷॳظԽ
    εϨουϓʔϧͷ։࢝
    ར༻ՄೳͳσόΠε୳͠
    ͕ߦΘΕΔ
    ͜ͷؔ਺͸spdk_app_stop͞ΕΔ·ͰฦΒͳ͍

    View Slide

  40. void run( void *ctx_, void * ) {
    auto ctx = reinterpret_cast< context_t* >( ctx_ );
    ctx->bdev = spdk_bdev_get_by_name("Nvme0n1");
    if( !ctx->bdev ) ABORT( "σόΠε͕ݟ͔ͭΒͳ͍" );
    ctx->page_size = spdk_bdev_get_block_size( ctx->bdev );
    const auto max_page_count = spdk_bdev_get_num_blocks( ctx-
    >bdev );
    const size_t count = std::min( ctx->count, max_page_count );
    const size_t max_concurrency = ( ctx->max_concurrency ?
    std::min( ctx->max_concurrency, count ) : count );
    const size_t buf_size = ctx->page_size * ctx->block_size *
    ( ctx->write ? 1u : max_concurrency );
    ctx->buffer.reset(
    reinterpret_cast< uint8_t* >( spdk_dma_zmalloc( buf_size, ctx-
    >page_size, nullptr ) ),
    []( uint8_t *p ) { spdk_dma_free( reinterpret_cast< void* >( p
    ) ); }
    );
    spdk_bdev_get_by_nameͰσόΠεΛऔಘ͢Δ

    View Slide

  41. ctx->page_size = spdk_bdev_get_block_size( ctx->bdev );
    const auto max_page_count = spdk_bdev_get_num_blocks( ctx-
    >bdev );
    const size_t count = std::min( ctx->count, max_page_count );
    const size_t max_concurrency = ( ctx->max_concurrency ?
    std::min( ctx->max_concurrency, count ) : count );
    const size_t buf_size = ctx->page_size * ctx->block_size *
    ( ctx->write ? 1u : max_concurrency );
    ctx->buffer.reset(
    reinterpret_cast< uint8_t* >( spdk_dma_zmalloc( buf_size, ctx-
    >page_size, nullptr ) ),
    []( uint8_t *p ) { spdk_dma_free( reinterpret_cast< void* >( p
    ) ); }
    );
    if( !ctx->buffer ) ABORT( "όοϑΝΛ֬อ͢Δࣄ͕Ͱ͖ͳ͍" )
    std::fill( ctx->buffer.get(), std::next( ctx->buffer.get(),
    buf_size ), ctx->zero ? 0 : 1 );
    const size_t cores = ctx->write ? 1 : rte_lcore_count();
    spdk_dma_zmallocͰόοϑΝΛ֬อ͢Δ

    View Slide

  42. CPUͱPCI-ExpressͷσόΠε͸
    ҟͳΔMMUΛհͯ͠ϝϞϦʹΞΫηε͍ͯ͠Δ
    IOMMU͔Β΋
    ࿈ଓʹݟ͑ΔΑ͏ʹ
    όοϑΝΛ֬อ͠ͳ͍ͱ
    σʔλΛDMAసૹͰ͖ͳ͍
    MMU IOMMU
    spdk_dma_mallocܥͷؔ਺͸
    σόΠεଆ͔Β΋ϦχΞʹΞΫηεͰ͖ΔΑ͏ʹϝϞϦ֬อΛߦ͏

    View Slide

  43. const size_t cores = ctx->write ? 1 : rte_lcore_count();
    ctx->channels.resize( cores );
    ctx->files.reserve( count );
    if( spdk_bdev_open( ctx->bdev, true, nullptr, nullptr, &ctx-
    >desc ) < 0 ) ABORT( "σόΠεΛ։͘͜ͱ͕Ͱ͖ͳ͍" );
    for( size_t i = 0; i != count; ++i ) {
    ctx->files.emplace_back( ctx, i % cores, ctx->block_size, ctx-
    >random ? ( ( rand() % max_page_count ) / ctx->block_size ) * ctx-
    >block_size : i * ctx->block_size );
    ++ctx->global_left_count;
    }
    ctx->head = max_concurrency;
    for( size_t i = 0; i != cores; ++i ) {
    spdk_event *event = spdk_event_allocate( i, ctx->write ?
    write_file : read_file, ctx_, nullptr );
    spdk_event_call( event );
    }
    } શͯͷϓϩηοα্Ͱ

    View Slide

  44. void write_file( void *context_, void* ) {
    auto context = reinterpret_cast< context_t* >( context_ );
    const size_t lcore = rte_lcore_id();
    const size_t cores = 1;
    context->channels[ lcore ].reset(
    spdk_bdev_get_io_channel( context->desc ),
    []( spdk_io_channel *p ) { if( p ) spdk_put_io_channel( p ); }
    );
    if( !context->channels[ lcore ] ) ABORT( "νϟωϧΛ֬อͰ͖ͳ͍" );
    const auto max_page_count = spdk_bdev_get_num_blocks( context-
    >bdev );
    const size_t count = std::min( context->count, max_page_count );
    const size_t max_concurrency = ( context->max_concurrency ?
    std::min( context->max_concurrency, count ) : count );
    context->begin = std::chrono::high_resolution_clock::now();
    for( size_t i = lcore; i < max_concurrency; i += cores ) {
    context->files[ i ].lcore = lcore;
    write_file_cont( &context->files[ i ] );
    spdk_bdev_get_io_channelͰνϟωϧΛ࡞੒

    View Slide

  45. νϟωϧ
    ίϚϯυ
    ίϚϯυ
    ίϚϯυ
    σόΠεʹίϚϯυΛ౤͛ΔΩϡʔ͕
    1͔ͭ͠ͳ͍ͱ
    ΩϡʔʹίϚϯυΛੵΉͨΊʹ
    ϓϩηοα͸ϩοΫΛୣ͍߹͏ࣄʹͳΔ
    ͕͜͜
    ौ଺͢Δ
    ίϚϯυ ίϚϯυ ίϚϯυ ίϚϯυ

    View Slide

  46. νϟωϧ
    ίϚϯυ
    ίϚϯυ
    ίϚϯυ
    ͦ͜ͰNVMe౳ͷ
    φ΢͍ετϨʔδ͸
    ΩϡʔΛ
    ෳ਺࡞ΕΔΑ͏ʹͳ͍ͬͯΔ
    ίϚϯυ
    ίϚϯυ
    ίϚϯυ
    ίϚϯυ
    ίϚϯυ
    ίϚϯυ
    ίϚϯυ
    ίϚϯυ
    ίϚϯυ
    ͜ͷΩϡʔΛSPDKͰ͸
    νϟωϧͱݺͿ
    νϟωϧ͸νϟωϧΛ࡞੒ͨ͠εϨουҎ֎͔Β͸࢖༻Ͱ͖ͳ͍
    νϟωϧΛҾ਺ʹͱΔؔ਺͸ಉ࣌ʹෳ਺ͷεϨου͔Βݺ΂Δ

    View Slide

  47. }
    void write_file_cont( file_t *file ) {
    if( spdk_bdev_write_blocks(
    file->context->desc,
    file->context->channels[ file->lcore ].get(),
    file->context->buffer.get(),
    file->offset,
    file->size,
    []( struct spdk_bdev_io *bdev_io, bool success, void *file_ )
    {
    if( !success ) ABORT( "ϑΝΠϧΛॻ͘͜ͱ͕Ͱ͖ͳ͍" );
    spdk_bdev_free_io( bdev_io );
    write_flush( file_ );
    }
    , reinterpret_cast< void* >( file )
    ) < 0 ) ABORT( "ϑΝΠϧΛॻ͘͜ͱ͕Ͱ͖ͳ͍" );
    }
    σόΠε͔Βॻ͖ࠐΈͷ݁Ռ͕ฦͬͯ͘Δͱ
    ࢦఆͨ͠ίʔϧόοΫ͕ݺͼग़͞ΕΔ
    ࢦఆͨ͠Ґஔ΁ͷσʔλͷॻ͖ࠐΈ

    View Slide

  48. void read_file_cont( file_t *file ) {
    size_t buffer_index;
    while( !file->context->buffer_index->pop( buffer_index ) );
    file->buffer_index = buffer_index;
    if( spdk_bdev_read_blocks(
    file->context->desc,
    file->context->channels[ file->lcore ].get(),
    std::next( file->context->buffer.get(), file->buffer_index *
    file->context->page_size * file->context->block_size ),
    file->offset,
    file->size,
    []( struct spdk_bdev_io *bdev_io, bool success, void *file_ )
    {
    auto file = reinterpret_cast< file_t* >( file_ );
    while( !file->context->buffer_index->push( file-
    >buffer_index ) );
    if( !success ) ABORT( "ϑΝΠϧΛಡΉ͜ͱ͕Ͱ͖ͳ͍" );
    σόΠε͔ΒಡΈग़͠ͷ݁Ռ͕ฦͬͯ͘Δͱ
    ࢦఆͨ͠ίʔϧόοΫ͕ݺͼग़͞ΕΔ
    ಡΜͩ಺༰͸Ҿ਺Ͱ౉ͨ͠όοϑΝʹೖ͍ͬͯΔ
    ࢦఆͨ͠Ґஔ͔ΒͷσʔλͷಡΈग़͠

    View Slide

  49. CPU Intel(R) Core(TM) i5-6600 CPU @ 3.30GHz 4ίΞ
    ϝϞϦ DDR4-2400 8GBx2
    ετϨʔδ Sandisk ExtremePro M.2 NVMe 3D SSD 500GB
    OS Gentoo Linux default/linux/amd64/17.0/desktop/plasma/systemd
    256MBͷಡΈॻ͖ʹཁ͢Δ࣌ؒΛܭଌ͢Δ
    ϒϩοΫαΠζΛ512όΠτ͔Β128MB·ͰมԽͤ͞Δ
    ֤ϒϩοΫͷಡΈॻ͖Λߦ͏Ґஔ͸ϥϯμϜ
    લͷread͕׬ྃ͢Δ·Ͱ࣍ͷreadΛ౤͛ͳ͍
    writeͷ׬ྃޙʹflushΛߦ͍ɺflush͕׬ྃ͢Δ·Ͱ࣍ͷwriteΛ౤͛ͳ͍
    ಉ͡ϒϩοΫαΠζʹ͍ͭͯ10ճͷܭଌΛߦ͏

    View Slide

  50. View Slide

  51. LinuxͷಉظಡΈࠐΈΑΓ
    SPDKͷಉظಡΈࠐΈ͕
    গ͠଎͍

    View Slide

  52. Linuxͷಉظॻ͖ࠐΈͱൺ΂ͯ
    SPDKͷಉظॻ͖ࠐΈ͸
    Ͳ͏ͯ͜͠͏ͳͬͨ

    View Slide

  53. SPDK: 8.02µඵ
    Ԡ౴͕ૣ͗͢Δ
    NAND·Ͱॻ͍ͯͳͦ͞͏
    Linux: 339µඵ

    View Slide

  54. Volatile Write Cache
    ͜Εॻ͍ͯ
    ॻ͚ͨ
    ͜Εॻ͍ͯ
    ॻ͚ͨ
    σόΠε͕RAMΛ͍࣋ͬͯͯ
    RAMʹσʔλΛసૹͨ࣌͠఺Ͱ
    ॻ͖ࠐΈ੒ޭΛฦ͢

    View Slide

  55. Volatile Write Cache
    ͜Εॻ͍ͯ
    ॻ͚ͨ
    ຊ౰ʹ?
    ࣮͸ࠓॻ͚ͨ
    ࣮֬ʹӬଓԽͤ͞ΔҝʹFLUSHίϚϯυ͕༻ҙ͞Ε͍ͯΔ
    ͜Εॻ͍ͯ
    ॻ͚ͨ

    View Slide

  56. void write_flush( void *file_ ) {
    auto file = reinterpret_cast< file_t* >( file_ );
    if( --file->context->global_left_count == 0 ) {
    if( file->context->flush ) {
    if( spdk_bdev_flush_blocks(
    file->context->desc,
    file->context->channel.get(),
    file->offset,
    file->size,
    []( struct spdk_bdev_io *bdev_io, bool success, void
    *file_ ) {
    auto end = std::chrono::high_resolution_clock::now();
    auto file = reinterpret_cast< file_t* >( file_ );
    spdk_bdev_free_io( bdev_io );
    if( !success ) ABORT( "ϑΝΠϧΛॻ͘͜ͱ͕Ͱ͖ͳ͍" );
    close( file, end );
    }, file_
    ) < 0 ) ABORT( "ϑΝΠϧΛॻ͘͜ͱ͕Ͱ͖ͳ͍" );
    bdevʹσόΠεͷflushΛཁٻ͢Δ
    spdk_bdev_flush_blocks͸ݺΜͰ͍͕ͨ

    View Slide

  57. return 0;
    }
    static int
    bdev_nvme_flush(struct nvme_bdev *nbdev, struct nvme_bdev_io *bio,
    uint64_t offset, uint64_t nbytes)
    {
    spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio),
    SPDK_BDEV_IO_STATUS_SUCCESS);
    return 0;
    }
    static void
    _bdev_nvme_reset_done(struct spdk_io_channel_iter *i, int status)
    {
    void *ctx = spdk_io_channel_iter_get_ctx(i);
    Spdk-18.07/lib/bdev/nvme/bdev_nvme.c 227ߦ໨
    bdev͔ΒNVMeσόΠεʹରͯ͠flushΛཁٻ͞ΕͨΒ
    Կ΋͠ͳ͍Ͱ੒ޭΛฦ͢

    View Slide

  58. void write_flush( void *file_ ) {
    auto file = reinterpret_cast< file_t* >( file_ );
    if( --file->context->global_left_count == 0 ) {
    if( file->context->flush ) {
    if( spdk_bdev_nvme_io_passthru(
    file->context->desc,
    file->context->channels[ file->lcore ].get(),
    &file->context->flush_command,
    nullptr,
    0,
    []( struct spdk_bdev_io *bdev_io, bool success, void
    *file_ ) {
    auto end = std::chrono::high_resolution_clock::now();
    auto file = reinterpret_cast< file_t* >( file_ );
    spdk_bdev_free_io( bdev_io );
    if( !success ) ABORT( "ϑΝΠϧΛॻ͘͜ͱ͕Ͱ͖ͳ͍" );
    close( file, end );
    }, file_
    bdevʹ͸σόΠε͕NVMe͚ͩͬͨ࣌ͩ࢖͑Δ
    spdk_bdev_nvme_io_passthru
    (NVMeίϚϯυΛࣗ෼Ͱ࡞ͬͯ౤͛Δ)
    ͕༻ҙ͞Ε͍ͯΔͷͰɺ͜ΕΛ࢖ͬͯFLUSHΛ౤͛Δ

    View Slide

  59. View Slide

  60. read writeڞʹ
    SPDKͷํ͕ͪΐͬͱ͚ͩ଎͍
    SPDK: 14.4MB/s
    Linux: 12.0MB/s
    SPDK: 85.0MB/s
    Linux: 70.8MB/s

    View Slide

  61. read writeڞʹ
    SPDKͷํ͕ͪΐͬͱ͚ͩ଎͍
    SPDK: 284µඵ
    Linux: 339µඵ
    SPDK: 48.1µඵ
    Linux: 57.8µඵ

    View Slide

  62. σόΠεʹFLUSHΛ౤͛ͳ͍৔߹
    σόΠεͷϨΠςϯγ͕΄΅ແ͘ͳΔҝ
    LinuxͱSPDKͷࠩ͸ݦஶ
    Linux: 49.6µඵ
    SPDK: 8.02µඵ

    View Slide

  63. CPU Intel(R) Core(TM) i5-6600 CPU @ 3.30GHz 4ίΞ
    ϝϞϦ DDR4-2400 8GBx2
    ετϨʔδ Sandisk ExtremePro M.2 NVMe 3D SSD 500GB
    OS Gentoo Linux default/linux/amd64/17.0/desktop/plasma/systemd
    256MBͷಡΈॻ͖ʹཁ͢Δ࣌ؒΛܭଌ͢Δ
    ϒϩοΫαΠζΛ512όΠτ͔Β128MB·ͰมԽͤ͞Δ
    ֤ϒϩοΫͷಡΈॻ͖Λߦ͏Ґஔ͸ϥϯμϜ
    લͷreadͷ׬ྃΛ଴ͨͣʹ࣍ͷreadΛ౤͛Δ
    લͷwriteͷ׬ྃΛ଴ͨͣʹ࣍ͷwriteΛ౤͛ɺ࠷ޙʹ1ճ͚ͩflush͢Δ
    ಉ͡ϒϩοΫαΠζʹ͍ͭͯ10ճͷܭଌΛߦ͏

    View Slide

  64. View Slide

  65. Linux: 508MB/s
    SPDK: 517MB/s
    Linux: 1580MB/s
    SPDK: 1600MB/s

    View Slide

  66. 41%,ͷํ͕Φʔόʔϔου͸খ͍͕͞
    /7.F44%ຕఔ౓Ͱ͸
    $16͕ϘτϧωοΫʹͳΔࣄ͸ͳ͘
    -JOVY"*0ͱେ͖ͳ͕ࠩͭ͘ࣄ͸ͳ͔ͬͨ
    41%,։ൃݩ͸/7.Fຕͷ؀ڥͰ
    ϕϯνϚʔΫΛߦͳ͍ͬͯͨ

    View Slide

  67. ଟ͘ͷϢʔβۭؒΞϓϦέʔγϣϯ͸
    ϒϩοΫσόΠεΛ௚઀౉͞Εͯ΋ࠔΔ

    View Slide

  68. blobstore
    8KiB 12KiB 4KiB 4KiB
    4KiBͷ੔਺ഒͷେ͖͞ͷblobΛετϨʔδ্ʹ֬อ͢Δ
    ֬อͨ͠blob͸ޙ͔ΒϦαΠζ͢Δ͜ͱ͕Ͱ͖Δ
    ֬อͨ͠blob͸4KiB୯ҐͰread/writeͰ͖Δ
    8KiB
    *% *% *% *% *%
    ϑΝΠϧγεςϜͱͯ͠͸༷ʑͳػೳ͕଍Γ͍ͯͳ͍͕
    ࠷খݶͷΦʔόʔϔουͰෳ਺ͷσʔλΛετϨʔδʹஔ͚Δ

    View Slide

  69. void init_storage( void *context_, void * ) {
    auto context = reinterpret_cast< context_t* >( context_ );
    context->bdev = spdk_bdev_get_by_name("Nvme0n1");
    if( !context->bdev ) ABORT( "σόΠε͕ݟ͔ͭΒͳ͍" );
    context->bs = spdk_bdev_create_bs_dev( context->bdev, NULL, NULL
    );
    if( !context->bs ) ABORT( "blobstoreσόΠεΛ࡞੒Ͱ͖ͳ͍" );
    struct spdk_bs_opts opts;
    spdk_bs_opts_init( &opts );
    opts.max_channel_ops = 8000;
    spdk_bs_init( context->bs, &opts, []( void *context_, struct
    spdk_blob_store *blobstore, int bserrno ) {
    auto context = reinterpret_cast< context_t* >( context_ );
    if( bserrno ) ABORT( "blobstoreΛॳظԽͰ͖ͳ͍" );
    context->blobstore = blobstore;
    context->page_size = spdk_bs_get_page_size( context->blobstore
    );
    spdk_bs_initͰεʔύʔϒϩοΫΛ࡞੒

    View Slide

  70. spdk_bs_create_blob( file->context->blobstore, []( void *file_,
    spdk_blob_id id, int bserrno ) {
    if( bserrno ) ABORT( "blobΛ࡞੒Ͱ͖ͳ͍" )
    auto file = reinterpret_cast< file_t* >( file_ );
    file->id = id;
    spdk_bs_open_blob( file->context->blobstore, id, []( void
    *file_, struct spdk_blob *fd, int bserrno ) {
    if( bserrno ) ABORT( "blobΛ։͘ࣄ͕Ͱ͖ͳ͍" )
    auto file = reinterpret_cast< file_t* >( file_ );
    file->fd = fd;
    file->page_count = file->context->headers[ file-
    >index ].size / file->context->page_size + ( ( file->size % file-
    >context->page_size ) ? 1u : 0u );
    spdk_blob_resize( file->fd, file->page_count, []( void
    *file_, int bserrno ) {
    auto file = reinterpret_cast< file_t* >( file_ );
    --file->context->meta_count;
    if( bserrno ) ABORT( "blobΛϦαΠζͰ͖ͳ͍" )
    spdk_bs_create_blobͰblobΛ࡞Γ
    spdk_bs_open_blobͰblobΛ։͖

    View Slide

  71. if( bserrno ) ABORT( "blobΛ։͘ࣄ͕Ͱ͖ͳ͍" )
    auto file = reinterpret_cast< file_t* >( file_ );
    file->fd = fd;
    file->page_count = file->context->headers[ file-
    >index ].size / file->context->page_size + ( ( file->size % file-
    >context->page_size ) ? 1u : 0u );
    spdk_blob_resize( file->fd, file->page_count, []( void
    *file_, int bserrno ) {
    auto file = reinterpret_cast< file_t* >( file_ );
    --file->context->meta_count;
    if( bserrno ) ABORT( "blobΛϦαΠζͰ͖ͳ͍" )
    spdk_event *event = spdk_event_allocate( file->lcore,
    write_file, file_, nullptr );
    spdk_event_call( event );
    }, file_ );
    }, file_ );
    }, file_ );
    spdk_blob_resizeͰඞཁͳαΠζʹϦαΠζ͢Δ

    View Slide

  72. void write_file( void *file_, void* ) {
    auto file = reinterpret_cast< file_t* >( file_ );
    auto head = std::next( file->context->buffer.get(), file-
    >context->headers[ file->index ].offset );
    const size_t lcore = rte_lcore_id();
    file->lcore = lcore;
    file->begin = std::chrono::high_resolution_clock::now();
    spdk_blob_io_write(
    file->fd, file->context->channels[ lcore ].get(), head, 0,
    file->page_count,
    []( void *file_, int bserrno ) {
    auto end = std::chrono::high_resolution_clock::now();
    auto file = reinterpret_cast< file_t* >( file_ );
    if( bserrno ) ABORT( "blobʹॻ͖ࠐΉࣄ͕Ͱ͖ͳ͍" )
    tar_benchmark::record_elapsed_time(
    *file->context->results,
    file->context->headers[ file->index ].size,
    std::chrono::duration_cast< std::chrono::nanoseconds
    spdk_blob_io_writeͰblobʹॻ͖ࠐΈ
    νϟωϧΛҾ਺ʹͱΔؔ਺͸೚ҙͷεϨου͔Βݺͼग़ͤΔ
    ͦ͏Ͱͳ͍ؔ਺(open౳)͸reactor_0͔Βݺͼग़͢

    View Slide

  73. CPU Intel(R) Core(TM) i5-6600 CPU @ 3.30GHz 4ίΞ
    ϝϞϦ DDR4-2400 8GBx2
    ετϨʔδ Sandisk ExtremePro M.2 NVMe 3D SSD 500GB
    OS Gentoo Linux default/linux/amd64/17.0/desktop/plasma/systemd
    Write
    git-2.19.0ͷιʔεtarϘʔϧΛඇѹॖͷঢ়ଶ͔ΒετϨʔδʹల։͢Δ
    ࣮ߦલʹ·ͬ͞ΒͳϑΝΠϧγεςϜΛ࡞Δ
    O_DIRECT͔ͭfsyncͳ͠ͰϑΝΠϧຖͷwriteʹཁͨ࣌ؒ͠Λܭଌ͢Δ
    Read
    git-2.19.0ͷιʔείʔυ͕ల։͞Ε͍ͯΔঢ়ଶ͔ΒશͯͷϑΝΠϧΛಡΉ
    ࣮ߦલʹΧʔωϧͷΩϟογϡΛഁغ͢Δ
    ϑΝΠϧຖͷreadʹཁͨ࣌ؒ͠Λܭଌ͢Δ

    View Slide

  74. View Slide

  75. View Slide

  76. View Slide

  77. View Slide

  78. CPU Intel(R) Core(TM) i5-6600 CPU @ 3.30GHz 4ίΞ
    ϝϞϦ DDR4-2400 8GBx2
    ετϨʔδ Sandisk ExtremePro M.2 NVMe 3D SSD 500GB
    OS Gentoo Linux default/linux/amd64/17.0/desktop/plasma/systemd
    Write
    gcc-8.2ͷιʔεtarϘʔϧΛඇѹॖͷঢ়ଶ͔ΒετϨʔδʹల։͢Δ
    ࣮ߦલʹ·ͬ͞ΒͳϑΝΠϧγεςϜΛ࡞Δ
    O_DIRECT͔ͭfsyncͳ͠ͰϑΝΠϧຖͷwriteʹཁͨ࣌ؒ͠Λܭଌ͢Δ
    Read
    gcc-8.2ͷιʔείʔυ͕ల։͞Ε͍ͯΔঢ়ଶ͔ΒશͯͷϑΝΠϧΛಡΉ
    ࣮ߦલʹΧʔωϧͷΩϟογϡΛഁغ͢Δ
    ϑΝΠϧຖͷreadʹཁͨ࣌ؒ͠Λܭଌ͢Δ

    View Slide

  79. View Slide

  80. View Slide

  81. View Slide

  82. View Slide

  83. View Slide

  84. Ұ௨ΓͷػೳΛඋ͑ͨϑΝΠϧγεςϜΛલఏʹ࡞ΒΕͨ
    طଘͷΞϓϦέʔγϣϯΛ
    CMPCTUPSFʹҠ২͢Δͷ͕ਏ͍

    View Slide

  85. CMPCGT
    8KiB 13KiB 10
    bytes
    1KiB
    bdev্Ͱಈ͘Α͏ʹ࡞ΒΕͨϑΝΠϧγεςϜ
    ֬อͨ͠ϑΝΠϧ͸ޙ͔ΒϦαΠζ͢Δ͜ͱ͕Ͱ͖Δ
    ֬อͨ͠ϑΝΠϧ͸όΠτ୯ҐͰread/writeͰ͖Δ
    7KiB
    IPHF GVHB QJZP GPP CBS
    ໊લͰࣝผ͞ΕΔ೚ҙͷαΠζͷϑΝΠϧΛ࡞Δࣄ͕Ͱ͖Δ
    BUUS BUUS BUUS BUUS BUUS
    ϑΝΠϧͷσʔλͱ͸ผʹxattrΛอଘͰ͖Δ
    blobstoreΑΓϑΝΠϧγεςϜʹ͍ۙ࢓༷

    View Slide

  86. spdk_fs_set_cache_size( 512 );
    spdk_fs_init( context->bs, nullptr,
    []( fs_request_fn f, void *arg ) {
    spdk_event *event = spdk_event_allocate( 0, []( void *arg1,
    void *arg2 ) {
    reinterpret_cast< fs_request_fn >( arg1 )( arg2 );
    }, (void *)f, arg );
    spdk_event_call( event );
    },
    []( void *context_, struct spdk_filesystem *fs, int fserrno )
    {
    auto context = reinterpret_cast< context_t* >( context_ );
    if( fserrno ) ABORT( "ϑΝΠϧγεςϜΛ࡞੒Ͱ͖ͳ͍" )
    context->fs = fs;
    auto [mapped_tar,tar_size] =
    tar_benchmark::load_tar( context->input );
    context->tar_size = tar_size;
    context->buffer.reset(
    spdk_fs_initͰεʔόʔϒϩοΫΛ࡞੒
    ୈࡾҾ਺͸ϑΝΠϧૢ࡞͕͋ͬͨ৔߹ʹݺͼग़͞ΕΔؔ਺
    blobfsͷϑΝΠϧૢ࡞͸શͯಉҰͷεϨουͰॲཧ͞ΕΔඞཁ͕͋Δҝ
    શͯͷϦΫΤετΛreactor_0ʹ౤͍͛ͯΔ

    View Slide

  87. void create_file( void *file_, void* ) {
    auto file = reinterpret_cast< file_t* >( file_ );
    if( spdk_fs_create_file( file->context->fs, file->context-
    >channel.get(), file->name.c_str() ) < 0 )
    ABORT( "ϑΝΠϧΛ࡞੒͢Δࣄ͕Ͱ͖ͳ͍" );
    if( spdk_fs_open_file( file->context->fs, file->context-
    >channel.get(), file->name.c_str(), 0, &file->fd ) < 0 )
    ABORT( "ϑΝΠϧΛ։͘ࣄ͕Ͱ͖ͳ͍" );
    auto head = std::next( file->context->buffer.get(), file-
    >header->offset );
    auto begin = std::chrono::high_resolution_clock::now();
    if( spdk_file_write( file->fd, file->context->channel.get(),
    head, 0, file->header->size ) < 0 )
    ABORT( "ϑΝΠϧΛॻ͖ࠐΉࣄ͕Ͱ͖ͳ͍" );
    if( file->context->flush ) {
    if( spdk_file_sync( file->fd, file->context->channel.get() ) <
    0 )
    spdk_fs_create_fileͰϑΝΠϧΛ࡞Γ
    spdk_fs_open_fileͰϑΝΠϧΛ։͖

    View Slide

  88. if( spdk_file_write( file->fd, file->context->channel.get(),
    head, 0, file->header->size ) < 0 )
    ABORT( "ϑΝΠϧΛॻ͖ࠐΉࣄ͕Ͱ͖ͳ͍" );
    if( file->context->flush ) {
    if( spdk_file_sync( file->fd, file->context->channel.get() ) <
    0 )
    ABORT( "ϑΝΠϧΩϟογϡΛಉظ͢Δࣄ͕Ͱ͖ͳ͍" );
    }
    auto end = std::chrono::high_resolution_clock::now();
    tar_benchmark::record_elapsed_time(
    *file->context->results,
    file->header->size,
    std::chrono::duration_cast< std::chrono::nanoseconds >( end -
    begin ).count()
    );
    if( spdk_file_close( file->fd, file->context->channel.get() ) <
    0 )
    ABORT( "ϑΝΠϧΛด͡Δࣄ͕Ͱ͖ͳ͍" );
    spdk_file_writeͰϑΝΠϧʹσʔλΛॻ͍ͯ
    spdk_file_syncͰblobfsͷΩϟογϡ͕ࡹ͚ΔͷΛ଴ͬͯ

    View Slide

  89. );
    if( spdk_file_close( file->fd, file->context->channel.get() ) <
    0 )
    ABORT( "ϑΝΠϧΛด͡Δࣄ͕Ͱ͖ͳ͍" );
    }
    ࠷ޙʹspdk_file_close
    POSIXͷopen/read/write/fsync/closeͱ
    ؆୯ʹஔ͖׵͑Δࣄ͕Ͱ͖Δ
    طଘͷΞϓϦέʔγϣϯΛSPDKʹҠ২͢Δ࣌ʹศར

    View Slide

  90. CPU Intel(R) Core(TM) i5-6600 CPU @ 3.30GHz 4ίΞ
    ϝϞϦ DDR4-2400 8GBx2
    ετϨʔδ Sandisk ExtremePro M.2 NVMe 3D SSD 500GB
    OS Gentoo Linux default/linux/amd64/17.0/desktop/plasma/systemd
    Write
    gcc-8.2ͷιʔεtarϘʔϧΛඇѹॖͷঢ়ଶ͔ΒετϨʔδʹల։͢Δ
    ࣮ߦલʹ·ͬ͞ΒͳϑΝΠϧγεςϜΛ࡞Δ
    O_DIRECT͔ͭfsyncͳ͠ͰϑΝΠϧຖͷwriteʹཁͨ࣌ؒ͠Λܭଌ͢Δ
    Read
    gcc-8.2ͷιʔείʔυ͕ల։͞Ε͍ͯΔঢ়ଶ͔ΒશͯͷϑΝΠϧΛಡΉ
    ࣮ߦલʹΧʔωϧͷΩϟογϡΛഁغ͢Δ
    ϑΝΠϧຖͷreadʹཁͨ࣌ؒ͠Λܭଌ͢Δ

    View Slide

  91. View Slide

  92. View Slide

  93. View Slide

  94. BlobfsͷWrite Cache͕
    ޮ͍͍ͯΔΑ͏ʹݟ͑Δ

    View Slide

  95. BlobfsΛ1ϑΝΠϧॻ͘ຖʹflush

    View Slide

  96. BlobfsΛ1ϑΝΠϧॻ͘ຖʹflush

    View Slide

  97. ࠓͷॴಉظ"1*͔͠αϙʔτ͞Ε͍ͯ·ͤΜɻඇಉظ"1*΋࡞ͬͯ͸͍ΔͷͰ͕͢ɺ
    े෼ͳςετ͕ߦΘΕ͍ͯͳ͍ҝެ։"1*ʹ͸·ؚͩ·Ε͍ͯ·ͤΜɻ͜Ε͸কདྷͷ
    ϦϦʔεͰ௥Ճ͞ΕΔ༧ఆͰ͢ɻ
    blobfs͸(ࠓͷॴ)ಉظI/Oઐ༻
    http://spdk.io/doc/blobfs.html

    View Slide

  98. ※௿͍΄Ͳྑ͍

    View Slide

  99. Controller Memory Buffer
    SFBE
    σʔλΛίϐʔ͢Δ৔߹
    ௚લͷreadͷ݁Ռ͸σόΠεଆͷRAMʹ৐͍ͬͯΔҝ
    ੺Ͱࣔͨ͠σʔλͷసૹ͕׬શʹແବ
    ͜ΕΛXSJUF

    View Slide

  100. Controller Memory Buffer
    SFBE
    σόΠεଆͷRAMΛϗετͷΞυϨεۭؒʹϚοϓ͠
    read/writeͰ࢖͏όοϑΝʹͦͷΞυϨεΛࢦఆ͢ΔࣄͰ
    σʔλΛσόΠεଆཹΊʹ͢Δ
    ͜ΕΛXSJUF

    View Slide

  101. spdk_dma_mallocͷସΘΓʹ
    spdk_nvme_ctrlr_alloc_cmb_io_bufferͰόοϑΝΛ֬อ
    σόΠεଆͷRAMʹόοϑΝΛ֬อͰ͖Δ
    http://spdk.io/doc/nvme_8h.html#ad63b25defbb0f1ccd52538e3b9a748df

    View Slide

  102. spdk_nvme_probe(
    nullptr, reinterpret_cast< void* >( ctx.get() ), probe_cb,
    attach_cb, nullptr
    );
    if( !ctx->ctrlr ) {
    std::cout << "NVMeσόΠε͸ݟ͔ͭΒͳ͔ͬͨ" << std::endl;
    return -1;
    }
    std::shared_ptr< uint8_t > buf(
    reinterpret_cast< uint8_t* >(
    spdk_nvme_ctrlr_alloc_cmb_io_buffer( ctx->ctrlr, 0x1000 )
    ),
    [ctx]( uint8_t *p ) { spdk_nvme_ctrlr_free_cmb_io_buffer(
    ctx->ctrlr, reinterpret_cast< void* >( p ), 0x1000
    ); }
    );
    if( !buf ) std::cout << "CMBΛ֬อͰ͖ͳ͍" << std::endl;
    else std::cout << "CMBΛ֬อͰ͖ͨ" << std::endl;

    View Slide

  103. $ ./src/cmb
    Starting SPDK v18.07 / DPDK 18.05.0 initialization...
    [ DPDK EAL parameters: cmb -c 0x1 --legacy-mem --file-prefix=spdk0 --base-
    virtaddr=0x200000000000 --proc-type=auto ]
    EAL: Detected 4 lcore(s)
    EAL: Detected 1 NUMA nodes
    EAL: Auto-detected process type: PRIMARY
    EAL: Multi-process socket /var/run/dpdk/spdk0/mp_socket
    EAL: Probing VFIO support...
    EAL: WARNING! Base virtual address hint (0x20080002f000 != 0x7fc326fc6000) not
    respected!
    EAL: This may cause issues with mapping memory into secondary processes
    EAL: WARNING! Base virtual address hint (0x201000030000 != 0x7fc326fc5000) not
    respected!
    EAL: This may cause issues with mapping memory into secondary processes
    EAL: WARNING! Base virtual address hint (0x201800031000 != 0x7fc326fc4000) not
    respected!
    EAL: This may cause issues with mapping memory into secondary processes
    EAL: PCI device 0000:03:00.0 on NUMA socket 0
    EAL: probe driver: 15b7:5002 spdk_nvme
    0000:03:00.0 ʹ઀ଓ͠·͢
    0000:03:00.0 ʹ઀ଓ͠·ͨ͠
    CMBΛ֬อͰ͖ͳ͍
    Sandisk ExtremeProʹ͸CMB͸ແ͔ͬͨ

    View Slide

  104. ·ͱΊ
    ͍·Ͳ͖ͷϑϥογϡϝϞϦΛ࢖͏ετϨʔδ͸
    ߴεϧʔϓοτ͔ͭߴϨΠςϯγ
    ͦͷੑೳΛ׆͔͢ʹ͸
    ඇಉظI/O
    গ͠Ͱ΋ϨΠςϯγΛ཈͑ΔͨΊʹ
    SPDK

    View Slide