Upgrade to Pro — share decks privately, control downloads, hide ads and more …

Probablistic Data Structures

Probablistic Data Structures

My talk on rannts #18 (11.11.2017)

Sergey Arkhipov

November 11, 2017
Tweet

More Decks by Sergey Arkhipov

Other Decks in Programming

Transcript

  1. Вероятностные
    структуры данных
    Сергей Архипов, 2017

    View full-size slide

  2. curl http://site.com

    View full-size slide

  3. curl -x myproxy.ru:3128 http://site.com

    View full-size slide

  4. curl -x proxy.crawlera.com:8010
    http://site.com

    View full-size slide

  5. evt evt evt evt evt

    View full-size slide

  6. {
    "user": "sarkhipov",
    "hostname": "rannts.ru",
    "status": "ok",
    "status_description": ""
    }

    View full-size slide

  7. {
    "user": "sarkhipov",
    "hostname": "rannts.ru",
    "status": "ok",
    "status_description": ""
    }

    View full-size slide

  8. collector
    collector
    collector

    View full-size slide

  9. {
    "user": "sarkhipov",
    "hostname": "rannts.ru",
    "status": "ok",
    "status_description": ""
    }
    {
    "user": "sarkhipov",
    "hostname": "rannts.ru",
    "status": "ok",
    "status_description": ""
    }
    {
    "user": "sarkhipov",
    "hostname": "rannts.ru",
    "status": "ok",
    "status_description": ""
    }
    {
    "user": "sarkhipov",
    "hostname": "rannts.ru",
    "ok": 234,
    "banned": 12,
    "errors": 3,
    }

    View full-size slide

  10. Consumer 1
    {
    "user": "sarkhipov",
    "hostname": "rannts.ru",
    "ok": 234,
    "banned": 12,
    "errors": 3,
    }
    Consumer 2
    {
    "user": "sarkhipov",
    "hostname": "rannts.ru",
    "ok": 250,
    "banned": 3,
    "errors": 0,
    }
    Consumer 3
    {
    "user": "sarkhipov",
    "hostname": "rannts.ru",
    "ok": 0,
    "banned": 124,
    "errors": 84,
    }

    View full-size slide

  11. INSERT INTO stats (
    date,
    user,
    hostname,
    ok,
    ban,
    error
    )
    VALUES (
    :date,
    :user,
    :hostname,
    :ok,
    :ban,
    :error
    )
    ON DUPLICATE KEY UPDATE
    ok = ok + VALUES(ok),
    ban = ban + VALUES(ban),
    error = error + VALUES(error);

    View full-size slide

  12. {
    "user": "sarkhipov",
    "hostname": "rannts.ru",
    "status": "ok",
    "status_description": "",
    "response_time": 2861,
    }

    View full-size slide

  13. (20 + 10) + 11 = (20 + 11) + 10

    View full-size slide

  14. F(x)=P{σ{
    P(x⩽x
    α
    )⩾α
    P(x⩾x
    α
    )⩾1−α

    View full-size slide

  15. collector
    collector
    collector
    pworker
    pworker
    pworker

    View full-size slide

  16. var memCount = 75604275;
    var memPerSec = 1.38176367782;
    function updateCount() {
    next = -(1000 / memPerSec) *
    Math.log(Math.random());
    memCountString = ''+memCount;
    len = memCountString.length;
    memCountString = memCountString.substr(0, len
    - 6) + ’ < span style = ”font - size: 8 px” > <
    /span>’+memCountString.substr(len-6,3)+‘ < span
    style = ”font - size: 8 px” > <
    /span>’+memCountString.substr(len-3,3);
    ge(‘memCount’).innerHTML = memCountString;
    memCount = memCount + 1;
    setTimeout(updateCount, next);
    }
    addEvent(window, ‘load’, updateCount);

    View full-size slide

  17. 3500
    3671
    3400
    3502
    3463
    3371
    3607
    6012
    6168
    6211
    6017

    View full-size slide

  18. 3500 3507
    3671 3667
    3400 3410
    3502 3502
    3463 3466
    3371 3330
    3607 3599
    6012 6009
    6168 6152
    6211 6215
    6017 6016

    View full-size slide

  19. Count-Min Sketch
    0 0 0 0 0
    0 0 0 0 0
    0 0 0 0 0
    0 0 0 0 0
    0 0 0 0 0

    View full-size slide

  20. Count-Min Sketch
    0 0 0 0 0
    0 0 0 0 0
    0 0 0 0 0
    0 0 0 0 0
    0 0 0 0 0

    View full-size slide

  21. Count-Min Sketch
    0 0 1 0 0
    0 0 0 1 0
    0 0 0 1 0
    0 1 0 0 0
    0 0 0 0 1

    View full-size slide

  22. Count-Min Sketch
    32 11 1 18 200
    126 184 78 1 0
    91 59 30 24 8
    82 76 34 48 72
    11 200 129 136 14

    View full-size slide

  23. Count-Min Sketch
    32 11 1 18 200
    126 184 78 1 0
    91 59 30 24 8
    82 76 34 48 72
    11 200 129 136 14

    View full-size slide

  24. MinHash
    J (A , B)=
    |A∩B|
    |A∪B|
    k=[
    1
    ε2
    ]

    View full-size slide

  25. HyperLogLog
    010010000110010101101100011011000110111100100001
    b
    26 = 64
    1001
    b
    = 9
    100001
    b
    = 33
    σ=
    1.04
    √2k
    E=
    α(k)4k

    j
    2−M
    j

    View full-size slide

  26. t-digest
    X=x
    1
    , x
    2
    ,…, x
    n
    X={s
    1
    ,s
    2
    ,…,s
    m
    }
    s
    i
    ={x
    l e f t(i)
    ,…, x
    r i ght(i)
    }

    View full-size slide

  27. t-digest
    k(q,δ)≝δ
    (sin−1
    (2q−1)
    π +
    1
    2
    )
    K(i)≝k(
    r i ght(i)
    n
    ,δ)−k(
    le f t(i)−1
    n
    ,δ)
    K (i)⩽1
    K(i)+K (i+1)>1

    View full-size slide

  28. collector
    collector
    collector
    pworker
    pworker
    pworker

    View full-size slide