Upgrade to Pro — share decks privately, control downloads, hide ads and more …

Building A Firehose - PHPNW

Ian Barber
October 06, 2012

Building A Firehose - PHPNW

#phpnw12 version of my talk on building firehose style streaming data systems

Ian Barber

October 06, 2012
Tweet

More Decks by Ian Barber

Other Decks in Technology

Transcript

  1. +Ian Barber - [email protected] - @ianbarber
    https://github.com/ianbarber/Firehose-PHP-Talk
    BUILDING A FIREHOSE

    View full-size slide

  2. FILTERABLE
    REAL TIME
    STREAMING
    DATA

    View full-size slide

  3. SELLING
    DATA
    ANALYSIS
    & DECISIONS
    USER
    TOOLS
    $£¥ ☑☒

    View full-size slide

  4. DATA SOURCES
    COMPOSE
    latency
    AUGMENT
    STORE
    FILTER
    STREAM

    View full-size slide

  5. EVENT
    SAMPLE
    order
    tweet
    temperature
    snapshot

    View full-size slide

  6. Data Source Data Source Data Source
    Output

    View full-size slide

  7. Data Source Data Source Data Source
    Output Output

    View full-size slide

  8. Data Source Data Source Data Source
    Output
    Messaging
    Batch HTTP Logs
    HTTP Chunked Websockets
    Batched POST

    View full-size slide

  9. APACHE
    PHP
    APACHE
    PHP
    NODE.JS
    PUSH
    ZEROMQ
    PULL
    HTTP POST
    WEBSOCKETS

    View full-size slide

  10. APACHE
    PHP
    APACHE
    PHP
    HTTP POST
    function sendPos() {
    navigator.geolocation.getCurrentPosition(
    function(pos) {
    $.ajax({ type: 'POST',
    url:'http://firehose.com/input.php',
    data: {lat: pos.coords.latitude,
    lon: pos.coords.longitude}}); });
    setTimeout(sendPos, 60000);
    } sendPos();
    location.php

    View full-size slide

  11. APACHE
    PHP
    APACHE
    PHP
    PUSH
    ZEROMQ
    WEBSOCKETS
    $ctx = new ZMQContext();
    $sock = $ctx->getSocket(ZMQ::SOCKET_PUSH);
    $sock->connect("tcp://localhost:5566");
    $data = array(
    'id' => get_next_msg_id(),
    'uid' => $_COOKIE['uid'],
    'lat' => $_POST['lat'],
    'lon' => $_POST['lon']
    );
    $sock->send(json_encode($data));
    input.php

    View full-size slide

  12. APACHE
    PHP
    APACHE
    PHP
    NODE.JS
    ZEROMQ PULL
    WEBSOCKETS
    app=require('http').createServer(handler),
    io = require('socket.io').listen(app),
    zmq = require('zmq'),
    sock = zmq.socket('pull');
    app.listen(8080);
    sock.bind('tcp://*:5566');
    sock.on('message', function (msg) {
    var data = JSON.parse(msg);
    // send to all clients
    io.sockets.emit("position", event);
    });
    output.js

    View full-size slide

  13. PHP
    DAEMON
    PHP
    DAEMON
    NODE.JS
    PUSH
    ZEROMQ
    PULL
    HTTP STREAM
    WEBSOCKETS
    $fh = fopen("https://".$user.":".
    $pass."@stream.twitter.com/1/statuses/
    filter.json?track=".$search, "r");
    while(!feof($fh)) {
    $d = fgets($fh);
    if(strlen($d) > 4) {
    $sock->send($d);
    }
    }
    twitter.php

    View full-size slide

  14. Data Source Data Source
    Output
    Assemble
    Process Process

    View full-size slide

  15. SOURCE
    ASSEMBLE
    PHP PHP
    ZEROMQ PUB
    SUB
    SUB SUB
    REDIS
    ZEROMQ
    PUSH

    View full-size slide

  16. SOURCE PHP
    ZEROMQ
    PUB SUB
    REDIS
    $ctx = new ZMQContext();
    $sub = $ctx->getSocket(ZMQ::SOCKET_SUB);
    $sub->setSockOpt(ZMQ::SOCKOPT_SUBSCRIBE,"");
    $sub->connect("tcp://localhost:5577");
    while( $dat = $sub->recv() ) {
    $aug = augment(json_decode($dat,true),$obj);
    $redis->lpush($dat['id'],json_encode($aug));
    }
    augmentor.php

    View full-size slide

  17. $mongo = new Mongo();
    $collection = $m->starbucks->locations;
    function augment($data, $collection) {
    $loc = array((float) $data['lon'],
    (float) $data['lat']);
    $res = $collection->findOne(array(
    'loc' => array('$near' => $loc)));
    return array('name' => 'starbucks',
    'val' => $res['street']);
    }
    SOURCE PHP REDIS
    starbucks.php
    DB

    View full-size slide

  18. $ld = new Text_LanguageDetect();
    $ld->setNameMode(2);
    function augment($data, $ld) {
    /* ["en"]=> float(0.24702222222222) */
    $names = $ld->detect($data['text'], 1);
    return array('name' => 'lang',
    'val' => key($names));
    }
    SOURCE PHP REDIS
    langdetect.php

    View full-size slide

  19. $zk = new Zookeeper();
    $zk->connect("localhost:2181");
    SOURCE
    ASSEMBLE PHP
    PHP
    REDIS PHP
    ZOOKEEPER
    COUNT OF SERVICES

    View full-size slide

  20. $zk->create(
    $path . "/" . uniqid(), NULL,
    array( array(
    "perms" => Zookeeper::PERM_ALL,
    "scheme" => "world",
    "id" => "anyone")),
    Zookeeper::EPHEMERAL);
    PHP ZOOKEEPER
    augmentor.php

    View full-size slide

  21. REASSEMBLE
    SOURCE
    REDIS
    ZOOKEEPER
    define("TIMEOUT", 5);
    $ch = $zk->getChildren("/services");
    $servs = count($ch);
    COUNT

    View full-size slide

  22. REASSEMBLE
    while($dat = $sub->recv()){
    do {
    $start = microtime(true);
    $aug = $redis->brpop($dat['id'],$time));
    if(count($aug)) $dat['aug'][] = $aug;
    $time -= microtime(true) - $start;
    } while($time > 0 &&
    count($dat['aug']) != $servs);
    $out->send(json_encode($dat)); //forward
    }
    COUNT
    reassemble.php

    View full-size slide

  23. Data Source Data Source
    Output
    Assemble
    Process Process
    Filter Filter Filter

    View full-size slide

  24. FILTER
    ELASTIC SEARCH
    QUERY - NAME
    QUERY - NAME
    MSG
    ?
    ?
    MSG
    ZEROMQ SUB
    ZEROMQ PUB
    TOPIC
    MSG
    TOPIC
    MSG
    TOPIC
    MSG
    ZEROMQ PULL
    QUERY - NAME
    MSG
    HTTP / REST

    View full-size slide

  25. ELASTIC SEARCH
    QUERY - NAME
    QUERY - NAME
    MSG
    MSG
    HTTP / REST
    function escall($server, $path, $param) {
    $context = stream_context_create(
    array('http' => $http));
    $result = file_get_contents(
    $serv.'/'.$path, NULL, $context);
    return json_decode( $result );
    }
    elasticsearch.php

    View full-size slide

  26. ELASTIC SEARCH
    QUERY - NAME
    QUERY - NAME
    MSG
    MSG
    HTTP / REST
    function percolate($host, $path, $tweet) {
    $path = "/twitter/tweet/_percolate";
    $tweet = array('doc' => array(
    'tweet' => $tweet['text']));
    $match = escall($host, $path,
    array('content' =>
    json_encode($tweet)));
    return $match['matches'];
    }
    elasticsearch.php

    View full-size slide

  27. // snip... creating in, ctl, out ZMQ socks
    $poll = new ZMQPoll();
    $poll->add($in, ZMQ::POLL_IN);
    $poll->add($ctl, ZMQ::POLL_IN);
    $read = $write = array();
    FILTER
    MSG
    ZEROMQ PULL
    QUERY - NAME
    ZEROMQ SUB
    elasticsearch.php

    View full-size slide

  28. while(true) {
    $ev = $poll->poll($read, $write, -1);
    if($read[0] === $in) {
    $msg = json_decode( $in->recv() );
    $matches = percolate($host, $msg);
    foreach($matches as $match) {
    $out->sendMulti(array($match, $msg));
    }
    } else if($read[0] === $ctl) {
    $q = json_decode($ctl->recv());
    $name = $q['name'];
    $query = $q['query'];
    add_query($host, $name,$query);
    }
    }
    elasticsearch.php

    View full-size slide

  29. Data Source
    Output
    Queue Process
    Filter
    Data
    Store
    Data
    Store

    View full-size slide

  30. STORE
    PHP
    KAFKA
    TOPIC
    TOPIC
    1 2 3 4
    1 2 3 4
    SUB
    APACHE
    PHP
    CLIENT
    HTTP GET
    TOPIC - OFFSET

    View full-size slide

  31. PHP
    KAFKA
    TOPIC
    TOPIC
    1 2 3 4
    1 2 3 4
    SUB
    $k = new Kafka_Producer("localhost", 9092);
    while ($data = $in->recvMulti()) {
    $topic = $data[0];
    $msg = $data[1];
    $bytes = $k->send(array($msg), $topic);
    }
    kafkastore.php

    View full-size slide

  32. $consumer = new Kafka_SimpleConsumer(
    'localhost', 9092, 1, $max);
    do {
    $msgs = $consumer->fetch(
    new Kafka_FetchRequest($top,0,$os,$max)
    );
    foreach($msgs as $msg)
    echo $msg->payload(), "\n";
    $offset += $msgs->validBytes();
    } while($msgs->validBytes() > 0);
    echo json_encode(array("offset"=>$offset));
    kafkaconsume.php
    KAFKA
    TOPIC
    TOPIC
    1 2 3 4
    1 2 3 4
    APACHE
    PHP
    CLIENT
    GET

    View full-size slide

  33. JSON & MSGPACK
    $data = array('id'=>1,'a'=>'a','b'=>'xyz',
    'c' => array(1, 2, "abcdefg",
    array(5, 7, 8)));
    $enc = json_encode($data);
    var_dump( json_decode($enc) );
    $enc = msgpack_pack($data);
    var_dump( msgpack_unpack($enc) );
    JSON
    MSGPACK
    MSGPACK
    JSON

    View full-size slide

  34. Data Source
    Output
    Queue Process
    Filter
    Data
    Store
    Tap
    Trace
    Trace
    Trace

    View full-size slide

  35. Data Source
    Output
    See Also: http://slidesha.re/JaWE78

    View full-size slide

  36. +Ian Barber - [email protected] - @ianbarber
    https://github.com/ianbarber/Firehose-PHP-Talk
    THANKS!

    View full-size slide