Upgrade to Pro — share decks privately, control downloads, hide ads and more …

Building a Firehose

Building a Firehose

My talk about dealing with streaming data in PHP, from PHP Tek 12.

Ian Barber

May 24, 2012
Tweet

More Decks by Ian Barber

Other Decks in Technology

Transcript

  1. ian barber - [email protected] - @ianbarber
    https://github.com/ianbarber/Firehose-PHP-Talk
    BUILDING A FIREHOSE

    View full-size slide

  2. FILTERABLE
    REAL TIME
    STREAMING
    DATA

    View full-size slide

  3. SELLING
    DATA
    ANALYSIS
    & DECISIONS
    USER
    TOOLS
    $£¥ ☑☒

    View full-size slide

  4. DATA SOURCES
    COMPOSE
    latency
    AUGMENT
    STORE
    FILTER
    STREAM

    View full-size slide

  5. EVENT
    SAMPLE
    order
    tweet
    temperature
    snapshot

    View full-size slide

  6. Data Source Data Source Data Source
    Output

    View full-size slide

  7. Data Source Data Source Data Source
    Output Output

    View full-size slide

  8. Data Source Data Source Data Source
    Output
    Messaging
    Batch HTTP Logs
    HTTP Chunked Websockets
    Batched POST

    View full-size slide

  9. APACHE
    PHP
    APACHE
    PHP
    NODE.JS
    PUSH
    ZEROMQ
    PULL
    HTTP POST
    WEBSOCKETS

    View full-size slide

  10. APACHE
    PHP
    APACHE
    PHP
    HTTP POST
    function sendPos() {
    navigator.geolocation.getCurrentPosition(
    function(pos) {
    $.ajax({ type: 'POST',
    url:'http://firehose.com/input.php',
    data: {lat: pos.coords.latitude,
    lon: pos.coords.longitude}}); });
    setTimeout(sendPos, 60000);
    } sendPos();
    location.php

    View full-size slide

  11. APACHE
    PHP
    APACHE
    PHP
    PUSH
    ZEROMQ
    WEBSOCKETS
    $ctx = new ZMQContext();
    $sock = $ctx->getSocket(ZMQ::SOCKET_PUSH);
    $sock->connect("tcp://localhost:5566");
    $data = array(
    'id' => get_next_msg_id(),
    'uid' => $_COOKIE['uid'],
    'lat' => $_POST['lat'],
    'lon' => $_POST['lon']
    );
    $sock->send(json_encode($data));
    input.php

    View full-size slide

  12. APACHE
    PHP
    APACHE
    PHP
    NODE.JS
    ZEROMQ PULL
    WEBSOCKETS
    app=require('http').createServer(handler),
    io = require('socket.io').listen(app),
    zmq = require('zmq'),
    sock = zmq.socket('pull');
    app.listen(8080);
    sock.bind('tcp://*:5566');
    sock.on('message', function (msg) {
    var data = JSON.parse(msg);
    // send to all clients
    io.sockets.emit("position", event);
    });
    output.js

    View full-size slide

  13. PHP
    DAEMON
    PHP
    DAEMON
    NODE.JS
    PUSH
    ZEROMQ
    PULL
    HTTP STREAM
    WEBSOCKETS
    $fh = fopen("https://".$user.":".
    $pass."@stream.twitter.com/1/statuses/
    filter.json?track=".$search, "r");
    while(!feof($fh)) {
    $d = fgets($fh);
    if(strlen($d) > 4) {
    $sock->send($d);
    }
    }
    twitter.php

    View full-size slide

  14. Data Source Data Source
    Output
    Assemble
    Process Process

    View full-size slide

  15. SOURCE
    ASSEMBLE
    PHP PHP
    ZEROMQ PUB
    SUB
    SUB SUB
    REDIS
    ZEROMQ
    PUSH

    View full-size slide

  16. SOURCE PHP
    ZEROMQ
    PUB SUB
    REDIS
    $ctx = new ZMQContext();
    $sub = $ctx->getSocket(ZMQ::SOCKET_SUB);
    $sub->setSockOpt(ZMQ::SOCKOPT_SUBSCRIBE,"");
    $sub->connect("tcp://localhost:5577");
    while( $dat = $sub->recv() ) {
    $aug = augment(json_decode($dat,true),$obj);
    $redis->lpush($dat['id'],json_encode($aug));
    }
    augmentor.php

    View full-size slide

  17. $mongo = new Mongo();
    $collection = $m->starbucks->locations;
    function augment($data, $collection) {
    $loc = array((float) $data['lon'],
    (float) $data['lat']);
    $res = $collection->findOne(array(
    'loc' => array('$near' => $loc)));
    return array('name' => 'starbucks',
    'val' => $res['street']);
    }
    SOURCE PHP REDIS
    starbucks.php
    DB

    View full-size slide

  18. $ld = new Text_LanguageDetect();
    $ld->setNameMode(2);
    function augment($data, $ld) {
    /* ["en"]=> float(0.24702222222222) */
    $names = $ld->detect($data['text'], 1);
    return array('name' => 'lang',
    'val' => key($names));
    }
    SOURCE PHP REDIS
    langdetect.php

    View full-size slide

  19. $zk = new Zookeeper();
    $zk->connect("localhost:2181");
    SOURCE
    ASSEMBLE PHP
    PHP
    REDIS PHP
    ZOOKEEPER
    COUNT OF SERVICES

    View full-size slide

  20. $zk->create(
    $path . "/" . uniqid(), NULL,
    array( array(
    "perms" => Zookeeper::PERM_ALL,
    "scheme" => "world",
    "id" => "anyone")),
    Zookeeper::EPHEMERAL);
    PHP ZOOKEEPER
    augmentor.php

    View full-size slide

  21. REASSEMBLE
    SOURCE
    REDIS
    ZOOKEEPER
    define("TIMEOUT", 5);
    $ch = $zk->getChildren("/services");
    $servs = count($ch);
    COUNT

    View full-size slide

  22. REASSEMBLE
    while($dat = $sub->recv()){
    do {
    $start = microtime(true);
    $aug = $redis->brpop($dat['id'],$time));
    if(count($aug)) $dat['aug'][] = $aug;
    $time -= microtime(true) - $start;
    } while($time > 0 &&
    count($dat['aug']) != $servs);
    $out->send(json_encode($dat)); //forward
    }
    COUNT
    reassemble.php

    View full-size slide

  23. Data Source Data Source
    Output
    Assemble
    Process Process
    Filter Filter Filter

    View full-size slide

  24. FILTER
    ELASTIC SEARCH
    QUERY - NAME
    QUERY - NAME
    MSG
    ?
    ?
    MSG
    ZEROMQ SUB
    ZEROMQ PUB
    TOPIC
    MSG
    TOPIC
    MSG
    TOPIC
    MSG
    ZEROMQ PULL
    QUERY - NAME
    MSG
    HTTP / REST

    View full-size slide

  25. ELASTIC SEARCH
    QUERY - NAME
    QUERY - NAME
    MSG
    MSG
    HTTP / REST
    function escall($server, $path, $param) {
    $context = stream_context_create(
    array('http' => $http));
    $result = file_get_contents(
    $serv.'/'.$path, NULL, $context);
    return json_decode( $result );
    }
    elasticsearch.php

    View full-size slide

  26. ELASTIC SEARCH
    QUERY - NAME
    QUERY - NAME
    MSG
    MSG
    HTTP / REST
    function percolate($host, $path, $tweet) {
    $path = "/twitter/tweet/_percolate";
    $tweet = array('doc' => array(
    'tweet' => $tweet['text']));
    $match = escall($host, $path,
    array('content' =>
    json_encode($tweet)));
    return $match['matches'];
    }
    elasticsearch.php

    View full-size slide

  27. // snip... creating in, ctl, out ZMQ socks
    $poll = new ZMQPoll();
    $poll->add($in, ZMQ::POLL_IN);
    $poll->add($ctl, ZMQ::POLL_IN);
    $read = $write = array();
    FILTER
    MSG
    ZEROMQ PULL
    QUERY - NAME
    ZEROMQ SUB
    elasticsearch.php

    View full-size slide

  28. while(true) {
    $ev = $poll->poll($read, $write, -1);
    if($read[0] === $in) {
    $msg = json_decode( $in->recv() );
    $matches = percolate($host, $msg);
    foreach($matches as $match) {
    $out->sendMulti(array($match, $msg));
    }
    } else if($read[0] === $ctl) {
    $q = json_decode($ctl->recv());
    $name = $q['name'];
    $query = $q['query'];
    add_query($host, $name,$query);
    }
    }
    elasticsearch.php

    View full-size slide

  29. Data Source
    Output
    Queue Process
    Filter
    Data
    Store
    Data
    Store

    View full-size slide

  30. STORE
    PHP
    KAFKA
    TOPIC
    TOPIC
    1 2 3 4
    1 2 3 4
    SUB
    APACHE
    PHP
    CLIENT
    HTTP GET
    TOPIC - OFFSET

    View full-size slide

  31. PHP
    KAFKA
    TOPIC
    TOPIC
    1 2 3 4
    1 2 3 4
    SUB
    $k = new Kafka_Producer("localhost", 9092);
    while ($data = $in->recvMulti()) {
    $topic = $data[0];
    $msg = $data[1];
    $bytes = $k->send(array($msg), $topic);
    }
    kafkastore.php

    View full-size slide

  32. $consumer = new Kafka_SimpleConsumer(
    'localhost', 9092, 1, $max);
    do {
    $msgs = $consumer->fetch(
    new Kafka_FetchRequest($top,0,$os,$max)
    );
    foreach($msgs as $msg)
    echo $msg->payload(), "\n";
    $offset += $msgs->validBytes();
    } while($msgs->validBytes() > 0);
    echo json_encode(array("offset"=>$offset));
    kafkaconsume.php
    KAFKA
    TOPIC
    TOPIC
    1 2 3 4
    1 2 3 4
    APACHE
    PHP
    CLIENT
    GET

    View full-size slide

  33. JSON & MSGPACK
    $data = array('id'=>1,'a'=>'a','b'=>'xyz',
    'c' => array(1, 2, "abcdefg",
    array(5, 7, 8)));
    $enc = json_encode($data);
    var_dump( json_decode($enc) );
    $enc = msgpack_pack($data);
    var_dump( msgpack_unpack($enc) );
    JSON
    MSGPACK
    MSGPACK
    JSON

    View full-size slide

  34. Data Source
    Output
    Queue Process
    Filter
    Data
    Store
    Tap
    Trace
    Trace
    Trace

    View full-size slide

  35. Data Source
    Output
    See Also: http://slidesha.re/JaWE78

    View full-size slide

  36. ian barber - [email protected] - @ianbarber
    https://github.com/ianbarber/Firehose-PHP-Talk
    THANKS!

    View full-size slide