Building A Firehose - PHPNW

47d1af0e885746e39195c8ff3234f47d?s=47 Ian Barber
October 06, 2012

Building A Firehose - PHPNW

#phpnw12 version of my talk on building firehose style streaming data systems

47d1af0e885746e39195c8ff3234f47d?s=128

Ian Barber

October 06, 2012
Tweet

Transcript

  1. +Ian Barber - ianb@php.net - @ianbarber https://github.com/ianbarber/Firehose-PHP-Talk BUILDING A FIREHOSE

  2. FILTERABLE REAL TIME STREAMING DATA

  3. SELLING DATA ANALYSIS & DECISIONS USER TOOLS $£¥ ☑☒

  4. DATA SOURCES COMPOSE latency AUGMENT STORE FILTER STREAM

  5. EVENT SAMPLE order tweet temperature snapshot

  6. Data Source Data Source Data Source Output

  7. Data Source Data Source Data Source Output Output

  8. Data Source Data Source Data Source Output Messaging Batch HTTP

    Logs HTTP Chunked Websockets Batched POST
  9. APACHE PHP APACHE PHP NODE.JS PUSH ZEROMQ PULL HTTP POST

    WEBSOCKETS
  10. APACHE PHP APACHE PHP HTTP POST function sendPos() { navigator.geolocation.getCurrentPosition(

    function(pos) { $.ajax({ type: 'POST', url:'http://firehose.com/input.php', data: {lat: pos.coords.latitude, lon: pos.coords.longitude}}); }); setTimeout(sendPos, 60000); } sendPos(); location.php
  11. APACHE PHP APACHE PHP PUSH ZEROMQ WEBSOCKETS $ctx = new

    ZMQContext(); $sock = $ctx->getSocket(ZMQ::SOCKET_PUSH); $sock->connect("tcp://localhost:5566"); $data = array( 'id' => get_next_msg_id(), 'uid' => $_COOKIE['uid'], 'lat' => $_POST['lat'], 'lon' => $_POST['lon'] ); $sock->send(json_encode($data)); input.php
  12. APACHE PHP APACHE PHP NODE.JS ZEROMQ PULL WEBSOCKETS app=require('http').createServer(handler), io

    = require('socket.io').listen(app), zmq = require('zmq'), sock = zmq.socket('pull'); app.listen(8080); sock.bind('tcp://*:5566'); sock.on('message', function (msg) { var data = JSON.parse(msg); // send to all clients io.sockets.emit("position", event); }); output.js
  13. PHP DAEMON PHP DAEMON NODE.JS PUSH ZEROMQ PULL HTTP STREAM

    WEBSOCKETS $fh = fopen("https://".$user.":". $pass."@stream.twitter.com/1/statuses/ filter.json?track=".$search, "r"); while(!feof($fh)) { $d = fgets($fh); if(strlen($d) > 4) { $sock->send($d); } } twitter.php
  14. Data Source Data Source Output Assemble Process Process

  15. SOURCE ASSEMBLE PHP PHP ZEROMQ PUB SUB SUB SUB REDIS

    ZEROMQ PUSH
  16. SOURCE PHP ZEROMQ PUB SUB REDIS $ctx = new ZMQContext();

    $sub = $ctx->getSocket(ZMQ::SOCKET_SUB); $sub->setSockOpt(ZMQ::SOCKOPT_SUBSCRIBE,""); $sub->connect("tcp://localhost:5577"); while( $dat = $sub->recv() ) { $aug = augment(json_decode($dat,true),$obj); $redis->lpush($dat['id'],json_encode($aug)); } augmentor.php
  17. $mongo = new Mongo(); $collection = $m->starbucks->locations; function augment($data, $collection)

    { $loc = array((float) $data['lon'], (float) $data['lat']); $res = $collection->findOne(array( 'loc' => array('$near' => $loc))); return array('name' => 'starbucks', 'val' => $res['street']); } SOURCE PHP REDIS starbucks.php DB
  18. $ld = new Text_LanguageDetect(); $ld->setNameMode(2); function augment($data, $ld) { /*

    ["en"]=> float(0.24702222222222) */ $names = $ld->detect($data['text'], 1); return array('name' => 'lang', 'val' => key($names)); } SOURCE PHP REDIS langdetect.php
  19. $zk = new Zookeeper(); $zk->connect("localhost:2181"); SOURCE ASSEMBLE PHP PHP REDIS

    PHP ZOOKEEPER COUNT OF SERVICES
  20. $zk->create( $path . "/" . uniqid(), NULL, array( array( "perms"

    => Zookeeper::PERM_ALL, "scheme" => "world", "id" => "anyone")), Zookeeper::EPHEMERAL); PHP ZOOKEEPER augmentor.php
  21. REASSEMBLE SOURCE REDIS ZOOKEEPER define("TIMEOUT", 5); $ch = $zk->getChildren("/services"); $servs

    = count($ch); COUNT
  22. REASSEMBLE while($dat = $sub->recv()){ do { $start = microtime(true); $aug

    = $redis->brpop($dat['id'],$time)); if(count($aug)) $dat['aug'][] = $aug; $time -= microtime(true) - $start; } while($time > 0 && count($dat['aug']) != $servs); $out->send(json_encode($dat)); //forward } COUNT reassemble.php
  23. Data Source Data Source Output Assemble Process Process Filter Filter

    Filter
  24. FILTER ELASTIC SEARCH QUERY - NAME QUERY - NAME MSG

    ? ? MSG ZEROMQ SUB ZEROMQ PUB TOPIC MSG TOPIC MSG TOPIC MSG ZEROMQ PULL QUERY - NAME MSG HTTP / REST
  25. ELASTIC SEARCH QUERY - NAME QUERY - NAME MSG MSG

    HTTP / REST function escall($server, $path, $param) { $context = stream_context_create( array('http' => $http)); $result = file_get_contents( $serv.'/'.$path, NULL, $context); return json_decode( $result ); } elasticsearch.php
  26. ELASTIC SEARCH QUERY - NAME QUERY - NAME MSG MSG

    HTTP / REST function percolate($host, $path, $tweet) { $path = "/twitter/tweet/_percolate"; $tweet = array('doc' => array( 'tweet' => $tweet['text'])); $match = escall($host, $path, array('content' => json_encode($tweet))); return $match['matches']; } elasticsearch.php
  27. // snip... creating in, ctl, out ZMQ socks $poll =

    new ZMQPoll(); $poll->add($in, ZMQ::POLL_IN); $poll->add($ctl, ZMQ::POLL_IN); $read = $write = array(); FILTER MSG ZEROMQ PULL QUERY - NAME ZEROMQ SUB elasticsearch.php
  28. while(true) { $ev = $poll->poll($read, $write, -1); if($read[0] === $in)

    { $msg = json_decode( $in->recv() ); $matches = percolate($host, $msg); foreach($matches as $match) { $out->sendMulti(array($match, $msg)); } } else if($read[0] === $ctl) { $q = json_decode($ctl->recv()); $name = $q['name']; $query = $q['query']; add_query($host, $name,$query); } } elasticsearch.php
  29. Data Source Output Queue Process Filter Data Store Data Store

  30. STORE PHP KAFKA TOPIC TOPIC 1 2 3 4 1

    2 3 4 SUB APACHE PHP CLIENT HTTP GET TOPIC - OFFSET
  31. PHP KAFKA TOPIC TOPIC 1 2 3 4 1 2

    3 4 SUB $k = new Kafka_Producer("localhost", 9092); while ($data = $in->recvMulti()) { $topic = $data[0]; $msg = $data[1]; $bytes = $k->send(array($msg), $topic); } kafkastore.php
  32. $consumer = new Kafka_SimpleConsumer( 'localhost', 9092, 1, $max); do {

    $msgs = $consumer->fetch( new Kafka_FetchRequest($top,0,$os,$max) ); foreach($msgs as $msg) echo $msg->payload(), "\n"; $offset += $msgs->validBytes(); } while($msgs->validBytes() > 0); echo json_encode(array("offset"=>$offset)); kafkaconsume.php KAFKA TOPIC TOPIC 1 2 3 4 1 2 3 4 APACHE PHP CLIENT GET
  33. OPS

  34. JSON & MSGPACK $data = array('id'=>1,'a'=>'a','b'=>'xyz', 'c' => array(1, 2,

    "abcdefg", array(5, 7, 8))); $enc = json_encode($data); var_dump( json_decode($enc) ); $enc = msgpack_pack($data); var_dump( msgpack_unpack($enc) ); JSON MSGPACK MSGPACK JSON
  35. Data Source Output Queue Process Filter Data Store Tap Trace

    Trace Trace
  36. Data Source Output See Also: http://slidesha.re/JaWE78

  37. +Ian Barber - ianb@php.net - @ianbarber https://github.com/ianbarber/Firehose-PHP-Talk THANKS!