#phpnw12 version of my talk on building firehose style streaming data systems
+Ian Barber - [email protected] - @ianbarberhttps://github.com/ianbarber/Firehose-PHP-TalkBUILDING A FIREHOSE
View Slide
FILTERABLEREAL TIMESTREAMINGDATA
SELLINGDATAANALYSIS& DECISIONSUSERTOOLS$£¥ ☑☒
DATA SOURCESCOMPOSElatencyAUGMENTSTOREFILTERSTREAM
EVENTSAMPLEordertweettemperaturesnapshot
Data Source Data Source Data SourceOutput
Data Source Data Source Data SourceOutput Output
Data Source Data Source Data SourceOutputMessagingBatch HTTP LogsHTTP Chunked WebsocketsBatched POST
APACHEPHPAPACHEPHPNODE.JSPUSHZEROMQPULLHTTP POSTWEBSOCKETS
APACHEPHPAPACHEPHPHTTP POSTfunction sendPos() {navigator.geolocation.getCurrentPosition(function(pos) {$.ajax({ type: 'POST',url:'http://firehose.com/input.php',data: {lat: pos.coords.latitude,lon: pos.coords.longitude}}); });setTimeout(sendPos, 60000);} sendPos();location.php
APACHEPHPAPACHEPHPPUSHZEROMQWEBSOCKETS$ctx = new ZMQContext();$sock = $ctx->getSocket(ZMQ::SOCKET_PUSH);$sock->connect("tcp://localhost:5566");$data = array('id' => get_next_msg_id(),'uid' => $_COOKIE['uid'],'lat' => $_POST['lat'],'lon' => $_POST['lon']);$sock->send(json_encode($data));input.php
APACHEPHPAPACHEPHPNODE.JSZEROMQ PULLWEBSOCKETSapp=require('http').createServer(handler),io = require('socket.io').listen(app),zmq = require('zmq'),sock = zmq.socket('pull');app.listen(8080);sock.bind('tcp://*:5566');sock.on('message', function (msg) {var data = JSON.parse(msg);// send to all clientsio.sockets.emit("position", event);});output.js
PHPDAEMONPHPDAEMONNODE.JSPUSHZEROMQPULLHTTP STREAMWEBSOCKETS$fh = fopen("https://".$user.":".$pass."@stream.twitter.com/1/statuses/filter.json?track=".$search, "r");while(!feof($fh)) {$d = fgets($fh);if(strlen($d) > 4) {$sock->send($d);}}twitter.php
Data Source Data SourceOutputAssembleProcess Process
SOURCEASSEMBLEPHP PHPZEROMQ PUBSUBSUB SUBREDISZEROMQPUSH
SOURCE PHPZEROMQPUB SUBREDIS$ctx = new ZMQContext();$sub = $ctx->getSocket(ZMQ::SOCKET_SUB);$sub->setSockOpt(ZMQ::SOCKOPT_SUBSCRIBE,"");$sub->connect("tcp://localhost:5577");while( $dat = $sub->recv() ) {$aug = augment(json_decode($dat,true),$obj);$redis->lpush($dat['id'],json_encode($aug));}augmentor.php
$mongo = new Mongo();$collection = $m->starbucks->locations;function augment($data, $collection) {$loc = array((float) $data['lon'],(float) $data['lat']);$res = $collection->findOne(array('loc' => array('$near' => $loc)));return array('name' => 'starbucks','val' => $res['street']);}SOURCE PHP REDISstarbucks.phpDB
$ld = new Text_LanguageDetect();$ld->setNameMode(2);function augment($data, $ld) {/* ["en"]=> float(0.24702222222222) */$names = $ld->detect($data['text'], 1);return array('name' => 'lang','val' => key($names));}SOURCE PHP REDISlangdetect.php
$zk = new Zookeeper();$zk->connect("localhost:2181");SOURCEASSEMBLE PHPPHPREDIS PHPZOOKEEPERCOUNT OF SERVICES
$zk->create($path . "/" . uniqid(), NULL,array( array("perms" => Zookeeper::PERM_ALL,"scheme" => "world","id" => "anyone")),Zookeeper::EPHEMERAL);PHP ZOOKEEPERaugmentor.php
REASSEMBLESOURCEREDISZOOKEEPERdefine("TIMEOUT", 5);$ch = $zk->getChildren("/services");$servs = count($ch);COUNT
REASSEMBLEwhile($dat = $sub->recv()){do {$start = microtime(true);$aug = $redis->brpop($dat['id'],$time));if(count($aug)) $dat['aug'][] = $aug;$time -= microtime(true) - $start;} while($time > 0 &&count($dat['aug']) != $servs);$out->send(json_encode($dat)); //forward}COUNTreassemble.php
Data Source Data SourceOutputAssembleProcess ProcessFilter Filter Filter
FILTERELASTIC SEARCHQUERY - NAMEQUERY - NAMEMSG??MSGZEROMQ SUBZEROMQ PUBTOPICMSGTOPICMSGTOPICMSGZEROMQ PULLQUERY - NAMEMSGHTTP / REST
ELASTIC SEARCHQUERY - NAMEQUERY - NAMEMSGMSGHTTP / RESTfunction escall($server, $path, $param) {$context = stream_context_create(array('http' => $http));$result = file_get_contents($serv.'/'.$path, NULL, $context);return json_decode( $result );}elasticsearch.php
ELASTIC SEARCHQUERY - NAMEQUERY - NAMEMSGMSGHTTP / RESTfunction percolate($host, $path, $tweet) {$path = "/twitter/tweet/_percolate";$tweet = array('doc' => array('tweet' => $tweet['text']));$match = escall($host, $path,array('content' =>json_encode($tweet)));return $match['matches'];}elasticsearch.php
// snip... creating in, ctl, out ZMQ socks$poll = new ZMQPoll();$poll->add($in, ZMQ::POLL_IN);$poll->add($ctl, ZMQ::POLL_IN);$read = $write = array();FILTERMSGZEROMQ PULLQUERY - NAMEZEROMQ SUBelasticsearch.php
while(true) {$ev = $poll->poll($read, $write, -1);if($read[0] === $in) {$msg = json_decode( $in->recv() );$matches = percolate($host, $msg);foreach($matches as $match) {$out->sendMulti(array($match, $msg));}} else if($read[0] === $ctl) {$q = json_decode($ctl->recv());$name = $q['name'];$query = $q['query'];add_query($host, $name,$query);}}elasticsearch.php
Data SourceOutputQueue ProcessFilterDataStoreDataStore
STOREPHPKAFKATOPICTOPIC1 2 3 41 2 3 4SUBAPACHEPHPCLIENTHTTP GETTOPIC - OFFSET
PHPKAFKATOPICTOPIC1 2 3 41 2 3 4SUB$k = new Kafka_Producer("localhost", 9092);while ($data = $in->recvMulti()) {$topic = $data[0];$msg = $data[1];$bytes = $k->send(array($msg), $topic);}kafkastore.php
$consumer = new Kafka_SimpleConsumer('localhost', 9092, 1, $max);do {$msgs = $consumer->fetch(new Kafka_FetchRequest($top,0,$os,$max));foreach($msgs as $msg)echo $msg->payload(), "\n";$offset += $msgs->validBytes();} while($msgs->validBytes() > 0);echo json_encode(array("offset"=>$offset));kafkaconsume.phpKAFKATOPICTOPIC1 2 3 41 2 3 4APACHEPHPCLIENTGET
OPS
JSON & MSGPACK$data = array('id'=>1,'a'=>'a','b'=>'xyz','c' => array(1, 2, "abcdefg",array(5, 7, 8)));$enc = json_encode($data);var_dump( json_decode($enc) );$enc = msgpack_pack($data);var_dump( msgpack_unpack($enc) );JSONMSGPACKMSGPACKJSON
Data SourceOutputQueue ProcessFilterDataStoreTapTraceTraceTrace
Data SourceOutputSee Also: http://slidesha.re/JaWE78
+Ian Barber - [email protected] - @ianbarberhttps://github.com/ianbarber/Firehose-PHP-TalkTHANKS!