Upgrade to Pro — share decks privately, control downloads, hide ads and more …

Data streams processing with PHP and STORM

Data streams processing with PHP and STORM

Mariusz Gil

April 20, 2013
Tweet

More Decks by Mariusz Gil

Other Decks in Programming

Transcript

  1. PROCESSING
    t
    he php way of...
    STORM
    DAta STREAMS
    Mariusz Gil

    View Slide

  2. about me

    View Slide

  3. #php #scalability #nosql #performance
    #hadoop #hive #pig #bigdata
    #mahout #datamining
    #storm
    https://music.twitter.com/_login/background.jpg

    View Slide

  4. batch #1 batch #2 batch #3
    t
    he P
    r
    obl
    em

    View Slide

  5. t
    he S
    t
    or
    y

    View Slide

  6. STORM
    DISTRIBUTED REALTIME COMPUTATION SYSTEM

    View Slide

  7. scalable
    no data lost
    fault tolerant
    extremely robust
    language agnostic
    efficient messaging
    local or distributed

    View Slide

  8. terms and architecture

    View Slide

  9. Spouts Bolts
    Stream Topologies
    (val1, val2)
    (val3, val4)
    (val5, val6)
    unbounded sequence of tuples
    tuple
    tuple
    tuple
    tuple
    tuple
    tuple
    tuple

    View Slide

  10. Spouts Bolts
    Stream Topologies
    (val1, val2)
    (val3, val4)
    (val5, val6)
    source of streams
    tuple
    tuple
    tuple
    tuple
    tuple
    tuple
    tuple
    tuple
    tuple
    tuple
    tuple
    tuple
    tuple
    tuple

    View Slide

  11. Spouts Bolts
    Stream Topologies
    (val1, val2)
    (val3, val4)
    (val5, val6)
    process input streams and produce new streams
    tuple
    tuple
    tuple
    tuple
    tuple
    tuple
    tuple
    tuple
    tuple
    tuple
    tuple
    tuple
    tuple
    tuple

    View Slide

  12. Spouts Bolts
    Stream Topologies
    (val1, val2)
    (val3, val4)
    (val5, val6)
    network of spouts and bolts
    TextSpout SplitSentenceBolt WordCountBolt
    [sentence] [word] [word, count]

    View Slide

  13. View Slide

  14. storm-kestrel
    storm-kafka
    storm-amqp-spout
    storm-jms
    storm-pubsub
    storm-beanstalkd
    mapr-spout

    View Slide

  15. shuffle grouping
    fields grouping
    all grouping
    global grouping
    direct grouping
    local or shuffle
    grouping

    View Slide

  16. ZooKeepers Supervisors
    Nimbus

    View Slide

  17. fast
    CLUSTER STATE IS STORED
    LOCALLY OR IN ZOOKEEPERS
    fail

    View Slide

  18. code examples

    View Slide

  19. https://github.com/nathanmarz/storm

    View Slide

  20. https://github.com/maltoe/storm-install

    View Slide

  21. https://github.com/nathanmarz/storm-starter/

    View Slide

  22. https://github.com/lazyshot/storm-php

    View Slide

  23. public class DoubleAndTripleBolt extends BaseRichBolt {
    private OutputCollectorBase _collector;
    @Override
    public void prepare(Map conf, TopologyContext context, OutputCollectorBase collector) {
    _collector = collector;
    }
    @Override
    public void execute(Tuple input) {
    int val = input.getInteger(0);
    _collector.emit(input, new Values(val*2, val*3));
    _collector.ack(input);
    }
    @Override
    public void declareOutputFields(OutputFieldsDeclarer declarer) {
    declarer.declare(new Fields("double", "triple"));
    }
    }
    Java example / bolt

    View Slide

  24. public static class ExclamationBolt implements IRichBolt {
    OutputCollector _collector;
    public void prepare(Map conf, TopologyContext context, OutputCollector collector) {
    _collector = collector;
    }
    public void execute(Tuple tuple) {
    _collector.emit(tuple, new Values(tuple.getString(0) + "!!!"));
    _collector.ack(tuple);
    }
    public void cleanup() {
    }
    public void declareOutputFields(OutputFieldsDeclarer declarer) {
    declarer.declare(new Fields("word"));
    }
    public Map getComponentConfiguration() {
    return null;
    }
    }
    Java example / bolt

    View Slide

  25. TopologyBuilder builder = new TopologyBuilder();
    builder.setSpout("words", new TestWordSpout(), 10);
    builder.setBolt("exclaim1", new ExclamationBolt(), 3)
    .shuffleGrouping("words");
    builder.setBolt("exclaim2", new ExclamationBolt(), 2)
    .shuffleGrouping("exclaim1");
    Java example / topology
    ...
    words exclaim1 exclaim2

    View Slide

  26. zkServer.sh start
    bin/storm nimbus
    bin/storm supervisor
    bin/storm ui #optional
    storm jar all-my-code.jar backtype.storm.MyTopology arg1 arg2
    Java example / run

    View Slide

  27. PHP example / spout
    PHP example / spout
    require_once('storm.php');
    class RandomSentenceSpout extends ShellSpout
    {
    ! protected $sentences = array(
    ! ! "the cow jumped over the moon",
    ! ! "an apple a day keeps the doctor away",
    ! ! "four score and seven years ago",
    ! ! "snow white and the seven dwarfs",
    ! );
    ! protected function nextTuple()
    ! {
    ! ! sleep(.1);
    ! ! $sentence = $this->sentences[ rand(0, count($this->sentences) -1)];!
    ! ! $this->emit(array($sentence));
    ! }
    ! protected function ack($tuple_id)
    ! {
    ! ! return;
    ! }
    ! protected function fail($tuple_id)
    ! {
    ! ! return;
    ! }!
    }
    $SentenceSpout = new RandomSentenceSpout();
    $SentenceSpout->run();

    View Slide

  28. PHP example / bolt
    require_once('storm.php');
    class SplitSentenceBolt extends BasicBolt
    {
    ! public function process(Tuple $tuple)
    ! {
    ! ! $words = explode(" ", $tuple->values[0]);
    ! ! foreach($words as $word)
    ! ! {
    ! ! ! $this->emit(array($word));
    ! ! }
    ! }
    }
    $splitsentence = new SplitSentenceBolt();
    $splitsentence->run();

    View Slide

  29. /**
    * This topology demonstrates Storm's stream groupings and multilang capabilities.
    */
    public class WordCountPHPTopology {
    public static class SplitSentence extends ShellBolt implements IRichBolt {
    public SplitSentence() {
    super("php", "splitsentence.php");
    }
    @Override
    public void declareOutputFields(OutputFieldsDeclarer declarer) {
    declarer.declare(new Fields("word"));
    }
    @Override
    public Map getComponentConfiguration() {
    return null;
    }
    }
    // ...
    }
    MultiLang example / Topology, Bolt

    View Slide

  30. {"command": "next"}
    {"command": "ack", "id": "1231231"}
    {"command": "fail", "id": "1231231"}
    NonJVMSpout NonJVMBolt
    {"command": "sync"}
    {
    ! "command": "emit",
    ! "id": "1231231",
    ! "stream": "1",
    ! "task": 9,
    ! "tuple": ["field1", 2, 3]
    }
    {
    ! "id": "-6955786537413359385",
    ! "comp": "1",
    ! "stream": "1",
    ! "task": 9,
    ! "tuple": ["snow white and dwarfs", "field2", 3]
    }
    {
    ! "command": "emit",
    ! "anchors": ["1231231", "-234234234"],
    ! "stream": "1",
    ! "task": 9,
    ! "tuple": ["field1", 2, 3]
    }
    https://github.com/nathanmarz/storm/wiki/Multilang-protocol

    View Slide

  31. demo

    View Slide

  32. use cases

    View Slide

  33. stream
    processing

    View Slide

  34. continous
    query computation

    View Slide

  35. RPC
    distributed
    arguments
    results
    [request-id, arguments]
    [request-id, results]

    View Slide

  36. realtime analytics
    personalization
    search
    revenue
    optimization
    monitoring

    View Slide

  37. content search
    realtime analytics
    generating feeds
    integrated with
    elastic search,
    Hbase,hadoop
    and hdfs

    View Slide

  38. realtime scoring
    moments generation
    integration with
    kafka queues and
    hdfs storage

    View Slide

  39. thanks!
    feel free to contact with me
    email: [email protected]
    twitter: @mariuszgil

    View Slide