Distributed File System (HDFS) • Name Node stores file metadata • files split into 64 MB blocks • blocks replicated across 3 Data Nodes Name Node blocks 3 Data Nodes
failures • tasks are assigned based on data locality • Task Trackers can execute multiple tasks MapReduce (Job Scheduling / Execution System) Hadoop Distributed File System (HDFS) Name Node Data Node Job Tracker data locality Task Trackers
var words = value.split(/[^a-zA-Z]/); for (var i = 0; i < words.length; i++) { if (words[i] !== "") { context.write(words[i].toLowerCase(), 1); } } }; var reduce = function (key, values, context) { var sum = 0; while (values.hasNext()) { sum += parseInt(values.next()); } context.write(key, sum); };
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n' STORED AS TEXTFILE LOCATION "/example/count"; SELECT * FROM WordCount ORDER BY count DESC LIMIT 10;