Upgrade to Pro — share decks privately, control downloads, hide ads and more …

JUC - Procesando datos con Hadoop: MapReduce y ...

CETA-Ciemat
February 11, 2015

JUC - Procesando datos con Hadoop: MapReduce y Yarn

I Jornadas Técnicas UEx - CIEMAT. Procesando grandes volúmenes de datos con Hadoop

CETA-Ciemat

February 11, 2015
Tweet

More Decks by CETA-Ciemat

Other Decks in Education

Transcript

  1. Jornadas Técnicas Uex-CIEMAT // 10-12 Febrero 2015 Procesando grandes volúmenes

    de datos con HADOOP César Suárez Ortega [email protected] Procesando datos con Hadoop: MapReduce y YARN
  2. Soluciones 1. Script  2. Paralelización por año  3.

    Paralelización por partes iguales  4. MapReduce 
  3. MapReduce 101   <Key, Value>  Hay que definir

    una función para cada etapa. MAP REDUCE
  4. Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsed Time,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,Carrie rDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay 1987,10,14,3,741,730,912,849,PS,1451,NA,91,79,NA,23,11,SAN,SFO,447,NA,NA,0,NA,0,NA,NA,NA,NA,NA 1987,10,15,4,729,730,903,849,PS,1451,NA,94,79,NA,14,-1,SAN,SFO,447,NA,NA,0,NA,0,NA,NA,NA,NA,NA 1987,10,17,6,741,730,918,849,PS,1451,NA,97,79,NA,29,11,SAN,SFO,447,NA,NA,0,NA,0,NA,NA,NA,NA,NA 1987,10,18,7,729,730,847,849,PS,1451,NA,78,79,NA,-2,-1,SAN,SFO,447,NA,NA,0,NA,0,NA,NA,NA,NA,NA 1987,10,19,1,749,730,922,849,PS,1451,NA,93,79,NA,33,19,SAN,SFO,447,NA,NA,0,NA,0,NA,NA,NA,NA,NA 1987,10,21,3,728,730,848,849,PS,1451,NA,80,79,NA,-1,-2,SAN,SFO,447,NA,NA,0,NA,0,NA,NA,NA,NA,NA 1987,10,22,4,728,730,852,849,PS,1451,NA,84,79,NA,3,-2,SAN,SFO,447,NA,NA,0,NA,0,NA,NA,NA,NA,NA

    1987,10,23,5,731,730,902,849,PS,1451,NA,91,79,NA,13,1,SAN,SFO,447,NA,NA,0,NA,0,NA,NA,NA,NA,NA 1987,10,24,6,744,730,908,849,PS,1451,NA,84,79,NA,19,14,SAN,SFO,447,NA,NA,0,NA,0,NA,NA,NA,NA,NA 1987,10,25,7,729,730,851,849,PS,1451,NA,82,79,NA,2,-1,SAN,SFO,447,NA,NA,0,NA,0,NA,NA,NA,NA,NA 1987,10,26,1,735,730,904,849,PS,1451,NA,89,79,NA,15,5,SAN,SFO,447,NA,NA,0,NA,0,NA,NA,NA,NA,NA 1987,10,28,3,741,725,919,855,PS,1451,NA,98,90,NA,24,16,SAN,SFO,447,NA,NA,0,NA,0,NA,NA,NA,NA,NA 1987,10,29,4,742,725,906,855,PS,1451,NA,84,90,NA,11,17,SAN,SFO,447,NA,NA,0,NA,0,NA,NA,NA,NA,NA 1987,10,31,6,726,725,848,855,PS,1451,NA,82,90,NA,-7,1,SAN,SFO,447,NA,NA,0,NA,0,NA,NA,NA,NA,NA 1987,10,1,4,936,915,1035,1001,PS,1451,NA,59,46,NA,34,21,SFO,RNO,192,NA,NA,0,NA,0,NA,NA,NA,NA,NA 1987,10,2,5,918,915,1017,1001,PS,1451,NA,59,46,NA,16,3,SFO,RNO,192,NA,NA,0,NA,0,NA,NA,NA,NA,NA 1987,10,3,6,928,915,1037,1001,PS,1451,NA,69,46,NA,36,13,SFO,RNO,192,NA,NA,0,NA,0,NA,NA,NA,NA,NA 1987,10,4,7,914,915,1003,1001,PS,1451,NA,49,46,NA,2,-1,SFO,RNO,192,NA,NA,0,NA,0,NA,NA,NA,NA,NA 1987,10,5,1,1042,915,1129,1001,PS,1451,NA,47,46,NA,88,87,SFO,RNO,192,NA,NA,0,NA,0,NA,NA,NA,NA,NA 1987,10,6,2,934,915,1024,1001,PS,1451,NA,50,46,NA,23,19,SFO,RNO,192,NA,NA,0,NA,0,NA,NA,NA,NA,NA 1987,10,7,3,946,915,1037,1001,PS,1451,NA,51,46,NA,36,31,SFO,RNO,192,NA,NA,0,NA,0,NA,NA,NA,NA,NA 1987,10,8,4,932,915,1033,1001,PS,1451,NA,61,46,NA,32,17,SFO,RNO,192,NA,NA,0,NA,0,NA,NA,NA,NA,NA 1987,10,9,5,947,915,1036,1001,PS,1451,NA,49,46,NA,35,32,SFO,RNO,192,NA,NA,0,NA,0,NA,NA,NA,NA,NA 1987,10,10,6,915,915,1022,1001,PS,1451,NA,67,46,NA,21,0,SFO,RNO,192,NA,NA,0,NA,0,NA,NA,NA,NA,NA 1987,10,11,7,916,915,1006,1001,PS,1451,NA,50,46,NA,5,1,SFO,RNO,192,NA,NA,0,NA,0,NA,NA,NA,NA,NA 1987,10,12,1,944,915,1027,1001,PS,1451,NA,43,46,NA,26,29,SFO,RNO,192,NA,NA,0,NA,0,NA,NA,NA,NA,NA 1987,10,13,2,941,915,1036,1001,PS,1451,NA,55,46,NA,35,26,SFO,RNO,192,NA,NA,0,NA,0,NA,NA,NA,NA,NA 1987,10,14,3,930,915,1029,1001,PS,1451,NA,59,46,NA,28,15,SFO,RNO,192,NA,NA,0,NA,0,NA,NA,NA,NA,NA 1987,10,15,4,920,915,1023,1001,PS,1451,NA,63,46,NA,22,5,SFO,RNO,192,NA,NA,0,NA,0,NA,NA,NA,NA,NA 1987,10,17,6,1009,915,1104,1001,PS,1451,NA,55,46,NA,63,54,SFO,RNO,192,NA,NA,0,NA,0,NA,NA,NA,NA,NA 1987,10,18,7,915,915,1008,1001,PS,1451,NA,53,46,NA,7,0,SFO,RNO,192,NA,NA,0,NA,0,NA,NA,NA,NA,NA 1987,10,19,1,940,915,1032,1001,PS,1451,NA,52,46,NA,31,25,SFO,RNO,192,NA,NA,0,NA,0,NA,NA,NA,NA,NA 1987,10,21,3,913,915,1003,1001,PS,1451,NA,50,46,NA,2,-2,SFO,RNO,192,NA,NA,0,NA,0,NA,NA,NA,NA,NA 1987,10,22,4,915,915,1017,1001,PS,1451,NA,62,46,NA,16,0,SFO,RNO,192,NA,NA,0,NA,0,NA,NA,NA,NA,NA
  5. public class FlightsByCarrier { public static void main (String[] args)

    throws Exception { Job job = new Job(); job.setJarByClass(FlightsByCarrier.class); job.setJobName("FlightsByCarrier”); TextInputFormat.addInputPath(job, new Path(args[0])); TextOutputFormat.setOutputPath(job, new Path(args[1])); job.setInputFormatClass(TextInputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(FlightsByCarrierMapper.class); job.setReducerClass(FlightsByCarrierReducer.class); job.setOutputFormatClass(TextOutputFormat.class); job.addFileToClassPath(new Path("/user/root/opencsv-2.3.jar")); job.waitForCompletion(true); } }
  6. //Mapper<KeyIn, ValueIn, KeyOut, ValueOut> public class FlightsByCarrierMapper extends Mapper<LongWritable, Text,

    Text, IntWritable> { @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { if (key.get() > 0) { //Ignora la primera linea String[] lines = new CSVParser().parseLine(value.toString()); context.write(new Text(lines[8]), new IntWritable(1)); } } }
  7. public class FlightsByCarrierReducer extends Reducer<Text, IntWritable, Text, IntWritable> { @Override

    protected void reduce(Text token, Iterable<IntWritable> counts, Context context) throws IOException, InterruptedException { int sum = 0; for (IntWritable count : counts) { sum += count.get(); } context.write(token, new IntWritable(sum)); } }
  8. $ git clone https://github.com/csuarez/seminario-mapreduce.git [...] $ tar xvzf 1987.tar.gz $

    hdfs dfs –copyFromLocal lib/opencsv-2.3.jar /user/root $ hdfs dfs –copyFromLocal 1987.csv /user/root $ sh build.sh $ hadoop jar FlightsByCarrier.jar FlightsByCarrier /user/root/1987.csv /user/root/output/flightsCount $ hdfs dfs -cat /user/root/output/flightsCount/part-r-00000 Ejecución
  9. FlightsByCarrier output INFO mapreduce.JobSubmitter: number of splits:2 ... INFO mapreduce.Job:

    map 0% reduce 0% INFO mapreduce.Job: map 22% reduce 0% INFO mapreduce.Job: map 41% reduce 0% INFO mapreduce.Job: map 83% reduce 0% INFO mapreduce.Job: map 100% reduce 0% INFO mapreduce.Job: map 100% reduce 100% ... Job Counters Launched map tasks=2 Launched reduce tasks=1 Rack-local map tasks=2 Total time spent by all maps in occupied slots (ms)=42442 Total time spent by all reduces in occupied slots (ms)=13465
  10. vs. MapReduce v1: Actores 1. 2. 3. 4. 1. YARN

    Resource Manager 2. YARN Node Manager 3. Application Master
  11. $ hadoop jar $HADOOP_HOME/hadoop-streaming.jar \ -input myInputDirs \ -output myOutputDir

    \ -mapper myPythonScript.py \ -reducer /bin/wc \ -file myPythonScript.py
  12. private final static String[] gods = { "Zeus", "Hera", "Poseidón",

    "Dioniso", "Apolo", "Artemisa", "Hermes", "Atenea", "Ares", "Afrodita", "Hefesto", "Deméter” };
  13. //Initializing the initial structure for (String god : gods) {

    godMap.put(god, 0); } try { //Reading input br = new BufferedReader(new FileReader(args[0])); String line = br.readLine(); while (line != null) { StringTokenizer tokenizer = new StringTokenizer(line); while (tokenizer.hasMoreTokens()) { String token = tokenizer.nextToken(); if (godMap.containsKey(token)) { godMap.put(token, godMap.get(token) + 1); } } line = br.readLine(); } //Writing output Writer writer = new BufferedWriter(new FileWriter("gods.txt")); for (Entry<String, Integer> entry : godMap.entrySet()) { writer.write(entry.getKey() + " = " + entry.getValue()); writer.write(System.lineSeparator()); } writer.close(); }
  14. import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.*; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

    import org.apache.hadoop.filecache.DistributedCache; public class GreekGodCounterMapReduce { public static void main (String[] args) throws Exception { Job job = new Job(); job.setJarByClass(GreekGodCounterMapReduce.class); job.setJobName("GreekGodCounterMapReduce"); TextInputFormat.addInputPath(job, new Path(args[0])); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(GreekGodCounterMapReduceMapper.class); job.setReducerClass(GreekGodCounterMapReduceReducer.class); TextOutputFormat.setOutputPath(job, new Path(args[1])); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.waitForCompletion(true); } }
  15. import java.io.IOException; import org.apache.hadoop.io.*; import org.apache.hadoop.mapreduce.Mapper; import java.util.HashMap; import java.util.Map.Entry;

    import java.util.StringTokenizer; public class GreekGodCounterMapReduceMapper extends Mapper<LongWritable, Text, Text, IntWritable> { private final static String[] gods = { "Zeus", "Hera", "Poseidón", "Dioniso", "Apolo", "Artemisa", "Hermes", "Atenea", "Ares", "Afrodita", "Hefesto", "Deméter" }; @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { } }
  16. import java.io.IOException; import org.apache.hadoop.io.*; import org.apache.hadoop.mapreduce.Reducer; public class GreekGodCounterMapReduceReducer extends

    Reducer<Text, IntWritable, Text, IntWritable> { @Override protected void reduce (Text token, Iterable<IntWritable> counts, Context context) throws IOException, InterruptedException { } }