Slide 1

Slide 1 text

MapReduce_ Which Way To Go? Ozren Gulan IT Consultant @ codecentric

Slide 2

Slide 2 text

Data?

Slide 3

Slide 3 text

Facebook 60 TB YouTube 300 h/m Instagram 140k p/m Twitter 350k t/m Google web index 10+ PB Logs * Requests * Large Hadron Colider ~ 1 PB/d

Slide 4

Slide 4 text

BigData?

Slide 5

Slide 5 text

No content

Slide 6

Slide 6 text

No content

Slide 7

Slide 7 text

Now…

Slide 8

Slide 8 text

No content

Slide 9

Slide 9 text

No content

Slide 10

Slide 10 text

No content

Slide 11

Slide 11 text

No content

Slide 12

Slide 12 text

MapReduce

Slide 13

Slide 13 text

No content

Slide 14

Slide 14 text

Batch Processing

Slide 15

Slide 15 text

Apache Pig vs Java MapReduce

Slide 16

Slide 16 text

$ rpm –qpR pig-vs-java-0.0.1-snapshot.x86.rpm $ hadoop version Hadoop 2.3.0-cdh5.1.0 $ pig --version Apache Pig version 0.12.0-cdh5.1.0

Slide 17

Slide 17 text

$ showcase – Word Count

Slide 18

Slide 18 text

$ showcase - for each customer group - top 5 products bought - average number of views per visit - average number of purchases - average purchase

Slide 19

Slide 19 text

$ cat input_record.json { "sessionId": 1, "customerCategoryId": 5, "customerCategoryDescription": "desc", "products": [ { "id": 1222, "name": "product", "category": "product category", "bought": true, "price": 57990.0 }, ... ] }

Slide 20

Slide 20 text

products = LOAD '/example/products/customer_records_map_reduce_input.json’ USING JsonLoader('...'); categories = LOAD '/example/dimension/customer_categories.db' AS (categoryId:int,age:chararray,gender:chararray); joinedRecords = JOIN categories BY categoryId, products BY customerCategoryId; --for each group of users, show top five selling products flattenedProducts = FOREACH joinedRecords GENERATE sessionId AS sessionId, categories::categoryId AS categoryId, categories::age AS age, categories::gender AS gender, FLATTEN(products.(id, name, category, bought, price)) AS (id, name, category, bought, price); boughtProducts = FILTER flattenedProducts BY bought == true; groupedProducts = GROUP boughtProducts BY (categoryId, age, gender, id, name); countedProducts = FOREACH groupedProducts GENERATE FLATTEN(group), COUNT(boughtProducts) AS counter; groupTopFiveProducts = GROUP countedProducts BY (categoryId, age, gender);

Slide 21

Slide 21 text

resultTopFiveProducts = FOREACH groupTopFiveProducts { sorted = ORDER countedProducts BY counter DESC; topProducts = LIMIT sorted 5; GENERATE FLATTEN(topProducts); }; STORE resultTopFiveProducts INTO '/example/results/topTenProducts' USING JsonStorage(); --average number of seen products averageSeenProducts = FOREACH joinedRecords GENERATE categories::categoryId AS categoryId, categories::age AS age, categories::gender AS gender, COUNT(products) AS counter; grpAverageSeenProducts = GROUP averageSeenProducts BY (categoryId, age, gender); averageCountedProducts = FOREACH grpAverageSeenProducts GENERATE FLATTEN(group), AVG(averageSeenProducts.counter) AS averageSeen; --average number of bought products per visit groupedBySession = GROUP boughtProducts BY (sessionId, categoryId, age, gender);

Slide 22

Slide 22 text

averageBoughtProducts = FOREACH groupedBySession GENERATE FLATTEN(group), COUNT(boughtProducts.name) AS counter; groupedAverageBoughtProducts = GROUP averageBoughtProducts BY (categoryId, age, gender); resultAverageBoughtProducts = FOREACH groupedAverageBoughtProducts GENERATE FLATTEN(group), AVG(averageBoughtProducts.counter) AS averageBought; --average purchase amount groupedAveragePrice = GROUP boughtProducts BY (categoryId, age, gender); averagePrice = FOREACH groupedAveragePrice GENERATE FLATTEN(group), AVG(boughtProducts.price) AS averagePaid; joinedFinal = JOIN averageCountedProducts BY (categoryId, age, gender), resultAverageBoughtProducts BY (categoryId, age, gender), averagePrice BY (categoryId, age, gender); finalResult = FOREACH joinedFinal GENERATE averageCountedProducts::categoryId AS categoryId, averageCountedProducts::age AS age, averageCountedProducts::gender AS gender, averageCountedProducts::averageSeen AS averageSeen, resultAverageBoughtProducts::averageBought AS averageBought, averagePrice::averagePaid AS averagePaid; STORE finalResult INTO '/example/results/productsStatistic' USING JsonStorage();

Slide 23

Slide 23 text

$ analysis:readability_maintainability –java package com.codingserbia.dto; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import com.codingserbia.writable.ProductWritable; public class CustomerCategoryProductBag { public LongWritable customerCategoryId; public Text customerCategoryDescription; private Map products; private Map purchasesByProduct; private Map viewsByProduct; private int numberOfViews = 0; private int numberOfSessions = 0;

Slide 24

Slide 24 text

private int numberOfPurchases = 0; public CustomerCategoryProductBag() { customerCategoryId = new LongWritable(0L); customerCategoryDescription = new Text(); products = new HashMap(); purchasesByProduct = new HashMap(); viewsByProduct = new HashMap(); } public ProductWritable getProductWritable(LongWritable id) { return products.get(id); } public boolean contains(LongWritable productId) { return getProductWritable(productId) != null; } public void add(ProductWritable product) { products.put(product.id, product); viewsByProduct.put(product.id, 1L); numberOfViews++; if (product.bought.get()) { purchasesByProduct.put(product.id, 1L); numberOfPurchases++; } } public void processOccurance(ProductWritable product) { if (product.bought.get()) { Long productNumberOfPurchases = purchasesByProduct.get(product.id); if (productNumberOfPurchases == null) { productNumberOfPurchases = 1L; } else { productNumberOfPurchases++; } purchasesByProduct.put(product.id, productNumberOfPurchases);

Slide 25

Slide 25 text

numberOfPurchases++; } Long productNumberOfViews = viewsByProduct.get(product.id); productNumberOfViews++; viewsByProduct.put(product.id, productNumberOfViews); numberOfViews++; } public List getTopProductsBought(int numberOfProducts) { List topProducts = new ArrayList(); Set> entrySet = purchasesByProduct.entrySet(); List> entries = new ArrayList>(); for (Iterator> iterator = entrySet.iterator(); iterator.hasNext();) { entries.add(iterator.next()); } Collections.sort(entries, new Comparator>() { @Override public int compare(Entry entry1, Entry entry2) { return entry2.getValue().intValue() - entry1.getValue().intValue(); } }); int resultSize = numberOfProducts; if (resultSize > entries.size()) { resultSize = entries.size(); } for (Entry e : entries.subList(0, resultSize)) { topProducts.add(products.get(e.getKey())); } return topProducts; }

Slide 26

Slide 26 text

public void increaseNumberOfSessions() { numberOfSessions++; } public float calculateAverageNumberOfViews() { if (numberOfSessions == 0) { return 0f; } return (float) numberOfViews / (float) numberOfSessions; } public float calculateAverageNumberOfPurchases() { if (numberOfSessions == 0) { return 0f; } return (float) numberOfPurchases / (float) numberOfSessions; } public float calculateAveragePurchase() { float amountInTotal = 0f; for (Iterator iterator = purchasesByProduct.keySet().iterator(); iterator.hasNext();) { LongWritable key = iterator.next(); amountInTotal += products.get(key).price.get() * purchasesByProduct.get(key); } return numberOfPurchases != 0 ? amountInTotal / numberOfPurchases : 0f; } } package com.codingserbia.dto; import java.util.ArrayList; import java.util.List; import com.fasterxml.jackson.annotation.JsonProperty;

Slide 27

Slide 27 text

public class CustomerSession { @JsonProperty public long sessionId; @JsonProperty public long customerCategoryId; @JsonProperty(required = false) public String customerCategoryDescription; @JsonProperty public List products; public CustomerSession() { products = new ArrayList(); } } package com.codingserbia.dto; import java.util.ArrayList; import java.util.List; import com.fasterxml.jackson.annotation.JsonProperty; public class CustomerSessionOutput { @JsonProperty public long customerCategoryId; @JsonProperty public String customerCategoryDescription; @JsonProperty public List products; @JsonProperty public float averageNumberOfViews;

Slide 28

Slide 28 text

@JsonProperty public float averageNumberOfPurchases; @JsonProperty public float averagePurchase; public CustomerSessionOutput() { products = new ArrayList(); } } package com.codingserbia.dto; import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonProperty; public class Product { @JsonProperty public long id; @JsonProperty public String name; @JsonProperty public String category; @JsonProperty public boolean bought; @JsonProperty public double price; @JsonIgnore public int numberOfPurschases; public Product() { }

Slide 29

Slide 29 text

public Product(long id, String name, String category, boolean bought, double price) { super(); this.id = id; this.name = name; this.category = category; this.bought = bought; this.price = price; } public Product(Product aProduct) { super(); this.id = aProduct.id; this.name = aProduct.name; this.category = aProduct.category; this.bought = aProduct.bought; this.price = aProduct.price; } } package com.codingserbia.dto; import com.fasterxml.jackson.annotation.JsonProperty; public class ProductOutput { @JsonProperty public long id; @JsonProperty public String name; public ProductOutput() { name = ""; } public ProductOutput(long id, String name) { super(); this.id = id; this.name = name; } }

Slide 30

Slide 30 text

package com.codingserbia.writable; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; public class CustomerCategoryWritable implements Writable { public LongWritable categoryId; public Text description; public Text gender; public CustomerCategoryWritable() { super(); categoryId = new LongWritable(); description = new Text(); gender = new Text(); } public CustomerCategoryWritable(long id, String description, String gender) { super(); categoryId = new LongWritable(id); this.description = new Text(description); this.gender = new Text(gender); } @Override public void readFields(DataInput input) throws IOException { categoryId.readFields(input); description.readFields(input); gender.readFields(input); } @Override public void write(DataOutput output) throws IOException { categoryId.write(output); description.write(output);

Slide 31

Slide 31 text

gender.write(output); } @Override public int hashCode() { final int prime = 31; int result = 1; result = prime * result + ((categoryId == null) ? 0 : categoryId.hashCode()); result = prime * result + ((description == null) ? 0 : description.hashCode()); result = prime * result + ((gender == null) ? 0 : gender.hashCode()); return result; } @Override public boolean equals(Object obj) { if (this == obj) { return true; } if (obj == null) { return false; } if (getClass() != obj.getClass()) { return false; } CustomerCategoryWritable other = (CustomerCategoryWritable) obj; if (categoryId == null) { if (other.categoryId != null) { return false; } } else if (!categoryId.equals(other.categoryId)) { return false; } if (description == null) { if (other.description != null) { return false; } } else if (!description.equals(other.description)) { return false; }

Slide 32

Slide 32 text

if (gender == null) { if (other.gender != null) { return false; } } else if (!gender.equals(other.gender)) { return false; } return true; } @Override public String toString() { return "CustomerCategoryWritable [categoryId=" + categoryId + ", description=" + description + ", gender=" + gender + "]"; } } package com.codingserbia.writable; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import com.codingserbia.dto.CustomerSession; public class CustomerSessionWritable implements Writable { public LongWritable categoryId; public Text categoryDescription; public ProductArrayWritable products; public CustomerSessionWritable() { super();

Slide 33

Slide 33 text

categoryDescription = new Text(); products = new ProductArrayWritable(ProductWritable.class); } public CustomerSessionWritable(String categoryDesc, CustomerSession json) { super(); categoryId = new LongWritable(json.customerCategoryId); categoryDescription = new Text(categoryDesc); products = new ProductArrayWritable(ProductWritable.class); ProductWritable[] pwArray = new ProductWritable[json.products.size()]; for (int i = 0; i < json.products.size(); i++) { ProductWritable pw = new ProductWritable(json.products.get(i)); pwArray[i] = pw; } products.set(pwArray); } @Override public void readFields(DataInput input) throws IOException { categoryId.readFields(input); categoryDescription.readFields(input); products.readFields(input); } @Override public void write(DataOutput ouput) throws IOException { categoryId.write(ouput); categoryDescription.write(ouput); products.write(ouput); } @Override public int hashCode() { final int prime = 31; int result = 1; result = prime * result + ((categoryDescription == null) ? 0 : categoryDescription.hashCode()); result = prime * result + ((categoryId == null) ? 0 : categoryId.hashCode()); result = prime * result + ((products == null) ? 0 : products.hashCode()); return result; }

Slide 34

Slide 34 text

@Override public boolean equals(Object obj) { if (this == obj) { return true; } if (obj == null) { return false; } if (getClass() != obj.getClass()) { return false; } CustomerSessionWritable other = (CustomerSessionWritable) obj; if (categoryDescription == null) { if (other.categoryDescription != null) { return false; } } else if (!categoryDescription.equals(other.categoryDescription)) { return false; } if (categoryId == null) { if (other.categoryId != null) { return false; } } else if (!categoryId.equals(other.categoryId)) { return false; } if (products == null) { if (other.products != null) { return false; } } else if (!products.equals(other.products)) { return false; } return true; } @Override public String toString() { return "CustomerSessionWritable [categoryId=" + categoryId + ", categoryDescription=" + categoryDescription.toString() + ", products=[" + products.toString() + "]]"; } }

Slide 35

Slide 35 text

package com.codingserbia.writable; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.MapWritable; import org.apache.hadoop.io.Writable; public class CustomerSessionWritablesGroupedByCustomerCategoryId implements Writable { public LongWritable customerCategoryId; public MapWritable sessions; public CustomerSessionWritablesGroupedByCustomerCategoryId() { super(); customerCategoryId = new LongWritable(); sessions = new MapWritable(); } public CustomerSessionWritablesGroupedByCustomerCategoryId(Long categoryId) { super(); customerCategoryId = new LongWritable(categoryId); sessions = new MapWritable(); } @Override public void readFields(DataInput input) throws IOException { customerCategoryId.readFields(input); sessions.readFields(input); } @Override public void write(DataOutput output) throws IOException { customerCategoryId.write(output); sessions.write(output); }

Slide 36

Slide 36 text

@Override public int hashCode() { final int prime = 31; int result = 1; result = prime * result + ((customerCategoryId == null) ? 0 : customerCategoryId.hashCode()); result = prime * result + ((sessions == null) ? 0 : sessions.hashCode()); return result; } @Override public boolean equals(Object obj) { if (this == obj) { return true; } if (obj == null) { return false; } if (getClass() != obj.getClass()) { return false; } CustomerSessionWritablesGroupedByCustomerCategoryId other = (CustomerSessionWritablesGroupedByCustomerCategoryId) obj; if (customerCategoryId == null) { if (other.customerCategoryId != null) { return false; } } else if (!customerCategoryId.equals(other.customerCategoryId)) { return false; } if (sessions == null) { if (other.sessions != null) { return false; } } else if (!sessions.equals(other.sessions)) { return false; } return true; } }

Slide 37

Slide 37 text

package com.codingserbia.writable; import org.apache.hadoop.io.ArrayWritable; import org.apache.hadoop.io.Writable; public class ProductArrayWritable extends ArrayWritable { public ProductArrayWritable(Class valueClass) { super(valueClass); } @Override public String toString() { String value = "ProductArrayWritable ["; Writable[] pwArray = get(); for (Writable pw : pwArray) { value += pw.toString(); } value += "]"; return value; } } package com.codingserbia.writable; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import org.apache.hadoop.io.BooleanWritable; import org.apache.hadoop.io.DoubleWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import com.codingserbia.dto.Product; public class ProductWritable implements Writable { public LongWritable id;

Slide 38

Slide 38 text

public Text name; public Text category; public BooleanWritable bought; public DoubleWritable price; public ProductWritable() { super(); id = new LongWritable(); name = new Text(); category = new Text(); bought = new BooleanWritable(); price = new DoubleWritable(); } public ProductWritable(Product json) { super(); id = new LongWritable(json.id); name = new Text(json.name); category = new Text(json.category); bought = new BooleanWritable(json.bought); price = new DoubleWritable(json.price); } @Override public void readFields(DataInput input) throws IOException { id.readFields(input); name.readFields(input); category.readFields(input); bought.readFields(input); price.readFields(input); } @Override public void write(DataOutput output) throws IOException { id.write(output); name.write(output); category.write(output); bought.write(output); price.write(output); }

Slide 39

Slide 39 text

@Override public int hashCode() { final int prime = 31; int result = 1; result = prime * result + ((bought == null) ? 0 : bought.hashCode()); result = prime * result + ((category == null) ? 0 : category.hashCode()); result = prime * result + ((id == null) ? 0 : id.hashCode()); result = prime * result + ((name == null) ? 0 : name.hashCode()); result = prime * result + ((price == null) ? 0 : price.hashCode()); return result; } @Override public boolean equals(Object obj) { if (this == obj) { return true; } if (obj == null) { return false; } if (getClass() != obj.getClass()) { return false; } ProductWritable other = (ProductWritable) obj; if (bought == null) { if (other.bought != null) { return false; } } else if (!bought.equals(other.bought)) { return false; } if (category == null) { if (other.category != null) { return false; } } else if (!category.equals(other.category)) { return false; }

Slide 40

Slide 40 text

if (id == null) { if (other.id != null) { return false; } } else if (!id.equals(other.id)) { return false; } if (name == null) { if (other.name != null) { return false; } } else if (!name.equals(other.name)) { return false; } if (price == null) { if (other.price != null) { return false; } } else if (!price.equals(other.price)) { return false; } return true; } @Override public String toString() { return "ProductWritable [id=" + id + ", name=" + name + ", category=" + category + ", bought=" + bought + ", price=" + price + "]"; } }

Slide 41

Slide 41 text

package com.codingserbia; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.codingserbia.writable.CustomerSessionWritable; public class CodingSerbiaMapReduce extends Configured implements Tool { private static final Logger LOGGER = LoggerFactory.getLogger(CodingSerbiaMapReduce.class); protected String customerCategoriesFilePath = ""; protected String inputPath = ""; protected String outputPath = ""; public CodingSerbiaMapReduce(Configuration config) { super(); setConf(config); } public static void main(String[] args) throws Exception { System.setProperty("hadoop.home.dir", "C:/work/tools/hadoop-common-2.2.0-bin-master"); Configuration config = new Configuration(); CodingSerbiaMapReduce mr = new CodingSerbiaMapReduce(config); ToolRunner.run(config, mr, args); }

Slide 42

Slide 42 text

protected boolean validateAndParseInput(String[] args) { if (args == null || args.length < 3) { LOGGER.error("Three arguments are required: path to customer categories file, path to input data and path to desired output directory."); return false; } if (args.length > 3) { LOGGER.error("Too many arguments. Only three arguments are required: path to customer categories file, path to input data and path to desired output directory."); return false; } customerCategoriesFilePath = args[0]; LOGGER.info("Customer categories file path: " + customerCategoriesFilePath); getConf().set("customer.categories.file.path", customerCategoriesFilePath); inputPath = args[1]; LOGGER.info("Input path: " + inputPath); outputPath = args[2]; LOGGER.info("Output path: " + outputPath); LOGGER.info("Input validation succeeded"); return true; } @Override public int run(String[] args) throws Exception { if (!validateAndParseInput(args)) { throw new RuntimeException("Input validation failed."); } Job job = Job.getInstance(getConf()); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(CustomerSessionWritable.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); job.setMapperClass(CustomerRecordsMapper.class); job.setReducerClass(CustomerRecordsReducer.class);

Slide 43

Slide 43 text

job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setJarByClass(CodingSerbiaMapReduce.class); return job.waitForCompletion(true) ? 0 : 1; } } package com.codingserbia; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.util.HashMap; import java.util.Map; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.codingserbia.dto.CustomerSession; import com.codingserbia.writable.CustomerCategoryWritable; import com.codingserbia.writable.CustomerSessionWritable; import com.fasterxml.jackson.databind.ObjectMapper; public class CustomerRecordsMapper extends Mapper { private static Logger LOGGER = LoggerFactory.getLogger(CustomerRecordsMapper.class); private Map groupedCategories; private ObjectMapper jsonMapper;

Slide 44

Slide 44 text

public CustomerRecordsMapper() { super(); groupedCategories = new HashMap(); jsonMapper = new ObjectMapper(); } @SuppressWarnings({ "rawtypes", "unchecked" }) @Override protected void setup(org.apache.hadoop.mapreduce.Mapper.Context context) throws IOException, InterruptedException { super.setup(context); String customerCategoriesPath = context.getConfiguration().get("customer.categories.file.path"); loadCustomerCategories(customerCategoriesPath, context); } @SuppressWarnings("unused") @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { try { CustomerSession jsonObj = jsonMapper.readValue(value.toString(), CustomerSession.class); LongWritable categoryId = new LongWritable(jsonObj.customerCategoryId); CustomerCategoryWritable category = groupedCategories.get(categoryId); if (category != null) { CustomerSessionWritable session = new CustomerSessionWritable(category.description.toString(), jsonObj); context.write(categoryId, session); } } catch (Exception e) { LOGGER.error(e.getMessage(), e); } } private void loadCustomerCategories(String filePath, Context context) throws IOException { Path path = new Path(filePath); FileSystem fs = path.getFileSystem(context.getConfiguration()); BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(path))); String line; while ((line = br.readLine()) != null) { String[] columns = line.split("\t"); long categoryId = Long.valueOf(columns[0]);

Slide 45

Slide 45 text

String description = columns[1] + " " + columns[2]; String gender = columns[2]; CustomerCategoryWritable writable = new CustomerCategoryWritable(categoryId, description, gender); groupedCategories.put(writable.categoryId, writable); } br.close(); } } package com.codingserbia; import java.io.IOException; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapreduce.Reducer; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.codingserbia.dto.CustomerCategoryProductBag; import com.codingserbia.dto.CustomerSessionOutput; import com.codingserbia.dto.ProductOutput; import com.codingserbia.writable.CustomerSessionWritable; import com.codingserbia.writable.ProductWritable; import com.fasterxml.jackson.databind.ObjectMapper; public class CustomerRecordsReducer extends Reducer { private static Logger LOGGER = LoggerFactory.getLogger(CustomerRecordsReducer.class); private Map categoryMap; private ObjectMapper jsonMapper;

Slide 46

Slide 46 text

public CustomerRecordsReducer() { super(); categoryMap = new HashMap(); jsonMapper = new ObjectMapper(); } @Override protected void reduce(LongWritable key, Iterable values, Context context) throws IOException, InterruptedException { CustomerCategoryProductBag aBag = categoryMap.get(key); if (aBag == null) { aBag = new CustomerCategoryProductBag(); aBag.customerCategoryId = key; } for (CustomerSessionWritable value : values) { aBag.increaseNumberOfSessions(); if (aBag.customerCategoryDescription.getLength() == 0) { aBag.customerCategoryDescription = value.categoryDescription; } Writable[] productWritables = value.products.get(); for (Writable writable : productWritables) { ProductWritable product = (ProductWritable) writable; if (!aBag.contains(product.id)) { aBag.add(product); } else { aBag.processOccurance(product); } } } categoryMap.put(key, aBag); int numberOfTopBoughtProducts = 5; List topProducts = aBag.getTopProductsBought(numberOfTopBoughtProducts); CustomerSessionOutput outputJsonObj = new CustomerSessionOutput();

Slide 47

Slide 47 text

outputJsonObj.customerCategoryId = key.get(); outputJsonObj.customerCategoryDescription = aBag.customerCategoryDescription.toString(); outputJsonObj.averageNumberOfViews = aBag.calculateAverageNumberOfViews(); outputJsonObj.averageNumberOfPurchases = aBag.calculateAverageNumberOfPurchases(); outputJsonObj.averagePurchase = aBag.calculateAveragePurchase(); for (ProductWritable pw : topProducts) { outputJsonObj.products.add(new ProductOutput(pw.id.get(), pw.name.toString())); } context.write(NullWritable.get(), new Text(jsonMapper.writeValueAsString(outputJsonObj))); LOGGER.info(jsonMapper.writeValueAsString(outputJsonObj)); } }

Slide 48

Slide 48 text

Total lines of code: ~ 1K

Slide 49

Slide 49 text

public class CodingSerbiaMapReduce extends Configured implements Tool { ... Configuration config = new Configuration(); CodingSerbiaMapReduce mr = new CodingSerbiaMapReduce(config); ToolRunner.run(config, mr, args); ... Job job = Job.getInstance(getConf()); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(CustomerSessionWritable.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); job.setMapperClass(CustomerRecordsMapper.class); job.setReducerClass(CustomerRecordsReducer.class); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); return job.waitForCompletion(true) ? 0 : 1; }

Slide 50

Slide 50 text

public class CustomerRecordsMapper extends Mapper { protected void map(LongWritable key, Text value, Context context) ... { ... CustomerSession jsonObj = jsonMapper.readValue(value.toString(), CustomerSession.class); LongWritable categoryId = new LongWritable(jsonObj.customerCategoryId); CustomerCategoryWritable category = categories.get(categoryId); if (category != null) { CustomerSessionWritable session = new CustomerSessionWritable(..., jsonObj); context.write(categoryId, session); }

Slide 51

Slide 51 text

public class CustomerRecordsReducer extends Reducer { protected void reduce(LongWritable key, Iterable values, Context context)…{ for (CustomerSessionWritable value : values) { // increase number of customer visits for (Writable writable : value.products.get()) { // process an occurrence of a product // track if it is bought or viewed, etc... } } // calculate average values we need // order bought/viewed products based on number of purschases/views context.write(NullWritable.get(), new Text(jsonMapper.writeValueAsString(outputJsonObj)));

Slide 52

Slide 52 text

$ cat output_record.json { "customerCategoryId": 4, "customerCategoryDescription": "30-40 male", "products": [ { "id": 1229, "name": "Candy ugradna rerna FS 635 AQUA" }, ... ], "averageNumberOfViews": 2.3333333, "averageNumberOfPurchases": 1.3333334, "averagePurchase": 44750.0 }

Slide 53

Slide 53 text

...

Slide 54

Slide 54 text

No content

Slide 55

Slide 55 text

$ analysis:performance cloudera-quickstart-vm-5.1.0 64-bit Intel i5 CPU @ 2.60GHz 16 GB RAM (12 GB RAM for VM) Small: 150000 json records ~ 103 MB Medium: 750000 json records ~ 517 MB Large: 1500000 json records ~ 1 GB X-large: 2250000 json records ~ 1.5 GB

Slide 56

Slide 56 text

$ analysis:performance -mode single-node 4m 05s 7m 40s 10m 30s 13m 30s 40s 1m 20s 2m 30s 3m 30s 0 100 200 300 400 500 600 700 800 900 small (150000/103MB) medium (750000/517MB) large (1500000/1GB) x-large (2250000/1.5GB)

Slide 57

Slide 57 text

$ analysis:performance –mode single-node 8 13 19 26 5 5 6 6 0 5 10 15 20 25 30 small (150000/103MB) medium (750000/517MB) large (1500000/1GB) x-large (2250000/1.5GB)

Slide 58

Slide 58 text

$ analysis:performance –mode cluster 2m 40s 3m 5s 3m 34s 4m 6s 35s 38s 46s 51s 0 50 100 150 200 250 300 small(150000/103MB) medium(750000/517MB) large(1500000/1GB) x-large(2250000/1.5GB) pig_2 java

Slide 59

Slide 59 text

$ analysis:performance –mode compare 4m 5s 7m 40s 10m 30s 13m 30s 40s 1m 20s 2m 30s 3m 30s 2m 40s 3m 5s 3m 34s 4m 6s 35s 38s 46s 51s 0 100 200 300 400 500 600 700 800 900 small(150000/103MB) medium(750000/517MB) large(1500000/1GB) x-large(2250000/1.5GB)

Slide 60

Slide 60 text

$ analysis:performance - PigCompiler translating PigLatin into Java code - PigOptimizer - Graphviz, execution plan

Slide 61

Slide 61 text

$ analysis:language_support Pig - UDF (Java, Python, Jython, Groovy, Ruby, JavaScript) REGISTER myUDFs.jar DEFINE ShinyUDF some.shiny.udf.DoSomething();

Slide 62

Slide 62 text

$ analysis:dev_tools:pig - Currently plugins for IDE - Plugins for text editors - Diagnostic operators: Describe, Dump, Explain and Illustrate - PigUnit $ analysis:dev_tools:java - MRUnit

Slide 63

Slide 63 text

$ conclusion:pig + high abstraction level + quick development + maintenance + extensions (UDF, PiggyBank) - performance - restrictions of Pig Latin $ conclusion:java + speeeeed, control + tools - complexity, maintenance, control

Slide 64

Slide 64 text

Future?

Slide 65

Slide 65 text

Future is now!

Slide 66

Slide 66 text

Streaming!

Slide 67

Slide 67 text

No content

Slide 68

Slide 68 text

6 8 - 10 To 100 times faster than MapReduce! - Advanced DAG (Directed Acyclic Graph) execution engine - Java, Scala, Python, R - In Memory or Disk - Berkeley Paper https://www.cs.berkeley.edu/~matei/papers/2012/nsdi_spark.pdf - Initially started by Matei Zaharia at UC Berkeley in 2009 - 2013 donated to the Apache Software Foundation - 2014 set a new world record in large scale sorting

Slide 69

Slide 69 text

Forecasting

Slide 70

Slide 70 text

Forecasting

Slide 71

Slide 71 text

Nowcasting!

Slide 72

Slide 72 text

No content

Slide 73

Slide 73 text

7 3 Questions?