Slide 1

Slide 1 text

TEACHING YOUR MACHINE TO FIND FRAUDSTERS Ian Barber ianb@php.net phpir.com twitter.com/ianbarber

Slide 2

Slide 2 text

https://github.com/ianbarber/FindingFraudsters-Talk https://joind.in/4949

Slide 3

Slide 3 text

5% 3% .1% 8% SOME SMALL NUMBERS

Slide 4

Slide 4 text

TESTED AGAINST NORMAL TESTED AGAINST MODEL STATISTICAL Anomaly Detection Supervised Classifiers SIGNATURE Complex Event Processing Rule Based Detection

Slide 5

Slide 5 text

99% ACCURACY

Slide 6

Slide 6 text

REALLY LEGITIMATE REALLY FRAUD EVALUATED LEGITIMATE 989 0 EVALUATED FRAUD 10 1

Slide 7

Slide 7 text

REALLY LEGITIMATE REALLY FRAUD EVALUATED LEGITIMATE 989 0 EVALUATED FRAUD 10 1 90% WRONG

Slide 8

Slide 8 text

ANOMALY DETECTION

Slide 9

Slide 9 text

0 7.5 15 22.5 30 Clicks Date

Slide 10

Slide 10 text

Detector User Clicks Ad Alarm No Alarm Landing Page SOFTWARE ARCHITECTURE Buffer

Slide 11

Slide 11 text

Threshold Expected Clicks Alarm Sensitivity Data Buffer statistics DETECTOR

Slide 12

Slide 12 text

function detect($sen) { $window = array(); $i = 0; $alarmCount = 0; $dtd = 0; $avg = $stddev = 0; $fraud = fopen("fraudclicks.csv", 'r'); while($d = fgetcsv($fraud)) { $i++; if(count($window) > 7) { array_shift($window); $avg = array_sum($window) / 7; foreach($window as $val) { $stddev += pow($val - $average, 2); } $stddev = sqrt($stddev/7); average.php

Slide 13

Slide 13 text

0 0.05 0.1 0.15 0.2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20

Slide 14

Slide 14 text

if($d[1] > ($avg + ($sen * $stddev))){ $alarmCount++; if($i > 201) { break; } } else { if($i > 201) { $dtd++; } } } array_push($window, $d[1]); } return array($alarmCount-1, $dtd); }

Slide 15

Slide 15 text

0 7.5 15 22.5 30 Clicks Date 18 False Alarms 1 Day To Detect 1.6 SENSITIVITY

Slide 16

Slide 16 text

0 7.5 15 22.5 30 Clicks Date 1 False Alarm 18 Days To Detect 2.7 SENSITIVITY

Slide 17

Slide 17 text

SICKNESS AVAILABILITY

Slide 18

Slide 18 text

function detect($sens) { $i = 0; $alarms = 0; $dtd = 0; $window = array(); $avail = array(); $fraud = fopen("fraudclicks.csv", 'r'); while($dat = fgetcsv($fraud)) { $dow = date("w", strtotime($dat[0])); if( count($window) >= 7 && isset($avail[$dow]) ) { $sick = 0; foreach($window as $day => $value) { $dowavg = array_sum($avail[$day]) / count($avail[$day]); $sick += $value / $dowavg; } $sick /= count($window); sickavail.php

Slide 19

Slide 19 text

$avlblty = array_sum($avail[$dow]) / count($avail[$dow]); $est = $sick * $avlblty; $fac = fac($dat[1]); $p = exp(-$est) * pow($est,$dat[1]) / $fac; // poisson calc if($p < $sens && $dat[1] > $est) { $alarms++; if($i > 201) { break; } } else { if($i > 201) { $dtd++; } } } // end if

Slide 20

Slide 20 text

if(!isset($avail[$dow])) { $avail[$dow] = array(); } $avail[$dow][] = $dat[1]; if(count($avail[$dow]) > 7) { array_shift($avail[$dow]); } $window[$dow] = $dat[1]; } // end while return array($alarms-1, $dtd); }

Slide 21

Slide 21 text

0 0.05 0.1 0.15 0.2 1 2 3 4 5 6 7 8 9 10

Slide 22

Slide 22 text

0 7.5 15 22.5 30 Clicks Date 1 False Alarm 1 Day To Detect 0.011 SENSITIVITY

Slide 23

Slide 23 text

RULES & SIGNATURES

Slide 24

Slide 24 text

No content

Slide 25

Slide 25 text

No content

Slide 26

Slide 26 text

function compareEmailToName($name, $email) { $name = strtolower($name); $email = strtolower($email); $parts = explode(" ", $name); $pcnt = 0; list($user, $dom) = explode("@", $email); $user = str_replace( array(".", "+"), " ", $user); $dom = preg_replace("/\..*/", "", $dom); similar_text($name, $user, $pcnt); if($pcnt > 80) { return 1.0; } similar_text($name, $dom, $pcnt); if($pcnt > 80) { return 0.8; } email.php

Slide 27

Slide 27 text

if(count($parts)) { $highest = 0; foreach($parts as $part) { similar_text($user, $part, $pcnt); if($pcnt > 50 && $pcnt > $highest) { $highest = $percent; } similar_text($dom, $part, $pcnt); if($pcnt > 50 && $pcnt > $highest) { $highest = $percent; } } return (1.7 * ($highest/100)) - 1; } return -1; }

Slide 28

Slide 28 text

No content

Slide 29

Slide 29 text

No content

Slide 30

Slide 30 text

No content

Slide 31

Slide 31 text

SUPPORT VECTOR MACHINES

Slide 32

Slide 32 text

$data = array( 'purchase_value' => 20993, 'geo_country' => 'DE', 'previous_orders' => 1, 'time' => 6, 'timegap' => 146632, 'product_category' => 'small_item', 'delivery_matches_card' => 0, 'geo_ip_matches_card' => 1, 'difference_from_last_trans' => 8755, 'free_shipping' => 0, 'email_like_name' => 0, 'free_email_provider' => 0, 'disposable_email_provider' => 0, 'quantity' => 2, 'fraud' => 0);

Slide 33

Slide 33 text

0 5 10 15 20 0 5 10 15 20

Slide 34

Slide 34 text

0 5 10 15 20 0 5 10 15 20

Slide 35

Slide 35 text

0 5 10 15 20 0 5 10 15 20

Slide 36

Slide 36 text

0 5 10 15 20 0 5 10 15 20

Slide 37

Slide 37 text

0 5 10 15 20 0 5 10 15 20

Slide 38

Slide 38 text

Learner Training Data Model Model Test Data Classifier Prediction Accuracy EVALUATING THE CLASSIFIER

Slide 39

Slide 39 text

Classifier User Purchase Fraud Not Fraud Transaction Processor Transaction Database Learner classification model SOFTWARE ARCHITECTURE

Slide 40

Slide 40 text

$ apt-get install libsvm-dev $ apt-get install libsvm-tools $ yum install libsvm-devel $ pecl install svm-beta $ echo extension=svm.so > /etc/php.d/svm.ini $ php -r '$s = new svm(); $m = $s->train (array(array(-1, -1), array(1, 1))); echo $m->predict(array(0, -1));' -1

Slide 41

Slide 41 text

$fh = fopen('paydata.csv', 'r'); $output = array(); while($data = fgetcsv($fh)) { $output[] = array( $data[14] == 1 ? -1 : 1, 1 => ($data[0]/20000.00) - 1.0, // price 2 => $data[1] == 'CN' ? 1.0:-1.0, 3 => $data[1] == 'US' ? 1.0:-1.0, 4 => $data[5] == 'digital' ? 1.0:-1.0, 5 => $data[7] == 1 ? 1.0:-1.0, //geo 6 => $data[6] == 1 ? 1.0:-1.0, // deliv 12 => $data[9] == 1 ? 1.0:-1.0, // ship 13 => ($data[13] / 1.5) - 1.0, // qty ); } learn.php

Slide 42

Slide 42 text

$svm = new svm(); $model = $svm->train($output, array(-1 => 0.65, 1 => 0.5)); $model->save('learn.model'); $fp = $tp = $fn = $tn = 0; foreach($output as $test) { $res = $model->predict($test); if($test[0] > 0) { if($res > 0) { $tp++; } else { $fn++; } } else { if($res > 0) { $fp++; } else { $tn++; } } }

Slide 43

Slide 43 text

// ...snip.. loading test data from // paytest.csv $model = new SVMModel('learn.model'); $fp = $tp = $fn = $tn = 0; foreach($output as $test) { $res = $model->predict($test); if($test[0] > 0) { if($res > 0) { $tp++; } else { $fn++; } } else { if($res > 0) { $fp++; } else { $tn++; } } } test.php

Slide 44

Slide 44 text

var_dump("True Positive " . $tp); var_dump("True Negative " . $tn); var_dump("False Positive " . $fp); var_dump("False Negative " . $fn); var_dump("Accuracy " . (($tp+$tn)/($tp+$tn+$fp+$fn)));

Slide 45

Slide 45 text

$ php learn.php string(18) "True Positive 8316" string(18) "True Negative 1682" string(16) "False Positive 2" string(16) "False Negative 0" string(15) "Accuracy 0.9998" $ php test.php string(17) "True Positive 844" string(17) "True Negative 155" string(16) "False Positive 0" string(16) "False Negative 1" string(14) "Accuracy 0.999"

Slide 46

Slide 46 text

Test Verify Update Automated Manual Manual training data

Slide 47

Slide 47 text

(shogun)

Slide 48

Slide 48 text

THANK YOU https://joind.in/4949 Ian Barber ianb@php.net

Slide 49

Slide 49 text

Title Slide - CSI http://www.flickr.com/photos/39matt/5241862082 Sickness Availability - Chicago Fire Department http://www.flickr.com/photos/mike_miley/3929146730/ Model Buildings - Ah Ain’t Long For This Whorl http://www.flickr.com/photos/chadmiller/98014022/ Repeat Customer - McDonald’s Loyalty Card http://www.flickr.com/photos/fsse-info/3658873057/ Shipping - FedEx Truck http://www.flickr.com/photos/moto_club4ag/4852235145/ Velocity - Chevrolet Chevelle Dragster http://www.flickr.com/photos/jns001/2958999006/ GeoIP - Earth Asia Terminator View http://www.flickr.com/photos/flyingsinger/86898564/ Multiple Items - Boxes http://www.flickr.com/photos/skrewtape/851672959/