Upgrade to Pro — share decks privately, control downloads, hide ads and more …

Teaching Your Machine To Find Fraudsters

Ian Barber
February 23, 2012

Teaching Your Machine To Find Fraudsters

When dealing with money online, fraud is an ongoing problem for both consumers and sellers. Researchers have been developing statistical and machine learning techniques to spot fraudulent purchases on e-commerce systems and catch click fraud on adverts. While there is no silver bullet, you will learn to flag suspicious activity and help protect your site from scammers using PHP and a little help from some other technologies.

Ian Barber

February 23, 2012
Tweet

More Decks by Ian Barber

Other Decks in Technology

Transcript

  1. TESTED AGAINST NORMAL TESTED AGAINST MODEL STATISTICAL Anomaly Detection Supervised

    Classifiers SIGNATURE Complex Event Processing Rule Based Detection
  2. function detect($sen) { $window = array(); $i = 0; $alarmCount

    = 0; $dtd = 0; $avg = $stddev = 0; $fraud = fopen("fraudclicks.csv", 'r'); while($d = fgetcsv($fraud)) { $i++; if(count($window) > 7) { array_shift($window); $avg = array_sum($window) / 7; foreach($window as $val) { $stddev += pow($val - $average, 2); } $stddev = sqrt($stddev/7); average.php
  3. 0 0.05 0.1 0.15 0.2 1 2 3 4 5

    6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
  4. if($d[1] > ($avg + ($sen * $stddev))){ $alarmCount++; if($i >

    201) { break; } } else { if($i > 201) { $dtd++; } } } array_push($window, $d[1]); } return array($alarmCount-1, $dtd); }
  5. 0 7.5 15 22.5 30 Clicks Date 18 False Alarms

    1 Day To Detect 1.6 SENSITIVITY
  6. 0 7.5 15 22.5 30 Clicks Date 1 False Alarm

    18 Days To Detect 2.7 SENSITIVITY
  7. function detect($sens) { $i = 0; $alarms = 0; $dtd

    = 0; $window = array(); $avail = array(); $fraud = fopen("fraudclicks.csv", 'r'); while($dat = fgetcsv($fraud)) { $dow = date("w", strtotime($dat[0])); if( count($window) >= 7 && isset($avail[$dow]) ) { $sick = 0; foreach($window as $day => $value) { $dowavg = array_sum($avail[$day]) / count($avail[$day]); $sick += $value / $dowavg; } $sick /= count($window); sickavail.php
  8. $avlblty = array_sum($avail[$dow]) / count($avail[$dow]); $est = $sick * $avlblty;

    $fac = fac($dat[1]); $p = exp(-$est) * pow($est,$dat[1]) / $fac; // poisson calc if($p < $sens && $dat[1] > $est) { $alarms++; if($i > 201) { break; } } else { if($i > 201) { $dtd++; } } } // end if
  9. if(!isset($avail[$dow])) { $avail[$dow] = array(); } $avail[$dow][] = $dat[1]; if(count($avail[$dow])

    > 7) { array_shift($avail[$dow]); } $window[$dow] = $dat[1]; } // end while return array($alarms-1, $dtd); }
  10. 0 7.5 15 22.5 30 Clicks Date 1 False Alarm

    1 Day To Detect 0.011 SENSITIVITY
  11. function compareEmailToName($name, $email) { $name = strtolower($name); $email = strtolower($email);

    $parts = explode(" ", $name); $pcnt = 0; list($user, $dom) = explode("@", $email); $user = str_replace( array(".", "+"), " ", $user); $dom = preg_replace("/\..*/", "", $dom); similar_text($name, $user, $pcnt); if($pcnt > 80) { return 1.0; } similar_text($name, $dom, $pcnt); if($pcnt > 80) { return 0.8; } email.php
  12. if(count($parts)) { $highest = 0; foreach($parts as $part) { similar_text($user,

    $part, $pcnt); if($pcnt > 50 && $pcnt > $highest) { $highest = $percent; } similar_text($dom, $part, $pcnt); if($pcnt > 50 && $pcnt > $highest) { $highest = $percent; } } return (1.7 * ($highest/100)) - 1; } return -1; }
  13. $data = array( 'purchase_value' => 20993, 'geo_country' => 'DE', 'previous_orders'

    => 1, 'time' => 6, 'timegap' => 146632, 'product_category' => 'small_item', 'delivery_matches_card' => 0, 'geo_ip_matches_card' => 1, 'difference_from_last_trans' => 8755, 'free_shipping' => 0, 'email_like_name' => 0, 'free_email_provider' => 0, 'disposable_email_provider' => 0, 'quantity' => 2, 'fraud' => 0);
  14. $ apt-get install libsvm-dev $ apt-get install libsvm-tools $ yum

    install libsvm-devel $ pecl install svm-beta $ echo extension=svm.so > /etc/php.d/svm.ini $ php -r '$s = new svm(); $m = $s->train (array(array(-1, -1), array(1, 1))); echo $m->predict(array(0, -1));' -1
  15. $fh = fopen('paydata.csv', 'r'); $output = array(); while($data = fgetcsv($fh))

    { $output[] = array( $data[14] == 1 ? -1 : 1, 1 => ($data[0]/20000.00) - 1.0, // price 2 => $data[1] == 'CN' ? 1.0:-1.0, 3 => $data[1] == 'US' ? 1.0:-1.0, 4 => $data[5] == 'digital' ? 1.0:-1.0, 5 => $data[7] == 1 ? 1.0:-1.0, //geo 6 => $data[6] == 1 ? 1.0:-1.0, // deliv 12 => $data[9] == 1 ? 1.0:-1.0, // ship 13 => ($data[13] / 1.5) - 1.0, // qty ); } learn.php
  16. $svm = new svm(); $model = $svm->train($output, array(-1 => 0.65,

    1 => 0.5)); $model->save('learn.model'); $fp = $tp = $fn = $tn = 0; foreach($output as $test) { $res = $model->predict($test); if($test[0] > 0) { if($res > 0) { $tp++; } else { $fn++; } } else { if($res > 0) { $fp++; } else { $tn++; } } }
  17. // ...snip.. loading test data from // paytest.csv $model =

    new SVMModel('learn.model'); $fp = $tp = $fn = $tn = 0; foreach($output as $test) { $res = $model->predict($test); if($test[0] > 0) { if($res > 0) { $tp++; } else { $fn++; } } else { if($res > 0) { $fp++; } else { $tn++; } } } test.php
  18. var_dump("True Positive " . $tp); var_dump("True Negative " . $tn);

    var_dump("False Positive " . $fp); var_dump("False Negative " . $fn); var_dump("Accuracy " . (($tp+$tn)/($tp+$tn+$fp+$fn)));
  19. $ php learn.php string(18) "True Positive 8316" string(18) "True Negative

    1682" string(16) "False Positive 2" string(16) "False Negative 0" string(15) "Accuracy 0.9998" $ php test.php string(17) "True Positive 844" string(17) "True Negative 155" string(16) "False Positive 0" string(16) "False Negative 1" string(14) "Accuracy 0.999"
  20. Title Slide - CSI http://www.flickr.com/photos/39matt/5241862082 Sickness Availability - Chicago Fire

    Department http://www.flickr.com/photos/mike_miley/3929146730/ Model Buildings - Ah Ain’t Long For This Whorl http://www.flickr.com/photos/chadmiller/98014022/ Repeat Customer - McDonald’s Loyalty Card http://www.flickr.com/photos/fsse-info/3658873057/ Shipping - FedEx Truck http://www.flickr.com/photos/moto_club4ag/4852235145/ Velocity - Chevrolet Chevelle Dragster http://www.flickr.com/photos/jns001/2958999006/ GeoIP - Earth Asia Terminator View http://www.flickr.com/photos/flyingsinger/86898564/ Multiple Items - Boxes http://www.flickr.com/photos/skrewtape/851672959/