Upgrade to Pro — share decks privately, control downloads, hide ads and more …

Machine Learning Infrastructure at Stripe

Machine Learning Infrastructure at Stripe

Rob Story

July 24, 2017
Tweet

More Decks by Rob Story

Other Decks in Programming

Transcript

  1. Y E P, R I G H T N O

    W Let’s ship a model to production
  2. @if_delegate_has_method(delegate='estimator') def fit_transform(self, X, y=None, **fit_params): self.fit(X, y, **fit_params) return

    self.transform(X) @if_delegate_has_method(delegate='estimator') def transform(self, X): return self.estimator.transform(X)
  3. def _fit_serializable(serializable, X, y=None, **fit_params): if not isinstance(X, pd.DataFrame): raise

    ValueError( 'serializable {} requires a pandas.DataFrame' .format(type(serializable.get_estimator()))) init_feature_names = list(X.columns.values) serializable.fit_with_feature_names( None, init_feature_names, X, y, **fit_params) # Allow feature selection to propagate backwards. serializable.set_output_features(None) return serializable
  4. class FillMissing(SerializableEstimator, TransformerMixin): def __init__(self, columns='all', missing_value=-1): self.columns = columns

    self.missing_value = missing_value def serialize(self, name): bytes = json_to_bytes({ "features": list(self.columns_), "value": self.missing_value }) return ApplyFeatureEncoder('fill_missing', name, bytes, 'json')
  5. class RandomForestSerializer(ModelSerializer): """Serializer for RandomForest models.""" def is_serializer_for(self, obj): return

    isinstance(obj, RandomForestRegressor) def serialize_model(self, name, model, feature_names): decision_trees = [] for decision_tree in model.estimators_: decision_trees.append( _tree_to_dict(decision_tree, feature_names)) bonsai_bytes = get_bonsai_bytes(decision_trees) return Model("simple-bonsai-regression-forest", name, bonsai_bytes, "bonsai")
  6. Scala library for transforming arbitrary tree structures into read- only

    versions that take up a fraction of the space Open Source!
  7. def _tree_to_dict(decision_tree, feature_names, fraudulent_class_idx=1): # This is where the internal

    tree structure lives in an sk DecisionTree tree = decision_tree.tree_ if isinstance(decision_tree, t.DecisionTreeClassifier): # NOTE: This ONLY WORKS with binary classification, where the # second class is the fraudulent class. probs = np.nan_to_num(tree.value[:, 0, fraudulent_class_idx] / (tree.value[:, 0, 0] + tree.value[:, 0, 1])) elif isinstance(decision_tree, t.DecisionTreeRegressor): probs = [v[0][0] for v in tree.value] else: raise ValueError("You can only serialize scikit decision trees!") return { "feature_names": feature_names, "features_used": _features_used(tree, feature_names), "node_features": map(int, tree.feature), "node_thresholds": map(float, tree.threshold), "left_children": map(int, tree.children_left), "right_children": map(int, tree.children_right), "probabilities": [float(p) for p in probs], # Deprecated, moving these to Pipeline "encodings": {} } Brittle to version changes!
  8. In [2]: model_package = estimator.model_package In [3]: model_package.encoder Out[3]: <scripts.ml.lib.diorama.serialize.model_package.ApplyFeatureEncoder…>

    In [4]: model_package.model Out[4]: <scripts.ml.lib.diorama.serialize.model_package.Model…> In [5]: model_package.encoder.encoder_type Out[5]: 'stripe-categorical-encoding' In [6]: model_package.model.model_type Out[6]: 'simple-bonsai-regression-forest'
  9. 'label-encoder.json': {'encodings': {'bird': {'chicken': 0, 'finch': 1, 'raven': 2}, 'food':

    {'cheese': 0, 'hamburger': 1, 'tomato': 2}, 'planet': {'earth': 0, 'mars': 1, 'pluto': 2}}, 'features': ['bird', 'food', 'planet']}
  10. Model Hierarchy: Fan Out M O D E L :

    S H A .1 2 3 4 5 … M O D E L : S H A . A B C … M O D E L : S H A . X Y Z … M O D E L : S H A .9 8 7…
  11. We implement everything on the Scala side, including the encoders.

    case class StandardCategoryEncoder( features: Set[String], encodings: Map[String, Map[String, Double]] ) extends FeatureEncoder { private[this] val (featureTypes, featureParsers) = StandardCategoryEncoder.makeParsers(features, encodings) def encode(features: Map[String, FeatureValue]): Try[Map[String, FeatureValue]] = Try { features.map { case (key, value) => featureParsers.get(key) match { case Some(parse) => key -> parse(value).get case None => key -> value } } }