Upgrade to Pro — share decks privately, control downloads, hide ads and more …

Machine Learning Infrastructure at Stripe

Machine Learning Infrastructure at Stripe

Avatar for Rob Story

Rob Story

July 24, 2017
Tweet

More Decks by Rob Story

Other Decks in Programming

Transcript

  1. Y E P, R I G H T N O

    W Let’s ship a model to production
  2. @if_delegate_has_method(delegate='estimator') def fit_transform(self, X, y=None, **fit_params): self.fit(X, y, **fit_params) return

    self.transform(X) @if_delegate_has_method(delegate='estimator') def transform(self, X): return self.estimator.transform(X)
  3. def _fit_serializable(serializable, X, y=None, **fit_params): if not isinstance(X, pd.DataFrame): raise

    ValueError( 'serializable {} requires a pandas.DataFrame' .format(type(serializable.get_estimator()))) init_feature_names = list(X.columns.values) serializable.fit_with_feature_names( None, init_feature_names, X, y, **fit_params) # Allow feature selection to propagate backwards. serializable.set_output_features(None) return serializable
  4. class FillMissing(SerializableEstimator, TransformerMixin): def __init__(self, columns='all', missing_value=-1): self.columns = columns

    self.missing_value = missing_value def serialize(self, name): bytes = json_to_bytes({ "features": list(self.columns_), "value": self.missing_value }) return ApplyFeatureEncoder('fill_missing', name, bytes, 'json')
  5. class RandomForestSerializer(ModelSerializer): """Serializer for RandomForest models.""" def is_serializer_for(self, obj): return

    isinstance(obj, RandomForestRegressor) def serialize_model(self, name, model, feature_names): decision_trees = [] for decision_tree in model.estimators_: decision_trees.append( _tree_to_dict(decision_tree, feature_names)) bonsai_bytes = get_bonsai_bytes(decision_trees) return Model("simple-bonsai-regression-forest", name, bonsai_bytes, "bonsai")
  6. Scala library for transforming arbitrary tree structures into read- only

    versions that take up a fraction of the space Open Source!
  7. def _tree_to_dict(decision_tree, feature_names, fraudulent_class_idx=1): # This is where the internal

    tree structure lives in an sk DecisionTree tree = decision_tree.tree_ if isinstance(decision_tree, t.DecisionTreeClassifier): # NOTE: This ONLY WORKS with binary classification, where the # second class is the fraudulent class. probs = np.nan_to_num(tree.value[:, 0, fraudulent_class_idx] / (tree.value[:, 0, 0] + tree.value[:, 0, 1])) elif isinstance(decision_tree, t.DecisionTreeRegressor): probs = [v[0][0] for v in tree.value] else: raise ValueError("You can only serialize scikit decision trees!") return { "feature_names": feature_names, "features_used": _features_used(tree, feature_names), "node_features": map(int, tree.feature), "node_thresholds": map(float, tree.threshold), "left_children": map(int, tree.children_left), "right_children": map(int, tree.children_right), "probabilities": [float(p) for p in probs], # Deprecated, moving these to Pipeline "encodings": {} } Brittle to version changes!
  8. In [2]: model_package = estimator.model_package In [3]: model_package.encoder Out[3]: <scripts.ml.lib.diorama.serialize.model_package.ApplyFeatureEncoder…>

    In [4]: model_package.model Out[4]: <scripts.ml.lib.diorama.serialize.model_package.Model…> In [5]: model_package.encoder.encoder_type Out[5]: 'stripe-categorical-encoding' In [6]: model_package.model.model_type Out[6]: 'simple-bonsai-regression-forest'
  9. 'label-encoder.json': {'encodings': {'bird': {'chicken': 0, 'finch': 1, 'raven': 2}, 'food':

    {'cheese': 0, 'hamburger': 1, 'tomato': 2}, 'planet': {'earth': 0, 'mars': 1, 'pluto': 2}}, 'features': ['bird', 'food', 'planet']}
  10. Model Hierarchy: Fan Out M O D E L :

    S H A .1 2 3 4 5 … M O D E L : S H A . A B C … M O D E L : S H A . X Y Z … M O D E L : S H A .9 8 7…
  11. We implement everything on the Scala side, including the encoders.

    case class StandardCategoryEncoder( features: Set[String], encodings: Map[String, Map[String, Double]] ) extends FeatureEncoder { private[this] val (featureTypes, featureParsers) = StandardCategoryEncoder.makeParsers(features, encodings) def encode(features: Map[String, FeatureValue]): Try[Map[String, FeatureValue]] = Try { features.map { case (key, value) => featureParsers.get(key) match { case Some(parse) => key -> parse(value).get case None => key -> value } } }