Source code for fusetools.ml_tools

"""
Functions for interacting with Machine Learning Tools.

|pic1|
    .. |pic1| image:: ../images_source/ml_tools/scikit1.png
        :width: 50%

"""

import pandas as pd
from sklearn import metrics
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from sklearn.svm import SVC
import numpy as np
from fusetools.analytics_tools import Pandas
import seaborn as sns


[docs]class Prep: """ Functions for preparing data for machine learning tasks. """
[docs] @classmethod def make_model_feature_df(cls, df, cat, model_list): """ Creates Pandas DataFrame with cumulatively trained combined feature names, coefficients, absolute coefficients in order of trained together. :param df: Pandas DataFrame of a fitted SckitLearn estimator's features, coefficients absolute coefficients. :param cat: Name of ScikitLearn estimator or self defines estimator type. :param model_list: List of ScikitLearn estimators. :return: Pandas DataFrame with cumulatively trained combined feature names, coefficients, absolute coefficients and performance placeholders in order of trained together. """ for idx, m in enumerate(model_list): df = df.copy() feature_list = [] df['feature_list'] = "" df['feature_count'] = "" df["auc"] = "" df["acc"] = "" for idxx, row in df.iterrows(): feature_list.append(row['feature']) df.at[idxx, 'feature_list'] = feature_list.copy() df.at[idxx, 'feature_count'] = len(feature_list) df['model'] = m df["cat"] = cat if idx == 0: df_all = df.copy() else: df_all = pd.concat([df_all, df]) return df_all
[docs] @classmethod def label_encode_df(cls, df, col, col_new): """ Assigns an incremental number for each string value and then converts the new number to a float so we can add the missing value (NaNs) back in for later imputation. :param df: Pandas DataFrame with atleast one feature to encode. :param col: Name of column to encode. :param col_new: Name of new, encoded column. :return: Pandas DataFrame with new, encoded column. """ lb = LabelEncoder() df[col_new] = lb.fit_transform(df[col].fillna("NaN")).astype( float) # apply label encoder (creates incremental number for each string) df[col_new] = np.where(df[col_new] == Pandas.find_na_holder(col, col_new)[1], np.nan, df[col_new]) # add missing values back df.drop(col, axis=1, inplace=True) # drop the old column return df
[docs]class Viz: """ Functions for visualizing results from machine learning tasks. """
[docs] @classmethod def show_model_results(cls, estimator, y_test, y_pred, y_score): """ Prints the Accuracy, AUC and Metrics Classification report for an estimator. :param estimator: A trained estimator. :param y_test: Output labels for the test dataset. :param y_pred: Output predictions labels for the test dataset. :param y_score: Probabilities of certainty for output prediction labels. :return: Accuracy, AUC and Metrics Classification report for an estimator. """ y_wrong = y_test[y_test != y_pred] print("Model Performance Metrics:") print(".................................................................") print('Test Accuracy: %.3f' % accuracy_score(y_true=y_test, y_pred=y_pred)) print('Test AUC: %.3f' % roc_auc_score(y_true=y_test, y_score=y_score)) # store model accuracy clf_acc = accuracy_score(y_true=y_test.values, y_pred=y_pred) print("") print("") print("Model Diagnostics:") print(".................................................................") print(metrics.classification_report(y_test, y_pred)) Viz.show_clf_perf(14, 5, estimator, y_test, y_pred, y_score);
[docs] @classmethod def show_clf_perf(cls, width, height, y_test, y_pred, y_score): """ Prints a confusion matrix of an estimator's binary classification performance. :param width: Plot width. :param height: Plot height. :param y_test: Output labels for the test dataset. :param y_pred: Output predictions labels for the test dataset. :param y_score: Probabilities of certainty for output prediction labels. :return: Confusion matrix of an estimator's binary classification performance. """ import matplotlib.pyplot as plt from matplotlib import gridspec import seaborn as sns fig = plt.figure(figsize=(width, height)) gs = gridspec.GridSpec(1, 2, width_ratios=[1, 1]) ax1 = plt.subplot(gs[0]) from sklearn.metrics import confusion_matrix confmat = confusion_matrix(y_true=y_test, y_pred=y_pred) ax1.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3) for i in range(confmat.shape[0]): for j in range(confmat.shape[1]): ax1.text(x=j, y=i, s=confmat[i, j], va='center', ha='center', size=20) from sklearn.metrics import accuracy_score accuracy_score = accuracy_score(y_true=y_test, y_pred=y_pred) ax1.text(.9, .1, 'Test Accuracy=%s' % round(accuracy_score, 5), ha='center', size=20, va='center', transform=ax1.transAxes, color="red") ax1.set_title("Confusion Matrix") ax1.set_xlabel("Predicted Success") ax1.set_ylabel("Actual Success") sns.despine(left=True, bottom=True) ax2 = plt.subplot(gs[1]) from sklearn.metrics import roc_curve, roc_auc_score fpr, tpr, thresholds = roc_curve(y_true=y_test, y_score=y_score, pos_label=1) auc = roc_auc_score(y_true=y_test, y_score=y_score) ax2.plot(fpr, tpr, linewidth=2, label=None) ax2.plot([0, 1], [0, 1], 'k--') ax2.text(.9, .1, 'Test AUC=%s' % round(auc, 5), size=20, ha='center', va='center', transform=ax2.transAxes, color="red") ax2.set_title("ROC Curve") plt.show() return gs;
[docs] @classmethod def show_clf_perf_features(cls, width, height, df, title, model_list, df_max_stats): """ Creates a plot of given ScikitLearn estimator Accuracies (Y Axis) by number of features in model (X Axis). :param width: Plot width. :param height: Plot height. :param df: Pandas DataFrame with cumulatively trained combined feature names, coefficients, absolute coefficients in order of trained together. :param title: Plot title. :param model_list: List of ScikitLearn estimators to iterate through and plot cumulative performance for. :param df_max_stats: Pandas DataFrame of estimator names, max accuracy achieved and number of features for fitted estimator. Used for plot annotation. :return: Plot of given ScikitLearn estimator Accuracies (Y Axis) by number of features in model (X Axis). """ alpha = 0.3 for idx, m in enumerate(model_list): df_sub = df[df['model'] == m] max_acc = df_max_stats[df_max_stats['model'] == m]['acc'].values[0] max_acc_ind = df_max_stats[df_max_stats['model'] == m]['feature_count'].values[0] ax = sns.lineplot(x=df_sub.index, y=df_sub.acc, markers=df_sub.model, dashes=df_sub.model, data=df_sub, alpha=alpha) color = ax.get_lines()[-1].get_c() ax.text(max_acc_ind, max_acc, str(round(max_acc, 4)), color=color) alpha += 0.1 ax.set_title(title, size=20) ax.set_ylabel("Accuracy", size=12) ax.set_xlabel("Feature Count", size=12) ax.set_ylim(0.60, 0.75) ax.legend(title='Classifier Type', labels=model_list) fig = plt.gcf() fig.set_size_inches(width, height)
[docs]class Train: """ Functions for training machine learning models. """
[docs] @classmethod def train_model(cls, estimator, X_train, X_test, y_train, predict_method): """ Takes a ScikitLearn estimator instance and train and test dataframes returns: an estimator as well as the predictions and probabilities for the test set. :param estimator: ScikitLearn estimator instance :param X_train: Input training dataset :param X_test: Input test dataset :param y_train: Target training dataset :param predict_method: Type of prediction to make. :return: Fitted SckitLearn estimator, test predictions and test scores for each prediction. """ estimator.fit(X_train, y_train) y_pred = estimator.predict(X_test) if predict_method == "prob": y_score = estimator.predict_proba(X_test) else: y_score = estimator.decision_function(X_test) return estimator, y_pred, y_score