Source code for fusetools.ml_tools

"""
Functions for interacting with Machine Learning Tools.

|pic1|
    .. |pic1| image:: ../images_source/ml_tools/scikit1.png
        :width: 50%

"""

import pandas as pd
from sklearn import metrics
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from sklearn.svm import SVC
import numpy as np
from fusetools.analytics_tools import Pandas
import seaborn as sns


[docs]class Prep:
    """
    Functions for preparing data for machine learning tasks.

    """

[docs]    @classmethod
    def make_model_feature_df(cls, df, cat, model_list):
        """
        Creates Pandas DataFrame with cumulatively trained combined feature names, coefficients, absolute coefficients in order of trained together.

        :param df: Pandas DataFrame of a fitted SckitLearn estimator's features, coefficients absolute coefficients.
        :param cat: Name of ScikitLearn estimator or self defines estimator type.
        :param model_list: List of ScikitLearn estimators.
        :return: Pandas DataFrame with cumulatively trained combined feature names, coefficients, absolute coefficients and performance placeholders in order of trained together.
        """
        for idx, m in enumerate(model_list):

            df = df.copy()

            feature_list = []
            df['feature_list'] = ""
            df['feature_count'] = ""
            df["auc"] = ""
            df["acc"] = ""

            for idxx, row in df.iterrows():
                feature_list.append(row['feature'])
                df.at[idxx, 'feature_list'] = feature_list.copy()
                df.at[idxx, 'feature_count'] = len(feature_list)

            df['model'] = m
            df["cat"] = cat

            if idx == 0:
                df_all = df.copy()
            else:
                df_all = pd.concat([df_all, df])

            return df_all

[docs]    @classmethod
    def label_encode_df(cls, df, col, col_new):
        """
        Assigns an incremental number for each string value and then converts the new number to a float so we can add the missing value (NaNs) back in for later imputation.

        :param df: Pandas DataFrame with atleast one feature to encode.
        :param col: Name of column to encode.
        :param col_new: Name of new, encoded column.
        :return: Pandas DataFrame with new, encoded column.
        """

        lb = LabelEncoder()
        df[col_new] = lb.fit_transform(df[col].fillna("NaN")).astype(
            float)  # apply label encoder (creates incremental number for each string)
        df[col_new] = np.where(df[col_new] == Pandas.find_na_holder(col, col_new)[1], np.nan,
                               df[col_new])  # add missing values back
        df.drop(col, axis=1, inplace=True)  # drop the old column

        return df


[docs]class Viz:
    """
    Functions for visualizing results from machine learning tasks.

    """

[docs]    @classmethod
    def show_model_results(cls, estimator, y_test, y_pred, y_score):
        """
        Prints the Accuracy, AUC and Metrics Classification report for an estimator.

        :param estimator: A trained estimator.
        :param y_test: Output labels for the test dataset.
        :param y_pred: Output predictions labels for the test dataset.
        :param y_score: Probabilities of certainty for output prediction labels.
        :return: Accuracy, AUC and Metrics Classification report for an estimator.
        """

        y_wrong = y_test[y_test != y_pred]
        print("Model Performance Metrics:")
        print(".................................................................")
        print('Test Accuracy: %.3f' % accuracy_score(y_true=y_test, y_pred=y_pred))
        print('Test AUC: %.3f' % roc_auc_score(y_true=y_test, y_score=y_score))

        # store model accuracy
        clf_acc = accuracy_score(y_true=y_test.values, y_pred=y_pred)
        print("")
        print("")
        print("Model Diagnostics:")
        print(".................................................................")
        print(metrics.classification_report(y_test, y_pred))
        Viz.show_clf_perf(14, 5, estimator, y_test, y_pred, y_score);

[docs]    @classmethod
    def show_clf_perf(cls, width, height, y_test, y_pred, y_score):
        """
        Prints a confusion matrix of an estimator's binary classification performance.

        :param width: Plot width.
        :param height: Plot height.
        :param y_test: Output labels for the test dataset.
        :param y_pred: Output predictions labels for the test dataset.
        :param y_score: Probabilities of certainty for output prediction labels.
        :return: Confusion matrix of an estimator's binary classification performance.
        """

        import matplotlib.pyplot as plt
        from matplotlib import gridspec
        import seaborn as sns
        fig = plt.figure(figsize=(width, height))
        gs = gridspec.GridSpec(1, 2, width_ratios=[1, 1])
        ax1 = plt.subplot(gs[0])
        from sklearn.metrics import confusion_matrix
        confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)
        ax1.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3)
        for i in range(confmat.shape[0]):
            for j in range(confmat.shape[1]):
                ax1.text(x=j, y=i, s=confmat[i, j],
                         va='center', ha='center', size=20)

        from sklearn.metrics import accuracy_score
        accuracy_score = accuracy_score(y_true=y_test, y_pred=y_pred)

        ax1.text(.9, .1, 'Test Accuracy=%s' % round(accuracy_score, 5),
                 ha='center', size=20, va='center', transform=ax1.transAxes, color="red")

        ax1.set_title("Confusion Matrix")
        ax1.set_xlabel("Predicted Success")
        ax1.set_ylabel("Actual Success")

        sns.despine(left=True, bottom=True)

        ax2 = plt.subplot(gs[1])

        from sklearn.metrics import roc_curve, roc_auc_score

        fpr, tpr, thresholds = roc_curve(y_true=y_test, y_score=y_score, pos_label=1)
        auc = roc_auc_score(y_true=y_test, y_score=y_score)

        ax2.plot(fpr, tpr, linewidth=2, label=None)
        ax2.plot([0, 1], [0, 1], 'k--')
        ax2.text(.9, .1, 'Test AUC=%s' % round(auc, 5), size=20,
                 ha='center', va='center', transform=ax2.transAxes, color="red")

        ax2.set_title("ROC Curve")
        plt.show()

        return gs;

[docs]    @classmethod
    def show_clf_perf_features(cls, width, height, df, title, model_list, df_max_stats):
        """
        Creates a plot of given ScikitLearn estimator Accuracies (Y Axis) by number of features in model (X Axis).

        :param width: Plot width.
        :param height: Plot height.
        :param df: Pandas DataFrame with cumulatively trained combined feature names, coefficients, absolute coefficients in order of trained together.
        :param title: Plot title.
        :param model_list: List of ScikitLearn estimators to iterate through and plot cumulative performance for.
        :param df_max_stats: Pandas DataFrame of estimator names, max accuracy achieved and number of features for fitted estimator. Used for plot annotation.
        :return: Plot of given ScikitLearn estimator Accuracies (Y Axis) by number of features in model (X Axis).
        """

        alpha = 0.3
        for idx, m in enumerate(model_list):
            df_sub = df[df['model'] == m]
            max_acc = df_max_stats[df_max_stats['model'] == m]['acc'].values[0]
            max_acc_ind = df_max_stats[df_max_stats['model'] == m]['feature_count'].values[0]

            ax = sns.lineplot(x=df_sub.index, y=df_sub.acc,
                              markers=df_sub.model, dashes=df_sub.model, data=df_sub, alpha=alpha)

            color = ax.get_lines()[-1].get_c()

            ax.text(max_acc_ind, max_acc, str(round(max_acc, 4)), color=color)

            alpha += 0.1

        ax.set_title(title, size=20)
        ax.set_ylabel("Accuracy", size=12)
        ax.set_xlabel("Feature Count", size=12)
        ax.set_ylim(0.60, 0.75)

        ax.legend(title='Classifier Type', labels=model_list)

        fig = plt.gcf()
        fig.set_size_inches(width, height)


[docs]class Train:
    """
    Functions for training machine learning models.

    """

[docs]    @classmethod
    def train_model(cls, estimator, X_train, X_test, y_train, predict_method):
        """
        Takes a ScikitLearn estimator instance and train and test dataframes
        returns: an estimator as well as the predictions and probabilities for the test set.

        :param estimator: ScikitLearn estimator instance
        :param X_train: Input training dataset
        :param X_test: Input test dataset
        :param y_train: Target training dataset
        :param predict_method: Type of prediction to make.
        :return: Fitted SckitLearn estimator, test predictions and test scores for each prediction.
        """

        estimator.fit(X_train, y_train)
        y_pred = estimator.predict(X_test)

        if predict_method == "prob":
            y_score = estimator.predict_proba(X_test)

        else:
            y_score = estimator.decision_function(X_test)

        return estimator, y_pred, y_score