from __future__ import division
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
# estimator imports
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC
# feature manipulation and preprocessing
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_extraction import DictVectorizer
# sampling, grid search, and reporting
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
%matplotlib inline
def feature_trends(df, response_col, figsize=(30,30), size=(3, 10), ignore=[]):
"""
Plot feature values against response variable.
Parameters
----------
df : {pandas.Dataframe}
data set
response_col : {string}
response variable in data set
figsize : {tuple}
figure size; default value acceptable for 20x3 grid
size : {tuple}
size of plot grid; defaults to 10x3 (i.e. 30 features)
ignore : {list}
list of features to ignore when plotting
"""
fig = plt.figure(figsize=figsize, dpi=400)
cols = df_numeric.columns[:size[0]*size[1]]
cols = set(cols) - set(ignore)
for idx, col in enumerate(cols):
axes = fig.add_subplot(size[0], size[1], idx+1)
axes.set_title(col)
axes.scatter(df[col], df[response_col])
plt.tight_layout()
plt.show()
# obtain switch frequency for a given feature
def switch_frequency(col, resp):
"""
Obtain the switch frequency for a given feature
Parameters
----------
col : {pandas.Series}
series of feature values
resp : {pandas.Series}
response variable series
"""
sf = 0
data = filter(lambda x: not np.isnan(x[0]), zip(col, resp))
prev = data[0][0]
for val, resp in data[1:]:
if resp == prev:
continue
sf += 1
prev = resp
return sf
def nan_evaluation(df, axis=1, method="all"):
"""
Evaluate all features or rows that have NaN values.
Parameters
----------
df : {pandas.DataFrame}
dataframe to evaluate
axis : {int}
0 => return rows, 1 => return columns; rows is slow for large dataframes
method : {string}
specify whether to return rows/columns with all or some NaN values.
"""
methods = {
"all": lambda x: np.all(x),
"some": lambda x: np.any(x)
}
if axis == 1:
return [col for col in df.columns if methods[method](df[col].isnull())]
return [row for row in df.index if methods[method](df.iloc[row][1:-1].isnull())]
def nan_transform(df, method="mean"):
"""
Apply transformation on NaN values in dataframe based on passed
method.
Parameters
----------
df : {pandas.DataFrame}
dataframe to be transformed
method : {string}
transformation to apply to NaN values; defaults to mean of
series
Method Options
--------------
median : use the median of the feature to fill NaNs
mean: use the mean of the feature to fill NaNs
feature_average: compute the mean of all features for a particular
record (series) and use it to fill NaN values
"""
methods = {
"mean": df.mean(),
"median": df.median()
}
# handle feature_average separately; conduct record iteration
if method == "feature_average":
transformed = df.apply(lambda x: x.fillna(x.mean()), axis=1)
else:
transformed = df.fillna(methods[method])
return transformed
# iterate over all 15 features and generate plots
def generate_plots(df, size):
"""
Helper function to generate a grid of plots for the
assumed features of a dataframe. Should be used strictly
for relative-feature inspection (i.e. how do plots for each
feature compare to one another).
Parameters
----------
df : {pandas.DataFrame}
data set consisting of features (columns) and records (row index)
size : {tuple}
grid representation; rows and columns of the grid correspond
to index 0 and 1, respectively
"""
numerical_ids = [int(id) for id in df.index.tolist()]
fig = matplotlib.pyplot.figure(figsize=(20, 20), dpi=400)
for idx, col in enumerate(df.columns):
axes = fig.add_subplot(size[0], size[1], idx+1)
axes.plot(zip(numerical_ids, df[col].tolist()))
matplotlib.pyplot.show()
def plot_corr(df, size_x=30, size_y=30):
"""
Plot seaborn correlation matrix. Dataframe feature slice should be
conducted prior to invoking this method.
Parameters
----------
df : {pandas.DataFrame}
values for which correlation is built and plotted
"""
fig, ax = plt.subplots(figsize=(size_x,size_y))
corr = df.corr()
# truncate correlation matrix
for col in corr.columns:
corr[col] = corr[col].apply(lambda x: "%.3f" % x)
corr[col] = corr[col].apply(lambda x: float(x))
# seaborn plot
sb.heatmap(corr, annot=True, fmt="g", cmap="viridis")
plt.show()
# write MCC evaluation for later use
def mcc_eval(result, test, response_col="Response"):
"""
Matthew's Correlation Coefficient used for model evaluation.
Parameters
----------
result : {pandas.DataFrame}
output of model mapping parts to response variable.
test : {pandas.DataFrame}
set with test data not exposed to the model
"""
true = result[response_col] == test[response_col]
false = result[response_col] != test[response_col]
positives = result[response_col] == 1
negatives = result[response_col] == 0
tp = len(result[true & positive])
print "true positives: %s" % tp
tn = len(result[true & negative])
print "true negatives: %s" % tn
fp = len(result[false & positive])
print "false positives: %s" % fp
fn = len(result[false & negative])
print "false negatives: %s" % fn
return (tp*tn - fp*fn)/np.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))
def hit_score(y_true, y_pred, hit_class=1):
"""
Determine the proportion of 'hits' in the predicted set,
in relation to the number of 'hits' in the true set.
Parameters
----------
y_true : {numpy.1darray}
true data points; usually corresponds to test set
y_pred : {numpy.1darray}
estimator outcome
hit_class : {int}
class value used to calculate number of hits
"""
hit_idx = [idx for idx, val in enumerate(y_true) if val == hit_class]
hits = 0
for pred_idx, pred_val in enumerate(y_pred):
if pred_idx in hit_idx and pred_val == hit_class:
hits += 1
return (hits / len(hit_idx))*100
def get_feat_imp(estimator, features):
"""
Obtain dataframe mapping feature to importance.
Parameters
----------
estimator : {sklearn.ensemble}
ensemble-based estimator with ability to output feature importance.
"""
return pd.DataFrame(zip(features, estimator.feature_importances_), columns=["feature", "importance"])
def scoring_report(y_true, y_pred, estimator_type=""):
"""
Helper function to output scores for binary classification prediction.
Parameters
----------
y_true : {numpy.1darray}
true data points; usually corresponds to test set
y_pred : {numpy.1darray}
estimator outcome
estimator_type : {string}
type of estimator (used for string formatting)
"""
print "%s Accuracy Score: %s" % (estimator_type, accuracy_score(y_true=y_true, y_pred=y_pred) * 100)
print "%s Hit Score: %s" % (estimator_type, hit_score(y_true=y_true, y_pred=y_pred, hit_class=1))
print "%s Classification Report:" % estimator_type
print classification_report(y_true=y_true, y_pred=y_pred, digits=3)
The training data is loaded into memory using Pandas (~1.18 million records). Proportions convey that the positive class (i.e. failed parts) is underepresented.
df_numeric = pd.read_csv("train_numeric.csv")
print "number of records: %s" % len(df_numeric)
print "number of features: %s" % len(df_numeric.columns)
df_numeric.head()
# all columns that have strictly nan values
print "num empty features: %s" % len(nan_evaluation(df_numeric, axis=1))
# all columns that have some nan values
print "num slightly empty features: %s" % len(nan_evaluation(df_numeric, axis=1, method="some"))
# feature imputation methods prior to using models or plotting correlation
df_numeric_mean = df_numeric.fillna(df_numeric.mean(), inplace=True)
# split numerical data into training and test sets - 70/30
features = list(set(df_numeric_mean.columns) - set(["Id", "Response"]))
X = df_numeric_mean[features].values
y = df_numeric_mean["Response"].values
X_train = X[:int(0.7*len(X))]
X_test = X[int(0.7*len(X)):]
y_train = y[:int(0.7*len(y))]
y_test = y[int(0.7*len(y)):]
# proportion of positives (1) to negatives (0) in train and test sets
print "positive proportion in train: {}%".format((len(y_train[y_train==1]) / len(y_train))*100)
print "negative proportion in train: {}%".format((len(y_train[y_train==0]) / len(y_train))*100)
print "positive proportion in test: {}%".format((len(y_test[y_test==1]) / len(y_test))*100)
print "negative proportion in test: {}%".format((len(y_test[y_test==0]) / len(y_test))*100)
One of the ways in which the feature set can be reduced is through detection and removal of redundant features. Correlation matrices are powerful in displaying ROC similarities across features; it is unreasonable to use a correlation matrix in the case of > 50 features simply due to visualization capabilities.
In order to safely reduce the feature space, three ensemble estimators (ExtraTrees, RandomForest, GradientBoost) were used in conjuction with the SelectFromModel method. As none of the estimators have a default 'l1 regularization' penalty, the mean feature importance was used as a threshold for discarding features.
The subset of features from each of the reduction methods were combined in the end to create a final set of features to be trained against.
# only use 10 estimators (default) for when we want to run feature selection
xt_clf = ExtraTreesClassifier(n_estimators=10, verbose=2)
xt_clf.fit(X_train, y_train)
prediction = xt_clf.predict(X_test)
print "Extra Trees Classifier Accuracy Score: %s" % (accuracy_score(y_true=y_test, y_pred=prediction) * 100)
print "Extra Trees Hit Score: %s" % hit_score(y_true=y_test, y_pred=prediction, hit_class=1)
print "Extra Trees Classification Report:"
print classification_report(y_true=y_test, y_pred=prediction, digits=3)
xt_feat_imp = get_feat_imp(xt_clf, features)
# plot top 20 features
xt_feat_imp.sort_values(by="importance", ascending=False)[:20].plot(y="importance", x="feature", kind="bar")
# use pre-written feature selection methods to determine
xt_sfm = SelectFromModel(xt_clf, prefit=True)
xt_sfm_support = xt_sfm.get_support()
xt_sfm_features = map(lambda y: y[1], filter(lambda x: xt_sfm_support[x[0]], enumerate(features)))
# random forest feature selection
rfc_clf = RandomForestClassifier(n_estimators=10, verbose=2)
rfc_clf.fit(X_train, y_train)
prediction = rfc_clf.predict(X_test)
print "Random Forest Classifier Accuracy Score: %s" % (accuracy_score(y_true=y_test, y_pred=prediction) * 100)
print "Random Forest Hit Score: %s" % hit_score(y_true=y_test, y_pred=prediction, hit_class=1)
print "Random Forest Classification Report:"
print classification_report(y_true=y_test, y_pred=prediction, digits=3)
rf_feat_imp = get_feat_imp(rfc_clf, features)
# plot top 20 features based on random forest results
rf_feat_imp.sort_values(by="importance", ascending=False)[:20].plot(y="importance", x="feature", kind="bar")
rf_sfm = SelectFromModel(rfc_clf, prefit=True)
rf_sfm_support = rf_sfm.get_support()
rf_sfm_features = map(lambda y: y[1], filter(lambda x: rf_sfm_support[x[0]], enumerate(features)))
# GBM classifier
gb_clf = GradientBoostingClassifier(n_estimators=10, verbose=2)
gb_clf.fit(X_train, y_train)
prediction = gb_clf.predict(X_test)
print "Gradient Boost Accuracy Score: %s" % (accuracy_score(y_true=y_test, y_pred=prediction) * 100)
print "Gradient Boost Hit Score: %s" % hit_score(y_true=y_test, y_pred=prediction, hit_class=1)
print "Gradient Boost Classification Report:"
print classification_report(y_true=y_test, y_pred=prediction, digits=3)
gb_feat_imp = get_feat_imp(gb_clf, features)
# plot importance for top 20 features as determined by gbm
gb_feat_imp.sort_values(by="importance", ascending=False)[:20].plot(y="importance", x="feature", kind="bar")
gb_sfm = SelectFromModel(gb_clf, prefit=True)
gb_sfm_support = gb_sfm.get_support()
gb_sfm_features = map(lambda y: y[1], filter(lambda x: gb_sfm_support[x[0]], enumerate(features)))
features_iter_1 = list(set(xt_sfm_features + rf_sfm_features + gb_sfm_features))
len(features_iter_1)
# output merged features to csv
pd.DataFrame(features_iter_1, columns=["features"]).to_csv("ensemble_sfm_features.csv")
# merged feature importance dataframes and output to csv
merged_feat_imp = xt_feat_imp.merge(rf_feat_imp, on="feature", copy=False, suffixes=("_xt", "_rf"), how="outer")
merged_feat_imp = merged_feat_imp.merge(gb_feat_imp, on="feature", copy=False, how="outer")
merged_feat_imp.columns = ["feature", "importance_xt", "importance_rf", "importance_gb"]
merged_feat_imp.to_csv("ensemble_merged_feat_rank.csv")
With the newly generated feature subset, three different models were used for training and evaluation. Of the two ensemble (GradientBoost, RandomForest) and SVM (LinearSVC) estimators, the Random Forest Classifier training resulted in the highest precision (i.e. small number of false positives) of 89.2%.
# use subset of features with ensemble methods and linear svc to test overall performance
gb_clf = GradientBoostingClassifier(n_estimators=100, verbose=2)
X_new = df_numeric_mean[features_iter_1].values
X_new_train = X_new[:int(0.7*len(X_new))]
X_new_test = X_new[int(0.7*len(X_new)):]
gb_clf.fit(X_new_train, y_train)
prediction = gb_clf.predict(X_new_test)
scoring_report(y_test, prediction, estimator_type="Gradient Boost")
# test support vector machine with linear kernel and subset of features
lsvm = LinearSVC(verbose=2)
lsvm.fit(X_new_train, y_train)
prediction = lsvm.predict(X_new_test)
scoring_report(y_test, prediction, estimator_type="LinearSVC")
rf_clf = RandomForestClassifier(n_estimators=100, verbose=2, oob_score=True)
rf_clf.fit(X_new_train, y_train)
prediction = rf_clf.predict(X_new_test)
scoring_report(y_test, prediction, estimator_type="Random Forest")
A secondary test was performed with the best performing estimator, wherein the feature space was altered such that there existed a lesser proportion of the over-represented class.
rf_clf = RandomForestClassifier(n_estimators=100, verbose=2)
positives_X = X_new_train[y_train == 1]
positives_y = y_train[y_train == 1]
negatives_X = X_new_train[y_train == 0]
negatives_y = y_train[y_train == 0]
for proportion in [0.01, 0.03, 0.05, 0.1, 0.3, 0.5]:
print "proportion of over-represented class: %s" % proportion
# use stratified shuffle split to over-represented class
sss = StratifiedShuffleSplit(negatives_y, n_iter=1, train_size=proportion)
train_index, test_index = [(train, test) for train, test in sss][0]
X_new_train = np.concatenate((negatives_X[train_index], positives_X[:int(0.8*len(positives_X))]), axis=0)
y_new_train = np.concatenate((negatives_y[train_index], positives_y[:int(0.8*len(positives_y))]), axis=0)
print "X train shape: {}".format(X_new_train.shape)
print "Y train shape: {}".format(y_new_train.shape)
X_new_test = np.concatenate((negatives_X[test_index], positives_X[int(0.8*len(positives_X)):]), axis=0)
y_new_test = np.concatenate((negatives_y[test_index], positives_y[int(0.8*len(positives_y)):]), axis=0)
print "X test shape: {}".format(X_new_test.shape)
print "Y test shape: {}".format(y_new_test.shape)
# split positive train and test set 80/20
rf_clf.fit(X_new_train, y_new_train)
prediction = rf_clf.predict(X_new_test)
scoring_report(y_new_test, prediction, estimator_type="Random Forest")
df_tags = df_numeric[["Id", "Response"]]
len(df_numeric)
# group features into lines and stations
lines = {}
for col in df_numeric.columns[1:-1]:
piece = [pc[1:] for pc in col.split("_")]
if len(piece) != 3:
print "Invalid column format: %s" % col
continue
if piece[0] not in lines:
lines[piece[0]] = {}
if piece[1] not in lines[piece[0]]:
lines[piece[0]][piece[1]] = []
lines[piece[0]][piece[1]].append((piece[2], col))
print len(lines['0'])
print sum([len(lines['0'][col]) for col in lines['0']])
print len(lines['1'])
print sum([len(lines['1'][col]) for col in lines['1']])
print len(lines['2'])
print sum([len(lines['2'][col]) for col in lines['2']])
print len(lines['3'])
print sum([len(lines['3'][col]) for col in lines['3']])
# analysis of line 2 data
lines['2'].keys()
df_226 = df_numeric[[tup[1] for tup in lines['2']['26']] + ['Id', 'Response']]
# create uid string column and set as index
df_226["uid"] = df_226["Id"].apply(lambda x: str(x))
df_226.drop("Id", inplace=True, axis=1)
df_226.set_index("uid", inplace=True)
# remove Response so that we can correctly check isnull()
df_226.drop("Response", inplace=True, axis=1)
nan_226 = strictly_nan(df_226.T)
df_226_cut = df_226.T[list(set(df_226.T.columns) - set(nan_226))].T
# only use the first three hundred thousand rows
df_numeric = df_numeric[:300000]
df_numeric.head()
# split data set into failures and passes
df_fail = df_numeric[df_numeric["Response"] == 0]
df_pass = df_numeric[df_numeric["Response"] == 1]
# create dataframe with switch frequencies
sw_freq = {}
for col in df_numeric.columns[1:-1]:
sw_freq[col] = switch_frequency(df_numeric[col], df_numeric.Response)
df_sf = pd.DataFrame([sw_freq.keys(), sw_freq.values()]).T
df_sf.columns=["feature", "switch_freq"]
feature_trends(df_numeric, "Response", figsize=(50,50), size=(60,3), ignore=["Id", "Response"])
df_sf.head()
# bar graph - need to figure out how to set vertical xticks
fig2 = plt.figure(figsize=(100,100))
axes = fig2.add_subplot(1,1,1)
axes.bar(np.arange(len(df_sf["switch_freq"])), df_sf["switch_freq"], 0.35)
plt.ylabel('Switch Frequency')
plt.title("Feature Switch Frequency")
plt.xticks(np.arange(len(df_sf["switch_freq"])) + 0.35/2, df_sf["feature"], rotation='vertical')
plt.yticks(np.arange(0, df_sf["switch_freq"].max(), 100))
plt.show()
# scatter plot of switch frequencies
fig3 = plt.figure()
axes = fig3.add_subplot(1,1,1)
axes.scatter(np.arange(len(df_sf["switch_freq"])), df_sf["switch_freq"])
plt.ylabel("Switch Frequency")
plt.title("Feature Switch Frequency")
plt.show()
# conducting nan analysis and model building from this point
# obtain nan frequencies - recognize that Id, Response will have freq = 0
nan_freq = {col: len(filter(lambda x: x, df_numeric[col].isnull())) for col in df_numeric.columns}
fig = plt.figure()
axes = fig.add_subplot(1,1,1)
axes.scatter(np.arange(len(nan_freq)), nan_freq.values())
plt.ylabel('NaN Frequency')
plt.title('Feature Null Frequency')
plt.show()
df_nf = pd.DataFrame([nan_freq.keys(), nan_freq.values()]).T
df_nf.columns = ["Feature", "NaN_Freq"]
print "Min null value count: %s" % df_nf.NaN_Freq.min()
print "Max null value count: %s" % df_nf.NaN_Freq.max()
print "# of features with 250,000 <= null value count <= 300,000: %s" % len(df_nf[(df_nf["NaN_Freq"] > 250000) & (df_nf["NaN_Freq"] < 300000)])
# create a copy of df_numeric for nan filling
# every copy of df_numeric must be "deleted" once model is built
df_num_nan = df_numeric.copy()
# check memory usage
import os
import psutil
process = psutil.Process(os.getpid())
print "Total memory ~ %sGB" % (process.memory_info().rss / (10**9))
# dumb model with all features
from sklearn import linear_model
# preserve NaNs for further analysis => use reduced-models approach and
df_num_model = df_numeric.fillna()
x = df_numeric[[col for col in df_numeric.columns if col not in ["Id", "Response"]]].values
y = df_numeric["Response"].values
clf = linear_model.SGDClassifier(loss="log")
clf.fit(x, y)
# simple model to compute all features
def sgd_class(data, target, test, loss="log"):
"""
Stochastic Gradient Classifier with log loss (i.e. fits logistic
regression model).
Parameters
----------
data : {pandas.DataFrame}
training data
target : {pandas.Series}
response variable
test : {pandas.DataFrame}
test data
loss : {string}
loss function to follow when classifying; defaults to 'log'
in order to force logistic regression model
"""
X = data.values
y = target.values
# we tweak params here as we deem appropriate
clf = linear_model.SGDClassifier(loss=loss)
clf.fit(X, y)
return clf.predict(test.values)
sgd_class(nan_transform(test.drop(["Id", 4], axis=1)), test[4], test_test)
df_test_num = pd.read_csv("test_numeric.csv")
# remember that numeric test has already been loaded
df_test_num.info()
# split (already reduced) training data into training and test sets
df_num_train = df_numeric[:240000]
df_num_test = df_numeric[240000:]
df_num_train.info()
df_num_test.info()
# show truly independent data sets
set(df_num_train["Id"].values) & set(df_num_test["Id"].values)
df_num_train.head()
df_num_test.head()
prediction = sgd_class(nan_transform(df_num_train.drop(["Id", "Response"], axis=1), method="mean"), df_num_train["Response"], nan_transform(df_num_test.drop(["Id", "Response"], axis=1)), loss="log")
result = pd.DataFrame(zip(df_num_test["Id"], prediction), columns=["Id", "Response"])
# run evaluation for mean imputation
print "MCC evaluation: %s" % mcc_eval(result, df_num_test[["Id", "Response"]])
prediction = sgd_class(nan_transform(df_num_train.drop(["Id", "Response"], axis=1), method="median"), df_num_train["Response"], nan_transform(df_num_test.drop(["Id", "Response"], axis=1), method="median"))
# run evaluation for median imputation
print "MCC evaluation: %s" % mcc_eval(pd.DataFrame(zip(df_num_test["Id"], prediction), columns=["Id", "Response"]), df_num_test[["Id", "Response"]])
# run evaluation for feature average imputation
prediction = sgd_class(nan_transform(df_num_train.drop(["Id", "Response"], axis=1), method="feature_average"), df_num_train["Response"], nan_transform(df_num_test.drop(["Id", "Response"], axis=1), method="feature_average"))
print "MCC evaluation: %s" % mcc_eval(pd.DataFrame(zip(df_num_test["Id"], prediction), columns=["Id", "Response"]), df_num_test[["Id", "Response"]])
# should try principal component analysis for dimensionality reduction
# http://sebastianraschka.com/Articles/2015_pca_in_3_steps.html