import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
# model imports
from sklearn.linear_model import SGDRegressor, LinearRegression, RidgeCV, LassoLars, ElasticNetCV
from sklearn.isotonic import IsotonicRegression
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import LinearSVR, SVR
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
# feature selection and preprocessing
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import OneHotEncoder
# cross validation and grid search
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.grid_search import GridSearchCV
# metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error
%matplotlib inline
def numeric_map(iterator):
"""
Construct a numeric map for a given iterator.
Parameters
----------
iterator: uncertain
iterator for which numeric map is to be built
Returns
-------
list
"""
num_map = {}
for idx, val in enumerate(iterator):
if val in num_map:
continue
num_map[val] = idx
return [num_map[x] for x in iterator]
def plot_corr(df, size_x=30, size_y=30):
"""
Plot seaborn correlation matrix. Dataframe feature slice should be
conducted prior to invoking this method.
Parameters
----------
df : {pandas.DataFrame}
values for which correlation is built and plotted
Returns
-------
None
"""
fig, ax = plt.subplots(figsize=(size_x,size_y))
corr = df.corr()
# truncate correlation matrix
for col in corr.columns:
corr[col] = corr[col].apply(lambda x: "%.3f" % x)
corr[col] = corr[col].apply(lambda x: float(x))
# seaborn plot
sb.heatmap(corr, annot=True, fmt="g", cmap="viridis")
plt.show()
def regression_report(y_true, y_pred, estimator=""):
"""
Helper function for error outputs.
Parameters
----------
y_true : numpy.1darray
true labels for test data
y_pred : numpy.1darray
predicted labels for test data
estimator : str
name of estimator (for output purposes)
Returns
-------
None
"""
print "%s mean absolute error: %s"% (estimator, mean_absolute_error(y_true, y_pred))
print "%s mean squared error: %s" % (estimator, mean_squared_error(y_true, y_pred))
def ohe_transform(df):
"""
One hot encode dataframe (categorical variables). It is assumed that the
dataframe has been sliced (i.e. correct categorical features have been selected).
Parameters
----------
df : pandas.DataFrame
structure to be transformed
Returns
-------
pandas.DataFrame
"""
dv = DictVectorizer(sparse=False)
cat_dict = [dict(row[1]) for row in df.iterrows()]
return pd.DataFrame(dv.fit_transform(cat_dict), columns=dv.get_feature_names())
def model(estimator, X_train, y_train, X_test, y_test, estimator_name):
"""
General model method which trains, predicts, and reports.
Parameters
----------
estimator : sklearn.{general}
scikit-learn estimator with methods: fit, predict ;
assumes estimators have been passed with optimal params
X_train : numpy.ndarray
training set
y_train : numpy.1darray
training label set
X_test : numpy.ndarray
test set
y_test : numpy.1darray
test label set
estimator_name : string
name for reporting purposes
Returns
-------
None
"""
print "========== %s ==========" % estimator_name
print estimator.get_params()
estimator.fit(X_train, y_train)
prediction = estimator.predict(X_test)
regression_report(y_test, prediction, estimator=estimator_name)
print "\n"
def get_feat_importance(estimator, features):
"""
Retrieve a dataframe of feature importances based on the estimator used.
Sorts in descending order.
Parameters
----------
estimator : {sklearn.ensemble}
ensemble method used for training + testing
features : {list}
data set features
Returns
-------
pandas.DataFrame
"""
return pd.DataFrame(zip(features, estimator.feature_importances_), columns=["feature", "importance"]).sort_values(by="importance", ascending=False)
Categorical and continuous features are loaded into memory via Pandas.
df_train = pd.read_csv("train.csv")
categorical_features = [col for col in df_train.columns if "cat" in col]
continuous_features = [col for col in df_train.columns if "cont" in col]
df_categorical = df_train[["id", "loss"] + categorical_features]
df_continuous = df_train[["id", "loss"] + continuous_features]
# unique identifiers == number of records => we don't have to worry about grouping IDs
len(df_categorical) == len(set(df_categorical.id))
df_categorical.head()
Attempt to remove categorical features by observing how strongly correlated said features are. Str/char categorical features are converted via a numeric map, which is built simply through index-value mappings of a given feature (i.e. dataframe column).
# remove redundant categorical features
# build numeric map for each of the categories and plot correlation
df_categorical_num = df_categorical[["id", "loss"]]
for col in df_categorical.columns:
if col in ["id", "loss"]:
continue
df_categorical_num["%s_num" % df_categorical[col].name] = numeric_map(df_categorical[col])
# plot correlation matrix between sets of features
# group 'single_char' and 'multi' columns
single_char = []
for col in df_categorical.columns:
if col in ["id", "loss"]:
continue
elif all([True if len(item) == 1 else False for item in df_categorical[col]]):
single_char.append(col)
multi_cols = list(set(df_categorical.columns) - set(single_char))
multi_cols = [col for col in multi_cols if col not in ["id", "loss"]]
df_categorical_num_single = df_categorical_num[["%s_num" % col for col in single_char]]
df_categorical_num_multi = df_categorical_num[["%s_num" % col for col in multi_cols]]
# no strongly correlated multi-char columns
plot_corr(df_categorical_num_multi, size_x=5, size_y=5)
cat73_num, cat74_num strongly correlated with cat100_num.
cat73_num strongly correlated with cat74_num.
cat6_num strongly correlated with cat50_num.
cat9_num strongly correlated with cat2_num.
Do not discard any of the strongly correlated categorical features, as there is no perfect correlation.
plot_corr(df_categorical_num_single, size_x=75, size_y=75)
# plot the correlation between all of the categorical variables
plot_corr(df_categorical_num, size_x=75, size_y=75)
Strong correlation between cont11 and cont12 (0.994).
Strong correlation between cont9 and cont1 (0.93).
Strong correlation between cont6 and cont[7-13].
No completely redundant features observed.
# check correlation of continuous variables
plot_corr(df_continuous, size_x=15, size_y=15)
Estimators to be used require continuous/discrete numeric form for all features. These forms are obtained for the categorical features through one-hot-encoding.
features = [col for col in df_categorical.columns if col not in ["id", "loss"]]
# use a dict vectorizer as we need to retrieve feature names
dv = DictVectorizer(sparse=False)
cat_dict = [dict(row[1]) for row in df_categorical[features].iterrows()]
ohe = pd.DataFrame(dv.fit_transform(cat_dict), columns=dv.get_feature_names())
X_cat = ohe.values
y_cat = df_categorical["loss"].values
Tried different regressors and SelectFromModel method to determine best set of categorical features to train final estimators. The SelectFromModel method uses an importance threshold (for keeping/discarding) which is dependent on the penalty paramater of the regressor being used; some regressors (e.g. SGDRegressor) use an 'l1 regularization' parameter, thus resulting in a threshold of ~1e-5.
Selected features are combined at the end to create an aggregate feature subset for training estimators. The first code block keeps all features and re-labels unimportant features as being 'NOISE'.
split = int(0.8*X_cat.shape[0])
X_cat_train = X_cat[:split]
y_cat_train = y_cat[:split]
X_cat_test = X_cat[split:]
y_cat_test = y_cat[split:]
lr_reg = LinearRegression()
lr_reg.fit(X_cat_train, y_cat_train)
loss_prediction = lr_reg.predict(X_cat_test)
# compute mean absolute and squared error
print "Linear Regression mean absolute error: %s" % mean_absolute_error(y_cat_test, loss_prediction)
print "Linear Regression mean squared error: %s" % mean_squared_error(y_cat_test, loss_prediction)
lr_sfm = SelectFromModel(lr_reg, prefit=True)
lr_support = lr_sfm.get_support()
lr_feature_selection = ohe.columns[lr_support]
# default parameters for sgd
sgd_reg = SGDRegressor(verbose=2)
sgd_reg.fit(X_cat_train, y_cat_train)
# prediction and error output
loss_prediction = sgd_reg.predict(X_cat_test)
regression_report(y_cat_test, loss_prediction, estimator="SGD Regressor")
sgd_sfm = SelectFromModel(sgd_reg, prefit=True)
sgd_feature_selection = ohe.columns[sgd_sfm.get_support()]
print len(sgd_feature_selection)
# linear svm - regression
lsvr_reg = LinearSVR(verbose=2) # default params => consider lowering C for greater regularization
lsvr_reg.fit(X_cat_train, y_cat_train)
loss_prediction = lsvr_reg.predict(X_cat_test)
regression_report(y_cat_test, loss_prediction, estimator="Linear SVR")
lsvr_sfm = SelectFromModel(lsvr_reg, prefit=True)
lsvr_feature_selection = ohe.columns[lsvr_sfm.get_support()]
print len(lsvr_feature_selection)
# ridge with cross validation; default params => consider increasing alpha (C^-1)
rcv_reg = RidgeCV()
rcv_reg.fit(X_cat_train, y_cat_train)
loss_prediction = rcv_reg.predict(X_cat_test)
regression_report(y_cat_test, loss_prediction, estimator="RidgeCV")
rcv_sfm = SelectFromModel(rcv_reg, prefit=True)
rcv_feature_selection = ohe.columns[rcv_sfm.get_support()]
print len(rcv_feature_selection)
# lars lasso; default params
ll_reg = LassoLars()
ll_reg.fit(X_cat_train, y_cat_train)
loss_prediction = ll_reg.predict(X_cat_test)
regression_report(y_cat_test, loss_prediction, estimator="Lasso Lars")
ll_sfm = SelectFromModel(ll_reg, prefit=True)
ll_feature_selection = ohe.columns[ll_sfm.get_support()]
print len(ll_feature_selection)
# strictly 8 features selected
# lowest mean absolute error with linear svm
# lowest mean squared error with SGD
# mean absolute error is evaluation metric for competition
# combine features of best performing regressor
categorical_features = list(sgd_feature_selection) + list(lsvr_feature_selection)
categorical_features = list(set(categorical_features))
# create structure of individual features
cat_ft_struct = {}
for feature in categorical_features:
ft, val = feature.split("=")
if ft not in cat_ft_struct:
cat_ft_struct[ft] = []
cat_ft_struct[ft].append(val)
# we use the newly created structure to filter out noise (irrelevant categorical values)
df_categorical.head()
df_categorical_transform = df_categorical.copy()
# apply transformation to categorical features with newly created structure
for col, arr in cat_ft_struct.iteritems():
if col not in df_categorical_transform.columns:
continue
df_categorical_transform[col] = df_categorical_transform[col].apply(lambda x: x if x in arr else "NOISE")
df_categorical_transform.head()
cat_features = [col for col in df_categorical_transform.columns if col not in ["id", "loss"]]
cat_ohe = ohe_transform(df_categorical_transform[cat_features])
cat_ohe["df_index"] = cat_ohe.index
df_categorical_transform["df_index"] = range(0, len(df_categorical_transform))
df_categorical_transform = df_categorical_transform[["id", "df_index"]].merge(cat_ohe, on="df_index", how="inner")
df_categorical_transform.drop("df_index", axis=1, inplace=True)
Recreate categorical feature set, but remove unimportant features. The following block should run if it is deemed ok to overwrite the previously created categorical features and discard the noise feature.
categorical_features = list(sgd_feature_selection) + list(lsvr_feature_selection)
categorical_features = list(set(categorical_features))
cat_ohe = ohe[categorical_features]
cat_ohe["df_index"] = cat_ohe.index
df_categorical["df_index"] = range(0, len(df_categorical))
df_categorical_transform = df_categorical[["id", "df_index"]].merge(cat_ohe, on="df_index", how="inner")
df_categorical.drop("df_index", inplace=True, axis=1)
df_categorical_transform.drop("df_index", axis=1, inplace=True)
Instead of using the SelectFromModel approach to determine important categorical features, we train ensemble estimators and use the built-in feature ranking methods. An arbitrary number of 'top features' were chosen from each of the estimators.
Random Forest, Extra Trees, and Gradient Boost Regresors were the trained estimators.
print "======== Random Forest Regression ========"
rf_reg = RandomForestRegressor(n_estimators=10, verbose=2)
rf_reg.fit(X_cat_train, y_cat_train)
loss_prediction = rf_reg.predict(X_cat_test)
regression_report(y_cat_test, loss_prediction, estimator="Random Forest Regressor")
rf_feature_importance = get_feat_importance(rf_reg, ohe.columns)
# retrieve top 50 features
rf_feature_selection = rf_feature_importance[:50]
print "======== Extra Trees Regression ========"
xt_reg = ExtraTreesRegressor(n_estimators=10, verbose=2)
xt_reg.fit(X_cat_train, y_cat_train)
loss_prediction = xt_reg.predict(X_cat_test)
regression_report(y_cat_test, loss_prediction, estimator="Extra Trees Regressor")
xt_feature_importance = get_feat_importance(xt_reg, ohe.columns)
# retrieve top 50 features
xt_feature_selection = xt_feature_importance[:50]
print "======== Gradient Boost Regression ========"
gb_reg = GradientBoostingRegressor(n_estimators=10, verbose=2)
gb_reg.fit(X_cat_train, y_cat_train)
loss_prediction = gb_reg.predict(X_cat_test)
regression_report(y_cat_test, loss_prediction)
gb_feature_importance = get_feat_importance(gb_reg, ohe.columns)
# retrieve top 50 features
gb_feature_selection = gb_feature_importance[:50]
# merge ensemble based features and extract categorical feature names
ensemble_features = rf_feature_selection.merge(xt_feature_selection, on="feature", how="outer")
ensemble_features = ensemble_features.merge(gb_feature_selection, on="feature", how="outer")
categorical_features = ensemble_features["feature"]
# one hot encoded categorical data set without noise
cat_ohe = ohe[categorical_features]
cat_ohe["df_index"] = cat_ohe.index
df_categorical["df_index"] = range(0, len(df_categorical))
df_categorical_transform = df_categorical[["id", "df_index"]].merge(cat_ohe, on="df_index", how="inner")
df_categorical.drop("df_index", inplace=True, axis=1)
df_categorical_transform.drop("df_index", inplace=True, axis=1)
# noise in OHE categorical can be discarded (corresponds to unimportant feature values)
# noise discarding should be done prior to running this section
# assumes categorical features are correctly created
# combine categorical and continuous variables
df_combined = df_continuous.merge(df_categorical_transform, on="id", how="inner")
df_combined.head()
combined_features = [col for col in df_combined.columns if col not in ["id", "loss"]]
With the newly combined categorical and continuous features, training and test sets were created. A set of regressors were run sequentially for three different types of feature sets: SelectFromModel + Noise, SelectFromModel with Noise removal, Ensemble-Ranked.
Best performing estimator was observed with SelectFromModel + Noise feature set, with a Linear Support Vector Machine. Wanted to use a non-RF estimator and determine whether the online benchmark (RF w/ entire feature set) can be beaten.
X_combined = df_combined[combined_features]
y_combined = df_combined["loss"]
split = int(0.8*len(df_combined))
X_combined_train = X_combined[:split]
y_combined_train = y_combined[:split]
X_combined_test = X_combined[split:]
y_combined_test = y_combined[split:]
# some default parameters for a set of features
# note: scaling not done during preprocessing for continuous - centered assumption
regressors = {
"Linear SV Regressor": LinearSVR(verbose=2, C=1.0), # l1 regularization
"RidgeCV": RidgeCV(),
"Elasticnet CV": ElasticNetCV(),
"Lasso LARs Regressor": LassoLars(verbose=2, alpha=0.1),
"Extra Trees Regressor": ExtraTreesRegressor(n_estimators=100),
"Random Forest Regressor": RandomForestRegressor(n_estimators=100),
"Gradient Boost Regressor": GradientBoostingRegressor(n_estimators=100)
}
# model evaluation with SelectFromModel and noise inclusion
for reg_name, reg in regressors.iteritems():
model(reg, X_combined_train, y_combined_train, X_combined_test, y_combined_test, reg_name)
# model evaluation with ensemble feature selection and noise removal
for reg_name, reg in regressors.iteritems():
model(reg, X_combined_train, y_combined_train, X_combined_test, y_combined_test, reg_name)
# model evaluation with ensemble selection and noise removal
for reg_name, reg in regressors.iteritems():
model(reg, X_combined_train, y_combined_train, X_combined_test, y_combined_test, reg_name)