Allstate Claims Severity Analysis

In [1]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

# model imports
from sklearn.linear_model import SGDRegressor, LinearRegression, RidgeCV, LassoLars, ElasticNetCV
from sklearn.isotonic import IsotonicRegression
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import LinearSVR, SVR
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
# feature selection and preprocessing
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import OneHotEncoder
# cross validation and grid search
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.grid_search import GridSearchCV
# metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error

%matplotlib inline
In [2]:
def numeric_map(iterator):
    """
        Construct a numeric map for a given iterator.
        
        Parameters
        ----------
        iterator: uncertain
            iterator for which numeric map is to be built
        
        Returns
        -------
        list
    """
    num_map = {}
    for idx, val in enumerate(iterator):
        if val in num_map:
            continue
        num_map[val] = idx
    
    return [num_map[x] for x in iterator]
In [3]:
def plot_corr(df, size_x=30, size_y=30):
    """
        Plot seaborn correlation matrix. Dataframe feature slice should be
        conducted prior to invoking this method.
        
        Parameters
        ----------
        df : {pandas.DataFrame}
            values for which correlation is built and plotted
    
        Returns
        -------
        None
    """
    fig, ax = plt.subplots(figsize=(size_x,size_y))
    corr = df.corr()
    # truncate correlation matrix
    for col in corr.columns:
        corr[col] = corr[col].apply(lambda x: "%.3f" % x)
        corr[col] = corr[col].apply(lambda x: float(x))
    # seaborn plot
    sb.heatmap(corr, annot=True, fmt="g", cmap="viridis")
    plt.show()
In [4]:
def regression_report(y_true, y_pred, estimator=""):
    """
        Helper function for error outputs.
        
        Parameters
        ----------
        y_true : numpy.1darray
            true labels for test data
        
        y_pred : numpy.1darray
            predicted labels for test data
        
        estimator : str
            name of estimator (for output purposes)
        
        Returns
        -------
        None
    """
    print "%s mean absolute error:  %s"% (estimator, mean_absolute_error(y_true, y_pred))
    print "%s mean squared error: %s" % (estimator, mean_squared_error(y_true, y_pred))
In [5]:
def ohe_transform(df):
    """
        One hot encode dataframe (categorical variables). It is assumed that the
        dataframe has been sliced (i.e. correct categorical features have been selected).
        
        Parameters
        ----------
        df : pandas.DataFrame
            structure to be transformed
        
        Returns
        -------
        pandas.DataFrame
    """
    dv = DictVectorizer(sparse=False)
    cat_dict = [dict(row[1]) for row in df.iterrows()]
    return pd.DataFrame(dv.fit_transform(cat_dict), columns=dv.get_feature_names())
In [6]:
def model(estimator, X_train, y_train, X_test, y_test, estimator_name):
    """
        General model method which trains, predicts, and reports.
        
        Parameters
        ----------
        estimator : sklearn.{general}
            scikit-learn estimator with methods: fit, predict ;
            assumes estimators have been passed with optimal params 
        
        X_train : numpy.ndarray
            training set
        
        y_train : numpy.1darray
            training label set
        
        X_test : numpy.ndarray
            test set
        
        y_test : numpy.1darray
            test label set
        
        estimator_name : string
            name for reporting purposes
        
        Returns
        -------
        None
    """
    print "========== %s ==========" % estimator_name
    print estimator.get_params()
    estimator.fit(X_train, y_train)
    prediction = estimator.predict(X_test)
    regression_report(y_test, prediction, estimator=estimator_name)
    print "\n"
In [61]:
def get_feat_importance(estimator, features):
    """
        Retrieve a dataframe of feature importances based on the estimator used.
        Sorts in descending order.
        
        Parameters
        ----------
        estimator : {sklearn.ensemble}
            ensemble method used for training + testing
        
        features : {list}
            data set features
        
        Returns
        -------
        pandas.DataFrame
    """
    return pd.DataFrame(zip(features, estimator.feature_importances_), columns=["feature", "importance"]).sort_values(by="importance", ascending=False)


Data Loading

Categorical and continuous features are loaded into memory via Pandas.

In [7]:
df_train = pd.read_csv("train.csv")
In [8]:
categorical_features = [col for col in df_train.columns if "cat" in col]
continuous_features = [col for col in df_train.columns if "cont" in col]
In [9]:
df_categorical = df_train[["id", "loss"] + categorical_features]
In [10]:
df_continuous = df_train[["id", "loss"] + continuous_features]
In [11]:
# unique identifiers == number of records => we don't have to worry about grouping IDs
len(df_categorical) == len(set(df_categorical.id))
Out[11]:
True
In [12]:
df_categorical.head()
Out[12]:
id loss cat1 cat2 cat3 cat4 cat5 cat6 cat7 cat8 ... cat107 cat108 cat109 cat110 cat111 cat112 cat113 cat114 cat115 cat116
0 1 2213.18 A B A B A A A A ... J G BU BC C AS S A O LB
1 2 1283.60 A B A A A A A A ... K K BI CQ A AV BM A O DP
2 5 3005.09 A B A A B A A A ... F A AB DK A C AF A I GK
3 10 939.85 B B A B A A A A ... K K BI CS C N AE A O DJ
4 11 2763.85 A B A B A A A A ... G B H C C Y BM A K CK

5 rows × 118 columns

Feature Set Reduction

Attempt to remove categorical features by observing how strongly correlated said features are. Str/char categorical features are converted via a numeric map, which is built simply through index-value mappings of a given feature (i.e. dataframe column).

In [14]:
# remove redundant categorical features
In [13]:
# build numeric map for each of the categories and plot correlation
df_categorical_num = df_categorical[["id", "loss"]]

for col in df_categorical.columns:
    if col in ["id", "loss"]:
        continue
    df_categorical_num["%s_num" % df_categorical[col].name] = numeric_map(df_categorical[col])
/Users/cvq595/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
In [29]:
# plot correlation matrix between sets of features
In [14]:
# group 'single_char' and 'multi' columns
single_char = []

for col in df_categorical.columns:
    if col in ["id", "loss"]:
        continue
    elif all([True if len(item) == 1 else False for item in df_categorical[col]]):
        single_char.append(col)
In [15]:
multi_cols = list(set(df_categorical.columns) - set(single_char))
multi_cols = [col for col in multi_cols if col not in ["id", "loss"]]
In [16]:
df_categorical_num_single = df_categorical_num[["%s_num" % col for col in single_char]]
df_categorical_num_multi = df_categorical_num[["%s_num" % col for col in multi_cols]]
In [17]:
# no strongly correlated multi-char columns
plot_corr(df_categorical_num_multi, size_x=5, size_y=5)

single character categorical variable correlation

cat73_num, cat74_num strongly correlated with cat100_num.
cat73_num strongly correlated with cat74_num.
cat6_num strongly correlated with cat50_num.
cat9_num strongly correlated with cat2_num.
Do not discard any of the strongly correlated categorical features, as there is no perfect correlation.

In [18]:
plot_corr(df_categorical_num_single, size_x=75, size_y=75)
In [67]:
# plot the correlation between all of the categorical variables
plot_corr(df_categorical_num, size_x=75, size_y=75)

Continuous Feature Correlation Analysis

Strong correlation between cont11 and cont12 (0.994).
Strong correlation between cont9 and cont1 (0.93).
Strong correlation between cont6 and cont[7-13].
No completely redundant features observed.

In [54]:
# check correlation of continuous variables
plot_corr(df_continuous, size_x=15, size_y=15)


Categorical Feature Processing and Selection

Estimators to be used require continuous/discrete numeric form for all features. These forms are obtained for the categorical features through one-hot-encoding.

In [19]:
features = [col for col in df_categorical.columns if col not in ["id", "loss"]]
In [20]:
# use a dict vectorizer as we need to retrieve feature names
dv = DictVectorizer(sparse=False)
In [21]:
cat_dict = [dict(row[1]) for row in df_categorical[features].iterrows()]
In [22]:
ohe = pd.DataFrame(dv.fit_transform(cat_dict), columns=dv.get_feature_names())
In [24]:
X_cat = ohe.values
In [25]:
y_cat = df_categorical["loss"].values

Tried different regressors and SelectFromModel method to determine best set of categorical features to train final estimators. The SelectFromModel method uses an importance threshold (for keeping/discarding) which is dependent on the penalty paramater of the regressor being used; some regressors (e.g. SGDRegressor) use an 'l1 regularization' parameter, thus resulting in a threshold of ~1e-5.

Selected features are combined at the end to create an aggregate feature subset for training estimators. The first code block keeps all features and re-labels unimportant features as being 'NOISE'.

In [26]:
split = int(0.8*X_cat.shape[0])
X_cat_train = X_cat[:split]
y_cat_train = y_cat[:split]
X_cat_test = X_cat[split:]
y_cat_test = y_cat[split:]
In [27]:
lr_reg = LinearRegression()
In [28]:
lr_reg.fit(X_cat_train, y_cat_train)
Out[28]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
In [29]:
loss_prediction = lr_reg.predict(X_cat_test)
In [30]:
# compute mean absolute and squared error
print "Linear Regression mean absolute error: %s" % mean_absolute_error(y_cat_test, loss_prediction)
print "Linear Regression mean squared error: %s" % mean_squared_error(y_cat_test, loss_prediction)
Linear Regression mean absolute error: 283699593900.0
Linear Regression mean squared error: 1.13904159969e+27
In [31]:
lr_sfm = SelectFromModel(lr_reg, prefit=True)
In [32]:
lr_support = lr_sfm.get_support()
In [33]:
lr_feature_selection = ohe.columns[lr_support]
In [34]:
# default parameters for sgd
sgd_reg = SGDRegressor(verbose=2)
In [35]:
sgd_reg.fit(X_cat_train, y_cat_train)
-- Epoch 1
Norm: 4320.13, NNZs: 1122, Bias: 195.153960, T: 150654, Avg. loss: 2278707.489124
Total training time: 0.48 seconds.
-- Epoch 2
Norm: 4948.34, NNZs: 1122, Bias: 210.946737, T: 301308, Avg. loss: 2241795.545410
Total training time: 0.92 seconds.
-- Epoch 3
Norm: 5275.51, NNZs: 1122, Bias: 217.608155, T: 451962, Avg. loss: 2224630.979105
Total training time: 1.35 seconds.
-- Epoch 4
Norm: 5556.14, NNZs: 1122, Bias: 222.634227, T: 602616, Avg. loss: 2213412.376394
Total training time: 1.78 seconds.
-- Epoch 5
Norm: 5778.21, NNZs: 1122, Bias: 232.420007, T: 753270, Avg. loss: 2205385.657833
Total training time: 2.21 seconds.
Out[35]:
SGDRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.01,
       fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling',
       loss='squared_loss', n_iter=5, penalty='l2', power_t=0.25,
       random_state=None, shuffle=True, verbose=2, warm_start=False)
In [36]:
# prediction and error output
loss_prediction = sgd_reg.predict(X_cat_test)
regression_report(y_cat_test, loss_prediction, estimator="SGD Regressor")
SGD Regressor mean absolute error:  1318.12251185
SGD Regressor mean squared error: 4146285.44085
In [37]:
sgd_sfm = SelectFromModel(sgd_reg, prefit=True)
sgd_feature_selection = ohe.columns[sgd_sfm.get_support()]
print len(sgd_feature_selection)
377
In [38]:
# linear svm - regression
lsvr_reg = LinearSVR(verbose=2) # default params => consider lowering C for greater regularization
lsvr_reg.fit(X_cat_train, y_cat_train)
loss_prediction = lsvr_reg.predict(X_cat_test)
regression_report(y_cat_test, loss_prediction, estimator="Linear SVR")
lsvr_sfm = SelectFromModel(lsvr_reg, prefit=True)
lsvr_feature_selection = ohe.columns[lsvr_sfm.get_support()]
print len(lsvr_feature_selection)
[LibLinear]Linear SVR mean absolute error:  1239.71445401
Linear SVR mean squared error: 4545297.97456
316
In [39]:
# ridge with cross validation; default params => consider increasing alpha (C^-1)
rcv_reg = RidgeCV()
rcv_reg.fit(X_cat_train, y_cat_train)
loss_prediction = rcv_reg.predict(X_cat_test)
regression_report(y_cat_test, loss_prediction, estimator="RidgeCV")
rcv_sfm = SelectFromModel(rcv_reg, prefit=True)
rcv_feature_selection = ohe.columns[rcv_sfm.get_support()]
print len(rcv_feature_selection)
RidgeCV mean absolute error:  1296.06397858
RidgeCV mean squared error: 4098802.77408
239
In [40]:
# lars lasso; default params
ll_reg = LassoLars()
ll_reg.fit(X_cat_train, y_cat_train)
loss_prediction = ll_reg.predict(X_cat_test)
regression_report(y_cat_test, loss_prediction, estimator="Lasso Lars")
ll_sfm = SelectFromModel(ll_reg, prefit=True)
ll_feature_selection = ohe.columns[ll_sfm.get_support()]
print len(ll_feature_selection)
# strictly 8 features selected
Lasso Lars mean absolute error:  1551.74307047
Lasso Lars mean squared error: 5574987.55179
8
/Users/cvq595/anaconda2/lib/python2.7/site-packages/sklearn/linear_model/least_angle.py:309: ConvergenceWarning: Regressors in active set degenerate. Dropping a regressor, after 5 iterations, i.e. alpha=1.366e+00, with an active set of 5 regressors, and the smallest cholesky pivot element being 2.220e-16
  ConvergenceWarning)
/Users/cvq595/anaconda2/lib/python2.7/site-packages/sklearn/linear_model/least_angle.py:309: ConvergenceWarning: Regressors in active set degenerate. Dropping a regressor, after 7 iterations, i.e. alpha=1.108e+00, with an active set of 7 regressors, and the smallest cholesky pivot element being 1.490e-08
  ConvergenceWarning)
In [ ]:
# lowest mean absolute error with linear svm
# lowest mean squared error with SGD
# mean absolute error is evaluation metric for competition
In [ ]:
# combine features of best performing regressor
In [90]:
categorical_features = list(sgd_feature_selection) + list(lsvr_feature_selection)
In [91]:
categorical_features = list(set(categorical_features))
In [45]:
# create structure of individual features
cat_ft_struct = {}
for feature in categorical_features:
    ft, val = feature.split("=")
    if ft not in cat_ft_struct:
        cat_ft_struct[ft] = []
    cat_ft_struct[ft].append(val)
In [ ]:
# we use the newly created structure to filter out noise (irrelevant categorical values)
In [46]:
df_categorical.head()
Out[46]:
id loss cat1 cat2 cat3 cat4 cat5 cat6 cat7 cat8 ... cat107 cat108 cat109 cat110 cat111 cat112 cat113 cat114 cat115 cat116
0 1 2213.18 A B A B A A A A ... J G BU BC C AS S A O LB
1 2 1283.60 A B A A A A A A ... K K BI CQ A AV BM A O DP
2 5 3005.09 A B A A B A A A ... F A AB DK A C AF A I GK
3 10 939.85 B B A B A A A A ... K K BI CS C N AE A O DJ
4 11 2763.85 A B A B A A A A ... G B H C C Y BM A K CK

5 rows × 118 columns

In [47]:
df_categorical_transform = df_categorical.copy()
In [48]:
# apply transformation to categorical features with newly created structure
for col, arr in cat_ft_struct.iteritems():
    if col not in df_categorical_transform.columns:
        continue
    df_categorical_transform[col] = df_categorical_transform[col].apply(lambda x: x if x in arr else "NOISE")
In [89]:
df_categorical_transform.head()
Out[89]:
id cat80=B cat57=A cat79=D cat81=D cat12=B cat12=A cat109=BI cat113=G cat57=B ... cat116=KA cat116=K cat116=JY cat116=JX cat116=JW cat116=JV cat116=JT cat116=IY cat116=JR cat116=JQ
0 1 0.0 1.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 2 0.0 1.0 0.0 1.0 0.0 1.0 1.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 5 1.0 1.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 10 0.0 1.0 0.0 1.0 0.0 1.0 1.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 11 1.0 1.0 1.0 1.0 1.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 86 columns

In [50]:
cat_features = [col for col in df_categorical_transform.columns if col not in ["id", "loss"]]
In [51]:
cat_ohe = ohe_transform(df_categorical_transform[cat_features])
In [52]:
cat_ohe["df_index"] = cat_ohe.index
df_categorical_transform["df_index"] = range(0, len(df_categorical_transform))
df_categorical_transform = df_categorical_transform[["id", "df_index"]].merge(cat_ohe, on="df_index", how="inner")
df_categorical_transform.drop("df_index", axis=1, inplace=True)

Recreate categorical feature set, but remove unimportant features. The following block should run if it is deemed ok to overwrite the previously created categorical features and discard the noise feature.

In [ ]:
categorical_features = list(sgd_feature_selection) + list(lsvr_feature_selection)
categorical_features = list(set(categorical_features))
cat_ohe = ohe[categorical_features]
cat_ohe["df_index"] = cat_ohe.index
df_categorical["df_index"] = range(0, len(df_categorical))
df_categorical_transform = df_categorical[["id", "df_index"]].merge(cat_ohe, on="df_index", how="inner")
df_categorical.drop("df_index", inplace=True, axis=1)
df_categorical_transform.drop("df_index", axis=1, inplace=True)


Ensemble-Based Feature Selection

Instead of using the SelectFromModel approach to determine important categorical features, we train ensemble estimators and use the built-in feature ranking methods. An arbitrary number of 'top features' were chosen from each of the estimators.

Random Forest, Extra Trees, and Gradient Boost Regresors were the trained estimators.

In [65]:
print "======== Random Forest Regression ========"
rf_reg = RandomForestRegressor(n_estimators=10, verbose=2)
rf_reg.fit(X_cat_train, y_cat_train)
loss_prediction = rf_reg.predict(X_cat_test)
regression_report(y_cat_test, loss_prediction, estimator="Random Forest Regressor")
rf_feature_importance = get_feat_importance(rf_reg, ohe.columns)
# retrieve top 50 features
rf_feature_selection = rf_feature_importance[:50]
building tree 1 of 10
building tree 2 of 10
building tree 3 of 10
building tree 4 of 10
building tree 5 of 10
building tree 6 of 10
building tree 7 of 10
building tree 8 of 10
building tree 9 of 10
building tree 10 of 10
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  5.9min finished
Random Forest Regressor mean absolute error:  1303.31333305
Random Forest Regressor mean squared error: 4294890.1558
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.3s finished
In [66]:
print "======== Extra Trees Regression ========"
xt_reg = ExtraTreesRegressor(n_estimators=10, verbose=2)
xt_reg.fit(X_cat_train, y_cat_train)
loss_prediction = xt_reg.predict(X_cat_test)
regression_report(y_cat_test, loss_prediction, estimator="Extra Trees Regressor")
xt_feature_importance = get_feat_importance(xt_reg, ohe.columns)
# retrieve top 50 features
xt_feature_selection = xt_feature_importance[:50]
======== Extra Trees Regression ========
building tree 1 of 10
building tree 2 of 10
building tree 3 of 10
building tree 4 of 10
building tree 5 of 10
building tree 6 of 10
building tree 7 of 10
building tree 8 of 10
building tree 9 of 10
building tree 10 of 10
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  8.9min finished
Extra Trees Regressor mean absolute error:  1658.33884381
Extra Trees Regressor mean squared error: 6946268.5101
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.3s finished
In [68]:
print "======== Gradient Boost Regression ========"
gb_reg = GradientBoostingRegressor(n_estimators=10, verbose=2)
gb_reg.fit(X_cat_train, y_cat_train)
loss_prediction = gb_reg.predict(X_cat_test)
regression_report(y_cat_test, loss_prediction)
gb_feature_importance = get_feat_importance(gb_reg, ohe.columns)
# retrieve top 50 features
gb_feature_selection = gb_feature_importance[:50]
======== Gradient Boost Regression ========
      Iter       Train Loss   Remaining Time 
         1     7961471.9050            1.59m
         2     7519249.8332            1.39m
         3     7153073.8244            1.21m
         4     6853938.5325            1.04m
         5     6588834.0114           52.54s
         6     6368165.6518           42.24s
         7     6174584.2517           31.80s
         8     6023337.9161           21.26s
         9     5878722.6774           10.61s
        10     5756283.9600            0.00s
 mean absolute error:  1561.95926986
 mean squared error: 5639724.20721
In [69]:
# merge ensemble based features and extract categorical feature names
ensemble_features = rf_feature_selection.merge(xt_feature_selection, on="feature", how="outer")
ensemble_features = ensemble_features.merge(gb_feature_selection, on="feature", how="outer")
categorical_features = ensemble_features["feature"]
In [74]:
# one hot encoded categorical data set without noise
cat_ohe = ohe[categorical_features]
In [77]:
cat_ohe["df_index"] = cat_ohe.index
df_categorical["df_index"] = range(0, len(df_categorical))
df_categorical_transform = df_categorical[["id", "df_index"]].merge(cat_ohe, on="df_index", how="inner")
df_categorical.drop("df_index", inplace=True, axis=1)
df_categorical_transform.drop("df_index", inplace=True, axis=1)
/Users/cvq595/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
/Users/cvq595/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
/Users/cvq595/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
In [ ]:
# noise in OHE categorical can be discarded (corresponds to unimportant feature values)
# noise discarding should be done prior to running this section
# assumes categorical features are correctly created
In [101]:
# combine categorical and continuous variables
df_combined = df_continuous.merge(df_categorical_transform, on="id", how="inner")
In [102]:
df_combined.head()
Out[102]:
id loss cont1 cont2 cont3 cont4 cont5 cont6 cont7 cont8 ... cat116=HN cat42=B cat42=A cat41=A cat41=B cat110=EM cat110=EL cat110=EE cat110=EG cat110=EF
0 1 2213.18 0.726300 0.245921 0.187583 0.789639 0.310061 0.718367 0.335060 0.30260 ... 0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0
1 2 1283.60 0.330514 0.737068 0.592681 0.614134 0.885834 0.438917 0.436585 0.60087 ... 0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0
2 5 3005.09 0.261841 0.358319 0.484196 0.236924 0.397069 0.289648 0.315545 0.27320 ... 0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0
3 10 939.85 0.321594 0.555782 0.527991 0.373816 0.422268 0.440945 0.391128 0.31796 ... 0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0
4 11 2763.85 0.273204 0.159990 0.527991 0.473202 0.704268 0.178193 0.247408 0.24564 ... 0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 463 columns

In [103]:
combined_features = [col for col in df_combined.columns if col not in ["id", "loss"]]


Model Training and Evaluation

With the newly combined categorical and continuous features, training and test sets were created. A set of regressors were run sequentially for three different types of feature sets: SelectFromModel + Noise, SelectFromModel with Noise removal, Ensemble-Ranked.

Best performing estimator was observed with SelectFromModel + Noise feature set, with a Linear Support Vector Machine. Wanted to use a non-RF estimator and determine whether the online benchmark (RF w/ entire feature set) can be beaten.

In [104]:
X_combined = df_combined[combined_features]
y_combined = df_combined["loss"]
split = int(0.8*len(df_combined))
X_combined_train = X_combined[:split]
y_combined_train = y_combined[:split]
X_combined_test = X_combined[split:]
y_combined_test = y_combined[split:]
In [105]:
# some default parameters for a set of features
# note: scaling not done during preprocessing for continuous - centered assumption 
regressors = {
    "Linear SV Regressor": LinearSVR(verbose=2, C=1.0), # l1 regularization
    "RidgeCV": RidgeCV(),
    "Elasticnet CV": ElasticNetCV(),
    "Lasso LARs Regressor": LassoLars(verbose=2, alpha=0.1),
    "Extra Trees Regressor": ExtraTreesRegressor(n_estimators=100),
    "Random Forest Regressor": RandomForestRegressor(n_estimators=100),
    "Gradient Boost Regressor": GradientBoostingRegressor(n_estimators=100)
}
In [60]:
# model evaluation with SelectFromModel and noise inclusion
for reg_name, reg in regressors.iteritems():
    model(reg, X_combined_train, y_combined_train, X_combined_test, y_combined_test, reg_name)
========== RidgeCV ==========
{'normalize': False, 'alphas': (0.1, 1.0, 10.0), 'fit_intercept': True, 'gcv_mode': None, 'store_cv_values': False, 'scoring': None, 'cv': None}
RidgeCV mean absolute error:  1298.32763397
RidgeCV mean squared error: 4056819.04109


========== SGD Regressor ==========
{'warm_start': False, 'loss': 'squared_loss', 'eta0': 0.01, 'verbose': 2, 'shuffle': True, 'fit_intercept': True, 'l1_ratio': 0.15, 'average': False, 'n_iter': 5, 'penalty': 'l2', 'power_t': 0.25, 'random_state': None, 'epsilon': 0.1, 'alpha': 0.0001, 'learning_rate': 'optimal'}
-- Epoch 1
Norm: 8579358231330.08, NNZs: 532, Bias: 988692739448.226562, T: 150654, Avg. loss: 1685678144631071920866983936.000000
Total training time: 0.21 seconds.
-- Epoch 2
Norm: 3686357408618.76, NNZs: 532, Bias: 1001833907600.564453, T: 301308, Avg. loss: 845032087909275336515256320.000000
Total training time: 0.44 seconds.
-- Epoch 3
Norm: 2068535631393.62, NNZs: 532, Bias: 996995419168.728271, T: 451962, Avg. loss: 563714432359246172352675840.000000
Total training time: 0.66 seconds.
-- Epoch 4
Norm: 908432740690.00, NNZs: 532, Bias: 999602444819.506104, T: 602616, Avg. loss: 422863517143592568128274432.000000
Total training time: 0.88 seconds.
-- Epoch 5
Norm: 258251122975.87, NNZs: 532, Bias: 992910390401.305542, T: 753270, Avg. loss: 338291837372737203818463232.000000
Total training time: 1.11 seconds.
SGD Regressor mean absolute error:  21938378490.6
SGD Regressor mean squared error: 4.93737694175e+20


========== Extra Trees Regressor ==========
{'warm_start': False, 'oob_score': False, 'n_jobs': 1, 'verbose': 0, 'max_leaf_nodes': None, 'bootstrap': False, 'min_samples_leaf': 1, 'n_estimators': 100, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'criterion': 'mse', 'random_state': None, 'max_features': 'auto', 'max_depth': None}
Extra Trees Regressor mean absolute error:  1269.93137282
Extra Trees Regressor mean squared error: 3974631.87302


========== Gradient Boost Regressor ==========
{'presort': 'auto', 'loss': 'ls', 'verbose': 0, 'subsample': 1.0, 'max_leaf_nodes': None, 'learning_rate': 0.1, 'warm_start': False, 'min_samples_leaf': 1, 'n_estimators': 100, 'min_samples_split': 2, 'init': None, 'min_weight_fraction_leaf': 0.0, 'random_state': None, 'max_features': None, 'alpha': 0.9, 'max_depth': 3}
Gradient Boost Regressor mean absolute error:  1247.71635284
Gradient Boost Regressor mean squared error: 3745890.22689


========== Lasso LARs Regressor ==========
{'normalize': True, 'fit_path': True, 'fit_intercept': True, 'positive': False, 'max_iter': 500, 'eps': 2.2204460492503131e-16, 'precompute': 'auto', 'copy_X': True, 'alpha': 0.1, 'verbose': 2}
.
/Users/cvq595/anaconda2/lib/python2.7/site-packages/sklearn/linear_model/least_angle.py:309: ConvergenceWarning: Regressors in active set degenerate. Dropping a regressor, after 11 iterations, i.e. alpha=9.379e-01, with an active set of 11 regressors, and the smallest cholesky pivot element being 2.220e-16
  ConvergenceWarning)
/Users/cvq595/anaconda2/lib/python2.7/site-packages/sklearn/linear_model/least_angle.py:309: ConvergenceWarning: Regressors in active set degenerate. Dropping a regressor, after 11 iterations, i.e. alpha=9.379e-01, with an active set of 11 regressors, and the smallest cholesky pivot element being 2.107e-08
  ConvergenceWarning)
/Users/cvq595/anaconda2/lib/python2.7/site-packages/sklearn/linear_model/least_angle.py:309: ConvergenceWarning: Regressors in active set degenerate. Dropping a regressor, after 14 iterations, i.e. alpha=6.781e-01, with an active set of 14 regressors, and the smallest cholesky pivot element being 2.220e-16
  ConvergenceWarning)
/Users/cvq595/anaconda2/lib/python2.7/site-packages/sklearn/linear_model/least_angle.py:309: ConvergenceWarning: Regressors in active set degenerate. Dropping a regressor, after 16 iterations, i.e. alpha=5.869e-01, with an active set of 16 regressors, and the smallest cholesky pivot element being 2.107e-08
  ConvergenceWarning)
/Users/cvq595/anaconda2/lib/python2.7/site-packages/sklearn/linear_model/least_angle.py:309: ConvergenceWarning: Regressors in active set degenerate. Dropping a regressor, after 18 iterations, i.e. alpha=5.480e-01, with an active set of 18 regressors, and the smallest cholesky pivot element being 2.220e-16
  ConvergenceWarning)
/Users/cvq595/anaconda2/lib/python2.7/site-packages/sklearn/linear_model/least_angle.py:309: ConvergenceWarning: Regressors in active set degenerate. Dropping a regressor, after 20 iterations, i.e. alpha=5.225e-01, with an active set of 20 regressors, and the smallest cholesky pivot element being 2.581e-08
  ConvergenceWarning)
/Users/cvq595/anaconda2/lib/python2.7/site-packages/sklearn/linear_model/least_angle.py:309: ConvergenceWarning: Regressors in active set degenerate. Dropping a regressor, after 20 iterations, i.e. alpha=5.225e-01, with an active set of 20 regressors, and the smallest cholesky pivot element being 4.344e-08
  ConvergenceWarning)
/Users/cvq595/anaconda2/lib/python2.7/site-packages/sklearn/linear_model/least_angle.py:309: ConvergenceWarning: Regressors in active set degenerate. Dropping a regressor, after 24 iterations, i.e. alpha=4.763e-01, with an active set of 24 regressors, and the smallest cholesky pivot element being 2.107e-08
  ConvergenceWarning)
/Users/cvq595/anaconda2/lib/python2.7/site-packages/sklearn/linear_model/least_angle.py:309: ConvergenceWarning: Regressors in active set degenerate. Dropping a regressor, after 29 iterations, i.e. alpha=4.522e-01, with an active set of 29 regressors, and the smallest cholesky pivot element being 1.490e-08
  ConvergenceWarning)
/Users/cvq595/anaconda2/lib/python2.7/site-packages/sklearn/linear_model/least_angle.py:309: ConvergenceWarning: Regressors in active set degenerate. Dropping a regressor, after 29 iterations, i.e. alpha=4.522e-01, with an active set of 29 regressors, and the smallest cholesky pivot element being 1.054e-08
  ConvergenceWarning)
/Users/cvq595/anaconda2/lib/python2.7/site-packages/sklearn/linear_model/least_angle.py:309: ConvergenceWarning: Regressors in active set degenerate. Dropping a regressor, after 31 iterations, i.e. alpha=4.454e-01, with an active set of 31 regressors, and the smallest cholesky pivot element being 1.054e-08
  ConvergenceWarning)
/Users/cvq595/anaconda2/lib/python2.7/site-packages/sklearn/linear_model/least_angle.py:309: ConvergenceWarning: Regressors in active set degenerate. Dropping a regressor, after 31 iterations, i.e. alpha=4.454e-01, with an active set of 31 regressors, and the smallest cholesky pivot element being 2.107e-08
  ConvergenceWarning)
/Users/cvq595/anaconda2/lib/python2.7/site-packages/sklearn/linear_model/least_angle.py:309: ConvergenceWarning: Regressors in active set degenerate. Dropping a regressor, after 31 iterations, i.e. alpha=4.454e-01, with an active set of 31 regressors, and the smallest cholesky pivot element being 1.490e-08
  ConvergenceWarning)
/Users/cvq595/anaconda2/lib/python2.7/site-packages/sklearn/linear_model/least_angle.py:309: ConvergenceWarning: Regressors in active set degenerate. Dropping a regressor, after 34 iterations, i.e. alpha=4.248e-01, with an active set of 34 regressors, and the smallest cholesky pivot element being 2.220e-16
  ConvergenceWarning)
/Users/cvq595/anaconda2/lib/python2.7/site-packages/sklearn/linear_model/least_angle.py:309: ConvergenceWarning: Regressors in active set degenerate. Dropping a regressor, after 35 iterations, i.e. alpha=4.085e-01, with an active set of 35 regressors, and the smallest cholesky pivot element being 2.220e-16
  ConvergenceWarning)
/Users/cvq595/anaconda2/lib/python2.7/site-packages/sklearn/linear_model/least_angle.py:309: ConvergenceWarning: Regressors in active set degenerate. Dropping a regressor, after 36 iterations, i.e. alpha=3.963e-01, with an active set of 36 regressors, and the smallest cholesky pivot element being 1.490e-08
  ConvergenceWarning)
/Users/cvq595/anaconda2/lib/python2.7/site-packages/sklearn/linear_model/least_angle.py:309: ConvergenceWarning: Regressors in active set degenerate. Dropping a regressor, after 36 iterations, i.e. alpha=3.963e-01, with an active set of 36 regressors, and the smallest cholesky pivot element being 3.942e-08
  ConvergenceWarning)
/Users/cvq595/anaconda2/lib/python2.7/site-packages/sklearn/linear_model/least_angle.py:334: ConvergenceWarning: Early stopping the lars path, as the residues are small and the current value of alpha is no longer well controlled. 37 iterations, alpha=3.945e-01, previous alpha=3.940e-01, with an active set of 36 regressors.
  ConvergenceWarning)
Lasso LARs Regressor mean absolute error:  1406.91253175
Lasso LARs Regressor mean squared error: 4768077.1841


========== Elasticnet CV ==========
{'normalize': False, 'alphas': None, 'n_jobs': 1, 'verbose': 0, 'fit_intercept': True, 'selection': 'cyclic', 'l1_ratio': 0.5, 'n_alphas': 100, 'max_iter': 1000, 'eps': 0.001, 'precompute': 'auto', 'random_state': None, 'tol': 0.0001, 'positive': False, 'copy_X': True, 'cv': None}
Elasticnet CV mean absolute error:  1465.1094749
Elasticnet CV mean squared error: 5189465.42734


========== Ada Boost Regressor ==========
{'n_estimators': 100, 'loss': 'linear', 'base_estimator': None, 'random_state': None, 'learning_rate': 1.0}
Ada Boost Regressor mean absolute error:  7860.36032076
Ada Boost Regressor mean squared error: 66266680.0641


========== Random Forest Regressor ==========
{'warm_start': False, 'oob_score': False, 'n_jobs': 1, 'verbose': 0, 'max_leaf_nodes': None, 'bootstrap': True, 'min_samples_leaf': 1, 'n_estimators': 100, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'criterion': 'mse', 'random_state': None, 'max_features': 'auto', 'max_depth': None}
Random Forest Regressor mean absolute error:  1235.31381132
Random Forest Regressor mean squared error: 3687788.16427


========== Linear SV Regressor ==========
{'loss': 'epsilon_insensitive', 'C': 1.0, 'intercept_scaling': 1.0, 'fit_intercept': True, 'epsilon': 0.0, 'max_iter': 1000, 'random_state': None, 'dual': True, 'tol': 0.0001, 'verbose': 2}
[LibLinear]Linear SV Regressor mean absolute error:  1237.10046951
Linear SV Regressor mean squared error: 4500341.98056


In [88]:
# model evaluation with ensemble feature selection and noise removal
for reg_name, reg in regressors.iteritems():
    model(reg, X_combined_train, y_combined_train, X_combined_test, y_combined_test, reg_name)
========== Random Forest Regressor ==========
{'warm_start': False, 'oob_score': False, 'n_jobs': 1, 'verbose': 0, 'max_leaf_nodes': None, 'bootstrap': True, 'min_samples_leaf': 1, 'n_estimators': 100, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'criterion': 'mse', 'random_state': None, 'max_features': 'auto', 'max_depth': None}
Random Forest Regressor mean absolute error:  1320.43051189
Random Forest Regressor mean squared error: 4032550.55088


========== Gradient Boost Regressor ==========
{'presort': 'auto', 'loss': 'ls', 'verbose': 0, 'subsample': 1.0, 'max_leaf_nodes': None, 'learning_rate': 0.1, 'warm_start': False, 'min_samples_leaf': 1, 'n_estimators': 100, 'min_samples_split': 2, 'init': None, 'min_weight_fraction_leaf': 0.0, 'random_state': None, 'max_features': None, 'alpha': 0.9, 'max_depth': 3}
Gradient Boost Regressor mean absolute error:  1293.03732003
Gradient Boost Regressor mean squared error: 3915410.35029


========== Lasso LARs Regressor ==========
{'normalize': True, 'fit_path': True, 'fit_intercept': True, 'positive': False, 'max_iter': 500, 'eps': 2.2204460492503131e-16, 'precompute': 'auto', 'copy_X': True, 'alpha': 0.1, 'verbose': 2}
.Lasso LARs Regressor mean absolute error:  1396.62429968
Lasso LARs Regressor mean squared error: 4690829.68601


========== Linear SV Regressor ==========
{'loss': 'epsilon_insensitive', 'C': 1.0, 'intercept_scaling': 1.0, 'fit_intercept': True, 'epsilon': 0.0, 'max_iter': 1000, 'random_state': None, 'dual': True, 'tol': 0.0001, 'verbose': 2}
[LibLinear]
/Users/cvq595/anaconda2/lib/python2.7/site-packages/sklearn/linear_model/least_angle.py:309: ConvergenceWarning: Regressors in active set degenerate. Dropping a regressor, after 6 iterations, i.e. alpha=1.197e+00, with an active set of 6 regressors, and the smallest cholesky pivot element being 1.490e-08
  ConvergenceWarning)
/Users/cvq595/anaconda2/lib/python2.7/site-packages/sklearn/linear_model/least_angle.py:309: ConvergenceWarning: Regressors in active set degenerate. Dropping a regressor, after 10 iterations, i.e. alpha=9.379e-01, with an active set of 10 regressors, and the smallest cholesky pivot element being 2.220e-16
  ConvergenceWarning)
/Users/cvq595/anaconda2/lib/python2.7/site-packages/sklearn/linear_model/least_angle.py:309: ConvergenceWarning: Regressors in active set degenerate. Dropping a regressor, after 10 iterations, i.e. alpha=9.379e-01, with an active set of 10 regressors, and the smallest cholesky pivot element being 1.054e-08
  ConvergenceWarning)
/Users/cvq595/anaconda2/lib/python2.7/site-packages/sklearn/linear_model/least_angle.py:309: ConvergenceWarning: Regressors in active set degenerate. Dropping a regressor, after 10 iterations, i.e. alpha=9.258e-01, with an active set of 10 regressors, and the smallest cholesky pivot element being 1.054e-08
  ConvergenceWarning)
/Users/cvq595/anaconda2/lib/python2.7/site-packages/sklearn/linear_model/least_angle.py:309: ConvergenceWarning: Regressors in active set degenerate. Dropping a regressor, after 17 iterations, i.e. alpha=5.366e-01, with an active set of 17 regressors, and the smallest cholesky pivot element being 1.490e-08
  ConvergenceWarning)
/Users/cvq595/anaconda2/lib/python2.7/site-packages/sklearn/linear_model/least_angle.py:309: ConvergenceWarning: Regressors in active set degenerate. Dropping a regressor, after 20 iterations, i.e. alpha=4.611e-01, with an active set of 20 regressors, and the smallest cholesky pivot element being 2.220e-16
  ConvergenceWarning)
/Users/cvq595/anaconda2/lib/python2.7/site-packages/sklearn/linear_model/least_angle.py:309: ConvergenceWarning: Regressors in active set degenerate. Dropping a regressor, after 22 iterations, i.e. alpha=4.267e-01, with an active set of 22 regressors, and the smallest cholesky pivot element being 1.490e-08
  ConvergenceWarning)
/Users/cvq595/anaconda2/lib/python2.7/site-packages/sklearn/linear_model/least_angle.py:334: ConvergenceWarning: Early stopping the lars path, as the residues are small and the current value of alpha is no longer well controlled. 33 iterations, alpha=3.119e-01, previous alpha=2.776e-01, with an active set of 32 regressors.
  ConvergenceWarning)
Linear SV Regressor mean absolute error:  1325.83767519
Linear SV Regressor mean squared error: 4948176.8828


========== RidgeCV ==========
{'normalize': False, 'alphas': (0.1, 1.0, 10.0), 'fit_intercept': True, 'gcv_mode': None, 'store_cv_values': False, 'scoring': None, 'cv': None}
RidgeCV mean absolute error:  1379.1189797
RidgeCV mean squared error: 4435512.65041


========== Extra Trees Regressor ==========
{'warm_start': False, 'oob_score': False, 'n_jobs': 1, 'verbose': 0, 'max_leaf_nodes': None, 'bootstrap': False, 'min_samples_leaf': 1, 'n_estimators': 100, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'criterion': 'mse', 'random_state': None, 'max_features': 'auto', 'max_depth': None}
Extra Trees Regressor mean absolute error:  1352.04235686
Extra Trees Regressor mean squared error: 4287780.43327


========== Elasticnet CV ==========
{'normalize': False, 'alphas': None, 'n_jobs': 1, 'verbose': 0, 'fit_intercept': True, 'selection': 'cyclic', 'l1_ratio': 0.5, 'n_alphas': 100, 'max_iter': 1000, 'eps': 0.001, 'precompute': 'auto', 'random_state': None, 'tol': 0.0001, 'positive': False, 'copy_X': True, 'cv': None}
Elasticnet CV mean absolute error:  1563.71336392
Elasticnet CV mean squared error: 5725634.04226


In [107]:
# model evaluation with ensemble selection and noise removal
for reg_name, reg in regressors.iteritems():
    model(reg, X_combined_train, y_combined_train, X_combined_test, y_combined_test, reg_name)
========== Random Forest Regressor ==========
{'warm_start': False, 'oob_score': False, 'n_jobs': 1, 'verbose': 0, 'max_leaf_nodes': None, 'bootstrap': True, 'min_samples_leaf': 1, 'n_estimators': 100, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'criterion': 'mse', 'random_state': None, 'max_features': 'auto', 'max_depth': None}
Random Forest Regressor mean absolute error:  1237.95517958
Random Forest Regressor mean squared error: 3710058.89982


========== Gradient Boost Regressor ==========
{'presort': 'auto', 'loss': 'ls', 'verbose': 0, 'subsample': 1.0, 'max_leaf_nodes': None, 'learning_rate': 0.1, 'warm_start': False, 'min_samples_leaf': 1, 'n_estimators': 100, 'min_samples_split': 2, 'init': None, 'min_weight_fraction_leaf': 0.0, 'random_state': None, 'max_features': None, 'alpha': 0.9, 'max_depth': 3}
Gradient Boost Regressor mean absolute error:  1247.60742176
Gradient Boost Regressor mean squared error: 3739621.62207


========== Lasso LARs Regressor ==========
{'normalize': True, 'fit_path': True, 'fit_intercept': True, 'positive': False, 'max_iter': 500, 'eps': 2.2204460492503131e-16, 'precompute': 'auto', 'copy_X': True, 'alpha': 0.1, 'verbose': 2}
.
/Users/cvq595/anaconda2/lib/python2.7/site-packages/sklearn/linear_model/least_angle.py:309: ConvergenceWarning: Regressors in active set degenerate. Dropping a regressor, after 4 iterations, i.e. alpha=1.836e+00, with an active set of 4 regressors, and the smallest cholesky pivot element being 2.220e-16
  ConvergenceWarning)
/Users/cvq595/anaconda2/lib/python2.7/site-packages/sklearn/linear_model/least_angle.py:309: ConvergenceWarning: Regressors in active set degenerate. Dropping a regressor, after 9 iterations, i.e. alpha=9.776e-01, with an active set of 9 regressors, and the smallest cholesky pivot element being 1.490e-08
  ConvergenceWarning)
/Users/cvq595/anaconda2/lib/python2.7/site-packages/sklearn/linear_model/least_angle.py:309: ConvergenceWarning: Regressors in active set degenerate. Dropping a regressor, after 12 iterations, i.e. alpha=8.780e-01, with an active set of 12 regressors, and the smallest cholesky pivot element being 2.220e-16
  ConvergenceWarning)
/Users/cvq595/anaconda2/lib/python2.7/site-packages/sklearn/linear_model/least_angle.py:309: ConvergenceWarning: Regressors in active set degenerate. Dropping a regressor, after 21 iterations, i.e. alpha=6.352e-01, with an active set of 19 regressors, and the smallest cholesky pivot element being 1.490e-08
  ConvergenceWarning)
/Users/cvq595/anaconda2/lib/python2.7/site-packages/sklearn/linear_model/least_angle.py:309: ConvergenceWarning: Regressors in active set degenerate. Dropping a regressor, after 21 iterations, i.e. alpha=6.352e-01, with an active set of 19 regressors, and the smallest cholesky pivot element being 1.054e-08
  ConvergenceWarning)
/Users/cvq595/anaconda2/lib/python2.7/site-packages/sklearn/linear_model/least_angle.py:309: ConvergenceWarning: Regressors in active set degenerate. Dropping a regressor, after 21 iterations, i.e. alpha=6.352e-01, with an active set of 19 regressors, and the smallest cholesky pivot element being 2.220e-16
  ConvergenceWarning)
/Users/cvq595/anaconda2/lib/python2.7/site-packages/sklearn/linear_model/least_angle.py:309: ConvergenceWarning: Regressors in active set degenerate. Dropping a regressor, after 24 iterations, i.e. alpha=5.306e-01, with an active set of 22 regressors, and the smallest cholesky pivot element being 2.220e-16
  ConvergenceWarning)
/Users/cvq595/anaconda2/lib/python2.7/site-packages/sklearn/linear_model/least_angle.py:309: ConvergenceWarning: Regressors in active set degenerate. Dropping a regressor, after 31 iterations, i.e. alpha=4.679e-01, with an active set of 29 regressors, and the smallest cholesky pivot element being 1.490e-08
  ConvergenceWarning)
/Users/cvq595/anaconda2/lib/python2.7/site-packages/sklearn/linear_model/least_angle.py:309: ConvergenceWarning: Regressors in active set degenerate. Dropping a regressor, after 31 iterations, i.e. alpha=4.679e-01, with an active set of 29 regressors, and the smallest cholesky pivot element being 1.054e-08
  ConvergenceWarning)
/Users/cvq595/anaconda2/lib/python2.7/site-packages/sklearn/linear_model/least_angle.py:309: ConvergenceWarning: Regressors in active set degenerate. Dropping a regressor, after 31 iterations, i.e. alpha=4.679e-01, with an active set of 29 regressors, and the smallest cholesky pivot element being 4.470e-08
  ConvergenceWarning)
/Users/cvq595/anaconda2/lib/python2.7/site-packages/sklearn/linear_model/least_angle.py:309: ConvergenceWarning: Regressors in active set degenerate. Dropping a regressor, after 35 iterations, i.e. alpha=4.258e-01, with an active set of 33 regressors, and the smallest cholesky pivot element being 1.825e-08
  ConvergenceWarning)
/Users/cvq595/anaconda2/lib/python2.7/site-packages/sklearn/linear_model/least_angle.py:309: ConvergenceWarning: Regressors in active set degenerate. Dropping a regressor, after 39 iterations, i.e. alpha=3.951e-01, with an active set of 37 regressors, and the smallest cholesky pivot element being 2.220e-16
  ConvergenceWarning)
/Users/cvq595/anaconda2/lib/python2.7/site-packages/sklearn/linear_model/least_angle.py:309: ConvergenceWarning: Regressors in active set degenerate. Dropping a regressor, after 39 iterations, i.e. alpha=3.951e-01, with an active set of 37 regressors, and the smallest cholesky pivot element being 1.054e-08
  ConvergenceWarning)
/Users/cvq595/anaconda2/lib/python2.7/site-packages/sklearn/linear_model/least_angle.py:334: ConvergenceWarning: Early stopping the lars path, as the residues are small and the current value of alpha is no longer well controlled. 52 iterations, alpha=3.238e-01, previous alpha=3.235e-01, with an active set of 49 regressors.
  ConvergenceWarning)
Lasso LARs Regressor mean absolute error:  1371.22405228
Lasso LARs Regressor mean squared error: 4585568.9694


========== Linear SV Regressor ==========
{'loss': 'epsilon_insensitive', 'C': 1.0, 'intercept_scaling': 1.0, 'fit_intercept': True, 'epsilon': 0.0, 'max_iter': 1000, 'random_state': None, 'dual': True, 'tol': 0.0001, 'verbose': 2}
[LibLinear]Linear SV Regressor mean absolute error:  1237.16855718
Linear SV Regressor mean squared error: 4507605.31531


========== RidgeCV ==========
{'normalize': False, 'alphas': (0.1, 1.0, 10.0), 'fit_intercept': True, 'gcv_mode': None, 'store_cv_values': False, 'scoring': None, 'cv': None}
RidgeCV mean absolute error:  1298.46614578
RidgeCV mean squared error: 4056312.50153


========== Extra Trees Regressor ==========
{'warm_start': False, 'oob_score': False, 'n_jobs': 1, 'verbose': 0, 'max_leaf_nodes': None, 'bootstrap': False, 'min_samples_leaf': 1, 'n_estimators': 100, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'criterion': 'mse', 'random_state': None, 'max_features': 'auto', 'max_depth': None}
Extra Trees Regressor mean absolute error:  1269.27422576
Extra Trees Regressor mean squared error: 3989375.62829


========== Elasticnet CV ==========
{'normalize': False, 'alphas': None, 'n_jobs': 1, 'verbose': 0, 'fit_intercept': True, 'selection': 'cyclic', 'l1_ratio': 0.5, 'n_alphas': 100, 'max_iter': 1000, 'eps': 0.001, 'precompute': 'auto', 'random_state': None, 'tol': 0.0001, 'positive': False, 'copy_X': True, 'cv': None}
Elasticnet CV mean absolute error:  1479.52327971
Elasticnet CV mean squared error: 5266326.03266