This notebook is trying to estimate the violent crime rate per population based on data from https://archive.ics.uci.edu/ml/datasets/Communities+and+Crime using different linear regression models on multivariate data.
It is based on examples from sklearn and is licensed under Apache 2.0 open source license.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn.metrics
import sklearn.linear_model
import sklearn.preprocessing
import sklearn.pipeline
%matplotlib inline
# Communities and Crime Dataset
# https://archive.ics.uci.edu/ml/datasets/Communities+and+Crime
df_crime_dataset = pd.read_csv('communities.data.txt', na_values='?', header=None)
# this is just the headers from the community.names file
headers_df = pd.read_csv('data_headers.txt', sep=' ', header=None)
df_crime_dataset.columns = headers_df[1].tolist()
#df_crime_dataset
df_crime_dataset.shape
df_crime_dataset.describe()
# set nan to zero
df_crime_dataset.fillna(value=0, inplace=True)
# shuffling
df_crime_dataset = df_crime_dataset.sample(frac=1).reset_index(drop=True)
sns.pairplot(df_crime_dataset,
x_vars=['medIncome','FemalePctDiv','PolicPerPop'],
y_vars='ViolentCrimesPerPop', size=7, aspect=0.7)
sns.pairplot(df_crime_dataset,
x_vars=['PctHousNoPhone','NumInShelters','PopDens'],
y_vars='ViolentCrimesPerPop', size=7, aspect=0.7)
# create arrays of features X and labels y (crime rate)
np_feature_set = df_crime_dataset.iloc[:,5:-1].as_matrix()
np_label_set = df_crime_dataset.iloc[:,-1].as_matrix()
# Split data in train set and test set
n_samples = np_feature_set.shape[0]
cutoff_train = int(n_samples * 0.9)
x_train, y_train = np_feature_set[:cutoff_train], np_label_set[:cutoff_train]
x_test, y_test = np_feature_set[cutoff_train:], np_label_set[cutoff_train:]
# training
alpha = 0.0001
def my_trainer(model, x_train, y_train, x_test, y_test):
y_pred = model.fit(x_train, y_train).predict(x_test)
r2_score = sklearn.metrics.r2_score(y_test, y_pred)
print(model)
print("r^2 on test data : %f \n" % r2_score)
# Lasso
lasso = sklearn.linear_model.Lasso(alpha=alpha)
my_trainer(lasso, x_train, y_train, x_test, y_test)
# ElasticNet
elnet = sklearn.linear_model.ElasticNet(alpha=alpha, l1_ratio=0.7)
my_trainer(elnet, x_train, y_train, x_test, y_test)
#LassoLars
lasso_lars = sklearn.linear_model.LassoLars(alpha=alpha)
my_trainer(lasso_lars, x_train, y_train, x_test, y_test)
#Polynominal Preposessing with ElNet
#poly_prep = sklearn.preprocessing.PolynomialFeatures(degree=3)
#poly_elnet_model = sklearn.pipeline.Pipeline([('poly', poly_prep),
# ('elnet', elnet)])
#my_trainer(poly_elnet_model, x_train, y_train, x_test, y_test)
plt.plot(elnet.coef_, color='lightgreen', linewidth=2,
label='Elastic net coefficients')
plt.plot(lasso.coef_, color='gold', linewidth=2,
label='Lasso coefficients')
plt.plot(lasso_lars.coef_, color='red', linewidth=2,
label='Lasso Lars coefficients')
plt.legend(loc='best')
plt.show()