#Working with Ecommerce Customers CSV File
#Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#Getting data
customers = pd.read_csv("Ecommerce Customers")

#Information through
customers.head()
customers.describe() #statistical summary
customers.info() #information about columns and variables

#EDA
sns.pairplot(customers) #for correlation matrix
sns.lmplot(x='Length of Membership',y='Yearly Amount Spent',data=customers) #for linear reg plot

#Training and Testing Data
y = customers['Yearly Amount Spent'] #response variable
#for x use df.columns to select all variables that are not response
df.columns()
X = customers[['Avg. Session Length', 'Time on App','Time on Website', 'Length of Membership']]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

#Training the Model
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X_train,y_train)

#Print Coefficients
print('Coefficients: \\n', lm.coef_)

#Predicting Test Data
predictions = lm.predict(X_test)
plt.scatter(y_test,predictions) #visualize difference between actual and predictions
plt.xlabel('Y Test')
plt.ylabel('Predicted Y')

#Evaluating the Model
from sklearn import metrics

print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))

#Residuals
sns.distplot((y_test-predictions),bins=50);

#Conclusion
coeffecients = pd.DataFrame(lm.coef_,X.columns)
coeffecients.columns = ['Coeffecient']
coeffecients
#Interpretation: for every one unit increase in X variable it is corresponding coefficient increase for response variable