#Using the UCI Student Performance Dataset
#Trying to predict G3 Score
# Read CSV, note the delimiter (sep)
df <- read.csv('student-mat.csv',sep=';')
head(df)
summary(df) #for statistical summary
str(df) #for information about each variable
any(is.na(df)) #to check for all null values

#EDA
library(ggplot2)
library(ggthemes)
library(dplyr)

#Correlation
# Grab only numeric columns
num.cols <- sapply(df, is.numeric)

# Filter to numeric columns for correlation
cor.data <- cor(df[,num.cols])

cor.data

#Visualize correlations
#install.packages('corrgram',repos = '<http://cran.us.r-project.org>')
#install.packages('corrplot',repos = '<http://cran.us.r-project.org>')
#help(corrplot)
corrplot(cor.data,method='color')

corrgram(df,order=TRUE, lower.panel=panel.shade,
  upper.panel=panel.pie, text.panel=panel.txt)

ggplot(df,aes(x=G3)) + geom_histogram(bins=20,alpha=0.5,fill='blue') + theme_minimal()

#General Linear Model in R looks like following
# model <- lm(y ~ x1 + x2,data)
# or to use all the features in your data
# model <- lm(y ~. , data) # Uses all features

#Train and Test Data
# Import Library
library(caTools)
# Set a random see so your "random" results are the same as this notebook
set.seed(101) 

# Split up the sample, basically randomly assigns a booleans to a new column "sample"
sample <- sample.split(df$age, SplitRatio = 0.70) # SplitRatio = percent of sample==TRUE

# Training Data
train = subset(df, sample == TRUE)

# Testing Data
test = subset(df, sample == FALSE)

#Training the Model
model <- lm(G3 ~ .,train)
summary(model)

#Visualize Model
# Grab residuals
res <- residuals(model)
# Convert to DataFrame for gglpot
res <- as.data.frame(res)
head(res)

# Histogram of residuals
ggplot(res,aes(res)) +  geom_histogram(fill='blue',alpha=0.5)
plot(model)

#Predictions
G3.predictions <- predict(model,test)
results <- cbind(G3.predictions,test$G3) 
colnames(results) <- c('pred','real')
results <- as.data.frame(results)

#Dealing with negative predictions
to_zero <- function(x){
    if  (x < 0){
        return(0)
    }else{
        return(x)
    }
}

results$pred <- sapply(results$pred,to_zero)
mse <- mean((results$real-results$pred)^2)
print(mse)
#rmse
mse^0.5

#Metrics
SSE = sum((results$pred - results$real)^2)
SST = sum( (mean(df$G3) - results$real)^2)

R2 = 1 - SSE/SST
R2