#Using the UCI Student Performance Dataset
#Trying to predict G3 Score
# Read CSV, note the delimiter (sep)
df <- read.csv('student-mat.csv',sep=';')
head(df)
summary(df) #for statistical summary
str(df) #for information about each variable
any(is.na(df)) #to check for all null values
#EDA
library(ggplot2)
library(ggthemes)
library(dplyr)
#Correlation
# Grab only numeric columns
num.cols <- sapply(df, is.numeric)
# Filter to numeric columns for correlation
cor.data <- cor(df[,num.cols])
cor.data
#Visualize correlations
#install.packages('corrgram',repos = '<http://cran.us.r-project.org>')
#install.packages('corrplot',repos = '<http://cran.us.r-project.org>')
#help(corrplot)
corrplot(cor.data,method='color')
corrgram(df,order=TRUE, lower.panel=panel.shade,
upper.panel=panel.pie, text.panel=panel.txt)
ggplot(df,aes(x=G3)) + geom_histogram(bins=20,alpha=0.5,fill='blue') + theme_minimal()
#General Linear Model in R looks like following
# model <- lm(y ~ x1 + x2,data)
# or to use all the features in your data
# model <- lm(y ~. , data) # Uses all features
#Train and Test Data
# Import Library
library(caTools)
# Set a random see so your "random" results are the same as this notebook
set.seed(101)
# Split up the sample, basically randomly assigns a booleans to a new column "sample"
sample <- sample.split(df$age, SplitRatio = 0.70) # SplitRatio = percent of sample==TRUE
# Training Data
train = subset(df, sample == TRUE)
# Testing Data
test = subset(df, sample == FALSE)
#Training the Model
model <- lm(G3 ~ .,train)
summary(model)
#Visualize Model
# Grab residuals
res <- residuals(model)
# Convert to DataFrame for gglpot
res <- as.data.frame(res)
head(res)
# Histogram of residuals
ggplot(res,aes(res)) + geom_histogram(fill='blue',alpha=0.5)
plot(model)
#Predictions
G3.predictions <- predict(model,test)
results <- cbind(G3.predictions,test$G3)
colnames(results) <- c('pred','real')
results <- as.data.frame(results)
#Dealing with negative predictions
to_zero <- function(x){
if (x < 0){
return(0)
}else{
return(x)
}
}
results$pred <- sapply(results$pred,to_zero)
mse <- mean((results$real-results$pred)^2)
print(mse)
#rmse
mse^0.5
#Metrics
SSE = sum((results$pred - results$real)^2)
SST = sum( (mean(df$G3) - results$real)^2)
R2 = 1 - SSE/SST
R2