#Growing a decision tree
#install.packages('rpart)
library(rpart)
#Sample Data looking for presence of kyphosis
str(kyphosis)
tree <- rpart(Kyphosis ~ . , method='class', data= kyphosis)
#Results of Tree Model
printcp(tree)
#Tree Visualization
plot(tree, uniform=TRUE, main="Main Title")
text(tree, use.n=TRUE, all=TRUE)
#For better visuals
#install.packages('rpart.plot')
library(rpart.plot)
prp(tree)
#Random Forests
# Random Forest prediction of Kyphosis data
library(randomForest)
model <- randomForest(Kyphosis ~ ., data=kyphosis)
print(model)
importance(model) # importance of each predictor
#Github Example
#Access at <https://github.com/RamVegiraju/KidneyDiseaseMLModel/blob/master/Kidney_Disease_Project.R>
#Objective: Utilize Random Forest ML techniques off of a single decision tree to see how accuratelt kidney disease(ckd) is classified by model
#Reading in from UCI ML Repository
df <- read.csv('kidney_disease.csv')
#check format of data
head(df)
str(df) #93, 45, 47 levels for cell volume, white blood cell, and red blood cell counts
#three numeric variables being read as categorical with factors, need to convert to numeric
df$pcv <- as.numeric(as.character(df$pcv))
str(df)
df$wc <- as.numeric(as.character(df$wc))
str(df)
df$rc <- as.numeric(as.character(df$rc))
str(df) #these three factor variables are now numeric
#Albumin and sugar are factor variables being read as numeric, must be converted to factors
df$al <- as.factor(as.numeric(df$al))
str(df)
df$su <- as.factor(as.numeric(df$su))
str(df)
#there are missing NA values detected in the conversion to account for
library(Amelia)
missmap(df, main="Kidney Disease Missing Values",
col=c("yellow", "black"), legend=FALSE)
#Large amounts of data missing in the first few columns specifically and about 20% off other columns
#IMPUTATION OF DATA USING KNN, dataset is relatively small and not highly dimensional
install.packages('mice')
install.packages('missForest')
install.packages('VIM')
library(mice)
library(missForest)
library(VIM)
summary(df) #check for amount of NA's in each variable
imputed_KNN <- kNN(df, variable = c("age","bp","sg","al","su","bgr","bu","sc","sod","pot","hemo", "pcv",
"wc","rc"))
summary(imputed_KNN) #no NA's present in summary anymore
missmap(imputed_KNN, main="Kidney Disease Missing Values",
col=c("yellow", "black"), legend=FALSE) #No missing values on map anymore
#KNN adds columns with _imp for its algorithm, remove these columns to get a clean final set before EDA and splitting
train_data <- subset(imputed_KNN, select = id:classification)
summary(train_data)
#Exploratory Data Analysis
library(ggplot2)
library(ggplot2movies)
install.packages('DataExplorer')
library(DataExplorer)
#Using Data Explorer functions, let us get an overview of what our data looks like right now visually
plot_str(train_data)
plot_missing(train_data) #confirm no missing values
#Quantitative Variables using ggplot2
#Relationship between age and potential kidney disease
ggplot(train_data,aes(age,bp)) + geom_point(aes(color=classification))
ggplot(train_data,aes(age)) + geom_histogram(aes(fill=classification),color='black',bins=50) #Slightly positive relationship
#Relationship between blood pressure and potential kidney disease
ggplot(train_data,aes(bp)) + geom_histogram(aes(fill=classification),color='black',bins=50) #Strong relationship, split in Random Forest clear
#Relationship between specific gravity and potential kidney disease
ggplot(train_data,aes(sg)) + geom_histogram(aes(fill=classification),color='black',bins=50) #Lower specific gravity leads to ckd/potential kidney disease
#Used Data Explorer Function for qualitative variables, QQ plots, and a general EDA overview, generated report and attached link
create_report(train_data)
#Splitting data
library(caTools)
sample <- sample.split(train_data$classification, SplitRatio = .7)
train = subset(train_data, sample == TRUE)
test = subset(train_data, sample == FALSE)
#Creating a singular Decision Tree before comparing to Random Forest Method
library(rpart)
decision_tree <- rpart(classification ~., method = 'class', data = train)
#Predict data using singular decision tree
decision_treepreds <- predict(decision_tree, test)
head(decision_treepreds) #returns three probabilities, we want to be able to compare in a confusion matrix to the test data set for the classification column
head(test) #develop a function for decision_treepreds to match output to classification column of test data set
decision_treepreds <- as.data.frame(decision_treepreds)#Confirm a df, before proceeding to function
joiner <- function(x){
if (x>.5){
return('ckd')
}else{
return('notckd')
}
}
decision_treepreds$classification <- sapply(decision_treepreds$ckd,joiner)
head(decision_treepreds) #now have a new column identifying as ckd or not
#Confusion Matrix for Single Decision Tree
table(decision_treepreds$class,test$classification) #88% accuracy
#Plot tree model
library(rpart.plot)
prp(decision_tree)
#Random Forest Implementation
library(randomForest)
#train random forest model
rf.model <- randomForest(classification ~ . , data = train,importance = TRUE)
#confusion matrix
rf.model$confusion #96% accuracy rate
rf.model$importance #utilize gini index to identify factors with highest mean decrease gini and low gini value, age and sg are highly prevalent
randomf_pred <- predict(rf.model,test)
table(randomf_pred,test$Private)