Monroe Week 9 Homework, R

Monroe Week 9 Homework

### Step 1: Load the data 
#Load your libraries
library("kernlab")
library("ggplot2")
library("e1071")
library("gridExtra")

# Let go back and analyze the air quality dataset (if you remember, we used that previously, in the visualization lab). Remember to think about how to deal with the NAs in the data. 
myairquality <- airquality
myairquality$Ozone[is.na(myairquality$Ozone)] <- round(mean(myairquality$Ozone, na.rm = TRUE))
myairquality$Solar.R[is.na(myairquality$Solar.R)] <- round(mean(myairquality$Solar.R, na.rm = TRUE))
myairquality

### Step 2: Create train and test data sets one for training and one for testing.
## 75% of the sample size
smp_size <- floor(0.75 * nrow(myairquality))

## set the seed to make your partition reproducible
set.seed(123)
train_ind <- sample(seq_len(nrow(myairquality)), size = smp_size)

train <- myairquality[train_ind, ]
test <- myairquality[-train_ind, ]
test

### Step 3: Build a Model using KSVM & visualize the results 
# Build a model (using the ksvm function, trying to predict onzone). You can use all the possible attributes, or select the attributes that you think would be the most helpful.
modelKSVM <- ksvm(Ozone ~ ., data = myairquality)
modelKSVM

predictOzone <- function(a, myairquality){
  predictedOzone <- predict(a, myairquality)
  results1 <- table(predictedOzone, myairquality$Ozone)
  print(results1)
  percentCorrect1 <- (results1[1,1]+results1[2,2])/(results1[1,1]+results1[1,2]+results1[2,1]+results1[2,2])*100
  round(percentCorrect1)  
  return(percentCorrect1)
}
predictOzone(modelKSVM, myairquality)

# 1) Test the model on the testing dataset, 2) compute the Root Mean Squared Error 
#3) Plot the results. Use a scatter plot. Have the x-axis represent temperature, the y-axis represent wind, the point size and color represent the error, as defined by the actual ozone level minus the predicted ozone level). 
root_square <- function(error)
{
  sqrt(mean(error^2))
}

modelKSVM.first <- predict(modelKSVM, myairquality)
modelKSVM.error <- (myairquality$Ozone - modelKSVM.first)
root_square(modelKSVM.error)

# Compute models and plot the results for svm (in the e1071 package) and lm. Generate similar charts for each model

# Show all three results (charts) in one window, using the grid.arrange function

### Step 4: Create a goodOzone variable 
# This variable should be either 0 or 1. It should be 0 if the ozone is below the average for all the data observations, and 1 if it is equal to or above the average ozone observed. 
goodOzone <-c()

for (i in 1:nrow(myairquality)) {
  if (myairquality$Ozone[i] < mean(myairquality$Ozone)){
      #cat(i, "goodozone", "\n")
      myairquality$goodOzone[i] <- 0
  }
  else {
      myairquality$goodOzone[i] <- 1
      #cat(i, "badozone", "\n")
   }
}

predictGoodozone <- function(m, myairquality){
  predictedGoodozone <- predict(m, myairquality)
  predictedGoodozone
  myairquality$Ozone
  results1 <- table(predictedGoodozone, myairquality$goodOzone)
  print(results1)
  percentCorrect1 <- (results1[1,1]+results1[2,2])/(results1[1,1]+results1[1,2]+results1[2,1]+results1[2,2])*100
  round(percentCorrect1)  
  return(percentCorrect1)
}

modelKSVM1 <- ksvm(goodOzone ~ ., data = myairquality)
modelKSVM1

predictGoodozone(modelKSVM1, myairquality)

### Step 5: See if we can do a better job predicting good and bad days 
# Build a model (using the ksvm function, trying to predict goodOzone). You can use all the possible attributes, or select the attributes that you think would be the most helpful.

modelKSVM <- ksvm(Ozone ~ ., data = myairquality)
predictOzone(modelKSVM, myairquality)
modelKSVM.first <- predict(modelKSVM, myairquality)
modelKSVM.error <- (myairquality$Ozone - modelKSVM.first)
root_square(modelKSVM.error)

modelSVM <- svm(Ozone ~ ., data = myairquality)
predictOzone(modelSVM, myairquality)
modelSVM.first <- predict(modelSVM, myairquality)
modelSVM.error <- (myairquality$Ozone - modelSVM.first)
root_square(modelSVM.error)

dfnew <- data.frame(myairquality$Wind,myairquality$Temp,modelKSVM.error)
colnames(dfnew) <- c("Wind","Temp","Error")
plotdf <- ggplot(data = dfnew,aes(x=myairquality$Temp,y=myairquality$Wind)) + geom_point(aes(size=modelKSVM.error), color = "red") + ggtitle("KSVM Model")
plotdf

dfnew1 <- data.frame(myairquality$Wind,myairquality$Temp,modelKSVM.error)
colnames(dfnew1) <- c("Wind","Temp","Error")
plotdf1 <- ggplot(data = dfnew1,aes(x=myairquality$Temp,y=myairquality$Wind)) + geom_point(aes(size=modelKSVM.error), color = "red") + ggtitle("SVM Model")
plotdf1

modelLM <- lm(Ozone ~., data=myairquality)
modelLM
modelLM.first <- predict(modelLM, myairquality)
modelLM.error <- (myairquality$Ozone - modelLM.first)
root_square(modelLM.error)

dfnew2 <- data.frame(myairquality$Wind,airquality$Temp,modelLM.error)
colnames(dfnew2) <- c("Wind","Temp","Error")
plotdf2 <- ggplot(data = dfnew2,aes(x=myairquality$Temp,y=airquality$Wind)) + geom_point(aes(size=modelLM.error), color = "red") + ggtitle("LM Model")
plotdf2

grid.arrange(plotdf,plotdf1, plotdf2, ncol = 2)

# Test the model on the testing dataset, and compute the percent of goodOzone that was correctly predicted.

# Plot the results. Use a scatter plot. Have the x-axis represent temperature, the y-axis represent wind, the shape representing what was predicted (good or bad day), the color representing the actual value of goodOzone (i.e. if the actual ozone level was good) and the size represent if the prediction was correct (larger symbols should be the observations the model got wrong).

# Compute models and plot the results for svm (in the e1071 package) and nb (Naive Bayes, also in the e1071 package).

# Show all three results (charts) in one window, using the grid.arrange function (have two charts in one row).


run \| edit \| history \| help	0

λ

                                      .NET NoSQL database for rapid development