Practical

We use the following packages:

library(class)
library(mvtnorm)
library(dplyr)
library(magrittr)
library(ggplot2)

Reproduce the data from slide 18 twice, but now instead of \(\pm 1.5\) use an adjustment of \(\pm .5\) and \(\pm 2.5\), respectively

First we create the data:

set.seed(123)
sigma <- matrix(c(1, .5, .5, 1), 2, 2)
sim.data <- rmvnorm(n = 100, 
                    mean = c(5, 5), 
                    sigma = sigma)
colnames(sim.data) <- c("x1", "x2")

add some clustering

sim.data <- 
  sim.data %>%
  as_tibble %>%
  mutate(class = sample(c("A", "B", "C"), size = 100, replace = TRUE))

and add the adjustments to the data

sim.data.half <- 
  sim.data %>%
  mutate(x2 = case_when(class == "A" ~ x2 + .5,
                        class == "B" ~ x2 - .5,
                        class == "C" ~ x2 + .5),
         x1 = case_when(class == "A" ~ x1 - .5,
                        class == "B" ~ x1 - 0,
                        class == "C" ~ x1 + .5))
sim.data.twohalf <- 
  sim.data %>%
  mutate(x2 = case_when(class == "A" ~ x2 + 2.5,
                        class == "B" ~ x2 - 2.5,
                        class == "C" ~ x2 + 2.5),
         x1 = case_when(class == "A" ~ x1 - 2.5,
                        class == "B" ~ x1 - 0,
                        class == "C" ~ x1 + 2.5))

Add a column to the data sets that indicates a Train (25%) and a Test (75%) part.

set.seed(123)
sim.data.half %<>% 
  mutate(set = sample(c("Train", "Test"), size=100, prob = c(.25, .75), replace=TRUE))
sim.data.twohalf %<>% 
  mutate(set = sample(c("Train", "Test"), size=100, prob = c(.25, .75), replace=TRUE))

Fit the K-NN model to both data sets. Use k = 3. For the first model with the .5 adjustment:

# create training and test
train.half <- subset(sim.data.half, set == "Train", select = c(x1, x2))
class.half <- subset(sim.data.half, set == "Train", select = class)
test.half <- subset(sim.data.half, set == "Test", select = c(x1, x2))

#run k-nn model
fit.knn.half <- knn(train = train.half,
                    test = test.half,
                    cl = as.matrix(class.half),
                    k = 3)

Then for the model with the 2.5 adjustment:

# create training and test
train.twohalf <- subset(sim.data.twohalf, set == "Train", select = c(x1, x2))
class.twohalf <- subset(sim.data.twohalf, set == "Train", select = class)
test.twohalf <- subset(sim.data.twohalf, set == "Test", select = c(x1, x2))

#run k-nn model
fit.knn.twohalf <- knn(train = train.twohalf,
                       test = test.twohalf,
                       cl = as.matrix(class.twohalf),
                       k = 3)

What is the percentage of correct predictions for each model? For the .5 adjustment data:

class.test.half <- subset(sim.data.half, set == "Test", select = class) %>%
  as.matrix()
correct.half <- fit.knn.half == class.test.half
mean(correct.half)

## [1] 0.5205479

and for the 2.5 adjustment data:

class.test.twohalf <- subset(sim.data.twohalf, set == "Test", select = class) %>%
  as.matrix()
correct.twohalf <- fit.knn.twohalf == class.test.twohalf

mean(correct.twohalf)

## [1] 0.9875

The model based on the 2.5 adjustment data performs much better. But in this model the classes are also more separated.

Plot the false and correct predictions for both models. For the .5 adjustment data:

cbind(test.half, correct.half) %>%
  ggplot(aes(x1, x2,  colour = correct.half)) +
  geom_point() +
  scale_colour_manual(values = c("red", "black")) + 
  ggtitle("K-NN classification \n Adjustment = .5")

We can see many mistakes for this model, but then again; there is not much clustering of values to detect

For the 2.5 adjustment data this changes:

cbind(test.twohalf, correct.twohalf) %>%
  ggplot(aes(x1, x2,  colour = correct.twohalf)) +
  geom_point() +
  scale_colour_manual(values = c("red", "black")) + 
  ggtitle("K-NN classification \n Adjustment = 2.5")

The clusters are visisbly separated. It is quite difficult to misclassify values based on their three closest neighbors - except for the values that are somewhat in between two (or more) clusters. Now we only make 1 misclassification given that we have set.seed(123) - different seeds may yield different data and, hence, different results. The misclassified value is indeed one of those values: in between two clusters.

Write a function that determines the optimum k with respect to classification error. Have the function return the following:

The optimum k (i.e. the lowest k with the most correct predictions)
The proportion correctly predicted for optimum k
A data frame with the proportion correct for every k

Optimize.knn <- function(train.set, test.set, train.class, test.class, min = 1, max = NULL) {
  if (is.null(max)) {
    max <- nrow(train.set)
  }
  if (!is.matrix(train.class)) {
    train.class <- as.matrix(train.class)
  }
  output <- list() #object to store in
  for (i in min:max){
    output[[i]] <- knn(train = train.set,
                       test = test.set,
                       cl = train.class,
                       k = i)
  }
  compare <- function(x) mean(x == test.class)
  correct <- data.frame(k = 1:max,
                        p.correct = sapply(output, compare))
  optimum <- min(correct$k[which.max(correct$p.correct)])
  result <- list(optimum.k = optimum,
                 max.p.correct = max(correct$p.correct),
                 results = correct)
  return(result)
}

Execute your function twice: once for the data set based on the .5 adjustment and once for the data set based on the 2.5 adjustment. Does the previously used k=3 yield the optimal classification prediction?

Optimize.knn(train.half, test.half, class.half, class.test.half)

## $optimum.k
## [1] 1
## 
## $max.p.correct
## [1] 0.5616438
## 
## $results
##     k p.correct
## 1   1 0.5616438
## 2   2 0.5479452
## 3   3 0.5342466
## 4   4 0.5342466
## 5   5 0.5616438
## 6   6 0.4794521
## 7   7 0.4520548
## 8   8 0.4520548
## 9   9 0.4109589
## 10 10 0.3835616
## 11 11 0.3835616
## 12 12 0.3561644
## 13 13 0.3698630
## 14 14 0.3698630
## 15 15 0.3698630
## 16 16 0.3835616
## 17 17 0.3698630
## 18 18 0.3972603
## 19 19 0.3698630
## 20 20 0.3150685
## 21 21 0.3150685
## 22 22 0.3013699
## 23 23 0.2876712
## 24 24 0.2876712
## 25 25 0.2876712
## 26 26 0.2876712
## 27 27 0.2876712

Optimize.knn(train.twohalf, test.twohalf, class.twohalf, class.test.twohalf)

## $optimum.k
## [1] 2
## 
## $max.p.correct
## [1] 1
## 
## $results
##     k p.correct
## 1   1    0.9875
## 2   2    1.0000
## 3   3    0.9875
## 4   4    0.9750
## 5   5    0.9750
## 6   6    0.9750
## 7   7    0.9750
## 8   8    0.9125
## 9   9    0.8375
## 10 10    0.7250
## 11 11    0.6875
## 12 12    0.6750
## 13 13    0.6500
## 14 14    0.6500
## 15 15    0.6500
## 16 16    0.5875
## 17 17    0.5500
## 18 18    0.4125
## 19 19    0.4125
## 20 20    0.2875

the2.5knn <- Optimize.knn(train.twohalf, test.twohalf, class.twohalf, class.test.twohalf)
the2.5knn$results

##     k p.correct
## 1   1    0.9875
## 2   2    0.9875
## 3   3    0.9875
## 4   4    0.9875
## 5   5    0.9750
## 6   6    0.9750
## 7   7    0.9750
## 8   8    0.9125
## 9   9    0.8000
## 10 10    0.7125
## 11 11    0.7000
## 12 12    0.6500
## 13 13    0.6500
## 14 14    0.6500
## 15 15    0.6500
## 16 16    0.5875
## 17 17    0.5875
## 18 18    0.3875
## 19 19    0.4875
## 20 20    0.3750

End of Practical

Practical_M

Gerko Vink

Statistical Programming with R