In-class worksheet 13

Feb 28, 2017

In this worksheet, we will use the libraries ggplot2, dplyr, and tidyr:

library(ggplot2)
theme_set(theme_bw(base_size=12)) # set default ggplot2 theme
library(dplyr)
library(tidyr)

1. Working with training and test data sets

We continue working with the biopsy data set:

biopsy <- read.csv("http://wilkelab.org/classes/SDS348/data_sets/biopsy.csv")

We will need this function from the last class, which calculates ROC curves:

calc_ROC <- function(probabilities, known_truth, model.name=NULL)
  {
  outcome <- as.numeric(factor(known_truth))-1
  pos <- sum(outcome) # total known positives
  neg <- sum(1-outcome) # total known negatives
  pos_probs <- outcome*probabilities # probabilities for known positives
  neg_probs <- (1-outcome)*probabilities # probabilities for known negatives
  true_pos <- sapply(probabilities,
                     function(x) sum(pos_probs>=x)/pos) # true pos. rate
  false_pos <- sapply(probabilities,
                     function(x) sum(neg_probs>=x)/neg)
  if (is.null(model.name))
    result <- data.frame(true_pos, false_pos)
  else
    result <- data.frame(true_pos, false_pos, model.name)
  result %>% arrange(false_pos, true_pos)
  }

The following code splits the biopsy data set into a random training and test set:

train_fraction <- 0.4 # fraction of data for training purposes
set.seed(126)  # set the seed to make your partition reproductible
train_size <- floor(train_fraction * nrow(biopsy)) # number of observations in training set
train_indices <- sample(1:nrow(biopsy), size = train_size)

train_data <- biopsy[train_indices, ] # get training data
test_data <- biopsy[-train_indices, ] # get test data

Fit a logistic regression model on the training data set, then predict the outcome on the test data set, and plot the resulting ROC curves. Limit the x-axis range from 0 to 0.25 to zoom into the ROC curve.

# model to use: 
# outcome ~ clump_thickness + uniform_cell_size + uniform_cell_shape

glm.out.train <- glm(outcome ~ clump_thickness + uniform_cell_size + uniform_cell_shape,
                data=train_data,
                family=binomial)
test_pred <- predict(glm.out.train, test_data, type='response')
ROC.train <- calc_ROC(probabilities=glm.out.train$fitted.values, 
                 known_truth=train_data$outcome,
                 model.name="train")
ROC.test <- calc_ROC(probabilities=test_pred, 
                 known_truth=test_data$outcome,
                 model.name="test")
ROCs <- rbind(ROC.train, ROC.test)
ggplot(ROCs, aes(x=false_pos, y=true_pos, color=model.name)) +
  geom_line() + xlim(0, 0.25)
## Warning: Removed 347 rows containing missing values (geom_path).

2. Area under the ROC curves

The following code (commented out) calculates the area under multiple ROC curves:

#ROCs %>% group_by(model.name) %>% 
#  mutate(delta=false_pos-lag(false_pos)) %>%
#  summarize(AUC=sum(delta*true_pos, na.rm=T)) %>%
#  arrange(desc(AUC))

Use this code to calculate the area under the training and test curve for this following model.

# model to use:
# outcome ~ clump_thickness

train_fraction <- 0.2 # fraction of data for training purposes
set.seed(101)  # set the seed to make your partition reproductible
n_obs <- nrow(biopsy) # number of observations in biopsy data set
train_size <- floor(train_fraction * nrow(biopsy)) # number of observations in training set
train_indices <- sample(1:n_obs, size = train_size)

train_data <- biopsy[train_indices, ] # get training data
test_data <- biopsy[-train_indices, ] # get test data

glm.out.train <- glm(outcome ~ clump_thickness, 
                data=train_data,
                family=binomial)
test_pred <- predict(glm.out.train, test_data, type='response')
ROC.train <- calc_ROC(probabilities=glm.out.train$fitted.values, 
                 known_truth=train_data$outcome,
                 model.name="train")
ROC.test <- calc_ROC(probabilities=test_pred, 
                 known_truth=test_data$outcome,
                 model.name="test")
ROCs <- rbind(ROC.train, ROC.test)
ROCs %>% group_by(model.name) %>% 
  mutate(delta=false_pos-lag(false_pos)) %>%
  summarize(AUC=sum(delta*true_pos, na.rm=T)) %>%
  arrange(desc(AUC))
## # A tibble: 2 × 2
##   model.name       AUC
##       <fctr>     <dbl>
## 1       test 0.9433136
## 2      train 0.9299145

3. If this was easy

Write code that generates an arbitrary number of random subdivisions of the data into training and test sets, fits a given model, calculates the area under the curve for each test data set, and then calculates the average and standard deviation of these values.

# function that does the heavy lifting
calc_AUC <- function(data, model, train_fraction)
{
  n_obs <- nrow(data) # number of observations in data set
  train_size <- floor(train_fraction * nrow(data)) # number of observations in training set
  train_indices <- sample(1:n_obs, size = train_size)

  train_data <- data[train_indices, ] # get training data
  test_data <- data[-train_indices, ] # get test data
  glm.out.train <- glm(model, data=train_data, family=binomial)
  test_pred <- predict(glm.out.train, test_data, type='response')
  ROC.train <- calc_ROC(probabilities=glm.out.train$fitted.values, 
                 known_truth=train_data$outcome,
                 model.name="training.AUC")
  ROC.test <- calc_ROC(probabilities=test_pred, 
                 known_truth=test_data$outcome,
                 model.name="test.AUC")
  ROCs <- rbind(ROC.train, ROC.test)
  # calculate AUCs
  ROCs %>% group_by(model.name) %>% 
    mutate(delta=false_pos-lag(false_pos)) %>%
    summarize(AUC=sum(delta*true_pos, na.rm=T)) %>%
    mutate(row=1) %>%
    spread(model.name, AUC) %>%
    select(-row)
}

# function that does repeated random subsampling validation
subsample_validate <- function(data, model, train_fraction, replicates)
{
  reps <- data.frame(rep=1:replicates) # dummy data frame to iterate over
  reps %>% group_by(rep) %>% # iterate over all replicates
    do(calc_AUC(data, model, train_fraction)) %>% # run calc_AUC for each replicate
    ungroup() %>%     # ungroup so we can summarize
    summarize(mean.train.AUC=mean(training.AUC),        # summarize
              sd.train.AUC=sd(training.AUC),
              mean.test.AUC=mean(test.AUC),
              sd.test.AUC=sd(test.AUC)) %>%
    mutate(train_fraction=train_fraction, replicates=replicates) # add columns containing meta data
}

Now that we have these two functions, we can use them to complete the exercise:

train_fraction <- 0.2 # fraction of data for training purposes
replicates <- 10 # how many times do we want to randomly sample
set.seed(116) # random seed
model <- outcome ~ clump_thickness + normal_nucleoli # the model we want to fit
subsample_validate(biopsy, model, train_fraction, replicates)
## # A tibble: 1 × 6
##   mean.train.AUC sd.train.AUC mean.test.AUC sd.test.AUC train_fraction
##            <dbl>        <dbl>         <dbl>       <dbl>          <dbl>
## 1      0.9703497   0.01179102     0.9727011 0.002249264            0.2
## # ... with 1 more variables: replicates <dbl>
# redo with a different model
model2 <- outcome ~ clump_thickness + normal_nucleoli + marg_adhesion
subsample_validate(biopsy, model2, train_fraction, replicates)
## # A tibble: 1 × 6
##   mean.train.AUC sd.train.AUC mean.test.AUC sd.test.AUC train_fraction
##            <dbl>        <dbl>         <dbl>       <dbl>          <dbl>
## 1      0.9848656  0.008365815     0.9842488 0.003611386            0.2
## # ... with 1 more variables: replicates <dbl>