**Feb 27, 2018**

In this worksheet, we will use the libraries tidyverse, plotROC, and ggthemes:

```
library(tidyverse)
theme_set(theme_bw(base_size=12)) # set default ggplot2 theme
library(plotROC)
library(ggthemes)
```

We continue working with the biopsy data set:

`biopsy <- read.csv("http://wilkelab.org/classes/SDS348/data_sets/biopsy.csv")`

The following code splits the biopsy data set into a random training and test set:

```
train_fraction <- 0.4 # fraction of data for training purposes
set.seed(126) # set the seed to make the partition reproductible
train_size <- floor(train_fraction * nrow(biopsy)) # number of observations in training set
train_indices <- sample(1:nrow(biopsy), size = train_size)
train_data <- biopsy[train_indices, ] # get training data
test_data <- biopsy[-train_indices, ] # get test data
```

Fit a logistic regression model on the training data set, then predict the outcome on the test data set, and plot the resulting ROC curves. Limit the x-axis range from 0 to 0.15 to zoom into the ROC curve. (Hint: Do **not** use `coord_fixed()`

.)

```
# model to use:
# outcome ~ clump_thickness + uniform_cell_size + uniform_cell_shape
glm.out <- glm(outcome ~ clump_thickness + uniform_cell_size + uniform_cell_shape,
data=train_data,
family=binomial)
# results data frame for training data
df_train <- data.frame(predictor = predict(glm.out, train_data),
known_truth = train_data$outcome,
data_name = "training")
# results data frame for test data
df_test <- data.frame(predictor = predict(glm.out, test_data),
known_truth = test_data$outcome,
data_name = "test")
df_combined <- rbind(df_train, df_test)
ggplot(df_combined, aes(d = known_truth, m = predictor, color = data_name)) +
geom_roc(n.cuts = 0) + xlim(0, 0.15) + scale_color_colorblind()
```

```
## Warning in verify_d(data$d): D not labeled 0/1, assuming benign = 0 and
## malignant = 1!
```

You can calculate the areas under the ROC curves by running `calc_auc()`

on a plot generated with `geom_roc()`

(see previous worksheet). Use this function to calculate the area under the training and test curve for the model `outcome ~ clump_thickness`

. For this exercise, generate a new set of training and test datasets with a different fraction of training data from before.

```
train_fraction <- 0.2 # fraction of data for training purposes
set.seed(123) # set the seed to make the partition reproductible
train_size <- floor(train_fraction * nrow(biopsy)) # number of observations in training set
train_indices <- sample(1:nrow(biopsy), size = train_size)
train_data <- biopsy[train_indices, ] # get training data
test_data <- biopsy[-train_indices, ] # get test data
glm.out <- glm(outcome ~ clump_thickness,
data=train_data,
family=binomial)
df_train <- data.frame(predictor = predict(glm.out, train_data),
known_truth = train_data$outcome,
data_name = "training")
df_test <- data.frame(predictor = predict(glm.out, test_data),
known_truth = test_data$outcome,
data_name = "test")
df_combined <- rbind(df_train, df_test)
p <- ggplot(df_combined, aes(d = known_truth, m = predictor, color = data_name)) +
geom_roc(n.cuts = 0)
calc_auc(p)
```

```
## Warning in verify_d(data$d): D not labeled 0/1, assuming benign = 0 and
## malignant = 1!
```

```
## PANEL group AUC
## 1 1 1 0.9214427
## 2 1 2 0.9050554
```

Write code that combines the AUC values calculated by `calc_auc()`

with the correct group names and orders the output in descending order of AUC. (Hint: We have seen similar code in the previous worksheet.)

```
data_name <- unique(df_combined$data_name)
data_info <- data.frame(data_name,
group = order(data_name))
left_join(data_info, calc_auc(p)) %>%
select(-group, -PANEL) %>%
arrange(desc(AUC))
```

```
## Warning in verify_d(data$d): D not labeled 0/1, assuming benign = 0 and
## malignant = 1!
```

`## Joining, by = "group"`

```
## data_name AUC
## 1 training 0.9214427
## 2 test 0.9050554
```

Write code that generates an arbitrary number of random subdivisions of the data into training and test sets, fits a given model, calculates the area under the curve for each test data set, and then calculates the average and standard deviation of these values.

```
# function that does the heavy lifting
generate_AUC_values <- function(data, formula, train_fraction)
{
n_obs <- nrow(data) # number of observations in data set
train_size <- floor(train_fraction * nrow(data)) # number of observations in training set
train_indices <- sample(1:n_obs, size = train_size)
train_data <- data[train_indices, ] # get training data
test_data <- data[-train_indices, ] # get test data
glm.out <- glm(formula, data=train_data, family=binomial)
df_train <- data.frame(predictor = predict(glm.out, train_data),
known_truth = train_data$outcome,
data_name = "AUC_train")
df_test <- data.frame(predictor = predict(glm.out, test_data),
known_truth = test_data$outcome,
data_name = "AUC_test")
df_combined <- rbind(df_train, df_test)
p <- ggplot(df_combined, aes(d = known_truth, m = predictor, color = data_name)) +
geom_roc(n.cuts = 0)
data_name <- unique(df_combined$data_name)
data_info <- data.frame(data_name,
group = order(data_name))
left_join(data_info, calc_auc(p)) %>%
select(-group, -PANEL) %>%
spread(data_name, AUC)
}
# example use
generate_AUC_values(biopsy, outcome ~ clump_thickness, 0.2)
```

`## Joining, by = "group"`

```
## AUC_train AUC_test
## 1 0.8968605 0.9121516
```

```
# function that does repeated random subsampling validation
subsample_validate <- function(data, formula, train_fraction, replicates)
{
reps <- data.frame(rep=1:replicates) # dummy data frame to iterate over
reps %>% group_by(rep) %>% # iterate over all replicates
do(generate_AUC_values(data, formula, train_fraction)) %>% # run calc_AUC for each replicate
ungroup() %>% # ungroup so we can summarize
summarize(mean_AUC_train = mean(AUC_train), # summarize
sd_AUC_train = sd(AUC_train),
mean_AUC_test = mean(AUC_test),
sd_AUC_test = sd(AUC_test)) %>%
mutate(train_fraction=train_fraction, replicates=replicates) # add columns containing meta data
}
```

Now that we have these two functions, we can use them to complete the exercise. (We set `message = FALSE`

and `warning = FALSE`

for this R chunk so that we donâ€™t get repeated messages and warnings in the knitted html.)

```
train_fraction <- 0.2 # fraction of data for training purposes
replicates <- 10 # how many times do we want to randomly sample
set.seed(116) # random seed
formula <- outcome ~ clump_thickness + normal_nucleoli # the model we want to fit
subsample_validate(biopsy, formula, train_fraction, replicates)
```

```
## # A tibble: 1 x 6
## mean_AUC_train sd_AUC_train mean_AUC_test sd_AUC_test train_fraction
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.964 0.0129 0.968 0.00224 0.200
## # ... with 1 more variable: replicates <dbl>
```

```
# redo with a different model
formula2 <- outcome ~ clump_thickness + normal_nucleoli + marg_adhesion
subsample_validate(biopsy, formula2, train_fraction, replicates)
```

```
## # A tibble: 1 x 6
## mean_AUC_train sd_AUC_train mean_AUC_test sd_AUC_test train_fraction
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.985 0.00850 0.984 0.00362 0.200
## # ... with 1 more variable: replicates <dbl>
```