--- title: "Trait-based modeling approach" author: "Barbara A. Han, Adrian A. Castellanos, John Paul Schmidt, Ilya R. Fischhoff, John M. Drake" date: "`r format(Sys.time(), "%B %d, %Y")`" output: html_document --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` ```{r Packages} packages <- c("tidyverse", "magrittr", "Hmisc", "gghighlight", "caret", "gbm", "Matrix", "pdp", "caTools", "ROCR", "foreach", "dismo", "doSNOW", "parallel", "patchwork", "reshape2", "gtable", "moments") #If you don't have any of these packages installed, installing them now is a good idea sapply(packages, library, character.only = T) setwd("C:/Users/castellanosa/Desktop/CarnivoresGBM/") #set to your appropriate working directory with the CarnivoreTraits22Mar2021.csv file source("https://raw.githubusercontent.com/hanlab-ecol/GBMFunctions/main/gridSearch.R") source("https://raw.githubusercontent.com/hanlab-ecol/GBMFunctions/main/partial_plotR.R") source("https://raw.githubusercontent.com/hanlab-ecol/GBMFunctions/main/boostrapGBM.R") ``` # Trait Data Cleanup We'll first read in our carnivore trait data, add a couple of variables (built out of existing columns), and remove species with no trait data (a casualty of the steady march of taxonomy). ```{r ReadInData} DATA <- read.csv("CarnivoreTraits22Mar2021.csv", header = T) DATA %<>% mutate(HomeRangeScaled_km2_g = HomeRange_km2/BodyMass.Value, dietBreadth_n = apply(DATA[, 21:30], 1, function(x) sum(x > 0))) %>% dplyr::select(1:83, 134:135, 84:133) #reorder columns to make messing with their classes easier DATA[which(apply(DATA[, 3:117], 1, function(x) sum(is.na(x))) == 115), "species"] #which species have NA values for all traits (minus taxonomy) DATA <- DATA[which(apply(DATA[, 3:117], 1, function(x) sum(is.na(x))) != 115), ] #remove those species which only have NA values colnames(DATA)[-nearZeroVar(DATA)] #just checking VARS <- colnames(DATA)[c(-1:-2, -134:-135)] ``` We'll next transform any highly skewed variables. Rather than looking and making a determination from density curves, we'll use a quantitative measure of skewness. We will make some determinations of removing variables that appear skewed thanks to how they are presented (typically percentages). ```{r SkewedVariables} SKEW <- sapply(c(3:85), function(x) skewness(na.omit(DATA[, x]))) #use moments package to determine skewness SKEW PLT <- lapply(VARS, function(i) { DATA %>% ggplot(aes(x = DATA[, i])) + geom_density() + labs(x = i) }) #wrap_plots(PLT) #create density plots for all variables #wrap_plots(PLT[which(SKEW > 1.5 & SKEW < 2)]) #only plot those with a skewness above 2 CLSK <- colnames(DATA)[c(3:85)][which(SKEW > 2)] #add heavily skewed variables to a list CLSK <- CLSK[c(-8:-17)] #remove diet and foraging strategy variables CLSK <- c(CLSK, "DispersalAge_d", "female_maturity_d", "male_maturity_d", "gestation_d", "HumanPopulationDensityMax_n") #add other highly skewed variables for(i in CLSK) { if(i %in% c("HumanPopulationDensityMin_n", "HumanPopulationDensityMax_n", "HumanPopulationDensityMedian_n", "HumanPopulationDensity10perc_n")) { #deal with negative infinity issue using an if statement DATA[, i] <- log(DATA[, i] + 1) } else DATA[, i] <- log(DATA[, i]) } ``` All taxonomic and habitat classification variables are changed to factors. ```{r BinaryVariables} for(i in colnames(DATA)[c(86:133)]) { #change binary variables to factors DATA[, i] <- as.factor(DATA[, i]) } ``` # Modeling ## Hurdle Model - classification of zero and nonzero data Modeling was done with first classification of zoonotic host status (a binary label) with a Bernoulli error distribution. ### Parameterization We evaluated 36 combinations of parameters to determine the set that returned the best deviance curve and highest test evaluation statistics. ```{r Parameterization} VARS <- colnames(DATA)[-nearZeroVar(DATA)][-1:-2] #remove variables with near zero variation VARS <- VARS[c(-50:-52, -56, -110:-111)] #remove longitude variables and other labels cl <- makeCluster(3) GRID <- gridSearch(DF = DATA, label = "label", vars = VARS, eta = c(0.0001, 0.001, 0.01), max_depth = c(2, 3, 4), n.minobsinnode = c(2, 3, 4, 5), k_split = 0.8, distribution = "bernoulli", nrounds = 100000, cl = cl) ``` ```{r ParameterDevianceCurves} GRID[[1]] PLTS <-lapply(1:length(unique(GRID[[2]]$group)), function(i) GRID[[2]] %>% filter(group == unique(GRID[[2]]$group)[i]) %>% ggplot() + geom_line(aes(x = index, y = train), color = "black", size = 1) + geom_line(aes(x = index, y = valid), color = "green", size = 1) + geom_vline(xintercept = GRID[[2]] %>% filter(group == unique(GRID[[2]]$group)[i]) %>% dplyr::select(best.iter) %>% unique %>% as.numeric, color = "blue", linetype = "dashed", size = 1) + #scale_y_continuous(limits = c(-2600, -1900), breaks = c(-2600, -2500, -2400, -2300, -2200, -2100, -2000, -1900)) + labs(x = "Iteration", y = "Bernoulli deviance", title = unique(GRID[[2]]$group)[i]) + theme(panel.background = element_blank(), panel.border = element_rect(fill = "transparent", color = "black", size = 1), panel.grid.major = element_line(color = "grey90"), title = element_text(size = 8))) patchwork::wrap_plots(PLTS) ``` ### Model Evaluation We used a bootstrap approach to evaluate our models, running ten iterations of the best performing parameters. A differently arranged test and training split was used for each iteration (see the bootstrapGBM function code). ```{r Evaluation} BOOT <- bootstrapGBM(DF = DATA, label = "label", vars = VARS, k_split = 0.8, distribution = "bernoulli", eta = 0.001, max_depth = 2, n.minobsinnode = 2, nrounds = 60000, nruns = 10, bootstrap = "observed", cl = cl) NULB <- bootstrapGBM(DF = DATA, label = "label", vars = VARS, k_split = 0.8, distribution = "bernoulli", eta = 0.001, max_depth = 2, n.minobsinnode = 2, nrounds = 60000, nruns = 10, bootstrap = "null", cl = cl) mean(BOOT[[1]]$eval_test) - mean(NULB[[1]]$eval_test) ``` ```{r RelativeImportance} IMP <- apply(BOOT[[1]][, 10:114], 2, mean) CONF <- t(apply(BOOT[[1]][, 10:114], 2, function(x) t.test(x, conf.level = 0.95)$conf.int)) CONF <- cbind.data.frame(var = row.names(CONF), CONF) row.names(CONF) <- NULL colnames(CONF) <- c("var", "LI", "HI") CONF %<>% mutate(imp = IMP) CONF %>% ggplot() + geom_point(aes(x = imp, y = var), size = 2) + geom_linerange(aes(xmin = LI, xmax = HI, y = var), size = 1) + labs(x = "Relative Importance", y = "Variable") + scale_y_discrete(limits = CONF$var[order(CONF$imp, decreasing = F)]) + theme(panel.background = element_rect(fill = "white", color = "grey50"), panel.grid.major = element_line(color = "grey90"), axis.text.y = element_text(vjust = 0.1)) ``` ```{r PartialDependency} VARS <- VARS[apply(BOOT[[1]][, 10:114], 2, mean) >= 1] NAMES <- VARS[order(apply(BOOT[[1]][, VARS], 2, mean), decreasing = T)] BRKS <- lapply(NAMES, function(i) hist(as.numeric(DATA[, i]), plot = F)$breaks) HIST <- lapply(1:length(NAMES), function(i) hist(as.numeric(DATA[, NAMES[i]]), breaks = BRKS[[i]], plot = F)) HIST <- do.call(rbind, lapply(1:length(NAMES), function(h) data.frame(variable.value = HIST[[h]]$mids, value=HIST[[h]]$counts/nrow(DATA), variable.name=NAMES[h], var = "frequency"))) partial_plot(BOOT[[2]], HIST, vars = NAMES[1:6], type = "mean", cleaned_names = CLND) ``` ## Hurdle Model - ```{r HurdleModelStep2} DATA %<>% filter(label == 1) VARS <- colnames(DATA)[-nearZeroVar(DATA)][-1:-2] VARS <- VARS[c(-49:-51, -55, -110:-111)] cl <- makeCluster(3) GRID <- gridSearch(DF = DATA, label = "num_zoonoses", vars = VARS, eta = c(0.0001, 0.001, 0.01), max_depth = c(2, 3, 4), n.minobsinnode = c(2, 3, 4, 5), k_split = 0.8, distribution = "poisson", nrounds = 100000, cl = cl) ``` ```{r ParameterizationElectricBoogaloo} GRID[[1]] PLTS <-lapply(1:length(unique(GRID[[2]]$group)), function(i) GRID[[2]] %>% filter(group == unique(GRID[[2]]$group)[i]) %>% ggplot() + geom_line(aes(x = index, y = train), color = "black", size = 1) + geom_line(aes(x = index, y = valid), color = "green", size = 1) + geom_vline(xintercept = GRID[[2]] %>% filter(group == unique(GRID[[2]]$group)[i]) %>% dplyr::select(best.iter) %>% unique %>% as.numeric, color = "blue", linetype = "dashed", size = 1) + labs(x = "Iteration", y = "Bernoulli deviance", title = unique(GRID[[2]]$group)[i]) + theme(panel.background = element_blank(), panel.border = element_rect(fill = "transparent", color = "black", size = 1), panel.grid.major = element_line(color = "grey90"), title = element_text(size = 8))) patchwork::wrap_plots(PLTS) ``` ```{r EvaluationYetAgain} BOOT <- bootstrapGBM(DF = DATA, label = "num_zoonoses", vars = VARS, k_split = 0.8, distribution = "poisson", eta = 0.001, max_depth = 3, n.minobsinnode = 2, nrounds = 100000, nruns = 10, bootstrap = "observed", cl = cl) NULB <- bootstrapGBM(DF = DATA, label = "num_zoonoses", vars = VARS, k_split = 0.8, distribution = "poisson", eta = 0.001, max_depth = 3, n.minobsinnode = 2, nrounds = 100000, nruns = 10, bootstrap = "null", cl = cl) mean(BOOT[[1]]$eval_test) - mean(NULB[[1]]$eval_test) ``` ```{r AnotherRelativeImportancePlot} IMP <- apply(BOOT[[1]][, 10:114], 2, mean) CONF <- t(apply(BOOT[[1]][, 10:114], 2, function(x) t.test(x, conf.level = 0.95)$conf.int)) CONF <- cbind.data.frame(var = row.names(CONF), CONF) row.names(CONF) <- NULL colnames(CONF) <- c("var", "LI", "HI") CONF %<>% mutate(imp = IMP) CONF %>% ggplot() + geom_point(aes(x = imp, y = var), size = 2) + geom_linerange(aes(xmin = LI, xmax = HI, y = var), size = 1) + labs(x = "Relative Importance", y = "Variable") + scale_y_discrete(limits = CONF$var[order(CONF$imp, decreasing = F)]) + theme(panel.background = element_rect(fill = "white", color = "grey50"), panel.grid.major = element_line(color = "grey90"), axis.text.y = element_text(vjust = 0.1)) ``` ```{r PartialDependencyPlots} VARS <- VARS[apply(BOOT[[1]][, 10:114], 2, mean) >= 1] NAMES <- VARS[order(apply(BOOT[[1]][, VARS], 2, mean), decreasing = T)] BRKS <- lapply(NAMES, function(i) hist(as.numeric(DATA[, i]), plot = F)$breaks) HIST <- lapply(1:length(NAMES), function(i) hist(as.numeric(DATA[, NAMES[i]]), breaks = BRKS[[i]], plot = F)) HIST <- do.call(rbind, lapply(1:length(NAMES), function(h) data.frame(variable.value = HIST[[h]]$mids, value=HIST[[h]]$counts/nrow(DATA), variable.name=NAMES[h], var = "frequency"))) partial_plot(BOOT[[2]], HIST, vars = NAMES, type = "mean") ```