Ecology in silico

Dynamic Occupancy Models in Stan

Occupancy modeling is possible in Stan as shown here, despite the lack of support for integer parameters. In many Bayesian applications of occupancy modeling, the true occupancy states (0 or 1) are directly modeled, but this can be avoided by marginalizing out the true occupancy state. The Stan manual (pg. 96) gives an example of this kind of marginalization for a discrete change-point model.

Here’s a Stan implementation of a dynamic (multi-year) occupancy model of the sort described by MacKenzie et al. (2003).

First, the model statement:

dyn_occ.stan
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
data {
   int<lower=0> nsite;
   int<lower=0> nyear;
   int<lower=0> nrep;
   int<lower=0,upper=1> Y[nsite,nyear,nrep];
}
parameters {
   real<lower=0,upper=1> p;
   real<lower=0,upper=1> gamma;
   real<lower=0,upper=1> phi;
   real<lower=0, upper=1> psi1;
}
transformed parameters {
   matrix[nsite, nyear] psi;
   for (r in 1:nsite){
     for (t in 1:nyear){
       if (t < 2){
          psi[r, t] <- psi1;
       } else {
          psi[r, t] <- psi[r, t-1] * phi + (1 - psi[r, t-1]) * gamma;
       }
     }
   }
}
model {
   // priors
  psi1 ~ uniform(0,1);
  gamma ~ uniform(0,1);
  phi ~ uniform(0,1);
  p ~ uniform(0,1);

   // likelihood
  for (r in 1:nsite){
    for (t in 1:nyear){
      if (sum(Y[r, t]) > 0){
        increment_log_prob(log(psi[r, t]) + bernoulli_log(Y[r, t], p));
      } else {
        increment_log_prob(log_sum_exp(log(psi[r, t]) + bernoulli_log(Y[r, t],p),
                                      log(1-psi[r, t])));
      }
    }
  }
}

This model can be made faster by storing values for log(psi) and log(1 - psi), as done in Bob Carpenter’s single season example.

Fitting the model (in parallel):

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
library(rstan)
library(parallel)

simulate_data <- function(){
  nsite <- 100;
  nrep <- 2;     # repeat surveys
  nyear <- 10;  # nyears
  p <- 0.8;
  gamma <- .2
  phi <- .8
  psi1 <- .8
  psi <- array(dim=c(nsite, nyear))
  psi[, 1] <- psi1
  for (t in 2:nyear){
    psi[, t] <- psi[, t - 1] * phi + (1 - psi[, t-1]) * gamma
  }

  Z <- array(dim=c(nsite, nyear))
  for (t in 1:nyear){
    Z[, t] <- rbinom(nsite, 1, psi[, t])
  }

  Y <- array(dim=c(nsite, nyear, nrep))
  for (r in 1:nsite){
    for (t in 1:nyear){
      Y[r, t, ] <- rbinom(nrep, 1, Z[r, t] * p)
    }
  }
  return(list(nsite=nsite, nrep=nrep, nyear=nyear,
              p=p, gamma=gamma, phi=phi,
              psi1=psi1, Y=Y))
}

## parallel run
d <- simulate_data()
# initialize model
mod <- stan("dyn_occ.stan", data = d, chains = 0)

estimate_params <- function(model, data, iter=2000){
  sflist <- mclapply(1:4, mc.cores = 4,
             function(i) stan(fit = model, data = data,
                              chains = 1, chain_id = i,
                              iter=iter,
                              refresh = -1))
  fit <- sflist2stanfit(sflist)
  return(fit)
}

fit <- estimate_params(model=mod, data=d)

Does it work? Let’s whip up 1000 simulated datasets and their corresponding estimates for colonization and extinction rates.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
one_run <- function(initialized_mod, params){
  require(modeest)
  require(reshape2)
  source("HDI.R")
  d <- simulate_data()
  fit <- estimate_params(model=initialized_mod, data=d)
  # store HDI for certain params
  post <- extract(fit, params)
  vals <- array(dim=c(length(params), 3))
  rownames(vals) <- params
  colnames(vals) <- c("LCI", "mode", "UCI")
  mode <- rep(NA, length(params))
  lci <- rep(NA, length(params))
  uci <- rep(NA, length(params))
  for (i in 1:length(params)){
    # calculate posterior mode
    mode[i] <- mlv(post[[i]], method="mfv")$M
    # calculate HDI
    interv <- HDI(post[[i]])
    lci[i] <- interv[1]
    uci[i] <- interv[2]
  }
  vals <- data.frame(parameter = params, mode=mode,
                     lci=lci, uci=uci)
  return(vals)
}

iterations <- 1000
check <- one_run(mod, c("gamma", "phi"))
check$iteration <- rep(1, nrow(check))
for (i in 2:iterations){
  new_d <- one_run(mod, c("gamma", "phi"))
  new_d$iteration <- rep(i, nrow(new_d))
  check <- rbind(check, new_d)
}

true.vals <- data.frame(parameter = c("gamma", "phi"),
                        value = c(d$gamma, d$phi))
post.vals <- data.frame(parameter = c("gamma", "phi"),
                        value = c(mean(check$mode[check$parameter == "gamma"]),
                                  mean(check$mode[check$parameter == "phi"])))

ggplot(check) +
  geom_point(aes(x=mode, y=iteration)) +
  facet_wrap(~parameter) +
  geom_segment(aes(x=lci, xend=uci,
                   y=iteration, yend=iteration)) +
  theme_bw() +
  geom_vline(aes(xintercept=value), true.vals,
             color="blue", linetype="dashed") +
  geom_vline(aes(xintercept=value), post.vals,
             color="red", linetype="dashed") +
  xlab("value")

Here are the results for the probability of colonization $\gamma$, and the probability of persistence $\phi$. The blue dashed line shows the true value, and the dashed red lines shows the mean of all 1000 posterior modes. The black lines represent the HPDI for each interation, and the black points represent the posterior modes. This example uses a uniform prior on both of these parameters - probably an overrepresentation of prior ignorance in most real systems.

Based on some initial exploration, this approach seems much much (much?) faster than explicitly modeling the latent occurrence states in JAGS, with better chain mixing and considerably less autocorrelation. Extension to multi-species models should be straightforward too - huzzah!

References

MacKenzie DI, Nichols JD, Hines JE, Knutson MG, Franklin AB. 2003. Estimating site occupancy, colonization, and local extinction when a species is detected imperfectly. Ecology 84(8): 2200-2207. pdf

Spatial Data Extraction Around Buffered Points in R

Quantifying spatial data (e.g. land cover) around points can be done in a variety of ways, some of which require considerable amounts of patience, clicking around, and/or cash for a license. Here’s a bit of code that I cobbled together to quickly extract land cover data from the National Land Cover Database for buffered regions around points (e.g. small ponds, point-count locations, etc.), in this case, U.S. capitals.

Here, extract_cover() is a function to do the extraction (with the help of the raster package), and extraction.R makes a parallel call to the function using the doMC package:

extract_cover.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# Function to extract the data in R instead of Arc
# inputs: year, buffer size (in meters), point data file, 
#   cover data directory, and output file directory
# output: csv file with site id, cover type, and % in buffer

extract_cover <- function(year, buffer,
                          point_d = "sites.csv",
                          data_dir="NLCD_data",
                          write_dir="extracted"){
  require(raster)
  require(rgdal)
  require(stringr)

  # load cover data 
  filename <- paste(data_dir, "/nlcd_",
                    year,
                    "_landcover_2011_edition_2014_03_31.img",
                    sep="")
  NLCD <- raster(filename)

  # load site data
  sites <- read.csv(point_d, header=T)
  coords <- sites[, c("longitude", "latitude")]

  #convert lat/lon to appropriate projection
  names(coords) <- c("x", "y")
  coordinates(coords) <- ~x + y
  proj4string(coords) <- CRS("+proj=longlat +ellps=WGS84 +datum=WGS84")
  crs_args <- NLCD@crs@projargs
  sites_transformed <- spTransform(coords, CRS(crs_args))

  #extract land cover data for each point, given buffer size
  Landcover <- extract(NLCD, sites_transformed, buffer=buffer)

  # summarize each site's data by proportion of each cover type
  summ <- lapply(Landcover, function(x){
    prop.table(table(x))
  }
  )

  # generate land cover number to name conversions
  num.codes <- unique(unlist(Landcover))
  cover.names <- NLCD@data@attributes[[1]]$Land.Cover.Class[num.codes + 1]
  levels(cover.names)[1] <- NA # first level is ""
  conversions <- data.frame(num.codes, cover.names)
  conversions <- na.omit(conversions)
  conversions <- conversions[order(conversions$num.codes),]

  # convert to data frame
  mydf <- data.frame(id = rep(sites$id, lapply(summ, length)),
                     cover = names(unlist(summ)),
                     percent = unlist(summ)
  )

  # create cover name column
  mydf$cover2 <- mydf$cover
  levels(mydf$cover2) <- conversions$cover.names

  # write output
  out_name <- paste(write_dir, "/",
                    year, "_cover_", buffer,
                    "_m_buffer.csv", sep="")
  write.csv(mydf, out_name)
}
extraction.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# Script to actually extract the data at various buffer distances
# parallelized for speed (one core per year)
library(doMC)

years <- c(2001, 2006, 2011)
nyears <- length(years)
registerDoMC(nyears)

# input vector of distances (in meters)
buffer_distances <- c(1000)

foreach (i=1:nyears) %dopar% {
  for (j in buffer_distances){
    extract_cover(year = years[i], buffer=j)
  }
}

Resources

Multilevel Modeling of Community Composition With Imperfect Detection

This is a guest post generously provided by Joe Mihaljevic.

A common goal of community ecology is to understand how and why species composition shifts across space. Common techniques to determine which environmental covariates might lead to such shifts typically rely on ordination of community data to reduce the amount of data. These techniques include redundancy analysis (RDA), canonical correspondence analysis (CCA), and nonmetric multi-dimensional scaling (NMDS), each paired with permutation tests. However, as brought to light by Jackson et al. (2012: Ecosphere), these ordination techniques do not discern species-level covariate effects, making it difficult to attribute community-level pattern shifts to species-level changes. Jackson et al. (2012) propose a hierarchical modeling framework as an alternative, which we extend in this post to correct for imperfect detection.

Multilevel models can estimate species-level random and fixed covariate effects to determine the relative contribution of environmental covariates to changing composition across space (Jackson et al. 2012). For presence/absence data, such models are often formulated as:

Here $y_q$ is a vector of presences/absence of each species at each site ($q=1, … , nm,$ where $n$ is the number of species and $m$ the number of sites). This model can be extended to incorporate multiple covariates.

We are interested in whether species respond differently to environmental gradients (e.g. elevation, temperature, precipitation). If this is the case, then we expect community composition changes along such gradients. Concretely, we are interested in whether $\sigma_{slope}^2$ for any covariate differs from zero.

Jackson et al. (2012) provide code for a maximum likelihood implementation of their model with data from Southern Appalachian understory herbs using the R package lme4. Here we present a simple extension of Jackson and colleague’s work, correcting for detection error with repeat surveys (i.e. multi-species occupancy modeling). Specifically, the above model could be changed slightly to:

Now $y_q$ is the number of times each species is observed at each site over $j$ surveys. $p_{spp[q]}$ represents the species-specific probability of detection when the species is present, and $z_q$ represents the ‘true’ occurence of the species, a Bernoulli random variable with probability, $\psi_q$.

To demonstrate the method, we simulate data for a 20 species community across 100 sites with 4 repeat surveys. We assume that three site-level environmental covariates were measured, two of which have variable affects on occurrence probabilities (i.e. random effects), and one of which has consistent effects for all species (i.e. a fixed effect). We also assumed that species-specific detection probabilities varied, but were independent of environmental covariates.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
################################################
# Simulate data
################################################

Nsite <- 100
Ncov <- 3
Nspecies <- 20
J <- 4

# species-specific intercepts:
alpha <- rnorm(Nspecies, 0, 1)

# covariate values
Xcov <- matrix(rnorm(Nsite*Ncov, 0, 2),
               nrow=Nsite, ncol=Ncov)

# I'll assume 2 of the 3 covariates have effects that vary among species
Beta <- array(c(rnorm(Nspecies, 0, 2),
                rnorm(Nspecies, -1, 1),
                rep(1, Nspecies)
                ),
              dim=c(Nspecies, Ncov)
              )

# species-specific detection probs
p0 <- plogis(rnorm(Nspecies, 1, 0.5))
p0

#### Occupancy states ####
Yobs <- array(0, dim = c(Nspecies, Nsite)) # Simulated observations

for(n in 1:Nspecies){
  for(k in 1:Nsite){
    lpsi <- alpha[n] + Beta[n, ] %*% Xcov[k, ] # Covariate effects on occurrence 
    psi <- 1/(1+exp(-lpsi)) #anti-logit

    z <- rbinom(1, 1, psi) # True Occupancy
    Yobs[n, k] <- rbinom(1, J, p0[n] * z) # Observed Occupancy
  }
}

################################################
# Format data for model
################################################
# X needs to have repeated covariates for each species, long form
X <- array(0, dim=c(Nsite*Nspecies, Ncov))
t <- 1; i <- 1
TT <- Nsite
while(i <= Nspecies){
  X[t:TT, ] <- Xcov
  t <- t+Nsite
  TT <- TT + Nsite
  i <- i+1
}

# Species
Species <- rep(c(1:Nspecies), each=Nsite)

# Observations/data:
Y <- NULL
for(i in 1:Nspecies){
  Y <- c(Y, Yobs[i, ])
}

# All sites surveyed same # times:
J <- rep(J, times=Nspecies*Nsite)

# Number of total observations
Nobs <- Nspecies*Nsite

We fit the following model with JAGS with vague priors.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
model {
  # Priors
  psi.mean ~ dbeta(1,1)
  p.detect.mean ~ dbeta(1,1)

  sd.psi ~ dunif(0,10)
  psi.tau <- pow(sd.psi, -2)

  sd.p.detect ~ dunif(0,10)
  p.detect.tau <- pow(sd.p.detect, -2)

  for(i in 1:Nspecies){
    alpha[i] ~ dnorm(logit(psi.mean), psi.tau)T(-12,12)
    lp.detect[i] ~ dnorm(logit(p.detect.mean), p.detect.tau)T(-12,12)
    p.detect[i] <- exp(lp.detect[i]) / (1 + exp(lp.detect[i]))
  }

  for(j in 1:Ncov){
    mean.beta[j] ~ dnorm(0, 0.01)
    sd.beta[j] ~ dunif(0, 10)
    tau.beta[j] <- pow(sd.beta[j]+0.001, -2)
    for(i in 1:Nspecies){
      betas[i,j] ~ dnorm(mean.beta[j], tau.beta[j])
    }
  }

  # Likelihood
  for(i in 1:Nobs){
    logit(psi[i]) <- alpha[Species[i]] + inprod(betas[Species[i],], X[i, ])
    z[i] ~ dbern(psi[i])
    Y[i] ~ dbinom(z[i] * p.detect[Species[i]], J[i])
  }
}

Using information theory, specifically Watanabe-Akaike information criteria (WAIC), we compared this model, which assumes all covariates have random effects, to all combination of models varying whether each covariate has fixed or random effects. See this GitHub repository for all model statements and code.

A model that assumes all covariates have random effects, and the data-generating model, in which only covariates 1 and 2 have random effects, performed the best, but were indistinguishable from one another:

This result makes sense because the model with all random effects is able to recover species-specific responses to site-level covariates very well:

However, this model estimates that the 95% HDI of $\sigma_{slope}$ of covariate 3 includes zero, indicating that this covariate effectively has a fixed, rather than random effect among species.

Thus, we could conclude that the first two covariates have random effects, while the third covariate has a fixed effect. This means that composition shifts along gradients of covariates 1 and 2. We can visualize the relative contribution of covariate 1 and 2’s random effects to composition using ordination, as discussed in Jackson et al. (2012). To do this, we compare the linear predictor (i.e. $logit^{-1}(\psi_q)$) of the best model that includes only significant random effects to a model that does not have any random effects.

The code to extract linear predictors and ordinate the community is provided on GitHub: