Simulating and Fitting Data Distributions

Fitting data distribution curves to a dataset that has intensity of earthworm invasion (biomass) and corresponding macro-arthropod abundance.

The simulations will be using the abundance data (Number of individuals per m^2)

library(ggplot2) # for graphics
library(MASS) # for maximum likelihood estimation

# Read in data vector
z <- read.csv("plotdata_macro_constrained_20210105.csv",header=TRUE,sep=",")
str(z)
## 'data.frame':    80 obs. of  9 variables:
##  $ plot           : chr  "CAB.01" "CAB.02" "CAB.03" "CAB.04" ...
##  $ worms          : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ richness       : int  18 19 19 12 14 14 17 18 19 12 ...
##  $ forest         : chr  "CAB" "CAB" "CAB" "CAB" ...
##  $ year           : int  2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 ...
##  $ worm_biomass.m2: num  7.968 6.468 0.376 12.004 9.8 ...
##  $ abundance.m2   : int  56 54 76 28 36 46 110 102 132 32 ...
##  $ biomass.g.m2   : num  0.486 0.657 0.411 0.396 0.178 ...
##  $ shannon        : num  2.8 2.85 2.68 2.44 2.58 ...
summary(z)
##      plot               worms        richness        forest         
##  Length:80          Min.   :0.0   Min.   : 7.00   Length:80         
##  Class :character   1st Qu.:0.0   1st Qu.:15.00   Class :character  
##  Mode  :character   Median :0.5   Median :19.00   Mode  :character  
##                     Mean   :0.5   Mean   :19.77                     
##                     3rd Qu.:1.0   3rd Qu.:22.00                     
##                     Max.   :1.0   Max.   :45.00                     
##       year      worm_biomass.m2    abundance.m2    biomass.g.m2   
##  Min.   :2016   Min.   :  0.000   Min.   : 24.0   Min.   :0.1185  
##  1st Qu.:2016   1st Qu.:  0.690   1st Qu.: 56.0   1st Qu.:0.3587  
##  Median :2016   Median :  7.218   Median : 74.0   Median :0.6261  
##  Mean   :2016   Mean   : 22.971   Mean   : 87.4   Mean   :0.8771  
##  3rd Qu.:2016   3rd Qu.: 45.491   3rd Qu.:103.0   3rd Qu.:0.9514  
##  Max.   :2017   Max.   :136.720   Max.   :312.0   Max.   :5.5635  
##     shannon     
##  Min.   :1.294  
##  1st Qu.:2.309  
##  Median :2.598  
##  Mean   :2.569  
##  3rd Qu.:2.838  
##  Max.   :3.420
# Plot histogram of data

p1 <- ggplot(data=z, aes(x=abundance.m2, y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) 
print(p1)

# Add empirical density curve
p1 <-  p1 +  geom_density(linetype="dotted",size=0.75)
print(p1)

# Get maximum likelihood parameters for normal
normPars <- fitdistr(z$abundance.m2,"normal")
print(normPars)
##      mean         sd    
##   87.400000   53.894712 
##  ( 6.025612) ( 4.260751)
str(normPars)
## List of 5
##  $ estimate: Named num [1:2] 87.4 53.9
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ sd      : Named num [1:2] 6.03 4.26
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ vcov    : num [1:2, 1:2] 36.3 0 0 18.2
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "mean" "sd"
##   .. ..$ : chr [1:2] "mean" "sd"
##  $ n       : int 80
##  $ loglik  : num -432
##  - attr(*, "class")= chr "fitdistr"
normPars$estimate["mean"] 
## mean 
## 87.4
# note structure of getting a named attribute
# Plot normal prob. density
meanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]

xval <- seq(0,max(z$abundance.m2),len=length(z$abundance.m2))

stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", n = length(z$abundance.m2), args = list(mean = meanML, sd = sdML))
p1 + stat

# Plot exponential prob. density
expoPars <- fitdistr(z$abundance.m2,"exponential")
rateML <- expoPars$estimate["rate"]

stat2 <- stat_function(aes(x = xval, y = ..y..), fun = dexp, colour="blue", n = length(z$abundance.m2), args = list(rate=rateML))
p1 + stat + stat2

# Plot uniform prob. density
stat3 <- stat_function(aes(x = xval, y = ..y..), fun = dunif, colour="darkgreen", n = length(z$abundance.m2), args = list(min=min(z$abundance.m2), max=max(z$abundance.m2)))
p1 + stat + stat2 + stat3

# plot gamma prob. density - this one fits best!
gammaPars <- fitdistr(z$abundance.m2,"gamma")
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]

stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(z$abundance.m2), args = list(shape=shapeML, rate=rateML))
p1 + stat + stat2 + stat3 + stat4

# Plot beta prob density
pSpecial <- ggplot(data=z, aes(x=abundance.m2/(max(abundance.m2 + 0.1)), y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) + 
  xlim(c(0,1)) +
  geom_density(size=0.75,linetype="dotted")

betaPars <- fitdistr(x=z$abundance.m2/max(z$abundance.m2 + 0.1),start=list(shape1=1,shape2=2),"beta")
shape1ML <- betaPars$estimate["shape1"]
shape2ML <- betaPars$estimate["shape2"]

statSpecial <- stat_function(aes(x = xval, y = ..y..), fun = dbeta, colour="orchid", n = length(z$abundance.m2), args = list(shape1=shape1ML,shape2=shape2ML))
pSpecial + statSpecial

Simulate a new data set. Using the best-fitting distribution, go back to the code and get the maximum likelihood parameters. Use those to simulate a new data set, with the same length as your original vector, and plot that in a histogram and add the probability density curve. Right below that, generate a fresh histogram plot of the original data, and also include the probability density curve.

zsim <- rgamma(n=80, shape=3.6329359, rate = 0.0415673)
zsim <- data.frame(1:80,zsim)
names(zsim) <- list("ID","myVar")
zsim <- zsim[zsim$myVar>0,]
str(zsim)
## 'data.frame':    80 obs. of  2 variables:
##  $ ID   : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ myVar: num  50.1 79.2 32.9 50.3 63.9 ...
summary(zsim$myVar)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   17.19   50.23   78.10   87.82  114.22  243.63
psim <- ggplot(data=zsim, aes(x=myVar, y=after_stat(density))) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) 
print(psim)

print(p1)

How do the two histogram profiles compare? Do you think the model is doing a good job of simulating realistic data that match your original measurements? Why or why not?

The two histogram profiles are not drastically different but there are noticeable differences between the simulated data and the original data. The simulated data using the gamma distribution parameters is a little bit more uniform in spread and has less skewness compared to the original data

If you have entered a large data frame with many columns, try running all of the code on a different variable to see how the simulation performs.

Insect biomass follows similar distribution as insect abundance (gamma)

Shannon diversity follows the normal distribution

Earthworm biomass seems to fit the beta distribution the best

For species richness, the normal distribution fits best