Strategic Coding Practices

library(upscaler)

##add_folder()           ## ONLY RUN ONCE

 

1

To start, go to this link and scroll down to “Download Data”. From there, Sort by Site to download the “BART” site dataset for years 2013-2023. In this compressed folder, you should see a list of six folders organized by year in the file name. Store that for now somewhere on your desktop.

https://data.neonscience.org/data-products/DP1.10003.001

BART: January 2015 - December 2023

 

2

Within each year’s folder, you will only be using a file from each year labeled “countdata” in its title. Using for loops, iterate through each year’s folders to gather the file names of these “countdata” .csv files.

years <- 2023-2015+1

directory <- "NEON_count-landbird"
subfolders <- list.files(directory)
startstring <- "NEON.D01.BART.DP1.10003.001.brd_countdata"

filenames <- vector(length=years)
filepaths <- vector(length=years)

for (i in 1:years){
  
  #year <- 2015+i-1
  
  folder <- subfolders[i]
  
  files <- list.files(paste0(directory, "/", folder))
  
  name <- files[startsWith(files, startstring)]
  
  filenames[i] <- name
  
  filepaths[i] <- paste0(directory, "/", folder, "/", name)
  
}

print(filenames)
## [1] "NEON.D01.BART.DP1.10003.001.brd_countdata.2015-06.basic.20241118T065914Z.csv"
## [2] "NEON.D01.BART.DP1.10003.001.brd_countdata.2016-06.basic.20241118T142515Z.csv"
## [3] "NEON.D01.BART.DP1.10003.001.brd_countdata.2017-06.basic.20241118T043125Z.csv"
## [4] "NEON.D01.BART.DP1.10003.001.brd_countdata.2018-06.basic.20241118T105926Z.csv"
## [5] "NEON.D01.BART.DP1.10003.001.brd_countdata.2019-06.basic.20241118T064156Z.csv"
## [6] "NEON.D01.BART.DP1.10003.001.brd_countdata.2020-06.basic.20241118T184512Z.csv"
## [7] "NEON.D01.BART.DP1.10003.001.brd_countdata.2020-07.basic.20241118T010504Z.csv"
## [8] "NEON.D01.BART.DP1.10003.001.brd_countdata.2021-06.basic.20241118T105538Z.csv"
## [9] "NEON.D01.BART.DP1.10003.001.brd_countdata.2022-06.basic.20241118T033934Z.csv"

 

3

Starting with pseudo-code, generate functions for 1) Cleaning the data for any empty/missing cases, 2) Extract the year from each file name, 3) Calculate Abundance for each year (Total number of individuals found), 4) Calculate Species Richness for each year(Number of unique species found), 5) Run a simple regression model for Species Richness (S) vs. Abundance for every year, 6) Generate histograms for both Abundance and Species Richness (S) and store the plots

functions <- vector(length=6)

## - - - - - - -

## 1) Cleaning the data for any empty/missing cases
functions[1] <- "clean_data"

## 2) Extract the year from each file name
functions[2] <- "extract_year"

## 3) Calculate Abundance for each year (Total number of individuals found)
functions[3] <- "calc_abundance"

## 4) Calculate Species Richness for each year(Number of unique species found)
functions[4] <- "calc_richness"

## 5) Run a simple regression model for Species Richness (S) vs. Abundance for every year
functions[5] <- "run_regression"

## 6) Generate histograms for both Abundance and Species Richness (S) and store the plots
functions[6] <- "make_histograms"

## - - - - - - -

##build_function(functions)           ## ONLY RUN ONCE

## - - - - - - -

fxs <- list.files("Functions")

for (i in 1:length(fxs)){
  
  fx <- paste0("Functions/", fxs[i])
  
  source(fx)
  
}
## Warning: package 'ggplot2' was built under R version 4.3.3

 

4

Create an initial empty data frame to hold the above summary statistics-you should have columns for the file name, one for abundance, one for species richness, one for year, and the regression model summary statistics.

. <- rep(NA, years)

stats <- data.frame(filename=.,
                    abundance=.,
                    richness=.,
                    year=.)

regression <- data.frame(estimate=c(NA,NA),
                         std.err=c(NA,NA),
                         t.value=c(NA,NA),
                         p.value=c(NA,NA))

rownames(regression) <- c("intercept", "abundance")

 

5

Using a for loop, run your created functions as a batch process for each folder, changing the working directory as necessary to read in the correct files, calculating summary statistics with your created functions, and then writing them out into your summary statistics data frame.

for (i in 1:length(filenames)){
  
  name_i <- filenames[i]
  
  df_i <- clean_data(filepaths[i])
  
  stats[i,] <- list(name_i, calc_abundance(df_i), calc_richness(df_i), extract_year(name_i))
  
}

stats[,2:4]
##   abundance richness year
## 1       454       40 2015
## 2       883       39 2016
## 3       685       35 2017
## 4       772       37 2018
## 5       628       44 2019
## 6       626       46 2020
## 7        89       18 2020
## 8      1015       50 2021
## 9       699       39 2022
## - - - - - - -

regression[,] <- summary(run_regression(stats$richness, stats$abundance))$coefficients
regression
##              estimate     std.err  t.value     p.value
## intercept 21.33140361 5.644479398 3.779162 0.006899822
## abundance  0.02666508 0.008105909 3.289585 0.013310138
## - - - - - - -

make_histograms(stats)
## [[1]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## 
## [[2]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 

Extra

ggplot(stats, aes(x=abundance, y=richness, label=year)) +
  
  geom_point(col="seagreen", pch=18, size=2.5) +
  
  geom_text(hjust=0.25, size=2.5, nudge_x = 25, color="grey25") +
  
  labs(x="Abundance",
       y="Richness",
       caption="Site: BART") +
  
  theme(plot.title=element_text(hjust=0.5, vjust=4),
        plot.subtitle=element_text(hjust=0.5, vjust=4),
        plot.caption=element_text(vjust=-7),
        axis.title.x=element_text(vjust=-4),
        axis.title.y=element_text(vjust=4),
        plot.margin=margin(1, 1, 1, 1, "cm"))