knitr::opts_chunk$set(echo = TRUE)

In this tutorial we will map an existing vector population data set to the VecDyn format.

Please refer to the 'R for Data Science guide'(https://r4ds.had.co.nz/) for further information on some of the techniques we use here.

Load the following packages (and install if required)

library(tidyverse)
library(readr)
library(lubridate)
library(plyr)

Create the VecDyn data template / data-frame and name it.


vecdyn_mcm_2012  <- data.frame(
                      taxon = character(),
                      location_description = character(),
                      study_collection_area = character(),
                      geo_datum = character(),
                      gps_obfuscation_info = character(),
                      species_id_method = character(),
                      study_design = character(),
                      sampling_strategy = character(),
                      sampling_method = character(),
                      sampling_protocol = character(),
                      measurement_unit = character(),
                      value_transform = character(),
                      sample_start_date = character(),
                      sample_start_time = character(),
                      sample_end_date =  character(),
                      sample_end_time = character(),
                      sample_value = numeric(),
                      sample_sex = character(),
                      sample_stage = character(),
                      sample_location = character(),
                      sample_collection_area = character(),
                      sample_lat_dd = character(),
                      sample_long_dd = character(),
                      sample_environment = character(),
                      additional_location_info = character(),
                      additional_sample_info = character(),
                      sample_name = character(),
                      stringsAsFactors=FALSE)

#write.csv(VecDyn_template, file = "VecDyn_template.csv", row.names = FALSE)

In this example, we will use a Manatee County Mosquito collection data-set. When importing data-sets, make sure you set 'stringsAsFactors=FALSE' to ensure R does not automatically convert strings to factors.


Manatee_County_Mosquito_2012 <- read.csv(url("https://zenodo.org/record/1217702/files/VectorBase-2012-Manatee_County_Mosquito_Control_District_Florida_USA.csv?download=1"), stringsAsFactors=FALSE)

Inspect the data-frame.

head(Manatee_County_Mosquito_2012)

All geo-names should be presented in one field. The Manatee_County data set's study location is characterized using 3 fields, we need to combine this information into one field. We can use a tidyr function called "unite" to do this.

Manatee_County_Mosquito_2012 <-
  Manatee_County_Mosquito_2012 %>%
  tidyr::unite("location_description", c("location_ADM2","location_ADM1", "location_country"), sep = ",")

Extract all the data from the Manatee_County_Mosquito_2012 to individual vectors. Each new vector name represents a name in the VecDyn template


taxon <- Manatee_County_Mosquito_2012$species

sample_value <- Manatee_County_Mosquito_2012$sample_count

sample_sex <- Manatee_County_Mosquito_2012$sex

sample_stage <-  Manatee_County_Mosquito_2012$developmental_stage

sample_start_date <- Manatee_County_Mosquito_2012$collection_date_start

sample_end_date <- Manatee_County_Mosquito_2012$collection_date_end

sample_lat_dd <- Manatee_County_Mosquito_2012$GPS_lat

sample_long_dd <- Manatee_County_Mosquito_2012$GPS_lon

sample_location <- Manatee_County_Mosquito_2012$location

location_description <- Manatee_County_Mosquito_2012$location_description

sampling_method <- Manatee_County_Mosquito_2012$attractant

sampling_protocol <- Manatee_County_Mosquito_2012$trap_type

species_id_method <- Manatee_County_Mosquito_2012$identification_method

sample_name <-  Manatee_County_Mosquito_2012$trap_id


Bind the extracted values together creating and create a new data-frame. We then bind the new data-frame with the vecdyn_mcm_2012 template.


a <- cbind(taxon, sample_value, sample_sex, sample_stage, sample_start_date, sample_end_date, sample_lat_dd, sample_long_dd, sample_location, location_description, sampling_method, sampling_protocol, species_id_method, sample_name)

a <- data.frame(a, stringsAsFactors=FALSE)

vecdyn_mcm_2012 <- rbind.fill(vecdyn_mcm_2012, a)

You should check to see if the number of observations in the newly created vecdyn_mcm_2012 data frame match the number of observations in the original manatee county data frame.

All VecDyn dates need to be formatted as Y-m-d, in this example the dates are already in the correct format.

vecdyn_mcm_2012 <-
  vecdyn_mcm_2012 %>%
  dplyr::mutate(sample_end_date = as.Date(as.character(sample_end_date, format = "%Y/%m/%d"))) %>%
  dplyr::mutate(sample_start_date = as.Date(as.character(sample_start_date, format = "%Y/%m/%d"))) %>%
  # We also convert the sample_value field to numeric so we can create a time series in the next step
  dplyr::mutate(sample_value = as.numeric(sample_value))


We can test our new data frame set against the original data frame to see if they produce the same results.

vecdyn_mcm_2012  %>%
  dplyr::select(sample_value, sample_end_date) %>%
  dplyr::group_by(sample_end_date) %>%
  ggplot(aes(sample_end_date, sample_value)) + geom_line() +
  scale_x_date(date_breaks = "1 month", date_minor_breaks = "1 week",
             date_labels = "%B")


Manatee_County_Mosquito_2012  %>%
  dplyr::select(sample_count, collection_date_end) %>%
  plyr::mutate(collection_date_end = as.Date(as.character(collection_date_end, format = "%Y/%m/%d"))) %>%
  dplyr::mutate(sample_count = as.numeric(sample_count)) %>%
  ggplot(aes(collection_date_end, sample_count)) + geom_line() +
  scale_x_date(date_breaks = "1 month", date_minor_breaks = "1 week",
             date_labels = "%B")

Finally write the data set to CSV, set all missing value to blank ""


write.csv(vecdyn_mcm_2012, "vecdyn_mcm_2012.csv", row.names = FALSE, quote=TRUE, na = "", fileEncoding = "UTF-8")

Next complete the publication information, this can be submitted as a CV or using the online web form when you submit the main data-set. The publication information for our example data-set here is found here https://zenodo.org/record/1217702#.Xvs89ygza70. Note that this information should only be recorded as one line (or row) for each field.

rm(list=setdiff(ls(), c("vecdyn_mcm_2012")))

You will also need to provide publication information for the data-set on the VecDyn data submissions page (example below) .


title <- "Manatee County Mosquito Control District entomological monitoring 2012"
collection_author <- "Joe Bloggs"
dataset_doi <- "10.5281/zenodo.1217702"
description <- "Mosquito surveillance from the Manatee County Mosquito Control District Vector Surveillance program to survey mosquito population"
url <- "https://zenodo.org/record/1217702#.XwbwICgzZPZ"
contact_name <- "Joe Bloggs"
contact_affiliation <- "Manatee County Mosquito Control District"
email <- "Joe Bloggs@joebloggs.com"
dataset_license=character <- "open"


Extracting observational climate data to spatial-temporal sample points [trap locations]

In this section of the tutorial, we will show you how to combine observational climate data and Vecdyn data. You can run through the above steps to get the data set we'll be working with. Please note for simplicity, we should you how to download and process max daily temp and daily precipitation. Depending on the species studied or study methodology, it may be appropriate to use daily min temps, take an average of min / max temps or calculate the range. You can adapt the code below to do this, minimum daily temperature data can also be sourced from the links below.

General guidelines

CPC Global Temperature data provided by the NOAA/OAR/ESRL PSD, Boulder, Colorado, USA, from their Web site at https://www.esrl.noaa.gov/psd/

In order to download the maximum daily temperature in degrees celsius (tmax) datasets (in order for workflow to complete), please follow the link below:

Index of ftp://ftp.cdc.noaa.gov/Datasets/cpc_global_temp/

Store these netCDF files in your main working directory

CPC Global Unified Precipitation (inches) data provided by the NOAA/OAR/ESRL PSD, Boulder, Colorado, USA, from their Web site at https://www.esrl.noaa.gov/psd/

In order to download the daily precipitation datasets (in order for workflow to complete), please follow the link below: ftp://ftp.cdc.noaa.gov/Datasets/cpc_global_precip/

Store these netCDF files in your main working directory

library(tidyverse)
library(lubridate)
library(sf)
library(tmap)

# Create a Spatial reference system
project_crs <- 4326


# rename main data set
vector_abun <- vecdyn_mcm_2012

# Create a year field from the dates, we will use this later for the climate extraction
vector_abun$year <- format(as.Date(vector_abun$`sample_start_date`), "%Y")

# conver long / lats to an sf spatial object
vector_abun = st_as_sf(vector_abun, coords = c("sample_long_dd", "sample_lat_dd"), crs = project_crs,  remove = FALSE)

# Visualize trap locations 

# get unique trap locations
traps <- dplyr::distinct(vector_abun, sample_lat_dd, sample_long_dd, geometry)

# Obtain USA state map from maps package
usa = st_as_sf(maps::map("state", fill = TRUE, plot = FALSE))
usa = st_set_crs(usa, project_crs)

# Obtain Florida from the USA map
state = subset(usa, grepl("florida", usa$ID))
usa <- st_buffer(usa, 0)

# Plot locations 
tm_shape(usa) +
  tm_polygons() + 
  tm_shape(traps) +
  tm_dots()

tm_shape(state) +
  tm_polygons() + 
  tm_shape(traps) +
  tm_dots()



###################################################################################################################
#EXTRACTION TMAX - Note that the CPC climate data spans from 1979 to 2019,remove observations outside of this range
###################################################################################################################
#Load packages
library(RNetCDF)
library(raster)
library(rgdal)
library(sf)
library(sp)

# create a vector of years in the data set (we use this to select tmax files)
years <- st_set_geometry(vector_abun, NULL)
years <- dplyr::distinct(years, year)
years <- as.vector(years$year)

# create start message
print (paste(Sys.time(),"start"))

# count the number of rows in the traps dataframe so we can create an empty df
df_count <- NROW(traps)

# Create a new dataframe with the same number of rows as there are traps 
tmaxAll <- as.data.frame(matrix(0, ncol = 0, nrow = df_count))

# Copy the geometry to the dataframe
tmaxAll$sample_lat_dd <- traps$sample_lat_dd
tmaxAll$sample_long_dd <- traps$sample_long_dd

# loop through each year in the years vector and look for matching file names in the Wd
for (y in 1:length(years))
{
    # create file pattern
    pat <-paste("tmax.",years[y], sep="")

    year <- years[y]

    #list the files matching the pattern
    listfiles <- list.files(pattern = pat)

    # create a loop that runs through every matching file in the wb
    for(f in listfiles)
    {

    # create a raster brick, each file in the brick will represent a day of temperature sampling
      tmax = brick(f)
      # rotate the raster brick to convert to conventional -180 to 180 sample_long_dd
      tmax <- rotate(tmax)
      # replace -999 values with NA 
      tmax <- reclassify(tmax, cbind(-999, NA))
      # create a day variable from the brick from each layer, representing each day in the year 
      days <- nlayers(tmax) 
      # ensure the Coordinate Reference System in the temp data and trap sites are matching
      shp = st_transform(traps, crs(tmax))
      #Extract data from each layer in the brick
      for (i in 1:nlayers(tmax))
        {
        print (paste(Sys.time(),"extracting-", "dateset:", f, ",year",year, ",Day:",i))
        #print(c(i, f))
        tmaxEx <- raster::extract(tmax[[i]], 
                as_Spatial(shp),
                method = 'bilinear', 
                fun = mean,
                na.rm = T)
        # create a dataframe out of the extracted data 
        tmaxexdf <- as.data.frame(tmaxEx)
        # name the column name by year
        colnames(tmaxexdf) <- paste(year, i, sep = "-")
        # bind newly created dataframe to the tmaxall, each loop in the cycle will create a new column
        tmaxAll <- cbind(tmaxAll,tmaxexdf)

      }
    }}

# convert the dataset to the correct format 
max_temp <- tmaxAll %>% gather(date, max_temp, -sample_lat_dd, -sample_long_dd)

# Now we have temperature for every day of each trap location

print (paste(Sys.time(),"finished Run tmax"))

# if you are dealing with a big dataset it would be a good idea to save this as a csv and re import it later
#write.csv(max_temp, file = "max_temp.csv", row.names=FALSE)


rm(list=setdiff(ls(), c("vector_abun", "project_crs", "vecdyn_mcm_2012", "max_temp")))
################################################################################
#EXTRACTION Precip - same steps as above
################################################################################
#Load packages
library(RNetCDF)
library(raster)
library(rgdal)
library(sf)
library(sp)

project_crs <- "+proj=longlat +WGS84 (EPSG: 4326) +init=epsg:4326 +proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs +towgs84=0,0,0"

# create a vector of years in the data set (we use this to select tmax files)
years <- st_set_geometry(vector_abun, NULL)
years <- dplyr::distinct(years, year)
years <- as.vector(years$year)

# get unique trap locations
traps <- dplyr::distinct(vector_abun, sample_lat_dd, sample_long_dd, geometry)


print (paste(Sys.time(),"start"))

df_count <- NROW(traps)

precAll <- as.data.frame(matrix(0, ncol = 0, nrow = df_count))

precAll$sample_lat_dd <- traps$sample_lat_dd

precAll$sample_long_dd <- traps$sample_long_dd

for (y in 1:length(years))
{
  pat<-paste("precip.",years[y], sep="")
  year <- years[y]
  listfiles <- list.files(pattern = pat)
  for(f in listfiles)
  {
    precip = brick(f)
    precip <- rotate(precip)
    precip <- reclassify(precip, cbind(-999, NA))
    days <- nlayers(precip) 
    shp = st_transform(traps, crs(precip))
    for (i in 1:nlayers(precip))
    {
      print (paste(Sys.time(),"extracting-", "dateset:", f, ",year",year, ",Day:",i))
      precipEx <- raster::extract(precip[[i]], 
                                  as_Spatial(shp),
                                  method = 'bilinear',
                                  fun = mean,
                                  na.rm = T)
      precipexdf <- as.data.frame(precipEx)
      colnames(precipexdf) <- paste(year, i, sep = "-")
      precAll <- cbind(precAll,precipexdf)
    }
  }}
precip <- precAll %>% gather(date, precip, -sample_lat_dd, -sample_long_dd)

print (paste(Sys.time(),"finished Run precip"))


rm(list=setdiff(ls(), c("vector_abun", "project_crs", "vecdyn_mcm_2012", "max_temp", "precip", "traps")))
######################################
#Join the climate data to main data set
######################################


# join climate datasets together by lat / long and date

clim_df <- dplyr::inner_join(max_temp, precip, by = c("sample_lat_dd", "sample_long_dd", "date"))

#convert year-Day of the year into standard date format
clim_df$date <- as.Date(clim_df$date, format="%Y-%j")


For the sake of this tutorial we will simplify the abundance data set - consider now that we have two data sets, we need to join them together. This could be done it in different ways depending on what analysis we want to achieve i.e. a standard linear regression or time series analysis.

For a linear regression we can join the matching mosquito abundance observations to the matching climate observations. We could also take average of all the observations by weeks, since the mosquito traps in this data set are only collected once a week.


# convert dates to epiweeks in the climate data frame and average values by epi week

clim_df_averages <-
clim_df %>%
  dplyr::mutate(epi_week = epiweek(date)) %>%
  dplyr::select(sample_lat_dd, sample_long_dd, epi_week, max_temp, precip) %>%
  dplyr::group_by(sample_lat_dd, sample_long_dd, epi_week) %>%
  dplyr::summarise(max_temp = mean(max_temp), precip = mean(precip))


vector_abun_clim_averages <-
  vector_abun %>%
  dplyr::mutate(epi_week = epiweek(sample_end_date)) %>%
  dplyr::group_by(sample_lat_dd, sample_long_dd, epi_week, taxon, sample_sex) %>%
  dplyr::summarise(sample_value = mean(sample_value)) %>%
  dplyr::left_join(clim_df_epi_week_averages, by = c("sample_lat_dd", "sample_long_dd", "epi_week"))  


# Plot results

ggplot(vector_abun_clim_averages, aes(x=epi_week, y=`sample_value`)) +
  geom_line()


ggplot(vector_abun_clim_averages, aes(x=epi_week, y=max_temp)) +
  geom_line()

ggplot(vector_abun_clim_averages, aes(x=epi_week, y=precip)) +
  geom_line()