Tidy Raw Files

p <- "D:/projects/soilTemperatureMonitoring/data/rawTxtFilesClean"
setwd(p)

# get file names of HOBO temp data
files <- list.files()

# read files
l <- lapply(files, function(x) {
  
  fileName = strsplit(x, '[.]')[[1]][1]
  siteid   = strsplit(x, '_')[[1]][1]
  
  cat(paste("working on", fileName, "\n"))
  f = paste0(p, "/", x)
  f = read.table(file = f, header=TRUE, sep="\t", stringsAsFactors = FALSE)
  f$siteid <- siteid
  names(f)[1:3] <- c("date","tempF","tempC")
  
  f$tempF <-as.numeric(f$tempF)
  f$tempC <-as.numeric(f$tempC)
  
  vars = c("date", "siteid", "tempF", "tempC")
  f    = f[vars]
  })

mastSeries_df <- do.call("rbind", l)


# save cached copy
save(mastSeries_df, file = "D:/projects/soilTemperatureMonitoring/data/R/mastSeries.Rdata")
# load cached versions
load(file = "mastSeries.Rdata")


# Plot sites visually inspect for flat lines and spikes
test <- subset(mastSeries_df, site == "JTNP08")
test.zoo <- read.zoo(test[,c(1,3)],format = "%m/%d/%y %H:%M:%S", tz = "GMT")
plot(test.zoo, ylab = "tempF")


# Aggregate by Year, Month, and Julian day (i.e. 1-365, 366 for leap years)
ms.df <- mastSeries_df
ms.df$date <- as.POSIXlt(ms.df$date, format="%m/%d/%y %H:%M:%S")
ms.df$day  <- as.character(format(ms.df$date, "%m/%d/%y"))
ms.df$Jday <- as.integer(format(ms.df$date, "%j"))

# compute number of days per site
ms.D.df <- aggregate(tempF ~ site + day, data = ms.df, FUN = mean, na.action = na.exclude)
ms.D.df <- aggregate(day ~ site, data = ms.D.df, function(x) sum(!is.na(x)))
names(ms.D.df) <- c("siteid","numDays")

# compute mast per year
ms.Jd.df <- aggregate(tempF ~ siteid + Jday, data = ms.df, mean)
mastSites.df <- aggregate(tempF ~ siteid, data = ms.Jd.df, mean)

# merge mast & numDays
mastSites.df <- merge(mastSites.df, ms.D.df, by = "siteid")
write.csv(mastSites.df, "mastSites.csv")

Exploratory Data Analysis

# Read tempC data
setwd("D:/projects/soilTemperatureMonitoring/data/R")

sites_df <- read.csv("HOBO_List_2013_0923_master.csv")
mast_df  <- read.csv("mastSites.csv")

mast_df <- merge(mast_df, sites_df, by = "siteid")
vars <- c("siteid", "tempF", "numDays", "utmeasting", "utmnorthing")
mast_df <- mast_df[vars]
mast_df$tempC <- (mast_df$tempF - 32) * (5 / 9)


# Read geodata
mast_sp <- mast_df
coordinates(mast_sp) <- ~ utmeasting + utmnorthing
proj4string(mast_sp)<- ("+init=epsg:26911")
mast_sp <- spTransform(mast_sp, CRS("+init=epsg:5070"))
folder <- "D:/geodata/project_data/R8-VIC/"
files <- c(elev   = "ned30m_8VIC_elev5.tif",
           solar  = "ned30m_8VIC_solarcv.tif",
           tc     = "landsat30m_8VIC_tc123.tif",
           precip = "prism30m_8VIC_ppt_1981_2010_annual_mm.tif",
           temp   = "prism30m_8VIC_tmean_1981_2010_annual_C.tif"
           )
geodata_f <- lapply(files, function(x) paste0(folder, x))
geodata_r <- stack(geodata_f)
data <- as.data.frame(extract(geodata_r, mast_sp, sp = TRUE))


# Summary of tempC data
ggplot(data, aes(sample = tempC)) +
  geom_qq() +
  geom_qq_line()

vars <- c("tempC", names(geodata_r))
GGally::ggpairs(data[, vars])

# Compare environmental representativeness of hobo locations 
geodata_s <- as.data.frame(sampleRegular(geodata_r, size = 5000))

geodata_s <- rbind(
  data.frame(source = "sample", data[names(geodata_r)]),
  data.frame(source = "population", geodata_s)
)

geodata_s <- reshape(geodata_s,
                     direction = "long",
                     timevar = "variable", times = names(geodata_r),
                     v.names = "value",  varying = names(geodata_r) 
                     )
ggplot(geodata_s, aes(x = value, fill = source)) +
  geom_density(alpha = 0.5) +
  facet_wrap(~ variable, scales = "free") +
  ggtitle("Evaluation of Sample Representativeness")

Construct Linear Model

# Estimate tempC model
full <- lm(tempC ~ .,data = data, weights = numDays)
mast_lm <- lm(tempC ~ temp + solar + precip + tc_1, data = data, weights = numDays)

plot(data$tempC ~ predict(mast_lm),
     ylab = "Observed tempC",
     xlab = "Predicted tempC",
     main = "Observed vs. predicted tempC")
abline(0,1)

# Validate tempC model
# Create folds
folds <- createFolds(data$tempC, k = 10)
train <- data

# Cross validate
cv_results <- lapply(folds, function(x) {
  train   = train[-x,]
  test    = train[x,]
  model   = lm(tempC ~ temp + solar + precip + tc_1, weights = numDays, data = train)
  actual  = test$tempC
  predict = predict(model, test)
  RMSE    = sqrt(mean((actual - predict)^2, na.rm = TRUE))
  R2      = cor(actual, predict, use = "pairwise")^2
  return(c(RMSE = RMSE, R2 = R2))
  })

# Convert to a data.frame
cv_results <- do.call(rbind, cv_results)

# Summarize results
summary(cv_results)
##       RMSE              R2        
##  Min.   :0.5143   Min.   :0.1556  
##  1st Qu.:0.9488   1st Qu.:0.9237  
##  Median :1.0699   Median :0.9586  
##  Mean   :1.3106   Mean   :0.8727  
##  3rd Qu.:1.4927   3rd Qu.:0.9751  
##  Max.   :3.2340   Max.   :0.9949
# Predict tempC model
predfun <- function(model, data) {
  v <- predict(model, data, se.fit=TRUE)
}
mast.raster <- predict(geodata, mast.lm, fun=predfun, index=1:2, progress='text')
writeRaster(mast.raster[[1]],filename="I:/workspace/soilTemperatureMonitoring/R/mast.raster.new.tif",format="GTiff",datatype="INT1S",overwrite=T,NAflag=-127, options=c("COMPRESS=DEFLATE", "TILED=YES"), progress='text')
test <- raster(mast.raster,layer=2)
writeRaster(test,filename="I:/workspace/soilTemperatureMonitoring/R/mast.se.raster.new.tif",format="GTiff",datatype="INT1S",overwrite=T,NAflag=-127, options=c("COMPRESS=DEFLATE", "TILED=YES"), progress='text')