Load packages
## Auto-install required R packages
if (!require("pacman")) install.packages("pacman")
pacman::p_load(aire.zmvm, dplyr, ggplot2, lubridate, readr, stringr, mgcv)Download the data
df <- download_deposition(deposition = "HUMEDO", type = "CONCENTRACION") %>%
filter(pollutant == "PP")
# When does the SIMAT start recording rainfall each year?
knitr::kable(df %>%
group_by(year(date)) %>%
summarise(min = min(date)))| year(date) | min |
|---|---|
| 1997 | 1997-05-05 |
| 1998 | 1998-05-04 |
| 1999 | 1999-02-22 |
| 2000 | 2000-02-14 |
| 2001 | 2001-01-08 |
| 2002 | 2002-02-04 |
| 2003 | 2003-05-05 |
| 2004 | 2004-05-03 |
| 2005 | 2005-05-09 |
| 2006 | 2006-05-08 |
| 2007 | 2007-04-30 |
| 2008 | 2008-05-05 |
| 2009 | 2009-05-11 |
| 2010 | 2010-05-03 |
| 2011 | 2011-05-02 |
| 2012 | 2012-05-07 |
| 2013 | 2013-04-29 |
| 2014 | 2014-04-28 |
| 2015 | 2015-05-04 |
| 2016 | 2016-05-02 |
| 2017 | 2017-05-08 |
| 2018 | 2018-04-23 |
| 2019 | 2019-04-22 |
| 2020 | 2020-06-16 |
| 2021 | 2021-04-26 |
| 2022 | 2022-04-19 |
| 2023 | 2023-04-17 |
Model with a GAM
# rainfall starts being measured around the beginning of May 2003
df <- df %>% filter(year(df$date) >= 2003 & year(df$date) <= 2020)
df$week <- week(df$date)
df$year <- year(df$date)
# no measures of rainfall outside the rainy season, so set them to zero
df <- full_join(df,
data.frame(
year = year(seq(as.Date("2003-01-01"),
as.Date("2016-12-31"),
by = "week")),
week = week(seq(as.Date("2003-01-01"),
as.Date("2016-12-31"),
by = "week"))
),
by = c("year", "week")) %>%
arrange(year, week)
df$value[is.na(df$value)] <- 0
df <- df[!is.na(df$date), ]
df$date <- as.Date(str_c(df$year, "-", df$week,"-",1), format = "%Y-%U-%u")
#fit <- gam(value ~ te(as.numeric(date), week, bs = c("tp", "cc"), k = c(10, 52)),
# data = df, family = nb())
#plot(fit, pers = TRUE)
fit2 <- gam(value ~ s(as.numeric(date)) + s(week), data = df, family = nb())
summary(fit2)
#>
#> Family: Negative Binomial(1.414)
#> Link function: log
#>
#> Formula:
#> value ~ s(as.numeric(date)) + s(week)
#>
#> Parametric coefficients:
#> Estimate Std. Error z value Pr(>|z|)
#> (Intercept) 3.42410 0.01081 316.9 <2e-16 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Approximate significance of smooth terms:
#> edf Ref.df Chi.sq p-value
#> s(as.numeric(date)) 7.314 8.315 41.21 8.18e-06 ***
#> s(week) 8.318 8.861 1176.31 < 2e-16 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> R-sq.(adj) = 0.125 Deviance explained = 13.1%
#> -REML = 28169 Scale est. = 1 n = 6359
df$pred <- predict(fit2, newdata = df, type = "response")Plots
ggplot(df, aes(date, value)) +
geom_point(alpha = .05) +
geom_smooth(method = 'gam', formula = y ~ s(x, k = 53),
method.args= list(family = nb())) +
ggtitle("Weekly rainfall in Mexico City (2003-2016)") +
ylab("mm") +
theme_bw()
ggplot(df, aes(week, value, group = year, color = year)) +
geom_point(alpha = .05) +
ggtitle("Average rainfall in Mexico City",
subtitle = "Based on a GAM model\nSource: SIMAT") +
ylab("rainfall in mm") +
scale_x_continuous(breaks = c(1, week("2016-06-01"),
week("2016-09-01"), week("2016-12-01")),
labels = c("Jan", "Jun", "Aug", "Dec")) +
scale_color_continuous(low="#dddddd", high = "black") +
theme_bw()
ggplot(df, aes(week, pred, group = year, color = year)) +
geom_line() +
ggtitle("Modeled weekly rainfall in Mexico City (2003-2016)",
subtitle = "Based on a GAM model\nSource: SIMAT") +
ylab("rainfall in mm") +
scale_x_continuous(breaks = c(1, week("2016-06-01"),
week("2016-09-01"), week("2016-12-01")),
labels = c("Jan", "Jun", "Aug", "Dec")) +
scale_color_continuous(low="#dddddd", high = "black") +
theme_bw()