We can use rsinaica to find out which city is the most PM25-polluted in all of Mexico.

First, we load the packages:

## Auto-install required R packages
packs <- c("dplyr", "ggplot2", "gghighlight", "lubridate", "anomalize", 
           "aire.zmvm", "tidyr", "zoo", "plotly", "rsinaica")
success <- suppressWarnings(sapply(packs, require, character.only = TRUE))
if (length(names(success)[!success])) {
  install.packages(names(success)[!success])
  sapply(names(success)[!success], require, character.only = TRUE)
}

Download

Then we download the data for the whole year of 2017 using the sinaica_param_data function. Since the maximum data range we can download is 1 month, we have to use a little mapply magic to download the entire year.

# Download all PM25 pollution data in 2017
pm25_2017 <- bind_rows(
  mapply(sinaica_param_data,
         "PM2.5",
         seq(as.Date("2017-01-01"), as.Date("2017-12-01"), by = "month"),
         seq(as.Date("2017-02-01"), as.Date("2018-01-01"), by = "month") - 1,
         SIMPLIFY = FALSE)
)
       

I thought there would be a few cities that collected PM25 data manually (they collect it through a filter and send it to be weighted to an external lab, sometimes in another country). But no air quality station collected manual PM2.5 in 2017.

bind_rows(
  mapply(sinaica_param_data,
         "PM2.5",
         seq(as.Date("2017-01-01"), as.Date("2017-12-01"), by = "month"),
         seq(as.Date("2017-02-01"), as.Date("2018-01-01"), by = "month") - 1,
         "Manual",
         SIMPLIFY = FALSE)
)
#>  [1] id               station_id       station_name     station_code    
#>  [5] network_name     network_code     network_id       date            
#>  [9] hour             parameter        value_actual     valid_actual    
#> [13] validation_level unit             value           
#> <0 rows> (or 0-length row.names)

This is what the data looks like:

knitr::kable(head(pm25_2017))
id station_id station_name station_code network_name network_code network_id date hour parameter value_original flag_original valid_original value_actual valid_actual date_validated validation_level unit value
103PM2.517010101 103 Las Águilas AGU Guadalajara GDL 63 2017-01-01 1 PM2.5 20.668 128 1 20.668 1 NA 0 µg/m³ 20.668
103PM2.517010102 103 Las Águilas AGU Guadalajara GDL 63 2017-01-01 2 PM2.5 30.338 128 1 30.338 1 NA 0 µg/m³ 30.338
103PM2.517010103 103 Las Águilas AGU Guadalajara GDL 63 2017-01-01 3 PM2.5 29.509 128 1 29.509 1 NA 0 µg/m³ 29.509
103PM2.517010104 103 Las Águilas AGU Guadalajara GDL 63 2017-01-01 4 PM2.5 27.284 128 1 27.284 1 NA 0 µg/m³ 27.284
103PM2.517010105 103 Las Águilas AGU Guadalajara GDL 63 2017-01-01 5 PM2.5 26.677 128 1 26.677 1 NA 0 µg/m³ 26.677
103PM2.517010106 103 Las Águilas AGU Guadalajara GDL 63 2017-01-01 6 PM2.5 27.639 128 1 27.639 1 NA 0 µg/m³ 27.639

Cleanup

Once we’ve downloaded the data we filter values below 1 µg/m³ since they’re probably calibration errors. And we only include stations that reported for more than 80% of days (292). We also have to take into account that PM25 data is measured as a 24 hour average.

# pm25_2017[which(pm25_2017$value_actual != pm25_2017$value_original),]
# pm25_2017[which(!is.na(pm25_2017$date_validated)),]

## filter stations that didn't report at least 47 weeks of the year
df_filtered <- pm25_2017 %>%
  #filter(value < 250) %>%
  mutate(value = if_else(value < 1, NA_real_, value)) %>%
  group_by(network_name) %>%
  filter(!is.na(value)) %>%
  mutate(nweeks = n_distinct(week(date))) %>%
  filter(nweeks >= 47) %>%
  select(-nweeks) %>%
  ungroup()



df_max <- df_filtered %>%
  complete(station_id,
           hour = 0:23,
           date = as.character(seq(as.Date("2017-01-01"), as.Date("2017-12-31"), by = "day"))) %>%
  group_by(station_id, network_name) %>%
  arrange(station_id, date, hour) %>%
  mutate(roll24 = rollapply(value, 24, mean, na.rm = TRUE, partial = 18, 
                            fill = NA, align = "right")) %>%
  ungroup() %>%
  #summarise(mean = mean(value, na.rm = TRUE)) %>%
  group_by(date, network_name) %>%
  summarise(max = max(roll24, na.rm = TRUE)) %>%
  ungroup() %>%
  add_count(network_name) %>%
  ## Only include stations that reported for more than 80% of days (292)
  filter(n >= (365*.8))  %>%
  select(-n) %>%
  filter(is.finite(max)) %>%
  arrange(date)

When plotting the daily 24 hour average maximums we can see that there are still some obvious errors in the data.

ggplot(df_max, aes(as.Date(date), max)) +
  geom_line(size = .3, color = "black") +
  facet_wrap(~ network_name, ncol = 3) +
  ggtitle(expression(paste("Maximum daily ", PM[2.5], " concentration by network"))) +
  xlab("date") +
  ylab(expression(paste("daily maximum 24 average of ", PM[2.5], 
                        " (", mu,"g/", m^3, ")"))) +
  theme_bw() +
  theme(axis.text.x=element_text(angle=60,hjust=1))

It looks like we can safely remove values above 200 µg/m³ and get rid of the Aguascalientes, Irapuato and Monclova networks.

Anomalies

We can also use the anomalize package to detect extreme values, but actually figuring out if they are errors is a little bit more tricky since fires can temporarily spike PM25 levels as often happens during the winter holidays when people burn trash and tires, and set off fireworks. I’ve opted not to remove them, because of the spikes around new year. Since I’m interested in the average of the whole year, these PM25 outliers are unlikely to have a substantial effect on the rankings.

df_max <- filter(df_max, !network_name %in% 
                   c("Irapuato", "Monclova", "Aguascalientes")) %>%
  filter(max <= 200)

df_max <- df_max %>%
  ungroup() %>%
  group_by(network_name) %>%
  mutate(date = as.Date(date)) %>%
  time_decompose(max, method = "stl") %>%
  anomalize(remainder, method = "iqr", alpha = 0.01) %>%
  time_recompose()


# Anomaly Visualization
df_max %>% plot_anomalies(time_recomposed = TRUE, ncol = 3, alpha_dots = 0.25) +
  labs(title = "Tidyverse Anomalies", subtitle = "STL + GESD Methods") 


## Don not remove outliers, if you look at the chart some values around new year
## are detected as spikes, but it's people setting off fireworks and burning trash
## df_max <- filter(df_max, anomaly != "Yes")

It was decided to remove Irapuato from the rankings and keep Tepic because PM2.5 values can spike because of fires. Normally I would have removed Mexicali since it didn’t report any correct data during the winter season, but its levels of PM2.5 where so high that I decided to keep it.

Most PM25-Polluted City

And here is the most PM25-polluted city in Mexico: Toluca


gghighlight_line(df_max, aes(as.Date(date), observed, 
                              group = network_name, color = network_name),
                 mean(observed, na.rm = TRUE), max_highlight = 1) +
  theme_bw() +
  ggtitle(expression(paste("Pollution measuring network with the highest ", 
                           PM[2.5], " pollution values in 2017")),
          subtitle = "Based on the mean of the highest 24-hour rolling average daily maximums. Source: SINAICA") +
  xlab("date") +
  ylab(expression(paste(PM[2.5], " ", mu,"g/", m^3)))

Top polluted cities

knitr::kable(
  df_max %>%
    group_by(network_name) %>%
    summarise(mean = mean(observed, na.rm = TRUE)) %>%
    arrange(-mean) %>%
    head(10)
  )
network_name mean
Toluca 52.83158
Guadalajara 52.38417
Cuernavaca 43.96163
Mexicali 43.27889
Pachuca 42.38443
Tijuana 39.49127
Valle de México 38.68967
Morelia 31.08508
Durango 30.19801
Monterrey 28.77041

The top 3 most PM25-polluted cities in Mexico. Note that Mexicali didn’t report any data during the winter, which is the season with the highest levels of PM2.5 pollution, it could probably have ranked higher if they bothered to keep their instruments working.

top3 <- filter(df_max, network_name %in% c("Toluca",
                                           "Cuernavaca",
                                           "Guadalajara"))[, 1:3]

top3 <- spread(top3, network_name, observed)
top3$date <- as.Date(as.Date(top3$date))
#top3$Toluca <- na.spline(top3$Toluca)
#top3$Guadalajara <- na.spline(top3$Guadalajara)
#top3$Mexicali <- na.spline(top3$Mexicali)
x <- list(
  title = "date"
)
y <- list(
  title = "daily maximum 24 average of PM2.5"
)

plot_ly(as.data.frame(top3), x = ~date, y = ~`Toluca`, name = "Toluca" ,
        type = 'scatter', mode = 'lines', line = list(color = '#e41a1c'), width = .5) %>%
  add_trace(y = ~Guadalajara, name = 'Guadalajara', mode = 'lines', line = list(color = '#377eb8'), width = .5)  %>%
  add_trace(y = ~Cuernavaca, name = 'Cuernavaca', mode = 'lines', line = list(color = '#4daf4a'), width = .5) %>%
  layout(title = "Top 3 most PM2.5-polluted cities in Mexico", xaxis = x, yaxis = y)

Days with bad air quality

Number of days with bad air quality (Índice IMECA MALO)

df_max %>%
  group_by(network_name) %>%
  filter(observed > 45.05) %>%
  summarise(count = n()) %>%
  arrange(-count) %>%
  head() %>%
  knitr::kable()
network_name count
Toluca 224
Guadalajara 168
Pachuca 153
Cuernavaca 113
Mexicali 103
Valle de México 103

Number of days with very bad air quality (Índice IMECA MUY MALO).

df_max %>%
  group_by(network_name) %>%
  filter(observed > 97.45) %>%
  summarise(count = n()) %>%
  arrange(-count) %>%
  head() %>%
  knitr::kable()
network_name count
Guadalajara 31
Cuernavaca 30
Tijuana 16
Durango 12
Tepeapulco 12
Saltillo 11