We can use rsinaica to find out which city is the most PM10-polluted in all of Mexico.

First, we load the packages:

## Auto-install required R packages
packs <- c("dplyr", "ggplot2", "gghighlight", "lubridate", "anomalize", 
           "aire.zmvm", "tidyr", "zoo", "plotly", "rsinaica")
success <- suppressWarnings(sapply(packs, require, character.only = TRUE))
if (length(names(success)[!success])) {
  install.packages(names(success)[!success])
  sapply(names(success)[!success], require, character.only = TRUE)
}

Download

Then we download the data for the whole year of 2017 using the sinaica_param_data function. Since the maximum data range we can download is 1 month, we have to use a little mapply magic to download the entire year.

# Download all PM10 pollution data in 2017
pm10_2017 <- bind_rows(
  mapply(sinaica_param_data,
         "PM10",
         seq(as.Date("2017-01-01"), as.Date("2017-12-01"), by = "month"),
         seq(as.Date("2017-02-01"), as.Date("2018-01-01"), by = "month") - 1,
         SIMPLIFY = FALSE)
)

There a few cities that collect PM10 data manually (they collect it through a filter and send it to be weighted to an external lab, sometimes in another country). Let’s also download that data:

# Download all manually collected PM10 pollution data in 2017
pm10_2017m <- bind_rows(
  mapply(sinaica_param_data,
         "PM10",
         seq(as.Date("2017-01-01"), as.Date("2017-12-01"), by = "month"),
         seq(as.Date("2017-02-01"), as.Date("2018-01-01"), by = "month") - 1,
         "Manual",
         SIMPLIFY = FALSE)
)
pm10_2017 <-  bind_rows(pm10_2017, pm10_2017m)       

This is what the data looks like:

knitr::kable(head(pm10_2017))
id station_id station_name station_code network_name network_code network_id date hour parameter value_original flag_original valid_original value_actual valid_actual date_validated validation_level unit value
101PM1017010100 101 Atemajac ATM Guadalajara GDL 63 2017-01-01 0 PM10 45.619 128 1 45.619 1 NA 0 µg/m³ 45.619
101PM1017010101 101 Atemajac ATM Guadalajara GDL 63 2017-01-01 1 PM10 49.651 128 1 49.651 1 NA 0 µg/m³ 49.651
101PM1017010102 101 Atemajac ATM Guadalajara GDL 63 2017-01-01 2 PM10 76.969 128 1 76.969 1 NA 0 µg/m³ 76.969
101PM1017010103 101 Atemajac ATM Guadalajara GDL 63 2017-01-01 3 PM10 84.916 128 1 84.916 1 NA 0 µg/m³ 84.916
101PM1017010104 101 Atemajac ATM Guadalajara GDL 63 2017-01-01 4 PM10 50.359 128 1 50.359 1 NA 0 µg/m³ 50.359
101PM1017010105 101 Atemajac ATM Guadalajara GDL 63 2017-01-01 5 PM10 53.143 128 1 53.143 1 NA 0 µg/m³ 53.143

Cleanup

Once we’ve downloaded the data we filter values below 1 µg/m³ since they’re probably calibration errors. And we only include stations that reported for more than 80% of days (292). We also have to take into account that PM10 data is measured as a 24 hour rolling average.

# pm10_2017[which(pm10_2017$value_actual != pm10_2017$value_original),]
# pm10_2017[which(!is.na(pm10_2017$date_validated)),]

## filter stations that didn't report at least 47 weeks of the year
df_filtered <- pm10_2017 %>%
  mutate(value = if_else(value < 1, NA_real_, value)) %>%
  group_by(network_name) %>%
  filter(!is.na(value)) %>%
  mutate(nweeks = n_distinct(week(date))) %>%
  filter(nweeks >= 47) %>%
  select(-nweeks) %>%
  ungroup()



df_max <- df_filtered %>%
  complete(station_id,
           hour = 0:23,
           date = as.character(seq(as.Date("2017-01-01"), as.Date("2017-12-31"), by = "day"))) %>%
  group_by(station_id, network_name) %>%
  arrange(station_id, date, hour) %>%
  mutate(roll24 = rollapply(value, 24, mean, na.rm = TRUE, partial = 18, 
                            fill = NA, align = "right")) %>%
  ungroup() %>%
  #summarise(mean = mean(value, na.rm = TRUE)) %>%
  group_by(date, network_name) %>%
  summarise(max = max(roll24, na.rm = TRUE)) %>%
  ungroup() %>%
  add_count(network_name) %>%
  ## Only include stations that reported for more than 80% of days (292)
  filter(n >= (365*.8))  %>%
  select(-n) %>%
  filter(is.finite(max)) %>%
  arrange(date)

When plotting the daily 24 hour average maximums we can see that there are still some obvious errors in the data.

ggplot(df_max, aes(as.Date(date), max)) +
  geom_line(size = .3, color = "black") +
  facet_wrap(~ network_name, ncol = 3) +
  ggtitle(expression(paste("Maximum daily ", 
                           PM[10], " values in 2017"))) +
  xlab("date") +
  ylab(expression(paste("daily maximum 24 average of ", PM[10], 
                        " (", mu,"g/", m^3, ")"))) +
  theme_bw() +
  theme(axis.text.x=element_text(angle=60,hjust=1))

It looks like we can safely remove values above 500 µg/m³ and get rid of the Aguascalientes and Monclova networks.

Anomalies

We can also use the anomalize package to detect extreme values, but actually figuring out if they are errors is a little bit more tricky since fires can temporarily spike PM10 levels as often happens during the winter holidays when people burn trash and tires, and set off fireworks. I’ve opted to remove them, since I’m interested in the average of the whole year these PM10 outliers are unlikely to have a substantial effect on the rankings.

df_max <- df_max %>%
  filter(!network_name %in% c("Aguascalientes", "Monclova")) %>%
  filter(max <= 500) %>%
  ungroup() %>%
  group_by(network_name) %>%
  mutate(date = as.Date(date)) %>%
  time_decompose(max, method = "stl") %>%
  anomalize(remainder, method = "iqr", alpha = 0.015) %>%
  time_recompose()


## Anomaly Visualization
df_max %>% plot_anomalies(time_recomposed = TRUE, ncol = 3, alpha_dots = 0.25) +
  labs(title = "Tidyverse Anomalies", subtitle = "STL + GESD Methods") 


df_max <- filter(df_max, anomaly != "Yes")

Most PM10-Polluted City

And here is the most PM10-polluted city in Mexico: Monterrey

gghighlight_line(df_max, aes(as.Date(date), observed, 
                              group = network_name, color = network_name),
                 mean(observed, na.rm = TRUE), max_highlight = 1) +
  theme_bw() +
  ggtitle(expression(paste("Pollution measuring network with highest ", 
                           PM[10], " pollution values in 2017")),
          subtitle = "Based on the mean of the highest 24-hour rolling average daily maximums. Source: SINAICA") +
  xlab("date") +
  ylab(expression(paste(PM[10], " ", mu,"g/", m^3)))

Top polluted cities

knitr::kable(
  df_max %>%
    group_by(network_name) %>%
    summarise(mean = mean(observed, na.rm = TRUE)) %>%
    arrange(-mean) %>%
    head(10)
  )
network_name mean
Monterrey 115.98512
Guadalajara 109.54051
Torreón 95.49052
Durango 94.05087
Toluca 93.88375
Valle de México 89.24066
León 86.42133
Irapuato 81.83166
San Luis Potosí Estatal 79.91457
Silao 75.39389

The top 3 most PM10-polluted cities in Mexico:

top3 <- filter(df_max, network_name %in% c("Torreón",
                                           "Guadalajara",
                                           "Monterrey"))[, 1:3]

top3 <- spread(top3, network_name, observed)
top3$date <- as.Date(as.Date(top3$date))
x <- list(
  title = "date"
)
y <- list(
  title = "daily maximum 24 average of PM10"
)

plot_ly(as.data.frame(top3), x = ~date, y = ~Monterrey, name = "Monterrey" ,
        type = 'scatter', mode = 'lines', line = list(color = '#e41a1c'), width = .5) %>%
  add_trace(y = ~Guadalajara, name = 'Guadalajara', mode = 'lines', line = list(color = '#377eb8'), width = .5) %>%
  add_trace(y = ~Torreón, name = 'Torreón', mode = 'lines', line = list(color = '#4daf4a'), width = .5) %>%
  layout(title = "Top 3 most PM10-polluted cities in Mexico", xaxis = x, yaxis = y)

Days with bad air quality

Number of days with bad air quality (Índice IMECA MALO)

df_max %>%
  group_by(network_name) %>%
  filter(observed > 75.5) %>%
  summarise(count = n()) %>%
  arrange(-count) %>%
  head() %>%
  knitr::kable()
network_name count
Monterrey 295
Guadalajara 258
Valle de México 212
Toluca 211
Durango 208
León 205

Number of days with very bad air quality (Índice IMECA MUY MALO).

df_max %>%
  group_by(network_name) %>%
  filter(observed > 215.5) %>%
  summarise(count = n()) %>%
  arrange(-count) %>%
  head() %>%
  knitr::kable()
network_name count
Monterrey 19
Guadalajara 16
Durango 10
San Luis Potosí Estatal 7
Silao 4
Valle de México 4