Getting in the Zone

library(ggplot2)
library(maps)
library(glue)
library(dplyr)
library(tidyr)
library(postal)
library(zipcode)

The postal::zips_zones_sample dataset provides a quarter of the ~4 million 3-digit origin-destination pairs. (If you want allll of them, you can run fetch_all and put on a pot of coffee.)

What if we wanted to use that to plot how zones increase as the destination gets farther from the origin?

We’ll need a measure of latitude and longitude for that. Luckily, data from the zipcode package relates every zip codes to its latitude and longitude. We can that along with the partial data in postal::zips_zones_sample to match up zips to latitudes and longitudes.

Let’s load in the randomly sampled postal package data.

data(zips_zones_sample)
zips_zones_sample
#> # A tibble: 1,000,000 x 6
#>    origin_zip dest_zip  zone specific_to_prior… same_ndc has_five_digit_ex…
#>    <chr>      <chr>    <int> <lgl>              <lgl>    <lgl>             
#>  1 003        <NA>        NA NA                 NA       NA                
#>  2 004        <NA>        NA NA                 NA       NA                
#>  3 005        012          2 FALSE              FALSE    FALSE             
#>  4 005        027          2 FALSE              FALSE    FALSE             
#>  5 005        028          2 FALSE              FALSE    FALSE             
#>  6 005        030          3 FALSE              FALSE    FALSE             
#>  7 005        042          3 FALSE              FALSE    FALSE             
#>  8 005        044          4 FALSE              FALSE    FALSE             
#>  9 005        051          3 FALSE              FALSE    FALSE             
#> 10 005        053          3 FALSE              FALSE    FALSE             
#> # … with 999,990 more rows

Later we’ll select away all the other details and focus just on origin_zip, dest_zip, and zone. As we know, the postal::zip_zones data displays all origin zips as 3 digit prefixes and most destination zips as 3 digits as well.

However, the zipcode::zipcode data displays zips in the usual way, as 5 digit.

data(zipcode)
zipcode %>% 
  as_tibble()
#> # A tibble: 44,336 x 5
#>    zip   city       state latitude longitude
#>    <chr> <chr>      <chr>    <dbl>     <dbl>
#>  1 00210 Portsmouth NH        43.0     -71.0
#>  2 00211 Portsmouth NH        43.0     -71.0
#>  3 00212 Portsmouth NH        43.0     -71.0
#>  4 00213 Portsmouth NH        43.0     -71.0
#>  5 00214 Portsmouth NH        43.0     -71.0
#>  6 00215 Portsmouth NH        43.0     -71.0
#>  7 00501 Holtsville NY        40.9     -72.6
#>  8 00544 Holtsville NY        40.9     -72.6
#>  9 00601 Adjuntas   PR        18.2     -66.7
#> 10 00602 Aguada     PR        18.4     -67.2
#> # … with 44,326 more rows

So we’ll want to trim them in order to be able to join on our data, the majority of which are 3 digits.

zips <- 
  zipcode %>% 
  as_tibble() %>% 
  mutate(
    zip_trim = substr(zip, 1, 3)
  )

zips
#> # A tibble: 44,336 x 6
#>    zip   city       state latitude longitude zip_trim
#>    <chr> <chr>      <chr>    <dbl>     <dbl> <chr>   
#>  1 00210 Portsmouth NH        43.0     -71.0 002     
#>  2 00211 Portsmouth NH        43.0     -71.0 002     
#>  3 00212 Portsmouth NH        43.0     -71.0 002     
#>  4 00213 Portsmouth NH        43.0     -71.0 002     
#>  5 00214 Portsmouth NH        43.0     -71.0 002     
#>  6 00215 Portsmouth NH        43.0     -71.0 002     
#>  7 00501 Holtsville NY        40.9     -72.6 005     
#>  8 00544 Holtsville NY        40.9     -72.6 005     
#>  9 00601 Adjuntas   PR        18.2     -66.7 006     
#> 10 00602 Aguada     PR        18.4     -67.2 006     
#> # … with 44,326 more rows

Let’s get a tibble of all possible USPS zips, both origin prefixes and destinations, including our 5 digit destinations.

(usps_zips <-
  tibble(
    zip =
      unique(zips_zones_sample$origin_zip) %>% 
      c(unique(zips_zones_sample$dest_zip)) 
  ) %>% 
  distinct())
#> # A tibble: 91,929 x 1
#>    zip  
#>    <chr>
#>  1 003  
#>  2 004  
#>  3 005  
#>  4 006  
#>  5 007  
#>  6 008  
#>  7 009  
#>  8 010  
#>  9 011  
#> 10 012  
#> # … with 91,919 more rows

Now we can join the zipcode trimmed zips on our usps_zips to get a corresponding lat and long for each 3-digit origin and destination zip.

(zips_lat_long <- 
  zips %>% 
  distinct(zip_trim, .keep_all = TRUE) %>% 
  left_join(usps_zips, by = c("zip_trim" = "zip")) %>% 
  select(zip_trim, latitude, longitude))
#> # A tibble: 955 x 3
#>    zip_trim latitude longitude
#>    <chr>       <dbl>     <dbl>
#>  1 002          43.0     -71.0
#>  2 005          40.9     -72.6
#>  3 006          18.2     -66.7
#>  4 007          18.2     -66.1
#>  5 008          18.3     -65.0
#>  6 009          18.5     -66.1
#>  7 010          42.1     -72.6
#>  8 011          42.2     -72.6
#>  9 012          42.5     -73.3
#> 10 013          42.6     -72.6
#> # … with 945 more rows

Now we have a mapping between each possible 3 digit zip and its latitude and longitude. But for every row in our zips_zones_sample dataset, we have two lats and two longs: one for the origin and one for the destination.

So we’ll want to take our zips_lat_long mapping and use that to attach latitude and longitude to each origin and each destination in zips_zones_sample.

(zips_zones_lat_long <- 
  zips_zones_sample %>% 
  select(origin_zip, dest_zip, zone) %>% 
  left_join(zips_lat_long, by = c("origin_zip" = "zip_trim")) %>% 
  rename(
      lat_origin = latitude,
      long_origin = longitude) %>% 
  left_join(zips_lat_long, by = c("dest_zip" = "zip_trim")) %>% 
  rename(
      lat_dest = latitude,
      long_dest = longitude) %>% 
  drop_na(zone))
#> # A tibble: 999,978 x 7
#>    origin_zip dest_zip  zone lat_origin long_origin lat_dest long_dest
#>    <chr>      <chr>    <int>      <dbl>       <dbl>    <dbl>     <dbl>
#>  1 005        012          2       40.9       -72.6     42.5     -73.3
#>  2 005        027          2       40.9       -72.6     41.8     -71.1
#>  3 005        028          2       40.9       -72.6     41.5     -71.3
#>  4 005        030          3       40.9       -72.6     43.0     -71.2
#>  5 005        042          3       40.9       -72.6     44.1     -70.2
#>  6 005        044          4       40.9       -72.6     44.8     -68.8
#>  7 005        051          3       40.9       -72.6     43.2     -72.5
#>  8 005        053          3       40.9       -72.6     42.8     -72.6
#>  9 005        054          3       40.9       -72.6     44.5     -73.2
#> 10 005        059          3       40.9       -72.6     44.7     -71.7
#> # … with 999,968 more rows

We could use the ggmap package to get a map of the US like get_googlemap("us", zoom = 4) but to go simpler, we’ll use ggplot2’s built-in map_data function.

us <- 
  ggplot2::map_data("state") %>%
  as_tibble() 
origin_prefix <- "041"

Now, using a single origin zip (041), we can plot all of the destination zips’ zones relative to that origin. We’ll also filter out any outliers.

filtered <- 
  zips_zones_lat_long %>% 
  filter(origin_zip == origin_prefix) %>% 
  left_join(us, by = c("lat_dest" = "lat", "long_dest" = "long")) %>% 
  filter(as.numeric(dest_zip) > 10 &
           long_dest < -50 &
           long_dest > -120)

Remember – always confusingly, to me, longitude is on the x and latitude is on the y (!)

The map should look fragmented as our zip_zones data only covers a quarter of the possible origin-destination pairs. Still, the pattern is clear: zones increase as your destination gets farther away.

ggplot() +
  geom_polygon(data = us, aes(x = long, y = lat, group = group), fill = "white", color = "black") +
  geom_density_2d(data = filtered,
             aes(long_dest, lat_dest, colour = factor(zone)),
             alpha = 1) +
  labs(x = "Longitude", y = "Latitude", colour = "Zone") +
  ggtitle("Shipping Zones from Portland, Maine",
          subtitle = glue("Origin zone prefix: {origin_prefix}")) +
  scale_colour_brewer(type = "seq", palette = "BrBG") +
  theme_classic(base_family = "Arial Narrow") +
  coord_quickmap()