Navigating SQL Databases and GCE BigQuery

Big Query
pacman::p_load(sf,
               tidyverse,
               lubridate,
               DBI,
               dbplyr,
               bigrquery,
               sandwich,
               here)
pacman::p_install(rms,force=FALSE)
## Package is already on your system.
sf::sf_use_s2(FALSE)
## Spherical geometry (s2) switched off

0. Setup

dsn_dir = here("data/shapefiles/kiribati_eez/")

## Read in Kiribati shapefile
kiribati_eez = 
  read_sf(dsn = dsn_dir, layer = "kiribati_eez") %>%
  select(GeoName, geometry)

## Get the projection string for later
proj_str = st_crs(kiribati_eez)

## Subset Line Islands
line = 
  kiribati_eez %>%
  subset(GeoName %in% c("Kiribati Exclusive Economic Zone (Line Islands)")) %>%
  rename(region = GeoName) %>%
  mutate(region = "Line Group")

## Subset Gilbert Islands
gilbert = 
  kiribati_eez %>%
  subset(GeoName %in%
            c("Kiribati Exclusive Economic Zone (Gilbert Islands)")) %>%
  rename(region = GeoName) %>%
  mutate(region = "Gilbert Islands")

dsn_dir = here("data/shapefiles/pipa_shapefile/")

## Read in PIPA shapefile
pipa = 
  read_sf(dsn = dsn_dir, layer = "worldheritagemarineprogramme") %>%
  st_transform(proj_str) %>%
  select(region = full_name, geometry) %>%
  mutate(region = "PIPA")

## Merge all regions of interest
all_regions = rbind(line, gilbert, pipa)

## Get bounding box of Kiribati
bbox = 
  kiribati_eez %>%
  st_bbox()

## Extent the bounding box 1 degree in every direction.
## We will use these limits to query the GFW database
min_lon = bbox[["xmin"]] - 1 
max_lon = bbox[["xmax"]] + 1
min_lat = bbox[["ymin"]] - 1
max_lat = bbox[["ymax"]] + 1 

if(min_lon <= -180) {
  min_lon =
    kiribati_eez %>% 
    st_crop(c(xmin=180, xmax=0, ymin=min_lat, ymax=max_lat)) %>%
    st_bbox() %>%
    pluck("xmin") - 1
}

if(max_lon >= 180) {
  max_lon =
    kiribati_eez %>% 
    st_crop(c(xmin=0, xmax=-180, ymin=min_lat, ymax=max_lat)) %>%
    st_bbox() %>%
    pluck("xmax") + 1
}

## Define mapping resoluion in degrees
resolution = 0.5

## Read in regional area sizes
region_areas = read_csv(here("data/region_areas.csv"))

1. Establishing Database Connection

bq_auth(path = Sys.getenv("~/kb_gce_key.json"))
## ! Using an auto-discovered, cached token.
##   To suppress this message, modify your code or options to clearly consent to
##   the use of a cached token.
##   See gargle's "Non-interactive auth" vignette for more details:
##   <https://gargle.r-lib.org/articles/non-interactive-auth.html>
## ℹ The bigrquery package is using a cached token for 'brewster.kyle@gmail.com'.
billing_id = Sys.getenv("GCE_DEFAULT_PROJECT_ID") 
cony <- dbConnect(
  bigrquery::bigquery(),
  project = "global-fishing-watch",
  dataset = "global_footprint_of_fisheries",
  billing = billing_id)
cony
## <BigQueryConnection>
##   Dataset: global-fishing-watch.global_footprint_of_fisheries
##   Billing: 117287121109189583528

2. Testing and Authenticating Connection

dbListTables(cony)
## [1] "fishing_effort"          "fishing_effort_byvessel"
## [3] "fishing_vessels"         "vessels"

3. Preview of Data

effort = tbl(cony, "fishing_effort")
effort

4. Query the database

effort_query = effort %>%
   mutate(lon_bin = lon_bin/100,  # STEP 1: Filtering values
         lat_bin = lat_bin/100,
         lat_cent_var=(((lat_bin/resolution)*resolution)+(0.5*resolution)),
         lon_cent_var=(((lon_bin/resolution)*resolution)+(0.5*resolution))) %>%
   filter(fishing_hours>0, # STEP 2: Creating variables for binning (above)
         min_lat<lat_bin | lat_bin<max_lat,
         min_lon<lon_bin | lon_bin<max_lon,
         date < '2016-12-31')

5. Collect the Data

effort_binned = effort_query %>%
   group_by(lat_cent_var,lon_cent_var,date) %>%
   summarise(avg_hrs_year = mean(fishing_hours, na.rm=TRUE))

6. Disconnect

dbDisconnect(cony)

7. Convert to sf object

trans_effort = tbl(cony, effort_binned)

effort %>% select(1:8) %>% head(10) %>% collect()

8. Finalizing the Data

Part 1: Cropping

Part 2: Replacing NAs and Tallies

Part 3: Splitting and Normalizing

gfw =
  gfw %>%
  left_join(region_areas) %>%
  mutate(region = ifelse(region=="PIPA", "PIPA", "Control")) %>%
  group_by(date, region) %>%
  mutate(area_km2 = sum(area_km2)) %>%
  mutate(fishing_hours_norm = fishing_hours/area_km2*1000) %>%
  summarise(fishing_hours_norm = sum(fishing_hours_norm)) %>% 
  ungroup 
gfw

Plotting the Figure

Work in progress, more updates to come soon!