Feather Upload

This script reads a directory of Feather files as output from XMLParsing.py and pushes the dataframes to a PostgreSQL database.

Note: Since the dbWriteTable statement uses append = TRUE to upload successive dataframes to the same table, if recalling this upload procedure all tables will need to be manually truncated first.

Source code

# docker run --name postgres_london_tube -p 5432:5432 -d -e POSTGRES_PASSWORD=mysecretpassword postgres:alpine
library(RPostgreSQL)
drv <- dbDriver("PostgreSQL")
host <- "localhost"
user <- "postgres"
password <- "mysecretpassword"

Create database if it doesn’t already exist

# con <- dbConnect(drv, host = host, user = user, password = password,
# dbname = "postgres")
# dbSendQuery(con, "CREATE DATABASE londontubepython;")
# dbDisconnect(con)
get_con <- function() {
dbConnect(drv,
host = host, user = user, password = password,
dbname = "londontubepython")
}
source("r/VariableCreation.R") # Load required Variable Creation functions
library(feather)
con <- get_con()
files <- list.files(file.path(root.dir, "1_data/1_2_processed_data"),
full.names = TRUE)
then <- Sys.time()
lapply(files, function(file){
df <- feather::read_feather(file)
tablename <- substring(basename(tools::file_path_sans_ext(file)), 5)
# Dispatch appropriate modification procedure depending on the tablename
df <- modify_df(tablename, df)
dbWriteTable(con, tablename, df, append = TRUE, row.names = FALSE)
})

RPostgreSQL does not support “time” db data type. Uploads POSIXct as “timestamp with time zone” so we need to strip date and timezone

dbSendQuery(con, paste('ALTER TABLE "VehicleJourneys"',
'ALTER COLUMN "DepartureTime" TYPE time(6) USING "DepartureTime"::time(6);'
))
now <- Sys.time()
dbDisconnect(con)
now - then
#> Time difference of 28.46938 secs

Copyright © Ruaridh Williamson 2017

Powered by TfL Open Data // Contains OS data © Crown copyright and database rights 2016