Get Tables from XPaths

Read in the first XML timetable file for inspection and setup extraction process.

Source code used in ManagingData.Rmd. For full documentation and commentary see that report instead.

Source code

# Get top level nodeset for first file
doc <- data.files[1] %>%
xml2::read_xml() %>%
xml2::xml_ns_strip()
nodeset <- doc %>%
xml2::xml_children()
# Get the xpaths to parents of a terminal node
terminal_xpaths <- nodeset %>%
xmltools::xml_get_paths(only_terminal_parent = TRUE) %>%
# Filter to unique paths
unlist() %>%
unique()

Define subset of xpaths required.

required_xpaths <- list(
NptgLocalities = "/TransXChange/NptgLocalities/AnnotatedNptgLocalityRef"
,StopPoints = c("/TransXChange/StopPoints/StopPoint",
"/TransXChange/StopPoints/StopPoint/Descriptor",
"/TransXChange/StopPoints/StopPoint/Place",
"/TransXChange/StopPoints/StopPoint/Place/Location")
,RouteLinks = c("/TransXChange/RouteSections/RouteSection/RouteLink",
"/TransXChange/RouteSections/RouteSection/RouteLink/From",
"/TransXChange/RouteSections/RouteSection/RouteLink/To")
,Routes = "/TransXChange/Routes/Route"
,JourneyPatternTimingLinks = c("/TransXChange/JourneyPatternSections/JourneyPatternSection/JourneyPatternTimingLink",
"/TransXChange/JourneyPatternSections/JourneyPatternSection/JourneyPatternTimingLink/From",
"/TransXChange/JourneyPatternSections/JourneyPatternSection/JourneyPatternTimingLink/To")
,Services = "/TransXChange/Services/Service"
,JourneyPatterns = "/TransXChange/Services/Service/StandardService/JourneyPattern"
,VehicleJourneys = "/TransXChange/VehicleJourneys/VehicleJourney"
)

Extract the tables by grouping the results from the sets of xpaths

build_tfl <- function(doc, terminal_xpaths, required_terminal_xpaths) {
purrr::map(required_terminal_xpaths, function(xpaths) {
# Use the provided paths
xpaths %>%
# Find all matches of each path
lapply(xml2::xml_find_all, x = doc) %>%
# Extract underlying data
purrr::map(xmltools::xml_dig_df) %>%
# Combine extracted data into one dataframe
purrr::map(dplyr::bind_rows) %>%
dplyr::bind_cols()
})
}

Get XML attributes and parent IDs

library(dplyr)
retrieve_additional_fields <- function(tfl, doc){
RouteSectionsNodeset <-
xml2::xml_find_all(doc, "/TransXChange/RouteSections/RouteSection")
AdditionalRouteLinksData <-
purrr::map(RouteSectionsNodeset, function(Section) {
RouteLinks <- xml2::xml_find_all(Section, "RouteLink")
data.table(
RouteSectionID = xml2::xml_attr(Section, "id"),
RouteLinkID = xml2::xml_attr(RouteLinks, "id")
)
}) %>%
rbindlist %>%
as_tibble
tfl$RouteLinks <- AdditionalRouteLinksData %>%
bind_cols(tfl$RouteLinks) %>%
rename(FromStopPointRef = StopPointRef,
ToStopPointRef = StopPointRef1)
JourneyPatternSectionsNodeset <-
xml2::xml_find_all(doc, "/TransXChange/JourneyPatternSections/JourneyPatternSection")
AdditionalTimingLinksData <-
purrr::map(JourneyPatternSectionsNodeset, function(Section) {
TimingLinks <- xml2::xml_find_all(Section, "JourneyPatternTimingLink")
data.table(
JourneyPatternSectionID = xml2::xml_attr(Section, "id"),
JourneyPatternTimingLinkID = xml2::xml_attr(TimingLinks, "id"),
FromSequenceNumber = TimingLinks %>%
xml2::xml_find_all("From") %>%
xml2::xml_attr("SequenceNumber") %>%
as.integer,
ToSequenceNumber = TimingLinks %>%
xml2::xml_find_all("To") %>%
xml2::xml_attr("SequenceNumber") %>%
as.integer
)
}) %>%
rbindlist %>%
as_tibble
tfl$JourneyPatternTimingLinks <- AdditionalTimingLinksData %>%
bind_cols(tfl$JourneyPatternTimingLinks) %>%
rename(FromActivity = Activity,
FromStopPointRef = StopPointRef,
FromTimingStatus = TimingStatus,
ToActivity = Activity1,
ToStopPointRef = StopPointRef1,
ToTimingStatus = TimingStatus1)
tfl$JourneyPatterns <-
tibble(JourneyPatternID =
"/TransXChange/Services/Service/StandardService/JourneyPattern" %>%
xml2::xml_find_all(x = doc) %>%
xml2::xml_attr("id")
) %>%
bind_cols(tfl$JourneyPatterns)
tfl
}

Full extraction workflow for one file.

extract_file <- function(file){
doc <- xml2::read_xml(file) %>%
xml2::xml_ns_strip()
tfl <- build_tfl(doc, terminal_xpaths, required_xpaths)
tfl %<>% retrieve_additional_fields(doc)
tfl
}
str(extract_file(data.files[1]), 1)

Copyright © Ruaridh Williamson 2017

Powered by TfL Open Data // Contains OS data © Crown copyright and database rights 2016