Read in the first XML timetable file for inspection and setup extraction process.
Source code used in ManagingData.Rmd. For full documentation and commentary see that report instead.
# Get top level nodeset for first file
doc <- data.files[1] %>%
xml2::read_xml() %>%
xml2::xml_ns_strip()
nodeset <- doc %>%
xml2::xml_children()
# Get the xpaths to parents of a terminal node
terminal_xpaths <- nodeset %>%
xmltools::xml_get_paths(only_terminal_parent = TRUE) %>%
# Filter to unique paths
unlist() %>%
unique()
Define subset of xpaths required.
required_xpaths <- list(
NptgLocalities = "/TransXChange/NptgLocalities/AnnotatedNptgLocalityRef"
,StopPoints = c("/TransXChange/StopPoints/StopPoint",
"/TransXChange/StopPoints/StopPoint/Descriptor",
"/TransXChange/StopPoints/StopPoint/Place",
"/TransXChange/StopPoints/StopPoint/Place/Location")
,RouteLinks = c("/TransXChange/RouteSections/RouteSection/RouteLink",
"/TransXChange/RouteSections/RouteSection/RouteLink/From",
"/TransXChange/RouteSections/RouteSection/RouteLink/To")
,Routes = "/TransXChange/Routes/Route"
,JourneyPatternTimingLinks = c("/TransXChange/JourneyPatternSections/JourneyPatternSection/JourneyPatternTimingLink",
"/TransXChange/JourneyPatternSections/JourneyPatternSection/JourneyPatternTimingLink/From",
"/TransXChange/JourneyPatternSections/JourneyPatternSection/JourneyPatternTimingLink/To")
,Services = "/TransXChange/Services/Service"
,JourneyPatterns = "/TransXChange/Services/Service/StandardService/JourneyPattern"
,VehicleJourneys = "/TransXChange/VehicleJourneys/VehicleJourney"
)
Extract the tables by grouping the results from the sets of xpaths
build_tfl <- function(doc, terminal_xpaths, required_terminal_xpaths) {
purrr::map(required_terminal_xpaths, function(xpaths) {
# Use the provided paths
xpaths %>%
# Find all matches of each path
lapply(xml2::xml_find_all, x = doc) %>%
# Extract underlying data
purrr::map(xmltools::xml_dig_df) %>%
# Combine extracted data into one dataframe
purrr::map(dplyr::bind_rows) %>%
dplyr::bind_cols()
})
}
Get XML attributes and parent IDs
library(dplyr)
retrieve_additional_fields <- function(tfl, doc){
RouteSectionsNodeset <-
xml2::xml_find_all(doc, "/TransXChange/RouteSections/RouteSection")
AdditionalRouteLinksData <-
purrr::map(RouteSectionsNodeset, function(Section) {
RouteLinks <- xml2::xml_find_all(Section, "RouteLink")
data.table(
RouteSectionID = xml2::xml_attr(Section, "id"),
RouteLinkID = xml2::xml_attr(RouteLinks, "id")
)
}) %>%
rbindlist %>%
as_tibble
tfl$RouteLinks <- AdditionalRouteLinksData %>%
bind_cols(tfl$RouteLinks) %>%
rename(FromStopPointRef = StopPointRef,
ToStopPointRef = StopPointRef1)
JourneyPatternSectionsNodeset <-
xml2::xml_find_all(doc, "/TransXChange/JourneyPatternSections/JourneyPatternSection")
AdditionalTimingLinksData <-
purrr::map(JourneyPatternSectionsNodeset, function(Section) {
TimingLinks <- xml2::xml_find_all(Section, "JourneyPatternTimingLink")
data.table(
JourneyPatternSectionID = xml2::xml_attr(Section, "id"),
JourneyPatternTimingLinkID = xml2::xml_attr(TimingLinks, "id"),
FromSequenceNumber = TimingLinks %>%
xml2::xml_find_all("From") %>%
xml2::xml_attr("SequenceNumber") %>%
as.integer,
ToSequenceNumber = TimingLinks %>%
xml2::xml_find_all("To") %>%
xml2::xml_attr("SequenceNumber") %>%
as.integer
)
}) %>%
rbindlist %>%
as_tibble
tfl$JourneyPatternTimingLinks <- AdditionalTimingLinksData %>%
bind_cols(tfl$JourneyPatternTimingLinks) %>%
rename(FromActivity = Activity,
FromStopPointRef = StopPointRef,
FromTimingStatus = TimingStatus,
ToActivity = Activity1,
ToStopPointRef = StopPointRef1,
ToTimingStatus = TimingStatus1)
tfl$JourneyPatterns <-
tibble(JourneyPatternID =
"/TransXChange/Services/Service/StandardService/JourneyPattern" %>%
xml2::xml_find_all(x = doc) %>%
xml2::xml_attr("id")
) %>%
bind_cols(tfl$JourneyPatterns)
tfl
}
Full extraction workflow for one file.
extract_file <- function(file){
doc <- xml2::read_xml(file) %>%
xml2::xml_ns_strip()
tfl <- build_tfl(doc, terminal_xpaths, required_xpaths)
tfl %<>% retrieve_additional_fields(doc)
tfl
}
str(extract_file(data.files[1]), 1)
Copyright © Ruaridh Williamson 2017
Powered by TfL Open Data // Contains OS data © Crown copyright and database rights 2016