## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----setup, message=FALSE----------------------------------------------------- library(scoringutils) library(data.table) library(ggplot2) library(magrittr) library(magrittr) #pipe operator ## ----eval=!require("ggdist", quietly = TRUE)---------------------------------- # #" @title Plot Predictions vs True Values # #" # #" @description # #" Make a plot of observed and predicted values # #" # #" @param data a data.frame that follows the same specifications outlined in # #" [score()]. To customise your plotting, you can filter your data using the # #" function [make_NA()]. # #" @param by character vector with column names that denote categories by which # #" the plot should be stratified. If for example you want to have a facetted # #" plot, this should be a character vector with the columns used in facetting # #" (note that the facetting still needs to be done outside of the function call) # #" @param x character vector of length one that denotes the name of the variable # #" @param interval_range numeric vector indicating the interval ranges to plot. # #" If 0 is included in `interval_range`, the median prediction will be shown. # #" @return ggplot object with a plot of true vs predicted values # #" @importFrom ggplot2 ggplot scale_colour_manual scale_fill_manual theme_light # #" @importFrom ggplot2 facet_wrap facet_grid aes geom_line .data geom_point # #" @importFrom data.table dcast # #" @importFrom ggdist geom_lineribbon # #" @export # #" @examples # #" library(ggplot2) # #" library(magrittr) # #" # #" example_sample_continuous %>% # #" make_NA ( # #" what = "truth", # #" target_end_date >= "2021-07-22", # #" target_end_date < "2021-05-01" # #" ) %>% # #" make_NA ( # #" what = "forecast", # #" model != "EuroCOVIDhub-ensemble", # #" forecast_date != "2021-06-07" # #" ) %>% # #" plot_predictions ( # #" x = "target_end_date", # #" by = c("target_type", "location"), # #" interval_range = c(0, 50, 90, 95) # #" ) + # #" facet_wrap(~ location + target_type, scales = "free_y") + # #" aes(fill = model, color = model) # #" # #" example_sample_continuous %>% # #" make_NA ( # #" what = "truth", # #" target_end_date >= "2021-07-22", # #" target_end_date < "2021-05-01" # #" ) %>% # #" make_NA ( # #" what = "forecast", # #" forecast_date != "2021-06-07" # #" ) %>% # #" plot_predictions ( # #" x = "target_end_date", # #" by = c("target_type", "location"), # #" interval_range = 0 # #" ) + # #" facet_wrap(~ location + target_type, scales = "free_y") + # #" aes(fill = model, color = model) # # # library(ggdist) # plot_predictions <- function(data, # by = NULL, # x = "date", # interval_range = c(0, 50, 90)) { # # # split truth data and forecasts in order to apply different filtering # truth_data <- data.table::as.data.table(data)[!is.na(observed)] # forecasts <- data.table::as.data.table(data)[!is.na(predicted)] # # del_cols <- # colnames(truth_data)[!(colnames(truth_data) %in% c(by, "observed", x))] # # truth_data <- unique(suppressWarnings(truth_data[, eval(del_cols) := NULL])) # # # find out what type of predictions we have. convert sample based to # # interval range data # # if ("quantile_level" %in% colnames(data)) { # forecasts <- scoringutils:::quantile_to_interval( # forecasts, # keep_quantile_col = FALSE # ) # } else if ("sample_id" %in% colnames(data)) { # # using a scoringutils internal function # forecasts <- scoringutils:::sample_to_interval_long( # as_forecast_sample(forecasts), # interval_range = interval_range, # keep_quantile_col = FALSE # ) # } # # # select appropriate boundaries and pivot wider # select <- forecasts$interval_range %in% setdiff(interval_range, 0) # intervals <- forecasts[select, ] # # # delete quantile column in intervals if present. This is important for # # pivoting # if ("quantile_level" %in% names(intervals)) { # intervals[, quantile_level := NULL] # } # # plot <- ggplot(data = data, aes(x = .data[[x]])) + # theme_scoringutils() + # ylab("True and predicted values") # # if (nrow(intervals) != 0) { # # pivot wider and convert range to a factor # intervals <- data.table::dcast(intervals, ... ~ boundary, # value.var = "predicted") # # # only plot interval ranges if there are interval ranges to plot # plot <- plot + # ggdist::geom_lineribbon( # data = intervals, # aes( # ymin = lower, ymax = upper, # # We use the fill_ramp aesthetic for this instead of the default fill # # because we want to keep fill to be able to use it for other # # variables # fill_ramp = factor( # interval_range, # levels = sort(unique(interval_range), decreasing = TRUE) # ) # ), # lwd = 0.4 # ) + # ggdist::scale_fill_ramp_discrete( # name = "interval_range", # # range argument was added to make sure that the line for the median # # and the ribbon don"t have the same opacity, making the line # # invisible # range = c(0.15, 0.75) # ) # } # # # We could treat this step as part of ggdist::geom_lineribbon() but we treat # # it separately here to deal with the case when only the median is provided # # (in which case ggdist::geom_lineribbon() will fail) # if (0 %in% interval_range) { # select_median <- # forecasts$interval_range == 0 & forecasts$boundary == "lower" # median <- forecasts[select_median] # # if (nrow(median) > 0) { # plot <- plot + # geom_line( # data = median, # mapping = aes(y = predicted), # lwd = 0.4 # ) # } # } # # # add observed values # if (nrow(truth_data) > 0) { # plot <- plot + # geom_point( # data = truth_data, # show.legend = FALSE, # inherit.aes = FALSE, # aes(x = .data[[x]], y = observed), # color = "black", # size = 0.5 # ) + # geom_line( # data = truth_data, # inherit.aes = FALSE, # show.legend = FALSE, # aes(x = .data[[x]], y = observed), # linetype = 1, # color = "grey40", # lwd = 0.2 # ) # } # # return(plot) # } ## ----------------------------------------------------------------------------- #" @title Make Rows NA in Data for Plotting #" #" @description #" Filters the data and turns values into `NA` before the data gets passed to #" [plot_predictions()]. The reason to do this is to this is that it allows to #" "filter" prediction and truth data separately. Any value that is NA will then #" be removed in the subsequent call to [plot_predictions()]. #" #" @inheritParams score #" @param what character vector that determines which values should be turned #" into `NA`. If `what = "truth"`, values in the column "observed" will be #" turned into `NA`. If `what = "forecast"`, values in the column "prediction" #" will be turned into `NA`. If `what = "both"`, values in both column will be #" turned into `NA`. #" @param ... logical statements used to filter the data #" @return A data.table #" @importFrom rlang enexprs #" @keywords plotting #" @export #" #" @examples #" make_NA ( #" example_sample_continuous, #" what = "truth", #" target_end_date >= "2021-07-22", #" target_end_date < "2021-05-01" #" ) make_NA <- function(data = NULL, what = c("truth", "forecast", "both"), ...) { stopifnot(is.data.frame(data)) data <- as.data.table(data) what <- match.arg(what) # turn ... arguments into expressions args <- enexprs(...) vars <- NULL if (what %in% c("forecast", "both")) { vars <- c(vars, "predicted") } if (what %in% c("truth", "both")) { vars <- c(vars, "observed") } for (expr in args) { data <- data[eval(expr), eval(vars) := NA_real_] } return(data[]) } ## ----eval=!require("ggdist", quietly = TRUE)---------------------------------- # median_forecasts <- example_quantile[quantile_level == 0.5] # median_forecasts %>% # make_NA(what = "truth", # target_end_date <= "2021-05-01", # target_end_date > "2021-07-22") %>% # make_NA(what = "forecast", # model != "EuroCOVIDhub-ensemble", # forecast_date != "2021-06-07") %>% # plot_predictions( # by = c("location", "target_type"), # x = "target_end_date" # ) + # facet_wrap(location ~ target_type, scales = "free_y") ## ----eval=!require("ggdist", quiet = TRUE)------------------------------------ # example_quantile %>% # make_NA(what = "truth", # target_end_date <= "2021-05-01", # target_end_date > "2021-07-22") %>% # make_NA(what = "forecast", # model != "EuroCOVIDhub-ensemble", # forecast_date != "2021-06-07") %>% # plot_predictions( # by = c("location", "target_type"), # x = "target_end_date", # interval_range = c(0, 10, 20, 30, 40, 50, 60) # ) + # facet_wrap(location ~ target_type, scales = "free_y") ## ----eval=!require("ggdist", quietly = TRUE)---------------------------------- # example_sample_continuous %>% # make_NA(what = "truth", # target_end_date <= "2021-05-01", # target_end_date > "2021-07-22") %>% # make_NA(what = "forecast", # model != "EuroCOVIDhub-ensemble", # forecast_date != "2021-06-07") %>% # plot_predictions( # by = c("location", "target_type"), # x = "target_end_date", # interval_range = c(0, 50, 90, 95) # ) + # facet_wrap(location ~ target_type, scales = "free_y") ## ----eval=!require("ggdist", quietly = TRUE)---------------------------------- # example_quantile %>% # make_NA(what = "truth", # target_end_date > "2021-07-15", # target_end_date <= "2021-05-22") %>% # make_NA(what = "forecast", # !(model %in% c("EuroCOVIDhub-ensemble", "EuroCOVIDhub-baseline")), # forecast_date != "2021-06-28") %>% # plot_predictions(x = "target_end_date", by = c("target_type", "location")) + # aes(colour = model, fill = model) + # facet_wrap(target_type ~ location, ncol = 4, scales = "free_y") + # labs(x = "Target end date") ## ----------------------------------------------------------------------------- #" @title Plot Metrics by Range of the Prediction Interval #" #" @description #" Visualise the metrics by range, e.g. if you are interested how different #" interval ranges contribute to the overall interval score, or how #" sharpness / dispersion changes by range. #" #" @param scores A data.frame of scores based on quantile forecasts as #" produced by [score()] or [summarise_scores()]. Note that "range" must be included #" in the `by` argument when running [summarise_scores()] #" @param y The variable from the scores you want to show on the y-Axis. #" This could be something like "wis" (the default) or "dispersion" #" @param x The variable from the scores you want to show on the x-Axis. #" Usually this will be "model" #" @param colour Character vector of length one used to determine a variable #" for colouring dots. The Default is "range". #" @return A ggplot2 object showing a contributions from the three components of #" the weighted interval score #" @importFrom ggplot2 ggplot aes aes geom_point geom_line #" expand_limits theme theme_light element_text scale_color_continuous labs #" @export #" @examples #" library(ggplot2) #" ex <- data.table::copy(example_quantile) #" ex$range <- scoringutils:::get_range_from_quantile(ex$quantile) #" scores <- suppressWarnings(score(as_forecast_quantile(ex), metrics = list("wis" = wis))) #" summarised <- summarise_scores( #" scores, #" by = c("model", "target_type", "range") #" ) #" plot_interval_ranges(summarised, x = "model") + #" facet_wrap(~target_type, scales = "free") plot_interval_ranges <- function(scores, y = "wis", x = "model", colour = "range") { plot <- ggplot( scores, aes( x = .data[[x]], y = .data[[y]], colour = .data[[colour]] ) ) + geom_point(size = 2) + geom_line(aes(group = range), colour = "black", linewidth = 0.01 ) + scale_color_continuous(low = "steelblue", high = "salmon") + theme_scoringutils() + expand_limits(y = 0) + theme( legend.position = "right", axis.text.x = element_text( angle = 90, vjust = 1, hjust = 1 ) ) return(plot) } ## ----------------------------------------------------------------------------- range_example <- copy(example_quantile) %>% na.omit() %>% .[, range := scoringutils:::get_range_from_quantile(quantile_level)] sum_scores <- range_example %>% as_forecast_quantile() %>% score(metrics = list(wis = wis, dispersion = dispersion_quantile)) %>% summarise_scores(by = c("model", "target_type", "range")) %>% suppressWarnings() plot_interval_ranges(sum_scores, x = "model") + facet_wrap(~target_type, scales = "free") ## ----------------------------------------------------------------------------- plot_interval_ranges(sum_scores, y = "dispersion", x = "model") + facet_wrap(~target_type, scales = "free_y") ## ----------------------------------------------------------------------------- #' @title Plot Coloured Score Table #' #' @description #' Plots a coloured table of summarised scores obtained using #' [score()]. #' #' @param y the variable to be shown on the y-axis. Instead of a single character string, #' you can also specify a vector with column names, e.g. #' `y = c("model", "location")`. These column names will be concatenated #' to create a unique row identifier (e.g. "model1_location1"). #' @param by A character vector that determines how the colour shading for the #' plot gets computed. By default (`NULL`), shading will be determined per #' metric, but you can provide additional column names (see examples). #' @param metrics A character vector with the metrics to show. If set to #' `NULL` (default), all metrics present in `scores` will be shown. #' #' @returns A ggplot object with a coloured table of summarised scores #' @inheritParams get_pairwise_comparisons #' @importFrom ggplot2 ggplot aes element_blank element_text labs coord_cartesian coord_flip #' @importFrom data.table setDT melt #' @importFrom stats sd #' @export #' #' @examples #' library(ggplot2) #' library(magrittr) # pipe operator #' \dontshow{ #' data.table::setDTthreads(2) # restricts number of cores used on CRAN #' } #' #' scores <- score(as_forecast_quantile(example_quantile)) %>% #' summarise_scores(by = c("model", "target_type")) %>% #' summarise_scores(by = c("model", "target_type"), fun = signif, digits = 2) #' #' plot_score_table(scores, y = "model", by = "target_type") + #' facet_wrap(~target_type, ncol = 1) #' #' # can also put target description on the y-axis #' plot_score_table(scores, #' y = c("model", "target_type"), #' by = "target_type") plot_score_table <- function(scores, y = "model", by = NULL, metrics = NULL) { # identify metrics ----------------------------------------------------------- id_vars <- get_forecast_unit(scores) metrics <- get_metrics(scores) cols_to_delete <- names(scores)[!(names(scores) %in% c(metrics, id_vars))] suppressWarnings(scores[, eval(cols_to_delete) := NULL]) # compute scaled values ------------------------------------------------------ # scaling is done in order to colour the different scores # for most metrics larger is worse, but others like bias are better if they # are close to zero and deviations in both directions are bad # define which metrics are scaled using min (larger is worse) and # which not (metrics like bias where deviations in both directions are bad) metrics_zero_good <- c("bias", "interval_coverage_deviation") metrics_no_color <- "coverage" metrics_min_good <- setdiff(metrics, c( metrics_zero_good, metrics_no_color )) # write scale functions that can be used in data.table scale <- function(x) { scaled <- x / sd(x, na.rm = TRUE) return(scaled) } scale_min_good <- function(x) { scaled <- (x - min(x)) / sd(x, na.rm = TRUE) return(scaled) } # pivot longer and add scaled values df <- data.table::melt(scores, value.vars = metrics, id.vars = id_vars, variable.name = "metric" ) df[metric %in% metrics_min_good, value_scaled := scale_min_good(value), by = c("metric", by) ] df[metric %in% metrics_zero_good, value_scaled := scale(value), by = c("metric", by) ] df[metric %in% metrics_no_color, value_scaled := 0, by = c("metric", by) ] # create identifier column for plot ------------------------------------------ # if there is only one column, leave column as is. Reason to do that is that # users can then pass in a factor and keep the ordering of that column intact if (length(y) > 1) { df[, identifCol := do.call(paste, c(.SD, sep = "_")), .SDcols = y[y %in% names(df)]] } else { setnames(df, old = eval(y), new = "identifCol") } # plot ----------------------------------------------------------------------- # make plot with all metrics that are not NA plot <- ggplot( df[!is.na(value), ], aes(y = identifCol, x = metric) ) + geom_tile(aes(fill = value_scaled), colour = "white", show.legend = FALSE) + geom_text(aes(y = identifCol, label = value)) + scale_fill_gradient2(low = "steelblue", high = "salmon") + theme_scoringutils() + theme( legend.title = element_blank(), legend.position = "none", axis.text.x = element_text( angle = 90, vjust = 1, hjust = 1 ) ) + labs(x = "", y = "") + coord_cartesian(expand = FALSE) return(plot) } ## ----------------------------------------------------------------------------- scores <- score(as_forecast_quantile(example_quantile)) %>% summarise_scores(by = c("model", "target_type")) %>% summarise_scores(by = c("model", "target_type"), fun = signif, digits = 2) plot_score_table(scores, y = "model", by = "target_type") + facet_wrap(~target_type, ncol = 1) ## ----------------------------------------------------------------------------- # can also put target description on the y-axis plot_score_table(scores, y = c("model", "target_type"), by = "target_type")