D Prediction Functions

D.1 Predict individual games

predict_game <- function(home, away, neutral = FALSE, visualize = TRUE,
  team_codes = team_counts, chains = model_params) {
  home_code <- team_codes$code[which(team_codes$team == home)]
  away_code <- team_codes$code[which(team_codes$team == away)]
  
  mu <- chains$mu %>% as.vector()
  eta <- ifelse(neutral, 0, chains$eta %>% as.vector())
  home_off <- chains$alpha[, home_code]
  home_def <- chains$delta[, home_code]
  away_off <- chains$alpha[, away_code]
  away_def <- chains$delta[, away_code]
  sigma_g <- chains$sigma_g %>% as.vector()
  game_int <- pmap_dbl(.l = list(sd = sigma_g), rnorm, n = 1, mean = 0)
  
  home_goals <- exp(mu + eta + home_off + away_def + game_int) %>%
    map_dbl(rpois, n = 1)
  away_goals <- exp(mu + away_off + home_def + game_int) %>%
    map_dbl(rpois, n = 1)
  
  outcomes <- data_frame(
    club = c(home, away),
    expected_goals = c(mean(home_goals), mean(away_goals)),
    prob_win = c(length(which(home_goals > away_goals)),
      length(which(away_goals > home_goals))) / length(home_goals),
    prob_tie = rep(length(which(home_goals == away_goals)) / length(home_goals),
      2),
    prob_loss = c(length(which(away_goals > home_goals)),
      length(which(home_goals > away_goals))) / length(home_goals)
  )
  
  if (visualize) {
    heatmap <- data_frame(home_goals, away_goals) %>%
      group_by(home_goals, away_goals) %>%
      summarize(probability = n() / nrow(.)) %>%
      mutate(plot = "Goal Distribution")
    histogram <- data_frame(home_goals, away_goals) %>%
      mutate(home_mov = home_goals - away_goals) %>%
      select(home_mov) %>%
      group_by(home_mov) %>%
      summarize(probability = n() / nrow(.)) %>%
      mutate(plot = "Margin of Victory", winner = ifelse(home_mov > 0, home,
        ifelse(home_mov < 0, away, "Tie"))) %>%
      mutate(winner = factor(winner, levels = c(home, "Tie", away)))
    
    score_dist <- ggplot(heatmap, aes(x = home_goals, y = away_goals)) +
      geom_tile(aes(fill = probability)) +
      scale_fill_gradient(name = "Probability") +
      scale_x_continuous(breaks = seq(0, 100, 1)) +
      scale_y_continuous(breaks = seq(0, 100, 1)) +
      labs(x = paste0(home, " Score"), y = paste0(away, " Score")) +
      theme_minimal() +
      theme(
        panel.grid.minor = element_blank(),
        legend.position = "bottom"
      )
    
    mov_dist <- ggplot(histogram, aes(x = home_mov, y = probability)) +
      geom_col(aes(fill = winner)) +
      scale_fill_brewer(type = "qual", palette = 3, name = "Winner") +
      scale_x_continuous(breaks = seq(-100, 100, 1)) +
      scale_y_continuous(breaks = seq(0, 1, 0.05), labels = scales::percent) +
      labs(x = paste0(home, " Margin of Victory"), y = "Probability") +
      theme_minimal() +
      theme(
        panel.grid.minor.x = element_blank(),
        legend.position = "bottom"
      )
    
    plot_list <- list(score_dist, mov_dist)
  }
  
  if (visualize) {
    list(
      predictions = outcomes,
      plots = plot_list
    )
  } else {
    list(
      predictions = outcomes
    )
  }
}

D.2 Plot multiple plots

multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
  # Make a list from the ... arguments and plotlist
  plots <- c(list(...), plotlist)

  numPlots <- length(plots)

  # If layout is NULL, then use 'cols' to determine layout
  if (is.null(layout)) {
    # Make the panel
    # ncol: Number of columns of plots
    # nrow: Number of rows needed, calculated from # of cols
    layout <- matrix(seq(1, cols * ceiling(numPlots / cols)),
      ncol = cols, nrow = ceiling(numPlots/cols))
  }

  if (numPlots == 1) {
    print(plots[[1]])

  } else {
    # Set up the page
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))

    # Make each plot, in the correct location
    for (i in 1:numPlots) {
      # Get the i,j matrix positions of the regions that contain this subplot
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))

      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
        layout.pos.col = matchidx$col))
    }
  }
}

D.3 Predict leagues

predict_league <- function(league, games = full_data, chains = model_params,
  team_codes = team_counts) {
  lookup <- data_frame(
    league = c("Premier League", "Bundesliga", "La Liga", "Ligue 1", "Serie A"),
    abbrev = c("Prem", "Bund", "La Liga", "Ligue 1", "Serie A"),
    website = c("http://www.espnfc.us/english-premier-league/23/table",
      "http://www.espnfc.us/german-bundesliga/10/table",
      "http://www.espnfc.us/spanish-primera-division/15/table",
      "http://www.espnfc.us/french-ligue-1/9/table",
      "http://www.espnfc.us/italian-serie-a/12/table")
  )
  abbrev <- lookup$abbrev[which(lookup$league == league)]
  website <- lookup$website[which(lookup$league == league)]
  
  safe_read_html <- safely(read_html)
  cont <- TRUE
  while(cont) {
    url_data <- safe_read_html(website)
    
    if(is.null(url_data[[1]])) {
      closeAllConnections()
      Sys.sleep(5)
    } else {
      url_data <- url_data[[1]]
      cont <- FALSE
    }
  }
  league_table <- url_data %>%
    html_nodes(css = "table") %>%
    html_table()
  league_table <- league_table[[1]]
  colnames(league_table) <- as.character(league_table[1,])
  colnames(league_table) <- make.names(colnames(league_table), unique = TRUE)
  league_table <- league_table[-1,]
  
  league_table <- league_table %>%
    select(club = TEAM, goals_for = `F`, goals_against = A, points = PTS) %>%
    mutate(club = trimws(club, which = "both"), points = as.numeric(points),
      goals_for = as.numeric(goals_for),
      goals_against = as.numeric(goals_against))
  club_names <- league_table$club

  future_games <- games %>%
    filter(competition == abbrev, date >= ymd(Sys.Date()), home %in% club_names,
      away %in% club_names)
  
  data_list <- list(
    mu = chains$mu %>% as.vector(),
    eta = chains$eta %>% as.vector(),
    alpha = t(chains$alpha) %>% as_data_frame() %>% as.list(),
    delta = t(chains$delta) %>% as_data_frame() %>% as.list(),
    sigma_g = chains$sigma_g %>% as.vector()
  )
  
  league_sim <- pmap_df(.l = data_list, .f = function(mu, eta, alpha, delta,
    sigma_g, sim_games, sim_league, team_codes) {
    for (g in seq_len(nrow(sim_games))) {
      home_code <- team_codes$code[which(team_codes$team == sim_games$home[g])]
      away_code <- team_codes$code[which(team_codes$team == sim_games$away[g])]
      
      home_off <- alpha[home_code]
      home_def <- delta[home_code]
      away_off <- alpha[away_code]
      away_def <- delta[away_code]
      game_int <- rnorm(1, mean = 0, sd = sigma_g)
      
      sim_games$h_goals[g] <- rpois(1, lambda = exp(mu + eta + home_off +
          away_def + game_int))
      sim_games$a_goals[g] <- rpois(1, lambda = exp(mu + away_off + home_def +
          game_int))
    }
    sim_games <- sim_games %>%
      mutate(
        home_pts = ifelse(h_goals > a_goals, 3, ifelse(h_goals < a_goals, 0, 1)),
        away_pts = ifelse(h_goals > a_goals, 0, ifelse(h_goals < a_goals, 3, 1))
      )
    pts_total <- bind_rows(
      select(sim_games, club = home, sim_pts = home_pts,
        sim_goals_for = h_goals, sim_goals_against = a_goals),
      select(sim_games, club = away, sim_pts = away_pts,
        sim_goals_for = a_goals, sim_goals_against = h_goals)
    ) %>%
      group_by(club) %>%
      summarize(sim_pts = sum(sim_pts), sim_goals_for = sum(sim_goals_for),
        sim_goals_against = sum(sim_goals_against))
    
    final_league <- left_join(sim_league, pts_total, by = "club") %>%
      mutate(
        tot_goals_for = goals_for + sim_goals_for,
        tot_goals_against = goals_against + sim_goals_against,
        goal_diff = tot_goals_for - tot_goals_against,
        tot_points = points + sim_pts
      ) %>%
      arrange(desc(tot_points), desc(goal_diff), desc(tot_goals_for))
    champ <- final_league$club[1]
    
    final_league %>%
      select(club, tot_points) %>%
      arrange(club) %>%
      spread(key = club, value = tot_points) %>%
      mutate(Champion = champ)
    }, sim_games = future_games, sim_league = league_table,
    team_codes = team_codes)
  
  champions <- data_frame(club = league_sim$Champion) %>%
    group_by(club) %>%
    summarize(champ_pct = n() / nrow(league_sim)) %>%
    arrange(desc(champ_pct))
  
  sim_results <- league_sim %>%
    select(-Champion) %>%
    gather(key = club, value = points, everything()) %>%
    group_by(club) %>%
    summarize(sim_points = mean(points)) %>%
    arrange(desc(sim_points)) %>%
    left_join(champions, by = "club")
  
  league_table %>%
    left_join(sim_results, by = "club")
}

D.4 Predict Champions League

predict_ucl <- function(matchups = matchups, games = full_data,
  chains = model_params, team_codes = team_counts) {
  all_clubs <- flatten_chr(matchups)
  
  future_games <- games %>%
    filter(competition == "UCL", home %in% all_clubs, away %in% all_clubs,
      date >= ymd(Sys.Date()) - 30)
  
  data_list <- list(
    mu = chains$mu %>% as.vector(),
    eta = chains$eta %>% as.vector(),
    alpha = t(chains$alpha) %>% as_data_frame() %>% as.list(),
    delta = t(chains$delta) %>% as_data_frame() %>% as.list(),
    sigma_g = chains$sigma_g %>% as.vector()
  )
  
  ucl_sim <- pmap_df(.l = data_list, .f = function(mu, eta, alpha, delta,
    sigma_g, sim_games, sim_ucl, team_codes) {
    results <- as_data_frame(matrix(data = NA, nrow = 1, ncol = 1))
    colnames(results) <- paste0("Round_", length(matchups))
    repeat {
      for (g in seq_len(nrow(sim_games))) {
        if (sim_games$date[g] < ymd(Sys.Date())) {
          next
        }
        home_code <- team_codes$code[which(team_codes$team ==
            sim_games$home[g])]
        away_code <- team_codes$code[which(team_codes$team ==
            sim_games$away[g])]
        
        home_off <- alpha[home_code]
        home_def <- delta[home_code]
        away_off <- alpha[away_code]
        away_def <- delta[away_code]
        game_int <- rnorm(1, mean = 0, sd = sigma_g)
        
        if (length(matchups) == 1) {
          sim_games$h_goals[g] <- rpois(1, lambda = exp(mu + home_off + away_def +
            game_int))
          sim_games$a_goals[g] <- rpois(1, lambda = exp(mu + away_off + home_def +
            game_int))
        } else {
          sim_games$h_goals[g] <- rpois(1, lambda = exp(mu + eta + home_off +
            away_def + game_int))
          sim_games$a_goals[g] <- rpois(1, lambda = exp(mu + away_off + home_def +
            game_int))
        }
      }
      winners <- map_chr(.x = matchups, .f = function(x, games) {
      games <- filter(games, home %in% x, away %in% x)
      scores <- bind_rows(
        select(games, club = home, h_goals) %>% mutate(a_goals = 0),
        select(games, club = away, a_goals) %>% mutate(h_goals = 0)
      ) %>%
        group_by(club) %>%
        summarize(h_goals = sum(h_goals), a_goals = sum(a_goals)) %>%
        mutate(total_goals = h_goals + a_goals) %>%
        arrange(desc(total_goals), desc(a_goals))
      if (length(unique(scores$total_goals)) > 1) {
        return(scores$club[1])
      } else if (length(unique(scores$total_goals)) == 1 &&
          length(matchups) == 1) {
        return(sample(scores$club, 1))
      } else if (length(unique(scores$total_goals)) == 1 &&
          length(unique(scores$a_goals)) > 1) {
        return(scores$club[1])
      } else {
        return(sample(scores$club, 1))
      }
    },
        games = sim_games)
      results[, paste0("Round_", length(winners))] <- paste(sort(winners),
        collapse = ",")
      if (length(winners) == 1) {
        break
      }
      if (length(winners) == 2) {
        matchups <- list(c(winners[1], winners[2]))
        sim_games <- data_frame(date = ymd(Sys.Date()), home = winners[1],
          away = winners[2], h_goals = NA, a_goals = NA, competition = "UCL",
          home_game = 0)
      } else {
        new_match <- data_frame(club = winners) %>%
          mutate(matchup = sample(rep(1:(length(winners) / 2), 2), length(winners),
            replace = FALSE))
        matchups <- list_along(seq_len(length(winners) / 2))
        new_games <- list_along(seq_len(length(winners) / 2))
        for (i in seq_along(matchups)) {
          clubs <- new_match$club[which(new_match$matchup == i)]
          matchups[[i]] <- clubs
          new_games[[i]] <- data_frame(date = ymd(Sys.Date()),
            home = sort(clubs), away = rev(sort(clubs)), h_goals = NA,
            a_goals = NA, competition = "UCL", home_game = 1)
        }
        sim_games <- bind_rows(new_games)
      }
    }
    return(results)
  }, sim_games = future_games,
    sim_ucl = matchups, team_codes = team_codes)
  
  round_results <- list_along(ucl_sim)
  names(round_results) <- colnames(ucl_sim)
  for (i in seq_along(round_results)) {
    round_results[[i]] <- paste(ucl_sim[[i]], collapse = ",") %>%
      strsplit(",") %>%
      unlist()
  }
  total_sim <- length(round_results$Round_1)
  final_results <- data_frame(club = sort(flatten_chr(matchups)),
    Round_8 = 1, Round_4 = 1, Round_2 = 1, Round_1 = 1)
  
  for (i in seq_len(nrow(final_results))) {
    club <- final_results$club[i]
    for (r in seq_along(round_results)) {
      final_results[[names(round_results)[r]]][i] <-
        length(which(round_results[[r]] == club)) / total_sim
    }
  }
  final_results %>%
    select(Club = club, Quarterfinals = Round_8, Semifinals = Round_4,
      Final = Round_2, Champion = Round_1) %>%
    arrange(desc(Champion))
}