D Prediction Functions
D.1 Predict individual games
predict_game <- function(home, away, neutral = FALSE, visualize = TRUE,
team_codes = team_counts, chains = model_params) {
home_code <- team_codes$code[which(team_codes$team == home)]
away_code <- team_codes$code[which(team_codes$team == away)]
mu <- chains$mu %>% as.vector()
eta <- ifelse(neutral, 0, chains$eta %>% as.vector())
home_off <- chains$alpha[, home_code]
home_def <- chains$delta[, home_code]
away_off <- chains$alpha[, away_code]
away_def <- chains$delta[, away_code]
sigma_g <- chains$sigma_g %>% as.vector()
game_int <- pmap_dbl(.l = list(sd = sigma_g), rnorm, n = 1, mean = 0)
home_goals <- exp(mu + eta + home_off + away_def + game_int) %>%
map_dbl(rpois, n = 1)
away_goals <- exp(mu + away_off + home_def + game_int) %>%
map_dbl(rpois, n = 1)
outcomes <- data_frame(
club = c(home, away),
expected_goals = c(mean(home_goals), mean(away_goals)),
prob_win = c(length(which(home_goals > away_goals)),
length(which(away_goals > home_goals))) / length(home_goals),
prob_tie = rep(length(which(home_goals == away_goals)) / length(home_goals),
2),
prob_loss = c(length(which(away_goals > home_goals)),
length(which(home_goals > away_goals))) / length(home_goals)
)
if (visualize) {
heatmap <- data_frame(home_goals, away_goals) %>%
group_by(home_goals, away_goals) %>%
summarize(probability = n() / nrow(.)) %>%
mutate(plot = "Goal Distribution")
histogram <- data_frame(home_goals, away_goals) %>%
mutate(home_mov = home_goals - away_goals) %>%
select(home_mov) %>%
group_by(home_mov) %>%
summarize(probability = n() / nrow(.)) %>%
mutate(plot = "Margin of Victory", winner = ifelse(home_mov > 0, home,
ifelse(home_mov < 0, away, "Tie"))) %>%
mutate(winner = factor(winner, levels = c(home, "Tie", away)))
score_dist <- ggplot(heatmap, aes(x = home_goals, y = away_goals)) +
geom_tile(aes(fill = probability)) +
scale_fill_gradient(name = "Probability") +
scale_x_continuous(breaks = seq(0, 100, 1)) +
scale_y_continuous(breaks = seq(0, 100, 1)) +
labs(x = paste0(home, " Score"), y = paste0(away, " Score")) +
theme_minimal() +
theme(
panel.grid.minor = element_blank(),
legend.position = "bottom"
)
mov_dist <- ggplot(histogram, aes(x = home_mov, y = probability)) +
geom_col(aes(fill = winner)) +
scale_fill_brewer(type = "qual", palette = 3, name = "Winner") +
scale_x_continuous(breaks = seq(-100, 100, 1)) +
scale_y_continuous(breaks = seq(0, 1, 0.05), labels = scales::percent) +
labs(x = paste0(home, " Margin of Victory"), y = "Probability") +
theme_minimal() +
theme(
panel.grid.minor.x = element_blank(),
legend.position = "bottom"
)
plot_list <- list(score_dist, mov_dist)
}
if (visualize) {
list(
predictions = outcomes,
plots = plot_list
)
} else {
list(
predictions = outcomes
)
}
}
D.2 Plot multiple plots
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
# Make a list from the ... arguments and plotlist
plots <- c(list(...), plotlist)
numPlots <- length(plots)
# If layout is NULL, then use 'cols' to determine layout
if (is.null(layout)) {
# Make the panel
# ncol: Number of columns of plots
# nrow: Number of rows needed, calculated from # of cols
layout <- matrix(seq(1, cols * ceiling(numPlots / cols)),
ncol = cols, nrow = ceiling(numPlots/cols))
}
if (numPlots == 1) {
print(plots[[1]])
} else {
# Set up the page
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
# Make each plot, in the correct location
for (i in 1:numPlots) {
# Get the i,j matrix positions of the regions that contain this subplot
matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col))
}
}
}
D.3 Predict leagues
predict_league <- function(league, games = full_data, chains = model_params,
team_codes = team_counts) {
lookup <- data_frame(
league = c("Premier League", "Bundesliga", "La Liga", "Ligue 1", "Serie A"),
abbrev = c("Prem", "Bund", "La Liga", "Ligue 1", "Serie A"),
website = c("http://www.espnfc.us/english-premier-league/23/table",
"http://www.espnfc.us/german-bundesliga/10/table",
"http://www.espnfc.us/spanish-primera-division/15/table",
"http://www.espnfc.us/french-ligue-1/9/table",
"http://www.espnfc.us/italian-serie-a/12/table")
)
abbrev <- lookup$abbrev[which(lookup$league == league)]
website <- lookup$website[which(lookup$league == league)]
safe_read_html <- safely(read_html)
cont <- TRUE
while(cont) {
url_data <- safe_read_html(website)
if(is.null(url_data[[1]])) {
closeAllConnections()
Sys.sleep(5)
} else {
url_data <- url_data[[1]]
cont <- FALSE
}
}
league_table <- url_data %>%
html_nodes(css = "table") %>%
html_table()
league_table <- league_table[[1]]
colnames(league_table) <- as.character(league_table[1,])
colnames(league_table) <- make.names(colnames(league_table), unique = TRUE)
league_table <- league_table[-1,]
league_table <- league_table %>%
select(club = TEAM, goals_for = `F`, goals_against = A, points = PTS) %>%
mutate(club = trimws(club, which = "both"), points = as.numeric(points),
goals_for = as.numeric(goals_for),
goals_against = as.numeric(goals_against))
club_names <- league_table$club
future_games <- games %>%
filter(competition == abbrev, date >= ymd(Sys.Date()), home %in% club_names,
away %in% club_names)
data_list <- list(
mu = chains$mu %>% as.vector(),
eta = chains$eta %>% as.vector(),
alpha = t(chains$alpha) %>% as_data_frame() %>% as.list(),
delta = t(chains$delta) %>% as_data_frame() %>% as.list(),
sigma_g = chains$sigma_g %>% as.vector()
)
league_sim <- pmap_df(.l = data_list, .f = function(mu, eta, alpha, delta,
sigma_g, sim_games, sim_league, team_codes) {
for (g in seq_len(nrow(sim_games))) {
home_code <- team_codes$code[which(team_codes$team == sim_games$home[g])]
away_code <- team_codes$code[which(team_codes$team == sim_games$away[g])]
home_off <- alpha[home_code]
home_def <- delta[home_code]
away_off <- alpha[away_code]
away_def <- delta[away_code]
game_int <- rnorm(1, mean = 0, sd = sigma_g)
sim_games$h_goals[g] <- rpois(1, lambda = exp(mu + eta + home_off +
away_def + game_int))
sim_games$a_goals[g] <- rpois(1, lambda = exp(mu + away_off + home_def +
game_int))
}
sim_games <- sim_games %>%
mutate(
home_pts = ifelse(h_goals > a_goals, 3, ifelse(h_goals < a_goals, 0, 1)),
away_pts = ifelse(h_goals > a_goals, 0, ifelse(h_goals < a_goals, 3, 1))
)
pts_total <- bind_rows(
select(sim_games, club = home, sim_pts = home_pts,
sim_goals_for = h_goals, sim_goals_against = a_goals),
select(sim_games, club = away, sim_pts = away_pts,
sim_goals_for = a_goals, sim_goals_against = h_goals)
) %>%
group_by(club) %>%
summarize(sim_pts = sum(sim_pts), sim_goals_for = sum(sim_goals_for),
sim_goals_against = sum(sim_goals_against))
final_league <- left_join(sim_league, pts_total, by = "club") %>%
mutate(
tot_goals_for = goals_for + sim_goals_for,
tot_goals_against = goals_against + sim_goals_against,
goal_diff = tot_goals_for - tot_goals_against,
tot_points = points + sim_pts
) %>%
arrange(desc(tot_points), desc(goal_diff), desc(tot_goals_for))
champ <- final_league$club[1]
final_league %>%
select(club, tot_points) %>%
arrange(club) %>%
spread(key = club, value = tot_points) %>%
mutate(Champion = champ)
}, sim_games = future_games, sim_league = league_table,
team_codes = team_codes)
champions <- data_frame(club = league_sim$Champion) %>%
group_by(club) %>%
summarize(champ_pct = n() / nrow(league_sim)) %>%
arrange(desc(champ_pct))
sim_results <- league_sim %>%
select(-Champion) %>%
gather(key = club, value = points, everything()) %>%
group_by(club) %>%
summarize(sim_points = mean(points)) %>%
arrange(desc(sim_points)) %>%
left_join(champions, by = "club")
league_table %>%
left_join(sim_results, by = "club")
}
D.4 Predict Champions League
predict_ucl <- function(matchups = matchups, games = full_data,
chains = model_params, team_codes = team_counts) {
all_clubs <- flatten_chr(matchups)
future_games <- games %>%
filter(competition == "UCL", home %in% all_clubs, away %in% all_clubs,
date >= ymd(Sys.Date()) - 30)
data_list <- list(
mu = chains$mu %>% as.vector(),
eta = chains$eta %>% as.vector(),
alpha = t(chains$alpha) %>% as_data_frame() %>% as.list(),
delta = t(chains$delta) %>% as_data_frame() %>% as.list(),
sigma_g = chains$sigma_g %>% as.vector()
)
ucl_sim <- pmap_df(.l = data_list, .f = function(mu, eta, alpha, delta,
sigma_g, sim_games, sim_ucl, team_codes) {
results <- as_data_frame(matrix(data = NA, nrow = 1, ncol = 1))
colnames(results) <- paste0("Round_", length(matchups))
repeat {
for (g in seq_len(nrow(sim_games))) {
if (sim_games$date[g] < ymd(Sys.Date())) {
next
}
home_code <- team_codes$code[which(team_codes$team ==
sim_games$home[g])]
away_code <- team_codes$code[which(team_codes$team ==
sim_games$away[g])]
home_off <- alpha[home_code]
home_def <- delta[home_code]
away_off <- alpha[away_code]
away_def <- delta[away_code]
game_int <- rnorm(1, mean = 0, sd = sigma_g)
if (length(matchups) == 1) {
sim_games$h_goals[g] <- rpois(1, lambda = exp(mu + home_off + away_def +
game_int))
sim_games$a_goals[g] <- rpois(1, lambda = exp(mu + away_off + home_def +
game_int))
} else {
sim_games$h_goals[g] <- rpois(1, lambda = exp(mu + eta + home_off +
away_def + game_int))
sim_games$a_goals[g] <- rpois(1, lambda = exp(mu + away_off + home_def +
game_int))
}
}
winners <- map_chr(.x = matchups, .f = function(x, games) {
games <- filter(games, home %in% x, away %in% x)
scores <- bind_rows(
select(games, club = home, h_goals) %>% mutate(a_goals = 0),
select(games, club = away, a_goals) %>% mutate(h_goals = 0)
) %>%
group_by(club) %>%
summarize(h_goals = sum(h_goals), a_goals = sum(a_goals)) %>%
mutate(total_goals = h_goals + a_goals) %>%
arrange(desc(total_goals), desc(a_goals))
if (length(unique(scores$total_goals)) > 1) {
return(scores$club[1])
} else if (length(unique(scores$total_goals)) == 1 &&
length(matchups) == 1) {
return(sample(scores$club, 1))
} else if (length(unique(scores$total_goals)) == 1 &&
length(unique(scores$a_goals)) > 1) {
return(scores$club[1])
} else {
return(sample(scores$club, 1))
}
},
games = sim_games)
results[, paste0("Round_", length(winners))] <- paste(sort(winners),
collapse = ",")
if (length(winners) == 1) {
break
}
if (length(winners) == 2) {
matchups <- list(c(winners[1], winners[2]))
sim_games <- data_frame(date = ymd(Sys.Date()), home = winners[1],
away = winners[2], h_goals = NA, a_goals = NA, competition = "UCL",
home_game = 0)
} else {
new_match <- data_frame(club = winners) %>%
mutate(matchup = sample(rep(1:(length(winners) / 2), 2), length(winners),
replace = FALSE))
matchups <- list_along(seq_len(length(winners) / 2))
new_games <- list_along(seq_len(length(winners) / 2))
for (i in seq_along(matchups)) {
clubs <- new_match$club[which(new_match$matchup == i)]
matchups[[i]] <- clubs
new_games[[i]] <- data_frame(date = ymd(Sys.Date()),
home = sort(clubs), away = rev(sort(clubs)), h_goals = NA,
a_goals = NA, competition = "UCL", home_game = 1)
}
sim_games <- bind_rows(new_games)
}
}
return(results)
}, sim_games = future_games,
sim_ucl = matchups, team_codes = team_codes)
round_results <- list_along(ucl_sim)
names(round_results) <- colnames(ucl_sim)
for (i in seq_along(round_results)) {
round_results[[i]] <- paste(ucl_sim[[i]], collapse = ",") %>%
strsplit(",") %>%
unlist()
}
total_sim <- length(round_results$Round_1)
final_results <- data_frame(club = sort(flatten_chr(matchups)),
Round_8 = 1, Round_4 = 1, Round_2 = 1, Round_1 = 1)
for (i in seq_len(nrow(final_results))) {
club <- final_results$club[i]
for (r in seq_along(round_results)) {
final_results[[names(round_results)[r]]][i] <-
length(which(round_results[[r]] == club)) / total_sim
}
}
final_results %>%
select(Club = club, Quarterfinals = Round_8, Semifinals = Round_4,
Final = Round_2, Champion = Round_1) %>%
arrange(desc(Champion))
}