B Web Scraping Functions

B.1 Scrape league pages

scrape_league <- function(x) {
  cont <- TRUE
  while(cont) {
    url_data <- safe_read_html(x)
    
    if(is.null(url_data[[1]])) {
      closeAllConnections()
      Sys.sleep(5)
    } else {
      url_data <- url_data[[1]]
      cont <- FALSE
    }
  }
  
  league_table <- url_data %>%
    html_nodes(css = "table") %>%
    html_table()
  league_table <- league_table[[1]]
  colnames(league_table) <- as.character(league_table[1,])
  colnames(league_table) <- make.names(colnames(league_table), unique = TRUE)
  league_table <- league_table[-1,]
  
  league_table <- league_table %>%
    select(club = TEAM, goals_for = `F`, goals_against = A, points = PTS) %>%
    mutate(club = trimws(club, which = "both"))
  
  teams <- url_data %>%
    html_nodes("td a") %>%
    html_text() %>%
    as.character() %>%
    trimws(which = "both")
  team_urls <- url_data %>%
    html_nodes("td a") %>%
    html_attr("href") %>%
    as.character()
  
  league_table <- league_table %>%
    left_join(data_frame(club = teams, club_url = team_urls), by = "club") %>%
    as_data_frame()
  
  return(league_table)
}

B.2 Scrape international cups

scrape_major_cup <- function(x) {
  cont <- TRUE
  while(cont) {
    url_data <- safe_read_html(x)
    
    if(is.null(url_data[[1]])) {
      closeAllConnections()
      Sys.sleep(5)
    } else {
      url_data <- url_data[[1]]
      cont <- FALSE
    }
  }
  
  league_table <- url_data %>%
    html_nodes(css = "table") %>%
    html_table()
  
  league_table <- map_df(.x = league_table, .f = function(x) {
    colnames(x) <- as.character(x[1,])
    colnames(x) <- make.names(colnames(x), unique = TRUE)
    x <- x[-1,]
    
    x <- x %>%
      select(club = TEAM)
    return(x)
  }) %>%
    bind_rows() %>%
    mutate(club = trimws(club, which = "both"))
  
  teams <- url_data %>%
    html_nodes("td a") %>%
    html_text() %>%
    as.character() %>%
    trimws(which = "both")
  team_urls <- url_data %>%
    html_nodes("td a") %>%
    html_attr("href") %>%
    as.character()
  
  league_table <- league_table %>%
    left_join(data_frame(club = teams, club_url = team_urls), by = "club") %>%
    as_data_frame()
  
  return(league_table)
}

B.3 Scrape domestic cups

scrape_dom_cup <- function(x) {
  cont <- TRUE
  while(cont) {
    url_data <- safe_read_html(x)
    
    if(is.null(url_data[[1]])) {
      closeAllConnections()
      Sys.sleep(5)
    } else {
      url_data <- url_data[[1]]
      cont <- FALSE
    }
  }

  teams <- url_data %>%
    html_nodes("#stats-fair-play a") %>%
    html_text() %>%
    as.character() %>%
    trimws(which = "both")
  team_urls <- url_data %>%
    html_nodes("#stats-fair-play a") %>%
    html_attr("href") %>%
    as.character()
  
  data_frame(
    club = teams,
    club_url = team_urls
  )
}

B.4 Scrape games

scrape_team <- function(x, y) {
    x <- gsub("/index", "/fixtures", x, fixed = TRUE)
    
    cont <- TRUE
    while(cont) {
      url_data <- safe_read_html(x)
      
      if(is.null(url_data[[1]])) {
        closeAllConnections()
        Sys.sleep(5)
      } else {
        url_data <- url_data[[1]]
        cont <- FALSE
      }
    }
    date <- url_data %>%
      html_nodes(".headline") %>%
      html_text() %>%
      as.character()
    if ("LIVE" %in% date) {
      date[which(date == "LIVE")] <- format(Sys.Date(), "%b %d, %Y")
    }
    date <- mdy(date)
    home_team <- url_data %>%
      html_nodes(".score-home-team .team-name") %>%
      html_text() %>%
      as.character()
    away_team <- url_data %>%
      html_nodes(".score-away-team .team-name") %>%
      html_text() %>%
      as.character()
    home_score <- url_data %>%
      html_nodes(".home-score") %>%
      html_text() %>%
      as.character() %>%
      gsub(" ", "", x = .) %>%
      gsub( " *\\(.*?\\) *", "", x = .) %>%
      as.numeric()
    away_score <- url_data %>%
      html_nodes(".away-score") %>%
      html_text() %>%
      as.character() %>%
      gsub(" ", "", x = .) %>%
      gsub( " *\\(.*?\\) *", "", x = .) %>%
      as.numeric()
    competition <- url_data %>%
      html_nodes(".score-column.score-competition") %>%
      html_text() %>%
      as.character()
    
    team_data <- data_frame(
      date = date,
      home = home_team,
      away = away_team,
      home_goals = home_score,
      away_goals = away_score,
      competition = competition
    ) %>%
      arrange(date) %>%
      unique()
    
    abbrev <- as_data_frame(table(c(team_data$home, team_data$away))) %>%
      top_n(n = 1, wt = n) %>%
      select(Var1) %>%
      flatten_chr()
    
    if (nrow(team_data) < 3) {
      ret_data <- data_frame(
        club = y,
        abbrev = y,
        team_data = NA
      )
    } else {
      if (abbrev == "Sporting") {
        team_data$home[which(team_data$home == "Sporting")] <- y
        team_data$away[which(team_data$away == "Sporting")] <- y
        ret_data <- data_frame(
          club = y,
          abbrev = y,
          team_data = list(team_data)
        )
      } else {
        team_data <- filter(team_data, home != "Sporting", away != "Sporting")
        ret_data <- data_frame(
          club = y,
          abbrev = abbrev,
          team_data = list(team_data)
        )
      }
    }
    
    return(ret_data)
}