Packages and Data

library(tools)
library(stringi)
library(gtsummary)
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(predtools)
library(magrittr)
library(probably)
## 
## Attaching package: 'probably'
## The following objects are masked from 'package:base':
## 
##     as.factor, as.ordered
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom        1.0.6     ✔ rsample      1.2.1
## ✔ dials        1.3.0     ✔ tibble       3.2.1
## ✔ dplyr        1.1.4     ✔ tidyr        1.3.1
## ✔ infer        1.0.7     ✔ tune         1.2.1
## ✔ modeldata    1.4.0     ✔ workflows    1.1.4
## ✔ parsnip      1.2.1     ✔ workflowsets 1.1.0
## ✔ purrr        1.0.2     ✔ yardstick    1.3.1
## ✔ recipes      1.1.0
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard()         masks scales::discard()
## ✖ tidyr::extract()         masks magrittr::extract()
## ✖ dplyr::filter()          masks stats::filter()
## ✖ dplyr::lag()             masks stats::lag()
## ✖ purrr::lift()            masks caret::lift()
## ✖ yardstick::precision()   masks caret::precision()
## ✖ yardstick::recall()      masks caret::recall()
## ✖ yardstick::sensitivity() masks caret::sensitivity()
## ✖ purrr::set_names()       masks magrittr::set_names()
## ✖ yardstick::specificity() masks caret::specificity()
## ✖ recipes::step()          masks stats::step()
## • Dig deeper into tidy modeling with R at https://www.tmwr.org
library(caTools)
library(boot)
## 
## Attaching package: 'boot'
## The following object is masked from 'package:lattice':
## 
##     melanoma
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## The following object is masked from 'package:gtsummary':
## 
##     select
library(rpart)
## 
## Attaching package: 'rpart'
## The following object is masked from 'package:dials':
## 
##     prune
library(ranger)
## 
## Attaching package: 'ranger'
## The following object is masked from 'package:randomForest':
## 
##     importance
library(gbm)
## Loaded gbm 2.2.2
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
library(xgboost)
## 
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
## 
##     slice
library(e1071)
## 
## Attaching package: 'e1071'
## The following object is masked from 'package:tune':
## 
##     tune
## The following object is masked from 'package:rsample':
## 
##     permutations
## The following object is masked from 'package:parsnip':
## 
##     tune
library(nnet)
library(dplyr)
library(ggplot2)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:xgboost':
## 
##     slice
## The following object is masked from 'package:MASS':
## 
##     select
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(ggsoccer)
library(ggtext)
library(viridis)
## Loading required package: viridisLite
## 
## Attaching package: 'viridis'
## The following object is masked from 'package:scales':
## 
##     viridis_pal
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(rms)
## Loading required package: Hmisc
## 
## Attaching package: 'Hmisc'
## The following object is masked from 'package:plotly':
## 
##     subplot
## The following object is masked from 'package:e1071':
## 
##     impute
## The following object is masked from 'package:parsnip':
## 
##     translate
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
shots_dataset <- read.csv("~/Cose Mie/shots_dataset.csv")
FullData <- read.csv("~/Cose Mie/FullData.csv")
players <- read.csv("~/Cose Mie/players.csv")
new.players <- read.csv("~/Cose Mie/new.players.csv")

kable(head(shots_dataset, 3), "html") %>%
  kable_styling(full_width = T)
id minute result X Y xG player h_a player_id situation season shotType match_id h_team a_team h_goals a_goals date player_assisted lastAction
445975 0 MissedShots 0.759 0.644 0.0186014 Jorrit Hendrix h 9392 OpenPlay 2021 LeftFoot 16265 Spartak Moscow FK Akhmat 2 1 2021-12-04 14:00:00 Quincy Promes Pass
445976 6 MissedShots 0.738 0.352 0.0136572 Jorrit Hendrix h 9392 FromCorner 2021 LeftFoot 16265 Spartak Moscow FK Akhmat 2 1 2021-12-04 14:00:00 None
445977 9 SavedShot 0.732 0.250 0.0310316 Quincy Promes h 2815 DirectFreekick 2021 RightFoot 16265 Spartak Moscow FK Akhmat 2 1 2021-12-04 14:00:00 Standard

Data Cleaning

shots_dataset$player <- stri_trans_general(shots_dataset$player, "Latin-ASCII")
giocatori <- as.data.frame(unique(shots_dataset$player))
kable(head(giocatori, 3), "html") %>%
  kable_styling(full_width = T)
unique(shots_dataset$player)
Jorrit Hendrix
Quincy Promes
Victor Moses
names(giocatori)[names(giocatori) == "unique(shots_dataset$player)"] <- "Name"
foot_dataset <- FullData[, c("Name", "Preffered_Foot")]
foot_dataset$Name <- stri_trans_general(foot_dataset$Name, "Latin-ASCII")
kable(head(foot_dataset, 3), "html") %>%
  kable_styling(full_width = T)
Name Preffered_Foot
Cristiano Ronaldo Right
Lionel Messi Left
Neymar Right
giocatori2 <- merge(giocatori, foot_dataset, by = "Name", all.x = TRUE)
kable(head(giocatori2, 3), "html") %>%
  kable_styling(full_width = T)
Name Preffered_Foot
Aaron Connolly NA
Aaron Cresswell Left
Aaron Hickey NA
colSums((is.na(giocatori2)))
##           Name Preffered_Foot 
##              0           2499
player2 <- players[, c("name", "foot")]
kable(head(player2, 3), "html") %>%
  kable_styling(full_width = T)
name foot
Timo Hildebrand
Martin Petrov
Martin Amedick
player2 <- na.omit(player2)
player2$name <- stri_trans_general(player2$name, "Latin-ASCII")
names(player2)[names(player2) == "name"] <- "Name"
giocatori3 <- merge(giocatori2, player2, by = "Name", all.x = TRUE)
kable(head(giocatori3, 3), "html") %>%
  kable_styling(full_width = T)
Name Preffered_Foot foot
Aaron Connolly NA right
Aaron Cresswell Left left
Aaron Hickey NA both
giocatori3$Preffered_Foot <- ifelse(is.na(giocatori3$Preffered_Foot), giocatori3$foot, giocatori3$Preffered_Foot)
colSums(is.na(giocatori3))
##           Name Preffered_Foot           foot 
##              0            514            651
new.players <- new.players[, c("name", "foot")]
##
nomi_da_cambiare <- c("Andrej Galabinov", "Anssumane Fati", "Arnaud Kalimuendo Muinga", "Daniel Parejo", "Emile Smith-Rowe", "Kephren Thuram", "Kylian Mbappe-Lottin", "Marco Faraoni", "Nemanja Vidci", "Pablo Daniel Osvaldo", "Luca Toni", "Tanguy NDombele Alvaro", "Thiago Alcantara", "Yeremi Pino", "Papis Demba Cisse", "Kevin Kuranyi", "Dimitar Berbatov")

giocatori3 <- giocatori3 %>%
  mutate(Preffered_Foot = ifelse(Name %in% nomi_da_cambiare, "Right", Preffered_Foot))
giocatori3 <- subset(giocatori3, select = -c(foot))

nomi_da_cambiare2 <- c("Dimitri Kombarov", "Lee Kang-In", "Mohammed Ali-Cho", "Yaroslav Rakitskiy", "Pape Alassane Gueye")

giocatori3 <- giocatori3 %>%
  mutate(Preffered_Foot = ifelse(Name %in% nomi_da_cambiare2, "Left", Preffered_Foot))

nomi_da_cambiare3 <- c("Kostas Mitroglu", "Santiago Cazorla", "Son Heung-Min")

giocatori3 <- giocatori3 %>%
  mutate(Preffered_Foot = ifelse(Name %in% nomi_da_cambiare3, "both", Preffered_Foot))

giocatori3 <- giocatori3 %>%
  mutate(across(everything(), na_if, ""))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(everything(), na_if, "")`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
## 
##   # Previously
##   across(a:b, mean, na.rm = TRUE)
## 
##   # Now
##   across(a:b, \(x) mean(x, na.rm = TRUE))
giocatori3$Preffered_Foot <- toTitleCase(giocatori3$Preffered_Foot)

giocatori3$Preffered_Foot <- ifelse(giocatori3$Preffered_Foot == "Right", "RightFoot", ifelse(giocatori3$Preffered_Foot == "Left", "LeftFoot", giocatori3$Preffered_Foot))

names(giocatori3)[names(giocatori3) == "Name"] <- "player"

shots_dataset2 <- merge(shots_dataset, giocatori3, by = "player", all.x = TRUE)

shots_dataset2 <- shots_dataset2 %>%
  mutate(Preffered_Foot = ifelse(player == "Franck Zambo", "Right", Preffered_Foot))

shots_dataset2 <- shots_dataset2[shots_dataset2$result != "OwnGoal", ]

shots_dataset2$result <- ifelse(shots_dataset2$result == "Goal", 1, 0)

shots_dataset2$result <- as.factor(shots_dataset2$result)

shots_dataset2 <- na.omit(shots_dataset2)

names(shots_dataset2)[names(shots_dataset2) == "xG"] <- "xG Understat"

kable(head(shots_dataset2, 3), "html") %>%
  kable_styling(full_width = T)
player id minute result X Y xG Understat h_a player_id situation season shotType match_id h_team a_team h_goals a_goals date player_assisted lastAction Preffered_Foot
Aaron Connolly 427269 51 0 0.913 0.536 0.5629963 h 7991 OpenPlay 2021 LeftFoot 16391 Brighton Watford 2 0 2021-08-21 16:30:00 None RightFoot
Aaron Connolly 329180 49 0 0.944 0.383 0.0140796 h 7991 FromCorner 2019 Head 11746 Brighton Norwich 2 0 2019-11-02 15:00:00 Pascal Groß Aerial RightFoot
Aaron Connolly 338614 91 0 0.797 0.377 0.0308471 h 7991 OpenPlay 2019 RightFoot 11815 Brighton Sheffield United 0 1 2019-12-21 15:00:00 None RightFoot
premier_league <- c("Watford", "Norwich", "Sheffield United", "Brighton", "Southampton", "Chelsea", "Liverpool", "Manchester United", "Wolverhampton Wanderers", "Tottenham", "Aston Villa", "Burnley", "Swansea", "West Ham", "Leeds", "Crystal Palace", "Manchester City", "Newcastle United", "Fulham", "Hull", "Arsenal", "Bournemouth", "West Bromwich Albion", "Stoke", "Sunderland", "Huddersfield", "Everton", "Leicester", "Cardiff", "Middlesbrough", "Queens Park Rangers", "Brentford")
seriea <- c("Bologna", "Genoa", "Salernitana", "AC Milan", "Fiorentina", "Cagliari", "Lazio", "Juventus", "Parma Calcio 1913", "Crotone", "SPAL 2013", "Sassuolo", "Lecce", "Udinese", "Roma", "Torino", "Inter", "Brescia", "Verona", "Parma", "Palermo", "Napoli", "Carpi", "Frosinone", "Venezia", "Sampdoria", "Benevento", "Spezia", "Empoli", "Chievo", "Atalanta", "Cesena", "Pescara")
bundesliga <- c("Hamburger SV", "Wolfsburg", "Eintracht Frankfurt", "RasenBallsport Leipzig", "Freiburg", "Bayern Munich", "Bayer Leverkusen", "Schalke 04", "Hertha Berlin", "Werder Bremen", "VfB Stuttgart", "Darmstadt", "Bochum", "Mainz 05", "Hannover 96", "Arminia Bielefeld", "Fortuna Duesseldorf", "Augsburg", "Hoffenheim", "Borussia M.Gladbach", "Paderborn", "FC Cologne", "Borussia Dortmund", "Greuther Fuerth", "Nuernberg", "Ingolstadt", "Union Berlin")
ligue1 <- c("Toulouse", "Marseille", "Nice", "Bordeaux", "Saint-Etienne", "Lille", "Strasbourg", "Guingamp", "Metz", "Brest", "Lyon", "Montpellier", "Reims", "Dijon", "Nantes", "Angers", "Rennes", "SC Bastia", "Paris Saint Germain", "Monaco", "Caen", "GFC Ajaccio", "Nimes", "Lorient", "Amiens", "Lens", "Troyes", "Evian Thonon Gaillard", "Nancy", "Clermont Foot")
liga <- c("Real Betis", "Celta Vigo", "Alaves", "Levante", "Elche", "Granada", "Deportivo La Coruna", "Villarreal", "Cordoba", "Athletic Club", "Rayo Vallecano", "Mallorca", "Real Valladolid", "Real Sociedad", "Sevilla", "Atletico Madrid", "Real Madrid", "Eibar", "Getafe", "Barcelona", "Malaga", "Girona", "Las Palmas", "Valencia", "SD Huesca", "Sporting Gijon", "Osasuna", "Cadiz", "Leganes", "Almeria")
shots_dataset2$lega <- ifelse(shots_dataset2$h_team %in% premier_league | shots_dataset2$a_team %in% premier_league, "Premier League",
                              ifelse(shots_dataset2$h_team %in% seriea | shots_dataset2$a_team %in% seriea, "Serie A", 
                                     ifelse(shots_dataset2$h_team %in% bundesliga | shots_dataset2$a_team %in% bundesliga, "Bundesliga", 
                                            ifelse(shots_dataset2$h_team %in% ligue1 | shots_dataset2$a_team %in% ligue1, "Ligue 1", 
                                                   ifelse(shots_dataset2$h_team %in% liga | shots_dataset2$a_team %in% liga, "Liga", "Russian PL")))))
subset_penalty <- subset(shots_dataset2, situation == "Penalty")

subset_openplay <- subset(shots_dataset2, situation == "OpenPlay")

subset_setpiece <- subset(shots_dataset2, situation %in% c("SetPiece", "FromCorner", "DirectFreekick"))

kable(head(subset_openplay, 3), "html") %>%
  kable_styling(full_width = T)
player id minute result X Y xG Understat h_a player_id situation season shotType match_id h_team a_team h_goals a_goals date player_assisted lastAction Preffered_Foot lega
1 Aaron Connolly 427269 51 0 0.913 0.536 0.5629963 h 7991 OpenPlay 2021 LeftFoot 16391 Brighton Watford 2 0 2021-08-21 16:30:00 None RightFoot Premier League
3 Aaron Connolly 338614 91 0 0.797 0.377 0.0308471 h 7991 OpenPlay 2019 RightFoot 11815 Brighton Sheffield United 0 1 2019-12-21 15:00:00 None RightFoot Premier League
4 Aaron Connolly 403235 49 0 0.847 0.382 0.1247877 a 7991 OpenPlay 2020 RightFoot 14656 Burnley Brighton 1 1 2021-02-06 15:00:00 Yves Bissouma Tackle RightFoot Premier League
openplay_foot <- subset(subset_openplay, shotType %in% c("LeftFoot", "RightFoot"))
openplay_head <- subset(subset_openplay, shotType == "Head")

setpiece_foot <- subset(subset_setpiece, shotType %in% c("LeftFoot", "RightFoot"))
setpiece_head <- subset(subset_setpiece, shotType == "Head")

tbl_summary(openplay_foot, include = c(minute, result, X, Y, shotType, lastAction))
Characteristic N = 288,8871
minute 49 (26, 71)
result
    0 260,327 (90%)
    1 28,560 (9.9%)
X 0.85 (0.77, 0.89)
Y 0.50 (0.39, 0.62)
shotType
    LeftFoot 106,100 (37%)
    RightFoot 182,787 (63%)
lastAction
    Aerial 3,641 (1.3%)
    BallRecovery 9,647 (3.3%)
    BallTouch 5,416 (1.9%)
    BlockedPass 472 (0.2%)
    Card 27 (<0.1%)
    Challenge 29 (<0.1%)
    ChanceMissed 2 (<0.1%)
    Chipped 15,477 (5.4%)
    Clearance 47 (<0.1%)
    CornerAwarded 197 (<0.1%)
    Cross 16,612 (5.8%)
    Dispossessed 2,383 (0.8%)
    End 104 (<0.1%)
    Error 3 (<0.1%)
    FormationChange 12 (<0.1%)
    Foul 272 (<0.1%)
    Goal 143 (<0.1%)
    GoodSkill 69 (<0.1%)
    HeadPass 6,590 (2.3%)
    Interception 526 (0.2%)
    KeeperPickup 7 (<0.1%)
    KeeperSweeper 1 (<0.1%)
    LayOff 3,836 (1.3%)
    None 32,994 (11%)
    OffsidePass 81 (<0.1%)
    OffsideProvoked 7 (<0.1%)
    Pass 148,079 (51%)
    Punch 1 (<0.1%)
    Rebound 11,636 (4.0%)
    Save 13 (<0.1%)
    ShieldBallOpp 2 (<0.1%)
    Smother 1 (<0.1%)
    Start 10 (<0.1%)
    SubstitutionOn 36 (<0.1%)
    Tackle 969 (0.3%)
    TakeOn 21,584 (7.5%)
    Throughball 7,961 (2.8%)
1 Median (Q1, Q3); n (%)
tbl_summary(openplay_head, include = c(minute, result, X, Y, shotType, lastAction))
Characteristic N = 30,8551
minute 49 (26, 73)
result
    0 27,181 (88%)
    1 3,674 (12%)
X 0.911 (0.890, 0.929)
Y 0.50 (0.46, 0.55)
shotType
    Head 30,855 (100%)
lastAction
    Aerial 9,053 (29%)
    BallRecovery 8 (<0.1%)
    BallTouch 65 (0.2%)
    BlockedPass 7 (<0.1%)
    Card 6 (<0.1%)
    Challenge 10 (<0.1%)
    Chipped 2,743 (8.9%)
    Clearance 2 (<0.1%)
    CornerAwarded 59 (0.2%)
    Cross 16,320 (53%)
    Dispossessed 3 (<0.1%)
    End 37 (0.1%)
    Error 2 (<0.1%)
    FormationChange 1 (<0.1%)
    Foul 67 (0.2%)
    Goal 26 (<0.1%)
    HeadPass 392 (1.3%)
    Interception 7 (<0.1%)
    KeeperPickup 3 (<0.1%)
    LayOff 6 (<0.1%)
    None 846 (2.7%)
    OffsidePass 7 (<0.1%)
    Pass 315 (1.0%)
    Rebound 735 (2.4%)
    Save 1 (<0.1%)
    SubstitutionOff 1 (<0.1%)
    SubstitutionOn 11 (<0.1%)
    Tackle 12 (<0.1%)
    TakeOn 21 (<0.1%)
    Throughball 89 (0.3%)
1 Median (Q1, Q3); n (%)
tbl_summary(setpiece_foot, include = c(minute, result, X, Y, shotType, lastAction))
Characteristic N = 69,4751
minute 49 (26, 71)
result
    0 64,065 (92%)
    1 5,410 (7.8%)
X 0.80 (0.75, 0.89)
Y 0.51 (0.41, 0.61)
shotType
    LeftFoot 24,145 (35%)
    RightFoot 45,330 (65%)
lastAction
    Aerial 3,397 (4.9%)
    BallRecovery 65 (<0.1%)
    BallTouch 1,316 (1.9%)
    BlockedPass 24 (<0.1%)
    Card 14 (<0.1%)
    Challenge 9 (<0.1%)
    Chipped 1,528 (2.2%)
    Clearance 5 (<0.1%)
    CornerAwarded 154 (0.2%)
    Cross 5,988 (8.6%)
    Dispossessed 118 (0.2%)
    End 103 (0.1%)
    Error 2 (<0.1%)
    FormationChange 3 (<0.1%)
    Foul 139 (0.2%)
    Goal 55 (<0.1%)
    GoodSkill 3 (<0.1%)
    HeadPass 2,736 (3.9%)
    Interception 47 (<0.1%)
    KeeperPickup 4 (<0.1%)
    KeeperSweeper 1 (<0.1%)
    LayOff 182 (0.3%)
    None 16,313 (23%)
    OffsidePass 9 (<0.1%)
    OffsideProvoked 1 (<0.1%)
    Pass 9,535 (14%)
    Rebound 5,585 (8.0%)
    Save 5 (<0.1%)
    Standard 20,936 (30%)
    Start 1 (<0.1%)
    SubstitutionOn 7 (<0.1%)
    Tackle 93 (0.1%)
    TakeOn 987 (1.4%)
    Throughball 110 (0.2%)
1 Median (Q1, Q3); n (%)
tbl_summary(setpiece_head, include = c(minute, result, X, Y, shotType, lastAction))
Characteristic N = 45,8331
minute 49 (26, 71)
result
    0 41,601 (91%)
    1 4,232 (9.2%)
X 0.917 (0.899, 0.934)
Y 0.50 (0.46, 0.55)
shotType
    Head 45,833 (100%)
lastAction
    Aerial 16,390 (36%)
    BallRecovery 13 (<0.1%)
    BallTouch 157 (0.3%)
    BlockedPass 42 (<0.1%)
    Card 23 (<0.1%)
    Challenge 16 (<0.1%)
    Chipped 2,327 (5.1%)
    Clearance 9 (<0.1%)
    CornerAwarded 146 (0.3%)
    Cross 23,526 (51%)
    Dispossessed 16 (<0.1%)
    End 98 (0.2%)
    Error 1 (<0.1%)
    FormationChange 2 (<0.1%)
    Foul 160 (0.3%)
    Goal 93 (0.2%)
    HeadPass 918 (2.0%)
    Interception 16 (<0.1%)
    KeeperPickup 4 (<0.1%)
    LayOff 6 (<0.1%)
    None 894 (2.0%)
    OffsidePass 35 (<0.1%)
    OffsideProvoked 2 (<0.1%)
    Pass 162 (0.4%)
    Rebound 710 (1.5%)
    Save 2 (<0.1%)
    ShieldBallOpp 3 (<0.1%)
    Standard 2 (<0.1%)
    Start 1 (<0.1%)
    SubstitutionOn 9 (<0.1%)
    Tackle 20 (<0.1%)
    TakeOn 25 (<0.1%)
    Throughball 5 (<0.1%)
1 Median (Q1, Q3); n (%)

Removing last action <10

openplay_foot <- subset(openplay_foot, !lastAction %in% c("ChanceMissed", "Error", "KeeperPickup", "KeeperSweeper", "OffsideProvoked", "Punch", "ShieldBallOpp", "Smother"))
openplay_head <- subset(openplay_head, !lastAction %in% c("BallRecovery", "BlockedPass", "Card", "Clearance", "Dispossessed", "Error", "FormationChange", "Interception", "KeeperPickup", "LayOff", "OffsidePass", "Save", "SubstiotutionOff"))
setpiece_foot <- subset(setpiece_foot, !lastAction %in% c("Challenge", "Clearance", "Error", "FormationChange", "KeeperPickup", "KeeperSweeper", "OffsidePass", "OffsideProvoked", "Save", "Start", "SubstiotutionOn"))
setpiece_head <- subset(setpiece_head, !lastAction %in% c("Clearance", "Error", "FormationChange", "KeeperPickup", "LayOff", "OffsideProvoked", "Save", "ShieldBallOpp", "Standard", "Start", "SubstiotutionOn", "Throughball"))

Introduction of is_weakfoot variable

openplay_foot$is_weakfoot <- ifelse(openplay_foot$Preffered_Foot == openplay_foot$shotType, "No", ifelse(openplay_foot$Preffered_Foot == "both", "No", "Yes"))
setpiece_foot$is_weakfoot <- ifelse(setpiece_foot$Preffered_Foot == setpiece_foot$shotType, "No", ifelse(setpiece_foot$Preffered_Foot == "both", "No", "Yes"))

understat_xG_op_foot <- openplay_foot$xG
understat_xG_op_head <- openplay_head$xG
understat_xG_sp_foot <- setpiece_foot$xG
understat_xG_sp_head <- setpiece_head$xG

kable(head(openplay_foot, 3), "html") %>%
  kable_styling(full_width = T)
player id minute result X Y xG Understat h_a player_id situation season shotType match_id h_team a_team h_goals a_goals date player_assisted lastAction Preffered_Foot lega is_weakfoot
1 Aaron Connolly 427269 51 0 0.913 0.536 0.5629963 h 7991 OpenPlay 2021 LeftFoot 16391 Brighton Watford 2 0 2021-08-21 16:30:00 None RightFoot Premier League Yes
3 Aaron Connolly 338614 91 0 0.797 0.377 0.0308471 h 7991 OpenPlay 2019 RightFoot 11815 Brighton Sheffield United 0 1 2019-12-21 15:00:00 None RightFoot Premier League No
4 Aaron Connolly 403235 49 0 0.847 0.382 0.1247877 a 7991 OpenPlay 2020 RightFoot 14656 Burnley Brighton 1 1 2021-02-06 15:00:00 Yves Bissouma Tackle RightFoot Premier League No

Data Modeling

Key

For better interpretation, this is how I refer to the different models:

For shot type, the numbering is as follows:

  1. Open Play Foot
  2. Open Play Head
  3. Set Piece Foot
  4. Set Piece Head

For modeling, the numbering is as follows: 1. Logit 2. Logit with interactions 3. Discriminant Analysis 4. Random Forest 5. Bagging 8. Neural Network

For example model1.5 is bagging model for Open Play Foot

Note: For brevity, the stepwise procedure is not shown in this report. Where there is no model with interactions, it is because there is no significant improvement that would justify such a model, which is very costly in terms of computational time.

Open Play Foot Model

set.seed(123)
index_opf <- sample.split(Y = openplay_foot$result, SplitRatio = 0.75)
train_opf <- openplay_foot[index_opf, ]

indice_test_opf <-  which(index_opf == FALSE)
test_opf <- openplay_foot[indice_test_opf, ] 

Logit

modello1.1 <- glm(result ~ minute + lastAction + is_weakfoot + X + exp(Y^2) + h_a, 
                                         data = train_opf, family = binomial)
summary(modello1.1)
## 
## Call:
## glm(formula = result ~ minute + lastAction + is_weakfoot + X + 
##     exp(Y^2) + h_a, family = binomial, data = train_opf)
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -1.327e+01  1.594e-01 -83.287  < 2e-16 ***
## minute                     1.444e-03  2.824e-04   5.113 3.16e-07 ***
## lastActionBallRecovery     8.943e-01  1.114e-01   8.029 9.85e-16 ***
## lastActionBallTouch        4.734e-01  1.169e-01   4.049 5.13e-05 ***
## lastActionBlockedPass      6.401e-01  2.185e-01   2.929 0.003400 ** 
## lastActionCard             1.393e-01  7.542e-01   0.185 0.853428    
## lastActionChallenge        4.390e-01  1.049e+00   0.418 0.675667    
## lastActionChipped          3.860e-01  1.055e-01   3.660 0.000253 ***
## lastActionClearance        1.148e-01  7.561e-01   0.152 0.879348    
## lastActionCornerAwarded   -2.692e+00  1.010e+00  -2.666 0.007675 ** 
## lastActionCross            4.258e-01  1.040e-01   4.095 4.23e-05 ***
## lastActionDispossessed     5.274e-01  1.384e-01   3.811 0.000139 ***
## lastActionEnd             -2.009e+00  1.015e+00  -1.978 0.047884 *  
## lastActionFormationChange  8.962e-01  1.093e+00   0.820 0.412368    
## lastActionFoul             3.885e-03  2.842e-01   0.014 0.989093    
## lastActionGoal            -1.739e+00  7.240e-01  -2.402 0.016307 *  
## lastActionGoodSkill        1.241e+00  4.443e-01   2.793 0.005219 ** 
## lastActionHeadPass         4.361e-01  1.135e-01   3.842 0.000122 ***
## lastActionInterception     7.496e-01  2.350e-01   3.190 0.001422 ** 
## lastActionLayOff           6.903e-01  1.306e-01   5.285 1.26e-07 ***
## lastActionNone             5.927e-01  1.031e-01   5.747 9.08e-09 ***
## lastActionOffsidePass     -2.190e-02  5.418e-01  -0.040 0.967758    
## lastActionPass             6.951e-01  1.010e-01   6.884 5.83e-12 ***
## lastActionRebound          1.110e+00  1.042e-01  10.652  < 2e-16 ***
## lastActionSave            -7.818e+00  3.653e+01  -0.214 0.830549    
## lastActionStart            2.035e+00  1.197e+00   1.699 0.089265 .  
## lastActionSubstitutionOn   1.017e+00  5.857e-01   1.736 0.082476 .  
## lastActionTackle           7.795e-01  1.732e-01   4.501 6.76e-06 ***
## lastActionTakeOn           7.280e-01  1.039e-01   7.009 2.40e-12 ***
## lastActionThroughball      1.488e+00  1.047e-01  14.216  < 2e-16 ***
## is_weakfootYes            -1.602e-01  1.742e-02  -9.194  < 2e-16 ***
## X                          1.291e+01  1.289e-01 100.099  < 2e-16 ***
## exp(Y^2)                  -5.834e-01  4.082e-02 -14.294  < 2e-16 ***
## h_ah                       2.449e-02  1.511e-02   1.621 0.105116    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 139769  on 216646  degrees of freedom
## Residual deviance: 123770  on 216613  degrees of freedom
## AIC: 123838
## 
## Number of Fisher Scoring iterations: 9
predict1.1 <- predict.glm(modello1.1, newdata = test_opf, type = "response")

roc1.1 <- roc(test_opf$result ~ predict1.1, plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright", legend = paste("AUC =", round(auc(roc1.1), 4)), lwd = 2, box.lwd = 0, bg = "white")

confusione1.1 <- confusionMatrix(as.factor(ifelse(predict1.1>0.3, 1, 0)), as.factor(test_opf$result)) 
confusione1.1
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 63353  6307
##          1  1723   833
##                                           
##                Accuracy : 0.8888          
##                  95% CI : (0.8865, 0.8911)
##     No Information Rate : 0.9011          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1263          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.9735          
##             Specificity : 0.1167          
##          Pos Pred Value : 0.9095          
##          Neg Pred Value : 0.3259          
##              Prevalence : 0.9011          
##          Detection Rate : 0.8773          
##    Detection Prevalence : 0.9646          
##       Balanced Accuracy : 0.5451          
##                                           
##        'Positive' Class : 0               
## 
rms::val.prob(as.numeric(as.character(predict1.1)), as.numeric(as.character(test_opf$result)))

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##  4.937122e-01  7.468561e-01  1.422587e-01  7.001681e-02  5.057334e+03 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA  3.125414e-05  4.257049e+00  1.190128e-01  6.998556e-02 
##         Brier     Intercept         Slope          Emax           E90 
##  8.249545e-02 -5.805545e-02  9.688800e-01  1.046446e-01  5.777724e-03 
##          Eavg           S:z           S:p 
##  2.633787e-03  9.329611e-01  3.508401e-01

Logit with interactions

modello1.2 <- glm(formula = result ~ minute + lastAction + is_weakfoot + X + 
                    exp(Y^2) + h_a + lastAction:X + is_weakfoot:exp(Y^2) + X:exp(Y^2) + 
                    is_weakfoot:X + exp(Y^2):h_a + minute:lastAction + minute:h_a + 
                    minute:exp(Y^2) + lastAction:h_a, family = binomial, data = train_opf)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(modello1.2)
## 
## Call:
## glm(formula = result ~ minute + lastAction + is_weakfoot + X + 
##     exp(Y^2) + h_a + lastAction:X + is_weakfoot:exp(Y^2) + X:exp(Y^2) + 
##     is_weakfoot:X + exp(Y^2):h_a + minute:lastAction + minute:h_a + 
##     minute:exp(Y^2) + lastAction:h_a, family = binomial, data = train_opf)
## 
## Coefficients:
##                                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                      -1.493e+01  1.521e+00  -9.817  < 2e-16 ***
## minute                            1.603e-02  4.342e-03   3.692 0.000222 ***
## lastActionBallRecovery            7.784e-01  1.427e+00   0.545 0.585464    
## lastActionBallTouch              -1.297e+00  1.591e+00  -0.815 0.414990    
## lastActionBlockedPass            -3.624e+00  3.283e+00  -1.104 0.269698    
## lastActionCard                    1.157e+01  1.284e+01   0.901 0.367548    
## lastActionChallenge              -5.879e+02  1.246e+04  -0.047 0.962374    
## lastActionChipped                 3.317e+00  1.405e+00   2.361 0.018243 *  
## lastActionClearance              -5.464e+00  1.576e+03  -0.003 0.997233    
## lastActionCornerAwarded           1.727e+02  2.529e+03   0.068 0.945571    
## lastActionCross                  -5.733e+00  1.488e+00  -3.854 0.000116 ***
## lastActionDispossessed           -1.104e+00  1.880e+00  -0.587 0.556960    
## lastActionEnd                     5.735e+02  3.299e+03   0.174 0.861998    
## lastActionFormationChange         2.008e+02  1.270e+05   0.002 0.998739    
## lastActionFoul                    7.324e+00  3.945e+00   1.857 0.063370 .  
## lastActionGoal                    1.089e+01  8.492e+02   0.013 0.989772    
## lastActionGoodSkill               1.025e+00  5.205e+00   0.197 0.843953    
## lastActionHeadPass               -1.060e+00  1.551e+00  -0.683 0.494455    
## lastActionInterception           -4.864e+00  3.025e+00  -1.608 0.107889    
## lastActionLayOff                 -9.187e-01  1.790e+00  -0.513 0.607762    
## lastActionNone                   -1.188e+00  1.349e+00  -0.881 0.378568    
## lastActionOffsidePass             4.325e+00  7.721e+00   0.560 0.575313    
## lastActionPass                   -1.781e+00  1.319e+00  -1.351 0.176837    
## lastActionRebound                -5.337e+00  1.421e+00  -3.755 0.000173 ***
## lastActionSave                   -5.817e+00  2.059e+04   0.000 0.999775    
## lastActionStart                  -1.121e+01  6.523e+03  -0.002 0.998629    
## lastActionSubstitutionOn         -1.680e+01  1.601e+01  -1.050 0.293930    
## lastActionTackle                 -8.704e-01  2.259e+00  -0.385 0.700041    
## lastActionTakeOn                  4.561e-01  1.361e+00   0.335 0.737519    
## lastActionThroughball             1.381e+01  1.473e+00   9.374  < 2e-16 ***
## is_weakfootYes                    1.098e-01  2.893e-01   0.379 0.704371    
## X                                 1.398e+01  1.743e+00   8.022 1.04e-15 ***
## exp(Y^2)                          2.016e+00  5.946e-01   3.390 0.000698 ***
## h_ah                             -5.780e-01  2.296e-01  -2.517 0.011825 *  
## lastActionBallRecovery:X          4.389e-01  1.648e+00   0.266 0.789939    
## lastActionBallTouch:X             2.787e+00  1.820e+00   1.531 0.125701    
## lastActionBlockedPass:X           5.876e+00  3.709e+00   1.584 0.113156    
## lastActionCard:X                 -1.265e+01  1.444e+01  -0.876 0.380987    
## lastActionChallenge:X             2.126e+02  6.053e+03   0.035 0.971982    
## lastActionChipped:X              -2.796e+00  1.609e+00  -1.738 0.082216 .  
## lastActionClearance:X            -1.139e+01  4.333e+00  -2.630 0.008541 ** 
## lastActionCornerAwarded:X        -2.254e+02  3.388e+03  -0.067 0.946963    
## lastActionCross:X                 7.177e+00  1.687e+00   4.255 2.09e-05 ***
## lastActionDispossessed:X          2.644e+00  2.156e+00   1.226 0.220024    
## lastActionEnd:X                  -8.521e+02  4.825e+03  -0.177 0.859831    
## lastActionFormationChange:X      -3.728e+02  1.731e+05  -0.002 0.998282    
## lastActionFoul:X                 -8.592e+00  4.424e+00  -1.942 0.052096 .  
## lastActionGoal:X                 -2.884e+01  1.176e+01  -2.452 0.014193 *  
## lastActionGoodSkill:X            -1.715e+00  5.784e+00  -0.297 0.766777    
## lastActionHeadPass:X              2.285e+00  1.773e+00   1.289 0.197357    
## lastActionInterception:X          6.750e+00  3.388e+00   1.992 0.046331 *  
## lastActionLayOff:X                2.191e+00  2.089e+00   1.049 0.294095    
## lastActionNone:X                  2.571e+00  1.548e+00   1.661 0.096766 .  
## lastActionOffsidePass:X          -3.654e+00  7.972e+00  -0.458 0.646686    
## lastActionPass:X                  3.227e+00  1.516e+00   2.129 0.033218 *  
## lastActionRebound:X               7.672e+00  1.619e+00   4.740 2.14e-06 ***
## lastActionSave:X                 -1.085e+01  2.480e+04   0.000 0.999651    
## lastActionStart:X                -5.294e-01  1.248e+01  -0.042 0.966174    
## lastActionSubstitutionOn:X        1.010e+01  1.490e+01   0.678 0.497963    
## lastActionTackle:X                2.135e+00  2.569e+00   0.831 0.405788    
## lastActionTakeOn:X                8.590e-01  1.562e+00   0.550 0.582282    
## lastActionThroughball:X          -1.324e+01  1.684e+00  -7.860 3.83e-15 ***
## is_weakfootYes:exp(Y^2)          -8.210e-01  9.906e-02  -8.288  < 2e-16 ***
## X:exp(Y^2)                       -2.709e+00  6.750e-01  -4.013 5.99e-05 ***
## is_weakfootYes:X                  9.331e-01  3.061e-01   3.048 0.002306 ** 
## exp(Y^2):h_ah                     2.060e-01  8.160e-02   2.525 0.011577 *  
## minute:lastActionBallRecovery    -8.250e-03  4.226e-03  -1.952 0.050886 .  
## minute:lastActionBallTouch       -1.175e-02  4.456e-03  -2.636 0.008381 ** 
## minute:lastActionBlockedPass     -1.606e-02  8.260e-03  -1.944 0.051918 .  
## minute:lastActionCard             2.070e-03  3.606e-02   0.057 0.954232    
## minute:lastActionChallenge        5.181e+00  1.032e+02   0.050 0.959962    
## minute:lastActionChipped         -8.817e-03  4.026e-03  -2.190 0.028521 *  
## minute:lastActionClearance       -1.062e-02  2.520e-02  -0.421 0.673493    
## minute:lastActionCornerAwarded   -9.022e-01  1.472e+01  -0.061 0.951122    
## minute:lastActionCross           -9.620e-03  3.972e-03  -2.422 0.015441 *  
## minute:lastActionDispossessed    -1.557e-02  5.282e-03  -2.947 0.003210 ** 
## minute:lastActionEnd              1.572e+00  1.062e+01   0.148 0.882327    
## minute:lastActionFormationChange  1.324e+00  1.670e+02   0.008 0.993674    
## minute:lastActionFoul            -1.878e-02  1.103e-02  -1.703 0.088586 .  
## minute:lastActionGoal            -3.517e-02  3.015e-02  -1.167 0.243331    
## minute:lastActionGoodSkill        3.062e-02  2.189e-02   1.399 0.161903    
## minute:lastActionHeadPass        -1.166e-02  4.309e-03  -2.705 0.006827 ** 
## minute:lastActionInterception     2.194e-03  9.279e-03   0.236 0.813085    
## minute:lastActionLayOff          -5.453e-03  4.961e-03  -1.099 0.271714    
## minute:lastActionNone            -1.066e-02  3.946e-03  -2.702 0.006897 ** 
## minute:lastActionOffsidePass     -4.063e-02  2.443e-02  -1.664 0.096210 .  
## minute:lastActionPass            -9.567e-03  3.871e-03  -2.471 0.013461 *  
## minute:lastActionRebound         -1.211e-02  3.993e-03  -3.034 0.002417 ** 
## minute:lastActionSave            -1.389e-02  7.405e+01   0.000 0.999850    
## minute:lastActionStart           -3.936e-01  1.449e+02  -0.003 0.997833    
## minute:lastActionSubstitutionOn   1.278e-01  7.682e-02   1.663 0.096248 .  
## minute:lastActionTackle          -7.866e-03  6.632e-03  -1.186 0.235583    
## minute:lastActionTakeOn          -1.117e-02  3.976e-03  -2.811 0.004943 ** 
## minute:lastActionThroughball     -1.113e-02  3.997e-03  -2.786 0.005339 ** 
## minute:h_ah                      -9.621e-04  5.706e-04  -1.686 0.091747 .  
## minute:exp(Y^2)                  -3.052e-03  1.528e-03  -1.998 0.045769 *  
## lastActionBallRecovery:h_ah       3.278e-01  2.222e-01   1.475 0.140144    
## lastActionBallTouch:h_ah          3.752e-02  2.339e-01   0.160 0.872590    
## lastActionBlockedPass:h_ah       -4.142e-02  4.531e-01  -0.091 0.927167    
## lastActionCard:h_ah              -1.666e+01  2.035e+03  -0.008 0.993468    
## lastActionChallenge:h_ah          1.521e+01  1.010e+03   0.015 0.987981    
## lastActionChipped:h_ah            2.693e-01  2.102e-01   1.281 0.200194    
## lastActionClearance:h_ah          1.664e+01  1.576e+03   0.011 0.991573    
## lastActionCornerAwarded:h_ah      1.093e+01  6.754e+02   0.016 0.987086    
## lastActionCross:h_ah              4.412e-01  2.075e-01   2.126 0.033506 *  
## lastActionDispossessed:h_ah       3.282e-01  2.773e-01   1.183 0.236643    
## lastActionEnd:h_ah               -2.914e+02  1.725e+03  -0.169 0.865815    
## lastActionFormationChange:h_ah    6.069e+01  2.276e+04   0.003 0.997872    
## lastActionFoul:h_ah               2.016e+00  8.014e-01   2.515 0.011904 *  
## lastActionGoal:h_ah               1.591e+01  8.491e+02   0.019 0.985052    
## lastActionGoodSkill:h_ah          3.968e-01  9.520e-01   0.417 0.676807    
## lastActionHeadPass:h_ah           3.384e-01  2.269e-01   1.491 0.135855    
## lastActionInterception:h_ah      -4.266e-01  4.976e-01  -0.857 0.391232    
## lastActionLayOff:h_ah             9.304e-02  2.614e-01   0.356 0.721864    
## lastActionNone:h_ah               2.946e-01  2.060e-01   1.430 0.152650    
## lastActionOffsidePass:h_ah        1.294e+00  1.343e+00   0.963 0.335473    
## lastActionPass:h_ah               4.271e-01  2.017e-01   2.118 0.034190 *  
## lastActionRebound:h_ah            3.789e-01  2.085e-01   1.817 0.069197 .  
## lastActionSave:h_ah               4.704e-01  4.439e+03   0.000 0.999915    
## lastActionStart:h_ah              1.464e+01  6.523e+03   0.002 0.998210    
## lastActionSubstitutionOn:h_ah    -1.440e+00  1.878e+00  -0.767 0.443286    
## lastActionTackle:h_ah             5.257e-01  3.483e-01   1.509 0.131209    
## lastActionTakeOn:h_ah             3.459e-01  2.073e-01   1.668 0.095298 .  
## lastActionThroughball:h_ah        4.527e-01  2.087e-01   2.169 0.030077 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 139769  on 216646  degrees of freedom
## Residual deviance: 122830  on 216523  degrees of freedom
## AIC: 123078
## 
## Number of Fisher Scoring iterations: 17
predict1.2 <- predict.glm(modello1.2, newdata = test_opf, type = "response")

confusione1.2 <- confusionMatrix(as.factor(ifelse(predict1.2>0.3, 1, 0)), as.factor(test_opf$result))
confusione1.2
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 63054  6204
##          1  2022   936
##                                           
##                Accuracy : 0.8861          
##                  95% CI : (0.8838, 0.8884)
##     No Information Rate : 0.9011          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1353          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.9689          
##             Specificity : 0.1311          
##          Pos Pred Value : 0.9104          
##          Neg Pred Value : 0.3164          
##              Prevalence : 0.9011          
##          Detection Rate : 0.8731          
##    Detection Prevalence : 0.9590          
##       Balanced Accuracy : 0.5500          
##                                           
##        'Positive' Class : 0               
## 
rms::val.prob(as.numeric(as.character(predict1.2)), as.numeric(as.character(test_opf$result)))
## Warning in rms::val.prob(as.numeric(as.character(predict1.2)),
## as.numeric(as.character(test_opf$result))): 2 observations deleted from
## logistic calibration due to probs. of 0 or 1

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##  4.992168e-01  7.496084e-01  1.392605e-01  6.849007e-02  4.946942e+03 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA  5.514068e-04  4.181929e+01  8.299584e-10  6.793867e-02 
##         Brier     Intercept         Slope          Emax           E90 
##  8.220725e-02 -1.447628e-01  9.225944e-01  2.886326e-01  3.951237e-03 
##          Eavg           S:z           S:p 
##  2.745378e-03  1.345419e+00  1.784899e-01
roc1.2 <- roc(test_opf$result ~ predict1.2, plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright", legend = paste("AUC =", round(auc(roc1.2), 4)), lwd = 2, box.lwd = 0, bg = "white")

Linear Discriminant Analysis

set.seed(1234)

new_train_opf <- train_opf

new_train_opf$Y <- exp(new_train_opf$Y^2)

new_test_opf <- test_opf

new_test_opf$Y <- exp(new_test_opf$Y^2)

set.seed(1234)


modello1.3 <- lda(result ~ minute + lastAction + is_weakfoot + X + exp(Y^2) + h_a, data = train_opf)

summary(modello1.3)
##         Length Class  Mode     
## prior    2     -none- numeric  
## counts   2     -none- numeric  
## means   66     -none- numeric  
## scaling 33     -none- numeric  
## lev      2     -none- character
## svd      1     -none- numeric  
## N        1     -none- numeric  
## call     3     -none- call     
## terms    3     terms  call     
## xlevels  3     -none- list
predict1.3.1 <- predict(modello1.3, newdata = test_opf)

predict1.3 <- predict1.3.1$posterior[, 2]


confusione1.3.1 <- confusionMatrix(as.factor(predict1.3.1$class), as.factor(test_opf$result))
confusione1.3.1
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 64747  6980
##          1   329   160
##                                          
##                Accuracy : 0.8988         
##                  95% CI : (0.8966, 0.901)
##     No Information Rate : 0.9011         
##     P-Value [Acc > NIR] : 0.9825         
##                                          
##                   Kappa : 0.0296         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.99494        
##             Specificity : 0.02241        
##          Pos Pred Value : 0.90269        
##          Neg Pred Value : 0.32720        
##              Prevalence : 0.90113        
##          Detection Rate : 0.89657        
##    Detection Prevalence : 0.99323        
##       Balanced Accuracy : 0.50868        
##                                          
##        'Positive' Class : 0              
## 
confusione1.3 <- confusionMatrix(as.factor(ifelse(predict1.3>0.3, 1, 0)), as.factor(test_opf$result))


roc1.3 <- roc(test_opf$result ~ predict1.3, plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright", legend = paste("AUC =", round(auc(roc1.3), 4)), lwd = 2, box.lwd = 0, bg = "white")

rms::val.prob(as.numeric(as.character(predict1.3)), as.numeric(as.character(test_opf$result)))

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##  4.895863e-01  7.447931e-01  1.329346e-01  6.527350e-02  4.714791e+03 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA  3.116984e-04  2.450961e+01  4.762170e-06  6.496181e-02 
##         Brier     Intercept         Slope          Emax           E90 
##  8.333969e-02 -1.391553e-01  9.295095e-01  2.800603e-01  2.064602e-02 
##          Eavg           S:z           S:p 
##  1.156119e-02  3.082489e+00  2.052772e-03

Random Forest

set.seed(1234)

modello1.4 <- ranger(train_opf$result ~ minute + lastAction + is_weakfoot + X + Y + h_a, 
                     data = new_train_opf, num.trees = 500, mtry = 3, seed = 42, probability = T)

train_predict_1.4.1 <- predict(modello1.4, data = new_train_opf)
train_predict1.4 <- train_predict_1.4.1$predictions[, 2]

predict1.4.1 <- predict(modello1.4, data = new_test_opf)

predict1.4 <- predict1.4.1$predictions[, 2]

confusione1.4 <- confusionMatrix(as.factor(ifelse(predict1.4 >0.3, 1, 0)), as.factor(test_opf$result))
confusione1.4
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 61892  3957
##          1  3184  3183
##                                           
##                Accuracy : 0.9011          
##                  95% CI : (0.8989, 0.9033)
##     No Information Rate : 0.9011          
##     P-Value [Acc > NIR] : 0.5081          
##                                           
##                   Kappa : 0.417           
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.9511          
##             Specificity : 0.4458          
##          Pos Pred Value : 0.9399          
##          Neg Pred Value : 0.4999          
##              Prevalence : 0.9011          
##          Detection Rate : 0.8570          
##    Detection Prevalence : 0.9118          
##       Balanced Accuracy : 0.6984          
##                                           
##        'Positive' Class : 0               
## 
roc1.4 <- roc(test_opf$result ~ as.numeric(predict1.4), plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright", legend = paste("AUC =", round(auc(roc1.4), 4)), lwd = 2, box.lwd = 0, bg = "white")

rms::val.prob(as.numeric(as.character(predict1.4)), as.numeric(as.character(new_test_opf$result)))
## Warning in rms::val.prob(as.numeric(as.character(predict1.4)),
## as.numeric(as.character(new_test_opf$result))): 929 observations deleted from
## logistic calibration due to probs. of 0 or 1

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##  6.706364e-01  8.353182e-01  3.232237e-01  1.679441e-01  1.197323e+04 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA           Inf           Inf  0.000000e+00          -Inf 
##         Brier     Intercept         Slope          Emax           E90 
##  6.651975e-02 -2.583708e-01  8.922615e-01  4.151460e-02  3.601427e-02 
##          Eavg           S:z           S:p 
##  1.228770e-02 -3.416589e+00  6.341090e-04

Bagging

set.seed(1234)

modello1.5 <- ranger(train_opf$result ~ minute + lastAction + is_weakfoot + X + Y + h_a, 
                     data = new_train_opf, num.trees = 500, seed = 42, probability = T)

predict1.5.1 <- predict(modello1.5, data = new_test_opf)

predict1.5 <- predict1.5.1$predictions[, 2]

confusione1.5 <- confusionMatrix(as.factor(ifelse(predict1.5>0.3, 1, 0)), as.factor(new_test_opf$result))
confusione1.5
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 62474  4157
##          1  2602  2983
##                                           
##                Accuracy : 0.9064          
##                  95% CI : (0.9043, 0.9085)
##     No Information Rate : 0.9011          
##     P-Value [Acc > NIR] : 8.749e-07       
##                                           
##                   Kappa : 0.4184          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9600          
##             Specificity : 0.4178          
##          Pos Pred Value : 0.9376          
##          Neg Pred Value : 0.5341          
##              Prevalence : 0.9011          
##          Detection Rate : 0.8651          
##    Detection Prevalence : 0.9227          
##       Balanced Accuracy : 0.6889          
##                                           
##        'Positive' Class : 0               
## 
roc1.5 <- roc(new_test_opf$result ~ as.numeric(predict1.5), plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright", legend = paste("AUC =", round(auc(roc1.5), 4)), lwd = 2, box.lwd = 0, bg = "white")

rms::val.prob(as.numeric(as.character(predict1.5)), as.numeric(as.character(new_test_opf$result)))

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##  6.784610e-01  8.392305e-01  3.261361e-01  1.684718e-01  1.216736e+04 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA  9.957298e-04  7.390762e+01  1.110223e-16  1.674760e-01 
##         Brier     Intercept         Slope          Emax           E90 
##  6.710256e-02  1.193215e-01  1.096641e+00  1.542791e-01  1.849012e-02 
##          Eavg           S:z           S:p 
##  1.020388e-02 -8.226908e+00  1.921075e-16

Neural Network

modello1.8 <- nnet(result ~ minute + lastAction + is_weakfoot + X + Y + h_a, data = new_train_opf, size = 10, maxit = 1000, linout = FALSE)
## # weights:  351
## initial  value 118545.093917 
## iter  10 value 69981.009756
## iter  20 value 69192.592115
## iter  30 value 67888.711525
## iter  40 value 63628.809594
## iter  50 value 62525.075031
## iter  60 value 62057.467967
## iter  70 value 61604.685356
## iter  80 value 61532.876008
## iter  90 value 61487.160400
## iter 100 value 61233.657946
## iter 110 value 60338.128851
## iter 120 value 59616.426996
## iter 130 value 58733.031793
## iter 140 value 58349.312834
## iter 150 value 58148.958166
## iter 160 value 57935.848822
## iter 170 value 57756.338123
## iter 180 value 57639.865936
## iter 190 value 57573.532413
## iter 200 value 57512.936093
## iter 210 value 57426.532572
## iter 220 value 57327.745162
## iter 230 value 57217.958189
## iter 240 value 57172.723167
## iter 250 value 57148.220125
## iter 260 value 57133.696190
## iter 270 value 57116.527271
## iter 280 value 57102.667221
## iter 290 value 57084.939379
## iter 300 value 57056.657224
## iter 310 value 57048.010413
## iter 320 value 57038.515424
## iter 330 value 57021.976784
## iter 340 value 57009.160524
## iter 350 value 57001.456556
## iter 360 value 56993.776476
## iter 370 value 56988.466171
## iter 380 value 56985.363909
## iter 390 value 56984.264328
## iter 400 value 56983.744595
## iter 410 value 56983.261082
## iter 420 value 56982.689945
## iter 430 value 56981.621792
## iter 440 value 56979.853764
## iter 450 value 56977.893636
## iter 460 value 56975.779385
## iter 470 value 56973.815073
## iter 480 value 56972.577633
## iter 490 value 56971.942203
## iter 500 value 56971.328528
## iter 510 value 56969.780842
## iter 520 value 56969.663506
## iter 530 value 56969.309645
## iter 540 value 56968.354652
## iter 550 value 56967.126957
## iter 560 value 56964.815551
## iter 570 value 56961.936920
## iter 580 value 56957.827947
## iter 590 value 56955.201372
## iter 600 value 56954.353889
## iter 610 value 56953.875346
## iter 620 value 56953.530963
## iter 630 value 56953.236351
## iter 640 value 56953.022966
## iter 650 value 56952.857124
## iter 660 value 56952.730886
## iter 670 value 56952.602141
## iter 680 value 56952.471966
## iter 690 value 56952.311041
## iter 700 value 56952.158581
## iter 710 value 56951.968581
## iter 720 value 56951.865165
## iter 730 value 56951.791552
## iter 740 value 56951.713886
## iter 750 value 56951.575373
## iter 760 value 56951.479369
## iter 770 value 56951.335809
## iter 780 value 56951.186863
## iter 790 value 56951.084841
## iter 800 value 56950.981560
## iter 810 value 56950.861977
## iter 820 value 56950.727533
## iter 830 value 56950.537039
## iter 840 value 56950.345343
## iter 850 value 56950.119654
## iter 860 value 56949.843306
## iter 870 value 56949.747925
## iter 880 value 56949.637210
## iter 890 value 56949.555767
## iter 900 value 56949.517758
## iter 910 value 56949.381139
## iter 920 value 56949.296510
## iter 930 value 56949.210179
## iter 940 value 56949.124307
## iter 950 value 56948.983340
## iter 960 value 56948.841103
## iter 970 value 56948.594921
## iter 980 value 56948.099597
## iter 990 value 56947.901399
## iter1000 value 56947.590591
## final  value 56947.590591 
## stopped after 1000 iterations
predict1.8 <- predict(modello1.8, newdata = new_test_opf)

confusione1.8 <- confusionMatrix(as.factor(ifelse(predict1.8>0.3, 1, 0)), new_test_opf$result)
confusione1.8
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 61956  4927
##          1  3120  2213
##                                           
##                Accuracy : 0.8886          
##                  95% CI : (0.8863, 0.8909)
##     No Information Rate : 0.9011          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2953          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.9521          
##             Specificity : 0.3099          
##          Pos Pred Value : 0.9263          
##          Neg Pred Value : 0.4150          
##              Prevalence : 0.9011          
##          Detection Rate : 0.8579          
##    Detection Prevalence : 0.9262          
##       Balanced Accuracy : 0.6310          
##                                           
##        'Positive' Class : 0               
## 
roc1.8 <- roc(new_test_opf$result ~ as.numeric(predict1.8), plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright", legend = paste("AUC =", round(auc(roc1.8), 4)), lwd = 2, box.lwd = 0, bg = "white")

rms::val.prob(as.numeric(as.character(predict1.8)), as.numeric(as.character(new_test_opf$result)))
## Warning in rms::val.prob(as.numeric(as.character(predict1.8)),
## as.numeric(as.character(new_test_opf$result))): 315 observations deleted from
## logistic calibration due to probs. of 0 or 1

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##  5.834815e-01  7.917408e-01  2.184253e-01  1.097806e-01  7.894334e+03 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA           Inf           Inf  0.000000e+00          -Inf 
##         Brier     Intercept         Slope          Emax           E90 
##  7.652105e-02 -7.947524e-02  9.565840e-01  4.063039e-02  7.167874e-03 
##          Eavg           S:z           S:p 
##  3.731212e-03  1.660904e+00  9.673279e-02

Understat Result

confusione1.us <- confusionMatrix(as.factor(ifelse(test_opf$`xG Understat`>0.3, 1, 0)), test_opf$result)
confusione1.us
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 60545  3589
##          1  4531  3551
##                                           
##                Accuracy : 0.8876          
##                  95% CI : (0.8852, 0.8899)
##     No Information Rate : 0.9011          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.404           
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.9304          
##             Specificity : 0.4973          
##          Pos Pred Value : 0.9440          
##          Neg Pred Value : 0.4394          
##              Prevalence : 0.9011          
##          Detection Rate : 0.8384          
##    Detection Prevalence : 0.8881          
##       Balanced Accuracy : 0.7139          
##                                           
##        'Positive' Class : 0               
## 
roc1.us <- roc(test_opf$result ~ as.numeric(test_opf$`xG Understat`), plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright", legend = paste("AUC =", round(auc(roc1.us), 4)), lwd = 2, box.lwd = 0, bg = "white")

rms::val.prob(as.numeric(as.character(test_opf$`xG Understat`)), as.numeric(as.character(test_opf$result)))
## Warning in rms::val.prob(as.numeric(as.character(test_opf$`xG Understat`)), : 6
## observations deleted from logistic calibration due to probs. of 0 or 1

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##  6.294039e-01  8.147019e-01  2.776084e-01  1.415280e-01  1.022074e+04 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA           Inf           Inf  0.000000e+00          -Inf 
##         Brier     Intercept         Slope          Emax           E90 
##  7.128048e-02 -1.240680e-01  9.296240e-01  8.520251e-02  8.860956e-03 
##          Eavg           S:z           S:p 
##  5.630629e-03  3.822074e+00  1.323339e-04

Open Play Head

set.seed(123)
index_oph <- sample.split(Y = openplay_head$result, SplitRatio = 0.75)
train_oph <- openplay_head[index_oph, ]

indice_test_oph <-  which(index_oph == FALSE)
test_oph <- openplay_head[indice_test_oph, ] 

Logit

modello2.1 <- glm(result ~ minute + lastAction + X + exp(Y^2) + h_a, 
                  data = train_oph, family = binomial)

summary2.1 <- summary(modello2.1)
summary2.1
## 
## Call:
## glm(formula = result ~ minute + lastAction + X + exp(Y^2) + h_a, 
##     family = binomial, data = train_oph)
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -2.680e+01  7.831e-01 -34.219  < 2e-16 ***
## minute                    -9.752e-04  7.921e-04  -1.231  0.21826    
## lastActionBallTouch       -2.660e-01  6.064e-01  -0.439  0.66098    
## lastActionChallenge       -1.277e+01  6.023e+02  -0.021  0.98309    
## lastActionChipped          9.195e-01  8.387e-02  10.964  < 2e-16 ***
## lastActionCornerAwarded   -1.305e+01  2.282e+02  -0.057  0.95440    
## lastActionCross            7.696e-01  5.820e-02  13.222  < 2e-16 ***
## lastActionEnd             -1.284e+01  2.877e+02  -0.045  0.96441    
## lastActionFoul            -1.302e+01  1.854e+02  -0.070  0.94400    
## lastActionGoal            -1.338e+01  3.162e+02  -0.042  0.96626    
## lastActionHeadPass         9.408e-01  1.567e-01   6.002 1.95e-09 ***
## lastActionNone             5.923e-01  1.292e-01   4.585 4.54e-06 ***
## lastActionPass             7.710e-01  1.802e-01   4.279 1.88e-05 ***
## lastActionRebound          1.274e+00  1.125e-01  11.322  < 2e-16 ***
## lastActionSubstitutionOff  1.729e+01  1.455e+03   0.012  0.99052    
## lastActionSubstitutionOn  -4.954e-01  1.063e+00  -0.466  0.64106    
## lastActionTackle          -1.282e+01  4.519e+02  -0.028  0.97737    
## lastActionTakeOn           1.697e+00  6.706e-01   2.530  0.01141 *  
## lastActionThroughball      2.603e+00  2.779e-01   9.368  < 2e-16 ***
## X                          2.731e+01  7.882e-01  34.656  < 2e-16 ***
## exp(Y^2)                  -7.093e-01  2.478e-01  -2.862  0.00421 ** 
## h_ah                       5.532e-02  4.326e-02   1.279  0.20097    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 16885  on 23101  degrees of freedom
## Residual deviance: 15006  on 23080  degrees of freedom
## AIC: 15050
## 
## Number of Fisher Scoring iterations: 14
predict2.1 <- predict.glm(modello2.1, newdata = test_oph, type = "response")

roc2.1 <- roc(test_oph$result ~ predict2.1, plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright", legend = paste("AUC =", round(auc(roc2.1), 4)), lwd = 2, box.lwd = 0, bg = "white")

confusione2.1 <- confusionMatrix(as.factor(ifelse(predict2.1>0.3, 1, 0)), as.factor(test_oph$result)) 
confusione2.1
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 6430  743
##          1  352  175
##                                           
##                Accuracy : 0.8578          
##                  95% CI : (0.8498, 0.8655)
##     No Information Rate : 0.8808          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.17            
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.9481          
##             Specificity : 0.1906          
##          Pos Pred Value : 0.8964          
##          Neg Pred Value : 0.3321          
##              Prevalence : 0.8808          
##          Detection Rate : 0.8351          
##    Detection Prevalence : 0.9316          
##       Balanced Accuracy : 0.5694          
##                                           
##        'Positive' Class : 0               
## 
rms::val.prob(as.numeric(as.character(predict2.1)), as.numeric(as.character(test_oph$result)))

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##  4.741098e-01  7.370549e-01  1.357465e-01  7.284670e-02  5.619196e+02 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA  2.890682e-05  2.222582e+00  3.291337e-01  7.281779e-02 
##         Brier     Intercept         Slope          Emax           E90 
##  9.674780e-02 -1.001238e-01  9.372446e-01  1.817344e-01  1.510059e-02 
##          Eavg           S:z           S:p 
##  6.637031e-03  8.586321e-01  3.905435e-01

Linear Discriminant Analysis

set.seed(1234)

new_train_oph <- train_oph

new_train_oph$Y <- exp(new_train_oph$Y^2)

new_test_oph <- test_oph

new_test_oph$Y <- exp(new_test_oph$Y^2)

set.seed(1234)


modello2.3 <- lda(result ~ minute + lastAction + X + exp(Y^2) + h_a, data = train_oph)

summary(modello2.3)
##         Length Class  Mode     
## prior    2     -none- numeric  
## counts   2     -none- numeric  
## means   42     -none- numeric  
## scaling 21     -none- numeric  
## lev      2     -none- character
## svd      1     -none- numeric  
## N        1     -none- numeric  
## call     3     -none- call     
## terms    3     terms  call     
## xlevels  2     -none- list
predict2.3.1 <- predict(modello2.3, newdata = test_oph)

predict2.3 <- predict2.3.1$posterior[, 2]

confusione2.3.1 <- confusionMatrix(as.factor(predict2.3.1$class), as.factor(test_oph$result))
confusione2.3.1
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 6735  885
##          1   47   33
##                                           
##                Accuracy : 0.879           
##                  95% CI : (0.8715, 0.8862)
##     No Information Rate : 0.8808          
##     P-Value [Acc > NIR] : 0.6961          
##                                           
##                   Kappa : 0.0479          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.99307         
##             Specificity : 0.03595         
##          Pos Pred Value : 0.88386         
##          Neg Pred Value : 0.41250         
##              Prevalence : 0.88078         
##          Detection Rate : 0.87468         
##    Detection Prevalence : 0.98961         
##       Balanced Accuracy : 0.51451         
##                                           
##        'Positive' Class : 0               
## 
confusione2.3 <- confusionMatrix(as.factor(ifelse(predict2.3>0.3, 1, 0)), as.factor(test_oph$result))


roc2.3 <- roc(test_oph$result ~ predict2.3, plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright", legend = paste("AUC =", round(auc(roc2.3), 4)), lwd = 2, box.lwd = 0, bg = "white")

rms::val.prob(as.numeric(as.character(predict2.3)), as.numeric(as.character(test_oph$result)))

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##  4.699917e-01  7.349958e-01  1.318688e-01  7.068646e-02  5.452857e+02 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA  4.950705e-04  5.812043e+00  5.469290e-02  7.019139e-02 
##         Brier     Intercept         Slope          Emax           E90 
##  9.774986e-02 -1.619795e-01  9.020178e-01  3.755251e-01  2.307149e-02 
##          Eavg           S:z           S:p 
##  1.214045e-02  1.746477e+00  8.072811e-02

Random Forest

set.seed(1234)

modello2.4 <- ranger(train_oph$result ~ minute + lastAction + X + Y + h_a, 
                     data = new_train_oph, num.trees = 500, mtry = 3, seed = 42, probability = T)

predict2.4.1 <- predict(modello2.4, data = new_test_oph)

predict2.4 <- predict2.4.1$predictions[, 2]

confusione2.4 <- confusionMatrix(as.factor(ifelse(predict2.4 >0.3, 1, 0)), as.factor(test_oph$result))
confusione2.4
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 6272  522
##          1  510  396
##                                           
##                Accuracy : 0.866           
##                  95% CI : (0.8582, 0.8735)
##     No Information Rate : 0.8808          
##     P-Value [Acc > NIR] : 1.000           
##                                           
##                   Kappa : 0.3582          
##                                           
##  Mcnemar's Test P-Value : 0.732           
##                                           
##             Sensitivity : 0.9248          
##             Specificity : 0.4314          
##          Pos Pred Value : 0.9232          
##          Neg Pred Value : 0.4371          
##              Prevalence : 0.8808          
##          Detection Rate : 0.8145          
##    Detection Prevalence : 0.8823          
##       Balanced Accuracy : 0.6781          
##                                           
##        'Positive' Class : 0               
## 
roc2.4 <- roc(test_oph$result ~ as.numeric(predict2.4), plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright", legend = paste("AUC =", round(auc(roc2.4), 4)), lwd = 2, box.lwd = 0, bg = "white")

rms::val.prob(as.numeric(as.character(predict2.4)), as.numeric(as.character(new_test_oph$result)))
## Warning in rms::val.prob(as.numeric(as.character(predict2.4)),
## as.numeric(as.character(new_test_oph$result))): 245 observations deleted from
## logistic calibration due to probs. of 0 or 1

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##  5.957707e-01  7.978854e-01  2.638290e-01  1.492232e-01  1.113459e+03 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA  7.485408e-03  5.780371e+01  2.805534e-13  1.417378e-01 
##         Brier     Intercept         Slope          Emax           E90 
##  8.376312e-02 -4.099566e-01  7.751907e-01  9.897154e-02  5.407294e-02 
##          Eavg           S:z           S:p 
##  2.153259e-02  1.694824e+00  9.010883e-02

Bagging

set.seed(1234)

modello2.5 <- ranger(train_oph$result ~ minute + lastAction + X + Y + h_a, 
                     data = new_train_oph, num.trees = 500, seed = 42, probability = T)

predict2.5.1 <- predict(modello2.5, data = new_test_oph)

predict2.5 <- predict2.5.1$predictions[, 2]

confusione2.5 <- confusionMatrix(as.factor(ifelse(predict2.5>0.3, 1, 0)), as.factor(new_test_oph$result))
confusione2.5
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 6332  530
##          1  450  388
##                                           
##                Accuracy : 0.8727          
##                  95% CI : (0.8651, 0.8801)
##     No Information Rate : 0.8808          
##     P-Value [Acc > NIR] : 0.98542         
##                                           
##                   Kappa : 0.3703          
##                                           
##  Mcnemar's Test P-Value : 0.01162         
##                                           
##             Sensitivity : 0.9336          
##             Specificity : 0.4227          
##          Pos Pred Value : 0.9228          
##          Neg Pred Value : 0.4630          
##              Prevalence : 0.8808          
##          Detection Rate : 0.8223          
##    Detection Prevalence : 0.8912          
##       Balanced Accuracy : 0.6782          
##                                           
##        'Positive' Class : 0               
## 
roc2.5 <- roc(new_test_oph$result ~ as.numeric(predict2.5), plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright", legend = paste("AUC =", round(auc(roc2.5), 4)), lwd = 2, box.lwd = 0, bg = "white")

rms::val.prob(as.numeric(as.character(predict2.5)), as.numeric(as.character(new_test_oph$result)))

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##  6.244548e-01  8.122274e-01  2.908950e-01  1.633468e-01  1.258770e+03 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA  1.468163e-03  1.330485e+01  1.290887e-03  1.618786e-01 
##         Brier     Intercept         Slope          Emax           E90 
##  8.252587e-02 -2.167724e-01  8.903037e-01  6.141308e-02  3.284286e-02 
##          Eavg           S:z           S:p 
##  1.285068e-02  3.637999e-01  7.160074e-01

Neural Network

modello2.8 <- nnet(result ~ minute + lastAction + X + Y + h_a, data = new_train_oph, size = 10, maxit = 1000, linout = FALSE)
## # weights:  231
## initial  value 8722.578293 
## iter  10 value 8435.748320
## iter  20 value 8279.696679
## iter  30 value 8089.025131
## iter  40 value 8065.396259
## iter  50 value 7820.389397
## iter  60 value 7620.533019
## iter  70 value 7527.182160
## iter  80 value 7347.012489
## iter  90 value 7306.470349
## iter 100 value 7270.602773
## iter 110 value 7209.485135
## iter 120 value 7149.740344
## iter 130 value 7108.479248
## iter 140 value 7089.482769
## iter 150 value 7081.089074
## iter 160 value 7075.318128
## iter 170 value 7072.006341
## iter 180 value 7065.815611
## iter 190 value 7061.494404
## iter 200 value 7059.751906
## iter 210 value 7057.881921
## iter 220 value 7056.740067
## iter 230 value 7055.132476
## iter 240 value 7052.237977
## iter 250 value 7051.064152
## iter 260 value 7050.361597
## iter 270 value 7049.824061
## iter 280 value 7049.270668
## iter 290 value 7048.685893
## iter 300 value 7047.999021
## iter 310 value 7047.421214
## iter 320 value 7047.147571
## iter 330 value 7046.797723
## iter 340 value 7046.235374
## iter 350 value 7045.552039
## iter 360 value 7043.659123
## iter 370 value 7043.059688
## iter 380 value 7042.916778
## iter 390 value 7042.807888
## iter 400 value 7042.674153
## iter 410 value 7042.430997
## iter 420 value 7042.132006
## iter 430 value 7041.922490
## iter 440 value 7041.629317
## iter 450 value 7041.465949
## iter 460 value 7041.325280
## iter 470 value 7041.021732
## iter 480 value 7040.845830
## iter 490 value 7040.576073
## iter 500 value 7040.369471
## iter 510 value 7040.308663
## iter 520 value 7040.267512
## iter 530 value 7040.203691
## iter 540 value 7040.109465
## iter 550 value 7040.064034
## iter 560 value 7039.967961
## iter 570 value 7039.905303
## iter 580 value 7039.802194
## iter 590 value 7039.726709
## iter 600 value 7039.678707
## iter 610 value 7039.625371
## iter 620 value 7039.569185
## iter 630 value 7039.529814
## iter 640 value 7039.473146
## iter 650 value 7039.436103
## iter 660 value 7039.409985
## final  value 7039.386145 
## converged
predict2.8 <- predict(modello2.8, newdata = new_test_oph)

confusione2.8 <- confusionMatrix(as.factor(ifelse(predict2.8>0.3, 1, 0)), new_test_oph$result)
confusione2.8
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 6407  655
##          1  375  263
##                                           
##                Accuracy : 0.8662          
##                  95% CI : (0.8584, 0.8738)
##     No Information Rate : 0.8808          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2663          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.9447          
##             Specificity : 0.2865          
##          Pos Pred Value : 0.9073          
##          Neg Pred Value : 0.4122          
##              Prevalence : 0.8808          
##          Detection Rate : 0.8321          
##    Detection Prevalence : 0.9171          
##       Balanced Accuracy : 0.6156          
##                                           
##        'Positive' Class : 0               
## 
roc2.8 <- roc(new_test_oph$result ~ as.numeric(predict2.8), plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright", legend = paste("AUC =", round(auc(roc2.8), 4)), lwd = 2, box.lwd = 0, bg = "white")

rms::val.prob(as.numeric(as.character(predict2.8)), as.numeric(as.character(new_test_oph$result)))
## Warning in rms::val.prob(as.numeric(as.character(predict2.8)),
## as.numeric(as.character(new_test_oph$result))): 64 observations deleted from
## logistic calibration due to probs. of 0 or 1

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##   0.536170235   0.768085117   0.187960276   0.102744421 785.556401386 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA           Inf           Inf   0.000000000          -Inf 
##         Brier     Intercept         Slope          Emax           E90 
##   0.092247046  -0.148281506   0.904902415   0.153865186   0.008181917 
##          Eavg           S:z           S:p 
##   0.006748645   1.625219257   0.104115849

Understat Result

confusione2.us <- confusionMatrix(as.factor(ifelse(test_oph$`xG Understat`>0.3, 1, 0)), test_oph$result)
confusione2.us
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 6021  391
##          1  761  527
##                                           
##                Accuracy : 0.8504          
##                  95% CI : (0.8422, 0.8583)
##     No Information Rate : 0.8808          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.3933          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.8878          
##             Specificity : 0.5741          
##          Pos Pred Value : 0.9390          
##          Neg Pred Value : 0.4092          
##              Prevalence : 0.8808          
##          Detection Rate : 0.7819          
##    Detection Prevalence : 0.8327          
##       Balanced Accuracy : 0.7309          
##                                           
##        'Positive' Class : 0               
## 
roc2.us <- roc(test_oph$result ~ as.numeric(test_oph$`xG Understat`), plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright", legend = paste("AUC =", round(auc(roc2.us), 4)), lwd = 2, box.lwd = 0, bg = "white")

rms::val.prob(as.numeric(as.character(test_oph$`xG Understat`)), as.numeric(as.character(test_oph$result)))

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##  6.478921e-01  8.239461e-01  2.930995e-01  1.646936e-01  1.269140e+03 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA  2.090539e-03  1.809715e+01  1.175584e-04  1.626030e-01 
##         Brier     Intercept         Slope          Emax           E90 
##  8.339068e-02 -1.780625e-01  9.907059e-01  4.643716e-02  3.833526e-02 
##          Eavg           S:z           S:p 
##  1.470120e-02 -3.084414e+00  2.039538e-03

Set Piece Foot

set.seed(123)
index_spf <- sample.split(Y = setpiece_foot$result, SplitRatio = 0.75)
train_spf <- setpiece_foot[index_spf, ]

indice_test_spf <-  which(index_spf == FALSE)
test_spf <- setpiece_foot[indice_test_spf, ] 

Logit

modello3.1 <- glm(result ~ minute + lastAction + is_weakfoot + X + exp(Y^2) + h_a + situation, 
                  data = train_spf, family = binomial)

summary(modello3.1)
## 
## Call:
## glm(formula = result ~ minute + lastAction + is_weakfoot + X + 
##     exp(Y^2) + h_a + situation, family = binomial, data = train_spf)
## 
## Coefficients: (1 not defined because of singularities)
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -1.421e+01  2.918e-01 -48.687  < 2e-16 ***
## minute                    3.158e-04  6.464e-04   0.489  0.62515    
## lastActionBallRecovery    2.785e-01  7.601e-01   0.366  0.71411    
## lastActionBallTouch      -4.691e-03  1.859e-01  -0.025  0.97986    
## lastActionBlockedPass    -1.258e+01  3.202e+02  -0.039  0.96867    
## lastActionCard           -1.336e+01  4.690e+02  -0.028  0.97728    
## lastActionChipped         1.790e-01  1.591e-01   1.125  0.26060    
## lastActionCornerAwarded  -2.713e-01  3.710e-01  -0.731  0.46466    
## lastActionCross           1.187e-01  1.261e-01   0.941  0.34681    
## lastActionDispossessed   -5.646e-01  7.315e-01  -0.772  0.44014    
## lastActionEnd            -1.317e+01  1.526e+02  -0.086  0.93122    
## lastActionFoul            2.640e-01  3.382e-01   0.781  0.43504    
## lastActionGoal           -1.334e+01  2.110e+02  -0.063  0.94958    
## lastActionGoodSkill      -1.355e+01  8.259e+02  -0.016  0.98691    
## lastActionHeadPass        8.723e-01  1.301e-01   6.705 2.01e-11 ***
## lastActionInterception   -4.658e-01  1.049e+00  -0.444  0.65714    
## lastActionLayOff          9.714e-01  3.336e-01   2.912  0.00360 ** 
## lastActionNone            5.073e-01  1.203e-01   4.216 2.48e-05 ***
## lastActionPass            6.736e-01  1.242e-01   5.424 5.83e-08 ***
## lastActionRebound         1.219e+00  1.216e-01  10.030  < 2e-16 ***
## lastActionStandard        1.470e+00  1.248e-01  11.781  < 2e-16 ***
## lastActionSubstitutionOn  2.382e+00  1.061e+00   2.246  0.02473 *  
## lastActionTackle         -2.138e-01  7.419e-01  -0.288  0.77326    
## lastActionTakeOn          2.452e-02  2.147e-01   0.114  0.90908    
## lastActionThroughball     1.445e+00  2.679e-01   5.393 6.95e-08 ***
## is_weakfootYes           -1.214e-01  4.351e-02  -2.791  0.00526 ** 
## X                         1.400e+01  2.924e-01  47.871  < 2e-16 ***
## exp(Y^2)                 -6.425e-01  8.422e-02  -7.629 2.36e-14 ***
## h_ah                     -8.534e-02  3.469e-02  -2.460  0.01389 *  
## situationFromCorner      -9.978e-02  4.192e-02  -2.380  0.01730 *  
## situationSetPiece                NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 28504  on 52076  degrees of freedom
## Residual deviance: 24469  on 52047  degrees of freedom
## AIC: 24529
## 
## Number of Fisher Scoring iterations: 14
predict3.1 <- predict.glm(modello3.1, newdata = test_spf, type = "response")

roc3.1 <- roc(test_spf$result ~ predict3.1, plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright", legend = paste("AUC =", round(auc(roc3.1), 4)), lwd = 2, box.lwd = 0, bg = "white")

confusione3.1 <- confusionMatrix(as.factor(ifelse(predict3.1>0.3, 1, 0)), as.factor(test_spf$result)) 
confusione3.1
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 15617  1107
##          1   389   245
##                                           
##                Accuracy : 0.9138          
##                  95% CI : (0.9095, 0.9179)
##     No Information Rate : 0.9221          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2073          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.9757          
##             Specificity : 0.1812          
##          Pos Pred Value : 0.9338          
##          Neg Pred Value : 0.3864          
##              Prevalence : 0.9221          
##          Detection Rate : 0.8997          
##    Detection Prevalence : 0.9635          
##       Balanced Accuracy : 0.5785          
##                                           
##        'Positive' Class : 0               
## 
rms::val.prob(as.numeric(as.character(predict3.1)), as.numeric(as.character(test_spf$result)))

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##  5.080409e-01  7.540205e-01  1.505219e-01  6.547547e-02  1.137523e+03 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA  3.787373e-04  8.574122e+00  1.374527e-02  6.509673e-02 
##         Brier     Intercept         Slope          Emax           E90 
##  6.520744e-02 -1.766042e-01  9.150318e-01  1.923828e-02  1.603711e-02 
##          Eavg           S:z           S:p 
##  7.930469e-03  3.248913e-01  7.452633e-01

Linear Discriminant Analysis

set.seed(1234)

new_train_spf <- train_spf

new_train_spf$Y <- exp(new_train_spf$Y^2)

new_test_spf <- test_spf

new_test_spf$Y <- exp(new_test_spf$Y^2)

set.seed(1234)


modello3.3 <- lda(result ~ minute + lastAction + is_weakfoot + X + exp(Y^2) + h_a, data = train_spf)

summary(modello3.3)
##         Length Class  Mode     
## prior    2     -none- numeric  
## counts   2     -none- numeric  
## means   56     -none- numeric  
## scaling 28     -none- numeric  
## lev      2     -none- character
## svd      1     -none- numeric  
## N        1     -none- numeric  
## call     3     -none- call     
## terms    3     terms  call     
## xlevels  3     -none- list
predict3.3.1 <- predict(modello3.3, newdata = test_spf)

predict3.3 <- predict3.3.1$posterior[, 2]

confusione3.3.1 <- confusionMatrix(as.factor(predict3.3.1$class), as.factor(test_spf$result))
confusione3.3.1
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 15775  1193
##          1   231   159
##                                          
##                Accuracy : 0.918          
##                  95% CI : (0.9138, 0.922)
##     No Information Rate : 0.9221         
##     P-Value [Acc > NIR] : 0.9794         
##                                          
##                   Kappa : 0.153          
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.9856         
##             Specificity : 0.1176         
##          Pos Pred Value : 0.9297         
##          Neg Pred Value : 0.4077         
##              Prevalence : 0.9221         
##          Detection Rate : 0.9088         
##    Detection Prevalence : 0.9775         
##       Balanced Accuracy : 0.5516         
##                                          
##        'Positive' Class : 0              
## 
confusione3.3 <- confusionMatrix(as.factor(ifelse(predict3.3>0.3, 1, 0)), as.factor(test_spf$result))

roc3.3 <- roc(test_spf$result ~ predict3.3, plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright", legend = paste("AUC =", round(auc(roc3.3), 4)), lwd = 2, box.lwd = 0, bg = "white")

rms::val.prob(as.numeric(as.character(predict3.3)), as.numeric(as.character(test_spf$result)))

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##   0.487283753   0.743641877   0.129926651   0.056251210 977.408498151 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA   0.006455379 114.052465074   0.000000000   0.049795831 
##         Brier     Intercept         Slope          Emax           E90 
##   0.066883628  -0.556372588   0.751043723   0.343102307   0.019185729 
##          Eavg           S:z           S:p 
##   0.016310743   3.167461664   0.001537760

Random Forest

set.seed(1234)

modello3.4 <- ranger(train_spf$result ~ minute + lastAction + is_weakfoot + X + Y + h_a, 
                     data = new_train_spf, num.trees = 500, mtry = 3, seed = 42, probability = T)

train_predict_3.4.1 <- predict(modello3.4, data = new_train_spf)
train_predict3.4 <- train_predict_3.4.1$predictions[, 2]

predict3.4.1 <- predict(modello3.4, data = new_test_spf)

predict3.4 <- predict3.4.1$predictions[, 2]

confusione3.4 <- confusionMatrix(as.factor(ifelse(predict3.4 >0.3, 1, 0)), as.factor(test_spf$result))
confusione3.4
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 15519   717
##          1   487   635
##                                           
##                Accuracy : 0.9306          
##                  95% CI : (0.9268, 0.9344)
##     No Information Rate : 0.9221          
##     P-Value [Acc > NIR] : 1.086e-05       
##                                           
##                   Kappa : 0.4763          
##                                           
##  Mcnemar's Test P-Value : 4.121e-11       
##                                           
##             Sensitivity : 0.9696          
##             Specificity : 0.4697          
##          Pos Pred Value : 0.9558          
##          Neg Pred Value : 0.5660          
##              Prevalence : 0.9221          
##          Detection Rate : 0.8941          
##    Detection Prevalence : 0.9354          
##       Balanced Accuracy : 0.7196          
##                                           
##        'Positive' Class : 0               
## 
roc3.4 <- roc(test_spf$result ~ as.numeric(predict3.4), plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright", legend = paste("AUC =", round(auc(roc3.4), 4)), lwd = 2, box.lwd = 0, bg = "white")

rms::val.prob(as.numeric(as.character(predict3.4)), as.numeric(as.character(new_test_spf$result)))
## Warning in rms::val.prob(as.numeric(as.character(predict3.4)),
## as.numeric(as.character(new_test_spf$result))): 383 observations deleted from
## logistic calibration due to probs. of 0 or 1

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##  6.650859e-01  8.325429e-01  3.375476e-01  1.553417e-01  2.637926e+03 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA  2.683426e-03  4.755116e+01  4.724943e-11  1.526583e-01 
##         Brier     Intercept         Slope          Emax           E90 
##  5.104138e-02 -3.530922e-01  8.691969e-01  6.181181e-02  4.470280e-02 
##          Eavg           S:z           S:p 
##  1.567332e-02 -2.138949e+00  3.243979e-02

Bagging

set.seed(1234)

modello3.5 <- ranger(train_spf$result ~ minute + lastAction + is_weakfoot + X + Y + h_a, 
                     data = new_train_spf, num.trees = 500, seed = 42, probability = T)

predict3.5.1 <- predict(modello3.5, data = new_test_spf)

predict3.5 <- predict3.5.1$predictions[, 2]

confusione3.5 <- confusionMatrix(as.factor(ifelse(predict3.5>0.3, 1, 0)), as.factor(new_test_spf$result))
confusione3.5
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 15591   758
##          1   415   594
##                                           
##                Accuracy : 0.9324          
##                  95% CI : (0.9286, 0.9361)
##     No Information Rate : 0.9221          
##     P-Value [Acc > NIR] : 1.239e-07       
##                                           
##                   Kappa : 0.4677          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9741          
##             Specificity : 0.4393          
##          Pos Pred Value : 0.9536          
##          Neg Pred Value : 0.5887          
##              Prevalence : 0.9221          
##          Detection Rate : 0.8982          
##    Detection Prevalence : 0.9419          
##       Balanced Accuracy : 0.7067          
##                                           
##        'Positive' Class : 0               
## 
roc3.5 <- roc(new_test_spf$result ~ as.numeric(predict3.5), plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright", legend = paste("AUC =", round(auc(roc3.5), 4)), lwd = 2, box.lwd = 0, bg = "white")

rms::val.prob(as.numeric(as.character(predict3.5)), as.numeric(as.character(new_test_spf$result)))

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##  6.772853e-01  8.386426e-01  3.486116e-01  1.588321e-01  2.758008e+03 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA  4.519128e-04  9.844303e+00  7.283445e-03  1.583802e-01 
##         Brier     Intercept         Slope          Emax           E90 
##  5.136852e-02  4.054414e-02  1.056891e+00  9.273364e-02  2.470800e-02 
##          Eavg           S:z           S:p 
##  8.638274e-03 -3.800341e+00  1.444973e-04

Neural Network

set.seed(1234)

modello3.8 <- nnet(result ~ minute + lastAction + is_weakfoot + X + Y + h_a, data = new_train_spf, size = 10, maxit = 1000, linout = FALSE)
## # weights:  301
## initial  value 23522.705483 
## iter  10 value 14179.880092
## iter  20 value 13625.005698
## iter  30 value 13418.429983
## iter  40 value 13173.057450
## iter  50 value 12640.458928
## iter  60 value 12265.364331
## iter  70 value 12038.069262
## iter  80 value 11934.605007
## iter  90 value 11896.243136
## iter 100 value 11893.367623
## iter 110 value 11884.155700
## iter 120 value 11866.894186
## iter 130 value 11850.193146
## iter 140 value 11837.232488
## iter 150 value 11820.460297
## iter 160 value 11805.131341
## iter 170 value 11790.186472
## iter 180 value 11778.943767
## iter 190 value 11768.430048
## iter 200 value 11754.212725
## iter 210 value 11746.100902
## iter 220 value 11740.054020
## iter 230 value 11733.960416
## iter 240 value 11728.417755
## iter 250 value 11724.163176
## iter 260 value 11721.026534
## iter 270 value 11718.333051
## iter 280 value 11714.357414
## iter 290 value 11713.123854
## iter 300 value 11712.121098
## iter 310 value 11711.590404
## iter 320 value 11710.029488
## iter 330 value 11706.959369
## iter 340 value 11705.784964
## iter 350 value 11704.471415
## iter 360 value 11703.823626
## iter 370 value 11703.137776
## iter 380 value 11702.167481
## iter 390 value 11700.827529
## iter 400 value 11699.873236
## iter 410 value 11698.942583
## final  value 11698.629976 
## converged
predict3.8 <- predict(modello3.8, newdata = new_test_spf)

confusione3.8 <- confusionMatrix(as.factor(ifelse(predict3.8>0.3, 1, 0)), new_test_spf$result)
confusione3.8
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 15425   951
##          1   581   401
##                                           
##                Accuracy : 0.9117          
##                  95% CI : (0.9074, 0.9159)
##     No Information Rate : 0.9221          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2976          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.9637          
##             Specificity : 0.2966          
##          Pos Pred Value : 0.9419          
##          Neg Pred Value : 0.4084          
##              Prevalence : 0.9221          
##          Detection Rate : 0.8886          
##    Detection Prevalence : 0.9434          
##       Balanced Accuracy : 0.6301          
##                                           
##        'Positive' Class : 0               
## 
roc3.8 <- roc(new_test_spf$result ~ as.numeric(predict3.8), plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright", legend = paste("AUC =", round(auc(roc3.8), 4)), lwd = 2, box.lwd = 0, bg = "white")

rms::val.prob(as.numeric(as.character(predict3.8)), as.numeric(as.character(new_test_spf$result)))
## Warning in rms::val.prob(as.numeric(as.character(predict3.8)),
## as.numeric(as.character(new_test_spf$result))): 73 observations deleted from
## logistic calibration due to probs. of 0 or 1

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##  5.362365e-01  7.681183e-01  1.831183e-01  8.020757e-02  1.387388e+03 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA           Inf           Inf  0.000000e+00          -Inf 
##         Brier     Intercept         Slope          Emax           E90 
##  6.358302e-02 -2.002981e-01  9.038563e-01  1.008896e-01  8.304276e-03 
##          Eavg           S:z           S:p 
##  5.422550e-03  1.609048e+00  1.076059e-01

Understat Result

confusione3.us <- confusionMatrix(as.factor(ifelse(test_spf$`xG Understat`>0.3, 1, 0)), test_spf$result)
confusione3.us
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 15438   770
##          1   568   582
##                                           
##                Accuracy : 0.9229          
##                  95% CI : (0.9188, 0.9268)
##     No Information Rate : 0.9221          
##     P-Value [Acc > NIR] : 0.3524          
##                                           
##                   Kappa : 0.424           
##                                           
##  Mcnemar's Test P-Value : 3.907e-08       
##                                           
##             Sensitivity : 0.9645          
##             Specificity : 0.4305          
##          Pos Pred Value : 0.9525          
##          Neg Pred Value : 0.5061          
##              Prevalence : 0.9221          
##          Detection Rate : 0.8894          
##    Detection Prevalence : 0.9337          
##       Balanced Accuracy : 0.6975          
##                                           
##        'Positive' Class : 0               
## 
roc3.us <- roc(test_spf$result ~ as.numeric(test_spf$`xG Understat`), plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright", legend = paste("AUC =", round(auc(roc3.us), 4)), lwd = 2, box.lwd = 0, bg = "white")

rms::val.prob(as.numeric(as.character(test_spf$`xG Understat`)), as.numeric(as.character(test_spf$result)))
## Warning in rms::val.prob(as.numeric(as.character(test_spf$`xG Understat`)), :
## 13 observations deleted from logistic calibration due to probs. of 0 or 1

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##  5.895407e-01  7.947703e-01  2.724341e-01  1.219437e-01  2.116114e+03 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA  5.332985e-04  1.125006e+01  3.606451e-03  1.214104e-01 
##         Brier     Intercept         Slope          Emax           E90 
##  5.646719e-02 -1.819309e-01  9.393373e-01  4.516177e-02  6.395073e-03 
##          Eavg           S:z           S:p 
##  5.191630e-03 -5.115445e-02  9.592024e-01

Set Piece Head

set.seed(123)
index_sph <- sample.split(Y = setpiece_head$result, SplitRatio = 0.75)
train_sph <- setpiece_head[index_sph, ]

indice_test_sph <-  which(index_sph == FALSE)
test_sph <- setpiece_head[indice_test_sph, ] 

Logit

modello4.1 <- glm(result ~ minute + lastAction + X + exp(Y^2) + h_a, 
                  data = train_sph, family = binomial)

summary(modello4.1)
## 
## Call:
## glm(formula = result ~ minute + lastAction + X + exp(Y^2) + h_a, 
##     family = binomial, data = train_sph)
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -2.420e+01  7.364e-01 -32.868  < 2e-16 ***
## minute                   -1.725e-03  7.315e-04  -2.358  0.01836 *  
## lastActionBallRecovery   -1.282e+01  4.131e+02  -0.031  0.97524    
## lastActionBallTouch      -1.418e-01  3.709e-01  -0.382  0.70231    
## lastActionBlockedPass    -1.300e+01  2.409e+02  -0.054  0.95697    
## lastActionCard           -1.281e+01  3.355e+02  -0.038  0.96954    
## lastActionChallenge      -1.291e+01  4.496e+02  -0.029  0.97708    
## lastActionChipped         6.573e-01  1.014e-01   6.481 9.14e-11 ***
## lastActionCornerAwarded  -1.219e+00  7.181e-01  -1.698  0.08954 .  
## lastActionCross           7.615e-01  4.764e-02  15.986  < 2e-16 ***
## lastActionDispossessed    7.975e-01  1.060e+00   0.753  0.45168    
## lastActionEnd            -4.132e-01  5.973e-01  -0.692  0.48913    
## lastActionFoul           -8.870e-01  5.903e-01  -1.502  0.13297    
## lastActionGoal           -1.269e+01  1.798e+02  -0.071  0.94375    
## lastActionHeadPass        1.108e+00  1.048e-01  10.566  < 2e-16 ***
## lastActionInterception   -1.269e+01  4.071e+02  -0.031  0.97513    
## lastActionNone            3.315e-01  1.422e-01   2.332  0.01970 *  
## lastActionOffsidePass    -1.249e+01  2.712e+02  -0.046  0.96326    
## lastActionPass            1.859e+00  2.146e-01   8.660  < 2e-16 ***
## lastActionRebound         1.199e+00  1.118e-01  10.724  < 2e-16 ***
## lastActionSubstitutionOn -1.292e+01  5.695e+02  -0.023  0.98190    
## lastActionTackle         -1.305e+01  3.234e+02  -0.040  0.96782    
## lastActionTakeOn         -1.262e+01  3.072e+02  -0.041  0.96722    
## X                         2.461e+01  7.397e-01  33.267  < 2e-16 ***
## exp(Y^2)                 -9.621e-01  2.060e-01  -4.671 2.99e-06 ***
## h_ah                     -1.052e-01  3.876e-02  -2.715  0.00663 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 21149  on 34346  degrees of freedom
## Residual deviance: 19194  on 34321  degrees of freedom
## AIC: 19246
## 
## Number of Fisher Scoring iterations: 14
predict4.1 <- predict.glm(modello4.1, newdata = test_sph, type = "response")

confusione4.1 <- confusionMatrix(as.factor(ifelse(predict4.1>0.3, 1, 0)), as.factor(test_sph$result)) 
confusione4.1
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 10215   962
##          1   177    95
##                                           
##                Accuracy : 0.9005          
##                  95% CI : (0.8949, 0.9059)
##     No Information Rate : 0.9077          
##     P-Value [Acc > NIR] : 0.9958          
##                                           
##                   Kappa : 0.1093          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.98297         
##             Specificity : 0.08988         
##          Pos Pred Value : 0.91393         
##          Neg Pred Value : 0.34926         
##              Prevalence : 0.90768         
##          Detection Rate : 0.89222         
##    Detection Prevalence : 0.97624         
##       Balanced Accuracy : 0.53642         
##                                           
##        'Positive' Class : 0               
## 
roc4.1 <- roc(test_sph$result ~ predict4.1, plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright", legend = paste("AUC =", round(auc(roc4.1), 4)), lwd = 2, box.lwd = 0, bg = "white")

rms::val.prob(as.numeric(as.character(predict4.1)), as.numeric(as.character(test_sph$result)))

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##  4.251367e-01  7.125684e-01  1.037063e-01  4.876773e-02  5.593418e+02 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA  1.560155e-04  3.786222e+00  1.506026e-01  4.861172e-02 
##         Brier     Intercept         Slope          Emax           E90 
##  7.906927e-02 -1.739121e-01  9.218147e-01  2.812030e-02  1.657528e-02 
##          Eavg           S:z           S:p 
##  5.413573e-03 -2.727723e-02  9.782386e-01

Linear Discriminant Analysis

set.seed(1234)

new_train_sph <- train_sph

new_train_sph$Y <- exp(new_train_sph$Y^2)

new_test_sph <- test_sph

new_test_sph$Y <- exp(new_test_sph$Y^2)

set.seed(1234)


modello4.3 <- lda(result ~ minute + lastAction + X + exp(Y^2) + h_a, data = train_sph)

summary(modello4.3)
##         Length Class  Mode     
## prior    2     -none- numeric  
## counts   2     -none- numeric  
## means   50     -none- numeric  
## scaling 25     -none- numeric  
## lev      2     -none- character
## svd      1     -none- numeric  
## N        1     -none- numeric  
## call     3     -none- call     
## terms    3     terms  call     
## xlevels  2     -none- list
predict4.3.1 <- predict(modello4.3, newdata = test_sph)

predict4.3 <- predict4.3.1$posterior[, 2]

confusione4.3.1 <- confusionMatrix(as.factor(predict4.3.1$class), as.factor(test_sph$result))
confusione4.3.1
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 10324  1011
##          1    68    46
##                                          
##                Accuracy : 0.9058         
##                  95% CI : (0.9003, 0.911)
##     No Information Rate : 0.9077         
##     P-Value [Acc > NIR] : 0.7668         
##                                          
##                   Kappa : 0.0617         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.99346        
##             Specificity : 0.04352        
##          Pos Pred Value : 0.91081        
##          Neg Pred Value : 0.40351        
##              Prevalence : 0.90768        
##          Detection Rate : 0.90174        
##    Detection Prevalence : 0.99004        
##       Balanced Accuracy : 0.51849        
##                                          
##        'Positive' Class : 0              
## 
confusione4.3 <- confusionMatrix(as.factor(ifelse(predict4.3>0.3, 1, 0)), as.factor(test_sph$result))

roc4.3 <- roc(test_sph$result ~ predict4.3, plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright", legend = paste("AUC =", round(auc(roc4.3), 4)), lwd = 2, box.lwd = 0, bg = "white")

rms::val.prob(as.numeric(as.character(predict4.3)), as.numeric(as.character(test_sph$result)))

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##  4.176003e-01  7.088001e-01  9.620260e-02  4.515158e-02  5.179404e+02 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA  1.869583e-03  2.340486e+01  8.273686e-06  4.328199e-02 
##         Brier     Intercept         Slope          Emax           E90 
##  8.005259e-02 -3.824800e-01  8.242660e-01  3.101720e-01  1.341532e-02 
##          Eavg           S:z           S:p 
##  1.179437e-02  1.354577e+00  1.755522e-01

Random Forest

set.seed(1234)

modello4.4 <- ranger(train_sph$result ~ minute + lastAction + X + Y + h_a, 
                     data = new_train_sph, num.trees = 500, mtry = 3, seed = 42, probability = T)

train_predict_4.4.1 <- predict(modello4.4, data = new_train_sph)
train_predict4.4 <- train_predict_4.4.1$predictions[, 2]

predict4.4.1 <- predict(modello4.4, data = new_test_sph)

predict4.4 <- predict4.4.1$predictions[, 2]

confusione4.4 <- confusionMatrix(as.factor(ifelse(predict4.4 >0.3, 1, 0)), as.factor(test_sph$result))
confusione4.4
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 9942  631
##          1  450  426
##                                           
##                Accuracy : 0.9056          
##                  95% CI : (0.9001, 0.9109)
##     No Information Rate : 0.9077          
##     P-Value [Acc > NIR] : 0.786           
##                                           
##                   Kappa : 0.3897          
##                                           
##  Mcnemar's Test P-Value : 4.383e-08       
##                                           
##             Sensitivity : 0.9567          
##             Specificity : 0.4030          
##          Pos Pred Value : 0.9403          
##          Neg Pred Value : 0.4863          
##              Prevalence : 0.9077          
##          Detection Rate : 0.8684          
##    Detection Prevalence : 0.9235          
##       Balanced Accuracy : 0.6799          
##                                           
##        'Positive' Class : 0               
## 
roc4.4 <- roc(test_sph$result ~ as.numeric(predict4.4), plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright", legend = paste("AUC =", round(auc(roc4.4), 4)), lwd = 2, box.lwd = 0, bg = "white")

rms::val.prob(as.numeric(as.character(predict4.4)), as.numeric(as.character(new_test_sph$result)))
## Warning in rms::val.prob(as.numeric(as.character(predict4.4)),
## as.numeric(as.character(new_test_sph$result))): 869 observations deleted from
## logistic calibration due to probs. of 0 or 1

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##    0.60666454    0.80333227    0.28605354    0.14680716 1554.21971840 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA           Inf           Inf    0.00000000          -Inf 
##         Brier     Intercept         Slope          Emax           E90 
##    0.06304306   -0.46515693    0.76715854    0.09687157    0.05778390 
##          Eavg           S:z           S:p 
##    0.02110086    0.94272703    0.34582054

Bagging

set.seed(1234)

modello4.5 <- ranger(train_sph$result ~ minute + lastAction + X + Y + h_a, 
                     data = new_train_sph, num.trees = 500, seed = 42, probability = T)

predict4.5.1 <- predict(modello4.5, data = new_test_sph)

predict4.5 <- predict4.5.1$predictions[, 2]

confusione4.5 <- confusionMatrix(as.factor(ifelse(predict4.5>0.3, 1, 0)), as.factor(new_test_sph$result))
confusione4.5
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 10042   658
##          1   350   399
##                                           
##                Accuracy : 0.912           
##                  95% CI : (0.9066, 0.9171)
##     No Information Rate : 0.9077          
##     P-Value [Acc > NIR] : 0.05793         
##                                           
##                   Kappa : 0.3956          
##                                           
##  Mcnemar's Test P-Value : < 2e-16         
##                                           
##             Sensitivity : 0.9663          
##             Specificity : 0.3775          
##          Pos Pred Value : 0.9385          
##          Neg Pred Value : 0.5327          
##              Prevalence : 0.9077          
##          Detection Rate : 0.8771          
##    Detection Prevalence : 0.9346          
##       Balanced Accuracy : 0.6719          
##                                           
##        'Positive' Class : 0               
## 
roc4.5 <- roc(new_test_sph$result ~ as.numeric(predict4.5), plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright", legend = paste("AUC =", round(auc(roc4.5), 4)), lwd = 2, box.lwd = 0, bg = "white")

rms::val.prob(as.numeric(as.character(predict4.5)), as.numeric(as.character(new_test_sph$result)))
## Warning in rms::val.prob(as.numeric(as.character(predict4.5)),
## as.numeric(as.character(new_test_sph$result))): 32 observations deleted from
## logistic calibration due to probs. of 0 or 1

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##  6.444300e-01  8.222150e-01  3.168346e-01  1.575874e-01  1.800175e+03 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA  6.581974e-04  9.514640e+00  8.588598e-03  1.569292e-01 
##         Brier     Intercept         Slope          Emax           E90 
##  6.248466e-02 -1.808876e-01  9.179875e-01  1.028428e-01  3.948918e-02 
##          Eavg           S:z           S:p 
##  1.451649e-02 -8.034238e-01  4.217298e-01

Neural Network

modello4.8 <- nnet(result ~ minute + lastAction + X + Y + h_a, data = new_train_sph, size = 10, maxit = 1000, linout = FALSE)
## # weights:  271
## initial  value 14444.509965 
## iter  10 value 10571.777439
## iter  20 value 10555.297209
## iter  30 value 10535.397340
## iter  40 value 10531.264920
## iter  50 value 10529.572248
## iter  60 value 10528.543519
## final  value 10528.519092 
## converged
predict4.8 <- predict(modello4.8, newdata = new_test_sph)

confusione4.8 <- confusionMatrix(as.factor(ifelse(predict4.8>0.3, 1, 0)), new_test_sph$result)
confusione4.8
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 10390  1054
##          1     2     3
##                                          
##                Accuracy : 0.9078         
##                  95% CI : (0.9023, 0.913)
##     No Information Rate : 0.9077         
##     P-Value [Acc > NIR] : 0.4953         
##                                          
##                   Kappa : 0.0048         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.999808       
##             Specificity : 0.002838       
##          Pos Pred Value : 0.907899       
##          Neg Pred Value : 0.600000       
##              Prevalence : 0.907678       
##          Detection Rate : 0.907503       
##    Detection Prevalence : 0.999563       
##       Balanced Accuracy : 0.501323       
##                                          
##        'Positive' Class : 0              
## 
roc4.8 <- roc(new_test_sph$result ~ as.numeric(predict4.8), plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright", legend = paste("AUC =", round(auc(roc4.8), 4)), lwd = 2, box.lwd = 0, bg = "white")

rms::val.prob(as.numeric(as.character(predict4.8)), as.numeric(as.character(new_test_sph$result)))

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##  1.874104e-02  5.093705e-01  2.306491e-03  9.736699e-04  1.214755e+01 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA  6.033001e-05  2.690718e+00  2.604461e-01  9.133399e-04 
##         Brier     Intercept         Slope          Emax           E90 
##  8.366746e-02 -6.346217e-01  7.213535e-01  8.956164e-02  1.159593e-03 
##          Eavg           S:z           S:p 
##  2.291678e-03  9.299804e-02  9.259051e-01

Understat Result

confusione4.us <- confusionMatrix(as.factor(ifelse(test_sph$`xG Understat`>0.3, 1, 0)), test_sph$result)
confusione4.us
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 9805  626
##          1  587  431
##                                           
##                Accuracy : 0.8941          
##                  95% CI : (0.8883, 0.8996)
##     No Information Rate : 0.9077          
##     P-Value [Acc > NIR] : 1.0000          
##                                           
##                   Kappa : 0.3572          
##                                           
##  Mcnemar's Test P-Value : 0.2752          
##                                           
##             Sensitivity : 0.9435          
##             Specificity : 0.4078          
##          Pos Pred Value : 0.9400          
##          Neg Pred Value : 0.4234          
##              Prevalence : 0.9077          
##          Detection Rate : 0.8564          
##    Detection Prevalence : 0.9111          
##       Balanced Accuracy : 0.6756          
##                                           
##        'Positive' Class : 0               
## 
roc4.us <- roc(test_sph$result ~ as.numeric(test_sph$`xG Understat`), plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright", legend = paste("AUC =", round(auc(roc4.us), 4)), lwd = 2, box.lwd = 0, bg = "white")

rms::val.prob(as.numeric(as.character(test_sph$`xG Understat`)), as.numeric(as.character(test_sph$result)))
## Warning in rms::val.prob(as.numeric(as.character(test_sph$`xG Understat`)), : 2
## observations deleted from logistic calibration due to probs. of 0 or 1

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##  5.820363e-01  7.910181e-01  2.272736e-01  1.102882e-01  1.263469e+03 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA  5.036564e-04  7.765355e+00  2.059561e-02  1.097845e-01 
##         Brier     Intercept         Slope          Emax           E90 
##  7.092230e-02 -1.467926e-01  9.228842e-01  1.822378e-01  4.303519e-03 
##          Eavg           S:z           S:p 
##  4.759339e-03  1.399728e+00  1.615947e-01

Final Result

accuracy_opf <- data.frame(confusione1.us$overall["Accuracy"], confusione1.1$overall["Accuracy"], confusione1.2$overall["Accuracy"], confusione1.3$overall["Accuracy"], confusione1.4$overall["Accuracy"], confusione1.5$overall["Accuracy"], confusione1.8$overall["Accuracy"])
nomi_acc1 <- c("Understat", "Lineare", "Interazione", "Discriminante", "Random_Forest", "Bagging", "Neural_Net")
names(accuracy_opf) <- nomi_acc1

auc_opf <- data.frame(roc1.us$auc, roc1.1$auc, roc1.2$auc, roc1.3$auc, roc1.4$auc, roc1.5$auc, roc1.8$auc)
names(auc_opf) <- nomi_acc1

acc_auc1 <- rbind(accuracy_opf, auc_opf)
rownames(acc_auc1) <- c("Accuracy", "AUC")
acc_auc1
##          Understat   Lineare Interazione Discriminante Random_Forest   Bagging
## Accuracy 0.8875595 0.8888058   0.8860917     0.8850532     0.9011161 0.9064058
## AUC      0.8145883 0.7468567   0.7495843     0.7447884     0.8374263 0.8392269
##          Neural_Net
## Accuracy  0.8885704
## AUC       0.7913828
accuracy_oph <- data.frame(confusione2.us$overall["Accuracy"], confusione2.1$overall["Accuracy"], confusione2.3$overall["Accuracy"], confusione2.4$overall["Accuracy"], confusione2.5$overall["Accuracy"], confusione2.8$overall["Accuracy"])
nomi_acc2 <- c("Understat", "Lineare", "Discriminante", "Random_Forest", "Bagging", "Neural_Net")
names(accuracy_oph) <- nomi_acc2

auc_oph <- data.frame(roc1.us$auc, roc1.1$auc, roc1.3$auc, roc1.4$auc, roc1.5$auc, roc1.8$auc)
names(auc_oph) <- nomi_acc2

acc_auc2 <- rbind.data.frame(accuracy_oph, auc_oph)
rownames(acc_auc2) <- c("Accuracy", "AUC")
acc_auc2
##          Understat   Lineare Discriminante Random_Forest   Bagging Neural_Net
## Accuracy 0.8503896 0.8577922     0.8579221     0.8659740 0.8727273  0.8662338
## AUC      0.8145883 0.7468567     0.7447884     0.8374263 0.8392269  0.7913828
accuracy_spf <- data.frame(confusione3.us$overall["Accuracy"], confusione3.1$overall["Accuracy"], confusione3.3$overall["Accuracy"], confusione3.4$overall["Accuracy"], confusione3.5$overall["Accuracy"], confusione3.8$overall["Accuracy"])
nomi_acc3 <- c("Understat", "Lineare", "Discriminante", "Random_Forest", "Bagging", "Neural_Net")
names(accuracy_spf) <- nomi_acc3

auc_spf <- data.frame(roc3.us$auc, roc3.1$auc, roc3.3$auc, roc3.4$auc, roc3.5$auc, roc3.8$auc)
names(auc_spf) <- nomi_acc3


acc_auc3 <- rbind.data.frame(accuracy_spf, auc_spf)
rownames(acc_auc3) <- c("Accuracy", "AUC")
acc_auc3
##          Understat   Lineare Discriminante Random_Forest   Bagging Neural_Net
## Accuracy 0.9229174 0.9138150     0.9033875     0.9306372 0.9324231  0.9117410
## AUC      0.7949585 0.7540138     0.7436472     0.8365386 0.8386365  0.7650791
accuracy_sph <- data.frame(confusione4.us$overall["Accuracy"], confusione4.1$overall["Accuracy"], confusione4.3$overall["Accuracy"], confusione4.4$overall["Accuracy"], confusione4.5$overall["Accuracy"], confusione4.8$overall["Accuracy"])
nomi_acc2 <- c("Understat", "Lineare", "Discriminante", "Random_Forest", "Bagging", "Neural_Net")
names(accuracy_sph) <- nomi_acc2

auc_sph <- data.frame(roc4.us$auc, roc4.1$auc, roc4.3$auc, roc4.4$auc, roc4.5$auc, roc4.8$auc)
names(auc_sph) <- nomi_acc2

acc_auc4 <- rbind.data.frame(accuracy_sph, auc_sph)
rownames(acc_auc4) <- c("Accuracy", "AUC")
acc_auc4
##          Understat   Lineare Discriminante Random_Forest   Bagging Neural_Net
## Accuracy 0.8940519 0.9005153     0.8967595     0.9055813 0.9119574  0.9077649
## AUC      0.7910585 0.7125744     0.7087912     0.8190310 0.8227611  0.5094168

Final Plot

Messi’s Plot

test_opf_rf <- cbind(test_opf, predict1.4)
names(test_opf_rf)[names(test_opf_rf) == "predict1.4"] <- "xG_Random_Forest"
train_opf_rf <- cbind(train_opf, train_predict1.4)
names(train_opf_rf)[names(train_opf_rf) == "train_predict1.4"] <- "xG_Random_Forest"

messi_shots_opf_test <- test_opf_rf %>% filter(player == "Lionel Messi")
messi_shots_opf_train <- train_opf_rf %>% filter(player == "Lionel Messi")
messi_shots_opf <- bind_rows(messi_shots_opf_train, messi_shots_opf_test)

messi_shots_opf$shotType <- ifelse(messi_shots_opf$shotType == "LeftFoot", "Left", "Right")
messi_shots_opf <- messi_shots_opf %>%
  mutate(X = X * 100, Y = Y * 100)

messi_shots_opf <- messi_shots_opf %>% 
  mutate(result = factor(result, levels = c(0, 1), labels = c("No Goal", "Goal")),
         match_info = paste(h_team, "vs", a_team, "<br>", h_goals, "-", a_goals),
          date_formatted = format(as.Date(date), "%d-%m-%Y"))

generate_center_circle_arc <- function(center_x = 50, center_y = 50, r = 9.15, n_points = 100) {
  theta <- seq(-pi / 2, pi / 2, length.out = n_points)  # Modificato per metà arco, da -pi/2 a pi/2
  data.frame(
    x = center_x + r * cos(theta),  # Mantiene il centro sul bordo visibile del campo
    y = center_y + r * sin(theta)
  )
}

# Genera i punti della lunetta del centrocampo
center_circle_arc <- generate_center_circle_arc(center_x = 50, center_y = 50)

# Esempio di dataset con coordinate normalizzate (da 0 a 1)
p <- ggplot() +
  annotate_pitch(colour = "black", fill = "white") + # disegna il campo
  theme_pitch() + # tema predefinito per il campo
  theme(panel.background = element_rect(fill = "green4")) + # colore sfondo campo
  # Aggiungi la lunetta del centrocampo
  geom_path(data = center_circle_arc, aes(x = x, y = y), color = "black", size = 0.8) +  
  geom_point(data = messi_shots_opf, 
             aes(x = X, y = Y, 
                 color = factor(result), # colori differenti per goal e no goal
                 shape = factor(result), # forme differenti per goal e no goal
                 text = paste("Random Forest xG:", round(xG_Random_Forest, 4), 
                              "<br>X:", round(X, 2), 
                              "<br>Y:", round(Y, 2), 
                              "<br>Foot:", shotType,
                              "<br>", match_info,
                              "<br>", date_formatted,
                              "<br>Minute:", minute)),
             size = 2, stroke = 0.3) +
  scale_color_manual(values = c("No Goal" = "black", "Goal" = "red"),
                     name = "Esito Tiro") +
  scale_shape_manual(values = c("No Goal" = 1, "Goal" = 16),
                     name = "Esito Tiro") +
  labs(title = "<span style='font-size:16pt;'>Tiri effettuati da Lionel Messi in campionato</span><br><span style='font-size:12pt;'>di piede e in situazione di Open Play tra il 2014 e il 2021", x = " ", y = " ") +
  theme_minimal() +
  coord_cartesian(xlim = c(50, 100))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning in geom_point(data = messi_shots_opf, aes(x = X, y = Y, color =
## factor(result), : Ignoring unknown aesthetics: text
p_interactive <- ggplotly(p, tooltip = "text")
## Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomCurve() has yet to be implemented in plotly.
##   If you'd like to see this geom implemented,
##   Please open an issue with your example code at
##   https://github.com/ropensci/plotly/issues
## Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomCurve() has yet to be implemented in plotly.
##   If you'd like to see this geom implemented,
##   Please open an issue with your example code at
##   https://github.com/ropensci/plotly/issues
## Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomCurve() has yet to be implemented in plotly.
##   If you'd like to see this geom implemented,
##   Please open an issue with your example code at
##   https://github.com/ropensci/plotly/issues
## Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomCurve() has yet to be implemented in plotly.
##   If you'd like to see this geom implemented,
##   Please open an issue with your example code at
##   https://github.com/ropensci/plotly/issues
p_interactive

Serie A’s Plot

test_spf_rf <- cbind(test_spf, predict3.4)
names(test_spf_rf)[names(test_spf_rf) == "predict3.4"] <- "xG_Random_Forest"
test_sph_rf <- cbind(test_sph, predict4.4)
names(test_sph_rf)[names(test_sph_rf) == "predict4.4"] <- "xG_Random_Forest"
train_spf_rf <- cbind(new_train_spf, train_predict3.4)
names(test_spf_rf)[names(test_spf_rf) == "train_predict3.4"] <- "xG_Random_Forest"
train_sph_rf <- cbind(new_train_sph, train_predict4.4)
names(test_sph_rf)[names(test_sph_rf) == "train_predict4.4"] <- "xG_Random_Forest"

seriea_corner_spf_test <- test_spf_rf %>%
  filter(situation == "FromCorner", lega == "Serie A")

seriea_corner_sph_test <- test_sph_rf %>%
  filter(situation == "FromCorner", lega == "Serie A")

seriea_corner_spf_train <- train_spf_rf %>%
  filter(situation == "FromCorner", lega == "Serie A")

seriea_corner_sph_train <- train_sph_rf %>%
  filter(situation == "FromCorner", lega == "Serie A")  


seriea_corner <- bind_rows(seriea_corner_spf_test, seriea_corner_sph_test, seriea_corner_spf_train, seriea_corner_sph_train)
seriea_corner$foot <- ifelse(seriea_corner$shotType %in% c("RightFoot", "LeftFoot"), 
                             "foot", 
                             "head")

seriea_corner <- seriea_corner %>%
  mutate(team_name = ifelse(h_a == "h", h_team, a_team),
         match_info = paste(h_team, "vs", a_team, "<br>", h_goals, "-", a_goals),
         date_formatted = format(as.Date(date), "%d-%m-%Y"),
         result = factor(result, levels = c(0, 1), labels = c("No Goal", "Goal")))

seriea_corner <- seriea_corner %>%
  mutate(X = X * 100, Y = Y * 100)

seriea_corner_20_21 <- seriea_corner %>%
  filter(season == "2020")
seriea_corner_19_21 <- seriea_corner %>%
  filter(season %in% c("2019", "2020"))

generate_center_circle_arc <- function(center_x = 50, center_y = 50, r = 9.15, n_points = 100) {
  theta <- seq(-pi / 2, pi / 2, length.out = n_points)  # Modificato per metà arco, da -pi/2 a pi/2
  data.frame(
    x = center_x + r * cos(theta),  # Mantiene il centro sul bordo visibile del campo
    y = center_y + r * sin(theta)
  )
}

# Genera i punti della lunetta del centrocampo
center_circle_arc <- generate_center_circle_arc(center_x = 50, center_y = 50)

# Esempio di dataset con coordinate normalizzate (da 0 a 1)
p_seriea <- ggplot() +
  annotate_pitch(colour = "black", fill = "white") + # disegna il campo
  theme_pitch() + # tema predefinito per il campo
  theme(panel.background = element_rect(fill = "green4")) + # colore sfondo campo
  # Aggiungi la lunetta del centrocampo
  geom_path(data = center_circle_arc, aes(x = x, y = y), color = "black", size = 0.8) +  
  geom_point(data = seriea_corner_19_21, 
             aes(x = X, y = Y, 
                 color = factor(result), # colori differenti per goal e no goal
                 shape = factor(result), # forme differenti per goal e no goal
                 text = paste("Random Forest xG:", round(xG_Random_Forest, 4), 
                              "<br>X:", round(X, 2), 
                              "<br>Y:", round(Y, 2), 
                              "<br>Player:", player,
                              "<br>Type:", shotType,
                              "<br>Assist:", player_assisted,
                              "<br>Team:", team_name,
                              "<br>", match_info,
                              "<br>", date_formatted,
                              "<br>Minute:", minute)),
             size = 2, stroke = 0.3) +
  scale_color_manual(values = c("No Goal" = "black", "Goal" = "red"),
                     name = "Esito Tiro") +
  scale_shape_manual(values = c("No Goal" = 1, "Goal" = 16),
                     name = "Esito Tiro") +
  labs(title = "<span style='font-size:16pt;'>Tiri effettuati in Serie A da calcio d'angolo</span><br><span style='font-size:12pt;'>nelle stagioni 2019/20 e 2020/21</span>", x = " ", y = " ") +
  theme_minimal() +
  coord_cartesian(xlim = c(50, 100))
## Warning in geom_point(data = seriea_corner_19_21, aes(x = X, y = Y, color =
## factor(result), : Ignoring unknown aesthetics: text
p_interactive_seriea <- ggplotly(p_seriea, tooltip = "text")
## Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomCurve() has yet to be implemented in plotly.
##   If you'd like to see this geom implemented,
##   Please open an issue with your example code at
##   https://github.com/ropensci/plotly/issues
## Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomCurve() has yet to be implemented in plotly.
##   If you'd like to see this geom implemented,
##   Please open an issue with your example code at
##   https://github.com/ropensci/plotly/issues
## Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomCurve() has yet to be implemented in plotly.
##   If you'd like to see this geom implemented,
##   Please open an issue with your example code at
##   https://github.com/ropensci/plotly/issues
## Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomCurve() has yet to be implemented in plotly.
##   If you'd like to see this geom implemented,
##   Please open an issue with your example code at
##   https://github.com/ropensci/plotly/issues
p_interactive_seriea

xG’s Heatmap

tot_opf_rf <- bind_rows(train_opf_rf, test_opf_rf)
tot_opf_rf <- tot_opf_rf %>%
  mutate(X = X * 100, Y = Y * 100)

p_heatmap_custom <- ggplot(tot_opf_rf, aes(x = X, y = Y)) +
  annotate_pitch(colour = "black", fill = "white") + 
  theme_pitch() + 
  theme(panel.background = element_rect(fill = "lightblue")) +
  
  # Usa stat_summary_2d invece di geom_tile, per calcolare densità in ogni area
  stat_summary_2d(aes(z = xG_Random_Forest), fun = mean, bins = 65, alpha = 0.8) + 
  scale_fill_viridis(option = "plasma", name = "xG Medio", direction = -1, 
                     breaks = pretty_breaks(n = 5), 
                     limits = c(0, max(tot_opf_rf$xG_Random_Forest, na.rm = TRUE))) + 
  
  # Migliora i dettagli del grafico
  labs(
    title = "Heat Map degli Expected Goals (xG)\nTiri Effettuati in Situazione di Open Play",
    x = NULL, y = NULL
  ) +
  
  # Coordinata per mostrare solo la metà campo
  coord_cartesian(xlim = c(50, 100), ylim = c(0, 100)) +
  theme_minimal() +
  theme(
    axis.text = element_blank(),    # Nascondi i valori degli assi
    axis.ticks = element_blank(),   # Nascondi i ticks degli assi
    panel.grid = element_blank(),   # Rimuovi le griglie
    legend.position = "bottom",     # Posiziona la legenda in basso
    plot.title = element_text(size = 14, face = "bold", hjust = 0.5)  # Centra il titolo e lo rende più visibile
  )

# Visualizza il grafico
p_heatmap_custom