library(tools)
library(stringi)
library(gtsummary)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(predtools)
library(magrittr)
library(probably)
##
## Attaching package: 'probably'
## The following objects are masked from 'package:base':
##
## as.factor, as.ordered
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom 1.0.6 ✔ rsample 1.2.1
## ✔ dials 1.3.0 ✔ tibble 3.2.1
## ✔ dplyr 1.1.4 ✔ tidyr 1.3.1
## ✔ infer 1.0.7 ✔ tune 1.2.1
## ✔ modeldata 1.4.0 ✔ workflows 1.1.4
## ✔ parsnip 1.2.1 ✔ workflowsets 1.1.0
## ✔ purrr 1.0.2 ✔ yardstick 1.3.1
## ✔ recipes 1.1.0
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ tidyr::extract() masks magrittr::extract()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ✖ yardstick::precision() masks caret::precision()
## ✖ yardstick::recall() masks caret::recall()
## ✖ yardstick::sensitivity() masks caret::sensitivity()
## ✖ purrr::set_names() masks magrittr::set_names()
## ✖ yardstick::specificity() masks caret::specificity()
## ✖ recipes::step() masks stats::step()
## • Dig deeper into tidy modeling with R at https://www.tmwr.org
library(caTools)
library(boot)
##
## Attaching package: 'boot'
## The following object is masked from 'package:lattice':
##
## melanoma
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## The following object is masked from 'package:gtsummary':
##
## select
library(rpart)
##
## Attaching package: 'rpart'
## The following object is masked from 'package:dials':
##
## prune
library(ranger)
##
## Attaching package: 'ranger'
## The following object is masked from 'package:randomForest':
##
## importance
library(gbm)
## Loaded gbm 2.2.2
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
library(xgboost)
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
library(e1071)
##
## Attaching package: 'e1071'
## The following object is masked from 'package:tune':
##
## tune
## The following object is masked from 'package:rsample':
##
## permutations
## The following object is masked from 'package:parsnip':
##
## tune
library(nnet)
library(dplyr)
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:xgboost':
##
## slice
## The following object is masked from 'package:MASS':
##
## select
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(ggsoccer)
library(ggtext)
library(viridis)
## Loading required package: viridisLite
##
## Attaching package: 'viridis'
## The following object is masked from 'package:scales':
##
## viridis_pal
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(rms)
## Loading required package: Hmisc
##
## Attaching package: 'Hmisc'
## The following object is masked from 'package:plotly':
##
## subplot
## The following object is masked from 'package:e1071':
##
## impute
## The following object is masked from 'package:parsnip':
##
## translate
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
shots_dataset <- read.csv("~/Cose Mie/shots_dataset.csv")
FullData <- read.csv("~/Cose Mie/FullData.csv")
players <- read.csv("~/Cose Mie/players.csv")
new.players <- read.csv("~/Cose Mie/new.players.csv")
kable(head(shots_dataset, 3), "html") %>%
kable_styling(full_width = T)
id | minute | result | X | Y | xG | player | h_a | player_id | situation | season | shotType | match_id | h_team | a_team | h_goals | a_goals | date | player_assisted | lastAction |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
445975 | 0 | MissedShots | 0.759 | 0.644 | 0.0186014 | Jorrit Hendrix | h | 9392 | OpenPlay | 2021 | LeftFoot | 16265 | Spartak Moscow | FK Akhmat | 2 | 1 | 2021-12-04 14:00:00 | Quincy Promes | Pass |
445976 | 6 | MissedShots | 0.738 | 0.352 | 0.0136572 | Jorrit Hendrix | h | 9392 | FromCorner | 2021 | LeftFoot | 16265 | Spartak Moscow | FK Akhmat | 2 | 1 | 2021-12-04 14:00:00 | None | |
445977 | 9 | SavedShot | 0.732 | 0.250 | 0.0310316 | Quincy Promes | h | 2815 | DirectFreekick | 2021 | RightFoot | 16265 | Spartak Moscow | FK Akhmat | 2 | 1 | 2021-12-04 14:00:00 | Standard |
shots_dataset$player <- stri_trans_general(shots_dataset$player, "Latin-ASCII")
giocatori <- as.data.frame(unique(shots_dataset$player))
kable(head(giocatori, 3), "html") %>%
kable_styling(full_width = T)
unique(shots_dataset$player) |
---|
Jorrit Hendrix |
Quincy Promes |
Victor Moses |
names(giocatori)[names(giocatori) == "unique(shots_dataset$player)"] <- "Name"
foot_dataset <- FullData[, c("Name", "Preffered_Foot")]
foot_dataset$Name <- stri_trans_general(foot_dataset$Name, "Latin-ASCII")
kable(head(foot_dataset, 3), "html") %>%
kable_styling(full_width = T)
Name | Preffered_Foot |
---|---|
Cristiano Ronaldo | Right |
Lionel Messi | Left |
Neymar | Right |
giocatori2 <- merge(giocatori, foot_dataset, by = "Name", all.x = TRUE)
kable(head(giocatori2, 3), "html") %>%
kable_styling(full_width = T)
Name | Preffered_Foot |
---|---|
Aaron Connolly | NA |
Aaron Cresswell | Left |
Aaron Hickey | NA |
colSums((is.na(giocatori2)))
## Name Preffered_Foot
## 0 2499
player2 <- players[, c("name", "foot")]
kable(head(player2, 3), "html") %>%
kable_styling(full_width = T)
name | foot |
---|---|
Timo Hildebrand | |
Martin Petrov | |
Martin Amedick |
player2 <- na.omit(player2)
player2$name <- stri_trans_general(player2$name, "Latin-ASCII")
names(player2)[names(player2) == "name"] <- "Name"
giocatori3 <- merge(giocatori2, player2, by = "Name", all.x = TRUE)
kable(head(giocatori3, 3), "html") %>%
kable_styling(full_width = T)
Name | Preffered_Foot | foot |
---|---|---|
Aaron Connolly | NA | right |
Aaron Cresswell | Left | left |
Aaron Hickey | NA | both |
giocatori3$Preffered_Foot <- ifelse(is.na(giocatori3$Preffered_Foot), giocatori3$foot, giocatori3$Preffered_Foot)
colSums(is.na(giocatori3))
## Name Preffered_Foot foot
## 0 514 651
new.players <- new.players[, c("name", "foot")]
##
nomi_da_cambiare <- c("Andrej Galabinov", "Anssumane Fati", "Arnaud Kalimuendo Muinga", "Daniel Parejo", "Emile Smith-Rowe", "Kephren Thuram", "Kylian Mbappe-Lottin", "Marco Faraoni", "Nemanja Vidci", "Pablo Daniel Osvaldo", "Luca Toni", "Tanguy NDombele Alvaro", "Thiago Alcantara", "Yeremi Pino", "Papis Demba Cisse", "Kevin Kuranyi", "Dimitar Berbatov")
giocatori3 <- giocatori3 %>%
mutate(Preffered_Foot = ifelse(Name %in% nomi_da_cambiare, "Right", Preffered_Foot))
giocatori3 <- subset(giocatori3, select = -c(foot))
nomi_da_cambiare2 <- c("Dimitri Kombarov", "Lee Kang-In", "Mohammed Ali-Cho", "Yaroslav Rakitskiy", "Pape Alassane Gueye")
giocatori3 <- giocatori3 %>%
mutate(Preffered_Foot = ifelse(Name %in% nomi_da_cambiare2, "Left", Preffered_Foot))
nomi_da_cambiare3 <- c("Kostas Mitroglu", "Santiago Cazorla", "Son Heung-Min")
giocatori3 <- giocatori3 %>%
mutate(Preffered_Foot = ifelse(Name %in% nomi_da_cambiare3, "both", Preffered_Foot))
giocatori3 <- giocatori3 %>%
mutate(across(everything(), na_if, ""))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(everything(), na_if, "")`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
##
## # Previously
## across(a:b, mean, na.rm = TRUE)
##
## # Now
## across(a:b, \(x) mean(x, na.rm = TRUE))
giocatori3$Preffered_Foot <- toTitleCase(giocatori3$Preffered_Foot)
giocatori3$Preffered_Foot <- ifelse(giocatori3$Preffered_Foot == "Right", "RightFoot", ifelse(giocatori3$Preffered_Foot == "Left", "LeftFoot", giocatori3$Preffered_Foot))
names(giocatori3)[names(giocatori3) == "Name"] <- "player"
shots_dataset2 <- merge(shots_dataset, giocatori3, by = "player", all.x = TRUE)
shots_dataset2 <- shots_dataset2 %>%
mutate(Preffered_Foot = ifelse(player == "Franck Zambo", "Right", Preffered_Foot))
shots_dataset2 <- shots_dataset2[shots_dataset2$result != "OwnGoal", ]
shots_dataset2$result <- ifelse(shots_dataset2$result == "Goal", 1, 0)
shots_dataset2$result <- as.factor(shots_dataset2$result)
shots_dataset2 <- na.omit(shots_dataset2)
names(shots_dataset2)[names(shots_dataset2) == "xG"] <- "xG Understat"
kable(head(shots_dataset2, 3), "html") %>%
kable_styling(full_width = T)
player | id | minute | result | X | Y | xG Understat | h_a | player_id | situation | season | shotType | match_id | h_team | a_team | h_goals | a_goals | date | player_assisted | lastAction | Preffered_Foot |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Aaron Connolly | 427269 | 51 | 0 | 0.913 | 0.536 | 0.5629963 | h | 7991 | OpenPlay | 2021 | LeftFoot | 16391 | Brighton | Watford | 2 | 0 | 2021-08-21 16:30:00 | None | RightFoot | |
Aaron Connolly | 329180 | 49 | 0 | 0.944 | 0.383 | 0.0140796 | h | 7991 | FromCorner | 2019 | Head | 11746 | Brighton | Norwich | 2 | 0 | 2019-11-02 15:00:00 | Pascal Groß | Aerial | RightFoot |
Aaron Connolly | 338614 | 91 | 0 | 0.797 | 0.377 | 0.0308471 | h | 7991 | OpenPlay | 2019 | RightFoot | 11815 | Brighton | Sheffield United | 0 | 1 | 2019-12-21 15:00:00 | None | RightFoot |
premier_league <- c("Watford", "Norwich", "Sheffield United", "Brighton", "Southampton", "Chelsea", "Liverpool", "Manchester United", "Wolverhampton Wanderers", "Tottenham", "Aston Villa", "Burnley", "Swansea", "West Ham", "Leeds", "Crystal Palace", "Manchester City", "Newcastle United", "Fulham", "Hull", "Arsenal", "Bournemouth", "West Bromwich Albion", "Stoke", "Sunderland", "Huddersfield", "Everton", "Leicester", "Cardiff", "Middlesbrough", "Queens Park Rangers", "Brentford")
seriea <- c("Bologna", "Genoa", "Salernitana", "AC Milan", "Fiorentina", "Cagliari", "Lazio", "Juventus", "Parma Calcio 1913", "Crotone", "SPAL 2013", "Sassuolo", "Lecce", "Udinese", "Roma", "Torino", "Inter", "Brescia", "Verona", "Parma", "Palermo", "Napoli", "Carpi", "Frosinone", "Venezia", "Sampdoria", "Benevento", "Spezia", "Empoli", "Chievo", "Atalanta", "Cesena", "Pescara")
bundesliga <- c("Hamburger SV", "Wolfsburg", "Eintracht Frankfurt", "RasenBallsport Leipzig", "Freiburg", "Bayern Munich", "Bayer Leverkusen", "Schalke 04", "Hertha Berlin", "Werder Bremen", "VfB Stuttgart", "Darmstadt", "Bochum", "Mainz 05", "Hannover 96", "Arminia Bielefeld", "Fortuna Duesseldorf", "Augsburg", "Hoffenheim", "Borussia M.Gladbach", "Paderborn", "FC Cologne", "Borussia Dortmund", "Greuther Fuerth", "Nuernberg", "Ingolstadt", "Union Berlin")
ligue1 <- c("Toulouse", "Marseille", "Nice", "Bordeaux", "Saint-Etienne", "Lille", "Strasbourg", "Guingamp", "Metz", "Brest", "Lyon", "Montpellier", "Reims", "Dijon", "Nantes", "Angers", "Rennes", "SC Bastia", "Paris Saint Germain", "Monaco", "Caen", "GFC Ajaccio", "Nimes", "Lorient", "Amiens", "Lens", "Troyes", "Evian Thonon Gaillard", "Nancy", "Clermont Foot")
liga <- c("Real Betis", "Celta Vigo", "Alaves", "Levante", "Elche", "Granada", "Deportivo La Coruna", "Villarreal", "Cordoba", "Athletic Club", "Rayo Vallecano", "Mallorca", "Real Valladolid", "Real Sociedad", "Sevilla", "Atletico Madrid", "Real Madrid", "Eibar", "Getafe", "Barcelona", "Malaga", "Girona", "Las Palmas", "Valencia", "SD Huesca", "Sporting Gijon", "Osasuna", "Cadiz", "Leganes", "Almeria")
shots_dataset2$lega <- ifelse(shots_dataset2$h_team %in% premier_league | shots_dataset2$a_team %in% premier_league, "Premier League",
ifelse(shots_dataset2$h_team %in% seriea | shots_dataset2$a_team %in% seriea, "Serie A",
ifelse(shots_dataset2$h_team %in% bundesliga | shots_dataset2$a_team %in% bundesliga, "Bundesliga",
ifelse(shots_dataset2$h_team %in% ligue1 | shots_dataset2$a_team %in% ligue1, "Ligue 1",
ifelse(shots_dataset2$h_team %in% liga | shots_dataset2$a_team %in% liga, "Liga", "Russian PL")))))
subset_penalty <- subset(shots_dataset2, situation == "Penalty")
subset_openplay <- subset(shots_dataset2, situation == "OpenPlay")
subset_setpiece <- subset(shots_dataset2, situation %in% c("SetPiece", "FromCorner", "DirectFreekick"))
kable(head(subset_openplay, 3), "html") %>%
kable_styling(full_width = T)
player | id | minute | result | X | Y | xG Understat | h_a | player_id | situation | season | shotType | match_id | h_team | a_team | h_goals | a_goals | date | player_assisted | lastAction | Preffered_Foot | lega | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | Aaron Connolly | 427269 | 51 | 0 | 0.913 | 0.536 | 0.5629963 | h | 7991 | OpenPlay | 2021 | LeftFoot | 16391 | Brighton | Watford | 2 | 0 | 2021-08-21 16:30:00 | None | RightFoot | Premier League | |
3 | Aaron Connolly | 338614 | 91 | 0 | 0.797 | 0.377 | 0.0308471 | h | 7991 | OpenPlay | 2019 | RightFoot | 11815 | Brighton | Sheffield United | 0 | 1 | 2019-12-21 15:00:00 | None | RightFoot | Premier League | |
4 | Aaron Connolly | 403235 | 49 | 0 | 0.847 | 0.382 | 0.1247877 | a | 7991 | OpenPlay | 2020 | RightFoot | 14656 | Burnley | Brighton | 1 | 1 | 2021-02-06 15:00:00 | Yves Bissouma | Tackle | RightFoot | Premier League |
openplay_foot <- subset(subset_openplay, shotType %in% c("LeftFoot", "RightFoot"))
openplay_head <- subset(subset_openplay, shotType == "Head")
setpiece_foot <- subset(subset_setpiece, shotType %in% c("LeftFoot", "RightFoot"))
setpiece_head <- subset(subset_setpiece, shotType == "Head")
tbl_summary(openplay_foot, include = c(minute, result, X, Y, shotType, lastAction))
Characteristic | N = 288,8871 |
---|---|
minute | 49 (26, 71) |
result | |
    0 | 260,327 (90%) |
    1 | 28,560 (9.9%) |
X | 0.85 (0.77, 0.89) |
Y | 0.50 (0.39, 0.62) |
shotType | |
    LeftFoot | 106,100 (37%) |
    RightFoot | 182,787 (63%) |
lastAction | |
    Aerial | 3,641 (1.3%) |
    BallRecovery | 9,647 (3.3%) |
    BallTouch | 5,416 (1.9%) |
    BlockedPass | 472 (0.2%) |
    Card | 27 (<0.1%) |
    Challenge | 29 (<0.1%) |
    ChanceMissed | 2 (<0.1%) |
    Chipped | 15,477 (5.4%) |
    Clearance | 47 (<0.1%) |
    CornerAwarded | 197 (<0.1%) |
    Cross | 16,612 (5.8%) |
    Dispossessed | 2,383 (0.8%) |
    End | 104 (<0.1%) |
    Error | 3 (<0.1%) |
    FormationChange | 12 (<0.1%) |
    Foul | 272 (<0.1%) |
    Goal | 143 (<0.1%) |
    GoodSkill | 69 (<0.1%) |
    HeadPass | 6,590 (2.3%) |
    Interception | 526 (0.2%) |
    KeeperPickup | 7 (<0.1%) |
    KeeperSweeper | 1 (<0.1%) |
    LayOff | 3,836 (1.3%) |
    None | 32,994 (11%) |
    OffsidePass | 81 (<0.1%) |
    OffsideProvoked | 7 (<0.1%) |
    Pass | 148,079 (51%) |
    Punch | 1 (<0.1%) |
    Rebound | 11,636 (4.0%) |
    Save | 13 (<0.1%) |
    ShieldBallOpp | 2 (<0.1%) |
    Smother | 1 (<0.1%) |
    Start | 10 (<0.1%) |
    SubstitutionOn | 36 (<0.1%) |
    Tackle | 969 (0.3%) |
    TakeOn | 21,584 (7.5%) |
    Throughball | 7,961 (2.8%) |
1 Median (Q1, Q3); n (%) |
tbl_summary(openplay_head, include = c(minute, result, X, Y, shotType, lastAction))
Characteristic | N = 30,8551 |
---|---|
minute | 49 (26, 73) |
result | |
    0 | 27,181 (88%) |
    1 | 3,674 (12%) |
X | 0.911 (0.890, 0.929) |
Y | 0.50 (0.46, 0.55) |
shotType | |
    Head | 30,855 (100%) |
lastAction | |
    Aerial | 9,053 (29%) |
    BallRecovery | 8 (<0.1%) |
    BallTouch | 65 (0.2%) |
    BlockedPass | 7 (<0.1%) |
    Card | 6 (<0.1%) |
    Challenge | 10 (<0.1%) |
    Chipped | 2,743 (8.9%) |
    Clearance | 2 (<0.1%) |
    CornerAwarded | 59 (0.2%) |
    Cross | 16,320 (53%) |
    Dispossessed | 3 (<0.1%) |
    End | 37 (0.1%) |
    Error | 2 (<0.1%) |
    FormationChange | 1 (<0.1%) |
    Foul | 67 (0.2%) |
    Goal | 26 (<0.1%) |
    HeadPass | 392 (1.3%) |
    Interception | 7 (<0.1%) |
    KeeperPickup | 3 (<0.1%) |
    LayOff | 6 (<0.1%) |
    None | 846 (2.7%) |
    OffsidePass | 7 (<0.1%) |
    Pass | 315 (1.0%) |
    Rebound | 735 (2.4%) |
    Save | 1 (<0.1%) |
    SubstitutionOff | 1 (<0.1%) |
    SubstitutionOn | 11 (<0.1%) |
    Tackle | 12 (<0.1%) |
    TakeOn | 21 (<0.1%) |
    Throughball | 89 (0.3%) |
1 Median (Q1, Q3); n (%) |
tbl_summary(setpiece_foot, include = c(minute, result, X, Y, shotType, lastAction))
Characteristic | N = 69,4751 |
---|---|
minute | 49 (26, 71) |
result | |
    0 | 64,065 (92%) |
    1 | 5,410 (7.8%) |
X | 0.80 (0.75, 0.89) |
Y | 0.51 (0.41, 0.61) |
shotType | |
    LeftFoot | 24,145 (35%) |
    RightFoot | 45,330 (65%) |
lastAction | |
    Aerial | 3,397 (4.9%) |
    BallRecovery | 65 (<0.1%) |
    BallTouch | 1,316 (1.9%) |
    BlockedPass | 24 (<0.1%) |
    Card | 14 (<0.1%) |
    Challenge | 9 (<0.1%) |
    Chipped | 1,528 (2.2%) |
    Clearance | 5 (<0.1%) |
    CornerAwarded | 154 (0.2%) |
    Cross | 5,988 (8.6%) |
    Dispossessed | 118 (0.2%) |
    End | 103 (0.1%) |
    Error | 2 (<0.1%) |
    FormationChange | 3 (<0.1%) |
    Foul | 139 (0.2%) |
    Goal | 55 (<0.1%) |
    GoodSkill | 3 (<0.1%) |
    HeadPass | 2,736 (3.9%) |
    Interception | 47 (<0.1%) |
    KeeperPickup | 4 (<0.1%) |
    KeeperSweeper | 1 (<0.1%) |
    LayOff | 182 (0.3%) |
    None | 16,313 (23%) |
    OffsidePass | 9 (<0.1%) |
    OffsideProvoked | 1 (<0.1%) |
    Pass | 9,535 (14%) |
    Rebound | 5,585 (8.0%) |
    Save | 5 (<0.1%) |
    Standard | 20,936 (30%) |
    Start | 1 (<0.1%) |
    SubstitutionOn | 7 (<0.1%) |
    Tackle | 93 (0.1%) |
    TakeOn | 987 (1.4%) |
    Throughball | 110 (0.2%) |
1 Median (Q1, Q3); n (%) |
tbl_summary(setpiece_head, include = c(minute, result, X, Y, shotType, lastAction))
Characteristic | N = 45,8331 |
---|---|
minute | 49 (26, 71) |
result | |
    0 | 41,601 (91%) |
    1 | 4,232 (9.2%) |
X | 0.917 (0.899, 0.934) |
Y | 0.50 (0.46, 0.55) |
shotType | |
    Head | 45,833 (100%) |
lastAction | |
    Aerial | 16,390 (36%) |
    BallRecovery | 13 (<0.1%) |
    BallTouch | 157 (0.3%) |
    BlockedPass | 42 (<0.1%) |
    Card | 23 (<0.1%) |
    Challenge | 16 (<0.1%) |
    Chipped | 2,327 (5.1%) |
    Clearance | 9 (<0.1%) |
    CornerAwarded | 146 (0.3%) |
    Cross | 23,526 (51%) |
    Dispossessed | 16 (<0.1%) |
    End | 98 (0.2%) |
    Error | 1 (<0.1%) |
    FormationChange | 2 (<0.1%) |
    Foul | 160 (0.3%) |
    Goal | 93 (0.2%) |
    HeadPass | 918 (2.0%) |
    Interception | 16 (<0.1%) |
    KeeperPickup | 4 (<0.1%) |
    LayOff | 6 (<0.1%) |
    None | 894 (2.0%) |
    OffsidePass | 35 (<0.1%) |
    OffsideProvoked | 2 (<0.1%) |
    Pass | 162 (0.4%) |
    Rebound | 710 (1.5%) |
    Save | 2 (<0.1%) |
    ShieldBallOpp | 3 (<0.1%) |
    Standard | 2 (<0.1%) |
    Start | 1 (<0.1%) |
    SubstitutionOn | 9 (<0.1%) |
    Tackle | 20 (<0.1%) |
    TakeOn | 25 (<0.1%) |
    Throughball | 5 (<0.1%) |
1 Median (Q1, Q3); n (%) |
Removing last action <10
openplay_foot <- subset(openplay_foot, !lastAction %in% c("ChanceMissed", "Error", "KeeperPickup", "KeeperSweeper", "OffsideProvoked", "Punch", "ShieldBallOpp", "Smother"))
openplay_head <- subset(openplay_head, !lastAction %in% c("BallRecovery", "BlockedPass", "Card", "Clearance", "Dispossessed", "Error", "FormationChange", "Interception", "KeeperPickup", "LayOff", "OffsidePass", "Save", "SubstiotutionOff"))
setpiece_foot <- subset(setpiece_foot, !lastAction %in% c("Challenge", "Clearance", "Error", "FormationChange", "KeeperPickup", "KeeperSweeper", "OffsidePass", "OffsideProvoked", "Save", "Start", "SubstiotutionOn"))
setpiece_head <- subset(setpiece_head, !lastAction %in% c("Clearance", "Error", "FormationChange", "KeeperPickup", "LayOff", "OffsideProvoked", "Save", "ShieldBallOpp", "Standard", "Start", "SubstiotutionOn", "Throughball"))
Introduction of is_weakfoot variable
openplay_foot$is_weakfoot <- ifelse(openplay_foot$Preffered_Foot == openplay_foot$shotType, "No", ifelse(openplay_foot$Preffered_Foot == "both", "No", "Yes"))
setpiece_foot$is_weakfoot <- ifelse(setpiece_foot$Preffered_Foot == setpiece_foot$shotType, "No", ifelse(setpiece_foot$Preffered_Foot == "both", "No", "Yes"))
understat_xG_op_foot <- openplay_foot$xG
understat_xG_op_head <- openplay_head$xG
understat_xG_sp_foot <- setpiece_foot$xG
understat_xG_sp_head <- setpiece_head$xG
kable(head(openplay_foot, 3), "html") %>%
kable_styling(full_width = T)
player | id | minute | result | X | Y | xG Understat | h_a | player_id | situation | season | shotType | match_id | h_team | a_team | h_goals | a_goals | date | player_assisted | lastAction | Preffered_Foot | lega | is_weakfoot | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | Aaron Connolly | 427269 | 51 | 0 | 0.913 | 0.536 | 0.5629963 | h | 7991 | OpenPlay | 2021 | LeftFoot | 16391 | Brighton | Watford | 2 | 0 | 2021-08-21 16:30:00 | None | RightFoot | Premier League | Yes | |
3 | Aaron Connolly | 338614 | 91 | 0 | 0.797 | 0.377 | 0.0308471 | h | 7991 | OpenPlay | 2019 | RightFoot | 11815 | Brighton | Sheffield United | 0 | 1 | 2019-12-21 15:00:00 | None | RightFoot | Premier League | No | |
4 | Aaron Connolly | 403235 | 49 | 0 | 0.847 | 0.382 | 0.1247877 | a | 7991 | OpenPlay | 2020 | RightFoot | 14656 | Burnley | Brighton | 1 | 1 | 2021-02-06 15:00:00 | Yves Bissouma | Tackle | RightFoot | Premier League | No |
For better interpretation, this is how I refer to the different models:
For shot type, the numbering is as follows:
For modeling, the numbering is as follows: 1. Logit 2. Logit with interactions 3. Discriminant Analysis 4. Random Forest 5. Bagging 8. Neural Network
For example model1.5 is bagging model for Open Play Foot
Note: For brevity, the stepwise procedure is not shown in this report. Where there is no model with interactions, it is because there is no significant improvement that would justify such a model, which is very costly in terms of computational time.
set.seed(123)
index_opf <- sample.split(Y = openplay_foot$result, SplitRatio = 0.75)
train_opf <- openplay_foot[index_opf, ]
indice_test_opf <- which(index_opf == FALSE)
test_opf <- openplay_foot[indice_test_opf, ]
modello1.1 <- glm(result ~ minute + lastAction + is_weakfoot + X + exp(Y^2) + h_a,
data = train_opf, family = binomial)
summary(modello1.1)
##
## Call:
## glm(formula = result ~ minute + lastAction + is_weakfoot + X +
## exp(Y^2) + h_a, family = binomial, data = train_opf)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.327e+01 1.594e-01 -83.287 < 2e-16 ***
## minute 1.444e-03 2.824e-04 5.113 3.16e-07 ***
## lastActionBallRecovery 8.943e-01 1.114e-01 8.029 9.85e-16 ***
## lastActionBallTouch 4.734e-01 1.169e-01 4.049 5.13e-05 ***
## lastActionBlockedPass 6.401e-01 2.185e-01 2.929 0.003400 **
## lastActionCard 1.393e-01 7.542e-01 0.185 0.853428
## lastActionChallenge 4.390e-01 1.049e+00 0.418 0.675667
## lastActionChipped 3.860e-01 1.055e-01 3.660 0.000253 ***
## lastActionClearance 1.148e-01 7.561e-01 0.152 0.879348
## lastActionCornerAwarded -2.692e+00 1.010e+00 -2.666 0.007675 **
## lastActionCross 4.258e-01 1.040e-01 4.095 4.23e-05 ***
## lastActionDispossessed 5.274e-01 1.384e-01 3.811 0.000139 ***
## lastActionEnd -2.009e+00 1.015e+00 -1.978 0.047884 *
## lastActionFormationChange 8.962e-01 1.093e+00 0.820 0.412368
## lastActionFoul 3.885e-03 2.842e-01 0.014 0.989093
## lastActionGoal -1.739e+00 7.240e-01 -2.402 0.016307 *
## lastActionGoodSkill 1.241e+00 4.443e-01 2.793 0.005219 **
## lastActionHeadPass 4.361e-01 1.135e-01 3.842 0.000122 ***
## lastActionInterception 7.496e-01 2.350e-01 3.190 0.001422 **
## lastActionLayOff 6.903e-01 1.306e-01 5.285 1.26e-07 ***
## lastActionNone 5.927e-01 1.031e-01 5.747 9.08e-09 ***
## lastActionOffsidePass -2.190e-02 5.418e-01 -0.040 0.967758
## lastActionPass 6.951e-01 1.010e-01 6.884 5.83e-12 ***
## lastActionRebound 1.110e+00 1.042e-01 10.652 < 2e-16 ***
## lastActionSave -7.818e+00 3.653e+01 -0.214 0.830549
## lastActionStart 2.035e+00 1.197e+00 1.699 0.089265 .
## lastActionSubstitutionOn 1.017e+00 5.857e-01 1.736 0.082476 .
## lastActionTackle 7.795e-01 1.732e-01 4.501 6.76e-06 ***
## lastActionTakeOn 7.280e-01 1.039e-01 7.009 2.40e-12 ***
## lastActionThroughball 1.488e+00 1.047e-01 14.216 < 2e-16 ***
## is_weakfootYes -1.602e-01 1.742e-02 -9.194 < 2e-16 ***
## X 1.291e+01 1.289e-01 100.099 < 2e-16 ***
## exp(Y^2) -5.834e-01 4.082e-02 -14.294 < 2e-16 ***
## h_ah 2.449e-02 1.511e-02 1.621 0.105116
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 139769 on 216646 degrees of freedom
## Residual deviance: 123770 on 216613 degrees of freedom
## AIC: 123838
##
## Number of Fisher Scoring iterations: 9
predict1.1 <- predict.glm(modello1.1, newdata = test_opf, type = "response")
roc1.1 <- roc(test_opf$result ~ predict1.1, plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright", legend = paste("AUC =", round(auc(roc1.1), 4)), lwd = 2, box.lwd = 0, bg = "white")
confusione1.1 <- confusionMatrix(as.factor(ifelse(predict1.1>0.3, 1, 0)), as.factor(test_opf$result))
confusione1.1
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 63353 6307
## 1 1723 833
##
## Accuracy : 0.8888
## 95% CI : (0.8865, 0.8911)
## No Information Rate : 0.9011
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1263
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9735
## Specificity : 0.1167
## Pos Pred Value : 0.9095
## Neg Pred Value : 0.3259
## Prevalence : 0.9011
## Detection Rate : 0.8773
## Detection Prevalence : 0.9646
## Balanced Accuracy : 0.5451
##
## 'Positive' Class : 0
##
rms::val.prob(as.numeric(as.character(predict1.1)), as.numeric(as.character(test_opf$result)))
## Dxy C (ROC) R2 D D:Chi-sq
## 4.937122e-01 7.468561e-01 1.422587e-01 7.001681e-02 5.057334e+03
## D:p U U:Chi-sq U:p Q
## NA 3.125414e-05 4.257049e+00 1.190128e-01 6.998556e-02
## Brier Intercept Slope Emax E90
## 8.249545e-02 -5.805545e-02 9.688800e-01 1.046446e-01 5.777724e-03
## Eavg S:z S:p
## 2.633787e-03 9.329611e-01 3.508401e-01
modello1.2 <- glm(formula = result ~ minute + lastAction + is_weakfoot + X +
exp(Y^2) + h_a + lastAction:X + is_weakfoot:exp(Y^2) + X:exp(Y^2) +
is_weakfoot:X + exp(Y^2):h_a + minute:lastAction + minute:h_a +
minute:exp(Y^2) + lastAction:h_a, family = binomial, data = train_opf)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(modello1.2)
##
## Call:
## glm(formula = result ~ minute + lastAction + is_weakfoot + X +
## exp(Y^2) + h_a + lastAction:X + is_weakfoot:exp(Y^2) + X:exp(Y^2) +
## is_weakfoot:X + exp(Y^2):h_a + minute:lastAction + minute:h_a +
## minute:exp(Y^2) + lastAction:h_a, family = binomial, data = train_opf)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.493e+01 1.521e+00 -9.817 < 2e-16 ***
## minute 1.603e-02 4.342e-03 3.692 0.000222 ***
## lastActionBallRecovery 7.784e-01 1.427e+00 0.545 0.585464
## lastActionBallTouch -1.297e+00 1.591e+00 -0.815 0.414990
## lastActionBlockedPass -3.624e+00 3.283e+00 -1.104 0.269698
## lastActionCard 1.157e+01 1.284e+01 0.901 0.367548
## lastActionChallenge -5.879e+02 1.246e+04 -0.047 0.962374
## lastActionChipped 3.317e+00 1.405e+00 2.361 0.018243 *
## lastActionClearance -5.464e+00 1.576e+03 -0.003 0.997233
## lastActionCornerAwarded 1.727e+02 2.529e+03 0.068 0.945571
## lastActionCross -5.733e+00 1.488e+00 -3.854 0.000116 ***
## lastActionDispossessed -1.104e+00 1.880e+00 -0.587 0.556960
## lastActionEnd 5.735e+02 3.299e+03 0.174 0.861998
## lastActionFormationChange 2.008e+02 1.270e+05 0.002 0.998739
## lastActionFoul 7.324e+00 3.945e+00 1.857 0.063370 .
## lastActionGoal 1.089e+01 8.492e+02 0.013 0.989772
## lastActionGoodSkill 1.025e+00 5.205e+00 0.197 0.843953
## lastActionHeadPass -1.060e+00 1.551e+00 -0.683 0.494455
## lastActionInterception -4.864e+00 3.025e+00 -1.608 0.107889
## lastActionLayOff -9.187e-01 1.790e+00 -0.513 0.607762
## lastActionNone -1.188e+00 1.349e+00 -0.881 0.378568
## lastActionOffsidePass 4.325e+00 7.721e+00 0.560 0.575313
## lastActionPass -1.781e+00 1.319e+00 -1.351 0.176837
## lastActionRebound -5.337e+00 1.421e+00 -3.755 0.000173 ***
## lastActionSave -5.817e+00 2.059e+04 0.000 0.999775
## lastActionStart -1.121e+01 6.523e+03 -0.002 0.998629
## lastActionSubstitutionOn -1.680e+01 1.601e+01 -1.050 0.293930
## lastActionTackle -8.704e-01 2.259e+00 -0.385 0.700041
## lastActionTakeOn 4.561e-01 1.361e+00 0.335 0.737519
## lastActionThroughball 1.381e+01 1.473e+00 9.374 < 2e-16 ***
## is_weakfootYes 1.098e-01 2.893e-01 0.379 0.704371
## X 1.398e+01 1.743e+00 8.022 1.04e-15 ***
## exp(Y^2) 2.016e+00 5.946e-01 3.390 0.000698 ***
## h_ah -5.780e-01 2.296e-01 -2.517 0.011825 *
## lastActionBallRecovery:X 4.389e-01 1.648e+00 0.266 0.789939
## lastActionBallTouch:X 2.787e+00 1.820e+00 1.531 0.125701
## lastActionBlockedPass:X 5.876e+00 3.709e+00 1.584 0.113156
## lastActionCard:X -1.265e+01 1.444e+01 -0.876 0.380987
## lastActionChallenge:X 2.126e+02 6.053e+03 0.035 0.971982
## lastActionChipped:X -2.796e+00 1.609e+00 -1.738 0.082216 .
## lastActionClearance:X -1.139e+01 4.333e+00 -2.630 0.008541 **
## lastActionCornerAwarded:X -2.254e+02 3.388e+03 -0.067 0.946963
## lastActionCross:X 7.177e+00 1.687e+00 4.255 2.09e-05 ***
## lastActionDispossessed:X 2.644e+00 2.156e+00 1.226 0.220024
## lastActionEnd:X -8.521e+02 4.825e+03 -0.177 0.859831
## lastActionFormationChange:X -3.728e+02 1.731e+05 -0.002 0.998282
## lastActionFoul:X -8.592e+00 4.424e+00 -1.942 0.052096 .
## lastActionGoal:X -2.884e+01 1.176e+01 -2.452 0.014193 *
## lastActionGoodSkill:X -1.715e+00 5.784e+00 -0.297 0.766777
## lastActionHeadPass:X 2.285e+00 1.773e+00 1.289 0.197357
## lastActionInterception:X 6.750e+00 3.388e+00 1.992 0.046331 *
## lastActionLayOff:X 2.191e+00 2.089e+00 1.049 0.294095
## lastActionNone:X 2.571e+00 1.548e+00 1.661 0.096766 .
## lastActionOffsidePass:X -3.654e+00 7.972e+00 -0.458 0.646686
## lastActionPass:X 3.227e+00 1.516e+00 2.129 0.033218 *
## lastActionRebound:X 7.672e+00 1.619e+00 4.740 2.14e-06 ***
## lastActionSave:X -1.085e+01 2.480e+04 0.000 0.999651
## lastActionStart:X -5.294e-01 1.248e+01 -0.042 0.966174
## lastActionSubstitutionOn:X 1.010e+01 1.490e+01 0.678 0.497963
## lastActionTackle:X 2.135e+00 2.569e+00 0.831 0.405788
## lastActionTakeOn:X 8.590e-01 1.562e+00 0.550 0.582282
## lastActionThroughball:X -1.324e+01 1.684e+00 -7.860 3.83e-15 ***
## is_weakfootYes:exp(Y^2) -8.210e-01 9.906e-02 -8.288 < 2e-16 ***
## X:exp(Y^2) -2.709e+00 6.750e-01 -4.013 5.99e-05 ***
## is_weakfootYes:X 9.331e-01 3.061e-01 3.048 0.002306 **
## exp(Y^2):h_ah 2.060e-01 8.160e-02 2.525 0.011577 *
## minute:lastActionBallRecovery -8.250e-03 4.226e-03 -1.952 0.050886 .
## minute:lastActionBallTouch -1.175e-02 4.456e-03 -2.636 0.008381 **
## minute:lastActionBlockedPass -1.606e-02 8.260e-03 -1.944 0.051918 .
## minute:lastActionCard 2.070e-03 3.606e-02 0.057 0.954232
## minute:lastActionChallenge 5.181e+00 1.032e+02 0.050 0.959962
## minute:lastActionChipped -8.817e-03 4.026e-03 -2.190 0.028521 *
## minute:lastActionClearance -1.062e-02 2.520e-02 -0.421 0.673493
## minute:lastActionCornerAwarded -9.022e-01 1.472e+01 -0.061 0.951122
## minute:lastActionCross -9.620e-03 3.972e-03 -2.422 0.015441 *
## minute:lastActionDispossessed -1.557e-02 5.282e-03 -2.947 0.003210 **
## minute:lastActionEnd 1.572e+00 1.062e+01 0.148 0.882327
## minute:lastActionFormationChange 1.324e+00 1.670e+02 0.008 0.993674
## minute:lastActionFoul -1.878e-02 1.103e-02 -1.703 0.088586 .
## minute:lastActionGoal -3.517e-02 3.015e-02 -1.167 0.243331
## minute:lastActionGoodSkill 3.062e-02 2.189e-02 1.399 0.161903
## minute:lastActionHeadPass -1.166e-02 4.309e-03 -2.705 0.006827 **
## minute:lastActionInterception 2.194e-03 9.279e-03 0.236 0.813085
## minute:lastActionLayOff -5.453e-03 4.961e-03 -1.099 0.271714
## minute:lastActionNone -1.066e-02 3.946e-03 -2.702 0.006897 **
## minute:lastActionOffsidePass -4.063e-02 2.443e-02 -1.664 0.096210 .
## minute:lastActionPass -9.567e-03 3.871e-03 -2.471 0.013461 *
## minute:lastActionRebound -1.211e-02 3.993e-03 -3.034 0.002417 **
## minute:lastActionSave -1.389e-02 7.405e+01 0.000 0.999850
## minute:lastActionStart -3.936e-01 1.449e+02 -0.003 0.997833
## minute:lastActionSubstitutionOn 1.278e-01 7.682e-02 1.663 0.096248 .
## minute:lastActionTackle -7.866e-03 6.632e-03 -1.186 0.235583
## minute:lastActionTakeOn -1.117e-02 3.976e-03 -2.811 0.004943 **
## minute:lastActionThroughball -1.113e-02 3.997e-03 -2.786 0.005339 **
## minute:h_ah -9.621e-04 5.706e-04 -1.686 0.091747 .
## minute:exp(Y^2) -3.052e-03 1.528e-03 -1.998 0.045769 *
## lastActionBallRecovery:h_ah 3.278e-01 2.222e-01 1.475 0.140144
## lastActionBallTouch:h_ah 3.752e-02 2.339e-01 0.160 0.872590
## lastActionBlockedPass:h_ah -4.142e-02 4.531e-01 -0.091 0.927167
## lastActionCard:h_ah -1.666e+01 2.035e+03 -0.008 0.993468
## lastActionChallenge:h_ah 1.521e+01 1.010e+03 0.015 0.987981
## lastActionChipped:h_ah 2.693e-01 2.102e-01 1.281 0.200194
## lastActionClearance:h_ah 1.664e+01 1.576e+03 0.011 0.991573
## lastActionCornerAwarded:h_ah 1.093e+01 6.754e+02 0.016 0.987086
## lastActionCross:h_ah 4.412e-01 2.075e-01 2.126 0.033506 *
## lastActionDispossessed:h_ah 3.282e-01 2.773e-01 1.183 0.236643
## lastActionEnd:h_ah -2.914e+02 1.725e+03 -0.169 0.865815
## lastActionFormationChange:h_ah 6.069e+01 2.276e+04 0.003 0.997872
## lastActionFoul:h_ah 2.016e+00 8.014e-01 2.515 0.011904 *
## lastActionGoal:h_ah 1.591e+01 8.491e+02 0.019 0.985052
## lastActionGoodSkill:h_ah 3.968e-01 9.520e-01 0.417 0.676807
## lastActionHeadPass:h_ah 3.384e-01 2.269e-01 1.491 0.135855
## lastActionInterception:h_ah -4.266e-01 4.976e-01 -0.857 0.391232
## lastActionLayOff:h_ah 9.304e-02 2.614e-01 0.356 0.721864
## lastActionNone:h_ah 2.946e-01 2.060e-01 1.430 0.152650
## lastActionOffsidePass:h_ah 1.294e+00 1.343e+00 0.963 0.335473
## lastActionPass:h_ah 4.271e-01 2.017e-01 2.118 0.034190 *
## lastActionRebound:h_ah 3.789e-01 2.085e-01 1.817 0.069197 .
## lastActionSave:h_ah 4.704e-01 4.439e+03 0.000 0.999915
## lastActionStart:h_ah 1.464e+01 6.523e+03 0.002 0.998210
## lastActionSubstitutionOn:h_ah -1.440e+00 1.878e+00 -0.767 0.443286
## lastActionTackle:h_ah 5.257e-01 3.483e-01 1.509 0.131209
## lastActionTakeOn:h_ah 3.459e-01 2.073e-01 1.668 0.095298 .
## lastActionThroughball:h_ah 4.527e-01 2.087e-01 2.169 0.030077 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 139769 on 216646 degrees of freedom
## Residual deviance: 122830 on 216523 degrees of freedom
## AIC: 123078
##
## Number of Fisher Scoring iterations: 17
predict1.2 <- predict.glm(modello1.2, newdata = test_opf, type = "response")
confusione1.2 <- confusionMatrix(as.factor(ifelse(predict1.2>0.3, 1, 0)), as.factor(test_opf$result))
confusione1.2
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 63054 6204
## 1 2022 936
##
## Accuracy : 0.8861
## 95% CI : (0.8838, 0.8884)
## No Information Rate : 0.9011
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1353
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9689
## Specificity : 0.1311
## Pos Pred Value : 0.9104
## Neg Pred Value : 0.3164
## Prevalence : 0.9011
## Detection Rate : 0.8731
## Detection Prevalence : 0.9590
## Balanced Accuracy : 0.5500
##
## 'Positive' Class : 0
##
rms::val.prob(as.numeric(as.character(predict1.2)), as.numeric(as.character(test_opf$result)))
## Warning in rms::val.prob(as.numeric(as.character(predict1.2)),
## as.numeric(as.character(test_opf$result))): 2 observations deleted from
## logistic calibration due to probs. of 0 or 1
## Dxy C (ROC) R2 D D:Chi-sq
## 4.992168e-01 7.496084e-01 1.392605e-01 6.849007e-02 4.946942e+03
## D:p U U:Chi-sq U:p Q
## NA 5.514068e-04 4.181929e+01 8.299584e-10 6.793867e-02
## Brier Intercept Slope Emax E90
## 8.220725e-02 -1.447628e-01 9.225944e-01 2.886326e-01 3.951237e-03
## Eavg S:z S:p
## 2.745378e-03 1.345419e+00 1.784899e-01
roc1.2 <- roc(test_opf$result ~ predict1.2, plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright", legend = paste("AUC =", round(auc(roc1.2), 4)), lwd = 2, box.lwd = 0, bg = "white")
set.seed(1234)
new_train_opf <- train_opf
new_train_opf$Y <- exp(new_train_opf$Y^2)
new_test_opf <- test_opf
new_test_opf$Y <- exp(new_test_opf$Y^2)
set.seed(1234)
modello1.3 <- lda(result ~ minute + lastAction + is_weakfoot + X + exp(Y^2) + h_a, data = train_opf)
summary(modello1.3)
## Length Class Mode
## prior 2 -none- numeric
## counts 2 -none- numeric
## means 66 -none- numeric
## scaling 33 -none- numeric
## lev 2 -none- character
## svd 1 -none- numeric
## N 1 -none- numeric
## call 3 -none- call
## terms 3 terms call
## xlevels 3 -none- list
predict1.3.1 <- predict(modello1.3, newdata = test_opf)
predict1.3 <- predict1.3.1$posterior[, 2]
confusione1.3.1 <- confusionMatrix(as.factor(predict1.3.1$class), as.factor(test_opf$result))
confusione1.3.1
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 64747 6980
## 1 329 160
##
## Accuracy : 0.8988
## 95% CI : (0.8966, 0.901)
## No Information Rate : 0.9011
## P-Value [Acc > NIR] : 0.9825
##
## Kappa : 0.0296
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.99494
## Specificity : 0.02241
## Pos Pred Value : 0.90269
## Neg Pred Value : 0.32720
## Prevalence : 0.90113
## Detection Rate : 0.89657
## Detection Prevalence : 0.99323
## Balanced Accuracy : 0.50868
##
## 'Positive' Class : 0
##
confusione1.3 <- confusionMatrix(as.factor(ifelse(predict1.3>0.3, 1, 0)), as.factor(test_opf$result))
roc1.3 <- roc(test_opf$result ~ predict1.3, plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright", legend = paste("AUC =", round(auc(roc1.3), 4)), lwd = 2, box.lwd = 0, bg = "white")
rms::val.prob(as.numeric(as.character(predict1.3)), as.numeric(as.character(test_opf$result)))
## Dxy C (ROC) R2 D D:Chi-sq
## 4.895863e-01 7.447931e-01 1.329346e-01 6.527350e-02 4.714791e+03
## D:p U U:Chi-sq U:p Q
## NA 3.116984e-04 2.450961e+01 4.762170e-06 6.496181e-02
## Brier Intercept Slope Emax E90
## 8.333969e-02 -1.391553e-01 9.295095e-01 2.800603e-01 2.064602e-02
## Eavg S:z S:p
## 1.156119e-02 3.082489e+00 2.052772e-03
set.seed(1234)
modello1.4 <- ranger(train_opf$result ~ minute + lastAction + is_weakfoot + X + Y + h_a,
data = new_train_opf, num.trees = 500, mtry = 3, seed = 42, probability = T)
train_predict_1.4.1 <- predict(modello1.4, data = new_train_opf)
train_predict1.4 <- train_predict_1.4.1$predictions[, 2]
predict1.4.1 <- predict(modello1.4, data = new_test_opf)
predict1.4 <- predict1.4.1$predictions[, 2]
confusione1.4 <- confusionMatrix(as.factor(ifelse(predict1.4 >0.3, 1, 0)), as.factor(test_opf$result))
confusione1.4
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 61892 3957
## 1 3184 3183
##
## Accuracy : 0.9011
## 95% CI : (0.8989, 0.9033)
## No Information Rate : 0.9011
## P-Value [Acc > NIR] : 0.5081
##
## Kappa : 0.417
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9511
## Specificity : 0.4458
## Pos Pred Value : 0.9399
## Neg Pred Value : 0.4999
## Prevalence : 0.9011
## Detection Rate : 0.8570
## Detection Prevalence : 0.9118
## Balanced Accuracy : 0.6984
##
## 'Positive' Class : 0
##
roc1.4 <- roc(test_opf$result ~ as.numeric(predict1.4), plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright", legend = paste("AUC =", round(auc(roc1.4), 4)), lwd = 2, box.lwd = 0, bg = "white")
rms::val.prob(as.numeric(as.character(predict1.4)), as.numeric(as.character(new_test_opf$result)))
## Warning in rms::val.prob(as.numeric(as.character(predict1.4)),
## as.numeric(as.character(new_test_opf$result))): 929 observations deleted from
## logistic calibration due to probs. of 0 or 1
## Dxy C (ROC) R2 D D:Chi-sq
## 6.706364e-01 8.353182e-01 3.232237e-01 1.679441e-01 1.197323e+04
## D:p U U:Chi-sq U:p Q
## NA Inf Inf 0.000000e+00 -Inf
## Brier Intercept Slope Emax E90
## 6.651975e-02 -2.583708e-01 8.922615e-01 4.151460e-02 3.601427e-02
## Eavg S:z S:p
## 1.228770e-02 -3.416589e+00 6.341090e-04
set.seed(1234)
modello1.5 <- ranger(train_opf$result ~ minute + lastAction + is_weakfoot + X + Y + h_a,
data = new_train_opf, num.trees = 500, seed = 42, probability = T)
predict1.5.1 <- predict(modello1.5, data = new_test_opf)
predict1.5 <- predict1.5.1$predictions[, 2]
confusione1.5 <- confusionMatrix(as.factor(ifelse(predict1.5>0.3, 1, 0)), as.factor(new_test_opf$result))
confusione1.5
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 62474 4157
## 1 2602 2983
##
## Accuracy : 0.9064
## 95% CI : (0.9043, 0.9085)
## No Information Rate : 0.9011
## P-Value [Acc > NIR] : 8.749e-07
##
## Kappa : 0.4184
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9600
## Specificity : 0.4178
## Pos Pred Value : 0.9376
## Neg Pred Value : 0.5341
## Prevalence : 0.9011
## Detection Rate : 0.8651
## Detection Prevalence : 0.9227
## Balanced Accuracy : 0.6889
##
## 'Positive' Class : 0
##
roc1.5 <- roc(new_test_opf$result ~ as.numeric(predict1.5), plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright", legend = paste("AUC =", round(auc(roc1.5), 4)), lwd = 2, box.lwd = 0, bg = "white")
rms::val.prob(as.numeric(as.character(predict1.5)), as.numeric(as.character(new_test_opf$result)))
## Dxy C (ROC) R2 D D:Chi-sq
## 6.784610e-01 8.392305e-01 3.261361e-01 1.684718e-01 1.216736e+04
## D:p U U:Chi-sq U:p Q
## NA 9.957298e-04 7.390762e+01 1.110223e-16 1.674760e-01
## Brier Intercept Slope Emax E90
## 6.710256e-02 1.193215e-01 1.096641e+00 1.542791e-01 1.849012e-02
## Eavg S:z S:p
## 1.020388e-02 -8.226908e+00 1.921075e-16
modello1.8 <- nnet(result ~ minute + lastAction + is_weakfoot + X + Y + h_a, data = new_train_opf, size = 10, maxit = 1000, linout = FALSE)
## # weights: 351
## initial value 118545.093917
## iter 10 value 69981.009756
## iter 20 value 69192.592115
## iter 30 value 67888.711525
## iter 40 value 63628.809594
## iter 50 value 62525.075031
## iter 60 value 62057.467967
## iter 70 value 61604.685356
## iter 80 value 61532.876008
## iter 90 value 61487.160400
## iter 100 value 61233.657946
## iter 110 value 60338.128851
## iter 120 value 59616.426996
## iter 130 value 58733.031793
## iter 140 value 58349.312834
## iter 150 value 58148.958166
## iter 160 value 57935.848822
## iter 170 value 57756.338123
## iter 180 value 57639.865936
## iter 190 value 57573.532413
## iter 200 value 57512.936093
## iter 210 value 57426.532572
## iter 220 value 57327.745162
## iter 230 value 57217.958189
## iter 240 value 57172.723167
## iter 250 value 57148.220125
## iter 260 value 57133.696190
## iter 270 value 57116.527271
## iter 280 value 57102.667221
## iter 290 value 57084.939379
## iter 300 value 57056.657224
## iter 310 value 57048.010413
## iter 320 value 57038.515424
## iter 330 value 57021.976784
## iter 340 value 57009.160524
## iter 350 value 57001.456556
## iter 360 value 56993.776476
## iter 370 value 56988.466171
## iter 380 value 56985.363909
## iter 390 value 56984.264328
## iter 400 value 56983.744595
## iter 410 value 56983.261082
## iter 420 value 56982.689945
## iter 430 value 56981.621792
## iter 440 value 56979.853764
## iter 450 value 56977.893636
## iter 460 value 56975.779385
## iter 470 value 56973.815073
## iter 480 value 56972.577633
## iter 490 value 56971.942203
## iter 500 value 56971.328528
## iter 510 value 56969.780842
## iter 520 value 56969.663506
## iter 530 value 56969.309645
## iter 540 value 56968.354652
## iter 550 value 56967.126957
## iter 560 value 56964.815551
## iter 570 value 56961.936920
## iter 580 value 56957.827947
## iter 590 value 56955.201372
## iter 600 value 56954.353889
## iter 610 value 56953.875346
## iter 620 value 56953.530963
## iter 630 value 56953.236351
## iter 640 value 56953.022966
## iter 650 value 56952.857124
## iter 660 value 56952.730886
## iter 670 value 56952.602141
## iter 680 value 56952.471966
## iter 690 value 56952.311041
## iter 700 value 56952.158581
## iter 710 value 56951.968581
## iter 720 value 56951.865165
## iter 730 value 56951.791552
## iter 740 value 56951.713886
## iter 750 value 56951.575373
## iter 760 value 56951.479369
## iter 770 value 56951.335809
## iter 780 value 56951.186863
## iter 790 value 56951.084841
## iter 800 value 56950.981560
## iter 810 value 56950.861977
## iter 820 value 56950.727533
## iter 830 value 56950.537039
## iter 840 value 56950.345343
## iter 850 value 56950.119654
## iter 860 value 56949.843306
## iter 870 value 56949.747925
## iter 880 value 56949.637210
## iter 890 value 56949.555767
## iter 900 value 56949.517758
## iter 910 value 56949.381139
## iter 920 value 56949.296510
## iter 930 value 56949.210179
## iter 940 value 56949.124307
## iter 950 value 56948.983340
## iter 960 value 56948.841103
## iter 970 value 56948.594921
## iter 980 value 56948.099597
## iter 990 value 56947.901399
## iter1000 value 56947.590591
## final value 56947.590591
## stopped after 1000 iterations
predict1.8 <- predict(modello1.8, newdata = new_test_opf)
confusione1.8 <- confusionMatrix(as.factor(ifelse(predict1.8>0.3, 1, 0)), new_test_opf$result)
confusione1.8
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 61956 4927
## 1 3120 2213
##
## Accuracy : 0.8886
## 95% CI : (0.8863, 0.8909)
## No Information Rate : 0.9011
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2953
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9521
## Specificity : 0.3099
## Pos Pred Value : 0.9263
## Neg Pred Value : 0.4150
## Prevalence : 0.9011
## Detection Rate : 0.8579
## Detection Prevalence : 0.9262
## Balanced Accuracy : 0.6310
##
## 'Positive' Class : 0
##
roc1.8 <- roc(new_test_opf$result ~ as.numeric(predict1.8), plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright", legend = paste("AUC =", round(auc(roc1.8), 4)), lwd = 2, box.lwd = 0, bg = "white")
rms::val.prob(as.numeric(as.character(predict1.8)), as.numeric(as.character(new_test_opf$result)))
## Warning in rms::val.prob(as.numeric(as.character(predict1.8)),
## as.numeric(as.character(new_test_opf$result))): 315 observations deleted from
## logistic calibration due to probs. of 0 or 1
## Dxy C (ROC) R2 D D:Chi-sq
## 5.834815e-01 7.917408e-01 2.184253e-01 1.097806e-01 7.894334e+03
## D:p U U:Chi-sq U:p Q
## NA Inf Inf 0.000000e+00 -Inf
## Brier Intercept Slope Emax E90
## 7.652105e-02 -7.947524e-02 9.565840e-01 4.063039e-02 7.167874e-03
## Eavg S:z S:p
## 3.731212e-03 1.660904e+00 9.673279e-02
confusione1.us <- confusionMatrix(as.factor(ifelse(test_opf$`xG Understat`>0.3, 1, 0)), test_opf$result)
confusione1.us
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 60545 3589
## 1 4531 3551
##
## Accuracy : 0.8876
## 95% CI : (0.8852, 0.8899)
## No Information Rate : 0.9011
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.404
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9304
## Specificity : 0.4973
## Pos Pred Value : 0.9440
## Neg Pred Value : 0.4394
## Prevalence : 0.9011
## Detection Rate : 0.8384
## Detection Prevalence : 0.8881
## Balanced Accuracy : 0.7139
##
## 'Positive' Class : 0
##
roc1.us <- roc(test_opf$result ~ as.numeric(test_opf$`xG Understat`), plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright", legend = paste("AUC =", round(auc(roc1.us), 4)), lwd = 2, box.lwd = 0, bg = "white")
rms::val.prob(as.numeric(as.character(test_opf$`xG Understat`)), as.numeric(as.character(test_opf$result)))
## Warning in rms::val.prob(as.numeric(as.character(test_opf$`xG Understat`)), : 6
## observations deleted from logistic calibration due to probs. of 0 or 1
## Dxy C (ROC) R2 D D:Chi-sq
## 6.294039e-01 8.147019e-01 2.776084e-01 1.415280e-01 1.022074e+04
## D:p U U:Chi-sq U:p Q
## NA Inf Inf 0.000000e+00 -Inf
## Brier Intercept Slope Emax E90
## 7.128048e-02 -1.240680e-01 9.296240e-01 8.520251e-02 8.860956e-03
## Eavg S:z S:p
## 5.630629e-03 3.822074e+00 1.323339e-04
set.seed(123)
index_oph <- sample.split(Y = openplay_head$result, SplitRatio = 0.75)
train_oph <- openplay_head[index_oph, ]
indice_test_oph <- which(index_oph == FALSE)
test_oph <- openplay_head[indice_test_oph, ]
modello2.1 <- glm(result ~ minute + lastAction + X + exp(Y^2) + h_a,
data = train_oph, family = binomial)
summary2.1 <- summary(modello2.1)
summary2.1
##
## Call:
## glm(formula = result ~ minute + lastAction + X + exp(Y^2) + h_a,
## family = binomial, data = train_oph)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.680e+01 7.831e-01 -34.219 < 2e-16 ***
## minute -9.752e-04 7.921e-04 -1.231 0.21826
## lastActionBallTouch -2.660e-01 6.064e-01 -0.439 0.66098
## lastActionChallenge -1.277e+01 6.023e+02 -0.021 0.98309
## lastActionChipped 9.195e-01 8.387e-02 10.964 < 2e-16 ***
## lastActionCornerAwarded -1.305e+01 2.282e+02 -0.057 0.95440
## lastActionCross 7.696e-01 5.820e-02 13.222 < 2e-16 ***
## lastActionEnd -1.284e+01 2.877e+02 -0.045 0.96441
## lastActionFoul -1.302e+01 1.854e+02 -0.070 0.94400
## lastActionGoal -1.338e+01 3.162e+02 -0.042 0.96626
## lastActionHeadPass 9.408e-01 1.567e-01 6.002 1.95e-09 ***
## lastActionNone 5.923e-01 1.292e-01 4.585 4.54e-06 ***
## lastActionPass 7.710e-01 1.802e-01 4.279 1.88e-05 ***
## lastActionRebound 1.274e+00 1.125e-01 11.322 < 2e-16 ***
## lastActionSubstitutionOff 1.729e+01 1.455e+03 0.012 0.99052
## lastActionSubstitutionOn -4.954e-01 1.063e+00 -0.466 0.64106
## lastActionTackle -1.282e+01 4.519e+02 -0.028 0.97737
## lastActionTakeOn 1.697e+00 6.706e-01 2.530 0.01141 *
## lastActionThroughball 2.603e+00 2.779e-01 9.368 < 2e-16 ***
## X 2.731e+01 7.882e-01 34.656 < 2e-16 ***
## exp(Y^2) -7.093e-01 2.478e-01 -2.862 0.00421 **
## h_ah 5.532e-02 4.326e-02 1.279 0.20097
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 16885 on 23101 degrees of freedom
## Residual deviance: 15006 on 23080 degrees of freedom
## AIC: 15050
##
## Number of Fisher Scoring iterations: 14
predict2.1 <- predict.glm(modello2.1, newdata = test_oph, type = "response")
roc2.1 <- roc(test_oph$result ~ predict2.1, plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright", legend = paste("AUC =", round(auc(roc2.1), 4)), lwd = 2, box.lwd = 0, bg = "white")
confusione2.1 <- confusionMatrix(as.factor(ifelse(predict2.1>0.3, 1, 0)), as.factor(test_oph$result))
confusione2.1
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 6430 743
## 1 352 175
##
## Accuracy : 0.8578
## 95% CI : (0.8498, 0.8655)
## No Information Rate : 0.8808
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.17
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9481
## Specificity : 0.1906
## Pos Pred Value : 0.8964
## Neg Pred Value : 0.3321
## Prevalence : 0.8808
## Detection Rate : 0.8351
## Detection Prevalence : 0.9316
## Balanced Accuracy : 0.5694
##
## 'Positive' Class : 0
##
rms::val.prob(as.numeric(as.character(predict2.1)), as.numeric(as.character(test_oph$result)))
## Dxy C (ROC) R2 D D:Chi-sq
## 4.741098e-01 7.370549e-01 1.357465e-01 7.284670e-02 5.619196e+02
## D:p U U:Chi-sq U:p Q
## NA 2.890682e-05 2.222582e+00 3.291337e-01 7.281779e-02
## Brier Intercept Slope Emax E90
## 9.674780e-02 -1.001238e-01 9.372446e-01 1.817344e-01 1.510059e-02
## Eavg S:z S:p
## 6.637031e-03 8.586321e-01 3.905435e-01
set.seed(1234)
new_train_oph <- train_oph
new_train_oph$Y <- exp(new_train_oph$Y^2)
new_test_oph <- test_oph
new_test_oph$Y <- exp(new_test_oph$Y^2)
set.seed(1234)
modello2.3 <- lda(result ~ minute + lastAction + X + exp(Y^2) + h_a, data = train_oph)
summary(modello2.3)
## Length Class Mode
## prior 2 -none- numeric
## counts 2 -none- numeric
## means 42 -none- numeric
## scaling 21 -none- numeric
## lev 2 -none- character
## svd 1 -none- numeric
## N 1 -none- numeric
## call 3 -none- call
## terms 3 terms call
## xlevels 2 -none- list
predict2.3.1 <- predict(modello2.3, newdata = test_oph)
predict2.3 <- predict2.3.1$posterior[, 2]
confusione2.3.1 <- confusionMatrix(as.factor(predict2.3.1$class), as.factor(test_oph$result))
confusione2.3.1
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 6735 885
## 1 47 33
##
## Accuracy : 0.879
## 95% CI : (0.8715, 0.8862)
## No Information Rate : 0.8808
## P-Value [Acc > NIR] : 0.6961
##
## Kappa : 0.0479
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.99307
## Specificity : 0.03595
## Pos Pred Value : 0.88386
## Neg Pred Value : 0.41250
## Prevalence : 0.88078
## Detection Rate : 0.87468
## Detection Prevalence : 0.98961
## Balanced Accuracy : 0.51451
##
## 'Positive' Class : 0
##
confusione2.3 <- confusionMatrix(as.factor(ifelse(predict2.3>0.3, 1, 0)), as.factor(test_oph$result))
roc2.3 <- roc(test_oph$result ~ predict2.3, plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright", legend = paste("AUC =", round(auc(roc2.3), 4)), lwd = 2, box.lwd = 0, bg = "white")
rms::val.prob(as.numeric(as.character(predict2.3)), as.numeric(as.character(test_oph$result)))
## Dxy C (ROC) R2 D D:Chi-sq
## 4.699917e-01 7.349958e-01 1.318688e-01 7.068646e-02 5.452857e+02
## D:p U U:Chi-sq U:p Q
## NA 4.950705e-04 5.812043e+00 5.469290e-02 7.019139e-02
## Brier Intercept Slope Emax E90
## 9.774986e-02 -1.619795e-01 9.020178e-01 3.755251e-01 2.307149e-02
## Eavg S:z S:p
## 1.214045e-02 1.746477e+00 8.072811e-02
set.seed(1234)
modello2.4 <- ranger(train_oph$result ~ minute + lastAction + X + Y + h_a,
data = new_train_oph, num.trees = 500, mtry = 3, seed = 42, probability = T)
predict2.4.1 <- predict(modello2.4, data = new_test_oph)
predict2.4 <- predict2.4.1$predictions[, 2]
confusione2.4 <- confusionMatrix(as.factor(ifelse(predict2.4 >0.3, 1, 0)), as.factor(test_oph$result))
confusione2.4
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 6272 522
## 1 510 396
##
## Accuracy : 0.866
## 95% CI : (0.8582, 0.8735)
## No Information Rate : 0.8808
## P-Value [Acc > NIR] : 1.000
##
## Kappa : 0.3582
##
## Mcnemar's Test P-Value : 0.732
##
## Sensitivity : 0.9248
## Specificity : 0.4314
## Pos Pred Value : 0.9232
## Neg Pred Value : 0.4371
## Prevalence : 0.8808
## Detection Rate : 0.8145
## Detection Prevalence : 0.8823
## Balanced Accuracy : 0.6781
##
## 'Positive' Class : 0
##
roc2.4 <- roc(test_oph$result ~ as.numeric(predict2.4), plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright", legend = paste("AUC =", round(auc(roc2.4), 4)), lwd = 2, box.lwd = 0, bg = "white")
rms::val.prob(as.numeric(as.character(predict2.4)), as.numeric(as.character(new_test_oph$result)))
## Warning in rms::val.prob(as.numeric(as.character(predict2.4)),
## as.numeric(as.character(new_test_oph$result))): 245 observations deleted from
## logistic calibration due to probs. of 0 or 1
## Dxy C (ROC) R2 D D:Chi-sq
## 5.957707e-01 7.978854e-01 2.638290e-01 1.492232e-01 1.113459e+03
## D:p U U:Chi-sq U:p Q
## NA 7.485408e-03 5.780371e+01 2.805534e-13 1.417378e-01
## Brier Intercept Slope Emax E90
## 8.376312e-02 -4.099566e-01 7.751907e-01 9.897154e-02 5.407294e-02
## Eavg S:z S:p
## 2.153259e-02 1.694824e+00 9.010883e-02
set.seed(1234)
modello2.5 <- ranger(train_oph$result ~ minute + lastAction + X + Y + h_a,
data = new_train_oph, num.trees = 500, seed = 42, probability = T)
predict2.5.1 <- predict(modello2.5, data = new_test_oph)
predict2.5 <- predict2.5.1$predictions[, 2]
confusione2.5 <- confusionMatrix(as.factor(ifelse(predict2.5>0.3, 1, 0)), as.factor(new_test_oph$result))
confusione2.5
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 6332 530
## 1 450 388
##
## Accuracy : 0.8727
## 95% CI : (0.8651, 0.8801)
## No Information Rate : 0.8808
## P-Value [Acc > NIR] : 0.98542
##
## Kappa : 0.3703
##
## Mcnemar's Test P-Value : 0.01162
##
## Sensitivity : 0.9336
## Specificity : 0.4227
## Pos Pred Value : 0.9228
## Neg Pred Value : 0.4630
## Prevalence : 0.8808
## Detection Rate : 0.8223
## Detection Prevalence : 0.8912
## Balanced Accuracy : 0.6782
##
## 'Positive' Class : 0
##
roc2.5 <- roc(new_test_oph$result ~ as.numeric(predict2.5), plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright", legend = paste("AUC =", round(auc(roc2.5), 4)), lwd = 2, box.lwd = 0, bg = "white")
rms::val.prob(as.numeric(as.character(predict2.5)), as.numeric(as.character(new_test_oph$result)))
## Dxy C (ROC) R2 D D:Chi-sq
## 6.244548e-01 8.122274e-01 2.908950e-01 1.633468e-01 1.258770e+03
## D:p U U:Chi-sq U:p Q
## NA 1.468163e-03 1.330485e+01 1.290887e-03 1.618786e-01
## Brier Intercept Slope Emax E90
## 8.252587e-02 -2.167724e-01 8.903037e-01 6.141308e-02 3.284286e-02
## Eavg S:z S:p
## 1.285068e-02 3.637999e-01 7.160074e-01
modello2.8 <- nnet(result ~ minute + lastAction + X + Y + h_a, data = new_train_oph, size = 10, maxit = 1000, linout = FALSE)
## # weights: 231
## initial value 8722.578293
## iter 10 value 8435.748320
## iter 20 value 8279.696679
## iter 30 value 8089.025131
## iter 40 value 8065.396259
## iter 50 value 7820.389397
## iter 60 value 7620.533019
## iter 70 value 7527.182160
## iter 80 value 7347.012489
## iter 90 value 7306.470349
## iter 100 value 7270.602773
## iter 110 value 7209.485135
## iter 120 value 7149.740344
## iter 130 value 7108.479248
## iter 140 value 7089.482769
## iter 150 value 7081.089074
## iter 160 value 7075.318128
## iter 170 value 7072.006341
## iter 180 value 7065.815611
## iter 190 value 7061.494404
## iter 200 value 7059.751906
## iter 210 value 7057.881921
## iter 220 value 7056.740067
## iter 230 value 7055.132476
## iter 240 value 7052.237977
## iter 250 value 7051.064152
## iter 260 value 7050.361597
## iter 270 value 7049.824061
## iter 280 value 7049.270668
## iter 290 value 7048.685893
## iter 300 value 7047.999021
## iter 310 value 7047.421214
## iter 320 value 7047.147571
## iter 330 value 7046.797723
## iter 340 value 7046.235374
## iter 350 value 7045.552039
## iter 360 value 7043.659123
## iter 370 value 7043.059688
## iter 380 value 7042.916778
## iter 390 value 7042.807888
## iter 400 value 7042.674153
## iter 410 value 7042.430997
## iter 420 value 7042.132006
## iter 430 value 7041.922490
## iter 440 value 7041.629317
## iter 450 value 7041.465949
## iter 460 value 7041.325280
## iter 470 value 7041.021732
## iter 480 value 7040.845830
## iter 490 value 7040.576073
## iter 500 value 7040.369471
## iter 510 value 7040.308663
## iter 520 value 7040.267512
## iter 530 value 7040.203691
## iter 540 value 7040.109465
## iter 550 value 7040.064034
## iter 560 value 7039.967961
## iter 570 value 7039.905303
## iter 580 value 7039.802194
## iter 590 value 7039.726709
## iter 600 value 7039.678707
## iter 610 value 7039.625371
## iter 620 value 7039.569185
## iter 630 value 7039.529814
## iter 640 value 7039.473146
## iter 650 value 7039.436103
## iter 660 value 7039.409985
## final value 7039.386145
## converged
predict2.8 <- predict(modello2.8, newdata = new_test_oph)
confusione2.8 <- confusionMatrix(as.factor(ifelse(predict2.8>0.3, 1, 0)), new_test_oph$result)
confusione2.8
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 6407 655
## 1 375 263
##
## Accuracy : 0.8662
## 95% CI : (0.8584, 0.8738)
## No Information Rate : 0.8808
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2663
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9447
## Specificity : 0.2865
## Pos Pred Value : 0.9073
## Neg Pred Value : 0.4122
## Prevalence : 0.8808
## Detection Rate : 0.8321
## Detection Prevalence : 0.9171
## Balanced Accuracy : 0.6156
##
## 'Positive' Class : 0
##
roc2.8 <- roc(new_test_oph$result ~ as.numeric(predict2.8), plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright", legend = paste("AUC =", round(auc(roc2.8), 4)), lwd = 2, box.lwd = 0, bg = "white")
rms::val.prob(as.numeric(as.character(predict2.8)), as.numeric(as.character(new_test_oph$result)))
## Warning in rms::val.prob(as.numeric(as.character(predict2.8)),
## as.numeric(as.character(new_test_oph$result))): 64 observations deleted from
## logistic calibration due to probs. of 0 or 1
## Dxy C (ROC) R2 D D:Chi-sq
## 0.536170235 0.768085117 0.187960276 0.102744421 785.556401386
## D:p U U:Chi-sq U:p Q
## NA Inf Inf 0.000000000 -Inf
## Brier Intercept Slope Emax E90
## 0.092247046 -0.148281506 0.904902415 0.153865186 0.008181917
## Eavg S:z S:p
## 0.006748645 1.625219257 0.104115849
confusione2.us <- confusionMatrix(as.factor(ifelse(test_oph$`xG Understat`>0.3, 1, 0)), test_oph$result)
confusione2.us
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 6021 391
## 1 761 527
##
## Accuracy : 0.8504
## 95% CI : (0.8422, 0.8583)
## No Information Rate : 0.8808
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3933
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.8878
## Specificity : 0.5741
## Pos Pred Value : 0.9390
## Neg Pred Value : 0.4092
## Prevalence : 0.8808
## Detection Rate : 0.7819
## Detection Prevalence : 0.8327
## Balanced Accuracy : 0.7309
##
## 'Positive' Class : 0
##
roc2.us <- roc(test_oph$result ~ as.numeric(test_oph$`xG Understat`), plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright", legend = paste("AUC =", round(auc(roc2.us), 4)), lwd = 2, box.lwd = 0, bg = "white")
rms::val.prob(as.numeric(as.character(test_oph$`xG Understat`)), as.numeric(as.character(test_oph$result)))
## Dxy C (ROC) R2 D D:Chi-sq
## 6.478921e-01 8.239461e-01 2.930995e-01 1.646936e-01 1.269140e+03
## D:p U U:Chi-sq U:p Q
## NA 2.090539e-03 1.809715e+01 1.175584e-04 1.626030e-01
## Brier Intercept Slope Emax E90
## 8.339068e-02 -1.780625e-01 9.907059e-01 4.643716e-02 3.833526e-02
## Eavg S:z S:p
## 1.470120e-02 -3.084414e+00 2.039538e-03
set.seed(123)
index_spf <- sample.split(Y = setpiece_foot$result, SplitRatio = 0.75)
train_spf <- setpiece_foot[index_spf, ]
indice_test_spf <- which(index_spf == FALSE)
test_spf <- setpiece_foot[indice_test_spf, ]
modello3.1 <- glm(result ~ minute + lastAction + is_weakfoot + X + exp(Y^2) + h_a + situation,
data = train_spf, family = binomial)
summary(modello3.1)
##
## Call:
## glm(formula = result ~ minute + lastAction + is_weakfoot + X +
## exp(Y^2) + h_a + situation, family = binomial, data = train_spf)
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.421e+01 2.918e-01 -48.687 < 2e-16 ***
## minute 3.158e-04 6.464e-04 0.489 0.62515
## lastActionBallRecovery 2.785e-01 7.601e-01 0.366 0.71411
## lastActionBallTouch -4.691e-03 1.859e-01 -0.025 0.97986
## lastActionBlockedPass -1.258e+01 3.202e+02 -0.039 0.96867
## lastActionCard -1.336e+01 4.690e+02 -0.028 0.97728
## lastActionChipped 1.790e-01 1.591e-01 1.125 0.26060
## lastActionCornerAwarded -2.713e-01 3.710e-01 -0.731 0.46466
## lastActionCross 1.187e-01 1.261e-01 0.941 0.34681
## lastActionDispossessed -5.646e-01 7.315e-01 -0.772 0.44014
## lastActionEnd -1.317e+01 1.526e+02 -0.086 0.93122
## lastActionFoul 2.640e-01 3.382e-01 0.781 0.43504
## lastActionGoal -1.334e+01 2.110e+02 -0.063 0.94958
## lastActionGoodSkill -1.355e+01 8.259e+02 -0.016 0.98691
## lastActionHeadPass 8.723e-01 1.301e-01 6.705 2.01e-11 ***
## lastActionInterception -4.658e-01 1.049e+00 -0.444 0.65714
## lastActionLayOff 9.714e-01 3.336e-01 2.912 0.00360 **
## lastActionNone 5.073e-01 1.203e-01 4.216 2.48e-05 ***
## lastActionPass 6.736e-01 1.242e-01 5.424 5.83e-08 ***
## lastActionRebound 1.219e+00 1.216e-01 10.030 < 2e-16 ***
## lastActionStandard 1.470e+00 1.248e-01 11.781 < 2e-16 ***
## lastActionSubstitutionOn 2.382e+00 1.061e+00 2.246 0.02473 *
## lastActionTackle -2.138e-01 7.419e-01 -0.288 0.77326
## lastActionTakeOn 2.452e-02 2.147e-01 0.114 0.90908
## lastActionThroughball 1.445e+00 2.679e-01 5.393 6.95e-08 ***
## is_weakfootYes -1.214e-01 4.351e-02 -2.791 0.00526 **
## X 1.400e+01 2.924e-01 47.871 < 2e-16 ***
## exp(Y^2) -6.425e-01 8.422e-02 -7.629 2.36e-14 ***
## h_ah -8.534e-02 3.469e-02 -2.460 0.01389 *
## situationFromCorner -9.978e-02 4.192e-02 -2.380 0.01730 *
## situationSetPiece NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 28504 on 52076 degrees of freedom
## Residual deviance: 24469 on 52047 degrees of freedom
## AIC: 24529
##
## Number of Fisher Scoring iterations: 14
predict3.1 <- predict.glm(modello3.1, newdata = test_spf, type = "response")
roc3.1 <- roc(test_spf$result ~ predict3.1, plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright", legend = paste("AUC =", round(auc(roc3.1), 4)), lwd = 2, box.lwd = 0, bg = "white")
confusione3.1 <- confusionMatrix(as.factor(ifelse(predict3.1>0.3, 1, 0)), as.factor(test_spf$result))
confusione3.1
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 15617 1107
## 1 389 245
##
## Accuracy : 0.9138
## 95% CI : (0.9095, 0.9179)
## No Information Rate : 0.9221
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2073
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9757
## Specificity : 0.1812
## Pos Pred Value : 0.9338
## Neg Pred Value : 0.3864
## Prevalence : 0.9221
## Detection Rate : 0.8997
## Detection Prevalence : 0.9635
## Balanced Accuracy : 0.5785
##
## 'Positive' Class : 0
##
rms::val.prob(as.numeric(as.character(predict3.1)), as.numeric(as.character(test_spf$result)))
## Dxy C (ROC) R2 D D:Chi-sq
## 5.080409e-01 7.540205e-01 1.505219e-01 6.547547e-02 1.137523e+03
## D:p U U:Chi-sq U:p Q
## NA 3.787373e-04 8.574122e+00 1.374527e-02 6.509673e-02
## Brier Intercept Slope Emax E90
## 6.520744e-02 -1.766042e-01 9.150318e-01 1.923828e-02 1.603711e-02
## Eavg S:z S:p
## 7.930469e-03 3.248913e-01 7.452633e-01
set.seed(1234)
new_train_spf <- train_spf
new_train_spf$Y <- exp(new_train_spf$Y^2)
new_test_spf <- test_spf
new_test_spf$Y <- exp(new_test_spf$Y^2)
set.seed(1234)
modello3.3 <- lda(result ~ minute + lastAction + is_weakfoot + X + exp(Y^2) + h_a, data = train_spf)
summary(modello3.3)
## Length Class Mode
## prior 2 -none- numeric
## counts 2 -none- numeric
## means 56 -none- numeric
## scaling 28 -none- numeric
## lev 2 -none- character
## svd 1 -none- numeric
## N 1 -none- numeric
## call 3 -none- call
## terms 3 terms call
## xlevels 3 -none- list
predict3.3.1 <- predict(modello3.3, newdata = test_spf)
predict3.3 <- predict3.3.1$posterior[, 2]
confusione3.3.1 <- confusionMatrix(as.factor(predict3.3.1$class), as.factor(test_spf$result))
confusione3.3.1
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 15775 1193
## 1 231 159
##
## Accuracy : 0.918
## 95% CI : (0.9138, 0.922)
## No Information Rate : 0.9221
## P-Value [Acc > NIR] : 0.9794
##
## Kappa : 0.153
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9856
## Specificity : 0.1176
## Pos Pred Value : 0.9297
## Neg Pred Value : 0.4077
## Prevalence : 0.9221
## Detection Rate : 0.9088
## Detection Prevalence : 0.9775
## Balanced Accuracy : 0.5516
##
## 'Positive' Class : 0
##
confusione3.3 <- confusionMatrix(as.factor(ifelse(predict3.3>0.3, 1, 0)), as.factor(test_spf$result))
roc3.3 <- roc(test_spf$result ~ predict3.3, plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright", legend = paste("AUC =", round(auc(roc3.3), 4)), lwd = 2, box.lwd = 0, bg = "white")
rms::val.prob(as.numeric(as.character(predict3.3)), as.numeric(as.character(test_spf$result)))
## Dxy C (ROC) R2 D D:Chi-sq
## 0.487283753 0.743641877 0.129926651 0.056251210 977.408498151
## D:p U U:Chi-sq U:p Q
## NA 0.006455379 114.052465074 0.000000000 0.049795831
## Brier Intercept Slope Emax E90
## 0.066883628 -0.556372588 0.751043723 0.343102307 0.019185729
## Eavg S:z S:p
## 0.016310743 3.167461664 0.001537760
set.seed(1234)
modello3.4 <- ranger(train_spf$result ~ minute + lastAction + is_weakfoot + X + Y + h_a,
data = new_train_spf, num.trees = 500, mtry = 3, seed = 42, probability = T)
train_predict_3.4.1 <- predict(modello3.4, data = new_train_spf)
train_predict3.4 <- train_predict_3.4.1$predictions[, 2]
predict3.4.1 <- predict(modello3.4, data = new_test_spf)
predict3.4 <- predict3.4.1$predictions[, 2]
confusione3.4 <- confusionMatrix(as.factor(ifelse(predict3.4 >0.3, 1, 0)), as.factor(test_spf$result))
confusione3.4
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 15519 717
## 1 487 635
##
## Accuracy : 0.9306
## 95% CI : (0.9268, 0.9344)
## No Information Rate : 0.9221
## P-Value [Acc > NIR] : 1.086e-05
##
## Kappa : 0.4763
##
## Mcnemar's Test P-Value : 4.121e-11
##
## Sensitivity : 0.9696
## Specificity : 0.4697
## Pos Pred Value : 0.9558
## Neg Pred Value : 0.5660
## Prevalence : 0.9221
## Detection Rate : 0.8941
## Detection Prevalence : 0.9354
## Balanced Accuracy : 0.7196
##
## 'Positive' Class : 0
##
roc3.4 <- roc(test_spf$result ~ as.numeric(predict3.4), plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright", legend = paste("AUC =", round(auc(roc3.4), 4)), lwd = 2, box.lwd = 0, bg = "white")
rms::val.prob(as.numeric(as.character(predict3.4)), as.numeric(as.character(new_test_spf$result)))
## Warning in rms::val.prob(as.numeric(as.character(predict3.4)),
## as.numeric(as.character(new_test_spf$result))): 383 observations deleted from
## logistic calibration due to probs. of 0 or 1
## Dxy C (ROC) R2 D D:Chi-sq
## 6.650859e-01 8.325429e-01 3.375476e-01 1.553417e-01 2.637926e+03
## D:p U U:Chi-sq U:p Q
## NA 2.683426e-03 4.755116e+01 4.724943e-11 1.526583e-01
## Brier Intercept Slope Emax E90
## 5.104138e-02 -3.530922e-01 8.691969e-01 6.181181e-02 4.470280e-02
## Eavg S:z S:p
## 1.567332e-02 -2.138949e+00 3.243979e-02
set.seed(1234)
modello3.5 <- ranger(train_spf$result ~ minute + lastAction + is_weakfoot + X + Y + h_a,
data = new_train_spf, num.trees = 500, seed = 42, probability = T)
predict3.5.1 <- predict(modello3.5, data = new_test_spf)
predict3.5 <- predict3.5.1$predictions[, 2]
confusione3.5 <- confusionMatrix(as.factor(ifelse(predict3.5>0.3, 1, 0)), as.factor(new_test_spf$result))
confusione3.5
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 15591 758
## 1 415 594
##
## Accuracy : 0.9324
## 95% CI : (0.9286, 0.9361)
## No Information Rate : 0.9221
## P-Value [Acc > NIR] : 1.239e-07
##
## Kappa : 0.4677
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9741
## Specificity : 0.4393
## Pos Pred Value : 0.9536
## Neg Pred Value : 0.5887
## Prevalence : 0.9221
## Detection Rate : 0.8982
## Detection Prevalence : 0.9419
## Balanced Accuracy : 0.7067
##
## 'Positive' Class : 0
##
roc3.5 <- roc(new_test_spf$result ~ as.numeric(predict3.5), plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright", legend = paste("AUC =", round(auc(roc3.5), 4)), lwd = 2, box.lwd = 0, bg = "white")
rms::val.prob(as.numeric(as.character(predict3.5)), as.numeric(as.character(new_test_spf$result)))
## Dxy C (ROC) R2 D D:Chi-sq
## 6.772853e-01 8.386426e-01 3.486116e-01 1.588321e-01 2.758008e+03
## D:p U U:Chi-sq U:p Q
## NA 4.519128e-04 9.844303e+00 7.283445e-03 1.583802e-01
## Brier Intercept Slope Emax E90
## 5.136852e-02 4.054414e-02 1.056891e+00 9.273364e-02 2.470800e-02
## Eavg S:z S:p
## 8.638274e-03 -3.800341e+00 1.444973e-04
set.seed(1234)
modello3.8 <- nnet(result ~ minute + lastAction + is_weakfoot + X + Y + h_a, data = new_train_spf, size = 10, maxit = 1000, linout = FALSE)
## # weights: 301
## initial value 23522.705483
## iter 10 value 14179.880092
## iter 20 value 13625.005698
## iter 30 value 13418.429983
## iter 40 value 13173.057450
## iter 50 value 12640.458928
## iter 60 value 12265.364331
## iter 70 value 12038.069262
## iter 80 value 11934.605007
## iter 90 value 11896.243136
## iter 100 value 11893.367623
## iter 110 value 11884.155700
## iter 120 value 11866.894186
## iter 130 value 11850.193146
## iter 140 value 11837.232488
## iter 150 value 11820.460297
## iter 160 value 11805.131341
## iter 170 value 11790.186472
## iter 180 value 11778.943767
## iter 190 value 11768.430048
## iter 200 value 11754.212725
## iter 210 value 11746.100902
## iter 220 value 11740.054020
## iter 230 value 11733.960416
## iter 240 value 11728.417755
## iter 250 value 11724.163176
## iter 260 value 11721.026534
## iter 270 value 11718.333051
## iter 280 value 11714.357414
## iter 290 value 11713.123854
## iter 300 value 11712.121098
## iter 310 value 11711.590404
## iter 320 value 11710.029488
## iter 330 value 11706.959369
## iter 340 value 11705.784964
## iter 350 value 11704.471415
## iter 360 value 11703.823626
## iter 370 value 11703.137776
## iter 380 value 11702.167481
## iter 390 value 11700.827529
## iter 400 value 11699.873236
## iter 410 value 11698.942583
## final value 11698.629976
## converged
predict3.8 <- predict(modello3.8, newdata = new_test_spf)
confusione3.8 <- confusionMatrix(as.factor(ifelse(predict3.8>0.3, 1, 0)), new_test_spf$result)
confusione3.8
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 15425 951
## 1 581 401
##
## Accuracy : 0.9117
## 95% CI : (0.9074, 0.9159)
## No Information Rate : 0.9221
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2976
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9637
## Specificity : 0.2966
## Pos Pred Value : 0.9419
## Neg Pred Value : 0.4084
## Prevalence : 0.9221
## Detection Rate : 0.8886
## Detection Prevalence : 0.9434
## Balanced Accuracy : 0.6301
##
## 'Positive' Class : 0
##
roc3.8 <- roc(new_test_spf$result ~ as.numeric(predict3.8), plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright", legend = paste("AUC =", round(auc(roc3.8), 4)), lwd = 2, box.lwd = 0, bg = "white")
rms::val.prob(as.numeric(as.character(predict3.8)), as.numeric(as.character(new_test_spf$result)))
## Warning in rms::val.prob(as.numeric(as.character(predict3.8)),
## as.numeric(as.character(new_test_spf$result))): 73 observations deleted from
## logistic calibration due to probs. of 0 or 1
## Dxy C (ROC) R2 D D:Chi-sq
## 5.362365e-01 7.681183e-01 1.831183e-01 8.020757e-02 1.387388e+03
## D:p U U:Chi-sq U:p Q
## NA Inf Inf 0.000000e+00 -Inf
## Brier Intercept Slope Emax E90
## 6.358302e-02 -2.002981e-01 9.038563e-01 1.008896e-01 8.304276e-03
## Eavg S:z S:p
## 5.422550e-03 1.609048e+00 1.076059e-01
confusione3.us <- confusionMatrix(as.factor(ifelse(test_spf$`xG Understat`>0.3, 1, 0)), test_spf$result)
confusione3.us
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 15438 770
## 1 568 582
##
## Accuracy : 0.9229
## 95% CI : (0.9188, 0.9268)
## No Information Rate : 0.9221
## P-Value [Acc > NIR] : 0.3524
##
## Kappa : 0.424
##
## Mcnemar's Test P-Value : 3.907e-08
##
## Sensitivity : 0.9645
## Specificity : 0.4305
## Pos Pred Value : 0.9525
## Neg Pred Value : 0.5061
## Prevalence : 0.9221
## Detection Rate : 0.8894
## Detection Prevalence : 0.9337
## Balanced Accuracy : 0.6975
##
## 'Positive' Class : 0
##
roc3.us <- roc(test_spf$result ~ as.numeric(test_spf$`xG Understat`), plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright", legend = paste("AUC =", round(auc(roc3.us), 4)), lwd = 2, box.lwd = 0, bg = "white")
rms::val.prob(as.numeric(as.character(test_spf$`xG Understat`)), as.numeric(as.character(test_spf$result)))
## Warning in rms::val.prob(as.numeric(as.character(test_spf$`xG Understat`)), :
## 13 observations deleted from logistic calibration due to probs. of 0 or 1
## Dxy C (ROC) R2 D D:Chi-sq
## 5.895407e-01 7.947703e-01 2.724341e-01 1.219437e-01 2.116114e+03
## D:p U U:Chi-sq U:p Q
## NA 5.332985e-04 1.125006e+01 3.606451e-03 1.214104e-01
## Brier Intercept Slope Emax E90
## 5.646719e-02 -1.819309e-01 9.393373e-01 4.516177e-02 6.395073e-03
## Eavg S:z S:p
## 5.191630e-03 -5.115445e-02 9.592024e-01
set.seed(123)
index_sph <- sample.split(Y = setpiece_head$result, SplitRatio = 0.75)
train_sph <- setpiece_head[index_sph, ]
indice_test_sph <- which(index_sph == FALSE)
test_sph <- setpiece_head[indice_test_sph, ]
modello4.1 <- glm(result ~ minute + lastAction + X + exp(Y^2) + h_a,
data = train_sph, family = binomial)
summary(modello4.1)
##
## Call:
## glm(formula = result ~ minute + lastAction + X + exp(Y^2) + h_a,
## family = binomial, data = train_sph)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.420e+01 7.364e-01 -32.868 < 2e-16 ***
## minute -1.725e-03 7.315e-04 -2.358 0.01836 *
## lastActionBallRecovery -1.282e+01 4.131e+02 -0.031 0.97524
## lastActionBallTouch -1.418e-01 3.709e-01 -0.382 0.70231
## lastActionBlockedPass -1.300e+01 2.409e+02 -0.054 0.95697
## lastActionCard -1.281e+01 3.355e+02 -0.038 0.96954
## lastActionChallenge -1.291e+01 4.496e+02 -0.029 0.97708
## lastActionChipped 6.573e-01 1.014e-01 6.481 9.14e-11 ***
## lastActionCornerAwarded -1.219e+00 7.181e-01 -1.698 0.08954 .
## lastActionCross 7.615e-01 4.764e-02 15.986 < 2e-16 ***
## lastActionDispossessed 7.975e-01 1.060e+00 0.753 0.45168
## lastActionEnd -4.132e-01 5.973e-01 -0.692 0.48913
## lastActionFoul -8.870e-01 5.903e-01 -1.502 0.13297
## lastActionGoal -1.269e+01 1.798e+02 -0.071 0.94375
## lastActionHeadPass 1.108e+00 1.048e-01 10.566 < 2e-16 ***
## lastActionInterception -1.269e+01 4.071e+02 -0.031 0.97513
## lastActionNone 3.315e-01 1.422e-01 2.332 0.01970 *
## lastActionOffsidePass -1.249e+01 2.712e+02 -0.046 0.96326
## lastActionPass 1.859e+00 2.146e-01 8.660 < 2e-16 ***
## lastActionRebound 1.199e+00 1.118e-01 10.724 < 2e-16 ***
## lastActionSubstitutionOn -1.292e+01 5.695e+02 -0.023 0.98190
## lastActionTackle -1.305e+01 3.234e+02 -0.040 0.96782
## lastActionTakeOn -1.262e+01 3.072e+02 -0.041 0.96722
## X 2.461e+01 7.397e-01 33.267 < 2e-16 ***
## exp(Y^2) -9.621e-01 2.060e-01 -4.671 2.99e-06 ***
## h_ah -1.052e-01 3.876e-02 -2.715 0.00663 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 21149 on 34346 degrees of freedom
## Residual deviance: 19194 on 34321 degrees of freedom
## AIC: 19246
##
## Number of Fisher Scoring iterations: 14
predict4.1 <- predict.glm(modello4.1, newdata = test_sph, type = "response")
confusione4.1 <- confusionMatrix(as.factor(ifelse(predict4.1>0.3, 1, 0)), as.factor(test_sph$result))
confusione4.1
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 10215 962
## 1 177 95
##
## Accuracy : 0.9005
## 95% CI : (0.8949, 0.9059)
## No Information Rate : 0.9077
## P-Value [Acc > NIR] : 0.9958
##
## Kappa : 0.1093
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.98297
## Specificity : 0.08988
## Pos Pred Value : 0.91393
## Neg Pred Value : 0.34926
## Prevalence : 0.90768
## Detection Rate : 0.89222
## Detection Prevalence : 0.97624
## Balanced Accuracy : 0.53642
##
## 'Positive' Class : 0
##
roc4.1 <- roc(test_sph$result ~ predict4.1, plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright", legend = paste("AUC =", round(auc(roc4.1), 4)), lwd = 2, box.lwd = 0, bg = "white")
rms::val.prob(as.numeric(as.character(predict4.1)), as.numeric(as.character(test_sph$result)))
## Dxy C (ROC) R2 D D:Chi-sq
## 4.251367e-01 7.125684e-01 1.037063e-01 4.876773e-02 5.593418e+02
## D:p U U:Chi-sq U:p Q
## NA 1.560155e-04 3.786222e+00 1.506026e-01 4.861172e-02
## Brier Intercept Slope Emax E90
## 7.906927e-02 -1.739121e-01 9.218147e-01 2.812030e-02 1.657528e-02
## Eavg S:z S:p
## 5.413573e-03 -2.727723e-02 9.782386e-01
set.seed(1234)
new_train_sph <- train_sph
new_train_sph$Y <- exp(new_train_sph$Y^2)
new_test_sph <- test_sph
new_test_sph$Y <- exp(new_test_sph$Y^2)
set.seed(1234)
modello4.3 <- lda(result ~ minute + lastAction + X + exp(Y^2) + h_a, data = train_sph)
summary(modello4.3)
## Length Class Mode
## prior 2 -none- numeric
## counts 2 -none- numeric
## means 50 -none- numeric
## scaling 25 -none- numeric
## lev 2 -none- character
## svd 1 -none- numeric
## N 1 -none- numeric
## call 3 -none- call
## terms 3 terms call
## xlevels 2 -none- list
predict4.3.1 <- predict(modello4.3, newdata = test_sph)
predict4.3 <- predict4.3.1$posterior[, 2]
confusione4.3.1 <- confusionMatrix(as.factor(predict4.3.1$class), as.factor(test_sph$result))
confusione4.3.1
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 10324 1011
## 1 68 46
##
## Accuracy : 0.9058
## 95% CI : (0.9003, 0.911)
## No Information Rate : 0.9077
## P-Value [Acc > NIR] : 0.7668
##
## Kappa : 0.0617
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.99346
## Specificity : 0.04352
## Pos Pred Value : 0.91081
## Neg Pred Value : 0.40351
## Prevalence : 0.90768
## Detection Rate : 0.90174
## Detection Prevalence : 0.99004
## Balanced Accuracy : 0.51849
##
## 'Positive' Class : 0
##
confusione4.3 <- confusionMatrix(as.factor(ifelse(predict4.3>0.3, 1, 0)), as.factor(test_sph$result))
roc4.3 <- roc(test_sph$result ~ predict4.3, plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright", legend = paste("AUC =", round(auc(roc4.3), 4)), lwd = 2, box.lwd = 0, bg = "white")
rms::val.prob(as.numeric(as.character(predict4.3)), as.numeric(as.character(test_sph$result)))
## Dxy C (ROC) R2 D D:Chi-sq
## 4.176003e-01 7.088001e-01 9.620260e-02 4.515158e-02 5.179404e+02
## D:p U U:Chi-sq U:p Q
## NA 1.869583e-03 2.340486e+01 8.273686e-06 4.328199e-02
## Brier Intercept Slope Emax E90
## 8.005259e-02 -3.824800e-01 8.242660e-01 3.101720e-01 1.341532e-02
## Eavg S:z S:p
## 1.179437e-02 1.354577e+00 1.755522e-01
set.seed(1234)
modello4.4 <- ranger(train_sph$result ~ minute + lastAction + X + Y + h_a,
data = new_train_sph, num.trees = 500, mtry = 3, seed = 42, probability = T)
train_predict_4.4.1 <- predict(modello4.4, data = new_train_sph)
train_predict4.4 <- train_predict_4.4.1$predictions[, 2]
predict4.4.1 <- predict(modello4.4, data = new_test_sph)
predict4.4 <- predict4.4.1$predictions[, 2]
confusione4.4 <- confusionMatrix(as.factor(ifelse(predict4.4 >0.3, 1, 0)), as.factor(test_sph$result))
confusione4.4
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 9942 631
## 1 450 426
##
## Accuracy : 0.9056
## 95% CI : (0.9001, 0.9109)
## No Information Rate : 0.9077
## P-Value [Acc > NIR] : 0.786
##
## Kappa : 0.3897
##
## Mcnemar's Test P-Value : 4.383e-08
##
## Sensitivity : 0.9567
## Specificity : 0.4030
## Pos Pred Value : 0.9403
## Neg Pred Value : 0.4863
## Prevalence : 0.9077
## Detection Rate : 0.8684
## Detection Prevalence : 0.9235
## Balanced Accuracy : 0.6799
##
## 'Positive' Class : 0
##
roc4.4 <- roc(test_sph$result ~ as.numeric(predict4.4), plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright", legend = paste("AUC =", round(auc(roc4.4), 4)), lwd = 2, box.lwd = 0, bg = "white")
rms::val.prob(as.numeric(as.character(predict4.4)), as.numeric(as.character(new_test_sph$result)))
## Warning in rms::val.prob(as.numeric(as.character(predict4.4)),
## as.numeric(as.character(new_test_sph$result))): 869 observations deleted from
## logistic calibration due to probs. of 0 or 1
## Dxy C (ROC) R2 D D:Chi-sq
## 0.60666454 0.80333227 0.28605354 0.14680716 1554.21971840
## D:p U U:Chi-sq U:p Q
## NA Inf Inf 0.00000000 -Inf
## Brier Intercept Slope Emax E90
## 0.06304306 -0.46515693 0.76715854 0.09687157 0.05778390
## Eavg S:z S:p
## 0.02110086 0.94272703 0.34582054
set.seed(1234)
modello4.5 <- ranger(train_sph$result ~ minute + lastAction + X + Y + h_a,
data = new_train_sph, num.trees = 500, seed = 42, probability = T)
predict4.5.1 <- predict(modello4.5, data = new_test_sph)
predict4.5 <- predict4.5.1$predictions[, 2]
confusione4.5 <- confusionMatrix(as.factor(ifelse(predict4.5>0.3, 1, 0)), as.factor(new_test_sph$result))
confusione4.5
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 10042 658
## 1 350 399
##
## Accuracy : 0.912
## 95% CI : (0.9066, 0.9171)
## No Information Rate : 0.9077
## P-Value [Acc > NIR] : 0.05793
##
## Kappa : 0.3956
##
## Mcnemar's Test P-Value : < 2e-16
##
## Sensitivity : 0.9663
## Specificity : 0.3775
## Pos Pred Value : 0.9385
## Neg Pred Value : 0.5327
## Prevalence : 0.9077
## Detection Rate : 0.8771
## Detection Prevalence : 0.9346
## Balanced Accuracy : 0.6719
##
## 'Positive' Class : 0
##
roc4.5 <- roc(new_test_sph$result ~ as.numeric(predict4.5), plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright", legend = paste("AUC =", round(auc(roc4.5), 4)), lwd = 2, box.lwd = 0, bg = "white")
rms::val.prob(as.numeric(as.character(predict4.5)), as.numeric(as.character(new_test_sph$result)))
## Warning in rms::val.prob(as.numeric(as.character(predict4.5)),
## as.numeric(as.character(new_test_sph$result))): 32 observations deleted from
## logistic calibration due to probs. of 0 or 1
## Dxy C (ROC) R2 D D:Chi-sq
## 6.444300e-01 8.222150e-01 3.168346e-01 1.575874e-01 1.800175e+03
## D:p U U:Chi-sq U:p Q
## NA 6.581974e-04 9.514640e+00 8.588598e-03 1.569292e-01
## Brier Intercept Slope Emax E90
## 6.248466e-02 -1.808876e-01 9.179875e-01 1.028428e-01 3.948918e-02
## Eavg S:z S:p
## 1.451649e-02 -8.034238e-01 4.217298e-01
modello4.8 <- nnet(result ~ minute + lastAction + X + Y + h_a, data = new_train_sph, size = 10, maxit = 1000, linout = FALSE)
## # weights: 271
## initial value 14444.509965
## iter 10 value 10571.777439
## iter 20 value 10555.297209
## iter 30 value 10535.397340
## iter 40 value 10531.264920
## iter 50 value 10529.572248
## iter 60 value 10528.543519
## final value 10528.519092
## converged
predict4.8 <- predict(modello4.8, newdata = new_test_sph)
confusione4.8 <- confusionMatrix(as.factor(ifelse(predict4.8>0.3, 1, 0)), new_test_sph$result)
confusione4.8
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 10390 1054
## 1 2 3
##
## Accuracy : 0.9078
## 95% CI : (0.9023, 0.913)
## No Information Rate : 0.9077
## P-Value [Acc > NIR] : 0.4953
##
## Kappa : 0.0048
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.999808
## Specificity : 0.002838
## Pos Pred Value : 0.907899
## Neg Pred Value : 0.600000
## Prevalence : 0.907678
## Detection Rate : 0.907503
## Detection Prevalence : 0.999563
## Balanced Accuracy : 0.501323
##
## 'Positive' Class : 0
##
roc4.8 <- roc(new_test_sph$result ~ as.numeric(predict4.8), plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright", legend = paste("AUC =", round(auc(roc4.8), 4)), lwd = 2, box.lwd = 0, bg = "white")
rms::val.prob(as.numeric(as.character(predict4.8)), as.numeric(as.character(new_test_sph$result)))
## Dxy C (ROC) R2 D D:Chi-sq
## 1.874104e-02 5.093705e-01 2.306491e-03 9.736699e-04 1.214755e+01
## D:p U U:Chi-sq U:p Q
## NA 6.033001e-05 2.690718e+00 2.604461e-01 9.133399e-04
## Brier Intercept Slope Emax E90
## 8.366746e-02 -6.346217e-01 7.213535e-01 8.956164e-02 1.159593e-03
## Eavg S:z S:p
## 2.291678e-03 9.299804e-02 9.259051e-01
confusione4.us <- confusionMatrix(as.factor(ifelse(test_sph$`xG Understat`>0.3, 1, 0)), test_sph$result)
confusione4.us
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 9805 626
## 1 587 431
##
## Accuracy : 0.8941
## 95% CI : (0.8883, 0.8996)
## No Information Rate : 0.9077
## P-Value [Acc > NIR] : 1.0000
##
## Kappa : 0.3572
##
## Mcnemar's Test P-Value : 0.2752
##
## Sensitivity : 0.9435
## Specificity : 0.4078
## Pos Pred Value : 0.9400
## Neg Pred Value : 0.4234
## Prevalence : 0.9077
## Detection Rate : 0.8564
## Detection Prevalence : 0.9111
## Balanced Accuracy : 0.6756
##
## 'Positive' Class : 0
##
roc4.us <- roc(test_sph$result ~ as.numeric(test_sph$`xG Understat`), plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright", legend = paste("AUC =", round(auc(roc4.us), 4)), lwd = 2, box.lwd = 0, bg = "white")
rms::val.prob(as.numeric(as.character(test_sph$`xG Understat`)), as.numeric(as.character(test_sph$result)))
## Warning in rms::val.prob(as.numeric(as.character(test_sph$`xG Understat`)), : 2
## observations deleted from logistic calibration due to probs. of 0 or 1
## Dxy C (ROC) R2 D D:Chi-sq
## 5.820363e-01 7.910181e-01 2.272736e-01 1.102882e-01 1.263469e+03
## D:p U U:Chi-sq U:p Q
## NA 5.036564e-04 7.765355e+00 2.059561e-02 1.097845e-01
## Brier Intercept Slope Emax E90
## 7.092230e-02 -1.467926e-01 9.228842e-01 1.822378e-01 4.303519e-03
## Eavg S:z S:p
## 4.759339e-03 1.399728e+00 1.615947e-01
accuracy_opf <- data.frame(confusione1.us$overall["Accuracy"], confusione1.1$overall["Accuracy"], confusione1.2$overall["Accuracy"], confusione1.3$overall["Accuracy"], confusione1.4$overall["Accuracy"], confusione1.5$overall["Accuracy"], confusione1.8$overall["Accuracy"])
nomi_acc1 <- c("Understat", "Lineare", "Interazione", "Discriminante", "Random_Forest", "Bagging", "Neural_Net")
names(accuracy_opf) <- nomi_acc1
auc_opf <- data.frame(roc1.us$auc, roc1.1$auc, roc1.2$auc, roc1.3$auc, roc1.4$auc, roc1.5$auc, roc1.8$auc)
names(auc_opf) <- nomi_acc1
acc_auc1 <- rbind(accuracy_opf, auc_opf)
rownames(acc_auc1) <- c("Accuracy", "AUC")
acc_auc1
## Understat Lineare Interazione Discriminante Random_Forest Bagging
## Accuracy 0.8875595 0.8888058 0.8860917 0.8850532 0.9011161 0.9064058
## AUC 0.8145883 0.7468567 0.7495843 0.7447884 0.8374263 0.8392269
## Neural_Net
## Accuracy 0.8885704
## AUC 0.7913828
accuracy_oph <- data.frame(confusione2.us$overall["Accuracy"], confusione2.1$overall["Accuracy"], confusione2.3$overall["Accuracy"], confusione2.4$overall["Accuracy"], confusione2.5$overall["Accuracy"], confusione2.8$overall["Accuracy"])
nomi_acc2 <- c("Understat", "Lineare", "Discriminante", "Random_Forest", "Bagging", "Neural_Net")
names(accuracy_oph) <- nomi_acc2
auc_oph <- data.frame(roc1.us$auc, roc1.1$auc, roc1.3$auc, roc1.4$auc, roc1.5$auc, roc1.8$auc)
names(auc_oph) <- nomi_acc2
acc_auc2 <- rbind.data.frame(accuracy_oph, auc_oph)
rownames(acc_auc2) <- c("Accuracy", "AUC")
acc_auc2
## Understat Lineare Discriminante Random_Forest Bagging Neural_Net
## Accuracy 0.8503896 0.8577922 0.8579221 0.8659740 0.8727273 0.8662338
## AUC 0.8145883 0.7468567 0.7447884 0.8374263 0.8392269 0.7913828
accuracy_spf <- data.frame(confusione3.us$overall["Accuracy"], confusione3.1$overall["Accuracy"], confusione3.3$overall["Accuracy"], confusione3.4$overall["Accuracy"], confusione3.5$overall["Accuracy"], confusione3.8$overall["Accuracy"])
nomi_acc3 <- c("Understat", "Lineare", "Discriminante", "Random_Forest", "Bagging", "Neural_Net")
names(accuracy_spf) <- nomi_acc3
auc_spf <- data.frame(roc3.us$auc, roc3.1$auc, roc3.3$auc, roc3.4$auc, roc3.5$auc, roc3.8$auc)
names(auc_spf) <- nomi_acc3
acc_auc3 <- rbind.data.frame(accuracy_spf, auc_spf)
rownames(acc_auc3) <- c("Accuracy", "AUC")
acc_auc3
## Understat Lineare Discriminante Random_Forest Bagging Neural_Net
## Accuracy 0.9229174 0.9138150 0.9033875 0.9306372 0.9324231 0.9117410
## AUC 0.7949585 0.7540138 0.7436472 0.8365386 0.8386365 0.7650791
accuracy_sph <- data.frame(confusione4.us$overall["Accuracy"], confusione4.1$overall["Accuracy"], confusione4.3$overall["Accuracy"], confusione4.4$overall["Accuracy"], confusione4.5$overall["Accuracy"], confusione4.8$overall["Accuracy"])
nomi_acc2 <- c("Understat", "Lineare", "Discriminante", "Random_Forest", "Bagging", "Neural_Net")
names(accuracy_sph) <- nomi_acc2
auc_sph <- data.frame(roc4.us$auc, roc4.1$auc, roc4.3$auc, roc4.4$auc, roc4.5$auc, roc4.8$auc)
names(auc_sph) <- nomi_acc2
acc_auc4 <- rbind.data.frame(accuracy_sph, auc_sph)
rownames(acc_auc4) <- c("Accuracy", "AUC")
acc_auc4
## Understat Lineare Discriminante Random_Forest Bagging Neural_Net
## Accuracy 0.8940519 0.9005153 0.8967595 0.9055813 0.9119574 0.9077649
## AUC 0.7910585 0.7125744 0.7087912 0.8190310 0.8227611 0.5094168
test_opf_rf <- cbind(test_opf, predict1.4)
names(test_opf_rf)[names(test_opf_rf) == "predict1.4"] <- "xG_Random_Forest"
train_opf_rf <- cbind(train_opf, train_predict1.4)
names(train_opf_rf)[names(train_opf_rf) == "train_predict1.4"] <- "xG_Random_Forest"
messi_shots_opf_test <- test_opf_rf %>% filter(player == "Lionel Messi")
messi_shots_opf_train <- train_opf_rf %>% filter(player == "Lionel Messi")
messi_shots_opf <- bind_rows(messi_shots_opf_train, messi_shots_opf_test)
messi_shots_opf$shotType <- ifelse(messi_shots_opf$shotType == "LeftFoot", "Left", "Right")
messi_shots_opf <- messi_shots_opf %>%
mutate(X = X * 100, Y = Y * 100)
messi_shots_opf <- messi_shots_opf %>%
mutate(result = factor(result, levels = c(0, 1), labels = c("No Goal", "Goal")),
match_info = paste(h_team, "vs", a_team, "<br>", h_goals, "-", a_goals),
date_formatted = format(as.Date(date), "%d-%m-%Y"))
generate_center_circle_arc <- function(center_x = 50, center_y = 50, r = 9.15, n_points = 100) {
theta <- seq(-pi / 2, pi / 2, length.out = n_points) # Modificato per metà arco, da -pi/2 a pi/2
data.frame(
x = center_x + r * cos(theta), # Mantiene il centro sul bordo visibile del campo
y = center_y + r * sin(theta)
)
}
# Genera i punti della lunetta del centrocampo
center_circle_arc <- generate_center_circle_arc(center_x = 50, center_y = 50)
# Esempio di dataset con coordinate normalizzate (da 0 a 1)
p <- ggplot() +
annotate_pitch(colour = "black", fill = "white") + # disegna il campo
theme_pitch() + # tema predefinito per il campo
theme(panel.background = element_rect(fill = "green4")) + # colore sfondo campo
# Aggiungi la lunetta del centrocampo
geom_path(data = center_circle_arc, aes(x = x, y = y), color = "black", size = 0.8) +
geom_point(data = messi_shots_opf,
aes(x = X, y = Y,
color = factor(result), # colori differenti per goal e no goal
shape = factor(result), # forme differenti per goal e no goal
text = paste("Random Forest xG:", round(xG_Random_Forest, 4),
"<br>X:", round(X, 2),
"<br>Y:", round(Y, 2),
"<br>Foot:", shotType,
"<br>", match_info,
"<br>", date_formatted,
"<br>Minute:", minute)),
size = 2, stroke = 0.3) +
scale_color_manual(values = c("No Goal" = "black", "Goal" = "red"),
name = "Esito Tiro") +
scale_shape_manual(values = c("No Goal" = 1, "Goal" = 16),
name = "Esito Tiro") +
labs(title = "<span style='font-size:16pt;'>Tiri effettuati da Lionel Messi in campionato</span><br><span style='font-size:12pt;'>di piede e in situazione di Open Play tra il 2014 e il 2021", x = " ", y = " ") +
theme_minimal() +
coord_cartesian(xlim = c(50, 100))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning in geom_point(data = messi_shots_opf, aes(x = X, y = Y, color =
## factor(result), : Ignoring unknown aesthetics: text
p_interactive <- ggplotly(p, tooltip = "text")
## Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomCurve() has yet to be implemented in plotly.
## If you'd like to see this geom implemented,
## Please open an issue with your example code at
## https://github.com/ropensci/plotly/issues
## Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomCurve() has yet to be implemented in plotly.
## If you'd like to see this geom implemented,
## Please open an issue with your example code at
## https://github.com/ropensci/plotly/issues
## Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomCurve() has yet to be implemented in plotly.
## If you'd like to see this geom implemented,
## Please open an issue with your example code at
## https://github.com/ropensci/plotly/issues
## Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomCurve() has yet to be implemented in plotly.
## If you'd like to see this geom implemented,
## Please open an issue with your example code at
## https://github.com/ropensci/plotly/issues
p_interactive
test_spf_rf <- cbind(test_spf, predict3.4)
names(test_spf_rf)[names(test_spf_rf) == "predict3.4"] <- "xG_Random_Forest"
test_sph_rf <- cbind(test_sph, predict4.4)
names(test_sph_rf)[names(test_sph_rf) == "predict4.4"] <- "xG_Random_Forest"
train_spf_rf <- cbind(new_train_spf, train_predict3.4)
names(test_spf_rf)[names(test_spf_rf) == "train_predict3.4"] <- "xG_Random_Forest"
train_sph_rf <- cbind(new_train_sph, train_predict4.4)
names(test_sph_rf)[names(test_sph_rf) == "train_predict4.4"] <- "xG_Random_Forest"
seriea_corner_spf_test <- test_spf_rf %>%
filter(situation == "FromCorner", lega == "Serie A")
seriea_corner_sph_test <- test_sph_rf %>%
filter(situation == "FromCorner", lega == "Serie A")
seriea_corner_spf_train <- train_spf_rf %>%
filter(situation == "FromCorner", lega == "Serie A")
seriea_corner_sph_train <- train_sph_rf %>%
filter(situation == "FromCorner", lega == "Serie A")
seriea_corner <- bind_rows(seriea_corner_spf_test, seriea_corner_sph_test, seriea_corner_spf_train, seriea_corner_sph_train)
seriea_corner$foot <- ifelse(seriea_corner$shotType %in% c("RightFoot", "LeftFoot"),
"foot",
"head")
seriea_corner <- seriea_corner %>%
mutate(team_name = ifelse(h_a == "h", h_team, a_team),
match_info = paste(h_team, "vs", a_team, "<br>", h_goals, "-", a_goals),
date_formatted = format(as.Date(date), "%d-%m-%Y"),
result = factor(result, levels = c(0, 1), labels = c("No Goal", "Goal")))
seriea_corner <- seriea_corner %>%
mutate(X = X * 100, Y = Y * 100)
seriea_corner_20_21 <- seriea_corner %>%
filter(season == "2020")
seriea_corner_19_21 <- seriea_corner %>%
filter(season %in% c("2019", "2020"))
generate_center_circle_arc <- function(center_x = 50, center_y = 50, r = 9.15, n_points = 100) {
theta <- seq(-pi / 2, pi / 2, length.out = n_points) # Modificato per metà arco, da -pi/2 a pi/2
data.frame(
x = center_x + r * cos(theta), # Mantiene il centro sul bordo visibile del campo
y = center_y + r * sin(theta)
)
}
# Genera i punti della lunetta del centrocampo
center_circle_arc <- generate_center_circle_arc(center_x = 50, center_y = 50)
# Esempio di dataset con coordinate normalizzate (da 0 a 1)
p_seriea <- ggplot() +
annotate_pitch(colour = "black", fill = "white") + # disegna il campo
theme_pitch() + # tema predefinito per il campo
theme(panel.background = element_rect(fill = "green4")) + # colore sfondo campo
# Aggiungi la lunetta del centrocampo
geom_path(data = center_circle_arc, aes(x = x, y = y), color = "black", size = 0.8) +
geom_point(data = seriea_corner_19_21,
aes(x = X, y = Y,
color = factor(result), # colori differenti per goal e no goal
shape = factor(result), # forme differenti per goal e no goal
text = paste("Random Forest xG:", round(xG_Random_Forest, 4),
"<br>X:", round(X, 2),
"<br>Y:", round(Y, 2),
"<br>Player:", player,
"<br>Type:", shotType,
"<br>Assist:", player_assisted,
"<br>Team:", team_name,
"<br>", match_info,
"<br>", date_formatted,
"<br>Minute:", minute)),
size = 2, stroke = 0.3) +
scale_color_manual(values = c("No Goal" = "black", "Goal" = "red"),
name = "Esito Tiro") +
scale_shape_manual(values = c("No Goal" = 1, "Goal" = 16),
name = "Esito Tiro") +
labs(title = "<span style='font-size:16pt;'>Tiri effettuati in Serie A da calcio d'angolo</span><br><span style='font-size:12pt;'>nelle stagioni 2019/20 e 2020/21</span>", x = " ", y = " ") +
theme_minimal() +
coord_cartesian(xlim = c(50, 100))
## Warning in geom_point(data = seriea_corner_19_21, aes(x = X, y = Y, color =
## factor(result), : Ignoring unknown aesthetics: text
p_interactive_seriea <- ggplotly(p_seriea, tooltip = "text")
## Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomCurve() has yet to be implemented in plotly.
## If you'd like to see this geom implemented,
## Please open an issue with your example code at
## https://github.com/ropensci/plotly/issues
## Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomCurve() has yet to be implemented in plotly.
## If you'd like to see this geom implemented,
## Please open an issue with your example code at
## https://github.com/ropensci/plotly/issues
## Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomCurve() has yet to be implemented in plotly.
## If you'd like to see this geom implemented,
## Please open an issue with your example code at
## https://github.com/ropensci/plotly/issues
## Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomCurve() has yet to be implemented in plotly.
## If you'd like to see this geom implemented,
## Please open an issue with your example code at
## https://github.com/ropensci/plotly/issues
p_interactive_seriea
tot_opf_rf <- bind_rows(train_opf_rf, test_opf_rf)
tot_opf_rf <- tot_opf_rf %>%
mutate(X = X * 100, Y = Y * 100)
p_heatmap_custom <- ggplot(tot_opf_rf, aes(x = X, y = Y)) +
annotate_pitch(colour = "black", fill = "white") +
theme_pitch() +
theme(panel.background = element_rect(fill = "lightblue")) +
# Usa stat_summary_2d invece di geom_tile, per calcolare densità in ogni area
stat_summary_2d(aes(z = xG_Random_Forest), fun = mean, bins = 65, alpha = 0.8) +
scale_fill_viridis(option = "plasma", name = "xG Medio", direction = -1,
breaks = pretty_breaks(n = 5),
limits = c(0, max(tot_opf_rf$xG_Random_Forest, na.rm = TRUE))) +
# Migliora i dettagli del grafico
labs(
title = "Heat Map degli Expected Goals (xG)\nTiri Effettuati in Situazione di Open Play",
x = NULL, y = NULL
) +
# Coordinata per mostrare solo la metà campo
coord_cartesian(xlim = c(50, 100), ylim = c(0, 100)) +
theme_minimal() +
theme(
axis.text = element_blank(), # Nascondi i valori degli assi
axis.ticks = element_blank(), # Nascondi i ticks degli assi
panel.grid = element_blank(), # Rimuovi le griglie
legend.position = "bottom", # Posiziona la legenda in basso
plot.title = element_text(size = 14, face = "bold", hjust = 0.5) # Centra il titolo e lo rende più visibile
)
# Visualizza il grafico
p_heatmap_custom