Packages and Data

library(tools)
library(stringi)
library(gtsummary)
library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

library(predtools)
library(magrittr)
library(probably)

## 
## Attaching package: 'probably'

## The following objects are masked from 'package:base':
## 
##     as.factor, as.ordered

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──

## ✔ broom        1.0.6     ✔ rsample      1.2.1
## ✔ dials        1.3.0     ✔ tibble       3.2.1
## ✔ dplyr        1.1.4     ✔ tidyr        1.3.1
## ✔ infer        1.0.7     ✔ tune         1.2.1
## ✔ modeldata    1.4.0     ✔ workflows    1.1.4
## ✔ parsnip      1.2.1     ✔ workflowsets 1.1.0
## ✔ purrr        1.0.2     ✔ yardstick    1.3.1
## ✔ recipes      1.1.0

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard()         masks scales::discard()
## ✖ tidyr::extract()         masks magrittr::extract()
## ✖ dplyr::filter()          masks stats::filter()
## ✖ dplyr::lag()             masks stats::lag()
## ✖ purrr::lift()            masks caret::lift()
## ✖ yardstick::precision()   masks caret::precision()
## ✖ yardstick::recall()      masks caret::recall()
## ✖ yardstick::sensitivity() masks caret::sensitivity()
## ✖ purrr::set_names()       masks magrittr::set_names()
## ✖ yardstick::specificity() masks caret::specificity()
## ✖ recipes::step()          masks stats::step()
## • Dig deeper into tidy modeling with R at https://www.tmwr.org

library(caTools)
library(boot)

## 
## Attaching package: 'boot'

## The following object is masked from 'package:lattice':
## 
##     melanoma

library(randomForest)

## randomForest 4.7-1.2

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

## The following object is masked from 'package:gtsummary':
## 
##     select

library(rpart)

## 
## Attaching package: 'rpart'

## The following object is masked from 'package:dials':
## 
##     prune

library(ranger)

## 
## Attaching package: 'ranger'

## The following object is masked from 'package:randomForest':
## 
##     importance

library(gbm)

## Loaded gbm 2.2.2

## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3

library(xgboost)

## 
## Attaching package: 'xgboost'

## The following object is masked from 'package:dplyr':
## 
##     slice

library(e1071)

## 
## Attaching package: 'e1071'

## The following object is masked from 'package:tune':
## 
##     tune

## The following object is masked from 'package:rsample':
## 
##     permutations

## The following object is masked from 'package:parsnip':
## 
##     tune

library(nnet)
library(dplyr)
library(ggplot2)
library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:xgboost':
## 
##     slice

## The following object is masked from 'package:MASS':
## 
##     select

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(ggsoccer)
library(ggtext)
library(viridis)

## Loading required package: viridisLite

## 
## Attaching package: 'viridis'

## The following object is masked from 'package:scales':
## 
##     viridis_pal

library(kableExtra)

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

library(rms)

## Loading required package: Hmisc

## 
## Attaching package: 'Hmisc'

## The following object is masked from 'package:plotly':
## 
##     subplot

## The following object is masked from 'package:e1071':
## 
##     impute

## The following object is masked from 'package:parsnip':
## 
##     translate

## The following objects are masked from 'package:dplyr':
## 
##     src, summarize

## The following objects are masked from 'package:base':
## 
##     format.pval, units

shots_dataset <- read.csv("~/Cose Mie/shots_dataset.csv")
FullData <- read.csv("~/Cose Mie/FullData.csv")
players <- read.csv("~/Cose Mie/players.csv")
new.players <- read.csv("~/Cose Mie/new.players.csv")

kable(head(shots_dataset, 3), "html") %>%
  kable_styling(full_width = T)

id	minute	result	X	Y	xG	player	h_a	player_id	situation	season	shotType	match_id	h_team	a_team	h_goals	a_goals	date	player_assisted	lastAction
445975	0	MissedShots	0.759	0.644	0.0186014	Jorrit Hendrix	h	9392	OpenPlay	2021	LeftFoot	16265	Spartak Moscow	FK Akhmat	2	1	2021-12-04 14:00:00	Quincy Promes	Pass
445976	6	MissedShots	0.738	0.352	0.0136572	Jorrit Hendrix	h	9392	FromCorner	2021	LeftFoot	16265	Spartak Moscow	FK Akhmat	2	1	2021-12-04 14:00:00		None
445977	9	SavedShot	0.732	0.250	0.0310316	Quincy Promes	h	2815	DirectFreekick	2021	RightFoot	16265	Spartak Moscow	FK Akhmat	2	1	2021-12-04 14:00:00		Standard

Data Cleaning

shots_dataset$player <- stri_trans_general(shots_dataset$player, "Latin-ASCII")
giocatori <- as.data.frame(unique(shots_dataset$player))
kable(head(giocatori, 3), "html") %>%
  kable_styling(full_width = T)

unique(shots_dataset$player)
Jorrit Hendrix
Quincy Promes
Victor Moses

names(giocatori)[names(giocatori) == "unique(shots_dataset$player)"] <- "Name"
foot_dataset <- FullData[, c("Name", "Preffered_Foot")]
foot_dataset$Name <- stri_trans_general(foot_dataset$Name, "Latin-ASCII")
kable(head(foot_dataset, 3), "html") %>%
  kable_styling(full_width = T)

Name	Preffered_Foot
Cristiano Ronaldo	Right
Lionel Messi	Left
Neymar	Right

giocatori2 <- merge(giocatori, foot_dataset, by = "Name", all.x = TRUE)
kable(head(giocatori2, 3), "html") %>%
  kable_styling(full_width = T)

Name	Preffered_Foot
Aaron Connolly	NA
Aaron Cresswell	Left
Aaron Hickey	NA

colSums((is.na(giocatori2)))

##           Name Preffered_Foot 
##              0           2499

player2 <- players[, c("name", "foot")]
kable(head(player2, 3), "html") %>%
  kable_styling(full_width = T)

name	foot
Timo Hildebrand
Martin Petrov
Martin Amedick

player2 <- na.omit(player2)
player2$name <- stri_trans_general(player2$name, "Latin-ASCII")
names(player2)[names(player2) == "name"] <- "Name"

giocatori3 <- merge(giocatori2, player2, by = "Name", all.x = TRUE)
kable(head(giocatori3, 3), "html") %>%
  kable_styling(full_width = T)

Name	Preffered_Foot	foot
Aaron Connolly	NA	right
Aaron Cresswell	Left	left
Aaron Hickey	NA	both

giocatori3$Preffered_Foot <- ifelse(is.na(giocatori3$Preffered_Foot), giocatori3$foot, giocatori3$Preffered_Foot)
colSums(is.na(giocatori3))

##           Name Preffered_Foot           foot 
##              0            514            651

new.players <- new.players[, c("name", "foot")]
##
nomi_da_cambiare <- c("Andrej Galabinov", "Anssumane Fati", "Arnaud Kalimuendo Muinga", "Daniel Parejo", "Emile Smith-Rowe", "Kephren Thuram", "Kylian Mbappe-Lottin", "Marco Faraoni", "Nemanja Vidci", "Pablo Daniel Osvaldo", "Luca Toni", "Tanguy NDombele Alvaro", "Thiago Alcantara", "Yeremi Pino", "Papis Demba Cisse", "Kevin Kuranyi", "Dimitar Berbatov")

giocatori3 <- giocatori3 %>%
  mutate(Preffered_Foot = ifelse(Name %in% nomi_da_cambiare, "Right", Preffered_Foot))
giocatori3 <- subset(giocatori3, select = -c(foot))

nomi_da_cambiare2 <- c("Dimitri Kombarov", "Lee Kang-In", "Mohammed Ali-Cho", "Yaroslav Rakitskiy", "Pape Alassane Gueye")

giocatori3 <- giocatori3 %>%
  mutate(Preffered_Foot = ifelse(Name %in% nomi_da_cambiare2, "Left", Preffered_Foot))

nomi_da_cambiare3 <- c("Kostas Mitroglu", "Santiago Cazorla", "Son Heung-Min")

giocatori3 <- giocatori3 %>%
  mutate(Preffered_Foot = ifelse(Name %in% nomi_da_cambiare3, "both", Preffered_Foot))

giocatori3 <- giocatori3 %>%
  mutate(across(everything(), na_if, ""))

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(everything(), na_if, "")`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
## 
##   # Previously
##   across(a:b, mean, na.rm = TRUE)
## 
##   # Now
##   across(a:b, \(x) mean(x, na.rm = TRUE))

giocatori3$Preffered_Foot <- toTitleCase(giocatori3$Preffered_Foot)

giocatori3$Preffered_Foot <- ifelse(giocatori3$Preffered_Foot == "Right", "RightFoot", ifelse(giocatori3$Preffered_Foot == "Left", "LeftFoot", giocatori3$Preffered_Foot))

names(giocatori3)[names(giocatori3) == "Name"] <- "player"

shots_dataset2 <- merge(shots_dataset, giocatori3, by = "player", all.x = TRUE)

shots_dataset2 <- shots_dataset2 %>%
  mutate(Preffered_Foot = ifelse(player == "Franck Zambo", "Right", Preffered_Foot))

shots_dataset2 <- shots_dataset2[shots_dataset2$result != "OwnGoal", ]

shots_dataset2$result <- ifelse(shots_dataset2$result == "Goal", 1, 0)

shots_dataset2$result <- as.factor(shots_dataset2$result)

shots_dataset2 <- na.omit(shots_dataset2)

names(shots_dataset2)[names(shots_dataset2) == "xG"] <- "xG Understat"

kable(head(shots_dataset2, 3), "html") %>%
  kable_styling(full_width = T)

player	id	minute	X	Y	xG Understat	h_a	player_id	situation	season	shotType	match_id	h_team	a_team	h_goals	a_goals	date	player_assisted	lastAction	Preffered_Foot
Aaron Connolly	427269	51	0.913	0.536	0.5629963	h	7991	OpenPlay	2021	LeftFoot	16391	Brighton	Watford	2	0	2021-08-21 16:30:00		None	RightFoot
Aaron Connolly	329180	49	0.944	0.383	0.0140796	h	7991	FromCorner	2019	Head	11746	Brighton	Norwich	2	0	2019-11-02 15:00:00	Pascal Groß	Aerial	RightFoot
Aaron Connolly	338614	91	0.797	0.377	0.0308471	h	7991	OpenPlay	2019	RightFoot	11815	Brighton	Sheffield United	0	1	2019-12-21 15:00:00		None	RightFoot

premier_league <- c("Watford", "Norwich", "Sheffield United", "Brighton", "Southampton", "Chelsea", "Liverpool", "Manchester United", "Wolverhampton Wanderers", "Tottenham", "Aston Villa", "Burnley", "Swansea", "West Ham", "Leeds", "Crystal Palace", "Manchester City", "Newcastle United", "Fulham", "Hull", "Arsenal", "Bournemouth", "West Bromwich Albion", "Stoke", "Sunderland", "Huddersfield", "Everton", "Leicester", "Cardiff", "Middlesbrough", "Queens Park Rangers", "Brentford")
seriea <- c("Bologna", "Genoa", "Salernitana", "AC Milan", "Fiorentina", "Cagliari", "Lazio", "Juventus", "Parma Calcio 1913", "Crotone", "SPAL 2013", "Sassuolo", "Lecce", "Udinese", "Roma", "Torino", "Inter", "Brescia", "Verona", "Parma", "Palermo", "Napoli", "Carpi", "Frosinone", "Venezia", "Sampdoria", "Benevento", "Spezia", "Empoli", "Chievo", "Atalanta", "Cesena", "Pescara")
bundesliga <- c("Hamburger SV", "Wolfsburg", "Eintracht Frankfurt", "RasenBallsport Leipzig", "Freiburg", "Bayern Munich", "Bayer Leverkusen", "Schalke 04", "Hertha Berlin", "Werder Bremen", "VfB Stuttgart", "Darmstadt", "Bochum", "Mainz 05", "Hannover 96", "Arminia Bielefeld", "Fortuna Duesseldorf", "Augsburg", "Hoffenheim", "Borussia M.Gladbach", "Paderborn", "FC Cologne", "Borussia Dortmund", "Greuther Fuerth", "Nuernberg", "Ingolstadt", "Union Berlin")
ligue1 <- c("Toulouse", "Marseille", "Nice", "Bordeaux", "Saint-Etienne", "Lille", "Strasbourg", "Guingamp", "Metz", "Brest", "Lyon", "Montpellier", "Reims", "Dijon", "Nantes", "Angers", "Rennes", "SC Bastia", "Paris Saint Germain", "Monaco", "Caen", "GFC Ajaccio", "Nimes", "Lorient", "Amiens", "Lens", "Troyes", "Evian Thonon Gaillard", "Nancy", "Clermont Foot")
liga <- c("Real Betis", "Celta Vigo", "Alaves", "Levante", "Elche", "Granada", "Deportivo La Coruna", "Villarreal", "Cordoba", "Athletic Club", "Rayo Vallecano", "Mallorca", "Real Valladolid", "Real Sociedad", "Sevilla", "Atletico Madrid", "Real Madrid", "Eibar", "Getafe", "Barcelona", "Malaga", "Girona", "Las Palmas", "Valencia", "SD Huesca", "Sporting Gijon", "Osasuna", "Cadiz", "Leganes", "Almeria")
shots_dataset2$lega <- ifelse(shots_dataset2$h_team %in% premier_league | shots_dataset2$a_team %in% premier_league, "Premier League",
                              ifelse(shots_dataset2$h_team %in% seriea | shots_dataset2$a_team %in% seriea, "Serie A", 
                                     ifelse(shots_dataset2$h_team %in% bundesliga | shots_dataset2$a_team %in% bundesliga, "Bundesliga", 
                                            ifelse(shots_dataset2$h_team %in% ligue1 | shots_dataset2$a_team %in% ligue1, "Ligue 1", 
                                                   ifelse(shots_dataset2$h_team %in% liga | shots_dataset2$a_team %in% liga, "Liga", "Russian PL")))))

subset_penalty <- subset(shots_dataset2, situation == "Penalty")

subset_openplay <- subset(shots_dataset2, situation == "OpenPlay")

subset_setpiece <- subset(shots_dataset2, situation %in% c("SetPiece", "FromCorner", "DirectFreekick"))

kable(head(subset_openplay, 3), "html") %>%
  kable_styling(full_width = T)

	player	id	minute	X	Y	xG Understat	h_a	player_id	situation	season	shotType	match_id	h_team	a_team	h_goals	a_goals	date	player_assisted	lastAction	Preffered_Foot	lega
1	Aaron Connolly	427269	51	0.913	0.536	0.5629963	h	7991	OpenPlay	2021	LeftFoot	16391	Brighton	Watford	2	0	2021-08-21 16:30:00		None	RightFoot	Premier League
3	Aaron Connolly	338614	91	0.797	0.377	0.0308471	h	7991	OpenPlay	2019	RightFoot	11815	Brighton	Sheffield United	0	1	2019-12-21 15:00:00		None	RightFoot	Premier League
4	Aaron Connolly	403235	49	0.847	0.382	0.1247877	a	7991	OpenPlay	2020	RightFoot	14656	Burnley	Brighton	1	1	2021-02-06 15:00:00	Yves Bissouma	Tackle	RightFoot	Premier League

openplay_foot <- subset(subset_openplay, shotType %in% c("LeftFoot", "RightFoot"))
openplay_head <- subset(subset_openplay, shotType == "Head")

setpiece_foot <- subset(subset_setpiece, shotType %in% c("LeftFoot", "RightFoot"))
setpiece_head <- subset(subset_setpiece, shotType == "Head")

tbl_summary(openplay_foot, include = c(minute, result, X, Y, shotType, lastAction))

Characteristic	N = 288,887¹
minute	49 (26, 71)
result
0	260,327 (90%)
1	28,560 (9.9%)
X	0.85 (0.77, 0.89)
Y	0.50 (0.39, 0.62)
shotType
LeftFoot	106,100 (37%)
RightFoot	182,787 (63%)
lastAction
Aerial	3,641 (1.3%)
BallRecovery	9,647 (3.3%)
BallTouch	5,416 (1.9%)
BlockedPass	472 (0.2%)
Card	27 (<0.1%)
Challenge	29 (<0.1%)
ChanceMissed	2 (<0.1%)
Chipped	15,477 (5.4%)
Clearance	47 (<0.1%)
CornerAwarded	197 (<0.1%)
Cross	16,612 (5.8%)
Dispossessed	2,383 (0.8%)
End	104 (<0.1%)
Error	3 (<0.1%)
FormationChange	12 (<0.1%)
Foul	272 (<0.1%)
Goal	143 (<0.1%)
GoodSkill	69 (<0.1%)
HeadPass	6,590 (2.3%)
Interception	526 (0.2%)
KeeperPickup	7 (<0.1%)
KeeperSweeper	1 (<0.1%)
LayOff	3,836 (1.3%)
None	32,994 (11%)
OffsidePass	81 (<0.1%)
OffsideProvoked	7 (<0.1%)
Pass	148,079 (51%)
Punch	1 (<0.1%)
Rebound	11,636 (4.0%)
Save	13 (<0.1%)
ShieldBallOpp	2 (<0.1%)
Smother	1 (<0.1%)
Start	10 (<0.1%)
SubstitutionOn	36 (<0.1%)
Tackle	969 (0.3%)
TakeOn	21,584 (7.5%)
Throughball	7,961 (2.8%)
¹ Median (Q1, Q3); n (%)

tbl_summary(openplay_head, include = c(minute, result, X, Y, shotType, lastAction))

Characteristic	N = 30,855¹
minute	49 (26, 73)
result
0	27,181 (88%)
1	3,674 (12%)
X	0.911 (0.890, 0.929)
Y	0.50 (0.46, 0.55)
shotType
Head	30,855 (100%)
lastAction
Aerial	9,053 (29%)
BallRecovery	8 (<0.1%)
BallTouch	65 (0.2%)
BlockedPass	7 (<0.1%)
Card	6 (<0.1%)
Challenge	10 (<0.1%)
Chipped	2,743 (8.9%)
Clearance	2 (<0.1%)
CornerAwarded	59 (0.2%)
Cross	16,320 (53%)
Dispossessed	3 (<0.1%)
End	37 (0.1%)
Error	2 (<0.1%)
FormationChange	1 (<0.1%)
Foul	67 (0.2%)
Goal	26 (<0.1%)
HeadPass	392 (1.3%)
Interception	7 (<0.1%)
KeeperPickup	3 (<0.1%)
LayOff	6 (<0.1%)
None	846 (2.7%)
OffsidePass	7 (<0.1%)
Pass	315 (1.0%)
Rebound	735 (2.4%)
Save	1 (<0.1%)
SubstitutionOff	1 (<0.1%)
SubstitutionOn	11 (<0.1%)
Tackle	12 (<0.1%)
TakeOn	21 (<0.1%)
Throughball	89 (0.3%)
¹ Median (Q1, Q3); n (%)

tbl_summary(setpiece_foot, include = c(minute, result, X, Y, shotType, lastAction))

Characteristic	N = 69,475¹
minute	49 (26, 71)
result
0	64,065 (92%)
1	5,410 (7.8%)
X	0.80 (0.75, 0.89)
Y	0.51 (0.41, 0.61)
shotType
LeftFoot	24,145 (35%)
RightFoot	45,330 (65%)
lastAction
Aerial	3,397 (4.9%)
BallRecovery	65 (<0.1%)
BallTouch	1,316 (1.9%)
BlockedPass	24 (<0.1%)
Card	14 (<0.1%)
Challenge	9 (<0.1%)
Chipped	1,528 (2.2%)
Clearance	5 (<0.1%)
CornerAwarded	154 (0.2%)
Cross	5,988 (8.6%)
Dispossessed	118 (0.2%)
End	103 (0.1%)
Error	2 (<0.1%)
FormationChange	3 (<0.1%)
Foul	139 (0.2%)
Goal	55 (<0.1%)
GoodSkill	3 (<0.1%)
HeadPass	2,736 (3.9%)
Interception	47 (<0.1%)
KeeperPickup	4 (<0.1%)
KeeperSweeper	1 (<0.1%)
LayOff	182 (0.3%)
None	16,313 (23%)
OffsidePass	9 (<0.1%)
OffsideProvoked	1 (<0.1%)
Pass	9,535 (14%)
Rebound	5,585 (8.0%)
Save	5 (<0.1%)
Standard	20,936 (30%)
Start	1 (<0.1%)
SubstitutionOn	7 (<0.1%)
Tackle	93 (0.1%)
TakeOn	987 (1.4%)
Throughball	110 (0.2%)
¹ Median (Q1, Q3); n (%)

tbl_summary(setpiece_head, include = c(minute, result, X, Y, shotType, lastAction))

Characteristic	N = 45,833¹
minute	49 (26, 71)
result
0	41,601 (91%)
1	4,232 (9.2%)
X	0.917 (0.899, 0.934)
Y	0.50 (0.46, 0.55)
shotType
Head	45,833 (100%)
lastAction
Aerial	16,390 (36%)
BallRecovery	13 (<0.1%)
BallTouch	157 (0.3%)
BlockedPass	42 (<0.1%)
Card	23 (<0.1%)
Challenge	16 (<0.1%)
Chipped	2,327 (5.1%)
Clearance	9 (<0.1%)
CornerAwarded	146 (0.3%)
Cross	23,526 (51%)
Dispossessed	16 (<0.1%)
End	98 (0.2%)
Error	1 (<0.1%)
FormationChange	2 (<0.1%)
Foul	160 (0.3%)
Goal	93 (0.2%)
HeadPass	918 (2.0%)
Interception	16 (<0.1%)
KeeperPickup	4 (<0.1%)
LayOff	6 (<0.1%)
None	894 (2.0%)
OffsidePass	35 (<0.1%)
OffsideProvoked	2 (<0.1%)
Pass	162 (0.4%)
Rebound	710 (1.5%)
Save	2 (<0.1%)
ShieldBallOpp	3 (<0.1%)
Standard	2 (<0.1%)
Start	1 (<0.1%)
SubstitutionOn	9 (<0.1%)
Tackle	20 (<0.1%)
TakeOn	25 (<0.1%)
Throughball	5 (<0.1%)
¹ Median (Q1, Q3); n (%)

Removing last action <10

openplay_foot <- subset(openplay_foot, !lastAction %in% c("ChanceMissed", "Error", "KeeperPickup", "KeeperSweeper", "OffsideProvoked", "Punch", "ShieldBallOpp", "Smother"))
openplay_head <- subset(openplay_head, !lastAction %in% c("BallRecovery", "BlockedPass", "Card", "Clearance", "Dispossessed", "Error", "FormationChange", "Interception", "KeeperPickup", "LayOff", "OffsidePass", "Save", "SubstiotutionOff"))
setpiece_foot <- subset(setpiece_foot, !lastAction %in% c("Challenge", "Clearance", "Error", "FormationChange", "KeeperPickup", "KeeperSweeper", "OffsidePass", "OffsideProvoked", "Save", "Start", "SubstiotutionOn"))
setpiece_head <- subset(setpiece_head, !lastAction %in% c("Clearance", "Error", "FormationChange", "KeeperPickup", "LayOff", "OffsideProvoked", "Save", "ShieldBallOpp", "Standard", "Start", "SubstiotutionOn", "Throughball"))

Introduction of is_weakfoot variable

openplay_foot$is_weakfoot <- ifelse(openplay_foot$Preffered_Foot == openplay_foot$shotType, "No", ifelse(openplay_foot$Preffered_Foot == "both", "No", "Yes"))
setpiece_foot$is_weakfoot <- ifelse(setpiece_foot$Preffered_Foot == setpiece_foot$shotType, "No", ifelse(setpiece_foot$Preffered_Foot == "both", "No", "Yes"))

understat_xG_op_foot <- openplay_foot$xG
understat_xG_op_head <- openplay_head$xG
understat_xG_sp_foot <- setpiece_foot$xG
understat_xG_sp_head <- setpiece_head$xG

kable(head(openplay_foot, 3), "html") %>%
  kable_styling(full_width = T)

	player	id	minute	X	Y	xG Understat	h_a	player_id	situation	season	shotType	match_id	h_team	a_team	h_goals	a_goals	date	player_assisted	lastAction	Preffered_Foot	lega	is_weakfoot
1	Aaron Connolly	427269	51	0.913	0.536	0.5629963	h	7991	OpenPlay	2021	LeftFoot	16391	Brighton	Watford	2	0	2021-08-21 16:30:00		None	RightFoot	Premier League	Yes
3	Aaron Connolly	338614	91	0.797	0.377	0.0308471	h	7991	OpenPlay	2019	RightFoot	11815	Brighton	Sheffield United	0	1	2019-12-21 15:00:00		None	RightFoot	Premier League	No
4	Aaron Connolly	403235	49	0.847	0.382	0.1247877	a	7991	OpenPlay	2020	RightFoot	14656	Burnley	Brighton	1	1	2021-02-06 15:00:00	Yves Bissouma	Tackle	RightFoot	Premier League	No

Data Modeling

Key

For better interpretation, this is how I refer to the different models:

For shot type, the numbering is as follows:

Open Play Foot
Open Play Head
Set Piece Foot
Set Piece Head

For modeling, the numbering is as follows: 1. Logit 2. Logit with interactions 3. Discriminant Analysis 4. Random Forest 5. Bagging 8. Neural Network

For example model1.5 is bagging model for Open Play Foot

Note: For brevity, the stepwise procedure is not shown in this report. Where there is no model with interactions, it is because there is no significant improvement that would justify such a model, which is very costly in terms of computational time.

Open Play Foot Model

set.seed(123)
index_opf <- sample.split(Y = openplay_foot$result, SplitRatio = 0.75)
train_opf <- openplay_foot[index_opf, ]

indice_test_opf <-  which(index_opf == FALSE)
test_opf <- openplay_foot[indice_test_opf, ]

Logit

modello1.1 <- glm(result ~ minute + lastAction + is_weakfoot + X + exp(Y^2) + h_a, 
                                         data = train_opf, family = binomial)
summary(modello1.1)

## 
## Call:
## glm(formula = result ~ minute + lastAction + is_weakfoot + X + 
##     exp(Y^2) + h_a, family = binomial, data = train_opf)
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -1.327e+01  1.594e-01 -83.287  < 2e-16 ***
## minute                     1.444e-03  2.824e-04   5.113 3.16e-07 ***
## lastActionBallRecovery     8.943e-01  1.114e-01   8.029 9.85e-16 ***
## lastActionBallTouch        4.734e-01  1.169e-01   4.049 5.13e-05 ***
## lastActionBlockedPass      6.401e-01  2.185e-01   2.929 0.003400 ** 
## lastActionCard             1.393e-01  7.542e-01   0.185 0.853428    
## lastActionChallenge        4.390e-01  1.049e+00   0.418 0.675667    
## lastActionChipped          3.860e-01  1.055e-01   3.660 0.000253 ***
## lastActionClearance        1.148e-01  7.561e-01   0.152 0.879348    
## lastActionCornerAwarded   -2.692e+00  1.010e+00  -2.666 0.007675 ** 
## lastActionCross            4.258e-01  1.040e-01   4.095 4.23e-05 ***
## lastActionDispossessed     5.274e-01  1.384e-01   3.811 0.000139 ***
## lastActionEnd             -2.009e+00  1.015e+00  -1.978 0.047884 *  
## lastActionFormationChange  8.962e-01  1.093e+00   0.820 0.412368    
## lastActionFoul             3.885e-03  2.842e-01   0.014 0.989093    
## lastActionGoal            -1.739e+00  7.240e-01  -2.402 0.016307 *  
## lastActionGoodSkill        1.241e+00  4.443e-01   2.793 0.005219 ** 
## lastActionHeadPass         4.361e-01  1.135e-01   3.842 0.000122 ***
## lastActionInterception     7.496e-01  2.350e-01   3.190 0.001422 ** 
## lastActionLayOff           6.903e-01  1.306e-01   5.285 1.26e-07 ***
## lastActionNone             5.927e-01  1.031e-01   5.747 9.08e-09 ***
## lastActionOffsidePass     -2.190e-02  5.418e-01  -0.040 0.967758    
## lastActionPass             6.951e-01  1.010e-01   6.884 5.83e-12 ***
## lastActionRebound          1.110e+00  1.042e-01  10.652  < 2e-16 ***
## lastActionSave            -7.818e+00  3.653e+01  -0.214 0.830549    
## lastActionStart            2.035e+00  1.197e+00   1.699 0.089265 .  
## lastActionSubstitutionOn   1.017e+00  5.857e-01   1.736 0.082476 .  
## lastActionTackle           7.795e-01  1.732e-01   4.501 6.76e-06 ***
## lastActionTakeOn           7.280e-01  1.039e-01   7.009 2.40e-12 ***
## lastActionThroughball      1.488e+00  1.047e-01  14.216  < 2e-16 ***
## is_weakfootYes            -1.602e-01  1.742e-02  -9.194  < 2e-16 ***
## X                          1.291e+01  1.289e-01 100.099  < 2e-16 ***
## exp(Y^2)                  -5.834e-01  4.082e-02 -14.294  < 2e-16 ***
## h_ah                       2.449e-02  1.511e-02   1.621 0.105116    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 139769  on 216646  degrees of freedom
## Residual deviance: 123770  on 216613  degrees of freedom
## AIC: 123838
## 
## Number of Fisher Scoring iterations: 9

predict1.1 <- predict.glm(modello1.1, newdata = test_opf, type = "response")

roc1.1 <- roc(test_opf$result ~ predict1.1, plot = T)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

legend("bottomright", legend = paste("AUC =", round(auc(roc1.1), 4)), lwd = 2, box.lwd = 0, bg = "white")

confusione1.1 <- confusionMatrix(as.factor(ifelse(predict1.1>0.3, 1, 0)), as.factor(test_opf$result)) 
confusione1.1

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 63353  6307
##          1  1723   833
##                                           
##                Accuracy : 0.8888          
##                  95% CI : (0.8865, 0.8911)
##     No Information Rate : 0.9011          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1263          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.9735          
##             Specificity : 0.1167          
##          Pos Pred Value : 0.9095          
##          Neg Pred Value : 0.3259          
##              Prevalence : 0.9011          
##          Detection Rate : 0.8773          
##    Detection Prevalence : 0.9646          
##       Balanced Accuracy : 0.5451          
##                                           
##        'Positive' Class : 0               
##

rms::val.prob(as.numeric(as.character(predict1.1)), as.numeric(as.character(test_opf$result)))

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##  4.937122e-01  7.468561e-01  1.422587e-01  7.001681e-02  5.057334e+03 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA  3.125414e-05  4.257049e+00  1.190128e-01  6.998556e-02 
##         Brier     Intercept         Slope          Emax           E90 
##  8.249545e-02 -5.805545e-02  9.688800e-01  1.046446e-01  5.777724e-03 
##          Eavg           S:z           S:p 
##  2.633787e-03  9.329611e-01  3.508401e-01

Logit with interactions

modello1.2 <- glm(formula = result ~ minute + lastAction + is_weakfoot + X + 
                    exp(Y^2) + h_a + lastAction:X + is_weakfoot:exp(Y^2) + X:exp(Y^2) + 
                    is_weakfoot:X + exp(Y^2):h_a + minute:lastAction + minute:h_a + 
                    minute:exp(Y^2) + lastAction:h_a, family = binomial, data = train_opf)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

summary(modello1.2)

## 
## Call:
## glm(formula = result ~ minute + lastAction + is_weakfoot + X + 
##     exp(Y^2) + h_a + lastAction:X + is_weakfoot:exp(Y^2) + X:exp(Y^2) + 
##     is_weakfoot:X + exp(Y^2):h_a + minute:lastAction + minute:h_a + 
##     minute:exp(Y^2) + lastAction:h_a, family = binomial, data = train_opf)
## 
## Coefficients:
##                                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                      -1.493e+01  1.521e+00  -9.817  < 2e-16 ***
## minute                            1.603e-02  4.342e-03   3.692 0.000222 ***
## lastActionBallRecovery            7.784e-01  1.427e+00   0.545 0.585464    
## lastActionBallTouch              -1.297e+00  1.591e+00  -0.815 0.414990    
## lastActionBlockedPass            -3.624e+00  3.283e+00  -1.104 0.269698    
## lastActionCard                    1.157e+01  1.284e+01   0.901 0.367548    
## lastActionChallenge              -5.879e+02  1.246e+04  -0.047 0.962374    
## lastActionChipped                 3.317e+00  1.405e+00   2.361 0.018243 *  
## lastActionClearance              -5.464e+00  1.576e+03  -0.003 0.997233    
## lastActionCornerAwarded           1.727e+02  2.529e+03   0.068 0.945571    
## lastActionCross                  -5.733e+00  1.488e+00  -3.854 0.000116 ***
## lastActionDispossessed           -1.104e+00  1.880e+00  -0.587 0.556960    
## lastActionEnd                     5.735e+02  3.299e+03   0.174 0.861998    
## lastActionFormationChange         2.008e+02  1.270e+05   0.002 0.998739    
## lastActionFoul                    7.324e+00  3.945e+00   1.857 0.063370 .  
## lastActionGoal                    1.089e+01  8.492e+02   0.013 0.989772    
## lastActionGoodSkill               1.025e+00  5.205e+00   0.197 0.843953    
## lastActionHeadPass               -1.060e+00  1.551e+00  -0.683 0.494455    
## lastActionInterception           -4.864e+00  3.025e+00  -1.608 0.107889    
## lastActionLayOff                 -9.187e-01  1.790e+00  -0.513 0.607762    
## lastActionNone                   -1.188e+00  1.349e+00  -0.881 0.378568    
## lastActionOffsidePass             4.325e+00  7.721e+00   0.560 0.575313    
## lastActionPass                   -1.781e+00  1.319e+00  -1.351 0.176837    
## lastActionRebound                -5.337e+00  1.421e+00  -3.755 0.000173 ***
## lastActionSave                   -5.817e+00  2.059e+04   0.000 0.999775    
## lastActionStart                  -1.121e+01  6.523e+03  -0.002 0.998629    
## lastActionSubstitutionOn         -1.680e+01  1.601e+01  -1.050 0.293930    
## lastActionTackle                 -8.704e-01  2.259e+00  -0.385 0.700041    
## lastActionTakeOn                  4.561e-01  1.361e+00   0.335 0.737519    
## lastActionThroughball             1.381e+01  1.473e+00   9.374  < 2e-16 ***
## is_weakfootYes                    1.098e-01  2.893e-01   0.379 0.704371    
## X                                 1.398e+01  1.743e+00   8.022 1.04e-15 ***
## exp(Y^2)                          2.016e+00  5.946e-01   3.390 0.000698 ***
## h_ah                             -5.780e-01  2.296e-01  -2.517 0.011825 *  
## lastActionBallRecovery:X          4.389e-01  1.648e+00   0.266 0.789939    
## lastActionBallTouch:X             2.787e+00  1.820e+00   1.531 0.125701    
## lastActionBlockedPass:X           5.876e+00  3.709e+00   1.584 0.113156    
## lastActionCard:X                 -1.265e+01  1.444e+01  -0.876 0.380987    
## lastActionChallenge:X             2.126e+02  6.053e+03   0.035 0.971982    
## lastActionChipped:X              -2.796e+00  1.609e+00  -1.738 0.082216 .  
## lastActionClearance:X            -1.139e+01  4.333e+00  -2.630 0.008541 ** 
## lastActionCornerAwarded:X        -2.254e+02  3.388e+03  -0.067 0.946963    
## lastActionCross:X                 7.177e+00  1.687e+00   4.255 2.09e-05 ***
## lastActionDispossessed:X          2.644e+00  2.156e+00   1.226 0.220024    
## lastActionEnd:X                  -8.521e+02  4.825e+03  -0.177 0.859831    
## lastActionFormationChange:X      -3.728e+02  1.731e+05  -0.002 0.998282    
## lastActionFoul:X                 -8.592e+00  4.424e+00  -1.942 0.052096 .  
## lastActionGoal:X                 -2.884e+01  1.176e+01  -2.452 0.014193 *  
## lastActionGoodSkill:X            -1.715e+00  5.784e+00  -0.297 0.766777    
## lastActionHeadPass:X              2.285e+00  1.773e+00   1.289 0.197357    
## lastActionInterception:X          6.750e+00  3.388e+00   1.992 0.046331 *  
## lastActionLayOff:X                2.191e+00  2.089e+00   1.049 0.294095    
## lastActionNone:X                  2.571e+00  1.548e+00   1.661 0.096766 .  
## lastActionOffsidePass:X          -3.654e+00  7.972e+00  -0.458 0.646686    
## lastActionPass:X                  3.227e+00  1.516e+00   2.129 0.033218 *  
## lastActionRebound:X               7.672e+00  1.619e+00   4.740 2.14e-06 ***
## lastActionSave:X                 -1.085e+01  2.480e+04   0.000 0.999651    
## lastActionStart:X                -5.294e-01  1.248e+01  -0.042 0.966174    
## lastActionSubstitutionOn:X        1.010e+01  1.490e+01   0.678 0.497963    
## lastActionTackle:X                2.135e+00  2.569e+00   0.831 0.405788    
## lastActionTakeOn:X                8.590e-01  1.562e+00   0.550 0.582282    
## lastActionThroughball:X          -1.324e+01  1.684e+00  -7.860 3.83e-15 ***
## is_weakfootYes:exp(Y^2)          -8.210e-01  9.906e-02  -8.288  < 2e-16 ***
## X:exp(Y^2)                       -2.709e+00  6.750e-01  -4.013 5.99e-05 ***
## is_weakfootYes:X                  9.331e-01  3.061e-01   3.048 0.002306 ** 
## exp(Y^2):h_ah                     2.060e-01  8.160e-02   2.525 0.011577 *  
## minute:lastActionBallRecovery    -8.250e-03  4.226e-03  -1.952 0.050886 .  
## minute:lastActionBallTouch       -1.175e-02  4.456e-03  -2.636 0.008381 ** 
## minute:lastActionBlockedPass     -1.606e-02  8.260e-03  -1.944 0.051918 .  
## minute:lastActionCard             2.070e-03  3.606e-02   0.057 0.954232    
## minute:lastActionChallenge        5.181e+00  1.032e+02   0.050 0.959962    
## minute:lastActionChipped         -8.817e-03  4.026e-03  -2.190 0.028521 *  
## minute:lastActionClearance       -1.062e-02  2.520e-02  -0.421 0.673493    
## minute:lastActionCornerAwarded   -9.022e-01  1.472e+01  -0.061 0.951122    
## minute:lastActionCross           -9.620e-03  3.972e-03  -2.422 0.015441 *  
## minute:lastActionDispossessed    -1.557e-02  5.282e-03  -2.947 0.003210 ** 
## minute:lastActionEnd              1.572e+00  1.062e+01   0.148 0.882327    
## minute:lastActionFormationChange  1.324e+00  1.670e+02   0.008 0.993674    
## minute:lastActionFoul            -1.878e-02  1.103e-02  -1.703 0.088586 .  
## minute:lastActionGoal            -3.517e-02  3.015e-02  -1.167 0.243331    
## minute:lastActionGoodSkill        3.062e-02  2.189e-02   1.399 0.161903    
## minute:lastActionHeadPass        -1.166e-02  4.309e-03  -2.705 0.006827 ** 
## minute:lastActionInterception     2.194e-03  9.279e-03   0.236 0.813085    
## minute:lastActionLayOff          -5.453e-03  4.961e-03  -1.099 0.271714    
## minute:lastActionNone            -1.066e-02  3.946e-03  -2.702 0.006897 ** 
## minute:lastActionOffsidePass     -4.063e-02  2.443e-02  -1.664 0.096210 .  
## minute:lastActionPass            -9.567e-03  3.871e-03  -2.471 0.013461 *  
## minute:lastActionRebound         -1.211e-02  3.993e-03  -3.034 0.002417 ** 
## minute:lastActionSave            -1.389e-02  7.405e+01   0.000 0.999850    
## minute:lastActionStart           -3.936e-01  1.449e+02  -0.003 0.997833    
## minute:lastActionSubstitutionOn   1.278e-01  7.682e-02   1.663 0.096248 .  
## minute:lastActionTackle          -7.866e-03  6.632e-03  -1.186 0.235583    
## minute:lastActionTakeOn          -1.117e-02  3.976e-03  -2.811 0.004943 ** 
## minute:lastActionThroughball     -1.113e-02  3.997e-03  -2.786 0.005339 ** 
## minute:h_ah                      -9.621e-04  5.706e-04  -1.686 0.091747 .  
## minute:exp(Y^2)                  -3.052e-03  1.528e-03  -1.998 0.045769 *  
## lastActionBallRecovery:h_ah       3.278e-01  2.222e-01   1.475 0.140144    
## lastActionBallTouch:h_ah          3.752e-02  2.339e-01   0.160 0.872590    
## lastActionBlockedPass:h_ah       -4.142e-02  4.531e-01  -0.091 0.927167    
## lastActionCard:h_ah              -1.666e+01  2.035e+03  -0.008 0.993468    
## lastActionChallenge:h_ah          1.521e+01  1.010e+03   0.015 0.987981    
## lastActionChipped:h_ah            2.693e-01  2.102e-01   1.281 0.200194    
## lastActionClearance:h_ah          1.664e+01  1.576e+03   0.011 0.991573    
## lastActionCornerAwarded:h_ah      1.093e+01  6.754e+02   0.016 0.987086    
## lastActionCross:h_ah              4.412e-01  2.075e-01   2.126 0.033506 *  
## lastActionDispossessed:h_ah       3.282e-01  2.773e-01   1.183 0.236643    
## lastActionEnd:h_ah               -2.914e+02  1.725e+03  -0.169 0.865815    
## lastActionFormationChange:h_ah    6.069e+01  2.276e+04   0.003 0.997872    
## lastActionFoul:h_ah               2.016e+00  8.014e-01   2.515 0.011904 *  
## lastActionGoal:h_ah               1.591e+01  8.491e+02   0.019 0.985052    
## lastActionGoodSkill:h_ah          3.968e-01  9.520e-01   0.417 0.676807    
## lastActionHeadPass:h_ah           3.384e-01  2.269e-01   1.491 0.135855    
## lastActionInterception:h_ah      -4.266e-01  4.976e-01  -0.857 0.391232    
## lastActionLayOff:h_ah             9.304e-02  2.614e-01   0.356 0.721864    
## lastActionNone:h_ah               2.946e-01  2.060e-01   1.430 0.152650    
## lastActionOffsidePass:h_ah        1.294e+00  1.343e+00   0.963 0.335473    
## lastActionPass:h_ah               4.271e-01  2.017e-01   2.118 0.034190 *  
## lastActionRebound:h_ah            3.789e-01  2.085e-01   1.817 0.069197 .  
## lastActionSave:h_ah               4.704e-01  4.439e+03   0.000 0.999915    
## lastActionStart:h_ah              1.464e+01  6.523e+03   0.002 0.998210    
## lastActionSubstitutionOn:h_ah    -1.440e+00  1.878e+00  -0.767 0.443286    
## lastActionTackle:h_ah             5.257e-01  3.483e-01   1.509 0.131209    
## lastActionTakeOn:h_ah             3.459e-01  2.073e-01   1.668 0.095298 .  
## lastActionThroughball:h_ah        4.527e-01  2.087e-01   2.169 0.030077 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 139769  on 216646  degrees of freedom
## Residual deviance: 122830  on 216523  degrees of freedom
## AIC: 123078
## 
## Number of Fisher Scoring iterations: 17

predict1.2 <- predict.glm(modello1.2, newdata = test_opf, type = "response")

confusione1.2 <- confusionMatrix(as.factor(ifelse(predict1.2>0.3, 1, 0)), as.factor(test_opf$result))
confusione1.2

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 63054  6204
##          1  2022   936
##                                           
##                Accuracy : 0.8861          
##                  95% CI : (0.8838, 0.8884)
##     No Information Rate : 0.9011          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1353          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.9689          
##             Specificity : 0.1311          
##          Pos Pred Value : 0.9104          
##          Neg Pred Value : 0.3164          
##              Prevalence : 0.9011          
##          Detection Rate : 0.8731          
##    Detection Prevalence : 0.9590          
##       Balanced Accuracy : 0.5500          
##                                           
##        'Positive' Class : 0               
##

rms::val.prob(as.numeric(as.character(predict1.2)), as.numeric(as.character(test_opf$result)))

## Warning in rms::val.prob(as.numeric(as.character(predict1.2)),
## as.numeric(as.character(test_opf$result))): 2 observations deleted from
## logistic calibration due to probs. of 0 or 1

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##  4.992168e-01  7.496084e-01  1.392605e-01  6.849007e-02  4.946942e+03 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA  5.514068e-04  4.181929e+01  8.299584e-10  6.793867e-02 
##         Brier     Intercept         Slope          Emax           E90 
##  8.220725e-02 -1.447628e-01  9.225944e-01  2.886326e-01  3.951237e-03 
##          Eavg           S:z           S:p 
##  2.745378e-03  1.345419e+00  1.784899e-01

roc1.2 <- roc(test_opf$result ~ predict1.2, plot = T)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

legend("bottomright", legend = paste("AUC =", round(auc(roc1.2), 4)), lwd = 2, box.lwd = 0, bg = "white")

Linear Discriminant Analysis

set.seed(1234)

new_train_opf <- train_opf

new_train_opf$Y <- exp(new_train_opf$Y^2)

new_test_opf <- test_opf

new_test_opf$Y <- exp(new_test_opf$Y^2)

set.seed(1234)


modello1.3 <- lda(result ~ minute + lastAction + is_weakfoot + X + exp(Y^2) + h_a, data = train_opf)

summary(modello1.3)

##         Length Class  Mode     
## prior    2     -none- numeric  
## counts   2     -none- numeric  
## means   66     -none- numeric  
## scaling 33     -none- numeric  
## lev      2     -none- character
## svd      1     -none- numeric  
## N        1     -none- numeric  
## call     3     -none- call     
## terms    3     terms  call     
## xlevels  3     -none- list

predict1.3.1 <- predict(modello1.3, newdata = test_opf)

predict1.3 <- predict1.3.1$posterior[, 2]


confusione1.3.1 <- confusionMatrix(as.factor(predict1.3.1$class), as.factor(test_opf$result))
confusione1.3.1

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 64747  6980
##          1   329   160
##                                          
##                Accuracy : 0.8988         
##                  95% CI : (0.8966, 0.901)
##     No Information Rate : 0.9011         
##     P-Value [Acc > NIR] : 0.9825         
##                                          
##                   Kappa : 0.0296         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.99494        
##             Specificity : 0.02241        
##          Pos Pred Value : 0.90269        
##          Neg Pred Value : 0.32720        
##              Prevalence : 0.90113        
##          Detection Rate : 0.89657        
##    Detection Prevalence : 0.99323        
##       Balanced Accuracy : 0.50868        
##                                          
##        'Positive' Class : 0              
##

confusione1.3 <- confusionMatrix(as.factor(ifelse(predict1.3>0.3, 1, 0)), as.factor(test_opf$result))


roc1.3 <- roc(test_opf$result ~ predict1.3, plot = T)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

legend("bottomright", legend = paste("AUC =", round(auc(roc1.3), 4)), lwd = 2, box.lwd = 0, bg = "white")

rms::val.prob(as.numeric(as.character(predict1.3)), as.numeric(as.character(test_opf$result)))

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##  4.895863e-01  7.447931e-01  1.329346e-01  6.527350e-02  4.714791e+03 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA  3.116984e-04  2.450961e+01  4.762170e-06  6.496181e-02 
##         Brier     Intercept         Slope          Emax           E90 
##  8.333969e-02 -1.391553e-01  9.295095e-01  2.800603e-01  2.064602e-02 
##          Eavg           S:z           S:p 
##  1.156119e-02  3.082489e+00  2.052772e-03

Random Forest

set.seed(1234)

modello1.4 <- ranger(train_opf$result ~ minute + lastAction + is_weakfoot + X + Y + h_a, 
                     data = new_train_opf, num.trees = 500, mtry = 3, seed = 42, probability = T)

train_predict_1.4.1 <- predict(modello1.4, data = new_train_opf)
train_predict1.4 <- train_predict_1.4.1$predictions[, 2]

predict1.4.1 <- predict(modello1.4, data = new_test_opf)

predict1.4 <- predict1.4.1$predictions[, 2]

confusione1.4 <- confusionMatrix(as.factor(ifelse(predict1.4 >0.3, 1, 0)), as.factor(test_opf$result))
confusione1.4

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 61892  3957
##          1  3184  3183
##                                           
##                Accuracy : 0.9011          
##                  95% CI : (0.8989, 0.9033)
##     No Information Rate : 0.9011          
##     P-Value [Acc > NIR] : 0.5081          
##                                           
##                   Kappa : 0.417           
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.9511          
##             Specificity : 0.4458          
##          Pos Pred Value : 0.9399          
##          Neg Pred Value : 0.4999          
##              Prevalence : 0.9011          
##          Detection Rate : 0.8570          
##    Detection Prevalence : 0.9118          
##       Balanced Accuracy : 0.6984          
##                                           
##        'Positive' Class : 0               
##

roc1.4 <- roc(test_opf$result ~ as.numeric(predict1.4), plot = T)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

legend("bottomright", legend = paste("AUC =", round(auc(roc1.4), 4)), lwd = 2, box.lwd = 0, bg = "white")

rms::val.prob(as.numeric(as.character(predict1.4)), as.numeric(as.character(new_test_opf$result)))

## Warning in rms::val.prob(as.numeric(as.character(predict1.4)),
## as.numeric(as.character(new_test_opf$result))): 929 observations deleted from
## logistic calibration due to probs. of 0 or 1

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##  6.706364e-01  8.353182e-01  3.232237e-01  1.679441e-01  1.197323e+04 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA           Inf           Inf  0.000000e+00          -Inf 
##         Brier     Intercept         Slope          Emax           E90 
##  6.651975e-02 -2.583708e-01  8.922615e-01  4.151460e-02  3.601427e-02 
##          Eavg           S:z           S:p 
##  1.228770e-02 -3.416589e+00  6.341090e-04

Bagging

set.seed(1234)

modello1.5 <- ranger(train_opf$result ~ minute + lastAction + is_weakfoot + X + Y + h_a, 
                     data = new_train_opf, num.trees = 500, seed = 42, probability = T)

predict1.5.1 <- predict(modello1.5, data = new_test_opf)

predict1.5 <- predict1.5.1$predictions[, 2]

confusione1.5 <- confusionMatrix(as.factor(ifelse(predict1.5>0.3, 1, 0)), as.factor(new_test_opf$result))
confusione1.5

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 62474  4157
##          1  2602  2983
##                                           
##                Accuracy : 0.9064          
##                  95% CI : (0.9043, 0.9085)
##     No Information Rate : 0.9011          
##     P-Value [Acc > NIR] : 8.749e-07       
##                                           
##                   Kappa : 0.4184          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9600          
##             Specificity : 0.4178          
##          Pos Pred Value : 0.9376          
##          Neg Pred Value : 0.5341          
##              Prevalence : 0.9011          
##          Detection Rate : 0.8651          
##    Detection Prevalence : 0.9227          
##       Balanced Accuracy : 0.6889          
##                                           
##        'Positive' Class : 0               
##

roc1.5 <- roc(new_test_opf$result ~ as.numeric(predict1.5), plot = T)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

legend("bottomright", legend = paste("AUC =", round(auc(roc1.5), 4)), lwd = 2, box.lwd = 0, bg = "white")

rms::val.prob(as.numeric(as.character(predict1.5)), as.numeric(as.character(new_test_opf$result)))

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##  6.784610e-01  8.392305e-01  3.261361e-01  1.684718e-01  1.216736e+04 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA  9.957298e-04  7.390762e+01  1.110223e-16  1.674760e-01 
##         Brier     Intercept         Slope          Emax           E90 
##  6.710256e-02  1.193215e-01  1.096641e+00  1.542791e-01  1.849012e-02 
##          Eavg           S:z           S:p 
##  1.020388e-02 -8.226908e+00  1.921075e-16

Neural Network

modello1.8 <- nnet(result ~ minute + lastAction + is_weakfoot + X + Y + h_a, data = new_train_opf, size = 10, maxit = 1000, linout = FALSE)

## # weights:  351
## initial  value 118545.093917 
## iter  10 value 69981.009756
## iter  20 value 69192.592115
## iter  30 value 67888.711525
## iter  40 value 63628.809594
## iter  50 value 62525.075031
## iter  60 value 62057.467967
## iter  70 value 61604.685356
## iter  80 value 61532.876008
## iter  90 value 61487.160400
## iter 100 value 61233.657946
## iter 110 value 60338.128851
## iter 120 value 59616.426996
## iter 130 value 58733.031793
## iter 140 value 58349.312834
## iter 150 value 58148.958166
## iter 160 value 57935.848822
## iter 170 value 57756.338123
## iter 180 value 57639.865936
## iter 190 value 57573.532413
## iter 200 value 57512.936093
## iter 210 value 57426.532572
## iter 220 value 57327.745162
## iter 230 value 57217.958189
## iter 240 value 57172.723167
## iter 250 value 57148.220125
## iter 260 value 57133.696190
## iter 270 value 57116.527271
## iter 280 value 57102.667221
## iter 290 value 57084.939379
## iter 300 value 57056.657224
## iter 310 value 57048.010413
## iter 320 value 57038.515424
## iter 330 value 57021.976784
## iter 340 value 57009.160524
## iter 350 value 57001.456556
## iter 360 value 56993.776476
## iter 370 value 56988.466171
## iter 380 value 56985.363909
## iter 390 value 56984.264328
## iter 400 value 56983.744595
## iter 410 value 56983.261082
## iter 420 value 56982.689945
## iter 430 value 56981.621792
## iter 440 value 56979.853764
## iter 450 value 56977.893636
## iter 460 value 56975.779385
## iter 470 value 56973.815073
## iter 480 value 56972.577633
## iter 490 value 56971.942203
## iter 500 value 56971.328528
## iter 510 value 56969.780842
## iter 520 value 56969.663506
## iter 530 value 56969.309645
## iter 540 value 56968.354652
## iter 550 value 56967.126957
## iter 560 value 56964.815551
## iter 570 value 56961.936920
## iter 580 value 56957.827947
## iter 590 value 56955.201372
## iter 600 value 56954.353889
## iter 610 value 56953.875346
## iter 620 value 56953.530963
## iter 630 value 56953.236351
## iter 640 value 56953.022966
## iter 650 value 56952.857124
## iter 660 value 56952.730886
## iter 670 value 56952.602141
## iter 680 value 56952.471966
## iter 690 value 56952.311041
## iter 700 value 56952.158581
## iter 710 value 56951.968581
## iter 720 value 56951.865165
## iter 730 value 56951.791552
## iter 740 value 56951.713886
## iter 750 value 56951.575373
## iter 760 value 56951.479369
## iter 770 value 56951.335809
## iter 780 value 56951.186863
## iter 790 value 56951.084841
## iter 800 value 56950.981560
## iter 810 value 56950.861977
## iter 820 value 56950.727533
## iter 830 value 56950.537039
## iter 840 value 56950.345343
## iter 850 value 56950.119654
## iter 860 value 56949.843306
## iter 870 value 56949.747925
## iter 880 value 56949.637210
## iter 890 value 56949.555767
## iter 900 value 56949.517758
## iter 910 value 56949.381139
## iter 920 value 56949.296510
## iter 930 value 56949.210179
## iter 940 value 56949.124307
## iter 950 value 56948.983340
## iter 960 value 56948.841103
## iter 970 value 56948.594921
## iter 980 value 56948.099597
## iter 990 value 56947.901399
## iter1000 value 56947.590591
## final  value 56947.590591 
## stopped after 1000 iterations

predict1.8 <- predict(modello1.8, newdata = new_test_opf)

confusione1.8 <- confusionMatrix(as.factor(ifelse(predict1.8>0.3, 1, 0)), new_test_opf$result)
confusione1.8

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 61956  4927
##          1  3120  2213
##                                           
##                Accuracy : 0.8886          
##                  95% CI : (0.8863, 0.8909)
##     No Information Rate : 0.9011          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2953          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.9521          
##             Specificity : 0.3099          
##          Pos Pred Value : 0.9263          
##          Neg Pred Value : 0.4150          
##              Prevalence : 0.9011          
##          Detection Rate : 0.8579          
##    Detection Prevalence : 0.9262          
##       Balanced Accuracy : 0.6310          
##                                           
##        'Positive' Class : 0               
##

roc1.8 <- roc(new_test_opf$result ~ as.numeric(predict1.8), plot = T)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

legend("bottomright", legend = paste("AUC =", round(auc(roc1.8), 4)), lwd = 2, box.lwd = 0, bg = "white")

rms::val.prob(as.numeric(as.character(predict1.8)), as.numeric(as.character(new_test_opf$result)))

## Warning in rms::val.prob(as.numeric(as.character(predict1.8)),
## as.numeric(as.character(new_test_opf$result))): 315 observations deleted from
## logistic calibration due to probs. of 0 or 1

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##  5.834815e-01  7.917408e-01  2.184253e-01  1.097806e-01  7.894334e+03 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA           Inf           Inf  0.000000e+00          -Inf 
##         Brier     Intercept         Slope          Emax           E90 
##  7.652105e-02 -7.947524e-02  9.565840e-01  4.063039e-02  7.167874e-03 
##          Eavg           S:z           S:p 
##  3.731212e-03  1.660904e+00  9.673279e-02

Understat Result

confusione1.us <- confusionMatrix(as.factor(ifelse(test_opf$`xG Understat`>0.3, 1, 0)), test_opf$result)
confusione1.us

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 60545  3589
##          1  4531  3551
##                                           
##                Accuracy : 0.8876          
##                  95% CI : (0.8852, 0.8899)
##     No Information Rate : 0.9011          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.404           
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.9304          
##             Specificity : 0.4973          
##          Pos Pred Value : 0.9440          
##          Neg Pred Value : 0.4394          
##              Prevalence : 0.9011          
##          Detection Rate : 0.8384          
##    Detection Prevalence : 0.8881          
##       Balanced Accuracy : 0.7139          
##                                           
##        'Positive' Class : 0               
##

roc1.us <- roc(test_opf$result ~ as.numeric(test_opf$`xG Understat`), plot = T)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

legend("bottomright", legend = paste("AUC =", round(auc(roc1.us), 4)), lwd = 2, box.lwd = 0, bg = "white")

rms::val.prob(as.numeric(as.character(test_opf$`xG Understat`)), as.numeric(as.character(test_opf$result)))

## Warning in rms::val.prob(as.numeric(as.character(test_opf$`xG Understat`)), : 6
## observations deleted from logistic calibration due to probs. of 0 or 1

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##  6.294039e-01  8.147019e-01  2.776084e-01  1.415280e-01  1.022074e+04 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA           Inf           Inf  0.000000e+00          -Inf 
##         Brier     Intercept         Slope          Emax           E90 
##  7.128048e-02 -1.240680e-01  9.296240e-01  8.520251e-02  8.860956e-03 
##          Eavg           S:z           S:p 
##  5.630629e-03  3.822074e+00  1.323339e-04

Open Play Head

set.seed(123)
index_oph <- sample.split(Y = openplay_head$result, SplitRatio = 0.75)
train_oph <- openplay_head[index_oph, ]

indice_test_oph <-  which(index_oph == FALSE)
test_oph <- openplay_head[indice_test_oph, ]

Logit

modello2.1 <- glm(result ~ minute + lastAction + X + exp(Y^2) + h_a, 
                  data = train_oph, family = binomial)

summary2.1 <- summary(modello2.1)
summary2.1

## 
## Call:
## glm(formula = result ~ minute + lastAction + X + exp(Y^2) + h_a, 
##     family = binomial, data = train_oph)
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -2.680e+01  7.831e-01 -34.219  < 2e-16 ***
## minute                    -9.752e-04  7.921e-04  -1.231  0.21826    
## lastActionBallTouch       -2.660e-01  6.064e-01  -0.439  0.66098    
## lastActionChallenge       -1.277e+01  6.023e+02  -0.021  0.98309    
## lastActionChipped          9.195e-01  8.387e-02  10.964  < 2e-16 ***
## lastActionCornerAwarded   -1.305e+01  2.282e+02  -0.057  0.95440    
## lastActionCross            7.696e-01  5.820e-02  13.222  < 2e-16 ***
## lastActionEnd             -1.284e+01  2.877e+02  -0.045  0.96441    
## lastActionFoul            -1.302e+01  1.854e+02  -0.070  0.94400    
## lastActionGoal            -1.338e+01  3.162e+02  -0.042  0.96626    
## lastActionHeadPass         9.408e-01  1.567e-01   6.002 1.95e-09 ***
## lastActionNone             5.923e-01  1.292e-01   4.585 4.54e-06 ***
## lastActionPass             7.710e-01  1.802e-01   4.279 1.88e-05 ***
## lastActionRebound          1.274e+00  1.125e-01  11.322  < 2e-16 ***
## lastActionSubstitutionOff  1.729e+01  1.455e+03   0.012  0.99052    
## lastActionSubstitutionOn  -4.954e-01  1.063e+00  -0.466  0.64106    
## lastActionTackle          -1.282e+01  4.519e+02  -0.028  0.97737    
## lastActionTakeOn           1.697e+00  6.706e-01   2.530  0.01141 *  
## lastActionThroughball      2.603e+00  2.779e-01   9.368  < 2e-16 ***
## X                          2.731e+01  7.882e-01  34.656  < 2e-16 ***
## exp(Y^2)                  -7.093e-01  2.478e-01  -2.862  0.00421 ** 
## h_ah                       5.532e-02  4.326e-02   1.279  0.20097    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 16885  on 23101  degrees of freedom
## Residual deviance: 15006  on 23080  degrees of freedom
## AIC: 15050
## 
## Number of Fisher Scoring iterations: 14

predict2.1 <- predict.glm(modello2.1, newdata = test_oph, type = "response")

roc2.1 <- roc(test_oph$result ~ predict2.1, plot = T)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

legend("bottomright", legend = paste("AUC =", round(auc(roc2.1), 4)), lwd = 2, box.lwd = 0, bg = "white")

confusione2.1 <- confusionMatrix(as.factor(ifelse(predict2.1>0.3, 1, 0)), as.factor(test_oph$result)) 
confusione2.1

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 6430  743
##          1  352  175
##                                           
##                Accuracy : 0.8578          
##                  95% CI : (0.8498, 0.8655)
##     No Information Rate : 0.8808          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.17            
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.9481          
##             Specificity : 0.1906          
##          Pos Pred Value : 0.8964          
##          Neg Pred Value : 0.3321          
##              Prevalence : 0.8808          
##          Detection Rate : 0.8351          
##    Detection Prevalence : 0.9316          
##       Balanced Accuracy : 0.5694          
##                                           
##        'Positive' Class : 0               
##

rms::val.prob(as.numeric(as.character(predict2.1)), as.numeric(as.character(test_oph$result)))

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##  4.741098e-01  7.370549e-01  1.357465e-01  7.284670e-02  5.619196e+02 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA  2.890682e-05  2.222582e+00  3.291337e-01  7.281779e-02 
##         Brier     Intercept         Slope          Emax           E90 
##  9.674780e-02 -1.001238e-01  9.372446e-01  1.817344e-01  1.510059e-02 
##          Eavg           S:z           S:p 
##  6.637031e-03  8.586321e-01  3.905435e-01

Linear Discriminant Analysis

set.seed(1234)

new_train_oph <- train_oph

new_train_oph$Y <- exp(new_train_oph$Y^2)

new_test_oph <- test_oph

new_test_oph$Y <- exp(new_test_oph$Y^2)

set.seed(1234)


modello2.3 <- lda(result ~ minute + lastAction + X + exp(Y^2) + h_a, data = train_oph)

summary(modello2.3)

##         Length Class  Mode     
## prior    2     -none- numeric  
## counts   2     -none- numeric  
## means   42     -none- numeric  
## scaling 21     -none- numeric  
## lev      2     -none- character
## svd      1     -none- numeric  
## N        1     -none- numeric  
## call     3     -none- call     
## terms    3     terms  call     
## xlevels  2     -none- list

predict2.3.1 <- predict(modello2.3, newdata = test_oph)

predict2.3 <- predict2.3.1$posterior[, 2]

confusione2.3.1 <- confusionMatrix(as.factor(predict2.3.1$class), as.factor(test_oph$result))
confusione2.3.1

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 6735  885
##          1   47   33
##                                           
##                Accuracy : 0.879           
##                  95% CI : (0.8715, 0.8862)
##     No Information Rate : 0.8808          
##     P-Value [Acc > NIR] : 0.6961          
##                                           
##                   Kappa : 0.0479          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.99307         
##             Specificity : 0.03595         
##          Pos Pred Value : 0.88386         
##          Neg Pred Value : 0.41250         
##              Prevalence : 0.88078         
##          Detection Rate : 0.87468         
##    Detection Prevalence : 0.98961         
##       Balanced Accuracy : 0.51451         
##                                           
##        'Positive' Class : 0               
##

confusione2.3 <- confusionMatrix(as.factor(ifelse(predict2.3>0.3, 1, 0)), as.factor(test_oph$result))


roc2.3 <- roc(test_oph$result ~ predict2.3, plot = T)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

legend("bottomright", legend = paste("AUC =", round(auc(roc2.3), 4)), lwd = 2, box.lwd = 0, bg = "white")

rms::val.prob(as.numeric(as.character(predict2.3)), as.numeric(as.character(test_oph$result)))

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##  4.699917e-01  7.349958e-01  1.318688e-01  7.068646e-02  5.452857e+02 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA  4.950705e-04  5.812043e+00  5.469290e-02  7.019139e-02 
##         Brier     Intercept         Slope          Emax           E90 
##  9.774986e-02 -1.619795e-01  9.020178e-01  3.755251e-01  2.307149e-02 
##          Eavg           S:z           S:p 
##  1.214045e-02  1.746477e+00  8.072811e-02

Random Forest

set.seed(1234)

modello2.4 <- ranger(train_oph$result ~ minute + lastAction + X + Y + h_a, 
                     data = new_train_oph, num.trees = 500, mtry = 3, seed = 42, probability = T)

predict2.4.1 <- predict(modello2.4, data = new_test_oph)

predict2.4 <- predict2.4.1$predictions[, 2]

confusione2.4 <- confusionMatrix(as.factor(ifelse(predict2.4 >0.3, 1, 0)), as.factor(test_oph$result))
confusione2.4

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 6272  522
##          1  510  396
##                                           
##                Accuracy : 0.866           
##                  95% CI : (0.8582, 0.8735)
##     No Information Rate : 0.8808          
##     P-Value [Acc > NIR] : 1.000           
##                                           
##                   Kappa : 0.3582          
##                                           
##  Mcnemar's Test P-Value : 0.732           
##                                           
##             Sensitivity : 0.9248          
##             Specificity : 0.4314          
##          Pos Pred Value : 0.9232          
##          Neg Pred Value : 0.4371          
##              Prevalence : 0.8808          
##          Detection Rate : 0.8145          
##    Detection Prevalence : 0.8823          
##       Balanced Accuracy : 0.6781          
##                                           
##        'Positive' Class : 0               
##

roc2.4 <- roc(test_oph$result ~ as.numeric(predict2.4), plot = T)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

legend("bottomright", legend = paste("AUC =", round(auc(roc2.4), 4)), lwd = 2, box.lwd = 0, bg = "white")

rms::val.prob(as.numeric(as.character(predict2.4)), as.numeric(as.character(new_test_oph$result)))

## Warning in rms::val.prob(as.numeric(as.character(predict2.4)),
## as.numeric(as.character(new_test_oph$result))): 245 observations deleted from
## logistic calibration due to probs. of 0 or 1

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##  5.957707e-01  7.978854e-01  2.638290e-01  1.492232e-01  1.113459e+03 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA  7.485408e-03  5.780371e+01  2.805534e-13  1.417378e-01 
##         Brier     Intercept         Slope          Emax           E90 
##  8.376312e-02 -4.099566e-01  7.751907e-01  9.897154e-02  5.407294e-02 
##          Eavg           S:z           S:p 
##  2.153259e-02  1.694824e+00  9.010883e-02

Bagging

set.seed(1234)

modello2.5 <- ranger(train_oph$result ~ minute + lastAction + X + Y + h_a, 
                     data = new_train_oph, num.trees = 500, seed = 42, probability = T)

predict2.5.1 <- predict(modello2.5, data = new_test_oph)

predict2.5 <- predict2.5.1$predictions[, 2]

confusione2.5 <- confusionMatrix(as.factor(ifelse(predict2.5>0.3, 1, 0)), as.factor(new_test_oph$result))
confusione2.5

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 6332  530
##          1  450  388
##                                           
##                Accuracy : 0.8727          
##                  95% CI : (0.8651, 0.8801)
##     No Information Rate : 0.8808          
##     P-Value [Acc > NIR] : 0.98542         
##                                           
##                   Kappa : 0.3703          
##                                           
##  Mcnemar's Test P-Value : 0.01162         
##                                           
##             Sensitivity : 0.9336          
##             Specificity : 0.4227          
##          Pos Pred Value : 0.9228          
##          Neg Pred Value : 0.4630          
##              Prevalence : 0.8808          
##          Detection Rate : 0.8223          
##    Detection Prevalence : 0.8912          
##       Balanced Accuracy : 0.6782          
##                                           
##        'Positive' Class : 0               
##

roc2.5 <- roc(new_test_oph$result ~ as.numeric(predict2.5), plot = T)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

legend("bottomright", legend = paste("AUC =", round(auc(roc2.5), 4)), lwd = 2, box.lwd = 0, bg = "white")

rms::val.prob(as.numeric(as.character(predict2.5)), as.numeric(as.character(new_test_oph$result)))

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##  6.244548e-01  8.122274e-01  2.908950e-01  1.633468e-01  1.258770e+03 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA  1.468163e-03  1.330485e+01  1.290887e-03  1.618786e-01 
##         Brier     Intercept         Slope          Emax           E90 
##  8.252587e-02 -2.167724e-01  8.903037e-01  6.141308e-02  3.284286e-02 
##          Eavg           S:z           S:p 
##  1.285068e-02  3.637999e-01  7.160074e-01

Neural Network

modello2.8 <- nnet(result ~ minute + lastAction + X + Y + h_a, data = new_train_oph, size = 10, maxit = 1000, linout = FALSE)

## # weights:  231
## initial  value 8722.578293 
## iter  10 value 8435.748320
## iter  20 value 8279.696679
## iter  30 value 8089.025131
## iter  40 value 8065.396259
## iter  50 value 7820.389397
## iter  60 value 7620.533019
## iter  70 value 7527.182160
## iter  80 value 7347.012489
## iter  90 value 7306.470349
## iter 100 value 7270.602773
## iter 110 value 7209.485135
## iter 120 value 7149.740344
## iter 130 value 7108.479248
## iter 140 value 7089.482769
## iter 150 value 7081.089074
## iter 160 value 7075.318128
## iter 170 value 7072.006341
## iter 180 value 7065.815611
## iter 190 value 7061.494404
## iter 200 value 7059.751906
## iter 210 value 7057.881921
## iter 220 value 7056.740067
## iter 230 value 7055.132476
## iter 240 value 7052.237977
## iter 250 value 7051.064152
## iter 260 value 7050.361597
## iter 270 value 7049.824061
## iter 280 value 7049.270668
## iter 290 value 7048.685893
## iter 300 value 7047.999021
## iter 310 value 7047.421214
## iter 320 value 7047.147571
## iter 330 value 7046.797723
## iter 340 value 7046.235374
## iter 350 value 7045.552039
## iter 360 value 7043.659123
## iter 370 value 7043.059688
## iter 380 value 7042.916778
## iter 390 value 7042.807888
## iter 400 value 7042.674153
## iter 410 value 7042.430997
## iter 420 value 7042.132006
## iter 430 value 7041.922490
## iter 440 value 7041.629317
## iter 450 value 7041.465949
## iter 460 value 7041.325280
## iter 470 value 7041.021732
## iter 480 value 7040.845830
## iter 490 value 7040.576073
## iter 500 value 7040.369471
## iter 510 value 7040.308663
## iter 520 value 7040.267512
## iter 530 value 7040.203691
## iter 540 value 7040.109465
## iter 550 value 7040.064034
## iter 560 value 7039.967961
## iter 570 value 7039.905303
## iter 580 value 7039.802194
## iter 590 value 7039.726709
## iter 600 value 7039.678707
## iter 610 value 7039.625371
## iter 620 value 7039.569185
## iter 630 value 7039.529814
## iter 640 value 7039.473146
## iter 650 value 7039.436103
## iter 660 value 7039.409985
## final  value 7039.386145 
## converged

predict2.8 <- predict(modello2.8, newdata = new_test_oph)

confusione2.8 <- confusionMatrix(as.factor(ifelse(predict2.8>0.3, 1, 0)), new_test_oph$result)
confusione2.8

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 6407  655
##          1  375  263
##                                           
##                Accuracy : 0.8662          
##                  95% CI : (0.8584, 0.8738)
##     No Information Rate : 0.8808          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2663          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.9447          
##             Specificity : 0.2865          
##          Pos Pred Value : 0.9073          
##          Neg Pred Value : 0.4122          
##              Prevalence : 0.8808          
##          Detection Rate : 0.8321          
##    Detection Prevalence : 0.9171          
##       Balanced Accuracy : 0.6156          
##                                           
##        'Positive' Class : 0               
##

roc2.8 <- roc(new_test_oph$result ~ as.numeric(predict2.8), plot = T)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

legend("bottomright", legend = paste("AUC =", round(auc(roc2.8), 4)), lwd = 2, box.lwd = 0, bg = "white")

rms::val.prob(as.numeric(as.character(predict2.8)), as.numeric(as.character(new_test_oph$result)))

## Warning in rms::val.prob(as.numeric(as.character(predict2.8)),
## as.numeric(as.character(new_test_oph$result))): 64 observations deleted from
## logistic calibration due to probs. of 0 or 1

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##   0.536170235   0.768085117   0.187960276   0.102744421 785.556401386 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA           Inf           Inf   0.000000000          -Inf 
##         Brier     Intercept         Slope          Emax           E90 
##   0.092247046  -0.148281506   0.904902415   0.153865186   0.008181917 
##          Eavg           S:z           S:p 
##   0.006748645   1.625219257   0.104115849

Understat Result

confusione2.us <- confusionMatrix(as.factor(ifelse(test_oph$`xG Understat`>0.3, 1, 0)), test_oph$result)
confusione2.us

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 6021  391
##          1  761  527
##                                           
##                Accuracy : 0.8504          
##                  95% CI : (0.8422, 0.8583)
##     No Information Rate : 0.8808          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.3933          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.8878          
##             Specificity : 0.5741          
##          Pos Pred Value : 0.9390          
##          Neg Pred Value : 0.4092          
##              Prevalence : 0.8808          
##          Detection Rate : 0.7819          
##    Detection Prevalence : 0.8327          
##       Balanced Accuracy : 0.7309          
##                                           
##        'Positive' Class : 0               
##

roc2.us <- roc(test_oph$result ~ as.numeric(test_oph$`xG Understat`), plot = T)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

legend("bottomright", legend = paste("AUC =", round(auc(roc2.us), 4)), lwd = 2, box.lwd = 0, bg = "white")

rms::val.prob(as.numeric(as.character(test_oph$`xG Understat`)), as.numeric(as.character(test_oph$result)))

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##  6.478921e-01  8.239461e-01  2.930995e-01  1.646936e-01  1.269140e+03 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA  2.090539e-03  1.809715e+01  1.175584e-04  1.626030e-01 
##         Brier     Intercept         Slope          Emax           E90 
##  8.339068e-02 -1.780625e-01  9.907059e-01  4.643716e-02  3.833526e-02 
##          Eavg           S:z           S:p 
##  1.470120e-02 -3.084414e+00  2.039538e-03

Set Piece Foot

set.seed(123)
index_spf <- sample.split(Y = setpiece_foot$result, SplitRatio = 0.75)
train_spf <- setpiece_foot[index_spf, ]

indice_test_spf <-  which(index_spf == FALSE)
test_spf <- setpiece_foot[indice_test_spf, ]

Logit

modello3.1 <- glm(result ~ minute + lastAction + is_weakfoot + X + exp(Y^2) + h_a + situation, 
                  data = train_spf, family = binomial)

summary(modello3.1)

## 
## Call:
## glm(formula = result ~ minute + lastAction + is_weakfoot + X + 
##     exp(Y^2) + h_a + situation, family = binomial, data = train_spf)
## 
## Coefficients: (1 not defined because of singularities)
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -1.421e+01  2.918e-01 -48.687  < 2e-16 ***
## minute                    3.158e-04  6.464e-04   0.489  0.62515    
## lastActionBallRecovery    2.785e-01  7.601e-01   0.366  0.71411    
## lastActionBallTouch      -4.691e-03  1.859e-01  -0.025  0.97986    
## lastActionBlockedPass    -1.258e+01  3.202e+02  -0.039  0.96867    
## lastActionCard           -1.336e+01  4.690e+02  -0.028  0.97728    
## lastActionChipped         1.790e-01  1.591e-01   1.125  0.26060    
## lastActionCornerAwarded  -2.713e-01  3.710e-01  -0.731  0.46466    
## lastActionCross           1.187e-01  1.261e-01   0.941  0.34681    
## lastActionDispossessed   -5.646e-01  7.315e-01  -0.772  0.44014    
## lastActionEnd            -1.317e+01  1.526e+02  -0.086  0.93122    
## lastActionFoul            2.640e-01  3.382e-01   0.781  0.43504    
## lastActionGoal           -1.334e+01  2.110e+02  -0.063  0.94958    
## lastActionGoodSkill      -1.355e+01  8.259e+02  -0.016  0.98691    
## lastActionHeadPass        8.723e-01  1.301e-01   6.705 2.01e-11 ***
## lastActionInterception   -4.658e-01  1.049e+00  -0.444  0.65714    
## lastActionLayOff          9.714e-01  3.336e-01   2.912  0.00360 ** 
## lastActionNone            5.073e-01  1.203e-01   4.216 2.48e-05 ***
## lastActionPass            6.736e-01  1.242e-01   5.424 5.83e-08 ***
## lastActionRebound         1.219e+00  1.216e-01  10.030  < 2e-16 ***
## lastActionStandard        1.470e+00  1.248e-01  11.781  < 2e-16 ***
## lastActionSubstitutionOn  2.382e+00  1.061e+00   2.246  0.02473 *  
## lastActionTackle         -2.138e-01  7.419e-01  -0.288  0.77326    
## lastActionTakeOn          2.452e-02  2.147e-01   0.114  0.90908    
## lastActionThroughball     1.445e+00  2.679e-01   5.393 6.95e-08 ***
## is_weakfootYes           -1.214e-01  4.351e-02  -2.791  0.00526 ** 
## X                         1.400e+01  2.924e-01  47.871  < 2e-16 ***
## exp(Y^2)                 -6.425e-01  8.422e-02  -7.629 2.36e-14 ***
## h_ah                     -8.534e-02  3.469e-02  -2.460  0.01389 *  
## situationFromCorner      -9.978e-02  4.192e-02  -2.380  0.01730 *  
## situationSetPiece                NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 28504  on 52076  degrees of freedom
## Residual deviance: 24469  on 52047  degrees of freedom
## AIC: 24529
## 
## Number of Fisher Scoring iterations: 14

predict3.1 <- predict.glm(modello3.1, newdata = test_spf, type = "response")

roc3.1 <- roc(test_spf$result ~ predict3.1, plot = T)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

legend("bottomright", legend = paste("AUC =", round(auc(roc3.1), 4)), lwd = 2, box.lwd = 0, bg = "white")

confusione3.1 <- confusionMatrix(as.factor(ifelse(predict3.1>0.3, 1, 0)), as.factor(test_spf$result)) 
confusione3.1

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 15617  1107
##          1   389   245
##                                           
##                Accuracy : 0.9138          
##                  95% CI : (0.9095, 0.9179)
##     No Information Rate : 0.9221          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2073          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.9757          
##             Specificity : 0.1812          
##          Pos Pred Value : 0.9338          
##          Neg Pred Value : 0.3864          
##              Prevalence : 0.9221          
##          Detection Rate : 0.8997          
##    Detection Prevalence : 0.9635          
##       Balanced Accuracy : 0.5785          
##                                           
##        'Positive' Class : 0               
##

rms::val.prob(as.numeric(as.character(predict3.1)), as.numeric(as.character(test_spf$result)))

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##  5.080409e-01  7.540205e-01  1.505219e-01  6.547547e-02  1.137523e+03 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA  3.787373e-04  8.574122e+00  1.374527e-02  6.509673e-02 
##         Brier     Intercept         Slope          Emax           E90 
##  6.520744e-02 -1.766042e-01  9.150318e-01  1.923828e-02  1.603711e-02 
##          Eavg           S:z           S:p 
##  7.930469e-03  3.248913e-01  7.452633e-01

Linear Discriminant Analysis

set.seed(1234)

new_train_spf <- train_spf

new_train_spf$Y <- exp(new_train_spf$Y^2)

new_test_spf <- test_spf

new_test_spf$Y <- exp(new_test_spf$Y^2)

set.seed(1234)


modello3.3 <- lda(result ~ minute + lastAction + is_weakfoot + X + exp(Y^2) + h_a, data = train_spf)

summary(modello3.3)

##         Length Class  Mode     
## prior    2     -none- numeric  
## counts   2     -none- numeric  
## means   56     -none- numeric  
## scaling 28     -none- numeric  
## lev      2     -none- character
## svd      1     -none- numeric  
## N        1     -none- numeric  
## call     3     -none- call     
## terms    3     terms  call     
## xlevels  3     -none- list

predict3.3.1 <- predict(modello3.3, newdata = test_spf)

predict3.3 <- predict3.3.1$posterior[, 2]

confusione3.3.1 <- confusionMatrix(as.factor(predict3.3.1$class), as.factor(test_spf$result))
confusione3.3.1

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 15775  1193
##          1   231   159
##                                          
##                Accuracy : 0.918          
##                  95% CI : (0.9138, 0.922)
##     No Information Rate : 0.9221         
##     P-Value [Acc > NIR] : 0.9794         
##                                          
##                   Kappa : 0.153          
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.9856         
##             Specificity : 0.1176         
##          Pos Pred Value : 0.9297         
##          Neg Pred Value : 0.4077         
##              Prevalence : 0.9221         
##          Detection Rate : 0.9088         
##    Detection Prevalence : 0.9775         
##       Balanced Accuracy : 0.5516         
##                                          
##        'Positive' Class : 0              
##

confusione3.3 <- confusionMatrix(as.factor(ifelse(predict3.3>0.3, 1, 0)), as.factor(test_spf$result))

roc3.3 <- roc(test_spf$result ~ predict3.3, plot = T)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

legend("bottomright", legend = paste("AUC =", round(auc(roc3.3), 4)), lwd = 2, box.lwd = 0, bg = "white")

rms::val.prob(as.numeric(as.character(predict3.3)), as.numeric(as.character(test_spf$result)))

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##   0.487283753   0.743641877   0.129926651   0.056251210 977.408498151 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA   0.006455379 114.052465074   0.000000000   0.049795831 
##         Brier     Intercept         Slope          Emax           E90 
##   0.066883628  -0.556372588   0.751043723   0.343102307   0.019185729 
##          Eavg           S:z           S:p 
##   0.016310743   3.167461664   0.001537760

Random Forest

set.seed(1234)

modello3.4 <- ranger(train_spf$result ~ minute + lastAction + is_weakfoot + X + Y + h_a, 
                     data = new_train_spf, num.trees = 500, mtry = 3, seed = 42, probability = T)

train_predict_3.4.1 <- predict(modello3.4, data = new_train_spf)
train_predict3.4 <- train_predict_3.4.1$predictions[, 2]

predict3.4.1 <- predict(modello3.4, data = new_test_spf)

predict3.4 <- predict3.4.1$predictions[, 2]

confusione3.4 <- confusionMatrix(as.factor(ifelse(predict3.4 >0.3, 1, 0)), as.factor(test_spf$result))
confusione3.4

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 15519   717
##          1   487   635
##                                           
##                Accuracy : 0.9306          
##                  95% CI : (0.9268, 0.9344)
##     No Information Rate : 0.9221          
##     P-Value [Acc > NIR] : 1.086e-05       
##                                           
##                   Kappa : 0.4763          
##                                           
##  Mcnemar's Test P-Value : 4.121e-11       
##                                           
##             Sensitivity : 0.9696          
##             Specificity : 0.4697          
##          Pos Pred Value : 0.9558          
##          Neg Pred Value : 0.5660          
##              Prevalence : 0.9221          
##          Detection Rate : 0.8941          
##    Detection Prevalence : 0.9354          
##       Balanced Accuracy : 0.7196          
##                                           
##        'Positive' Class : 0               
##

roc3.4 <- roc(test_spf$result ~ as.numeric(predict3.4), plot = T)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

legend("bottomright", legend = paste("AUC =", round(auc(roc3.4), 4)), lwd = 2, box.lwd = 0, bg = "white")

rms::val.prob(as.numeric(as.character(predict3.4)), as.numeric(as.character(new_test_spf$result)))

## Warning in rms::val.prob(as.numeric(as.character(predict3.4)),
## as.numeric(as.character(new_test_spf$result))): 383 observations deleted from
## logistic calibration due to probs. of 0 or 1

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##  6.650859e-01  8.325429e-01  3.375476e-01  1.553417e-01  2.637926e+03 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA  2.683426e-03  4.755116e+01  4.724943e-11  1.526583e-01 
##         Brier     Intercept         Slope          Emax           E90 
##  5.104138e-02 -3.530922e-01  8.691969e-01  6.181181e-02  4.470280e-02 
##          Eavg           S:z           S:p 
##  1.567332e-02 -2.138949e+00  3.243979e-02

Bagging

set.seed(1234)

modello3.5 <- ranger(train_spf$result ~ minute + lastAction + is_weakfoot + X + Y + h_a, 
                     data = new_train_spf, num.trees = 500, seed = 42, probability = T)

predict3.5.1 <- predict(modello3.5, data = new_test_spf)

predict3.5 <- predict3.5.1$predictions[, 2]

confusione3.5 <- confusionMatrix(as.factor(ifelse(predict3.5>0.3, 1, 0)), as.factor(new_test_spf$result))
confusione3.5

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 15591   758
##          1   415   594
##                                           
##                Accuracy : 0.9324          
##                  95% CI : (0.9286, 0.9361)
##     No Information Rate : 0.9221          
##     P-Value [Acc > NIR] : 1.239e-07       
##                                           
##                   Kappa : 0.4677          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9741          
##             Specificity : 0.4393          
##          Pos Pred Value : 0.9536          
##          Neg Pred Value : 0.5887          
##              Prevalence : 0.9221          
##          Detection Rate : 0.8982          
##    Detection Prevalence : 0.9419          
##       Balanced Accuracy : 0.7067          
##                                           
##        'Positive' Class : 0               
##

roc3.5 <- roc(new_test_spf$result ~ as.numeric(predict3.5), plot = T)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

legend("bottomright", legend = paste("AUC =", round(auc(roc3.5), 4)), lwd = 2, box.lwd = 0, bg = "white")

rms::val.prob(as.numeric(as.character(predict3.5)), as.numeric(as.character(new_test_spf$result)))

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##  6.772853e-01  8.386426e-01  3.486116e-01  1.588321e-01  2.758008e+03 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA  4.519128e-04  9.844303e+00  7.283445e-03  1.583802e-01 
##         Brier     Intercept         Slope          Emax           E90 
##  5.136852e-02  4.054414e-02  1.056891e+00  9.273364e-02  2.470800e-02 
##          Eavg           S:z           S:p 
##  8.638274e-03 -3.800341e+00  1.444973e-04

Neural Network

set.seed(1234)

modello3.8 <- nnet(result ~ minute + lastAction + is_weakfoot + X + Y + h_a, data = new_train_spf, size = 10, maxit = 1000, linout = FALSE)

## # weights:  301
## initial  value 23522.705483 
## iter  10 value 14179.880092
## iter  20 value 13625.005698
## iter  30 value 13418.429983
## iter  40 value 13173.057450
## iter  50 value 12640.458928
## iter  60 value 12265.364331
## iter  70 value 12038.069262
## iter  80 value 11934.605007
## iter  90 value 11896.243136
## iter 100 value 11893.367623
## iter 110 value 11884.155700
## iter 120 value 11866.894186
## iter 130 value 11850.193146
## iter 140 value 11837.232488
## iter 150 value 11820.460297
## iter 160 value 11805.131341
## iter 170 value 11790.186472
## iter 180 value 11778.943767
## iter 190 value 11768.430048
## iter 200 value 11754.212725
## iter 210 value 11746.100902
## iter 220 value 11740.054020
## iter 230 value 11733.960416
## iter 240 value 11728.417755
## iter 250 value 11724.163176
## iter 260 value 11721.026534
## iter 270 value 11718.333051
## iter 280 value 11714.357414
## iter 290 value 11713.123854
## iter 300 value 11712.121098
## iter 310 value 11711.590404
## iter 320 value 11710.029488
## iter 330 value 11706.959369
## iter 340 value 11705.784964
## iter 350 value 11704.471415
## iter 360 value 11703.823626
## iter 370 value 11703.137776
## iter 380 value 11702.167481
## iter 390 value 11700.827529
## iter 400 value 11699.873236
## iter 410 value 11698.942583
## final  value 11698.629976 
## converged

predict3.8 <- predict(modello3.8, newdata = new_test_spf)

confusione3.8 <- confusionMatrix(as.factor(ifelse(predict3.8>0.3, 1, 0)), new_test_spf$result)
confusione3.8

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 15425   951
##          1   581   401
##                                           
##                Accuracy : 0.9117          
##                  95% CI : (0.9074, 0.9159)
##     No Information Rate : 0.9221          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2976          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.9637          
##             Specificity : 0.2966          
##          Pos Pred Value : 0.9419          
##          Neg Pred Value : 0.4084          
##              Prevalence : 0.9221          
##          Detection Rate : 0.8886          
##    Detection Prevalence : 0.9434          
##       Balanced Accuracy : 0.6301          
##                                           
##        'Positive' Class : 0               
##

roc3.8 <- roc(new_test_spf$result ~ as.numeric(predict3.8), plot = T)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

legend("bottomright", legend = paste("AUC =", round(auc(roc3.8), 4)), lwd = 2, box.lwd = 0, bg = "white")

rms::val.prob(as.numeric(as.character(predict3.8)), as.numeric(as.character(new_test_spf$result)))

## Warning in rms::val.prob(as.numeric(as.character(predict3.8)),
## as.numeric(as.character(new_test_spf$result))): 73 observations deleted from
## logistic calibration due to probs. of 0 or 1

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##  5.362365e-01  7.681183e-01  1.831183e-01  8.020757e-02  1.387388e+03 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA           Inf           Inf  0.000000e+00          -Inf 
##         Brier     Intercept         Slope          Emax           E90 
##  6.358302e-02 -2.002981e-01  9.038563e-01  1.008896e-01  8.304276e-03 
##          Eavg           S:z           S:p 
##  5.422550e-03  1.609048e+00  1.076059e-01

Understat Result

confusione3.us <- confusionMatrix(as.factor(ifelse(test_spf$`xG Understat`>0.3, 1, 0)), test_spf$result)
confusione3.us

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 15438   770
##          1   568   582
##                                           
##                Accuracy : 0.9229          
##                  95% CI : (0.9188, 0.9268)
##     No Information Rate : 0.9221          
##     P-Value [Acc > NIR] : 0.3524          
##                                           
##                   Kappa : 0.424           
##                                           
##  Mcnemar's Test P-Value : 3.907e-08       
##                                           
##             Sensitivity : 0.9645          
##             Specificity : 0.4305          
##          Pos Pred Value : 0.9525          
##          Neg Pred Value : 0.5061          
##              Prevalence : 0.9221          
##          Detection Rate : 0.8894          
##    Detection Prevalence : 0.9337          
##       Balanced Accuracy : 0.6975          
##                                           
##        'Positive' Class : 0               
##

roc3.us <- roc(test_spf$result ~ as.numeric(test_spf$`xG Understat`), plot = T)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

legend("bottomright", legend = paste("AUC =", round(auc(roc3.us), 4)), lwd = 2, box.lwd = 0, bg = "white")

rms::val.prob(as.numeric(as.character(test_spf$`xG Understat`)), as.numeric(as.character(test_spf$result)))

## Warning in rms::val.prob(as.numeric(as.character(test_spf$`xG Understat`)), :
## 13 observations deleted from logistic calibration due to probs. of 0 or 1

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##  5.895407e-01  7.947703e-01  2.724341e-01  1.219437e-01  2.116114e+03 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA  5.332985e-04  1.125006e+01  3.606451e-03  1.214104e-01 
##         Brier     Intercept         Slope          Emax           E90 
##  5.646719e-02 -1.819309e-01  9.393373e-01  4.516177e-02  6.395073e-03 
##          Eavg           S:z           S:p 
##  5.191630e-03 -5.115445e-02  9.592024e-01

Set Piece Head

set.seed(123)
index_sph <- sample.split(Y = setpiece_head$result, SplitRatio = 0.75)
train_sph <- setpiece_head[index_sph, ]

indice_test_sph <-  which(index_sph == FALSE)
test_sph <- setpiece_head[indice_test_sph, ]

Logit

modello4.1 <- glm(result ~ minute + lastAction + X + exp(Y^2) + h_a, 
                  data = train_sph, family = binomial)

summary(modello4.1)

## 
## Call:
## glm(formula = result ~ minute + lastAction + X + exp(Y^2) + h_a, 
##     family = binomial, data = train_sph)
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -2.420e+01  7.364e-01 -32.868  < 2e-16 ***
## minute                   -1.725e-03  7.315e-04  -2.358  0.01836 *  
## lastActionBallRecovery   -1.282e+01  4.131e+02  -0.031  0.97524    
## lastActionBallTouch      -1.418e-01  3.709e-01  -0.382  0.70231    
## lastActionBlockedPass    -1.300e+01  2.409e+02  -0.054  0.95697    
## lastActionCard           -1.281e+01  3.355e+02  -0.038  0.96954    
## lastActionChallenge      -1.291e+01  4.496e+02  -0.029  0.97708    
## lastActionChipped         6.573e-01  1.014e-01   6.481 9.14e-11 ***
## lastActionCornerAwarded  -1.219e+00  7.181e-01  -1.698  0.08954 .  
## lastActionCross           7.615e-01  4.764e-02  15.986  < 2e-16 ***
## lastActionDispossessed    7.975e-01  1.060e+00   0.753  0.45168    
## lastActionEnd            -4.132e-01  5.973e-01  -0.692  0.48913    
## lastActionFoul           -8.870e-01  5.903e-01  -1.502  0.13297    
## lastActionGoal           -1.269e+01  1.798e+02  -0.071  0.94375    
## lastActionHeadPass        1.108e+00  1.048e-01  10.566  < 2e-16 ***
## lastActionInterception   -1.269e+01  4.071e+02  -0.031  0.97513    
## lastActionNone            3.315e-01  1.422e-01   2.332  0.01970 *  
## lastActionOffsidePass    -1.249e+01  2.712e+02  -0.046  0.96326    
## lastActionPass            1.859e+00  2.146e-01   8.660  < 2e-16 ***
## lastActionRebound         1.199e+00  1.118e-01  10.724  < 2e-16 ***
## lastActionSubstitutionOn -1.292e+01  5.695e+02  -0.023  0.98190    
## lastActionTackle         -1.305e+01  3.234e+02  -0.040  0.96782    
## lastActionTakeOn         -1.262e+01  3.072e+02  -0.041  0.96722    
## X                         2.461e+01  7.397e-01  33.267  < 2e-16 ***
## exp(Y^2)                 -9.621e-01  2.060e-01  -4.671 2.99e-06 ***
## h_ah                     -1.052e-01  3.876e-02  -2.715  0.00663 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 21149  on 34346  degrees of freedom
## Residual deviance: 19194  on 34321  degrees of freedom
## AIC: 19246
## 
## Number of Fisher Scoring iterations: 14

predict4.1 <- predict.glm(modello4.1, newdata = test_sph, type = "response")

confusione4.1 <- confusionMatrix(as.factor(ifelse(predict4.1>0.3, 1, 0)), as.factor(test_sph$result)) 
confusione4.1

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 10215   962
##          1   177    95
##                                           
##                Accuracy : 0.9005          
##                  95% CI : (0.8949, 0.9059)
##     No Information Rate : 0.9077          
##     P-Value [Acc > NIR] : 0.9958          
##                                           
##                   Kappa : 0.1093          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.98297         
##             Specificity : 0.08988         
##          Pos Pred Value : 0.91393         
##          Neg Pred Value : 0.34926         
##              Prevalence : 0.90768         
##          Detection Rate : 0.89222         
##    Detection Prevalence : 0.97624         
##       Balanced Accuracy : 0.53642         
##                                           
##        'Positive' Class : 0               
##

roc4.1 <- roc(test_sph$result ~ predict4.1, plot = T)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

legend("bottomright", legend = paste("AUC =", round(auc(roc4.1), 4)), lwd = 2, box.lwd = 0, bg = "white")

rms::val.prob(as.numeric(as.character(predict4.1)), as.numeric(as.character(test_sph$result)))

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##  4.251367e-01  7.125684e-01  1.037063e-01  4.876773e-02  5.593418e+02 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA  1.560155e-04  3.786222e+00  1.506026e-01  4.861172e-02 
##         Brier     Intercept         Slope          Emax           E90 
##  7.906927e-02 -1.739121e-01  9.218147e-01  2.812030e-02  1.657528e-02 
##          Eavg           S:z           S:p 
##  5.413573e-03 -2.727723e-02  9.782386e-01

Linear Discriminant Analysis

set.seed(1234)

new_train_sph <- train_sph

new_train_sph$Y <- exp(new_train_sph$Y^2)

new_test_sph <- test_sph

new_test_sph$Y <- exp(new_test_sph$Y^2)

set.seed(1234)


modello4.3 <- lda(result ~ minute + lastAction + X + exp(Y^2) + h_a, data = train_sph)

summary(modello4.3)

##         Length Class  Mode     
## prior    2     -none- numeric  
## counts   2     -none- numeric  
## means   50     -none- numeric  
## scaling 25     -none- numeric  
## lev      2     -none- character
## svd      1     -none- numeric  
## N        1     -none- numeric  
## call     3     -none- call     
## terms    3     terms  call     
## xlevels  2     -none- list

predict4.3.1 <- predict(modello4.3, newdata = test_sph)

predict4.3 <- predict4.3.1$posterior[, 2]

confusione4.3.1 <- confusionMatrix(as.factor(predict4.3.1$class), as.factor(test_sph$result))
confusione4.3.1

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 10324  1011
##          1    68    46
##                                          
##                Accuracy : 0.9058         
##                  95% CI : (0.9003, 0.911)
##     No Information Rate : 0.9077         
##     P-Value [Acc > NIR] : 0.7668         
##                                          
##                   Kappa : 0.0617         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.99346        
##             Specificity : 0.04352        
##          Pos Pred Value : 0.91081        
##          Neg Pred Value : 0.40351        
##              Prevalence : 0.90768        
##          Detection Rate : 0.90174        
##    Detection Prevalence : 0.99004        
##       Balanced Accuracy : 0.51849        
##                                          
##        'Positive' Class : 0              
##

confusione4.3 <- confusionMatrix(as.factor(ifelse(predict4.3>0.3, 1, 0)), as.factor(test_sph$result))

roc4.3 <- roc(test_sph$result ~ predict4.3, plot = T)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

legend("bottomright", legend = paste("AUC =", round(auc(roc4.3), 4)), lwd = 2, box.lwd = 0, bg = "white")

rms::val.prob(as.numeric(as.character(predict4.3)), as.numeric(as.character(test_sph$result)))

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##  4.176003e-01  7.088001e-01  9.620260e-02  4.515158e-02  5.179404e+02 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA  1.869583e-03  2.340486e+01  8.273686e-06  4.328199e-02 
##         Brier     Intercept         Slope          Emax           E90 
##  8.005259e-02 -3.824800e-01  8.242660e-01  3.101720e-01  1.341532e-02 
##          Eavg           S:z           S:p 
##  1.179437e-02  1.354577e+00  1.755522e-01

Random Forest

set.seed(1234)

modello4.4 <- ranger(train_sph$result ~ minute + lastAction + X + Y + h_a, 
                     data = new_train_sph, num.trees = 500, mtry = 3, seed = 42, probability = T)

train_predict_4.4.1 <- predict(modello4.4, data = new_train_sph)
train_predict4.4 <- train_predict_4.4.1$predictions[, 2]

predict4.4.1 <- predict(modello4.4, data = new_test_sph)

predict4.4 <- predict4.4.1$predictions[, 2]

confusione4.4 <- confusionMatrix(as.factor(ifelse(predict4.4 >0.3, 1, 0)), as.factor(test_sph$result))
confusione4.4

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 9942  631
##          1  450  426
##                                           
##                Accuracy : 0.9056          
##                  95% CI : (0.9001, 0.9109)
##     No Information Rate : 0.9077          
##     P-Value [Acc > NIR] : 0.786           
##                                           
##                   Kappa : 0.3897          
##                                           
##  Mcnemar's Test P-Value : 4.383e-08       
##                                           
##             Sensitivity : 0.9567          
##             Specificity : 0.4030          
##          Pos Pred Value : 0.9403          
##          Neg Pred Value : 0.4863          
##              Prevalence : 0.9077          
##          Detection Rate : 0.8684          
##    Detection Prevalence : 0.9235          
##       Balanced Accuracy : 0.6799          
##                                           
##        'Positive' Class : 0               
##

roc4.4 <- roc(test_sph$result ~ as.numeric(predict4.4), plot = T)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

legend("bottomright", legend = paste("AUC =", round(auc(roc4.4), 4)), lwd = 2, box.lwd = 0, bg = "white")

rms::val.prob(as.numeric(as.character(predict4.4)), as.numeric(as.character(new_test_sph$result)))

## Warning in rms::val.prob(as.numeric(as.character(predict4.4)),
## as.numeric(as.character(new_test_sph$result))): 869 observations deleted from
## logistic calibration due to probs. of 0 or 1

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##    0.60666454    0.80333227    0.28605354    0.14680716 1554.21971840 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA           Inf           Inf    0.00000000          -Inf 
##         Brier     Intercept         Slope          Emax           E90 
##    0.06304306   -0.46515693    0.76715854    0.09687157    0.05778390 
##          Eavg           S:z           S:p 
##    0.02110086    0.94272703    0.34582054

Bagging

set.seed(1234)

modello4.5 <- ranger(train_sph$result ~ minute + lastAction + X + Y + h_a, 
                     data = new_train_sph, num.trees = 500, seed = 42, probability = T)

predict4.5.1 <- predict(modello4.5, data = new_test_sph)

predict4.5 <- predict4.5.1$predictions[, 2]

confusione4.5 <- confusionMatrix(as.factor(ifelse(predict4.5>0.3, 1, 0)), as.factor(new_test_sph$result))
confusione4.5

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 10042   658
##          1   350   399
##                                           
##                Accuracy : 0.912           
##                  95% CI : (0.9066, 0.9171)
##     No Information Rate : 0.9077          
##     P-Value [Acc > NIR] : 0.05793         
##                                           
##                   Kappa : 0.3956          
##                                           
##  Mcnemar's Test P-Value : < 2e-16         
##                                           
##             Sensitivity : 0.9663          
##             Specificity : 0.3775          
##          Pos Pred Value : 0.9385          
##          Neg Pred Value : 0.5327          
##              Prevalence : 0.9077          
##          Detection Rate : 0.8771          
##    Detection Prevalence : 0.9346          
##       Balanced Accuracy : 0.6719          
##                                           
##        'Positive' Class : 0               
##

roc4.5 <- roc(new_test_sph$result ~ as.numeric(predict4.5), plot = T)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

legend("bottomright", legend = paste("AUC =", round(auc(roc4.5), 4)), lwd = 2, box.lwd = 0, bg = "white")

rms::val.prob(as.numeric(as.character(predict4.5)), as.numeric(as.character(new_test_sph$result)))

## Warning in rms::val.prob(as.numeric(as.character(predict4.5)),
## as.numeric(as.character(new_test_sph$result))): 32 observations deleted from
## logistic calibration due to probs. of 0 or 1

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##  6.444300e-01  8.222150e-01  3.168346e-01  1.575874e-01  1.800175e+03 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA  6.581974e-04  9.514640e+00  8.588598e-03  1.569292e-01 
##         Brier     Intercept         Slope          Emax           E90 
##  6.248466e-02 -1.808876e-01  9.179875e-01  1.028428e-01  3.948918e-02 
##          Eavg           S:z           S:p 
##  1.451649e-02 -8.034238e-01  4.217298e-01

Neural Network

modello4.8 <- nnet(result ~ minute + lastAction + X + Y + h_a, data = new_train_sph, size = 10, maxit = 1000, linout = FALSE)

## # weights:  271
## initial  value 14444.509965 
## iter  10 value 10571.777439
## iter  20 value 10555.297209
## iter  30 value 10535.397340
## iter  40 value 10531.264920
## iter  50 value 10529.572248
## iter  60 value 10528.543519
## final  value 10528.519092 
## converged

predict4.8 <- predict(modello4.8, newdata = new_test_sph)

confusione4.8 <- confusionMatrix(as.factor(ifelse(predict4.8>0.3, 1, 0)), new_test_sph$result)
confusione4.8

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 10390  1054
##          1     2     3
##                                          
##                Accuracy : 0.9078         
##                  95% CI : (0.9023, 0.913)
##     No Information Rate : 0.9077         
##     P-Value [Acc > NIR] : 0.4953         
##                                          
##                   Kappa : 0.0048         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.999808       
##             Specificity : 0.002838       
##          Pos Pred Value : 0.907899       
##          Neg Pred Value : 0.600000       
##              Prevalence : 0.907678       
##          Detection Rate : 0.907503       
##    Detection Prevalence : 0.999563       
##       Balanced Accuracy : 0.501323       
##                                          
##        'Positive' Class : 0              
##

roc4.8 <- roc(new_test_sph$result ~ as.numeric(predict4.8), plot = T)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

legend("bottomright", legend = paste("AUC =", round(auc(roc4.8), 4)), lwd = 2, box.lwd = 0, bg = "white")

rms::val.prob(as.numeric(as.character(predict4.8)), as.numeric(as.character(new_test_sph$result)))

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##  1.874104e-02  5.093705e-01  2.306491e-03  9.736699e-04  1.214755e+01 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA  6.033001e-05  2.690718e+00  2.604461e-01  9.133399e-04 
##         Brier     Intercept         Slope          Emax           E90 
##  8.366746e-02 -6.346217e-01  7.213535e-01  8.956164e-02  1.159593e-03 
##          Eavg           S:z           S:p 
##  2.291678e-03  9.299804e-02  9.259051e-01

Understat Result

confusione4.us <- confusionMatrix(as.factor(ifelse(test_sph$`xG Understat`>0.3, 1, 0)), test_sph$result)
confusione4.us

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 9805  626
##          1  587  431
##                                           
##                Accuracy : 0.8941          
##                  95% CI : (0.8883, 0.8996)
##     No Information Rate : 0.9077          
##     P-Value [Acc > NIR] : 1.0000          
##                                           
##                   Kappa : 0.3572          
##                                           
##  Mcnemar's Test P-Value : 0.2752          
##                                           
##             Sensitivity : 0.9435          
##             Specificity : 0.4078          
##          Pos Pred Value : 0.9400          
##          Neg Pred Value : 0.4234          
##              Prevalence : 0.9077          
##          Detection Rate : 0.8564          
##    Detection Prevalence : 0.9111          
##       Balanced Accuracy : 0.6756          
##                                           
##        'Positive' Class : 0               
##

roc4.us <- roc(test_sph$result ~ as.numeric(test_sph$`xG Understat`), plot = T)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

legend("bottomright", legend = paste("AUC =", round(auc(roc4.us), 4)), lwd = 2, box.lwd = 0, bg = "white")

rms::val.prob(as.numeric(as.character(test_sph$`xG Understat`)), as.numeric(as.character(test_sph$result)))

## Warning in rms::val.prob(as.numeric(as.character(test_sph$`xG Understat`)), : 2
## observations deleted from logistic calibration due to probs. of 0 or 1

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##  5.820363e-01  7.910181e-01  2.272736e-01  1.102882e-01  1.263469e+03 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA  5.036564e-04  7.765355e+00  2.059561e-02  1.097845e-01 
##         Brier     Intercept         Slope          Emax           E90 
##  7.092230e-02 -1.467926e-01  9.228842e-01  1.822378e-01  4.303519e-03 
##          Eavg           S:z           S:p 
##  4.759339e-03  1.399728e+00  1.615947e-01

Final Result

accuracy_opf <- data.frame(confusione1.us$overall["Accuracy"], confusione1.1$overall["Accuracy"], confusione1.2$overall["Accuracy"], confusione1.3$overall["Accuracy"], confusione1.4$overall["Accuracy"], confusione1.5$overall["Accuracy"], confusione1.8$overall["Accuracy"])
nomi_acc1 <- c("Understat", "Lineare", "Interazione", "Discriminante", "Random_Forest", "Bagging", "Neural_Net")
names(accuracy_opf) <- nomi_acc1

auc_opf <- data.frame(roc1.us$auc, roc1.1$auc, roc1.2$auc, roc1.3$auc, roc1.4$auc, roc1.5$auc, roc1.8$auc)
names(auc_opf) <- nomi_acc1

acc_auc1 <- rbind(accuracy_opf, auc_opf)
rownames(acc_auc1) <- c("Accuracy", "AUC")
acc_auc1

##          Understat   Lineare Interazione Discriminante Random_Forest   Bagging
## Accuracy 0.8875595 0.8888058   0.8860917     0.8850532     0.9011161 0.9064058
## AUC      0.8145883 0.7468567   0.7495843     0.7447884     0.8374263 0.8392269
##          Neural_Net
## Accuracy  0.8885704
## AUC       0.7913828

accuracy_oph <- data.frame(confusione2.us$overall["Accuracy"], confusione2.1$overall["Accuracy"], confusione2.3$overall["Accuracy"], confusione2.4$overall["Accuracy"], confusione2.5$overall["Accuracy"], confusione2.8$overall["Accuracy"])
nomi_acc2 <- c("Understat", "Lineare", "Discriminante", "Random_Forest", "Bagging", "Neural_Net")
names(accuracy_oph) <- nomi_acc2

auc_oph <- data.frame(roc1.us$auc, roc1.1$auc, roc1.3$auc, roc1.4$auc, roc1.5$auc, roc1.8$auc)
names(auc_oph) <- nomi_acc2

acc_auc2 <- rbind.data.frame(accuracy_oph, auc_oph)
rownames(acc_auc2) <- c("Accuracy", "AUC")
acc_auc2

##          Understat   Lineare Discriminante Random_Forest   Bagging Neural_Net
## Accuracy 0.8503896 0.8577922     0.8579221     0.8659740 0.8727273  0.8662338
## AUC      0.8145883 0.7468567     0.7447884     0.8374263 0.8392269  0.7913828

accuracy_spf <- data.frame(confusione3.us$overall["Accuracy"], confusione3.1$overall["Accuracy"], confusione3.3$overall["Accuracy"], confusione3.4$overall["Accuracy"], confusione3.5$overall["Accuracy"], confusione3.8$overall["Accuracy"])
nomi_acc3 <- c("Understat", "Lineare", "Discriminante", "Random_Forest", "Bagging", "Neural_Net")
names(accuracy_spf) <- nomi_acc3

auc_spf <- data.frame(roc3.us$auc, roc3.1$auc, roc3.3$auc, roc3.4$auc, roc3.5$auc, roc3.8$auc)
names(auc_spf) <- nomi_acc3


acc_auc3 <- rbind.data.frame(accuracy_spf, auc_spf)
rownames(acc_auc3) <- c("Accuracy", "AUC")
acc_auc3

##          Understat   Lineare Discriminante Random_Forest   Bagging Neural_Net
## Accuracy 0.9229174 0.9138150     0.9033875     0.9306372 0.9324231  0.9117410
## AUC      0.7949585 0.7540138     0.7436472     0.8365386 0.8386365  0.7650791

accuracy_sph <- data.frame(confusione4.us$overall["Accuracy"], confusione4.1$overall["Accuracy"], confusione4.3$overall["Accuracy"], confusione4.4$overall["Accuracy"], confusione4.5$overall["Accuracy"], confusione4.8$overall["Accuracy"])
nomi_acc2 <- c("Understat", "Lineare", "Discriminante", "Random_Forest", "Bagging", "Neural_Net")
names(accuracy_sph) <- nomi_acc2

auc_sph <- data.frame(roc4.us$auc, roc4.1$auc, roc4.3$auc, roc4.4$auc, roc4.5$auc, roc4.8$auc)
names(auc_sph) <- nomi_acc2

acc_auc4 <- rbind.data.frame(accuracy_sph, auc_sph)
rownames(acc_auc4) <- c("Accuracy", "AUC")
acc_auc4

##          Understat   Lineare Discriminante Random_Forest   Bagging Neural_Net
## Accuracy 0.8940519 0.9005153     0.8967595     0.9055813 0.9119574  0.9077649
## AUC      0.7910585 0.7125744     0.7087912     0.8190310 0.8227611  0.5094168

Final Plot

Messi’s Plot

test_opf_rf <- cbind(test_opf, predict1.4)
names(test_opf_rf)[names(test_opf_rf) == "predict1.4"] <- "xG_Random_Forest"
train_opf_rf <- cbind(train_opf, train_predict1.4)
names(train_opf_rf)[names(train_opf_rf) == "train_predict1.4"] <- "xG_Random_Forest"

messi_shots_opf_test <- test_opf_rf %>% filter(player == "Lionel Messi")
messi_shots_opf_train <- train_opf_rf %>% filter(player == "Lionel Messi")
messi_shots_opf <- bind_rows(messi_shots_opf_train, messi_shots_opf_test)

messi_shots_opf$shotType <- ifelse(messi_shots_opf$shotType == "LeftFoot", "Left", "Right")
messi_shots_opf <- messi_shots_opf %>%
  mutate(X = X * 100, Y = Y * 100)

messi_shots_opf <- messi_shots_opf %>% 
  mutate(result = factor(result, levels = c(0, 1), labels = c("No Goal", "Goal")),
         match_info = paste(h_team, "vs", a_team, "<br>", h_goals, "-", a_goals),
          date_formatted = format(as.Date(date), "%d-%m-%Y"))

generate_center_circle_arc <- function(center_x = 50, center_y = 50, r = 9.15, n_points = 100) {
  theta <- seq(-pi / 2, pi / 2, length.out = n_points)  # Modificato per metà arco, da -pi/2 a pi/2
  data.frame(
    x = center_x + r * cos(theta),  # Mantiene il centro sul bordo visibile del campo
    y = center_y + r * sin(theta)
  )
}

# Genera i punti della lunetta del centrocampo
center_circle_arc <- generate_center_circle_arc(center_x = 50, center_y = 50)

# Esempio di dataset con coordinate normalizzate (da 0 a 1)
p <- ggplot() +
  annotate_pitch(colour = "black", fill = "white") + # disegna il campo
  theme_pitch() + # tema predefinito per il campo
  theme(panel.background = element_rect(fill = "green4")) + # colore sfondo campo
  # Aggiungi la lunetta del centrocampo
  geom_path(data = center_circle_arc, aes(x = x, y = y), color = "black", size = 0.8) +  
  geom_point(data = messi_shots_opf, 
             aes(x = X, y = Y, 
                 color = factor(result), # colori differenti per goal e no goal
                 shape = factor(result), # forme differenti per goal e no goal
                 text = paste("Random Forest xG:", round(xG_Random_Forest, 4), 
                              "<br>X:", round(X, 2), 
                              "<br>Y:", round(Y, 2), 
                              "<br>Foot:", shotType,
                              "<br>", match_info,
                              "<br>", date_formatted,
                              "<br>Minute:", minute)),
             size = 2, stroke = 0.3) +
  scale_color_manual(values = c("No Goal" = "black", "Goal" = "red"),
                     name = "Esito Tiro") +
  scale_shape_manual(values = c("No Goal" = 1, "Goal" = 16),
                     name = "Esito Tiro") +
  labs(title = "<span style='font-size:16pt;'>Tiri effettuati da Lionel Messi in campionato</span><br><span style='font-size:12pt;'>di piede e in situazione di Open Play tra il 2014 e il 2021", x = " ", y = " ") +
  theme_minimal() +
  coord_cartesian(xlim = c(50, 100))

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning in geom_point(data = messi_shots_opf, aes(x = X, y = Y, color =
## factor(result), : Ignoring unknown aesthetics: text

p_interactive <- ggplotly(p, tooltip = "text")

## Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomCurve() has yet to be implemented in plotly.
##   If you'd like to see this geom implemented,
##   Please open an issue with your example code at
##   https://github.com/ropensci/plotly/issues

## Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomCurve() has yet to be implemented in plotly.
##   If you'd like to see this geom implemented,
##   Please open an issue with your example code at
##   https://github.com/ropensci/plotly/issues
## Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomCurve() has yet to be implemented in plotly.
##   If you'd like to see this geom implemented,
##   Please open an issue with your example code at
##   https://github.com/ropensci/plotly/issues
## Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomCurve() has yet to be implemented in plotly.
##   If you'd like to see this geom implemented,
##   Please open an issue with your example code at
##   https://github.com/ropensci/plotly/issues

p_interactive

Serie A’s Plot

test_spf_rf <- cbind(test_spf, predict3.4)
names(test_spf_rf)[names(test_spf_rf) == "predict3.4"] <- "xG_Random_Forest"
test_sph_rf <- cbind(test_sph, predict4.4)
names(test_sph_rf)[names(test_sph_rf) == "predict4.4"] <- "xG_Random_Forest"
train_spf_rf <- cbind(new_train_spf, train_predict3.4)
names(test_spf_rf)[names(test_spf_rf) == "train_predict3.4"] <- "xG_Random_Forest"
train_sph_rf <- cbind(new_train_sph, train_predict4.4)
names(test_sph_rf)[names(test_sph_rf) == "train_predict4.4"] <- "xG_Random_Forest"

seriea_corner_spf_test <- test_spf_rf %>%
  filter(situation == "FromCorner", lega == "Serie A")

seriea_corner_sph_test <- test_sph_rf %>%
  filter(situation == "FromCorner", lega == "Serie A")

seriea_corner_spf_train <- train_spf_rf %>%
  filter(situation == "FromCorner", lega == "Serie A")

seriea_corner_sph_train <- train_sph_rf %>%
  filter(situation == "FromCorner", lega == "Serie A")  


seriea_corner <- bind_rows(seriea_corner_spf_test, seriea_corner_sph_test, seriea_corner_spf_train, seriea_corner_sph_train)
seriea_corner$foot <- ifelse(seriea_corner$shotType %in% c("RightFoot", "LeftFoot"), 
                             "foot", 
                             "head")

seriea_corner <- seriea_corner %>%
  mutate(team_name = ifelse(h_a == "h", h_team, a_team),
         match_info = paste(h_team, "vs", a_team, "<br>", h_goals, "-", a_goals),
         date_formatted = format(as.Date(date), "%d-%m-%Y"),
         result = factor(result, levels = c(0, 1), labels = c("No Goal", "Goal")))

seriea_corner <- seriea_corner %>%
  mutate(X = X * 100, Y = Y * 100)

seriea_corner_20_21 <- seriea_corner %>%
  filter(season == "2020")
seriea_corner_19_21 <- seriea_corner %>%
  filter(season %in% c("2019", "2020"))

generate_center_circle_arc <- function(center_x = 50, center_y = 50, r = 9.15, n_points = 100) {
  theta <- seq(-pi / 2, pi / 2, length.out = n_points)  # Modificato per metà arco, da -pi/2 a pi/2
  data.frame(
    x = center_x + r * cos(theta),  # Mantiene il centro sul bordo visibile del campo
    y = center_y + r * sin(theta)
  )
}

# Genera i punti della lunetta del centrocampo
center_circle_arc <- generate_center_circle_arc(center_x = 50, center_y = 50)

# Esempio di dataset con coordinate normalizzate (da 0 a 1)
p_seriea <- ggplot() +
  annotate_pitch(colour = "black", fill = "white") + # disegna il campo
  theme_pitch() + # tema predefinito per il campo
  theme(panel.background = element_rect(fill = "green4")) + # colore sfondo campo
  # Aggiungi la lunetta del centrocampo
  geom_path(data = center_circle_arc, aes(x = x, y = y), color = "black", size = 0.8) +  
  geom_point(data = seriea_corner_19_21, 
             aes(x = X, y = Y, 
                 color = factor(result), # colori differenti per goal e no goal
                 shape = factor(result), # forme differenti per goal e no goal
                 text = paste("Random Forest xG:", round(xG_Random_Forest, 4), 
                              "<br>X:", round(X, 2), 
                              "<br>Y:", round(Y, 2), 
                              "<br>Player:", player,
                              "<br>Type:", shotType,
                              "<br>Assist:", player_assisted,
                              "<br>Team:", team_name,
                              "<br>", match_info,
                              "<br>", date_formatted,
                              "<br>Minute:", minute)),
             size = 2, stroke = 0.3) +
  scale_color_manual(values = c("No Goal" = "black", "Goal" = "red"),
                     name = "Esito Tiro") +
  scale_shape_manual(values = c("No Goal" = 1, "Goal" = 16),
                     name = "Esito Tiro") +
  labs(title = "<span style='font-size:16pt;'>Tiri effettuati in Serie A da calcio d'angolo</span><br><span style='font-size:12pt;'>nelle stagioni 2019/20 e 2020/21</span>", x = " ", y = " ") +
  theme_minimal() +
  coord_cartesian(xlim = c(50, 100))

## Warning in geom_point(data = seriea_corner_19_21, aes(x = X, y = Y, color =
## factor(result), : Ignoring unknown aesthetics: text

p_interactive_seriea <- ggplotly(p_seriea, tooltip = "text")

## Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomCurve() has yet to be implemented in plotly.
##   If you'd like to see this geom implemented,
##   Please open an issue with your example code at
##   https://github.com/ropensci/plotly/issues

## Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomCurve() has yet to be implemented in plotly.
##   If you'd like to see this geom implemented,
##   Please open an issue with your example code at
##   https://github.com/ropensci/plotly/issues
## Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomCurve() has yet to be implemented in plotly.
##   If you'd like to see this geom implemented,
##   Please open an issue with your example code at
##   https://github.com/ropensci/plotly/issues
## Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomCurve() has yet to be implemented in plotly.
##   If you'd like to see this geom implemented,
##   Please open an issue with your example code at
##   https://github.com/ropensci/plotly/issues

p_interactive_seriea

xG’s Heatmap

tot_opf_rf <- bind_rows(train_opf_rf, test_opf_rf)
tot_opf_rf <- tot_opf_rf %>%
  mutate(X = X * 100, Y = Y * 100)

p_heatmap_custom <- ggplot(tot_opf_rf, aes(x = X, y = Y)) +
  annotate_pitch(colour = "black", fill = "white") + 
  theme_pitch() + 
  theme(panel.background = element_rect(fill = "lightblue")) +
  
  # Usa stat_summary_2d invece di geom_tile, per calcolare densità in ogni area
  stat_summary_2d(aes(z = xG_Random_Forest), fun = mean, bins = 65, alpha = 0.8) + 
  scale_fill_viridis(option = "plasma", name = "xG Medio", direction = -1, 
                     breaks = pretty_breaks(n = 5), 
                     limits = c(0, max(tot_opf_rf$xG_Random_Forest, na.rm = TRUE))) + 
  
  # Migliora i dettagli del grafico
  labs(
    title = "Heat Map degli Expected Goals (xG)\nTiri Effettuati in Situazione di Open Play",
    x = NULL, y = NULL
  ) +
  
  # Coordinata per mostrare solo la metà campo
  coord_cartesian(xlim = c(50, 100), ylim = c(0, 100)) +
  theme_minimal() +
  theme(
    axis.text = element_blank(),    # Nascondi i valori degli assi
    axis.ticks = element_blank(),   # Nascondi i ticks degli assi
    panel.grid = element_blank(),   # Rimuovi le griglie
    legend.position = "bottom",     # Posiziona la legenda in basso
    plot.title = element_text(size = 14, face = "bold", hjust = 0.5)  # Centra il titolo e lo rende più visibile
  )

# Visualizza il grafico
p_heatmap_custom

xG Model

Matteo Morella

19 October, 2024

Packages and Data

Data Cleaning

Data Modeling

Key

Open Play Foot Model

Logit

Logit with interactions

Linear Discriminant Analysis

Random Forest

Bagging

Neural Network

Understat Result

Open Play Head

Logit

Linear Discriminant Analysis

Random Forest

Bagging

Neural Network

Understat Result

Set Piece Foot

Logit

Linear Discriminant Analysis

Random Forest

Bagging

Neural Network

Understat Result

Set Piece Head

Logit

Linear Discriminant Analysis

Random Forest

Bagging

Neural Network

Understat Result

Final Result

Final Plot

Messi’s Plot

Serie A’s Plot

xG’s Heatmap