Decision Tree
· 10 min read
0) Importing Packages & Data
library(tidyverse)
## -- Attaching packages ------------------------------------------------------ tidyverse 1.3.0 --
## √ ggplot2 3.3.1 √ purrr 0.3.4
## √ tibble 3.0.1 √ dplyr 1.0.0
## √ tidyr 1.1.0 √ stringr 1.4.0
## √ readr 1.3.1 √ forcats 0.5.0
## -- Conflicts --------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## 다음의 패키지를 부착합니다: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(rpart)
library(plot3D)
data = read_csv("data/baseball.csv")
## Parsed with column specification:
## cols(
## Team = col_character(),
## League = col_character(),
## Year = col_double(),
## RS = col_double(),
## RA = col_double(),
## W = col_double(),
## OBP = col_double(),
## SLG = col_double(),
## BA = col_double(),
## Playoffs = col_double(),
## RankSeason = col_double(),
## RankPlayoffs = col_double(),
## G = col_double(),
## OOBP = col_double(),
## OSLG = col_double()
## )
head(data)
## # A tibble: 6 x 15
## Team League Year RS RA W OBP SLG BA Playoffs RankSeason
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ARI NL 2012 734 688 81 0.328 0.418 0.259 0 NA
## 2 ATL NL 2012 700 600 94 0.32 0.389 0.247 1 4
## 3 BAL AL 2012 712 705 93 0.311 0.417 0.247 1 5
## 4 BOS AL 2012 734 806 69 0.315 0.415 0.26 0 NA
## 5 CHC NL 2012 613 759 61 0.302 0.378 0.24 0 NA
## 6 CHW AL 2012 748 676 85 0.318 0.422 0.255 0 NA
## # ... with 4 more variables: RankPlayoffs <dbl>, G <dbl>, OOBP <dbl>,
## # OSLG <dbl>
이 데이터는 메이저리그 팀들의 기록과 플레이오프 진출 여부를 포함한 데이터이다. 그 중 의미있는 X변수들을 뽑아 플레이오프 진출 여부를 classifying 해보겠다. 당연히 승수(Win), 득점(RS), 실점(RA)는 상관관계가 굉장히 높을 것으로 예상되므로, 트리를 좀 더 키워서 연습해보기 위해 이 변수들은 제거하였다.
data = data %>% select(c("Playoffs", "OBP", "SLG", "BA", "OOBP", "OSLG"))
OBP는 출루율, SLG는 장타율, BA는 타율, OOBP는 피출루율, OSLG는 피장타율을 나타낸다.
1) Train - Test split
set.seed(2026)
nobs = nrow(data)
train_ind = sample(1:nobs, round(nobs * 0.5))
train = data[train_ind,]
test = data[-train_ind,]
2) Decision Tree & AUROC 계산
minsplit = seq(1,46,by=5)
cp = seq(0.001, 0.01, by = 0.001)
mcr = matrix(NA, nrow = length(minsplit), ncol = length(cp))
mcrdf = data.frame()
for(i in 1:10){
for(j in 1:10){
m = minsplit[i]
c = cp[j]
my.control = rpart.control(minsplit = m, cp = c, xval = 0)
tree = rpart(Playoffs~., data = train, method = "class", control = my.control)
pred = predict(tree, newdata = test, type = "prob")
roccurve0 = roc(test$Playoffs ~ pred[,2])
auc = roccurve0$auc %>% as.numeric()
mcr[i,j] = auc
mcrdf = rbind(mcrdf, c(m,c,auc))
}
}
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
3) 3D Surface Graph
persp3D(minsplit, cp, mcr,theta=30, phi=50, axes=TRUE,scale= 0.75, box=TRUE, nticks=5,
ticktype="detailed", xlab = "minsplit", ylab= "cp", zlab = "AUROC")
colnames(mcrdf) = c("minsplit", "cp", "AUC")
그래프를 봤을 때, minsplit 10근처, cp 0.008~0.010 사이에 짙은 적색으로 표현되는 최댓값이 위치하는 것으로 보인다.
4) AUC를 max로 하는 minsplit과 cp 찾기
df = mcrdf %>%
filter(AUC == max(AUC))
print(df)
## minsplit cp AUC
## 1 11 0.009 0.7125245
## 2 11 0.010 0.7125245