Decision Tree

0) Importing Packages & Data

library(tidyverse)
## -- Attaching packages ------------------------------------------------------ tidyverse 1.3.0 --
## √ ggplot2 3.3.1     √ purrr   0.3.4
## √ tibble  3.0.1     √ dplyr   1.0.0
## √ tidyr   1.1.0     √ stringr 1.4.0
## √ readr   1.3.1     √ forcats 0.5.0
## -- Conflicts --------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## 다음의 패키지를 부착합니다: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(rpart)
library(plot3D)

data = read_csv("data/baseball.csv")
## Parsed with column specification:
## cols(
##   Team = col_character(),
##   League = col_character(),
##   Year = col_double(),
##   RS = col_double(),
##   RA = col_double(),
##   W = col_double(),
##   OBP = col_double(),
##   SLG = col_double(),
##   BA = col_double(),
##   Playoffs = col_double(),
##   RankSeason = col_double(),
##   RankPlayoffs = col_double(),
##   G = col_double(),
##   OOBP = col_double(),
##   OSLG = col_double()
## )
head(data)
## # A tibble: 6 x 15
##   Team  League  Year    RS    RA     W   OBP   SLG    BA Playoffs RankSeason
##   <chr> <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl>      <dbl>
## 1 ARI   NL      2012   734   688    81 0.328 0.418 0.259        0         NA
## 2 ATL   NL      2012   700   600    94 0.32  0.389 0.247        1          4
## 3 BAL   AL      2012   712   705    93 0.311 0.417 0.247        1          5
## 4 BOS   AL      2012   734   806    69 0.315 0.415 0.26         0         NA
## 5 CHC   NL      2012   613   759    61 0.302 0.378 0.24         0         NA
## 6 CHW   AL      2012   748   676    85 0.318 0.422 0.255        0         NA
## # ... with 4 more variables: RankPlayoffs <dbl>, G <dbl>, OOBP <dbl>,
## #   OSLG <dbl>

이 데이터는 메이저리그 팀들의 기록과 플레이오프 진출 여부를 포함한 데이터이다. 그 중 의미있는 X변수들을 뽑아 플레이오프 진출 여부를 classifying 해보겠다. 당연히 승수(Win), 득점(RS), 실점(RA)는 상관관계가 굉장히 높을 것으로 예상되므로, 트리를 좀 더 키워서 연습해보기 위해 이 변수들은 제거하였다.

data = data %>% select(c("Playoffs", "OBP", "SLG", "BA", "OOBP", "OSLG"))

OBP는 출루율, SLG는 장타율, BA는 타율, OOBP는 피출루율, OSLG는 피장타율을 나타낸다.

1) Train - Test split

set.seed(2026)
nobs = nrow(data)

train_ind = sample(1:nobs, round(nobs * 0.5))

train = data[train_ind,]
test = data[-train_ind,]

2) Decision Tree & AUROC 계산

minsplit = seq(1,46,by=5)
cp = seq(0.001, 0.01, by = 0.001)
mcr = matrix(NA, nrow = length(minsplit), ncol = length(cp))
mcrdf = data.frame()

for(i in 1:10){
   
  for(j in 1:10){
    m = minsplit[i]
    c = cp[j] 
    my.control = rpart.control(minsplit = m, cp = c, xval = 0)
    tree = rpart(Playoffs~., data = train, method = "class", control = my.control)
    pred = predict(tree, newdata = test, type = "prob")
    roccurve0 = roc(test$Playoffs ~ pred[,2])

    auc = roccurve0$auc %>% as.numeric()
    
    mcr[i,j] = auc
    mcrdf = rbind(mcrdf, c(m,c,auc))
  }
   
}
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

3) 3D Surface Graph

persp3D(minsplit, cp, mcr,theta=30, phi=50, axes=TRUE,scale= 0.75, box=TRUE, nticks=5, 

        ticktype="detailed", xlab = "minsplit", ylab= "cp", zlab = "AUROC")

colnames(mcrdf) = c("minsplit", "cp", "AUC")

그래프를 봤을 때, minsplit 10근처, cp 0.008~0.010 사이에 짙은 적색으로 표현되는 최댓값이 위치하는 것으로 보인다.

4) AUC를 max로 하는 minsplit과 cp 찾기

df = mcrdf %>%
  filter(AUC == max(AUC))

print(df)
##   minsplit    cp       AUC
## 1       11 0.009 0.7125245
## 2       11 0.010 0.7125245

그래프에서 볼 수 있듯, minsplit 11과 cp 0.009, 0.010에서 가장 좋은 AUC가 나오게 된다.