Linear Regression

Cleaning data

airbnb 데이터를 활용해 선형회귀분석을 해보자

Importing data

library(tidyverse)
setwd("C:/Users/dhxog/Desktop/4-1학기/데이터마이닝")
data <- readxl::read_xlsx("airbnb.xlsx")
summary(data)
##        id             log_price     property_type       room_type        
##  Min.   :     344   Min.   :0.000   Length:74111       Length:74111      
##  1st Qu.: 6261964   1st Qu.:4.317   Class :character   Class :character  
##  Median :12254147   Median :4.710   Mode  :character   Mode  :character  
##  Mean   :11266617   Mean   :4.782                                        
##  3rd Qu.:16402260   3rd Qu.:5.220                                        
##  Max.   :21230903   Max.   :7.600                                        
##                                                                          
##   amenities          accommodates      bathrooms       bed_type        
##  Length:74111       Min.   : 1.000   Min.   :0.000   Length:74111      
##  Class :character   1st Qu.: 2.000   1st Qu.:1.000   Class :character  
##  Mode  :character   Median : 2.000   Median :1.000   Mode  :character  
##                     Mean   : 3.155   Mean   :1.235                     
##                     3rd Qu.: 4.000   3rd Qu.:1.000                     
##                     Max.   :16.000   Max.   :8.000                     
##                                      NA's   :200                       
##  cancellation_policy cleaning_fee        city           description       
##  Length:74111        Mode :logical   Length:74111       Length:74111      
##  Class :character    FALSE:19708     Class :character   Class :character  
##  Mode  :character    TRUE :54403     Mode  :character   Mode  :character  
##                                                                           
##                                                                           
##                                                                           
##                                                                           
##   first_review                 host_has_profile_pic host_identity_verified
##  Min.   :1930-01-11 00:00:00   Length:74111         Length:74111          
##  1st Qu.:2006-06-16 00:00:00   Class :character     Class :character      
##  Median :2014-06-15 00:00:00   Mode  :character     Mode  :character      
##  Mean   :2010-07-22 09:03:47                                              
##  3rd Qu.:2021-09-17 00:00:00                                              
##  Max.   :2029-12-16 00:00:00                                              
##  NA's   :15864                                                            
##  host_response_rate   host_since                  instant_bookable  
##  Min.   :0.000      Min.   :1930-01-10 00:00:00   Length:74111      
##  1st Qu.:1.000      1st Qu.:2007-02-10 12:00:00   Class :character  
##  Median :1.000      Median :2014-07-13 00:00:00   Mode  :character  
##  Mean   :0.944      Mean   :2011-02-11 15:13:38                     
##  3rd Qu.:1.000      3rd Qu.:2022-01-14 12:00:00                     
##  Max.   :1.000      Max.   :2029-12-16 00:00:00                     
##  NA's   :18299      NA's   :188                                     
##   last_review                     latitude       longitude      
##  Min.   :1930-01-15 00:00:00   Min.   :33.34   Min.   :-122.51  
##  1st Qu.:2006-06-14 00:00:00   1st Qu.:34.13   1st Qu.:-118.34  
##  Median :2016-04-17 00:00:00   Median :40.66   Median : -77.00  
##  Mean   :2010-01-08 15:39:02   Mean   :38.45   Mean   : -92.40  
##  3rd Qu.:2023-04-17 00:00:00   3rd Qu.:40.75   3rd Qu.: -73.95  
##  Max.   :2029-12-16 00:00:00   Max.   :42.39   Max.   : -70.99  
##  NA's   :15827                                                  
##      name           neighbourhood      number_of_reviews review_scores_rating
##  Length:74111       Length:74111       Min.   :  0.0     Min.   : 20.00      
##  Class :character   Class :character   1st Qu.:  1.0     1st Qu.: 92.00      
##  Mode  :character   Mode  :character   Median :  6.0     Median : 96.00      
##                                        Mean   : 20.9     Mean   : 94.07      
##                                        3rd Qu.: 23.0     3rd Qu.:100.00      
##                                        Max.   :605.0     Max.   :100.00      
##                                                          NA's   :16722       
##  thumbnail_url        zipcode             bedrooms           beds       
##  Length:74111       Length:74111       Min.   : 0.000   Min.   : 0.000  
##  Class :character   Class :character   1st Qu.: 1.000   1st Qu.: 1.000  
##  Mode  :character   Mode  :character   Median : 1.000   Median : 1.000  
##                                        Mean   : 1.266   Mean   : 1.711  
##                                        3rd Qu.: 1.000   3rd Qu.: 2.000  
##                                        Max.   :10.000   Max.   :18.000  
##                                        NA's   :91       NA's   :131

Changing variables

a. number_of_reviews 11이상만

data <- data %>%
  filter(data$number_of_reviews >= 11)

b. amenities를 “,”를 기준으로 나누고, 나뉜 갯수(편의용품 갯수)로 바꾸기

temp <- data$amenities %>% strsplit(split = ",") %>% summary()
data$amenities <- as.numeric(temp[,1])

c. 가격비 변수 만들기

mean_bycity = data %>% 
  group_by(city) %>%
  summarise(mbc = mean(exp(log_price)))
## `summarise()` ungrouping output (override with `.groups` argument)
data <- merge(data, mean_bycity)
data = data %>% 
  mutate(priceratio = exp(log_price) / mbc * 100)

d. 회귀분석과 관련 없는 변수 or y변수 생성에 쓰인 변수 등 빼기

data = subset(data, select = c(-id,-log_price,
-description, -first_review, -host_since, -last_review, -name,
-neighbourhood, -thumbnail_url,-zipcode,-mbc ))

e. NA 다루기

일단 칼럼별 NA의 비율을 구하자

nas = rep(0, length(data[1,]))
for(i in 1:length(data[1,])){
  nas[i] = sum(is.na(data[,i]))
  colname = names(data)
  print(paste(colname[i],  ":" , round(nas[i]/nrow(data)*100, digits = 2), "%"))
} 
## [1] "city : 0 %"
## [1] "property_type : 0 %"
## [1] "room_type : 0 %"
## [1] "amenities : 0 %"
## [1] "accommodates : 0 %"
## [1] "bathrooms : 0.35 %"
## [1] "bed_type : 0 %"
## [1] "cancellation_policy : 0 %"
## [1] "cleaning_fee : 0 %"
## [1] "host_has_profile_pic : 0.22 %"
## [1] "host_identity_verified : 0.22 %"
## [1] "host_response_rate : 5.5 %"
## [1] "instant_bookable : 0 %"
## [1] "latitude : 0 %"
## [1] "longitude : 0 %"
## [1] "number_of_reviews : 0 %"
## [1] "review_scores_rating : 0.08 %"
## [1] "bedrooms : 0.14 %"
## [1] "beds : 0.06 %"
## [1] "priceratio : 0 %"

NA의 비율이 상당히 낮기 때문에 빼기로 결정

data <- data %>% na.omit()

f. property_type 변환

data$property_type[data$property_type != "Apartment" & data$property_type != "House"] = "Other"

g. bed_type 변환

data$bed_type[data$bed_type != "Real Bed"] = "Other"
data$bed_type[data$bed_type == "Real Bed"] = "Bed"

h. instant_bookable 변환

data$instant_bookable[data$instant_bookable=="t"] = as.logical(TRUE)
data$instant_bookable[data$instant_bookable=="f"] = as.logical(FALSE)
data$instant_bookable <- as.logical(data$instant_bookable)

i. host_identity_verified 변환

data$host_identity_verified[data$host_identity_verified=='t'] = TRUE
data$host_identity_verified[data$host_identity_verified=='f'] = FALSE
data$host_identity_verified <- as.logical(data$host_identity_verified)

j. host_has_profile_pic 변환

data$host_has_profile_pic[data$host_has_profile_pic=='t'] = as.logical(TRUE)
data$host_has_profile_pic[data$host_has_profile_pic=='f'] = as.logical(FALSE)
data$host_has_profile_pic <- as.logical(data$host_has_profile_pic)

변환 결과

str(data)
## 'data.frame':    27268 obs. of  20 variables:
##  $ city                  : chr  "Boston" "Boston" "Boston" "Boston" ...
##  $ property_type         : chr  "House" "Apartment" "Apartment" "Apartment" ...
##  $ room_type             : chr  "Entire home/apt" "Entire home/apt" "Entire home/apt" "Entire home/apt" ...
##  $ amenities             : num  28 22 20 20 16 15 24 7 18 18 ...
##  $ accommodates          : num  10 2 3 5 1 4 2 1 2 6 ...
##  $ bathrooms             : num  2.5 1 1 1 1 1 1 1 1 1 ...
##  $ bed_type              : chr  "Bed" "Bed" "Bed" "Bed" ...
##  $ cancellation_policy   : chr  "strict" "strict" "moderate" "moderate" ...
##  $ cleaning_fee          : logi  TRUE TRUE TRUE TRUE FALSE TRUE ...
##  $ host_has_profile_pic  : logi  TRUE TRUE TRUE TRUE TRUE TRUE ...
##  $ host_identity_verified: logi  TRUE TRUE TRUE TRUE TRUE TRUE ...
##  $ host_response_rate    : num  1 1 1 1 1 0.88 1 1 1 1 ...
##  $ instant_bookable      : logi  TRUE TRUE TRUE TRUE FALSE FALSE ...
##  $ latitude              : num  42.4 42.4 42.4 42.4 42.3 ...
##  $ longitude             : num  -71.1 -71.1 -71.1 -71 -71.1 ...
##  $ number_of_reviews     : num  128 138 159 165 94 12 37 130 12 92 ...
##  $ review_scores_rating  : num  98 93 96 91 91 87 93 97 70 95 ...
##  $ bedrooms              : num  3 0 1 3 1 2 1 1 0 3 ...
##  $ beds                  : num  9 1 2 3 1 2 1 1 1 4 ...
##  $ priceratio            : num  229.5 114.8 104.3 98.4 23 ...
##  - attr(*, "na.action")= 'omit' Named int  3 8 26 31 88 97 145 225 290 313 ...
##   ..- attr(*, "names")= chr  "3" "8" "26" "31" ...
summary(data)
##      city           property_type       room_type           amenities   
##  Length:27268       Length:27268       Length:27268       Min.   : 1.0  
##  Class :character   Class :character   Class :character   1st Qu.:15.0  
##  Mode  :character   Mode  :character   Mode  :character   Median :20.0  
##                                                           Mean   :19.9  
##                                                           3rd Qu.:23.0  
##                                                           Max.   :77.0  
##   accommodates      bathrooms      bed_type         cancellation_policy
##  Min.   : 1.000   Min.   :0.00   Length:27268       Length:27268       
##  1st Qu.: 2.000   1st Qu.:1.00   Class :character   Class :character   
##  Median : 2.000   Median :1.00   Mode  :character   Mode  :character   
##  Mean   : 3.424   Mean   :1.22                                         
##  3rd Qu.: 4.000   3rd Qu.:1.00                                         
##  Max.   :16.000   Max.   :8.00                                         
##  cleaning_fee    host_has_profile_pic host_identity_verified host_response_rate
##  Mode :logical   Mode :logical        Mode :logical          Min.   :0.0000    
##  FALSE:4189      FALSE:24             FALSE:5838             1st Qu.:1.0000    
##  TRUE :23079     TRUE :27244          TRUE :21430            Median :1.0000    
##                                                              Mean   :0.9692    
##                                                              3rd Qu.:1.0000    
##                                                              Max.   :1.0000    
##  instant_bookable    latitude       longitude       number_of_reviews
##  Mode :logical    Min.   :33.34   Min.   :-122.51   Min.   : 11.00   
##  FALSE:18879      1st Qu.:34.10   1st Qu.:-118.35   1st Qu.: 19.00   
##  TRUE :8389       Median :40.64   Median : -77.04   Median : 33.00   
##                   Mean   :38.34   Mean   : -93.84   Mean   : 50.64   
##                   3rd Qu.:40.75   3rd Qu.: -73.96   3rd Qu.: 64.00   
##                   Max.   :42.39   Max.   : -71.00   Max.   :605.00   
##  review_scores_rating    bedrooms           beds          priceratio     
##  Min.   : 58.00       Min.   : 0.000   Min.   : 0.000   Min.   :   7.29  
##  1st Qu.: 92.00       1st Qu.: 1.000   1st Qu.: 1.000   1st Qu.:  52.59  
##  Median : 95.00       Median : 1.000   Median : 1.000   Median :  77.14  
##  Mean   : 94.28       Mean   : 1.268   Mean   : 1.841   Mean   : 100.05  
##  3rd Qu.: 98.00       3rd Qu.: 1.000   3rd Qu.: 2.000   3rd Qu.: 119.18  
##  Max.   :100.00       Max.   :10.000   Max.   :18.000   Max.   :1423.76

훨씬 깔끔하다!

Problem 1. 가격비의 평균과 표준편차 구하기

#mean
data$priceratio %>% mean
## [1] 100.0486
#sd
data$priceratio %>% sd
## [1] 83.8321

Problem 2. 가격비를 y변수로 선형회귀분석

lm1 = lm(priceratio~., data = data)
step_used1 <- step(lm1, direction = 'both')
plot(step_used1, which = 1)

plot(step_used1, which = 2) 

등분산성과 정규성을 충족하지 못하는 것으로 보이므로, y변수에 대해 log변환을 실시하였다.

lm2 = lm(log(priceratio)~., data = data)
step_used2 = step(lm2, direction = "both")
## Start:  AIC=-53791.28
## log(priceratio) ~ city + property_type + room_type + amenities + 
##     accommodates + bathrooms + bed_type + cancellation_policy + 
##     cleaning_fee + host_has_profile_pic + host_identity_verified + 
##     host_response_rate + instant_bookable + latitude + longitude + 
##     number_of_reviews + review_scores_rating + bedrooms + beds
## 
##                          Df Sum of Sq    RSS    AIC
## - host_has_profile_pic    1      0.05 3784.5 -53793
## - number_of_reviews       1      0.23 3784.7 -53792
## <none>                                3784.4 -53791
## - host_identity_verified  1      0.32 3784.8 -53791
## - cleaning_fee            1      0.37 3784.8 -53791
## - latitude                1      0.52 3785.0 -53790
## - bed_type                1      1.25 3785.7 -53784
## - instant_bookable        1      5.99 3790.4 -53750
## - host_response_rate      1      6.22 3790.7 -53749
## - property_type           2     10.95 3795.4 -53716
## - amenities               1     13.27 3797.7 -53698
## - beds                    1     22.13 3806.6 -53634
## - cancellation_policy     4     24.02 3808.5 -53627
## - bathrooms               1     75.01 3859.4 -53258
## - accommodates            1    130.52 3915.0 -52869
## - review_scores_rating    1    131.42 3915.9 -52862
## - bedrooms                1    200.92 3985.4 -52383
## - longitude               1    244.81 4029.2 -52084
## - city                    5    292.37 4076.8 -51772
## - room_type               2   1735.16 5519.6 -43504
## 
## Step:  AIC=-53792.92
## log(priceratio) ~ city + property_type + room_type + amenities + 
##     accommodates + bathrooms + bed_type + cancellation_policy + 
##     cleaning_fee + host_identity_verified + host_response_rate + 
##     instant_bookable + latitude + longitude + number_of_reviews + 
##     review_scores_rating + bedrooms + beds
## 
##                          Df Sum of Sq    RSS    AIC
## - number_of_reviews       1      0.23 3784.7 -53793
## <none>                                3784.5 -53793
## - host_identity_verified  1      0.31 3784.8 -53793
## - cleaning_fee            1      0.37 3784.9 -53792
## + host_has_profile_pic    1      0.05 3784.4 -53791
## - latitude                1      0.52 3785.0 -53791
## - bed_type                1      1.25 3785.7 -53786
## - instant_bookable        1      5.98 3790.5 -53752
## - host_response_rate      1      6.22 3790.7 -53750
## - property_type           2     10.94 3795.4 -53718
## - amenities               1     13.26 3797.7 -53700
## - beds                    1     22.10 3806.6 -53636
## - cancellation_policy     4     24.01 3808.5 -53628
## - bathrooms               1     75.00 3859.5 -53260
## - accommodates            1    130.51 3915.0 -52870
## - review_scores_rating    1    131.42 3915.9 -52864
## - bedrooms                1    200.87 3985.4 -52385
## - longitude               1    244.76 4029.3 -52086
## - city                    5    292.34 4076.8 -51774
## - room_type               2   1735.25 5519.7 -43505
## 
## Step:  AIC=-53793.28
## log(priceratio) ~ city + property_type + room_type + amenities + 
##     accommodates + bathrooms + bed_type + cancellation_policy + 
##     cleaning_fee + host_identity_verified + host_response_rate + 
##     instant_bookable + latitude + longitude + review_scores_rating + 
##     bedrooms + beds
## 
##                          Df Sum of Sq    RSS    AIC
## - host_identity_verified  1      0.26 3785.0 -53793
## <none>                                3784.7 -53793
## - cleaning_fee            1      0.33 3785.0 -53793
## + number_of_reviews       1      0.23 3784.5 -53793
## + host_has_profile_pic    1      0.05 3784.7 -53792
## - latitude                1      0.54 3785.3 -53791
## - bed_type                1      1.26 3786.0 -53786
## - instant_bookable        1      6.13 3790.9 -53751
## - host_response_rate      1      6.42 3791.1 -53749
## - property_type           2     11.08 3795.8 -53718
## - amenities               1     13.17 3797.9 -53701
## - beds                    1     22.15 3806.9 -53636
## - cancellation_policy     4     23.86 3808.6 -53630
## - bathrooms               1     75.39 3860.1 -53257
## - accommodates            1    130.32 3915.0 -52872
## - review_scores_rating    1    131.86 3916.6 -52861
## - bedrooms                1    202.76 3987.5 -52372
## - longitude               1    244.61 4029.3 -52088
## - city                    5    292.26 4077.0 -51775
## - room_type               2   1735.94 5520.7 -43503
## 
## Step:  AIC=-53793.4
## log(priceratio) ~ city + property_type + room_type + amenities + 
##     accommodates + bathrooms + bed_type + cancellation_policy + 
##     cleaning_fee + host_response_rate + instant_bookable + latitude + 
##     longitude + review_scores_rating + bedrooms + beds
## 
##                          Df Sum of Sq    RSS    AIC
## <none>                                3785.0 -53793
## + host_identity_verified  1      0.26 3784.7 -53793
## - cleaning_fee            1      0.30 3785.3 -53793
## + number_of_reviews       1      0.18 3784.8 -53793
## + host_has_profile_pic    1      0.04 3784.9 -53792
## - latitude                1      0.53 3785.5 -53792
## - bed_type                1      1.25 3786.2 -53786
## - instant_bookable        1      6.39 3791.4 -53749
## - host_response_rate      1      6.41 3791.4 -53749
## - property_type           2     11.06 3796.0 -53718
## - amenities               1     13.59 3798.6 -53698
## - beds                    1     22.14 3807.1 -53636
## - cancellation_policy     4     24.12 3809.1 -53628
## - bathrooms               1     75.35 3860.3 -53258
## - accommodates            1    130.31 3915.3 -52872
## - review_scores_rating    1    132.34 3917.3 -52858
## - bedrooms                1    202.74 3987.7 -52373
## - longitude               1    244.67 4029.7 -52087
## - city                    5    292.01 4077.0 -51777
## - room_type               2   1735.79 5520.8 -43504
summary(step_used2)
## 
## Call:
## lm(formula = log(priceratio) ~ city + property_type + room_type + 
##     amenities + accommodates + bathrooms + bed_type + cancellation_policy + 
##     cleaning_fee + host_response_rate + instant_bookable + latitude + 
##     longitude + review_scores_rating + bedrooms + beds, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.55964 -0.23796 -0.00661  0.23018  2.59788 
## 
## Coefficients:
##                                      Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)                        -7.807e+01  2.291e+00  -34.072  < 2e-16 ***
## cityChicago                        -1.825e+01  4.346e-01  -42.007  < 2e-16 ***
## cityDC                             -6.337e+00  1.976e-01  -32.062  < 2e-16 ***
## cityLA                             -5.136e+01  1.278e+00  -40.189  < 2e-16 ***
## cityNYC                            -2.991e+00  9.496e-02  -31.496  < 2e-16 ***
## citySF                             -5.601e+01  1.358e+00  -41.257  < 2e-16 ***
## property_typeHouse                 -4.532e-02  5.979e-03   -7.579 3.59e-14 ***
## property_typeOther                  1.715e-02  7.317e-03    2.344  0.01909 *  
## room_typePrivate room              -5.860e-01  5.715e-03 -102.531  < 2e-16 ***
## room_typeShared room               -1.108e+00  1.687e-02  -65.679  < 2e-16 ***
## amenities                           3.494e-03  3.533e-04    9.889  < 2e-16 ***
## accommodates                        6.676e-02  2.180e-03   30.625  < 2e-16 ***
## bathrooms                           1.172e-01  5.034e-03   23.288  < 2e-16 ***
## bed_typeOther                      -4.160e-02  1.389e-02   -2.995  0.00275 ** 
## cancellation_policymoderate         3.002e-02  7.566e-03    3.968 7.26e-05 ***
## cancellation_policystrict           7.549e-02  7.255e-03   10.405  < 2e-16 ***
## cancellation_policysuper_strict_30  4.078e-01  7.103e-02    5.742 9.48e-09 ***
## cancellation_policysuper_strict_60  6.674e-01  2.641e-01    2.527  0.01152 *  
## cleaning_feeTRUE                   -9.666e-03  6.614e-03   -1.462  0.14389    
## host_response_rate                 -1.568e-01  2.309e-02   -6.791 1.13e-11 ***
## instant_bookableTRUE               -3.420e-02  5.043e-03   -6.782 1.21e-11 ***
## latitude                            6.594e-02  3.363e-02    1.961  0.04994 *  
## longitude                          -1.097e+00  2.614e-02  -41.964  < 2e-16 ***
## review_scores_rating                1.614e-02  5.230e-04   30.862  < 2e-16 ***
## bedrooms                            1.651e-01  4.322e-03   38.200  < 2e-16 ***
## beds                               -3.995e-02  3.164e-03  -12.624  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3727 on 27242 degrees of freedom
## Multiple R-squared:  0.6468, Adjusted R-squared:  0.6464 
## F-statistic:  1995 on 25 and 27242 DF,  p-value: < 2.2e-16

R^2가 0.6468으로. 나름 괜찮아 보인다.

plot(step_used2, which = 1)

plot(step_used2, which = 2) 

등분산성성과 정규성이 이전보다 더 충족되어 보인다.