Linear Regression
· 12 min read
Cleaning data
airbnb 데이터를 활용해 선형회귀분석을 해보자
Importing data
library(tidyverse)
setwd("C:/Users/dhxog/Desktop/4-1학기/데이터마이닝")
data <- readxl::read_xlsx("airbnb.xlsx")
summary(data)
## id log_price property_type room_type
## Min. : 344 Min. :0.000 Length:74111 Length:74111
## 1st Qu.: 6261964 1st Qu.:4.317 Class :character Class :character
## Median :12254147 Median :4.710 Mode :character Mode :character
## Mean :11266617 Mean :4.782
## 3rd Qu.:16402260 3rd Qu.:5.220
## Max. :21230903 Max. :7.600
##
## amenities accommodates bathrooms bed_type
## Length:74111 Min. : 1.000 Min. :0.000 Length:74111
## Class :character 1st Qu.: 2.000 1st Qu.:1.000 Class :character
## Mode :character Median : 2.000 Median :1.000 Mode :character
## Mean : 3.155 Mean :1.235
## 3rd Qu.: 4.000 3rd Qu.:1.000
## Max. :16.000 Max. :8.000
## NA's :200
## cancellation_policy cleaning_fee city description
## Length:74111 Mode :logical Length:74111 Length:74111
## Class :character FALSE:19708 Class :character Class :character
## Mode :character TRUE :54403 Mode :character Mode :character
##
##
##
##
## first_review host_has_profile_pic host_identity_verified
## Min. :1930-01-11 00:00:00 Length:74111 Length:74111
## 1st Qu.:2006-06-16 00:00:00 Class :character Class :character
## Median :2014-06-15 00:00:00 Mode :character Mode :character
## Mean :2010-07-22 09:03:47
## 3rd Qu.:2021-09-17 00:00:00
## Max. :2029-12-16 00:00:00
## NA's :15864
## host_response_rate host_since instant_bookable
## Min. :0.000 Min. :1930-01-10 00:00:00 Length:74111
## 1st Qu.:1.000 1st Qu.:2007-02-10 12:00:00 Class :character
## Median :1.000 Median :2014-07-13 00:00:00 Mode :character
## Mean :0.944 Mean :2011-02-11 15:13:38
## 3rd Qu.:1.000 3rd Qu.:2022-01-14 12:00:00
## Max. :1.000 Max. :2029-12-16 00:00:00
## NA's :18299 NA's :188
## last_review latitude longitude
## Min. :1930-01-15 00:00:00 Min. :33.34 Min. :-122.51
## 1st Qu.:2006-06-14 00:00:00 1st Qu.:34.13 1st Qu.:-118.34
## Median :2016-04-17 00:00:00 Median :40.66 Median : -77.00
## Mean :2010-01-08 15:39:02 Mean :38.45 Mean : -92.40
## 3rd Qu.:2023-04-17 00:00:00 3rd Qu.:40.75 3rd Qu.: -73.95
## Max. :2029-12-16 00:00:00 Max. :42.39 Max. : -70.99
## NA's :15827
## name neighbourhood number_of_reviews review_scores_rating
## Length:74111 Length:74111 Min. : 0.0 Min. : 20.00
## Class :character Class :character 1st Qu.: 1.0 1st Qu.: 92.00
## Mode :character Mode :character Median : 6.0 Median : 96.00
## Mean : 20.9 Mean : 94.07
## 3rd Qu.: 23.0 3rd Qu.:100.00
## Max. :605.0 Max. :100.00
## NA's :16722
## thumbnail_url zipcode bedrooms beds
## Length:74111 Length:74111 Min. : 0.000 Min. : 0.000
## Class :character Class :character 1st Qu.: 1.000 1st Qu.: 1.000
## Mode :character Mode :character Median : 1.000 Median : 1.000
## Mean : 1.266 Mean : 1.711
## 3rd Qu.: 1.000 3rd Qu.: 2.000
## Max. :10.000 Max. :18.000
## NA's :91 NA's :131
Changing variables
a. number_of_reviews 11이상만
data <- data %>%
filter(data$number_of_reviews >= 11)
b. amenities를 “,”를 기준으로 나누고, 나뉜 갯수(편의용품 갯수)로 바꾸기
temp <- data$amenities %>% strsplit(split = ",") %>% summary()
data$amenities <- as.numeric(temp[,1])
c. 가격비 변수 만들기
mean_bycity = data %>%
group_by(city) %>%
summarise(mbc = mean(exp(log_price)))
## `summarise()` ungrouping output (override with `.groups` argument)
data <- merge(data, mean_bycity)
data = data %>%
mutate(priceratio = exp(log_price) / mbc * 100)
d. 회귀분석과 관련 없는 변수 or y변수 생성에 쓰인 변수 등 빼기
data = subset(data, select = c(-id,-log_price,
-description, -first_review, -host_since, -last_review, -name,
-neighbourhood, -thumbnail_url,-zipcode,-mbc ))
e. NA 다루기
일단 칼럼별 NA의 비율을 구하자
nas = rep(0, length(data[1,]))
for(i in 1:length(data[1,])){
nas[i] = sum(is.na(data[,i]))
colname = names(data)
print(paste(colname[i], ":" , round(nas[i]/nrow(data)*100, digits = 2), "%"))
}
## [1] "city : 0 %"
## [1] "property_type : 0 %"
## [1] "room_type : 0 %"
## [1] "amenities : 0 %"
## [1] "accommodates : 0 %"
## [1] "bathrooms : 0.35 %"
## [1] "bed_type : 0 %"
## [1] "cancellation_policy : 0 %"
## [1] "cleaning_fee : 0 %"
## [1] "host_has_profile_pic : 0.22 %"
## [1] "host_identity_verified : 0.22 %"
## [1] "host_response_rate : 5.5 %"
## [1] "instant_bookable : 0 %"
## [1] "latitude : 0 %"
## [1] "longitude : 0 %"
## [1] "number_of_reviews : 0 %"
## [1] "review_scores_rating : 0.08 %"
## [1] "bedrooms : 0.14 %"
## [1] "beds : 0.06 %"
## [1] "priceratio : 0 %"
NA의 비율이 상당히 낮기 때문에 빼기로 결정
data <- data %>% na.omit()
f. property_type 변환
data$property_type[data$property_type != "Apartment" & data$property_type != "House"] = "Other"
g. bed_type 변환
data$bed_type[data$bed_type != "Real Bed"] = "Other"
data$bed_type[data$bed_type == "Real Bed"] = "Bed"
h. instant_bookable 변환
data$instant_bookable[data$instant_bookable=="t"] = as.logical(TRUE)
data$instant_bookable[data$instant_bookable=="f"] = as.logical(FALSE)
data$instant_bookable <- as.logical(data$instant_bookable)
i. host_identity_verified 변환
data$host_identity_verified[data$host_identity_verified=='t'] = TRUE
data$host_identity_verified[data$host_identity_verified=='f'] = FALSE
data$host_identity_verified <- as.logical(data$host_identity_verified)
j. host_has_profile_pic 변환
data$host_has_profile_pic[data$host_has_profile_pic=='t'] = as.logical(TRUE)
data$host_has_profile_pic[data$host_has_profile_pic=='f'] = as.logical(FALSE)
data$host_has_profile_pic <- as.logical(data$host_has_profile_pic)
변환 결과
str(data)
## 'data.frame': 27268 obs. of 20 variables:
## $ city : chr "Boston" "Boston" "Boston" "Boston" ...
## $ property_type : chr "House" "Apartment" "Apartment" "Apartment" ...
## $ room_type : chr "Entire home/apt" "Entire home/apt" "Entire home/apt" "Entire home/apt" ...
## $ amenities : num 28 22 20 20 16 15 24 7 18 18 ...
## $ accommodates : num 10 2 3 5 1 4 2 1 2 6 ...
## $ bathrooms : num 2.5 1 1 1 1 1 1 1 1 1 ...
## $ bed_type : chr "Bed" "Bed" "Bed" "Bed" ...
## $ cancellation_policy : chr "strict" "strict" "moderate" "moderate" ...
## $ cleaning_fee : logi TRUE TRUE TRUE TRUE FALSE TRUE ...
## $ host_has_profile_pic : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
## $ host_identity_verified: logi TRUE TRUE TRUE TRUE TRUE TRUE ...
## $ host_response_rate : num 1 1 1 1 1 0.88 1 1 1 1 ...
## $ instant_bookable : logi TRUE TRUE TRUE TRUE FALSE FALSE ...
## $ latitude : num 42.4 42.4 42.4 42.4 42.3 ...
## $ longitude : num -71.1 -71.1 -71.1 -71 -71.1 ...
## $ number_of_reviews : num 128 138 159 165 94 12 37 130 12 92 ...
## $ review_scores_rating : num 98 93 96 91 91 87 93 97 70 95 ...
## $ bedrooms : num 3 0 1 3 1 2 1 1 0 3 ...
## $ beds : num 9 1 2 3 1 2 1 1 1 4 ...
## $ priceratio : num 229.5 114.8 104.3 98.4 23 ...
## - attr(*, "na.action")= 'omit' Named int 3 8 26 31 88 97 145 225 290 313 ...
## ..- attr(*, "names")= chr "3" "8" "26" "31" ...
summary(data)
## city property_type room_type amenities
## Length:27268 Length:27268 Length:27268 Min. : 1.0
## Class :character Class :character Class :character 1st Qu.:15.0
## Mode :character Mode :character Mode :character Median :20.0
## Mean :19.9
## 3rd Qu.:23.0
## Max. :77.0
## accommodates bathrooms bed_type cancellation_policy
## Min. : 1.000 Min. :0.00 Length:27268 Length:27268
## 1st Qu.: 2.000 1st Qu.:1.00 Class :character Class :character
## Median : 2.000 Median :1.00 Mode :character Mode :character
## Mean : 3.424 Mean :1.22
## 3rd Qu.: 4.000 3rd Qu.:1.00
## Max. :16.000 Max. :8.00
## cleaning_fee host_has_profile_pic host_identity_verified host_response_rate
## Mode :logical Mode :logical Mode :logical Min. :0.0000
## FALSE:4189 FALSE:24 FALSE:5838 1st Qu.:1.0000
## TRUE :23079 TRUE :27244 TRUE :21430 Median :1.0000
## Mean :0.9692
## 3rd Qu.:1.0000
## Max. :1.0000
## instant_bookable latitude longitude number_of_reviews
## Mode :logical Min. :33.34 Min. :-122.51 Min. : 11.00
## FALSE:18879 1st Qu.:34.10 1st Qu.:-118.35 1st Qu.: 19.00
## TRUE :8389 Median :40.64 Median : -77.04 Median : 33.00
## Mean :38.34 Mean : -93.84 Mean : 50.64
## 3rd Qu.:40.75 3rd Qu.: -73.96 3rd Qu.: 64.00
## Max. :42.39 Max. : -71.00 Max. :605.00
## review_scores_rating bedrooms beds priceratio
## Min. : 58.00 Min. : 0.000 Min. : 0.000 Min. : 7.29
## 1st Qu.: 92.00 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 52.59
## Median : 95.00 Median : 1.000 Median : 1.000 Median : 77.14
## Mean : 94.28 Mean : 1.268 Mean : 1.841 Mean : 100.05
## 3rd Qu.: 98.00 3rd Qu.: 1.000 3rd Qu.: 2.000 3rd Qu.: 119.18
## Max. :100.00 Max. :10.000 Max. :18.000 Max. :1423.76
훨씬 깔끔하다!
Problem 1. 가격비의 평균과 표준편차 구하기
#mean
data$priceratio %>% mean
## [1] 100.0486
#sd
data$priceratio %>% sd
## [1] 83.8321
Problem 2. 가격비를 y변수로 선형회귀분석
lm1 = lm(priceratio~., data = data)
step_used1 <- step(lm1, direction = 'both')
plot(step_used1, which = 1)
plot(step_used1, which = 2)
등분산성과 정규성을 충족하지 못하는 것으로 보이므로, y변수에 대해 log변환을 실시하였다.
lm2 = lm(log(priceratio)~., data = data)
step_used2 = step(lm2, direction = "both")
## Start: AIC=-53791.28
## log(priceratio) ~ city + property_type + room_type + amenities +
## accommodates + bathrooms + bed_type + cancellation_policy +
## cleaning_fee + host_has_profile_pic + host_identity_verified +
## host_response_rate + instant_bookable + latitude + longitude +
## number_of_reviews + review_scores_rating + bedrooms + beds
##
## Df Sum of Sq RSS AIC
## - host_has_profile_pic 1 0.05 3784.5 -53793
## - number_of_reviews 1 0.23 3784.7 -53792
## <none> 3784.4 -53791
## - host_identity_verified 1 0.32 3784.8 -53791
## - cleaning_fee 1 0.37 3784.8 -53791
## - latitude 1 0.52 3785.0 -53790
## - bed_type 1 1.25 3785.7 -53784
## - instant_bookable 1 5.99 3790.4 -53750
## - host_response_rate 1 6.22 3790.7 -53749
## - property_type 2 10.95 3795.4 -53716
## - amenities 1 13.27 3797.7 -53698
## - beds 1 22.13 3806.6 -53634
## - cancellation_policy 4 24.02 3808.5 -53627
## - bathrooms 1 75.01 3859.4 -53258
## - accommodates 1 130.52 3915.0 -52869
## - review_scores_rating 1 131.42 3915.9 -52862
## - bedrooms 1 200.92 3985.4 -52383
## - longitude 1 244.81 4029.2 -52084
## - city 5 292.37 4076.8 -51772
## - room_type 2 1735.16 5519.6 -43504
##
## Step: AIC=-53792.92
## log(priceratio) ~ city + property_type + room_type + amenities +
## accommodates + bathrooms + bed_type + cancellation_policy +
## cleaning_fee + host_identity_verified + host_response_rate +
## instant_bookable + latitude + longitude + number_of_reviews +
## review_scores_rating + bedrooms + beds
##
## Df Sum of Sq RSS AIC
## - number_of_reviews 1 0.23 3784.7 -53793
## <none> 3784.5 -53793
## - host_identity_verified 1 0.31 3784.8 -53793
## - cleaning_fee 1 0.37 3784.9 -53792
## + host_has_profile_pic 1 0.05 3784.4 -53791
## - latitude 1 0.52 3785.0 -53791
## - bed_type 1 1.25 3785.7 -53786
## - instant_bookable 1 5.98 3790.5 -53752
## - host_response_rate 1 6.22 3790.7 -53750
## - property_type 2 10.94 3795.4 -53718
## - amenities 1 13.26 3797.7 -53700
## - beds 1 22.10 3806.6 -53636
## - cancellation_policy 4 24.01 3808.5 -53628
## - bathrooms 1 75.00 3859.5 -53260
## - accommodates 1 130.51 3915.0 -52870
## - review_scores_rating 1 131.42 3915.9 -52864
## - bedrooms 1 200.87 3985.4 -52385
## - longitude 1 244.76 4029.3 -52086
## - city 5 292.34 4076.8 -51774
## - room_type 2 1735.25 5519.7 -43505
##
## Step: AIC=-53793.28
## log(priceratio) ~ city + property_type + room_type + amenities +
## accommodates + bathrooms + bed_type + cancellation_policy +
## cleaning_fee + host_identity_verified + host_response_rate +
## instant_bookable + latitude + longitude + review_scores_rating +
## bedrooms + beds
##
## Df Sum of Sq RSS AIC
## - host_identity_verified 1 0.26 3785.0 -53793
## <none> 3784.7 -53793
## - cleaning_fee 1 0.33 3785.0 -53793
## + number_of_reviews 1 0.23 3784.5 -53793
## + host_has_profile_pic 1 0.05 3784.7 -53792
## - latitude 1 0.54 3785.3 -53791
## - bed_type 1 1.26 3786.0 -53786
## - instant_bookable 1 6.13 3790.9 -53751
## - host_response_rate 1 6.42 3791.1 -53749
## - property_type 2 11.08 3795.8 -53718
## - amenities 1 13.17 3797.9 -53701
## - beds 1 22.15 3806.9 -53636
## - cancellation_policy 4 23.86 3808.6 -53630
## - bathrooms 1 75.39 3860.1 -53257
## - accommodates 1 130.32 3915.0 -52872
## - review_scores_rating 1 131.86 3916.6 -52861
## - bedrooms 1 202.76 3987.5 -52372
## - longitude 1 244.61 4029.3 -52088
## - city 5 292.26 4077.0 -51775
## - room_type 2 1735.94 5520.7 -43503
##
## Step: AIC=-53793.4
## log(priceratio) ~ city + property_type + room_type + amenities +
## accommodates + bathrooms + bed_type + cancellation_policy +
## cleaning_fee + host_response_rate + instant_bookable + latitude +
## longitude + review_scores_rating + bedrooms + beds
##
## Df Sum of Sq RSS AIC
## <none> 3785.0 -53793
## + host_identity_verified 1 0.26 3784.7 -53793
## - cleaning_fee 1 0.30 3785.3 -53793
## + number_of_reviews 1 0.18 3784.8 -53793
## + host_has_profile_pic 1 0.04 3784.9 -53792
## - latitude 1 0.53 3785.5 -53792
## - bed_type 1 1.25 3786.2 -53786
## - instant_bookable 1 6.39 3791.4 -53749
## - host_response_rate 1 6.41 3791.4 -53749
## - property_type 2 11.06 3796.0 -53718
## - amenities 1 13.59 3798.6 -53698
## - beds 1 22.14 3807.1 -53636
## - cancellation_policy 4 24.12 3809.1 -53628
## - bathrooms 1 75.35 3860.3 -53258
## - accommodates 1 130.31 3915.3 -52872
## - review_scores_rating 1 132.34 3917.3 -52858
## - bedrooms 1 202.74 3987.7 -52373
## - longitude 1 244.67 4029.7 -52087
## - city 5 292.01 4077.0 -51777
## - room_type 2 1735.79 5520.8 -43504
summary(step_used2)
##
## Call:
## lm(formula = log(priceratio) ~ city + property_type + room_type +
## amenities + accommodates + bathrooms + bed_type + cancellation_policy +
## cleaning_fee + host_response_rate + instant_bookable + latitude +
## longitude + review_scores_rating + bedrooms + beds, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.55964 -0.23796 -0.00661 0.23018 2.59788
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.807e+01 2.291e+00 -34.072 < 2e-16 ***
## cityChicago -1.825e+01 4.346e-01 -42.007 < 2e-16 ***
## cityDC -6.337e+00 1.976e-01 -32.062 < 2e-16 ***
## cityLA -5.136e+01 1.278e+00 -40.189 < 2e-16 ***
## cityNYC -2.991e+00 9.496e-02 -31.496 < 2e-16 ***
## citySF -5.601e+01 1.358e+00 -41.257 < 2e-16 ***
## property_typeHouse -4.532e-02 5.979e-03 -7.579 3.59e-14 ***
## property_typeOther 1.715e-02 7.317e-03 2.344 0.01909 *
## room_typePrivate room -5.860e-01 5.715e-03 -102.531 < 2e-16 ***
## room_typeShared room -1.108e+00 1.687e-02 -65.679 < 2e-16 ***
## amenities 3.494e-03 3.533e-04 9.889 < 2e-16 ***
## accommodates 6.676e-02 2.180e-03 30.625 < 2e-16 ***
## bathrooms 1.172e-01 5.034e-03 23.288 < 2e-16 ***
## bed_typeOther -4.160e-02 1.389e-02 -2.995 0.00275 **
## cancellation_policymoderate 3.002e-02 7.566e-03 3.968 7.26e-05 ***
## cancellation_policystrict 7.549e-02 7.255e-03 10.405 < 2e-16 ***
## cancellation_policysuper_strict_30 4.078e-01 7.103e-02 5.742 9.48e-09 ***
## cancellation_policysuper_strict_60 6.674e-01 2.641e-01 2.527 0.01152 *
## cleaning_feeTRUE -9.666e-03 6.614e-03 -1.462 0.14389
## host_response_rate -1.568e-01 2.309e-02 -6.791 1.13e-11 ***
## instant_bookableTRUE -3.420e-02 5.043e-03 -6.782 1.21e-11 ***
## latitude 6.594e-02 3.363e-02 1.961 0.04994 *
## longitude -1.097e+00 2.614e-02 -41.964 < 2e-16 ***
## review_scores_rating 1.614e-02 5.230e-04 30.862 < 2e-16 ***
## bedrooms 1.651e-01 4.322e-03 38.200 < 2e-16 ***
## beds -3.995e-02 3.164e-03 -12.624 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3727 on 27242 degrees of freedom
## Multiple R-squared: 0.6468, Adjusted R-squared: 0.6464
## F-statistic: 1995 on 25 and 27242 DF, p-value: < 2.2e-16
R^2가 0.6468으로. 나름 괜찮아 보인다.
plot(step_used2, which = 1)
plot(step_used2, which = 2)