R_AIRBNB_Explaratory Data Analysis

[INDEX]

0. LOAD

1. EDA ( REFINE )

2. VISUALIZING

3. MODELLING

0. LOAD

For visualizing, get the “ggplot2”

library(ggplot2)
library(xgboost)
library(lattice)
library(caret)
library(magrittr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:xgboost':
## 
##     slice
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Load the airbnb data which are train and test.

train <- read.csv("train_users_2.csv", header = T)
test <- read.csv("test_users.csv", header = T)
air <- rbind(train[ ,-ncol(train)], test)

Check the row number of both table.

paste("train data :", nrow(train), ", test data :", nrow(test),
      "is available.")
## [1] "train data : 213451 , test data : 62096 is available."
paste("In total, train + test : ", nrow(air))
## [1] "In total, train + test :  275547"

1. EDA

Before we go with xgboost, we should take several questions such as 1. Is it realistic for data modelling? 2. Is “NA” values acceptable range? 3. Is Outlier values acceptable range?

1.0 ID delete for Explaratory Data Analysis

air <- air[,-1]

1.1 Finding Missing data

summary(air)
##  date_account_created timestamp_first_active  date_first_booking
##  2014-07-23:  1105    Min.   :2.009e+13                :124543  
##  2014-07-22:  1052    1st Qu.:2.013e+13      2014-05-22:   248  
##  2014-07-17:   978    Median :2.014e+13      2014-06-11:   231  
##  2014-07-24:   923    Mean   :2.013e+13      2014-06-24:   226  
##  2014-07-18:   892    3rd Qu.:2.014e+13      2014-05-21:   225  
##  2014-07-21:   888    Max.   :2.014e+13      (Other)   : 87978  
##  (Other)   :269709                           NA's      : 62096  
##        gender            age           signup_method     signup_flow    
##  -unknown-:129480   Min.   :   1.00   basic   :198222   Min.   : 0.000  
##  FEMALE   : 77524   1st Qu.:  28.00   facebook: 74864   1st Qu.: 0.000  
##  MALE     : 68209   Median :  33.00   google  :  2438   Median : 0.000  
##  OTHER    :   334   Mean   :  47.15   weibo   :    23   Mean   : 4.292  
##                     3rd Qu.:  42.00                     3rd Qu.: 1.000  
##                     Max.   :2014.00                     Max.   :25.000  
##                     NA's   :116866                                      
##     language          affiliate_channel   affiliate_provider
##  en     :265538   direct       :181571   direct    :181270  
##  zh     :  2634   sem-brand    : 36439   google    : 65956  
##  fr     :  1508   sem-non-brand: 20075   other     : 13036  
##  es     :  1174   seo          : 14362   facebook  :  3996  
##  ko     :  1116   other        :  9547   bing      :  3719  
##  de     :   977   api          :  8167   craigslist:  3475  
##  (Other):  2600   (Other)      :  5386   (Other)   :  4095  
##   first_affiliate_tracked   signup_app           first_device_type 
##  untracked    :143181     Android: 10519   Mac Desktop    :106328  
##  linked       : 62064     iOS    : 34593   Windows Desktop: 86948  
##  omg          : 54859     Moweb  : 10517   iPhone         : 39814  
##  tracked-other:  6655     Web    :219918   iPad           : 18036  
##               :  6085                      Other/Unknown  : 11167  
##  product      :  2353                      Android Phone  :  9458  
##  (Other)      :   350                      (Other)        :  3796  
##        first_browser  
##  Chrome       :78671  
##  Safari       :53302  
##  -unknown-    :44394  
##  Firefox      :38665  
##  Mobile Safari:29636  
##  IE           :24744  
##  (Other)      : 6135

time_first_active data form is different with date_account_created and date_first_booking, so should convert this into same date form.

1.2 Convert into dateform

air[,"timestamp_first_active"] <- trimws(air$timestamp_first_active) # remove white space
air[, "timestamp_first_active"] <- substr(air$timestamp_first_active, 1, 8) # trim hour, minutes and seconds
x <- air$timestamp_first_active
air$timestamp_first_active <- as.Date(x, "%Y%m%d") # convert string to date

1.3 Gender is unknown, so we need to convert this value as NA

train[, "gender"] <- gsub("-unknown-", NA , train[, "gender"])

1.4 Percentage of “NA” values

paste(" NA Percentages of each columns ")
## [1] " NA Percentages of each columns "
for( i in 1 : ncol(train)){
  x <- sum(is.na(train[,i]))
  if (x != 0){
    y <- paste(colnames(train)[i] ," : ", round(x/nrow(train), 3) *100, "%")
    print(y)
  }
}
## [1] "gender  :  44.8 %"
## [1] "age  :  41.2 %"

Each of Gender and Age has 45% and 41% of NA Values, So it isn’t enough for modelling. >> Let’s make them more fit

1.5 Refine age values

summary(train$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    1.00   28.00   34.00   49.67   43.00 2014.00   87990

There is weired values in age, which is “2014”. (Remeber, this is “age”)

x1 <- train[ train$age >= 120 & !is.na(train$age), "age"]
table(x1) 
## x1
##  132  150 1924 1925 1926 1927 1928 1929 1931 1932 1933 1935 1936 1938 1942 
##    1    1    2    1    1    1    2    2    3    3    1    1    2    1    1 
## 1947 1949 1952 1953 1995 2008 2013 2014 
##    2    3    1    1    1    1   39  710
sum(x1==x1) 
## [1] 781
x2 <- train[ train$age < 18 & !is.na(train$age), "age"]
table(x2) 
## x2
##  1  2  4  5 15 16 17 
##  2  7  3 45  8 26 67
sum(x2==x2) 
## [1] 158

There are 781 values who are higer than 120, and there are 158 values which stands for under 18 years even if airbnb doesn’t allow under 18 years. So we should change this whole values into mean value.

age_temp <- na.omit(train$age)
age_temp <- age_temp[age_temp > 18 & age_temp <= 120 ]
age_temp <- round(mean(age_temp),2)

Average age is 37.55 years.

train[ (train$age < 18 | train$age >= 120 ) & !is.na(train$age) , "age"]  <- age_temp
train[ is.na(train$age), "age"] <- age_temp
table(is.na(train[, "age"]))
## 
##  FALSE 
## 213451

Refine complete.

2. EDA - VISUALIZING

x <- data.frame(table(air$gender))
colnames(x) <- c("gender","count")

ggplot(x, aes(x=gender,y=count)) +
  geom_bar(stat="identity", width = 0.5, fill = "deepskyblue") +
  geom_text(aes(label=count), vjust = 1.5, color = "white", size = 3.7) +
  theme_minimal()

As we can see in this graph, gender is not the values which gives diffrences.

x2 <- data.frame(table(air$signup_method))
colnames(x2) <- c("signup_method","count")

ggplot(x2, aes(x=signup_method, y = count)) +
  geom_bar(stat="identity", width = 0.5, fill = "orangered2") +
  geom_text(aes(label = count), vjust = 1.5 , color = "black", size = 3.7) +
  theme_minimal()

3. Apply XGBoost

### TRAIN/TEST

train_temp <- air[1:nrow(train),]
train_temp$label <- train$country_destination

#test <- air[nrow(train_temp)+1:nrow(air),] # ?????? test set
### train_temp / test_temp??? train set 30%
set <- round(nrow(train_temp)*0.7)
temp_train <- train_temp[1:set,]
temp_test <- train_temp[(set+1):nrow(train_temp),]
#actual <- temp_test$label

df_train <- lapply(temp_train, as.numeric)
df_train <- as.data.frame(df_train)
df_train$label <- df_train$label-1

#temp_test <- na.omit(temp_test)
df_test <- lapply(temp_test, as.numeric)
df_test <- as.data.frame(df_test)
actual <- df_test$label

xgb_data <- xgb.DMatrix(data=data.matrix(df_train[,-ncol(df_train)]), label=df_train$label)
xgb <- xgb.train(data = xgb_data, 
                 eta = 0.3, ## eta : Learning Rate, x??? ????????? (default = 0.5)
                 max_depth = 3,  ## max_depth, decision tree??? ?????? ???????????????
                 nround = 73, ## the max number of iterations (weak learners number)
                 subsample = 0.8, ## Sample Number of each sample group (?????? 0.5~0.8, max??? 1??????.)
                 colsample_bytree = 0.8,
                 seed = 1,
                 eval_metric = "merror", ## merror Multiclass classification error rate. It is calculated as (# wrong cases) / (# all cases).
                 objective = "multi:softprob",
                 num_class = 12,
                 print_every_n = 10,
                 gamma=2,
                 min_child_weight=6,
                 lambda=1,
                 alpha=0
)
# temp_test : Check the accuracy
test_pred <- predict(xgb, newdata=data.matrix(df_test[,-ncol(df_test)]))
test_prediction <- matrix(test_pred, nrow = 12,
                          ncol=length(test_pred)/12) %>%
  t() %>%
  data.frame() %>%
  mutate(label = actual,
         max_prob = max.col(., "last"))

results <- table(test_prediction$label, test_prediction$max_prob)
accuracy <- sum(test_prediction$label == test_prediction$max_prob)/nrow(temp_test)
print(accuracy)  
## [1] 0.8835793


반응형
Posted by JoeSung
,