Loading the reqired libraries

ifelse(require(pscl),{library("pscl")},install.packages("pscl"))
## [1] "pscl"
ifelse(require(DMwR),{library("DMwR")},install.packages("DMwR"))
## [1] "DMwR"
ifelse(require(boot),{library("boot")},install.packages("boot"))
## [1] "boot"
library(dplyr)
library(ggplot2)
library(ggthemes)
library(reshape)

The braziltourism file contains data from a survey of Brazilian tourists in the Atlantic Coastal Forest of Brazil. The goal of this survey was to study how the number of trips to adventure tourism areas in the region could be modeled as a function of characteristics of the tourists.

travel_data<-read.csv(file.choose(),header = T)

All features are self explanatory except Active - the number of adventure activities (e.g. mountaineering and hiking), passive - passive activities (e.g. sightseeing and picnicking) access.road - the importance to the respondent of the access road into the region and Travel.cost - estimated travel cost (in USD) of the respondents.

head(travel_data)
##   Age Sex  Income Travel.cost Access.road Active Passive Logged.income
## 1  35   0  250.00     48.0003           1      1       0      5.521461
## 2  25   1  218.75     52.2000           0      2       2      5.387930
## 3  37   0 1875.00    163.5000           0      0       0      7.536364
## 4  29   1  781.25     87.0000           0      0       0      6.660895
## 5  54   0  562.50     21.0000           0      0       0      6.332391
## 6  34   0  218.75     60.0000           1      0       1      5.387930
##   Trips
## 1     0
## 2     0
## 3     0
## 4     0
## 5     0
## 6     0
print("number of rows")
## [1] "number of rows"
nrow(travel_data)
## [1] 412
print("number of NAs")
## [1] "number of NAs"
apply(is.na(travel_data),2,sum)
##           Age           Sex        Income   Travel.cost   Access.road 
##             1             1            44             3             0 
##        Active       Passive Logged.income         Trips 
##             2             1            44             0
print("percentage of NAs")
## [1] "percentage of NAs"
apply(is.na(travel_data),2,function(x){ (sum(x)/nrow(travel_data))*100})
##           Age           Sex        Income   Travel.cost   Access.road 
##     0.2427184     0.2427184    10.6796117     0.7281553     0.0000000 
##        Active       Passive Logged.income         Trips 
##     0.4854369     0.2427184    10.6796117     0.0000000
percent_nas<-melt(data.frame(lapply((travel_data),function(x){ (sum(is.na(x))/nrow(travel_data))*100})))
## Using  as id variables
 # visualization

ggplot(percent_nas, aes(x = reorder(variable,value), y = value)) +
  geom_bar( stat="identity") +
  geom_text(aes(x = variable, y = 0.5, label = sprintf( "%.2f%%", value)),
            hjust=0, vjust=0.55, size = 4, colour = 'black',fontface = 'bold') +
  labs(x = 'Attributes', title = 'Percentage of Missing values') +
  coord_flip() + 
  theme_fivethirtyeight()

Lets perform a knn imputation on the data set - knn imputation finds k (default is 10) nearest neighbours to fill in the unknown (NA) values in a data set. For each case with any NA value it will search for its k most similar cases and use the values of these cases to fill in the unknowns.

noNa_data<-knnImputation(travel_data,4)
apply(is.na(noNa_data),2,function(x){ (sum(x)/nrow(travel_data))*100})
##           Age           Sex        Income   Travel.cost   Access.road 
##             0             0             0             0             0 
##        Active       Passive Logged.income         Trips 
##             0             0             0             0
#table(noNa_data$Sex)
#table(noNa_data$Active)
#table(noNa_data$Passive)
#table(noNa_data$Age)
#table(noNa_data$Access.road)
#table(noNa_data$Trips)
noNa_data$Sex<- round(noNa_data$Sex)
noNa_data$Active<- round(noNa_data$Active)
noNa_data$Passive<- round(noNa_data$Passive)
noNa_data$Age<- round(noNa_data$Age)
noNa_data$Access.road<- round(noNa_data$Access.road)
#Converting Sex, Access Road to factor from integer

noNa_data$Sex[noNa_data$Sex =='4'] <-1
noNa_data$Sex <- as.factor(noNa_data$Sex) 

noNa_data$Access.road <- as.factor(noNa_data$Access.road)

str(noNa_data)
## 'data.frame':    412 obs. of  9 variables:
##  $ Age          : num  35 25 37 29 54 34 40 32 62 36 ...
##  $ Sex          : Factor w/ 2 levels "0","1": 1 2 1 2 1 1 1 2 2 2 ...
##  $ Income       : num  250 219 1875 781 562 ...
##  $ Travel.cost  : num  48 52.2 163.5 87 21 ...
##  $ Access.road  : Factor w/ 2 levels "0","1": 2 1 1 1 1 2 2 1 1 1 ...
##  $ Active       : num  1 2 0 0 0 0 0 1 2 2 ...
##  $ Passive      : num  0 2 0 0 0 1 0 1 2 1 ...
##  $ Logged.income: num  5.52 5.39 7.54 6.66 6.33 ...
##  $ Trips        : int  0 0 0 0 0 0 0 0 0 0 ...

To find the distribution of the trip variable

ggplot(noNa_data, aes(Trips)) + geom_histogram( ) +
  labs(x = 'Trips', title = 'Histogram of Trips') +
  theme_light()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

This code runs step wise variable selection algorithm to check for the most significant variables for poisson regression computation.

step(glm(Trips~1,family="poisson",data=noNa_data),
     scope = list(upper=glm(Trips~.,family="poisson",data=noNa_data)),
     direction="both",
     test="Chisq",
     data=Data)
## Start:  AIC=719.24
## Trips ~ 1
## 
##                 Df Deviance    AIC    LRT  Pr(>Chi)    
## + Active         1   431.73 650.54 70.696 < 2.2e-16 ***
## + Travel.cost    1   478.22 697.03 24.210 8.637e-07 ***
## + Access.road    1   484.70 703.50 17.735 2.539e-05 ***
## + Passive        1   485.23 704.04 17.201 3.362e-05 ***
## + Sex            1   487.93 706.74 14.499 0.0001402 ***
## + Age            1   493.04 711.85  9.390 0.0021817 ** 
## <none>               502.43 719.24                     
## + Logged.income  1   501.42 720.22  1.015 0.3136888    
## + Income         1   502.41 721.22  0.016 0.9002186    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=650.54
## Trips ~ Active
## 
##                 Df Deviance    AIC    LRT  Pr(>Chi)    
## + Sex            1   415.39 636.19 16.349 5.269e-05 ***
## + Access.road    1   418.84 639.65 12.897 0.0003291 ***
## + Travel.cost    1   419.78 640.59 11.956 0.0005448 ***
## + Logged.income  1   429.43 650.24  2.305 0.1289474    
## <none>               431.73 650.54                     
## + Age            1   430.59 651.40  1.142 0.2851856    
## + Passive        1   430.81 651.62  0.921 0.3372735    
## + Income         1   431.34 652.15  0.396 0.5292455    
## - Active         1   502.43 719.24 70.696 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=636.19
## Trips ~ Active + Sex
## 
##                 Df Deviance    AIC    LRT  Pr(>Chi)    
## + Travel.cost    1   403.32 626.13 12.068 0.0005128 ***
## + Access.road    1   403.46 626.26 11.930 0.0005524 ***
## <none>               415.39 636.19                     
## + Age            1   413.43 636.24  1.954 0.1621970    
## + Logged.income  1   413.85 636.66  1.534 0.2155146    
## + Income         1   415.23 638.04  0.155 0.6936947    
## + Passive        1   415.38 638.19  0.008 0.9299276    
## - Sex            1   431.73 650.54 16.349 5.269e-05 ***
## - Active         1   487.93 706.74 72.546 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=626.13
## Trips ~ Active + Sex + Travel.cost
## 
##                 Df Deviance    AIC    LRT  Pr(>Chi)    
## + Access.road    1   390.57 615.38 12.748 0.0003565 ***
## <none>               403.32 626.13                     
## + Age            1   401.84 626.65  1.476 0.2244047    
## + Logged.income  1   402.85 627.66  0.466 0.4948259    
## + Passive        1   403.27 628.08  0.046 0.8305840    
## + Income         1   403.32 628.13  0.001 0.9778877    
## - Travel.cost    1   415.39 636.19 12.068 0.0005128 ***
## - Sex            1   419.78 640.59 16.461 4.966e-05 ***
## - Active         1   462.02 682.83 58.703 1.834e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=615.38
## Trips ~ Active + Sex + Travel.cost + Access.road
## 
##                 Df Deviance    AIC    LRT  Pr(>Chi)    
## <none>               390.57 615.38                     
## + Age            1   389.55 616.36  1.022 0.3120337    
## + Logged.income  1   390.19 617.00  0.377 0.5390434    
## + Passive        1   390.38 617.19  0.185 0.6673127    
## + Income         1   390.57 617.38  0.000 0.9936966    
## - Access.road    1   403.32 626.13 12.748 0.0003565 ***
## - Travel.cost    1   403.46 626.26 12.886 0.0003311 ***
## - Sex            1   405.52 628.32 14.946 0.0001106 ***
## - Active         1   443.82 666.62 53.246 2.943e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Call:  glm(formula = Trips ~ Active + Sex + Travel.cost + Access.road, 
##     family = "poisson", data = noNa_data)
## 
## Coefficients:
##  (Intercept)        Active          Sex1   Travel.cost  Access.road1  
##    -0.857884      0.393149     -0.660492     -0.003782     -1.053709  
## 
## Degrees of Freedom: 411 Total (i.e. Null);  407 Residual
## Null Deviance:       502.4 
## Residual Deviance: 390.6     AIC: 615.4

We observe the AIC (Akaike Information Criterion) of different models and choose the model with smallest AIC. The parsimonious model with high prediction accuracy we obtain is

poisson_model<-glm(formula = Trips ~ Active + Sex + Travel.cost + Access.road, 
    family = "poisson", data = noNa_data)
 ## is the selected Model.
summary(poisson_model)
## 
## Call:
## glm(formula = Trips ~ Active + Sex + Travel.cost + Access.road, 
##     family = "poisson", data = noNa_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9661  -0.8410  -0.6045  -0.2973   5.2506  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -0.857884   0.179428  -4.781 1.74e-06 ***
## Active        0.393149   0.050129   7.843 4.41e-15 ***
## Sex1         -0.660492   0.176982  -3.732  0.00019 ***
## Travel.cost  -0.003782   0.001252  -3.021  0.00252 ** 
## Access.road1 -1.053709   0.344992  -3.054  0.00226 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 502.43  on 411  degrees of freedom
## Residual deviance: 390.57  on 407  degrees of freedom
## AIC: 615.38
## 
## Number of Fisher Scoring iterations: 6

Deviance is another metric that can be used to compare models. Higher numbers indicate worse fit. R reports two forms of deviance - the null deviance and the residual deviance. The null deviance shows how well the response variable is predicted by a model that includes only the intercept (grand mean)

with(poisson_model, cbind(res.deviance = deviance,df = df.residual, p = pchisq(deviance, df.residual,lower.tail=FALSE)))
##      res.deviance  df         p
## [1,]     390.5697 407 0.7123866
par(mfrow=c(1,4))
plot(poisson_model)+ theme_light()

## NULL

Plots of poisson model: Residual vs fitted plot: In this plot the residuals should lie around the central line, but most of the residuals lie to one side of the line. From the qq plot we can see that the residuals do not follow normal distribution. The third scale-location plot shows that the spread is not consistent with the mean from the above plots. We can conclude that the poisson model does not fit the data properly.

This data contains lot of records with trip variable as 0. Hence, zero inflated model fits the data better.

Zero-inflated poisson regression is used to model count data that has an excess of zero. In this process, the excess zeros are generated by a one process and then the poisson model is developed on that model. Thus, the zip model has two parts, a poisson count model and the logit model for predicting excess zeros. You may want to review these Data Analysis Example pages, Poisson Regression and Logit Regression.

zrifn_Model<-zeroinfl(formula = Trips ~ Active + Sex + Travel.cost + Access.road | Active, link="logit", data = noNa_data)
summary(zrifn_Model)
## 
## Call:
## zeroinfl(formula = Trips ~ Active + Sex + Travel.cost + Access.road | 
##     Active, data = noNa_data, link = "logit")
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -1.1564 -0.4606 -0.3491 -0.2041  9.0050 
## 
## Count model coefficients (poisson with log link):
##               Estimate Std. Error z value Pr(>|z|)   
## (Intercept)   0.121172   0.229893   0.527  0.59814   
## Active        0.181702   0.065085   2.792  0.00524 **
## Sex1         -0.607606   0.202180  -3.005  0.00265 **
## Travel.cost  -0.003349   0.001312  -2.554  0.01066 * 
## Access.road1 -1.089113   0.371757  -2.930  0.00339 **
## 
## Zero-inflation model coefficients (binomial with logit link):
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   0.7713     0.3328   2.317 0.020485 *  
## Active       -0.5858     0.1689  -3.469 0.000523 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Number of iterations in BFGS optimization: 29 
## Log-likelihood: -290.3 on 7 Df
AIC(zrifn_Model,poisson_model)
##               df      AIC
## zrifn_Model    7 594.5938
## poisson_model  5 615.3785
zrifn_Model_negbin<-zeroinfl(formula = Trips ~ Active + Sex + Travel.cost + Access.road | Active ,dist="negbin", link="logit", data = noNa_data)
summary(zrifn_Model_negbin)
## 
## Call:
## zeroinfl(formula = Trips ~ Active + Sex + Travel.cost + Access.road | 
##     Active, data = noNa_data, dist = "negbin", link = "logit")
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -0.9181 -0.4486 -0.3449 -0.2107  8.7905 
## 
## Count model coefficients (negbin with log link):
##               Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  -0.210938   0.340827  -0.619  0.53598   
## Active        0.234530   0.086479   2.712  0.00669 **
## Sex1         -0.656773   0.216770  -3.030  0.00245 **
## Travel.cost  -0.003300   0.001334  -2.474  0.01336 * 
## Access.road1 -1.043227   0.383499  -2.720  0.00652 **
## Log(theta)    0.792414   0.707002   1.121  0.26237   
## 
## Zero-inflation model coefficients (binomial with logit link):
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)   0.1967     0.6200   0.317  0.75107   
## Active       -0.6569     0.2495  -2.632  0.00848 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Theta = 2.2087 
## Number of iterations in BFGS optimization: 34 
## Log-likelihood: -287.9 on 8 Df
AIC(zrifn_Model_negbin,zrifn_Model)
##                    df      AIC
## zrifn_Model_negbin  8 591.7322
## zrifn_Model         7 594.5938

Zero inflated model with negative binomial distribution gave us the most accurate results.Negative binomial regression does better with over dispersed data, data with variance much larger than the mean.

table(noNa_data$Travel.cost)
## 
##               21             25.2             29.4               45 
##                6                1                8                3 
##               48          48.0003             52.2             56.4 
##               54              171               10                2 
##               60             82.2               87            112.5 
##                6                7                5                1 
##       117.300075            118.5            125.4              138 
##                1                3                1                2 
##            158.4            160.2            161.7            163.5 
##                2                2               19                2 
##            170.4            176.4            177.9            180.9 
##               28               10                2                1 
##            186.9            190.8            196.2            197.4 
##                2                7                2                6 
## 199.687720486667            203.7            208.5            216.3 
##                1                3                1                1 
##            239.1            261.3              288            303.6 
##                3                3                4                7 
##            348.9            349.2            403.8            420.9 
##                1                1                3                1 
##            457.8 488.555464227225            520.2            535.5 
##                5                1                1                6 
##            551.7            595.2            988.5           1005.9 
##                1                3                1                1
newdata1 <- expand.grid((0:6),factor(0:1),seq(100,1000,100),factor(0:1))
colnames(newdata1) <- c("Active", "Sex", "Travel.cost","Access.road")
#newdata1 <- subset(newdata1, subset=(child<=persons))
newdata1$Trips <- predict(zrifn_Model_negbin, newdata1)
ggplot(newdata1, aes(x = Active, y = Trips, color=factor(Travel.cost))) +
  geom_jitter(shape=16, position=position_jitter(0.2)) +
   
  facet_wrap(~Sex) +
  theme_light()+
  labs(x = "Active", y = "Trips")

Now that we have developed a model on the data, see if you can derive insights from the abbove graph.You will find a clear trend between Active activities, trips and Travel cost.