Poisson and Zero inflated poisson regression
seetharam annepu
30 April 2018
Loading the reqired libraries
ifelse(require(pscl),{library("pscl")},install.packages("pscl"))
## [1] "pscl"
ifelse(require(DMwR),{library("DMwR")},install.packages("DMwR"))
## [1] "DMwR"
ifelse(require(boot),{library("boot")},install.packages("boot"))
## [1] "boot"
library(dplyr)
library(ggplot2)
library(ggthemes)
library(reshape)
The braziltourism file contains data from a survey of Brazilian tourists in the Atlantic Coastal Forest of Brazil. The goal of this survey was to study how the number of trips to adventure tourism areas in the region could be modeled as a function of characteristics of the tourists.
travel_data<-read.csv(file.choose(),header = T)
All features are self explanatory except Active - the number of adventure activities (e.g. mountaineering and hiking), passive - passive activities (e.g. sightseeing and picnicking) access.road - the importance to the respondent of the access road into the region and Travel.cost - estimated travel cost (in USD) of the respondents.
head(travel_data)
## Age Sex Income Travel.cost Access.road Active Passive Logged.income
## 1 35 0 250.00 48.0003 1 1 0 5.521461
## 2 25 1 218.75 52.2000 0 2 2 5.387930
## 3 37 0 1875.00 163.5000 0 0 0 7.536364
## 4 29 1 781.25 87.0000 0 0 0 6.660895
## 5 54 0 562.50 21.0000 0 0 0 6.332391
## 6 34 0 218.75 60.0000 1 0 1 5.387930
## Trips
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
print("number of rows")
## [1] "number of rows"
nrow(travel_data)
## [1] 412
print("number of NAs")
## [1] "number of NAs"
apply(is.na(travel_data),2,sum)
## Age Sex Income Travel.cost Access.road
## 1 1 44 3 0
## Active Passive Logged.income Trips
## 2 1 44 0
print("percentage of NAs")
## [1] "percentage of NAs"
apply(is.na(travel_data),2,function(x){ (sum(x)/nrow(travel_data))*100})
## Age Sex Income Travel.cost Access.road
## 0.2427184 0.2427184 10.6796117 0.7281553 0.0000000
## Active Passive Logged.income Trips
## 0.4854369 0.2427184 10.6796117 0.0000000
percent_nas<-melt(data.frame(lapply((travel_data),function(x){ (sum(is.na(x))/nrow(travel_data))*100})))
## Using as id variables
# visualization
ggplot(percent_nas, aes(x = reorder(variable,value), y = value)) +
geom_bar( stat="identity") +
geom_text(aes(x = variable, y = 0.5, label = sprintf( "%.2f%%", value)),
hjust=0, vjust=0.55, size = 4, colour = 'black',fontface = 'bold') +
labs(x = 'Attributes', title = 'Percentage of Missing values') +
coord_flip() +
theme_fivethirtyeight()
Lets perform a knn imputation on the data set - knn imputation finds k (default is 10) nearest neighbours to fill in the unknown (NA) values in a data set. For each case with any NA value it will search for its k most similar cases and use the values of these cases to fill in the unknowns.
noNa_data<-knnImputation(travel_data,4)
apply(is.na(noNa_data),2,function(x){ (sum(x)/nrow(travel_data))*100})
## Age Sex Income Travel.cost Access.road
## 0 0 0 0 0
## Active Passive Logged.income Trips
## 0 0 0 0
#table(noNa_data$Sex)
#table(noNa_data$Active)
#table(noNa_data$Passive)
#table(noNa_data$Age)
#table(noNa_data$Access.road)
#table(noNa_data$Trips)
noNa_data$Sex<- round(noNa_data$Sex)
noNa_data$Active<- round(noNa_data$Active)
noNa_data$Passive<- round(noNa_data$Passive)
noNa_data$Age<- round(noNa_data$Age)
noNa_data$Access.road<- round(noNa_data$Access.road)
#Converting Sex, Access Road to factor from integer
noNa_data$Sex[noNa_data$Sex =='4'] <-1
noNa_data$Sex <- as.factor(noNa_data$Sex)
noNa_data$Access.road <- as.factor(noNa_data$Access.road)
str(noNa_data)
## 'data.frame': 412 obs. of 9 variables:
## $ Age : num 35 25 37 29 54 34 40 32 62 36 ...
## $ Sex : Factor w/ 2 levels "0","1": 1 2 1 2 1 1 1 2 2 2 ...
## $ Income : num 250 219 1875 781 562 ...
## $ Travel.cost : num 48 52.2 163.5 87 21 ...
## $ Access.road : Factor w/ 2 levels "0","1": 2 1 1 1 1 2 2 1 1 1 ...
## $ Active : num 1 2 0 0 0 0 0 1 2 2 ...
## $ Passive : num 0 2 0 0 0 1 0 1 2 1 ...
## $ Logged.income: num 5.52 5.39 7.54 6.66 6.33 ...
## $ Trips : int 0 0 0 0 0 0 0 0 0 0 ...
To find the distribution of the trip variable
ggplot(noNa_data, aes(Trips)) + geom_histogram( ) +
labs(x = 'Trips', title = 'Histogram of Trips') +
theme_light()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
This code runs step wise variable selection algorithm to check for the most significant variables for poisson regression computation.
step(glm(Trips~1,family="poisson",data=noNa_data),
scope = list(upper=glm(Trips~.,family="poisson",data=noNa_data)),
direction="both",
test="Chisq",
data=Data)
## Start: AIC=719.24
## Trips ~ 1
##
## Df Deviance AIC LRT Pr(>Chi)
## + Active 1 431.73 650.54 70.696 < 2.2e-16 ***
## + Travel.cost 1 478.22 697.03 24.210 8.637e-07 ***
## + Access.road 1 484.70 703.50 17.735 2.539e-05 ***
## + Passive 1 485.23 704.04 17.201 3.362e-05 ***
## + Sex 1 487.93 706.74 14.499 0.0001402 ***
## + Age 1 493.04 711.85 9.390 0.0021817 **
## <none> 502.43 719.24
## + Logged.income 1 501.42 720.22 1.015 0.3136888
## + Income 1 502.41 721.22 0.016 0.9002186
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=650.54
## Trips ~ Active
##
## Df Deviance AIC LRT Pr(>Chi)
## + Sex 1 415.39 636.19 16.349 5.269e-05 ***
## + Access.road 1 418.84 639.65 12.897 0.0003291 ***
## + Travel.cost 1 419.78 640.59 11.956 0.0005448 ***
## + Logged.income 1 429.43 650.24 2.305 0.1289474
## <none> 431.73 650.54
## + Age 1 430.59 651.40 1.142 0.2851856
## + Passive 1 430.81 651.62 0.921 0.3372735
## + Income 1 431.34 652.15 0.396 0.5292455
## - Active 1 502.43 719.24 70.696 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=636.19
## Trips ~ Active + Sex
##
## Df Deviance AIC LRT Pr(>Chi)
## + Travel.cost 1 403.32 626.13 12.068 0.0005128 ***
## + Access.road 1 403.46 626.26 11.930 0.0005524 ***
## <none> 415.39 636.19
## + Age 1 413.43 636.24 1.954 0.1621970
## + Logged.income 1 413.85 636.66 1.534 0.2155146
## + Income 1 415.23 638.04 0.155 0.6936947
## + Passive 1 415.38 638.19 0.008 0.9299276
## - Sex 1 431.73 650.54 16.349 5.269e-05 ***
## - Active 1 487.93 706.74 72.546 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=626.13
## Trips ~ Active + Sex + Travel.cost
##
## Df Deviance AIC LRT Pr(>Chi)
## + Access.road 1 390.57 615.38 12.748 0.0003565 ***
## <none> 403.32 626.13
## + Age 1 401.84 626.65 1.476 0.2244047
## + Logged.income 1 402.85 627.66 0.466 0.4948259
## + Passive 1 403.27 628.08 0.046 0.8305840
## + Income 1 403.32 628.13 0.001 0.9778877
## - Travel.cost 1 415.39 636.19 12.068 0.0005128 ***
## - Sex 1 419.78 640.59 16.461 4.966e-05 ***
## - Active 1 462.02 682.83 58.703 1.834e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=615.38
## Trips ~ Active + Sex + Travel.cost + Access.road
##
## Df Deviance AIC LRT Pr(>Chi)
## <none> 390.57 615.38
## + Age 1 389.55 616.36 1.022 0.3120337
## + Logged.income 1 390.19 617.00 0.377 0.5390434
## + Passive 1 390.38 617.19 0.185 0.6673127
## + Income 1 390.57 617.38 0.000 0.9936966
## - Access.road 1 403.32 626.13 12.748 0.0003565 ***
## - Travel.cost 1 403.46 626.26 12.886 0.0003311 ***
## - Sex 1 405.52 628.32 14.946 0.0001106 ***
## - Active 1 443.82 666.62 53.246 2.943e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Call: glm(formula = Trips ~ Active + Sex + Travel.cost + Access.road,
## family = "poisson", data = noNa_data)
##
## Coefficients:
## (Intercept) Active Sex1 Travel.cost Access.road1
## -0.857884 0.393149 -0.660492 -0.003782 -1.053709
##
## Degrees of Freedom: 411 Total (i.e. Null); 407 Residual
## Null Deviance: 502.4
## Residual Deviance: 390.6 AIC: 615.4
We observe the AIC (Akaike Information Criterion) of different models and choose the model with smallest AIC. The parsimonious model with high prediction accuracy we obtain is
poisson_model<-glm(formula = Trips ~ Active + Sex + Travel.cost + Access.road,
family = "poisson", data = noNa_data)
## is the selected Model.
summary(poisson_model)
##
## Call:
## glm(formula = Trips ~ Active + Sex + Travel.cost + Access.road,
## family = "poisson", data = noNa_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9661 -0.8410 -0.6045 -0.2973 5.2506
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.857884 0.179428 -4.781 1.74e-06 ***
## Active 0.393149 0.050129 7.843 4.41e-15 ***
## Sex1 -0.660492 0.176982 -3.732 0.00019 ***
## Travel.cost -0.003782 0.001252 -3.021 0.00252 **
## Access.road1 -1.053709 0.344992 -3.054 0.00226 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 502.43 on 411 degrees of freedom
## Residual deviance: 390.57 on 407 degrees of freedom
## AIC: 615.38
##
## Number of Fisher Scoring iterations: 6
Deviance is another metric that can be used to compare models. Higher numbers indicate worse fit. R reports two forms of deviance - the null deviance and the residual deviance. The null deviance shows how well the response variable is predicted by a model that includes only the intercept (grand mean)
with(poisson_model, cbind(res.deviance = deviance,df = df.residual, p = pchisq(deviance, df.residual,lower.tail=FALSE)))
## res.deviance df p
## [1,] 390.5697 407 0.7123866
par(mfrow=c(1,4))
plot(poisson_model)+ theme_light()
## NULL
Plots of poisson model: Residual vs fitted plot: In this plot the residuals should lie around the central line, but most of the residuals lie to one side of the line. From the qq plot we can see that the residuals do not follow normal distribution. The third scale-location plot shows that the spread is not consistent with the mean from the above plots. We can conclude that the poisson model does not fit the data properly.
This data contains lot of records with trip variable as 0. Hence, zero inflated model fits the data better.
Zero-inflated poisson regression is used to model count data that has an excess of zero. In this process, the excess zeros are generated by a one process and then the poisson model is developed on that model. Thus, the zip model has two parts, a poisson count model and the logit model for predicting excess zeros. You may want to review these Data Analysis Example pages, Poisson Regression and Logit Regression.
zrifn_Model<-zeroinfl(formula = Trips ~ Active + Sex + Travel.cost + Access.road | Active, link="logit", data = noNa_data)
summary(zrifn_Model)
##
## Call:
## zeroinfl(formula = Trips ~ Active + Sex + Travel.cost + Access.road |
## Active, data = noNa_data, link = "logit")
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## -1.1564 -0.4606 -0.3491 -0.2041 9.0050
##
## Count model coefficients (poisson with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.121172 0.229893 0.527 0.59814
## Active 0.181702 0.065085 2.792 0.00524 **
## Sex1 -0.607606 0.202180 -3.005 0.00265 **
## Travel.cost -0.003349 0.001312 -2.554 0.01066 *
## Access.road1 -1.089113 0.371757 -2.930 0.00339 **
##
## Zero-inflation model coefficients (binomial with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.7713 0.3328 2.317 0.020485 *
## Active -0.5858 0.1689 -3.469 0.000523 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Number of iterations in BFGS optimization: 29
## Log-likelihood: -290.3 on 7 Df
AIC(zrifn_Model,poisson_model)
## df AIC
## zrifn_Model 7 594.5938
## poisson_model 5 615.3785
zrifn_Model_negbin<-zeroinfl(formula = Trips ~ Active + Sex + Travel.cost + Access.road | Active ,dist="negbin", link="logit", data = noNa_data)
summary(zrifn_Model_negbin)
##
## Call:
## zeroinfl(formula = Trips ~ Active + Sex + Travel.cost + Access.road |
## Active, data = noNa_data, dist = "negbin", link = "logit")
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## -0.9181 -0.4486 -0.3449 -0.2107 8.7905
##
## Count model coefficients (negbin with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.210938 0.340827 -0.619 0.53598
## Active 0.234530 0.086479 2.712 0.00669 **
## Sex1 -0.656773 0.216770 -3.030 0.00245 **
## Travel.cost -0.003300 0.001334 -2.474 0.01336 *
## Access.road1 -1.043227 0.383499 -2.720 0.00652 **
## Log(theta) 0.792414 0.707002 1.121 0.26237
##
## Zero-inflation model coefficients (binomial with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.1967 0.6200 0.317 0.75107
## Active -0.6569 0.2495 -2.632 0.00848 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Theta = 2.2087
## Number of iterations in BFGS optimization: 34
## Log-likelihood: -287.9 on 8 Df
AIC(zrifn_Model_negbin,zrifn_Model)
## df AIC
## zrifn_Model_negbin 8 591.7322
## zrifn_Model 7 594.5938
Zero inflated model with negative binomial distribution gave us the most accurate results.Negative binomial regression does better with over dispersed data, data with variance much larger than the mean.
table(noNa_data$Travel.cost)
##
## 21 25.2 29.4 45
## 6 1 8 3
## 48 48.0003 52.2 56.4
## 54 171 10 2
## 60 82.2 87 112.5
## 6 7 5 1
## 117.300075 118.5 125.4 138
## 1 3 1 2
## 158.4 160.2 161.7 163.5
## 2 2 19 2
## 170.4 176.4 177.9 180.9
## 28 10 2 1
## 186.9 190.8 196.2 197.4
## 2 7 2 6
## 199.687720486667 203.7 208.5 216.3
## 1 3 1 1
## 239.1 261.3 288 303.6
## 3 3 4 7
## 348.9 349.2 403.8 420.9
## 1 1 3 1
## 457.8 488.555464227225 520.2 535.5
## 5 1 1 6
## 551.7 595.2 988.5 1005.9
## 1 3 1 1
newdata1 <- expand.grid((0:6),factor(0:1),seq(100,1000,100),factor(0:1))
colnames(newdata1) <- c("Active", "Sex", "Travel.cost","Access.road")
#newdata1 <- subset(newdata1, subset=(child<=persons))
newdata1$Trips <- predict(zrifn_Model_negbin, newdata1)
ggplot(newdata1, aes(x = Active, y = Trips, color=factor(Travel.cost))) +
geom_jitter(shape=16, position=position_jitter(0.2)) +
facet_wrap(~Sex) +
theme_light()+
labs(x = "Active", y = "Trips")
Now that we have developed a model on the data, see if you can derive insights from the abbove graph.You will find a clear trend between Active activities, trips and Travel cost.