combined_data <- read_csv("combined_data.csv")
combined_data <- combined_data[, -1]
test <- combined_data%>%
select(participant_key, date_of_birth_dt, date_of_death_dt,
admission_dt, discharge_dt, length_of_stay_day_cnt, deceased,
bmi_q1, cig_day_avg)%>%
mutate(days_after = ifelse(date_of_death_dt <= discharge_dt, 0,
date_of_death_dt - discharge_dt),
month_death = month(date_of_death_dt),
season_death = case_when(
month_death < 3 | month_death == 12 ~ 'Winter',
month_death < 6 ~ 'Spring',
month_death < 9 ~ 'Summer',
month_death < 12 ~ 'Fall'))
test$days_after <- as.numeric(test$days_after)
test$month_death <- factor(test$month_death)
test$season_death <- factor(test$season_death)
# test subset of selected variables
ggplot(test, aes(x = days_after))+
geom_histogram(na.rm = TRUE)+
labs(title = 'Frequency of Deaths Days After Discharge Date',
x = 'Days after Discharge',
y = 'Frequency')
# Heavily inflated zeros with patients dying during stay in hospital
table(test$days_after == 0)
##
## FALSE TRUE
## 64711 3728
# 3,728 patients died within their stay
length(unique(test$participant_key))
## [1] 44494
# 44,494 unique patients in dataset
table(test$month_death)
##
## 1 2 3 4 5 6 7 8 9 10 11 12
## 6479 5905 6124 5638 5434 5405 5424 5063 5375 5795 5793 6004
test <- test[complete.cases(test),]
ggplot(test, aes(x = month_death))+
geom_bar()+
labs(title = 'Frequency of Deaths Throughout the Year',
x = 'Month',
y = 'Frequency')
# high amounts of reported deaths for the winter months. With the lowest amount during the spring summer and then a gradual increase towards the end of the year.
ggplot(test, aes(x = season_death))+
geom_bar()+
labs(title = 'Recorded Deaths in Seasons',
x = 'Season',
y = 'Frequency')
# winter had the most deaths compared to summer being the lowest amounts of deaths
mod <- glm(deceased ~ bmi_q1 + cig_day_avg + season_death,
family = 'binomial', data = test)
## Warning: glm.fit: algorithm did not converge
summary(mod)
##
## Call:
## glm(formula = deceased ~ bmi_q1 + cig_day_avg + season_death,
## family = "binomial", data = test)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## 2.409e-06 2.409e-06 2.409e-06 2.409e-06 2.409e-06
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.657e+01 1.109e+04 0.002 0.998
## bmi_q1 -7.152e-11 3.832e+02 0.000 1.000
## cig_day_avg -3.052e-10 2.051e+02 0.000 1.000
## season_deathSpring 8.440e-10 6.212e+03 0.000 1.000
## season_deathSummer 5.821e-10 6.319e+03 0.000 1.000
## season_deathWinter -2.757e-08 6.100e+03 0.000 1.000
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 0.0000e+00 on 26590 degrees of freedom
## Residual deviance: 1.5427e-07 on 26585 degrees of freedom
## AIC: 12
##
## Number of Fisher Scoring iterations: 25
exp(mod$coefficients)
## (Intercept) bmi_q1 cig_day_avg season_deathSpring
## 344742742140 1 1 1
## season_deathSummer season_deathWinter
## 1 1
tibble(
parameter = names(mod$coefficients),
or = round(exp(mod$coefficients), 2),
as.data.frame.matrix(round(exp(confint.default(mod)), 2))
)
## # A tibble: 6 x 4
## parameter or `2.5 %` `97.5 %`
## <chr> <dbl> <dbl> <dbl>
## 1 (Intercept) 344742742140. 0 Inf
## 2 bmi_q1 1 0 Inf
## 3 cig_day_avg 1 0 3.94e174
## 4 season_deathSpring 1 0 Inf
## 5 season_deathSummer 1 0 Inf
## 6 season_deathWinter 1 0 Inf
mod2 <- glm(deceased ~ season_death + days_after + bmi_q1 + cig_day_avg,
family = 'binomial', data = test)
## Warning: glm.fit: algorithm did not converge
tibble(
parameter = names(mod2$coefficients),
or = round(exp(mod2$coefficients), 2),
as.data.frame.matrix(round(exp(confint.default(mod2)), 2))
)
## # A tibble: 7 x 4
## parameter or `2.5 %` `97.5 %`
## <chr> <dbl> <dbl> <dbl>
## 1 (Intercept) 344742918472. 0 Inf
## 2 season_deathSpring 1 0 Inf
## 3 season_deathSummer 1 0 Inf
## 4 season_deathWinter 1 0 Inf
## 5 days_after 1 0.05 1.92e 1
## 6 bmi_q1 1 0 Inf
## 7 cig_day_avg 1 0 4.16e174