combined_data <- read_csv("combined_data.csv")
combined_data <- combined_data[, -1]


test <- combined_data%>%
  select(participant_key, date_of_birth_dt, date_of_death_dt,
         admission_dt, discharge_dt, length_of_stay_day_cnt, deceased,
         bmi_q1, cig_day_avg)%>%
  mutate(days_after = ifelse(date_of_death_dt <= discharge_dt, 0,
                             date_of_death_dt - discharge_dt),
         month_death = month(date_of_death_dt),
         season_death = case_when(
           month_death < 3 | month_death == 12 ~ 'Winter',
           month_death < 6 ~ 'Spring',
           month_death < 9 ~ 'Summer',
           month_death < 12 ~ 'Fall'))

test$days_after <- as.numeric(test$days_after)
test$month_death <- factor(test$month_death)
test$season_death <- factor(test$season_death)

# test subset of selected variables
ggplot(test, aes(x = days_after))+
  geom_histogram(na.rm = TRUE)+
  labs(title = 'Frequency of Deaths Days After Discharge Date',
       x = 'Days after Discharge',
       y = 'Frequency')

# Heavily inflated zeros with patients dying during stay in hospital

table(test$days_after == 0)
## 
## FALSE  TRUE 
## 64711  3728
# 3,728 patients died within their stay

length(unique(test$participant_key))
## [1] 44494
# 44,494 unique patients in dataset
table(test$month_death)
## 
##    1    2    3    4    5    6    7    8    9   10   11   12 
## 6479 5905 6124 5638 5434 5405 5424 5063 5375 5795 5793 6004
test <- test[complete.cases(test),]

ggplot(test, aes(x = month_death))+
  geom_bar()+
  labs(title = 'Frequency of Deaths Throughout the Year',
       x = 'Month',
       y = 'Frequency')

# high amounts of reported deaths for the winter months. With the lowest amount during the spring summer and then a gradual increase towards the end of the year.

ggplot(test, aes(x = season_death))+
  geom_bar()+
  labs(title = 'Recorded Deaths in Seasons',
       x = 'Season',
       y = 'Frequency')

# winter had the most deaths compared to summer being the lowest amounts of deaths
mod <- glm(deceased ~ bmi_q1 + cig_day_avg + season_death,
           family = 'binomial', data = test)
## Warning: glm.fit: algorithm did not converge
summary(mod)
## 
## Call:
## glm(formula = deceased ~ bmi_q1 + cig_day_avg + season_death, 
##     family = "binomial", data = test)
## 
## Deviance Residuals: 
##       Min         1Q     Median         3Q        Max  
## 2.409e-06  2.409e-06  2.409e-06  2.409e-06  2.409e-06  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)
## (Intercept)         2.657e+01  1.109e+04   0.002    0.998
## bmi_q1             -7.152e-11  3.832e+02   0.000    1.000
## cig_day_avg        -3.052e-10  2.051e+02   0.000    1.000
## season_deathSpring  8.440e-10  6.212e+03   0.000    1.000
## season_deathSummer  5.821e-10  6.319e+03   0.000    1.000
## season_deathWinter -2.757e-08  6.100e+03   0.000    1.000
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 0.0000e+00  on 26590  degrees of freedom
## Residual deviance: 1.5427e-07  on 26585  degrees of freedom
## AIC: 12
## 
## Number of Fisher Scoring iterations: 25
exp(mod$coefficients)
##        (Intercept)             bmi_q1        cig_day_avg season_deathSpring 
##       344742742140                  1                  1                  1 
## season_deathSummer season_deathWinter 
##                  1                  1
tibble(
  parameter = names(mod$coefficients),
  or = round(exp(mod$coefficients), 2),
  as.data.frame.matrix(round(exp(confint.default(mod)), 2))
)
## # A tibble: 6 x 4
##   parameter                     or `2.5 %`   `97.5 %`
##   <chr>                      <dbl>   <dbl>      <dbl>
## 1 (Intercept)        344742742140.       0 Inf       
## 2 bmi_q1                        1        0 Inf       
## 3 cig_day_avg                   1        0   3.94e174
## 4 season_deathSpring            1        0 Inf       
## 5 season_deathSummer            1        0 Inf       
## 6 season_deathWinter            1        0 Inf
mod2 <- glm(deceased ~ season_death + days_after + bmi_q1 + cig_day_avg,
    family = 'binomial', data = test)
## Warning: glm.fit: algorithm did not converge
tibble(
  parameter = names(mod2$coefficients),
  or = round(exp(mod2$coefficients), 2),
  as.data.frame.matrix(round(exp(confint.default(mod2)), 2))
)
## # A tibble: 7 x 4
##   parameter                     or `2.5 %`   `97.5 %`
##   <chr>                      <dbl>   <dbl>      <dbl>
## 1 (Intercept)        344742918472.    0    Inf       
## 2 season_deathSpring            1     0    Inf       
## 3 season_deathSummer            1     0    Inf       
## 4 season_deathWinter            1     0    Inf       
## 5 days_after                    1     0.05   1.92e  1
## 6 bmi_q1                        1     0    Inf       
## 7 cig_day_avg                   1     0      4.16e174