# Author: Enoch Yi-Tung Chen 
# Website: enochytchen.com
# Title: Replicate a Cox model using Poisson regression (using R)
# Purpose: Compare Cox PH model and Poisson regression with time-splitting
#          Replicate Paul Dickman's tutorials in Stata to R
# References:
# Paul Dickman. Replicate a Cox model using Poisson regression. 2019. https://www.pauldickman.com/software/stata/compare-cox-poisson/
# Bendix Carstensen. Who needs the Cox model anyway? 2023. http://bendixcarstensen.com/WntCma.pdf

#############################
## Load necessary packages ##
#############################
library(haven)
library(survival)
library(dplyr)
library(biostat3)

######################
## Data Preparation ##
######################
# Load the colon dataset (same data as biostat3::colon) 
colon <- read_dta("http://pauldickman.com/data/colon.dta")

colon_data <- colon |>
              filter(stage==1) |>
              mutate(# Event: 1 if Dead: cancer; 0 otherwise
                     dead = as.numeric(status==1),
                     # Female as the reference group
                     female = ifelse(sex == 2, 1, 0),
                     # Age group as a factor
                     agegrp_factor = as.factor(agegrp)) |>
              mutate(# Right-censored at 10 years
                     surv_mm = pmin(surv_mm, 120),
                     dead = ifelse(surv_mm >= 120, 0, dead))

str(colon_data)
table(colon_data$dead)
table(colon_data$female)
table(colon_data$agegrp_factor)

########################################
## 1. Cox PH Model on individual data ##
########################################
fit1 <- coxph(Surv(surv_mm, dead) ~ female + year8594 + agegrp_factor,
              ties = "breslow", # Caution: for handling tied data,
                                # default in R is "efron", not "breslow" (in Stata)
              data = colon_data)
summary(fit1) 
round(eform(fit1), 4)

#########################################
## 2. GLM with one-year time-splitting ##
#########################################
# This creates a piecewise exponential model with 1-year (12-month) time bands.
max_time <- max(colon_data$surv_mm)
cut_points <- seq(0, max_time, by = 12)
cut_points <- cut_points[cut_points < max_time]

colon_data_splityr <- survSplit(data = colon_data,
                                cut=cut_points,
                                start="start",
                                end="surv_mm",
                                event="dead")

## Calculate person time and claim fu as a factor
colon_data_splityr2 <- transform(colon_data_splityr,
                                 pt = surv_mm - start,
                                 fu = as.factor(start))

## Collapse
## This is the key of using aggregate level data to model non-parametric baseline
## If we can obtain this level of data, we can fit a glm
colon_data_splityr3 <- colon_data_splityr2 |>
                       group_by(female, fu, year8594, agegrp_factor) |>
                       summarise(pt=sum(pt), 
                                 dead=sum(dead)) |>
                       data.frame()

# Fit GLM with time band factor to model non-constant baseline
# time_years - start is the exact duration in the split interval 
fit2 <- glm(dead ~ female + fu + year8594 + agegrp_factor + offset(log(pt)),
            family = poisson,
            data = colon_data_splityr3)

summary(fit2)
round(eform(fit2), 4)

#################################
## 3. GLM split at event times ##
#################################
# Splitting at every unique event time makes the model non-parametric
# We expect this leads to very similar regression coefficient as the Cox model.
event_times <- colon_data |>
               filter(dead == 1) |>
               pull(surv_mm) |>
               unique() |>
               sort()

colon_data_splitevent <- survSplit(data = colon_data,
                                   cut = event_times,
                                   start = "start",
                                   end = "surv_mm",
                                   event = "dead")

## Calculate person time and claim fu as a factor
colon_data_splitevent2 <- transform(colon_data_splitevent,
                                    pt = surv_mm - start,
                                    fu = as.factor(start))

## Collapse
colon_data_splitevent3 <- colon_data_splitevent2 |>
                          group_by(female, fu, year8594, agegrp_factor) |>
                          summarise(pt=sum(pt), 
                                    dead=sum(dead),
                                    .groups = 'drop') |>
                          # Sort by the covariates first, then by fu
                          arrange(female, year8594, agegrp_factor, fu) |>
                          # Order the variables as female, year8594, agegrp_factor, fu, pt, dead
                          dplyr::select(female, year8594, agegrp_factor, fu, pt, dead) |>
                          data.frame()

head(colon_data_splitevent3)

head(colon_data_splitevent3)

# Fit GLM with time-split data
fit3 <- glm(dead ~ female + fu + year8594 + agegrp_factor + offset(log(pt)),
            family = poisson,
            data = colon_data_splitevent3)

summary(fit3)
round(eform(fit3), 4)

#############
## Summary ##
#############
round(exp(coef(fit1)["female"]), 4) # Cox PH model on individual data
round(exp(coef(fit2)["female"]), 4) # GLM with one-year time-splitting
round(exp(coef(fit3)["female"]), 4) # GLM with time-splitting at event times

#################
# Author: Enoch Yi-Tung Chen

# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


