The goal of this workshop is to introduce core statistical concepts using simple base R syntax and a real clinical dataset from the OncoDataSets package.
Topics covered:
#install.packages("OncoDataSets")
#install.packages("survival")
library(OncoDataSets)
library(survival)
We will use the BreastCancer_df dataset, which contains
demographic, tumour, and survival information for patients with breast
cancer. Paste these commands into your script to rename the dataset and
generate an (artificial) tumor size variable, which would not otherwise
be present in the dataset.
data("WBreastCancer_tbl_df")
BreastCancer_df <- WBreastCancer_tbl_df
BreastCancer_df$tumour_size <- abs(round(15 + rnorm(length(BreastCancer_df$age),25,10)
- 0.25*BreastCancer_df$age + BreastCancer_df$histgrad
*-(rgamma(length(BreastCancer_df$histgrad),3)),0))
head(BreastCancer_df)
## id time status er age histgrad ln_yesno pathsd pr tumour_size
## 1 1 9.466667 0 0 60 3 0 NA 0 13
## 2 2 8.600000 0 NA 79 NA 0 NA NA NA
## 3 3 19.333333 0 NA 82 2 0 NA NA 19
## 4 4 16.333333 0 1 66 2 0 NA 1 28
## 5 5 8.500000 0 NA 52 3 0 NA NA 11
## 6 6 9.400000 0 NA 58 NA 0 NA NA NA
str(BreastCancer_df)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1207 obs. of 10 variables:
## $ id : num 1 2 3 4 5 6 7 8 9 10 ...
## ..- attr(*, "format.spss")= chr "F8.0"
## ..- attr(*, "display_width")= int 4
## $ time : num 9.47 8.6 19.33 16.33 8.5 ...
## ..- attr(*, "format.spss")= chr "F8.2"
## $ status : num 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "format.spss")= chr "F8.0"
## $ er : num 0 NA NA 1 NA NA 1 0 NA 1 ...
## ..- attr(*, "format.spss")= chr "F6.0"
## $ age : num 60 79 82 66 52 58 50 83 46 54 ...
## ..- attr(*, "format.spss")= chr "F8.0"
## $ histgrad : num 3 NA 2 2 3 NA 2 3 NA 2 ...
## ..- attr(*, "format.spss")= chr "F8.0"
## $ ln_yesno : num 0 0 0 0 0 0 0 0 1 1 ...
## ..- attr(*, "format.spss")= chr "F8.0"
## $ pathsd : num NA NA NA NA NA NA NA NA NA NA ...
## ..- attr(*, "format.spss")= chr "F8.0"
## $ pr : num 0 NA NA 1 NA NA 0 0 NA 1 ...
## ..- attr(*, "format.spss")= chr "F6.0"
## $ tumour_size: num 13 NA 19 28 11 NA 0 3 NA 12 ...
## ..- attr(*, "format.spss")= chr "F8.0"
summary(BreastCancer_df)
## id time status er
## Min. : 1.0 Min. : 2.633 Min. :0.00000 Min. :0.000
## 1st Qu.: 310.5 1st Qu.: 22.550 1st Qu.:0.00000 1st Qu.:0.000
## Median : 619.0 Median : 42.967 Median :0.00000 Median :1.000
## Mean : 621.1 Mean : 46.956 Mean :0.05965 Mean :0.611
## 3rd Qu.: 931.5 3rd Qu.: 65.583 3rd Qu.:0.00000 3rd Qu.:1.000
## Max. :1266.0 Max. :133.800 Max. :1.00000 Max. :1.000
## NA's :338
## age histgrad ln_yesno pathsd
## Min. :22.00 Min. :1.00 Min. :0.0000 Min. :0.0000
## 1st Qu.:46.00 1st Qu.:2.00 1st Qu.:0.0000 1st Qu.:0.0000
## Median :56.00 Median :2.00 Median :0.0000 Median :0.0000
## Mean :56.39 Mean :2.27 Mean :0.2303 Mean :0.2632
## 3rd Qu.:66.50 3rd Qu.:3.00 3rd Qu.:0.0000 3rd Qu.:1.0000
## Max. :88.00 Max. :3.00 Max. :1.0000 Max. :1.0000
## NA's :287 NA's :86
## pr tumour_size
## Min. :0.0000 Min. : 0.00
## 1st Qu.:0.0000 1st Qu.:10.00
## Median :1.0000 Median :18.00
## Mean :0.5429 Mean :18.76
## 3rd Qu.:1.0000 3rd Qu.:26.00
## Max. :1.0000 Max. :51.00
## NA's :356 NA's :287
hist(BreastCancer_df$age,
main = "Age Distribution in Breast Cancer Patients",
xlab = "Age (years)",
col = "lightblue")
barplot(table(BreastCancer_df$er),
main = "Estrogen Receptor Status",
xlab = "ER Status (0 = Negative, 1 = Positive)",
ylab = "Number of Patients")
hist(BreastCancer_df$tumour_size,
main = "Tumour Size",
xlab = "Tumour Size in mm")
Is age different between ER-positive and ER-negative patients?
table(BreastCancer_df$er)
##
## 0 1
## 338 531
t.test(age ~ er, data = BreastCancer_df)
##
## Welch Two Sample t-test
##
## data: age by er
## t = -7.5115, df = 685.01, p-value = 1.83e-13
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -8.605477 -5.038948
## sample estimates:
## mean in group 0 mean in group 1
## 52.16272 58.98493
Is ER status associated with nodal involvement?
contingency <- table(BreastCancer_df$er, BreastCancer_df$ln_yesno)
contingency
##
## 0 1
## 0 255 83
## 1 394 137
chisq.test(contingency)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: contingency
## X-squared = 0.10969, df = 1, p-value = 0.7405
Does age predict tumour size?
model <- lm(tumour_size ~ age, data = BreastCancer_df)
summary(model)
##
## Call:
## lm(formula = tumour_size ~ age, data = BreastCancer_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.341 -7.879 -0.337 6.914 32.021
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 31.25664 1.46887 21.279 <2e-16 ***
## age -0.22310 0.02551 -8.747 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.33 on 918 degrees of freedom
## (287 observations deleted due to missingness)
## Multiple R-squared: 0.07693, Adjusted R-squared: 0.07592
## F-statistic: 76.51 on 1 and 918 DF, p-value: < 2.2e-16
plot(BreastCancer_df$age, BreastCancer_df$tumour_size,
xlab = "Age",
ylab = "tumour Size (mm)")
abline(model, col = "red")
Assume status == 1 indicates death and 0
indicates censoring.
surv_obj <- Surv(BreastCancer_df$time, BreastCancer_df$status)
km_fit <- survfit(surv_obj ~ er, data = BreastCancer_df)
plot(km_fit,
col = c("blue", "red"),
xlab = "Time (months)",
ylab = "Survival Probability")
legend("topright",
legend = c("ER Negative", "ER Positive"),
col = c("blue", "red"),
lty = 1)
survdiff(surv_obj ~ er, data = BreastCancer_df)
## Call:
## survdiff(formula = surv_obj ~ er, data = BreastCancer_df)
##
## n=869, 338 observations deleted due to missingness.
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## er=0 338 28 19.8 3.39 5.54
## er=1 531 23 31.2 2.15 5.54
##
## Chisq= 5.5 on 1 degrees of freedom, p= 0.02