1 Set up package environment

We first want to install all of the packages used in this analysis. We use the package renv as a way to manage package versions. The following code was initially run to set up the package library.

install.packages("renv")
library(renv)

renv::init(bare = TRUE)
install.packages("BiocManager")
library(BiocManager)
install.packages(c("ggplot2", "magrittr", "knitr", "kableExtra", "ggrepel",
                   "rmarkdown", "formatR", "bitops", "caTools","cowplot",
                   "Rcpp","digest","evaluate","htmltools","jsonlite",
                   "base64enc","rprojroot","highr","xfun","stringi"),
                 repo = BiocManager::repositories())
renv::snapshot()

If you are attempting to reproduce this analysis, you can 1) install/load renv, 2) copy/paste the renv.lock file into your current working directory and 3) run the renv::restore() command to automatically install all of the packages with the same version.

# Make sure the 'renv.lock' file is in your current working directory.
install.packages("renv")
library(renv)
renv::restore()

2 Load in scripts and R packages

Next, R packages will be loaded and scripts will be sourced.

source("logistic_regression_util.R")
source("maf_utils.R")
library("ggplot2")
library("cowplot")
library("magrittr")
library("knitr")
library("kableExtra")
library("ggrepel")
knitr::opts_chunk$set(fig.width=12, fig.height=8)

3 Read in data files

The file containing Mutation burden (i.e. mutation per Mb) and MSI or POLE status was obtained from the DNA damage TCGA working group from the PanCanAtlas consortium. This work was published in Cell and can be found at:

https://www.cell.com/cell-reports/pdf/S2211-1247(18)30437-6.pdf.

Data from the paper can be found at:

https://gdc.cancer.gov/about-data/publications/PanCan-DDR-2018.

The ABSOLUTE call file can be found at:

http://api.gdc.cancer.gov/data/4f277128-f793-4354-a13d-30cc7fe9f6b5.

3.1 Read in tumor annotation

# Read in mutation rate data
mut.rates = read.table("../../Data/Mutation_Rate_and-MSI.txt", header=TRUE, stringsAsFactors = FALSE, sep="\t", row.names = 1)

# Read in ABSOLUTE calls and combine with mut.rates
absolute = read.table("../../Data/TCGA_mastercalls.abs_tables_JSedit.fixed.txt", header = TRUE, stringsAsFactors = FALSE, sep = "\t", row.names = 1)
mut.rates$Ploidy = absolute[rownames(mut.rates), "ploidy"]
mut.rates$PloidyCorrected_MutationPerMb = log10(mut.rates$Non.silentMutationsperMb) / mut.rates$Ploidy

mut.rates$WGD = factor(ifelse(mut.rates$Genome_doublings > 0, "WGD", "Non-WGD"), levels = c("Non-WGD", "WGD"))
mut.rates$MSI_POLE_Status = factor(ifelse(mut.rates$MSIPOLE == "MSIPOLE", "Yes", "No"), levels = c("No", "Yes"))
mut.rates$Log10_MutsPerMB = log10(mut.rates$Non.silentMutationsperMb)

# Make tumor type a factor ordered by median mutation rate
agg = aggregate(mut.rates$Non.silentMutationsperMb,
                by=list(mut.rates$Type),
                "median")
tumor.type = agg[order(agg[,2]),1]
mut.rates$Type = factor(mut.rates$Type, levels = tumor.type)

# Create subsets
mut.rates.hyper = subset(mut.rates, MSI_POLE_Status == "Yes")
mut.rates.nonhyper = subset(mut.rates, MSI_POLE_Status == "No")

3.2 Read in and process gene mutations

All tumor ids in mut.rates match an entry in the PanCanAtlas MC3 maf, however some of the tumors are no longer present after filtering out poor quality variants. Therefore, the Tumor_Sample_Barcode column within the maf is converted to a factor and the levels are set to the tumors found in mut.rates so each tumor will get a column in the muts table generated by xtabs. The maf file was obtained from http://api.gdc.cancer.gov/data/1c8cfe5f-e52d-41ba-94da-f15ea1337efc. PanCan MutSig2CV results were obtained from https://api.gdc.cancer.gov/data/2e8d4650-5498-4288-a461-23667ff553e2.

# Read in file and subset to filtered variants
maf = read.table(gzfile("../../Data/mc3.v0.2.8.PUBLIC.maf.gz"), sep="\t", header=TRUE, stringsAsFactors = FALSE, quote="")
maf = subset(maf, FILTER %in% c("PASS", "wga", "native_wga_mix"))

# Substring ID and convert to factor with appropriate levels matching all tumors in mut.rates
maf$Tumor_Sample_Barcode = factor(substring(maf$Tumor_Sample_Barcode, 1, 15), levels = rownames(mut.rates))

# Subset to nonsynonymous variants
maf.c = maf.coding(maf)
muts = xtabs(~ Hugo_Symbol + Tumor_Sample_Barcode, data = maf.c)

# Subset further to significantly mutated genes from MutSig2CV
fs = list.files(path="../../Data/MutSig2CV/", full.names = TRUE)
fs = setdiff(fs, "../../Data/MutSig2CV//README.txt")
genes = c()
for(i in fs) {
  temp = read.table(i, header=TRUE, stringsAsFactors=FALSE, sep="\t", quote="")
  temp = subset(temp, qvalue < 0.1)
  genes = c(genes, temp[,1])
}
genes = sort(intersect(unique(genes), rownames(muts)))

muts = muts[genes,]

# Clear up some memory
rm(maf)
rm(maf.c)

4 Plotting distribution of mutation rates

We need to examine the distribution of mutation rates to determine appropriate statistical tests. In parts A and B, the density of non-silent mutations is plotted across 9414 tumors from TCGA. Densities are plotted separately for tumors with MSI or POLE tumors, which are expected to have higher mutation rates in general. By applying a log10 transformation in part B, we can more clearly see the distubtions.

In parts B and C, the density of non-silent mutations is plotted across 9240 tumors from TCGA that do NOT fall into MSI or POLE categories. In part C, we can see that although hypermutated samples are excluded, there is still a long tailed distribution. The application of log10 in part D makes the distrubution appear more “bell-shapped”. Therefore, we should either use non-parametric tests (e.g. Wilcoxon) or use logged transformed values with parametric tests (e.g. t-test).

p1 = ggplot(mut.rates, aes(x=Non.silentMutationsperMb, fill=MSI_POLE_Status)) + geom_density(alpha = 0.5) + theme_bw() + ggtitle("All tumors, no log transformation")
p2 = ggplot(mut.rates, aes(x=Non.silentMutationsperMb, fill=MSI_POLE_Status)) + geom_density(alpha = 0.5) + theme_bw() + ggtitle("All tumors, with Log10 transformation") + scale_x_log10()

p3 = ggplot(mut.rates.nonhyper, aes(x=Non.silentMutationsperMb, fill=MSI_POLE_Status)) + geom_density(alpha = 0.5) + theme_bw() + ggtitle("Non-hypermutated tumors, no log transformation")
p4 = ggplot(mut.rates.nonhyper, aes(x=Non.silentMutationsperMb, fill=MSI_POLE_Status)) + geom_density(alpha = 0.5) + theme_bw() + ggtitle("Non-hypermutated tumors, with Log10 transformation") + scale_x_log10()

plot_grid(p1, p2, p3, p4, labels = "AUTO")

5 Testing associations between mutation burden and WGD status

Next, we will use linear modeling to examine associations between mutation burden per Mb and WGD status while controlling for tumor type and MSI/POLE status. Note that in the “Estimate” column in the “Coefficients” section within the summary tables, the (Intercept) is the baseline group and refers to the mean mutation rate for , non-WGD, non-MSI/POLE tumors. WGDWGD refers to the difference between the baseline group and WGD tumors. MSI_POLE_StatusYes refers to the difference between the baseline group and tumors with MSI/POLE status. Rows for each tumor type (e.g. TypeBLCA) refer to the mean difference between each tumor type and the baseline group.

There are no significant associations between WGD status and mutation rate when using the non-transformed values. However, when using log-transformed values, we see strong associations with WGD status. This is true when including MSI/POLE tumors and controlling for this in the model AND when subsetting to non-MSI/POLE tumors and estimating a new model.

Overall, these results show that there is an overall strong association between WGD status and mutation rate when averaging across all tumor types . This analysis does not show whether this trend is true for all tumor types or if a subset of tumor types are driving the association.

# Controlling for tumor type and Hypermutation
model.full = lm(Non.silentMutationsperMb ~ WGD + MSI_POLE_Status + Type, data=mut.rates)
summary(model.full)
## 
## Call:
## lm(formula = Non.silentMutationsperMb ~ WGD + MSI_POLE_Status + 
##     Type, data = mut.rates)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -81.93  -1.90  -0.61   0.07 650.40 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         0.27857    1.36847   0.204 0.838698    
## WGDWGD             -0.07853    0.40587  -0.193 0.846574    
## MSI_POLE_StatusYes 76.16014    1.43716  52.994  < 2e-16 ***
## TypeTHCA            0.01462    1.58542   0.009 0.992641    
## TypeTHYM            0.16719    2.16576   0.077 0.938469    
## TypeTGCT            0.12371    2.00686   0.062 0.950848    
## TypeUVM             0.20736    2.35327   0.088 0.929786    
## TypeLAML            1.07605    2.13611   0.504 0.614455    
## TypeKICH           -0.68550    2.52654  -0.271 0.786151    
## TypeACC             0.17775    2.27859   0.078 0.937822    
## TypeMESO            0.48090    2.36288   0.204 0.838730    
## TypeCHOL            0.84852    3.16544   0.268 0.788660    
## TypePRAD            0.80560    1.58180   0.509 0.610558    
## TypeLGG             1.05494    1.56799   0.673 0.501094    
## TypeSARC            0.67159    1.78782   0.376 0.707185    
## TypePAAD            2.73553    1.94006   1.410 0.158565    
## TypeBRCA            1.52186    1.47839   1.029 0.303317    
## TypeUCS             2.05381    2.68098   0.766 0.443655    
## TypeGBM             2.53184    1.63092   1.552 0.120600    
## TypeKIRC            1.20545    1.64979   0.731 0.465000    
## TypeKIRP            1.18523    1.71607   0.691 0.489792    
## TypeUCEC            6.39443    1.60428   3.986 6.77e-05 ***
## TypeCESC            3.36050    1.98163   1.696 0.089952 .  
## TypeLIHC            2.22482    1.68165   1.323 0.185869    
## TypeOV              2.80459    1.63025   1.720 0.085404 .  
## TypeESCA            2.28297    1.93751   1.178 0.238706    
## TypeREAD            6.41732    2.07244   3.097 0.001964 ** 
## TypeDLBC            2.81262    3.13054   0.898 0.368972    
## TypeHNSC            3.62029    1.57362   2.301 0.021436 *  
## TypeCOAD            4.13320    1.64851   2.507 0.012185 *  
## TypeSTAD            8.42660    1.60426   5.253 1.53e-07 ***
## TypeBLCA            6.15916    1.62559   3.789 0.000152 ***
## TypeLUAD            7.61471    1.57895   4.823 1.44e-06 ***
## TypeLUSC            7.11767    1.58949   4.478 7.63e-06 ***
## TypeSKCM           14.45046    1.58825   9.098  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.13 on 9379 degrees of freedom
## Multiple R-squared:  0.2972, Adjusted R-squared:  0.2946 
## F-statistic: 116.7 on 34 and 9379 DF,  p-value: < 2.2e-16
# Controlling for tumor type and Hypermutation with log transformation
model.full.log10 = lm(log10(Non.silentMutationsperMb) ~ WGD + MSI_POLE_Status + Type, data=mut.rates)
summary(model.full.log10)
## 
## Call:
## lm(formula = log10(Non.silentMutationsperMb) ~ WGD + MSI_POLE_Status + 
##     Type, data = mut.rates)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.34038 -0.19150 -0.01097  0.16847  2.16187 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -0.659856   0.029526 -22.348  < 2e-16 ***
## WGDWGD              0.087711   0.008757  10.016  < 2e-16 ***
## MSI_POLE_StatusYes  1.181884   0.031008  38.116  < 2e-16 ***
## TypeTHCA            0.024806   0.034207   0.725  0.46836    
## TypeTHYM            0.032610   0.046728   0.698  0.48528    
## TypeTGCT            0.014279   0.043300   0.330  0.74158    
## TypeUVM             0.175998   0.050774   3.466  0.00053 ***
## TypeLAML            0.150160   0.046088   3.258  0.00113 ** 
## TypeKICH            0.309860   0.054512   5.684 1.35e-08 ***
## TypeACC             0.495454   0.049163  10.078  < 2e-16 ***
## TypeMESO            0.445372   0.050981   8.736  < 2e-16 ***
## TypeCHOL            0.493489   0.068297   7.226 5.38e-13 ***
## TypePRAD            0.485802   0.034129  14.234  < 2e-16 ***
## TypeLGG             0.481022   0.033831  14.218  < 2e-16 ***
## TypeSARC            0.498500   0.038574  12.923  < 2e-16 ***
## TypePAAD            0.556086   0.041858  13.285  < 2e-16 ***
## TypeBRCA            0.661379   0.031898  20.734  < 2e-16 ***
## TypeUCS             0.751159   0.057844  12.986  < 2e-16 ***
## TypeGBM             0.830007   0.035188  23.588  < 2e-16 ***
## TypeKIRC            0.755898   0.035596  21.236  < 2e-16 ***
## TypeKIRP            0.761471   0.037026  20.566  < 2e-16 ***
## TypeUCEC            0.968828   0.034614  27.990  < 2e-16 ***
## TypeCESC            0.971555   0.042755  22.724  < 2e-16 ***
## TypeLIHC            0.914194   0.036283  25.196  < 2e-16 ***
## TypeOV              0.921154   0.035174  26.188  < 2e-16 ***
## TypeESCA            0.961971   0.041803  23.012  < 2e-16 ***
## TypeREAD            1.029012   0.044715  23.013  < 2e-16 ***
## TypeDLBC            1.057293   0.067544  15.653  < 2e-16 ***
## TypeHNSC            1.065720   0.033952  31.389  < 2e-16 ***
## TypeCOAD            1.162253   0.035568  32.677  < 2e-16 ***
## TypeSTAD            1.216415   0.034613  35.143  < 2e-16 ***
## TypeBLCA            1.274766   0.035073  36.346  < 2e-16 ***
## TypeLUAD            1.284862   0.034067  37.716  < 2e-16 ***
## TypeLUSC            1.389252   0.034295  40.509  < 2e-16 ***
## TypeSKCM            1.499240   0.034268  43.751  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3696 on 9379 degrees of freedom
## Multiple R-squared:  0.6017, Adjusted R-squared:  0.6002 
## F-statistic: 416.7 on 34 and 9379 DF,  p-value: < 2.2e-16
# Testing hypermutated samples only
model.hyper = lm(Non.silentMutationsperMb ~ WGD + Type, data=mut.rates.hyper)
summary(model.hyper)
## 
## Call:
## lm(formula = Non.silentMutationsperMb ~ WGD + Type, data = mut.rates.hyper)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -161.55  -69.97  -37.57   33.57  651.01 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)   -2.967    116.849  -0.025   0.9798  
## WGDWGD        19.460     50.694   0.384   0.7016  
## TypeACC      -11.331    128.941  -0.088   0.9301  
## TypePRAD     191.176    157.281   1.216   0.2260  
## TypeLGG      324.692    157.281   2.064   0.0406 *
## TypeSARC       4.085    157.281   0.026   0.9793  
## TypePAAD     389.267    157.281   2.475   0.0144 *
## TypeBRCA      22.773    117.097   0.194   0.8461  
## TypeUCS      106.988    157.281   0.680   0.4974  
## TypeGBM      170.342    122.735   1.388   0.1672  
## TypeUCEC      85.189    117.058   0.728   0.4679  
## TypeCESC      47.300    157.281   0.301   0.7640  
## TypeESCA     -11.979    148.888  -0.080   0.9360  
## TypeREAD     194.948    128.159   1.521   0.1302  
## TypeCOAD      47.550    118.022   0.403   0.6876  
## TypeSTAD     114.071    125.978   0.905   0.3666  
## TypeBLCA      47.809    131.408   0.364   0.7165  
## TypeSKCM      36.950    157.281   0.235   0.8146  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 105.3 on 156 degrees of freedom
## Multiple R-squared:  0.1566, Adjusted R-squared:  0.06472 
## F-statistic: 1.704 on 17 and 156 DF,  p-value: 0.04708
# Testing hypermutated samples only with log transformation
model.hyper.log10 = lm(log10(Non.silentMutationsperMb) ~ WGD + Type, data=mut.rates.hyper)
summary(model.hyper.log10)
## 
## Call:
## lm(formula = log10(Non.silentMutationsperMb) ~ WGD + Type, data = mut.rates.hyper)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.5723 -0.4571 -0.0354  0.4282  1.7118 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  1.23593    0.67551   1.830   0.0692 .
## WGDWGD      -0.01863    0.29306  -0.064   0.9494  
## TypeACC     -0.69754    0.74541  -0.936   0.3508  
## TypePRAD     1.03871    0.90925   1.142   0.2550  
## TypeLGG      1.27156    0.90925   1.398   0.1640  
## TypeSARC    -1.18738    0.90925  -1.306   0.1935  
## TypePAAD     1.35100    0.90925   1.486   0.1393  
## TypeBRCA    -0.74880    0.67695  -1.106   0.2704  
## TypeUCS      0.78120    0.90925   0.859   0.3916  
## TypeGBM      0.77676    0.70954   1.095   0.2753  
## TypeUCEC     0.27193    0.67672   0.402   0.6884  
## TypeCESC     0.41080    0.90925   0.452   0.6520  
## TypeESCA    -0.56276    0.86073  -0.654   0.5142  
## TypeREAD     0.90232    0.74089   1.218   0.2251  
## TypeCOAD     0.25087    0.68229   0.368   0.7136  
## TypeSTAD     0.71108    0.72829   0.976   0.3304  
## TypeBLCA     0.21315    0.75968   0.281   0.7794  
## TypeSKCM     0.29534    0.90925   0.325   0.7458  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6086 on 156 degrees of freedom
## Multiple R-squared:  0.2226, Adjusted R-squared:  0.1379 
## F-statistic: 2.628 on 17 and 156 DF,  p-value: 0.0008978
# Testing non-hypermutated samples only
model.nonhyper = lm(Non.silentMutationsperMb ~ WGD + Type, data=mut.rates.nonhyper)
summary(model.nonhyper)
## 
## Call:
## lm(formula = Non.silentMutationsperMb ~ WGD + Type, data = mut.rates.nonhyper)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -14.819  -1.690  -0.458   0.102 271.100 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.27718    0.69589   0.398 0.690412    
## WGDWGD      -0.06941    0.20703  -0.335 0.737425    
## TypeTHCA     0.01578    0.80620   0.020 0.984386    
## TypeTHYM     0.16762    1.10131   0.152 0.879033    
## TypeTGCT     0.11636    1.02059   0.114 0.909227    
## TypeUVM      0.20819    1.19666   0.174 0.861892    
## TypeLAML     1.07711    1.08623   0.992 0.321420    
## TypeKICH     0.23932    1.29180   0.185 0.853026    
## TypeACC      1.81514    1.16679   1.556 0.119820    
## TypeMESO     0.48056    1.20155   0.400 0.689199    
## TypeCHOL     0.84789    1.60966   0.527 0.598378    
## TypePRAD     0.56660    0.80458   0.704 0.481313    
## TypeLGG      0.56582    0.79753   0.709 0.478054    
## TypeSARC     1.00938    0.90998   1.109 0.267359    
## TypePAAD     0.74023    0.98814   0.749 0.453807    
## TypeBRCA     1.76034    0.75202   2.341 0.019263 *  
## TypeUCS      1.58325    1.37253   1.154 0.248725    
## TypeGBM      1.70511    0.83033   2.054 0.040048 *  
## TypeKIRC     1.20541    0.83894   1.437 0.150799    
## TypeKIRP     1.18602    0.87264   1.359 0.174140    
## TypeUCEC     6.53743    0.82614   7.913 2.80e-15 ***
## TypeCESC     3.60851    1.00953   3.574 0.000353 ***
## TypeLIHC     2.22301    0.85514   2.600 0.009349 ** 
## TypeOV       2.80091    0.82903   3.379 0.000732 ***
## TypeESCA     2.75142    0.98673   2.788 0.005307 ** 
## TypeREAD     2.71502    1.06367   2.552 0.010711 *  
## TypeDLBC     2.81179    1.59191   1.766 0.077379 .  
## TypeHNSC     3.61806    0.80021   4.521 6.22e-06 ***
## TypeCOAD     6.45664    0.84563   7.635 2.48e-14 ***
## TypeSTAD     8.11038    0.81709   9.926  < 2e-16 ***
## TypeBLCA     6.29669    0.82722   7.612 2.97e-14 ***
## TypeLUAD     7.61072    0.80294   9.479  < 2e-16 ***
## TypeLUSC     7.11400    0.80830   8.801  < 2e-16 ***
## TypeSKCM    14.57326    0.80788  18.039  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.71 on 9206 degrees of freedom
## Multiple R-squared:  0.1477, Adjusted R-squared:  0.1447 
## F-statistic: 48.36 on 33 and 9206 DF,  p-value: < 2.2e-16
# Testing non-hypermutated samples only with log transformation
model.nonhyper.log10 = lm(log10(Non.silentMutationsperMb) ~ WGD + Type, data=mut.rates.nonhyper)
summary(model.nonhyper.log10)
## 
## Call:
## lm(formula = log10(Non.silentMutationsperMb) ~ WGD + Type, data = mut.rates.nonhyper)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.34097 -0.18836 -0.00993  0.16700  2.16255 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.660044   0.028925 -22.819  < 2e-16 ***
## WGDWGD       0.088940   0.008605  10.336  < 2e-16 ***
## TypeTHCA     0.024961   0.033510   0.745 0.456353    
## TypeTHYM     0.032668   0.045776   0.714 0.475467    
## TypeTGCT     0.013290   0.042421   0.313 0.754076    
## TypeUVM      0.176109   0.049739   3.541 0.000401 ***
## TypeLAML     0.150303   0.045149   3.329 0.000875 ***
## TypeKICH     0.305224   0.053694   5.685 1.35e-08 ***
## TypeACC      0.508450   0.048498  10.484  < 2e-16 ***
## TypeMESO     0.445326   0.049942   8.917  < 2e-16 ***
## TypeCHOL     0.493403   0.066905   7.375 1.79e-13 ***
## TypePRAD     0.483156   0.033443  14.447  < 2e-16 ***
## TypeLGG      0.477800   0.033150  14.413  < 2e-16 ***
## TypeSARC     0.502457   0.037823  13.284  < 2e-16 ***
## TypePAAD     0.546158   0.041072  13.298  < 2e-16 ***
## TypeBRCA     0.664974   0.031258  21.274  < 2e-16 ***
## TypeUCS      0.736770   0.057049  12.915  < 2e-16 ***
## TypeGBM      0.825220   0.034513  23.911  < 2e-16 ***
## TypeKIRC     0.755893   0.034870  21.677  < 2e-16 ***
## TypeKIRP     0.761577   0.036271  20.997  < 2e-16 ***
## TypeUCEC     0.963528   0.034339  28.060  < 2e-16 ***
## TypeCESC     0.970240   0.041961  23.122  < 2e-16 ***
## TypeLIHC     0.913951   0.035544  25.713  < 2e-16 ***
## TypeOV       0.920658   0.034459  26.718  < 2e-16 ***
## TypeESCA     0.967312   0.041013  23.585  < 2e-16 ***
## TypeREAD     1.008680   0.044212  22.815  < 2e-16 ***
## TypeDLBC     1.057182   0.066168  15.977  < 2e-16 ***
## TypeHNSC     1.065420   0.033261  32.032  < 2e-16 ***
## TypeCOAD     1.175318   0.035149  33.438  < 2e-16 ***
## TypeSTAD     1.213602   0.033962  35.734  < 2e-16 ***
## TypeBLCA     1.276233   0.034384  37.118  < 2e-16 ***
## TypeLUAD     1.284325   0.033374  38.482  < 2e-16 ***
## TypeLUSC     1.388758   0.033597  41.336  < 2e-16 ***
## TypeSKCM     1.500010   0.033580  44.670  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.362 on 9206 degrees of freedom
## Multiple R-squared:  0.5766, Adjusted R-squared:  0.575 
## F-statistic: 379.9 on 33 and 9206 DF,  p-value: < 2.2e-16

Next, we calculate the differences between WGD and non-WGD samples within each tumor type using the non-parametric Wilcoxon rank-sum test.

res = c()
tumor.type = unique(unique(mut.rates.nonhyper$Type))
for(i in tumor.type) {
  mut.rates.type = subset(mut.rates.nonhyper, Type == i)
  model = wilcox.test(Non.silentMutationsperMb ~ WGD, data = mut.rates.type)
  agg.median = aggregate(mut.rates.type$Non.silentMutationsperMb,
                  by=list(mut.rates.type$WGD),
                  median)
  agg.mean = aggregate(mut.rates.type$Non.silentMutationsperMb,
                  by=list(mut.rates.type$WGD),
                  mean)
  res = rbind(res, c(agg.median[,2], agg.mean[,2], model$p.value))
}
colnames(res) = c("Non-WGD_Median", "WGD_Median",
                  "Non-WGD_Mean", "WGD_Mean",
                  "Pvalue")
res = data.frame(Type = tumor.type,
                 res,
                 FDR=p.adjust(res[,"Pvalue"], "fdr"),
                 Bonferroni = p.adjust(res[,"Pvalue"], "bon"),
                 check.names = FALSE)

res = res[order(res$Pvalue),]

# Show results in table
kable(res, style = 'html', row.names = FALSE) %>%
  kable_styling(bootstrap_options = "striped") %>%
  scroll_box(width = "100%", height = "800px")
Type Non-WGD_Median WGD_Median Non-WGD_Mean WGD_Mean Pvalue FDR Bonferroni
BRCA 0.7887883 1.2911623 1.6425024 2.4745046 0.0000000 0.0000000 0.0000000
LUAD 3.3057242 6.9627762 5.3857255 9.5580078 0.0000000 0.0000000 0.0000000
SARC 0.5722591 0.9102237 0.8857385 1.6516875 0.0000000 0.0000000 0.0000000
BLCA 3.1230637 5.7202472 4.9002245 7.5739151 0.0000000 0.0000000 0.0000000
PRAD 0.6903266 0.9494249 0.8251362 0.9890484 0.0000003 0.0000023 0.0000114
LUSC 5.6169853 6.7154968 6.2240072 8.2582152 0.0000309 0.0001700 0.0010199
LIHC 1.8065517 2.3281210 2.3667956 2.6778088 0.0003893 0.0018351 0.0128459
THYM 0.2360255 0.4123456 0.4194798 0.5894340 0.0014128 0.0058279 0.0466235
COAD 2.9237316 2.6972247 9.3617234 3.1911594 0.0053887 0.0176937 0.1778274
ACC 0.5187245 0.7906970 1.7644122 2.3151797 0.0055410 0.0176937 0.1828544
DLBC 2.3135876 4.0776043 2.6005717 4.5390239 0.0058979 0.0176937 0.1946312
LGG 0.7095434 0.7982604 0.7667416 0.9274871 0.0167829 0.0440212 0.5538356
OV 1.9435882 2.1925929 3.6942358 2.5163338 0.0173417 0.0440212 0.5722760
KIRC 1.3492120 1.5275596 1.4253739 1.7204570 0.0380972 0.0898006 1.0000000
ESCA 1.9362814 2.3858724 3.2732779 2.7673311 0.0428929 0.0943644 1.0000000
CHOL 0.6594951 0.9153742 1.1797529 0.8642641 0.0486301 0.1002995 1.0000000
PAAD 0.8976690 1.0303383 0.9533860 1.1304550 0.0556280 0.1079837 1.0000000
SKCM 8.2243260 9.0626483 13.1679982 17.2816952 0.1007054 0.1846265 1.0000000
HNSC 2.6010606 2.8967389 3.9823461 3.6935965 0.1178907 0.2047576 1.0000000
THCA 0.2364548 0.3012428 0.2901206 0.3287041 0.1261393 0.2081298 1.0000000
PCPG 0.2258893 0.2489308 0.2569747 0.3197262 0.1480959 0.2294626 1.0000000
KICH 0.4203093 0.5863136 0.4770349 0.6882803 0.1529751 0.2294626 1.0000000
KIRP 1.4209476 1.7404218 1.4333290 1.8170041 0.1726414 0.2477028 1.0000000
UCEC 1.3706469 1.3505228 8.5867437 2.0701521 0.2183405 0.3002182 1.0000000
UVM 0.3215546 0.3881548 0.4841662 0.4339046 0.2489854 0.3286607 1.0000000
MESO 0.6233613 0.7900537 0.6622938 1.0955767 0.2686183 0.3409386 1.0000000
GBM 1.3522352 1.4722469 2.0098155 1.7378375 0.3199001 0.3909891 1.0000000
STAD 2.7489453 3.4362930 10.9640890 4.6330983 0.4673039 0.5480047 1.0000000
CESC 1.7448954 2.3039068 4.3870496 2.8647043 0.4815799 0.5480047 1.0000000
READ 2.3601104 2.5257189 3.2471526 2.6678267 0.5904206 0.6494627 1.0000000
TGCT 0.2769972 0.2783775 0.3482366 0.3261160 0.6908412 0.7354116 1.0000000
UCS 1.3342429 1.3009094 3.5672642 1.5004883 0.9160979 0.9447260 1.0000000
LAML 0.3387778 0.3521584 1.3907336 0.3280533 0.9550171 0.9550171 1.0000000

We can plot the mutation counts per Mb for each tumor type divided into WGD and non-WGD.

ggplot(mut.rates.nonhyper, aes(x=Type, y=Non.silentMutationsperMb, fill=WGD)) +
  geom_violin() + 
  theme_bw() +
  scale_y_log10() +
  ylab("Non-silent mutations per Mb (log10)") + 
  xlab("Tumor Type") + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank())