First lets set up the environment: upload the data and useful packages.
# Packages
library(limma)
library(edgeR)
library(data.table)
library(dplyr)
library(ggplot2)
library(plotly)
library(car)
library(lattice)
library(GGally)
library(RColorBrewer)
library(gplots)
library(DESeq2)
library(viridis)
library(VennDiagram)
# makes everything reproductible
set.seed(1234)
transData <- read.table("Data/NHBE_transcriptome_data.txt", sep="\t", header=TRUE, row.names = 1)
designMat <- read.table("Data/NHBE_design.txt", sep="\t", header=TRUE, row.names = 1)
prepareData <- function(myGenes) {
miniDat <- t(transData[myGenes, ])
miniDat <- suppressWarnings(data.frame(gExp = as.vector(miniDat),
gene = rep(colnames(miniDat), each = nrow(miniDat))))
miniDat <- suppressWarnings(data.frame(designMat, miniDat))
miniDat
}
str(transData)
## 'data.frame': 22737 obs. of 23 variables:
## $ GSE10718_Biomat_1 : num 7.9 6.13 6.82 6.63 6.95 ...
## $ GSE10718_Biomat_10: num 7.62 5.85 7.62 5.7 7.51 ...
## $ GSE10718_Biomat_11: num 8.22 6.3 7.22 6.22 7.29 ...
## $ GSE10718_Biomat_12: num 8.01 5.91 6.17 6.06 7.95 ...
## $ GSE10718_Biomat_13: num 7.61 6.62 7.2 6.89 7.87 ...
## $ GSE10718_Biomat_14: num 7.45 6.82 7.4 7.02 7.55 ...
## $ GSE10718_Biomat_15: num 7.41 6.88 6.78 6.59 7.27 ...
## $ GSE10718_Biomat_16: num 7.67 6.83 6.99 6.45 7.56 ...
## $ GSE10718_Biomat_17: num 7.89 6.45 5.85 6.18 7.24 ...
## $ GSE10718_Biomat_19: num 7.8 6.66 6.83 6.19 6.96 ...
## $ GSE10718_Biomat_2 : num 7.72 6.57 6.67 6.74 7.05 ...
## $ GSE10718_Biomat_20: num 7.55 5.81 6.28 5.57 8.04 ...
## $ GSE10718_Biomat_21: num 6.34 7.06 7.23 5.77 7.62 ...
## $ GSE10718_Biomat_22: num 7.8 6.49 6.75 5.59 7.32 ...
## $ GSE10718_Biomat_23: num 7.95 7.08 7.11 7.14 7.83 ...
## $ GSE10718_Biomat_24: num 7.88 6.24 6.97 6.34 7.03 ...
## $ GSE10718_Biomat_3 : num 7.51 6.38 6.98 6.73 7.06 ...
## $ GSE10718_Biomat_4 : num 7.31 6.89 7.62 6.48 8.46 ...
## $ GSE10718_Biomat_5 : num 8 7.1 7.29 5.72 7.73 ...
## $ GSE10718_Biomat_6 : num 7.5 6.42 7.68 6.28 7.54 ...
## $ GSE10718_Biomat_7 : num 7.5 6.27 7.12 6.63 8.17 ...
## $ GSE10718_Biomat_8 : num 7.51 6.53 6.61 6.52 7.67 ...
## $ GSE10718_Biomat_9 : num 7.74 6.89 6.71 7.06 7.42 ...
show(designMat)
## ExternalID Treatment time
## GSE10718_Biomat_1 GSM270883 control 24_h
## GSE10718_Biomat_10 GSM270874 control 1_h
## GSE10718_Biomat_11 GSM270873 control 1_h
## GSE10718_Biomat_12 GSM270872 control 1_h
## GSE10718_Biomat_13 GSM270881 control 4_h
## GSE10718_Biomat_14 GSM270880 control 4_h
## GSE10718_Biomat_15 GSM270879 control 4_h
## GSE10718_Biomat_16 GSM270887 cigarette_smoke 1_h
## GSE10718_Biomat_17 GSM270886 cigarette_smoke 1_h
## GSE10718_Biomat_19 GSM270889 cigarette_smoke 2_h
## GSE10718_Biomat_2 GSM270882 control 24_h
## GSE10718_Biomat_20 GSM270888 cigarette_smoke 2_h
## GSE10718_Biomat_21 GSM270890 cigarette_smoke 2_h
## GSE10718_Biomat_22 GSM270891 cigarette_smoke 4_h
## GSE10718_Biomat_23 GSM270892 cigarette_smoke 4_h
## GSE10718_Biomat_24 GSM270893 cigarette_smoke 4_h
## GSE10718_Biomat_3 GSM270884 control 24_h
## GSE10718_Biomat_4 GSM270894 cigarette_smoke 24_h
## GSE10718_Biomat_5 GSM270895 cigarette_smoke 24_h
## GSE10718_Biomat_6 GSM270896 cigarette_smoke 24_h
## GSE10718_Biomat_7 GSM270878 control 2_h
## GSE10718_Biomat_8 GSM270876 control 2_h
## GSE10718_Biomat_9 GSM270875 control 2_h
if(all(rownames(designMat) == colnames(transData))) {print("The design matrix and data agree")}
## [1] "The design matrix and data agree"
if (!anyNA(transData)) { print("There are no missing values in Data") }
## [1] "There are no missing values in Data"
The dataset contains the expression of:
The different samples are :
Adds a quantitative variable for time in the design matrix.
designMat$timeCont <- car::recode(designMat$time,
"'1_h'=1; '2_h'=2; '4_h'=4; '24_h'=24",
as.factor.result = FALSE)
First lets plot all in a single graph.
randomRowNumber <- sample(1:nrow(transData), 1)
randomGeneName <- rownames(transData)[[randomRowNumber]]
# aggregates the meta data and data, using the function coded above
randGene <- prepareData(randomGeneName)
plotRandGene <- ggplot(randGene,aes(x=Treatment,y=gExp,color=factor(timeCont)))+
geom_point(aes(stroke=1.3)) +
theme_minimal() +
stat_summary(fun.y = "mean", aes(colour = "mean"), size = 3, geom="point", shape=3) +
scale_color_manual("Time since exposure (hours)", values = c("1"="#e31a1c","2"="#fd8d3c","4"="#fecc5c","24"="#ffffb2","mean"="black")) +
ggtitle("Differential expression due to tabacco") +
ylab("Gene expression (relative)")
ggplotly(plotRandGene)
We could also decide to use different facets. It is easier to compare the effect of treatments at each time point, but harder to compare between time points.
label_names <- c(
"1" = "After 1 hour",
"2" = "After 2 hour",
"4" = "After 3 hour",
"24" = "After 24 hour"
)
ggplot(randGene,aes(x=Treatment,y=gExp,color=factor(timeCont)))+
geom_point(aes(stroke=2.1)) +
stat_summary(fun.y = "mean", aes(colour = "mean"), size = 4, geom="point", shape=3) +
scale_color_manual("Time (hours)", values = c("1"="#e31a1c","2"="#fd8d3c","4"="#fecc5c","24"="#ffffb2","mean"="black")) +
ggtitle("Differential expression due to tabacco with respect to time since last exposure") +
theme(plot.title = element_text(hjust = 0.5, size = 15)) +
facet_wrap(~ timeCont, ncol = 4, labeller=as_labeller(label_names)) +
ylab("Gene expression (relative)")
Note: For both plots I’ve written that the gene expression is in a relative unit. In reality they represent log2 normalized fluorescence intensity values which is unitless.
Note: In both heat maps below, I decided to take out the diagonal as it will always be filled with ones. This enables us to have a more informative color gradient.
# function that will be used to extract the last 2 digits of ExternalID
substrRight <- function(x){
c <- as.character(x)
substr(c, nchar(c)-1, nchar(c))
}
# REORDERING
# here orders by time first and then treatment
indexTimeTreatment <- order(designMat$timeCont, designMat$Treatment)
designTimeTreatment <- designMat[indexTimeTreatment,]
sampleNameTimeTreatment <- rownames(designTimeTreatment)
correlationTimeTreatment <- cor(transData[,sampleNameTimeTreatment])
treatmentShort <- as.character(designTimeTreatment$Treatment)
treatmentShort <- replace(treatmentShort,treatmentShort=="cigarette_smoke","smoke")
niceLabelTimeTreatment <- paste(designTimeTreatment$time, treatmentShort, substrRight(designTimeTreatment$ExternalID))
# sets the diagonal to empty because will always be 1 and want to be able to see
# small differences between other samples
diag(correlationTimeTreatment) = NA
m <- list(l=150, r=50, b=150, t=60, pad=4)
# needs to force the non-alphabetical ordering
xylabTimeTreatment = list(categoryorder = "array", categoryarray = niceLabelTimeTreatment)
plot_ly(x = niceLabelTimeTreatment, y=niceLabelTimeTreatment, z = ~correlationTimeTreatment, type = "heatmap", colorscale = "Greys") %>%
layout(title = 'Correlation heatmap: ordering by time and then treatment', margin=m,xaxis = xylabTimeTreatment, yaxis = xylabTimeTreatment)
# REORDERING
# here orders by treatment first and then by time
indexTreatmentTime <- order(designMat$Treatment, designMat$timeCont)
designTreatmentTime <- designMat[indexTreatmentTime,]
sampleNameTreatmentTime <- rownames(designTreatmentTime)
correlationTreatmentTime <- cor(transData[,sampleNameTreatmentTime])
treatmentShort <- as.character(designTreatmentTime$Treatment)
treatmentShort <- replace(treatmentShort,treatmentShort=="cigarette_smoke","smoke")
niceLabelTreatmentTime <- paste(designTreatmentTime$time, treatmentShort, substrRight(designTreatmentTime$ExternalID))
# sets the diagonal to empty because will always be 1 and want to be able to see
# small differences between other samples
diag(correlationTreatmentTime) = NA
m <- list(l=150, r=50, b=150, t=60, pad=4)
# needs to force the non-alphabetical ordering
xylabTreatmentTime = list(categoryorder = "array", categoryarray = niceLabelTreatmentTime)
plot_ly(x = niceLabelTreatmentTime, y=niceLabelTreatmentTime, z = ~correlationTreatmentTime, type = "heatmap", colorscale = "Greys") %>%
layout(title = 'Correlation heatmap: ordering by treatment and then time', margin=m,xaxis = xylabTreatmentTime, yaxis = xylabTreatmentTime)
We see that every coorelation result is very high: approximatively between [0.88,0.94]. This high correlation is not very suprising as it means that when a gene is relatively highly expressed, it will be relatively highly expressed in every sample. To be more precise (default correlation in R is pearson) the gene expression in different samples are close to linearly dependant (with a positive slope). This very probably due to a confounding variable, which is simply the mean expression of a gene.
In the first heat map, we see a pattern of 3*3 squarres (not perfect but distinguishable). This is not suprising as the axis are ordered by: 3 “smoke” samples then 3 “control” sample then 3 “smoke” … These square patterns tend to be more blue when comparing 3 “control samples” with 3 “smoke samples” and red otherwise. This indicates us that the Treatment is very important and seems to be more important than the time. It is interesting to notice that these pattern do not seem to be present for “1_h smoke” samples. We could thus hypothesize that the smoke didn’t have the chance to make an effect yet on these samples.
In the second heat map, we distinguish relatively well 4 squares, one at each corner. As before, this is not suprising as they show the effect of teh Treatment. As before, we see that the top left and bottom right square (different treatment) are more blue while the 2 others (same treatment) are more red. We see it very well if we do not look at the “1_h smoke” sample (which I encourage you to do as the heat map is interactive). Anew information which was complicated to see before is that all the control samples are very highly correlates together (on the other heat map the difference between control-control and smoke-smoke was harder to see).
One intersting information that we can catch sight of quickly, is the fact that “1_h control 74” has a very low correlation with all other genes. This could indicate a possible problem with that sample (outlier). We can also see 2 other samples that have a low overall correlation with other samples (“24_h smoke 94” and “24_h smoke 95”). Inspite of their low correlations with other samples, they seem to be highly correlated bewtween them and with the third 24_h sample : “24_h smoke 96”. This indicates that contrary to the the first “outlier” we decribed (which could be simply an error), these 2 seem very interesting! On the second heatmap we also realize that sample “24_h smoke 96” seems suprisingly highly correlated with every other sample. It could be interesting to keep that sample in mind and maybe also test its outlierness.
To look more closely at the outlierness of these samples, let’s have a look at the kernel density plot:
Note I’m keeping the diagonal empty as what were are interested in as it will always be 1 and does not give any valuable information! Keeping it could indeed increas the the number of false negative uselessly for smaller number of samples (shouldn’t change anything for 23).
# plots kernel density
correlationMeans <- rowMeans(correlationTreatmentTime, na.rm = TRUE)
plot(density(correlationMeans),
main = "Average correlation with other samples")
rug(correlationMeans, lwd = 1.5, ticksize = 0.2)
rug(min(correlationMeans), col = "red", lwd = 2.5, ticksize = 0.2)
On the density plot we do see that one of the samples (“1_h control 74”) could be an outlier, although it doesn’t seem shocking enough to remove it.
Note: Here we a only comparing the mean correlation of the sample to the average total mean correlation. This gives doesn’t take into account an important information which is that the sample are grouped by 3 (or 2). Let’s look also plot the groups:
niceLabelNoId <- unique(paste(designTreatmentTime$time, treatmentShort))
# plots kernel density
plot(density(correlationMeans),
main = "Average correlation with other samples")
iOld = 1
k=1
for (i in seq(from = 2, to = 23, by = 3)) {
# plots ticks by group
rug(correlationMeans[iOld:i], col = rainbow(8)[k], lwd = 2.5, ticksize = 0.2)
iOld = i
# color counter
k=k+1
}
# adds manual legen
par(new=TRUE)
plot(1, type="n", axes=FALSE, xlab="", ylab="")
legend(1, 1, legend = niceLabelNoId, col=rainbow(8),lwd=3, cex=0.9, xjust=-1.5, yjust=0)
This plot is very interesting because we now see the group and population correlation average. Now sample (“1_h control 74”) seems very likely to be an outlier as it is very different from the 2 other “1_h control” samples.
To test these hypothesis quantitatively (although not taking into account the group average: see other techniques for better methods that would take it into account). Let’s compute a t-test to compare the sample which could be an utlier with all other samples.
Note: We have to use var.equal = T (not Welch test) because we cannot compute \(s_x\) as the sample size is of 1. But this assumption of equal variance seems plausible as it would be the expression variance of every cell in our body. Note that with this t-test we assume that identically distributed, which is not true as we know that there are sub groups. This should be better taken into account using for example bayesian models (see below).
alpha = 0.95
for (i in c(1:23)){
testOutlierT <- t.test(correlationMeans[i],correlationMeans, alternatives = "two.sided", conf.level = alpha, var.equal = T)
# if 0 is not in confidence interval
if(sign(testOutlierT$conf.int[1]) == sign(testOutlierT$conf.int[2])){
print(paste(niceLabelTreatmentTime[i], "is significantly different from the others, withp-value=", testOutlierT$p.value))
}
}
## [1] "1_h control 74 is significantly different from the others, withp-value= 0.0421846532518661"
We therefore see that even if we don not take into account the multiple groups (which would make “1_h control 74” even mor outlier) it is already an outlier.
Other techniques that I could have used:
Assume that each sample lives in a 23 dimensional world (one dimension for the correlation with each other sample) and that it is sampled by a multivariate gaussian. So we could fit a multivariate gaussian and compute the probability that each sample comes from that distribution. A sample would be considered as an outlier if it’s probability under the multivariate gaussian is beneath a threshold. Problem: gaussian distribution is not robust to outliers. This distribution would not take into account the fact that that there are 8 sub-goups that should be very similar.
Use a dimensionality reduction algorithm and “see” if there are outliers. Ex: t-sne or PCA. Problem: PCA is based on squarred error and is thus not robust. T-SNE could be a good idea! These 2 methods would still not take into account the fact that that there are 8 sub-goups that should be very similar.
Fit a empirical bayesian hierarchical model whith parameters \(\phi\) and \(\theta\) where we can assume conditional independance such that \(p(sample = x | \phi , \theta) = p(sample=x|\theta)p(\theta|\phi)p(\phi)\). With \(\theta\) the parameter of the group of samples (8 groups: same treatment and same “time”). \(\phi\) the parameter of the distribution of all the samples. Finally compute the probability under these priors. This would probably be the best way if we use a robust distribution such as the laplace or t distribtuion.
desTreatment <- model.matrix(~Treatment, designMat)
fitTreatment <- lmFit(transData, desTreatment )
fitTreatmentEb <- eBayes(fitTreatment)
ttTreatment <- topTable(fitTreatmentEb)
## Removing intercept from test coefficients
show(ttTreatment)
## logFC AveExpr t P.Value adj.P.Val
## 200779_at -0.7613410 13.983184 -8.577428 1.419183e-08 0.0001750755
## 202912_at -1.7199499 12.764302 -8.161121 3.365296e-08 0.0001750755
## 214696_at -1.9074941 11.730937 -8.155234 3.407195e-08 0.0001750755
## 223394_at -1.2644771 10.909886 -8.147206 3.465212e-08 0.0001750755
## 223774_at -1.1696606 9.529931 -8.080341 3.989741e-08 0.0001750755
## 209020_at -1.0223766 10.677979 -7.939997 5.373691e-08 0.0001750755
## 202672_s_at -3.5302487 9.716310 -7.904577 5.795528e-08 0.0001750755
## 220468_at -2.4132680 6.488711 -7.876047 6.160022e-08 0.0001750755
## 223982_s_at -0.9256205 10.907060 -7.729633 8.438527e-08 0.0001933190
## 226924_at -0.6366539 9.588851 -7.726141 8.502396e-08 0.0001933190
## B
## 200779_at 9.623149
## 202912_at 8.840436
## 214696_at 8.829176
## 223394_at 8.813808
## 223774_at 8.685431
## 209020_at 8.413709
## 202672_s_at 8.344646
## 220468_at 8.288875
## 223982_s_at 8.000666
## 226924_at 7.993752
I decided to use the \(ref + effect\) model and not the cell means because I find that the p–value computed is the more natural one. In plain english : the model I’m fitting is taking the reference as “cigarette_smoke”. So all the expression values are first modelled by the mean of the “cigarette_smoke” samples. It then substracts a parameter only for “control” variables. These 2 parameters give us the model, the difference with the given expression value is the residue \(\epsilon\). The linear model is fitted by minimizing \(\epsilon\) (computing and finding it’s minimum by equalizing the derivative to 0).
In more mathy way here is the model fitted: \[Y_{ij} = \theta + \tau_j + \epsilon_{ij}\] which can be written in matrix form \[\begin{align*} y&=X \alpha + \epsilon \\ \left( \begin{matrix} y_{11}\\ \vdots\\ y_{n1}\\ y_{12}\\ \vdots\\ y_{n2} \end{matrix} \right) &= \left( \begin{matrix} 1 & 0\\ \vdots & \vdots \\ 1 & 0 \\ 1 & 1\\ \vdots & \vdots \\ 1 & 1 \end{matrix} \right) * \left( \begin{matrix} mean\_smoke \\ \delta_{control - smoke} \end{matrix} \right) + \left( \begin{matrix} \epsilon_{11}\\ \vdots\\ \epsilon_{n1}\\ \epsilon_{12}\\ \vdots\\ \epsilon_{n2} \end{matrix} \right) \end{align*}\]Here the adjusted p-value therefore tests if the expression difference between “smoke” samples and “control” samples is statistically significant. i.e: The null hypothesis \(H0\) is that the mean of control samples and smoke samples is the same. The adjusted p-value use the Benjamin and Hocberg method to adjust the p-value (important beacsue of the huge number of tests). It therefore gives the proprtion of false positives we are expected to get if we call this feature significant (i.e. targets FDR).
cutoffP <- 1e-03
cutoffPadj <- 0.05
numhitsTreatment <- nrow(topTable(fitTreatmentEb, p.value = cutoffP, n = Inf, adjust.method="none"))
## Removing intercept from test coefficients
numBHhitsTreatment <- nrow(topTable(fitTreatmentEb, p.value = cutoffPadj, n = Inf))
## Removing intercept from test coefficients
There are 805 probes with p-values smaller than 0.001. While there are 1238 probes with adjusted p-values smaller than 0.05.
nHit = 50
hitsTreatment <- topTable(fitTreatmentEb, n = nHit)
## Removing intercept from test coefficients
hitNames <- rev(rownames(hitsTreatment))
gExpression <- as.matrix(transData[hitNames,sampleNameTreatmentTime])
m <- list(l=150, r=50, b=150, t=60, pad=4)
# needs to force the non-alphabetical ordering
xlabHits = list(categoryorder = "array", categoryarray = niceLabelTreatmentTime)
ylabHits = list(categoryorder = "array", categoryarray = hitNames)
plot_ly( x=niceLabelTreatmentTime , y=~hitNames ,z = ~gExpression, type = "heatmap", colorscale = "Greys") %>%
layout(title = 'Heatmap: top50 genes transcription' , margin=m,xaxis = xlabHits, yaxis = ylabHits)
By definition \(FDR = \frac{F}{S}\) so \(F=S*FDR\). Where \(FDR\) is the False Discovery Rate threshold used (i.e the max FDR of the hits). \(S\) is number of significant gene which we can estimate by simply using the number of genes with \(p-value \leq t\).
FDRhit <- max(hitsTreatment$adj.P.Val)
estimatedF = nHit * FDRhit
We therefore conclude that the estimated false discovery rate is 0.001036 and that the number of false positives in our 50 hits is 0.0517996.
# use time as quantitative covariate
desTime <- model.matrix(~timeCont, designMat)
fitTime <- lmFit(transData, desTime )
fitTimeEb <- eBayes(fitTime)
ttTime <- topTable(fitTimeEb)
## Removing intercept from test coefficients
show(ttTime)
## logFC AveExpr t P.Value adj.P.Val
## 202586_at -0.04321190 9.045753 -11.559699 5.595299e-11 1.272203e-06
## 203201_at -0.04242793 10.057592 -10.121063 7.109181e-10 6.663751e-06
## 227559_at -0.05501598 8.182478 -9.958972 9.609875e-10 6.663751e-06
## 202769_at 0.04317888 12.095002 9.769107 1.373425e-09 6.663751e-06
## 226775_at -0.03828553 8.647287 -9.734894 1.465398e-09 6.663751e-06
## 213113_s_at -0.04650788 10.391574 -9.412004 2.721027e-09 1.031133e-05
## 202770_s_at 0.03591883 11.510308 9.272069 3.572582e-09 1.160426e-05
## 226226_at 0.04323136 9.561381 9.139895 4.630899e-09 1.316159e-05
## 202887_s_at 0.04197944 13.601392 8.972788 6.449452e-09 1.629347e-05
## 200810_s_at -0.03436033 11.350772 -8.828058 8.617559e-09 1.674623e-05
## B
## 202586_at 15.28331
## 203201_at 12.74567
## 227559_at 12.44384
## 202769_at 12.08602
## 226775_at 12.02105
## 213113_s_at 11.40035
## 202770_s_at 11.12709
## 226226_at 10.86660
## 202887_s_at 10.53391
## 200810_s_at 10.24274
numhitsTime <- nrow(topTable(fitTimeEb, p.value = cutoffP, n = Inf, adjust.method="none"))
## Removing intercept from test coefficients
numBHhitsTime <- nrow(topTable(fitTimeEb, p.value = cutoffPadj, n = Inf))
## Removing intercept from test coefficients
#will not be needed: don't keep big data
remove(ttTime)
remove(fitTimeEb)
remove(fitTime)
There are 958 probes with p-values smaller than 0.001. While there are 1451 probes with adjusted p-values smaller than 0.05.
# use time as quantitative covariate
desTimeTreatment <- model.matrix(~timeCont*Treatment, designMat)
fitTimeTreatment <- lmFit(transData, desTimeTreatment )
fitTimeTreatmentEb <- eBayes(fitTimeTreatment)
ttTimeTreatment <- topTable(fitTimeTreatmentEb)
## Removing intercept from test coefficients
show(ttTimeTreatment)
## timeCont Treatmentcontrol timeCont.Treatmentcontrol
## 214290_s_at 0.09410100 -0.16551666 -0.07394082
## 202870_s_at -0.08934809 0.22069666 0.06287376
## 208955_at -0.06812775 -0.17577142 0.05748477
## 203108_at 0.08038465 0.59479660 -0.11407096
## 218542_at -0.06617139 0.09262906 0.05928858
## 212281_s_at -0.06795005 -0.13169137 0.04787848
## 202954_at -0.08021917 -0.06430425 0.06099165
## 225687_at -0.06742085 -0.01686898 0.06235953
## 202779_s_at -0.06825002 0.01796472 0.04901065
## 203764_at -0.08326001 -0.08054327 0.08864904
## AveExpr F P.Value adj.P.Val
## 214290_s_at 12.36283 156.55408 2.659658e-14 6.047264e-10
## 202870_s_at 11.80425 133.84586 1.230797e-13 1.399232e-09
## 208955_at 10.09237 123.82646 2.621916e-13 1.987150e-09
## 203108_at 12.71756 101.30302 1.817024e-12 1.032842e-08
## 218542_at 11.21883 90.07826 5.573399e-12 2.534448e-08
## 212281_s_at 10.96036 86.82289 7.903859e-12 2.782995e-08
## 202954_at 11.74982 86.08671 8.567956e-12 2.782995e-08
## 225687_at 11.25114 84.17688 1.059449e-11 3.011088e-08
## 202779_s_at 12.15344 74.95279 3.157967e-11 7.978077e-08
## 203764_at 10.82074 73.08677 3.997812e-11 9.089826e-08
numhitsTimeTreatment <- nrow(topTable(fitTimeTreatmentEb, p.value = cutoffP, n = Inf, adjust.method="none", coef = "Treatmentcontrol"))
numBHhitsTimeTreatment <- nrow(topTable(fitTimeTreatmentEb, p.value = cutoffPadj, n = Inf, coef = "Treatmentcontrol"))
There are 621 probes with p-values smaller than 0.001. While there are 768 probes with adjusted p-values smaller than 0.05. This means that there are now 184 less probes with unadjusted p-value smaller than 0.001. While there are 470 less probes with adjusted p-values smaller than 0.05. This is because we are now controlling for the interaction between Time and Treatment. One mathematical way to understand it is that we added parameters because the model has now 4 parameters (before it had 2), some variance that was explained by the Treatment parameter is now explained by the Time and /or Time-Treatment interaction parameter. The treatment has therefore less weight! Note: there are also hits that were not present when cosidering only the Treatment but are present now, the reason behind it has to do with variance. Indeed, by adding new parameters we incereased the number of explained variance (i.e we decreased the residue \(\epsilon\)) this is easy to understand as the parameters were found to minimize \(\epsilon\), so if it didn’t decrease the parameters would have been put to 0. By decreasing the variance it means that some variables that “were closed to be significative” before can become significative.
namesHitTreatment <- rownames(topTable(fitTreatmentEb, p.value = cutoffP, n = Inf, adjust.method="none", coef = "Treatmentcontrol"))
namesHitTimeTreatment <- rownames(topTable(fitTimeTreatmentEb, p.value = cutoffP, n = Inf, adjust.method="none", coef = "Treatmentcontrol"))
nOverlap <- length(intersect(namesHitTreatment,namesHitTimeTreatment))
length(setdiff(namesHitTimeTreatment,namesHitTreatment))
## [1] 293
length(setdiff(namesHitTreatment,namesHitTimeTreatment))
## [1] 477
There are 328 overlaping probes, which represents 52.8180354 % of the probes now that we control the interaction variable.
pValueTreatment <- topTable(fitTreatmentEb, n = Inf, coef = "Treatmentcontrol")$P.Value
pValueTreatmentTime <- topTable(fitTimeTreatmentEb, n = Inf, coef = "Treatmentcontrol")$P.Value
plot(density(pValueTreatment), xlim=c(0.0,1), ylim=c(0.0,2.5), main="Density of model with and without interaction",col="darkblue"); par(new=TRUE); plot(density(pValueTreatmentTime),axes=FALSE, xlab='', ylab='', xlim=c(0.0,1), ylim=c(0.0,2.5), main="",col="red1"); legend("right", legend = c("Treatment-Time","Treatment"),text.col=c("red1","darkblue")); abline(v = 0.025, col = "gray10"); text(0.025,2.37, "0.05", col = "gray10", adj = c(-.2, -.1), cex=2)
Not suprisingly we see that the distribution of the P-value is very close for both models. We are also happy to see a spike around \(\alpha = 0.025\) which shows that the p-values are not unifromaly distributed. This would be the case if \(H0\) were true.
With the interaction model we are adding the term \(effect\_smoke*time\), which means that we are hypothesizing that the difference between gene expression of “smoke” and “control” samples change significantly when varying the time. One possible example might be that expression of “control” samples is always constant, while “smoke” samples start with the same expression (no effect yet) then change the expression (effect of tobacco on cells) and finally goes back to the “control” expression (effect disapears with time).
numhitsTimeTreatmentInteraction <- nrow(topTable(fitTimeTreatmentEb, p.value = cutoffP, n = Inf, adjust.method="none", coef = "timeCont:Treatmentcontrol"))
numBHhitsTimeTreatmentInteraction <- nrow(topTable(fitTimeTreatmentEb, p.value = cutoffPadj, n = Inf, coef = "timeCont:Treatmentcontrol"))
There are 573 probes with p-values smaller than 0.001. While there are 664 probes with adjusted p-values smaller than 0.05.
With the interaction model we are adding the term \(effect\_smoke*time\), which means that we are hypothesizing that the difference between gene expression of “smoke” and “control” samples change significantly when varying the time. One possible example might be that expression of “control” samples is always constant, while “smoke” samples start with the same expression (no effect yet) then change the expression (effect of tobacco on cells) and finally goes back to the “control” expression (effect disapears with time).
nameInteracHit <- rownames(topTable(fitTimeTreatmentEb, n = 4, sort.by = "p", coef = "timeCont:Treatmentcontrol"))
nameNoInteracHit <- rev(rownames(topTable(fitTimeTreatmentEb, n = Inf, sort.by = "P", coef = "timeCont:Treatmentcontrol")))[1:4]
dataInterac <- prepareData(nameInteracHit)
dataNoInterac <- prepareData(nameNoInteracHit)
gplotInterac <- ggplot(dataInterac, aes(x=timeCont,y=gExp,color=Treatment)) +
geom_point() +
facet_wrap(~ gene, ncol = 4) +
geom_smooth(method='lm', se=F) +
ggtitle("Interaction between treatment and time")
ggplotly(gplotInterac)
gplotNoInterac <- ggplot(dataNoInterac, aes(x=timeCont,y=gExp,color=Treatment)) +
geom_point() +
facet_wrap(~ gene, ncol = 4) +
geom_smooth(method='lm', se=F) +
ggtitle("Interaction between treatment and time")
ggplotly(gplotNoInterac)
#will not be needed: don't keep big data
remove(ttTimeTreatment)
remove(fitTreatmentEb)
remove(fitTimeTreatmentEb)
remove(fitTreatment)
remove(fitTimeTreatment)
Note: I’ve plotted the plots above with a continuous time, as the model was made using a continuous time variable.
# removes previous data that won't be necessary
remove(transData)
remove(designMat)
transData2 <- read.table("Data/GSE37599-data.tsv", sep="\t", header=TRUE, row.names = 1)
str(transData2)
## 'data.frame': 10928 obs. of 6 variables:
## $ b1: num 11.15 2.16 1.49 9.01 6.95 ...
## $ b2: num 6.8 3.18 1.43 9.46 6.9 ...
## $ b3: num 6.71 3.13 1.82 9.23 6.96 ...
## $ c1: num 10.95 2.5 1.46 8.97 6.85 ...
## $ c2: num 6.7 3.05 2.08 9.28 6.9 ...
## $ c3: num 11.07 2.44 1.62 9 6.89 ...
samples <- colnames(transData2)
if (!anyNA(transData2)) { print("There are no missing values in Data") }
## [1] "There are no missing values in Data"
The dataset contains the expression of:
# plot parameters
lowerFn <- function(data, mapping, method = "lm", ...) {
p <- ggplot(data = data, mapping = mapping) +
geom_point(colour = "blue") +
geom_smooth(method = method, color = "red", ...)
p
}
ggpairs(transData2,
upper = list(continuous = wrap("cor", size = 7)),
lower = list(continuous = wrap(lowerFn, method = "lm"))) +
ggtitle("Correlation scatter plot matrix") +
theme(plot.title = element_text(hjust = 0.5, size = 15),
legend.position = "none",
panel.grid.major = element_blank(),
axis.ticks = element_blank(),
panel.border = element_rect(linetype = "dashed", colour = "black", fill = NA))
In order to find the switched label we can simply focus on one of the 2 condition (b or c) and try to find a replicate which seems very different than the others. To be more precise we can look at the correlation between the samples in order to find a sigle sample which isn’t as much correlated to the others. Note: due to high correlation of transcription data, it is highly unprobable to see a sample which isn’t correlated to the others we thus need to find one which is “less” correlated. Replicates should indeed be extrimely highly correlated as they (in a theoretically perfect experience and world) should have the same gene transcription.
Looking at the scatter plots for the batch replicates, we clearly see that b3 and b2 a pperfect straight line, while b1-b2 and b1-b3 seem to have a higher variability. We can thus conclude that b1 is the most different sample from the 3, this can be easily seen looking at the correlation values of 0.962 for b1-b2 and b1-b3 while it close to “perfect” for b2-b3 (0.998). In the chemostat replicates, the same can be seen for c2. We thus hypothesize that c2 and b1 have switched label. Note that if that were true we would also see a high correlation between c2-b2, c2-b3, b1-c1 and b1-c3. This is also important to check as it could also be that b1 and c2 are less highly coreelated but weren’t switched. Looking at the respective correlations of 0.998, 0.997, 0.998, 0.999 we see that this is true and that it is very probable that there has been a label switch.
First I will fit a linear model like in question 3.2 to find the top 100 hits. I will then plot the expression of each of these genes.
# fits linear model for finding top 100 genes
fitCondition <- lmFit(transData2)
fitConditionEb <- eBayes(fitCondition)
nHit = 100
hitsCondition <- topTable(fitConditionEb, n = nHit)
hitNames <- rev(rownames(hitsCondition))
gExpression <- as.matrix(transData2[hitNames,])
m <- list(l=150, r=50, b=50, t=60, pad=4)
# needs to force the non-alphabetical ordering
plot_ly(x=samples, y=hitNames, z = ~gExpression, type = "heatmap", colorscale = "Greys") %>%
layout(title = 'Heatmap: top100 genes transcription', margin=m)
#will not be needed: don't keep big data
remove(fitConditionEb)
remove(fitCondition)
Note: on the figure above we do not see much because the differences between genes are higher than between samples. Let us thus only look at the difference of each sample transcription compared to its respective gene average transcription (simply substracts corresponding gene average).
gExpressionDifference <- t(apply(gExpression, 1, function(x)(x-mean(x))))
m <- list(l=150, r=50, b=50, t=60, pad=4)
# needs to force the non-alphabetical ordering
xlabHits = list(categoryorder = "array", categoryarray = samples)
ylabHits = list(categoryorder = "array", categoryarray = hitNames)
plot_ly(x=samples, y=hitNames, z = ~gExpressionDifference, type = "heatmap", colorscale = "Greys") %>%
layout(title = 'Heatmap: top100 genes transcription compared to average', margin=m,xaxis = xlabHits, yaxis = ylabHits)
On that figure it is still pretty complicated to see that 2 samples were switched. Although c2 does seem to be different than c1 and c3. But I wouldn’t bet based on that figure :) .
# calculating sample-sample correlations, default is pearson but still specifies here
corelationMatrix <- cor(transData2, method = "pearson")
mypal <- rev(colorRampPalette(brewer.pal(11,"RdBu"))(100)) #setting colour palette
heatmap.2(corelationMatrix, symm = T, #making a heatmap
trace = "none", col = mypal,
cexCol = 2.77, cexRow = 2.77,
main = "Sample-Sample Correlations")
From the previous questions we know that the the samples that seem to be switched are b1 and c2. Here we see the same information as in the scatter plot matrix but with color. Note: as before we clearly see that b1 is closer to c1 and c3 than to the other b (vis versa for c2). Note: it is interesting to see that when the package heatmap.2 clusters teh samples it clusters b1 with the c’s and c2 with the b’s. The dendogramm also shows us that b1 is even closer to c3 than c1 to c3.
#compute PCs with normalizing
pcs <- prcomp(transData2, center = T, scale = T)
pcs2 <- as.data.frame(pcs$rotation[,c("PC1","PC2")])
g <- ggplot(pcs2, aes(x=PC1, y=PC2)) +
geom_point(aes(colour=samples), size=5) +
ggtitle("PCA") +
theme_minimal()
ggplotly(g)
Here again we clearly see that b1 and c2 seem to be swapped. Note that PC2 seem to be able distinguish between the type of sample (b or c). We should keep that in mind as it probably is a linear combinantion of most of genes that are of interest to us.
# swaps c2 and b1
fixedSamples = c("c2","b2","b3","c1","b1","c3")
transDataFix <- transData2[,fixedSamples]
#will not be needed: don't keep big data
remove(transData2)
First lets fit the linear model.
# Retrive gene ID
library("yeast2.db") # Load the yeast2.db package
gene.ids <- unlist(mget(rownames(transDataFix), yeast2ORF)) # See all the genes for the list of probes
# makes a design matrix
conditions <- c("batch","batch","batch","chemostat","chemostat","chemostat")
designMatFix <- data.frame(conditions)
row.names(designMatFix) <- fixedSamples
designCondition <- model.matrix(~ conditions, designMatFix)
# fits linear model
fitCondition <- lmFit(transDataFix, designCondition)
fitConditionEb <- eBayes(fitCondition)
ttCondition <- topTable(fitConditionEb)
show(ttCondition)
## logFC AveExpr t P.Value adj.P.Val B
## 1772391_at 10.111470 7.043828 108.92958 9.477652e-14 1.035718e-09 20.96284
## 1774122_at 8.473827 6.852455 89.86550 4.277079e-13 2.336996e-09 20.05204
## 1774070_at 8.990866 8.378757 80.69881 9.932237e-13 3.617983e-09 19.47328
## 1776153_at 7.304449 5.427876 72.70264 2.248189e-12 5.717049e-09 18.86696
## 1774072_at 8.577513 6.886731 71.30964 2.615780e-12 5.717049e-09 18.74990
## 1776304_at 8.081401 6.729502 69.09726 3.347614e-12 5.881184e-09 18.55620
## 1770580_at 8.444769 7.037351 67.77269 3.895175e-12 5.881184e-09 18.43543
## 1774019_at 8.926037 8.104420 66.44446 4.547787e-12 5.881184e-09 18.31054
## 1776285_at 4.895122 7.692760 65.91159 4.843582e-12 5.881184e-09 18.25934
## 1776791_at 6.053323 6.743367 61.70349 8.116417e-12 8.521536e-09 17.83138
Now lets simply put that in a data frame.
# Makes the asked data frame
ttCondition <- topTable(fitConditionEb, number=Inf, sort.by="none")
## Removing intercept from test coefficients
limmaDF <- data.frame(probe.id = rownames(transDataFix),
gene.id = gene.ids,
p.value = ttCondition$P.Value,
q.value = ttCondition$adj.P.Val,
log.fc = ttCondition$logFC,
test.stat = ttCondition$t)
#will not be needed: don't keep big data
remove(ttCondition)
remove(fitConditionEb)
remove(fitCondition)
# COunts differentially expressed genes before deleting rows
cutoffPadj <- 1e-5
numBHhitsConditionUnmatched <- nrow(limmaDF[limmaDF$q.value < cutoffPadj,])
# Delete rows that have NA in gene ID
limmaDF <- limmaDF[!is.na(limmaDF$gene.id),]
# saves data frame. Note: not really necessary as I'm not removing it, and everything is in the same doc
write.csv(limmaDF, file = "Data/limmaDF.csv",row.names = TRUE)
As already stated before, the total number of probes is 10928. Now, after having removed unmatched genes, the number of probes is only of 5705.
prepareData <- function(myGenes) {
miniDat <- t(transDataFix[myGenes, ])
miniDat <- suppressWarnings(data.frame(gExp = as.vector(miniDat),
gene = rep(colnames(miniDat), each = nrow(miniDat))))
miniDat <- suppressWarnings(data.frame(designMatFix, miniDat))
miniDat
}
# stores the prob id of the top hit
topHitProbID <- as.character(limmaDF[limmaDF$q.value == min(limmaDF$q.value), "probe.id"])
preparedTopHit <- prepareData(topHitProbID)
stripplot(gExp ~ conditions | gene, preparedTopHit,
jitter.data = TRUE,
auto.key = TRUE, type = c('p', 'a'), grid = TRUE)
Here we clearly see that the top gene has a more expressed in the chemostat than in the batch condition.
Wanting a FDR of \(10^{-5}\) can simply be approximated by setting a threshold of the adjusted p-value at \(10^{-5}\).
cutoffPadj <- 1e-5
numBHhitsCondition <- nrow(limmaDF[limmaDF$q.value < cutoffPadj,])
There are 725 genes at a FDR of \(10^{-5}\). What is interested to remark is that before removing the unmatched genes there were only 7 more “differentially expressed genes”. This isn’t suprising at all as probes that aren’t overlapping with genes shouldn’t be differentially expressed. Their shouldn’t even be these 7 probs!
# removes previous data that won't be necessary
#remove(transDataFix)
#remove(designMatFix)
countData <- read.table("Data/stampy.deep.counts.tsv", sep="\t", header=TRUE, row.names = 1)
# makes a design matrix
samples <- colnames(countData)
conditions <- c("batch","batch","batch","chemostat","chemostat","chemostat")
designMatCount <- data.frame(conditions)
row.names(designMatCount) <- samples
designConditionCount <- model.matrix(~ conditions, designMatCount)
str(countData)
## 'data.frame': 6542 obs. of 6 variables:
## $ b1: int 11 40 31 63 112 17 0 4 5 0 ...
## $ b2: int 3 26 52 87 106 21 0 2 8 1 ...
## $ b3: int 5 10 40 53 60 15 0 2 1 0 ...
## $ c1: int 13 71 12 51 139 7 0 11 5 1 ...
## $ c2: int 5 41 18 78 142 6 0 0 5 0 ...
## $ c3: int 17 48 32 100 127 2 2 3 2 1 ...
if (!anyNA(countData)) { print("There are no missing values in Data") }
## [1] "There are no missing values in Data"
The dataset contains the RNA seq count data of:
6542 genes. Note before the rows were the probes on a microarray, here they are genes.
6 samples: there are 3 replicates of 2 different conditions in which the the yeast were grown (b* stands for batch replicate number* and c* stands for chemostat replicate number*)
As before we can check whether samples have been swapped with the heatmap of the correlation matrix.
# calculating sample-sample correlations, default is pearson but still specifies here
corelationMatrix <- cor(countData, method = "pearson")
mypal <- rev(colorRampPalette(brewer.pal(11,"RdBu"))(100)) #setting colour palette
heatmap.2(corelationMatrix, symm = T, #making a heatmap
trace = "none", col = mypal,
cexCol = 2.77, cexRow = 2.77,
main = "Sample-Sample Correlations")
On the contrary to before, nothing shows us that there might have meen a swap. Indeed each of the b’s and c’s seem to be clustered together.
First needs to prepare data for edgeR.
dge_obj <- DGEList(counts=countData, group=factor(conditions))
# normalize and filter based on cpm
dge_obj_cpm <- cpm(dge_obj)
# Manual said "a gene is required to have a count of 5-10" in a library to be considered expressed in that library". p.12
# compute smallest library size
min_lib_size_million <- min(dge_obj$samples$lib.size) / 1000000
# cpm of ~2 is equivalent to ~5 count in the smallest library
threshold <- 5/min_lib_size_million
keep <- rowSums(dge_obj_cpm > threshold)
# filter for gene expressed in at least all of the samples of one conditions
keep <- keep >=3
dge_obj_filtered <- dge_obj[keep, , keep.lib.sizes=FALSE] # keep.lib.sizes recalculates the library size.
head(dge_obj_filtered$samples)
## group lib.size norm.factors
## b1 batch 3419742 1
## b2 batch 4486378 1
## b3 batch 3414470 1
## c1 chemostat 2528577 1
## c2 chemostat 2989865 1
## c3 chemostat 3694270 1
#will not be needed: don't keep big data
remove(countData)
remove(dge_obj_cpm)
Out of curiosity let’s look at the difference in the library size for each sample due to the filtering:
filtered_lib <- (dge_obj$samples$lib.size - dge_obj_filtered$samples$lib.size)
head(data.frame(dge_obj$samples$group, filtered_lib))
## dge_obj.samples.group filtered_lib
## 1 batch 1109
## 2 batch 1207
## 3 batch 908
## 4 chemostat 994
## 5 chemostat 1091
## 6 chemostat 1402
We have removed 552 genes.
The sequence depth doesn’t need to be normalized for, as EdgeR will automatically adjust take that into account. But we do need to adjust for the fact that some some genes are over-expressed which can make other genes seem under-expressed after rescaling. We need to adjust for that using TMM (trimmed mean of M-values).
dge_obj_filtered_norm <- calcNormFactors(dge_obj_filtered)
head(dge_obj_filtered_norm$sample)
## group lib.size norm.factors
## b1 batch 3419742 1.0010227
## b2 batch 4486378 0.9799708
## b3 batch 3414470 0.9516236
## c1 chemostat 2528577 1.0369536
## c2 chemostat 2989865 1.0077786
## c3 chemostat 3694270 1.0250694
#will not be needed: don't keep big data
remove(dge_obj)
remove(dge_obj_filtered)
# Estimate trend-wise (all tag) dispersion and then tag-wise (one tag) dispersion in one go.
dge_obj_filtered_norm_disp <- estimateDisp(dge_obj_filtered_norm, designConditionCount)
plotBCV(dge_obj_filtered_norm_disp)
#will not be needed: don't keep big data
remove(dge_obj_filtered_norm)
From The plot above we see that the BCV trend is close to the common BCV. This thus shows that the following analysis with EdgeR will can be trusted, indeed the the common BCV that will be used isn’t to bad at representing the Tagwise BCV. Also most of gene-specific BCV cluster auround the trend which indicates that the estimated BCV can be used in edgeR. Note: a last good thing we see is that the common BCV is around 0.08 which is less than a typical BCV or genetically identicall models :0.1 ( source ). Having a low BCV means a low dispersion which means less variability and therefore a better statistical power.
Fit a negative binomial generalized linear model for each tag.
fit <- glmFit(dge_obj_filtered_norm_disp, designConditionCount)
lrt <- glmLRT(fit,coef="conditionschemostat")
# top genes
DEGenes <- topTags(lrt, n=Inf)
head(DEGenes$table)
## logFC logCPM LR PValue FDR
## YIL057C 10.019940 8.214836 2236.095 0 0
## YMR175W 9.362595 7.954929 1722.644 0 0
## YJR095W 9.355021 7.755606 2353.156 0 0
## YMR107W 9.009533 8.737205 2767.988 0 0
## YKL217W 8.512287 11.391069 2126.357 0 0
## YPL201C 7.589903 7.047413 1865.612 0 0
lattice::histogram(DEGenes$table$PValue)
# removes previous data that won't be necessary
remove(fit)
remove(lrt)
remove(dge_obj_filtered_norm_disp)
This plot is very suprizing: the number of genes that have a very low p-value is very big! Nearly 80 % this is crazy and tells us that there is many genes are diffenretialy expressed when in chemostat!
# makes the data fram that we were asked
edger.deep.results <- data.frame(gene.id = rownames(DEGenes$table),
p.value = DEGenes$table$PValue,
q.value = DEGenes$table$FDR,
log.fc = DEGenes$table$logFC,
test.stat = DEGenes$table$LR)
write.csv(edger.deep.results, file = "Data/edger.deep.results.csv",row.names = TRUE)
cutoffPadj <- 1e-5
numBHhitsConditionCount <- nrow(edger.deep.results[edger.deep.results$q.value < cutoffPadj,])
# removes previous data that won't be necessary
remove(DEGenes)
There are 2758 genes at a FDR of \(10^{-5}\). Which is close to half of the genes! That seems crazy again.
countData <- read.table("Data/stampy.low.counts.tsv", sep="\t", header=TRUE, row.names = 1)
dge_obj <- DGEList(counts=countData, group=factor(conditions))
# normalize and filter based on cpm
dge_obj_cpm <- cpm(dge_obj)
# Manual said "a gene is required to have a count of 5-10" in a library to be considered expressed in that library". p.12
# compute smallest library size
min_lib_size_million <- min(dge_obj$samples$lib.size) / 1000000
# cpm of ~2 is equivalent to ~5 count in the smallest library
threshold <- 5/min_lib_size_million
keep <- rowSums(dge_obj_cpm > threshold)
# filter for gene expressed in at least all of the samples of one conditions
keep <- keep >=3
dge_obj_filtered <- dge_obj[keep, , keep.lib.sizes=FALSE] # keep.lib.sizes recalculates the library size.
head(dge_obj_filtered$samples)
## group lib.size norm.factors
## b1 batch 67794 1
## b2 batch 89342 1
## b3 batch 67061 1
## c1 chemostat 47927 1
## c2 chemostat 57461 1
## c3 chemostat 70771 1
#will not be needed: don't keep big data
#remove(countData)
remove(dge_obj_cpm)
Here we see that the library size is a lot smaller than in 7.2 (as its supposed to be). Out of curiosity let’s look at the difference in the library size for each sample due to the filtering:
filtered_lib <- (dge_obj$samples$lib.size - dge_obj_filtered$samples$lib.size)
head(data.frame(dge_obj$samples$group, filtered_lib))
## dge_obj.samples.group filtered_lib
## 1 batch 9196
## 2 batch 11611
## 3 batch 8374
## 4 chemostat 7831
## 5 chemostat 9297
## 6 chemostat 11446
We have removed 4298 genes. Note: That is way more than before but doesn’t seem correct: we will always be removing more genes if the library size is smaller!
The sequence depth doesn’t need to be normalized for, as EdgeR will automatically adjust take that into account. But we do need to adjust for the fact that some some genes are over-expressed which can make other genes seem under-expressed after rescaling. We need to adjust for that using TMM (trimmed mean of M-values).
dge_obj_filtered_norm <- calcNormFactors(dge_obj_filtered)
head(dge_obj_filtered_norm$sample)
## group lib.size norm.factors
## b1 batch 67794 1.0265139
## b2 batch 89342 1.0227914
## b3 batch 67061 1.0015433
## c1 chemostat 47927 0.9927162
## c2 chemostat 57461 0.9792398
## c3 chemostat 70771 0.9782823
#will not be needed: don't keep big data
remove(dge_obj)
remove(dge_obj_filtered)
# Estimate trend-wise (all tag) dispersion and then tag-wise (one tag) dispersion in one go.
dge_obj_filtered_norm_disp <- estimateDisp(dge_obj_filtered_norm, designConditionCount)
plotBCV(dge_obj_filtered_norm_disp, cex=0.7, ylim=c(0, 0.5))
#will not be needed: don't keep big data
remove(dge_obj_filtered_norm)
From The plot above we see that the BCV trend is close to the common BCV when the log CPM is big but at low values it is pretty far away. In order to see the points I’ve made them a lot bigger, indeed with the default parameters we didn’t see them. The trend isn’t smooth and seems strange because there aren’t enough genes that weren’t filtered out, as result the trend simply joins each point. To be more correct we should probably be fitting an other spline which doesn’t overfit.
Fit a negative binomial generalized linear model for each tag.
fit <- glmFit(dge_obj_filtered_norm_disp, designConditionCount)
lrt <- glmLRT(fit,coef="conditionschemostat")
# top genes
DEGenes <- topTags(lrt, n=Inf)
head(DEGenes$table)
## logFC logCPM LR PValue FDR
## YOR374W 6.650904 12.27331 1399.5606 2.617691e-306 7.402829e-303
## YKL217W 8.718002 11.53380 1137.5732 2.252581e-249 3.185150e-246
## YBR072W 7.656596 11.38314 1032.7129 1.392397e-226 1.312566e-223
## YBL075C 6.989949 10.71024 730.8448 5.868076e-161 4.148730e-158
## YDR343C 5.354107 10.96674 722.9866 3.000759e-159 1.697229e-156
## YDR342C 4.788023 11.13263 717.8749 3.879329e-158 1.828457e-155
lattice::histogram(DEGenes$table$PValue)
# removes previous data that won't be necessary
remove(fit)
remove(lrt)
remove(dge_obj_filtered_norm_disp)
As before we see that there still is a lot of genes that “seem to be” differentially expressed between the condintions. The propotion of genes with an extremely low p-value is still lower than before of approximatively 20 %. Note: in reality the number of hits is way lower than before because here we look at the proportion, but a lot of genes have been filtered out. We thus conclude that low sequence deth results in more genes filtered out but also in a lower porportion of hits.
# makes the data fram that we were asked
edger.low.results <- data.frame(gene.id = rownames(DEGenes$table),
p.value = DEGenes$table$PValue,
q.value = DEGenes$table$FDR,
log.fc = DEGenes$table$logFC,
test.stat = DEGenes$table$LR)
write.csv(edger.deep.results, file = "Data/edger_low_results.csv",row.names = TRUE)
cutoffPadj <- 1e-5
numBHhitsConditionCount <- nrow(edger.low.results[edger.low.results$q.value < cutoffPadj,])
# removes previous data that won't be necessary
remove(DEGenes)
There are 470 genes at a FDR of \(10^{-5}\). Which is nearly 6 times less than before! This really shows us the loss of statistical power due to a lower sequencing depth.
cutoffPadj <- 1e-5
# finds gene id
hitsLow <- as.character(edger.low.results[edger.low.results$q.value < cutoffPadj,"gene.id"])
hitsDeep <- as.character(edger.deep.results[edger.deep.results$q.value < cutoffPadj,"gene.id"])
nGenesBoth <- sum(hitsLow %in% hitsDeep)
nGenesLow <- length(hitsLow)
nGenesDeep <- length(hitsDeep)
grid.newpage()
draw.pairwise.venn(nGenesDeep, nGenesLow, nGenesBoth, category = c("Deep", "Low"), lty = rep("blank",
2), fill = c("light blue", "pink"), alpha = rep(0.5, 2))
## (polygon[GRID.polygon.2226], polygon[GRID.polygon.2227], polygon[GRID.polygon.2228], polygon[GRID.polygon.2229], text[GRID.text.2230], text[GRID.text.2231], text[GRID.text.2232], text[GRID.text.2233])
We thus see from the image above that all the genes that were found in the low sequencing depth were also found in the deep one. We thus conclude that having a deeper depth only gives a higher statistical power, without “hiding” results. This seems pretty logical. In all the analysis there were 2758 genes that have been found (which corresponds to the number of genes in the “deep” experiment). While there has been 470 genes that were found (which corresponds to the number of genes in the “low” experiment).
Note for all th plots I’ve used the same axis limit so that you can compare between plots, but as a result it seems sometimes that the plot hasn’t the good scale.
# defines a function to draw side by side count and Expression
plotData <- function(myGenes) {
myProb <- as.character(limmaDF[limmaDF$gene.id == myGenes,"probe.id"])
# plot DEA
miniDat <- t(transDataFix[myProb, ])
miniDat <- suppressWarnings(data.frame(gExp = as.vector(miniDat),
gene = rep(colnames(miniDat), each = nrow(miniDat))))
miniDat <- suppressWarnings(data.frame(designMatFix, miniDat))
plotDEA <- stripplot(gExp ~ conditions | gene, miniDat ,
jitter.data = TRUE,
auto.key = TRUE, type = c('p', 'a'), grid = TRUE, ylim= c(6.5,13.3))
# plot RNA seq
miniDat <- t(countData[myGenes, ])
miniDat <- suppressWarnings(data.frame(count = as.vector(miniDat),
gene = rep(colnames(miniDat), each = nrow(miniDat))))
miniDat <- suppressWarnings(data.frame(designMatCount, miniDat))
plotRNA <- stripplot(count ~ conditions | gene, miniDat ,
jitter.data = TRUE,
auto.key = TRUE, type = c('p', 'a'), grid = TRUE, ylim= c(0,120))
# plot side by side
print(plotDEA, position = c(0, 0, 0.5, 1), more = TRUE)
print(plotRNA, position = c(0.5, 0, 1, 1))
}
# defines genes that will be needed
hitsRNAseq <- as.character(edger.low.results[edger.low.results$q.value < cutoffPadj,"gene.id"])
hitsDEA <- as.character(limmaDF[limmaDF$q.value < cutoffPadj,"gene.id"])
notHitsRNAseq <- as.character(edger.low.results[edger.low.results$q.value > cutoffPadj,"gene.id"])
notHitsDEA <- as.character(limmaDF[limmaDF$q.value > cutoffPadj,"gene.id"])
isRNAinDEA <- (hitsRNAseq %in% hitsDEA)
isNotDEAinRNA <- (notHitsDEA %in% hitsRNAseq )
isNotRNAinDEA <- (notHitsRNAseq %in% hitsDEA)
isNotRNAinNotDEA <- (notHitsRNAseq %in% notHitsDEA)
# finds a gene in both
geneBoth <- head(hitsRNAseq[isRNAinDEA],20)
# plots 1
plotData(geneBoth[19])
# plots 2
plotData(geneBoth[20])
# finds a gene only DEA
geneNotRNA <- head(notHitsRNAseq[isNotRNAinDEA],1)
# plots
plotData(geneNotRNA)
# finds a gene in both
geneNotDEA <- head(notHitsDEA[isNotDEAinRNA],1)
# plots
plotData(geneNotDEA)
# finds a gene in both
geneNeither <- head(notHitsRNAseq[isNotRNAinNotDEA],1)
# plots
plotData(geneNeither)