Question 1: Data inspection

1.1 Download and inspect the data

First lets set up the environment: upload the data and useful packages.

# Packages
library(limma)
library(edgeR)
library(data.table)
library(dplyr)
library(ggplot2)
library(plotly)
library(car) 
library(lattice) 
library(GGally)
library(RColorBrewer)
library(gplots)
library(DESeq2)
library(viridis)
library(VennDiagram)

# makes everything reproductible
set.seed(1234) 

transData <- read.table("Data/NHBE_transcriptome_data.txt", sep="\t", header=TRUE, row.names = 1)
designMat <- read.table("Data/NHBE_design.txt", sep="\t", header=TRUE, row.names = 1)

prepareData <- function(myGenes) {
    miniDat <- t(transData[myGenes, ])
    miniDat <- suppressWarnings(data.frame(gExp = as.vector(miniDat),
                          gene = rep(colnames(miniDat), each = nrow(miniDat))))
    miniDat <- suppressWarnings(data.frame(designMat, miniDat))
    miniDat
}
str(transData)
## 'data.frame':    22737 obs. of  23 variables:
##  $ GSE10718_Biomat_1 : num  7.9 6.13 6.82 6.63 6.95 ...
##  $ GSE10718_Biomat_10: num  7.62 5.85 7.62 5.7 7.51 ...
##  $ GSE10718_Biomat_11: num  8.22 6.3 7.22 6.22 7.29 ...
##  $ GSE10718_Biomat_12: num  8.01 5.91 6.17 6.06 7.95 ...
##  $ GSE10718_Biomat_13: num  7.61 6.62 7.2 6.89 7.87 ...
##  $ GSE10718_Biomat_14: num  7.45 6.82 7.4 7.02 7.55 ...
##  $ GSE10718_Biomat_15: num  7.41 6.88 6.78 6.59 7.27 ...
##  $ GSE10718_Biomat_16: num  7.67 6.83 6.99 6.45 7.56 ...
##  $ GSE10718_Biomat_17: num  7.89 6.45 5.85 6.18 7.24 ...
##  $ GSE10718_Biomat_19: num  7.8 6.66 6.83 6.19 6.96 ...
##  $ GSE10718_Biomat_2 : num  7.72 6.57 6.67 6.74 7.05 ...
##  $ GSE10718_Biomat_20: num  7.55 5.81 6.28 5.57 8.04 ...
##  $ GSE10718_Biomat_21: num  6.34 7.06 7.23 5.77 7.62 ...
##  $ GSE10718_Biomat_22: num  7.8 6.49 6.75 5.59 7.32 ...
##  $ GSE10718_Biomat_23: num  7.95 7.08 7.11 7.14 7.83 ...
##  $ GSE10718_Biomat_24: num  7.88 6.24 6.97 6.34 7.03 ...
##  $ GSE10718_Biomat_3 : num  7.51 6.38 6.98 6.73 7.06 ...
##  $ GSE10718_Biomat_4 : num  7.31 6.89 7.62 6.48 8.46 ...
##  $ GSE10718_Biomat_5 : num  8 7.1 7.29 5.72 7.73 ...
##  $ GSE10718_Biomat_6 : num  7.5 6.42 7.68 6.28 7.54 ...
##  $ GSE10718_Biomat_7 : num  7.5 6.27 7.12 6.63 8.17 ...
##  $ GSE10718_Biomat_8 : num  7.51 6.53 6.61 6.52 7.67 ...
##  $ GSE10718_Biomat_9 : num  7.74 6.89 6.71 7.06 7.42 ...
show(designMat)
##                    ExternalID       Treatment time
## GSE10718_Biomat_1   GSM270883         control 24_h
## GSE10718_Biomat_10  GSM270874         control  1_h
## GSE10718_Biomat_11  GSM270873         control  1_h
## GSE10718_Biomat_12  GSM270872         control  1_h
## GSE10718_Biomat_13  GSM270881         control  4_h
## GSE10718_Biomat_14  GSM270880         control  4_h
## GSE10718_Biomat_15  GSM270879         control  4_h
## GSE10718_Biomat_16  GSM270887 cigarette_smoke  1_h
## GSE10718_Biomat_17  GSM270886 cigarette_smoke  1_h
## GSE10718_Biomat_19  GSM270889 cigarette_smoke  2_h
## GSE10718_Biomat_2   GSM270882         control 24_h
## GSE10718_Biomat_20  GSM270888 cigarette_smoke  2_h
## GSE10718_Biomat_21  GSM270890 cigarette_smoke  2_h
## GSE10718_Biomat_22  GSM270891 cigarette_smoke  4_h
## GSE10718_Biomat_23  GSM270892 cigarette_smoke  4_h
## GSE10718_Biomat_24  GSM270893 cigarette_smoke  4_h
## GSE10718_Biomat_3   GSM270884         control 24_h
## GSE10718_Biomat_4   GSM270894 cigarette_smoke 24_h
## GSE10718_Biomat_5   GSM270895 cigarette_smoke 24_h
## GSE10718_Biomat_6   GSM270896 cigarette_smoke 24_h
## GSE10718_Biomat_7   GSM270878         control  2_h
## GSE10718_Biomat_8   GSM270876         control  2_h
## GSE10718_Biomat_9   GSM270875         control  2_h
if(all(rownames(designMat) == colnames(transData))) {print("The design matrix and data agree")}
## [1] "The design matrix and data agree"
if (!anyNA(transData)) { print("There are no missing values in Data") }
## [1] "There are no missing values in Data"

The dataset contains the expression of:

  • 22737 genes
  • 23 samples. For each 2*4=8 types of treatments, there are therefore 3 samples (one “cigarette_smoke” “1_h” is missing ).

The different samples are :

  • Treatment: cigarette_smoke, control. i.e. If the cell was exposed to smoke or air (“mock control”).
  • time: 1_h, 2_h, 24_h, 4_h. i.e. The time after which the transcriptome alterations were assessed using micro-arrays.

1.2 Basic data manipulation

Adds a quantitative variable for time in the design matrix.

designMat$timeCont <- car::recode(designMat$time, 
                         "'1_h'=1; '2_h'=2; '4_h'=4; '24_h'=24", 
                         as.factor.result = FALSE)

1.3 Basic graphing

First lets plot all in a single graph.

randomRowNumber <- sample(1:nrow(transData), 1)
randomGeneName <- rownames(transData)[[randomRowNumber]]
# aggregates the meta data and data, using the function coded above
randGene <- prepareData(randomGeneName)
plotRandGene <- ggplot(randGene,aes(x=Treatment,y=gExp,color=factor(timeCont)))+ 
  geom_point(aes(stroke=1.3)) +
  theme_minimal() +
  stat_summary(fun.y = "mean", aes(colour = "mean"), size = 3, geom="point", shape=3) +
  scale_color_manual("Time since exposure (hours)", values = c("1"="#e31a1c","2"="#fd8d3c","4"="#fecc5c","24"="#ffffb2","mean"="black")) +
  ggtitle("Differential expression due to tabacco") +
  ylab("Gene expression (relative)")

ggplotly(plotRandGene) 

We could also decide to use different facets. It is easier to compare the effect of treatments at each time point, but harder to compare between time points.

label_names <- c(
                    "1" = "After 1 hour",
                    "2" = "After 2 hour",
                    "4" = "After 3 hour",
                    "24" = "After 24 hour"
                    )

ggplot(randGene,aes(x=Treatment,y=gExp,color=factor(timeCont)))+ 
  geom_point(aes(stroke=2.1)) +
  stat_summary(fun.y = "mean", aes(colour = "mean"), size = 4, geom="point", shape=3) +
  scale_color_manual("Time (hours)", values = c("1"="#e31a1c","2"="#fd8d3c","4"="#fecc5c","24"="#ffffb2","mean"="black")) +
  ggtitle("Differential expression due to tabacco with respect to time since last exposure") + 
  theme(plot.title = element_text(hjust = 0.5, size = 15)) +
  facet_wrap(~ timeCont, ncol = 4, labeller=as_labeller(label_names)) +
  ylab("Gene expression (relative)") 

Note: For both plots I’ve written that the gene expression is in a relative unit. In reality they represent log2 normalized fluorescence intensity values which is unitless.

Question 2: Assessing data quality

2.1 Examine the sample-to-sample correlations in a heatmap

Note: In both heat maps below, I decided to take out the diagonal as it will always be filled with ones. This enables us to have a more informative color gradient.

# function that will be used to extract the last 2 digits of ExternalID
substrRight <- function(x){
  c <- as.character(x)
  substr(c, nchar(c)-1, nchar(c))
}

# REORDERING
# here orders by time first and then treatment
indexTimeTreatment <- order(designMat$timeCont, designMat$Treatment)
designTimeTreatment <- designMat[indexTimeTreatment,]
sampleNameTimeTreatment <- rownames(designTimeTreatment)

correlationTimeTreatment <- cor(transData[,sampleNameTimeTreatment])
treatmentShort <- as.character(designTimeTreatment$Treatment)
treatmentShort <- replace(treatmentShort,treatmentShort=="cigarette_smoke","smoke")
niceLabelTimeTreatment <- paste(designTimeTreatment$time, treatmentShort, substrRight(designTimeTreatment$ExternalID))
# sets the diagonal to empty because will always be 1 and want to be able to see
# small differences between other samples
diag(correlationTimeTreatment) = NA

m <- list(l=150, r=50, b=150, t=60, pad=4)
# needs to force the non-alphabetical ordering
xylabTimeTreatment = list(categoryorder = "array", categoryarray = niceLabelTimeTreatment)
plot_ly(x = niceLabelTimeTreatment, y=niceLabelTimeTreatment, z = ~correlationTimeTreatment, type = "heatmap", colorscale = "Greys") %>% 
  layout(title = 'Correlation heatmap: ordering by time and then treatment', margin=m,xaxis = xylabTimeTreatment, yaxis = xylabTimeTreatment)
# REORDERING
# here orders by treatment first and then by time
indexTreatmentTime <- order(designMat$Treatment, designMat$timeCont)
designTreatmentTime <- designMat[indexTreatmentTime,]
sampleNameTreatmentTime <- rownames(designTreatmentTime)

correlationTreatmentTime <- cor(transData[,sampleNameTreatmentTime])
treatmentShort <- as.character(designTreatmentTime$Treatment)
treatmentShort <- replace(treatmentShort,treatmentShort=="cigarette_smoke","smoke")
niceLabelTreatmentTime <- paste(designTreatmentTime$time, treatmentShort, substrRight(designTreatmentTime$ExternalID))
# sets the diagonal to empty because will always be 1 and want to be able to see
# small differences between other samples
diag(correlationTreatmentTime) = NA

m <- list(l=150, r=50, b=150, t=60, pad=4)
# needs to force the non-alphabetical ordering
xylabTreatmentTime = list(categoryorder = "array", categoryarray = niceLabelTreatmentTime)
plot_ly(x = niceLabelTreatmentTime, y=niceLabelTreatmentTime, z = ~correlationTreatmentTime, type = "heatmap", colorscale = "Greys") %>% 
  layout(title = 'Correlation heatmap: ordering by treatment and then time', margin=m,xaxis = xylabTreatmentTime, yaxis = xylabTreatmentTime)

We see that every coorelation result is very high: approximatively between [0.88,0.94]. This high correlation is not very suprising as it means that when a gene is relatively highly expressed, it will be relatively highly expressed in every sample. To be more precise (default correlation in R is pearson) the gene expression in different samples are close to linearly dependant (with a positive slope). This very probably due to a confounding variable, which is simply the mean expression of a gene.

In the first heat map, we see a pattern of 3*3 squarres (not perfect but distinguishable). This is not suprising as the axis are ordered by: 3 “smoke” samples then 3 “control” sample then 3 “smoke” … These square patterns tend to be more blue when comparing 3 “control samples” with 3 “smoke samples” and red otherwise. This indicates us that the Treatment is very important and seems to be more important than the time. It is interesting to notice that these pattern do not seem to be present for “1_h smoke” samples. We could thus hypothesize that the smoke didn’t have the chance to make an effect yet on these samples.

In the second heat map, we distinguish relatively well 4 squares, one at each corner. As before, this is not suprising as they show the effect of teh Treatment. As before, we see that the top left and bottom right square (different treatment) are more blue while the 2 others (same treatment) are more red. We see it very well if we do not look at the “1_h smoke” sample (which I encourage you to do as the heat map is interactive). Anew information which was complicated to see before is that all the control samples are very highly correlates together (on the other heat map the difference between control-control and smoke-smoke was harder to see).

2.2 Assess the presence of outlier samples

One intersting information that we can catch sight of quickly, is the fact that “1_h control 74” has a very low correlation with all other genes. This could indicate a possible problem with that sample (outlier). We can also see 2 other samples that have a low overall correlation with other samples (“24_h smoke 94” and “24_h smoke 95”). Inspite of their low correlations with other samples, they seem to be highly correlated bewtween them and with the third 24_h sample : “24_h smoke 96”. This indicates that contrary to the the first “outlier” we decribed (which could be simply an error), these 2 seem very interesting! On the second heatmap we also realize that sample “24_h smoke 96” seems suprisingly highly correlated with every other sample. It could be interesting to keep that sample in mind and maybe also test its outlierness.

To look more closely at the outlierness of these samples, let’s have a look at the kernel density plot:

Note I’m keeping the diagonal empty as what were are interested in as it will always be 1 and does not give any valuable information! Keeping it could indeed increas the the number of false negative uselessly for smaller number of samples (shouldn’t change anything for 23).

# plots kernel density
correlationMeans <- rowMeans(correlationTreatmentTime, na.rm = TRUE)
plot(density(correlationMeans), 
  main = "Average correlation with other samples") 
rug(correlationMeans, lwd = 1.5, ticksize = 0.2)
rug(min(correlationMeans), col = "red", lwd = 2.5, ticksize = 0.2)

On the density plot we do see that one of the samples (“1_h control 74”) could be an outlier, although it doesn’t seem shocking enough to remove it.

Note: Here we a only comparing the mean correlation of the sample to the average total mean correlation. This gives doesn’t take into account an important information which is that the sample are grouped by 3 (or 2). Let’s look also plot the groups:

niceLabelNoId <- unique(paste(designTreatmentTime$time, treatmentShort))
# plots kernel density
plot(density(correlationMeans), 
  main = "Average correlation with other samples") 
iOld = 1
k=1
for (i in seq(from = 2, to = 23, by = 3)) {
  # plots ticks by group
  rug(correlationMeans[iOld:i], col = rainbow(8)[k], lwd = 2.5, ticksize = 0.2)
  iOld = i
  # color counter
  k=k+1
}
# adds manual legen
par(new=TRUE)
plot(1, type="n", axes=FALSE, xlab="", ylab="")
legend(1, 1, legend = niceLabelNoId, col=rainbow(8),lwd=3, cex=0.9, xjust=-1.5, yjust=0)

This plot is very interesting because we now see the group and population correlation average. Now sample (“1_h control 74”) seems very likely to be an outlier as it is very different from the 2 other “1_h control” samples.

To test these hypothesis quantitatively (although not taking into account the group average: see other techniques for better methods that would take it into account). Let’s compute a t-test to compare the sample which could be an utlier with all other samples.

Note: We have to use var.equal = T (not Welch test) because we cannot compute \(s_x\) as the sample size is of 1. But this assumption of equal variance seems plausible as it would be the expression variance of every cell in our body. Note that with this t-test we assume that identically distributed, which is not true as we know that there are sub groups. This should be better taken into account using for example bayesian models (see below).

alpha = 0.95
for (i in c(1:23)){
  testOutlierT <- t.test(correlationMeans[i],correlationMeans, alternatives = "two.sided", conf.level = alpha, var.equal = T)
  # if 0 is not in confidence interval
  if(sign(testOutlierT$conf.int[1])  == sign(testOutlierT$conf.int[2])){
    print(paste(niceLabelTreatmentTime[i], "is significantly different from the others, withp-value=", testOutlierT$p.value))
  }
}
## [1] "1_h control 74 is significantly different from the others, withp-value= 0.0421846532518661"

We therefore see that even if we don not take into account the multiple groups (which would make “1_h control 74” even mor outlier) it is already an outlier.

Other techniques that I could have used:

  • Assume that each sample lives in a 23 dimensional world (one dimension for the correlation with each other sample) and that it is sampled by a multivariate gaussian. So we could fit a multivariate gaussian and compute the probability that each sample comes from that distribution. A sample would be considered as an outlier if it’s probability under the multivariate gaussian is beneath a threshold. Problem: gaussian distribution is not robust to outliers. This distribution would not take into account the fact that that there are 8 sub-goups that should be very similar.

  • Use a dimensionality reduction algorithm and “see” if there are outliers. Ex: t-sne or PCA. Problem: PCA is based on squarred error and is thus not robust. T-SNE could be a good idea! These 2 methods would still not take into account the fact that that there are 8 sub-goups that should be very similar.

  • Fit a empirical bayesian hierarchical model whith parameters \(\phi\) and \(\theta\) where we can assume conditional independance such that \(p(sample = x | \phi , \theta) = p(sample=x|\theta)p(\theta|\phi)p(\phi)\). With \(\theta\) the parameter of the group of samples (8 groups: same treatment and same “time”). \(\phi\) the parameter of the distribution of all the samples. Finally compute the probability under these priors. This would probably be the best way if we use a robust distribution such as the laplace or t distribtuion.

Question 3: Differential expression with respect to treatment

3.1 Linear model

desTreatment <- model.matrix(~Treatment, designMat)
fitTreatment <- lmFit(transData, desTreatment )
fitTreatmentEb <- eBayes(fitTreatment)
ttTreatment <- topTable(fitTreatmentEb)
## Removing intercept from test coefficients
show(ttTreatment)
##                  logFC   AveExpr         t      P.Value    adj.P.Val
## 200779_at   -0.7613410 13.983184 -8.577428 1.419183e-08 0.0001750755
## 202912_at   -1.7199499 12.764302 -8.161121 3.365296e-08 0.0001750755
## 214696_at   -1.9074941 11.730937 -8.155234 3.407195e-08 0.0001750755
## 223394_at   -1.2644771 10.909886 -8.147206 3.465212e-08 0.0001750755
## 223774_at   -1.1696606  9.529931 -8.080341 3.989741e-08 0.0001750755
## 209020_at   -1.0223766 10.677979 -7.939997 5.373691e-08 0.0001750755
## 202672_s_at -3.5302487  9.716310 -7.904577 5.795528e-08 0.0001750755
## 220468_at   -2.4132680  6.488711 -7.876047 6.160022e-08 0.0001750755
## 223982_s_at -0.9256205 10.907060 -7.729633 8.438527e-08 0.0001933190
## 226924_at   -0.6366539  9.588851 -7.726141 8.502396e-08 0.0001933190
##                    B
## 200779_at   9.623149
## 202912_at   8.840436
## 214696_at   8.829176
## 223394_at   8.813808
## 223774_at   8.685431
## 209020_at   8.413709
## 202672_s_at 8.344646
## 220468_at   8.288875
## 223982_s_at 8.000666
## 226924_at   7.993752

I decided to use the \(ref + effect\) model and not the cell means because I find that the p–value computed is the more natural one. In plain english : the model I’m fitting is taking the reference as “cigarette_smoke”. So all the expression values are first modelled by the mean of the “cigarette_smoke” samples. It then substracts a parameter only for “control” variables. These 2 parameters give us the model, the difference with the given expression value is the residue \(\epsilon\). The linear model is fitted by minimizing \(\epsilon\) (computing and finding it’s minimum by equalizing the derivative to 0).

In more mathy way here is the model fitted: \[Y_{ij} = \theta + \tau_j + \epsilon_{ij}\] which can be written in matrix form \[\begin{align*} y&=X \alpha + \epsilon \\ \left( \begin{matrix} y_{11}\\ \vdots\\ y_{n1}\\ y_{12}\\ \vdots\\ y_{n2} \end{matrix} \right) &= \left( \begin{matrix} 1 & 0\\ \vdots & \vdots \\ 1 & 0 \\ 1 & 1\\ \vdots & \vdots \\ 1 & 1 \end{matrix} \right) * \left( \begin{matrix} mean\_smoke \\ \delta_{control - smoke} \end{matrix} \right) + \left( \begin{matrix} \epsilon_{11}\\ \vdots\\ \epsilon_{n1}\\ \epsilon_{12}\\ \vdots\\ \epsilon_{n2} \end{matrix} \right) \end{align*}\]

Here the adjusted p-value therefore tests if the expression difference between “smoke” samples and “control” samples is statistically significant. i.e: The null hypothesis \(H0\) is that the mean of control samples and smoke samples is the same. The adjusted p-value use the Benjamin and Hocberg method to adjust the p-value (important beacsue of the huge number of tests). It therefore gives the proprtion of false positives we are expected to get if we call this feature significant (i.e. targets FDR).

3.2 Look at the hits

cutoffP <- 1e-03
cutoffPadj <- 0.05

numhitsTreatment <- nrow(topTable(fitTreatmentEb, p.value = cutoffP, n = Inf, adjust.method="none"))
## Removing intercept from test coefficients
numBHhitsTreatment <- nrow(topTable(fitTreatmentEb, p.value = cutoffPadj, n = Inf))
## Removing intercept from test coefficients

There are 805 probes with p-values smaller than 0.001. While there are 1238 probes with adjusted p-values smaller than 0.05.

nHit = 50
hitsTreatment <- topTable(fitTreatmentEb, n = nHit)
## Removing intercept from test coefficients
hitNames <- rev(rownames(hitsTreatment))
gExpression <- as.matrix(transData[hitNames,sampleNameTreatmentTime])

m <- list(l=150, r=50, b=150, t=60, pad=4)
# needs to force the non-alphabetical ordering
xlabHits = list(categoryorder = "array", categoryarray = niceLabelTreatmentTime)
ylabHits = list(categoryorder = "array", categoryarray = hitNames)
plot_ly( x=niceLabelTreatmentTime , y=~hitNames ,z = ~gExpression, type = "heatmap", colorscale = "Greys") %>% 
  layout(title = 'Heatmap: top50 genes transcription' , margin=m,xaxis = xlabHits, yaxis = ylabHits)

By definition \(FDR = \frac{F}{S}\) so \(F=S*FDR\). Where \(FDR\) is the False Discovery Rate threshold used (i.e the max FDR of the hits). \(S\) is number of significant gene which we can estimate by simply using the number of genes with \(p-value \leq t\).

FDRhit <- max(hitsTreatment$adj.P.Val)
estimatedF = nHit * FDRhit

We therefore conclude that the estimated false discovery rate is 0.001036 and that the number of false positives in our 50 hits is 0.0517996.

Question 4: Differential expression with respect to time

4.1 Linear model

# use time as quantitative covariate
desTime <- model.matrix(~timeCont, designMat)
fitTime <- lmFit(transData, desTime )
fitTimeEb <- eBayes(fitTime)
ttTime <- topTable(fitTimeEb)
## Removing intercept from test coefficients
show(ttTime)
##                   logFC   AveExpr          t      P.Value    adj.P.Val
## 202586_at   -0.04321190  9.045753 -11.559699 5.595299e-11 1.272203e-06
## 203201_at   -0.04242793 10.057592 -10.121063 7.109181e-10 6.663751e-06
## 227559_at   -0.05501598  8.182478  -9.958972 9.609875e-10 6.663751e-06
## 202769_at    0.04317888 12.095002   9.769107 1.373425e-09 6.663751e-06
## 226775_at   -0.03828553  8.647287  -9.734894 1.465398e-09 6.663751e-06
## 213113_s_at -0.04650788 10.391574  -9.412004 2.721027e-09 1.031133e-05
## 202770_s_at  0.03591883 11.510308   9.272069 3.572582e-09 1.160426e-05
## 226226_at    0.04323136  9.561381   9.139895 4.630899e-09 1.316159e-05
## 202887_s_at  0.04197944 13.601392   8.972788 6.449452e-09 1.629347e-05
## 200810_s_at -0.03436033 11.350772  -8.828058 8.617559e-09 1.674623e-05
##                    B
## 202586_at   15.28331
## 203201_at   12.74567
## 227559_at   12.44384
## 202769_at   12.08602
## 226775_at   12.02105
## 213113_s_at 11.40035
## 202770_s_at 11.12709
## 226226_at   10.86660
## 202887_s_at 10.53391
## 200810_s_at 10.24274
numhitsTime <- nrow(topTable(fitTimeEb, p.value = cutoffP, n = Inf, adjust.method="none"))
## Removing intercept from test coefficients
numBHhitsTime <- nrow(topTable(fitTimeEb, p.value = cutoffPadj, n = Inf))
## Removing intercept from test coefficients
#will not be needed: don't keep big data
remove(ttTime)
remove(fitTimeEb)
remove(fitTime)

There are 958 probes with p-values smaller than 0.001. While there are 1451 probes with adjusted p-values smaller than 0.05.

Question 5: Differential expression analysis with a full model

5.1 Quantify the number of hits for treatment

# use time as quantitative covariate
desTimeTreatment <- model.matrix(~timeCont*Treatment, designMat)
fitTimeTreatment <- lmFit(transData, desTimeTreatment )
fitTimeTreatmentEb <- eBayes(fitTimeTreatment)
ttTimeTreatment <- topTable(fitTimeTreatmentEb)
## Removing intercept from test coefficients
show(ttTimeTreatment)
##                timeCont Treatmentcontrol timeCont.Treatmentcontrol
## 214290_s_at  0.09410100      -0.16551666               -0.07394082
## 202870_s_at -0.08934809       0.22069666                0.06287376
## 208955_at   -0.06812775      -0.17577142                0.05748477
## 203108_at    0.08038465       0.59479660               -0.11407096
## 218542_at   -0.06617139       0.09262906                0.05928858
## 212281_s_at -0.06795005      -0.13169137                0.04787848
## 202954_at   -0.08021917      -0.06430425                0.06099165
## 225687_at   -0.06742085      -0.01686898                0.06235953
## 202779_s_at -0.06825002       0.01796472                0.04901065
## 203764_at   -0.08326001      -0.08054327                0.08864904
##              AveExpr         F      P.Value    adj.P.Val
## 214290_s_at 12.36283 156.55408 2.659658e-14 6.047264e-10
## 202870_s_at 11.80425 133.84586 1.230797e-13 1.399232e-09
## 208955_at   10.09237 123.82646 2.621916e-13 1.987150e-09
## 203108_at   12.71756 101.30302 1.817024e-12 1.032842e-08
## 218542_at   11.21883  90.07826 5.573399e-12 2.534448e-08
## 212281_s_at 10.96036  86.82289 7.903859e-12 2.782995e-08
## 202954_at   11.74982  86.08671 8.567956e-12 2.782995e-08
## 225687_at   11.25114  84.17688 1.059449e-11 3.011088e-08
## 202779_s_at 12.15344  74.95279 3.157967e-11 7.978077e-08
## 203764_at   10.82074  73.08677 3.997812e-11 9.089826e-08
numhitsTimeTreatment <- nrow(topTable(fitTimeTreatmentEb, p.value = cutoffP, n = Inf, adjust.method="none", coef = "Treatmentcontrol"))
numBHhitsTimeTreatment <- nrow(topTable(fitTimeTreatmentEb, p.value = cutoffPadj, n = Inf, coef = "Treatmentcontrol"))

There are 621 probes with p-values smaller than 0.001. While there are 768 probes with adjusted p-values smaller than 0.05. This means that there are now 184 less probes with unadjusted p-value smaller than 0.001. While there are 470 less probes with adjusted p-values smaller than 0.05. This is because we are now controlling for the interaction between Time and Treatment. One mathematical way to understand it is that we added parameters because the model has now 4 parameters (before it had 2), some variance that was explained by the Treatment parameter is now explained by the Time and /or Time-Treatment interaction parameter. The treatment has therefore less weight! Note: there are also hits that were not present when cosidering only the Treatment but are present now, the reason behind it has to do with variance. Indeed, by adding new parameters we incereased the number of explained variance (i.e we decreased the residue \(\epsilon\)) this is easy to understand as the parameters were found to minimize \(\epsilon\), so if it didn’t decrease the parameters would have been put to 0. By decreasing the variance it means that some variables that “were closed to be significative” before can become significative.

namesHitTreatment <- rownames(topTable(fitTreatmentEb, p.value = cutoffP, n = Inf, adjust.method="none", coef = "Treatmentcontrol"))
namesHitTimeTreatment <- rownames(topTable(fitTimeTreatmentEb, p.value = cutoffP, n = Inf, adjust.method="none", coef = "Treatmentcontrol"))
nOverlap <- length(intersect(namesHitTreatment,namesHitTimeTreatment))
length(setdiff(namesHitTimeTreatment,namesHitTreatment))
## [1] 293
length(setdiff(namesHitTreatment,namesHitTimeTreatment))
## [1] 477

There are 328 overlaping probes, which represents 52.8180354 % of the probes now that we control the interaction variable.

pValueTreatment <- topTable(fitTreatmentEb, n = Inf, coef = "Treatmentcontrol")$P.Value
pValueTreatmentTime <- topTable(fitTimeTreatmentEb, n = Inf,  coef = "Treatmentcontrol")$P.Value

plot(density(pValueTreatment), xlim=c(0.0,1), ylim=c(0.0,2.5), main="Density of model with and without interaction",col="darkblue"); par(new=TRUE); plot(density(pValueTreatmentTime),axes=FALSE, xlab='', ylab='', xlim=c(0.0,1), ylim=c(0.0,2.5), main="",col="red1"); legend("right", legend = c("Treatment-Time","Treatment"),text.col=c("red1","darkblue")); abline(v = 0.025, col = "gray10"); text(0.025,2.37, "0.05", col = "gray10", adj = c(-.2, -.1), cex=2)

Not suprisingly we see that the distribution of the P-value is very close for both models. We are also happy to see a spike around \(\alpha = 0.025\) which shows that the p-values are not unifromaly distributed. This would be the case if \(H0\) were true.

5.2 Test the null hypothesis

With the interaction model we are adding the term \(effect\_smoke*time\), which means that we are hypothesizing that the difference between gene expression of “smoke” and “control” samples change significantly when varying the time. One possible example might be that expression of “control” samples is always constant, while “smoke” samples start with the same expression (no effect yet) then change the expression (effect of tobacco on cells) and finally goes back to the “control” expression (effect disapears with time).

numhitsTimeTreatmentInteraction <- nrow(topTable(fitTimeTreatmentEb, p.value = cutoffP, n = Inf, adjust.method="none", coef = "timeCont:Treatmentcontrol"))
numBHhitsTimeTreatmentInteraction <- nrow(topTable(fitTimeTreatmentEb, p.value = cutoffPadj, n = Inf, coef = "timeCont:Treatmentcontrol"))

There are 573 probes with p-values smaller than 0.001. While there are 664 probes with adjusted p-values smaller than 0.05.

5.3 Plot a few probes where the interaction does and does not matter

With the interaction model we are adding the term \(effect\_smoke*time\), which means that we are hypothesizing that the difference between gene expression of “smoke” and “control” samples change significantly when varying the time. One possible example might be that expression of “control” samples is always constant, while “smoke” samples start with the same expression (no effect yet) then change the expression (effect of tobacco on cells) and finally goes back to the “control” expression (effect disapears with time).

nameInteracHit <- rownames(topTable(fitTimeTreatmentEb, n = 4, sort.by = "p", coef = "timeCont:Treatmentcontrol"))
nameNoInteracHit <- rev(rownames(topTable(fitTimeTreatmentEb, n = Inf, sort.by = "P", coef = "timeCont:Treatmentcontrol")))[1:4]
dataInterac <- prepareData(nameInteracHit)
dataNoInterac <- prepareData(nameNoInteracHit)

gplotInterac <- ggplot(dataInterac, aes(x=timeCont,y=gExp,color=Treatment)) + 
  geom_point() + 
  facet_wrap(~ gene, ncol = 4) +
  geom_smooth(method='lm', se=F) +
  ggtitle("Interaction between treatment and time")

ggplotly(gplotInterac)
gplotNoInterac <- ggplot(dataNoInterac, aes(x=timeCont,y=gExp,color=Treatment)) + 
  geom_point() + 
  facet_wrap(~ gene, ncol = 4) +
  geom_smooth(method='lm', se=F) +
  ggtitle("Interaction between treatment and time")
  
ggplotly(gplotNoInterac)
#will not be needed: don't keep big data
remove(ttTimeTreatment)
remove(fitTreatmentEb)
remove(fitTimeTreatmentEb)
remove(fitTreatment)
remove(fitTimeTreatment)

Note: I’ve plotted the plots above with a continuous time, as the model was made using a continuous time variable.

Question 6: Microarray analysis

6.1 Data loading and QC

Loading and summarize data

# removes previous data that won't be necessary
remove(transData)
remove(designMat)


transData2 <- read.table("Data/GSE37599-data.tsv", sep="\t", header=TRUE, row.names = 1)
str(transData2)
## 'data.frame':    10928 obs. of  6 variables:
##  $ b1: num  11.15 2.16 1.49 9.01 6.95 ...
##  $ b2: num  6.8 3.18 1.43 9.46 6.9 ...
##  $ b3: num  6.71 3.13 1.82 9.23 6.96 ...
##  $ c1: num  10.95 2.5 1.46 8.97 6.85 ...
##  $ c2: num  6.7 3.05 2.08 9.28 6.9 ...
##  $ c3: num  11.07 2.44 1.62 9 6.89 ...
samples <- colnames(transData2)

if (!anyNA(transData2)) { print("There are no missing values in Data") }
## [1] "There are no missing values in Data"

The dataset contains the expression of:

  • 10928 genes
  • 6 samples: there are 3 replicates of 2 different conditions in which the the yeast were grown (b* stands for batch replicate number* and c* stands for chemostat replicate number*)

Finding swapped label

Scatter plot matrix

# plot parameters
lowerFn <- function(data, mapping, method = "lm", ...) {
  p <- ggplot(data = data, mapping = mapping) +
    geom_point(colour = "blue") +
    geom_smooth(method = method, color = "red", ...)
  p
}

ggpairs(transData2, 
        upper = list(continuous = wrap("cor", size = 7)),
        lower = list(continuous = wrap(lowerFn, method = "lm"))) +
  ggtitle("Correlation scatter plot matrix") +  
  theme(plot.title = element_text(hjust = 0.5, size = 15),
        legend.position = "none", 
        panel.grid.major = element_blank(), 
        axis.ticks = element_blank(), 
        panel.border = element_rect(linetype = "dashed", colour = "black", fill = NA))

In order to find the switched label we can simply focus on one of the 2 condition (b or c) and try to find a replicate which seems very different than the others. To be more precise we can look at the correlation between the samples in order to find a sigle sample which isn’t as much correlated to the others. Note: due to high correlation of transcription data, it is highly unprobable to see a sample which isn’t correlated to the others we thus need to find one which is “less” correlated. Replicates should indeed be extrimely highly correlated as they (in a theoretically perfect experience and world) should have the same gene transcription.

Looking at the scatter plots for the batch replicates, we clearly see that b3 and b2 a pperfect straight line, while b1-b2 and b1-b3 seem to have a higher variability. We can thus conclude that b1 is the most different sample from the 3, this can be easily seen looking at the correlation values of 0.962 for b1-b2 and b1-b3 while it close to “perfect” for b2-b3 (0.998). In the chemostat replicates, the same can be seen for c2. We thus hypothesize that c2 and b1 have switched label. Note that if that were true we would also see a high correlation between c2-b2, c2-b3, b1-c1 and b1-c3. This is also important to check as it could also be that b1 and c2 are less highly coreelated but weren’t switched. Looking at the respective correlations of 0.998, 0.997, 0.998, 0.999 we see that this is true and that it is very probable that there has been a label switch.

Heatmap top 100 genes

First I will fit a linear model like in question 3.2 to find the top 100 hits. I will then plot the expression of each of these genes.

# fits linear model for finding top 100 genes 
fitCondition <- lmFit(transData2)
fitConditionEb <- eBayes(fitCondition)

nHit = 100
hitsCondition <- topTable(fitConditionEb, n = nHit)
hitNames <- rev(rownames(hitsCondition))
gExpression <- as.matrix(transData2[hitNames,])

m <- list(l=150, r=50, b=50, t=60, pad=4)
# needs to force the non-alphabetical ordering
plot_ly(x=samples, y=hitNames, z = ~gExpression, type = "heatmap", colorscale = "Greys") %>% 
  layout(title = 'Heatmap: top100 genes transcription', margin=m)
#will not be needed: don't keep big data
remove(fitConditionEb)
remove(fitCondition)

Note: on the figure above we do not see much because the differences between genes are higher than between samples. Let us thus only look at the difference of each sample transcription compared to its respective gene average transcription (simply substracts corresponding gene average).

gExpressionDifference <- t(apply(gExpression, 1, function(x)(x-mean(x))))

m <- list(l=150, r=50, b=50, t=60, pad=4)
# needs to force the non-alphabetical ordering
xlabHits = list(categoryorder = "array", categoryarray = samples)
ylabHits = list(categoryorder = "array", categoryarray = hitNames)

plot_ly(x=samples, y=hitNames, z = ~gExpressionDifference, type = "heatmap", colorscale = "Greys") %>% 
  layout(title = 'Heatmap: top100 genes transcription compared to average', margin=m,xaxis = xlabHits, yaxis = ylabHits)

On that figure it is still pretty complicated to see that 2 samples were switched. Although c2 does seem to be different than c1 and c3. But I wouldn’t bet based on that figure :) .

Heatmap pearson correlation

# calculating sample-sample correlations, default is pearson but still specifies here
corelationMatrix <- cor(transData2, method = "pearson")  

mypal <- rev(colorRampPalette(brewer.pal(11,"RdBu"))(100)) #setting colour palette

heatmap.2(corelationMatrix, symm = T, #making a heatmap
          trace = "none",  col = mypal,
          cexCol = 2.77, cexRow = 2.77,  
          main = "Sample-Sample Correlations")

From the previous questions we know that the the samples that seem to be switched are b1 and c2. Here we see the same information as in the scatter plot matrix but with color. Note: as before we clearly see that b1 is closer to c1 and c3 than to the other b (vis versa for c2). Note: it is interesting to see that when the package heatmap.2 clusters teh samples it clusters b1 with the c’s and c2 with the b’s. The dendogramm also shows us that b1 is even closer to c3 than c1 to c3.

PCA

#compute PCs with normalizing
pcs <- prcomp(transData2, center = T, scale = T) 
pcs2 <- as.data.frame(pcs$rotation[,c("PC1","PC2")])

g <- ggplot(pcs2, aes(x=PC1, y=PC2)) + 
  geom_point(aes(colour=samples), size=5) +
  ggtitle("PCA") +
  theme_minimal()

ggplotly(g)

Here again we clearly see that b1 and c2 seem to be swapped. Note that PC2 seem to be able distinguish between the type of sample (b or c). We should keep that in mind as it probably is a linear combinantion of most of genes that are of interest to us.

6.2 Microarray DEA

Fix swap

# swaps c2 and b1
fixedSamples = c("c2","b2","b3","c1","b1","c3")
transDataFix <- transData2[,fixedSamples]

#will not be needed: don't keep big data
remove(transData2)

Fit linear model

First lets fit the linear model.

# Retrive gene ID
library("yeast2.db")                            # Load the yeast2.db package 
gene.ids <- unlist(mget(rownames(transDataFix), yeast2ORF))  # See all the genes for the list of probes

# makes a design matrix
conditions <- c("batch","batch","batch","chemostat","chemostat","chemostat")
designMatFix <- data.frame(conditions)
row.names(designMatFix) <- fixedSamples
designCondition <- model.matrix(~ conditions, designMatFix)

# fits linear model  
fitCondition <- lmFit(transDataFix, designCondition)
fitConditionEb <- eBayes(fitCondition)
ttCondition <- topTable(fitConditionEb)
show(ttCondition)
##                logFC  AveExpr         t      P.Value    adj.P.Val        B
## 1772391_at 10.111470 7.043828 108.92958 9.477652e-14 1.035718e-09 20.96284
## 1774122_at  8.473827 6.852455  89.86550 4.277079e-13 2.336996e-09 20.05204
## 1774070_at  8.990866 8.378757  80.69881 9.932237e-13 3.617983e-09 19.47328
## 1776153_at  7.304449 5.427876  72.70264 2.248189e-12 5.717049e-09 18.86696
## 1774072_at  8.577513 6.886731  71.30964 2.615780e-12 5.717049e-09 18.74990
## 1776304_at  8.081401 6.729502  69.09726 3.347614e-12 5.881184e-09 18.55620
## 1770580_at  8.444769 7.037351  67.77269 3.895175e-12 5.881184e-09 18.43543
## 1774019_at  8.926037 8.104420  66.44446 4.547787e-12 5.881184e-09 18.31054
## 1776285_at  4.895122 7.692760  65.91159 4.843582e-12 5.881184e-09 18.25934
## 1776791_at  6.053323 6.743367  61.70349 8.116417e-12 8.521536e-09 17.83138

Now lets simply put that in a data frame.

# Makes the asked data frame
ttCondition <- topTable(fitConditionEb, number=Inf, sort.by="none")
## Removing intercept from test coefficients
limmaDF <- data.frame(probe.id = rownames(transDataFix),
                      gene.id = gene.ids, 
                      p.value = ttCondition$P.Value,
                      q.value = ttCondition$adj.P.Val,
                      log.fc = ttCondition$logFC, 
                      test.stat = ttCondition$t)

#will not be needed: don't keep big data
remove(ttCondition)
remove(fitConditionEb)
remove(fitCondition)

6.3 Microarray DEA continue

Removes unmatched genes

# COunts differentially expressed genes before deleting rows
cutoffPadj <- 1e-5
numBHhitsConditionUnmatched <- nrow(limmaDF[limmaDF$q.value < cutoffPadj,])

# Delete rows that have NA in gene ID
limmaDF <- limmaDF[!is.na(limmaDF$gene.id),]

# saves data frame. Note: not really necessary as I'm not removing it, and everything is in the same doc
write.csv(limmaDF, file = "Data/limmaDF.csv",row.names = TRUE)

As already stated before, the total number of probes is 10928. Now, after having removed unmatched genes, the number of probes is only of 5705.

Identify differentially expressed genes

Illustrating top hit

prepareData <- function(myGenes) {
    miniDat <- t(transDataFix[myGenes, ])
    miniDat <- suppressWarnings(data.frame(gExp = as.vector(miniDat),
                          gene = rep(colnames(miniDat), each = nrow(miniDat))))
    miniDat <- suppressWarnings(data.frame(designMatFix, miniDat))
    miniDat
}

# stores the prob id of the top hit
topHitProbID <- as.character(limmaDF[limmaDF$q.value == min(limmaDF$q.value), "probe.id"])

preparedTopHit <- prepareData(topHitProbID)

stripplot(gExp ~ conditions | gene, preparedTopHit,
          jitter.data = TRUE,
          auto.key = TRUE, type = c('p', 'a'), grid = TRUE)

Here we clearly see that the top gene has a more expressed in the chemostat than in the batch condition.

FDR of \(10^{-5}\)

Wanting a FDR of \(10^{-5}\) can simply be approximated by setting a threshold of the adjusted p-value at \(10^{-5}\).

cutoffPadj <- 1e-5
numBHhitsCondition <- nrow(limmaDF[limmaDF$q.value < cutoffPadj,])

There are 725 genes at a FDR of \(10^{-5}\). What is interested to remark is that before removing the unmatched genes there were only 7 more “differentially expressed genes”. This isn’t suprising at all as probes that aren’t overlapping with genes shouldn’t be differentially expressed. Their shouldn’t even be these 7 probs!

Question 7: RNA-seq analysis

7.1 Load RNA Count Data and Sanity Check

Load Sample

# removes previous data that won't be necessary
#remove(transDataFix)
#remove(designMatFix)

countData <- read.table("Data/stampy.deep.counts.tsv", sep="\t", header=TRUE, row.names = 1)

# makes a design matrix
samples <- colnames(countData)
conditions <- c("batch","batch","batch","chemostat","chemostat","chemostat")
designMatCount <- data.frame(conditions)
row.names(designMatCount) <- samples
designConditionCount <- model.matrix(~ conditions, designMatCount)

str(countData)
## 'data.frame':    6542 obs. of  6 variables:
##  $ b1: int  11 40 31 63 112 17 0 4 5 0 ...
##  $ b2: int  3 26 52 87 106 21 0 2 8 1 ...
##  $ b3: int  5 10 40 53 60 15 0 2 1 0 ...
##  $ c1: int  13 71 12 51 139 7 0 11 5 1 ...
##  $ c2: int  5 41 18 78 142 6 0 0 5 0 ...
##  $ c3: int  17 48 32 100 127 2 2 3 2 1 ...
if (!anyNA(countData)) { print("There are no missing values in Data") }
## [1] "There are no missing values in Data"

The dataset contains the RNA seq count data of:

  • 6542 genes. Note before the rows were the probes on a microarray, here they are genes.

  • 6 samples: there are 3 replicates of 2 different conditions in which the the yeast were grown (b* stands for batch replicate number* and c* stands for chemostat replicate number*)

Swap check

As before we can check whether samples have been swapped with the heatmap of the correlation matrix.

# calculating sample-sample correlations, default is pearson but still specifies here
corelationMatrix <- cor(countData, method = "pearson")  

mypal <- rev(colorRampPalette(brewer.pal(11,"RdBu"))(100)) #setting colour palette

heatmap.2(corelationMatrix, symm = T, #making a heatmap
          trace = "none",  col = mypal,
          cexCol = 2.77, cexRow = 2.77,  
          main = "Sample-Sample Correlations")

On the contrary to before, nothing shows us that there might have meen a swap. Indeed each of the b’s and c’s seem to be clustered together.

7.2 DEA of deep sequencing data

First needs to prepare data for edgeR.

Filtering

dge_obj <- DGEList(counts=countData, group=factor(conditions))
# normalize and filter based on cpm
dge_obj_cpm <- cpm(dge_obj)

# Manual said "a gene is required to have a count of 5-10" in a library to be considered expressed in that library". p.12
# compute smallest library size
min_lib_size_million <- min(dge_obj$samples$lib.size) / 1000000
# cpm of ~2 is equivalent to ~5 count in the smallest library
threshold <- 5/min_lib_size_million 
keep <- rowSums(dge_obj_cpm > threshold) 
# filter for gene expressed in at least all of the samples of one conditions
keep <- keep >=3  
dge_obj_filtered <- dge_obj[keep, , keep.lib.sizes=FALSE] # keep.lib.sizes recalculates the library size. 
head(dge_obj_filtered$samples)
##        group lib.size norm.factors
## b1     batch  3419742            1
## b2     batch  4486378            1
## b3     batch  3414470            1
## c1 chemostat  2528577            1
## c2 chemostat  2989865            1
## c3 chemostat  3694270            1
#will not be needed: don't keep big data
remove(countData)
remove(dge_obj_cpm)

Out of curiosity let’s look at the difference in the library size for each sample due to the filtering:

filtered_lib <- (dge_obj$samples$lib.size - dge_obj_filtered$samples$lib.size)
head(data.frame(dge_obj$samples$group, filtered_lib))
##   dge_obj.samples.group filtered_lib
## 1                 batch         1109
## 2                 batch         1207
## 3                 batch          908
## 4             chemostat          994
## 5             chemostat         1091
## 6             chemostat         1402

We have removed 552 genes.

Normalizing

The sequence depth doesn’t need to be normalized for, as EdgeR will automatically adjust take that into account. But we do need to adjust for the fact that some some genes are over-expressed which can make other genes seem under-expressed after rescaling. We need to adjust for that using TMM (trimmed mean of M-values).

dge_obj_filtered_norm <- calcNormFactors(dge_obj_filtered)
head(dge_obj_filtered_norm$sample)
##        group lib.size norm.factors
## b1     batch  3419742    1.0010227
## b2     batch  4486378    0.9799708
## b3     batch  3414470    0.9516236
## c1 chemostat  2528577    1.0369536
## c2 chemostat  2989865    1.0077786
## c3 chemostat  3694270    1.0250694
#will not be needed: don't keep big data
remove(dge_obj)
remove(dge_obj_filtered)

Dispersion

# Estimate trend-wise (all tag) dispersion and then tag-wise (one tag) dispersion in one go. 
dge_obj_filtered_norm_disp <- estimateDisp(dge_obj_filtered_norm, designConditionCount)
plotBCV(dge_obj_filtered_norm_disp)

#will not be needed: don't keep big data
remove(dge_obj_filtered_norm)

From The plot above we see that the BCV trend is close to the common BCV. This thus shows that the following analysis with EdgeR will can be trusted, indeed the the common BCV that will be used isn’t to bad at representing the Tagwise BCV. Also most of gene-specific BCV cluster auround the trend which indicates that the estimated BCV can be used in edgeR. Note: a last good thing we see is that the common BCV is around 0.08 which is less than a typical BCV or genetically identicall models :0.1 ( source ). Having a low BCV means a low dispersion which means less variability and therefore a better statistical power.

Find differentially expressed genes

Fit negative niomial GLM

Fit a negative binomial generalized linear model for each tag.

fit <- glmFit(dge_obj_filtered_norm_disp, designConditionCount)
lrt <- glmLRT(fit,coef="conditionschemostat")
# top genes
DEGenes <- topTags(lrt, n=Inf)
head(DEGenes$table)
##             logFC    logCPM       LR PValue FDR
## YIL057C 10.019940  8.214836 2236.095      0   0
## YMR175W  9.362595  7.954929 1722.644      0   0
## YJR095W  9.355021  7.755606 2353.156      0   0
## YMR107W  9.009533  8.737205 2767.988      0   0
## YKL217W  8.512287 11.391069 2126.357      0   0
## YPL201C  7.589903  7.047413 1865.612      0   0
lattice::histogram(DEGenes$table$PValue) 

# removes previous data that won't be necessary
remove(fit)
remove(lrt)
remove(dge_obj_filtered_norm_disp)

This plot is very suprizing: the number of genes that have a very low p-value is very big! Nearly 80 % this is crazy and tells us that there is many genes are diffenretialy expressed when in chemostat!

Results

# makes the data fram that we were asked 
edger.deep.results <- data.frame(gene.id = rownames(DEGenes$table),
                      p.value = DEGenes$table$PValue,
                      q.value = DEGenes$table$FDR,
                      log.fc = DEGenes$table$logFC, 
                      test.stat = DEGenes$table$LR)

write.csv(edger.deep.results, file = "Data/edger.deep.results.csv",row.names = TRUE)

cutoffPadj <- 1e-5
numBHhitsConditionCount <- nrow(edger.deep.results[edger.deep.results$q.value < cutoffPadj,])

# removes previous data that won't be necessary
remove(DEGenes)

There are 2758 genes at a FDR of \(10^{-5}\). Which is close to half of the genes! That seems crazy again.

7.3 DEA of low sequencing data

Importing data

countData <- read.table("Data/stampy.low.counts.tsv", sep="\t", header=TRUE, row.names = 1)

Filtering

dge_obj <- DGEList(counts=countData, group=factor(conditions))
# normalize and filter based on cpm
dge_obj_cpm <- cpm(dge_obj)

# Manual said "a gene is required to have a count of 5-10" in a library to be considered expressed in that library". p.12
# compute smallest library size
min_lib_size_million <- min(dge_obj$samples$lib.size) / 1000000
# cpm of ~2 is equivalent to ~5 count in the smallest library
threshold <- 5/min_lib_size_million 
keep <- rowSums(dge_obj_cpm > threshold) 
# filter for gene expressed in at least all of the samples of one conditions
keep <- keep >=3  
dge_obj_filtered <- dge_obj[keep, , keep.lib.sizes=FALSE] # keep.lib.sizes recalculates the library size. 
head(dge_obj_filtered$samples)
##        group lib.size norm.factors
## b1     batch    67794            1
## b2     batch    89342            1
## b3     batch    67061            1
## c1 chemostat    47927            1
## c2 chemostat    57461            1
## c3 chemostat    70771            1
#will not be needed: don't keep big data
#remove(countData)
remove(dge_obj_cpm)

Here we see that the library size is a lot smaller than in 7.2 (as its supposed to be). Out of curiosity let’s look at the difference in the library size for each sample due to the filtering:

filtered_lib <- (dge_obj$samples$lib.size - dge_obj_filtered$samples$lib.size)
head(data.frame(dge_obj$samples$group, filtered_lib))
##   dge_obj.samples.group filtered_lib
## 1                 batch         9196
## 2                 batch        11611
## 3                 batch         8374
## 4             chemostat         7831
## 5             chemostat         9297
## 6             chemostat        11446

We have removed 4298 genes. Note: That is way more than before but doesn’t seem correct: we will always be removing more genes if the library size is smaller!

Normalizing

The sequence depth doesn’t need to be normalized for, as EdgeR will automatically adjust take that into account. But we do need to adjust for the fact that some some genes are over-expressed which can make other genes seem under-expressed after rescaling. We need to adjust for that using TMM (trimmed mean of M-values).

dge_obj_filtered_norm <- calcNormFactors(dge_obj_filtered)
head(dge_obj_filtered_norm$sample)
##        group lib.size norm.factors
## b1     batch    67794    1.0265139
## b2     batch    89342    1.0227914
## b3     batch    67061    1.0015433
## c1 chemostat    47927    0.9927162
## c2 chemostat    57461    0.9792398
## c3 chemostat    70771    0.9782823
#will not be needed: don't keep big data
remove(dge_obj)
remove(dge_obj_filtered)

Dispersion

# Estimate trend-wise (all tag) dispersion and then tag-wise (one tag) dispersion in one go. 
dge_obj_filtered_norm_disp <- estimateDisp(dge_obj_filtered_norm, designConditionCount)
plotBCV(dge_obj_filtered_norm_disp, cex=0.7, ylim=c(0, 0.5))

#will not be needed: don't keep big data
remove(dge_obj_filtered_norm)

From The plot above we see that the BCV trend is close to the common BCV when the log CPM is big but at low values it is pretty far away. In order to see the points I’ve made them a lot bigger, indeed with the default parameters we didn’t see them. The trend isn’t smooth and seems strange because there aren’t enough genes that weren’t filtered out, as result the trend simply joins each point. To be more correct we should probably be fitting an other spline which doesn’t overfit.

Find differentially expressed genes

Fit negative niomial GLM

Fit a negative binomial generalized linear model for each tag.

fit <- glmFit(dge_obj_filtered_norm_disp, designConditionCount)
lrt <- glmLRT(fit,coef="conditionschemostat")
# top genes
DEGenes <- topTags(lrt, n=Inf)
head(DEGenes$table)
##            logFC   logCPM        LR        PValue           FDR
## YOR374W 6.650904 12.27331 1399.5606 2.617691e-306 7.402829e-303
## YKL217W 8.718002 11.53380 1137.5732 2.252581e-249 3.185150e-246
## YBR072W 7.656596 11.38314 1032.7129 1.392397e-226 1.312566e-223
## YBL075C 6.989949 10.71024  730.8448 5.868076e-161 4.148730e-158
## YDR343C 5.354107 10.96674  722.9866 3.000759e-159 1.697229e-156
## YDR342C 4.788023 11.13263  717.8749 3.879329e-158 1.828457e-155
lattice::histogram(DEGenes$table$PValue) 

# removes previous data that won't be necessary
remove(fit)
remove(lrt)
remove(dge_obj_filtered_norm_disp)

As before we see that there still is a lot of genes that “seem to be” differentially expressed between the condintions. The propotion of genes with an extremely low p-value is still lower than before of approximatively 20 %. Note: in reality the number of hits is way lower than before because here we look at the proportion, but a lot of genes have been filtered out. We thus conclude that low sequence deth results in more genes filtered out but also in a lower porportion of hits.

Results

# makes the data fram that we were asked 
edger.low.results <- data.frame(gene.id = rownames(DEGenes$table),
                      p.value = DEGenes$table$PValue,
                      q.value = DEGenes$table$FDR,
                      log.fc = DEGenes$table$logFC, 
                      test.stat = DEGenes$table$LR)

write.csv(edger.deep.results, file = "Data/edger_low_results.csv",row.names = TRUE)

cutoffPadj <- 1e-5
numBHhitsConditionCount <- nrow(edger.low.results[edger.low.results$q.value < cutoffPadj,])

# removes previous data that won't be necessary
remove(DEGenes)

There are 470 genes at a FDR of \(10^{-5}\). Which is nearly 6 times less than before! This really shows us the loss of statistical power due to a lower sequencing depth.

7.4 DEA of low sequencing data

cutoffPadj <- 1e-5
# finds gene id
hitsLow <- as.character(edger.low.results[edger.low.results$q.value < cutoffPadj,"gene.id"])
hitsDeep <- as.character(edger.deep.results[edger.deep.results$q.value < cutoffPadj,"gene.id"])

nGenesBoth <- sum(hitsLow %in% hitsDeep)
nGenesLow <- length(hitsLow) 
nGenesDeep <- length(hitsDeep) 

grid.newpage()
draw.pairwise.venn(nGenesDeep, nGenesLow, nGenesBoth, category = c("Deep", "Low"), lty = rep("blank", 
    2), fill = c("light blue", "pink"), alpha = rep(0.5, 2))

## (polygon[GRID.polygon.2226], polygon[GRID.polygon.2227], polygon[GRID.polygon.2228], polygon[GRID.polygon.2229], text[GRID.text.2230], text[GRID.text.2231], text[GRID.text.2232], text[GRID.text.2233])

We thus see from the image above that all the genes that were found in the low sequencing depth were also found in the deep one. We thus conclude that having a deeper depth only gives a higher statistical power, without “hiding” results. This seems pretty logical. In all the analysis there were 2758 genes that have been found (which corresponds to the number of genes in the “deep” experiment). While there has been 470 genes that were found (which corresponds to the number of genes in the “low” experiment).

Question 8: Compare DEA results from RNA-Seq and arrays

8.1 Plots of interesting and boring genes

Note for all th plots I’ve used the same axis limit so that you can compare between plots, but as a result it seems sometimes that the plot hasn’t the good scale.

Defines function

# defines a function to draw side by side count and Expression

plotData <- function(myGenes) {
    myProb <- as.character(limmaDF[limmaDF$gene.id == myGenes,"probe.id"])
  
    # plot DEA
    miniDat <- t(transDataFix[myProb, ])
    miniDat <- suppressWarnings(data.frame(gExp = as.vector(miniDat),
                          gene = rep(colnames(miniDat), each = nrow(miniDat))))
    miniDat <- suppressWarnings(data.frame(designMatFix, miniDat))
    
    plotDEA <- stripplot(gExp ~ conditions | gene, miniDat ,
              jitter.data = TRUE,
              auto.key = TRUE, type = c('p', 'a'), grid = TRUE, ylim= c(6.5,13.3))
    
    # plot RNA seq
    miniDat <- t(countData[myGenes, ])
    miniDat <- suppressWarnings(data.frame(count = as.vector(miniDat),
                          gene = rep(colnames(miniDat), each = nrow(miniDat))))
    miniDat <- suppressWarnings(data.frame(designMatCount, miniDat))
    
  
    plotRNA <- stripplot(count ~ conditions | gene, miniDat ,
          jitter.data = TRUE,
          auto.key = TRUE, type = c('p', 'a'), grid = TRUE, ylim= c(0,120))
    
    # plot side by side 
    
    print(plotDEA, position = c(0, 0, 0.5, 1), more = TRUE)
    print(plotRNA, position = c(0.5, 0, 1, 1))
}

# defines genes that will be needed
hitsRNAseq <- as.character(edger.low.results[edger.low.results$q.value < cutoffPadj,"gene.id"])
hitsDEA <- as.character(limmaDF[limmaDF$q.value < cutoffPadj,"gene.id"])
notHitsRNAseq <- as.character(edger.low.results[edger.low.results$q.value > cutoffPadj,"gene.id"])
notHitsDEA <- as.character(limmaDF[limmaDF$q.value > cutoffPadj,"gene.id"])

isRNAinDEA <- (hitsRNAseq %in% hitsDEA)
isNotDEAinRNA <- (notHitsDEA %in% hitsRNAseq )
isNotRNAinDEA <- (notHitsRNAseq %in% hitsDEA)
isNotRNAinNotDEA <- (notHitsRNAseq %in% notHitsDEA)

Plot 2 genes in both

# finds a gene in both 
geneBoth <- head(hitsRNAseq[isRNAinDEA],20)
# plots 1
plotData(geneBoth[19])

# plots 2
plotData(geneBoth[20])

Plot gene in DEA but not RNAseq

# finds a gene only DEA
geneNotRNA <- head(notHitsRNAseq[isNotRNAinDEA],1)
# plots 
plotData(geneNotRNA)

Plot gene in RNAseq but not DEA

# finds a gene in both 
geneNotDEA <- head(notHitsDEA[isNotDEAinRNA],1)
# plots 
plotData(geneNotDEA)

Plot gene in neither

# finds a gene in both 
geneNeither <- head(notHitsRNAseq[isNotRNAinNotDEA],1)
# plots 
plotData(geneNeither)