Assignment 1

CE7412: Computational and Systems Biology

author: Siyue Zhang

date: 2023-02-06

Question 1

1. Download the Ecoli bacterial genome sequences by using: library(“BSgenome.Ecoli.NCBI.20080805”) and selects the smallest of the sequences to answer the following:

library("BSgenome.Ecoli.NCBI.20080805")

The smallest sequence, NC_000913, has 4639675 neucleotides.

seqlens <- seqlengths(Ecoli)
target <- seqlens[which.min(seqlens)]
target

## NC_000913 
##   4639675

seq <- Ecoli$NC_000913
seq

## 4639675-letter DNAString object
## seq: AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTC...ATCACCAAATAAAAAACGCCTTAGTAAGTATTTTTC

a. Compute neucleotide probabilities $p(X)$ and dinucleotide probabilities $p(XY)$ where $X,Y \in \Omega_{DNA}$ and $Y$ follows $X$.

Neucleotide probabilities are calculated as follows:

p <- letterFrequency(seq, c("A", "C", "G", "T"), as.prob = TRUE)
p

##         A         C         G         T 
## 0.2461871 0.2542320 0.2536650 0.2459159

Dinucleotide probabilities are calculated as follows:

pp <- dinucleotideFrequency(seq, as.prob = TRUE)
pp

##         AA         AC         AG         AT         CA         CC         CG 
## 0.07282193 0.05531897 0.05127020 0.06677603 0.07008014 0.05855433 0.07471861 
##         CT         GA         GC         GG         GT         TA         TC 
## 0.05087879 0.05760038 0.08274956 0.05822327 0.05509180 0.04568446 0.05760922 
##         TG         TT 
## 0.06945294 0.07316936

b. Determine the entropy and the divergence of nucleotides and dinucleotides.

H = 0
for (X in 1:4)
  H = H - p[X]*log(p[X])
H

##        A 
## 1.386169

Neucleotide entropy = 1.386169

D = 0
for (i in 1:4)
  D = D + p[i]*log(4*p[i])
D

##            A 
## 0.0001251216

Neucleotide divergene = 0.0001251216

HH = 0
for (i in 1:16)
    HH = HH - pp[i]*log(pp[i])
HH

##       AA 
## 2.759576

Dinucleotide entropy = 2.759576

D = 0
for (i in 1:16)
  D = D + pp[i]*log(16*pp[i])
D

##        AA 
## 0.0130129

Dinucleotide divergene = 0.0130129

c. Find the conditional probabilities $p(Y|X)$.

pp = matrix(pp,4,4,TRUE)
tpp = as.table(pp)
colnames(tpp) = c("A", "C", "G", "T")
rownames(tpp)= c("A", "C", "G", "T")
names(attributes(tpp)$dimnames) = c("X", "Y")
tpp

##    Y
## X            A          C          G          T
##   A 0.07282193 0.05531897 0.05127020 0.06677603
##   C 0.07008014 0.05855433 0.07471861 0.05087879
##   G 0.05760038 0.08274956 0.05822327 0.05509180
##   T 0.04568446 0.05760922 0.06945294 0.07316936

Conditional Probabilities (nucleotides) are as follows:

cp = matrix(0, 4, 4)
for (i in 1:4)
  for (j in 1:4)
    cp[j,i] = pp[i,j]/p[i]
tcp = as.table(cp)
colnames(tcp) = c("A", "C", "G", "T")
rownames(tcp) = c("A", "C", "G", "T")
names(attributes(tcp)$dimnames) = c("X", "Y")
tcp

##    Y
## X           A         C         G         T
##   A 0.2957992 0.2756542 0.2270727 0.1857727
##   C 0.2247030 0.2303185 0.3262160 0.2342639
##   G 0.2082571 0.2938993 0.2295282 0.2824255
##   T 0.2712410 0.2001274 0.2171833 0.2975381

d. Using following definitions, find mutual information $M(XY)$ and conditional entropy $H(Y|X)$ of the dinucleotides.

M = 0
for (i in 1:4)
  for (j in 1:4)
    M = M + tpp[i,j]*log(tpp[i,j]/(p[i]*p[j]))
M

##          A 
## 0.01276266

Mutual information $M(XY)$ = 0.01276266

H = 0
for (i in 1:4)
  for (j in 1:4)
    H = H - tpp[i,j]*log(cp[j,i])
H

## [1] 1.373407

Conditional entropy $H(Y|X)$ = 1.373407

e. Using the above definitions in (d), prove that

equation 1: $$ \[\begin{align*} M(XY) & = \sum_{X,Y\in\Omega}{p(XY)\log(\frac{p(XY)}{p(X)p(Y)})} \\ & = \sum_{X,Y\in\Omega}{p(XY)\log(p(XY))} - \sum_{X,Y\in\Omega}{p(XY)\log(p(X))} - \sum_{X,Y\in\Omega}{p(XY)\log(p(Y))} \\ & = \sum_{X,Y\in\Omega}{p(XY)\log(p(XY))} - 2\sum_{X\in\Omega}\sum_{Y\in\Omega}{p(XY)\log(p(X))}\\ & = \sum_{X,Y\in\Omega}{p(XY)\log(p(XY))} - 2\sum_{X\in\Omega}{p(X)\log(p(X))}\\ & = -H(XY) + 2H(X)\\ \end{align*}\]

\[ equation 2: \] \[\begin{align*} H(Y|X) & = -\sum_{X,Y\in \Omega}{p(XY)\log(p(Y|X))}\\ & = -\sum_{X,Y\in \Omega}{p(XY)\log(\frac{p(XY)}{p(X)})}\\ & = -\sum_{X,Y\in \Omega}{p(XY)\log(p(XY))} + \sum_{X,Y\in \Omega}{p(XY)\log(p(X))}\\ & = -\sum_{X,Y\in \Omega}{p(XY)\log(p(XY))} + \sum_{X\in\Omega}\sum_{Y\in\Omega}{p(XY)\log(p(X))}\\ & = -\sum_{X,Y\in\Omega}{p(XY)\log(p(XY))} + \sum_{X\in\Omega}{p(X)\log(p(X))}\\ & = H(XY) - H(X) \end{align*}\] $$

f. Find $M(XY)$ and $H(Y|X)$ for the sequence by using the equations in part (e). Comment on differences if any with those values obtained in part (d).

$2H(X)-H(XY)$ = 0.01276266

H = 0
for (X in 1:4)
  H = H - p[X]*log(p[X])

HH = 0
for (i in 1:4)
  for (j in 1:4)
    HH = HH - pp[i,j]*log(pp[i,j])

2*H-HH

##          A 
## 0.01276266

$H(XY)-H(X)$=1.373407

HH - H

##        A 
## 1.373407

There is no difference found as $2H(X)-H(XY) = M(XY)$ and $H(XY)-H(X) = H(Y|X)$.

Question 2

Download the Worm genome sequences by using: library(BSgenome.Celegans.UCSC.ce2) and select the chrM sequence to answer the following:

library("BSgenome.Celegans.UCSC.ce2")

Model the sequence with (i) an independent model of sequences, (ii) a first-order Markov chain, and (iii) a second order Markov chain.

seq = Celegans$chrM
seq

## 13794-letter DNAString object
## seq: CAGTAAATAGTTTAATAAAAATATAGCATTTGGGTT...TATTTATAGATATATACTTTGTATATATCTATATTA

pC = letterFrequency(seq, c("C"), as.prob = TRUE)
pC

##          C 
## 0.08880673

stateseq = function (s) {
  vec = strsplit(s, "")[[1]]  # split the string into characters
  for (i in 1:length(vec)) {
    if (vec[i] == "A") vec[i] = 1; 
    if (vec[i] == "T") vec[i] = 2;
    if (vec[i] == "G") vec[i] = 3;
    if (vec[i] == "C") vec[i] = 4;
  }
  st = as.numeric(vec)
  return(st)  
}

dna = as.character(seq)
st = stateseq(dna)

a. Compute model parameters for each model

Independent model:

independent_model = function(st, ns) {
  nx = vector(ns, mode='integer')
  n = length(st)
 
  for (i in 1:n) 
    nx[st[i]] = nx[st[i]] + 1

  px = nx/n

  loglik = log(1)
  for (i in 1:ns)
    loglik = loglik + nx[i]*log(px[i])

  return(list(px, loglik))
}
ind_model = independent_model(st, 4)

ind_model

## [[1]]
## [1] 0.31426707 0.44794838 0.14897782 0.08880673
## 
## [[2]]
## [1] -16858.75

A: 0.31426707, T: 0.44794838, G: 0.14897782, C: 0.08880673

Log Likelihood = -16858.75

First-order Markov model:

markov_model = function(st, ns, p1) {
  N = matrix(0, ns, ns)  # counts of transitions
  for (i in 1:length(st)-1) {
    N[st[i], st[i+1]] = N[st[i],st[i+1]] + 1
  }

  sN = rowSums(N)
  A = matrix(0, ns, ns) # state transition matrix
  for (i in 1:ns) 
    for (j in 1:ns)
      A[i,j] = N[i,j]/sN[i]
  
  loglik = log(p1)  # prob of starting nucleotide
    for (i in 1:ns)
      for (j in 1:ns)
        loglik = loglik + N[i,j]*log(A[i,j])
  
  colnames(A) = c("A", "T", "G", "C")
  rownames(A) = c("A", "T", "G", "C")

  return( list(A, loglik))
}

mark_model = markov_model(st, 4, pC)

mark_model

## [[1]]
##           A         T          G          C
## A 0.3101061 0.4074758 0.18689432 0.09552377
## T 0.3147759 0.4999191 0.11522900 0.07007606
## G 0.2968370 0.3956204 0.20583942 0.10170316
## C 0.3559184 0.4171429 0.08979592 0.13714286
## 
## [[2]]
##         C 
## -16708.82

Log Likelihood = -16708.82

The transition probabilities are as follows:

A = as.table(mark_model[[1]])
colnames(A) = c("A", "T", "G", "C")
rownames(A) = c("A", "T", "G", "C")
names(attributes(A)$dimnames) = c("X", "Y")
A

##    Y
## X            A          T          G          C
##   A 0.31010614 0.40747577 0.18689432 0.09552377
##   T 0.31477585 0.49991908 0.11522900 0.07007606
##   G 0.29683698 0.39562044 0.20583942 0.10170316
##   C 0.35591837 0.41714286 0.08979592 0.13714286

Second-order Markov model:

Assume the start probability is the density of CA.

dinucleotideFrequency(seq)

##   AA   AC   AG   AT   CA   CC   CG   CT   GA   GC   GG   GT   TA   TC   TG   TT 
## 1344  414  810 1766  436  168  110  511  610  209  423  813 1945  433  712 3089

pCA = dinucleotideFrequency(seq)["CA"]/sum(dinucleotideFrequency(seq))

pCA

##         CA 
## 0.03161024

Transition probabilities for the second-order markov model:

second_markov_model = function(st, ns, p1) {
  N = array(0, dim=c(ns, ns, ns))  
  dimnames(N) = list(c("A", "T", "G", "C"),c("A", "T", "G", "C"),c("A", "T", "G", "C"))
  for (i in 1:length(st)-2) {
    N[st[i], st[i+1], st[i+2]] = N[st[i],st[i+1], st[i+2]] + 1
  }
  
  sN = apply(N, c(1,2), sum)
  
  A = array(0, dim=c(ns, ns, ns)) 
  dimnames(A) = list(c("A", "T", "G", "C"),c("A", "T", "G", "C"),c("A", "T", "G", "C"))
  for (i in 1:ns) 
    for (j in 1:ns)
      for (k in 1:ns)
        A[i,j, k] = N[i,j,k]/sN[i,j]
  
  loglik = log(p1)
    for (i in 1:ns)
      for (j in 1:ns)
        for (k in 1:ns)
          loglik = loglik + N[i,j,k]*log(A[i,j,k])

  return( list(A, loglik))
}


second_mark_model = second_markov_model(st, 4, pCA)

second_mark_model

## [[1]]
## , , A
## 
##           A         T          G         C
## A 0.3452381 0.2961495 0.35308642 0.3743961
## T 0.2906379 0.3217870 0.28932584 0.3302540
## G 0.3213115 0.3247232 0.25531915 0.3397129
## C 0.2729358 0.3209393 0.09090909 0.3928571
## 
## , , T
## 
##           A         T         G         C
## A 0.4032738 0.5220838 0.3604938 0.3816425
## T 0.4331276 0.4956297 0.3862360 0.4665127
## G 0.3819672 0.4920049 0.4444444 0.4545455
## C 0.3417431 0.4618395 0.5272727 0.3333333
## 
## , , G
## 
##           A         T         G          C
## A 0.1703869 0.1041903 0.1950617 0.08695652
## T 0.1784979 0.1142765 0.2148876 0.09468822
## G 0.2098361 0.1340713 0.1985816 0.08133971
## C 0.2431193 0.1291585 0.2545455 0.09523810
## 
## , , C
## 
##            A          T          G         C
## A 0.08110119 0.07757644 0.09135802 0.1570048
## T 0.09773663 0.06830690 0.10955056 0.1085450
## G 0.08688525 0.04920049 0.10165485 0.1244019
## C 0.14220183 0.08806262 0.12727273 0.1785714
## 
## 
## [[2]]
##        CA 
## -16646.31

b. Determine the most appropriate model for the chromosome

The second order Markov model has the highest probability as calculated above (i.e., log likelihood -16646.31), it’s regarded as the most appropriate model.

c. Determine the statistical significance of choosing

i. the independence model over a random sequence by chance

$H_0$ belongs to random model, $H_1$ belongs to independence model. I use the Chi-sqaure test to compute the significance of choosing between two models.

Compute log generalized likelihood ratio:

log_random = length(st)*log(1/4)

log_order = -2*(log_random - ind_model[[2]])

m_0 = 0
m_1 = 3
m = m_1-m_0

pchisq(log_order, m, lower.tail = FALSE)

## [1] 0

The p-value is very small close to zero, we can reject the null hypothesis strongly.

ii. the second-order Markov model over the first-order Markov model

$H_0$ belongs to first-order Markov model, $H_1$ belongs to second-order Markov model.

log_order = -2*(mark_model[[2]] - second_mark_model[[2]])

m_0 = 15
m_1 = 63
m = m_1-m_0

pchisq(log_order, m, lower.tail = FALSE)

##            C 
## 8.715738e-09

The p-value is very small close to zero, we can reject the null hypothesis strongly.

Question 3

Given an aligned pair of sequences:

x = "ATAGACGTACAGGTCGGAATCTTAAGTGAAATCGCGCGTCCAAACCCAGCTCTATTTTAGTGGT"
y = "ATAGCGGTACAGGTCCGATACTTTAGTTAATTAGCGCGTCCATACCCAGATCTATTGTAGTGGT"

Use the best suitable model from part (a) to answer part (b) and (d).

a. By modeling nucleotide locations independently, determine whether a random model or a match model is better suited to represent the sequence pair. State the statistical significance of your answer.

Random: Independent model

independent_model = function(st, ns) {
  nx = vector(ns, mode='integer')
  n = length(st)
  for (i in 1:n) 
    nx[st[i]] = nx[st[i]] + 1
  
  px = nx/n

  lik = 1
  for (i in 1:ns)
    lik = lik*px[i]^nx[i]
  
  return(list(px, lik))
}

stateseq2 = function (s) {
  vec = strsplit(s, "")[[1]]
  for (i in 1:length(vec)) {
    if (vec[i] == "A") vec[i] = 1; 
    if (vec[i] == "T") vec[i] = 2;
    if (vec[i] == "G") vec[i] = 3;
    if (vec[i] == "C") vec[i] = 4;
  }
  st = as.numeric(vec)
  return(st)  
}

dx = independent_model(stateseq2(x), 4)
dy = independent_model(stateseq2(y), 4)

likelihood = dx[[2]]*dy[[2]]
likelihood_rand = likelihood
likelihood_rand

## [1] 2.617059e-77

Likelihood = 2.617059e-77

pair_stateseq = function(x,y) {
  vx = strsplit(x, "")[[1]]
  vy = strsplit(y, "")[[1]]

  v <- vector(mode="integer", length = length(vx))
  for (i in 1:length(vx)) {
    if (vx[i] == "A") {
      if (vy[i] == "A") v[i] = 1;
      if (vy[i] == "T") v[i] = 2;
      if (vy[i] == "G") v[i] = 3;
      if (vy[i] == "C") v[i] = 4;
    }
    if (vx[i] == "T") {
      if (vy[i] == "A") v[i] = 5;
      if (vy[i] == "T") v[i] = 6;
      if (vy[i] == "G") v[i] = 7;
      if (vy[i] == "C") v[i] = 8;
    }
    if (vx[i] == "G") {
      if (vy[i] == "A") v[i] = 9;
      if (vy[i] == "T") v[i] = 10;
      if (vy[i] == "G") v[i] = 11;
      if (vy[i] == "C") v[i] = 12;
    }
    if (vx[i] == "C") {
      if (vy[i] == "A") v[i] = 13;
      if (vy[i] == "T") v[i] = 14;
      if (vy[i] == "G") v[i] = 15;
      if (vy[i] == "C") v[i] = 16;
    }
  }
 v
}

st2 = pair_stateseq(x, y)
st2[1:25]

##  [1]  1  6  1 11  4 15 11  6  1 16  1 11 11  6 16 12 11  1  2  5 16  6  6  2  1

Match: Independent model

dxy = independent_model(st2, 16)
likelihood_match = dxy[[2]]
dxy

## [[1]]
##  [1] 0.203125 0.062500 0.000000 0.015625 0.015625 0.234375 0.015625 0.000000
##  [9] 0.000000 0.015625 0.203125 0.015625 0.031250 0.000000 0.015625 0.171875
## 
## [[2]]
## [1] 2.978497e-55

Likelihood = 2.978497e-55

The likelihood is maximum for matched model. Hence, most appropriate model is matched independent model.

Calculate the statistical significance:

$H_0$ belongs to random model, $H_1$ belongs to match model.

order = likelihood_rand/likelihood_match
order = -2*log(order)

p_value = pchisq(order, 15-6, lower.tail=FALSE)
p_value

## [1] 7.56213e-18

The p-value is very small close to zero, we can reject the null hypothesis strongly.

b. Find the probability that two sequences match at i. a particular site

\[p = \sum_{X_1=X_2 \in \Omega}{P(X_1,X_2)} = P(AA=1) + P(TT=6) + P(GG=11) + P(CC=16)\]

agg = dxy[[1]]
p = agg[1] + agg[6] + agg[11] + agg[16]
p

## [1] 0.8125

The probability is 0.8125 to find two sequences match at a particular site.

b. Find the probability that two sequences match at ii. at least 3 sites of segments of size 5

comb = function(n, x) {
  factorial(n) / factorial(n-x) / factorial(x)
}

pp = comb(5,3)*p^3*(1-p)^2 + comb(5,4)*p^4*(1-p) + comb(5,5)*p^5
pp

## [1] 0.951231

The probability is 0.951231.

c. Find the length longest matching segments of the two sequences.

max_match_length = function(xx, yy) {
  l = 0
  maxl = 0
  for (i in 1:length(xx)) {
    if (xx[i] == yy[i]) l = l+1 else l = 0
    if (maxl < l) maxl = l
  }
  return(maxl)
}

y_max = max_match_length(stateseq2(x),stateseq2(y))
y_max

## [1] 9

Longest matching size is 9.

d. Determine whether a well-matched segment found by allowing two mismaches in the sequence pair is statistically significant or not.

\[ P(Y_{max} \leq y_{max} -1) = \sum_{j=k}^{y_{max}-1}(^j_k)p^{j-k}(1-p)^{k+1} \]

\[ k = 2, y_{max} = 9 \]

k = 2

pb = 0
for (j in k:(y_max-1)){
  tmp = comb(j,k) * (p^(j-k)) * ((1-p)^(k+1))
  pb = pb + tmp
}

1-pb

## [1] 0.7706658

It’s not statistically significant to reject the hypothesis of two mismaches in this well-matched segment.

Question 4

Sample sequences containing CpG islands and nonCpG islands from human chromosome 8 have been collected and are given in fasta format in ‘CpG.txt’ and ‘NonCpG.txt’ files, respectively.

seqCpG =  readDNAStringSet("CpG.txt")
seqNonCpG =  readDNAStringSet("NonCpG.txt")
seqCpG

## DNAStringSet object of length 1000:
##        width seq                                            names               
##    [1]   100 GCACTCTGTCGTGTGCCCGGTG...CACTTTCAGGGCCGGTAGCGA 
##    [2]   100 GGCTAGGCAGCTGCAGCTGCGC...CCGGCTCCCGCCAGCTCCGTG 
##    [3]   100 TGCTCTGCCTTCCCCTGCCCGG...CTCCCACCTCTCCCTGCCTGA 
##    [4]   100 GACAGAGAGCCTTTGGCTGCTT...TCAGGACAGAAAGCAGGCGTT 
##    [5]   100 TTAGTGGGTGCCTCCTAGGAGC...GTGGTAAGGGGGCTGGAAGAC 
##    ...   ... ...
##  [996]   100 TCTAGTTCTCTCACATGGCACC...GTTCTCTCACATGGTGCCGTG 
##  [997]   100 GGCAGGCGAGCGCGCTGCCCAC...ACCGCAGTCATCGCGCGGCAC 
##  [998]   100 GTGAGGATGCAGGTCACGCTGC...CCTGGGCCTCCCATGGCCCCA 
##  [999]   100 CGGCTGGATGAGATCACGTCTG...GCAGCTCAGCCGGCTCCTGGG 
## [1000]   100 CAAGGCGAACACCAAGGACGCG...CTCCTAGGCGCTTCTTGGAAG

a. Model the CpG islands and nonCPG islands with first-order Markov models. Find the parameters of the two models.

dinucCpG    = sapply(seqCpG, dinucleotideFrequency)
dinucCpG[, 1]

## AA AC AG AT CA CC CG CT GA GC GG GT TA TC TG TT 
##  0  4  5  0  6 15  8 10  3 14  7  7  1  6 10  3

dinucNonCpG = sapply(seqNonCpG, dinucleotideFrequency)
dinucNonCpG[, 1]

## AA AC AG AT CA CC CG CT GA GC GG GT TA TC TG TT 
##  9  2 12  5  4  3  2  5 10  5  9 10  5  4 11  3

Transition probabilities of CpG islands:

IslCounts  = rowSums(dinucCpG)
TI  = matrix( IslCounts, ncol = 4, byrow = TRUE)
dimnames(TI) = list(c("A", "C", "G", "T"), c("A", "C", "G", "T"))

MI = TI /rowSums(TI)
MI

##            A         C         G         T
## A 0.19310585 0.2682672 0.4020810 0.1365460
## C 0.19604171 0.3521883 0.2318162 0.2199538
## G 0.18393386 0.3163625 0.3514584 0.1482452
## T 0.09492885 0.3379264 0.3655975 0.2015473

Transition probabilities of nonCpG island:

NonICounts = rowSums(dinucNonCpG)
TnI = matrix(NonICounts, ncol = 4, byrow = TRUE)
dimnames(TI)  = list(c("A", "C", "G", "T"), c("A", "C", "G", "T"))

MN = TnI / rowSums(TnI)
MN

##           [,1]      [,2]       [,3]      [,4]
## [1,] 0.3380813 0.1674228 0.22866769 0.2658283
## [2,] 0.3656171 0.2461656 0.04215338 0.3460640
## [3,] 0.3010176 0.2052853 0.24436794 0.2493292
## [4,] 0.2275263 0.1944961 0.24443017 0.3335474

b. Using the likelihood ratios of the two models, describe a method to decide if a given sequence is a CpG island or not. Determine the maximum and minimum values of likelihood ratios for the given sequences of CpG islands and nonCpG islands.

Using the alpha value we found, we can compute a loglikelihood score for a given sequence. If the loglikelihood score is larger than the threshold, it is regarded as CpG islands. Otherwise, it is nonCpG islands.

beta  = log(MI / MN)
beta

##            A         C         G          T
## A -0.5600478 0.4714611 0.5643838 -0.6661893
## C -0.6232591 0.3581616 1.7046300 -0.4532062
## G -0.4925924 0.4324881 0.3634164 -0.5199065
## T -0.8741382 0.5524159 0.4026033 -0.5037609

Compute the ratio of nucleotide probabilities of the two models:

freqIsl = alphabetFrequency(seqCpG, baseOnly = TRUE, collapse = TRUE)[1:4]
freqIsl / sum(freqIsl)

##       A       C       G       T 
## 0.17352 0.32342 0.32417 0.17889

freqNon = alphabetFrequency(seqNonCpG, baseOnly = TRUE, collapse = TRUE)[1:4]
freqNon / sum(freqNon)

##       A       C       G       T 
## 0.30309 0.19881 0.19939 0.29871

alpha = log((freqIsl/sum(freqIsl)) / (freqNon/sum(freqNon)))
alpha

##          A          C          G          T 
## -0.5577369  0.4866022  0.4860054 -0.5127021

Suppose we have a sequence “ACGTTATACTACG”, the score is -0.4995709.

x = "ACGTTATACTACG"
scorefun = function(x) {
  s = unlist(strsplit(x, ""))
  score = alpha[s[1]]
  if (length(s) >= 2)
    for (j in 2:length(s))
      score = score + beta[s[j-1], s[j]]
  score
}
scorefun(x)

##          A 
## -0.4995709

scores = c()
for (i in 1:length(seqCpG)){
  s = scorefun(as.character(seqCpG[i]))
  scores = c(scores, s)}
scores_CpG = scores/100
max(scores)

## [1] 59.72279

min(scores)

## [1] -21.09602

CpG islands (non-length-normalized) loglikelihood scores: max 59.72279, min -21.09602

scores = c()
for (i in 1:length(seqCpG)){
  s = scorefun(as.character(seqNonCpG[i]))
  scores = c(scores, s)}
scores_NonCpG = scores/100
max(scores)

## [1] 31.36811

min(scores)

## [1] -49.47441

NonCpG islands (non-length-normalized) loglikelihood scores: max 31.36811, min -49.47441

c. Plot the histograms of the length-normalized log-likelihood ratios of the sequences in both files.

br = seq(-0.7, 0.7, length.out = 100)
h1 = hist(scores_CpG,    breaks = br, plot = FALSE)
h2 = hist(scores_NonCpG, breaks = br, plot = FALSE)
plot(h1, col = rgb(0, 0, 1, 1/4), xlim = c(-0.5, 0.5), ylim=c(0,120))
plot(h2, col = rgb(1, 0, 0, 1/4), add = TRUE)

d. By fitting two Gaussian functions to the histograms of CpG islands and nonCpG sequences, determine a data-driven threshold for detecting a CpG island. Evaluate the sensitivity and specificity of your method on given CpG islands or non CpG islands.

I fit two gaussian distributions to CpG and non CpG islands as below:

library("MASS")
cpg = fitdistr(scores_CpG, "normal")
cpg

##       mean           sd     
##   0.181290874   0.141074391 
##  (0.004461164) (0.003154519)

noncpg = fitdistr(scores_NonCpG, "normal")
noncpg

##        mean            sd     
##   -0.160422634    0.119433955 
##  ( 0.003776833) ( 0.002670624)

The threshold for detecting a CpG island ($CutX), which is the intersection between two gaussian distributions:

library(AdaptGauss)
x = Intersect2Mixtures(cpg$estimate[1],cpg$estimate[2],1,noncpg$estimate[1],noncpg$estimate[2],1)
x

## $CutX
## [1] 0.004424052
## 
## $CutY
## [1] 1.288696

Sensitivity = Recall = TP / (TP + FN) = 0.89

thre = x$CutX
TP = sum(scores_CpG>thre)
TN = sum(scores_NonCpG<thre)
FP = sum(scores_NonCpG>thre)
FN = sum(scores_CpG<thre)

Sensitivity = TP / (TP + FN)
Sensitivity

## [1] 0.89

Specificity = TN / (TN + FP) = 0.901

Specificity = TN / (TN + FP) 
Specificity

## [1] 0.901

Question 5

Build a Profile HMM to represent the following multiple amino acid sequence alignment (each row represents a sequence). Consider columns with gaps >4 belonging to insert states. The aligned sequences are also given in ‘sequence_alignment.txt’ file.

a. Find the state transition probabilities and the emission probabilities of the model.

seqs = scan("sequence_alignment.txt",what=character())
seqs

## [1] "AV-A---HAGGEY" "VVE----DVDD--" "VEA----DVDAGH" "DFKG-----AD-Y"
## [5] "EVY-----YGE-S" "EFNAAT--TFPK-" "-TAG--DND-A-Y" "DF-GAADTVD--H"

stateseq3 <- function(seq) {
  vec = strsplit(seq, "")[[1]]
  for (i in 1:length(vec)) {
    if (vec[i] == "A") vec[i]<-1; 
    if (vec[i] == "C") vec[i]<-2;
    if (vec[i] == "D") vec[i]<-3;
    if (vec[i] == "E") vec[i]<-4
    if (vec[i] == "F") vec[i]<-5;
    if (vec[i] == "G") vec[i]<-6; 
    if (vec[i] == "H") vec[i]<-7;
    if (vec[i] == "I") vec[i]<-8;
    if (vec[i] == "K") vec[i]<-9;
    if (vec[i] == "L") vec[i]<-10; 
    if (vec[i] == "M") vec[i]<-11;
    if (vec[i] == "N") vec[i]<-12;
    if (vec[i] == "P") vec[i]<-13;
    if (vec[i] == "Q") vec[i]<-14;
    if (vec[i] == "R") vec[i]<-15; 
    if (vec[i] == "S") vec[i]<-16;
    if (vec[i] == "T") vec[i]<-17;
    if (vec[i] == "V") vec[i]<-18;
    if (vec[i] == "W") vec[i]<-19; 
    if (vec[i] == "Y") vec[i]<-20;
    if (vec[i] == "-") vec[i]<-21;
  }
  st <- as.numeric(vec)
  st
}

t(sapply(seqs, stateseq3))

##               [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12]
## AV-A---HAGGEY    1   18   21    1   21   21   21    7    1     6     6     4
## VVE----DVDD--   18   18    4   21   21   21   21    3   18     3     3    21
## VEA----DVDAGH   18    4    1   21   21   21   21    3   18     3     1     6
## DFKG-----AD-Y    3    5    9    6   21   21   21   21   21     1     3    21
## EVY-----YGE-S    4   18   20   21   21   21   21   21   20     6     4    21
## EFNAAT--TFPK-    4    5   12    1    1   17   21   21   17     5    13     9
## -TAG--DND-A-Y   21   17    1    6   21   21    3   12    3    21     1    21
## DF-GAADTVD--H    3    5   21    6    1    1    3   17   18     3    21    21
##               [,13]
## AV-A---HAGGEY    20
## VVE----DVDD--    21
## VEA----DVDAGH     7
## DFKG-----AD-Y    20
## EVY-----YGE-S    16
## EFNAAT--TFPK-    21
## -TAG--DND-A-Y    20
## DF-GAADTVD--H     7

nseqs = length(seqs)
nchars = nchar(seqs[1])
matseq = matrix(0, nseqs, nchars)
for (i in 1:nseqs){
  stseq = stateseq3(seqs[i])
  for (j in 1:nchars)
    matseq[i,j] = stseq[j]
}
matseq

##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
## [1,]    1   18   21    1   21   21   21    7    1     6     6     4    20
## [2,]   18   18    4   21   21   21   21    3   18     3     3    21    21
## [3,]   18    4    1   21   21   21   21    3   18     3     1     6     7
## [4,]    3    5    9    6   21   21   21   21   21     1     3    21    20
## [5,]    4   18   20   21   21   21   21   21   20     6     4    21    16
## [6,]    4    5   12    1    1   17   21   21   17     5    13     9    21
## [7,]   21   17    1    6   21   21    3   12    3    21     1    21    20
## [8,]    3    5   21    6    1    1    3   17   18     3    21    21     7

fmatseq = factor(matseq)
levels(fmatseq)

##  [1] "1"  "3"  "4"  "5"  "6"  "7"  "9"  "12" "13" "16" "17" "18" "20" "21"

namino = length(levels(fmatseq))

ngaps <- vector("integer", nchars)
for (j in 1:nchars){
  for (i in 1:nseqs)
    if (matseq[i,j]==21)
      ngaps[j] = ngaps[j] + 1
}

gt = floor(nseqs/2)

number of matched states:

mstate = matrix(0, nseqs, nchars)
nm = 0
for (j in 1:nchars){
  if (ngaps[j] > gt) {
    for (i in 1:nseqs) 
      if (matseq[i, j] < 21)
        mstate [i, j] = 2
      else
        mstate[i, j] = 0 # no state is defined: no character within an insert state
  }
  else {
    for (i in 1:nseqs) {
      if (matseq[i,j] == 21)
        mstate[i, j] = 3
      else
        mstate[i,j] = 1
    }
    nm = nm + 1
  }
}
nm # number of matched states

## [1] 9

Emission probabilities are as below:

emission_probabilities = function(mstate, matseq, nm) {
  
  ee = array(0, c(nm, 2, 20)) 
  
  for (i in 1:nseqs) {
    st = 1
    for (j in 1:nchars) {
      
      if (mstate[i, j] == 1)
        ee[st, mstate[i, j], matseq[i, j]] = ee[st, mstate[i, j],  matseq[i, j]] + 1
      if (mstate[i, j] == 2)
        ee[st-1, mstate[i, j], matseq[i, j]] = ee[st-1, mstate[i, j],  matseq[i, j]] + 1
      
      if (ngaps[j] <= gt) 
        st = st + 1
    }
  }
  
  pe = array(0, c(nm, 2, 20))
  for (i in 1:nm) 
    for (j in 1:2)
      if (sum(ee[i, j,]) != 0) pe[i, j,] = ee[i, j,]/sum(ee[i, j,])
  
  pe
}

pe = emission_probabilities(mstate, matseq, nm)
pem = t(pe[, 1, ]) # at match states
pei = t(pe[, 2, ]) # at insert states

rownames(pem) = rownames(pei) = c("A","C","D","E","F","G","H","I","K","L","M","N","P","Q","R","S","T","V","W","Y")
print(round(pem, 2))

##   [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
## A 0.14 0.00 0.33  0.4  0.0 0.14 0.14 0.29 0.00
## C 0.00 0.00 0.00  0.0  0.0 0.00 0.00 0.00 0.00
## D 0.29 0.00 0.00  0.0  0.4 0.14 0.43 0.29 0.00
## E 0.29 0.12 0.17  0.0  0.0 0.00 0.00 0.14 0.00
## F 0.00 0.38 0.00  0.0  0.0 0.00 0.14 0.00 0.00
## G 0.00 0.00 0.00  0.6  0.0 0.00 0.29 0.14 0.00
## H 0.00 0.00 0.00  0.0  0.2 0.00 0.00 0.00 0.33
## I 0.00 0.00 0.00  0.0  0.0 0.00 0.00 0.00 0.00
## K 0.00 0.00 0.17  0.0  0.0 0.00 0.00 0.00 0.00
## L 0.00 0.00 0.00  0.0  0.0 0.00 0.00 0.00 0.00
## M 0.00 0.00 0.00  0.0  0.0 0.00 0.00 0.00 0.00
## N 0.00 0.00 0.17  0.0  0.2 0.00 0.00 0.00 0.00
## P 0.00 0.00 0.00  0.0  0.0 0.00 0.00 0.14 0.00
## Q 0.00 0.00 0.00  0.0  0.0 0.00 0.00 0.00 0.00
## R 0.00 0.00 0.00  0.0  0.0 0.00 0.00 0.00 0.00
## S 0.00 0.00 0.00  0.0  0.0 0.00 0.00 0.00 0.17
## T 0.00 0.12 0.00  0.0  0.2 0.14 0.00 0.00 0.00
## V 0.29 0.38 0.00  0.0  0.0 0.43 0.00 0.00 0.00
## W 0.00 0.00 0.00  0.0  0.0 0.00 0.00 0.00 0.00
## Y 0.00 0.00 0.17  0.0  0.0 0.14 0.00 0.00 0.50

Transition probabilities are as below:

transition_probabilities2 = function(mstate, matseq, nm) {
  
  aa = array(0, c((nm-1), 3, 3)) # state transition matrix: 'm', 'i' and 'd' states
  
  for (i in 1:nseqs) {
    st = pst = ist = 1. # position in sequence; match state; insert state
    for (j in 1:(nchars-1)) {
      
      if (ngaps[j] <= gt) {
        if (mstate[i, j+1] == 1 || mstate[i, j+1] == 3) {
          aa[st, mstate[i, j], mstate[i, j+1]] = aa[st, mstate[i, j], mstate[i, j+1]] + 1
          pst = j + 1
        }
        else if(mstate[i, j+1]!=0){
          aa[st, mstate[i, j], mstate[i, j+1]] = aa[st, mstate[i, j], mstate[i, j+1]] + 1
          ist = j + 1
        }
        st = st + 1
      }
      else { 
        if (mstate[i,j+1] == 1 || mstate[i, j+1] == 3) { # at an end of insert state
          if (ist > pst)
            aa[st-1, mstate[i, ist], mstate[i, j+1]] = aa[st-1, mstate[i, ist], mstate[i, j+1]] + 1
          else
            aa[st-1, mstate[i, pst], mstate[i, j+1]] = aa[st-1, mstate[i, pst], mstate[i, j+1]] + 1
          pst = j+1
        }
        if (mstate[i,j+1] == 2) {
          if (ist > pst)
            aa[st-1, mstate[i, ist], mstate[i, j+1]] = aa[st-1, mstate[i, ist], mstate[i, j+1]] + 1
          else
            aa[st-1, mstate[i, pst], mstate[i, j+1]] = aa[st-1, mstate[i, pst], mstate[i, j+1]] + 1
          ist = j + 1
        }
      }
      
    }
  }
  
  pa = array(0, c((nm-1), 3, 3))
  for (i in 1:(nm-1)) 
    for (j in 1:3)
      if (sum(aa[i, j,])!= 0) pa[i, j,] = aa[i,j,]/sum(aa[i, j,])
  
  pa
}

pa = transition_probabilities2(mstate, matseq, nm)

pmat = matrix(0, 3, 3)

for (i in 1:(nm-1)) {
  print(paste("i = ", i))
  pmat = pa[i, , ]
  rownames(pmat) = colnames(pmat) = c("m", "i", "d")
  print(round(pmat, 2))
}

## [1] "i =  1"
##   m i d
## m 1 0 0
## i 0 0 0
## d 1 0 0
## [1] "i =  2"
##      m i    d
## m 0.75 0 0.25
## i 0.00 0 0.00
## d 0.00 0 0.00
## [1] "i =  3"
##     m i   d
## m 0.5 0 0.5
## i 0.0 0 0.0
## d 1.0 0 0.0
## [1] "i =  4"
##      m   i    d
## m 0.20 0.6 0.20
## i 0.33 0.5 0.17
## d 0.67 0.0 0.33
## [1] "i =  5"
##      m i    d
## m 1.00 0 0.00
## i 0.00 0 0.00
## d 0.67 0 0.33
## [1] "i =  6"
##      m i    d
## m 0.86 0 0.14
## i 0.00 0 0.00
## d 1.00 0 0.00
## [1] "i =  7"
##      m i    d
## m 0.86 0 0.14
## i 0.00 0 0.00
## d 1.00 0 0.00
## [1] "i =  8"
##      m    i    d
## m 0.43 0.43 0.14
## i 0.67 0.00 0.33
## d 1.00 0.00 0.00

b. Determine the alignment and the likelihood, given by the model for the following two sequences:

Based on the HMM profile based above, two sequences, “AVAAGDYDSY” and “TFADYYH”, are aligned as below:

library(aphid)
library(ape)

seqs_a = as.AAbin(seqs)
seqs_a = as.matrix(AAStringSet(seqs))
phmm = derivePHMM(seqs_a)

alignment1 = align(list(strsplit("AVAAGDYDSY","")[[1]]), phmm)
alignment2 = align(list(strsplit("TFADYYH","")[[1]]), phmm)


print(alignment1)

##      1   2   3   4   I   5   6   7   8   9  
## [1,] "A" "V" "A" "A" "G" "D" "Y" "D" "S" "Y"

print(forward(phmm, list(strsplit("AVAAGDYDSY","")[[1]]), odds=FALSE))

## Full (log) probability of sequence given model = -22.54357

The log probability of of sequence given model = -22.54357.

print(alignment2)

##      1   2   3   4   5   6   7   8   9  
## [1,] "T" "F" "A" "-" "D" "Y" "Y" "-" "H"

print(forward(phmm, list(strsplit("TFADYYH","")[[1]]), odds=FALSE))

## Full (log) probability of sequence given model = -17.89012

The log probability of of sequence given model = -17.89012.

The HMM profile is demonstrated as below:

plot(phmm)