The purpose of the following tutorial is to give a short and quick overview of the currenty (as of Sep, 2016) available motif search tools and databases. This is intended for the undergraduate students taking Bioinformatics algorithm course. However, any newcomer in the broad field of Computational Biology can find it useful.

Researchers use motif searching techniques for two main purposes:

  1. given a set of sequences (e.g. upstream regions from a set of genes, or ChIP-Seq locations), can we find a de-novo motif? and

  2. given a set of motifs, can we find the putative occurrences of those motifs in regions of interest (e.g. genomic locations where certain bio-chemical property like methylation has been changed from healthy to disease conditions).

Motif Databases

#read a file from local directory and print
read.table('/Users/msharmin/Box Sync/Mat/M00001.mat', sep='\n')
##                                V1
## 1  AC M00001 ID V$MYOD_01 NA MyoD
## 2                        1 2 2 0 
## 3                        2 1 2 0 
## 4                        3 0 1 1 
## 5                        0 5 0 0 
## 6                        5 0 0 0 
## 7                        0 0 4 1 
## 8                        0 1 4 0 
## 9                        0 0 0 5 
## 10                       0 0 5 0 
## 11                       0 1 2 2 
## 12                       0 2 0 3 
## 13                       1 0 3 1
pfm = read.table('/Users/msharmin/Box Sync/Mat/M00001.mat', skip=1)

To access a motif matrix via R, we can directly use the JASPAR id

library(JASPAR2014)
library(TFBSTools)
data(MA0003.2)
MA0003.2
## An object of class PFMatrix
## ID: MA0003.2
## Name: TFAP2A
## Matrix Class: Zipper-Type
## strand: +
## Tags: 
## $centrality_logp
## [1] "-4343"
## 
## $family
## [1] "Helix-Loop-Helix"
## 
## $medline
## [1] "10497269"
## 
## $pazar_tf_id
## [1] "TF0000002"
## 
## $source
## [1] "ENCODE"
## 
## $tax_group
## [1] "vertebrates"
## 
## $tfbs_shape_id
## [1] "264"
## 
## $type
## [1] "ChIP-seq"
## 
## $collection
## [1] "CORE"
## 
## $species
## [1] "9606"
## 
## $acc
## [1] "P05549"
## 
## Background: 
##    A    C    G    T 
## 0.25 0.25 0.25 0.25 
## Matrix: 
##   [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
## A 1387 2141  727 1517   56    0    0   62  346  3738   460     0   116
## C 1630 1060 1506  519 1199 5098 4762 1736 2729   236     0     0  1443
## G  851  792  884  985 3712    0    0   85 1715   920  4638  5098  3455
## T 1230 1105 1981 2077  131    0  336 3215  308   204     0     0    84
##   [,14] [,15]
## A   451  3146
## C  3672   690
## G   465   168
## T   510  1094

Or, we can specify other options, e.g. species, name, construction type etc.

opts <- list()
opts[["species"]] <- 9606 #species id
opts[["name"]] <- "CEBPB" #TF name
opts[["type"]] <- "ChIP-seq" #what kind of data has been used to construct the TF
PFMatrixList <- getMatrixSet(JASPAR2014, opts)
PFMatrixList[[1]]
## An object of class PFMatrix
## ID: MA0466.1
## Name: CEBPB
## Matrix Class: Zipper-Type
## strand: +
## Tags: 
## $alias
## [1] "LAP,CRP2,NFIL6,IL6DBP,C/EBP-beta"
## 
## $centrality_logp
## [1] "-188131"
## 
## $description
## [1] "CCAAT/enhancer binding protein (C/EBP),beta"
## 
## $family
## [1] "Leucine-Zipper"
## 
## $medline
## [1] "8380454"
## 
## $source
## [1] "ENCODE"
## 
## $symbol
## [1] "CEBPB"
## 
## $tax_group
## [1] "vertebrates"
## 
## $tfbs_shape_id
## [1] "270"
## 
## $type
## [1] "ChIP-seq"
## 
## $collection
## [1] "CORE"
## 
## $species
##           9606 
## "Homo sapiens" 
## 
## $acc
## [1] "P17676"
## 
## Background: 
##    A    C    G    T 
## 0.25 0.25 0.25 0.25 
## Matrix: 
##    [,1]  [,2]  [,3]  [,4]  [,5]  [,6]  [,7]  [,8]  [,9] [,10] [,11]
## A 13006 75198     0     0  4556     0 74715  8654 60151 99494     0
## C 10026  5868     0     0     0 99494  5478 51954 39343     0 36038
## G 33617 18428     0     0 75531     0 10015     0     0     0  2043
## T 42845     0 99494 99494 19407     0  9286 38886     0     0 61413

From the position frequency matrix, we can canculate PPM, PWM, Information content, consensus, qnd motif logo

pfm = PFMatrixList@listData$MA0466.1#MA0491.1
ppm <- toPWM(pfm, type="prob", pseudocounts=0.8); ppm@profileMatrix
##        [,1]         [,2]         [,3]         [,4]         [,5]
## A 0.1307224 7.558003e-01 2.010155e-06 2.010155e-06 4.579335e-02
## C 0.1007711 5.897997e-02 2.010155e-06 2.010155e-06 2.010155e-06
## G 0.3378790 1.852177e-01 2.010155e-06 2.010155e-06 7.591472e-01
## T 0.4306275 2.010155e-06 9.999940e-01 9.999940e-01 1.950574e-01
##           [,6]       [,7]         [,8]         [,9]        [,10]
## A 2.010155e-06 0.75094578 8.698143e-02 6.045663e-01 9.999940e-01
## C 9.999940e-01 0.05506016 5.221801e-01 3.954297e-01 2.010155e-06
## G 2.010155e-06 0.10066054 2.010155e-06 2.010155e-06 2.010155e-06
## T 2.010155e-06 0.09333352 3.908365e-01 2.010155e-06 2.010155e-06
##          [,11]
## A 2.010155e-06
## C 3.622119e-01
## G 2.053575e-02
## T 6.172503e-01
pwm <- toPWM(pfm, type="log2probratio", pseudocounts=0.8); pwm@profileMatrix
##         [,1]       [,2]       [,3]       [,4]        [,5]       [,6]
## A -0.9354216   1.596077 -16.924262 -16.924262  -2.4487181 -16.924262
## C -1.3108462  -2.083631 -16.924262 -16.924262 -16.9242616   1.999991
## G  0.4345784  -0.432706 -16.924262 -16.924262   1.6024516 -16.924262
## T  0.7845125 -16.924262   1.999991   1.999991  -0.3580291 -16.924262
##        [,7]        [,8]        [,9]      [,10]       [,11]
## A  1.586781  -1.5231488   1.2739724   1.999991 -16.9242616
## C -2.182847   1.0626193   0.6614932 -16.924262   0.5349058
## G -1.312430 -16.9242616 -16.9242616 -16.924262  -3.6057188
## T -1.421461   0.6446371 -16.9242616 -16.924262   1.3039277
icm <- toICM(pfm, pseudocounts=0.8, bg=c(A=0.25, C=0.25, G=0.25, T=0.25)); icm@profileMatrix
##         [,1]         [,2]         [,3]         [,4]         [,5]
## A 0.03010427 7.582522e-01 4.020064e-06 4.020064e-06 4.737282e-02
## C 0.02320674 5.917131e-02 4.020064e-06 4.020064e-06 2.079488e-06
## G 0.07781069 1.858186e-01 4.020064e-06 4.020064e-06 7.853312e-01
## T 0.09916990 2.016677e-06 1.999865e+00 1.999865e+00 2.017852e-01
##           [,6]       [,7]         [,8]         [,9]        [,10]
## A 4.020064e-06 0.60572672 5.865215e-02 6.237351e-01 1.999865e+00
## C 1.999865e+00 0.04441254 3.521095e-01 4.079675e-01 4.020064e-06
## G 4.020064e-06 0.08119465 1.355461e-06 2.073891e-06 4.020064e-06
## T 4.020064e-06 0.07528454 2.635436e-01 2.073891e-06 4.020064e-06
##          [,11]
## A 1.858427e-06
## C 3.348717e-01
## G 1.898569e-02
## T 5.706596e-01
TFBSTools::seqLogo(icm)

Tools and Options?

I took 200 DNA fragments from CEBPB ChIP-seq data of blood cell. Let’s try to find out what de-novo motif we see, and compar the de-novo motif with the CEBPB motif from database.

library(rGADEM)
chipData = read.csv('/Users/msharmin/Downloads/seqbalanced1b.csv', sep=' '); 
chipData = as.character(chipData[chipData$label==1, 'sequence'])[1:200]; 
dnaTexts = DNAStringSet(unlist(sapply(chipData, DNAString)))
result = GADEM(dnaTexts, verbose = F)
## top 3  4, 5-mers: 20 40 60
## top 3  4, 5-mers: 10 40 58
#These are the strings which has been extracted from given sequences, to build the motif-matrix
sapply(result@motifList[[1]]@alignList, function(x) x@seq)[1:10]
##  [1] "ATTTCCTGTC" "GTTTCCTGTA" "GTTTCCTGTC" "GTTTCCTGTA" "CTTTCCTGTT"
##  [6] "CTTTCCTGTT" "TTTTCCTGTC" "ATTTCCTGTA" "TTTTCCTCTT" "GTTTCCTCTC"
par(mfrow=c(1,2))
seqLogo(result@motifList[[1]]@pwm); 

TFBSTools::seqLogo(icm)

Motif scanning: let’s see the most probable candidate motif from 1st dnaText

library(Biostrings)
subject <- DNAString(chipData[1])#
siteset <- searchSeq(pwm, subject, seqname="seq1", min.score="60%", strand="*")
siteset[c(12)]
## An object of class SiteSet with 1 site sequence
##   seqname source feature start end     score strand frame
## 1    seq1   TFBS    TFBS    64  74 -6.333091      +     .
##                                        attributes
## 1 TF=CEBPB;class=Zipper-Type;sequence=AATTGTGCAAT
seqLogo(toPWM(pfm@profileMatrix, 'prob'))

Where do I get the sequences?

chipPeaks = read.table('/Users/msharmin/Downloads/Gm12878Cebpb.narrowPeak', sep='\t')
colnames(chipPeaks) = c('chr', 'start', 'end', 'name', 'score', 'strand', 'value', 'p', 'q', 'point')
head(chipPeaks)
##     chr     start       end name score strand    value  p        q point
## 1 chr11  62608831  62609368    .  1000      . 182.6336 -1 3.821579   282
## 2  chr4 163418781 163419076    .  1000      . 166.0033 -1 3.821579   154
## 3  chr6  86353764  86354075    .   989      . 154.8770 -1 3.821579   154
## 4  chr2  69016741  69017066    .   988      . 154.6890 -1 3.821579   145
## 5 chr17  56708951  56709260    .   947      . 148.2397 -1 3.821579   129
## 6  chr4 105887826 105888119    .   924      . 144.6403 -1 3.821579   153

Where we are right now?