The purpose of the following tutorial is to give a short and quick overview of the currenty (as of Sep, 2016) available motif search tools and databases. This is intended for the undergraduate students taking Bioinformatics algorithm
course. However, any newcomer in the broad field of Computational Biology can find it useful.
Researchers use motif searching techniques for two main purposes:
given a set of sequences (e.g. upstream regions from a set of genes, or ChIP-Seq locations), can we find a de-novo motif? and
given a set of motifs, can we find the putative occurrences of those motifs in regions of interest (e.g. genomic locations where certain bio-chemical property like methylation has been changed from healthy to disease conditions).
Currently, is maintained by GeneExplain (http://genexplain.com/transfac-1)
The usage of an older version of TRANSFAC is free of charge for non-profit users.
Access to the most up-to-date version requires a license.
- [Wikipedia]
Format: positional freqeuncy matrix
#read a file from local directory and print
read.table('/Users/msharmin/Box Sync/Mat/M00001.mat', sep='\n')
## V1
## 1 AC M00001 ID V$MYOD_01 NA MyoD
## 2 1 2 2 0
## 3 2 1 2 0
## 4 3 0 1 1
## 5 0 5 0 0
## 6 5 0 0 0
## 7 0 0 4 1
## 8 0 1 4 0
## 9 0 0 0 5
## 10 0 0 5 0
## 11 0 1 2 2
## 12 0 2 0 3
## 13 1 0 3 1
pfm = read.table('/Users/msharmin/Box Sync/Mat/M00001.mat', skip=1)
To access a motif matrix via R, we can directly use the JASPAR id
library(JASPAR2014)
library(TFBSTools)
data(MA0003.2)
MA0003.2
## An object of class PFMatrix
## ID: MA0003.2
## Name: TFAP2A
## Matrix Class: Zipper-Type
## strand: +
## Tags:
## $centrality_logp
## [1] "-4343"
##
## $family
## [1] "Helix-Loop-Helix"
##
## $medline
## [1] "10497269"
##
## $pazar_tf_id
## [1] "TF0000002"
##
## $source
## [1] "ENCODE"
##
## $tax_group
## [1] "vertebrates"
##
## $tfbs_shape_id
## [1] "264"
##
## $type
## [1] "ChIP-seq"
##
## $collection
## [1] "CORE"
##
## $species
## [1] "9606"
##
## $acc
## [1] "P05549"
##
## Background:
## A C G T
## 0.25 0.25 0.25 0.25
## Matrix:
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
## A 1387 2141 727 1517 56 0 0 62 346 3738 460 0 116
## C 1630 1060 1506 519 1199 5098 4762 1736 2729 236 0 0 1443
## G 851 792 884 985 3712 0 0 85 1715 920 4638 5098 3455
## T 1230 1105 1981 2077 131 0 336 3215 308 204 0 0 84
## [,14] [,15]
## A 451 3146
## C 3672 690
## G 465 168
## T 510 1094
Or, we can specify other options, e.g. species, name, construction type etc.
opts <- list()
opts[["species"]] <- 9606 #species id
opts[["name"]] <- "CEBPB" #TF name
opts[["type"]] <- "ChIP-seq" #what kind of data has been used to construct the TF
PFMatrixList <- getMatrixSet(JASPAR2014, opts)
PFMatrixList[[1]]
## An object of class PFMatrix
## ID: MA0466.1
## Name: CEBPB
## Matrix Class: Zipper-Type
## strand: +
## Tags:
## $alias
## [1] "LAP,CRP2,NFIL6,IL6DBP,C/EBP-beta"
##
## $centrality_logp
## [1] "-188131"
##
## $description
## [1] "CCAAT/enhancer binding protein (C/EBP),beta"
##
## $family
## [1] "Leucine-Zipper"
##
## $medline
## [1] "8380454"
##
## $source
## [1] "ENCODE"
##
## $symbol
## [1] "CEBPB"
##
## $tax_group
## [1] "vertebrates"
##
## $tfbs_shape_id
## [1] "270"
##
## $type
## [1] "ChIP-seq"
##
## $collection
## [1] "CORE"
##
## $species
## 9606
## "Homo sapiens"
##
## $acc
## [1] "P17676"
##
## Background:
## A C G T
## 0.25 0.25 0.25 0.25
## Matrix:
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11]
## A 13006 75198 0 0 4556 0 74715 8654 60151 99494 0
## C 10026 5868 0 0 0 99494 5478 51954 39343 0 36038
## G 33617 18428 0 0 75531 0 10015 0 0 0 2043
## T 42845 0 99494 99494 19407 0 9286 38886 0 0 61413
From the position frequency matrix, we can canculate PPM, PWM, Information content, consensus, qnd motif logo
pfm = PFMatrixList@listData$MA0466.1#MA0491.1
ppm <- toPWM(pfm, type="prob", pseudocounts=0.8); ppm@profileMatrix
## [,1] [,2] [,3] [,4] [,5]
## A 0.1307224 7.558003e-01 2.010155e-06 2.010155e-06 4.579335e-02
## C 0.1007711 5.897997e-02 2.010155e-06 2.010155e-06 2.010155e-06
## G 0.3378790 1.852177e-01 2.010155e-06 2.010155e-06 7.591472e-01
## T 0.4306275 2.010155e-06 9.999940e-01 9.999940e-01 1.950574e-01
## [,6] [,7] [,8] [,9] [,10]
## A 2.010155e-06 0.75094578 8.698143e-02 6.045663e-01 9.999940e-01
## C 9.999940e-01 0.05506016 5.221801e-01 3.954297e-01 2.010155e-06
## G 2.010155e-06 0.10066054 2.010155e-06 2.010155e-06 2.010155e-06
## T 2.010155e-06 0.09333352 3.908365e-01 2.010155e-06 2.010155e-06
## [,11]
## A 2.010155e-06
## C 3.622119e-01
## G 2.053575e-02
## T 6.172503e-01
pwm <- toPWM(pfm, type="log2probratio", pseudocounts=0.8); pwm@profileMatrix
## [,1] [,2] [,3] [,4] [,5] [,6]
## A -0.9354216 1.596077 -16.924262 -16.924262 -2.4487181 -16.924262
## C -1.3108462 -2.083631 -16.924262 -16.924262 -16.9242616 1.999991
## G 0.4345784 -0.432706 -16.924262 -16.924262 1.6024516 -16.924262
## T 0.7845125 -16.924262 1.999991 1.999991 -0.3580291 -16.924262
## [,7] [,8] [,9] [,10] [,11]
## A 1.586781 -1.5231488 1.2739724 1.999991 -16.9242616
## C -2.182847 1.0626193 0.6614932 -16.924262 0.5349058
## G -1.312430 -16.9242616 -16.9242616 -16.924262 -3.6057188
## T -1.421461 0.6446371 -16.9242616 -16.924262 1.3039277
icm <- toICM(pfm, pseudocounts=0.8, bg=c(A=0.25, C=0.25, G=0.25, T=0.25)); icm@profileMatrix
## [,1] [,2] [,3] [,4] [,5]
## A 0.03010427 7.582522e-01 4.020064e-06 4.020064e-06 4.737282e-02
## C 0.02320674 5.917131e-02 4.020064e-06 4.020064e-06 2.079488e-06
## G 0.07781069 1.858186e-01 4.020064e-06 4.020064e-06 7.853312e-01
## T 0.09916990 2.016677e-06 1.999865e+00 1.999865e+00 2.017852e-01
## [,6] [,7] [,8] [,9] [,10]
## A 4.020064e-06 0.60572672 5.865215e-02 6.237351e-01 1.999865e+00
## C 1.999865e+00 0.04441254 3.521095e-01 4.079675e-01 4.020064e-06
## G 4.020064e-06 0.08119465 1.355461e-06 2.073891e-06 4.020064e-06
## T 4.020064e-06 0.07528454 2.635436e-01 2.073891e-06 4.020064e-06
## [,11]
## A 1.858427e-06
## C 3.348717e-01
## G 1.898569e-02
## T 5.706596e-01
TFBSTools::seqLogo(icm)
I took 200 DNA fragments from CEBPB
ChIP-seq data of blood cell. Let’s try to find out what de-novo motif we see, and compar the de-novo motif with the CEBPB
motif from database.
library(rGADEM)
chipData = read.csv('/Users/msharmin/Downloads/seqbalanced1b.csv', sep=' ');
chipData = as.character(chipData[chipData$label==1, 'sequence'])[1:200];
dnaTexts = DNAStringSet(unlist(sapply(chipData, DNAString)))
result = GADEM(dnaTexts, verbose = F)
## top 3 4, 5-mers: 20 40 60
## top 3 4, 5-mers: 10 40 58
#These are the strings which has been extracted from given sequences, to build the motif-matrix
sapply(result@motifList[[1]]@alignList, function(x) x@seq)[1:10]
## [1] "ATTTCCTGTC" "GTTTCCTGTA" "GTTTCCTGTC" "GTTTCCTGTA" "CTTTCCTGTT"
## [6] "CTTTCCTGTT" "TTTTCCTGTC" "ATTTCCTGTA" "TTTTCCTCTT" "GTTTCCTCTC"
par(mfrow=c(1,2))
seqLogo(result@motifList[[1]]@pwm);
TFBSTools::seqLogo(icm)
Motif scanning: let’s see the most probable candidate motif from 1st dnaText
library(Biostrings)
subject <- DNAString(chipData[1])#
siteset <- searchSeq(pwm, subject, seqname="seq1", min.score="60%", strand="*")
siteset[c(12)]
## An object of class SiteSet with 1 site sequence
## seqname source feature start end score strand frame
## 1 seq1 TFBS TFBS 64 74 -6.333091 + .
## attributes
## 1 TF=CEBPB;class=Zipper-Type;sequence=AATTGTGCAAT
seqLogo(toPWM(pfm@profileMatrix, 'prob'))
chipPeaks = read.table('/Users/msharmin/Downloads/Gm12878Cebpb.narrowPeak', sep='\t')
colnames(chipPeaks) = c('chr', 'start', 'end', 'name', 'score', 'strand', 'value', 'p', 'q', 'point')
head(chipPeaks)
## chr start end name score strand value p q point
## 1 chr11 62608831 62609368 . 1000 . 182.6336 -1 3.821579 282
## 2 chr4 163418781 163419076 . 1000 . 166.0033 -1 3.821579 154
## 3 chr6 86353764 86354075 . 989 . 154.8770 -1 3.821579 154
## 4 chr2 69016741 69017066 . 988 . 154.6890 -1 3.821579 145
## 5 chr17 56708951 56709260 . 947 . 148.2397 -1 3.821579 129
## 6 chr4 105887826 105888119 . 924 . 144.6403 -1 3.821579 153