August 6, 2021
segmenter
The goal of the segmenter
package is to
The package can be installed from Bioconductor using BiocManager
or from GitHub using remotes
# install from bioconductor BiocManager::install('segmenter') # install from github remotes::install_github('MahShaaban/segmenter@devel')
# run command obj <- learn_model( inputdir = <input binarized files>, coordsdir = <genomic coordinates of features>, anchorsdir = <genomic location of TSS>, chromsizefile = <chromosome lengths>, numstates = <the number of desired states>, assembly = <the genome assembly>, cells = <names of the biological samples>, annotation = <name of the reference genome type>, binsize = <the size of bins> )
segmenter
ChromHMM requires two types of input files. Those are
# coordinates coordsdir <- 'hg38_coords' list.files(coordsdir) #> [1] "CpGIsland.hg38.bed.gz" "RefSeqExon.hg38.bed.gz" #> [3] "RefSeqGene.hg38.bed.gz" "RefSeqTES.hg38.bed.gz" #> [5] "RefSeqTSS.hg38.bed.gz" "RefSeqTSS2kb.hg38.bed.gz" # anchors anchorsdir <- 'hg38_anchors' list.files(anchorsdir) #> [1] "RefSeqTES.hg38.txt.gz" "RefSeqTSS.hg38.txt.gz" # chromosomes' sizes chromsizefile <- 'hg38_chromsizes.txt' read.delim(chromsizefile, header = FALSE, nrows = 2) #> V1 V2 #> 1 chr1 248956422 #> 2 chr2 242193529
binarize_bam
or bed binarize_bed
# a table to assign marker and cell names to the bam files cell_marks <- 'cell_mark_file.tsv' read.delim(cell_marks, header = FALSE, nrows = 3) #> V1 V2 V3 #> 1 GM12878 H3K27me3 ENCFF796DDM.bam #> 2 GM12878 H3K27me3 ENCFF927XRX.bam #> 3 GM12878 H3K27me3 ENCFF265UBT.bam
# run command binarize_bam('bam/', chromsizefile = chromsizefile, cell_marks = cell_marks, outputdir = 'bins_200')
# show output files example_binaries <- list.files('bins_200', full.names = TRUE)[1] read.delim(example_binaries, nrows = 2, skip = 1) #> H2AFZ H3K27ac H3K27me3 H3K36me3 H3K4me1 H3K4me2 H3K4me3 H3K79me2 H3K9ac #> 1 0 0 0 0 0 0 0 0 0 #> 2 0 0 0 0 0 0 0 0 0 #> H3K9me3 H4K20me1 #> 1 0 0 #> 2 0 0
learn_model
wraps the the Java module that learns a chromatin segmentation model of a given number of states.
# load library library(segmenter) # run the main command obj <- learn_model( inputdir = 'bins_200', outputdir = 'models/model_10', anchorsdir = 'hg38_anchors/', coordsdir = 'hg38_coords/', chromsizefile = 'hg38_chromsizes.txt', numstates = 10, cells = c('GM12878', 'H1', 'HepG2', 'IMR-90', 'K562'), assembly = 'hg38', annotation = 'RefSeq', binsize = 200 )
# show the object class(obj) #> [1] "segmentation" #> attr(,"package") #> [1] "segmenter"
model
: the initial and final parametersemission
: the probabilities of the mark in a given statetransition
: the probabilities of the states transitionsoverlap
: the enrichment of the states at genomic featuresTSS
: the enrichment of the states around the TSSTES
: the enrichment of the states around the TESsegment
: the assignment of states to the binAn accessor function with the same of every slot access its contents. For example, to access the emission probabilities, call emission
# access object slots emission(obj)
Some accessors have more arguments to subset the object. For example, the segment
method take a cell
name to return only the segments in the corresponding cell.
# subset the segment slot segment(obj, cell = 'K562')
# show the object show(obj) #> # An object of class 'segmentation' #> # Contains a chromatin segmentation model: #> ## States: 1 2 3 4 5 6 7 8 9 10 #> ## Marks: H3K36me3 H4K20me1 H3K79me2 H3K4me1 H3K27ac H3K9ac H3K4me2 H3K4me3 H2AFZ H3K27me3 H3K9me3 #> ## Cells: GM12878 H1 HepG2 IMR-90 K562 #> # Contains nine slots: #> ## model: use 'model(object)' to access #> ## emission: use 'emission(object)' to access #> ## transition: use 'transition(object)' to access #> ## overlap: use 'overlap(object)' to access #> ## TSS: use 'TSS(object)' to access #> ## TES: use 'TES(object)' to access #> ## segment: use 'segment(object)' to access #> ## bins: use 'bins(object)' to access #> ## counts: use 'counts(object)' to access #> # For more info about how to use the object, use ?accessors
# access object slots emission(obj) transition(obj)
plot_heatmap
takes a segmentation
object and visualize the slot in type
. By default, this is emission
.Heatmap
object from the ComplexHeatmap
package. These objects are customized to produce diverse informative figures.# emission and transition plots plot_heatmap(obj, row_labels = paste0('S', 1:10), name = 'Emission') plot_heatmap(obj, type = 'transition', row_labels = paste0('S', 1:10), column_labels = paste0('S', 1:10), name = 'Transition')
overlap
slot contains the fold enrichment of each state in the genomic coordinates.# overlap enrichment plot_heatmap( obj, type = 'overlap', column_labels = c('Genome', 'CpG', 'Exon', 'Gene', 'TES', 'TSS', 'TSS2kb') )
segment
contains the state assingmentGRanges
object with the chromosome name, start and end sites in the ranges part of the object and the name of the state in a metadata columns.# get segments of all cells segment(obj) # get segments of GM12878 segment(obj, 'GM12878')
# get segments segment(obj, 'GM12878') #> $GM12878 #> GRanges object with 748339 ranges and 1 metadata column: #> seqnames ranges strand | state #> <Rle> <IRanges> <Rle> | <character> #> [1] chr10 0-73800 * | E2 #> [2] chr10 73800-74200 * | E5 #> [3] chr10 74200-76200 * | E4 #> [4] chr10 76200-76800 * | E5 #> [5] chr10 76800-79000 * | E4 #> ... ... ... ... . ... #> [748335] chrY 56851800-56855200 * | E2 #> [748336] chrY 56855200-56859200 * | E1 #> [748337] chrY 56859200-56868600 * | E2 #> [748338] chrY 56868600-56874600 * | E1 #> [748339] chrY 56874600-57227400 * | E2 #> ------- #> seqinfo: 179 sequences from an unspecified genome; no seqlengths
Gviz
’s AnnotationTrack
to render the ranges as tracks grouped by the state
column in the GRanges
object for each of the cell lines.# load txdb pacakge library(TxDb.Hsapiens.UCSC.hg18.knownGene) # gene gene coordinates txdb <- TxDb.Hsapiens.UCSC.hg18.knownGene gen <- genes(txdb, filter = list(gene_id = 38)) # extend genomic region prom <- promoters(gen, upstream = 10000, downstream = 10000)
# load gviz library(Gviz) library(GenomicRanges) # annotation track tracks <- lapply(segment(obj), function(x) { ## subset segments to the promoter region seg <- subsetByOverlaps(x, prom) ## make annotation track AnnotationTrack(seg, group = seg$state, name = names(x)) })
# plot the track plotTracks(tracks$GM12878)
Other tracks can be added to the plot to make it more informative. Here, we used
IdeogramTrack
to show a graphic representation of chromosome 11GenomeAxisTrack
to show a scale of the exact location on the chromosomeGeneRegionTrack
to show the exon, intron and transcripts of the target geneThose can be put together in one plot using plotTracks
# ideogram track itrack <- IdeogramTrack(genome = 'hg38', chromosome = 11) # genome axis track gtrack <- GenomeAxisTrack() # gene region track data("geneModels") grtrack <- GeneRegionTrack(geneModels, genom = 'hg38', chromosome = 11, name = 'ACAT1')
# put all tracks together plotTracks( list( itrack, grtrack, tracks$GM12878, tracks$H1 ), from = min(start(prom)), to = max(end(gen)), groupAnnotation = 'group' )
get_frequency
does that and returns the output in tabular or graphic formats.# get segment frequency freq <- get_frequency(segment(obj), tidy = TRUE) head(freq) #> state frequency cell #> 1 E1 39265 GM12878 #> 2 E10 44178 GM12878 #> 3 E2 134265 GM12878 #> 4 E3 110055 GM12878 #> 5 E4 130492 GM12878 #> 6 E5 131189 GM12878
The frequency of the states in each cell can also be normalized by the total number of states.
# frequency plots par(mfrow=c(1,2)) get_frequency(segment(obj), plot = TRUE, ylab = 'State Fraction') get_frequency(segment(obj), normalize = TRUE, plot = TRUE, ylab = 'Normalized State Fraction')
learn_model
several times using lapply
with the same inputs except the number of states (numstates
).segmentation
objects. segmenter
contain functions to do basic comparison between the models.# relearn the models with 3 to 8 states objs <- lapply(5:15, function(x) { learn_model(..., numstates = x) })
compare_models
takes a list of segmentation
objects and returns a vector with the same length.# compare the models max correlation between the states compare_models(objs) #> 5 6 7 8 9 10 11 12 #> 0.9996889 0.9996790 0.9975066 0.9969124 0.9997405 0.9997547 0.9999885 0.9999835 #> 13 14 15 #> 0.9999947 0.9999913 1.0000000
type
argument.# compare the models likelihood compare_models(objs, type = 'likelihood') #> 5 6 7 8 9 10 11 #> -111813816 -108830828 -104771345 -103275667 -101839739 -100997027 -99990825 #> 12 13 14 15 #> -98810943 -98086895 -97544467 -97041000
Setting plot = TRUE
returns a plot with data points corresponding to the models in the list.
# compare models plots par(mfrow=c(1,2)) compare_models(objs, plot = TRUE, xlab = 'Model', ylab = 'State Correlation') compare_models(objs, type = 'likelihood', plot = TRUE, xlab = 'Model', ylab = 'Model Likelihood')
#> integer(0)
Thanks to my collaborators and the bioconductor team for reviewing the pacakge.
To learn more, check our github repo.