August 6, 2021
segmenterThe goal of the segmenter package is to
The package can be installed from Bioconductor using BiocManager or from GitHub using remotes
# install from bioconductor
BiocManager::install('segmenter')
# install from github
remotes::install_github('MahShaaban/segmenter@devel')
# run command
obj <- learn_model(
inputdir = <input binarized files>,
coordsdir = <genomic coordinates of features>,
anchorsdir = <genomic location of TSS>,
chromsizefile = <chromosome lengths>,
numstates = <the number of desired states>,
assembly = <the genome assembly>,
cells = <names of the biological samples>,
annotation = <name of the reference genome type>,
binsize = <the size of bins>
)
segmenterChromHMM requires two types of input files. Those are
# coordinates coordsdir <- 'hg38_coords' list.files(coordsdir) #> [1] "CpGIsland.hg38.bed.gz" "RefSeqExon.hg38.bed.gz" #> [3] "RefSeqGene.hg38.bed.gz" "RefSeqTES.hg38.bed.gz" #> [5] "RefSeqTSS.hg38.bed.gz" "RefSeqTSS2kb.hg38.bed.gz" # anchors anchorsdir <- 'hg38_anchors' list.files(anchorsdir) #> [1] "RefSeqTES.hg38.txt.gz" "RefSeqTSS.hg38.txt.gz" # chromosomes' sizes chromsizefile <- 'hg38_chromsizes.txt' read.delim(chromsizefile, header = FALSE, nrows = 2) #> V1 V2 #> 1 chr1 248956422 #> 2 chr2 242193529
binarize_bam or bed binarize_bed# a table to assign marker and cell names to the bam files cell_marks <- 'cell_mark_file.tsv' read.delim(cell_marks, header = FALSE, nrows = 3) #> V1 V2 V3 #> 1 GM12878 H3K27me3 ENCFF796DDM.bam #> 2 GM12878 H3K27me3 ENCFF927XRX.bam #> 3 GM12878 H3K27me3 ENCFF265UBT.bam
# run command
binarize_bam('bam/',
chromsizefile = chromsizefile,
cell_marks = cell_marks,
outputdir = 'bins_200')
# show output files
example_binaries <- list.files('bins_200', full.names = TRUE)[1]
read.delim(example_binaries, nrows = 2, skip = 1)
#> H2AFZ H3K27ac H3K27me3 H3K36me3 H3K4me1 H3K4me2 H3K4me3 H3K79me2 H3K9ac
#> 1 0 0 0 0 0 0 0 0 0
#> 2 0 0 0 0 0 0 0 0 0
#> H3K9me3 H4K20me1
#> 1 0 0
#> 2 0 0
learn_model wraps the the Java module that learns a chromatin segmentation model of a given number of states.
# load library
library(segmenter)
# run the main command
obj <- learn_model(
inputdir = 'bins_200',
outputdir = 'models/model_10',
anchorsdir = 'hg38_anchors/',
coordsdir = 'hg38_coords/',
chromsizefile = 'hg38_chromsizes.txt',
numstates = 10,
cells = c('GM12878', 'H1', 'HepG2', 'IMR-90', 'K562'),
assembly = 'hg38',
annotation = 'RefSeq',
binsize = 200
)
# show the object class(obj) #> [1] "segmentation" #> attr(,"package") #> [1] "segmenter"
model: the initial and final parametersemission: the probabilities of the mark in a given statetransition: the probabilities of the states transitionsoverlap: the enrichment of the states at genomic featuresTSS: the enrichment of the states around the TSSTES: the enrichment of the states around the TESsegment: the assignment of states to the binAn accessor function with the same of every slot access its contents. For example, to access the emission probabilities, call emission
# access object slots emission(obj)
Some accessors have more arguments to subset the object. For example, the segment method take a cell name to return only the segments in the corresponding cell.
# subset the segment slot segment(obj, cell = 'K562')
# show the object show(obj) #> # An object of class 'segmentation' #> # Contains a chromatin segmentation model: #> ## States: 1 2 3 4 5 6 7 8 9 10 #> ## Marks: H3K36me3 H4K20me1 H3K79me2 H3K4me1 H3K27ac H3K9ac H3K4me2 H3K4me3 H2AFZ H3K27me3 H3K9me3 #> ## Cells: GM12878 H1 HepG2 IMR-90 K562 #> # Contains nine slots: #> ## model: use 'model(object)' to access #> ## emission: use 'emission(object)' to access #> ## transition: use 'transition(object)' to access #> ## overlap: use 'overlap(object)' to access #> ## TSS: use 'TSS(object)' to access #> ## TES: use 'TES(object)' to access #> ## segment: use 'segment(object)' to access #> ## bins: use 'bins(object)' to access #> ## counts: use 'counts(object)' to access #> # For more info about how to use the object, use ?accessors
# access object slots emission(obj) transition(obj)
plot_heatmap takes a segmentation object and visualize the slot in type. By default, this is emission.Heatmap object from the ComplexHeatmap package. These objects are customized to produce diverse informative figures.# emission and transition plots
plot_heatmap(obj,
row_labels = paste0('S', 1:10),
name = 'Emission')
plot_heatmap(obj,
type = 'transition',
row_labels = paste0('S', 1:10),
column_labels = paste0('S', 1:10),
name = 'Transition')
Emission and Transition
overlap slot contains the fold enrichment of each state in the genomic coordinates.# overlap enrichment
plot_heatmap(
obj,
type = 'overlap',
column_labels = c('Genome', 'CpG', 'Exon', 'Gene',
'TES', 'TSS', 'TSS2kb')
)
Overlap Enrichment
segment contains the state assingmentGRanges object with the chromosome name, start and end sites in the ranges part of the object and the name of the state in a metadata columns.# get segments of all cells segment(obj) # get segments of GM12878 segment(obj, 'GM12878')
# get segments segment(obj, 'GM12878') #> $GM12878 #> GRanges object with 748339 ranges and 1 metadata column: #> seqnames ranges strand | state #> <Rle> <IRanges> <Rle> | <character> #> [1] chr10 0-73800 * | E2 #> [2] chr10 73800-74200 * | E5 #> [3] chr10 74200-76200 * | E4 #> [4] chr10 76200-76800 * | E5 #> [5] chr10 76800-79000 * | E4 #> ... ... ... ... . ... #> [748335] chrY 56851800-56855200 * | E2 #> [748336] chrY 56855200-56859200 * | E1 #> [748337] chrY 56859200-56868600 * | E2 #> [748338] chrY 56868600-56874600 * | E1 #> [748339] chrY 56874600-57227400 * | E2 #> ------- #> seqinfo: 179 sequences from an unspecified genome; no seqlengths
Gviz’s AnnotationTrack to render the ranges as tracks grouped by the state column in the GRanges object for each of the cell lines.# load txdb pacakge library(TxDb.Hsapiens.UCSC.hg18.knownGene) # gene gene coordinates txdb <- TxDb.Hsapiens.UCSC.hg18.knownGene gen <- genes(txdb, filter = list(gene_id = 38)) # extend genomic region prom <- promoters(gen, upstream = 10000, downstream = 10000)
# load gviz
library(Gviz)
library(GenomicRanges)
# annotation track
tracks <- lapply(segment(obj), function(x) {
## subset segments to the promoter region
seg <- subsetByOverlaps(x, prom)
## make annotation track
AnnotationTrack(seg,
group = seg$state,
name = names(x))
})
# plot the track plotTracks(tracks$GM12878)
Segmentation track
Other tracks can be added to the plot to make it more informative. Here, we used
IdeogramTrack to show a graphic representation of chromosome 11GenomeAxisTrack to show a scale of the exact location on the chromosomeGeneRegionTrack to show the exon, intron and transcripts of the target geneThose can be put together in one plot using plotTracks
# ideogram track
itrack <- IdeogramTrack(genome = 'hg38', chromosome = 11)
# genome axis track
gtrack <- GenomeAxisTrack()
# gene region track
data("geneModels")
grtrack <- GeneRegionTrack(geneModels,
genom = 'hg38',
chromosome = 11,
name = 'ACAT1')
# put all tracks together
plotTracks(
list(
itrack,
grtrack,
tracks$GM12878, tracks$H1
),
from = min(start(prom)),
to = max(end(gen)),
groupAnnotation = 'group'
)
Full tracks
get_frequency does that and returns the output in tabular or graphic formats.# get segment frequency freq <- get_frequency(segment(obj), tidy = TRUE) head(freq) #> state frequency cell #> 1 E1 39265 GM12878 #> 2 E10 44178 GM12878 #> 3 E2 134265 GM12878 #> 4 E3 110055 GM12878 #> 5 E4 130492 GM12878 #> 6 E5 131189 GM12878
The frequency of the states in each cell can also be normalized by the total number of states.
# frequency plots
par(mfrow=c(1,2))
get_frequency(segment(obj),
plot = TRUE,
ylab = 'State Fraction')
get_frequency(segment(obj),
normalize = TRUE,
plot = TRUE,
ylab = 'Normalized State Fraction')
State frequency
learn_model several times using lapply with the same inputs except the number of states (numstates).segmentation objects. segmenter contain functions to do basic comparison between the models.# relearn the models with 3 to 8 states
objs <- lapply(5:15,
function(x) {
learn_model(..., numstates = x)
})
compare_models takes a list of segmentation objects and returns a vector with the same length.# compare the models max correlation between the states compare_models(objs) #> 5 6 7 8 9 10 11 12 #> 0.9996889 0.9996790 0.9975066 0.9969124 0.9997405 0.9997547 0.9999885 0.9999835 #> 13 14 15 #> 0.9999947 0.9999913 1.0000000
type argument.# compare the models likelihood compare_models(objs, type = 'likelihood') #> 5 6 7 8 9 10 11 #> -111813816 -108830828 -104771345 -103275667 -101839739 -100997027 -99990825 #> 12 13 14 15 #> -98810943 -98086895 -97544467 -97041000
Setting plot = TRUE returns a plot with data points corresponding to the models in the list.
# compare models plots
par(mfrow=c(1,2))
compare_models(objs,
plot = TRUE,
xlab = 'Model',
ylab = 'State Correlation')
compare_models(objs, type = 'likelihood',
plot = TRUE,
xlab = 'Model',
ylab = 'Model Likelihood')
Comparing models
#> integer(0)
Thanks to my collaborators and the bioconductor team for reviewing the pacakge.
To learn more, check our github repo.