vignettes/workshop_packages.Rmd
workshop_packages.Rmd
This is a brief introduction to Bioconductor. We will cover the basics of the Bioconductor packages and classes that are relevant to the workshop.
Provide implementation of algorithms (e.g. statistical analysis), access to resources (e.g. biomart, or NCBI) or visualizations (e.g. volcano plots, pathways plots).
Database-like packages that provide information linking identifiers (e.g., Entrez gene names or Affymetrix probe ids) to other information (e.g., chromosomal location, Gene Ontology category). It is also encouraged to utilize AnnotationHub for storage and access to large raw data files and their conversion to standard R formats.
provide data sets that are used, often by software packages, to illustrate particular analyses. These packages contain curated data from an experiment, teaching course or publication and in most cases contain a single data set. It is also encouraged to utilize ExperimentHub for storage and access to larger data files.
Summarized Experiment Object
# constructors
obj <- SomeObject(data, metadata)
# accessors
data(obj)
GenomicRanges
# load library
library(GenomicRanges)
# build an IRanges object
regions <- IRanges(start = c(1, 10),
end = c(11, 20),
names = c('region_1', 'region_2'))
regions
#> IRanges object with 2 ranges and 0 metadata columns:
#> start end width
#> <integer> <integer> <integer>
#> region_1 1 11 11
#> region_2 10 20 11
# build a GRanges object
GRanges(seqnames = c('chr1', 'chr1'),
ranges = regions)
#> GRanges object with 2 ranges and 0 metadata columns:
#> seqnames ranges strand
#> <Rle> <IRanges> <Rle>
#> region_1 chr1 1-11 *
#> region_2 chr1 10-20 *
#> -------
#> seqinfo: 1 sequence from an unspecified genome; no seqlengths
# add strand info to the object
gr <- GRanges(seqnames = c('chr1', 'chr1'),
ranges = regions,
strand = c('+', '+'))
gr
#> GRanges object with 2 ranges and 0 metadata columns:
#> seqnames ranges strand
#> <Rle> <IRanges> <Rle>
#> region_1 chr1 1-11 +
#> region_2 chr1 10-20 +
#> -------
#> seqinfo: 1 sequence from an unspecified genome; no seqlengths
# add metadata to the object
gr$region_name <- c('first', 'second')
gr
#> GRanges object with 2 ranges and 1 metadata column:
#> seqnames ranges strand | region_name
#> <Rle> <IRanges> <Rle> | <character>
#> region_1 chr1 1-11 + | first
#> region_2 chr1 10-20 + | second
#> -------
#> seqinfo: 1 sequence from an unspecified genome; no seqlengths
# accessors
seqnames(gr)
#> factor-Rle of length 2 with 1 run
#> Lengths: 2
#> Values : chr1
#> Levels(1): chr1
start(gr)
#> [1] 1 10
end(gr)
#> [1] 11 20
strand(gr)
#> factor-Rle of length 2 with 1 run
#> Lengths: 2
#> Values : +
#> Levels(3): + - *
width(gr)
#> [1] 11 11
mcols(gr)
#> DataFrame with 2 rows and 1 column
#> region_name
#> <character>
#> region_1 first
#> region_2 second
SummarizedExperiments
# load library
library(SummarizedExperiment)
# make a data.frame with phenotype data
pd <- data.frame(id = paste('sample', 1:2, sep = '_'),
group = rep(c('control', 'treatment'), each = 2))
# make a 2 by 4 matrix with values
mat <- matrix(rnorm(8), nrow = 2)
# build a SummarizedExperiment object
se <- SummarizedExperiment(assays = mat,
rowData = gr,
colData = pd)
se
#> class: RangedSummarizedExperiment
#> dim: 2 4
#> metadata(0):
#> assays(1): ''
#> rownames(2): region_1 region_2
#> rowData names(1): region_name
#> colnames: NULL
#> colData names(2): id group
# accessors
assay(se)
#> [,1] [,2] [,3] [,4]
#> region_1 0.4703326 0.03897944 -0.4338181 0.2364776
#> region_2 -0.1196687 -0.63810615 -0.0357938 -0.2908488
rowData(se)
#> DataFrame with 2 rows and 1 column
#> region_name
#> <character>
#> region_1 first
#> region_2 second
rowRanges(se)
#> GRanges object with 2 ranges and 1 metadata column:
#> seqnames ranges strand | region_name
#> <Rle> <IRanges> <Rle> | <character>
#> region_1 chr1 1-11 + | first
#> region_2 chr1 10-20 + | second
#> -------
#> seqinfo: 1 sequence from an unspecified genome; no seqlengths
colData(se)
#> DataFrame with 4 rows and 2 columns
#> id group
#> <character> <character>
#> 1 sample_1 control
#> 2 sample_2 control
#> 3 sample_1 treatment
#> 4 sample_2 treatment
rtracklayer
# load library
library(rtracklayer)
# export a GRanges object as a bed file
export.bed(gr, 'regions.bed')
# import a bed file as GRanges
import.bed('regions.bed')
#> GRanges object with 2 ranges and 2 metadata columns:
#> seqnames ranges strand | name score
#> <Rle> <IRanges> <Rle> | <character> <numeric>
#> [1] chr1 1-11 + | region_1 0
#> [2] chr1 10-20 + | region_2 0
#> -------
#> seqinfo: 1 sequence from an unspecified genome; no seqlengths
org.Hs.eg.db
# load library
library(org.Hs.eg.db)
# shorter name
org <- org.Hs.eg.db
org
#> OrgDb object:
#> | DBSCHEMAVERSION: 2.1
#> | Db type: OrgDb
#> | Supporting package: AnnotationDbi
#> | DBSCHEMA: HUMAN_DB
#> | ORGANISM: Homo sapiens
#> | SPECIES: Human
#> | EGSOURCEDATE: 2020-Sep23
#> | EGSOURCENAME: Entrez Gene
#> | EGSOURCEURL: ftp://ftp.ncbi.nlm.nih.gov/gene/DATA
#> | CENTRALID: EG
#> | TAXID: 9606
#> | GOSOURCENAME: Gene Ontology
#> | GOSOURCEURL: http://current.geneontology.org/ontology/go-basic.obo
#> | GOSOURCEDATE: 2020-09-10
#> | GOEGSOURCEDATE: 2020-Sep23
#> | GOEGSOURCENAME: Entrez Gene
#> | GOEGSOURCEURL: ftp://ftp.ncbi.nlm.nih.gov/gene/DATA
#> | KEGGSOURCENAME: KEGG GENOME
#> | KEGGSOURCEURL: ftp://ftp.genome.jp/pub/kegg/genomes
#> | KEGGSOURCEDATE: 2011-Mar15
#> | GPSOURCENAME: UCSC Genome Bioinformatics (Homo sapiens)
#> | GPSOURCEURL:
#> | GPSOURCEDATE: 2020-Aug27
#> | ENSOURCEDATE: 2020-Aug18
#> | ENSOURCENAME: Ensembl
#> | ENSOURCEURL: ftp://ftp.ensembl.org/pub/current_fasta
#> | UPSOURCENAME: Uniprot
#> | UPSOURCEURL: http://www.UniProt.org/
#> | UPSOURCEDATE: Mon Oct 5 00:18:02 2020
# show names of columns
columns(org)
#> [1] "ACCNUM" "ALIAS" "ENSEMBL" "ENSEMBLPROT" "ENSEMBLTRANS"
#> [6] "ENTREZID" "ENZYME" "EVIDENCE" "EVIDENCEALL" "GENENAME"
#> [11] "GO" "GOALL" "IPI" "MAP" "OMIM"
#> [16] "ONTOLOGY" "ONTOLOGYALL" "PATH" "PFAM" "PMID"
#> [21] "PROSITE" "REFSEQ" "SYMBOL" "UCSCKG" "UNIGENE"
#> [26] "UNIPROT"
TxDb.Hsapiens.UCSC.hg19.knownGene
# load library
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
# shorter name
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
txdb
#> TxDb object:
#> # Db type: TxDb
#> # Supporting package: GenomicFeatures
#> # Data source: UCSC
#> # Genome: hg19
#> # Organism: Homo sapiens
#> # Taxonomy ID: 9606
#> # UCSC Table: knownGene
#> # Resource URL: http://genome.ucsc.edu/
#> # Type of Gene ID: Entrez Gene ID
#> # Full dataset: yes
#> # miRBase build ID: GRCh37
#> # transcript_nrow: 82960
#> # exon_nrow: 289969
#> # cds_nrow: 237533
#> # Db created by: GenomicFeatures package from Bioconductor
#> # Creation time: 2015-10-07 18:11:28 +0000 (Wed, 07 Oct 2015)
#> # GenomicFeatures version at creation time: 1.21.30
#> # RSQLite version at creation time: 1.0.0
#> # DBSCHEMAVERSION: 1.1
# show names of columns
columns(txdb)
#> [1] "CDSCHROM" "CDSEND" "CDSID" "CDSNAME" "CDSSTART"
#> [6] "CDSSTRAND" "EXONCHROM" "EXONEND" "EXONID" "EXONNAME"
#> [11] "EXONRANK" "EXONSTART" "EXONSTRAND" "GENEID" "TXCHROM"
#> [16] "TXEND" "TXID" "TXNAME" "TXSTART" "TXSTRAND"
#> [21] "TXTYPE"
AnnotationDbi
# load library
library(AnnotationDbi)
# map gene symbols to entrez ids from the organism pkg
gene_ids <- select(org,
keys = c('ATG5', 'ATG7'),
keytype = 'SYMBOL',
columns = 'ENTREZID')
gene_ids
#> SYMBOL ENTREZID
#> 1 ATG5 9474
#> 2 ATG7 10533
# load library
library(GenomicFeatures)
# extract gene coordinates from the taxonomy pkg
gene_coordinates <- genes(txdb,
filter = list(gene_id = gene_ids$ENTREZID))
gene_coordinates
#> GRanges object with 2 ranges and 1 metadata column:
#> seqnames ranges strand | gene_id
#> <Rle> <IRanges> <Rle> | <character>
#> 10533 chr3 11314010-11599139 + | 10533
#> 9474 chr6 106632352-106773695 - | 9474
#> -------
#> seqinfo: 93 sequences (1 circular) from hg19 genome
# extract promoter regions
gene_promoters <- promoters(gene_coordinates,
upstream = 3000)
gene_promoters
#> GRanges object with 2 ranges and 1 metadata column:
#> seqnames ranges strand | gene_id
#> <Rle> <IRanges> <Rle> | <character>
#> 10533 chr3 11311010-11314209 + | 10533
#> 9474 chr6 106773496-106776695 - | 9474
#> -------
#> seqinfo: 93 sequences (1 circular) from hg19 genome