Overview

This is a brief introduction to Bioconductor. We will cover the basics of the Bioconductor packages and classes that are relevant to the workshop.

Types of Packages

  • Software
  • Annotation
  • Experiment data

Software Packages

Provide implementation of algorithms (e.g. statistical analysis), access to resources (e.g. biomart, or NCBI) or visualizations (e.g. volcano plots, pathways plots).

Annotation Packages

Database-like packages that provide information linking identifiers (e.g., Entrez gene names or Affymetrix probe ids) to other information (e.g., chromosomal location, Gene Ontology category). It is also encouraged to utilize AnnotationHub for storage and access to large raw data files and their conversion to standard R formats.

Experiment Data Packages

provide data sets that are used, often by software packages, to illustrate particular analyses. These packages contain curated data from an experiment, teaching course or publication and in most cases contain a single data set. It is also encouraged to utilize ExperimentHub for storage and access to larger data files.

Bioconductor Classes

  • Three-table structures

Summarized Experiment Object

  • Accessors and constructors
# constructors
obj <- SomeObject(data, metadata)

# accessors
data(obj)

Packages relevant to the workshop

GenomicRanges

# load library
library(GenomicRanges)

# build an IRanges object
regions <- IRanges(start = c(1, 10),
                   end   = c(11, 20),
                   names = c('region_1', 'region_2'))

regions
#> IRanges object with 2 ranges and 0 metadata columns:
#>                start       end     width
#>            <integer> <integer> <integer>
#>   region_1         1        11        11
#>   region_2        10        20        11

# build a GRanges object
GRanges(seqnames = c('chr1', 'chr1'),
        ranges = regions)
#> GRanges object with 2 ranges and 0 metadata columns:
#>            seqnames    ranges strand
#>               <Rle> <IRanges>  <Rle>
#>   region_1     chr1      1-11      *
#>   region_2     chr1     10-20      *
#>   -------
#>   seqinfo: 1 sequence from an unspecified genome; no seqlengths

# add strand info to the object
gr <- GRanges(seqnames = c('chr1', 'chr1'),
              ranges = regions,
              strand = c('+', '+'))

gr
#> GRanges object with 2 ranges and 0 metadata columns:
#>            seqnames    ranges strand
#>               <Rle> <IRanges>  <Rle>
#>   region_1     chr1      1-11      +
#>   region_2     chr1     10-20      +
#>   -------
#>   seqinfo: 1 sequence from an unspecified genome; no seqlengths

# add metadata to the object
gr$region_name <- c('first', 'second')

gr
#> GRanges object with 2 ranges and 1 metadata column:
#>            seqnames    ranges strand | region_name
#>               <Rle> <IRanges>  <Rle> | <character>
#>   region_1     chr1      1-11      + |       first
#>   region_2     chr1     10-20      + |      second
#>   -------
#>   seqinfo: 1 sequence from an unspecified genome; no seqlengths

# accessors
seqnames(gr)
#> factor-Rle of length 2 with 1 run
#>   Lengths:    2
#>   Values : chr1
#> Levels(1): chr1
start(gr)
#> [1]  1 10
end(gr)
#> [1] 11 20
strand(gr)
#> factor-Rle of length 2 with 1 run
#>   Lengths: 2
#>   Values : +
#> Levels(3): + - *
width(gr)
#> [1] 11 11
mcols(gr)
#> DataFrame with 2 rows and 1 column
#>          region_name
#>          <character>
#> region_1       first
#> region_2      second

SummarizedExperiments

# load library
library(SummarizedExperiment)

# make a data.frame with phenotype data
pd <- data.frame(id = paste('sample', 1:2, sep = '_'),
                 group = rep(c('control', 'treatment'), each = 2))

# make a 2 by 4 matrix with values
mat <- matrix(rnorm(8), nrow = 2)

# build a SummarizedExperiment object
se <- SummarizedExperiment(assays = mat,
                           rowData = gr,
                           colData = pd)
se
#> class: RangedSummarizedExperiment 
#> dim: 2 4 
#> metadata(0):
#> assays(1): ''
#> rownames(2): region_1 region_2
#> rowData names(1): region_name
#> colnames: NULL
#> colData names(2): id group

# accessors
assay(se)
#>                [,1]        [,2]       [,3]       [,4]
#> region_1  0.4703326  0.03897944 -0.4338181  0.2364776
#> region_2 -0.1196687 -0.63810615 -0.0357938 -0.2908488
rowData(se)
#> DataFrame with 2 rows and 1 column
#>          region_name
#>          <character>
#> region_1       first
#> region_2      second
rowRanges(se)
#> GRanges object with 2 ranges and 1 metadata column:
#>            seqnames    ranges strand | region_name
#>               <Rle> <IRanges>  <Rle> | <character>
#>   region_1     chr1      1-11      + |       first
#>   region_2     chr1     10-20      + |      second
#>   -------
#>   seqinfo: 1 sequence from an unspecified genome; no seqlengths
colData(se)
#> DataFrame with 4 rows and 2 columns
#>            id       group
#>   <character> <character>
#> 1    sample_1     control
#> 2    sample_2     control
#> 3    sample_1   treatment
#> 4    sample_2   treatment

rtracklayer

# load library
library(rtracklayer)

# export a GRanges object as a bed file
export.bed(gr, 'regions.bed')

# import a bed file as GRanges
import.bed('regions.bed')
#> GRanges object with 2 ranges and 2 metadata columns:
#>       seqnames    ranges strand |        name     score
#>          <Rle> <IRanges>  <Rle> | <character> <numeric>
#>   [1]     chr1      1-11      + |    region_1         0
#>   [2]     chr1     10-20      + |    region_2         0
#>   -------
#>   seqinfo: 1 sequence from an unspecified genome; no seqlengths

org.Hs.eg.db

# load library
library(org.Hs.eg.db)

# shorter name
org <- org.Hs.eg.db
org
#> OrgDb object:
#> | DBSCHEMAVERSION: 2.1
#> | Db type: OrgDb
#> | Supporting package: AnnotationDbi
#> | DBSCHEMA: HUMAN_DB
#> | ORGANISM: Homo sapiens
#> | SPECIES: Human
#> | EGSOURCEDATE: 2020-Sep23
#> | EGSOURCENAME: Entrez Gene
#> | EGSOURCEURL: ftp://ftp.ncbi.nlm.nih.gov/gene/DATA
#> | CENTRALID: EG
#> | TAXID: 9606
#> | GOSOURCENAME: Gene Ontology
#> | GOSOURCEURL: http://current.geneontology.org/ontology/go-basic.obo
#> | GOSOURCEDATE: 2020-09-10
#> | GOEGSOURCEDATE: 2020-Sep23
#> | GOEGSOURCENAME: Entrez Gene
#> | GOEGSOURCEURL: ftp://ftp.ncbi.nlm.nih.gov/gene/DATA
#> | KEGGSOURCENAME: KEGG GENOME
#> | KEGGSOURCEURL: ftp://ftp.genome.jp/pub/kegg/genomes
#> | KEGGSOURCEDATE: 2011-Mar15
#> | GPSOURCENAME: UCSC Genome Bioinformatics (Homo sapiens)
#> | GPSOURCEURL: 
#> | GPSOURCEDATE: 2020-Aug27
#> | ENSOURCEDATE: 2020-Aug18
#> | ENSOURCENAME: Ensembl
#> | ENSOURCEURL: ftp://ftp.ensembl.org/pub/current_fasta
#> | UPSOURCENAME: Uniprot
#> | UPSOURCEURL: http://www.UniProt.org/
#> | UPSOURCEDATE: Mon Oct  5 00:18:02 2020

# show names of columns
columns(org)
#>  [1] "ACCNUM"       "ALIAS"        "ENSEMBL"      "ENSEMBLPROT"  "ENSEMBLTRANS"
#>  [6] "ENTREZID"     "ENZYME"       "EVIDENCE"     "EVIDENCEALL"  "GENENAME"    
#> [11] "GO"           "GOALL"        "IPI"          "MAP"          "OMIM"        
#> [16] "ONTOLOGY"     "ONTOLOGYALL"  "PATH"         "PFAM"         "PMID"        
#> [21] "PROSITE"      "REFSEQ"       "SYMBOL"       "UCSCKG"       "UNIGENE"     
#> [26] "UNIPROT"

TxDb.Hsapiens.UCSC.hg19.knownGene

# load library
library(TxDb.Hsapiens.UCSC.hg19.knownGene)

# shorter name
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
txdb
#> TxDb object:
#> # Db type: TxDb
#> # Supporting package: GenomicFeatures
#> # Data source: UCSC
#> # Genome: hg19
#> # Organism: Homo sapiens
#> # Taxonomy ID: 9606
#> # UCSC Table: knownGene
#> # Resource URL: http://genome.ucsc.edu/
#> # Type of Gene ID: Entrez Gene ID
#> # Full dataset: yes
#> # miRBase build ID: GRCh37
#> # transcript_nrow: 82960
#> # exon_nrow: 289969
#> # cds_nrow: 237533
#> # Db created by: GenomicFeatures package from Bioconductor
#> # Creation time: 2015-10-07 18:11:28 +0000 (Wed, 07 Oct 2015)
#> # GenomicFeatures version at creation time: 1.21.30
#> # RSQLite version at creation time: 1.0.0
#> # DBSCHEMAVERSION: 1.1

# show names of columns
columns(txdb)
#>  [1] "CDSCHROM"   "CDSEND"     "CDSID"      "CDSNAME"    "CDSSTART"  
#>  [6] "CDSSTRAND"  "EXONCHROM"  "EXONEND"    "EXONID"     "EXONNAME"  
#> [11] "EXONRANK"   "EXONSTART"  "EXONSTRAND" "GENEID"     "TXCHROM"   
#> [16] "TXEND"      "TXID"       "TXNAME"     "TXSTART"    "TXSTRAND"  
#> [21] "TXTYPE"

AnnotationDbi

# load library
library(AnnotationDbi)

# map gene symbols to entrez ids from the organism pkg
gene_ids <- select(org,
                   keys = c('ATG5', 'ATG7'),
                   keytype = 'SYMBOL',
                   columns = 'ENTREZID')
gene_ids
#>   SYMBOL ENTREZID
#> 1   ATG5     9474
#> 2   ATG7    10533
# load library
library(GenomicFeatures)

# extract gene coordinates from the taxonomy pkg
gene_coordinates <- genes(txdb,
                          filter = list(gene_id = gene_ids$ENTREZID))

gene_coordinates
#> GRanges object with 2 ranges and 1 metadata column:
#>         seqnames              ranges strand |     gene_id
#>            <Rle>           <IRanges>  <Rle> | <character>
#>   10533     chr3   11314010-11599139      + |       10533
#>    9474     chr6 106632352-106773695      - |        9474
#>   -------
#>   seqinfo: 93 sequences (1 circular) from hg19 genome

# extract promoter regions
gene_promoters <- promoters(gene_coordinates,
                            upstream = 3000)

gene_promoters
#> GRanges object with 2 ranges and 1 metadata column:
#>         seqnames              ranges strand |     gene_id
#>            <Rle>           <IRanges>  <Rle> | <character>
#>   10533     chr3   11311010-11314209      + |       10533
#>    9474     chr6 106773496-106776695      - |        9474
#>   -------
#>   seqinfo: 93 sequences (1 circular) from hg19 genome

target

# load library
library(target)

# get help pages
?target
?target::associated_peaks

# explore package vignettes
vignette(package = 'target')
vignette('target')
vignette('extend-target')