Learning Bioinformatics: November 2013

http://www.fgcz.ch/education/StatMethodsExpression/05_DNAme_R_exercises.txt

# make sure you have these packages installed
# -------------------------------------------
source("http://bioconductor.org/biocLite.R")
biocLite(c("minfi","IlluminaHumanMethylation450kmanifest",
           "charm","GenomicRanges"))

library(minfi)
library(charm)
library(GenomicRanges)


# preprocess 450k array data
# --------------------------

f <- read.450k.sheet(".", "450k_SampleSheet")
raw <- read.450k.exp(targets = f, verbose=TRUE)

ppi <- preprocessIllumina(raw, bg.correct = TRUE, normalize = "controls")

b <- getBeta(ppi)
colnames(b) <- f$Sample_Name  # give descriptive column names


# read probe annotation (for chr22 probes)

anno <- read.csv("450_probes_subset.csv", stringsAsFactors=FALSE, header=TRUE)
probeids <- intersect(rownames(b), anno$Name)

m <- match(probeids, anno$Name)
anno <- anno[m, ]

m <- match(probeids, rownames(b))
b <- b[m, ]

k <- !is.na(anno$MAPINFO) & rowSums( is.na(b) )==0
anno <- anno[k,]
b <- b[k,]

gr450k <- GRanges(seqnames=paste("chr",anno$CHR,sep=""), 
                  ranges=IRanges(start=anno$MAPINFO,width=1))


# load RRBS data
# --------------
rf <- dir(,".bedRrbs")
names(rf) <- gsub("_chr22.bedRrbs","",rf)

rd <- lapply(rf, function(u) {
  d <- read.table(u, header = FALSE, stringsAsFactors = FALSE)
  d[,c(1:3,5:6,11)]
})

# have a look at how RRBS data comes
lapply(rd,head)


# function to convert to methylation levels

mapNeg2PosRRBS <- function(p) {
  xs <- split(p, p$V6)
  pos <- xs[["+"]]
  neg <- xs[["-"]]

  key <- paste(pos$V1, pos$V3-1, sep = ".")
  keyn <- paste(neg$V1, neg$V2, sep = ".")

  # finding the pairs
  m.pos <- match(key, keyn)
  n <- is.na(m.pos)
 
  # positive strand
  x <- data.frame(chr=pos$V1, position=pos$V2+1, n=pos$V5, 
                  nC=round( pos$V5*pos$V11/100 ))

  # negative strand that match with positive strand
  mn <- m.pos[!n]
  x$n[!n] <- x$n[!n] + neg$V5[mn]
  x$nC[!n] <- x$nC[!n] + round( neg$V5[mn]*neg$V11[mn]/100 )
 
  # negative strand only
  m.neg <- match(keyn, key)
  nas.neg <- is.na(m.neg)
  y <- data.frame(chr=neg$V1, position=neg$V2, n=neg$V5,
                  nC=round( neg$V5*neg$V11/100 ))[nas.neg, ]

  res <- rbind(x,y)
  GRanges(seqnames=res$chr, IRanges(start=res$position, width=1),
          n=res$n, nC=res$nC)
}


rdc <- lapply(rd, mapNeg2PosRRBS)

# look at converted data
rdc
names(rdc)
colnames(b)

# do spot checks: match up RRBS and 450k readout for same CpG sites 

v <- values(rdc[[1]])
meth <- v$nC / v$n

fo <- findOverlaps(gr450k, rdc[[1]])

plot(b[fo@queryHits,1],meth[fo@subjectHits],xlab="450k", ylab="RRBS")


# make design matrices for DMR finding
grp <- gsub("_[1-3]","",colnames(b))
d1 <- model.matrix(~grp)
d0 <- d1[,1,drop=FALSE]

b1 <- b*.98+.01
range(b)
range(b1)

o <- order(anno$CHR, anno$MAPINFO)

# find differentially methylated regions
pns <- clusterMaker(anno$CHR, anno$MAPINFO, maxGap=500)
dmrs <- dmrFind(p=b1[o,], mod=d1, mod0=d0, coef=2, pns=pns[o],
                chr=anno$CHR[o], pos=anno$MAPINFO[o], svs=0, 
                use.limma=TRUE, use="swald", Q=0.97)


# Exercise 1. Make histograms of "beta" values (3 LNCaP replicates,
# 3 PrEC replicates).  Are there (global) differences between cancer and 
# normal?  What if you restrict to probes only in CpG islands? 
# (You can get this information for chr22 in the annotation file).

# Exercise 2. Does the concordance between 450k and RRBS improve 
# when you require a higher depth in the RRBS data?

# Exercise 3. For the DMRs found using 450k array data, make a plot
# of the RRBS data in the same regions to verify that they are 
# calling the same diff. methylated regions.

mysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -D hg19 -N \

 -e 'select concat("curl http://genome.ucsc.edu/cgi-bin/das/hg19/dna?segment=",chrom,":",txStart-35,",",txStart-1) from knownGene where strand="+"'  |\

 sh > result.concatenated.xml

#Install the package if you've never done so

install.packages("RMySQL")

#Load the package

library(RMySQL)

# Set up a connection to your database management system.

# I'm using the public MySQL server for the UCSC genome browser (no password)

mychannel <- dbConnect(MySQL(), user="genome", host="genome-mysql.cse.ucsc.edu")

# Function to make it easier to query 

query <- function(...) dbGetQuery(mychannel, ...)

# Get the UCSC gene name, start and end sites for the first 10 genes on Chromosome 12

query("SELECT name, chrom, txStart, txEnd FROM mm9.knownGene WHERE chrom='chr12' LIMIT 10;")

# Results are returned as a data.frame:

#          name chrom txStart   txEnd

# 1  uc007mwj.2 chr12 3235525 3250374

# 2  uc007mwg.2 chr12 3235790 3239112

# 3  uc007mwh.2 chr12 3235790 3239288

# 4  uc007mwi.2 chr12 3235790 3250374

# 5  uc007mwk.1 chr12 3236610 3249997

# 6  uc011yjq.1 chr12 3237284 3241410

# 7  uc007mwl.2 chr12 3247427 3309969

# 8  uc007mwm.1 chr12 3365131 3406494

# 9  uc007mwn.1 chr12 3365131 3406494

# 10 uc007mwp.2 chr12 3403882 3426747

Learning Bioinformatics

Monday, November 18, 2013

DNA methylation R exercises - FGCZ

Tuesday, November 12, 2013

Query a MySQL Database from R using RMySQL