Load the libraries.
library(ape)
library(phangorn)
library(ggtree)
##
## Attaching package: 'ggtree'
##
## The following objects are masked from 'package:IRanges':
##
## collapse, expand
##
## The following object is masked from 'package:phangorn':
##
## getRoot
Load the FASTA data.
myseqs <- read.dna("H5N1.fas",format="fasta")
mytree <- nj(dist.dna(myseqs,model="TN93"))
Starting with the neighbour joining tree, we reconstruct a maximum likelihood tree, as we did before. Note that we get a warning about negative branch lengths in the NJ tree, which aren't allowed in the ML tree.
myseqs.phydat <- as.phyDat(myseqs)
myseqs.gtrig <- pml(mytree,myseqs.phydat,model="GTR+I+G",k=4)
## Warning in pml(mytree, myseqs.phydat, model = "GTR+I+G", k = 4): negative
## edges length changed to 0!
myseqs.gtrig <- optim.pml(myseqs.gtrig,optNni=TRUE,optBf=TRUE,optQ=TRUE,optInv=TRUE,optGamma=TRUE,optEdge=TRUE,optRate=FALSE)
## optimize edge weights: -6050.051 --> -6038.844
## optimize base frequencies: -6038.844 --> -5986.777
## optimize rate matrix: -5986.777 --> -5728.631
## optimize invariant sites: -5728.631 --> -5717.886
## optimize shape parameter: -5717.886 --> -5717.886
## optimize edge weights: -5717.886 --> -5717.684
## optimize topology: -5717.684 --> -5704.969
## optimize topology: -5704.969 --> -5704.969
## 4
## optimize base frequencies: -5704.969 --> -5704.725
## optimize rate matrix: -5704.725 --> -5704.684
## optimize invariant sites: -5704.684 --> -5704.657
## optimize shape parameter: -5704.657 --> -5704.657
## optimize edge weights: -5704.657 --> -5704.657
## optimize topology: -5704.657 --> -5704.657
## 0
## optimize base frequencies: -5704.657 --> -5704.655
## optimize rate matrix: -5704.655 --> -5704.655
## optimize invariant sites: -5704.655 --> -5704.655
## optimize shape parameter: -5704.655 --> -5704.655
## optimize edge weights: -5704.655 --> -5704.655
## optimize base frequencies: -5704.655 --> -5704.655
## optimize rate matrix: -5704.655 --> -5704.655
## optimize invariant sites: -5704.655 --> -5704.655
## optimize shape parameter: -5704.655 --> -5704.654
## optimize edge weights: -5704.654 --> -5704.654
## optimize base frequencies: -5704.654 --> -5704.654
## optimize rate matrix: -5704.654 --> -5704.654
## optimize invariant sites: -5704.654 --> -5704.654
## optimize shape parameter: -5704.654 --> -5704.654
## optimize edge weights: -5704.654 --> -5704.654
## optimize base frequencies: -5704.654 --> -5704.654
## optimize rate matrix: -5704.654 --> -5704.654
## optimize invariant sites: -5704.654 --> -5704.654
## optimize shape parameter: -5704.654 --> -5704.653
## optimize edge weights: -5704.653 --> -5704.653
## optimize base frequencies: -5704.653 --> -5704.653
## optimize rate matrix: -5704.653 --> -5704.653
## optimize invariant sites: -5704.653 --> -5704.653
## optimize shape parameter: -5704.653 --> -5704.653
## optimize edge weights: -5704.653 --> -5704.653
## optimize base frequencies: -5704.653 --> -5704.653
## optimize rate matrix: -5704.653 --> -5704.653
## optimize invariant sites: -5704.653 --> -5704.653
## optimize shape parameter: -5704.653 --> -5704.653
## optimize edge weights: -5704.653 --> -5704.653
## optimize base frequencies: -5704.653 --> -5704.653
## optimize rate matrix: -5704.653 --> -5704.653
## optimize invariant sites: -5704.653 --> -5704.653
## optimize shape parameter: -5704.653 --> -5704.652
## optimize edge weights: -5704.652 --> -5704.652
## optimize base frequencies: -5704.652 --> -5704.652
## optimize rate matrix: -5704.652 --> -5704.652
## optimize invariant sites: -5704.652 --> -5704.652
## optimize shape parameter: -5704.652 --> -5704.652
## optimize edge weights: -5704.652 --> -5704.652
myseqs.mltree <- myseqs.gtrig$tree
We need to root the tree in order to do ancestral reconstruction. We could use rtt or lsd, but in principle, we could use any method we discussed before. We scan the names of the tip labels, to get the tip dates and location.
info <- scan(what=list(character(),character(),character(),character(),integer()),sep="_",quote="\"",text=paste(myseqs.mltree$tip.label,collapse="\n"),quiet=TRUE)
tipnames <- myseqs.mltree$tip.label
tipdates <- as.double(info[[5]])
tipdates
## [1] 2005 2005 2002 2001 2005 2005 2004 2004 2003 2001 2000 1996 2005 2005
## [15] 2004 2004 2004 2004 2004 2004 2004 2004 2004 2001 2001 2004 2003 2002
## [29] 1998 1997 1997 1997 1997 1997 1997 1997 1997 1997 1997 2005 2005 2005
## [43] 2005
We can now root the tree and turn branch lengths into time using LSD.
write.tree(myseqs.mltree,"H5N1.tre")
write.table(rbind(c(length(tipnames),""),cbind(tipnames,tipdates)),"H5N1.td",col.names=FALSE,row.names=FALSE,quote=FALSE)
lsd.cmd <- sprintf("lsd -i %s -d %s -c -n 1 -r -b %s -s %s -v","H5N1.tre","H5N1.td",paste(10),seq.len)
lsd.cmd
## [1] "lsd -i H5N1.tre -d H5N1.td -c -n 1 -r -b 10 -s 6987 -v"
lsd <- system(lsd.cmd,intern=TRUE)
procresult <- function(fn){
result <- readLines(fn)
line <- result[grep("Tree 1 rate ",result)]
line.split <- strsplit(line, " ")[[1]]
list(rate=as.double(line.split[4]),tmrca=as.double(line.split[6]))
}
procresult("H5N1_result.txt")
## $rate
## [1] 0.003233
##
## $tmrca
## [1] 1994.113
lsd.tree <- read.tree("H5N1_result_newick_date.txt")
Now we can extract the location, and reconstruct the changes in state. Here is another R trick to parse sequence headers.
info <- scan(what=list(character(),character(),character(),character(),integer()),sep="_",quote="\"",text=paste(lsd.tree$tip.label,collapse="\n"),quiet=TRUE)
info
## [[1]]
## [1] "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A"
## [18] "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A"
## [35] "A" "A" "A" "A" "A" "A" "A" "A" "A"
##
## [[2]]
## [1] "chicken" "Goose" "bird" "bird" "bird"
## [6] "bird" "bird" "chicken" "bird" "bird"
## [11] "bird" "duck" "goose" "goose" "goose"
## [16] "duck" "duck" "chicken" "swine" "chicken"
## [21] "duck" "goose" "goose" "duck" "chicken"
## [26] "duck" "Chicken" "greyheron" "duck" "duck"
## [31] "duck" "duck" "chicken" "chicken" "bird"
## [36] "duck" "chicken" "duck" "duck" "duck"
## [41] "duck" "duck" "Goose"
##
## [[3]]
## [1] "HongKong" "HongKong" "HongKong" "HongKong" "HongKong"
## [6] "HongKong" "HongKong" "HongKong" "HongKong" "HongKong"
## [11] "HongKong" "Guangxi" "Guangxi" "Guangxi" "Guangxi"
## [16] "Guangxi" "Guangxi" "Guangxi" "Fujian" "Fujian"
## [21] "Fujian" "Guangdong" "Guangxi" "Hunan" "Guangxi"
## [26] "Guangxi" "Guangdong" "HongKong" "Guangxi" "Hunan"
## [31] "Hunan" "Hunan" "Guangdong" "Guangdong" "HongKong"
## [36] "Guangxi" "HongKong" "Guangdong" "Guangdong" "Guangxi"
## [41] "Fujian" "Guangdong" "Guangdong"
##
## [[4]]
## [1] "915" "w355" "488" "97" "481" "485" "514" "258" "503" "542"
## [11] "532" "1378" "914" "1097" "2112" "2291" "1681" "2439" "F1" "1042"
## [21] "897" "2216" "345" "1265" "2461" "793" "810" "837" "351" "139"
## [31] "182" "157" "191" "174" "213" "50" "863" "4610" "01" "22"
## [41] "13" "12" "1"
##
## [[5]]
## [1] 1997 1997 1997 1998 1997 1997 1997 1997 1997 1997 1997 2004 2004 2004
## [15] 2004 2004 2004 2004 2001 2005 2005 2005 2005 2005 2004 2005 2005 2004
## [29] 2004 2005 2005 2005 2004 2004 2003 2001 2002 2003 2001 2001 2002 2000
## [43] 1996
The location is the third entry in the list.
mylocation <- as.factor(info[[3]])
mylocation
## [1] HongKong HongKong HongKong HongKong HongKong HongKong HongKong
## [8] HongKong HongKong HongKong HongKong Guangxi Guangxi Guangxi
## [15] Guangxi Guangxi Guangxi Guangxi Fujian Fujian Fujian
## [22] Guangdong Guangxi Hunan Guangxi Guangxi Guangdong HongKong
## [29] Guangxi Hunan Hunan Hunan Guangdong Guangdong HongKong
## [36] Guangxi HongKong Guangdong Guangdong Guangxi Fujian Guangdong
## [43] Guangdong
## Levels: Fujian Guangdong Guangxi HongKong Hunan
We fix any small branch lengths.
lsd.tree$edge.length[lsd.tree$edge.length<0.00000001] <- 0.00000001
Now perform ancestral reconstruction of the location; I use a simple equal rates model (model="ER"), although in principle I could use more complex ones.
lsd.tree.ace <- ace(mylocation,lsd.tree,type="discrete",method="ML",model="ER")
lsd.tree.ace
##
## Ancestral Character Estimation
##
## Call: ace(x = mylocation, phy = lsd.tree, type = "discrete", method = "ML",
## model = "ER")
##
## Log-likelihood: -45.57865
##
## Rate index matrix:
## Fujian Guangdong Guangxi HongKong Hunan
## Fujian . 1 1 1 1
## Guangdong 1 . 1 1 1
## Guangxi 1 1 . 1 1
## HongKong 1 1 1 . 1
## Hunan 1 1 1 1 .
##
## Parameter estimates:
## rate index estimate std-err
## 1 0.0701 0.0151
##
## Scaled likelihoods at the root (type '...$lik.anc' to get them for all nodes):
## Fujian Guangdong Guangxi HongKong Hunan
## 0.02836587 0.20304071 0.02992778 0.71128212 0.02738351
plot(lsd.tree, type="p",label.offset=0.0025,cex=0.75)
co <- c("blue", "yellow","red","green","orange")
tiplabels(pch = 22, bg = co[as.numeric(mylocation)], cex = 1.0)
nodelabels(thermo = lsd.tree.ace$lik.anc, piecol = co, cex = 0.25)

The next steps in this analysis could be:
rcolgem