So for this project, I needed a function that could take a random tree from R and produce and ms string. For my purposes, I want the ability to have ms give gene trees or segregating sites. Here is the code I came up with. It is also available from GitHub version of my R package evobiR:
getMS <- function(tree, samps, report, n.site=NULL){
# add a demes tracking column to the edge matrix
tree$edge <- cbind(tree$edge, rep(NA, nrow(tree$edge)))
output <- vector()
used.demes <- 0
check <- T
while(check == T){
# find tips
tips <- tree$edge[!tree$edge[, 2] %in% tree$edge[, 1], 2]
# determine which rows of #edge and elements of #edge.length these are
branches <- which(tree$edge[,2] %in% tips)
# find the shortest of the branches
shortb <- branches[which.min(tree$edge.length[branches])]
# find the parent node
par.node <- tree$edge[shortb, 1]
# find branches with this parent
hits <- tree$edge[, 1] == par.node
# remove one already stored
hits[shortb] <- F
# store the branch we will join
shortb[2] <- which(hits)
# determine if demes are named branch 1
if(is.na(tree$edge[shortb[1], 3])){
demes1 <- max(used.demes) + 1
used.demes <- c(used.demes, (max(used.demes) + 1))
}else{
demes1 <- tree$edge[shortb[1], 3]
}
# determine if demes are named branch 1
if(is.na(tree$edge[shortb[2], 3])){
demes2 <- max(used.demes) + 1
used.demes <- c(used.demes, (max(used.demes) + 1))
}else{
demes2 <- tree$edge[shortb[2], 3]
}
# store deme name in tree$edge
tree$edge[tree$edge[, 2] == par.node, 3] <- demes2
# record ej statement
output <- paste(output, "-ej", tree$edge.length[shortb[1]], demes1, demes2)
# check and see if we still have more work to do
if(length(tree$edge.length)>2){
#add edge length that will be lost when we remove branches
tree$edge.length[tree$edge[, 2] == par.node] <-
tree$edge.length[tree$edge[, 2] == par.node] +
tree$edge.length[shortb[1]]
#update tree$edge.length
tree$edge.length <- tree$edge.length[-shortb]
#update tree$edge
tree$edge <- tree$edge[-shortb, ]
}else{
# when we are down to 2 edges we are done
check <- F
}
}
z <- paste(rep("1", length(tree$tip)), collapse=" ")
outz <- paste("-", report, sep="")
output <- paste(length(tree$tip),
samps, "-I",
length(tree$tip),
z,
output,
outz,
n.site)
return(output)
}
And here is an example of it running:
> library(ape)
> tree <- rcoal(4)
> plot(tree)
> getMS(tree, samps=2, report="T")
[1] "4 2 -I 4 1 1 1 1 -ej 0.0178240316454321 1 2 -ej 0.65862003858505 2 3 -ej 0.922894212466909 4 3 -T "
> getMS(tree, samps=20, report="s", n.site=50)
[1] "4 20 -I 4 1 1 1 1 -ej 0.0178240316454321 1 2 -ej 0.65862003858505 2 3 -ej 0.922894212466909 4 3 -s 50"
And now we can run these strings through ms:
$ ./ms 4 2 -I 4 1 1 1 1 -ej 0.0178240316454321 1 2 -ej 0.65862003858505 2 3 -ej 0.922894212466909 4 3 -T
./ms 4 2 -I 4 1 1 1 1 -ej 0.0178240316454321 1 2 -ej 0.65862003858505 2 3 -ej 0.922894212466909 4 3 -T
32028 54298 17741
//
(3:1.096,(4:0.950,(1:0.502,2:0.502):0.448):0.146);
//
(3:1.340,(4:1.221,(1:0.314,2:0.314):0.907):0.119);
Add a comment