{- | Module : ELynx.Import.Tree.Newick Description : Import Newick trees Copyright : (c) Dominik Schrempf 2019 License : GPL-3 Maintainer : dominik.schrempf@gmail.com Stability : unstable Portability : portable Creation date: Thu Jan 17 14:56:27 2019. Code partly taken from Biobase.Newick.Import. [Specifications](http://evolution.genetics.washington.edu/phylip/newicktree.html) - In particular, no conversion from _ to (space) is done right now. -} module ELynx.Import.Tree.Newick ( Parser , newick , manyNewick , forest , leaf , node , name , branchLength ) where import qualified Data.ByteString.Lazy as L import Data.Tree import Data.Void import Data.Word import Text.Megaparsec import Text.Megaparsec.Byte import Text.Megaparsec.Byte.Lexer (decimal, float) import ELynx.Data.Tree.PhyloTree import ELynx.Tools.ByteString (c2w) -- | Shortcut. type Parser = Parsec Void L.ByteString -- | Parse many Newick trees. manyNewick :: Parser [Tree PhyloByteStringLabel] manyNewick = some (newick <* space) <* eof "manyNewick" -- | Parse a Newick tree. newick :: Parser (Tree PhyloByteStringLabel) newick = tree <* char (c2w ';') "newick" tree :: Parser (Tree PhyloByteStringLabel) tree = space *> (branched <|> leaf) "tree" branched :: Parser (Tree PhyloByteStringLabel) branched = do f <- forest s <- branchSupport n <- node "branched" let n' = n {pBrSup = s} return $ Node n' f -- | A 'forest' is a set of trees separated by @,@ and enclosed by parentheses. forest :: Parser [Tree PhyloByteStringLabel] forest = do _ <- char (c2w '(') f <- tree `sepBy1` char (c2w ',') _ <- char (c2w ')') "forest" return f branchSupport :: Parser (Maybe Double) branchSupport = optional $ try float <|> try decimalAsDouble -- | A 'leaf' is a 'node' without children. leaf :: Parser (Tree PhyloByteStringLabel) leaf = do n <- node "leaf" return $ Node n [] -- | A 'node' has a name and a 'branchLength'. node :: Parser PhyloByteStringLabel node = do n <- name b <- branchLength "node" return $ PhyloLabel n Nothing b checkNameCharacter :: Word8 -> Bool checkNameCharacter c = c `notElem` map c2w " :;()[]," -- | A name can be any string of printable characters except blanks, colons, -- semicolons, parentheses, and square brackets (and commas). name :: Parser L.ByteString name = L.pack <$> many (satisfy checkNameCharacter) "name" -- | Branch lengths default to 0. branchLength :: Parser Double branchLength = char (c2w ':') *> branchLengthGiven <|> pure 0 "branchLength" branchLengthGiven :: Parser Double branchLengthGiven = try float <|> decimalAsDouble decimalAsDouble :: Parser Double decimalAsDouble = fromIntegral <$> (decimal :: Parser Int)