{- |
Module Radium.Formats.Smiles
Copyright : Copyright (C) 2014 Krzysztof Langner
License : BSD3

Maintainer : Krzysztof Langner <klangner@gmail.com>
Stability : alpha
Portability : portable

SMILES is popular format for describing the structure of chemical molecules.
http://en.wikipedia.org/wiki/Simplified_molecular-input_line-entry_system

-}

module Radium.Formats.Smiles ( Smiles(..)
                             , readSmiles
                             , writeSmiles ) where
                             

import Text.ParserCombinators.Parsec
import qualified Data.Set as Set
import Data.Maybe


-- | This model describes molecules with valence bounds
data Smiles = SmilesRing Atom       -- Atom symbol
                         Int        -- Number of bounds
                         Smiles     -- Rest of smiles
            | Smiles Atom
            deriving( Eq, Show )

data Atom = Atom  String      -- Symbol
                  Int         -- Isotopes count
                  Int         -- Hydrogen count
                  Int         -- Ion number (charge)
                  Int         -- Atom class
            | Aliphatic String
            | Aromatic String
            | Unknown           -- '*'
              deriving( Eq, Show )


-- | Set of all aliphatic symbols
aliphatics :: Set.Set String
aliphatics = Set.fromList ["B", "C", "N", "O", "S", "P", "F", "Cl", "Br", "I" ]

-- | Set of all aromatic symbols
aromatics :: Set.Set Char
aromatics = Set.fromList "bcnosp"

-- | Parses textual representation
readSmiles :: String -> Either String Smiles
readSmiles xs = case parse smiles "" xs of
    Left err -> Left (show err)
    Right val -> Right val

-- Parse SMILES
smiles :: Parser Smiles
smiles = do
    a <- atom
    n <- optionMaybe bound
    xs <- optionMaybe smiles
    return $ if isJust xs then SmilesRing a (fromMaybe 1 n) (fromJust xs) else Smiles a

-- Parse atom bound
bound :: Parser Int
bound = do
    s <- char '-' <|> char '=' <|> char '#' <|> char '$'
    return $ case s of
        '=' -> 2
        '#' -> 3
        '$' -> 4
        _   -> 1

-- Parse atom
atom :: Parser Atom
atom = bracketAtom <|> aliphaticOrganic <|> aromaticOrganic <|> unknown

-- Parse atom
bracketAtom :: Parser Atom
bracketAtom = do
    _ <- char '['
    i <- optionMaybe number
    s <- symbolOrUnknown
    hc <- optionMaybe hcount
    n <- optionMaybe charge
    ac <- optionMaybe atomClass
    _ <- char ']'
    return $ Atom s (fromMaybe 0 i) (fromMaybe 0 hc) (fromMaybe 0 n) (fromMaybe 0 ac)
    
-- Accept symbol or unknown '*' character    
symbolOrUnknown :: Parser String
symbolOrUnknown = symbol <|> string "*"    

-- Parse hydrogen
hcount :: Parser Int
hcount =  do
    _ <- char 'H'
    hc <- optionMaybe number
    let n = fromMaybe 1 hc
    return $ if n == 0 then 1 else n

-- Parse ion number (charge). Ion number starts with '+' or '-'
charge :: Parser Int
charge =  do
    s <- char '-' <|> char '+'
    n <- number
    let m = if n == 0 then 1 else n
    return $ if s == '-' then (-m) else m

-- Parse atom class
atomClass :: Parser Int
atomClass =  do
    _ <- char ':'
    number

-- Parse number of elements. If number not found then return 1
number :: Parser Int
number =  do
    ds <- many digit
    return $ if null ds then 0 else read ds :: Int

-- Parse aliphatic
aliphaticOrganic :: Parser Atom
aliphaticOrganic = do
    ss <- symbol
    if Set.member ss aliphatics then return (Aliphatic ss) else fail "" 

-- Parse aromatic
aromaticOrganic :: Parser Atom
aromaticOrganic = do
    ss <- lower
    if Set.member ss aromatics then return (Aromatic [ss]) else fail "" 

-- Parser for '*' symbol
unknown :: Parser Atom
unknown = do
    _ <- char '*'
    return Unknown

-- Parse element symbol
-- Starts with upper case
-- has 0, 1 or 2 lower letters
symbol :: Parser String
symbol = do 
    s <- upper
    ss <- many lower
    return (s:ss)
    
    
-- | Convert SMILES to the string
writeSmiles :: Smiles -> String
writeSmiles (SmilesRing a n xs) = writeAtom a ++ writeBounds n ++ writeSmiles xs
writeSmiles (Smiles a) = writeAtom a

writeAtom :: Atom -> String
writeAtom (Atom xs ic hc n ac) = "[" ++ showIsotopes ++ xs ++ showHyrdogen ++ showCharge ++ showClass ++ "]"
    where showIsotopes = if ic > 0 then show ic else ""
          showHyrdogen | hc > 1 = "H" ++ show hc
                       | hc == 1 = "H"
                       | otherwise = ""
          showCharge | n < (-1) = show n
                     | n == (-1) = "-"
                     | n == 1 = "+"
                     | n > 1 = "+" ++ show n
                     | otherwise = ""
          showClass = if ac > 0 then ":" ++ show ac else ""

writeAtom (Aliphatic xs) = xs
writeAtom (Aromatic xs) = xs
writeAtom Unknown = "*"

writeBounds :: Int -> String
writeBounds 2 = "="
writeBounds 3 = "#"
writeBounds 4 = "$"
writeBounds _ = ""