Bio/PDB/EventParser/PDBParsingAbstractions.hs

{-# LANGUAGE BangPatterns, PatternGuards, ScopedTypeVariables, OverloadedStrings #-}
{-# LANGUAGE ViewPatterns #-}
{-| Standard building blocks for PDB parser.
-}
module Bio.PDB.EventParser.PDBParsingAbstractions(
                              -- * Stage 1 parsing - separating columns
                              trim, parseFields,
                              unstr,
                              -- * Stage 2 parsing - parsing typed values from strings
                              ParsedField(..),
                              -- ** Optional fields
                              pSpc,
                              pInt, pStr, pChr, pDouble,
                              -- ** Optional fields with default values
                              dInt, dStr, dChr, dDouble,
                              -- ** Mandatory fields
                              mKeyword,
                              mKeywords,
                              mSpc, mInt, mStr, mChr, mDouble,
                              -- * Stage 2.5 parsing - grouping fields into compound values
                              -- ** Compound types
                              fgAtom,    maybeFgAtom,
                              fgResidue, maybeFgResidue,
                              -- ** Extracting lists of typed values or error events
                              lefts, rights,
                              liftFgErrs,
                              maybeList
                              -- * Stage 3 is making valid 'PDBEvent's (not included in this module.)
                              )
where

import Prelude hiding (String)
import qualified Data.ByteString.Char8    as BS
import Data.Char (isSpace)

import Bio.PDB.EventParser.PDBEvents
import Bio.PDB.EventParser.FastParse(strtof, trim, trimFront)

--------------- {{{ Parsing abstraction
-- * Parsing abstraction utilities 

--------------- {{{ Stage 1 parsing - dissection by columns
-- ** Stage 1 parsing - dissection by columns 
{-# INLINE splitByColumns #-}
-- | Splits a string into substrings by column numbers.
splitByColumns :: String -> -- -- ^ An input String
                  [Int]  -> -- -- ^ An input list of first column numbers for each record entry
                  [String]  -- -- ^ Output list of record entries as strings
splitByColumns !s !cols = split s cols 0
  where
    split !s []      _ = [s] -- leftover
    split !s (c:cs) !i = let (a, s2) = BS.splitAt (c-i) s in
                             a:split s2 cs c

{-# INLINE unstr #-}
-- | Converts a 'ParsedField' containing 'String' into this string or an empty string if nothing was parsed.
unstr :: ParsedField -> String
unstr (IFStr !s) = s
unstr  IFNone   = ""
--------------- }}} Stage 1 parsing - dissection by columns


--------------- {{{ Stage 2 parsing - type conversions
-- ** Stage 2 parsing - type conversions
-- | A common type for shuttling parsed and typed record values.
data ParsedField = IFInt   {-# UNPACK #-} !Int |
                   IFStr   {-# UNPACK #-} !String  |
                   IFChar  {-# UNPACK #-} !Char    |
                   IFDouble {-# UNPACK #-} !Double   |
                   IFError {-# UNPACK #-} !String  |
                   IFNone
  deriving (Eq, Ord, Show, Read)

-- | Construct an error message by concatenating list of 'Bytestring's
ifError msg = IFError $ BS.concat msg

-- *** Parsers for optional fields
{-# INLINE pChr #-}
-- | Parser for an optional single-character field, first argument is a name of a field, second is an input.
pChr :: String -> String -> ParsedField
pChr fname s | BS.length s == 1 = IFChar (BS.head s)
pChr fname ""  = IFNone
pChr fname s   = ifError ["Char field '", fname, "' should have length of 1 - is '", s, "'."]

{-# INLINE pSpc #-}
-- | Parser for an optional spacing, an argument is an input.
pSpc :: String -> ParsedField
pSpc s | BS.null $ trimFront s = IFNone
pSpc s = ifError ["Spacing expected, but '", s, "' found."]

-- | Old version of pSpc
--   NOTE: also slower than using new trimFront
pSpcO s | BS.dropWhile (==' ') s == "" = IFNone
pSpcO s = ifError ["Spacing expected, but '", s, "' found."]

{-# INLINE pInt #-}
-- | Parser for an optional integer field, first argument is a field name, a second argument is an input.
pInt :: String -> String -> ParsedField
pInt fname s | Just (a, r) <- BS.readInt $ trimFront s = if BS.null $ trimFront r
                                                           then IFInt a
                                                           else pIntErr fname s
pInt fname (trimFront -> "")                           = IFNone
pInt fname s                                           = pIntErr fname s

-- | Reports error from pInt.
pIntErr fname s = ifError ["Int field '", fname,
                           "' cannot be parsed: '", s, "'."]
{-# INLINE pStr #-}
-- | Parser for an optional string field, first argument is a field name, a second argument is an input.
pStr :: String -> String -> ParsedField
pStr fname s = IFStr $ trim s

{-# INLINE pDouble #-}
-- | Parser for an optional floating-point field, first argument is a field name, a second argument is an input.
pDouble :: String -> String -> ParsedField
pDouble fname (strtof -> Just f) = IFDouble f
pDouble fname (trimFront -> "")  = IFNone
pDouble fname s                  = ifError ["Double cannot be parsed: '", s, "'."]

-- *** Parsers with default field values
-- | A helper method for converting a parser of an optional field
-- into a parser of an optional field with a default value.
{-# INLINE dConv #-}
dConv :: (t -> t1 -> ParsedField) -> ParsedField -> t -> t1 -> ParsedField
dConv conv def fname s = case conv fname s of
                              IFNone -> def
                              other  -> other

{-# INLINE dChr #-}
-- | Parser for an optional single character field with a default value, arguments are:
--
-- (1) field name
--
-- (2) default value
--
-- (3) input
dChr   ::  String -> Char    -> String -> ParsedField
dChr   fname def = dConv pChr   (IFChar  def) fname

{-# INLINE dInt #-}
-- | Parser for an optional integer field with a default value, arguments are:
--
-- (1) field name
--
-- (2) default value
--
-- (3) input
dInt   ::  String -> Int -> String -> ParsedField
dInt   fname def = dConv pInt   (IFInt   def) fname

{-# INLINE dStr #-}
-- | Parser for an optional string field with a default value, arguments are:
--
-- (1) field name
--
-- (2) default value
--
-- (3) input
dStr   ::  String -> String  -> String -> ParsedField
dStr   fname def = dConv pStr   (IFStr   def) fname

{-# INLINE dDouble #-}
-- | Parser for an optional floating-point field with a default value, arguments are:
--
-- (1) field name
--
-- (2) default value
--
-- (3) input
dDouble ::  String -> Double   -> String -> ParsedField
dDouble fname def = dConv pDouble (IFDouble def) fname

-- *** Parsers for mandatory fields
{-# INLINE mandField #-}
-- | Converts an optional field parser into a mandatory field parser that bails on missing input.
--
-- Arguments are:
--
-- (1) name of the field type
--
-- (2) a parser function that takes field name, and input and returns 'ParsedField'
--
-- A results is a function that takes:
--
-- (1) field name
--
-- (2) input
--
-- and returns a 'ParsedField'.
mandField ::  String -> (String -> String -> ParsedField) -> String -> String -> ParsedField
mandField typename ftype fname "" = mandFieldErrMsg typename fname
mandField typename ftype fname s  = case ftype fname s of
  IFNone -> mandFieldErrMsg typename fname
  other  -> other 

{-# INLINE mandFieldErrMsg #-}
-- | Return error message when mandatory field is missing.
mandFieldErrMsg typename fname = ifError [typename, " field '", fname, "' is empty or missing!"]

{-# INLINE mChr #-}
-- | Parser for a mandatory character field with a default value, arguments are:
--
-- (1) field name
--
-- (2) input
mChr ::  String -> String -> ParsedField
mChr   = mandField "Char"    pChr
{-# INLINE mDouble #-}
-- | Parser for a mandatory floating-point field with a default value, arguments are:
--
-- (1) field name
--
-- (2) input
mDouble ::  String -> String -> ParsedField
mDouble = mandField "Double"   pDouble
{-# INLINE mStr #-}
-- | Parser for a mandatory string field with a default value, arguments are:
--
-- (1) field name
--
-- (2) input
mStr ::  String -> String -> ParsedField
mStr   = mandField "String"  pStr
{-# INLINE mInt #-}
-- | Parser for a mandatory integer field with a default value, arguments are:
--
-- (1) field name
--
mInt ::  String -> String -> ParsedField
-- (2) input
mInt   = mandField "Int" pInt
{-# INLINE mSpc #-}
-- | Parser for a mandatory spacing field, arguments are:
--
-- (1) number of columns for spacing
--
-- (2) input
mSpc :: Int -> String -> ParsedField
mSpc l s = if BS.length s == l
             then pSpc s
             else ifError ["Spacing has different length ",
                           BS.pack $ show $ BS.length s,
                           " than expected ", BS.pack $ show l, "."]

{-# INLINE mKeywords #-}
-- | Parser for a fixed field that can be filled with one of many keywords, arguments are:
--
-- (1) field name
--
-- (2) a list of valid keywords
--
-- (3) input
mKeywords ::  String -> [String] -> String -> ParsedField
mKeywords fname kwds s | s `elem` kwds = IFStr s
mKeywords fname kwds s                 = ifError ["Keyword field '", fname,
                                                  "' should contain one of strings: '",
                                                  BS.intercalate "', '" kwds,
                                                  "' not '", s, "'."]

{-# INLINE mKeyword #-}
-- | Parser for a fixed single keyword field, arguments are:
--
-- (1) field name
--
-- (2) keyword
--
-- (3) input
mKeyword ::  String -> String -> String -> ParsedField
mKeyword fname kwd = mKeywords fname [kwd] 

-- Dissects columns (stage 1) and applies converters (stage 2)
{-# INLINE convertColumns #-}
-- | Dissects columns from stage 1 parsing, and applies converters from stage 2 parsing
--
-- (1) list of string parsers that return typed 'ParsedField' values
--
-- (2) list of column numbers that indicate a beginning of each field
--
-- (3) input
convertColumns :: [String -> ParsedField] -> [Int] -> String -> [ParsedField]
convertColumns convs cols s = zipWith convert convs content
  where
    convert conv s = conv s
    content = splitByColumns s cols

{-# INLINE findColumnErrors #-}
-- | Finds IFError values in a list of 'ParsedField' values, and returns
-- a list of error events in case there are any.
--
-- (1) list of string parsers that return typed 'ParsedField' values
--
-- (2) list of column numbers that indicate a beginning of each field
--
-- (3) line number to be injected into error events
--
-- (4) input
findColumnErrors :: [ParsedField] -> [Int] -> Int -> [PDBEvent]
findColumnErrors fields cols line_no = concatMap findError (zip fields cols)
  where
    findError (IFError e, c) = [PDBParseError line_no c e]
    findError _              = []

{-# INLINE parseFields #-}
-- | Uses field declarations that are list of (column number, parser to 'ParsedField', ...)
-- tuples and applies it to a given line of input.
--
-- Arguments are:
--
-- (1) field declarations list
--
-- (2) input line
--
-- (3) ordinal number of an input line
parseFields fieldDecls line line_no = (fields, errs)
  where
    fieldTypes  = map snd fieldDecls
    fieldBounds = map fst fieldDecls
    fields :: [ParsedField] = convertColumns fieldTypes fieldBounds line
    errs = findColumnErrors fields fieldBounds line_no

--------------- }}} Stage 2 parsing - type conversions

--------------- {{{ Stage 2.5 parsing - field groups

-- ** Stage 2.5 parsing - field grouping

{-# INLINE nonEmptyIF #-}
-- | Returns if a given 'ParsedField' value _certainly_ represents a missing value.
nonEmptyIF ::  ParsedField -> Bool
nonEmptyIF  IFNone      = False
nonEmptyIF (IFStr  "" ) = False
nonEmptyIF  _           = True

{-# INLINE fullIF #-}
-- | Returns if a given 'ParsedField' value _certainly_ represents a present value.
fullIF IFNone                    = False
fullIF (IFStr s) | BS.all isSpace s = False
fullIF (IFChar ' ')              = False
fullIF _                         = True

-- residue description field group
{-# INLINE fgResidue #-}
-- | Merges a set of values that corresponds to a residue description. 
--
-- Arguments are:
--
-- (1) boolean indicating, if the field group may omit a residue number
--
-- (2) field group name (description)
--
-- (3) column number beginning the residue description entries
--
-- (4) 'ParsedField' containing a three letter residue identifier
--
-- (5) 'ParsedField' containing a single letter chain identifier
--
-- (6) 'ParsedField' containing a residue number
--
-- (7) 'ParsedField' containing a residue insertion code
--
-- A result is a 'Either' of pair with column number and error message,
-- or 'RESID' value with a residue description.
fgResidue :: Bool -> BS.ByteString -> Int -> ParsedField -> ParsedField -> ParsedField -> ParsedField -> Either (Int, BS.ByteString) RESID
fgResidue delible fname col r c d i = case maybeFgResidue delible fname col r c d i of
                                Right Nothing   -> Left (col, BS.concat [fname, " residue description missing!"])
                                Right (Just at) -> Right at
                                Left (col, s)   -> Left (col, s)

{-# INLINE maybeFgResidue #-}
-- | Merges a set of values that corresponds to an optional residue description.
--
-- Arguments are:
--
-- (1) boolean indicating, if the field group may omit a residue number
--
-- (2) field group name (description)
--
-- (3) column number beginning the residue description entries
--
-- (4) 'ParsedField' containing a three letter residue identifier
--
-- (5) 'ParsedField' containing a single letter chain identifier
--
-- (6) 'ParsedField' containing a residue number
--
-- (7) 'ParsedField' containing a residue insertion code
--
-- A result is a 'Either' of pair with column number and error message,
-- or 'Maybe' 'RESID' value that may contain a residue description.
maybeFgResidue :: Bool -> BS.ByteString -> Int -> ParsedField -> ParsedField -> ParsedField -> ParsedField -> Either (Int, BS.ByteString) (Maybe RESID)
maybeFgResidue delible fname col r c d i
  | all nonEmptyIF obligatoryFields =
    Right $ Just $ RESID (unr, unc, und, uni)
  | any fullIF [r, c, d, i] =
    Left
      (col,
       BS.concat
	 [fname, " residue descriptions contains fields: ",
	  BS.pack $ show [r, c, d, i]])
  | otherwise = Right Nothing
  where obligatoryFields = if delible then [c, i] else [c, d, i]
	IFStr unr = r
	IFChar unc = c
	IFInt und = d
	IFChar uni = i

{-# INLINE fgAtom #-}
-- | Merges a set of values that correspond to a mandatory atom description.
--
-- Arguments are:
--
-- (1) field group name (description)
--
-- (2) column number beginning the residue description entries
--
-- (3) 'ParsedField' containing a three letter atom identifier
--
-- (4) 'ParsedField' containing a three letter residue identifier
--
-- (5) 'ParsedField' containing a single letter chain identifier
--
-- (6) 'ParsedField' containing a residue number
--
-- (7) 'ParsedField' containing a residue insertion code
--
-- A result is a 'Either' of pair with column number and error message,
-- or 'ATID' value that may contain an atom description.
fgAtom :: BS.ByteString-> Int -> ParsedField-> ParsedField-> ParsedField-> ParsedField-> ParsedField-> Either (Int, BS.ByteString) ATID
fgAtom fname col a r c d i = case maybeFgAtom fname col a r c d i of
                               Right Nothing   -> Left (col, BS.concat [fname, " atom description missing!"])
                               Right (Just at) -> Right at
                               Left  (col, s)  -> Left (col, s)

{-# INLINE maybeFgAtom #-}
-- | Merges a set of values that correspond to an optional atom description.
--
-- Arguments are:
--
-- (1) field group name (description)
--
-- (2) column number beginning the residue description entries
--
-- (3) 'ParsedField' containing a three letter atom identifier
--
-- (4) 'ParsedField' containing a three letter residue identifier
--
-- (5) 'ParsedField' containing a single letter chain identifier
--
-- (6) 'ParsedField' containing a residue number
--
-- (7) 'ParsedField' containing a residue insertion code
--
-- A result is a 'Either' of pair with column number and error message,
-- or 'Maybe' 'ATID' value that may contain an atom description.
maybeFgAtom :: BS.ByteString-> Int -> ParsedField-> ParsedField-> ParsedField-> ParsedField-> ParsedField-> Either (Int, BS.ByteString) (Maybe ATID)
maybeFgAtom fname col a r c d i
  | all nonEmptyIF [a, r, c, d, i] =
    Right $ Just $ ATID (una, unr, unc, und, uni)
  | any fullIF [a, r, c, d, i] =
    Left
      (col,
       BS.concat
	 [fname, " atom descriptions contains fields: ",
	  BS.pack $ show [a, r, c, d, i]])
  | otherwise = Right Nothing
  where IFStr una = a
	IFStr unr = r
	IFChar unc = c
	IFInt und = d
	IFChar uni = i

-- Stage 3 is generation of events - code is separated for each kind of event.

{-# INLINE lefts #-}
-- | Changes a list of 'Either' values, into a list of all values in 'Left' entries.
lefts ::  [Either a b] -> [a]
lefts (Left  s:ls) = s:lefts ls
lefts (Right _:ls) =   lefts ls
lefts []           = []

{-# INLINE liftFgErrs #-}
-- | Extracts Left (column_number, error_message) values from a list of results in a given line,
-- to form 'PDBParseError' events with a given line number, column number and error message.
--
-- Arguments:
--
-- (1) line number
--
-- (2) list of 'Either' (column_number, error_message_string) result values,
-- where 'Left' entries are used to generate error messages.
--
-- Result is a list of 'PDBEvent' entries that contain 'PDBParseError's (if any.)
liftFgErrs ::  Int -> [Either (Int, String) b] -> [PDBEvent]
liftFgErrs line_no errs = map (uncurry $ PDBParseError line_no) (lefts errs)

{-# INLINE rights #-}
-- | Changes a list of 'Either' values, into a list of all values in 'Right' entries.
rights ::  [Either a b] -> [b]
rights (Left  _:ls) =   rights ls
rights (Right s:ls) = s:rights ls
rights []           = []
--------------- }}} Stage 2.5 parsing - field groups

-- | Utility: Changes a list of 'Maybe's to a list of values hidden in 'Just' _ records.
maybeList :: [Maybe a] -> [a]
maybeList []           = []
maybeList (Nothing:as) = maybeList as
maybeList (Just a :as) = a:maybeList as

--------------- }}} Parsing abstractions