{- |

Properly validating e-mail addresses (or converting EBNF to Parsec)

In recent times there have been several calls for websites to properly validate
email addresses. Invariably, the compiled regex from Perl’s RFC822 is pasted up
as The Way To Do It. The problem with this is (as the source code from the Perl
module notes) is that email addresses cannot be validated by a simple regex
(due to requiring parenthesis-matching). The Perl code addresses this by first
stripping out all comments and then parsing via regex.

With this in mind, I thought that implementing the Addr-Spec specification from
RFC 5322 (only released less than 6 months ago) might be a good test of the
Haskell library Parsec. So, without further ado I went ahead and translated the
EBNF from RFC 5322 directly into Parsec.

Author: Porges
Source: <http://porg.es/blog/properly-validating-e-mail-addresses>

-}

module Text.Email.Validate ( isValid ) where

import Text.Parsec
import Text.Parsec.Char
import Data.Char (chr)

-- | Validate an email address encoded in a String satisifes RFC 5322
isValid :: String -> Bool
isValid x = let result = valid x in
        either (const False) (const True) result

valid :: String -> Either ParseError ()
valid x = parse addrSpec "" x

ignore x = x >> return ()

addrSpec = localPart >> char '@' >> domain >> eof

localPart = dotAtom <|> quotedString <|> obsLocalPart <?> "local part"
domain = dotAtom <|> domainLiteral <|> obsDomain <?> "domain"

domainLiteral = optional cfws >> char '[' >>
                many ( optional fws >> dtext) >>
                optional fws  >> char ']' >> optional cfws
                <?> "domain literal"

ranges = oneOf . map chr . concat
vchar = ranges [[0x21..0x7E]] -- from Backus-Naur RFC
dtext = ranges [[33..90],[94..126]] <|> obsDtext
qtext = ranges [[33],[35..91],[93..126]] <|> obsQtext
atext = alphaNum <|> oneOf "!#$%&'*+-/=?^_`{|}~"
ctext = ranges [[33..39],[42..91],[93..126]] <|> obsCtext
wsp = char ' '
        <|> char '\t'
        <?> "space or tab"

cr = char '\r' <?> "carriage return"
lf = char '\n' <?> "line feed"
crlf = cr >> lf <?> "CRLF line ending"

-- # modification: added try
cfws = try (many1 (optional fws >> comment) >> optional fws) <|> ignore fws
-- # modification from RFC: adding try because of overlap
fws = try (optional (many wsp >> crlf) >> many1 wsp)
        <|> many1 wsp
        <|> obsFws

-- # modification: added try
comment = between (char '(') (char ')') (many (try (optional fws >> ccontent)) >> optional fws)
        <?> "comment"
ccontent = ignore ctext
        <|> ignore quotedPair
        <|> comment

atom = optional cfws >> many1 atext >> optional cfws
dotAtomText = many1 atext >> many (char '.' >> many1 atext)
dotAtom = optional cfws >> dotAtomText >> optional cfws

-- # other change from RFC -- merge prefix
quotedPair = char '\\' >> ((vchar <|> wsp) <|> obsQp)
qcontent = qtext <|> quotedPair
quotedString = optional cfws >> char '\"' >> many (optional fws >> qcontent) >>
        optional fws >> char '\"' >> optional cfws
        <?> "quoted string"

-- # Obsolete syntax
obsNoWsCtl = ranges [[1..8],[11..12],[14..31],[127]]
obsCtext = obsNoWsCtl
obsDtext = obsNoWsCtl <|> quotedPair
obsQtext = obsNoWsCtl
-- # change: see above
obsQp = (char (chr 0) <|> obsNoWsCtl <|> lf <|> cr)
obsLocalPart = word >> many (char '.' >> word) >> return ()
obsDomain = atom >> many (char '.' >> atom) >> return ()
obsFws = many1 wsp >> many (crlf >> many1 wsp) >> return []
word = atom <|> quotedString