{-# LANGUAGE CPP #-} {-# LANGUAGE GADTs #-} {-# LANGUAGE GeneralizedNewtypeDeriving #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE TupleSections #-} -- |An implementation of the RFC3987 -- [RFC3987]: http://www.ietf.org/rfc/rfc3987.txt module Data.RDF.IRI ( IRI(..), IRIRef(..) , Scheme(..), Authority(..), UserInfo(..), Host(..), Port(..) , Path(..), Query(..), Fragment(..) , IRIError(..), SchemaError(..) , mkIRI , serializeIRI , parseIRI, parseRelIRI , validateIRI, resolveIRI , removeIRIFragment ) where import Data.Semigroup (Semigroup(..)) import Data.Maybe (maybe, isJust) import Data.Functor import Data.List (intersperse) import Control.Applicative import Control.Monad (guard) import Control.Arrow (first, (&&&), (>>>)) import Data.Char (isAlpha, isDigit, isAlphaNum, toUpper, toLower) import Data.Text (Text) import qualified Data.Text as T import Data.Attoparsec.Text (Parser, ()) import qualified Data.Attoparsec.Text as P -- | A serialized IRI representation. newtype IRI = IRI { getIRI :: Text } deriving (Show, Eq) -- | A detailed IRI representation with its components. data IRIRef = IRIRef !(Maybe Scheme) !(Maybe Authority) !Path !(Maybe Query) !(Maybe Fragment) deriving (Show, Eq, Ord) newtype Scheme = Scheme Text deriving (Show, Eq, Ord) data Authority = Authority !(Maybe UserInfo) !Host !(Maybe Port) deriving (Show, Eq, Ord) newtype UserInfo = UserInfo Text deriving (Show, Eq, Ord) newtype Host = Host Text deriving (Show, Eq, Ord) newtype Port = Port Int deriving (Show, Eq, Ord) newtype Path = Path Text deriving (Show, Eq, Semigroup, Monoid, Ord) newtype Query = Query Text deriving (Show, Eq, Semigroup, Ord) instance Monoid Query where mempty = Query mempty #if !(MIN_VERSION_base(4,11,0)) mappend = (<>) #endif newtype Fragment = Fragment Text deriving (Show, Eq, Semigroup, Ord) instance Monoid Fragment where mempty = Fragment mempty #if !(MIN_VERSION_base(4,11,0)) mappend = (<>) #endif data IRIError = InvalidIRI deriving (Show, Eq) data SchemaError = NonAlphaLeading -- ^ Scheme must start with an alphabet character | InvalidChars -- ^ Subsequent characters in the schema were invalid | MissingColon -- ^ Schemas must be followed by a colon deriving (Show, Eq) removeIRIFragment :: IRIRef -> IRIRef removeIRIFragment (IRIRef s a p q _) = IRIRef s a p q Nothing -- [TODO] use Builder serializeIRI :: IRIRef -> Text serializeIRI (IRIRef s a p q f) = mconcat [ maybe mempty scheme s , maybe mempty authority a , path p , maybe mempty query q , maybe mempty fragment f ] where scheme (Scheme s') = s' <> ":" authority (Authority u (Host h) p') = mconcat [ "//" , maybe mempty userInfo u , h , maybe mempty port p' ] userInfo (UserInfo u) = u <> "@" port (Port p') = (":" <>) . T.pack . show $ p' path (Path p') = p' query (Query q') = "?" <> q' fragment (Fragment f') = "#" <> f' mkIRI :: Text -> Either String IRI mkIRI t = IRI . serializeIRI <$> parseIRI t parseIRI :: Text -> Either String IRIRef parseIRI = P.parseOnly $ iriParser <* (P.endOfInput "Unexpected characters at the end") parseRelIRI :: Text -> Either String IRIRef parseRelIRI = P.parseOnly $ irelativeRefParser <* (P.endOfInput "Unexpected characters at the end") validateIRI :: Text -> Either String Text validateIRI t = t <$ parseIRI t -- | IRI parsing and resolution according to algorithm 5.2 from RFC3986 -- See: http://www.ietf.org/rfc/rfc3986.txt -- [FIXME] Currently, this is a correct but naive implementation. resolveIRI :: Text -- ^ Base URI -> Text -- ^ URI to resolve -> Either String Text resolveIRI baseIri iri = serializeIRI <$> resolvedIRI where resolvedIRI = either (const resolvedRelativeIRI) resolveAbsoluteIRI (parseIRI iri) resolveAbsoluteIRI (IRIRef s a (Path p) q f) = return $ IRIRef s a (removeDotSegments p) q f resolvedRelativeIRI = do -- Parse as a relative IRI (IRIRef _ ra rp@(Path rp') rq rf) <- parseRelIRI iri -- Parse base IRI (IRIRef bs ba bp bq _) <- parseIRI baseIri let rIriWithoutAuth = resolveIriWithoutAuth rp rq rf bs ba bp bq rIriWithAuth = return (IRIRef bs ra (removeDotSegments rp') rq rf) maybe rIriWithoutAuth (const rIriWithAuth) ra resolveIriWithoutAuth rp rq rf bs ba bp bq = return $! if (rp == mempty) then maybe (IRIRef bs ba bp bq rf) (const (IRIRef bs ba bp rq rf)) rq else let (Path rp') = rp in if (T.head rp' == '/') then IRIRef bs ba (removeDotSegments rp') rq rf else IRIRef bs ba (removeDotSegments (merge ba bp rp)) rq rf removeDotSegments p = removeDotSegments' (T.split (== '/') p) mempty removeDotSegments' [] os = Path $ mconcat (intersperse "/" os) removeDotSegments' ["."] os = removeDotSegments' mempty (os <> [mempty]) removeDotSegments' [".."] [] = removeDotSegments' mempty mempty removeDotSegments' [".."] os = removeDotSegments' mempty (init os <> [mempty]) removeDotSegments' ss@[_] os = removeDotSegments' mempty (os <> ss) removeDotSegments' (".":ss) os = removeDotSegments' ss os removeDotSegments' ("..":ss) [] = removeDotSegments' ss mempty removeDotSegments' ("..":ss) os@[""] = removeDotSegments' ss os removeDotSegments' ("..":ss) os = removeDotSegments' ss (init os) removeDotSegments' (s:ss) os = removeDotSegments' ss (os <> [s]) merge ba (Path bp) (Path rp) | isJust ba && bp == mempty = "/" <> rp | otherwise = T.dropWhileEnd (/= '/') bp <> rp -- IRI = scheme ":" ihier-part [ "?" iquery ] [ "#" ifragment ] iriParser :: Parser IRIRef iriParser = do scheme <- Just <$> schemeParser _ <- P.string ":" "Missing colon after scheme" (authority, path) <- ihierPartParser query <- optional iqueryParser fragment <- optional ifragmentParser return (IRIRef scheme authority path query fragment) -- ihier-part = "//" iauthority ipath-abempty -- / ipath-absolute -- / ipath-rootless -- / ipath-empty ihierPartParser :: Parser (Maybe Authority, Path) ihierPartParser = iauthWithPathParser <|> ipathAbsoluteParser <|> ipathRootlessParser <|> ipathEmptyParser -- IRI-reference = IRI / irelative-ref -- [TODO] -- absolute-IRI = scheme ":" ihier-part [ "?" iquery ] -- [TODO] -- irelative-ref = irelative-part [ "?" iquery ] [ "#" ifragment ] irelativeRefParser :: Parser IRIRef irelativeRefParser = do (authority, path) <- irelativePartParser query <- optional iqueryParser fragment <- optional ifragmentParser return (IRIRef Nothing authority path query fragment) -- irelative-part = "//" iauthority ipath-abempty -- / ipath-absolute -- / ipath-noscheme -- / ipath-empty irelativePartParser :: Parser (Maybe Authority, Path) irelativePartParser = iauthWithPathParser <|> ipathAbsoluteParser <|> ipathNoSchemeParser <|> ipathEmptyParser -- iauthority = [ iuserinfo "@" ] ihost [ ":" port ] iauthorityParser :: Parser Authority iauthorityParser = Authority <$> optional (iuserInfoParser <* P.string "@") <*> ihostParser <*> optional (P.string ":" *> portParser) "Authority" -- iuserinfo = *( iunreserved / pct-encoded / sub-delims / ":" ) iuserInfoParser :: Parser UserInfo iuserInfoParser = UserInfo . mconcat <$> P.many1 iuserInfoP where iuserInfoP = iunreservedP <|> pctEncodedParser <|> subDelimsP <|> P.string ":" -- ihost = IP-literal / IPv4address / ireg-name ihostParser :: Parser Host ihostParser = Host <$> (ipLiteralParser <|> ipV4AddressParser <|> iregNameParser) "Host" -- ireg-name = *( iunreserved / pct-encoded / sub-delims ) iregNameParser :: Parser Text iregNameParser = mconcat <$> P.many' (iunreservedP <|> pctEncodedParser <|> subDelimsP) {- ipath = ipath-abempty ; begins with "/" or is empty / ipath-absolute ; begins with "/" but not "//" / ipath-noscheme ; begins with a non-colon segment / ipath-rootless ; begins with a segment / ipath-empty ; zero characters -} -- [TODO] -- ipath-abempty = *( "/" isegment ) ipathAbEmptyParser :: Parser Path ipathAbEmptyParser = Path <$> ipathAbEmptyParser' ipathAbEmptyParser' :: Parser Text ipathAbEmptyParser' = mconcat <$> P.many' (mconcat <$> sequence [P.string "/", isegmentParser]) -- ipath-absolute = "/" [ isegment-nz *( "/" isegment ) ] ipathAbsoluteParser :: Parser (Maybe Authority, Path) ipathAbsoluteParser = (Nothing,) <$> (Path <$> ipathAbsoluteParser') ipathAbsoluteParser' :: Parser Text ipathAbsoluteParser' = mconcat <$> sequence [P.string "/", ipathRootlessParser'] -- ipath-noscheme = isegment-nz-nc *( "/" isegment ) ipathNoSchemeParser :: Parser (Maybe Authority, Path) ipathNoSchemeParser = (Nothing,) <$> (Path <$> ipathNoSchemeParser') ipathNoSchemeParser' :: Parser Text ipathNoSchemeParser' = mconcat <$> sequence [isegmentNzNcParser, ipathAbEmptyParser'] -- ipath-rootless = isegment-nz *( "/" isegment ) ipathRootlessParser :: Parser (Maybe Authority, Path) ipathRootlessParser = (Nothing,) <$> (Path <$> ipathRootlessParser') ipathRootlessParser' :: Parser Text ipathRootlessParser' = mconcat <$> sequence [isegmentNzParser, ipathAbEmptyParser'] -- ipath-empty = 0 ipathEmptyParser :: Parser (Maybe Authority, Path) ipathEmptyParser = (Nothing, mempty) <$ ipathEmptyParser' ipathEmptyParser' :: Parser Text ipathEmptyParser' = P.string mempty "Empty path" -- isegment = *ipchar isegmentParser :: Parser Text isegmentParser = mconcat <$> (P.many' ipcharParser) -- isegment-nz = 1*ipchar isegmentNzParser :: Parser Text isegmentNzParser = mconcat <$> (P.many1 ipcharParser) -- isegment-nz-nc = 1*( iunreserved / pct-encoded / sub-delims / "@" ) -- ; non-zero-length segment without any colon ":" isegmentNzNcParser :: Parser Text isegmentNzNcParser = mconcat <$> (P.many1 _isegmentNzNcParser) where _isegmentNzNcParser = iunreservedP <|> pctEncodedParser <|> subDelimsP <|> P.string "@" -- ipchar = iunreserved / pct-encoded / sub-delims / ":" / "@" ipcharParser :: Parser Text ipcharParser = iunreservedP <|> pctEncodedParser <|> subDelimsP <|> P.string ":" <|> P.string "@" -- iquery = *( ipchar / iprivate / "/" / "?" ) iqueryParser :: Parser Query iqueryParser = Query <$> iqueryParser' iqueryParser' :: Parser Text iqueryParser' = P.char '?' *> (mconcat <$> P.many' (ipcharParser <|> iprivateParser <|> P.string "/" <|> P.string "?")) "Query" -- ifragment = *( ipchar / "/" / "?" ) ifragmentParser :: Parser Fragment ifragmentParser = Fragment <$> ifragmentParser' ifragmentParser' :: Parser Text ifragmentParser' = P.char '#' *> (mconcat <$> P.many' (ipcharParser <|> P.string "/" <|> P.string "?")) "Fragment" -- iunreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" / ucschar iunreservedP :: Parser Text iunreservedP = T.singleton <$> P.satisfy isIunreserved isIunreserved :: Char -> Bool isIunreserved c = isUnreserved c || isUcsChar c -- ucschar = %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF -- / %x10000-1FFFD / %x20000-2FFFD / %x30000-3FFFD -- / %x40000-4FFFD / %x50000-5FFFD / %x60000-6FFFD -- / %x70000-7FFFD / %x80000-8FFFD / %x90000-9FFFD -- / %xA0000-AFFFD / %xB0000-BFFFD / %xC0000-CFFFD -- / %xD0000-DFFFD / %xE1000-EFFFD isUcsChar :: Char -> Bool isUcsChar c = ('\x000A0' <= c && c <= '\x0D7FF') || ('\x0F900' <= c && c <= '\x0FDCF') || ('\x0FDF0' <= c && c <= '\x0FFEF') || ('\x10000' <= c && c <= '\x1FFFD') || ('\x20000' <= c && c <= '\x2FFFD') || ('\x30000' <= c && c <= '\x3FFFD') || ('\x40000' <= c && c <= '\x4FFFD') || ('\x50000' <= c && c <= '\x5FFFD') || ('\x60000' <= c && c <= '\x6FFFD') || ('\x70000' <= c && c <= '\x7FFFD') || ('\x80000' <= c && c <= '\x8FFFD') || ('\x90000' <= c && c <= '\x9FFFD') || ('\xA0000' <= c && c <= '\xAFFFD') || ('\xB0000' <= c && c <= '\xBFFFD') || ('\xC0000' <= c && c <= '\xCFFFD') || ('\xD0000' <= c && c <= '\xDFFFD') || ('\xE1000' <= c && c <= '\xEFFFD') -- iprivate = %xE000-F8FF / %xF0000-FFFFD / %x100000-10FFFD iprivateParser :: Parser Text iprivateParser = T.singleton <$> P.satisfy isIPrivate isIPrivate :: Char -> Bool isIPrivate c = ('\x00E000' <= c && c <= '\x00F8FF') || ('\x0F0000' <= c && c <= '\x0FFFFD') || ('\x100000' <= c && c <= '\x10FFFD') -- scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) schemeParser :: Parser Scheme schemeParser = -- Force lower case (RFC page 25) Scheme . T.map toLower <$> (T.cons <$> schemeHead <*> schemeRest) where schemeHead = P.satisfy isAlpha "Scheme head" schemeRest = P.takeWhile isSchemeTailChar "Scheme tail" isSchemeTailChar c = isAlphaNum c || c == '+' || c == '.' || c == '_' || c == '-' -- port = *DIGIT portParser :: Parser Port portParser = Port <$> portParser' portParser' :: Parser Int portParser' = P.decimal "Port" -- IP-literal = "[" ( IPv6address / IPvFuture ) "]" ipLiteralParser :: Parser Text ipLiteralParser = P.string "[" *> (ipV6AddressParser <|> ipFutureParser) <* P.string "]" -- IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" ) ipFutureParser :: Parser Text ipFutureParser = mconcat <$> sequence [ P.string "v", P.takeWhile1 isHexaDigit, P.string ".", P.takeWhile1 isValidFinalChar] where isValidFinalChar c = isUnreserved c || isSubDelims c || c == ':' -- IPv6address = 6( h16 ":" ) ls32 -- / "::" 5( h16 ":" ) ls32 -- / [ h16 ] "::" 4( h16 ":" ) ls32 -- / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32 -- / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32 -- / [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32 -- / [ *4( h16 ":" ) h16 ] "::" ls32 -- / [ *5( h16 ":" ) h16 ] "::" h16 -- / [ *6( h16 ":" ) h16 ] "::" ipV6AddressParser :: Parser Text ipV6AddressParser = do l <- leadingP t <- trailingP l joinParts l t "IPV6" where leadingP = h16 `P.sepBy` ":" trailingP = (id &&& length) >>> \l -> ipNotElided l <|> ipElided l joinParts leading trailing = pure $ (T.intercalate ":" leading) <> trailing h16 = parseBetween 1 4 (P.takeWhile isHexaDigit) ipNotElided (leading, lengthL) = guard (lengthL == 7 && isDecOctet (last leading)) *> partialIpV4 <|> (guard (lengthL == 8) $> mempty) ipElided (_, lengthL) = do guard $ lengthL <= 8 elision <- P.string "::" trailing <- h16 `P.sepBy` ":" let lengthT = length trailing let lengthTotal = lengthL + lengthT guard $ lengthT < 8 embeddedIpV4 <- guard (lengthT > 0 && lengthTotal < 7 && isDecOctet (last trailing)) *> partialIpV4 <|> pure mempty pure $ mconcat [elision, (T.intercalate ":" trailing), embeddedIpV4] partialIpV4 = mconcat <$> sequence [dotP, decOctetP, dotP, decOctetP, dotP, decOctetP] -- h16 = 1*4HEXDIG -- [TODO] -- ls32 = ( h16 ":" h16 ) / IPv4address -- [TODO] -- IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet ipV4AddressParser :: Parser Text ipV4AddressParser = mconcat <$> sequence [decOctetP, dotP, decOctetP, dotP, decOctetP, dotP, decOctetP] -- dec-octet = DIGIT ; 0-9 -- / %x31-39 DIGIT ; 10-99 -- / "1" 2DIGIT ; 100-199 -- / "2" %x30-34 DIGIT ; 200-249 -- / "25" %x30-35 ; 250-255 decOctetP :: Parser Text decOctetP = do -- [TODO] 1-liner ? s <- P.takeWhile1 isDigit guard (isDecOctet s) pure s isDecOctet :: Text -> Bool isDecOctet s = len > 0 && T.all isDigit s && (len < 3 || (len == 3 && s <= "255")) where len = T.length s -- pct-encoded = "%" HEXDIG HEXDIG pctEncodedParser :: Parser Text pctEncodedParser = T.cons <$> P.char '%' <*> (T.pack . fmap toUpper <$> (P.count 2 (P.satisfy isHexaDigit))) "Percent encoding" -- unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" isUnreserved :: Char -> Bool isUnreserved c = isAlphaNum c || c == '-' || c == '.' || c == '_' || c == '~' -- reserved = gen-delims / sub-delims -- [TODO] -- gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" -- sub-delims = "!" / "$" / "&" / "'" / "(" / ")" -- / "*" / "+" / "," / ";" / "=" subDelimsP :: Parser Text subDelimsP = T.singleton <$> P.satisfy isSubDelims isSubDelims :: Char -> Bool isSubDelims c = c `elem` ("!$&'()*+,;=" :: String) -- "//" iauthority ipath-abempty iauthWithPathParser :: Parser (Maybe Authority, Path) iauthWithPathParser = do void (P.string "//") curry (first Just) <$> iauthorityParser <*> ipathAbEmptyParser isHexaDigit :: Char -> Bool isHexaDigit c = (isDigit c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') dotP :: Parser Text dotP = P.string "." parseBetween :: Int -> Int -> Parser Text -> Parser Text parseBetween i j p = do s <- p let len = T.length s guard $ len >= i && len <= j return s