{-# LANGUAGE OverloadedStrings , RecordWildCards #-} -- | URL parser, following RFC 3986 (). module Network.URLb {-# DEPRECATED "Please use Network.URI insteasd." #-} where import Control.Applicative import Control.Monad import Data.Bits import Data.ByteString.Char8 (groupBy, pack) import Data.ByteString hiding (groupBy, pack, take, takeWhile) import Data.Either import Data.Maybe import Data.Monoid import Data.String import Data.Word import Prelude hiding (concatMap, drop, null, take, takeWhile) import Data.Attoparsec.ByteString import Data.Attoparsec.ByteString.Char8 (hexadecimal, decimal, char) -- | URL \"...refers to the subset of URIs that, in addition to identifying a -- resource, provide a means of locating the resource by describing its -- primary access mechanism\". -- -- A breakdown of URLs, per the diagram from RFC 3986: -- -- > foo://example.com:8042/over/there?name=ferret#nose -- > \_/ \______________/\_________/ \_________/ \__/ -- > | | | | | -- > scheme authority path query fragment -- > | _____________________|__ -- > / \ / \ -- > urn:example:animal:ferret:nose -- -- For the most part, URL parts are made of strings with percent encoding -- required of certain characters. The scheme is especially limited in the -- allowable data: -- -- > scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) -- -- Note well that no percent encoding is allowed. -- -- The authority section, nominally denoting @userinfo\@host:port@, is in -- fact quite flexible, allowing percent encoding for the hostname and -- userinfo section; only the port has a byte range restriction, to digits. -- -- Since this datatype represents the /data/ in a URL and not its particular -- encoded form, we use 'ByteString' liberally. data URL = URL { scheme :: Scheme , authority :: Maybe Authority , path :: ByteString , query :: ByteString , fragment :: ByteString } deriving (Eq, Ord, Show) instance IsString URL where fromString = fromRight . fromString' instance Parse URL where parser = url <$> (parser <* string "://") <*> authorityPath <*> option "" (char '?' *> qf) <*> option "" (char '#' *> qf) where url a (b, c) d e = URL a b c d e qf = option "" queryFragmentP instance Encode URL where encode URL{..} = mconcat [ encode scheme, "://" , maybe "" encode authority , "/" `concatNonEmpty` pathEncode path , "?" `concatNonEmpty` selectiveEncode queryFragmentOctet query , "#" `concatNonEmpty` selectiveEncode queryFragmentOctet fragment ] -- | \"Many URI schemes include a hierarchical element for a naming authority -- so that governance of the name space defined by the remainder of the URI -- is delegated to that authority...\" -- > authority = [ userinfo "@" ] host [ ":" port ] -- > userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) -- > host = IP-literal / IPv4address / reg-name -- > reg-name = *( unreserved / pct-encoded / sub-delims ) -- The @reg-name@ production overlaps with the IP-based ones; and in fact -- allows host to contain all bytes. data Authority = Authority { userinfo :: ByteString , host :: ByteString , port :: Maybe Word16 } deriving (Eq, Ord, Show) instance IsString Authority where fromString = fromRight . fromString' instance Parse Authority where parser = Authority <$> option "" (userinfoP <* char '@') <*> regNameP <*> optional (char ':' *> decimal) instance Encode Authority where encode Authority{..} = mconcat [ selectiveEncode userinfoOctet userinfo `concatNonEmpty` "@" , selectiveEncode regNameOctet host , maybe "" (mappend ":" . pack . show) port ] -- | \"Each URI begins with a scheme name that refers to a specification for -- assigning identifiers within that scheme. As such, the URI syntax is a -- federated and extensible naming system wherein each scheme's -- specification may further restrict the syntax and semantics of -- identifiers using that scheme.\" -- > scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) newtype Scheme = Scheme ByteString deriving (Eq, Ord, Show) instance IsString Scheme where fromString = fromRight . fromString' instance Encode Scheme where encode (Scheme b) = b instance Parse Scheme where parser = (Scheme .) . cons <$> satisfy (inClass "a-zA-Z") <*> takeWhile (inClass "a-zA-Z0-9.+-") -- | Class for encoding items from this module as URLs. class Encode t where encode :: t -> ByteString -- | Class for parsing URL-related datatypes. class Parse t where parser :: Parser t -- | > *( unreserved / pct-encoded / sub-delims / ":" ) userinfoOctet :: Word8 -> Bool userinfoOctet = inClass "-a-zA-Z0-9._~!$&'()*+,;=:" userinfoP :: Parser ByteString userinfoP = withPercents userinfoOctet -- | > *( unreserved / pct-encoded / sub-delims ) regNameOctet :: Word8 -> Bool regNameOctet = inClass "-a-zA-Z0-9._~!$&'()*+,;=" regNameP :: Parser ByteString regNameP = withPercents regNameOctet percent :: Parser Word8 percent = char '%' *> usingOnly 2 hexadecimal -- | Paths are quite sophisticated, with 5 productions to handle the different -- URI contexts in which they appear. However, for the purpose of URL -- parsing, we can assume that paths are always separated from the authority -- (even the empty authority) with a @/@ and thus can work with a relatively -- simple subset of the productions in the RFC. -- -- > path-rootless = segment-nz *( "/" segment ) -- > -- > ... -- > -- > segment-nz = 1*pchar -- > -- > ... -- > -- > pchar = unreserved / pct-encoded / sub-delims / ":" / "@" -- -- Although literal slash runs are not permitted by the RFC, equivalent -- content can be encoded with percent encoding. pathRootlessP :: Parser ByteString pathRootlessP = mappend <$> segment <*> option "" next where segment = withPercents segmentOctet next = char '/' *> (mappend "/" <$> pathRootlessP) -- > unreserved / pct-encoded / sub-delims / ":" / "@" segmentOctet :: Word8 -> Bool segmentOctet = inClass "-a-zA-Z0-9._~!$&'()*+,;=:@" -- ^ To parse the authority and path: -- -- * we parse an authority and then optionally a slash and a path or -- -- * we parse a single slash and then optionally a path. authorityPath :: Parser (Maybe Authority, ByteString) authorityPath = (,) <$> (Just <$> parser) <*> option "" (char '/' *> pathP) <|> (,) <$> (char '/' *> pure Nothing) <*> pathP where pathP = option "" pathRootlessP -- | The query and fragment have identical productions in the RFC. -- > *( pchar / "/" / "?" ) queryFragmentOctet :: Word8 -> Bool queryFragmentOctet = inClass "-a-zA-Z0-9._~!$&'()*+,;=:@/?" queryFragmentP :: Parser ByteString queryFragmentP = withPercents queryFragmentOctet usingOnly :: Int -> Parser t -> Parser t usingOnly c p = either (const mzero) return . parseOnly p =<< take c -- | Parse a bytestream, accepting either literal bytes matching the predicate -- or any percent encoded characters. withPercents :: (Word8 -> Bool) -> Parser ByteString withPercents predicate = cons <$> one <*> (option "" more) where more = withPercents predicate one = satisfy predicate <|> percent {- sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" pct-encoded = "%" HEXDIG HEXDIG unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" -} -- | Transform any octet to its percent encoded form. percentEncode :: Word8 -> ByteString percentEncode w = "%" `snoc` mod' (w `shiftR` 4) `snoc` mod' w where mod' w' = "0123456789ABCDEF" `index` fromIntegral (w' `mod` 16) -- | Percent encode a 'ByteString', ignoring octets that match the predicate. selectiveEncode :: (Word8 -> Bool) -> ByteString -> ByteString selectiveEncode predicate bytes = unfoldr f ("", bytes) where f (queued, remaining) = dequeue <|> next where dequeue = do (w, queued') <- uncons queued Just (w, (queued', remaining)) next = do (w, remaining') <- uncons remaining if predicate w then Just (w, (queued, remaining')) else f (percentEncode w, remaining') concatNonEmpty a b | null a || null b = "" | otherwise = mappend a b -- | Slash runs are not allowed in encoded paths. Here, this is interpreted to -- mean that the first slash in path data, which would come after the slash -- separating the path and the scheme or authority, should be escaped. pathEncode :: ByteString -> ByteString pathEncode = mconcat . fmap enc . groupBy neitherSlash where enc s | "/" == s = "/" | isPrefixOf "//" s = "/" `mappend` concatMap percentEncode (drop 1 s) | otherwise = selectiveEncode regNameOctet s neitherSlash a b = (a /= '/' && b /= '/') || (a == '/' && b == '/') fromString' s = parseOnly (parser <* endOfInput) (fromString s) {-# WARNING fromRight "This unsafe function calls error and is only intended for internal use." #-} fromRight (Right x) = x fromRight (Left s) = error s