{-# LANGUAGE GeneralizedNewtypeDeriving, FlexibleInstances, TypeFamilies, PackageImports, FlexibleContexts, UndecidableInstances #-} ----------------------------------------------------------------------------- -- | -- Module : Web.Routes.Base -- Copyright : (c) 2010 Jeremy Shaw -- License : BSD-style (see the file LICENSE) -- -- Maintainer : partners@seereason.com -- Stability : experimental -- Portability : portable -- -- Conversions between raw pathinfos and decoded path segments. ----------------------------------------------------------------------------- module Web.Routes.Base ( encodePathInfo , decodePathInfo ) where import Codec.Binary.UTF8.String (encodeString, decodeString) import Data.List (intercalate, intersperse) import Network.URI {- From RFC1738 - 3.3 The HTTP URL scheme is used to designate Internet resources accessible using HTTP (HyperText Transfer Protocol). The HTTP protocol is specified elsewhere. This specification only describes the syntax of HTTP URLs. An HTTP URL takes the form: http://:/? where and are as described in Section 3.1. If : is omitted, the port defaults to 80. No user name or password is allowed. is an HTTP selector, and is a query string. The is optional, as is the and its preceding "?". If neither nor is present, the "/" may also be omitted. Within the and components, "/", ";", "?" are reserved. The "/" character may be used within HTTP to designate a hierarchical structure. From FRC1808 - 2.1 URL Syntactic Components The URL syntax is dependent upon the scheme. Some schemes use reserved characters like "?" and ";" to indicate special components, while others just consider them to be part of the path. However, there is enough uniformity in the use of URLs to allow a parser to resolve relative URLs based upon a single, generic-RL syntax. This generic-RL syntax consists of six components: :///;?# URL = ( absoluteURL | relativeURL ) [ "#" fragment ] absoluteURL = generic-RL | ( scheme ":" *( uchar | reserved ) ) generic-RL = scheme ":" relativeURL relativeURL = net_path | abs_path | rel_path net_path = "//" net_loc [ abs_path ] abs_path = "/" rel_path rel_path = [ path ] [ ";" params ] [ "?" query ] path = fsegment *( "/" segment ) fsegment = 1*pchar segment = *pchar params = param *( ";" param ) param = *( pchar | "/" ) pchar = uchar | ":" | "@" | "&" | "=" uchar = unreserved | escape unreserved = alpha | digit | safe | extra From RFC2396 - 3.3 path_segments = segment *( "/" segment ) segment = *pchar *( ";" param ) param = *pchar pchar = unreserved | escaped | ":" | "@" | "&" | "=" | "+" | "$" | "," The path may consist of a sequence of path segments separated by a single slash "/" character. Within a path segment, the characters "/", ";", "=", and "?" are reserved. Each path segment may include a sequence of parameters, indicated by the semicolon ";" character. The parameters are not significant to the parsing of relative references. From RFC3986 - 3.3 The path component contains data, usually organized in hierarchical form, that, along with data in the non-hierarchical query component (Section 3.4), serves to identify a resource within the scope of the URI's scheme and naming authority (if any). The path is terminated by the first question mark ("?") or number sign ("#") character, or by the end of the URI. If a URI contains an authority component, then the path component must either be empty or begin with a slash ("/") character. If a URI does not contain an authority component, then the path cannot begin with two slash characters ("//"). In addition, a URI reference (Section 4.1) may be a relative-path reference, in which case the first path segment cannot contain a colon (":") character. The ABNF requires five separate rules to disambiguate these cases, only one of which will match the path substring within a given URI reference. We use the generic term "path component" to describe the URI substring matched by the parser to one of these rules. path = path-abempty ; begins with "/" or is empty / path-absolute ; begins with "/" but not "//" / path-noscheme ; begins with a non-colon segment / path-rootless ; begins with a segment / path-empty ; zero characters path-abempty = *( "/" segment ) path-absolute = "/" [ segment-nz *( "/" segment ) ] path-noscheme = segment-nz-nc *( "/" segment ) path-rootless = segment-nz *( "/" segment ) path-empty = 0 segment = *pchar segment-nz = 1*pchar segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" ) ; non-zero-length segment without any colon ":" pchar = unreserved / pct-encoded / sub-delims / ":" / "@" A path consists of a sequence of path segments separated by a slash ("/") character. A path is always defined for a URI, though the defined path may be empty (zero length). Use of the slash character to indicate hierarchy is only required when a URI will be used as the context for relative references. For example, the URI has a path of "fred@example.com", whereas the URI has an empty path. The path segments "." and "..", also known as dot-segments, are defined for relative reference within the path name hierarchy. They are intended for use at the beginning of a relative-path reference (Section 4.2) to indicate relative position within the hierarchical tree of names. This is similar to their role within some operating systems' file directory structures to indicate the current directory and parent directory, respectively. However, unlike in a file system, these dot-segments are only interpreted within the URI path hierarchy and are removed as part of the resolution process (Section 5.2). Aside from dot-segments in hierarchical paths, a path segment is considered opaque by the generic syntax. URI producing applications often use the reserved characters allowed in a segment to delimit scheme-specific or dereference-handler-specific subcomponents. For example, the semicolon (";") and equals ("=") reserved characters are often used to delimit parameters and parameter values applicable to that segment. The comma (",") reserved character is often used for similar purposes. For example, one URI producer might use a segment such as "name;v=1.1" to indicate a reference to version 1.1 of "name", whereas another might use a segment such as "name,1.1" to indicate the same. Parameter types may be defined by scheme-specific semantics, but in most cases the syntax of a parameter is specific to the implementation of the URI's dereferencing algorithm. -} {- Reserved characters: If a character is unreserved, then you can included it as the literal character, or percent encode it, and it does not change its meaning. The two urls will be equal to each other. Some characters are explicitly reserved in different url schemes. For example the '/' character in a path component has special meaning, and therefore any occurance of '/' must be escaped unless it is being used for it's reserved purposed. The spec also provides a list of characters than can be reserved in specific url spec. For example, a url producer can choose to use , as a reserved character. However, it is not obligated to use , as a reserved character. From RFC3986 - 2.2 Characters in the "reserved" set are not reserved in all contexts. The set of characters actually reserved within any given URI component is defined by that component. In general, a character is reserved if the semantics of the URI changes if the character is replaced with its escaped US-ASCII encoding. Some choices we made: The presence of ; and params in a path segment is handled differently in the different RFCs. It does some clear, though that ; is supposed to indicate the start of parameters. Hence we should escape ; so that if it appears in a url it does not treated as parameters when it was not meant to be. At present we offer no way for a user who actually wants to add parameters. That would probably be done path extending the encodePathInfo to be more like: encodePathInfo :: [(String, [Param])] -> String The spec also forbids a path from starting with // if the scheme has no authority. This library is currently only intended to be used with the http scheme, so we do not have to worry about that rule, since the http scheme does have an authority. -} {-| Encodes a list of path segments into a valid URL fragment. This function takes the following three steps: * UTF-8 encodes the characters. * Performs percent encoding on all unreserved characters, as well as \:\@\=\+\$, * Intercalates with a slash. For example: > encodePathInfo [\"foo\", \"bar\", \"baz\"] \"foo\/bar\/baz\" > encodePathInfo [\"foo bar\", \"baz\/bin\"] \"foo\%20bar\/baz\%2Fbin\" > encodePathInfo [\"שלום\"] \"%D7%A9%D7%9C%D7%95%D7%9D\" -} encodePathInfo :: [String] -> [(String, String)] -> String encodePathInfo pieces qs = let x = map encodeString `o` -- utf-8 encode the data characters in path components (we have not added any delimiters yet) map (escapeURIString (\c -> isUnreserved c || c `elem` ":@&=+$,")) `o` -- percent encode the characters map (\str -> case str of "." -> "%2E" ; ".." -> "%2E%2E" ; _ -> str) `o` -- encode . and .. intercalate "/" -- add in the delimiters y = paramsToQueryString qs in x pieces ++ y where -- reverse composition o :: (a -> b) -> (b -> c) -> a -> c o = flip (.) {-| Performs the inverse operation of 'encodePathInfo'. In particular, this function: * Splits a string at each occurence of a forward slash. * Percent-decodes the individual pieces. * UTF-8 decodes the resulting data. This utilizes 'decodeString' from the utf8-string library, and thus all UTF-8 decoding errors are handled as per that library. In general, you will want to strip the leading slash from a pathinfo before passing it to this function. For example: > decodePathInfo \"\" \[\] > decodePathInfo \"\/\" [\"\"] -} decodePathInfo :: String -> [String] decodePathInfo = splitPaths `o` -- split path on delimiters map unEscapeString `o` -- decode any percent encoded characters map decodeString -- decode octets where -- reverse composition o :: (a -> b) -> (b -> c) -> a -> c o = flip (.) splitPaths :: String -> [String] splitPaths "" = [] splitPaths s = let (x, y) = break (== '/') $ drop1Slash s in x : splitPaths y where drop1Slash ('/':x) = x drop1Slash x = x paramsToQueryString :: [(String, String)] -> String paramsToQueryString [] = "" paramsToQueryString ps = '?' : concat (intersperse "&" (map paramToQueryString ps)) where isOK :: Char -> Bool isOK c = isUnreserved c || (c `elem` ":@$,") escapeParamChar :: Char -> String escapeParamChar ' ' = "+" escapeParamChar c = escapeURIChar isOK c escapeParamString :: String -> String escapeParamString = concatMap escapeParamChar paramToQueryString :: (String, String) -> String paramToQueryString (k,v) = (escapeParamString k) ++ ('=' : escapeParamString v)