------------------------------------------------------------------------------- -- | -- Module : Network.Api.Arxiv.hs -- Copyright : (c) Tobias Schoofs -- License : LGPL -- Stability : experimental -- Portability: portable -- -- The ArXiv API is split in two parts: -- Request and Response. -- The Request part contains -- a simple language to define queries, -- a query parser and some helpers to navigate -- through the results of a mutlipage query -- (which, in fact, induces a new query). -- -- The Response part contains -- an API to access the fields of the result -- based on TagSoup. -- -- This library does not contain functions -- to actually execute and manage http requests. -- It is intended to be used with existing -- http libraries such as http-conduit. -- An example how to use the ArXiv library -- with http-conduit is included in this documentation. ------------------------------------------------------------------------------- module Network.Api.Arxiv ( -- * Request -- $RequestOv baseUrl, apiUrl, apiQuery, Field(..), Expression(..), (/*/), (/+/), (/-/), -- * Expression Example -- $ExpExample -- * Queries Query(..), nextPage, parseQuery, preprocess, parseIds, mkQuery, exp2str, items2str, ids2str, itemControl, -- * Response -- $ResponseOv totalResults, startIndex, itemsPerPage, getEntry, forEachEntry, forEachEntryM, forEachEntryM_, checkForError, exhausted, getId, getIdUrl, getUpdated, getPublished, getYear, getTitle, getSummary, getComment, getJournal, getDoi, Link(..), getLinks, getPdfLink, getPdf, Category(..), getCategories, getPrimaryCategory, Author(..), getAuthors, getAuthorNames -- * A complete Example using http-conduit -- $CompleteExample ) where import Text.HTML.TagSoup import Text.Parsec import Data.Char (isDigit) import Data.Maybe (fromMaybe) import Data.List (find, intercalate) import qualified Data.List.Split as S import Control.Applicative ((<$>)) import Control.Monad (void) ------------------------------------------------------------------------ -- import Debug.Trace (trace) ------------------------------------------------------------------------ {- $ExpExample Expressions are intended to ease the construction of well-formed queries in application code. A simple example: > let au = Exp $ Au ["Knuth"] > t1 = Exp $ Ti ["The Art of Computer Programming"] > t2 = Exp $ Ti ["Concrete Mathematics"] > ex = au /*/ (t1 /+/ t2) > in ... -} {- $RequestOv Requests are URL parameters, either \"search_query\" or \"id_list\". This module provides functions to build and parse these parameters, to create the full request string and to navigate through a multi-page request with a maximum number of items per page. For details of the Arxiv request format, please refer to the Arxiv documentation. -} {- $ResponseOv Response processing expects [Tag String] as input (see TagSoup). The result produced by your http library (such as http-conduit) must be converted to [Tag String] before the result is passed to the response functions defined here (see also the example below). The response functions extract information from the tag soup, either in 'String', 'Int' or TagSoup format. For details of the Arxiv Feed format, please refer to the Arxiv documentation. -} {- $CompleteExample > module Main > where > > import qualified Network.Api.Arxiv as Ax > import Network.Api.Arxiv (Expression(..), > Field(..), (/*/), (/+/)) > import Network.Socket (withSocketsDo) > import Network.HTTP.Simple as HT > import Network.HTTP.Conduit (parseRequest) > import Network.HTTP.Types.Status > import Data.List (intercalate) > import qualified Data.ByteString as B hiding (unpack) > import qualified Data.ByteString.Char8 as B (unpack) > import Data.Conduit ((.|)) > import qualified Data.Conduit as C > import qualified Data.Conduit.List as CL > import Data.Function ((&)) > import Text.HTML.TagSoup > import Control.Monad.Trans (liftIO) > import Control.Monad.Trans.Resource (MonadResource) > import Control.Applicative ((<$>)) > > main :: IO () > main = withSocketsDo (execQuery makeQuery) > > makeQuery :: Ax.Query > makeQuery = > let au = Exp $ Au ["Aaronson"] > t1 = Exp $ Ti ["quantum"] > t2 = Exp $ Ti ["complexity"] > x = au /*/ (t1 /+/ t2) > in Ax.Query { > Ax.qExp = Just x, > Ax.qIds = [], > Ax.qStart = 0, > Ax.qItems = 25} > > type Soup = Tag String > > execQuery :: Ax.Query -> IO () > execQuery q = C.runConduitRes (searchAxv q .| outSnk) > > ---------------------------------------------------------------------- > -- Execute query and start a source > ---------------------------------------------------------------------- > searchAxv :: MonadResource m => Ax.Query -> C.ConduitT () String m () > searchAxv q = > let s = Ax.mkQuery q > in do rsp <- HT.httpBS =<< liftIO (parseRequest s) > case getResponseStatus rsp of > (Status 200 _) -> getSoup (getResponseBody rsp) > >>= results q > st -> error $ "Error:" ++ show st > > ---------------------------------------------------------------------- > -- Consume page by page > ---------------------------------------------------------------------- > getSoup :: MonadResource m => > B.ByteString -> C.ConduitT () String m [Soup] > getSoup b = concat <$> (C.yield b .| toSoup .| CL.consume) > > ---------------------------------------------------------------------- > -- Receive a ByteString and yield Soup > ---------------------------------------------------------------------- > toSoup :: MonadResource m => C.ConduitT B.ByteString [Soup] m () > toSoup = C.awaitForever (C.yield . parseTags . B.unpack) > > ---------------------------------------------------------------------- > -- Yield all entries and fetch next page > ---------------------------------------------------------------------- > results :: MonadResource m => > Ax.Query -> [Soup] -> C.ConduitT () String m () > results q sp = > if Ax.exhausted sp > then C.yield ("EOT: " ++ show (Ax.totalResults sp) ++ " results") > else Ax.forEachEntryM_ sp (C.yield . mkResult) > >> searchAxv (Ax.nextPage q) > > ---------------------------------------------------------------------- > -- Get data and format > ---------------------------------------------------------------------- > mkResult :: [Soup] -> String > mkResult sp = let aus = Ax.getAuthorNames sp > y = Ax.getYear sp > tmp = Ax.getTitle sp & clean ['\n', '\r', '\t'] > ti = if null tmp then "No title" else tmp > in intercalate ", " aus ++ " (" ++ y ++ "): " ++ ti > where clean _ [] = [] > clean d (c:cs) | c `elem` d = clean d cs > | otherwise = c:clean d cs > > ---------------------------------------------------------------------- > -- Sink results > ---------------------------------------------------------------------- > outSnk :: MonadResource m => C.ConduitT String C.Void m () > outSnk = C.awaitForever (liftIO . putStrLn) -} ------------------------------------------------------------------------ -- | The Arxiv base URL \"arxiv.org\" ------------------------------------------------------------------------ baseUrl :: String baseUrl = "arxiv.org" ------------------------------------------------------------------------ -- | The Arxiv API URL \"export.arxiv.org/api\" ------------------------------------------------------------------------ apiUrl :: String apiUrl = "https://export.arxiv.org/api/query?" ------------------------------------------------------------------------ -- | The query string (\"search_query=\" or \"id_list=\") ------------------------------------------------------------------------ apiQuery,apiIdList :: String apiQuery = "search_query=" apiIdList = "id_list=" ------------------------------------------------------------------------ -- | Field data type; -- a field consist of a field identifier -- (author, title, etc.) -- and a list of search terms. ------------------------------------------------------------------------ data Field = -- | Title Ti [Term] -- | Author | Au [Term] -- | Abstract | Abs [Term] -- | Comment | Co [Term] -- | Journal | Jr [Term] -- | Category | Cat [Term] -- | Report Number | Rn [Term] -- | Article identifier | Id [Term] -- | Any of the above | All [Term] deriving (Eq, Show) ------------------------------------------------------------------------ -- | A term is just a string ------------------------------------------------------------------------ type Term = String ------------------------------------------------------------------------ -- convert a field to a string ------------------------------------------------------------------------ field2str :: Field -> String field2str (Ti s) = "ti:" ++ terms2str s field2str (Au s) = "au:" ++ terms2str s field2str (Abs s) = "abs:" ++ terms2str s field2str (Co s) = "co:" ++ terms2str s field2str (Jr s) = "jr:" ++ terms2str s field2str (Cat s) = "cat:" ++ terms2str s field2str (Rn s) = "rn:" ++ terms2str s field2str (Id s) = "id:" ++ terms2str s field2str (All s) = "all:" ++ terms2str s ------------------------------------------------------------------------ -- convert a term to a string ------------------------------------------------------------------------ terms2str :: [Term] -> String terms2str = intercalate "+" . map term2str where term2str t = let x = intercalate "+" (words t) in if '+' `elem` x then "%22" ++ x ++ "%22" else x ------------------------------------------------------------------------ -- | Expression data type. -- An expression is either a field or a logical connection -- of two expressions using the basic operators -- AND, OR and ANDNOT. ------------------------------------------------------------------------ data Expression = -- | Just a field Exp Field -- | Logical \"and\" | And Expression Expression -- | Logical \"or\" | Or Expression Expression -- | Logical \"and . not\" | AndNot Expression Expression deriving (Eq, Show) ------------------------------------------------------------------------ -- | AND operator. -- The symbol was chosen because -- 0 * 1 = 0. ------------------------------------------------------------------------ infix /*/ (/*/) :: Expression -> Expression -> Expression (/*/) = And ------------------------------------------------------------------------ -- | OR operator. -- The symbol was chosen because -- 0 + 1 = 1. ------------------------------------------------------------------------ infix /+/ (/+/) :: Expression -> Expression -> Expression (/+/) = Or ------------------------------------------------------------------------ -- | ANDNOT operator. -- The symbol was chosen because -- 1 - 1 = 0 and 1 - 0 = 1. ------------------------------------------------------------------------ infix /-/ (/-/) :: Expression -> Expression -> Expression (/-/) = AndNot ------------------------------------------------------------------------ -- | Create a query string from an expression. -- Note that we create redundant parentheses, -- for instance \"a AND b OR c\" will be encoded as -- \"a+AND+%28b+OR+c%29\". -- The rationale is that the API specification is not clear -- on how expressions are parsed. -- The above expression could be understood as -- \"a AND (b OR c)\" or as \"(a AND b) OR c\". -- To avoid confusion, one should always use parentheses -- to group boolean expressions - even if some of these parentheses -- appear to be redundant under one or the other parsing strategy. ------------------------------------------------------------------------ exp2str, innerExp2str :: Expression -> String exp2str (Exp f) = field2str f exp2str (And e1 e2) = innerExp2str e1 ++ "+AND+" ++ innerExp2str e2 exp2str (Or e1 e2) = innerExp2str e1 ++ "+OR+" ++ innerExp2str e2 exp2str (AndNot e1 e2) = innerExp2str e1 ++ "+ANDNOT+" ++ innerExp2str e2 innerExp2str (Exp f) = exp2str (Exp f) innerExp2str e = "%28" ++ exp2str e ++ "%29" type Identifier = String ------------------------------------------------------------------------ -- | Query data type. -- -- You usually want to create a query like: -- -- > let e = (Exp $ Au ["Aaronson"]) /*/ ( -- > (Exp $ Ti ["quantum"]) /+/ -- > (Exp $ Ti ["complexity"])) -- > in Query { -- > qExp = Just e, -- > qIds = ["0902.3175v2","1406.2858v1","0912.3825v1"], -- > qStart = 0, -- > qItems = 10} ------------------------------------------------------------------------ data Query = Query { -- | The query expression qExp :: Maybe Expression, -- | Id List qIds :: [Identifier], -- | The first item we want to see qStart :: Int, -- | The number of items we want to see qItems :: Int} deriving (Eq, Show) ------------------------------------------------------------------------- -- | Prepares the query to fetch -- the next page adding \"items per page\" to \"start index\". ------------------------------------------------------------------------- nextPage :: Query -> Query nextPage q = let s = qStart q i = qItems q in q{qStart = s + i} ------------------------------------------------------------------------- -- | Checks whether the query is exhausted or not, i.e. -- whether all pages have been fetched already. -- The first argument is the entire response (not just a part of it). ------------------------------------------------------------------------- exhausted :: [Tag String] -> Bool exhausted sp = startIndex sp >= totalResults sp ------------------------------------------------------------------------- -- | Parses an expression from a string. -- Please refer to the Arxiv documentation for details -- on query format. -- -- Just a minor remark here: -- The operators OR, AND and ANDNOT are case sensitive. -- \"andnot\" would be interpreted as part of a title, for instance: -- \"ti:functional+andnot+object+oriented\" is just one title; -- \"ti:functional+ANDNOT+object+oriented\" would cause an error, -- because a field identifier (ti, au, etc.) is expected after -- \"+ANDNOT+\". -- -- The other way round: the field content itself -- is not case sensitive, i.e. -- \"ti:complexity\" or \"au:aaronson\" is the same as -- \"ti:Complexity\" and \"au:Aaronson\" respectively. -- This is a feature of the very arXiv API. -- -- You may want to refer to the comments under -- 'preprocess' and 'exp2str' for some more details -- on our interpretation of the Arxiv documentation. ------------------------------------------------------------------------- parseQuery :: String -> Either String Expression parseQuery s = case parse expression "" $ preprocess s of Left e -> Left $ show e Right e -> Right e ------------------------------------------------------------------------- -- | Converts a string containing comma-separated identifiers -- into a list of 'Identifier's. -- As stated already: No whitespace! ------------------------------------------------------------------------- parseIds :: String -> [Identifier] parseIds = S.endBy "," ------------------------------------------------------------------------- -- | This is an internal function used by 'parseQuery'. -- It may be occasionally useful for direct use: -- It replaces \" \", \"(\", \")\" and \"\"\" -- by \"+\", \"%28\", \"%29\" and \"%22\" -- respectively. -- -- Usually, these substitutions are performed -- when transforming a string to an URL, which should be done -- by your http library anyway (e.g. http-conduit). -- But this step is usually after parsing has been performed -- on the input string. (Considering a work flow like: -- parseQuery >>= mkQuery >>= parseUrl >>= execQuery.) -- The parser, however, accepts -- only the URL-encoded characters and, thus, some preprocessing -- may be necessary. -- -- The other way round, this means -- that you may use parentheses, spaces and quotation marks -- instead of the URL encodings. -- But be careful! Do not introduce two successive spaces - -- we do not check for whitespace! ------------------------------------------------------------------------- preprocess :: String -> String preprocess = concatMap s2s . map tos where s2s "(" = "%28" s2s ")" = "%29" s2s "\"" = "%22" s2s " " = "+" s2s c = c tos c = [c] ------------------------------------------------------------------------- -- | Generates the complete query string -- including URL, -- query expression, -- id list and -- item control ------------------------------------------------------------------------- mkQuery :: Query -> String mkQuery q = apiUrl ++ qry ++ x ++ plus ++ apiIdList ++ is ++ itm where x = case qExp q of Nothing -> "" Just e -> exp2str e plus = case qExp q of Nothing -> "" Just _ -> "&" qry = case qExp q of Nothing -> "" Just _ -> apiQuery is = ids2str $ qIds q itm = items2str q ------------------------------------------------------------------------- -- | Converts a list of 'Identifier' -- to a string with comma-separated identifiers ------------------------------------------------------------------------- ids2str :: [Identifier] -> String ids2str = foldr i2s "" where i2s i [] = i i2s i s = i ++ "," ++ s ------------------------------------------------------------------------- -- | Converts the query to a string containing only the item control ------------------------------------------------------------------------- items2str :: Query -> String items2str q = itemControl (qStart q) (qItems q) ------------------------------------------------------------------------- -- | Generates the item control of a query string according -- to first item and results per page: -- -- * 'Int': Start index for this page -- -- * 'Int': Number of results per page. ------------------------------------------------------------------------- itemControl :: Int -> Int -> String itemControl s m = "&start=" ++ show s ++ "&max_results=" ++ show m -- ====================================================================== -- result -- ====================================================================== ------------------------------------------------------------------------ -- | Total results of the query ------------------------------------------------------------------------ totalResults :: [Tag String] -> Int totalResults = getN "opensearch:totalResults" ------------------------------------------------------------------------ -- | Start index of this page of results ------------------------------------------------------------------------ startIndex :: [Tag String] -> Int startIndex = getN "opensearch:startIndex" ------------------------------------------------------------------------ -- | Number of items per page ------------------------------------------------------------------------ itemsPerPage :: [Tag String] -> Int itemsPerPage = getN "opensearch:itemsPerPage" ------------------------------------------------------------------------ -- | Checks if the feed contains an error message, i.e. -- -- * it has only one entry, -- -- * the title of this entry is \"Error\" and -- -- * its id field contains an error message, -- which is returned as 'Left'. -- -- Apparently, this function is not necessary, -- since the Arxiv site returns error feeds -- with status code 400 ("bad request"), -- which should be handled by your http library anyway. ------------------------------------------------------------------------ checkForError :: [Tag String] -> Either String () checkForError ts = case totalResults ts of 1 -> head $ forEachEntry ts $ \e -> if getTitle e == "Error" then Left $ getError e else Right () _ -> Right () ------------------------------------------------------------------------ -- | Get the first entry in the tag soup. -- The function returns a tuple of -- -- * The entry (if any) -- -- * The rest of the tag soup following the first entry. ------------------------------------------------------------------------ getEntry :: [Tag String] -> ([Tag String],[Tag String]) getEntry = element "entry" ------------------------------------------------------------------------ -- | Loop through all entries in the result feed -- applying a function on each one. -- The results are returned as list. -- The function is similar to 'map' -- with the arguments swapped (as in Foldable 'forM'). -- -- Arguments: -- -- * ['Tag' 'String']: The TagSoup through which we are looping -- -- * ['Tag' 'String'] -> r: The function we are applying per entry; -- the TagSoup passed in to the function -- represents the current entry. -- -- Example: -- -- > forEachEntry soup $ \e -> -- > let y = case getYear e of -- > "" -> "s.a." -- > x -> x -- > a = case getAuthorNames e of -- > [] -> "Anonymous" -- > as -> head as ++ -- > in a ++ " (" ++ y ++ ")" -- -- Would retrieve the name of the first author -- and the year of publication (like "Aaronson (2013)") -- from all entries. ------------------------------------------------------------------------ forEachEntry :: [Tag String] -> ([Tag String] -> r) -> [r] forEachEntry = forEach "entry" ------------------------------------------------------------------------ -- | Variant of 'forEachEntry' for monadic actions. ------------------------------------------------------------------------ forEachEntryM :: Monad m => [Tag String] -> ([Tag String] -> m r) -> m [r] forEachEntryM = forEachM "entry" ------------------------------------------------------------------------ -- | Variant of 'forEachEntryM' for actions -- that do not return a result. ------------------------------------------------------------------------ forEachEntryM_ :: Monad m => [Tag String] -> ([Tag String] -> m ()) -> m () forEachEntryM_ = forEachM_ "entry" ------------------------------------------------------------------------ -- | Gets the full contents of the id field -- (which contains an URL before the article identifier). -- The [Tag String] argument is expected to be a single entry. ------------------------------------------------------------------------ getIdUrl :: [Tag String] -> String getIdUrl = getString "id" ------------------------------------------------------------------------ -- | Gets the article identifier as it can be used -- in an \"id_list\" query, i.e. without the URL. -- The [Tag String] argument is expected to be a single entry. ------------------------------------------------------------------------ getId :: [Tag String] -> String getId = pureId . getString "id" ------------------------------------------------------------------------ -- Extract the pure article identifier from the id string ------------------------------------------------------------------------ pureId :: String -> String pureId s = let i = toSlash 2 (reverse s) z = drop 6 i in case z of "" -> reverse i '.':_ -> reverse $ toSlash 1 i _ -> reverse i where toSlash :: Int -> String -> String toSlash i m = let x = takeWhile (/= '/') m in if i == 1 then x else x ++ ('/' : toSlash (i-1) (drop (length x + 1) m)) ------------------------------------------------------------------------ -- Get the error message ------------------------------------------------------------------------ getError :: [Tag String] -> String getError = pureError . getString "id" ------------------------------------------------------------------------ -- Extract the pure error message from the id string ------------------------------------------------------------------------ pureError :: String -> String pureError = drop 1 . dropWhile (/= '#') ------------------------------------------------------------------------ -- | Gets the contents of the \"updated\" field in this entry, i.e. -- the date when the article was last updated. -- Be aware that there is another \"updated\" field -- right below the root node of the result. -- Make sure your are operating on an entry, -- not on the root node! ------------------------------------------------------------------------ getUpdated :: [Tag String] -> String getUpdated = getString "updated" ------------------------------------------------------------------------ -- | Gets the contents of the \"published\" field in this entry, i.e. -- the date when the article was last uploaded. ------------------------------------------------------------------------ getPublished :: [Tag String] -> String getPublished = getString "published" ------------------------------------------------------------------------ -- | Gets the year of the \"published\" field in this entry. ------------------------------------------------------------------------ getYear :: [Tag String] -> String getYear sp = case getPublished sp of "" -> "s.a." s -> takeWhile (/= '-') s ------------------------------------------------------------------------ -- | Gets the title of this entry. ------------------------------------------------------------------------ getTitle :: [Tag String] -> String getTitle = getString "title" ------------------------------------------------------------------------ -- | Gets the summary of this entry. ------------------------------------------------------------------------ getSummary :: [Tag String] -> String getSummary = getString "summary" ------------------------------------------------------------------------ -- | Gets author''s comment (in \"arxiv:comment\") of this entry. ------------------------------------------------------------------------ getComment :: [Tag String] -> String getComment = getString "arxiv:comment" ------------------------------------------------------------------------ -- | Gets the journal information (in \"arxiv:journal_ref\") -- of this entry. ------------------------------------------------------------------------ getJournal :: [Tag String] -> String getJournal = getString "arxiv:journal_ref" ------------------------------------------------------------------------ -- | Gets the digital object identifier (in \"arxiv:doi\") -- of this entry. ------------------------------------------------------------------------ getDoi :: [Tag String] -> String getDoi = getString "arxiv:doi" ------------------------------------------------------------------------ -- | The Link data type ------------------------------------------------------------------------ data Link = Link { -- | The hyperlink lnkHref :: String, -- | The link type (a MIME type) lnkType :: String, -- | The link title (e.g. \"pdf\" would be the link -- where we find the article -- in pdf format) lnkTitle :: String, -- | the link relation (e.g. \"related\" would point -- to the related information, -- such as the pdf document) lnkRel :: String} deriving (Show, Eq) ------------------------------------------------------------------------ -- | Gets all links in the entry. ------------------------------------------------------------------------ getLinks :: [Tag String] -> [Link] getLinks soup = case element "link" soup of ([],_) -> [] (x:_,[]) -> [mkLink x] (x:_,rs) -> mkLink x : getLinks rs where mkLink l = Link { lnkHref = fromMaybe "" $ getAt "href" l, lnkTitle = fromMaybe "" $ getAt "title" l, lnkRel = fromMaybe "" $ getAt "rel" l, lnkType = fromMaybe "" $ getAt "type" l} ------------------------------------------------------------------------ -- | Gets only the pdf link of this entry (if any). ------------------------------------------------------------------------ getPdfLink :: [Tag String] -> Maybe Link getPdfLink soup = case getLinks soup of [] -> Nothing ls -> find (\l -> lnkTitle l == "pdf") ls ------------------------------------------------------------------------ -- | Gets the hyperlink to the pdf document of this entry (if any). ------------------------------------------------------------------------ getPdf :: [Tag String] -> String getPdf soup = case getPdfLink soup of Nothing -> "" Just l -> lnkHref l ------------------------------------------------------------------------ -- | Category data type ------------------------------------------------------------------------ data Category = Category { -- | The category term (e.g. \"math-ph\") catTerm :: String, -- | The category scheme catScheme :: String} deriving (Show, Eq) ------------------------------------------------------------------------ -- Make category from TagSoup ------------------------------------------------------------------------ mkCat :: Tag String -> Category mkCat c = Category { catTerm = fromMaybe "" $ getAt "term" c, catScheme = fromMaybe "" $ getAt "scheme" c} ------------------------------------------------------------------------ -- | Gets the categories of this entry. ------------------------------------------------------------------------ getCategories :: [Tag String] -> [Category] getCategories soup = case element "category" soup of ([],_) -> [] (x:_,[]) -> [mkCat x] (x:_,rs) -> mkCat x : getCategories rs ------------------------------------------------------------------------ -- | Gets the primary category of this entry (if any). ------------------------------------------------------------------------ getPrimaryCategory :: [Tag String] -> Maybe Category getPrimaryCategory soup = case element "arxiv:primary_category" soup of ([],_) -> Nothing (x:_,_) -> Just (mkCat x) ------------------------------------------------------------------------ -- | The Author data type ------------------------------------------------------------------------ data Author = Author { -- | Author name auName :: String, -- | Author Affiliation auFil :: String} deriving (Show, Eq) ------------------------------------------------------------------------ -- | Gets the authors of this entry. ------------------------------------------------------------------------ getAuthors :: [Tag String] -> [Author] getAuthors soup = case element "author" soup of ([],_) -> [] (xs,rs) -> mkAut xs : getAuthors rs where mkAut au = let nm = case element "name" au of ([],_) -> "" (n,_) -> findTxt n fl = case element "arxiv:affiliation" au of ([],_) -> "" (a,_) -> findTxt a in Author { auName = nm, auFil = fl} ------------------------------------------------------------------------ -- | Gets the names of all authors of this entry. ------------------------------------------------------------------------ getAuthorNames :: [Tag String] -> [String] getAuthorNames = go where go s = case element "author" s of ([],[]) -> [] (a,[]) -> [getString "name" a] (a,r) -> getString "name" a : go r ------------------------------------------------------------------------ -- Lookup attribute by name ------------------------------------------------------------------------ getAt :: String -> Tag String -> Maybe String getAt a (TagOpen _ as) = lookup a as getAt _ _ = Nothing ------------------------------------------------------------------------ -- Find a 'TagText' and return the content ------------------------------------------------------------------------ getString :: String -> [Tag String] -> String getString n soup = let (i,_) = element n soup in if null i then "" else findTxt i ------------------------------------------------------------------------ -- Find a 'TagText' and return the contentas a 'Int'. -- If the tag is not found or the content is not a number, -- -1 is returned. ------------------------------------------------------------------------ getN :: String -> [Tag String] -> Int getN key soup = case element key soup of (k,_) -> case findTxt k of "" -> -1 t -> if all isDigit t then read t else -1 ------------------------------------------------------------------------ -- Get the content of a 'TagText' ------------------------------------------------------------------------ findTxt :: [Tag String] -> String findTxt [] = "" findTxt (t:ts) = case t of TagText x -> x _ -> findTxt ts ------------------------------------------------------------------------ -- Map a function to all occurences of an element in the soup ------------------------------------------------------------------------ forEach :: String -> [Tag String] -> ([Tag String] -> r) -> [r] forEach nm soup f = case element nm soup of ([],_) -> [] (e,rs) -> f e : forEach nm rs f ------------------------------------------------------------------------ -- Variant of forEach for monadic actions ------------------------------------------------------------------------ forEachM :: Monad m => String -> [Tag String] -> ([Tag String] -> m r) -> m [r] forEachM nm soup f = case element nm soup of ([],_) -> return [] (e,rs) -> do r <- f e rr <- forEachM nm rs f return (r:rr) ------------------------------------------------------------------------ -- Variant of forEachM for actions that do not return anything ------------------------------------------------------------------------ forEachM_ :: Monad m => String -> [Tag String] -> ([Tag String] -> m ()) -> m () forEachM_ nm soup f = case element nm soup of ([],_) -> return () (e,rs) -> f e >> forEachM_ nm rs f ------------------------------------------------------------------------ -- Find occurrence of an element and -- return this element (open tag to close tag) and -- the rest of the soup behind the element. ------------------------------------------------------------------------ element :: String -> [Tag String] -> ([Tag String], [Tag String]) element _ [] = ([],[]) element nm (t:ts) | isTagOpenName nm t = let (r,rs) = closeEl 0 ts in (t:r,rs) | otherwise = element nm ts where closeEl :: Int -> [Tag String] -> ([Tag String], [Tag String]) closeEl _ [] = ([],[]) closeEl i (x:xs) = go i (isTagCloseName nm x) x xs go i b x xs | b && i == 0 = ([x],xs) | b && i > 0 = let (r,rs) = closeEl (i-1) xs in (x:r,rs) | isTagOpenName nm x = let (r,rs) = closeEl (i+1) xs in (x:r,rs) | otherwise = let (r,rs) = closeEl i xs in (x:r,rs) ------------------------------------------------------------------------ -- Expression Parser ------------------------------------------------------------------------ type Parser a = Parsec String () a ------------------------------------------------------------------------ -- Expression is something in parentheses or something -- that starts with a field ------------------------------------------------------------------------ expression :: Parser Expression expression = try parentheses <|> fieldOperator ------------------------------------------------------------------------ -- A field potentially followed by an operator ------------------------------------------------------------------------ fieldOperator :: Parser Expression fieldOperator = do f <- field c <- try (char '+') <|> return ' ' if c == ' ' then return f else opAndArg f ------------------------------------------------------------------------ -- Find an operator and an expression ------------------------------------------------------------------------ opAndArg :: Expression -> Parser Expression opAndArg f = do o <- op void $ char '+' e <- expression return (o f e) ------------------------------------------------------------------------ -- A field consists of a fieldId and a list of terms ------------------------------------------------------------------------ field :: Parser Expression field = do i <- fieldId ts <- terms return (Exp $ i ts) ------------------------------------------------------------------------ -- The field ids ------------------------------------------------------------------------ fieldId :: Parser ([Term] -> Field) fieldId = try (void (string "au:" ) >> return Au) <|> try (void (string "ti:" ) >> return Ti) <|> try (void (string "abs:") >> return Abs) <|> try (void (string "co:" ) >> return Co) <|> try (void (string "cat:") >> return Cat) <|> try (void (string "jr:" ) >> return Jr) <|> try (void (string "rn:" ) >> return Rn) <|> try (void (string "id:" ) >> return Id) <|> (void (string "all:") >> return All) ------------------------------------------------------------------------ -- A term may be quoted, -- otherwise we build terms as list of strings -- separated by '+' ------------------------------------------------------------------------ terms :: Parser [String] terms = do t <- try quoted <|> term c <- try (lookAhead anyChar) <|> onEof '%' --ugly case c of '%' -> return [t] '+' -> do x <- isOp if x then return [t] else void (char c) >> (t:) <$> terms _ -> fail $ "unexpected symbol: '" ++ [c] ++ "'" ------------------------------------------------------------------------ -- Checks if an operator follows without consuming input ------------------------------------------------------------------------ isOp :: Parser Bool isOp = try (void (lookAhead (string "+ANDNOT+")) >> return True) <|> try (void (lookAhead (string "+AND+")) >> return True) <|> try (void (lookAhead (string "+OR+")) >> return True) <|> return False ------------------------------------------------------------------------ -- A quoted term ------------------------------------------------------------------------ quoted :: Parser String quoted = do void $ string "%22" intercalate "+" <$> go where go = do t <- term s <- try (string "%22") <|> return "" if not (null s) then return [t] else do c <- anyChar let t' = if c == '+' then t else t ++ [c] (t':) <$> go ------------------------------------------------------------------------ -- A single term ------------------------------------------------------------------------ term :: Parser String term = do c <- try (lookAhead anyChar) <|> onEof '%' if c `elem` "%+" then return "" else do x <- char c (x:) <$> term ------------------------------------------------------------------------ -- Signal EOF by returning the specified char ------------------------------------------------------------------------ onEof :: Char -> Parser Char onEof c = eof >> return c ------------------------------------------------------------------------ -- An expression in parentheses, -- which may be followed by another expression ------------------------------------------------------------------------ parentheses :: Parser Expression parentheses = do void $ string "%28" e <- expression void $ string "%29" c <- try (char '+') <|> try (lookAhead (char '%')) <|> onEof '.' case c of '+' -> opAndArg e _ -> return e ------------------------------------------------------------------------ -- Parse operator (note that it is essential to -- to process "ANDNOT" before "AND"! ------------------------------------------------------------------------ op :: Parser (Expression -> Expression -> Expression) op = try (void (string "ANDNOT") >> return AndNot) <|> try (void (string "OR") >> return Or) <|> (void (string "AND") >> return And)