module Text.WikimediaParser where import Text.ParserCombinators.Parsec import Data.List (intercalate) data WikiMarkup = Text String | Link {text::String, target::String} deriving (Eq) instance Show WikiMarkup where show (Text t) = t show (Link t _) = t data Page = Page [WikiMarkup] deriving (Eq) instance Show Page where show (Page xs) = foldl (\x y -> x ++ (show y)) "" xs generalParseLink p = between (string "[[") (string "]]") p isNot c = satisfy (/= c) betweenMatching :: Char -> GenParser Char st [Char] betweenMatching c = do open <- many1 $ char c content <- (many $ isNot c) count (length open) (char c) return content parseLink :: Parser WikiMarkup parseLink = generalParseLink linkContent where linkContent = do target <- optionMaybe (try $ manyTill anyToken (char '|')) text <- many1 (noneOf "[]") return $ Link text (maybe text id target) parseNoWiki :: Parser WikiMarkup parseNoWiki = between (string "") (string "") parseText parseCategory :: Parser WikiMarkup parseCategory = generalParseLink linkContent where linkContent = do option "" (string ":") string "Catégorie" many (noneOf "]") return $ Text "" eol = char '\n' notEol = noneOf "\r\n" parseUnorderedListItem = do char '*' many notEol eol parseUnorderedList :: Parser WikiMarkup parseUnorderedList = do many1 parseUnorderedListItem return $ Text "" parseOrderedListItem = do char '#' many notEol eol parseOrderedList :: Parser WikiMarkup parseOrderedList = do many1 parseUnorderedListItem return $ Text "" parseLinkNameSpace :: Parser WikiMarkup parseLinkNameSpace = generalParseLink linkContent where linkContent = do manyTill anyToken (char ':') text <- many (noneOf "[]") return $ Text text parseLinkOtherLanguage :: Parser WikiMarkup parseLinkOtherLanguage = generalParseLink linkContent where linkContent = do anyToken anyToken option "" (string "mple") char ':' many (noneOf "[]") return $ Text "" parseUnnamedLink :: Parser WikiMarkup parseUnnamedLink = do between (char '[') (char ']') (many $ noneOf "]") return $ Text "" parseHTMLComment :: Parser WikiMarkup parseHTMLComment = do string "") anyToken return $ Text "" parseHTML = do char '<' tagName <- many $ noneOf " >" options <- option "" $ manyTill anyToken (char '>') content <- nest string "" return $ intercalate "," $ [tagName, options, content, endTag] where nest = do text <- many $ isNot '<' case text of "" -> do return "" _ -> do html <- option "" parseHTML case html of "" -> return text _ -> do next <- nest case next of "" -> return $ text ++ html _ -> return $ text ++ html ++ next parseHeading :: Parser WikiMarkup parseHeading = do try $ betweenMatching '=' return $ Text "" parseBrackets :: Parser WikiMarkup parseBrackets = do between (string "{{") (string "}}") (many (noneOf "}")) return $ Text "" symbols :: String symbols = "[{=*<" symbol :: Parser Char symbol = oneOf symbols symbolW :: Parser WikiMarkup symbolW = symbol >>= (return . Text . (\a->[a])) notSymbol = noneOf symbols parseSymbols = try parseCategory <|> try parseLink <|> try parseUnnamedLink <|> try parseBrackets <|> try parseHeading <|> try parseUnorderedList <|> try parseOrderedList <|> try parseHTMLComment <|> symbolW parseText :: Parser WikiMarkup parseText = do text <- many1 notSymbol return $ Text text parseLine :: Parser WikiMarkup parseLine = parseSymbols <|> parseText parseArticle :: Parser [WikiMarkup] parseArticle = do vals <- many parseLine eof return vals concatText :: [WikiMarkup] -> [WikiMarkup] concatText (Text x: Text y:xs) = Text (x ++ y) : concatText xs concatText [] = [] concatText (x:xs) = x : concatText xs