{-| A parser for the medaiiwki grammar -}
module MediaWikiParser where
import Text.ParserCombinators.Parsec
import qualified Data.List as List
import qualified Data.Map as Map
import MediaWikiParseTree
import MagicStrings
import Control.Monad
import Data.String.HT (trim)
import Data.List.Split (splitOn)
import Data.List (isInfixOf)
import Tools
import Codec.Binary.UTF8.String
import Data.Maybe
import Network.URI
import WikiHelper
{-| flattens out the HTML 'a' tags. That is it replaces each 'a' element with its content. That is everything that is between its opening and its closing tag. The only parameter of this function is the parse tree to be processed. This function returns the parse tree with the 'a' HTML elements flattened |-}
reducea :: [Anything Char] -> [Anything Char]
reducea ll = concat (map go ll)
where go :: Anything Char -> [Anything Char]
go (Environment Tag (TagAttr "a" _) l) = l
go (Environment x y l) = [Environment x y (reducea l)]
go x = [x]
{-| flattens out the HTML 'div' tags, which have got a 'class' attributes present with the value 'noresize'. That is it replaces each 'div' element with the properties mentioned above by its content. That is everything that is between its opening and its closing tag. The only parameter of this function is the parse tree to be processed. This function returns the parse tree with the 'div' HTML elements with the properties given above flattened |-}
reducediv :: [Anything Char] -> [Anything Char]
reducediv ll = concat (map go ll)
where go :: Anything Char -> [Anything Char]
go (Environment Tag (TagAttr "div" m) l)
| (Map.lookup "class" m) == (Just "noresize") = l
go (Environment x y l) = [Environment x y (reducea l)]
go x = [x]
{-| a function to get HTML elements out of a parse tree. The first parameter is name of the tag to be looked for. The second parameter is a key in the attributes of that element that has to be present for the element to be considered. The third parameter is a value that has to be found under the given key in the attributes of the element, in order for the element to be part of the returned output list. The fourth parameter is the parse tree. |-}
deepGet ::
[Char] -> String -> [Char] -> [Anything a] -> [Anything a]
deepGet t k v ll = concat $ map go ll
where go (Environment Tag (TagAttr tag m) l)
| (tag == t) && ((Map.lookup k m) == (Just v)) =
[Environment Tag (TagAttr tag m) l]
go (Environment _ _ l) = (deepGet t k v l)
go _ = []
{-| flattens a part of the parse tree, that is takes the characters found in the tree and turns them into a string dropping all other information in the tree -}
deepFlatten :: [Anything t] -> [Anything t]
deepFlatten ll = concat $ map go ll
where go (Environment HtmlChar s l) = [Environment HtmlChar s l]
go (Environment _ _ l) = (deepFlatten l)
go x = [x]
{-| converts a wiki source document to a parse tree to be converted to LaTeX be treeToLaTeX3. The first parameter is that list of parsers. That is the list of environments to be recognized by the parser. This is usually either only plain HTML, or HTML mixed with mediawiki markup. The second parameter is the source code to be parsed. This function returns a parse tree |-}
parseit :: [MyParser Char] -> String -> [Anything Char]
parseit pp x
= (parseit2
(decon2 (remake pp)
(parseAnything2
[MyStackFrame{endparser = pzero, startdata = Str "",
environment = Root, badparser = \ _ -> pzero, parsernumber = 0,
nestingdepth = 0}]
(remake pp)
[]))
('\n' : x))
{-| helper function of parseit, not to be called directly. This function takes the parser for the grammar, in the sense of a parser of the parsec library, (so that is the final combined parser) as first argument. It takes the source code to be parsed (usually HTML of mediawiki markup mixed with HTML) as second and runs the parser on the source code. It returns the resulting parse tree. |-}
parseit2 :: Parser [Anything Char] -> String -> [Anything Char]
parseit2 p input
= case (parse p "" input) of
Left _ -> []
Right x -> x
{-| A parser for one particular element of the mediawiki grammar -}
data MyParser tok = MyParser{bad ::
[Anything tok] -> GenParser tok () (),
start :: MyStack tok -> GenParser tok () StartData,
end :: StartData -> GenParser tok () (), allowed :: [EnvType],
self :: EnvType,
modify :: StartData -> [Anything tok] -> [Anything tok],
reenv :: EnvType -> EnvType}
{-| the stack of the parser. See documentation on MyStackFrame in this module for details. -}
type MyStack tok = [MyStackFrame tok]
{-| A stack frame on the parsers stack. A stack frame represents an environment that was opened. So something like an opening HTML tag. The value endparser. Is a parser that should match exactly the closing bracket of the environment. The value startdata is the return value of the start parser of the enviroment. This stack frame is created immediately after the startparser of an environment has matches and is given the return value of that startparser as startdata. The value environment is the environment this stack frame belongs to in case of an HTML tag this would be Tag. See the type EnvType in the module MediaWikiParseTree for a full list of possible environments. In the parse tree that is finally generated each node with arbitrarily nested children has got an EnvType associated with it. The badparser is a parser that is repeatedly tries while processing the current environment, if it matches the current environment is considered to be invalid. Backtracking occurs an the characters currently under consideration are parser (possible very) different manner. So with badparser you can signal that an environment is invalid if a creating parser (the badparser) matches within the environment. The parserenumber is just a number that uniquely identifies each parsers in the list of parsers active for the whole parsing process. These numbers are usually generated by the remake function. The nestingdepth is a bit of a misnomer. It is a unique number for each stack frame. So each stack frame that is newly created gets a different number. -}
data MyStackFrame tok = MyStackFrame{endparser ::
GenParser tok () (),
startdata :: StartData, environment :: EnvType,
badparser :: [Anything tok] -> GenParser tok () (),
parsernumber :: Int, nestingdepth :: Int}
{-| takes a result returned by parseAnything3 and converts it into a parse tree for further processing. The only purpose of this function is to convert the notation of bracket. The bracket a denoted by the Open and Close parse tree elements of the type Anything be the function parseAnything3. The need to be converted to environments, that is node with children in the parse tree. The environments will be denoted by the 'Environment' data constructors of the type Anything|-}
decon2 ::
(Monad m) =>
[(a1, MyParser a)] -> m (t, [Anything a]) -> m [Anything a]
decon2 l x
= do (_, s) <- x
return (findMatchingBrackets l (reverse s))
{-| Usually bracket can be close in an order different from the reverse one in which they were opened. But for certain environments this is not allowed, and the order has to be strictly followed. This value is the list of those environments. -}
preserving :: [EnvType]
preserving
= [Math, Source, Comment, Gallery, NoWiki, NoInclude, BigMath,
Preformat, TableCap, TableRowSep, TableColSep, TableHeadColSep,
TemplateInside, Wikitable, TableTag]
{-| Helper function for parseAnyClosingBracket. Should not be called directly. The only parameter is the current parser stack. Returns the depth on the stack of the stack frame whose closing bracket matched. -}
parseAnyClosingBracket2 ::
(Show tok, Eq tok, Read tok) =>
MyStack tok -> GenParser tok () Integer
parseAnyClosingBracket2
= (parseAnyClosingBracket3 0) . (List.map (\ x -> endparser x))
{-| Helper function for parseAnyClosingBracket. Should not be called directly. The this will take stack frame by stack frame of the stack. The first parameter is an integer and indicates how many stack frames have allready been take of the stack. Returns the depth on the stack of the stack frame whose closing bracket matched. -}
parseAnyClosingBracket3 ::
Integer -> [GenParser tok () ()] -> GenParser tok () Integer
parseAnyClosingBracket3 i (x : xs)
= try
(do x
return i)
<|> (parseAnyClosingBracket3 (i + 1) xs)
parseAnyClosingBracket3 _ [] = pzero
{-| Remove the n'th elements from a list. n is an integer an given is first parameter. The list to be processed is given as second parameter -}
myremove :: Integer -> [a] -> [a]
myremove _ [] = []
myremove 0 (_ : xs) = myremove (-1) xs
myremove i (x : xs) = x : (myremove (i - 1) xs)
{-| Enumerates a list of parsers. needed to prepare a list of parsers for use with parseAnything2 -}
remake :: [a] -> [(Int, a)]
remake x = zip (iterate (+ 1) 0) x
{-| predicate to test whether the current stack-frame-index is in a stack. The first parameter is the stack-frame-index the second parameter is the stack. Returns true if it could be found -}
isin ::
(Show tok, Eq tok, Read tok) => Int -> (MyStack tok) -> Bool
isin i s = i `elem` (List.map nestingdepth s)
{-| tries to parse exactly one specific opening bracket. The parameters are identical to the ones of parseAnyOpeningBracket, which the exception of the second parameter. The second parameter is that parser for the bracket currently under consideration. This function 'catch' the BBad 'exception' 'thrown' by parseAnything. In this case it returns pzero, causing the parser to backtrack. -}
parseSpecificOpeningBracket ::
(Show tok, Eq tok, Read tok) =>
Int ->
(Int, MyParser tok) ->
(MyStack tok) ->
[(Int, MyParser tok)] ->
[Anything tok] ->
GenParser tok () (Either2 (MyStack tok, [Anything tok]))
parseSpecificOpeningBracket v (n, x) s l i
= do r <- do sd <- try (start x s)
parseAnything (v + 1)
(MyStackFrame{endparser = (end x) sd, startdata = sd,
environment = self x, badparser = bad x, parsernumber = n,
nestingdepth = v}
: s)
l
((Open (length s) (self x) sd n) : i)
case r of
BBad (ss, y) -> if isin v ss then pzero else return (BBad (ss, y))
_ -> return r
{-| tried to parse any of the opening brackets given by the parsers passed as the third parameter. The first parameter is the stack number (see documentation of the parseAnything function for more details on that). The second parameter is the current parser stack (see documentation of the parseAnything2 function for more details on that). The forth parameter is the list of parsers to b taken into account by the general parsing process. In contrast the third parameter contains only a list of parsers that are allowed to match in the current step of the parsing process. The fifth parameter is the current parser output stream. That is the information returned by the parser up to the current step. It is kind of an accumulator for parser results. -}
parseAnyOpeningBracket ::
(Show tok, Eq tok, Read tok) =>
Int ->
MyStack tok ->
[(Int, MyParser tok)] ->
[(Int, MyParser tok)] ->
[Anything tok] ->
GenParser tok () (Either2 (MyStack tok, [Anything tok]))
parseAnyOpeningBracket _ _ [] _ _ = pzero
parseAnyOpeningBracket v s (x : xs) l i
= try (parseSpecificOpeningBracket v x s l i) <|>
parseAnyOpeningBracket v s xs l i
{-| insert a list of closing brackets into the parser output stream. Later on matching opening and closing brackets will be found and parse tree will be generated this way. The first parameter is an integer it is the number of brackets which should be close. The second parameter is the parser stack. It says which kind of brackets should be closed. It returns a parser output stream just containing the opening brackets -}
generateClosingBrackets ::
(Num a, Eq a, Show tok, Eq tok, Read tok) =>
a -> MyStack tok -> [Anything tok]
generateClosingBrackets 0 (s : xs)
= [Close (length xs) (environment s)]
generateClosingBrackets mi (s : xs)
= (Close (length xs) (environment s)) :
(generateClosingBrackets (mi - 1) xs)
generateClosingBrackets _ _ = []
{-| insert a list of opening brackets into the parser output stream. Later on matching opening and closing brackets will be found and parse tree will be generated this way. The first parameter is an integer it is the number of brackets which should be opened. The second parameter is the parser stack. It says which kind of brackets should be opened. It returns a parser output stream just containing the opening brackets -}
generateOpeningBrackets ::
(Num a, Eq a, Show tok, Eq tok, Read tok) =>
a -> MyStack tok -> [Anything tok]
generateOpeningBrackets 0 _ = []
generateOpeningBrackets mi (s : xs)
= (Open (length xs) (environment s) (startdata s) (parsernumber s))
: (generateOpeningBrackets (mi - 1) xs)
generateOpeningBrackets _ _ = []
{-| a version of either with the difference that the left and right types are the same. RRight stands for sucessful parse of a token. BBad stands for parse failure in which the next possiblity is tried. -}
data Either2 b = RRight b
| BBad b
{-| tries to match any of the currently possible closing brackets. Brackets closed in a order different from the reverse to the one in which they were opened are usually possible. And exception are the so called preserving elements, they can only be closed in the correct order. In the general case of this kind of crossbracketing it is necessary to add some opening and closing brackets to the output stream and to take the right stack frame of the stack keeping all others on it in the right order. -}
parseAnyClosingBracket ::
(Show tok, Eq tok, Read tok) =>
Int ->
MyStack tok ->
[(Int, MyParser tok)] ->
[Anything tok] ->
GenParser tok () (Either2 (MyStack tok, [Anything tok]))
parseAnyClosingBracket v s l i
= do mi <- try
(do mmi <- parseAnyClosingBracket2 s
guard
(case s of
(g : _) -> (mmi == 0) ||
((environment g) == Link2) ||
((not ((environment g) `elem` preserving)) &&
(not
((environment (s !! (fromIntegral mmi))) `elem`
preserving)))
[] -> False)
return mmi)
let ss = myremove mi s
parseAnything v ss l
((reverse
((generateClosingBrackets mi s) ++
(reverse (generateOpeningBrackets mi ss))))
++ i)
{-| this function tries to match the bad parser of the current environment. If it matches it returns BBAD, otherwise it returns RRight. See also comment of the parserAnything function. -}
trybadparser ::
(Show tok, Eq tok, Read tok) =>
MyStack tok ->
[Anything tok] -> GenParser tok () (Either2 (MyStack tok, [a2]))
trybadparser s i
= do x <- case s of
(g : _) -> (do _ <- (badparser g) i
return True)
<|> return False
[] -> return False
if x == True then return (BBad (s, [])) else
return (RRight (s, []))
{-| this is the main function of the parser which calls itself recursively. To run the parser you should not call this function directly but rather use parseAnything2. The parameter are the same as the parameters to the parameters to the function parseAnything2. So look at the documentation for their meaning. But there is one additional parameter namely the first one. This is the stack frame number, it is increase for every stack frame and never decreased this way each stack frame has got a unique identifier this way. An other difference is the return type this function returns the always same type as the function parseAnything2, but wrapped in the Either2 monad. The Either2 monad has an additional bit to signal whether the parse was good or bad. The bad bit signals so called bad parser of the current environment has matched signaling that the environment is to be considered invalid, and we have to backtrack. But what we do here is just stop paring and return a successful parse, but return the bad flag as set in the return type. This will propagate through to the parser that was trying to open the environment that caused the current problem. If that recognizes the problem it can flag the environment as failed by returning pzero. So again here we just return BBad. So we kind of throw an exception. And in parseSpecificOpeningBracket we will catch BBad and signal the actual problem by returning pzero and that way kick of backtracking. -}
parseAnything ::
(Show tok, Eq tok, Read tok) =>
Int ->
MyStack tok ->
[(Int, MyParser tok)] ->
[Anything tok] ->
GenParser tok () (Either2 (MyStack tok, [Anything tok]))
parseAnything v s l i
= (do eof
return $ RRight (s, i ++ (generateClosingBrackets (length s) s)))
<|>
do nb <- trybadparser s i
case nb of
RRight _ -> try (parseAnyClosingBracket v s l i) <|>
case s of
(g : _) -> do try
(parseAnyOpeningBracket v s
[x | x <- l,
(environment g) `elem` (allowed (snd x))]
l
i)
[] -> return (RRight ([], i))
<|>
do c <- anyToken
parseAnything v s l ((C c) : i)
BBad _ -> case s of
(g : _) -> try
(parseAnyOpeningBracket v s
[x | x <- l, (environment g) `elem` (allowed (snd x))]
l
i)
<|> return (BBad (s, i))
[] -> pzero
{-| This is the main entry point of the parse. So the function you need to call when you want to convert the source into the parse tree. The first parameter is the stack. I usually should contain only and exactly the root stack frame. The second parameter is an enumerated list of parsers. You usually take a list like the list parsers from this module and enumerate it by running remake on it. So thats the list of environments the parser is able to recognize. The third parameter is the parse results that have been created so far. Since we are just starting the parse this has to be the empty list. The function returns a parser. See the documentation of the parse module for more details on the type GenParser. Roughly is means that this parser takes an input list whose items are of type tok and that the parsers does not have state (hence the void type '()') and return a tuple. The first elements of that tuple is a stack. Where a new stack frame is added to the stack for each new environment that is found to open by the parser, like an opening HTML tag. And the second elements of the tuple is a parse tree, that is a list of parse tree elements, where each parse tree element may contain sublists of parse tree element. This way it is a real tree. -}
parseAnything2 ::
(Show tok, Eq tok, Read tok) =>
MyStack tok ->
[(Int, MyParser tok)] ->
[Anything tok] -> GenParser tok () (MyStack tok, [Anything tok])
parseAnything2 s l i
= do x <- parseAnything 0 s l i
case x of
BBad (_, b) -> return (s, b)
RRight b -> return b
{-| this find the matching closing bracket for an opening bracket. It returns a tuple. Its first element is the environment created form the given opening bracket together with its closing bracket and the content between opening and closing bracket. Its second elements is the remaining list of parsed elements, after the closing bracket. This list does still contain the Open and Close parser tree elements for opening and closing bracket, and those are not yet converted to environments. This function takes the list of parse tree elements after the opening bracket as first input parameter. It takes the index of the parser that created the opening bracket as second input parameter. That is the index created by the remake function in this module. It takes the size of the stack at the time when the opening bracket was found as third input parameter. It takes the EnvType of the environment of the opening bracket as fourth input parameter. It takes the StartData parse result associated with the opening bracket as fifth parameter. The sixth parameter is the accumulator an should be the empty list when calling this function externally. The seventh parameter is the remaining parse tree after the opening bracket without the opening and closing brackets converted to environments -}
findMatchingClosingBracket ::
[(a1, MyParser a)] ->
Int ->
Int ->
EnvType ->
StartData ->
[Anything a] -> [Anything a] -> (Anything a, [Anything a])
findMatchingClosingBracket l n i e s b ((Close i2 e2) : xs)
= if (i, e) == (i2, e2) then
(Environment ((reenv (snd (l !! n))) e) s
((modify (snd (l !! n))) s (findMatchingBrackets l (reverse b))),
xs)
else findMatchingClosingBracket l n i e s ((Close i2 e2) : b) xs
findMatchingClosingBracket l n i e s b (x : xs)
= findMatchingClosingBracket l n i e s (x : b) xs
findMatchingClosingBracket l n _ e s b []
= (Environment ((reenv (snd (l !! n))) e) s
((modify (snd (l !! n))) s (findMatchingBrackets l (reverse b))),
[])
{-| run findMatchingBrackets on the inner part environment given as second parameter. This function takes the enumerated list of parsers created by remake as first input parameter -}
findMatchingBrackets2 ::
[(a1, MyParser a)] -> Anything a -> Anything a
findMatchingBrackets2 l (Environment e s b)
= Environment e s (findMatchingBrackets l b)
findMatchingBrackets2 l xs
= Environment Root (Str "") (findMatchingBrackets l [xs])
{-| the parser (Anything3) creates a list of parser elements, which is not a tree. The environments which will form the nodes with children in the final tree are denoted as opening and closing brackets in this list. This function takes that list as second input parameter, finds matching pairs of opening and closing brackets and converts the to environments. The opening an closing brackets are already balanced because of the way Anything3 works, that means there is exactly one matching closing bracket for each opening one and they open and close in to proper order. This function takes the enumerated list of parsers as first input parameter, that is the same list also given to the function Anything. -}
findMatchingBrackets ::
[(a1, MyParser a)] -> [Anything a] -> [Anything a]
findMatchingBrackets l ((Open i e s n) : xs)
= let (t, xxs) = findMatchingClosingBracket l n i e s [] xs in
(findMatchingBrackets2 l t) : (findMatchingBrackets l xxs)
findMatchingBrackets l (x : xs) = x : (findMatchingBrackets l xs)
findMatchingBrackets _ [] = []
{-| a list of environments. Most parsers use this list as their 'allowed' variable. Meaning that the parser is only allowed to match within the environments given in the 'allowed' list -}
everywhere :: [EnvType]
everywhere = [Wikitable] ++ everywheretbl
{-| list containing the Italic and Bold environments, see documentation on the list 'everywhere' in this module -}
bi :: [EnvType]
bi = [Italic, Bold]
{-| list containing the same environments as the list 'everywhere' except the Wikitable environment, see documentation on the list 'everywhere' in this module -}
everywheretbl :: [EnvType]
everywheretbl = bi ++ everywherebi
{-| list containing the same environments as the list 'everywhere' except the environment Wikitable, Bold and Italic, see documentation on the list 'everywhere' in this module -}
everywherebi :: [EnvType]
everywherebi = basicwhere ++ [Wikilink]
{-| list containing the same environments as the list 'everywhere' except the environment Wikitable, Bold, Italic and Wikilink see documentation on the list 'everywhere' in this module -}
basicwhere :: [EnvType]
basicwhere = [Link] ++ verybasicwhere
{-| list containing the same environments as the list 'everywhere' except the environment Wikitable, Bold, Italic, Wikilink and Link see documentation on the list 'everywhere' in this module -}
verybasicwhere :: [EnvType]
verybasicwhere
= [Itemgroup, Root, Wikiheading, TableCap, Chapter, Tag, TableTag,
TemplateInside, IncludeOnly]
{-| list containing the environments where the parser linkp is allowed to match. Currently this seems to be everywhere. So this possibly can go away -}
everywherel :: [EnvType]
everywherel = basicwhere ++ bi ++ [Wikitable, Wikilink]
{-| list containing the same environments as the list 'everywhere' except the Link environment see documentation on the list 'everywhere' in this module -}
everywherel2 :: [EnvType]
everywherel2 = verybasicwhere ++ bi ++ [Wikitable, Wikilink]
{-| list containing the TableColSep and TableHeadColSep environments, see documentation on the list 'everywhere' in this module. the environments mean table header column separator and table column separator -}
wikilinkwhere :: [EnvType]
wikilinkwhere = [TableColSep, TableHeadColSep]
{-| the list of parsers needed for processing the HTML output created by MediaWiki -}
minparsers :: [MyParser Char]
minparsers
= [doctagparser, metatagparser, supp, subp, dhunurlp, itagparser,
pagebreakp, htmlcharp, p302p, attrp, greekp, brparser, mytablep,
mytrsepp, mytcolsepp, mytcapp, mythcolsepp, annop, tagparser,
tagparserp, tagparser2, tagparser2p, tagparsert, tagparsert,
tagparser2t, tagparsers, stagparser, commentp, numhtmlp,
rtagparser]
{-| the list of parsers for parsing contributor information for images on MediaWiki websites -}
htmlminparsers :: [MyParser Char]
htmlminparsers
= [doctagparser, metatagparser, supp, subp, dhunurlp, itagparser,
pagebreakp, htmlcharp, p302p, attrp, greekp, brparser, htmytablep,
htmytrsepp, htmytcolsepp, htmytcapp, htmythcolsepp, tagparser,
tagparserp, tagparser2, tagparser2p, tagparsert, tagparsert,
tagparser2t, tagparsers, stagparser, commentp, numhtmlp,
rtagparser]
{-| the list of parsers needed for processing the image title description so that is the content a html attibutes -}
imgparsers :: [MyParser Char]
imgparsers = [supp, subp, htmlcharp, p302p, greekp, numhtmlp]
{-| the list of parsers needed for parsing source code in the MediaWiki markup language -}
parsers :: [MyParser Char]
parsers
= [doctagparser, metatagparser, supp, subp, dhunurlp, itagparser,
chapterp, prep, pagebreakp, htmlcharp, p302p, attrp, greekp,
brparser, wikilinkp, wikitablep, mytablep, wikiheadingp,
itempgrouppt, itempgroupp, itemlinep, boldp, italicp, tablecapp,
tablecapp2, tablecapp3, rowsepp, mytrsepp, colsepp, colsepp2,
mytcolsepp, mytcapp, headcolsepp, headcolsepp2, mythcolsepp,
galleryp, imagemapp, nowikip, noincludep, mathp, annop, imagemapp,
ttagparser, ttagparser2, ttagparsert, ttagparser2t, ttagparsers,
tagparser, tagparser2, tagparsert, tagparser2t, tagparsers,
stagparser, commentp, reservedp, templatewikilinkp, wikiparamp,
wikitemplatep, templateinsideverbatimp, templateinsidep,
gallerywlp, imagemapwlp, hdevlinep, linkp, linkp2, presectionp,
presectionpt, numhtmlp, rtagparser]
{-| the parser record, with some fields initialized with default values -}
baseParser :: MyParser tok
baseParser
= MyParser{bad = \ _ -> pzero, start = undefined,
end = \ _ -> return (), allowed = everywhere, self = undefined,
modify = \ _ x -> x, reenv = id}
{-| this function takes a string and returns a parser that matches any of the given strings -}
oneOfTheStrings :: [String] -> Parser String
oneOfTheStrings (x : xs) = try (string x) <|> (oneOfTheStrings xs)
oneOfTheStrings [] = pzero
{-| parses a HTML entity, that is a character escaped with the ampersand notation -}
htmlcharp :: MyParser Char
htmlcharp
= baseParser{start =
\ _ ->
do _ <- char '&'
s <- (oneOfTheStrings [fst x | x <- htmlchars])
_ <- char ';'
return (Str (s)),
allowed = Preformat : SpaceIndent : NoWiki : everywhere,
self = HtmlChar}
{-| parses a HTML entity, escaped with numeric ampersand notation -}
numhtmlp :: MyParser Char
numhtmlp
= baseParser{start =
\ _ ->
do _ <- string ""
s <- try (many1 digit) <|>
do ss <- try (string "x") <|> try (string "X")
sss <- try (many1 hexDigit)
return $ ss ++ sss
_ <- char ';'
return (Str (s)),
allowed = Preformat : SpaceIndent : NoWiki : everywhere,
self = NumHtml}
{-| parses a HTML #302 character. Special parser needed since it acts on the receding character -}
p302p :: MyParser Char
p302p
= baseParser{start =
\ _ ->
do c <- anyChar
_ <- string "̂"
return (Str (c : [])),
self = P302}
{-| parses a HTML &sub entity. -}
subp :: MyParser Char
subp
= baseParser{start =
\ _ ->
do _ <- string "&sub"
c <- anyChar
_ <- string ";"
return (Str (c : [])),
self = Sub}
{-| parses a HTML &sup entity. -}
supp :: MyParser Char
supp
= baseParser{start =
\ _ ->
do _ <- string "&sup"
c <- anyChar
_ <- string ";"
return (Str (c : [])),
self = Sup}
{-| parses the start of a new URL. That is the place where a page begin that was downloaded from an URL different from the previous one -}
dhunurlp :: MyParser Char
dhunurlp
= baseParser{start =
\ _ ->
do _ <- string "\ndhunparserurl "
return (Str ""),
end = \ _ -> string "\n" >> return (), self = DhunUrl,
allowed = [Root, Tag]}
{-| parses a Greek HTML entity. So a Greek letter or something similar -}
greekp :: MyParser Char
greekp
= baseParser{start =
\ _ ->
do _ <- char '&'
s <- (oneOfTheStrings greek)
_ <- char ';'
return (Str (s)),
self = Greek}
{-| parses the mediawiki math tag. That is a latex formula in the wiki -}
mathp :: MyParser Char
mathp
= (maketagparser ["math"]){allowed =
SpaceIndent : everywhere ++ wikilinkwhere,
self = Math}
annop :: MyParser Char
annop
= (maketagparser ["annotation"]){allowed =
SpaceIndent : everywhere ++ wikilinkwhere,
self = Math, reenv = const Tag}
{-| parses a new chapter heading -}
chapterp :: MyParser Char
chapterp
= baseParser{start =
\ _ ->
do _ <- try
(do _ <- string "\n"
many (char ' '))
string "dhunincludechaper" >> return (Str ""),
end = \ _ -> string "/dhunincludechaper" >> return (),
self = Chapter}
{-| parses a horizontal dividing line -}
hdevlinep :: MyParser Char
hdevlinep
= baseParser{start =
\ _ ->
do _ <- string "----"
skipMany (string "-")
return (Str ""),
allowed = [Root], self = HDevLine}
{-| parses the mediawiki 'nowiki' tag -}
nowikip :: MyParser Char
nowikip
= baseParser{start = \ _ -> string "
because of the tailing /. -}
maketagparser2 :: [String] -> MyParser Char
maketagparser2 tags
= baseParser{start =
\ _ ->
do _ <- string "<"
t <- (oneOfTheStrings tags)
atr <- many (try (attr))
_ <- try (many (oneOf " \n")) <|> return []
_ <- try (char '/') <|> (return '/')
_ <- char '>'
return (TagAttr (t) (Map.fromList atr)),
allowed = Wikitable : SpaceIndent : everywheretbl, self = Tag}
{-| Returns a parser that matches all HTML elements, given by the list of strings given as first input parameter. The parser matches only the opening part of the tag. The inside of it is not processed by this parser. The opening tag may not be self closing so not like
because of the tailing /. -}
maketagparser3 :: [String] -> MyParser Char
maketagparser3 tags
= baseParser{start =
\ _ ->
do _ <- string "<"
t <- (oneOfTheStrings tags)
atr <- many (try (attr))
_ <- try (many (oneOf " \n")) <|> return []
_ <- char '>'
return (TagAttr (t) (Map.fromList atr)),
allowed = SpaceIndent : everywheretbl, self = Tag}
{-| tags that can not be nested and are thus allowed to closed by an opening tag instead of a closing one -}
nonNestTags :: [String]
nonNestTags
= ["tt", "pre", "TT", "PRE", "b", "B", "i", "I", "sc", "SC",
"code", "CODE"]
{-| Returns a parser that matches all HTML elements, given by the list of strings given as first input parameter. The parser matches only inside tables. Use maketagparser for that -}
makettagparser :: [String] -> MyParser Char
makettagparser tags
= baseParser{bad =
\ _ ->
do _ <- lookAhead (do try (string "\n|") <|> (string "||"))
return (),
start =
\ _ ->
do _ <- string "<"
t <- (oneOfTheStrings tags)
atr <- many (try (attr))
_ <- try (many (oneOf " \n")) <|> return []
_ <- char '>'
return (TagAttr (t) (Map.fromList atr)),
end =
\ (TagAttr x _) ->
do _ <- char '<'
_ <- try (many (oneOf " ")) <|> return []
_ <- if x `elem` nonNestTags then try (char '/') <|> (return '/')
else (char '/')
_ <- string x
_ <- char '>'
return (),
allowed = [Wikitable], self = Tag}
{-| Returns a parser that matches all HTML elements, given by the list of strings given as first input parameter. The parser matches only the opening part of the tag. The inside of it is not processed by this parser. The opening tag may also self closing like
because of the tailing /. Matches only inside tables -}
makettagparser2 :: [String] -> MyParser Char
makettagparser2 tags
= baseParser{bad =
\ _ ->
do _ <- lookAhead
(do _ <- anyChar
try (string "\n|") <|> (string "||"))
return (),
start =
\ _ ->
do _ <- string "<"
t <- (oneOfTheStrings tags)
atr <- many (try (attr))
_ <- try (many (oneOf " \n")) <|> return []
_ <- char '/'
_ <- char '>'
return (TagAttr (t) (Map.fromList atr)),
allowed = [Wikitable], self = Tag}
{-| maketagparser for all HTML elements see documentation on function maketagparser -}
tagparser :: MyParser Char
tagparser = maketagparser listOfTags
{-| maketagparser for the 'pre' HTML tag see documentation on function maketagparser -}
tagparserp :: MyParser Char
tagparserp = maketagparser ["pre"]
{-| maketagparser for the HTML tags for HTML tables see documentation on function maketagparser -}
tagparsert :: MyParser Char
tagparsert
= (maketagparser listOfTableTags){self = TableTag,
reenv = const Tag}
{-| makettagparser for all HTML elements see documentation on function makettagparser -}
ttagparser :: MyParser Char
ttagparser = makettagparser listOfTags
{-| makettagparser for the 'pre' HTML tag see documentation on function makettagparser -}
ttagparserp :: MyParser Char
ttagparserp = makettagparser ["pre"]
{-| makettagparser for the HTML tags for HTML tables see documentation on function makettagparser -}
ttagparsert :: MyParser Char
ttagparsert
= (makettagparser listOfTableTags){self = TableTag,
reenv = const Tag}
{-| maketagparser2 for all HTML elements see documentation on function maketagparser2 -}
tagparser2 :: MyParser Char
tagparser2 = maketagparser2 listOfTags
{-| maketagparser2 for the 'pre' HTML tag see documentation on function maketagparser2 -}
tagparser2p :: MyParser Char
tagparser2p = maketagparser2 ["pre"]
{-| maketagparser2 for the HTML tags for HTML tables see documentation on function maketagparser2 -}
tagparser2t :: MyParser Char
tagparser2t
= (maketagparser2 listOfTableTags){self = TableTag,
reenv = const Tag}
{-| makettagparser2 for all HTML elements see documentation on function makettagparser2 -}
ttagparser2 :: MyParser Char
ttagparser2 = makettagparser2 listOfTags
{-| makettagparser2 for the 'pre' HTML tag see documentation on function makettagparser2 -}
ttagparser2p :: MyParser Char
ttagparser2p = makettagparser2 ["pre"]
{-| makettagparser2 for the HTML tags for HTML tables see documentation on function makettagparser2 -}
ttagparser2t :: MyParser Char
ttagparser2t
= (makettagparser2 listOfTableTags){self = TableTag,
reenv = const Tag}
{-| a parser for mediawiki source extension tags -}
tagparsers :: MyParser Char
tagparsers
= (maketagparser ["source", "syntaxhighlight"]){self = Source}
{-| a parser for mediawiki source extension tags inside tables -}
ttagparsers :: MyParser Char
ttagparsers
= (maketagparser ["source", "syntaxhighlight"]){self = Source,
allowed = [Wikitable]}
{-| a parser for HTML tables -}
mytablep :: MyParser Char
mytablep
= (maketagparser ["table"]){self = TableTag,
reenv = const Wikitable,
allowed = Wikitable : SpaceIndent : everywheretbl}
{-| a parser for HTML table rows -}
mytrsepp :: MyParser Char
mytrsepp
= (maketagparser3 ["tr"]){reenv = const TableRowSep,
allowed = everywhere}
{-| a parser for normal HTML table cells -}
mytcolsepp :: MyParser Char
mytcolsepp
= (maketagparser3 ["td"]){reenv = const TableColSep,
allowed = everywhere}
{-| a parser for HTML table captions cells -}
mytcapp :: MyParser Char
mytcapp
= (maketagparser ["caption"]){reenv = const TableCap,
allowed = everywhere}
{-| a parser for HTML table header cells, so th tags -}
mythcolsepp :: MyParser Char
mythcolsepp
= (maketagparser3 ["th"]){reenv = const TableHeadColSep,
allowed = everywhere}
{-| a parser for HTML tables for html parse mode only -}
htmytablep :: MyParser Char
htmytablep = (maketagparser ["table"])
{-| a parser for HTML table rows for html parse mode only -}
htmytrsepp :: MyParser Char
htmytrsepp = (maketagparser ["tr"])
{-| a parser for normal HTML table cells for html parse mode only -}
htmytcolsepp :: MyParser Char
htmytcolsepp = (maketagparser ["td"])
{-| a parser for HTML table captions cells for html parse mode only -}
htmytcapp :: MyParser Char
htmytcapp = (maketagparser ["caption"])
{-| a parser for HTML table header cells, so th tags for html parse mode only -}
htmythcolsepp :: MyParser Char
htmythcolsepp = (maketagparser ["th"])
{-| a parser for closing HTML tags which don't have an opening partner. This parser is only allowed to match within itemization enumerations etc. -}
itagparser :: MyParser Char
itagparser
= baseParser{start =
\ _ ->
do _ <- string "<"
skipMany (char '/')
t <- (oneOfTheStrings ["small"])
atr <- many (try (attr))
_ <- try (many (oneOf " \n")) <|> return []
_ <- char '>'
return (TagAttr (t) (Map.fromList atr)),
allowed = [Itemgroup], self = Tag}
{-| a parser for closing HTML tags which don't have an opening partner -}
rtagparser :: MyParser Char
rtagparser
= baseParser{start =
\ _ ->
do _ <- string "<"
skipMany (char '/')
s <- (oneOfTheStrings (listOfTags ++ listOfTableTags))
atr <- many (try (attr))
_ <- try (many (oneOf " \n")) <|> return []
_ <- char '>'
return (TagAttr ("bad" ++ s) (Map.fromList atr)),
allowed = everywhere, self = Tag}
{-| a parser for HTML opening tags which might be self closing but never have a matching closing partner -}
stagparser :: MyParser Char
stagparser
= baseParser{start =
\ _ ->
do _ <- string "<"
t <- (oneOfTheStrings (listOfTags ++ listOfTableTags))
atr <- many (try (attr))
_ <- try (many (oneOf " ")) <|> return []
_ <- try (char '/') <|> return 'f'
_ <- char '>'
return (TagAttr t (Map.fromList atr)),
self = Tag, allowed = []}
{-| a parser for HTML page breaks -}
pagebreakp :: MyParser Char
pagebreakp
= baseParser{start =
\ _ ->
do _ <- string "