module Compiler where import ImperativeState import MyState import Text.ParserCombinators.Parsec import Data.List (isInfixOf) import MediaWikiParseTree import MediaWikiParser import LatexRenderer import Tools import qualified Data.Map as Map import Data.Set () import Data.Maybe import qualified System.Directory as Dir import Data.List.Split import Control.Monad import Network.URI import Codec.Binary.UTF8.String import Control.Monad.State import UrlAnalyse import Data.ByteString.UTF8 (toString) import Data.ByteString hiding (take, reverse, dropWhile, takeWhile, drop, map, concat, elem, length, zip, head, filter, minimum, isInfixOf) compile :: RunMode -> String -> String -> [[ByteString]] -> ImperativeMonad CompileResult compile theRunMode text templates tabs = do st <- get --liftIO $ print (printPrepareTree(parseit minparsers text)) case theRunMode of StandardTemplates -> return (run2 (parseit parsers text) (hostname . fullUrl $ st) templates tabs) UserTemplateFile _ -> return (run2 (parseit parsers text) (hostname . fullUrl $ st) templates tabs) HTML -> return (run2 (printPrepareTree (parseit minparsers text)) (hostname . fullUrl $ st) templates tabs) ExpandedTemplates -> return (run2 (parseit parsers text) (hostname . fullUrl $ st) templates tabs) {-DHUN| pathname of the temporary directory of the compiler |DHUN-} dirpref :: [Char] dirpref = "../tmp/compiler/" {-DHUN| converts a wiki source document received from mediawiki when requesting it for Special:Export to a parse tree to be converted to LaTeX be treeToLaTeX3. It also signals to compiler.py that the source code was read using the temporary compiler directory |DHUN-} shortparse :: String -> IO [Anything Char] shortparse x = do Tools.writeFile (dirpref ++ "done") "" return (parseit parsers x) {-DHUN| a function to get HTML elements out of a parse tree. The first parameter is name of the tag to be looked for. The second parameter is a key in the attributes of that element that has to be present for the element to be considered. The third parameter is a value that has to be found under the given key in the attributes of the element, in order for the element to be part of the returned output list. The fourth parameter is the parse tree. |DHUN-} deepGet :: [Char] -> String -> [Char] -> [Anything a] -> [Anything a] deepGet t k v ll = concat $ map go ll where go (Environment Tag (TagAttr tag m) l) | (tag == t) && ((Map.lookup k m) == (Just v)) = [Environment Tag (TagAttr tag m) l] go (Environment _ _ l) = (deepGet t k v l) go _ = [] {-DHUN| flattens out the HTML 'a' tags. That is it replaces each 'a' element with its content. That is everything that is between its opening and its closing tag. The only parameter of this function is the parse tree to be processed. This function returns the parse tree with the 'a' HTML elements flattened |DHUN-} reducea :: [Anything Char] -> [Anything Char] reducea ll = concat (map go ll) where go :: Anything Char -> [Anything Char] go (Environment Tag (TagAttr "a" _) l) = l go (Environment x y l) = [Environment x y (reducea l)] go x = [x] {-DHUN| flattens out the HTML 'div' tags, which have got a 'class' attributes present with the value 'noresize'. That is it replaces each 'div' element with the properties mentioned above by its content. That is everything that is between its opening and its closing tag. The only parameter of this function is the parse tree to be processed. This function returns the parse tree with the 'div' HTML elements with the properties given above flattened |DHUN-} reducediv :: [Anything Char] -> [Anything Char] reducediv ll = concat (map go ll) where go :: Anything Char -> [Anything Char] go (Environment Tag (TagAttr "div" m) l) | (Map.lookup "class" m) == (Just "noresize") = l go (Environment x y l) = [Environment x y (reducea l)] go x = [x] {-DHUN| takes a parse tree that was created form the HTML returned by MediaWiki when being requested for the print version of a wiki page. And returns a modified version of that parse tree ready for being converted to LaTeX with treeToLaTeX3 |DHUN-} printPrepareTree :: [Anything Char] -> [Anything Char] printPrepareTree ll = concat (map printPrepareNode ll) where printPrepareNode :: Anything Char -> [Anything Char] printPrepareNode (Environment Tag (TagAttr "div" mm) l) | (Map.lookup "class" mm) == (Just "thumbinner") = case do (m, llll) <- case filter (mypred "a") (reducediv l) of [(Environment Tag (TagAttr "a" mmm) lll)] -> return (mmm, lll) _ -> mzero tt <- case filter (mypred "div") (reducediv l) of [(Environment Tag (TagAttr "div" mmm) tt)] | (Map.lookup "class" mmm) == (Just "thumbcaption") -> return . (dropWhile (== (C '\n'))) . (filter magnpred) $ tt _ -> mzero return $ imgfun m llll (Just tt) of Just x -> x _ -> printPrepareTree l printPrepareNode (Environment Wikitable (TagAttr "table" m) _) | (Map.lookup "class" m) == (Just "toc") = [] printPrepareNode (Environment Tag (TagAttr "div" m) _) | ((Map.lookup "class" m) == (Just "toc") || (Map.lookup "id" m) == (Just "toc")) = [] printPrepareNode (Environment Wikitable (TagAttr "table" m) _) | (Map.lookup "class" m) == (Just "navbox") = [] printPrepareNode (Environment Tag (TagAttr "title" _) _) = [] printPrepareNode (Environment Tag (TagAttr "div" m) _) | (Map.lookup "id" m) == (Just "siteSub") = [] printPrepareNode (Environment Tag (TagAttr "div" m) _) | (Map.lookup "id" m) == (Just "jump-to-nav") = [] printPrepareNode (Environment Tag (TagAttr "div" m) _) | (Map.lookup "id" m) == (Just "footer") = [] printPrepareNode (Environment Tag (TagAttr "div" m) _) | (Map.lookup "id" m) == (Just "p-lang") = [] printPrepareNode (Environment Tag (TagAttr "div" m) _) | (Map.lookup "id" m) == (Just "contentSub") = [] printPrepareNode (Environment Tag (TagAttr "div" m) _) | (Map.lookup "class" m) == (Just "printfooter") = [] printPrepareNode (Environment Tag (TagAttr "span" m) _) | (Map.lookup "class" m) == (Just "mw-cite-backlink") = [] printPrepareNode (Environment Tag (TagAttr "div" m) _) | (Map.lookup "id" m) == (Just "mw-navigation") = [] printPrepareNode (Environment Tag (TagAttr "sup" m) l) | (Map.lookup "class" m) == (Just "reference") = [Environment Tag (TagAttr "sup" m) (reducea l)] printPrepareNode (Environment Tag (TagAttr "li" m) l) | (Map.lookup "class" m) == (Just "gallerybox") = case (do llll <- case deepGet "div" "class" "thumb" l of [Environment Tag (TagAttr "div" _) lll] -> return lll _ -> mzero (mmm, lll) <- case deepGet "a" "class" "image" llll of [Environment Tag (TagAttr "a" mmm) lll] -> return (mmm, lll) _ -> mzero te <- case deepGet "div" "class" "gallerytext" l of [Environment Tag (TagAttr "div" _) te] -> return te _ -> mzero return $ imgfun mmm lll (Just te)) of Just x -> x _ -> printPrepareTree l printPrepareNode (Environment TableHeadColSep (TagAttr t m) l) = [Environment TableHeadColSep (TagAttr t m) (l++ (map (\x->Environment Attribute (Attr x) []) (Map.toList m)))] printPrepareNode (Environment TableColSep (TagAttr t m) l) = [Environment TableColSep (TagAttr t m) (l++ (map (\x->Environment Attribute (Attr x) []) (Map.toList m)))] printPrepareNode (Environment TableRowSep (TagAttr t m) l) = [Environment TableRowSep (TagAttr t m) (l++ (map (\x->Environment Attribute (Attr x) []) (Map.toList m)))] printPrepareNode (Environment Tag (TagAttr "pre" m) l) = [Environment Preformat (TagAttr "pre" m) l] printPrepareNode (Environment Tag (TagAttr "span" m) _) | (Map.lookup "class" m) == (Just "editsection") = [] printPrepareNode (Environment Tag (TagAttr "span" m) _) | (Map.lookup "class" m) == (Just "mw-editsection") = [] printPrepareNode (Environment Tag (TagAttr "a" m) l) = case (Map.lookup "class" m) of (Just "image") -> imgfun m l Nothing _ -> case (Map.lookup "class" m) of (Just "external free") -> [Environment Tag (TagAttr "a" m) []] _ -> [Environment Tag (TagAttr "a" m) l] printPrepareNode (Environment Tag (TagAttr "div" m) _) | (Map.lookup "class" m) == (Just "bodyContent") = [] printPrepareNode (Environment Tag (TagAttr "img" m) l) | (Map.lookup "class" m) == (Just "tex") = case Map.lookup "alt" m of Just x -> [Environment Math (TagAttr "math" Map.empty) (map C (replace2 (replace2 (replace2 x "&" "&") "<" "<") "gt;" ">"))] Nothing -> [(Environment Tag (TagAttr "img" m) (printPrepareTree l))] printPrepareNode (Environment Tag (TagAttr "div" m) l) = case do c <- Map.lookup "class" m guard $ isInfixOf "source" c return $ Environment Source (TagAttr "source" (Map.fromList [("lang", (takeWhile (/= ' ') c))])) (deepFlatten l) of Nothing -> [Environment Tag (TagAttr "div" m) (printPrepareTree l)] Just x -> [x] printPrepareNode (Environment x y l) = [Environment x y (printPrepareTree l)] printPrepareNode x = [x] mypred :: String -> Anything Char -> Bool mypred x y = case y of (Environment Tag (TagAttr z _) _) | z == x -> True _ -> False magnpred :: Anything Char -> Bool magnpred y = case y of (Environment Tag (TagAttr "div" m) _) | (Map.lookup "class" m) == (Just "magnify") -> False _ -> True unEsc x = let z = unEscapeString x in if isUTF8Encoded z then decodeString z else z imgfun m l tt = maybeToList $ do t <- case (fmap (parseit imgparsers) (Map.lookup "title" m)) `mplus` tt of Just x -> return $ [C '|'] ++ x Nothing -> return [] h <- case filter (mypred "img") l of [(Environment Tag (TagAttr "img" mmm) _)] -> case Map.lookup "src" mmm of Just s -> if length (reverse (splitOn "/" s)) >= 2 then return ("File:" ++ (unEsc ((reverse (splitOn "/" s)) !! (if "thumb" `elem` splitOn "/" s then 1 else 0)))) else mzero _ -> mzero _ -> mzero w <- case case filter (mypred "img") l of [(Environment Tag (TagAttr "img" mmm) _)] -> do ma <- Map.lookup "width" mmm guard $ tt == Nothing return ma _ -> Nothing of Just x -> return $ [C '|'] ++ (map C (x ++ "px")) Nothing -> return [] return (Environment Wikilink (Str "") ((map C h) ++ w ++ t)) {-DHUN| return a parse tree of a source file. The first argument is the source file. The second argument is a list of command line parameter. If it contains the keyword print. The source file is understood to be the HTML code returned by mediawiki when being requested for the print version of a wiki page, otherwise it is understood to be the wiki source code of a wiki page, that is what you get when issuing a Special:Export request to mediawiki. This function return a parse tree ready to be turned into a latex document by treeToLaTeX3 |DHUN-} getparse :: String -> [String] -> IO [Anything Char] getparse x args = if ("print" `elem` args) then printparse x else shortparse x {-DHUN| prepares a HTML document received from mediawiki when requesting it for the print version of a wiki page to a parse tree to be converted to LaTeX be treeToLaTeX3. It also signals to compiler.py that the source code was read using the temporary compiler directory |DHUN-} printparse :: String -> IO [Anything Char] printparse x = do Tools.writeFile (dirpref ++ "done") "" return (printPrepareTree (parseit minparsers x)) {-DHUN| converts a wiki source document to a parse tree to be converted to LaTeX be treeToLaTeX3. The first parameter is that list of parsers. That is the list of environments to be recognized by the parser. This is usually either only plain HTML, or HTML mixed with mediawiki markup. The second parameter is the source code to be parsed. This function returns a parse tree |DHUN-} parseit :: [MyParser Char] -> String -> [Anything Char] parseit pp x = (parseit2 (decon2 (remake pp) (parseAnything2 [MyStackFrame{endparser = pzero, startdata = Str "", environment = Root, badparser = \ _ -> pzero, parsernumber = 0, nestingdepth = 0}] (remake pp) [])) ('\n' : x)) {-DHUN| helper function of parseit, not to be called directly. This function takes the parser for the grammar, in the sense of a parser of the parsec library, (so that is the final combined parser) as first argument. It takes the source code to be parsed (usually HTML of mediawiki markup mixed with HTML) as second and runs the parser on the source code. It returns the resulting parse tree. |DHUN-} parseit2 :: Parser [Anything Char] -> String -> [Anything Char] parseit2 p input = case (parse p "" input) of Left _ -> [] Right x -> x {-DHUN| the pathname of the temporary directory |DHUN-} tmppath :: [Char] tmppath = "../tmp/" {-DHUN| the function takes a list of the following format as first input [([table1,column1],width_x),([table1,column2],width_y),...,([table2,column1],width_z),...] parameter. Thats is a list containing the maximum width for each column of each table of the document. The maximum width of the column is the width that the column could have if it was printed on paper of infinite size. So the size without line breaks. The second parameter is the accumulator which should be the empty map when calling this function from outside. The function returns a map mapping tablenumber to a map mapping columnnumbers to maximum columns width. This data structure contains all information needed to make the decisions on the final width of the columns in the document |DHUN-} maketabmap :: [([Int], Double)] -> Map.Map Int (Map.Map Int Double) -> Map.Map Int (Map.Map Int Double) maketabmap (x : xs) m = case x of (t : (s : []), b) -> maketabmap xs (Map.alter (f s b) t m) _ -> m where f s1 b1 xx = case xx of Nothing -> Just (Map.singleton s1 b1) Just m1 -> Just (Map.insert s1 b1 m1) maketabmap _ m = m {-DHUN| prepare the result of maketabmap for further procession. Some indices are offset corrected. Space for the rules of the table is added to the width of the columns DHUN-} postproctabmap :: (Fractional a, Num k1, Ord k1, Ord a) => Map.Map k (Map.Map k1 a) -> Map.Map k (Map.Map k1 a) postproctabmap m = Map.map f m where f m1 = Map.delete 0 (Map.mapKeys (\ k -> k - 1) (Map.map (\ x -> (x + 12.333748 - (minimum (Map.elems m1)))) m1)) data CompileResult = CompileResult{images :: [String], body :: String, tablelist :: [[String]], galleryNumbers :: [Integer], title :: String} {-DHUN| the first parameter is the parse tree created by get parse of the document currently being processed. the second parameter is the URL under which the document was downloaded. the third parameter is the netloc describing the wiki this page belongs to. The fourth parameter is a mapping file defined by the user for the mapping of mediawiki templates to latex commands. the fifth parameter is a possible parse tree created by precious run the the should be added before the begging of the newly created parse tree. This function writes out all results to temporary files that will be further processed by compiler.py DHUN-} run2 :: [Anything Char] -> String -> String -> [[ByteString]] -> CompileResult run2 parsetree netloc tmpl someTables = CompileResult{images = img, body = bdy, tablelist = theTables, galleryNumbers = gals, title = tit} where alldata2 g u = (treeToLaTeX3 ((snd . newtree $ g)) initialState{urld = analyseNetloc netloc}{tabmap = u, templateMap = getUserTemplateMap (read tmpl :: [[String]])}{urls = mUrlState . fst . newtree $ g}) newtree g = makeLables g initialUrlState tm = (postproctabmap (maketabmap theNewSizes Map.empty)) (trda, trst) = (alldata2 parsetree tm) img = (map (\ g -> ((replace '\n' ' ' g))) (getImages trst)) theTables = reverse (tablist trst) bdy = doUnicode trda gals = getGalleryNumbers trst tit = getTitle trst fun :: ByteString -> Double fun x = case reads (toString x) of [(f, _)] -> f _ -> (1000.0) theSizes = zip [1 ..] (map (\ x -> zip [1 ..] (map fun x)) someTables) theNewSizes = concat (map sizeFun theSizes) sizeFun (t, k) = map (\ (s, b) -> ([t, s], b)) k {-DHUN| the first parameter is the parse tree created by get parse of the document currently being processed. the second parameter is the URL under which the document was downloaded. the third parameter is the netloc describing the wiki this page belongs to. The fourth parameter is a mapping file defined by the user for the mapping of mediawiki templates to latex commands. the fifth parameter is a possible parse tree created by precious run the the should be added before the begging of the newly created parse tree. This function writes out all results to temporary files that will be further processed by compiler.py DHUN-} run :: [Anything Char] -> String -> String -> [[Anything Char]] -> IO () run x z tmpl cur = do y <- Prelude.readFile (tmppath ++ "imgnum") d <- Dir.getDirectoryContents tmppath dd <- return $ filter ((".size" ==) . reverse . (take 5) . reverse) d ll <- mapM (\ xx -> Tools.readFile (tmppath ++ xx)) dd lll <- return $ zip (map (\ xx -> (map read) . (splitOn "_") . head . (splitOn ".") . (drop 5) $ xx) dd) (map read ll) return () (trda, trst) <- return ((((alldata2 (jj ++ x) y (postproctabmap (maketabmap lll Map.empty)))))) Tools.writeFile (tmppath ++ "images") (concat (map (\ g -> ((replace '\n' ' ' g) ++ "\n")) (getImages trst))) Tools.writeFile (tmppath ++ "ttt.zzz") (show (reverse (tablist trst))) Tools.writeFile (tmppath ++ "test.tex") (doUnicode trda) _ <- mapM (\ (xx, i) -> mapM (\ (yy, j) -> Tools.writeFile (tmppath ++ "table" ++ (show i) ++ "_" ++ (show j) ++ ".tex") yy) (zip xx ([1 ..] :: [Integer]))) (zip (reverse (tablist trst)) ([1 ..] :: [Integer])) Tools.writeFile (tmppath ++ "final.state") (show trst) Tools.writeFile (tmppath ++ "current2") (show (x)) Tools.writeFile (tmppath ++ "imgnum2") (show ((length (getImages trst)) + (read y))) Tools.writeFile (tmppath ++ "gals") ((concat (map (((++) "\n") . show) (getGalleryNumbers (snd (alldata x y)))))) Tools.writeFile (tmppath ++ "title") (getTitle (snd (alldata x y))) where jj = concat cur alldata g y = (treeToLaTeX3 ((snd . newtree $ g)) initialState{getJ = read y}{urld = analyseNetloc z}{templateMap = getUserTemplateMap (read tmpl :: [[String]])}{urls = mUrlState . fst . newtree $ g}) alldata2 g _ u = (treeToLaTeX3 ((snd . newtree $ g)) initialState{urld = analyseNetloc z}{tabmap = u, templateMap = getUserTemplateMap (read tmpl :: [[String]])}{urls = mUrlState . fst . newtree $ g}) newtree g = makeLables g initialUrlState