Halipeto/SimpleDB.lhs

% % Halipeto -- Haskell static web page generator % Copyright 2004 Andrew Cooke (andrew@acooke.org) % Copyright 2007-2010 Peter Simons (simons@cryp.to) % % This program is free software; you can redistribute it and/or modify % it under the terms of the GNU General Public License as published by % the Free Software Foundation; either version 2 of the License, or % (at your option) any later version. % % This program is distributed in the hope that it will be useful, % but WITHOUT ANY WARRANTY; without even the implied warranty of % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the % GNU General Public License for more details. % % You should have received a copy of the GNU General Public License % along with this program; if not, write to the Free Software % Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA % \section{Simple Database} This is a very simple database, implemented on top of the file system using the Dictionary class. The Dictionary namespace reflects the file path to the text (plus, in some cases, an additional name that depends on the file type as described below). There is currently no support for writing data. Some care must be taken with the cases of characters in filenames on Windows systems. It may be wise to force all filenames to lower case via the translate function, or to use a dictionary that is case insensitive. The main disadvantage of the SimpleDB implementation, to my mind, is that there is no formal specification of the structure that verifies that the information in related directories is consistent. Nothing warns the user that the second image directory is missing a details.haldx file, for example. This could be fixed, but it's difficult to find the energy to make a ``quick fix'' better. A full SQL interface should be developed instead. \begin{code}

module Halipeto.SimpleDB (
  Translate, noCVS, allFiles, ReadDB, addDefaultsDB, readDB, readDB'
) where

import Prelude hiding (readList)
import Halipeto.Template
import Halipeto.Dictionary
import Halipeto.Utilities
import Text.Pandoc
import System.IO
import System.Directory
import Control.Monad
import Data.List
import Data.Char

\end{code} \subsection{File Formats} Three different file formats are supported, giving three ways of associating the content with the dictionary namespace. The formats are distinguished by the file extension. \begin{itemize} \item {\bf hal} Files ending in ``.hal'' have all their contents stored under the file's main name (with the associated directory path). For example, the contents of file diary/2004/jul/text.hal would be stored in ``diary.2004.jul.text''. \begin{code}

readHal :: Dictionary d String =>
  Translate -> d String -> FilePath -> [String] -> IO (d String)
readHal _ dct fp ky = do txt <- readFile fp
                         return $ add dct (ky, txt)

\end{code} \item {\bf hals} Each paragraph (separated by one or more blank lines) in a file with extension ``.hals'' is numbered. So if the contents of file months.hals had the contents \begin{verbatim} jan feb mar .. \end{verbatim} then the dictionary would associate, for example, ``months.5'' with ``May'' (this could be used to order the months for display, for example --- a simpler solution might be available via the translation facility described below). More often this format is used to store several paragraphs of text that are iterated over in a template so that each paragraph is enlosed within its own $<$p$>$ element. \begin{code}

readHals :: Dictionary d String =>
  Translate -> d String -> FilePath -> [String] -> IO (d String)
readHals tr dct fp ky = do h <- openFile fp ReadMode
                           dct' <- readParas h tr dct ky 0
                           hClose h
                           return dct'

readParas :: Dictionary d String =>
  Handle -> Translate -> d String -> [String] -> Int -> IO (d String)
readParas h tr dct ky n =
  do done <- hIsEOF h
     if done
       then return $ dct
       else do txt <- hGetPara h
               case txt of
                 Nothing   -> readParas h tr dct ky n
                 Just txt' ->
                   case tr $ ky ++ [(show n)] of
                     Nothing -> readParas h tr dct ky n
                     Just k  -> readParas h tr (add dct (k, txt')) ky (n+1)

hGetPara :: Handle -> IO (Maybe String)
hGetPara h = do done <- hIsEOF h
                if done
                  then return Nothing
                  else do txt <- hGetLine h
                          if "" == dropSpace txt
                            then hGetPara h
                            else do txt' <- collect h txt
                                    return $ Just txt'

collect :: Handle -> String -> IO String
collect h txt = do done <- hIsEOF h
                   if done
                     then return txt
                     else do txt' <- hGetLine h
                             if "" == dropSpace txt'
                               then return txt
                               else collect h $ txt ++ "\n" ++ txt'

\end{code} \item {\bf haldx} The contents of files ending in ``.haldx'' are read as key value pairs. The first word on each line is the key, subsequent text, starting with the first non--space character, is the value. For example, if the file diary/2004/highlights.haldx contains \begin{verbatim} 1.month Jan 1.day 2 1.p 4 2.month July 2.day 20 2.p 1 \end{verbatim} Then the dictionary would associate the value ``July'' with the key ``diary.2004.highlights.2.month''. \begin{code}

readHaldx :: Dictionary d String =>
  Translate -> d String -> FilePath -> [String] -> IO (d String)
readHaldx tr dct fp ky = do h <- openFile fp ReadMode
                            dct' <- readLines h tr dct ky
                            hClose h
                            return dct'

readLines :: Dictionary d String =>
  Handle -> Translate -> d String -> [String] -> IO (d String)
readLines h tr dct ky =
  do done <- hIsEOF h
     if done
       then return $ dct
       else do txt <- hGetLine h
               txt' <- return $ dropWindowsReturn txt
               case splitLine tr ky txt' of
                 Nothing     -> readLines h tr dct ky
                 Just (k, v) -> readLines h tr (add dct (k, v)) ky

dropWindowsReturn :: String -> String
dropWindowsReturn ""                = ""
dropWindowsReturn (c:s) | c == '\r' = dropWindowsReturn s
                        | otherwise = c:(dropWindowsReturn s)

splitLine :: Translate -> [String] -> String -> Maybe ([String], String)
splitLine tr ky txt =
  case keyVal txt of
    Nothing     -> Nothing
    Just (k, v) -> case tr $ ky ++ k of
                     Nothing  -> Nothing
                     Just ky' -> Just (ky', v)

keyVal :: String -> Maybe ([String], String)
keyVal s = keyVal' "" $ dropSpace s
  where
    keyVal' _ ""                = Nothing
    keyVal' k (c:s) | isSpace c = Just (fromDot k, dropSpace s)
                    | otherwise = keyVal' (k ++ [c]) s

\end{code} \end{itemize} Given those formats it is possible to define two different values with the same key. In such cases the final result in the dictionary is undefined (the key will be associated with one of the values, but the choice will depend on implementation details). The functions for parsing these file formats are stored in a dictionary, indexed by file extension. The database can be extended to handle other formats by adding to this dictionary. %%Haddock: Map file extension to file reading function \begin{code}

type ReadDB d = Translate -> d String -> FilePath -> [String] -> IO (d String)

\end{code} %%Haddock: Add the default file functions (hal, hals, haldx) \begin{code}

addDefaultsDB :: (Dictionary d String, Dictionary r (ReadDB d)) =>
  r (ReadDB d) -> r (ReadDB d)
addDefaultsDB d = addAll' d [ (hal,       readHal)
                            , (hal++"s",  readHals)
                            , (hal++"dx", readHaldx)
                            , ("rst",     readRstMsg)
                            , ("mdwn",    readMdwnMsg)
                            , ("lst",     readList)
                            ]

\end{code} \subsection{Translation} Before a file is read, its full path (from the base directory of the database, but excluding the file extension) is passed to a translate function which returns either a (possibly modified) value or Nothing. A valid value is used as the base key for the contents, a value of Nothing will cause that file to be ignored. Directories are also translated --- their names are not used as keys, but a value of Nothing will prevent traversal of that directory. %%Haddock: Type of translation function to select or modify file names \begin{code}

type Translate = [String] -> Maybe [String]

\end{code} %%Haddock: Block CVS directories \begin{code}

noCVS :: Translate
noCVS s | last s == "CVS" = Nothing
        | otherwise       = Just s

\end{code} %%Haddock: Select all files \begin{code}

allFiles :: Translate
allFiles = Just

\end{code} \subsection{HTML} All file contents are read as plain text. Functions called during the generation of the site may parse some of these values as HTML, but that functionality is not part of this database. \subsection{Reading Data} So here's the code... \begin{code}

safety :: Translate
safety []                        = Nothing
safety l | last l        == ""   = Nothing
         | head (last l) == '.'  = Nothing
         | head (last l) == '/'  = Nothing
         | head (last l) == '\\' = Nothing
         | otherwise             = Just l

\end{code} %%Haddock: Read a database from disk using the default file functions \begin{code}

readDB' :: Dictionary d String =>
  Translate -> d String -> FilePath -> IO (d String)
readDB' = readDB (addDefaultsDB empty)

\end{code} %%Haddock: Read a database from disk \begin{code}

readDB :: (Dictionary d String, Dictionary r (ReadDB d)) =>
  r (ReadDB d) -> Translate -> d String -> FilePath -> IO (d String)
readDB d tr dc dr = readFP d (safety `thenMaybe` tr) dc dr []

readFP :: (Dictionary d String, Dictionary r (ReadDB d)) =>
  r (ReadDB d) -> ReadDB d
readFP d tr dct fp ky =
    do isD <- doesDirectoryExist fp
       if isD
          then readDir d tr dct fp ky $ getDirectoryContents fp
          else readHFile d tr dct fp ky

readDir :: (Dictionary d String, Dictionary r (ReadDB d)) =>
  r (ReadDB d) -> Translate -> d String -> FilePath -> [String]
  -> IO [FilePath] -> IO (d String)
readDir d tr dct fp ky l = do l' <- l
                              foldM fn dct l'
  where
    fn dct' f = case tr $ ky ++ [beforeDot f] of
                  Nothing  -> do return dct'
                  Just ky' -> readFP d tr dct' (fp `slash` f) ky'
      where
        beforeDot ""                = ""
        beforeDot (c:s) | c == '.'  = ""
                        | otherwise = c:(beforeDot s)

readHFile :: (Dictionary d String, Dictionary r (ReadDB d)) =>
  r (ReadDB d) -> ReadDB d
readHFile d tr dct fp ky = case search' d (suffix fp) of
                             Nothing -> do putStrLn $ "no match for " ++ fp
                                           return dct
                             Just fn -> do putStrLn $ "reading " ++ fp
                                           fn tr dct fp ky

suffix :: String -> String
suffix = suffix' ""

suffix' :: String -> String -> String
suffix' x ""                = x
suffix' x (c:s) | c == '.'  = suffix' s s
                | otherwise = suffix' x s

\end{code} Support for Internet Message style entries. \begin{code}

readMessage :: FilePath -> IO ([(String,String)], String)
readMessage fp = do
  (header,body) <- fmap (span (not . Data.List.null) . lines) (readFile fp)
  let headerLinesWords = map (words . concat) (groupBy (\l r -> isSpace (head r)) header)
      headerLines      = map (\wrds -> (fixKeyword (head wrds), unwords (tail wrds))) headerLinesWords
      fixKeyword k
        | ":" `isSuffixOf` k = map toLower $ reverse . tail . reverse $ k
        | otherwise          = error $ "unknown header keyword " ++ show k ++ " in " ++ fp
  return (headerLines, unlines . drop 1 $ body)

\end{code} Support for ReStructured Text Messages. \begin{code}

readRstMsg :: Dictionary d String => Translate -> d String -> FilePath -> [String] -> IO (d String)
readRstMsg _ dct fp ky = do
  (header,body) <- readMessage fp
  let st   = defaultParserState
      opt  = defaultWriterOptions { writerStrictMarkdown = True }
      pdoc = readRST st body
      html = writeHtmlString opt pdoc
      dct' = foldl (\d (k,v) -> add d (ky ++ [k], v)) dct header
  return $ add dct' (ky ++ ["body"], html)

\end{code} Support for Markdown Text Messages. \begin{code}

readMdwnMsg :: Dictionary d String => Translate -> d String -> FilePath -> [String] -> IO (d String)
readMdwnMsg _ dct fp ky = do
  (header,body) <- readMessage fp
  let st   = defaultParserState
      opt  = defaultWriterOptions { writerStrictMarkdown = True }
      pdoc = readMarkdown st body
      html = writeHtmlString opt pdoc
      dct' = foldl (\d (k,v) -> add d (ky ++ [k], v)) dct header
  return $ add dct' (ky ++ ["body"], html)

\end{code} Support for simple entry-by-line lists. \begin{code}

readList :: Dictionary d String => Translate -> d String -> FilePath -> [String] -> IO (d String)
readList _ dct fp ky = do
  ls <- fmap lines (readFile fp)
  let dct' = foldl (\d (k,v) -> add d (ky ++ [show k], v)) dct (zip [1..] ls)
  return $ dct'

\end{code}