Text/Tagset/Positional.hs

{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE PatternGuards #-}

-- | Parsing and printing positional tags and tagsets.

module Text.Tagset.Positional
( parseTag
, showTag
, parseTagset
) where

import Control.Applicative ((<$>), (<*>), (<*), (*>))
import Text.Parsec
import Data.Char (isSpace)
import Data.Maybe (catMaybes)
import qualified Data.Set as S
import qualified Data.Map as M
import qualified Data.Text as T

import Data.Tagset.Positional

-- | Parse the tag given the corresponding tagset.
parseTag :: Tagset -> T.Text -> Tag
parseTag tagset inp =
    Tag _pos . M.fromList $ parseRule (rule tagset _pos) attrVals
  where
    (_pos : attrVals) = T.split (==':') inp
    parseRule ((attr, opt):restAtts) (x:xs)
        | x `S.member` domain tagset attr
            = (attr, x) : parseRule restAtts xs
        | opt == True   = parseRule restAtts (x:xs)
        | otherwise     = error $ "parseRule:"
            ++ " no value for " ++ T.unpack attr
            ++ " attribute in tag " ++ T.unpack inp
    parseRule [] [] = []
    parseRule ((_, True):restAtts) [] = parseRule restAtts []
    parseRule _ [] = error $ "parseRule: unexpected end of input in tag "
        ++ T.unpack inp
    parseRule [] _ = error $ "parseRule: input too long in tag "
        ++ T.unpack inp

-- | Print the tag given the corresponding tagset.
showTag :: Tagset -> Tag -> T.Text
showTag tagset tag =
    T.intercalate ":" (pos tag : catMaybes attrVals)
  where
    attrVals = map showAttr $ rule tagset (pos tag)
    showAttr (attr, opt)
        | Just x <- M.lookup attr (atts tag) = Just x
        | opt == True = Nothing
        | otherwise = error $
            "showTag: no value for mandatory attribute " ++ T.unpack attr

-- | Below we defined the parser for the positional tagset.

type Parser = Parsec String ()

tagsetFile :: Parser Tagset
tagsetFile = spaces *> (Tagset <$> attrSec <*> ruleSec)

attrSec :: Parser (M.Map Attr (S.Set T.Text))
attrSec = do
    secName "ATTR" *> spaces
    defs <- attrLine `endBy` spaces
    return $ M.fromList defs

attrLine :: Parser (Attr, S.Set T.Text)
attrLine = do
    attr <- ident
    _ <- spaces *> char '=' *> lineSpaces
    values <- map T.pack <$> ident `endBy` lineSpaces
    return (T.pack attr, S.fromList values)

ruleSec :: Parser (M.Map POS [(Attr, Optional)])
ruleSec = do
    secName "RULE" *> spaces
    M.fromList <$> ruleLine `endBy` spaces

ruleLine :: Parser (POS, [(Attr, Optional)])
ruleLine = do
    _pos <- ident
    _ <- lineSpaces *> char '=' *> lineSpaces
    actionAtts <- attrName `endBy` lineSpaces
    return $ (T.pack _pos, actionAtts)

attrName :: Parser (Attr, Optional)
attrName = optionalAttrName <|> plainAttrName <?> "attribute name"

optionalAttrName :: Parser (Attr, Optional)
optionalAttrName = do
    name <- char '[' *> ident <*  char ']'
    return (T.pack name, True)

plainAttrName :: Parser (Attr, Optional)
plainAttrName = do
    name <- ident
    return $ (T.pack name, False)

lineSpace :: Parser Char
lineSpace = satisfy $ \c -> (isSpace c) && (not $ c == '\n')

lineSpaces :: Parser String
lineSpaces = many lineSpace

ident :: Parser String
ident = many1 $ oneOf $ ['a'..'z'] ++ ['A'..'Z'] ++ ['0'..'9'] ++ "."

secName :: String -> Parser Char
secName name = char '[' *> string name *> char ']'

-- | Parse the textual representation of the tagset.  The first argument
-- should be the name of the source.
parseTagset :: String -> String -> Tagset
parseTagset src contents = do
    case parse tagsetFile src filtered of
        Left e  -> error $ "parseTagset: Error parsing input:\n" ++ show e
        Right r -> r
  where
     filtered = unlines $ map (removeComment '#') $ lines contents

removeComment :: Char -> String -> String
removeComment commChar s = case findComment s of
    Just i -> fst $ splitAt i s
    Nothing -> s
  where
    findComment xs = doFind xs 0 False
    doFind (x:xs) acc inQuot
        | x == commChar && not inQuot = Just acc
        | x == '"' = doFind xs (acc + 1) (not inQuot)
        | otherwise =  doFind xs (acc + 1) inQuot
    doFind [] _ _ = Nothing