{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE PatternGuards #-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE RecordWildCards #-}

-- | The module provides functionality for manipulating PoliMorf, the
-- morphological dictionary for Polish.

module Data.PoliMorf
( 
-- * Types
  Form
, Base
, POS
, MSD
, Tag
, Cat
, Entry (..)
, split
, pos
, msd
, atomic

-- * Parsing
, readPoliMorf
, parsePoliMorf
) where

import Control.Applicative ((<$>))
import Control.Arrow (second)
import qualified Data.Text as T
import qualified Data.Text.Lazy as L
import qualified Data.Text.Lazy.IO as L

-- | A word form.
type Form = T.Text

-- | A base form.
type Base = T.Text

-- | A part of speech.
type POS  = T.Text

-- | A morphosyntactic description 
type MSD  = T.Text

-- | A morphosyntactic tag. (Tag = POS + MSD)
type Tag  = T.Text

-- | A semantic category.  It will be set to "" when there is
-- no category assigned to a particular PoliMorf entry.
type Cat  = T.Text

-- | An entry from the PoliMorf dictionary.
data Entry = Entry
    { form :: !Form
    , base :: !Base
    , tag  :: !Tag
    , cat  :: !Cat }
    deriving (Eq, Ord, Show, Read)

-- | Split tag.
split :: Tag -> (POS, MSD)
split = second (T.drop 1) . T.break (==':')

-- | Entry POS.
pos :: Entry -> POS
pos = fst . split . tag

-- | Entry MSD.
msd :: Entry -> MSD
msd = snd . split . tag

-- | Is the entry an atomic one?  More precisely, we treat all negative
-- forms starting with ''nie'' and all superlatives starting with ''naj''
-- as non-atomic entries.
atomic :: Entry -> Bool
atomic x
    | "sup" `T.isInfixOf` tag x && "naj" `T.isPrefixOf` form x = False
    | "neg" `T.isInfixOf` tag x && "nie" `T.isPrefixOf` form x = False
    | otherwise = True

-- | Read the PoliMorf from the file.
readPoliMorf :: FilePath -> IO [Entry]
readPoliMorf path = parsePoliMorf <$> L.readFile path

-- | Parse the PoliMorf into a list of entries.
parsePoliMorf :: L.Text -> [Entry]
parsePoliMorf = map parsePoliRow . L.lines 

-- | Get an entry pair from a PoliMorf row.
parsePoliRow :: L.Text -> Entry
parsePoliRow row = case map L.toStrict (L.split (=='\t') row) of
    _form : _base : _tag : rest -> Entry _form _base _tag $ case rest of
        []       -> ""
        (_cat:_) -> _cat
    _   -> error $ "parsePoliRow: invalid row \"" ++ L.unpack row ++ "\""