module Data.PoliMorf
(
Form
, Base
, POS
, MSD
, Tag
, Cat
, Entry (..)
, split
, pos
, msd
, atomic
, readPoliMorf
, parsePoliMorf
) where
import Control.Applicative ((<$>))
import Control.Arrow (second)
import qualified Data.Text as T
import qualified Data.Text.Lazy as L
import qualified Data.Text.Lazy.IO as L
type Form = T.Text
type Base = T.Text
type POS = T.Text
type MSD = T.Text
type Tag = T.Text
type Cat = T.Text
data Entry = Entry
{ form :: !Form
, base :: !Base
, tag :: !Tag
, cat :: !Cat }
deriving (Eq, Ord, Show, Read)
split :: Tag -> (POS, MSD)
split = second (T.drop 1) . T.break (==':')
pos :: Entry -> POS
pos = fst . split . tag
msd :: Entry -> MSD
msd = snd . split . tag
atomic :: Entry -> Bool
atomic x
| "sup" `T.isInfixOf` tag x && "naj" `T.isPrefixOf` form x = False
| "neg" `T.isInfixOf` tag x && "nie" `T.isPrefixOf` form x = False
| otherwise = True
readPoliMorf :: FilePath -> IO [Entry]
readPoliMorf path = parsePoliMorf <$> L.readFile path
parsePoliMorf :: L.Text -> [Entry]
parsePoliMorf = map parsePoliRow . L.lines
parsePoliRow :: L.Text -> Entry
parsePoliRow row = case map L.toStrict (L.split (=='\t') row) of
_form : _base : _tag : rest -> Entry _form _base _tag $ case rest of
[] -> ""
(_cat:_) -> _cat
_ -> error $ "parsePoliRow: invalid row \"" ++ L.unpack row ++ "\""