{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE RecordWildCards #-} module NLP.SyntaxNet.SyntaxNet ( readCnll , readParseTree ) where import Control.Applicative import Control.Concurrent import qualified Control.Exception as E import Control.Lens hiding (at) import Control.Monad import Control.Monad.IO.Class import Data.Aeson import Data.Char import qualified Data.Csv as Csv import Data.Tree import Data.Maybe import Protolude import qualified Data.ByteString as BS import qualified Data.ByteString.Char8 as BSC import qualified Data.ByteString.Lazy as BL import qualified Data.ByteString.Lazy as BSL import Data.Functor ((<$>)) import qualified Data.Text as T import qualified Data.Text.Encoding as TE import qualified Data.Text.Lazy as TL import qualified Data.Text.Lazy.Encoding as TEL import qualified Data.Vector as V import Safe as S import Data.String import Data.ConllToken import Data.ParsedSentence import Data.SyntaxTree import Data.TagLabel import NLP.SyntaxNet.Types.CoNLL import NLP.SyntaxNet.Types.ParseTree -------------------------------------------------------------------------------- readCnll :: FilePath -> IO [SnConllToken Text] readCnll fpath = do csvData <- BSL.readFile fpath case TEL.decodeUtf8' csvData of Left err -> do putStrLn $ "error decoding" ++ (show err) return [] Right dat -> do let res = (Csv.decodeWith cnllOptions Csv.NoHeader $ TEL.encodeUtf8 dat) :: Either String (V.Vector (SnConllToken Text)) case res of Left err -> do putStrLn $ "error decoding" ++ (show err) return [] Right vals -> return $ V.toList vals preprocess :: TL.Text -> TL.Text preprocess txt = TL.cons '\"' $ TL.snoc escaped '\"' where escaped = TL.concatMap escaper txt escaper :: Char -> TL.Text escaper c | c == '\t' = "\"\t\"" | c == '\n' = "\"\n\"" | c == '\"' = "\"\"" | otherwise = TL.singleton c -- | Define custom options to read tab-delimeted files cnllOptions = Csv.defaultDecodeOptions { Csv.decDelimiter = fromIntegral (ord '\t') } -------------------------------------------------------------------------------- -- Dealing with trees readParseTree :: FilePath -> IO (Maybe (Tree (SnConllToken Text))) readParseTree fpath = do treeData <- BSC.readFile fpath let ls = BSC.lines treeData let lls = map ( \x -> BSC.split ' ' x) ls lln = map parseNode lls let tree = fromList lln return $ tree -- | Same as readParseTree but for debugging -- readParseTree' :: FilePath -> IO () readParseTree' fpath = do treeData <- BSC.readFile fpath let ls = BSC.lines treeData mapM_ (putStrLn . T.pack . show ) ls let lls = map ( \x -> BSC.split ' ' x) ls lln <- mapM parseNode' ls tree <- fromList' $ lln mapM_ (putStrLn . T.pack . show ) lln putStrLn $ T.pack $ "----\n" putStrLn $ T.pack $ show $ drawTree' $ fromJust tree return $ () parseNode :: [BSC.ByteString] -> SnConllToken Text parseNode llbs = do let llss = map BSC.unpack llbs lenLB = 3 -- useful labels like TOKEN POS ER lenWP = (length $ filter (==" ") llss ) -- each identation takes 2 spaces let lls = map T.pack llss lvl = (length lls) - lenLB - lenWP -- calculate actual level lbls = drop ((length lls) - 3) lls -- extract only lables ConllToken lvl -- reuse id to indicate level, when working with trees (lbls `at` 0) "" UnkCg (parsePosFg $ lbls `at` 1) "" 0 (parseGER $ lbls `at` 2) "" "" parseNode' :: BSC.ByteString -> IO (SnConllToken Text) parseNode' bs = do let llbs = filter (/="|") $ BSC.split ' ' bs llss = map BSC.unpack llbs lenLB = 3 -- useful labels like TOKEN POS ER lenWP = (length $ filter (=="") llss) -- always have 1 in front, each identation takes 2 spaces let llt = map T.pack llss llsf = filter (/="") llt lvl = ((mod lenWP 2))+1 -- calculate actual level lbls = drop ((length llt) - 3) llt -- extract only lables putStrLn $ T.pack $ show $ llss putStrLn $ T.pack $ show $ llsf putStrLn $ T.pack $ show $ lenWP putStrLn $ T.pack $ show $ lvl return $ ConllToken lvl -- reuse id to indicate level, when working with trees (lbls `at` 0) "" UnkCg (parsePosFg $ lbls `at` 1) "" 0 (parseGER $ lbls `at` 2) "" ""