module NLP.SyntaxNet.SyntaxNet
( readCnll
, readParseTree
) where
import Control.Applicative
import Control.Concurrent
import qualified Control.Exception as E
import Control.Lens hiding (at)
import Control.Monad
import Control.Monad.IO.Class
import Data.Aeson
import Data.Char
import qualified Data.Csv as Csv
import Data.Tree
import Data.Maybe
import Protolude
import qualified Data.ByteString as BS
import qualified Data.ByteString.Char8 as BSC
import qualified Data.ByteString.Lazy as BL
import qualified Data.ByteString.Lazy as BSL
import Data.Functor ((<$>))
import qualified Data.Text as T
import qualified Data.Text.Encoding as TE
import qualified Data.Text.Lazy as TL
import qualified Data.Text.Lazy.Encoding as TEL
import qualified Data.Vector as V
import Safe as S
import Data.String
import Data.ConllToken
import Data.ParsedSentence
import Data.SyntaxTree
import Data.TagLabel
import NLP.SyntaxNet.Types.CoNLL
import NLP.SyntaxNet.Types.ParseTree
readCnll :: FilePath -> IO [SnConllToken Text]
readCnll fpath = do
csvData <- BSL.readFile fpath
case TEL.decodeUtf8' csvData of
Left err -> do
putStrLn $ "error decoding" ++ (show err)
return []
Right dat -> do
let res = (Csv.decodeWith cnllOptions Csv.NoHeader $ TEL.encodeUtf8 dat) :: Either String (V.Vector (SnConllToken Text))
case res of
Left err -> do
putStrLn $ "error decoding" ++ (show err)
return []
Right vals ->
return $ V.toList vals
preprocess :: TL.Text -> TL.Text
preprocess txt = TL.cons '\"' $ TL.snoc escaped '\"'
where escaped = TL.concatMap escaper txt
escaper :: Char -> TL.Text
escaper c
| c == '\t' = "\"\t\""
| c == '\n' = "\"\n\""
| c == '\"' = "\"\""
| otherwise = TL.singleton c
cnllOptions =
Csv.defaultDecodeOptions
{ Csv.decDelimiter = fromIntegral (ord '\t')
}
readParseTree :: FilePath -> IO (Maybe (Tree (SnConllToken Text)))
readParseTree fpath = do
treeData <- BSC.readFile fpath
let ls = BSC.lines treeData
let lls = map ( \x -> BSC.split ' ' x) ls
lln = map parseNode lls
let tree = fromList lln
return $ tree
readParseTree' :: FilePath -> IO ()
readParseTree' fpath = do
treeData <- BSC.readFile fpath
let ls = BSC.lines treeData
mapM_ (putStrLn . T.pack . show ) ls
let lls = map ( \x -> BSC.split ' ' x) ls
lln <- mapM parseNode' ls
tree <- fromList' $ lln
mapM_ (putStrLn . T.pack . show ) lln
putStrLn $ T.pack $ "----\n"
putStrLn $ T.pack $ show $ drawTree' $ fromJust tree
return $ ()
parseNode :: [BSC.ByteString] -> SnConllToken Text
parseNode llbs = do
let llss = map BSC.unpack llbs
lenLB = 3
lenWP = (length $ filter (==" ") llss )
let lls = map T.pack llss
lvl = (length lls) lenLB lenWP
lbls = drop ((length lls) 3) lls
ConllToken lvl
(lbls `at` 0)
""
UnkCg
(parsePosFg $ lbls `at` 1)
""
0
(parseGER $ lbls `at` 2)
""
""
parseNode' :: BSC.ByteString -> IO (SnConllToken Text)
parseNode' bs = do
let llbs = filter (/="|") $ BSC.split ' ' bs
llss = map BSC.unpack llbs
lenLB = 3
lenWP = (length $ filter (=="") llss)
let llt = map T.pack llss
llsf = filter (/="") llt
lvl = ((mod lenWP 2))+1
lbls = drop ((length llt) 3) llt
putStrLn $ T.pack $ show $ llss
putStrLn $ T.pack $ show $ llsf
putStrLn $ T.pack $ show $ lenWP
putStrLn $ T.pack $ show $ lvl
return $ ConllToken lvl
(lbls `at` 0)
""
UnkCg
(parsePosFg $ lbls `at` 1)
""
0
(parseGER $ lbls `at` 2)
""
""