nkjp-0.2.0: Manipulating the National Corpus of Polish (NKJP)

Safe HaskellNone

Text.NKJP.Named

Contents

Description

Parsing the NKJP named entity layer.

Synopsis

Data types

data Cert Source

A certainty of an annotator.

Constructors

High 
Medium 
Low 

Instances

data Ptr t Source

A pointer.

Constructors

Local

Of #id form.

Fields

target :: t
 
Global

Of loc#id form.

Fields

target :: t
 
location :: t
 

Instances

Functor Ptr 
Show t => Show (Ptr t) 

data Deriv t Source

A derivation structure.

Constructors

Deriv 

Fields

derivType :: t
 
derivFrom :: t
 

Instances

data Para t Source

A paragraph.

Constructors

Para 

Fields

paraID :: t
 
sentences :: [Sent t]
 

Instances

Functor Para 
Show t => Show (Para t) 

data Sent t Source

A sentence.

Constructors

Sent 

Fields

sentID :: t
 
names :: [NE t]
 

Instances

Functor Sent 
Show t => Show (Sent t) 

data NE t Source

A segment element in a file.

Constructors

NE 

Fields

neID :: t
 
derived :: Maybe (Deriv t)
 
neType :: t
 
subType :: Maybe t
 
orth :: t
 
base :: Either t t

Left base or Right when.

cert :: Cert
 
certComment :: Maybe t
 
ptrs :: [Ptr t]
 

Instances

Functor NE 
Show t => Show (NE t) 

Parsing

parseNamed :: Text -> [Para Text]Source

Parse textual contents of the ann_named.xml file.

readNamed :: FilePath -> IO [Para Text]Source

Parse the stand-alone ann_named.xml file.

readCorpus :: FilePath -> IO [(FilePath, Maybe [Para Text])]Source

Parse all ann_named.xml files from the NCP .tar.gz file. Directories will be processed in an ascending order (with respect to directory names).

readTrees :: FilePath -> IO [Forest (Either (NE Text) (Seg Text))]Source

Parse the NCP .tar.gz corpus, extract all NEs and translate them to the tree form using the mkForest function.

Utilities

mkForest :: Ord t => [Seg t] -> [NE t] -> Forest (Either (NE t) (Seg t))Source

Make NE forest from a segment list and a list of NEs, both lists corresponding to the same sentence.