-- Hoogle documentation, generated by Haddock -- See Hoogle, http://www.haskell.org/hoogle/ -- | Tools for processing unstructured text data -- -- The lightweight and easy to use functions for text tokenizing and -- parsing. It aimed for parsing mostly unstructured data, but the -- structured formats may be parsed as well. It may be used in different -- sutiations, for DSL, tex markups or even for parsing simple grammars -- easier and sometimes faster than in case of usage mainstream parsing -- combinators or parser generators. See the README.markdown, examples -- and modules documentation for more. @package fuzzy-parse @version 0.1.2.0 module Data.Text.Fuzzy.Attoparsec.Day dayDMY :: Parser Day dayYMD :: Parser Day dayYYYYMMDD :: Parser Day dayDMonY :: Parser Day day :: Parser Day module Data.Text.Fuzzy.Attoparsec.Month fuzzyMonth :: Parser Int fuzzyMonthFromText :: Text -> Maybe Int -- | Dates fuzzy parsing. Supports a number of dates format and tries to -- recover the incomplete dates from text, with use of some reasonable -- assumptions. Does not support locales, i.e assums only English for -- dates yet. -- --

Examples

-- --
--   parseMaybeDay "01.01.1979"
--   Just 1979-01-01
--   parseMaybeDay "01.01.01"
--   Just 2001-01-01
--   parseMaybeDay "13/01/2019"
--   Just 2019-01-13
--   parseMaybeDay "2019-12-1"
--   Just 2019-12-01
--   parseMaybeDay "21-feb-79"
--   Just 1979-02-21
--   parseMaybeDay "21-feb-01"
--   Just 2001-02-21
--   parseMaybeDay "29feb04"
--   Just 2004-02-29
--   parseMaybeDay "21feb28"
--   Just 2028-02-21
--   
module Data.Text.Fuzzy.Dates -- | Tries to parse a date from the text. parseMaybeDay :: Text -> Maybe Day module Data.Text.Fuzzy.Section cutSectionBy :: (Text -> Bool) -> (Text -> Bool) -> [Text] -> [Text] cutSectionOn :: Text -> Text -> [Text] -> [Text] -- | The lightweight and multi-functional text tokenizer allowing different -- types of text tokenization depending on it's settings. -- -- It may be used in different sutiations, for DSL, text markups or even -- for parsing simple grammars easier and sometimes faster than in case -- of usage mainstream parsing combinators or parser generators. -- -- The primary goal of this package is to parse unstructured text data, -- however it may be used for parsing such data formats as CSV with ease. -- -- Currently it supports the following types of entities: atoms, string -- literals (currently with the minimal set of escaped characters), -- punctuation characters and delimeters. -- --

Examples

-- --

Simple CSV-like tokenization

-- --
--   >>> tokenize (delims ":") "aaa : bebeb : qqq ::::" :: [Text]
--   ["aaa "," bebeb "," qqq "]
--   
-- --
--   >>> tokenize (delims ":"<>sq<>emptyFields ) "aaa : bebeb : qqq ::::" :: [Text]
--   ["aaa "," bebeb "," qqq ","","","",""]
--   
-- --
--   >>> > tokenize (delims ":"<>sq<>emptyFields ) "aaa : bebeb : qqq ::::" :: [Maybe Text]
--   [Just "aaa ",Just " bebeb ",Just " qqq ",Nothing,Nothing,Nothing,Nothing]
--   
-- --
--   >>> tokenize (delims ":"<>sq<>emptyFields ) "aaa : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text]
--   [Just "aaa ",Just " ",Just "bebeb:colon inside",Just " ",Just " qqq ",Nothing,Nothing,Nothing,Nothing]
--   
-- --
--   >>> let spec = sl<>delims ":"<>sq<>emptyFields<>noslits
--   
--   >>> tokenize spec "   aaa :   'bebeb:colon inside' : qqq ::::" :: [Maybe Text]
--   [Just "aaa ",Just "bebeb:colon inside ",Just "qqq ",Nothing,Nothing,Nothing,Nothing]
--   
-- --
--   >>> let spec = delims ":"<>sq<>emptyFields<>uw<>noslits
--   
--   >>> tokenize spec "  a  b  c  : 'bebeb:colon inside' : qqq ::::"  :: [Maybe Text]
--   [Just "a b c",Just "bebeb:colon inside",Just "qqq",Nothing,Nothing,Nothing,Nothing]
--   
-- --

Notes

-- --

About the delimeter tokens

-- -- This type of tokens appears during a "delimited" formats processing -- and disappears in results. Currenly you will never see it unless -- normalization is turned off by nn option. -- -- The delimeters make sense in case of processing the CSV-like formats, -- but in this case you probably need only values in results. -- -- This behavior may be changed later. But right now delimeters seem -- pointless in results. If you process some sort of grammar where -- delimeter character is important, you may use punctuation instead, -- i.e: -- --
--   >>> let spec = delims " \t"<>punct ",;()" <>emptyFields<>sq
--   
--   >>> tokenize spec "( delimeters , are , important, 'spaces are not');" :: [Text]
--   ["(","delimeters",",","are",",","important",",","spaces are not",")",";"]
--   
-- --

Other

-- -- For CSV-like formats it makes sense to split text to lines first, -- otherwise newline characters may cause to weird results module Data.Text.Fuzzy.Tokenize -- | Tokenization settings. Use mempty for an empty value and construction -- functions for changing the settings. data TokenizeSpec -- | Typeclass for token values. Note, that some tokens appear in results -- only when nn option is set, i.e. sequences of characters turn -- out to text tokens or string literals and delimeter tokens are just -- removed from the results class IsToken a -- | Create a character token mkChar :: IsToken a => Char -> a -- | Create a string literal character token mkSChar :: IsToken a => Char -> a -- | Create a punctuation token mkPunct :: IsToken a => Char -> a -- | Create a text chunk token mkText :: IsToken a => Text -> a -- | Create a string literal token mkStrLit :: IsToken a => Text -> a -- | Create a keyword token mkKeyword :: IsToken a => Text -> a -- | Create an empty field token mkEmpty :: IsToken a => a -- | Create a delimeter token mkDelim :: IsToken a => a -- | Creates an indent token mkIndent :: IsToken a => Int -> a -- | Creates an EOL token mkEol :: IsToken a => a -- | Tokenize a text tokenize :: IsToken a => TokenizeSpec -> Text -> [a] -- | Turn on character escaping inside string literals. Currently the -- following escaped characters are supported: [" ' t n r a b f v ] esc :: TokenizeSpec -- | Raise empty field tokens (note mkEmpty method) when no tokens found -- before a delimeter. Useful for processing CSV-like data in order to -- distingush empty columns addEmptyFields :: TokenizeSpec -- | same as addEmptyFields emptyFields :: TokenizeSpec -- | Turns off token normalization. Makes the tokenizer generate character -- stream. Useful for debugging. nn :: TokenizeSpec -- | Turns on single-quoted string literals. Character stream after '\'' -- character will be proceesed as single-quoted stream, assuming all -- delimeter, comment and other special characters as a part of the -- string literal until the next unescaped single quote character. sq :: TokenizeSpec -- | Enable double-quoted string literals support as sq for -- single-quoted strings. sqq :: TokenizeSpec -- | Disable separate string literals. -- -- Useful when processed delimeted data (csv-like formats). Normally, -- sequential text chunks are concatenated together, but consequent text -- and string literal will produce the two different tokens and it may -- cause weird results if data is in csv-like format, i.e: -- --
--   >>> tokenize (delims ":"<>emptyFields<>sq ) "aaa:bebe:'qq' aaa:next::" :: [Maybe Text]
--   [Just "aaa",Just "bebe",Just "qq",Just " aaa",Just "next",Nothing,Nothing]
--   
-- -- look: "qq" and " aaa" are turned into two separate tokens that makes -- the result of CSV processing looks improper, like it has an -- extra-column. This behavior may be avoided using this option, if you -- don't need to distinguish text chunks and string literals: -- --
--   >>> tokenize (delims ":"<>emptyFields<>sq<>noslits) "aaa:bebe:'qq:foo' aaa:next::" :: [Maybe Text]
--   [Just "aaa",Just "bebe",Just "qq:foo aaa",Just "next",Nothing,Nothing]
--   
noslits :: TokenizeSpec -- | Strip spaces on left side of a token. Does not affect string literals, -- i.e string are processed normally. Useful mostly for processing -- CSV-like formats, otherwise delims may be used to skip unwanted -- spaces. sl :: TokenizeSpec -- | Strip spaces on right side of a token. Does not affect string -- literals, i.e string are processed normally. Useful mostly for -- processing CSV-like formats, otherwise delims may be used to -- skip unwanted spaces. sr :: TokenizeSpec -- | Strips spaces on right and left sides and transforms multiple spaces -- into the one. Name origins from unwords . words -- -- Does not affect string literals, i.e string are processed normally. -- Useful mostly for processing CSV-like formats, otherwise delims -- may be used to skip unwanted spaces. uw :: TokenizeSpec -- | Specify the list of delimers (characters) to split the character -- stream into fields. Useful for CSV-like separated formats. Support for -- empty fields in token stream may be enabled by addEmptyFields -- function delims :: String -> TokenizeSpec -- | Specify the line comment prefix. All text after the line comment -- prefix will be ignored until the newline character appearance. -- Multiple line comments are supported. comment :: Text -> TokenizeSpec -- | Specify the punctuation characters. Any punctuation character is -- handled as a separate token. Any token will be breaked on a -- punctiation character. -- -- Useful for handling ... er... punctuaton, like -- --
--   > function(a,b)
--   
-- -- or -- --
--   > (apply function 1 2 3)
--   
-- --
--   >>> tokenize spec "(apply function 1 2 3)" :: [Text]
--   ["(","apply","function","1","2","3",")"]
--   
punct :: Text -> TokenizeSpec -- | Enable identation support indent :: TokenizeSpec -- | Set tab expanding multiplier i.e. each tab extends into n spaces -- before processing. It also turns on the indentation. Only the tabs at -- the beginning of the string are expanded, i.e. before the first -- non-space character appears. itabstops :: Int -> TokenizeSpec -- | Specify the keywords list. Each keyword will be threated as a separate -- token. keywords :: [Text] -> TokenizeSpec -- | Turns on EOL token generation eol :: TokenizeSpec instance GHC.Show.Show Data.Text.Fuzzy.Tokenize.Token instance GHC.Classes.Ord Data.Text.Fuzzy.Tokenize.Token instance GHC.Classes.Eq Data.Text.Fuzzy.Tokenize.Token instance GHC.Base.Monoid w => GHC.Base.Monad (Data.Text.Fuzzy.Tokenize.TokenizeM w) instance GHC.Base.Monoid w => Control.Monad.State.Class.MonadState () (Data.Text.Fuzzy.Tokenize.TokenizeM w) instance GHC.Base.Monoid w => Control.Monad.Writer.Class.MonadWriter w (Data.Text.Fuzzy.Tokenize.TokenizeM w) instance GHC.Base.Monoid w => Control.Monad.Reader.Class.MonadReader Data.Text.Fuzzy.Tokenize.TokenizeSpec (Data.Text.Fuzzy.Tokenize.TokenizeM w) instance GHC.Base.Functor (Data.Text.Fuzzy.Tokenize.TokenizeM w) instance GHC.Base.Monoid w => GHC.Base.Applicative (Data.Text.Fuzzy.Tokenize.TokenizeM w) instance GHC.Show.Show Data.Text.Fuzzy.Tokenize.TokenizeSpec instance GHC.Classes.Ord Data.Text.Fuzzy.Tokenize.TokenizeSpec instance GHC.Classes.Eq Data.Text.Fuzzy.Tokenize.TokenizeSpec instance Data.Text.Fuzzy.Tokenize.IsToken (GHC.Maybe.Maybe Data.Text.Internal.Text) instance Data.Text.Fuzzy.Tokenize.IsToken Data.Text.Internal.Text instance GHC.Base.Semigroup Data.Text.Fuzzy.Tokenize.TokenizeSpec instance GHC.Base.Monoid Data.Text.Fuzzy.Tokenize.TokenizeSpec