{- This file is part of text-position. - - Written in 2015 by fr33domlover . - - ♡ Copying is an act of love. Please copy, reuse and share. - - To the extent possible under law, the author(s) have dedicated all copyright - and related and neighboring rights to this software to the public domain - worldwide. This software is distributed without any warranty. - - You should have received a copy of the CC0 Public Domain Dedication along - with this software. If not, see - . -} module Data.Position.Interface ( -- * Special Positions zeroPosition , firstPosition -- * Special Advances , emptyAdvance , defaultAdvance -- * Creating Advances , psymAdvance , symAdvance , linecharAdvance , stringAdvance , newlineAdvance , commonAdvance , (<++>) -- * Applying Advances , tryAdvance , tryAdvanceC , advance , advanceC -- * Utilities Based on Advances , defaultAnnotate , enrichOnce , enrichOnceD , enrich , enrichD , bless , tokens , textInfo ) where import Data.List (isPrefixOf, mapAccumL) import Data.Maybe (fromJust, maybe) import Data.Position.Types import Text.Regex.Applicative ------------------------------------------------------------------------------- -- Special Positions ------------------------------------------------------------------------------- -- | The position before the first character in a file, to be used as an -- initial value before reading actual characters. zeroPosition :: Position zeroPosition = Position 1 0 0 -- | The position of the first character in a file. firstPosition :: Position firstPosition = Position 1 1 1 ------------------------------------------------------------------------------- -- Special Advances ------------------------------------------------------------------------------- -- | The zero advance. It doesn't match any input and doesn't consume any -- characters. Applying it doesn't change the position. emptyAdvance :: Advance s emptyAdvance = empty -- | The default advance when reading a character, e.g. a letter or a digit. -- The new character would have column number higher by 1, and character index -- higher by once (advances by 1 for each character read). The pattern accepts -- any single character. defaultAdvance :: Advance s defaultAdvance = f <$ anySym where f (Position l c ch) = Position l (c + 1) (ch + 1) ------------------------------------------------------------------------------- -- Creating Advances ------------------------------------------------------------------------------- -- | Create an advance for a single character based on a predicate. psymAdvance :: (s -> Bool) -> (Position -> Position) -> Advance s psymAdvance p a = a <$ psym p -- | Create an advance for the given character. symAdvance :: Eq s => s -> (Position -> Position) -> Advance s symAdvance c = psymAdvance (c ==) -- | Create an advance for a line character with the specified width. This is -- mainly useful for tabs and perhaps the various space characters in Unicode. -- Example for tab: -- -- > tabAdv = linecharAdvance '\t' 8 linecharAdvance :: Eq s => s -- ^ The character -> Int -- ^ How many columns the character takes -> Advance s linecharAdvance c width = symAdvance c f where f (Position l c ch) = Position l (c + width) (ch + 1) -- | Create an advance for the given character sequence. stringAdvance :: Eq s => [s] -> (Position -> Position) -> Advance s stringAdvance s a = a <$ string s -- | Create an advance for a character or sequence of characters expressing a -- newline, i.e. starting a new line. As the advance expresses the position -- /after/ the character, applying the advance results with a position at -- column 1. newlineAdvance :: Eq s => [s] -> Advance s newlineAdvance s = stringAdvance s f where f (Position l c ch) = Position (l + 1) 1 (ch + length s) -- | Create a set of common advances supporting tabs and newlines. More -- advances can easily be added by @<|>@ing them to the result. The result -- doesn't include the default advance. commonAdvance :: Int -- ^ Tab width (usually 2, 4 or 8) -> Bool -- ^ Whether carriage return (CR) counts as a newline -> Bool -- ^ Whether linefeed (LF) counts as a newline -> Bool -- ^ Whether the sequence CR LF counts as a newline -> Bool -- ^ Whether formfeed (FF) counts as a newline -> Advance Char commonAdvance tab cr lf crlf ff = foldr (<|>) tabAdv nlAdv where tabAdv = linecharAdvance '\t' tab nlAdv = [ adv | (adv, True) <- zipList ] zipList = zip (map newlineAdvance ["\r\n", "\r", "\n", "\f"]) [crlf, cr, lf, ff] -- | Concatenate two advances into a single advance accepting their patterns -- in order, and applying the advances on top of each other. For example, -- concatenating an advance for @'a'@ and an advance for @'b'@ results with an -- advance accepting @"ab"@ and moving the position 2 columns forward. (<++>) :: Advance s -> Advance s -> Advance s a <++> b = flip (.) <$> a <*> b infixl 4 <++> ------------------------------------------------------------------------------- -- Applying Advances ------------------------------------------------------------------------------- -- | Given a list of remaining characters to read, the next position in the -- file and a set of advance rules, try to consume characters once and -- determine what is the next position after reading them. Example: -- -- >>> tryAdvance defaultAdvance (Position 1 1 1) "abc" -- (Position 1 2 2,"bc") -- -- If there is no match, it returns the input position and the input list, i.e. -- no characters will be consumed. tryAdvance :: Advance s -> Position -> [s] -> (Position, [s]) tryAdvance a p l = case findFirstPrefix a l of Nothing -> (p, l) Just (adv, rest) -> (adv p, rest) -- | Like 'tryAdvance', but reads one character at most. In the general case -- you'll want to use 'tryAdvance', because 'tryAdvanceC' breaks chains. For -- example, while 'tryAdvance' can recognize @"\r\n"@ as a single newline, -- 'tryAdvanceC' will consume only the @'\r'@, splitting the string into 2 -- newlines. -- -- If there is no match, the input position is returned. tryAdvanceC :: Advance s -> Position -> s -> Position tryAdvanceC a p s = fst $ tryAdvance a p [s] -- | Given a list of remaining characters to read, the next position in the -- file and a set of advance rules, consume characters once and determine what -- is the next position after reading them. -- -- The 'defaultAdvance' is appended (using '<|>') to the given advance. -- Therefore, if the given list isn't empty, at leat character will be -- consumed. The intended use is to encode all the special cases (tab, -- newlines, non-spacing marks, etc.) in the given advance, and let the -- 'defaultAdvance' catch the rest. advance :: Advance s -> Position -> [s] -> (Position, [s]) advance a = tryAdvance (a <|> defaultAdvance) -- | Like 'advance', but reads exactly one character. Patterns which require -- more than one character fail to match. Like 'tryAdvanceC', but has the -- 'defaultAdvance' appended, which means is always consumes given a non-empty -- list. advanceC :: Advance s -> Position -> s -> Position advanceC a p s = fst $ advance a p [s] ------------------------------------------------------------------------------- -- Utilities based on Advances ------------------------------------------------------------------------------- -- | Given the next position and a list matched there, annotate the symbols -- with position information. For a single character, it is simply the given -- position. For a sequence, this annotation assigns all the symbols the same -- line and column, incrementing only the character index. -- -- >>> defaultAnnotate (Position 1 1 1) "a" -- [Positioned 'a' (Position 1 1 1)] -- -- >>> defaultAnnotate (Position 1 1 1) "\r\n" -- [Positioned '\r' (Position 1 1 1), Positioned '\n' (Position 1 1 2)] -- -- The last example would give the same positions to any list of the same -- length, e.g. @"ab"@ instead of @"\r\n"@. defaultAnnotate :: Position -> [s] -> [Positioned s] defaultAnnotate p [] = [] defaultAnnotate p (c:cs) = Positioned c p : defaultAnnotate (f p) cs where f (Position l c ch) = Position l c (ch + 1) -- | Given an advance rule, the next available position and a symbol list, -- consume symbols once. Return a list of them, annotated with position -- information, as well as the next position and the rest of the input. -- On empty input, return @[]@, the given position and the input list. -- -- If more than one character is matched, the sequence is annotated with -- consecutive character indices, but with the same line and column. -- -- >>> enrichOnce (newlineAdvance "\r\n") (Position 1 1 1) "\r\nhello" -- ( [ Positioned '\r' (Position 1 1 1) -- , Positioned '\n' (Position 1 1 2) -- ] -- , Position 2 1 3 -- , "hello" -- ) enrichOnce :: Advance s -> Position -> [s] -> ([Positioned s], Position, [s]) enrichOnce = enrichOnceD defaultAnnotate defaultAdvance -- | Given an advance rule, the next available position and a symbol list, try -- to consume symbols once. If consumed, return a list of them, annotated with -- position information, as well as the next position and the rest of the -- input. Otherwise, return @[]@, the given position and the input list. -- -- If more than one character is matched, the sequence is annotated using the -- function passed as the first parameter. -- -- >>> let ann = defaultAnnotate; adv = empty -- >>> enrichOnceD ann adv (newlineAdvance "\r\n") (Position 1 1 1) "\r\nhello" -- ( [ Positioned '\r' (Position 1 1 1) -- , Positioned '\n' (Position 1 1 2) -- ] -- , Position 2 1 3 -- , "hello" -- ) enrichOnceD :: (Position -> [s] -> [Positioned s]) -- ^ annotation function -> Advance s -- ^ default advance -> Advance s -- ^ advance rule -> Position -- ^ initial position -> [s] -- ^ input list -> ([Positioned s], Position, [s]) enrichOnceD ann def adv pos syms = f $ findFirstPrefix re syms where re = g <$> withMatched (adv <|> def) g (apply, l) = (apply pos, ann pos l) f Nothing = ([], firstPosition, syms) f (Just ((p, s), rest)) = (s, p, rest) -- | Given a list of symbols, annotate it with position based on advance rules. -- Each symbol is annotated with its position in the text. In addition to the -- annotated list, the next available position is returned (i.e. the position -- of the next symbol, if another symbol were appended to the list). -- -- >>> enrich defaultAdvance "abc" -- ( [ Positioned 'a' (Position 1 1 1)) -- , Positioned 'b' (Position 1 2 2)) -- ] -- , Position 1 3 3 -- ) -- -- It is implemented using the 'defaultAdvance' as a default, i.e. the entire -- list is always consumed. enrich :: Advance s -> [s] -> ([Positioned s], Position) enrich adv = f . enrichD defaultAnnotate defaultAdvance adv where f (ps, p, _) = (ps, p) -- | Like 'enrich', but takes an annotation function as the first parameter, -- and a default advance as the second parameter. The rest of the parameters -- are the same ones 'enrich' takes. It allows using custom defaults. To have -- no default advance, pass 'empty'. -- -- Since a match of the whole list isn't guaranteed, there is an additional -- list in the return type, containing the rest of the input. If the entire -- input is matched, that list will be @[]@. If no input is matched at all, -- the annotated list is @[]@, the position is 'firstPosition' and the -- additional list (rest of input) is the input list. enrichD :: (Position -> [s] -> [Positioned s]) -> Advance s -> Advance s -> [s] -> ([Positioned s], Position, [s]) enrichD ann def adv syms = f ([], firstPosition, syms) where g = enrichOnceD ann def adv f acc@(ps, p, s) = let (ps', p', s') = g p s in if null ps' then acc else f (ps ++ ps', p', s') -- | Given a regex, create an equivalent position-aware regex. The resulting -- regex reads position-tagged symbols, and returns a position-tagged result. bless :: RE s a -> PosRE s a bless re = g <$> withMatched (comap f re) where f (Positioned c _) = c g (val, []) = Positioned val zeroPosition g (val, Positioned _ p : _) = Positioned val p -- | Tokenize an input list and get list of tokens. If there was an error (no -- regex match), get the text position at which it happened. tokens :: Advance s -- ^ Advance rule for position tagging, e.g. made with -- 'commonAdvance' -> RE s a -- ^ Regex which selects and returns a single token -> [s] -- ^ Input list of symbols -> ( [Positioned a] , Maybe (Positioned s) ) -- ^ List of tokens matched. If the entire input was -- matched, the second element is 'Nothing'. Otherwise, -- it is the (position-tagged) symbol at which matching -- failed. tokens adv re syms = let re' = many $ bless re syms' = fst $ enrich adv syms in case findFirstPrefix re' syms' of Nothing -> ([], Just $ head syms') Just (list, []) -> (list, Nothing) Just (list, x:_) -> (list, Just x) -- | Get some numbers describing the given text (list of symbols): -- -- * The total number of lines -- * The length (number of columns) of the last line -- * The total number of characters -- -- Note that this probably isn't the fastest implementation. It's possible to -- compute directly by counting the lines and the characters. This function is -- here anyway, as a demonstration of using this library. -- -- >>> let adv = commonAdvance 4 True True True True -- >>> textInfo adv "Hello world!\nHow are you?\nWonderful!" -- (3,11,36) textInfo :: Advance s -> [s] -> (Int, Int, Int) textInfo adv syms = g $ f <$> many (adv <|> defaultAdvance) where f flist = h $ foldl (flip (.)) id flist $ firstPosition g re = fromJust $ match re syms h (Position l c ch) = (l, c - 1, ch - 1)