{-# LANGUAGE BangPatterns, RecordWildCards, CPP #-}
-- |
-- Module      : Data.Text.ICU.Break.Pure
-- Copyright   : (c) 2010 Bryan O'Sullivan
--
-- License     : BSD-style
-- Maintainer  : bos@serpentine.com
-- Stability   : experimental
-- Portability : GHC
--
-- String breaking functions for Unicode, implemented as bindings to
-- the International Components for Unicode (ICU) libraries.
--
-- The text boundary positions are found according to the rules described in
-- Unicode Standard Annex #29, Text Boundaries, and Unicode Standard Annex
-- #14, Line Breaking Properties.  These are available at
-- <http://www.unicode.org/reports/tr14/> and
-- <http://www.unicode.org/reports/tr29/>.

module Data.Text.ICU.Break.Pure
    (
    -- * Types
      Breaker
    , Break
    , brkPrefix
    , brkBreak
    , brkSuffix
    , brkStatus
    , Line(..)
    , Data.Text.ICU.Break.Word(..)
    -- * Breaking functions
    , breakCharacter
    , breakLine
    , breakSentence
    , breakWord
    -- * Iteration
    , breaks
    , breaksRight
    ) where

import Control.DeepSeq (NFData(..))
import Data.Text (Text, empty)
import Data.Text.ICU.Break (Line, Word)
import Data.Text.ICU.Break.Types (BreakIterator(..))
import Data.Text.ICU.Internal (LocaleName, takeWord, dropWord)
import System.IO.Unsafe (unsafeInterleaveIO, unsafePerformIO)
import qualified Data.Text.ICU.Break as IO

-- | A boundary analyser.
newtype Breaker a = B (BreakIterator a)

new :: (LocaleName -> Text -> IO (BreakIterator a)) -> LocaleName -> Breaker a
new :: forall a.
(LocaleName -> Text -> IO (BreakIterator a))
-> LocaleName -> Breaker a
new LocaleName -> Text -> IO (BreakIterator a)
act LocaleName
loc = IO (Breaker a) -> Breaker a
forall a. IO a -> a
unsafePerformIO (IO (Breaker a) -> Breaker a) -> IO (Breaker a) -> Breaker a
forall a b. (a -> b) -> a -> b
$ BreakIterator a -> Breaker a
forall a. BreakIterator a -> Breaker a
B (BreakIterator a -> Breaker a)
-> IO (BreakIterator a) -> IO (Breaker a)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
`fmap` LocaleName -> Text -> IO (BreakIterator a)
act LocaleName
loc Text
empty

-- | Break a string on character boundaries.
--
-- Character boundary analysis identifies the boundaries of "Extended
-- Grapheme Clusters", which are groupings of codepoints that should be
-- treated as character-like units for many text operations.  Please see
-- Unicode Standard Annex #29, Unicode Text Segmentation,
-- <http://www.unicode.org/reports/tr29/> for additional information on
-- grapheme clusters and guidelines on their use.
breakCharacter :: LocaleName -> Breaker ()
breakCharacter :: LocaleName -> Breaker ()
breakCharacter = (LocaleName -> Text -> IO (BreakIterator ()))
-> LocaleName -> Breaker ()
forall a.
(LocaleName -> Text -> IO (BreakIterator a))
-> LocaleName -> Breaker a
new LocaleName -> Text -> IO (BreakIterator ())
IO.breakCharacter

-- | Break a string on line boundaries.
--
-- Line boundary analysis determines where a text string can be broken when
-- line wrapping. The mechanism correctly handles punctuation and hyphenated
-- words.
breakLine :: LocaleName -> Breaker Line
breakLine :: LocaleName -> Breaker Line
breakLine = (LocaleName -> Text -> IO (BreakIterator Line))
-> LocaleName -> Breaker Line
forall a.
(LocaleName -> Text -> IO (BreakIterator a))
-> LocaleName -> Breaker a
new LocaleName -> Text -> IO (BreakIterator Line)
IO.breakLine

-- | Break a string on sentence boundaries.
--
-- Sentence boundary analysis allows selection with correct interpretation
-- of periods within numbers and abbreviations, and trailing punctuation
-- marks such as quotation marks and parentheses.
breakSentence :: LocaleName -> Breaker ()
breakSentence :: LocaleName -> Breaker ()
breakSentence = (LocaleName -> Text -> IO (BreakIterator ()))
-> LocaleName -> Breaker ()
forall a.
(LocaleName -> Text -> IO (BreakIterator a))
-> LocaleName -> Breaker a
new LocaleName -> Text -> IO (BreakIterator ())
IO.breakSentence

-- | Break a string on word boundaries.
--
-- Word boundary analysis is used by search and replace functions, as well
-- as within text editing applications that allow the user to select words
-- with a double click. Word selection provides correct interpretation of
-- punctuation marks within and following words. Characters that are not
-- part of a word, such as symbols or punctuation marks, have word breaks on
-- both sides.
breakWord :: LocaleName -> Breaker Data.Text.ICU.Break.Word
breakWord :: LocaleName -> Breaker Word
breakWord = (LocaleName -> Text -> IO (BreakIterator Word))
-> LocaleName -> Breaker Word
forall a.
(LocaleName -> Text -> IO (BreakIterator a))
-> LocaleName -> Breaker a
new LocaleName -> Text -> IO (BreakIterator Word)
IO.breakWord

-- | A break in a string.
data Break a = Break {
      forall a. Break a -> Text
brkPrefix :: {-# UNPACK #-} !Text -- ^ Prefix of the current break.
    , forall a. Break a -> Text
brkBreak :: {-# UNPACK #-} !Text  -- ^ Text of the current break.
    , forall a. Break a -> Text
brkSuffix :: {-# UNPACK #-} !Text -- ^ Suffix of the current break.
    , forall a. Break a -> a
brkStatus :: !a
    -- ^ Status of the current break (only meaningful if 'Line' or 'Word').
    } deriving (Break a -> Break a -> Bool
(Break a -> Break a -> Bool)
-> (Break a -> Break a -> Bool) -> Eq (Break a)
forall a. Eq a => Break a -> Break a -> Bool
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: Break a -> Break a -> Bool
$c/= :: forall a. Eq a => Break a -> Break a -> Bool
== :: Break a -> Break a -> Bool
$c== :: forall a. Eq a => Break a -> Break a -> Bool
Eq, Int -> Break a -> ShowS
[Break a] -> ShowS
Break a -> String
(Int -> Break a -> ShowS)
-> (Break a -> String) -> ([Break a] -> ShowS) -> Show (Break a)
forall a. Show a => Int -> Break a -> ShowS
forall a. Show a => [Break a] -> ShowS
forall a. Show a => Break a -> String
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [Break a] -> ShowS
$cshowList :: forall a. Show a => [Break a] -> ShowS
show :: Break a -> String
$cshow :: forall a. Show a => Break a -> String
showsPrec :: Int -> Break a -> ShowS
$cshowsPrec :: forall a. Show a => Int -> Break a -> ShowS
Show)

instance (NFData a) => NFData (Break a) where
    rnf :: Break a -> ()
rnf Break{a
Text
brkStatus :: a
brkSuffix :: Text
brkBreak :: Text
brkPrefix :: Text
brkStatus :: forall a. Break a -> a
brkSuffix :: forall a. Break a -> Text
brkBreak :: forall a. Break a -> Text
brkPrefix :: forall a. Break a -> Text
..} = a -> ()
forall a. NFData a => a -> ()
rnf a
brkStatus

-- | Return a list of all breaks in a string, from left to right.
breaks :: Breaker a -> Text -> [Break a]
breaks :: forall a. Breaker a -> Text -> [Break a]
breaks (B BreakIterator a
b) Text
t = IO [Break a] -> [Break a]
forall a. IO a -> a
unsafePerformIO (IO [Break a] -> [Break a]) -> IO [Break a] -> [Break a]
forall a b. (a -> b) -> a -> b
$ do
  BreakIterator a
bi <- BreakIterator a -> IO (BreakIterator a)
forall a. BreakIterator a -> IO (BreakIterator a)
IO.clone BreakIterator a
b
  BreakIterator a -> Text -> IO ()
forall a. BreakIterator a -> Text -> IO ()
IO.setText BreakIterator a
bi Text
t
  let go :: TextI -> IO [Break a]
go TextI
p = do
        Maybe TextI
mix <- BreakIterator a -> IO (Maybe TextI)
forall a. BreakIterator a -> IO (Maybe TextI)
IO.next BreakIterator a
bi
        case Maybe TextI
mix of
          Maybe TextI
Nothing -> [Break a] -> IO [Break a]
forall (m :: * -> *) a. Monad m => a -> m a
return []
          Just TextI
n -> do
            a
s <- BreakIterator a -> IO a
forall a. BreakIterator a -> IO a
IO.getStatus BreakIterator a
bi
            let d :: TextI
d = TextI
nTextI -> TextI -> TextI
forall a. Num a => a -> a -> a
-TextI
p
                u :: Text
u = TextI -> Text -> Text
dropWord TextI
p Text
t
            (Text -> Text -> Text -> a -> Break a
forall a. Text -> Text -> Text -> a -> Break a
Break (TextI -> Text -> Text
takeWord TextI
p Text
t) (TextI -> Text -> Text
takeWord TextI
d Text
u) (TextI -> Text -> Text
dropWord TextI
d Text
u) a
s Break a -> [Break a] -> [Break a]
forall a. a -> [a] -> [a]
:) ([Break a] -> [Break a]) -> IO [Break a] -> IO [Break a]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
`fmap` TextI -> IO [Break a]
go TextI
n
  IO [Break a] -> IO [Break a]
forall a. IO a -> IO a
unsafeInterleaveIO (IO [Break a] -> IO [Break a]) -> IO [Break a] -> IO [Break a]
forall a b. (a -> b) -> a -> b
$ TextI -> IO [Break a]
go (TextI -> IO [Break a]) -> IO TextI -> IO [Break a]
forall (m :: * -> *) a b. Monad m => (a -> m b) -> m a -> m b
=<< BreakIterator a -> IO TextI
forall a. BreakIterator a -> IO TextI
IO.first BreakIterator a
bi

-- | Return a list of all breaks in a string, from right to left.
breaksRight :: Breaker a -> Text -> [Break a]
breaksRight :: forall a. Breaker a -> Text -> [Break a]
breaksRight (B BreakIterator a
b) Text
t = IO [Break a] -> [Break a]
forall a. IO a -> a
unsafePerformIO (IO [Break a] -> [Break a]) -> IO [Break a] -> [Break a]
forall a b. (a -> b) -> a -> b
$ do
  BreakIterator a
bi <- BreakIterator a -> IO (BreakIterator a)
forall a. BreakIterator a -> IO (BreakIterator a)
IO.clone BreakIterator a
b
  BreakIterator a -> Text -> IO ()
forall a. BreakIterator a -> Text -> IO ()
IO.setText BreakIterator a
bi Text
t
  let go :: TextI -> IO [Break a]
go TextI
p = do
        Maybe TextI
mix <- BreakIterator a -> IO (Maybe TextI)
forall a. BreakIterator a -> IO (Maybe TextI)
IO.previous BreakIterator a
bi
        case Maybe TextI
mix of
          Maybe TextI
Nothing -> [Break a] -> IO [Break a]
forall (m :: * -> *) a. Monad m => a -> m a
return []
          Just TextI
n -> do
            a
s <- BreakIterator a -> IO a
forall a. BreakIterator a -> IO a
IO.getStatus BreakIterator a
bi
            let d :: TextI
d = TextI
pTextI -> TextI -> TextI
forall a. Num a => a -> a -> a
-TextI
n
                u :: Text
u = TextI -> Text -> Text
dropWord TextI
n Text
t
            (Text -> Text -> Text -> a -> Break a
forall a. Text -> Text -> Text -> a -> Break a
Break (TextI -> Text -> Text
takeWord TextI
n Text
t) (TextI -> Text -> Text
takeWord TextI
d Text
u) (TextI -> Text -> Text
dropWord TextI
d Text
u) a
s Break a -> [Break a] -> [Break a]
forall a. a -> [a] -> [a]
:) ([Break a] -> [Break a]) -> IO [Break a] -> IO [Break a]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
`fmap` TextI -> IO [Break a]
go TextI
n
  IO [Break a] -> IO [Break a]
forall a. IO a -> IO a
unsafeInterleaveIO (IO [Break a] -> IO [Break a]) -> IO [Break a] -> IO [Break a]
forall a b. (a -> b) -> a -> b
$ TextI -> IO [Break a]
go (TextI -> IO [Break a]) -> IO TextI -> IO [Break a]
forall (m :: * -> *) a b. Monad m => (a -> m b) -> m a -> m b
=<< BreakIterator a -> IO TextI
forall a. BreakIterator a -> IO TextI
IO.last BreakIterator a
bi