module Data.Char.Devanagari.Tokenizer
( selectTokenizerByContent,
tokenize,
fromDevanagari,
fromIso,
fromHarvard,
fromIast,
Tokenizer,
)
where
import Control.Monad (join)
import Data.Char.Devanagari.DevanagariTokens
import Data.Char.Devanagari.TokenTables
import Data.Map.Strict (Map)
import qualified Data.Map.Strict as Map
import Data.Maybe (isJust)
import Data.Sequence (Seq (Empty, (:<|)),
empty, (|>))
import Data.Text (Text)
import qualified Data.Text as T
type Tokenizer = (Text -> Seq DevanagariToken)
type ParseMap = Map Text DevanagariToken
harvardKyotoParseMap :: ParseMap
harvardKyotoParseMap :: ParseMap
harvardKyotoParseMap = [(Text, DevanagariToken)] -> ParseMap
forall k a. Ord k => [(k, a)] -> Map k a
Map.fromList ([(Text, DevanagariToken)]
harvardKyotoTable [(Text, DevanagariToken)]
-> [(Text, DevanagariToken)] -> [(Text, DevanagariToken)]
forall a. [a] -> [a] -> [a]
++ [(Text, DevanagariToken)]
inputVirams)
iastParseMap :: ParseMap
iastParseMap :: ParseMap
iastParseMap = [(Text, DevanagariToken)] -> ParseMap
forall k a. Ord k => [(k, a)] -> Map k a
Map.fromList ([(Text, DevanagariToken)]
iastTable [(Text, DevanagariToken)]
-> [(Text, DevanagariToken)] -> [(Text, DevanagariToken)]
forall a. [a] -> [a] -> [a]
++ [(Text, DevanagariToken)]
inputVirams)
isoParseMap :: ParseMap
isoParseMap :: ParseMap
isoParseMap = [(Text, DevanagariToken)] -> ParseMap
forall k a. Ord k => [(k, a)] -> Map k a
Map.fromList ([(Text, DevanagariToken)]
isoTable [(Text, DevanagariToken)]
-> [(Text, DevanagariToken)] -> [(Text, DevanagariToken)]
forall a. [a] -> [a] -> [a]
++ [(Text, DevanagariToken)]
inputVirams)
devanagariParseMap :: ParseMap
devanagariParseMap :: ParseMap
devanagariParseMap = [(Text, DevanagariToken)] -> ParseMap
forall k a. Ord k => [(k, a)] -> Map k a
Map.fromList ([(Text, DevanagariToken)]
devanagariIndependentTable [(Text, DevanagariToken)]
-> [(Text, DevanagariToken)] -> [(Text, DevanagariToken)]
forall a. [a] -> [a] -> [a]
++ [(Text, DevanagariToken)]
devanagariDependentTable [(Text, DevanagariToken)]
-> [(Text, DevanagariToken)] -> [(Text, DevanagariToken)]
forall a. [a] -> [a] -> [a]
++ [(Text, DevanagariToken)]
inputVirams)
parse :: ParseMap -> Text -> Seq DevanagariToken
parse :: ParseMap -> Text -> Seq DevanagariToken
parse ParseMap
pMap Text
s = Text -> ParseMap -> Seq DevanagariToken -> Seq DevanagariToken
parse1 Text
s ParseMap
pMap Seq DevanagariToken
forall a. Seq a
empty
where
parse1 :: Text -> ParseMap -> Seq DevanagariToken -> Seq DevanagariToken
parse1 :: Text -> ParseMap -> Seq DevanagariToken -> Seq DevanagariToken
parse1 Text
str ParseMap
_ Seq DevanagariToken
tokens
| Text
str Text -> Text -> Bool
forall a. Eq a => a -> a -> Bool
== Text
T.empty = Seq DevanagariToken
tokens
parse1 Text
str ParseMap
parseMap Seq DevanagariToken
tokens =
case Text -> Int -> ParseMap -> Maybe (DevanagariToken, Text)
tryMatch Text
str Int
3 ParseMap
parseMap of
Just (DevanagariToken
token, Text
rest) -> Text -> ParseMap -> Seq DevanagariToken -> Seq DevanagariToken
parse1 Text
rest ParseMap
parseMap (Seq DevanagariToken
tokens Seq DevanagariToken -> DevanagariToken -> Seq DevanagariToken
forall a. Seq a -> a -> Seq a
|> DevanagariToken
token)
Maybe (DevanagariToken, Text)
Nothing ->
case Text -> Int -> ParseMap -> Maybe (DevanagariToken, Text)
tryMatch Text
str Int
2 ParseMap
parseMap of
Just (DevanagariToken
token, Text
rest) -> Text -> ParseMap -> Seq DevanagariToken -> Seq DevanagariToken
parse1 Text
rest ParseMap
parseMap (Seq DevanagariToken
tokens Seq DevanagariToken -> DevanagariToken -> Seq DevanagariToken
forall a. Seq a -> a -> Seq a
|> DevanagariToken
token)
Maybe (DevanagariToken, Text)
Nothing ->
case Text -> Int -> ParseMap -> Maybe (DevanagariToken, Text)
tryMatch Text
str Int
1 ParseMap
parseMap of
Just (DevanagariToken
token, Text
rest) -> Text -> ParseMap -> Seq DevanagariToken -> Seq DevanagariToken
parse1 Text
rest ParseMap
parseMap (Seq DevanagariToken
tokens Seq DevanagariToken -> DevanagariToken -> Seq DevanagariToken
forall a. Seq a -> a -> Seq a
|> DevanagariToken
token)
Maybe (DevanagariToken, Text)
Nothing -> Text -> ParseMap -> Seq DevanagariToken -> Seq DevanagariToken
parse1 (Int -> Text -> Text
T.drop Int
1 Text
str) ParseMap
parseMap (Seq DevanagariToken
tokens Seq DevanagariToken -> DevanagariToken -> Seq DevanagariToken
forall a. Seq a -> a -> Seq a
|> Char -> DevanagariToken
Unmapped ([Char] -> Char
forall a. HasCallStack => [a] -> a
head ([Char] -> Char) -> [Char] -> Char
forall a b. (a -> b) -> a -> b
$ Text -> [Char]
T.unpack (Text -> [Char]) -> Text -> [Char]
forall a b. (a -> b) -> a -> b
$ Int -> Text -> Text
T.take Int
1 Text
str))
tryMatch :: Text -> Int -> ParseMap -> Maybe (DevanagariToken, Text)
tryMatch :: Text -> Int -> ParseMap -> Maybe (DevanagariToken, Text)
tryMatch Text
str Int
n ParseMap
parseMap =
let tok :: Text
tok = Int -> Text -> Text
T.take Int
n Text
str
rest :: Text
rest = (Text, Text) -> Text
forall a b. (a, b) -> b
snd ((Text, Text) -> Text) -> (Text, Text) -> Text
forall a b. (a -> b) -> a -> b
$ Int -> Text -> (Text, Text)
T.splitAt (Text -> Int
T.length Text
tok) Text
str
maybeToken :: Maybe DevanagariToken
maybeToken = Text -> ParseMap -> Maybe DevanagariToken
forall k a. Ord k => k -> Map k a -> Maybe a
Map.lookup Text
tok ParseMap
parseMap
in case Maybe DevanagariToken
maybeToken of
Just DevanagariToken
token -> (DevanagariToken, Text) -> Maybe (DevanagariToken, Text)
forall a. a -> Maybe a
Just (DevanagariToken
token, Text
rest)
Maybe DevanagariToken
Nothing -> Maybe (DevanagariToken, Text)
forall a. Maybe a
Nothing
fromIast :: Tokenizer
fromIast :: Text -> Seq DevanagariToken
fromIast = ParseMap -> Text -> Seq DevanagariToken
parse ParseMap
iastParseMap
fromIso :: Tokenizer
fromIso :: Text -> Seq DevanagariToken
fromIso = ParseMap -> Text -> Seq DevanagariToken
parse ParseMap
isoParseMap
fromHarvard :: Tokenizer
fromHarvard :: Text -> Seq DevanagariToken
fromHarvard = ParseMap -> Text -> Seq DevanagariToken
parse ParseMap
harvardKyotoParseMap
fromDevanagari :: Tokenizer
fromDevanagari :: Text -> Seq DevanagariToken
fromDevanagari Text
s = Seq DevanagariToken -> Seq DevanagariToken -> Seq DevanagariToken
addExplicitVowA Seq DevanagariToken
forall a. Seq a
empty (ParseMap -> Text -> Seq DevanagariToken
parse ParseMap
devanagariParseMap Text
s)
where
addExplicitVowA :: Seq DevanagariToken -> Seq DevanagariToken -> Seq DevanagariToken
addExplicitVowA :: Seq DevanagariToken -> Seq DevanagariToken -> Seq DevanagariToken
addExplicitVowA Seq DevanagariToken
acc Seq DevanagariToken
Empty = Seq DevanagariToken
acc
addExplicitVowA Seq DevanagariToken
acc (cons :: DevanagariToken
cons@(Cons Consonant
_) :<| DevanagariToken
Virama :<| Seq DevanagariToken
xs) = Seq DevanagariToken -> Seq DevanagariToken -> Seq DevanagariToken
addExplicitVowA (Seq DevanagariToken
acc Seq DevanagariToken -> DevanagariToken -> Seq DevanagariToken
forall a. Seq a -> a -> Seq a
|> DevanagariToken
cons) Seq DevanagariToken
xs
addExplicitVowA Seq DevanagariToken
acc (cons :: DevanagariToken
cons@(Cons Consonant
_) :<| vow :: DevanagariToken
vow@(Vow Vowel
_) :<| Seq DevanagariToken
xs) = Seq DevanagariToken -> Seq DevanagariToken -> Seq DevanagariToken
addExplicitVowA (Seq DevanagariToken
acc Seq DevanagariToken -> DevanagariToken -> Seq DevanagariToken
forall a. Seq a -> a -> Seq a
|> DevanagariToken
cons Seq DevanagariToken -> DevanagariToken -> Seq DevanagariToken
forall a. Seq a -> a -> Seq a
|> DevanagariToken
vow) Seq DevanagariToken
xs
addExplicitVowA Seq DevanagariToken
acc (cons :: DevanagariToken
cons@(Cons Consonant
_) :<| Seq DevanagariToken
xs) = Seq DevanagariToken -> Seq DevanagariToken -> Seq DevanagariToken
addExplicitVowA (Seq DevanagariToken
acc Seq DevanagariToken -> DevanagariToken -> Seq DevanagariToken
forall a. Seq a -> a -> Seq a
|> DevanagariToken
cons Seq DevanagariToken -> DevanagariToken -> Seq DevanagariToken
forall a. Seq a -> a -> Seq a
|> Vowel -> DevanagariToken
Vow Vowel
A) Seq DevanagariToken
xs
addExplicitVowA Seq DevanagariToken
acc (DevanagariToken
x :<| Seq DevanagariToken
xs) = Seq DevanagariToken -> Seq DevanagariToken -> Seq DevanagariToken
addExplicitVowA (Seq DevanagariToken
acc Seq DevanagariToken -> DevanagariToken -> Seq DevanagariToken
forall a. Seq a -> a -> Seq a
|> DevanagariToken
x) Seq DevanagariToken
xs
tokenize :: Tokenizer
tokenize :: Text -> Seq DevanagariToken
tokenize = (Text -> Text -> Seq DevanagariToken)
-> Text -> Seq DevanagariToken
forall (m :: * -> *) a. Monad m => m (m a) -> m a
join Text -> Text -> Seq DevanagariToken
selectTokenizerByContent
selectTokenizerByContent :: Text -> Tokenizer
selectTokenizerByContent :: Text -> Text -> Seq DevanagariToken
selectTokenizerByContent Text
str
| Text -> Bool
containsDevanagari Text
str = Text -> Seq DevanagariToken
fromDevanagari
| Text -> Bool
containsIso Text
str = Text -> Seq DevanagariToken
fromIso
| Text -> Bool
containsIast Text
str = Text -> Seq DevanagariToken
fromIast
| Bool
otherwise = Text -> Seq DevanagariToken
fromHarvard
where
containsDevanagari :: Text -> Bool
containsDevanagari = [Char] -> Text -> Bool
containsAnyOf ([Char
'\x900' .. Char
'\x963'] [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++ [Char
'\x966' .. Char
'\x97F'])
containsIso :: Text -> Bool
containsIso = [Char] -> Text -> Bool
containsAnyOf ([Char]
"ēōṁ" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++ [Char
'\0325', Char
'\0304'])
containsIast :: Text -> Bool
containsIast = [Char] -> Text -> Bool
containsAnyOf ([Char
'\241' .. Char
'\363'] [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++ [Char
'\7693' .. Char
'\7789'])
containsAnyOf :: [Char] -> Text -> Bool
containsAnyOf :: [Char] -> Text -> Bool
containsAnyOf [Char]
chars Text
text = (Char -> Bool) -> Text -> Bool
T.any (Char -> Text -> Bool
`isInfixOf` Text
text) (Text -> Bool) -> Text -> Bool
forall a b. (a -> b) -> a -> b
$ [Char] -> Text
T.pack [Char]
chars
isInfixOf :: Char -> Text -> Bool
isInfixOf Char
c Text
text = Maybe Char -> Bool
forall a. Maybe a -> Bool
isJust ((Char -> Bool) -> Text -> Maybe Char
T.find (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
c) Text
text)