module GF.Text.Lexing (stringOp,opInEnv) where
import GF.Text.Transliterations
import Data.Char (isSpace,toUpper,toLower)
import Data.List (intersperse)
stringOp :: (String -> Bool) -> String -> Maybe (String -> String)
stringOp good name = case name of
"chars" -> Just $ appLexer (filter (not . all isSpace) . map return)
"lextext" -> Just $ appLexer (lexText good)
"lexcode" -> Just $ appLexer lexCode
"lexmixed" -> Just $ appLexer (lexMixed good)
"lexgreek" -> Just $ appLexer lexAGreek
"lexgreek2" -> Just $ appLexer lexAGreek2
"words" -> Just $ appLexer words
"bind" -> Just $ appUnlexer (unwords . bindTok)
"unchars" -> Just $ appUnlexer concat
"unlextext" -> Just $ appUnlexer (unlexText . unquote . bindTok)
"unlexcode" -> Just $ appUnlexer unlexCode
"unlexmixed" -> Just $ appUnlexer (unlexMixed good . unquote . bindTok)
"unlexgreek" -> Just $ appUnlexer unlexAGreek
"unlexnone" -> Just id
"unlexid" -> Just id
"unwords" -> Just $ appUnlexer unwords
"to_html" -> Just wrapHTML
_ -> transliterate name
opInEnv :: String -> String -> (String -> String) -> (String -> String)
opInEnv beg end op = concat . altern False . chop (lbeg, beg) [] where
chop mk@(lg, mark) s0 s =
let (tag,rest) = splitAt lg s in
if tag==mark then (reverse s0) : mark : chop (switch mk) [] rest
else case s of
c:cs -> chop mk (c:s0) cs
[] -> [reverse s0]
switch (lg,mark) = if mark==beg then (lend,end) else (lbeg,beg)
(lbeg,lend) = (length beg, length end)
altern m ts = case ts of
t:ws | not m && t==beg -> t : altern True ws
t:ws | m && t==end -> t : altern False ws
t:ws -> (if m then op t else t) : altern m ws
[] -> []
appLexer :: (String -> [String]) -> String -> String
appLexer f = unwords . filter (not . null) . f
appUnlexer :: ([String] -> String) -> String -> String
appUnlexer f = f . words
wrapHTML :: String -> String
wrapHTML = unlines . tag . intersperse "<br>" . lines where
tag ss = "<html>":"<head>":"<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />":"</head>":"<body>" : ss ++ ["</body>","</html>"]
lexText :: (String -> Bool) -> String -> [String]
lexText good = lexText' (uncapitInit good)
lexText' :: (String->String) -> String -> [String]
lexText' uncap1 = uncap . lext where
lext s = case s of
c:cs | isMajorPunct c -> [c] : uncap (lext cs)
c:cs | isMinorPunct c -> [c] : lext cs
c:cs | isSpace c -> lext cs
_:_ -> let (w,cs) = break (\x -> isSpace x || isPunct x) s in w : lext cs
_ -> [s]
uncap s = case s of
w:ws -> uncap1 w:ws
_ -> s
unlexText :: [String] -> String
unlexText = capitInit . unlext where
unlext s = case s of
w:[] -> w
w:[c]:[] | isPunct c -> w ++ [c]
w:[c]:cs | isMajorPunct c -> w ++ [c] ++ " " ++ capitInit (unlext cs)
w:[c]:cs | isMinorPunct c -> w ++ [c] ++ " " ++ unlext cs
w:ws -> w ++ " " ++ unlext ws
_ -> []
bindTok :: [String] -> [String]
bindTok ws = case ws of
w1:"&+":w2:ws -> bindTok ((w1++w2):ws)
"&+":ws -> bindTok ws
"&|":(c:cs):ws-> bindTok ((toUpper c:cs) : ws)
"&|":ws -> bindTok ws
w:ws -> w:bindTok ws
[] -> []
lexCode :: String -> [String]
lexCode ss = case lex ss of
[(w@(_:_),ws)] -> w : lexCode ws
_ -> []
lexTextAGreek :: String -> [String]
lexTextAGreek s = lext s where
lext s = case s of
c:cs | isAGreekPunct c -> [c] : (lext cs)
c:cs | isSpace c -> lext cs
_:_ -> let (w,cs) = break (\x -> isSpace x || isAGreekPunct x) s
in w : lext cs
[] -> []
lexTextAGreek2 :: String -> [String]
lexTextAGreek2 s = lext s where
lext s = case s of
c:cs | isAGreekPunct c -> [c] : (lext cs)
c:cs | isSpace c -> lext cs
_:_ -> let (w,cs) = break (\x -> isSpace x || isAGreekPunct x) s
in case cs of
'.':'.':d:ds | isSpace d
-> (w++['.']) : lext ('.':d:ds)
'.':d:ds | isAGreekPunct d || isSpace d
-> (w++['.']) : lext (d:ds)
'.':d:ds | not (isSpace d)
-> case lext (d:ds) of
e:es -> (w++['.']++e) : es
es -> (w++['.']) : es
'.':[] -> (w++['.']) : []
_ -> w : lext cs
[] -> []
unlexTextAGreek :: [String] -> String
unlexTextAGreek = unlext where
unlext s = case s of
w:[] -> w
w:[c]:[] | isAGreekPunct c -> w ++ [c]
w:[c]:cs | isAGreekPunct c -> w ++ [c] ++ " " ++ unlext cs
w:ws -> w ++ " " ++ unlext ws
[] -> []
isAGreekPunct = flip elem ".,;··"
lexAGreek :: String -> [String]
lexAGreek = fromAGreek . lexTextAGreek
lexAGreek2 :: String -> [String]
lexAGreek2 = fromAGreek . lexTextAGreek2
unlexAGreek :: [String] -> String
unlexAGreek = unlexTextAGreek . toAGreek
normalize :: String -> String
normalize = (unlexTextAGreek . fromAGreek . lexTextAGreek)
fromAGreek :: [String] -> [String]
fromAGreek s = case s of
w:[]:vs -> w:[]:(fromAGreek vs)
w:(v:vs) | isAGreekPunct (head v) -> w:v:(fromAGreek vs)
w:v:vs | wasEnclitic v && wasEnclitic w ->
getEnclitic w : fromAGreek (v:vs)
w:v:vs | wasEnclitic v && wasProclitic w ->
getProclitic w : getEnclitic v : fromAGreek vs
w:v:vs | wasEnclitic v && (hasEndCircum w ||
(hasEndAcute w && hasSingleAccent w)) ->
w : getEnclitic v : fromAGreek vs
w:v:vs | wasEnclitic v && hasPrefinalAcute w ->
w : getEnclitic v : fromAGreek vs
w:v:vs | wasEnclitic v && hasEndAcute w ->
dropLastAccent w : getEnclitic v : fromAGreek vs
w:v:vs | wasEnclitic w ->
getEnclitic w : fromAGreek (v:vs)
w:ws -> (toAcute w) : (fromAGreek ws)
ws -> ws
denormalize :: String -> String
denormalize = (unlexTextAGreek . toAGreek . lexTextAGreek)
toAGreek :: [String] -> [String]
toAGreek s = case s of
w:[]:vs -> w:[]:(toAGreek vs)
w:v:vs | isAGreekPunct (head v) -> w:[]:v:(toAGreek vs)
w:v:vs | isEnclitic v && isEnclitic w ->
addAcute w : toAGreek (dropAccent v:vs)
w:v:vs | isEnclitic v && isProclitic w ->
addAcute w: (toAGreek (dropAccent v:vs))
w:v:vs | isEnclitic v && (hasEndCircum w || hasEndAcute w) ->
w:(toAGreek (dropAccent v:vs))
w:v:vs | isEnclitic v && hasPrefinalAcute w ->
w:v: toAGreek vs
w:v:vs | isEnclitic v ->
(addAcute w):(toAGreek (dropAccent v:vs))
w:v:vs | isEnclitic w -> w:(toAGreek (v:vs))
w:ws -> (toGrave w) : (toAGreek ws)
ws -> ws
toGrave :: String -> String
toGrave = reverse . grave . reverse where
grave s = case s of
'\'':cs -> '`':cs
c:cs | isAGreekVowel c -> c:cs
c:cs -> c: grave cs
_ -> s
toAcute :: String -> String
toAcute = reverse . acute . reverse where
acute s = case s of
'`':cs -> '\'':cs
c:cs | isAGreekVowel c -> c:cs
c:cs -> c: acute cs
_ -> s
isAGreekVowel = flip elem "aeioyhw"
enclitics = [
"moy","moi","me",
"soy","soi","se",
"oy(","oi(","e(",
"tis*","ti","tina'",
"tino's*","tini'",
"tine's*","tina's*",
"tinw~n","tisi'","tisi'n",
"poy","poi",
"pove'n","pws*",
"ph|","pote'",
"ge","te","toi",
"nyn","per","pw"
]
proclitics = [
"o(","h(","oi(","ai(",
"e)n","ei)s*","e)x","e)k",
"ei)","w(s*",
"oy)","oy)k","oy)c"
]
isEnclitic = flip elem enclitics
isProclitic = flip elem proclitics
wasEnclitic = let unaccented = (filter (not . hasAccent) enclitics)
++ (map dropAccent (filter hasAccent enclitics))
accented = (filter hasAccent enclitics)
++ map addAcute (filter (not . hasAccent) enclitics)
in flip elem (accented ++ unaccented)
wasProclitic = flip elem (map addAcute proclitics)
getEnclitic =
let pairs = zip (enclitics ++ (map dropAccent (filter hasAccent enclitics))
++ (map addAcute (filter (not . hasAccent) enclitics)))
(enclitics ++ (filter hasAccent enclitics)
++ (filter (not . hasAccent) enclitics))
find = \v -> lookup v pairs
in \v -> case (find v) of
Just x -> x
_ -> v
getProclitic =
let pairs = zip (map addAcute proclitics) proclitics
find = \v -> lookup v pairs
in \v -> case (find v) of
Just x -> x
_ -> v
dropAccent = reverse . drop . reverse where
drop s = case s of
[] -> []
'\'':cs -> cs
'`':cs -> cs
'~':cs -> cs
c:cs -> c:drop cs
dropLastAccent = reverse . drop . reverse where
drop s = case s of
[] -> []
'\'':cs -> cs
'`':cs -> cs
'~':cs -> cs
c:cs -> c:drop cs
addAcute :: String -> String
addAcute = reverse . acu . reverse where
acu w = case w of
c:cs | c == '\'' -> c:cs
c:cs | c == '(' -> '\'':c:cs
c:cs | c == ')' -> '\'':c:cs
c:cs | isAGreekVowel c -> '\'':c:cs
c:cs -> c : acu cs
_ -> w
hasEndAcute = find . reverse where
find s = case s of
[] -> False
'\'':cs -> True
'`':cs -> False
'~':cs -> False
c:cs | isAGreekVowel c -> False
_:cs -> find cs
hasEndCircum = find . reverse where
find s = case s of
[] -> False
'\'':cs -> False
'`':cs -> False
'~':cs -> True
c:cs | isAGreekVowel c -> False
_:cs -> find cs
hasPrefinalAcute = find . reverse where
find s = case s of
[] -> False
'\'':cs -> False
'`':cs -> False
'~':cs -> False
c:d:cs | isAGreekVowel c && isAGreekVowel d -> findNext cs
c:cs | isAGreekVowel c -> findNext cs
_:cs -> find cs where
findNext s = case s of
[] -> False
'\'':cs -> True
'`':cs -> False
'~':cs -> False
c:cs | isAGreekVowel c -> False
_:cs -> findNext cs where
hasSingleAccent v =
hasAccent v && not (hasAccent (dropLastAccent v))
hasAccent v = case v of
[] -> False
c:cs -> elem c ['\'','`','~'] || hasAccent cs
enclitics_expls =
"sofw~n tis*":"sofw~n tine's*":"sof~n tinw~n":
"sofo's tis*":"sofoi' tine's*":
"ei) tis*":"ei) tine's*":
"a)'nvrwpos* tis*":"a)'nvrwpoi tine's*":
"doy~los* tis*":"doy~loi tine's*":
"lo'gos* tis*":"lo'goi tine's*":"lo'gwn tinw~n":
"ei) poy tis* tina' i)'doi":
[]
unlexCode :: [String] -> String
unlexCode s = case s of
w:[] -> w
[c]:cs | isParen c -> [c] ++ unlexCode cs
w:cs@([c]:_) | isClosing c -> w ++ unlexCode cs
w:ws -> w ++ " " ++ unlexCode ws
_ -> []
lexLatexCode :: String -> [String]
lexLatexCode = restoreBackslash . lexCode where
restoreBackslash ws = case ws of
"\\":w:ww -> ("\\" ++ w) : restoreBackslash ww
w:ww -> w:restoreBackslash ww
_ -> ws
lexMixed :: (String -> Bool) -> String -> [String]
lexMixed good = concat . alternate False [] where
alternate env t s = case s of
'$':cs -> lex env (reverse t) : ["$"] : alternate (not env) [] cs
'\\':c:cs | elem c "()[]" -> lex env (reverse t) : [['\\',c]] : alternate (not env) [] cs
c:cs -> alternate env (c:t) cs
_ -> [lex env (reverse t)]
lex env = if env then lexLatexCode else lexText good
unlexMixed :: (String -> Bool) -> [String] -> String
unlexMixed good = capitInit . concat . alternate False where
alternate env s = case s of
_:_ -> case break (flip elem ["$","\\[","\\]","\\(","\\)"]) s of
(t,[]) -> unlex env t : []
(t,c:m) -> unlex env t : sep env c m : alternate (not env) m
_ -> []
unlex env = if env then unlexCode else (uncapitInit good . unlexText)
sep env c m = case (m,env) of
([p]:_,True) | isPunct p -> c
(_, True) -> c ++ " "
_ -> " " ++ c
capitInit s = case s of
c:cs -> toUpper c : cs
_ -> s
uncapitInit good s =
case s of
c:cs | not (good s) -> toLower c : cs
_ -> s
unquote = map unq where
unq s = case s of
'"':cs@(_:_) | last cs == '"' -> init cs
_ -> s
isPunct = flip elem ".?!,:;"
isMajorPunct = flip elem ".?!"
isMinorPunct = flip elem ",:;"
isParen = flip elem "()[]{}"
isClosing = flip elem ")]}"