module Text.MoeDict where
import Data.String
import Data.Text (Text)
import Data.Text.Encoding as E
import qualified Data.Text as T
import qualified Data.Map.Strict as Map
import Data.Aeson
import Data.Aeson.TH
import Data.Maybe (catMaybes)
import Control.Applicative
import Data.HashMap.Strict ((!))
import qualified Data.ByteString.Lazy as B
import Data.Map.Strict (Map)
import Data.List (sortBy)
import Data.Function (on)
type Str = Text
data Pronunciation = Pronunciation { bopomofo :: Str, bopomofo2 :: Str, pinyin :: Str } deriving (Show, Eq, Ord)
$(deriveJSON defaultOptions ''Pronunciation)
newtype Quote = Quote Str deriving (Show, IsString, FromJSON, ToJSON, Eq, Ord)
data Radical = Radical
{ letter :: Char
, strokeCount :: Count
, nonRadicalStrokeCount :: Count
, shapeDescription :: Maybe Text
} deriving (Show, Ord, Eq)
newtype Example = Example Str deriving (Show, IsString, FromJSON, ToJSON, Eq, Ord)
newtype Title = Title { titleText :: Str } deriving (Show, IsString, FromJSON, ToJSON, Ord, Eq)
newtype Link = Link Str deriving (Show, IsString, FromJSON, ToJSON, Eq, Ord)
newtype Count = Count Int deriving (Show, Ord, Eq, FromJSON, ToJSON, Enum)
data Part = Preposition | Pronoun | Adverb | Particle | Verb | Noun | Adjective | Exclamation | Onomatopoeia | Affix | Conjunction | Note
| Slang | Loanword | Derivative
| Composition | Synonym | Antonym
| Cantonese | PuTongHua | None
deriving (Show, Eq, Ord)
data POS = POS { label :: Text, part :: Part } deriving (Show, Eq, Ord)
data Reference = Reference
{ refType :: Maybe POS
, refText :: Text
} deriving (Show, Eq, Ord)
instance FromJSON POS where
parseJSON (String s) = maybe (fail $ show s) (pure . POS s) $ lookup s
[ ("介", Preposition), ("代", Pronoun), ("副", Adverb)
, ("助", Particle), ("動", Verb), ("名", Noun)
, ("形", Adjective), ("歎", Exclamation), ("狀", Onomatopoeia)
, ("綴", Affix), ("連", Conjunction), ("辨似", Note)
, ("俚", Slang), ("外", Loanword), ("衍", Derivative)
, ("孳", Composition), ("同", Synonym), ("反", Antonym)
, ("廣東話", Cantonese), ("普通話", PuTongHua)
, ("", None)
]
parseJSON x = fail $ show x
data Entry = Entry
{ title :: Title
, radical :: Maybe Radical
, heteronyms :: [Heteronym]
, references :: [Reference]
} deriving (Show, Eq, Ord)
data Heteronym = Heteronym
{ pronunciation :: Pronunciation
, definitions :: [Definition]
} deriving (Show, Eq, Ord)
data Definition = Definition
{ definition :: Text
, pos :: Maybe POS
, examples :: [Example]
, quotes :: [Quote]
, links :: [Link]
, antonyms :: [Title]
, synonyms :: [Title]
} deriving (Show, Eq, Ord)
instance FromJSON (Maybe Entry) where
parseJSON j = Just <$> parseJSON j <|> return Nothing
instance FromJSON (Maybe Heteronym) where
parseJSON j = Just <$> parseJSON j <|> return Nothing
instance FromJSON Entry where
parseJSON (Object o) = do
title <- o .: "title"
references <- maybeList <$> o .:? "references"
heteronyms <- catMaybes <$> o .: "heteronyms"
rv <- o .:? "radical"
radical <- case rv of
Nothing -> return Nothing
Just letter -> Just <$> do
strokeCount <- o .: "stroke_count"
nonRadicalStrokeCount <- o .: "non_radical_stroke_count"
shapeDescription <- o .:? "shape_description"
return Radical{..}
return Entry{..}
instance FromJSON Reference where
parseJSON (Object o) = do
pos <- o .:? "type"
txt <- o .: "text"
return $ Reference pos txt
instance FromJSON Definition where
parseJSON (Object o) = do
definition <- o .: "def"
pos <- o .:? "type"
examples <- maybeList <$> o .:? "example"
quotes <- maybeList <$> o .:? "quote"
links <- maybeList <$> o .:? "link"
antonyms <- maybeTitles <$> o .:? "antonyms"
synonyms <- maybeTitles <$> o .:? "synonyms"
return Definition {..}
where
maybeTitles Nothing = []
maybeTitles (Just xs) = Title <$> (T.splitOn "," xs)
maybeList Nothing = []
maybeList (Just xs) = xs
instance ToJSON Definition where
toJSON Definition {..} = error "NYI"
instance FromJSON Heteronym where
parseJSON json@(Object o) = do
pronunciation <- parseJSON json
definitions <- o .: "definitions"
return $ Heteronym { pronunciation, definitions }
instance ToJSON Heteronym where
toJSON Heteronym {..} = object
[ ("pronunciation" .= pronunciation)
, ("definitions" .= definitions)
]
parseMoeDictFile :: FilePath -> IO [Entry]
parseMoeDictFile fn = do
decoded <- eitherDecode <$> B.readFile fn
either fail (return . catMaybes) decoded
type RadicalLetter = Char
type PinYin = T.Text
data HeadWord = HeadWord
{ headChar :: !Char
, headSound :: !PinYin
} deriving (Show, Eq, Ord)
type EntryMap = Map HeadWord [Entry]
data Cluster = Cluster
{ radicalLetter :: RadicalLetter
, headWord :: !HeadWord
, clusterEntries :: [Entry]
} deriving (Show)
entriesToMap :: [Entry] -> Map RadicalLetter EntryMap
entriesToMap entries = Map.fromList $ ms
where
entriesSplitted = concatMap splitHeteronym entries
entriesByHeteronym = [(entryHead e, [e]) | e <- entriesSplitted]
cs :: [Cluster]
cs = map es2cluster $ Map.toList $ Map.fromListWith (++) entriesByHeteronym
ms = map cs2map $ Map.elems $ Map.fromListWith (++) [ (radicalLetter c, [c]) | c <- cs ]
cs2map :: [Cluster] -> (RadicalLetter, EntryMap)
cs2map clusters@(Cluster{radicalLetter}:_) = (radicalLetter, Map.fromList $ map (\x -> (headWord x, clusterEntries x)) clusters)
cs2map _ = error "impossible"
es2cluster :: (HeadWord, [Entry]) -> Cluster
es2cluster (headWord, es) =
case es' of
Entry{radical=Just Radical{letter}}:_ ->
Cluster { radicalLetter = letter, headWord, clusterEntries = es' }
_ -> Cluster { radicalLetter = '?', headWord, clusterEntries = es' }
where
es' = sortBy (compare `on` title) es
splitHeteronym :: Entry -> [Entry]
splitHeteronym Entry{..} = [ Entry{title, radical, references, heteronyms=[h]} | h <- heteronyms ]
entryHead :: Entry -> HeadWord
entryHead Entry{..} = HeadWord {..}
where
headChar = T.head (titleText title)
headSound =
T.dropWhileEnd (== 'r') $
T.takeWhile (/= ' ') $
T.dropWhile (> '\255') $
pinyin (pronunciation (head heteronyms))