-- | -- Module : Data.Kanji -- Copyright : (c) Colin Woodbury, 2015, 2016 -- License : GPL3 -- Maintainer: Colin Woodbury -- -- A library for analysing the density of Kanji in given texts, -- according to their "Level" classification, as defined by the -- Japan Kanji Aptitude Testing Foundation (日本漢字能力検定協会). module Data.Kanji ( -- * Kanji AsKanji(..) , Kanji(..) , allKanji , isKanji , hasLevel , kanjiDensity , elementaryKanjiDensity , percentSpread -- * Levels , Level(..) , Rank(..) , level , levels , isKanjiInLevel , levelDist , levelFromRank , averageLevel ) where import Data.List (sort, group) import qualified Data.Set as S import Lens.Micro import Data.Kanji.Level.EighthQ import Data.Kanji.Level.FifthQ import Data.Kanji.Level.FourthQ import Data.Kanji.Level.NinthQ import Data.Kanji.Level.PreSecondQ import Data.Kanji.Level.SecondQ import Data.Kanji.Level.SeventhQ import Data.Kanji.Level.SixthQ import Data.Kanji.Level.TenthQ import Data.Kanji.Level.ThirdQ import Data.Kanji.Types --- -- | All Kanji, grouped by their Level (級) in ascending order. -- Here, ascending order means from the lowest to the highest level, -- meaning from 10 to 1. allKanji :: [[Kanji]] allKanji = map asKanji ks where ks = [tenthQ, ninthQ, eighthQ, seventhQ, sixthQ, fifthQ, fourthQ, thirdQ, preSecondQ, secondQ] -- | Is the `Level` of a given `Kanji` known? hasLevel :: Kanji -> Bool hasLevel k = has _Just $ level k -- | What is the density @d@ of Kanji characters in a given String-like -- type, where @0 <= d <= 1@? kanjiDensity :: AsKanji a => a -> [Kanji] -> Float kanjiDensity orig ks = fromIntegral (length ks) / len orig -- | As above, but only Kanji of the first 1006 are counted (those learned -- in elementary school in Japan). elementaryKanjiDensity :: [Kanji] -> Float elementaryKanjiDensity ks = foldl (\acc (_,p) -> acc + p) 0 elementaryQs where elementaryQs = filter (\(qn,_) -> qn `elem` [Five, Six ..]) dists dists = levelDist ks makeLevel :: [Kanji] -> Rank -> Level makeLevel ks n = Level (S.fromDistinctAscList ks) n -- | All `Level`s, with all their `Kanji`, ordered from Level-10 to Level-2. levels :: [Level] levels = map f $ zip allKanji [Ten ..] where f (ks,n) = makeLevel ks n -- | What `Level` does a Kanji belong to? level :: Kanji -> Maybe Level level = level' levels where level' [] _ = Nothing level' (q:qs) k | isKanjiInLevel q k = Just q | otherwise = level' qs k -- | Does a given `Kanji` belong to the given `Level`? isKanjiInLevel :: Level -> Kanji -> Bool isKanjiInLevel q k = S.member k $ _allKanji q -- | Is there a `Level` that corresponds with a given `Rank` value? levelFromRank :: Rank -> Maybe Level levelFromRank = levelFromRank' levels where levelFromRank' [] _ = Nothing levelFromRank' (q:qs) qn | _rank q == qn = Just q | otherwise = levelFromRank' qs qn -- | Find the average `Level` of a given set of `Kanji`. averageLevel :: [Kanji] -> Float averageLevel ks = average ranks where ranks = ks ^.. each . to level . _Just . to _rank . to fromRank average ns = (sum ns) / (fromIntegral $ length ns) -- | How much of each `Level` is represented by a group of Kanji? levelDist :: [Kanji] -> [(Rank,Float)] levelDist ks = map toNumPercentPair $ group sortedRanks where sortedRanks = sort $ ks ^.. each . to level . _Just . to _rank toNumPercentPair qns = (head qns, length' qns / length' ks) length' n = fromIntegral $ length n -- | The distribution of each `Kanji` in a set of them. -- The distribution values must sum to 1. percentSpread :: [Kanji] -> [(Kanji,Float)] percentSpread ks = map getPercent kQuants where getPercent (k,q) = (k, fromIntegral q / totalKanji) kQuants = kanjiQuantities ks totalKanji = fromIntegral $ foldl (\acc (_,q) -> q + acc) 0 kQuants -- | Determines how many times each `Kanji` appears in given set of them. kanjiQuantities :: [Kanji] -> [(Kanji,Int)] kanjiQuantities = map (\ks -> (head ks, length ks)) . group . sort