module Data.Chinese.Frequency
( SubtlexMap
, SubtlexEntry(..)
, subtlex
, Data.Chinese.Frequency.lookup
) where
import qualified Data.ByteString.Lazy as L
import qualified Data.ByteString as B
import qualified Data.ByteString.Char8 as B8
import Data.FileEmbed
import Data.Map (Map)
import qualified Data.Map.Strict as M
import Data.Text (Text)
import Data.Text.Encoding
import qualified Data.Text as T
type SubtlexMap = Map B.ByteString RawEntry
data RawEntry = RawEntry
{ rawEntryIndex :: !Int
, rawEntryWCount :: !Int
, rawEntryWMillion :: !Double
}
data SubtlexEntry = SubtlexEntry
{ subtlexIndex :: !Int
, subtlexWord :: !T.Text
, subtlexWCount :: !Int
, subtlexWMillion :: !Double
} deriving ( Show )
toEntry :: Int -> B.ByteString -> RawEntry
toEntry idx row = RawEntry
{ rawEntryIndex = idx
, rawEntryWCount = asInt (chunks!!4)
, rawEntryWMillion = read (B8.unpack $ chunks!!5) }
where
chunks = B.split 9 row
asInt str =
case B8.readInt str of
Nothing -> 1
Just (n,_rest) -> n
lookup :: Text -> SubtlexMap -> Maybe SubtlexEntry
lookup key m = do
RawEntry n wcount wmillion <- M.lookup (encodeUtf8 key) m
return SubtlexEntry
{ subtlexIndex = n
, subtlexWord = key
, subtlexWCount = wcount
, subtlexWMillion = wmillion }
mkSubtlexMap :: [B.ByteString] -> SubtlexMap
mkSubtlexMap rows = M.fromListWith join
[ (word, toEntry n row)
| (n,row) <- zip [0..] rows
, let chunks = B.split 9 row
word = head chunks
, not (null chunks)
]
where
join (RawEntry n1 c1 m1) (RawEntry n2 c2 m2) =
RawEntry (min n1 n2) (c1+c2) (m1+m2)
subtlex :: SubtlexMap
subtlex = mkSubtlexMap $
rows
where
utfData = $(embedFile "data/SUBTLEX_CH_131210_CE.utf8")
rows = drop 1 (B.split 0xa utfData)