{- | Copyright : (c) 2024 Pierre Le Marre Maintainer: dev@wismill.eu Stability : experimental Parser for [CaseFolding.txt](https://www.unicode.org/reports/tr44/#CaseFolding.txt) @since 0.3.0 -} module Unicode.CharacterDatabase.Parser.CaseFolding ( parse, Entry (..), CaseFoldingType (..), ) where import Data.ByteString qualified as B import Data.ByteString.Char8 qualified as B8 import Data.List qualified as L import GHC.Stack (HasCallStack) import Unicode.CharacterDatabase.Parser.Internal ( parseCodePoint, withParser, pattern HashTag, pattern SemiColon, ) {- | An entry from @CaseFolding.txt@ file @since 0.3.0 -} data Entry = Entry { char ∷ !Char , caseFoldingType ∷ !CaseFoldingType , caseFolding ∷ ![Char] } deriving (Eq, Show) {- | Type of case folding @since 0.3.0 -} data CaseFoldingType = -- | /Common case folding/, common mappings shared by both simple and full mappings. CommonCaseFolding | -- | /Full case folding/, mappings that cause strings to grow in length. FullCaseFolding | -- | /Simple case folding/, mappings to single characters where different from 'FullCaseFolding' SimpleCaseFolding | -- | /Special case/ for uppercase I and dotted uppercase I SpecialCaseFolding deriving (Eq, Ord, Show) {- | A parser for @CaseFolding.txt@ >>> parse "0041; C; 0061; # LATIN CAPITAL LETTER A" [Entry {char = 'A', caseFoldingType = CommonCaseFolding, caseFolding = "a"}] >>> parse "0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE" [Entry {char = '\304', caseFoldingType = FullCaseFolding, caseFolding = "i\775"}] >>> parse "0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE" [Entry {char = '\304', caseFoldingType = SpecialCaseFolding, caseFolding = "i"}] @since 0.3.0 -} parse ∷ (HasCallStack) ⇒ B.ByteString → [Entry] parse = L.unfoldr (withParser parseSpecialCasingLine) {- | File format: @; ; ; # @ -} parseSpecialCasingLine ∷ (HasCallStack) ⇒ B.ByteString → Maybe Entry parseSpecialCasingLine line | B.null line || B.head line == HashTag = Nothing | otherwise = Just Entry{..} where (rawChar, line1) = B.span (/= SemiColon) line char = parseCodePoint rawChar (rawCaseFoldType, line2) = B.span (/= SemiColon) (B.tail line1) caseFoldingType = case B8.strip rawCaseFoldType of "C" → CommonCaseFolding "F" → FullCaseFolding "S" → SimpleCaseFolding "T" → SpecialCaseFolding ty → error ("Unsupported case folding type: " <> show ty) (rawCaseFolding, _) = B.span (/= SemiColon) (B.tail line2) caseFolding = toChars rawCaseFolding toChars = fmap parseCodePoint . B8.words