{- | Copyright : (c) 2024 Pierre Le Marre Maintainer: dev@wismill.eu Stability : experimental Parser for [DerivedName.txt](https://www.unicode.org/reports/tr44/#Derived_Extracted) @since 0.3.0 -} module Unicode.CharacterDatabase.Parser.Extracted.DerivedName ( parse, Entry (..), ) where import Control.Exception (assert) import Data.ByteString qualified as B import Data.ByteString.Char8 qualified as B8 import Data.ByteString.Short qualified as BS import Data.List qualified as L import GHC.Stack (HasCallStack) import Unicode.CharacterDatabase.Parser.Internal ( parseCodePointRange, withParser, pattern Asterisk, pattern HashTag, pattern SemiColon, ) import Unicode.CharacterDatabase.Parser.Internal qualified as I {- | An entry from @DerivedName.txt@ file @since 0.3.0 -} data Entry = SingleChar { char ∷ !Char , name ∷ !BS.ShortByteString } | CharRange { start ∷ !Char , end ∷ !Char , prefix ∷ !BS.ShortByteString } deriving (Eq, Show) {- | A parser for @DerivedName.txt@ file >>> parse "0020 ; SPACE" [SingleChar {char = ' ', name = "SPACE"}] >>> parse "3400..4DBF ; CJK UNIFIED IDEOGRAPH-*" [CharRange {start = '\13312', end = '\19903', prefix = "CJK UNIFIED IDEOGRAPH-"}] >>> parse "18B00..18CD5 ; KHITAN SMALL SCRIPT CHARACTER-*" [CharRange {start = '\101120', end = '\101589', prefix = "KHITAN SMALL SCRIPT CHARACTER-"}] >>> parse "18CFF ; KHITAN SMALL SCRIPT CHARACTER-*" [CharRange {start = '\101631', end = '\101631', prefix = "KHITAN SMALL SCRIPT CHARACTER-"}] @since 0.3.0 -} parse ∷ (HasCallStack) ⇒ B.ByteString → [Entry] parse = L.unfoldr (withParser parseNameLine) parseNameLine ∷ (HasCallStack) ⇒ B.ByteString → Maybe Entry parseNameLine line | B.null line || B.head line == HashTag = Nothing | otherwise = Just case range of I.SingleChar char → if BS.last name == Asterisk then CharRange{start = char, end = char, prefix = BS.init name} else SingleChar{..} I.CharRange start end → CharRange{..} where prefix = assert (BS.last name == Asterisk) (BS.dropEnd 1 name) where (rawRange, line1) = B.span (/= SemiColon) line rawName = B.takeWhile (/= HashTag) (B.tail line1) range = parseCodePointRange (B8.strip rawRange) name = BS.toShort (B8.strip rawName)