{- | Copyright : (c) 2024 Pierre Le Marre Maintainer: dev@wismill.eu Stability : experimental Parser for properties files with a /single/ property, such as: * [Scripts.txt](https://www.unicode.org/reports/tr44/#Scripts.txt) * [ScriptExtensions.txt](https://www.unicode.org/reports/tr44/#ScriptExtensions.txt) * [extracted/DerivedCombiningClass.txt](https://www.unicode.org/reports/tr44/#DerivedCombiningClass.txt) @since 0.1.0 -} module Unicode.CharacterDatabase.Parser.Properties.Single ( parse, Entry (..), parseMultipleValues, EntryMultipleValues (..), ) where import Data.ByteString qualified as B import Data.ByteString.Char8 qualified as B8 import Data.ByteString.Short qualified as BS import Data.List qualified as L import Data.List.NonEmpty qualified as NE import GHC.Stack (HasCallStack) import Unicode.CharacterDatabase.Parser.Internal ( CodePointRange, parseCodePointRange, withParser, pattern HashTag, pattern SemiColon, ) {- | An entry from a properties file with one value per entry @since 0.1.0 -} data Entry = Entry { range ∷ !CodePointRange , value ∷ !BS.ShortByteString } deriving (Eq, Show) {- | A parser for properties files with one value per entry >>> parse "102E0 ; Arab Copt # Mn COPTIC EPACT THOUSANDS MARK" [Entry {range = SingleChar {start = '\66272'}, value = "Arab Copt"}] >>> parse "1CF7 ; Beng # Mc VEDIC SIGN ATIKRAMA" [Entry {range = SingleChar {start = '\7415'}, value = "Beng"}] >>> parse "1CDE..1CDF ; Deva # Mn [2] VEDIC TONE TWO DOTS BELOW..VEDIC TONE THREE DOTS BELOW" [Entry {range = CharRange {start = '\7390', end = '\7391'}, value = "Deva"}] >>> parse "1CD0 ; Beng Deva Gran Knda # Mn VEDIC TONE KARSHANA" [Entry {range = SingleChar {start = '\7376'}, value = "Beng Deva Gran Knda"}] @since 0.1.0 -} parse ∷ (HasCallStack) ⇒ B.ByteString → [Entry] parse = L.unfoldr (withParser parsePropertyLine) parsePropertyLine ∷ (HasCallStack) ⇒ B.ByteString → Maybe Entry parsePropertyLine line | B.null line || B.head line == HashTag = Nothing | otherwise = Just Entry{..} where (rawRange, line1) = B.span (/= SemiColon) line rawValue = B.takeWhile (/= HashTag) (B.tail line1) range = parseCodePointRange (B8.strip rawRange) value = BS.toShort (B8.strip rawValue) {- | An entry from a properties file with multiple values per entry @since 0.1.0 -} data EntryMultipleValues = EntryMultipleValues { range ∷ !CodePointRange , values ∷ !(NE.NonEmpty BS.ShortByteString) } deriving (Eq, Show) {- | A parser for properties files with multiple values per entry @since 0.1.0 -} parseMultipleValues ∷ (HasCallStack) ⇒ B.ByteString → [EntryMultipleValues] parseMultipleValues = L.unfoldr (withParser parsePropertyLine') parsePropertyLine' ∷ (HasCallStack) ⇒ B.ByteString → Maybe EntryMultipleValues parsePropertyLine' line | B.null line || B.head line == HashTag = Nothing | otherwise = Just EntryMultipleValues{..} where (rawRange, line1) = B.span (/= SemiColon) line rawValues = B.takeWhile (/= HashTag) (B.tail line1) range = parseCodePointRange (B8.strip rawRange) values = NE.fromList . fmap (BS.toShort . B8.strip) . B8.split ';' $ rawValues