{- | Copyright : (c) 2024 Pierre Le Marre Maintainer: dev@wismill.eu Stability : experimental Parser for [Blocks.txt](https://www.unicode.org/reports/tr44/#Blocks.txt) @since 0.3.0 -} module Unicode.CharacterDatabase.Parser.Blocks (parse, Entry (..)) where import Data.ByteString qualified as B import Data.ByteString.Char8 qualified as B8 import Data.ByteString.Short qualified as BS import Data.List qualified as L import GHC.Stack (HasCallStack) import Unicode.CharacterDatabase.Parser.Internal ( parseCodePointRange', withParser, pattern HashTag, pattern SemiColon, ) {- | An entry from @Blocks.txt@ file >>> parse "0000..007F; Basic Latin" [Entry {start = '\NUL', end = '\DEL', name = "Basic Latin"}] >>> parse "100000..10FFFF; Supplementary Private Use Area-B" [Entry {start = '\1048576', end = '\1114111', name = "Supplementary Private Use Area-B"}] @since 0.3.0 -} data Entry = Entry { start ∷ !Char , end ∷ !Char , name ∷ !BS.ShortByteString } deriving (Eq, Show) {- | A parser for @Blocks.txt@ file @since 0.3.0 -} parse ∷ (HasCallStack) ⇒ B.ByteString → [Entry] parse = L.unfoldr (withParser parseBlockLine) parseBlockLine ∷ (HasCallStack) ⇒ B.ByteString → Maybe Entry parseBlockLine line | B.null line || B.head line == HashTag = Nothing | otherwise = Just Entry{..} where (rawRange, line1) = B.span (/= SemiColon) line rawName = B.takeWhile (/= HashTag) (B.tail line1) (start, end) = parseCodePointRange' (B8.strip rawRange) name = BS.toShort (B8.strip rawName)