{-# LANGUAGE OverloadedStrings #-} module PDF.Cmap ( parseCMap ) where import Data.Char (chr) import Numeric (readOct, readHex) import Data.ByteString (ByteString) import qualified Data.ByteString.Char8 as BS import qualified Data.ByteString.Lazy.Char8 as BSL import qualified Data.Text as T import Data.Text.Encoding (encodeUtf8) import Text.Parsec hiding (many, (<|>)) import Control.Applicative import Text.Parsec.ByteString.Lazy import Codec.Compression.Zlib (decompress) import Debug.Trace import PDF.Definition parseCMap :: BSL.ByteString -> CMap parseCMap str = case runParser (concat <$> manyTill (try bfchar <|> (concat <$> bfrange)) (try $ string "endcmap")) () "" str of Left err -> error "Can not parse CMap" Right cmap -> cmap bfchar :: Parser CMap bfchar = do spaces manyTill anyChar (try $ string "beginbfchar") spaces ms <- many1 (toCmap <$> hexletters <*> hexletters) spaces string "endbfchar" spaces return ms where toCmap cid ucs = ((fst.head.readHex) cid, ((:[]).chr.fst.head.readHex) ucs) bfrange :: Parser [CMap] bfrange = do spaces manyTill anyChar (try $ string "beginbfrange") spaces ms <- many1 (toCmap <$> (getRange <$> hexletters <*> hexletters) <*> hexletters) spaces string "endbfrange" spaces return ms where getRange cid cid' = [gethex cid .. gethex cid'] toCmap range ucs = zip range (map ((:[]).chr) [gethex ucs ..]) gethex = fst.head.readHex hexletters :: Parser String hexletters = do char '<' lets <- manyTill hexletter (try $ char '>') spaces return $ concat lets hexletter :: Parser String hexletter = (count 4 $ oneOf "0123456789ABCDEFabcdef")