module PDF.Cmap
( parseCMap
) where
import Data.Char (chr)
import Numeric (readOct, readHex)
import Data.ByteString (ByteString)
import qualified Data.ByteString.Char8 as BS
import qualified Data.ByteString.Lazy.Char8 as BSL
import qualified Data.Text as T
import Data.Text.Encoding (encodeUtf8)
import Text.Parsec hiding (many, (<|>))
import Control.Applicative
import Text.Parsec.ByteString.Lazy
import Codec.Compression.Zlib (decompress)
import Debug.Trace
import PDF.Definition
import PDF.Character (pdfchardict)
parseCMap :: BSL.ByteString -> CMap
parseCMap str = case runParser (concat <$> manyTill cmapParser (try $ string "endcmap")) () "" str of
Left err -> error "Can not parse CMap"
Right cmap -> cmap
cmapParser :: Parser CMap
cmapParser = do
spaces
manyTill anyChar (try $ string "beginbfchar")
spaces
ms <- many1 (toCmap <$> hexletters <*> hexletters)
spaces
string "endbfchar"
spaces
return ms
where toCmap cid ucs = ((fst.head.readHex) cid, ((:[]).chr.fst.head.readHex) ucs)
hexletters :: Parser String
hexletters = do
char '<'
lets <- manyTill hexletter (try $ char '>')
spaces
return $ concat lets
hexletter :: Parser String
hexletter = (count 4 $ oneOf "0123456789ABCDEFabcdef")