{-# LANGUAGE OverloadedStrings #-} module PDF.ContentStream ( deflate , decompressStream ) where import Data.Char (chr) import Numeric (readOct, readHex) import Data.ByteString (ByteString) import qualified Data.ByteString.Char8 as BS import qualified Data.ByteString.Lazy.Char8 as BSL import qualified Data.Text as T import Data.Text.Encoding (encodeUtf8) import Text.Parsec hiding (many, (<|>)) import Control.Applicative import Text.Parsec.ByteString.Lazy import Codec.Compression.Zlib (decompress) import Debug.Trace import PDF.Definition import PDF.Character (pdfchardict) type PSParser a = GenParser Char PSR a parseContentStream p st = runParser p st "" parseDeflated :: PSR -> BSL.ByteString -> PDFStream parseDeflated psr pdfstream = case parseContentStream (T.concat <$> many (elems <|> skipOther)) psr pdfstream of Left err -> error "Nothing to be parsed" Right str -> BSL.pack $ BS.unpack $ encodeUtf8 str deflate :: PSR -> PDFStream -> PDFStream deflate = parseDeflated decompressStream :: PDFBS -> PDFStream decompressStream (n,pdfobject) = case parse (BSL.pack <$> (manyTill anyChar (try $ (string "stream" >> oneOf "\n\r")) >> spaces *> manyTill anyChar (try $ string "endstream"))) "" pdfobject of Left err -> "err" Right bs -> decompress bs elems :: PSParser T.Text elems = choice [ try pdfopBT , try pdfopTf , try pdfopTD , try pdfopTd , try pdfopTm , try pdfopTc , try pdfopTw , try pdfopTJ , try pdfopTj , try pdfopTast , try letters <* spaces , try hexletters <* spaces , try array <* spaces , try pdfopGraphics , try xObject , try graphicState , unknowns ] pdfopGraphics :: PSParser T.Text pdfopGraphics = do spaces choice [ try $ T.empty <$ oneOf "qQ" <* spaces , try $ T.empty <$ oneOf "fFbBW" <* (many $ string "*") <* spaces , try $ T.empty <$ oneOf "nsS" <* spaces , try $ T.empty <$ (digitParam <* spaces) <* oneOf "gG" <* spaces , try $ T.empty <$ (digitParam <* spaces) <* oneOf "jJ" <* spaces , try $ T.empty <$ (digitParam <* spaces) <* oneOf "dwi" <* spaces , try $ T.empty <$ (many1 (digitParam <* spaces) <* oneOf "ml" <* spaces) , try $ T.empty <$ (many1 (digitParam <* spaces) <* string "re" <* spaces) , try $ T.empty <$ (many1 (digitParam <* spaces) <* string "rg" <* spaces) , try $ T.empty <$ (many1 (digitParam <* spaces) <* string "RG" <* spaces) , try $ T.empty <$ (many1 (digitParam <* spaces) <* string "cm" <* spaces) , try $ T.empty <$ (many1 (digitParam <* spaces) <* string "c" <* spaces) ] return T.empty graphicState :: PSParser T.Text graphicState = do gs <- (++) <$> string "/" <*> manyTill anyChar (try space) spaces string "gs" spaces return T.empty xObject :: PSParser T.Text xObject = do file <- (++) <$> string "/" <*> manyTill anyChar (try space) spaces string "Do" spaces return T.empty pdfopBT :: PSParser T.Text pdfopBT = do string "BT" spaces t <- manyTill elems (try $ string "ET") spaces return $ T.concat t pdfopTj :: PSParser T.Text pdfopTj = do spaces t <- manyTill (letters <|> hexletters <|> array) (try $ string "Tj") spaces return $ T.concat t pdfopTJ :: PSParser T.Text pdfopTJ = do spaces t <- manyTill (letters <|> hexletters <|> array) (try $ string "TJ") spaces return $ T.concat t unknowns :: PSParser T.Text unknowns = do ps <- manyTill anyChar (try $ oneOf "\r\n") return "" return $ T.pack $ "[[[UNNOKWN STREAM:" ++ take 100 (show ps) ++ "]]]" skipOther :: PSParser T.Text skipOther = do a <- manyTill anyChar (try $ oneOf "\r\n") return $ "" array :: PSParser T.Text array = do char '[' str <- (manyTill (letters <|> hexletters <|> kern) (try $ char ']')) return $ T.concat str letters :: PSParser T.Text letters = do char '(' lets <- manyTill psletter (try $ char ')') spaces return $ T.concat lets hexletters :: PSParser T.Text hexletters = do char '<' lets <- manyTill hexletter (try $ char '>') spaces return $ T.concat lets adobeOneSix :: Int -> T.Text adobeOneSix a = T.pack (show a) toUcs :: CMap -> Int -> T.Text toUcs map h = case lookup h map of Just ucs -> T.pack ucs Nothing -> adobeOneSix h hexletter :: PSParser T.Text hexletter = do st <- getState let cmap = case lookup (curfont st) (cmaps st) of Just m -> m Nothing -> [] (hexToString cmap . readHex) <$> (count 4 $ oneOf "0123456789ABCDEFabcdef") where hexToString map [] = "????" hexToString map [(h,_)] = toUcs map h psletter :: PSParser T.Text psletter = do st <- getState let fontmap = case lookup (curfont st) (fontmaps st) of Just m -> m Nothing -> [] c <- try (char '\\' >> oneOf "\\()") <|> try (octToString . readOct <$> (char '\\' >> (count 3 $ oneOf "01234567"))) <|> noneOf "\\" return $ replaceWithDiff fontmap c where replaceWithDiff m c' = case lookup c' m of Just s -> replaceWithCharDict s Nothing -> T.pack [c'] replaceWithCharDict s = case lookup s pdfchardict of Just cs -> cs Nothing -> T.pack s octToString [] = '?' octToString [(o,_)] = chr o kern :: PSParser T.Text kern = do t <- digitParam return $ if t < -60.0 then " " else "" pdfopTf :: PSParser T.Text pdfopTf = do font <- (++) <$> string "/" <*> manyTill anyChar (try space) spaces t <- digitParam spaces string "Tf" spaces st <- getState let ff = fontfactor st updateState (\s -> s{ curfont = font , fontfactor = ff*t}) return "" pdfopTD :: PSParser T.Text pdfopTD = do t1 <- digitParam spaces t2 <- digitParam spaces string "TD" spaces st <- getState let ax = absolutex st ay = absolutey st lx = linex st ly = liney st lm = leftmargin st ff = fontfactor st needBreak = abs t2 > 0 && abs (ly - t2) > 0 updateState (\s -> s { absolutex = ax + t1 , absolutey = ay + (if needBreak then 2*t2 else 0) , linex = if abs t1 > 0 then t1 else lx , liney = if abs t2 > 0 then 2*t2 else ly }) return $ if needBreak then T.concat ["\n", (desideParagraphBreak t1 t2 lx ly lm ff)] else "" pdfopTd :: PSParser T.Text pdfopTd = do t1 <- digitParam spaces t2 <- digitParam spaces string "Td" spaces st <- getState let ax = absolutex st ay = absolutey st lx = linex st ly = liney st lm = leftmargin st ff = fontfactor st needBreak = abs t2 > 0 && abs (ly - t2) > 0 updateState (\s -> s { absolutex = ax + t1 , absolutey = ay + (if needBreak then t2 else 0) , linex = if abs t1 > 0 then t1 else lx , liney = if abs t2 > 0 then t2 else ly }) return $ if needBreak then T.concat ["\n", (desideParagraphBreak t1 t2 lx ly lm ff)] else "" pdfopTw :: PSParser T.Text pdfopTw = do tw <- digitParam spaces string "Tw" spaces st <- getState let ff = fontfactor st updateState (\s -> s { fontfactor = ff }) return $ "" pdfopTc :: PSParser T.Text pdfopTc = do tc <- digitParam spaces string "Tc" spaces st <- getState let ff = fontfactor st updateState (\s -> s { fontfactor = ff }) return $ "" desideParagraphBreak :: Double -> Double -> Double -> Double -> Double -> Double -> T.Text desideParagraphBreak t1 t2 lx ly lm ff = T.pack $ (if abs ly > abs ff || abs t2 > abs ff || (t1 - lm) > 0.5 then "\n" else "") pdfopTm :: PSParser T.Text pdfopTm = do a <- digitParam spaces b <- digitParam spaces c <- digitParam spaces d <- digitParam spaces e <- digitParam spaces f <- digitParam spaces string "Tm" spaces st <- getState let ax = absolutex st ay = absolutey st lx = linex st ly = liney st lm = leftmargin st ff = fontfactor st needBreak = abs d*f > 0 && ly >= 0 updateState (\s -> s { linex = lx , liney = ly , absolutex = a*e , absolutey = d*f , fontfactor = a*e*d*f*ff }) return $ if needBreak then T.concat ["\n", desideParagraphBreak (d*f) (d*f) lx ly lm ff] else if a*e > lx then " " else "" pdfopTast :: PSParser T.Text pdfopTast = do string "T*" st <- getState let ax = absolutex st ay = absolutey st lx = linex st ly = liney st updateState (\s -> s { linex = lx , liney = ly , absolutex = ax , absolutey = ay }) return "\n" digitParam :: PSParser Double digitParam = do sign <- many $ char '-' num <- ((++) <$> (("0"++) <$> (string ".")) <*> many1 digit) <|> ((++) <$> (many1 digit) <*> ((++) <$> (many $ char '.') <*> many digit)) return $ read $ sign ++ num hexParam :: Parser T.Text hexParam = do char '<' lets <- manyTill (oneOf "0123456789abcdefABCDEF") (try $ char '>') return $ T.pack lets