{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE TypeApplications #-}

--
-- Glossary
--
-- CHK - An encryption and encoding scheme for storing immutable data.
--
-- Data - The plaintext used to construct CHK.
--
-- Share - One complete unit of encrypted and FEC encoded data.
--
-- Segment - One piece of ciphertext used to construct CHK.  All segments
--     belonging to a CHK are the same size except the last one may be short.
--
-- k, required - The number of "primary" erasure-encoding outputs.  Equal to
--     the minimum number of erasure-encoding outputs needed to reconstruct
--     the erasure-encoding input.
--
-- n, total - The total number of erasure-encoding outputs.  Always greater
--     than or equal to required.
--
-- Block - One output resulting from erasure-encoding one segment using
--     required, total.  If necessary, the input segment is nul-padded so its
--     size is a multiple of required.
--
-- Plaintext Hash Tree - Not actually implemented by Tahoe so I'm not really
--     sure.  Probably something like a sha256d merkle tree where the leaves
--     are hashes of the plaintext corresponding to each ciphertext segment.
--     Since all shares for a CHK are derived from the same plaintext, every
--     share has the same plaintext hash tree.
--
-- Crypttext Hash Tree - A sha256d merkle tree where the leaves are hashes of
--     the ciphertext segments.  Since all shares for a CHK are derived from
--     the same ciphertext, every share has the same ciphertext hash tree.
--
-- Crypttext Root Hash - The hash at the root of Crypttext Hash Tree.
--
-- Block Hash Tree - A sha256d merkle tree where the leaves are hashes of the
--     blocks.  Since the erasure-encoding output is different for each share,
--     every share has a different block hash tree.
--
-- Share Hash Tree - A sha256d merkle tree where the leaves are the root
--     hashes of the block hash trees for all shares.
--
-- Share Hashes - A list of hashes from the Share Hash Tree which are required
--     to verify one block hash tree.  Each share includes the Share Hashes
--     required to verify the Block Hash Tree contained within that share.
--     Since every share contains a different Block Hash Tree, every share
--     contains a different list of Share Hashes.  Each Share Hash in this
--     list is accompanied by information about its position in the Share Hash
--     Tree though it may not be strictly required (since it could be inferred
--     from position in the list).
--
-- URI Extension - A collection of metadata describing the encryption and
--     encoding used to create the CHK and (largely) necessary for either
--     decoding or verifying the integrity of the contained data.

module Tahoe.CHK (
    zfec,
    zunfec,
    encode,
    decode,
    padCiphertext,
    segmentCiphertext,
) where

import qualified Codec.FEC as ZFEC
import Crypto.Cipher.AES128 (
    AESKey128,
 )
import Data.Int (Int64)
import Data.Word (Word64)

-- import Debug.Trace

import Crypto.Hash (
    Context,
    HashAlgorithm (hashDigestSize),
    SHA256 (SHA256),
    hashFinalize,
    hashInit,
    hashUpdate,
 )
import Data.Bifunctor (first, second)
import qualified Data.ByteArray as BA
import qualified Data.ByteString as B
import qualified Data.ByteString.Lazy as LB
import Data.Foldable (foldlM)
import Data.List (sort, transpose)
import Data.List.Extra (
    snoc,
 )
import Data.Maybe (fromJust)
import qualified Tahoe.CHK.Capability as Cap
import Tahoe.CHK.Crypto (
    blockHash,
    ciphertextSegmentHash,
    ciphertextTag,
    sha256,
    uriExtensionHash,
 )
import Tahoe.CHK.Merkle (
    MerkleTree,
    buildTreeOutOfAllTheNodes,
    leafHashes,
    leafNumberToNodeNumber,
    makeTreePartial,
    neededHashes,
    rootHash,
 )
import Tahoe.CHK.Share (Share (..))
import Tahoe.CHK.Types (
    BlockHash,
    CrypttextHash,
    Parameters (..),
    Required,
    requiredToInt,
    totalToInt,
 )
import Tahoe.CHK.URIExtension (
    URIExtension (..),
 )
import Tahoe.Netstring (
    netstring,
 )
import Tahoe.Util (
    ceilDiv,
    chunkedBy,
    nextMultipleOf,
    nextPowerOf,
 )

-- | Erasure encode some bytes using ZFEC.
zfec ::
    -- | The number of outputs that will be required to reverse the encoding.
    -- Also known as `k`.
    Int ->
    -- | The total number of outputs to produce.  Also known as `n`.
    Int ->
    -- | Application data to divide into encoding inputs.
    B.ByteString ->
    -- | `n` encoding outputs.
    IO [B.ByteString]
zfec k n segment =
    pure $ chunks ++ ZFEC.encode (ZFEC.fec k n) chunks
  where
    chunks_ = chunkedBy (B.length segment `div` k) segment
    _msg =
        "zfec"
            <> " k="
            <> show k
            <> " n="
            <> show n
            <> ", segment len "
            <> show (B.length segment)
            <> ", chunk lengths "
            <> show (map B.length chunks_)
            <> ", segment "
            <> show segment
            <> "-> chunks "
            <> show chunks_
    chunks = {- trace _msg -} chunks_

-- | Version of `zfec` that operates on lazy ByteStrings.
zfecLazy :: Int -> Int -> LB.ByteString -> IO [LB.ByteString]
zfecLazy k n segment = (LB.fromStrict <$>) <$> zfec k n (LB.toStrict segment)

-- | Erasure decode some bytes using ZFEC.
zunfec ::
    -- | The `k` parameter used when encoding the data to decode.
    Int ->
    -- | The `n` parameter used when encoding the data to decode.
    Int ->
    -- | The encoding outputs annotated with their position (or "share number").
    [(Int, B.ByteString)] ->
    -- | The bytes which were originally encoded.
    IO B.ByteString
zunfec k n blocks = pure $ B.concat (ZFEC.decode (ZFEC.fec k n) blocks)

-- | Version of `zunfec` that operates on lazy ByteStrings.
zunfecLazy :: Int -> Int -> [(Int, LB.ByteString)] -> IO LB.ByteString
zunfecLazy k n blocks = do
    segment_ <- LB.fromStrict <$> zunfec k n (second LB.toStrict <$> blocks)
    let _msg =
            "zunfec"
                <> " k="
                <> show k
                <> " n="
                <> show n
                <> " blocks="
                <> show blocks
                <> " -> segment "
                <> show segment_

    -- pure (trace _msg) segment_
    pure segment_

{- | Represent progress encoding some ciphertext into a CHK share.  This
 carries along intermediate hash values used at the end to build extra
 self-authenticating fields into the share.
-}
data EncodingState = CPState
    { -- A single hash of all crypttext segments encoded so far.
      cpCrypttextHash :: Crypto.Hash.Context Crypto.Hash.SHA256
    , -- A list of hashes of each ciphertext segment encoded so far
      cpCrypttextHashes :: [CrypttextHash]
    , -- Hashes of blocks encoded so far.
      cpBlockHashes :: [[BlockHash]]
    , -- Blocks encoded so far.
      cpBlocks :: [[LB.ByteString]]
    }

-- | The initial state for CHK encoding.
initEncodingState :: EncodingState
initEncodingState =
    CPState
        { cpCrypttextHash = hashUpdate (hashInit :: Context SHA256) (netstring ciphertextTag)
        , cpCrypttextHashes = mempty
        , cpBlockHashes = mempty
        , cpBlocks = mempty
        }

{- | Split a full ciphertext string into the separate ciphertext segments
 required by most of CHK encoding.
-}
segmentCiphertext ::
    -- | The encoding parameters which determine how to split the ciphertext.
    Parameters ->
    -- | The ciphertext.
    LB.ByteString ->
    -- | The segments.
    [LB.ByteString]
segmentCiphertext Parameters{paramSegmentSize} ciphertext =
    result
  where
    result = {- trace ("segmentCiphertext: " <> show ciphertext) -} result_
    result_ = LB.fromStrict <$> chunkedBy (fromIntegral paramSegmentSize) (LB.toStrict ciphertext)

{- | Process ciphertext into blocks, carrying hashes computed along the way as
 state.
-}
processCiphertext :: Parameters -> [LB.ByteString] -> IO EncodingState
processCiphertext Parameters{paramRequiredShares, paramTotalShares} =
    foldlM processSegment initEncodingState
  where
    processSegment CPState{..} segment = do
        -- Produce the FEC blocks for this piece of ciphertext.
        blocks <-
            zfecLazy
                (requiredToInt paramRequiredShares)
                (totalToInt paramTotalShares)
                (padCiphertext paramRequiredShares segment)
        pure $
            CPState
                { cpCrypttextHash = hashUpdate cpCrypttextHash (LB.toStrict segment)
                , cpCrypttextHashes = snoc cpCrypttextHashes (ciphertextSegmentHash (LB.toStrict segment))
                , cpBlockHashes = snoc cpBlockHashes (blockHash . LB.toStrict <$> blocks)
                , cpBlocks = snoc cpBlocks blocks
                }

-- Compute the correctly padded ciphertext.  The only ciphertext which is
-- expected to require padding is the final segment - in case the original
-- ciphertext did not have a length that was a multiple of the `required`
-- parameter.
--
-- allmydata.immutable.encode.Encoder._gather_data NUL pads up to num_chunks
-- times input_chunk_size.  num_chunks is our requiredShares.
-- input_chunk_size is taken from codec.get_block_size() which returns
-- codec.share_size.  share_size is div_ceil(data_size, required_shares).
-- data_size is our segmentSize and required_shares is our requiredShares.
padCiphertext :: Required -> LB.ByteString -> LB.ByteString
padCiphertext requiredShares bs
    | paddingLength > 0 = bs <> LB.replicate paddingLength 0x00
    | otherwise = bs
  where
    desiredLength = nextMultipleOf requiredShares (LB.length bs)
    paddingLength = desiredLength - LB.length bs

{- | Encode some application data (typically ciphertext, but this function only
 weakly assumes this is the case) into some CHK shares.

 This replaces much of allmydata.immutable.encode.
-}
encode ::
    -- | The encryption/decryption key.
    AESKey128 ->
    -- | The ZFEC parameters for this encoding.  This determines how many shares
    -- will come out of this function.
    Parameters ->
    -- | The data to encode.  This is typically ciphertext.
    LB.ByteString ->
    -- | An IO which can be evaluated to get the encoded share data and the
    -- read capability.  The number of Shares will equal the `total` value
    -- from the given Parameters.
    IO ([Share], Cap.Reader)
encode readKey initParams@(Parameters maximumSegmentSize total _ required) ciphertext =
    processCiphertext p (segmentCiphertext p ciphertext) >>= \CPState{..} ->
        let -- The number of segments encoded in the share.  There are the same number
            -- of plaintext and ciphertext segments and this is also the number of
            -- blocks in each share (though each share may have a different _value_
            -- for each block).
            --
            -- allmydata.immutable.encode.Encoder._got_all_encoding_parameters
            numSegments = length cpBlocks

            -- Our merkle trees need a number of leaves equal to a power of 2.
            -- Compute that here so we can pad as necessary.
            --
            -- allmydata.immutable.layout.WriteBucketProxy
            effectiveSegments = nextPowerOf 2 numSegments

            -- XXX Unused by Tahoe so we don't even try for a sensible value right
            -- now.  Just fill it with zeros.
            --
            -- As long as we calculate a valid number of nodes for a tree
            -- buildTreeOutOfAllTheNodes won't give us a Nothing back ... cross
            -- your fingers.
            Just plaintextHashTree =
                buildTreeOutOfAllTheNodes
                    -- We have to fill the *whole* tree with nul, not just the
                    -- leaves.  Compute the total number of nodes in a tree that
                    -- can hold our number of segments.
                    . replicate (2 * effectiveSegments - 1)
                    -- And make every node all nul.
                    $ B.replicate (hashDigestSize SHA256) 0

            -- The merkle tree of ciphertext segment hashes.
            crypttextHashTree = makeTreePartial cpCrypttextHashes

            -- shareTree is a MerkleTree of MerkleTree
            shareTree =
                -- trace ("shareTree: " <> show shareTree')
                shareTree'
              where
                shareTree' = makeShareTree . map makeTreePartial . transpose $ cpBlockHashes

            -- A bag of additional metadata about the share and encoded object.
            uriExtension =
                URIExtension
                    { uriExtCodecName = "crs"
                    , uriExtCodecParams = p -- trace ("Params: " <> show p) p
                    , uriExtSize = fromIntegral $ LB.length ciphertext
                    , uriExtSegmentSize = segmentSize
                    , uriExtNeededShares = required
                    , uriExtTotalShares = total
                    , uriExtNumSegments = numSegments
                    , uriExtTailCodecParams = tailParams p (LB.length ciphertext)
                    , uriExtCrypttextHash = makeCrypttextHash cpCrypttextHash
                    , uriExtCrypttextRootHash = makeCrypttextRootHash cpCrypttextHashes
                    , uriExtShareRootHash = rootHash shareTree
                    }

            -- The read capability for the encoded object.
            cap =
                Cap.makeReader
                    readKey
                    (uriExtensionHash uriExtension)
                    required
                    total
                    (fromIntegral $ LB.length ciphertext)

            toShare sharenum blocks blockHashes =
                Share
                    { shareBlockSize = shareBlockSize p
                    , shareDataSize = fromIntegral $ LB.length ciphertext `ceilDiv` fromIntegral required
                    , shareBlocks = blocks
                    , sharePlaintextHashTree = plaintextHashTree
                    , shareCrypttextHashTree = crypttextHashTree
                    , shareBlockHashTree = makeTreePartial blockHashes
                    , shareNeededHashes = sort . fmap (first fromIntegral) $ computeNeededShares shareTree sharenum
                    , shareURIExtension = uriExtension
                    }

            -- The size in bytes of one erasure-encoded block of data.
            -- allmydata.immutable.encode.Encoder._got_all_encoding_parameters +
            -- allmydata.codec.CRSEncoder.set_params
            shareBlockSize :: Parameters -> Word64
            shareBlockSize Parameters{paramSegmentSize, paramRequiredShares} =
                fromIntegral paramSegmentSize `ceilDiv` fromIntegral paramRequiredShares
         in pure
                ( zipWith3 toShare [0 ..] (transpose cpBlocks) (transpose cpBlockHashes)
                , cap
                )
  where
    -- If we have little enough ciphertext, the maximum configured segment
    -- size may be greater than the length of the single segment we produce.
    -- Segment size is also required to be a multiple of the number of
    -- required shares so that segments can be evenly divided across the
    -- shares.
    p@(Parameters segmentSize _ _ required') =
        initParams
            { paramSegmentSize = nextMultipleOf required' $ min maximumSegmentSize (fromIntegral $ LB.length ciphertext)
            }

{- | Decode some CHK shares to recover some application data.  This is roughly
 the inverse of ``encode``.
-}
decode ::
    -- | The read capability for the application data.
    Cap.Reader ->
    -- | At least as many shares as are required to erasure decode the
    -- ciphertext.
    [(Int, Share)] ->
    -- | An action that results in the ciphertext contained by the shares if
    -- it is possible to recover it, or Nothing.
    IO (Maybe LB.ByteString)
decode Cap.Reader{verifier = Cap.Verifier{required, total, size}} shares
    | size > fromIntegral @Int64 @Integer maxBound = pure Nothing
    | length shares < fromIntegral required = pure Nothing
    | otherwise = do
        let -- Enough shares to satisfy the ZFEC decoder.
            enoughShares = take (fromIntegral required) shares

            -- A list of erasure encoded blocks and positional information.
            -- The outer list gives a share number along with all of the
            -- blocks held in that share.
            blocks :: [(Int, [LB.ByteString])]
            blocks = second shareBlocks <$> enoughShares

            -- The outer is corresponds to erasure-encoded segments.  The
            -- order corresponds to the order of the segments from the
            -- original input.  Each inner list contains enough blocks to be
            -- erasure-decoded back to a segment.
            explodedBlocks :: [[(Int, LB.ByteString)]]
            explodedBlocks = transpose $ fixBlocks <$> blocks

            -- Figure out how many bytes are expected to be in each segment.
            -- Depending on the ZFEC encoding parameters, it is possible that
            -- we will end up with blocks that are not completely "filled"
            -- with real data.  When these are decoded, we will get _extra_
            -- bytes in the result.  By knowing how many bytes were originally
            -- in our segments, we can recognize and discard these extra
            -- bytes.
            segSize = fromIntegral . paramSegmentSize . uriExtCodecParams . shareURIExtension . snd . head $ enoughShares

            -- A helper that knows the correct parameters to do ZFEC decoding
            -- for us.
            zunfec' = (LB.take segSize <$>) . zunfecLazy (fromIntegral required) (fromIntegral total)

        -- Decode every group of blocks back to the original segments.
        segments <- mapM zunfec' explodedBlocks

        -- Combine the segments and perform one more truncation to get the
        -- complete result.  Above where we computed segSize we weren't
        -- careful to find the tail segment size for use with the tail segment
        -- so there might still be some extra bytes in the `segments` list
        -- here.  This additional truncation addresses that.
        pure $ Just . LB.take (fromIntegral size) . LB.concat $ segments
  where
    -- Project the share number out across all of that share's blocks.  The
    -- result is something we can transpose into the correct form for ZFEC
    -- decoding.
    fixBlocks :: (Int, [LB.ByteString]) -> [(Int, LB.ByteString)]
    fixBlocks (sharenum, bs) = zip (repeat sharenum) bs

makeShareTree :: [MerkleTree] -> MerkleTree
makeShareTree = makeTreePartial . map rootHash

makeCrypttextHash :: Context SHA256 -> CrypttextHash
makeCrypttextHash = sha256 . toBytes . hashFinalize
  where
    toBytes = B.pack . BA.unpack

makeCrypttextRootHash :: [CrypttextHash] -> CrypttextHash
makeCrypttextRootHash = rootHash . makeTreePartial

-- Construct the encoding parameters for the final segment which may be
-- smaller than the earlier segments (if the size of the data to be encoded is
-- not a multiple of the segment size).
-- allmydata.immutable.encode.Encoder._got_all_encoding_parameters
tailParams :: Integral a => Parameters -> a -> Parameters
tailParams p@Parameters{paramSegmentSize, paramRequiredShares} dataSize =
    p{paramSegmentSize = nextMultipleOf paramRequiredShares tailSize'}
  where
    tailSize' =
        if tailSize == 0
            then paramSegmentSize
            else tailSize
    tailSize = fromIntegral dataSize `mod` paramSegmentSize

{- | Determine the node numbers of the share tree which are required to verify
 the indicated share number.  The indicated share number is included in the
 result, as are the corresponding hashes from the given tree.
-}
computeNeededShares :: MerkleTree -> Int -> [(Int, B.ByteString)]
computeNeededShares shareTree sharenum =
    -- In addition to what neededHashes computes we also need to include this
    -- share's own block hash root in the result.  Shove it on the front of
    -- the result here.  This will place it out of order so we'll fix it up
    -- below when we construct the Share.  We also have to translate between
    -- zero-indexed share numbers and 1-indexed leaf numbers.
    --
    -- Is fromJust here safe?  neededHashes returns Nothing when it fails to
    -- compute a merkle proof.  Given the way we're using it, that can
    -- probably only happen if there's a bug inside neededHashes (as opposed
    -- to our passing in some value it doesn't want to provide a result for).
    (leafNumberToNodeNumber shareTree sharenum - 1, blockHashRoot shareTree sharenum) : fromJust (neededHashes shareTree sharenum)

-- | Find the nth leaf hash in the given tree.
blockHashRoot :: MerkleTree -> Int -> B.ByteString
blockHashRoot tree n
    | n < 0 = error "Cannot have a negative leaf number"
    | n >= length leafs = error "Leaf number goes past the end of the tree"
    | otherwise = leafs !! n
  where
    leafs = leafHashes tree