module Smap.Flags
( Hdl(Std, File)
, Descriptor(Separate, UnKeyed, Interleaved)
, FileType(Set, Stream)
, Command(Union, Subtract, Intersect)
, Accuracy(Approximate, Exact)
, command
)
where
import qualified Data.Attoparsec.Text as A
import qualified Data.Text as Text
import qualified Options.Applicative as O
import Control.Applicative (many, (<|>))
import Data.ByteArray.Hash (SipKey(..))
data Hdl = Std
| File FilePath
data FileType = Set
| Stream
data Descriptor (ty :: FileType)
= Separate Hdl Hdl
| Interleaved Hdl
| UnKeyed Hdl
data Accuracy = Approximate SipKey
| Exact
data Command = Union Accuracy [Descriptor 'Set] (Descriptor 'Set)
| Subtract Accuracy (Descriptor 'Stream) [Descriptor 'Set] (Descriptor 'Stream)
| Intersect Accuracy (Descriptor 'Stream) [Descriptor 'Set] (Descriptor 'Stream)
hdl :: A.Parser Hdl
hdl = stdin <|> path
where
stdin = Std <$ A.char '-'
path = File . Text.unpack <$> A.takeWhile (/= ',')
descriptor :: A.Parser (Descriptor ty)
descriptor =
(keyed <|> interleaved <|> unkeyed)
<* (A.endOfInput A.<?> "more filepath characters than expected")
A.<?> "Could not parse filepath"
where
unkeyed = UnKeyed <$> hdl
keyed = do
_ <- A.char '+'
k <- hdl
_ <- A.char ','
v <- hdl
return (Separate k v)
interleaved = Interleaved <$> (A.char '@' *> hdl)
aToO :: A.Parser a -> O.ReadM a
aToO p = O.eitherReader (A.parseOnly p . Text.pack)
argDescription :: String
argDescription =
"Use '-' for stdin/out. Use +keyfile,valfile for separate keys and values. Use @file for keys and values on alternating lines."
stream :: O.Parser (Descriptor 'Stream)
stream = O.argument
(aToO descriptor)
(O.metavar "STREAM" <> O.help ("Stream source file. " ++ argDescription))
output :: O.Parser (Descriptor a)
output =
O.option
(aToO descriptor)
( O.metavar "OUTPUT"
<> O.help ("Output file(s). Defaults to stdout. " ++ argDescription)
<> O.short 'o'
<> O.long "output"
)
<|> pure (UnKeyed Std)
sets :: O.Parser [Descriptor 'Set]
sets = many $ O.argument
(aToO descriptor)
(O.metavar "SET*" <> O.help ("Set/map source files (0 or more). " ++ argDescription))
accuracy :: O.Parser Accuracy
accuracy = approx <|> approxWithKey <|> exact
where
approx = O.flag'
(Approximate (SipKey 0 0))
( O.short 'a'
<> O.long "approximate"
<> O.help
"For deduplication, store a 64-bit siphash rather than the whole line. Can save memory, depending on the set operation"
)
approxWithKey = O.option
(fmap
(\i -> Approximate $ SipKey (fromIntegral $ i `div` (2 ^ (64 :: Integer))) (fromIntegral i))
(O.auto :: O.ReadM Integer)
)
(O.short 'k' <> O.long "approx-with-key" <> O.metavar "KEY" <> O.help
"like --approximate, but takes a 128-bit SipHash key. Example: -k 0x1234..."
)
exact = pure Exact
catCommand :: O.Mod O.CommandFields Command
catCommand = O.command "cat" $ O.info
(value O.<**> O.helper)
(O.fullDesc <> O.progDesc "Set union. Output everything from the input sets, deduplicated.")
where value = Union <$> accuracy <*> sets <*> output
subCommand :: O.Mod O.CommandFields Command
subCommand = O.command "sub" $ O.info
(value O.<**> O.helper)
(O.fullDesc <> O.progDesc
(concat
[ "Set subtraction. "
, "Given stream A and sets B,C,..., output values in A but not in any of B,C,... "
, "Does not deduplicate values from A."
]
)
)
where value = Subtract <$> accuracy <*> stream <*> sets <*> output
intersectCommand :: O.Mod O.CommandFields Command
intersectCommand = O.command "int" $ O.info
(value O.<**> O.helper)
(O.fullDesc <> O.progDesc
(concat
[ "Set intersection. "
, "Given stream A and sets B,C,..., output values in A "
, "only if they are in all of of B,C,... "
, "Does not deduplicate values from A."
]
)
)
where value = Intersect <$> accuracy <*> stream <*> sets <*> output
command :: IO Command
command = O.execParser
(O.info
((O.subparser (catCommand <> subCommand <> intersectCommand)) O.<**> O.helper)
(O.fullDesc <> O.progDesc "Use -h for help. Available commands: cat, sub, int.")
)