#!/usr/bin/env runhaskell {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE FlexibleContexts #-} import System.Environment(getArgs) import System.Console.GetOpt import System.Exit import System.IO(stderr, hPutStrLn) import System.Directory(doesFileExist) import Data.Attoparsec.ByteString.Char8 import qualified Data.ByteString.Char8 as B import Data.Word import Data.String import Text.Regex.TDFA((=~)) import Data.List(isSuffixOf, isInfixOf, findIndex, findIndices, elem, nub, maximumBy, groupBy, intersect, transpose) import Data.List.Split(splitOn, splitWhen) import Data.Char(isLetter, isNumber, isSymbol, isSeparator, isPunctuation) import Data.Map(fromListWith, toList) import Data.Function(on) import Control.Monad(when, unless) import Data.Either(rights) import Text.Read(readMaybe) -- | Chapter 1. Functions for parsing the rule data Rule = Rule [Char] [Char] deriving Show parseRule :: Parser Rule parseRule = do left <- many' anyChar string "->" right <- many' anyChar return $ Rule left right -- | Guess the delimiter for several known formats knownFormats = [".csv", ".tsv", ".ssv", ".list"] decideF :: FilePath -> Maybe Char decideF flnm | ".csv" `isSuffixOf` flnm = Just ',' | ".tsv" `isSuffixOf` flnm = Just '\t' | ".ssv" `isSuffixOf` flnm = Just ' ' | ".list" `isSuffixOf` flnm = Just ' ' | ".sam" `isSuffixOf` flnm = Just '\t' | ".vcf" `isSuffixOf` flnm = Just '\t' | ".bed" `isSuffixOf` flnm = Just '\t' | ".gff" `isSuffixOf` flnm = Just '\t' | ".gtf" `isSuffixOf` flnm = Just '\t' | ".gff3" `isSuffixOf` flnm = Just '\t' | otherwise = Nothing guessF :: String -> Char guessF fstLine = mostFreqSymbol where mostFreqSymbol = fst $ maximumBy (compare `on` snd) symbolCount symbolCount = filter (\(s,n) -> isSymbol s || isSeparator s || isPunctuation s) freqMap freqMap = toList $ fromListWith (+) [(c, 1) | c <- fstLine] tuplify2 :: [t] -> (t, t) tuplify2 (a:b:_) = (a,b) isEmpty :: Char -> Bool isEmpty x = (x == ' ') || (x == '\t') lstrip :: [Char] -> [Char] lstrip [] = [] lstrip w@(x:xs) = if (isEmpty x) then (lstrip xs) else w rstrip :: [Char] -> [Char] rstrip [] = [] rstrip xs = if (isEmpty $ last xs) then rstrip (init xs) else xs strip :: [Char] -> [Char] strip = lstrip . rstrip isFieldVar :: [Char] -> Bool isFieldVar = all (\x -> isLetter x || isNumber x) isFieldChar :: Char -> Bool isFieldChar x = isLetter x || isNumber x interDivise x = splitWhen (not . isFieldChar) x fi :: Eq a => [a] -> a -> [Int] fi ar el = findIndices (==el) ar findFstInSnd :: Eq a => [a] -> [a] -> [[Int]] findFstInSnd a b = map (fi b) a sole :: Eq a => [a] -> [a] -> Bool sole x xs = length (splitOn x xs) == 2 sole' :: Eq a => [a] -> [a] -> Bool sole' x xs = length (splitOn x xs) <= 2 splitMargins :: Eq a => a -> [a] -> (Int, Int) splitMargins x xs = (pos', length xs - pos' - 1) where Just pos' | pos == Nothing = Just 0 | otherwise = pos pos = findIndex (==x) xs -- | Check the rule correctness -- 1. Every variable on the right must be present on the left checkRule :: (Eq a, IsString [a]) => [a] -> [a] -> Bool checkRule list1 list2 | ell `isInfixOf` list1 = (all (flip elem list1') list2') && (sole ell list1) | otherwise = all (flip elem list1') list2' where ell = "..." list1' = splitOn " " list1 list2' = splitOn " " list2 -- | Chapter 2. Functions for parsing the records recordWithSep sep field = sepBy field sep anyField sep = takeTill (==sep) parseLine line sep = feed (parse (recordWithSep (char sep) (anyField sep)) $ B.pack line) B.empty parseLine' line sep = y where Done x y = case sep of Nothing -> parseLine line (guessF line) Just sep' -> parseLine line sep' packEither oldDelim newDelim line = case newDelim of Nothing -> tryOld oldDelim line Just x -> B.pack [x] tryOld oldDelim line = case oldDelim of Nothing -> B.pack [guessF line] Just y -> B.pack [y] maybeEnumerate enumBool oldDelim newDelim | enumBool = zipWith (++) (map (\x -> show x ++ [delim]) [1..]) | otherwise = id where Just old' = oldDelim Just new' = newDelim delim | oldDelim == Nothing && newDelim == Nothing = error "Provide a new delimiter to enumerate lines." | newDelim == Nothing = old' | otherwise = new' -- | Chapter 3. Parse options and flags data Options = Options { optVersion :: Bool, optHelp :: Bool, optVerbose :: Bool, optDelim :: Maybe Char, optNewDelim :: Maybe Char, optSkip :: Maybe Int, optOmit :: Maybe Int, optEnum :: Bool } -- Handy if only version or help is needed defaultOptions' = defaultOptions "example.ssv" defaultOptions :: FilePath -> Options defaultOptions filename = Options { optVersion = False, optHelp = False, optVerbose = False, optDelim = decideF filename, optNewDelim = decideF filename, optSkip = Just 0, optOmit = Just 0, optEnum = False } options :: [OptDescr (Options -> IO Options)] options = [ Option ['V'] ["version"] (NoArg showVersion) "Show version number", Option ['h'] ["help"] (NoArg showHelp) "Show help message", Option ['v'] ["verbose"] (NoArg setVerbose) "Enable verbose messages", Option ['f'] ["delim"] (ReqArg setDelim " ") "Define delimiter in the input file", Option ['g'] ["newdelim"] (ReqArg setNewDelim " ") "Define delimiter in the output file", Option ['s'] ["skip"] (ReqArg setSkip "0") "Set number of lines to skip", Option ['t'] ["omit"] (ReqArg setOmit "0") "Set number of lines to omit in the end of the file", Option ['n'] ["enum"] (NoArg setEnum) "Treat the line number as the first column of the input." ] setDelim arg opt = return opt { optDelim = if length (readLiteral arg) > 0 then Just (readLiteral arg !! 0) else error "The delimiter should have non-zero length" } setNewDelim arg opt = return opt { optNewDelim = if length (readLiteral arg) > 0 then Just (readLiteral arg !! 0) else error "The new delimiter should have non-zero length" } setSkip arg opt = return opt { optSkip = let argParsed = readMaybe arg in if (argParsed == Nothing) then error "The number of lines to skip should be an integer" else argParsed } setOmit arg opt = return opt { optOmit = let argParsed = readMaybe arg in if (argParsed == Nothing) then error "The number of lines to omit should be an integer" else argParsed } setEnum opt = return opt { optEnum = True } setVerbose opt = return opt { optVerbose = True } elem' :: Eq a => [a] -> a -> Bool elem' = flip elem readLiteral :: [Char] -> String readLiteral s = read ('"' : s ++ "\"") :: String parseArgs argv = case getOpt Permute options argv of (args,fs,[]) -> do let files = if null fs then ["-"] else fs return (args, files) (_,_,errs) -> do hPutStrLn stderr (concat errs ++ usageInfo header options) exitWith (ExitFailure 1) where header = "Usage: rei [options] rule file" showVersion _ = do hPutStrLn stderr "rei: process lists easily. Version 0.3.5 (alpha). March 2016." exitWith ExitSuccess showHelp _ = do let header = "\n\trei is designed to process lists easily\n" hPutStrLn stderr (usageInfo header options) exitWith ExitSuccess -- | Chapter 4. Working with the file unlessM :: Monad m => m Bool -> m () -> m () unlessM b s = b >>= (\t -> unless t s) -- | Chapter 5. Magic functions condense2 :: Eq a => [[a]] -> [[a]] condense2 input = zipWith (:) keys values where grouped = groupBy ((==) `on` fst) . map tuplify2 $ input keys = map (fst . (!! 0)) grouped values = map (map snd) grouped melter :: [t] -> [[t]] melter [] = [] melter (a:b:[]) = [ [a,b] ] melter (a:b:xs) = ( [a,b]:(melter (a:xs)) ) melt2 :: [[t]] -> [[t]] melt2 input = foldr (++) [] . map melter $ input mzip :: [[a]] -> [[a]] -> [[a]] mzip a b = map (\(x,y) -> x++y) ( zip a b ) subtrList :: Eq a => [a] -> [a] -> [a] subtrList xs ys = filter (\x -> notElem x ys) xs filterUnique :: (Eq a) => Int -> [[a]] -> [[a]] filterUnique n xs = foldl (\xs x -> if xs == [] then [x] else if (x!!n)/=(xs!!0!!n) then x:xs else xs) [] xs main = do (opts, args) <- getArgs >>= parseArgs -- putStrLn $ "Flags: " ++ show opts -- Check several special options: help and version optsSp <- foldl (>>=) (return defaultOptions') opts let Options { optVersion = versionOnly, optHelp = helpOnly } = optsSp when helpOnly $ showHelp "" when versionOnly $ showVersion "" -- Check the rule let (rule:files) = args let filename | rule == "-" = error "Provide a rule and a filename\n-- Example: `rei \"a b -> b a\" example.ssv`" | rule == "filter" = files !! 1 | rule == "reduce" = files !! 1 | rule == "distinct" = files !! 1 | length files > 0 = files !! 0 | otherwise = error "Provide a filename\n-- Example: `rei \"a b -> b a\" example.ssv`" -- Parse options opts' <- foldl (>>=) (return $ defaultOptions filename) opts let Options { optVerbose = verbose , optVersion = versionOnly , optDelim = fieldSep , optNewDelim = fieldSep' , optSkip = linesToSkip , optOmit = linesToOmit , optEnum = ifEnumerate } = opts' -- Check if the delimiter provided if the file format is unknown if fieldSep == Nothing then hPutStrLn stderr "Warning: the delimiter is not provided and can not be deduced from the file extension." else return () -- Parse the rule let pat = "(.*[^\\])->(.*)" :: String -- Definitions for magic functions let finalDelim | fieldSep' /= Nothing = B.pack [d2] | fieldSep /= Nothing = B.pack [d1] | otherwise = B.pack " " where Just d1 = fieldSep Just d2 = fieldSep' let open x = if x == "-" then getContents else readFile x let drop' (Just n) = drop n let take' m xs | m == Just 0 = id xs | otherwise = Prelude.take (length xs - m') xs where Just m' = m let getParsed x = parseLine' x fieldSep let rule' = strip rule -- Create a union of several files, column-wise when (rule' == "merge") $ do when verbose $ hPutStrLn stderr "Merging files..." let several_files | length files > 1 = files | otherwise = error "Provide at least two files to merge\n-- Example: `rei merge f1.ssv f2.ssv`" let print' x = B.putStrLn $ B.intercalate finalDelim $ x let parseActions = map getParsed . take' linesToOmit . drop' linesToSkip . lines filesContents <- mapM readFile several_files mapM_ print' $ foldl1 mzip ( map parseActions filesContents ) exitWith ExitSuccess -- Condense the first two columns of the file when (rule' == "condense2") $ do when verbose $ hPutStrLn stderr "Condensing a file..." let print' x = B.putStrLn $ B.intercalate finalDelim $ x mapM_ print' . condense2 . map getParsed . take' linesToOmit . drop' linesToSkip . lines =<< open filename exitWith ExitSuccess -- Melt the file into two-column file when (rule' == "melt2") $ do when verbose $ hPutStrLn stderr "Melting a file..." let print' x = B.putStrLn $ B.intercalate finalDelim $ x mapM_ print' . melt2 . map getParsed . take' linesToOmit . drop' linesToSkip . lines =<< open filename exitWith ExitSuccess -- Concatenate files, row-wise when (rule' == "unite" || rule' == "concatenate" || rule' == "concat") $ do when verbose $ hPutStrLn stderr "Uniting files..." let several_files | length files > 1 = files | otherwise = error "Provide at least two files to unite\n-- Example: `rei unite f1.ssv f2.ssv`" let print' x = B.putStrLn $ B.intercalate finalDelim $ x let parseActions = map getParsed . take' linesToOmit . drop' linesToSkip . lines filesContents <- mapM readFile several_files mapM_ print' $ concatMap parseActions filesContents exitWith ExitSuccess -- Find common rows for several files. when (rule' == "join" || rule' == "intersect") $ do when verbose $ hPutStrLn stderr "Joining files..." let several_files | length files > 1 = files | otherwise = error "Provide at least two files to join\n-- Example: `rei join f1.ssv f2.ssv`" let print' x = B.putStrLn $ B.intercalate finalDelim $ x let parseActions = map getParsed . take' linesToOmit . drop' linesToSkip . lines filesContents <- mapM readFile several_files let joined = foldl1 intersect ( map parseActions filesContents ) when verbose $ hPutStrLn stderr ("There are " ++ show (length joined) ++ " common rows.") mapM_ print' $ joined exitWith ExitSuccess -- Find common rows for several files. when (rule' == "subtract" || rule' == "subtr") $ do when verbose $ hPutStrLn stderr "Subtracting files..." let several_files | length files > 1 = files | otherwise = error "Provide at least two files to subtract\n-- Example: `rei subtr f1.ssv f2.ssv`" let print' x = B.putStrLn $ B.intercalate finalDelim $ x let parseActions = map getParsed . take' linesToOmit . drop' linesToSkip . lines filesContents <- mapM readFile several_files let elemsUniq = foldl1 subtrList ( map parseActions filesContents ) when verbose $ hPutStrLn stderr ("There are " ++ show (length elemsUniq) ++ " common rows.") mapM_ print' $ elemsUniq exitWith ExitSuccess -- Transpose a list. when (rule' == "transpose") $ do when verbose $ hPutStrLn stderr "Transposing a file..." let print' x = B.putStrLn $ B.intercalate finalDelim $ x mapM_ print' . transpose . map getParsed . take' linesToOmit . drop' linesToSkip . lines =<< open filename exitWith ExitSuccess -- Filter a list. when (rule' == "filter" || rule' == "reduce") $ do when verbose $ hPutStrLn stderr "Filtering a file..." let (pattern:several_files) | length files > 1 = files | otherwise = error "Provide a filtering pattern and a file\n -- Example: `rei filter 'chr source type => type ~ gene' f1.bed`" let single_file = several_files !! 0 let regex_filter = "(.*[^\\])=>(.*)~(.*)" :: String -- Filtering pattern let pattern_parts = (pattern =~ regex_filter) :: [[String]] when (length pattern_parts < 1 || length (pattern_parts !! 0) /= 4) $ error "Something's wrong with the filtering pattern. The pattern should consist of a set of column descriptors, the fat arrow, the target field name, and a regular expression.\n-- Example: `rei filter 'chr source type => type ~ gene' f1.bed`" let pattern_parts' = pattern_parts !! 0 let pattern_regex = strip $ pattern_parts' !! 3 let pattern_field = findFstInSnd (words $ pattern_parts' !! 2) (words $ pattern_parts' !! 1) !! 0 !! 0 let print'' x = B.putStrLn $ B.intercalate finalDelim $ x let parseActions = map getParsed . take' linesToOmit . drop' linesToSkip . lines fileContent <- if single_file == "-" then getContents else readFile single_file let filtered | rule' == "filter" = filter (\x -> (x !! pattern_field =~ pattern_regex :: Bool)) (parseActions fileContent) | rule' == "reduce" = filter (\x -> not (x !! pattern_field =~ pattern_regex :: Bool)) (parseActions fileContent) when verbose $ hPutStrLn stderr ("There are " ++ show (length filtered) ++ " filtered rows.") mapM_ print'' $ filtered exitWith ExitSuccess -- Get unique records by field. when (rule' == "distinct") $ do when verbose $ hPutStrLn stderr "Getting distinct records..." let (pattern:several_files) | length files > 1 = files | otherwise = error "Provide a filtering pattern and a file\n -- Example: `rei filter 'chr source type => type ~ gene' f1.bed`" let single_file = several_files !! 0 let regex_filter = "(.*[^\\])=>(.*)" :: String -- Filtering pattern let pattern_parts = (pattern =~ regex_filter) :: [[String]] when (length pattern_parts < 1 || length (pattern_parts !! 0) /= 3) $ error "Something's wrong with the filtering pattern. The pattern should consist of a set of column descriptors, the fat arrow, the target field name, and a regular expression.\n-- Example: `rei distinct 'chr source type => chr' f1.bed`" let pattern_parts' = pattern_parts !! 0 let pattern_regex = strip $ pattern_parts' !! 3 let pattern_field = findFstInSnd (words $ pattern_parts' !! 2) (words $ pattern_parts' !! 1) !! 0 !! 0 let print'' x = B.putStrLn $ B.intercalate finalDelim $ x let parseActions = map getParsed . take' linesToOmit . drop' linesToSkip . lines fileContent <- if single_file == "-" then getContents else readFile single_file let unique = reverse $ filterUnique pattern_field (parseActions fileContent) when verbose $ hPutStrLn stderr ("There are " ++ show (length unique) ++ " distinct rows.") mapM_ print'' $ unique exitWith ExitSuccess let (before, after) | matchResult == [] = error "Something's wrong with the rule. The rule must have two sets of column descriptors separated by the arrow.\n-- Example: `rei \"a b -> b a\" example.ssv`" | otherwise = tuplify2 . map strip . drop 1 $ matchResult !! 0 where matchResult = (rule =~ pat :: [[String]]) -- print $ splitOn " " rule -- print $ strip before -- print $ strip after let ell = "..." let order | checkRule before after = concat $ findFstInSnd (words after) (words before) | sole' ell before && sole' ell after = error "Something's wrong with the rule. There might be a variable which is unique to the right part." | otherwise = error "Something's wrong with the rule. There might be multiple ellipsis signs." let correctOrder ns xs = ifEl where (prex, sufx) = splitMargins ell (words before) ifEl | rightEllipsis = concat $ map (proceed shift) ns | otherwise = map (proceedSkip prex) ns proceed s x = if x > prex then [x+s] else if x == prex then [ x .. x + shift ] else [x] shift = lxs - prex - sufx - 1 proceedSkip a x = if x <= a then x else x + shift lxs = length xs rightEllipsis = ell `isInfixOf` after -- Parse the records let print' x | length order == 0 = error "The right part is empty. That doesn't make sense..." | maximum order >= length (getParsed x) = error "Too many fields requested. There might be extra fields in the rule, or the wrong delimiter might have been used." | otherwise = B.putStrLn $ B.intercalate (packEither fieldSep fieldSep' x) $ getFields x getFields x | ell_pos == Nothing = map (pre_x !!) order | otherwise = map (pre_x !!) $ correctOrder order pre_x where ell_pos = findIndex (=="...") (words before) pre_x = getParsed x -- Read the file and apply the rule let withFile s = mapM_ print' . maybeEnumerate ifEnumerate fieldSep fieldSep' . take' linesToOmit . drop' linesToSkip . lines =<< open s where open x = if x == "-" then getContents else readFile x drop' (Just n) = drop n take' m xs | m == Just 0 = id xs | otherwise = Prelude.take (length xs - m') xs where Just m' = m withFile filename -- Verbose messages when verbose $ hPutStrLn stderr (case fieldSep of Nothing -> "Delimiter of the input file is not defined." Just delim -> "-- Delimiter of the input file: \'" ++ [delim] ++ "\'.") when verbose $ hPutStrLn stderr (case fieldSep' of Nothing -> "-- Delimiter of the output file is not defined." Just delim -> "-- Delimiter of the output file: \'" ++ [delim] ++ "\'.") when verbose $ hPutStrLn stderr (case linesToSkip of Just 0 -> "-- No lines skipped in the beginning of the file." Just x -> "-- Lines skipped in the beginning: \'" ++ show linesToSkip ++ "\'.") when verbose $ hPutStrLn stderr (case linesToOmit of Just 0 -> "-- No lines skipped in the end of the file." Just x -> "-- Lines omitted in the end: \'" ++ show linesToOmit ++ "\'.") when verbose $ hPutStrLn stderr (case ifEnumerate of True -> "-- Line enumeration is enabled." False -> "-- Lines are not enumerated." )