{- TooManyCells.Program.Options
Gregory W. Schwartz

Options for the command line program.
-}

{-# LANGUAGE DataKinds         #-}
{-# LANGUAGE DeriveGeneric     #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TypeOperators     #-}

module TooManyCells.Program.Options where

-- Remote
import Options.Generic
import qualified Data.Text as T

-- Local

-- | Command line arguments
data Options
    = MakeTree { matrixPath :: [String] <?> "(PATH) The path to the input directory containing the matrix output of cellranger (cellranger < 3 (matrix.mtx, genes.tsv, and barcodes.tsv) or cellranger >= 3 (matrix.mtx.gz, features.tsv.gz, and barcodes.tsv.gz) or an input csv file containing feature row names and cell column names. scATAC-seq is supported if input file contains \"fragments\", ends with \".tsv.gz\" (such as \"fragments.tsv.gz\" or \"sample1_fragments.tsv.gz\"), and is in the SORTED (sort -k1,1 -k2,2n) 10x fragments format (see also --binwidth, --no-binarize). If given as a list (--matrix-path input1 --matrix-path input2 etc.) then will join all matrices together. Assumes the same number and order of features in each matrix, so only cells are added."
               , binwidth :: Maybe Int <?> "(Nothing | BINSIZE) If input data has region features in the format of `chrN:START-END`, BINSIZE input is required to convert ranges to fixed width bins."
               , noBinarize :: Bool <?> "If a fragments.tsv.gz file, do not binarize data."
               , binarize :: Bool <?> "Binarize data. Default for fragments.tsv.gz."
               , excludeMatchFragments :: Maybe String <?> "([Nothing] | STRING) Exclude fragments from fragments.tsv.gz file if STRING is infix of fragment row. For instance, exclude all chrY fragments with \"--exclude-match-fragments chrY\"."
               , blacklistRegionsFile :: Maybe T.Text <?> "([Nothing] | FILE) Bed file containing regions to ignore. Any fragments overlapping these regions are ignored if the input is not a fragments file."
               , customRegion :: [T.Text] <?> "([Nothing] | chrN:START-END) Only look at these regions in the matrix for chromosome region features. A list in the format `chrN:START-END`, will assign values for a cell intersecting this region to this region. If a cell region overlaps two or more regions, will be counting that many times in the new features. Example input: `--custom-regions \"chr1:1003-10064\" --custom-regions \"chr2:3021-5034\"` etc."
               , projectionFile :: Maybe String <?> "([Nothing] | FILE) The input file containing positions of each cell for plotting. Format with header is \"barcode,x,y\". Useful for 10x where a TNSE projection is generated in \"projection.csv\". Cells without projections will not be plotted. If not supplied, no plot will be made."
               , cellWhitelistFile :: Maybe String <?> "([Nothing] | FILE) The input file containing the cells to include. No header, line separated list of barcodes."
               , labelsFile :: Maybe String <?> "([Nothing] | FILE) The input file containing the label for each cell barcode, with \"item,label\" header."
               , customLabel :: [T.Text] <?> "([] | [LABEL]) List of labels to assign each matrix if all cells from each matrix are given the same label per matrix. This argument intends to simplify the process of labeling by bypassing --labels-file if the user just wants each matrix to have its own label (i.e. sample). Must be the same length and order as --matrix-path: for instance, --matrix-path input1 --custom-label sample1 --matrix-path input2 --custom-label sample2 etc. will label all cells from input1 with sample1, input2 with sample2, etc. If there are multiple labels per matrix, you must use --labels-file."
               , delimiter :: Maybe Char <?> "([,] | CHAR) The delimiter for the most csv files in the program. For instance, if using a normal csv rather than cellranger output and for --labels-file."
               , featureColumn :: Maybe Int <?> "([1] | COLUMN) The column (1-indexed) in the features.tsv.gz file to use for feature names. If using matrix market format, cellranger stores multiple columns in the features file, usually the first column for the Ensembl identifier and the second column for the gene symbol. If the Ensembl identifier is not quickly accessible, use --feature-column 2 for the second column, which is usually more ubiquitous. Useful for overlaying gene expression so you can say --draw-leaf \"DrawItem (DrawContinuous \\\"CD4\\\")\") instead of --draw-leaf \"DrawItem (DrawContinuous \\\"ENSG00000010610\\\")\"). Does not affect CSV format (the column names will be the feature names)."
               , normalization :: [String] <?> "([TfIdfNorm] | UQNorm | MedNorm | TotalMedNorm | TotalNorm | LogCPMNorm DOUBLE | QuantileNorm | NoneNorm) Type of normalization before clustering. Can be used as a list to perform one after the other: --normalization QuantileNorm --normalization TfIdfNorm will first apply quantile then tf-idf normalization. TfIdfNorm normalizes based on the prevalence of each feature. UQNorm normalizes each observation by the upper quartile non-zero counts of that observation. MedNorm normalizes each observation by the median non-zero counts of that observation. TotalNorm normalizes each cell by the total count. TotalMedNorm normalized first each observation by total count then by median of non-zero counts across features. (LogCPMNorm DOUBLE) normalizes by logB(CPM + 1) where B is DOUBLE. QuantileNorm normalizes by quantile normalization, ignores zeros, may be slow. NoneNorm does not normalize. Default is TfIdfNorm for clustering and NoneNorm for differential (which instead uses the recommended edgeR single cell preprocessing including normalization and filtering, any normalization provided here will result in edgeR preprocessing on top). Older versions had BothNorm which has been replaced with --normalization TotalMedNorm --normalization TfIdfNorm. This change will also affect other analyses, as TfIdfNorm will now be in non-cluster-related entry points if specified, so --normalization UQNorm from < v2.0.0.0 is now --normalization UQNorm --normalization TfIdfNorm."
               , eigenGroup :: Maybe String <?> "([SignGroup] | KMeansGroup) Whether to group the eigenvector using the sign or kmeans while clustering. While the default is sign, kmeans may be more accurate (but starting points are arbitrary)."
               , numEigen :: Maybe Int <?> "([1] | INT) Number of eigenvectors to use while clustering with kmeans. Takes from the second to last eigenvector. Recommended to start at 1 and work up from there if needed. May help offset the possible instability and inaccuracy of SVDLIBC."
               , numRuns :: Maybe Int <?> "([Nothing] | INT) Number of runs for permutation test at each split for modularity. Defaults to no test."
               , minSize :: Maybe Int <?> "([1] | INT) The minimum size of a cluster. Defaults to 1."
               , maxStep :: Maybe Int <?> "([Nothing] | INT) Only keep clusters that are INT steps from the root. Defaults to all steps."
               , maxProportion :: Maybe Double <?> "([Nothing] | DOUBLE) Stopping criteria to stop at the node immediate after a node with DOUBLE proportion split. So a node N with L and R children will stop with this criteria at 0.5 if |L| / |R| < 0.5 or > 2 (absolute log2 transformed), that is, if one child has over twice as many items as the other child. Includes L and R in the final result."
               , minModularity :: Maybe Double <?> "([Nothing] | DOUBLE) Nearly the same as --min-distance, but for clustering instead of drawing (so the output json tree can be larger). Stopping criteria to stop at the node with DOUBLE modularity. So a node N with L and R children will stop with this criteria the distance at N to L and R is < DOUBLE. Does not include L and R in the final result."
               , minDistance :: Maybe Double <?> "([Nothing] | DOUBLE) Stopping criteria to stop at the node immediate after a node with DOUBLE distance. So a node N with L and R children will stop with this criteria the distance at N to L and R is < DOUBLE. Includes L and R in the final result."
               , minDistanceSearch :: Maybe Double <?> "([Nothing] | DOUBLE) Similar to --min-distance, but searches from the leaves to the root -- if a path from a subtree contains a distance of at least DOUBLE, keep that path, otherwise prune it. This argument assists in finding distant nodes."
               , smartCutoff :: Maybe Double <?> "([Nothing] | DOUBLE) Whether to set the cutoffs for --min-size, --max-proportion, --min-distance, and --min-distance-search based off of the distributions (median + (DOUBLE * MAD)) of all nodes. To use smart cutoffs, use this argument and then set one of the four arguments to an arbitrary number, whichever cutoff type you want to use. --max-proportion and --min-size distributions are log2 transformed."
               , elbowCutoff :: Maybe String <?> "(Max | Min) Whether to set the cutoffs for --min-size, --max-proportion, --min-distance, and --min-distance-search based off of the elbow point of distributions of all nodes. For a distribution in positive x and y on a graph, the top left hump would be Max and the bottom right dip would be Min. To use elbow cutoffs, use this argument and then set one of the three arguments to an arbitrary number, whichever cutoff type you want to use. --max-proportion and --min-size distributions are log2 transformed. Conflicts with --smart-cutoff, so this argument takes precedent."
               , customCut :: [Int] <?> "([Nothing] | NODE) List of nodes to prune (make these nodes leaves). Invoked by --custom-cut 34 --custom-cut 65 etc."
               , rootCut :: Maybe Int <?> "([Nothing] | NODE) Assign a new root to the tree, removing all nodes outside of the subtree."
               , dendrogramOutput :: Maybe String <?> "([dendrogram.svg] | FILE) The filename for the dendrogram. Supported formats are PNG, PS, PDF, and SVG."
               , matrixOutput :: Maybe String <?> "([Nothing] | FOLDER | FILE.csv) Output the filtered and normalized (not including TfIdfNorm) matrix in this folder under the --output directory in matrix market format or, if a csv file is specified, a dense csv format. Like input, features are rows."
               , labelsOutput :: Bool <?> "Whether to write the labels used for each observation as a labels.csv file in the output folder."
               , fragmentsOutput :: Bool <?> "Whether to output fragments_tsv.gz with barcodes altered by --custom-label in the output folder (excludes filtered-out cells). Useful for downstream analysis by the peaks entry point where the cluster barcodes differ from the original fragments.tsv.gz file when using --custom-label. Matches barcodes based on BARCODE-LABEL."
               , matrixTranspose :: Bool <?> "Whether to transpose the matrix before all processing (observations become features and vice-versa). Will be affected by other options (check your filtering thresholds, normalizations, etc!)"
               , drawLeaf :: Maybe String <?> "([DrawText] | DrawItem DrawItemType) How to draw leaves in the dendrogram. DrawText is the number of items in that leaf. DrawItem is the collection of items represented by circles, consisting of: DrawItem DrawLabel, where each item is colored by its label, DrawItem (DrawContinuous [FEATURE]), where each item is colored by the expression of FEATURE (corresponding to a feature name in the input matrix, [FEATURE] is a list, so if more than one FEATURE is listed, uses the average of the feature values), DrawItem (DrawThresholdContinuous [(FEATURE, THRESHOLD)]), where each item is colored by the binary high / low expression of FEATURE based on THRESHOLD (either `Exact DOUBLE` or `MadMedian DOUBLE`, where Exact just uses the DOUBLE as a cutoff value while MadMedian uses the DOUBLE as the number of MADs away from the median value of the feature) and multiple FEATUREs can be used to combinatorically label items (FEATURE1 high / FEATURE2 low, etc.), DrawItem DrawSumContinuous, where each item is colored by the sum of the post-normalized columns (use --normalization NoneNorm for UMI counts, default), and DrawItem DrawDiversity, where each node is colored by the diversity based on the labels of each item and the color is normalized separately for the leaves and the inner nodes. The default is DrawText, unless --labels-file is provided, in which DrawItem DrawLabel is the default. If the label or feature cannot be found, the default color will be black (check your spelling!)."
               , drawCollection :: Maybe String <?> "([PieChart] | PieRing | IndividualItems | Histogram | NoLeaf | CollectionGraph MAXWEIGHT THRESHOLD [NODE]) How to draw item leaves in the dendrogram. PieRing draws a pie chart ring around the items. PieChart only draws a pie chart instead of items. IndividualItems only draws items, no pie rings or charts. Histogram plots a histogram of the features requested. NoLeaf has no leaf, useful if there are so many items the tree takes very long to render. (CollectionGraph MAXWEIGHT THRESHOLD [NODE]) draws the nodes and edges within leaves that are descendents of NODE (empty list [] indicates draw all leaf networks) based on the input matrix, normalizes edges based on the MAXWEIGHT, and removes edges for display less than THRESHOLD (after normalization, so for CollectionGraph 2 0.5 [26], draw the leaf graphs for all leaves under node 26, then a edge of 0.7 would be removed because (0.7 / 2) < 0.5). For CollectionGraph with no colors, use --draw-leaf \"DrawItem DrawLabel\" and all nodes will be black. If you don't specify this option, DrawText from --draw-leaf overrides this argument and only the number of cells will be plotted."
               , drawMark :: Maybe String <?> "([MarkNone] | MarkModularity | MarkSignificance ) How to draw annotations around each inner node in the tree. MarkNone draws nothing and MarkModularity draws a black circle representing the modularity at that node, darker black means higher modularity for that next split. MarkSignificance is for significance, i.e. p-value, darker means higher value."
               , drawNodeNumber :: Bool <?> "Draw the node numbers on top of each node in the graph."
               , drawMaxNodeSize :: Maybe Double <?> "([72] | DOUBLE) The max node size when drawing the graph. 36 is the theoretical default, but here 72 makes for thicker branches."
               , drawMaxLeafNodeSize :: Maybe Double <?> "([--draw-max-node-size] | DOUBLE) The max leaf node size when drawing the graph. Defaults to the value of --draw-max-node-size."
               , drawNoScaleNodes :: Bool <?> "Do not scale inner node size when drawing the graph. Instead, uses draw-max-node-size as the size of each node and is highly recommended to change as the default may be too large for this option."
               , drawLegendSep :: Maybe Double <?> "([1] | DOUBLE) The amount of space between the legend and the tree."
               , drawLegendAllLabels :: Bool <?> "Whether to show all the labels in the label file instead of only showing labels within the current tree. The program generates colors from all labels in the label file first in order to keep consistent colors. By default, this value is false, meaning that only the labels present in the tree are shown (even though the colors are the same). The subset process occurs after --draw-colors, so when using that argument make sure to account for all labels."
               , drawPalette :: Maybe String <?> "([Set1] | Hsv | Ryb | Blues) Palette to use for legend colors. With high saturation in --draw-scale-saturation, consider using Hsv to better differentiate colors."
               , drawColors :: Maybe String <?> "([Nothing] | COLORS) Custom colors for the labels or continuous features. Will repeat if more labels than provided colors. For continuous feature plots, uses first two colors [high, low], defaults to [red, gray]. For instance: --draw-colors \"[\\\"#e41a1c\\\", \\\"#377eb8\\\"]\""
               , drawDiscretize :: Maybe String <?> "([Nothing] | COLORS | INT) Discretize colors by finding the nearest color for each item and node. For instance, --draw-discretize \"[\\\"#e41a1c\\\", \\\"#377eb8\\\"]\" will change all node and item colors to one of those two colors, based on Euclidean distance. If using \"--draw-discretize INT\", will instead take the default map and segment (or interpolate) it into INT colors, rather than a more continuous color scheme. May have unintended results when used with --draw-scale-saturation."
               , drawScaleSaturation :: Maybe Double <?> "([Nothing] | DOUBLE) Multiply the saturation value all nodes by this number in the HSV model. Useful for seeing more visibly the continuous colors by making the colors deeper against a gray scale."
               , drawItemLineWeight :: Maybe Double <?> "([0.1] | DOUBLE) The line weight for items in the leaves if shown. Supplied as if there are too many items, the collection may look like a black box. Set to 0 to disable outlines of items to avoid this."
               , drawFont :: Maybe String <?> "([Arial] | FONT) Specify the font to use for the labels when plotting."
               , drawBarBounds :: Bool <?> "Whether to plot only the minimum and maximum ticks for the color bars."
               , pca :: Maybe Int <?> "([Nothing] | INT) Not recommended, as it makes cosine similarity less meaningful (therefore less accurate -- instead, consider making your own similarity matrix and using cluster-tree, our sister algorithm, to cluster the matrix and plot with birch-beer). The number of dimensions to keep for PCA dimensionality reduction before clustering. Default is no PCA at all in order to keep all information. Should use with --shift-positive to ensure no negative values (as --pca will center and scale). Consider changing the modularity cutoff to a lower value (such as --min-modularity -0.5)."
               , lsa :: Maybe Int <?> "([Nothing] | INT) The number of dimensions to keep for LSA dimensionality reduction. Uses TD-IDF followed by SVD before clustering, same warnings as --pca apply, including the use of --shift-positive with possible --min-modularity -0.5."
               , svd :: Maybe Int <?> "([Nothing] | INT) The number of dimensions to keep for SVD dimensionality reduction. Will center and scale, same warnings as --pca apply, including the use of --shift-positive with possible --min-modularity -0.5."
               , dropDimension :: Bool <?> "Instead of keeping dimensions for --pca, --lsa, or --svd, drop the number of dimensions (i.e. --lsa 1 drops the first projection which may have a strong batch effect)."
               , shiftPositive :: Bool <?> "Shift features to positive values. Positive values are shifted to allow modularity to work correctly."
               , filterThresholds :: Maybe String <?> "([Nothing] | (DOUBLE, DOUBLE)) The minimum filter thresholds for (MINCELL, MINFEATURE) when filtering cells and features by low read counts. Default changed to Nothing due to additional supported assays. To use the original filter thresholds, use --filter-thresholds \"(250, 1)\"."
               , prior :: Maybe String <?> "([Nothing] | STRING) The input folder containing the output from a previous run. If specified, skips clustering by using the previous clustering files."
               , noUpdateTreeRows :: Bool <?> "Don't update the row indices of a tree if using an identical matrix to the one which generated the tree. Should not be used unless the matrix to make the tree is identical, then can result in speedup."
               , order :: Maybe Double <?> "([1] | DOUBLE) The order of diversity."
               , clumpinessMethod :: Maybe String <?> "([Majority] | Exclusive | AllExclusive) The method used when calculating clumpiness: Majority labels leaves according to the most abundant label, Exclusive only looks at leaves consisting of cells solely from one label, and AllExclusive treats the leaf as containing both labels."
               , dense :: Bool <?> "Whether to use dense matrix algorithms for clustering. Should be faster for dense matrices, so if batch correction, PCA, or other algorithms are applied upstream to the input matrix, consider using this option to speed up the tree generation."
               , output :: Maybe String <?> "([out] | STRING) The folder containing output."}
    | Interactive { matrixPath :: [String] <?> "(PATH) The path to the input directory containing the matrix output of cellranger (cellranger < 3 (matrix.mtx, genes.tsv, and barcodes.tsv) or cellranger >= 3 (matrix.mtx.gz, features.tsv.gz, and barcodes.tsv.gz) or an input csv file containing feature row names and cell column names. scATAC-seq is supported if input file contains \"fragments\", ends with \".tsv.gz\" (such as \"fragments.tsv.gz\" or \"sample1_fragments.tsv.gz\"), and is in the SORTED (sort -k1,1 -k2,2n) 10x fragments format (see also --binwidth, --no-binarize). If given as a list (--matrix-path input1 --matrix-path input2 etc.) then will join all matrices together. Assumes the same number and order of features in each matrix, so only cells are added."
                  , matrixTranspose :: Bool <?> "Whether to transpose the matrix before all processing (observations become features and vice-versa). Will be affected by other options (check your filtering thresholds, normalizations, etc!)"
                  , binwidth :: Maybe Int <?> "(Nothing | BINSIZE) If input data has region features in the format of `chrN:START-END`, BINSIZE input is required to convert ranges to fixed width bins."
                  , noBinarize :: Bool <?> "If a fragments.tsv.gz file, do not binarize data."
                  , binarize :: Bool <?> "Binarize data. Default for fragments.tsv.gz."
                  , excludeMatchFragments :: Maybe String <?> "([Nothing] | STRING) Exclude fragments from fragments.tsv.gz file if STRING is infix of fragment row. For instance, exclude all chrY fragments with \"--exclude-match-fragments chrY\"."
                  , blacklistRegionsFile :: Maybe T.Text <?> "([Nothing] | FILE) Bed file containing regions to ignore. Any fragments overlapping these regions are ignored if the input is not a fragments file."
                  , customRegion :: [T.Text] <?> "([Nothing] | chrN:START-END) Only look at these regions in the matrix for chromosome region features. A list in the format `chrN:START-END`, will assign values for a cell intersecting this region to this region. If a cell region overlaps two or more regions, will be counting that many times in the new features. Example input: `--custom-regions \"chr1:1003-10064\" --custom-regions \"chr2:3021-5034\"` etc."
                  , cellWhitelistFile :: Maybe String <?> "([Nothing] | FILE) The input file containing the cells to include. No header, line separated list of barcodes."
                  , labelsFile :: Maybe String <?> "([Nothing] | FILE) The input file containing the label for each cell barcode, with \"item,label\" header."
                  , customLabel :: [T.Text] <?> "([] | [LABEL]) List of labels to assign each matrix if all cells from each matrix are given the same label per matrix. This argument intends to simplify the process of labeling by bypassing --labels-file if the user just wants each matrix to have its own label (i.e. sample). Must be the same length and order as --matrix-path: for instance, --matrix-path input1 --custom-label sample1 --matrix-path input2 --custom-label sample2 etc. will label all cells from input1 with sample1, input2 with sample2, etc. If there are multiple labels per matrix, you must use --labels-file."
                  , delimiter :: Maybe Char <?> "([,] | CHAR) The delimiter for the most csv files in the program. For instance, if using a normal csv rather than cellranger output and for --labels-file."
                  , featureColumn :: Maybe Int <?> "([1] | COLUMN) The column (1-indexed) in the features.tsv.gz file to use for feature names. If using matrix market format, cellranger stores multiple columns in the features file, usually the first column for the Ensembl identifier and the second column for the gene symbol. If the Ensembl identifier is not quickly accessible, use --feature-column 2 for the second column, which is usually more ubiquitous. Useful for overlaying gene expression so you can say --draw-leaf \"DrawItem (DrawContinuous \\\"CD4\\\")\") instead of --draw-leaf \"DrawItem (DrawContinuous \\\"ENSG00000010610\\\")\"). Does not affect CSV format (the column names will be the feature names)."
                  , normalization :: [String] <?> "([TfIdfNorm] | UQNorm | MedNorm | TotalMedNorm | TotalNorm | LogCPMNorm DOUBLE | QuantileNorm | NoneNorm) Type of normalization before clustering. Can be used as a list to perform one after the other: --normalization QuantileNorm --normalization TfIdfNorm will first apply quantile then tf-idf normalization. TfIdfNorm normalizes based on the prevalence of each feature. UQNorm normalizes each observation by the upper quartile non-zero counts of that observation. MedNorm normalizes each observation by the median non-zero counts of that observation. TotalNorm normalizes each cell by the total count. TotalMedNorm normalized first each observation by total count then by median of non-zero counts across features. (LogCPMNorm DOUBLE) normalizes by logB(CPM + 1) where B is DOUBLE. QuantileNorm normalizes by quantile normalization, ignores zeros, may be slow. NoneNorm does not normalize. Default is TfIdfNorm for clustering and NoneNorm for differential (which instead uses the recommended edgeR single cell preprocessing including normalization and filtering, any normalization provided here will result in edgeR preprocessing on top). Older versions had BothNorm which has been replaced with --normalization TotalMedNorm --normalization TfIdfNorm. This change will also affect other analyses, as TfIdfNorm will now be in non-cluster-related entry points if specified, so --normalization UQNorm from < v2.0.0.0 is now --normalization UQNorm --normalization TfIdfNorm."
                  , pca :: Maybe Int <?> "([Nothing] | INT) Not recommended, as it makes cosine similarity less meaningful (therefore less accurate -- instead, consider making your own similarity matrix and using cluster-tree, our sister algorithm, to cluster the matrix and plot with birch-beer). The number of dimensions to keep for PCA dimensionality reduction before clustering. Default is no PCA at all in order to keep all information. Should use with --shift-positive to ensure no negative values (as --pca will center and scale). Consider changing the modularity cutoff to a lower value (such as --min-modularity -0.5)."
                  , lsa :: Maybe Int <?> "([Nothing] | INT) The number of dimensions to keep for LSA dimensionality reduction. Uses TD-IDF followed by SVD before clustering, same warnings as --pca apply, including the use of --shift-positive with possible --min-modularity -0.5."
                  , svd :: Maybe Int <?> "([Nothing] | INT) The number of dimensions to keep for SVD dimensionality reduction. Will center and scale, same warnings as --pca apply, including the use of --shift-positive with possible --min-modularity -0.5."
                  , dropDimension :: Bool <?> "Instead of keeping dimensions for --pca, --lsa, or --svd, drop the number of dimensions (i.e. --lsa 1 drops the first projection which may have a strong batch effect)."
                  , shiftPositive :: Bool <?> "Shift features to positive values. Positive values are shifted to allow modularity to work correctly."
                  , filterThresholds :: Maybe String <?> "([Nothing] | (DOUBLE, DOUBLE)) The minimum filter thresholds for (MINCELL, MINFEATURE) when filtering cells and features by low read counts. Default changed to Nothing due to additional supported assays. To use the original filter thresholds, use --filter-thresholds \"(250, 1)\"."
                  , prior :: Maybe String <?> "([Nothing] | STRING) The input folder containing the output from a previous run. If specified, skips clustering by using the previous clustering files."
                  , noUpdateTreeRows :: Bool <?> "Don't update the row indices of a tree if using an identical matrix to the one which generated the tree. Should not be used unless the matrix to make the tree is identical, then can result in speedup."      }
    | Differential { matrixPath :: [String] <?> "(PATH) The path to the input directory containing the matrix output of cellranger (cellranger < 3 (matrix.mtx, genes.tsv, and barcodes.tsv) or cellranger >= 3 (matrix.mtx.gz, features.tsv.gz, and barcodes.tsv.gz) or an input csv file containing feature row names and cell column names. scATAC-seq is supported if input file contains \"fragments\", ends with \".tsv.gz\" (such as \"fragments.tsv.gz\" or \"sample1_fragments.tsv.gz\"), and is in the SORTED (sort -k1,1 -k2,2n) 10x fragments format (see also --binwidth, --no-binarize). If given as a list (--matrix-path input1 --matrix-path input2 etc.) then will join all matrices together. Assumes the same number and order of features in each matrix, so only cells are added."
                   , matrixTranspose :: Bool <?> "Whether to transpose the matrix before all processing (observations become features and vice-versa). Will be affected by other options (check your filtering thresholds, normalizations, etc!)"
                   , binwidth :: Maybe Int <?> "(Nothing | BINSIZE) If input data has region features in the format of `chrN:START-END`, BINSIZE input is required to convert ranges to fixed width bins."
                   , noBinarize :: Bool <?> "If a fragments.tsv.gz file, do not binarize data."
                   , binarize :: Bool <?> "Binarize data. Default for fragments.tsv.gz."
                   , excludeMatchFragments :: Maybe String <?> "([Nothing] | STRING) Exclude fragments from fragments.tsv.gz file if STRING is infix of fragment row. For instance, exclude all chrY fragments with \"--exclude-match-fragments chrY\"."
                   , blacklistRegionsFile :: Maybe T.Text <?> "([Nothing] | FILE) Bed file containing regions to ignore. Any fragments overlapping these regions are ignored if the input is not a fragments file."
                   , customRegion :: [T.Text] <?> "([Nothing] | chrN:START-END) Only look at these regions in the matrix for chromosome region features. A list in the format `chrN:START-END`, will assign values for a cell intersecting this region to this region. If a cell region overlaps two or more regions, will be counting that many times in the new features. Example input: `--custom-regions \"chr1:1003-10064\" --custom-regions \"chr2:3021-5034\"` etc."
                   , cellWhitelistFile :: Maybe String <?> "([Nothing] | FILE) The input file containing the cells to include. No header, line separated list of barcodes."
                   , labelsFile :: Maybe String <?> "([Nothing] | FILE) The input file containing the label for each cell barcode, with \"item,label\" header."
                   , customLabel :: [T.Text] <?> "([] | [LABEL]) List of labels to assign each matrix if all cells from each matrix are given the same label per matrix. This argument intends to simplify the process of labeling by bypassing --labels-file if the user just wants each matrix to have its own label (i.e. sample). Must be the same length and order as --matrix-path: for instance, --matrix-path input1 --custom-label sample1 --matrix-path input2 --custom-label sample2 etc. will label all cells from input1 with sample1, input2 with sample2, etc. If there are multiple labels per matrix, you must use --labels-file."
                   , featureColumn :: Maybe Int <?> "([1] | COLUMN) The column (1-indexed) in the features.tsv.gz file to use for feature names. If using matrix market format, cellranger stores multiple columns in the features file, usually the first column for the Ensembl identifier and the second column for the gene symbol. If the Ensembl identifier is not quickly accessible, use --feature-column 2 for the second column, which is usually more ubiquitous. Useful for overlaying gene expression so you can say --draw-leaf \"DrawItem (DrawContinuous \\\"CD4\\\")\") instead of --draw-leaf \"DrawItem (DrawContinuous \\\"ENSG00000010610\\\")\"). Does not affect CSV format (the column names will be the feature names)."
                   , pca :: Maybe Int <?> "([Nothing] | INT) Not recommended, as it makes cosine similarity less meaningful (therefore less accurate -- instead, consider making your own similarity matrix and using cluster-tree, our sister algorithm, to cluster the matrix and plot with birch-beer). The number of dimensions to keep for PCA dimensionality reduction before clustering. Default is no PCA at all in order to keep all information. Should use with --shift-positive to ensure no negative values (as --pca will center and scale). Consider changing the modularity cutoff to a lower value (such as --min-modularity -0.5)."
                   , lsa :: Maybe Int <?> "([Nothing] | INT) The number of dimensions to keep for LSA dimensionality reduction. Uses TD-IDF followed by SVD before clustering, same warnings as --pca apply, including the use of --shift-positive with possible --min-modularity -0.5."
                   , svd :: Maybe Int <?> "([Nothing] | INT) The number of dimensions to keep for SVD dimensionality reduction. Will center and scale, same warnings as --pca apply, including the use of --shift-positive with possible --min-modularity -0.5."
                   , dropDimension :: Bool <?> "Instead of keeping dimensions for --pca, --lsa, or --svd, drop the number of dimensions (i.e. --lsa 1 drops the first projection which may have a strong batch effect)."
                   , shiftPositive :: Bool <?> "Shift features to positive values. Positive values are shifted to allow modularity to work correctly."
                   , filterThresholds :: Maybe String <?> "([Nothing] | (DOUBLE, DOUBLE)) The minimum filter thresholds for (MINCELL, MINFEATURE) when filtering cells and features by low read counts. Default changed to Nothing due to additional supported assays. To use the original filter thresholds, use --filter-thresholds \"(250, 1)\"."
                   , delimiter :: Maybe Char <?> "([,] | CHAR) The delimiter for the most csv files in the program. For instance, if using a normal csv rather than cellranger output and for --labels-file."
                   , normalization :: [String] <?> "([TfIdfNorm] | UQNorm | MedNorm | TotalMedNorm | TotalNorm | LogCPMNorm DOUBLE | QuantileNorm | NoneNorm) Type of normalization before clustering. Can be used as a list to perform one after the other: --normalization QuantileNorm --normalization TfIdfNorm will first apply quantile then tf-idf normalization. TfIdfNorm normalizes based on the prevalence of each feature. UQNorm normalizes each observation by the upper quartile non-zero counts of that observation. MedNorm normalizes each observation by the median non-zero counts of that observation. TotalNorm normalizes each cell by the total count. TotalMedNorm normalized first each observation by total count then by median of non-zero counts across features. (LogCPMNorm DOUBLE) normalizes by logB(CPM + 1) where B is DOUBLE. QuantileNorm normalizes by quantile normalization, ignores zeros, may be slow. NoneNorm does not normalize. Default is TfIdfNorm for clustering and NoneNorm for differential (which instead uses the recommended edgeR single cell preprocessing including normalization and filtering, any normalization provided here will result in edgeR preprocessing on top). Older versions had BothNorm which has been replaced with --normalization TotalMedNorm --normalization TfIdfNorm. This change will also affect other analyses, as TfIdfNorm will now be in non-cluster-related entry points if specified, so --normalization UQNorm from < v2.0.0.0 is now --normalization UQNorm --normalization TfIdfNorm."
                   , prior :: Maybe String <?> "([Nothing] | STRING) The input folder containing the output from a previous run. If specified, skips clustering by using the previous clustering files."
                   , noUpdateTreeRows :: Bool <?> "Don't update the row indices of a tree if using an identical matrix to the one which generated the tree. Should not be used unless the matrix to make the tree is identical, then can result in speedup."
                   , noEdger :: Bool <?> "Use Kruskall-Wallis instead of edgeR for differential expression between two sets of nodes (automatically on and required for all to all comparisons)."
                   , nodes :: String <?> "([NODE], [NODE]) Find the differential expression between cells belonging downstream of a list of nodes versus another list of nodes. Directionality is \"([1], [2])\" -> 2 / 1. \"([], [])\" switches the process to instead find the log2 average division between all nodes with all other cells in the provided data set matrix regardless of whether they are present within the tree (node / other cells) using the Kruskal-Wallis Test (--features does not work for this, --labels works, and UQNorm for the normalization is recommended. Only returns nodes where the comparison had both groups containing at least five cells.). If not using --no-update-tree-rows, remember to filter the matrix for cells outside of the tree if you only want to compare against other nodes within the tree."
                   , labels :: Maybe String <?> "([Nothing] | ([LABEL], [LABEL])) Use --labels-file to restrict the differential analysis to cells with these labels. Same format as --nodes, so the first list in --nodes and --labels gets the cells within that list of nodes with this list of labels. The same for the second list. For instance, --nodes \"([1], [2])\" --labels \"([\\\"A\\\"], [\\\"B\\\"])\" will compare cells from node 1 of label \"A\" only with cells from node 2 of label \"B\" only. To use all cells for that set of nodes, use an empty list, i.e. --labels \"([], [\\\"A\\\"])\". When comparing all nodes with all other cells, remember that the notation would be ([Other Cells], [Node]), so to compare cells of label X in Node with cells of label Y in Other Cells, use --labels \"([\\\"Y\\\", \\\"X\\\"])\". Requires both --labels and --labels-file, otherwise will include all labels."
                   , topN :: Maybe Int <?> "([100] | INT ) The top INT differentially expressed features."
                   , seed :: Maybe Int <?> "([0] | INT) The seed to use for subsampling. See --subsample-groups."
                   , subsampleGroups :: Maybe Int <?> "([Nothing] | INT) Whether to subsample each group in the differential comparison. Subsets the specified number of cells. When set to 0, subsamples the larger group to equal the size of the smaller group. When using with --nodes \"([], [])\" to compare all nodes against each other, note that the compared nodes may be resampled. Highly experimental at this stage, use with caution. See --seed."
                   , features :: [T.Text] <?> "([Nothing] | FEATURE) List of features (e.g. genes) to plot for all cells within selected nodes. Invoked by --features CD4 --features CD8 etc. When this argument is supplied, only the plot is outputted and edgeR differential expression is ignored. Outputs to --output."
                   , aggregate :: Bool <?> "([False] | True) Whether to plot the aggregate (mean here) of features for each cell from \"--features\" instead of plotting different distributions for each feature."
                   , plotSeparateNodes :: Bool <?> "([False] | True) Whether to plot each node separately. This will plot each node provided in --nodes from both entries in the tuple (as they may be different from --labels)."
                   , plotSeparateLabels :: Bool <?> "([False] | True) Whether to plot each label separately. This will plot each label provided in --labels from both entries in the tuple (as they may be different from --nodes)."
                   , plotViolin :: Bool <?> "([False] | True) Whether to plot features as a violin plots instead of boxplots."
                   , plotNoOutlier :: Bool <?> "([False] | True) Whether to avoid plotting outliers as there can be too many, making the plot too large."
                   , plotOutput :: Maybe String <?> "([out.pdf] | STRING) The file containing the output plot."}
    | Diversity { priors :: [String] <?> "(PATH) Either input folders containing the output from a run of too-many-cells or a csv files containing the clusters for each cell in the format \"cell,cluster\". Advanced features not available in the latter case. If --labels-file is specified, those labels designate entity type, otherwise the assigned cluster is the entity type."
                , delimiter :: Maybe Char <?> "([,] | CHAR) The delimiter for the most csv files in the program. For instance, if using a normal csv rather than cellranger output and for --labels-file."
                , labelsFile :: Maybe String <?> "([Nothing] | FILE) The input file containing the label for each cell barcode, with \"item,label\" header."
                , start :: Maybe Integer <?> "([0] | INT) For the rarefaction curve, start the curve at this subsampling."
                , interval :: Maybe Integer <?> "([1] | INT) For the rarefaction curve, the amount to increase each subsampling. For instance, starting at 0 with an interval of 4, we would sampling 0, 4, 8, 12, ..."
                , end :: Maybe Integer <?> "([N] | INT) For the rarefaction curve, which subsample to stop at. By default, the curve stops at the observed number of species for each population."
                , order :: Maybe Double <?> "([1] | DOUBLE) The order of diversity."
                , output :: Maybe String <?> "([out] | STRING) The folder containing output."}
    | Paths { prior :: Maybe String <?> "([Nothing] | STRING) The input folder containing the output from a previous run. If specified, skips clustering by using the previous clustering files."
            , labelsFile :: Maybe String <?> "([Nothing] | FILE) The input file containing the label for each cell barcode, with \"item,label\" header."
            , flipDirection :: Bool <?> "Flip the starting node when calculating the distances."
            , shallowStart :: Bool <?> "Choose the shallowest leaf as the starting node (rather than the deepest)."
            , pathDistance :: Maybe String <?> "([PathStep] | PathModularity) How to measure the distance from the starting leaf. PathModularity weighs the steps by the modularity, while PathStep counts the number of steps."
            , bandwidth :: Maybe Double <?> "([1] | DOUBLE) Bandwidth of the density plot."
            , delimiter :: Maybe Char <?> "([,] | CHAR) The delimiter for the most csv files in the program. For instance, if using a normal csv rather than cellranger output and for --labels-file."
            , pathsPalette :: Maybe String <?> "([Set1] | Hsv | Ryb | Blues) Palette to use for legend colors."
            , output :: Maybe String <?> "([out] | STRING) The folder containing output."}
    | Classify { matrixPath :: [String] <?> "(PATH) The path to the input directory containing the matrix output of cellranger (cellranger < 3 (matrix.mtx, genes.tsv, and barcodes.tsv) or cellranger >= 3 (matrix.mtx.gz, features.tsv.gz, and barcodes.tsv.gz) or an input csv file containing feature row names and cell column names. scATAC-seq is supported if input file contains \"fragments\", ends with \".tsv.gz\" (such as \"fragments.tsv.gz\" or \"sample1_fragments.tsv.gz\"), and is in the SORTED (sort -k1,1 -k2,2n) 10x fragments format (see also --binwidth, --no-binarize). If given as a list (--matrix-path input1 --matrix-path input2 etc.) then will join all matrices together. Assumes the same number and order of features in each matrix, so only cells are added."
               , referenceFile :: [String] <?> "(PATH) The path to the reference file to compare each cell to. Every transformation (e.g. filters and normalizations) applied to --matrix-path apply here as well."
               , singleReferenceMatrix :: Bool <?> "Treat the reference file as a single matrix such that each observation (barcode) is an aggregated reference population."
               , matrixTranspose :: Bool <?> "Whether to transpose the matrix before all processing (observations become features and vice-versa). Will be affected by other options (check your filtering thresholds, normalizations, etc!)"
               , binwidth :: Maybe Int <?> "(Nothing | BINSIZE) If input data has region features in the format of `chrN:START-END`, BINSIZE input is required to convert ranges to fixed width bins."
               , noBinarize :: Bool <?> "If a fragments.tsv.gz file, do not binarize data."
               , binarize :: Bool <?> "Binarize data. Default for fragments.tsv.gz."
               , excludeMatchFragments :: Maybe String <?> "([Nothing] | STRING) Exclude fragments from fragments.tsv.gz file if STRING is infix of fragment row. For instance, exclude all chrY fragments with \"--exclude-match-fragments chrY\"."
               , blacklistRegionsFile :: Maybe T.Text <?> "([Nothing] | FILE) Bed file containing regions to ignore. Any fragments overlapping these regions are ignored if the input is not a fragments file."
               , customRegion :: [T.Text] <?> "([Nothing] | chrN:START-END) Only look at these regions in the matrix for chromosome region features. A list in the format `chrN:START-END`, will assign values for a cell intersecting this region to this region. If a cell region overlaps two or more regions, will be counting that many times in the new features. Example input: `--custom-regions \"chr1:1003-10064\" --custom-regions \"chr2:3021-5034\"` etc."
               , cellWhitelistFile :: Maybe String <?> "([Nothing] | FILE) The input file containing the cells to include. No header, line separated list of barcodes."
               , labelsFile :: Maybe String <?> "([Nothing] | FILE) The input file containing the label for each cell barcode, with \"item,label\" header."
               , customLabel :: [T.Text] <?> "([] | [LABEL]) List of labels to assign each matrix if all cells from each matrix are given the same label per matrix. This argument intends to simplify the process of labeling by bypassing --labels-file if the user just wants each matrix to have its own label (i.e. sample). Must be the same length and order as --matrix-path: for instance, --matrix-path input1 --custom-label sample1 --matrix-path input2 --custom-label sample2 etc. will label all cells from input1 with sample1, input2 with sample2, etc. If there are multiple labels per matrix, you must use --labels-file."
               , featureColumn :: Maybe Int <?> "([1] | COLUMN) The column (1-indexed) in the features.tsv.gz file to use for feature names. If using matrix market format, cellranger stores multiple columns in the features file, usually the first column for the Ensembl identifier and the second column for the gene symbol. If the Ensembl identifier is not quickly accessible, use --feature-column 2 for the second column, which is usually more ubiquitous. Useful for overlaying gene expression so you can say --draw-leaf \"DrawItem (DrawContinuous \\\"CD4\\\")\") instead of --draw-leaf \"DrawItem (DrawContinuous \\\"ENSG00000010610\\\")\"). Does not affect CSV format (the column names will be the feature names)."
               , pca :: Maybe Int <?> "([Nothing] | INT) Not recommended, as it makes cosine similarity less meaningful (therefore less accurate -- instead, consider making your own similarity matrix and using cluster-tree, our sister algorithm, to cluster the matrix and plot with birch-beer). The number of dimensions to keep for PCA dimensionality reduction before clustering. Default is no PCA at all in order to keep all information. Should use with --shift-positive to ensure no negative values (as --pca will center and scale). Consider changing the modularity cutoff to a lower value (such as --min-modularity -0.5)."
               , lsa :: Maybe Int <?> "([Nothing] | INT) The number of dimensions to keep for LSA dimensionality reduction. Uses TD-IDF followed by SVD before clustering, same warnings as --pca apply, including the use of --shift-positive with possible --min-modularity -0.5."
               , svd :: Maybe Int <?> "([Nothing] | INT) The number of dimensions to keep for SVD dimensionality reduction. Will center and scale, same warnings as --pca apply, including the use of --shift-positive with possible --min-modularity -0.5."
               , dropDimension :: Bool <?> "Instead of keeping dimensions for --pca, --lsa, or --svd, drop the number of dimensions (i.e. --lsa 1 drops the first projection which may have a strong batch effect)."
               , shiftPositive :: Bool <?> "Shift features to positive values. Positive values are shifted to allow modularity to work correctly."
               , filterThresholds :: Maybe String <?> "([Nothing] | (DOUBLE, DOUBLE)) The minimum filter thresholds for (MINCELL, MINFEATURE) when filtering cells and features by low read counts. Default changed to Nothing due to additional supported assays. To use the original filter thresholds, use --filter-thresholds \"(250, 1)\"."
               , delimiter :: Maybe Char <?> "([,] | CHAR) The delimiter for the most csv files in the program. For instance, if using a normal csv rather than cellranger output and for --labels-file."
               , normalization :: [String] <?> "([TfIdfNorm] | UQNorm | MedNorm | TotalMedNorm | TotalNorm | LogCPMNorm DOUBLE | QuantileNorm | NoneNorm) Type of normalization before clustering. Can be used as a list to perform one after the other: --normalization QuantileNorm --normalization TfIdfNorm will first apply quantile then tf-idf normalization. TfIdfNorm normalizes based on the prevalence of each feature. UQNorm normalizes each observation by the upper quartile non-zero counts of that observation. MedNorm normalizes each observation by the median non-zero counts of that observation. TotalNorm normalizes each cell by the total count. TotalMedNorm normalized first each observation by total count then by median of non-zero counts across features. (LogCPMNorm DOUBLE) normalizes by logB(CPM + 1) where B is DOUBLE. QuantileNorm normalizes by quantile normalization, ignores zeros, may be slow. NoneNorm does not normalize. Default is TfIdfNorm for clustering and NoneNorm for differential (which instead uses the recommended edgeR single cell preprocessing including normalization and filtering, any normalization provided here will result in edgeR preprocessing on top). Older versions had BothNorm which has been replaced with --normalization TotalMedNorm --normalization TfIdfNorm. This change will also affect other analyses, as TfIdfNorm will now be in non-cluster-related entry points if specified, so --normalization UQNorm from < v2.0.0.0 is now --normalization UQNorm --normalization TfIdfNorm." }
    | Peaks { fragmentsPath :: [String] <?> "(PATH) The path to the input fragments.tsv.gz file. The input file must contain \"fragments\" and end with \".tsv.gz\" (such as \"fragments.tsv.gz\" or \"sample1_fragments.tsv.gz\"), and is in the SORTED (sort -k1,1 -k2,2n) 10x fragments format (CHR\tSTART\tEND\tBARCODE\tVALUE). See --fragments-output from the make-tree and matrix-output entry points for assistance in merging files."
            , prior :: Maybe String <?> "([Nothing] | STRING) The input folder containing the output from a previous run. If specified, skips clustering by using the previous clustering files."
            , delimiter :: Maybe Char <?> "([,] | CHAR) The delimiter for the most csv files in the program. For instance, if using a normal csv rather than cellranger output and for --labels-file."
            , labelsFile :: Maybe String <?> "([Nothing] | FILE) The input file containing the label for each cell barcode, with \"item,label\" header."
            , excludeMatchFragments :: Maybe String <?> "([Nothing] | STRING) Exclude fragments from fragments.tsv.gz file if STRING is infix of fragment row. For instance, exclude all chrY fragments with \"--exclude-match-fragments chrY\"."
            , blacklistRegionsFile :: Maybe T.Text <?> "([Nothing] | FILE) Bed file containing regions to ignore. Any fragments overlapping these regions are ignored if the input is not a fragments file."
            , customRegion :: [T.Text] <?> "([Nothing] | chrN:START-END) Only look at these regions in the matrix for chromosome region features. A list in the format `chrN:START-END`, will assign values for a cell intersecting this region to this region. If a cell region overlaps two or more regions, will be counting that many times in the new features. Example input: `--custom-regions \"chr1:1003-10064\" --custom-regions \"chr2:3021-5034\"` etc."
            , peakCallCommand :: Maybe String <?> "([macs2 callpeak --nomodel --nolambda -p 0.001 -B -t %s -n %s --outdir %s] | STRING) The command to call peaks with. Can be any command that will be run on each generated fragment file per cluster, but the first \"%s\" must be the input argument, second \"%s\" is the name of the sample, and the third \"%s\" should be the output directory. Uses macs2 by default. Must return a .narrowPeak file with each row being \"CHR\tSTART\tEND\t*\tVALUE\n\" at least (* can be anything, after VALUE there can be anything as well. Check macs2 output for guidance)."
            , genomecovCommand :: Maybe String <?> "([bedtools genomecov -i %s -g %s -scale %f -bg -trackline > %s] | STRING) The command to convert to coverage bedgraph output. Can be any command that will be run on each bed per cluster, but the first \"%s\" must be the input argument, the second \"%s\" is the genome file (see https://github.com/arq5x/bedtools2/tree/master/genomes), followed by the \"%f\" scaling argument, with the last \"%s\" as the output argument, in order. Uses bedtools genomecov by default."
            , genome :: Maybe String <?> "([./human.hg38.genome] | PATH) The location of the genome file for the --genomecov-command, see https://github.com/arq5x/bedtools2/tree/master/genomes"
            , skipFragments :: Bool <?> "Whether to skip the generation of the fragments (e.g. if changing only --peak-call-command and fragment separation by cluster already exists)."
            , peakNode :: [Int] <?> "([ALLLEAFNODES] | NODE) List of nodes to peak call, i.e. \"--peak-node 3 --peak-node 5 --peak-node 7\". If the node is not a leaf node, make sure to use --all-nodes in addition."
            , peakNodeLabels :: [String] <?> "([ALLLABELS] | NODE) List of labels to keep in each node when outputting fragments and peaks, i.e. --peak-node-labels \"(3, [\\\"Red\\\"])\" --peak-node-labels \"(5, [\\\"Red\\\", \\\"Blue\\\"]. Nodes not listed will include all labels."
            , allNodes :: Bool <?> "Whether to get fragments and peaks for all nodes, not just the leaves."
            , bedgraph :: Bool <?> "Whether to output cluster normalized per million bedgraph output."
            , output :: Maybe String <?> "([out] | STRING) The folder containing output." }
    | Motifs { diffFile :: T.Text <?> "(FILE) The input file containing the differential features between nodes. Must be in the format `node,feature,log2FC,pVal,FDR`. The node column is optional (if wanting to separate per node)."
             , backgroundDiffFile :: Maybe T.Text <?> "(FILE) The input file containing the differential features between nodes for use as a background in motif finding. Must be in the format `node,feature,log2FC,pVal,FDR`. The node column is optional (if wanting to separate per node). If using this argument, be sure to update the --motif-command appropriately (background file comes last, e.g. with homer use `/path/to/findMotifs.pl %s fasta %s -bgFasta %s`)."
             , motifGenome :: T.Text <?> "(FILE) The location of the genome file in fasta format to convert bed to fasta."
             , motifCommand :: Maybe String <?> "([meme %s -nmotifs 50 -oc %s] | STRING) The command to find motifs in a fasta file. Can be any command that will be run on each fasta file converted from the bed optionally per node, but the first \"%s\" must be the input file, the second \"%s\" is the output. An example of homer: `/path/to/findMotifs.pl %s fasta %s`. Uses meme by default."
             , motifGenomeCommand :: Maybe String <?> "([Nothing] | STRING) The command to find motifs from a bed file instead of --motif-command (replaces that argument), as in homer's findMotifsGenome.pl. Can be any command that will be run on each bed file optionally per node, but the first \"%s\" must be the input file, the second \"%s\" is the genome file, and the last is the output. An example of homer: `/path/to/findMotifsGenome.pl %s %s %s`."
             , topN :: Maybe Int <?> "([100] | INT ) The top INT differentially expressed features."
             , output :: Maybe String <?> "([out] | STRING) The folder containing output." }
    | MatrixOutput { matrixPath :: [String] <?> "(PATH) The path to the input directory containing the matrix output of cellranger (cellranger < 3 (matrix.mtx, genes.tsv, and barcodes.tsv) or cellranger >= 3 (matrix.mtx.gz, features.tsv.gz, and barcodes.tsv.gz) or an input csv file containing feature row names and cell column names. scATAC-seq is supported if input file contains \"fragments\", ends with \".tsv.gz\" (such as \"fragments.tsv.gz\" or \"sample1_fragments.tsv.gz\"), and is in the SORTED (sort -k1,1 -k2,2n) 10x fragments format (see also --binwidth, --no-binarize). If given as a list (--matrix-path input1 --matrix-path input2 etc.) then will join all matrices together. Assumes the same number and order of features in each matrix, so only cells are added."
                   , binwidth :: Maybe Int <?> "(Nothing | BINSIZE) If input data has region features in the format of `chrN:START-END`, BINSIZE input is required to convert ranges to fixed width bins."
                   , noBinarize :: Bool <?> "If a fragments.tsv.gz file, do not binarize data."
                   , binarize :: Bool <?> "Binarize data. Default for fragments.tsv.gz."
                   , excludeMatchFragments :: Maybe String <?> "([Nothing] | STRING) Exclude fragments from fragments.tsv.gz file if STRING is infix of fragment row. For instance, exclude all chrY fragments with \"--exclude-match-fragments chrY\"."
                   , blacklistRegionsFile :: Maybe T.Text <?> "([Nothing] | FILE) Bed file containing regions to ignore. Any fragments overlapping these regions are ignored if the input is not a fragments file."
                   , customRegion :: [T.Text] <?> "([Nothing] | chrN:START-END) Only look at these regions in the matrix for chromosome region features. A list in the format `chrN:START-END`, will assign values for a cell intersecting this region to this region. If a cell region overlaps two or more regions, will be counting that many times in the new features. Example input: `--custom-regions \"chr1:1003-10064\" --custom-regions \"chr2:3021-5034\"` etc."
                   , cellWhitelistFile :: Maybe String <?> "([Nothing] | FILE) The input file containing the cells to include. No header, line separated list of barcodes."
                   , customLabel :: [T.Text] <?> "([] | [LABEL]) List of labels to assign each matrix if all cells from each matrix are given the same label per matrix. This argument intends to simplify the process of labeling by bypassing --labels-file if the user just wants each matrix to have its own label (i.e. sample). Must be the same length and order as --matrix-path: for instance, --matrix-path input1 --custom-label sample1 --matrix-path input2 --custom-label sample2 etc. will label all cells from input1 with sample1, input2 with sample2, etc. If there are multiple labels per matrix, you must use --labels-file."
                   , delimiter :: Maybe Char <?> "([,] | CHAR) The delimiter for the most csv files in the program. For instance, if using a normal csv rather than cellranger output and for --labels-file."
                   , featureColumn :: Maybe Int <?> "([1] | COLUMN) The column (1-indexed) in the features.tsv.gz file to use for feature names. If using matrix market format, cellranger stores multiple columns in the features file, usually the first column for the Ensembl identifier and the second column for the gene symbol. If the Ensembl identifier is not quickly accessible, use --feature-column 2 for the second column, which is usually more ubiquitous. Useful for overlaying gene expression so you can say --draw-leaf \"DrawItem (DrawContinuous \\\"CD4\\\")\") instead of --draw-leaf \"DrawItem (DrawContinuous \\\"ENSG00000010610\\\")\"). Does not affect CSV format (the column names will be the feature names)."
                   , normalization :: [String] <?> "([TfIdfNorm] | UQNorm | MedNorm | TotalMedNorm | TotalNorm | LogCPMNorm DOUBLE | QuantileNorm | NoneNorm) Type of normalization before clustering. Can be used as a list to perform one after the other: --normalization QuantileNorm --normalization TfIdfNorm will first apply quantile then tf-idf normalization. TfIdfNorm normalizes based on the prevalence of each feature. UQNorm normalizes each observation by the upper quartile non-zero counts of that observation. MedNorm normalizes each observation by the median non-zero counts of that observation. TotalNorm normalizes each cell by the total count. TotalMedNorm normalized first each observation by total count then by median of non-zero counts across features. (LogCPMNorm DOUBLE) normalizes by logB(CPM + 1) where B is DOUBLE. QuantileNorm normalizes by quantile normalization, ignores zeros, may be slow. NoneNorm does not normalize. Default is TfIdfNorm for clustering and NoneNorm for differential (which instead uses the recommended edgeR single cell preprocessing including normalization and filtering, any normalization provided here will result in edgeR preprocessing on top). Older versions had BothNorm which has been replaced with --normalization TotalMedNorm --normalization TfIdfNorm. This change will also affect other analyses, as TfIdfNorm will now be in non-cluster-related entry points if specified, so --normalization UQNorm from < v2.0.0.0 is now --normalization UQNorm --normalization TfIdfNorm."
                   , matrixTranspose :: Bool <?> "Whether to transpose the matrix before all processing (observations become features and vice-versa). Will be affected by other options (check your filtering thresholds, normalizations, etc!)"
                   , pca :: Maybe Int <?> "([Nothing] | INT) Not recommended, as it makes cosine similarity less meaningful (therefore less accurate -- instead, consider making your own similarity matrix and using cluster-tree, our sister algorithm, to cluster the matrix and plot with birch-beer). The number of dimensions to keep for PCA dimensionality reduction before clustering. Default is no PCA at all in order to keep all information. Should use with --shift-positive to ensure no negative values (as --pca will center and scale). Consider changing the modularity cutoff to a lower value (such as --min-modularity -0.5)."
                   , lsa :: Maybe Int <?> "([Nothing] | INT) The number of dimensions to keep for LSA dimensionality reduction. Uses TD-IDF followed by SVD before clustering, same warnings as --pca apply, including the use of --shift-positive with possible --min-modularity -0.5."
                   , svd :: Maybe Int <?> "([Nothing] | INT) The number of dimensions to keep for SVD dimensionality reduction. Will center and scale, same warnings as --pca apply, including the use of --shift-positive with possible --min-modularity -0.5."
                   , dropDimension :: Bool <?> "Instead of keeping dimensions for --pca, --lsa, or --svd, drop the number of dimensions (i.e. --lsa 1 drops the first projection which may have a strong batch effect)."
                   , filterThresholds :: Maybe String <?> "([Nothing] | (DOUBLE, DOUBLE)) The minimum filter thresholds for (MINCELL, MINFEATURE) when filtering cells and features by low read counts. Default changed to Nothing due to additional supported assays. To use the original filter thresholds, use --filter-thresholds \"(250, 1)\"."
                   , shiftPositive :: Bool <?> "Shift features to positive values. Positive values are shifted to allow modularity to work correctly."
                   , matOutput :: String <?> "([out_matrix] | FOLDER | FILE.csv) Output the filtered and normalized (not including TfIdfNorm) matrix in this folder in matrix market format or, if a csv file is specified, a dense csv format. Like input, features are rows." }
    deriving (Generic)

modifiers :: Modifiers
modifiers = lispCaseModifiers { shortNameModifier = short }
  where
    short "allNodes"              = Nothing
    short "aggregate"             = Nothing
    short "atac"                  = Nothing
    short "binarize"              = Nothing
    short "bedgraph"              = Nothing
    short "blacklistRegionsFile"  = Nothing
    short "customRegion"          = Nothing
    short "clumpinessMethod"      = Just 'u'
    short "clusterNormalization"  = Just 'C'
    short "customCut"             = Nothing
    short "customLabel"           = Just 'Z'
    short "dendrogramOutput"      = Just 'U'
    short "diffFile"              = Nothing
    short "drawCollection"        = Just 'E'
    short "drawColors"            = Just 'R'
    short "drawDendrogram"        = Just 'D'
    short "drawDiscretize"        = Nothing
    short "drawFont"              = Nothing
    short "drawLeaf"              = Just 'L'
    short "drawLegendAllLabels"   = Just 'J'
    short "drawLegendSep"         = Just 'Q'
    short "drawMark"              = Just 'K'
    short "drawMaxLeafNodeSize"   = Nothing
    short "drawMaxNodeSize"       = Just 'A'
    short "drawNoScaleNodes"      = Just 'W'
    short "drawNodeNumber"        = Just 'N'
    short "drawPalette"           = Just 'Y'
    short "drawScaleSaturation"   = Just 'V'
    short "drawBarBounds"         = Nothing
    short "dropDimension"         = Nothing
    short "eigenGroup"            = Just 'B'
    short "elbowCutoff"           = Nothing
    short "featureColumn"         = Nothing
    short "filterThresholds"      = Just 'H'
    short "fragmentsOutput"       = Nothing
    short "fragmentsPath"         = Just 'f'
    short "genome"                = Nothing
    short "genomecovCommand"      = Nothing
    short "labels"                = Nothing
    short "labelsOutput"          = Nothing
    short "lsa"                   = Nothing
    short "matOutput"             = Nothing
    short "matrixOutput"          = Nothing
    short "matrixTranspose"       = Just 'T'
    short "maxDistance"           = Just 't'
    short "maxProportion"         = Just 'X'
    short "maxStep"               = Just 'S'
    short "minDistance"           = Nothing
    short "minDistanceSearch"     = Nothing
    short "minModularity"         = Nothing
    short "minSize"               = Just 'M'
    short "motifCommand"          = Nothing
    short "motifGenome"           = Nothing
    short "noBinarize"            = Nothing
    short "noEdger"               = Nothing
    short "normalization"         = Just 'z'
    short "numEigen"              = Just 'G'
    short "numRuns"               = Nothing
    short "order"                 = Just 'O'
    short "pathsPalette"          = Nothing
    short "pca"                   = Nothing
    short "peakNode"              = Nothing
    short "peakNodeLabels"        = Nothing
    short "plotOutput"            = Nothing
    short "plotSeparateNodes"     = Nothing
    short "plotSeparateLabels"    = Nothing
    short "plotViolin"            = Nothing
    short "priors"                = Just 'P'
    short "projectionFile"        = Just 'j'
    short "rootCut"               = Nothing
    short "seed"                  = Nothing
    short "shallowStart"          = Nothing
    short "shiftPositive"         = Nothing
    short "singleReferenceMatrix" = Nothing
    short "subsampleGroups"       = Nothing
    short "svd"                   = Nothing
    short "noUpdateTreeRows"      = Nothing
    short x                       = firstLetter x

instance ParseRecord Options where
    parseRecord = parseRecordWithModifiers modifiers