-- Hoogle documentation, generated by Haddock -- See Hoogle, http://www.haskell.org/hoogle/ -- | reader for the 2bit file format -- -- A library and command line tool for working with 2bit files. 2bit is a -- compact file format for genomes introduced by Jim Kent with his BLAT -- suite in the early 2000s. @package twobitreader @version 1.0 -- | .2bit format (from the UCSC Genome Browser FAQ) -- -- A .2bit file stores multiple DNA sequences (up to 4 Gb total) in a -- compact randomly-accessible format. The file contains masking -- information as well as the DNA itself. -- -- The file begins with a 16-byte header containing the following fields: -- -- -- -- All fields are 32 bits unless noted. If the signature value is not as -- given, the reader program should byte-swap the signature and check if -- the swapped version matches. If so, all multiple-byte entities in the -- file will have to be byte-swapped. This enables these binary files to -- be used unchanged on different architectures. -- -- The header is followed by a file index, which contains one entry for -- each sequence. Each index entry contains three fields: -- -- -- -- The index is followed by the sequence records, which contain nine -- fields: -- -- -- -- In this format, it is neither possible nor necessary to store Ns in -- the main sequence, and one wouldn't expect them to take up space -- there. However, they do; hard masked sequence is typically stored as -- many Ts. The sensible way to treat these is probably to just say there -- are two kinds of implied annotation (repeats and large gaps for a -- typical genome), which can be interpreted in whatever way fits. module Bio.TwoBit data TwoBitFile TBF :: {-# UNPACK #-} !ForeignPtr Word8 -> {-# UNPACK #-} !Int -> {-# UNPACK #-} !ByteString -> {-# UNPACK #-} !Array TwoBitChromosome -> !HashMap ByteString TwoBitChromosome -> TwoBitFile [tbf_raw] :: TwoBitFile -> {-# UNPACK #-} !ForeignPtr Word8 [tbf_size] :: TwoBitFile -> {-# UNPACK #-} !Int [tbf_path] :: TwoBitFile -> {-# UNPACK #-} !ByteString [tbf_chroms] :: TwoBitFile -> {-# UNPACK #-} !Array TwoBitChromosome [tbf_chrmap] :: TwoBitFile -> !HashMap ByteString TwoBitChromosome -- | Brings a 2bit file into memory. The file is mmap'ed, so it will not -- work on streams that are not actual files. It's also unsafe if the -- file is concurrently modified in any way. openTwoBit :: FilePath -> IO TwoBitFile data TwoBitChromosome TBC :: {-# UNPACK #-} !ForeignPtr Word8 -> {-# UNPACK #-} !ByteString -> {-# UNPACK #-} !Int -> {-# UNPACK #-} !Word32 -> {-# UNPACK #-} !Word32 -> (Int -> TwoBitSequence' Unidrectional) -> (Int -> TwoBitSequence' Bidirectional) -> TwoBitChromosome [tbc_raw] :: TwoBitChromosome -> {-# UNPACK #-} !ForeignPtr Word8 [tbc_name] :: TwoBitChromosome -> {-# UNPACK #-} !ByteString [tbc_index] :: TwoBitChromosome -> {-# UNPACK #-} !Int [tbc_dna_offset] :: TwoBitChromosome -> {-# UNPACK #-} !Word32 [tbc_dna_size] :: TwoBitChromosome -> {-# UNPACK #-} !Word32 -- | Lazily generated sequence in forward direction; the argument is the -- offset of the first base. [tbc_fwd_seq] :: TwoBitChromosome -> Int -> TwoBitSequence' Unidrectional -- | Lazily generated sequence in reverse direction; the argument is the -- offset of the first base to the right of the beginning. (The first -- base generated is the complement of the base found at (offset-1). [tbc_rev_seq] :: TwoBitChromosome -> Int -> TwoBitSequence' Bidirectional tbf_chrnames :: TwoBitFile -> [ByteString] -- | Finds a named scaffold in the reference. If it doesn't find the exact -- name, it will try to compensate for the crazy naming differences -- between NCBI and UCSC. This doesn't work in general, but is good -- enough in the common case. In particular, "1" maps to "chr1" and back, -- "GL000192.1" to "chr1_gl000192_random" and back, and "chrM" to -- MT and back. findChrom :: ByteString -> TwoBitFile -> Maybe TwoBitChromosome -- | This is a (piece of a) reference sequence. It consists of stretches -- with uniform masking. -- -- The offset is stored as a Word. This is done because on a 32 -- bit platform, every bit counts. This limits the genome to -- approximately four gigabases, which would be a file of about one -- gigabyte. That's just about enough to work with the human genome. On a -- 64 bit platform, the file format itself imposes a limit of four -- gigabytes, or about 16 gigabases in total. -- -- If length is zero, the piece is empty and the mask, pointer, and -- offset fields may not be valid. If length is positive, ptr+offset -- points at the first base of the piece. If length is negative, -- ptr+offset points just past the end of the piece, ptr+offset+length -- points to the first base of the piece, and the sequence in meant to be -- reverse complemented. -- -- In a TwoBitSequence, length must not be negative. In a -- TwoBitSequence' Bidirectional, length can be positive or -- negative. data TwoBitSequence' dir SomeSeq :: {-# UNPACK #-} !Masking -> {-# UNPACK #-} !ForeignPtr Word8 -> {-# UNPACK #-} !Word -> {-# UNPACK #-} !Int -> TwoBitSequence' dir -> TwoBitSequence' dir RefEnd :: TwoBitSequence' dir type TwoBitSequence = TwoBitSequence' Unidrectional data Unidrectional data Bidirectional -- | Unpacks a reference sequence into a (very long) list of bytes. Each -- byte contains the nucleotide in bits 0 and 1 with valjues 0..3 -- corresponding to TCAG, and the soft and hard mask bits in bits -- 2 and 3, respectively. unpackRSRaw :: TwoBitSequence' dir -> [Word8] -- | Unpacks a reference sequence into a (very long) list of ASCII -- characters. Hard masked nucleotides become the letter N, -- others become TCAG. unpackRS :: TwoBitSequence' dir -> [Word8] -- | Unpacks a reference sequence into a list of ASCII characters, -- interpreting masking in the customary way. Specifically, hard masking -- produces Ns, soft masking produces lower case letters, and dual -- masking produces lower case Ns. unpackRSMasked :: TwoBitSequence' dir -> [Word8] -- | 2bit supports two kinds of masking, typically rendered as lowercase -- letters (MaskSoft) and Ns (MaskHard). They can -- overlap (MaskBoth), and even the hard masking has underlying -- sequence (which is normally ignored). newtype Masking Masking :: Word8 -> Masking isSoftMasked :: Masking -> Bool isHardMasked :: Masking -> Bool noneMasked :: Masking softMasked :: Masking hardMasked :: Masking bothMasked :: Masking instance GHC.Show.Show Bio.TwoBit.TwoBitError instance GHC.Classes.Ord Bio.TwoBit.Block instance GHC.Classes.Eq Bio.TwoBit.Block instance GHC.Show.Show Bio.TwoBit.Block instance GHC.Classes.Ord Bio.TwoBit.Masking instance GHC.Classes.Eq Bio.TwoBit.Masking instance GHC.Show.Show (Bio.TwoBit.TwoBitSequence' dir) instance GHC.Show.Show Bio.TwoBit.Masking instance GHC.Read.Read Bio.TwoBit.Masking instance GHC.Base.Semigroup Bio.TwoBit.Masking instance GHC.Base.Monoid Bio.TwoBit.Masking instance GHC.Enum.Enum Bio.TwoBit.Masking instance GHC.Enum.Bounded Bio.TwoBit.Masking instance GHC.Exception.Type.Exception Bio.TwoBit.TwoBitError module Bio.TwoBit.Tool data EncodeProgress EncodeProgress :: !ShortByteString -> !Word32 -> !Word32 -> !Word32 -> !Int64 -> EncodeProgress -> EncodeProgress [ep_seqname] :: EncodeProgress -> !ShortByteString [ep_position] :: EncodeProgress -> !Word32 [ep_hardmasked] :: EncodeProgress -> !Word32 [ep_softmasked] :: EncodeProgress -> !Word32 [ep_enclength] :: EncodeProgress -> !Int64 [ep_tail] :: EncodeProgress -> EncodeProgress Encoded :: Builder -> EncodeProgress buildFasta :: Int -> [Word8] -> Builder faToTwoBit :: ByteString -> EncodeProgress formatCdna :: TwoBitFile -> Cdna -> Builder -- | Parses annotations in GFF format. We want to turn an annotation and a -- 2bit file into a FastA of the transcriptome (one sequence per -- annotated transcript), that looks like the stuff Lior Pachter feeds -- into Kallisto. Annotations come in two dialects of GFF, either GFF3 or -- GTF. We autodetect and understand both. parseAnno :: String -> ByteString -> [Either GffError Cdna] twoBitToFa :: Int -> TwoBitSequence' dir -> IO () -- | Extracts the reference from a VCF. This assumes the presence of at -- least one record per site. The VCF must be sorted by position. When -- writing out, we try to match the order of the contigs as listed in the -- header. Unlisted contigs follow at the end with their order preserved; -- contigs without data are not written at all. vcfToTwoBit :: [ByteString] -> EncodeProgress instance GHC.Show.Show Bio.TwoBit.Tool.Range instance GHC.Show.Show Bio.TwoBit.Tool.Cdna instance GHC.Show.Show Bio.TwoBit.Tool.GffErrorDetail instance GHC.Show.Show Bio.TwoBit.Tool.GffError instance GHC.Exception.Type.Exception Bio.TwoBit.Tool.GffError