-- Hoogle documentation, generated by Haddock
-- See Hoogle, http://www.haskell.org/hoogle/
-- | reader for the 2bit file format
--
-- A library and command line tool for working with 2bit files. 2bit is a
-- compact file format for genomes introduced by Jim Kent with his BLAT
-- suite in the early 2000s.
@package twobitreader
@version 1.0
-- | .2bit format (from the UCSC Genome Browser FAQ)
--
-- A .2bit file stores multiple DNA sequences (up to 4 Gb total) in a
-- compact randomly-accessible format. The file contains masking
-- information as well as the DNA itself.
--
-- The file begins with a 16-byte header containing the following fields:
--
--
-- - signature - the number 0x1A412743 in the architecture of the
-- machine that created the file
-- - version - zero for now. Readers should abort if they see a version
-- number higher than 0
-- - sequenceCount - the number of sequences in the file
-- - reserved - always zero for now
--
--
-- All fields are 32 bits unless noted. If the signature value is not as
-- given, the reader program should byte-swap the signature and check if
-- the swapped version matches. If so, all multiple-byte entities in the
-- file will have to be byte-swapped. This enables these binary files to
-- be used unchanged on different architectures.
--
-- The header is followed by a file index, which contains one entry for
-- each sequence. Each index entry contains three fields:
--
--
-- - nameSize - a byte containing the length of the name field
-- - name - the sequence name itself (in ASCII-compatible byte string),
-- of variable length depending on nameSize
-- - offset - the 32-bit offset of the sequence data relative to the
-- start of the file, not aligned to any 4-byte padding boundary
--
--
-- The index is followed by the sequence records, which contain nine
-- fields:
--
--
-- - dnaSize - number of bases of DNA in the sequence
-- - nBlockCount - the number of blocks of Ns in the file (representing
-- unknown sequence)
-- - nBlockStarts - an array of length nBlockCount of 32 bit integers
-- indicating the (0-based) starting position of a block of Ns
-- - nBlockSizes - an array of length nBlockCount of 32 bit integers
-- indicating the length of a block of Ns
-- - maskBlockCount - the number of masked (lower-case) blocks
-- - maskBlockStarts - an array of length maskBlockCount of 32 bit
-- integers indicating the (0-based) starting position of a masked
-- block
-- - maskBlockSizes - an array of length maskBlockCount of 32 bit
-- integers indicating the length of a masked block
-- - reserved - always zero for now
-- - packedDna - the DNA packed to two bits per base, represented as
-- so: T - 00, C - 01, A - 10, G - 11. The first base is in the most
-- significant 2-bit byte; the last base is in the least significant 2
-- bits. For example, the sequence TCAG is represented as 00011011.
--
--
-- In this format, it is neither possible nor necessary to store Ns in
-- the main sequence, and one wouldn't expect them to take up space
-- there. However, they do; hard masked sequence is typically stored as
-- many Ts. The sensible way to treat these is probably to just say there
-- are two kinds of implied annotation (repeats and large gaps for a
-- typical genome), which can be interpreted in whatever way fits.
module Bio.TwoBit
data TwoBitFile
TBF :: {-# UNPACK #-} !ForeignPtr Word8 -> {-# UNPACK #-} !Int -> {-# UNPACK #-} !ByteString -> {-# UNPACK #-} !Array TwoBitChromosome -> !HashMap ByteString TwoBitChromosome -> TwoBitFile
[tbf_raw] :: TwoBitFile -> {-# UNPACK #-} !ForeignPtr Word8
[tbf_size] :: TwoBitFile -> {-# UNPACK #-} !Int
[tbf_path] :: TwoBitFile -> {-# UNPACK #-} !ByteString
[tbf_chroms] :: TwoBitFile -> {-# UNPACK #-} !Array TwoBitChromosome
[tbf_chrmap] :: TwoBitFile -> !HashMap ByteString TwoBitChromosome
-- | Brings a 2bit file into memory. The file is mmap'ed, so it will not
-- work on streams that are not actual files. It's also unsafe if the
-- file is concurrently modified in any way.
openTwoBit :: FilePath -> IO TwoBitFile
data TwoBitChromosome
TBC :: {-# UNPACK #-} !ForeignPtr Word8 -> {-# UNPACK #-} !ByteString -> {-# UNPACK #-} !Int -> {-# UNPACK #-} !Word32 -> {-# UNPACK #-} !Word32 -> (Int -> TwoBitSequence' Unidrectional) -> (Int -> TwoBitSequence' Bidirectional) -> TwoBitChromosome
[tbc_raw] :: TwoBitChromosome -> {-# UNPACK #-} !ForeignPtr Word8
[tbc_name] :: TwoBitChromosome -> {-# UNPACK #-} !ByteString
[tbc_index] :: TwoBitChromosome -> {-# UNPACK #-} !Int
[tbc_dna_offset] :: TwoBitChromosome -> {-# UNPACK #-} !Word32
[tbc_dna_size] :: TwoBitChromosome -> {-# UNPACK #-} !Word32
-- | Lazily generated sequence in forward direction; the argument is the
-- offset of the first base.
[tbc_fwd_seq] :: TwoBitChromosome -> Int -> TwoBitSequence' Unidrectional
-- | Lazily generated sequence in reverse direction; the argument is the
-- offset of the first base to the right of the beginning. (The first
-- base generated is the complement of the base found at (offset-1).
[tbc_rev_seq] :: TwoBitChromosome -> Int -> TwoBitSequence' Bidirectional
tbf_chrnames :: TwoBitFile -> [ByteString]
-- | Finds a named scaffold in the reference. If it doesn't find the exact
-- name, it will try to compensate for the crazy naming differences
-- between NCBI and UCSC. This doesn't work in general, but is good
-- enough in the common case. In particular, "1" maps to "chr1" and back,
-- "GL000192.1" to "chr1_gl000192_random" and back, and "chrM" to
-- MT and back.
findChrom :: ByteString -> TwoBitFile -> Maybe TwoBitChromosome
-- | This is a (piece of a) reference sequence. It consists of stretches
-- with uniform masking.
--
-- The offset is stored as a Word. This is done because on a 32
-- bit platform, every bit counts. This limits the genome to
-- approximately four gigabases, which would be a file of about one
-- gigabyte. That's just about enough to work with the human genome. On a
-- 64 bit platform, the file format itself imposes a limit of four
-- gigabytes, or about 16 gigabases in total.
--
-- If length is zero, the piece is empty and the mask, pointer, and
-- offset fields may not be valid. If length is positive, ptr+offset
-- points at the first base of the piece. If length is negative,
-- ptr+offset points just past the end of the piece, ptr+offset+length
-- points to the first base of the piece, and the sequence in meant to be
-- reverse complemented.
--
-- In a TwoBitSequence, length must not be negative. In a
-- TwoBitSequence' Bidirectional, length can be positive or
-- negative.
data TwoBitSequence' dir
SomeSeq :: {-# UNPACK #-} !Masking -> {-# UNPACK #-} !ForeignPtr Word8 -> {-# UNPACK #-} !Word -> {-# UNPACK #-} !Int -> TwoBitSequence' dir -> TwoBitSequence' dir
RefEnd :: TwoBitSequence' dir
type TwoBitSequence = TwoBitSequence' Unidrectional
data Unidrectional
data Bidirectional
-- | Unpacks a reference sequence into a (very long) list of bytes. Each
-- byte contains the nucleotide in bits 0 and 1 with valjues 0..3
-- corresponding to TCAG, and the soft and hard mask bits in bits
-- 2 and 3, respectively.
unpackRSRaw :: TwoBitSequence' dir -> [Word8]
-- | Unpacks a reference sequence into a (very long) list of ASCII
-- characters. Hard masked nucleotides become the letter N,
-- others become TCAG.
unpackRS :: TwoBitSequence' dir -> [Word8]
-- | Unpacks a reference sequence into a list of ASCII characters,
-- interpreting masking in the customary way. Specifically, hard masking
-- produces Ns, soft masking produces lower case letters, and dual
-- masking produces lower case Ns.
unpackRSMasked :: TwoBitSequence' dir -> [Word8]
-- | 2bit supports two kinds of masking, typically rendered as lowercase
-- letters (MaskSoft) and Ns (MaskHard). They can
-- overlap (MaskBoth), and even the hard masking has underlying
-- sequence (which is normally ignored).
newtype Masking
Masking :: Word8 -> Masking
isSoftMasked :: Masking -> Bool
isHardMasked :: Masking -> Bool
noneMasked :: Masking
softMasked :: Masking
hardMasked :: Masking
bothMasked :: Masking
instance GHC.Show.Show Bio.TwoBit.TwoBitError
instance GHC.Classes.Ord Bio.TwoBit.Block
instance GHC.Classes.Eq Bio.TwoBit.Block
instance GHC.Show.Show Bio.TwoBit.Block
instance GHC.Classes.Ord Bio.TwoBit.Masking
instance GHC.Classes.Eq Bio.TwoBit.Masking
instance GHC.Show.Show (Bio.TwoBit.TwoBitSequence' dir)
instance GHC.Show.Show Bio.TwoBit.Masking
instance GHC.Read.Read Bio.TwoBit.Masking
instance GHC.Base.Semigroup Bio.TwoBit.Masking
instance GHC.Base.Monoid Bio.TwoBit.Masking
instance GHC.Enum.Enum Bio.TwoBit.Masking
instance GHC.Enum.Bounded Bio.TwoBit.Masking
instance GHC.Exception.Type.Exception Bio.TwoBit.TwoBitError
module Bio.TwoBit.Tool
data EncodeProgress
EncodeProgress :: !ShortByteString -> !Word32 -> !Word32 -> !Word32 -> !Int64 -> EncodeProgress -> EncodeProgress
[ep_seqname] :: EncodeProgress -> !ShortByteString
[ep_position] :: EncodeProgress -> !Word32
[ep_hardmasked] :: EncodeProgress -> !Word32
[ep_softmasked] :: EncodeProgress -> !Word32
[ep_enclength] :: EncodeProgress -> !Int64
[ep_tail] :: EncodeProgress -> EncodeProgress
Encoded :: Builder -> EncodeProgress
buildFasta :: Int -> [Word8] -> Builder
faToTwoBit :: ByteString -> EncodeProgress
formatCdna :: TwoBitFile -> Cdna -> Builder
-- | Parses annotations in GFF format. We want to turn an annotation and a
-- 2bit file into a FastA of the transcriptome (one sequence per
-- annotated transcript), that looks like the stuff Lior Pachter feeds
-- into Kallisto. Annotations come in two dialects of GFF, either GFF3 or
-- GTF. We autodetect and understand both.
parseAnno :: String -> ByteString -> [Either GffError Cdna]
twoBitToFa :: Int -> TwoBitSequence' dir -> IO ()
-- | Extracts the reference from a VCF. This assumes the presence of at
-- least one record per site. The VCF must be sorted by position. When
-- writing out, we try to match the order of the contigs as listed in the
-- header. Unlisted contigs follow at the end with their order preserved;
-- contigs without data are not written at all.
vcfToTwoBit :: [ByteString] -> EncodeProgress
instance GHC.Show.Show Bio.TwoBit.Tool.Range
instance GHC.Show.Show Bio.TwoBit.Tool.Cdna
instance GHC.Show.Show Bio.TwoBit.Tool.GffErrorDetail
instance GHC.Show.Show Bio.TwoBit.Tool.GffError
instance GHC.Exception.Type.Exception Bio.TwoBit.Tool.GffError