-- Hoogle documentation, generated by Haddock -- See Hoogle, http://www.haskell.org/hoogle/ -- | Library and executables for working with SFF files -- -- The library contains the functionality for reading and writing SFF -- files (sequencing data from 454 and Ion Torrent). It duplicates code -- from (and is incompatible with) the bio library. @package biosff @version 0.2 -- | Read and write the SFF file format used by Roche/454 sequencing to -- store flowgram data. -- -- A flowgram is a series of values (intensities) representing -- homopolymer runs of A,G,C, and T in a fixed cycle, and usually -- displayed as a histogram. -- -- This file is based on information in the Roche FLX manual. Among other -- sources for information about the format, are The Staden Package, -- which contains an io_lib with a C routine for parsing this format. -- According to comments in the sources, the io_lib implementation is -- based on a file called getsff.c, which I've been unable to track down. -- Other software parsing SFFs are QIIME, sff_extract, and Celera's -- sffToCa. -- -- It is believed that all values are stored big endian. module Bio.Sequence.SFF -- | The data structure storing the contents of an SFF file (modulo the -- index) data SFF SFF :: !CommonHeader -> [ReadBlock] -> SFF -- | SFF has a 31-byte common header -- -- The format is open to having the index anywhere between reads, we -- should really keep count and check for each read. In practice, it -- seems to be places after the reads. -- -- The following two fields are considered part of the header, but as -- they are static, they are not part of the data structure -- --
--      
--   magic   :: Word32   -- 0x2e736666, i.e. the string ".sff"
--   version :: Word32   -- 0x00000001
--   
data CommonHeader CommonHeader :: Int64 -> Int32 -> Int32 -> Int16 -> Int16 -> Word8 -> ByteString -> ByteString -> CommonHeader -- | Points to a text(?) section index_offset :: CommonHeader -> Int64 index_length :: CommonHeader -> Int32 num_reads :: CommonHeader -> Int32 key_length :: CommonHeader -> Int16 flow_length :: CommonHeader -> Int16 flowgram_fmt :: CommonHeader -> Word8 flow :: CommonHeader -> ByteString key :: CommonHeader -> ByteString -- | Each Read has a fixed read header, containing various information. data ReadHeader ReadHeader :: Int16 -> Int32 -> Int16 -> Int16 -> Int16 -> Int16 -> ByteString -> ReadHeader name_length :: ReadHeader -> Int16 num_bases :: ReadHeader -> Int32 clip_qual_left :: ReadHeader -> Int16 clip_qual_right :: ReadHeader -> Int16 clip_adapter_left :: ReadHeader -> Int16 clip_adapter_right :: ReadHeader -> Int16 read_name :: ReadHeader -> ByteString -- | This contains the actual flowgram for a single read. data ReadBlock ReadBlock :: !ReadHeader -> !ByteString -> !ByteString -> !SeqData -> !QualData -> ReadBlock read_header :: ReadBlock -> !ReadHeader flow_data :: ReadBlock -> !ByteString flow_index :: ReadBlock -> !ByteString bases :: ReadBlock -> !SeqData quality :: ReadBlock -> !QualData -- | Read an SFF file. readSFF :: FilePath -> IO SFF -- | Write an SFF to the specified file name writeSFF :: FilePath -> SFF -> IO () -- | Write an SFF to the specified file name, but go back and update -- the read count. Useful if you want to output a lazy stream of -- ReadBlocks. Returns the number of reads written. writeSFF' :: FilePath -> SFF -> IO Int -- | Read an SFF file, but be resilient against errors. recoverSFF :: FilePath -> IO SFF -- | Trim a read according to clipping information trim :: ReadBlock -> ReadBlock -- | Trim a read to specific sequence position, inclusive bounds. trimFromTo :: Integral i => i -> i -> ReadBlock -> ReadBlock -- | Convert a sequence position to the corresponding flow position baseToFlowPos :: Integral i => ReadBlock -> i -> Int -- | Convert a flow position to the corresponding sequence position flowToBasePos :: Integral i => ReadBlock -> i -> Int -- | Trim a ReadBlock limiting the number of flows. If writing to an -- SFF file, make sure you update the CommonHeader accordingly. -- See examples/Flx.hs for how to use this. trimFlows :: Integral i => i -> ReadBlock -> ReadBlock -- | test serialization by output'ing the header and first two reads in an -- SFF, and the same after a decode + encode cycle. test :: FilePath -> IO () -- | Convert a file by decoding it and re-encoding it This will lose the -- index (which isn't really necessary) convert :: FilePath -> IO () -- | Helper function to access the flowgram flowgram :: ReadBlock -> [Flow] -- | Extract the sequence with masked bases in lower case masked_bases :: ReadBlock -> SeqData -- | Extract the index as absolute coordinates, not relative. cumulative_index :: ReadBlock -> [Int] -- | Pack a list of flows into the corresponding binary structure (the -- flow_data field) packFlows :: [Flow] -> ByteString -- | Unpack the flow_data field into a list of flow values unpackFlows :: ByteString -> [Flow] -- | The type of flowgram value type Flow = Int16 -- | A quality value is in the range 0..255. data Qual :: * type Index = Word8 -- | Sequence data are lazy bytestrings of ASCII characters. data SeqData :: * -- | Quality data are lazy bytestrings of Quals. data QualData :: * -- | Read names encode various information, as per this struct. data ReadName ReadName :: (Int, Int, Int) -> (Int, Int, Int) -> Int -> Int -> Int -> ReadName date :: ReadName -> (Int, Int, Int) time :: ReadName -> (Int, Int, Int) region :: ReadName -> Int x_loc :: ReadName -> Int y_loc :: ReadName -> Int decodeReadName :: ByteString -> Maybe ReadName encodeReadName :: ReadName -> ByteString -- | A ReadBlock can't be an instance of Binary directly, since it depends -- on information from the CommonHeader. putRB :: Int -> ReadBlock -> Put -- | Helper function for decoding a ReadBlock. getRB :: Int -> ReadHeader -> Get ReadBlock instance Binary PartialReadHeader instance Binary RSFF instance Show ReadBlock instance Binary ReadHeader instance Show ReadHeader instance Binary CommonHeader instance Show CommonHeader instance Binary SFF instance Show SFF instance Binary RBI instance BioSeqQual ReadBlock instance BioSeq ReadBlock