module Bio.Bam.Regions where import Bio.Bam.Header ( Refseq(..) ) import Bio.Prelude import qualified Data.IntMap.Strict as IM data Region = Region { refseq :: !Refseq, start :: !Int, end :: !Int } deriving (Eq, Ord, Show) -- | A subset of a genome. The idea is to map the reference sequence -- (represented by its number) to a 'Subseqeunce'. newtype Regions = Regions (IM.IntMap Subsequence) deriving Show -- | A mostly contiguous subset of a sequence, stored as a set of -- non-overlapping intervals in an 'IntMap' from start position to end -- position (half-open intervals, naturally). newtype Subsequence = Subsequence (IM.IntMap Int) deriving Show toList :: Regions -> [(Refseq, Subsequence)] toList (Regions m) = [ (Refseq $ fromIntegral k, v) | (k,v) <- IM.toList m ] fromList :: [Region] -> Regions fromList = foldl' (flip add) (Regions IM.empty) add :: Region -> Regions -> Regions add (Region (Refseq r) b e) (Regions m) = let single = Just . Subsequence $ IM.singleton b e in Regions $ IM.alter (maybe single (Just . addInt b e)) (fromIntegral r) m addInt :: Int -> Int -> Subsequence -> Subsequence addInt b e (Subsequence m0) = Subsequence $ merge_into b e m0 where merge_into x y m = case IM.lookupLT y m of Just (u,v) | x < u && y <= v -> merge_into x v $ IM.delete u m -- extend to the left | x < u -> merge_into x y $ IM.delete u m -- subsume | y <= v -> m -- subsumed | x <= v -> merge_into u y $ IM.delete u m -- extend to the right _ -> IM.insert x y m -- no overlap overlaps :: Int -> Int -> Subsequence -> Bool overlaps b e (Subsequence m) = case IM.lookupLT e m of Just (_,v) -> b < v Nothing -> False