```
-----------------------------------------------------------------------------
-- |
-- Module      :  Data.List.LCS.HuntSzymanski
-- Copyright   :  (c) Ian Lynagh 2005
-- License     :  BSD or GPL v2
--
-- Maintainer  :  igloo@earth.li
-- Stability   :  provisional
-- Portability :  non-portable (uses STUArray)
--
-- This is an implementation of the Hunt-Szymanski LCS algorithm.
-- Derived from the description in \"String searching algorithms\" by
-- Graham A Stephen, ISBN 981021829X.
-----------------------------------------------------------------------------

module Data.List.LCS.HuntSzymanski (
-- * Algorithm
-- \$algorithm

-- * LCS
lcs
) where

import System.Environment (getArgs)
import Data.Array (listArray, (!))
import Data.Array.MArray (MArray, newArray, newArray_)
import Data.Array.ST (STArray, STUArray)
import Data.List (groupBy, sort)

{- \$algorithm
We take two sequences, @xs@ and @ys@, of length @\#xs@ and @\#ys@.

First we make an array

> matchlist[i=0..(#xs-1)]

such that

> (matchlist[i] = js) => ((j `elem` js) <=> (xs !! i == ys !! j))
>                     && sort js == reverse js

i.e. @matchlist[i]@ is the indices of elements of @ys@ equal to the
ith element of @xs@, in descending order.

Let @\#xys@ be the minimum of @\#xs@ and @\#ys@. Trivially this is the maximum
possible length of the LCS of @xs@ and @ys@. Then we can imagine an array

> k[i=0..#xs][l=0..#xys]

such that @k[i][l] = j@ where @j@ is the smallest value such that the
LCS of @xs[0..i]@ and @ys[0..j]@ has length @l@. We use @\#ys@ to
mean there is no such @j@.

We will not need to whole array at once, though. Instead we use an array

> kk[l=0..#xys]

representing a row of @kk@ for a particular @i@. Initially it is for
@i = -1@, so @kk[0] = -1@ and @kk[l] = \#ys@ otherwise. As the algorithm
progresses we will increase @i@ by one at the outer level and compute
the replacement values for @k@'s elements.

But we want more than just the length of the LCS, we also want the LCS
itself. Another array

> revres[l=0..#xys]

stores the list of @xs@ indices an LCS of length @l@, if one is known,
at @revres[l]@.

Now, suppose @kk@ contains @k[i-1]@. We consider each @j@ in @matchlist[i]@
in turn. We find the @l@ such that @k[l-1] < j <= k[l]@. If @j < k[l]@ then
we updated @k[l]@ to be @j@ and set @revres[l]@ to be @i:revres[l-1]@.

Finding @l@ is basically binary search, but there are some tricks we can
do. First, as the @j@s are decreasing the last @l@ we had for this @i@ is
an upper bound on this @l@. Second, we use another array

> lastl[j=0..#ys-1]

to store the @l@ we got last time for this @j@, initially all @1@. As the
values in @kk[j]@ monotonically decrease this is a lower bound for @l@.
We also test to see whether this old @l@ is still @l@ before we start the
binary search.
-}

-- |The 'lcs' function takes two lists and returns a list with a longest
-- common subsequence of the two.
lcs :: Ord a => [a] -> [a] -> [a]
-- Start off by returning the common prefix
lcs [] _ = []
lcs _ [] = []
lcs (c1:c1s) (c2:c2s)
| c1 == c2 = c1 : lcs c1s c2s
-- Then reverse everything, get the backwards LCS and reverse it
lcs s1 s2 = lcs_tail [] (reverse s1) (reverse s2)

-- To get the backwards LCS, we again start off by returning the common
-- prefix (or suffix, however you want to think of it  :-)  )
lcs_tail :: Ord a => [a] -> [a] -> [a] -> [a]
lcs_tail acc (c1:c1s) (c2:c2s)
| c1 == c2 = lcs_tail (c1:acc) c1s c2s
lcs_tail acc [] _ = acc
lcs_tail acc _ [] = acc
-- Then we begin the real algorithm
lcs_tail acc s1 s2 = runST (lcs' acc s1 s2)

lcs' :: Ord a => [a] -> [a] -> [a] -> ST s [a]
lcs' acc xs ys =
do let max_xs = length xs
max_ys = length ys
minmax = max_xs `min` max_ys
-- Initialise all the arrays
matchlist <- newArray_ (0, max_xs - 1)
mk_matchlist matchlist xs ys
kk <- newArray (0, minmax) max_ys
unsafeWrite kk 0 (-1)
lastl <- newArray (0, max_ys - 1) 1
revres <- newArray_ (0, minmax)
unsafeWrite revres 0 []
-- Pass the buck to lcs'' to finish the job off
is <- lcs'' matchlist lastl kk revres max_xs max_ys minmax
-- Convert the list of i indices into the result sequence
let axs = listArray (0, max_xs - 1) xs
return \$ map (axs !) is ++ acc

eqFst :: Eq a => (a, b) -> (a, b) -> Bool
eqFst (x, _) (y, _) = x == y

-- mk_matchlist fills the matchlist array such that if
-- xs !! i == ys !! j then (j+1) `elem` matchlist ! i
-- and matchlist ! i is decreasing for all i
mk_matchlist :: Ord a => STArray s Int [Int] -> [a] -> [a] -> ST s ()
mk_matchlist matchlist xs ys =
do let -- xs' is a list of (string, ids with that string in xs)
xs' = map (\sns -> (fst (head sns), map snd sns))
\$ groupBy eqFst \$ sort \$ zip xs [0..]
-- ys' is similar, only the ids are reversed
ys' = map (\sns -> (fst (head sns), reverse \$ map snd sns))
\$ groupBy eqFst \$ sort \$ zip ys [0..]
-- add_to_matchlist does all the hardwork
= case compare sx sy of
-- If we have the same string in xs'' and ys'' then all
-- the indices in xs'' must map to the indices in ys''
EQ -> do sequence_ [ unsafeWrite matchlist i idsy
| i <- idsx ]
-- If the string in xs'' is smaller then there are no
-- corresponding indices in ys so we assign all the xs''
-- indices the empty list
LT -> do sequence_ [ unsafeWrite matchlist i []
| i <- idsx ]
-- Otherwise the string appears in ys only, so we ignore it
GT -> do add_to_matchlist all_xs ys''
-- If we run out of ys'' altogether then just go through putting
-- in [] for the list of indices of each index remaining in xs''
= do sequence_ [ unsafeWrite matchlist i [] | i <- idsx ]
-- When we run out of xs'' we are done
add_to_matchlist [] _ = return ()
-- Finally, actually call add_to_matchlist to populate matchlist

lcs'' :: STArray s Int [Int] -- matchlist
-> STUArray s Int Int -- lastl
-> STUArray s Int Int -- kk
-> STArray s Int [Int] -- revres
-> Int -> Int -> Int -> ST s [Int]
lcs'' matchlist lastl kk revres max_xs max_ys minmax =
do let -- Out the outermost level we loop over the indices i of xs
loop_i = sequence_ [ loop_j i | i <- [0..max_xs - 1] ]
-- For each i we loop over the matching indices j of elements of ys
loop_j i = do js <- unsafeRead matchlist i
with_js i js minmax
-- Deal with this i and j
with_js i (j:js) max_bound =
do x0 <- unsafeRead lastl j
l <- find_l j x0 max_bound
unsafeWrite lastl j l
when (j < vl) \$ do
unsafeWrite kk l j
rs <- unsafeRead revres (l - 1)
unsafeWrite revres l (i:rs)
with_js i js l
with_js _ [] _ = return ()
-- find_l returns the l such that kk ! (l-1) < j <= kk ! l
find_l j x0 z0
= let f x z
| x + 1 == z = return z
| otherwise  = let y = (x + z) `div` 2
in do vy <- unsafeRead kk y
if vy < j
then f y z
else f x y
in j `seq` do q1 <- unsafeRead kk x0
if j <= q1
then return x0
else f x0 z0
-- Do the hard work
loop_i
-- Find where the result starts
succ_l <- find_l max_ys 1 (minmax + 1)
-- Get the result