{-# LANGUAGE RecordPuns, ParallelListComp, PatternGuards, PatternSignatures #-}
module Text.PageIO.Infer where
import Text.PageIO.Types
import Data.Char (isAlphaNum, toUpper)
import Data.Maybe (catMaybes)
import Data.Monoid
import Control.Applicative
import Data.Array (elems, (!))
import Text.Regex.TDFA
import Text.Regex.Base.RegexLike
import qualified Data.ByteString.Char8 as S
import qualified Text.PageIO.LabelMap as LM
import Debug.Trace

inferSheet :: FilePath -> [Page] -> Sheet
inferSheet fn pages = MkSheet
    { sheetName = name
    , sheetBox  = MkBox
        { boxTop    = 1
        , boxLeft   = 1
        , boxBottom = rows
        , boxRight  = cols
        }
    , sheetPatterns = mempty
    , sheetFields   = LM.fromList
        ( mkField "b_id" 1 1 4 1
        : case pages of
            []      -> []
            (p:_)   -> inferPageNameField p : inferPageDateFields p
        )
    , sheetFrames   = []
    , sheetOrderBy  = [toLabel "b_id"]
    , sheetGroupBy  = [toLabel "b_id"]
    , sheetUseBlockSortPriority = True
    }
    where
    lns     = pageLines <$> pages   
    rows    = maximum $ length <$> lns
    cols    = maximum $ S.length <$> concat lns
    name    = toLabel $ toUpper <$> takeWhile isAlphaNum fn

mkField :: String -> Col -> Row -> Col -> Row -> (Label, Field)
mkField lbl l t r b = (,) (toLabel lbl) MkField
    { fieldBox        = MkBox
        { boxLeft   = l
        , boxTop    = t
        , boxRight  = r
        , boxBottom = b
        }
    , fieldVariable   = Nothing
    , fieldKeepSpaces = False
    , fieldFormat     = FGeneral
    }

namePattern :: Value
namePattern = S.concat
    [ S.pack "(([^ ]+ {0,2})*[^ ]*("
    , keywords
    , S.pack ")[^< ]*)"
    ]
    where
    keywords = S.intercalate (S.pack "|") $ S.pack <$>
        [ "\xB3\xF8"    -- 報
        , "\xAA\xED"    -- 表
        , "\xB1\x62"    -- 帳
        , "\xC3\xAF"    -- 簿
        , "\xB3\xE6"    -- 單
        , "\xA5\x55"    -- 冊
        , "\xB2\xD3"    -- 細
        , "\xBF\xFD"    -- 錄
        , "\xB8\xEA"    -- 資
        , "\xAE\xC6"    -- 料
        , "\xAE\xD1"    -- 書
        , "\xC0\xC9"    -- 檔
        , "\xC1\x60"    -- 總
        , "\xAD\x70"    -- 計
        ]

datePattern :: Value
datePattern = S.pack $ concat
    [ "( {0,2}[0-9]{2,4} {0,2})"
    , "([^ 0-9]{1,4}|  )"
    , "( {0,3}[0-9]{1,2}) {0,2}"
    , "(([^ 0-9]{1,4}|  )"
    , "( {0,3}[0-9]{1,2} {0,1}))?"
    ]

data DateMatch = MkDateMatch
    { matchLine     :: Int
    , matchIsROC    :: Bool 
    , matchYear     :: (Int, Int)
    , matchMonth    :: (Int, Int)
    , matchDay      :: Maybe (Int, Int)
    }
    deriving (Show, Eq, Ord)

inferPageNameField :: Page -> (Label, Field)
inferPageNameField page = case concatMap tryMatchName (take 6 lns) of
    []      -> mkField "r_name" 1 1 0 0 -- error "No r_name found!"
    (f:_)   -> f
    where
    lns = [1..] `zip` pageLines page

tryMatchName :: (Int, Value) -> [(Label, Field)]
tryMatchName (lineNum, content) =
    [ mkField "r_name" (start+delta+1) lineNum (start+len) lineNum
    | (text, (start, len)) <- (! (0 :: Int)) <$> content =~ namePattern
    , let delta = tryDelta text
    ]
    where
    tryDelta text = case text `matchSubstring` MkMatch commercialBank of
        Nothing     -> 0
        Just off    -> off + S.length commercialBank
    commercialBank = S.pack "\xB0\xD3\xB7\x7E\xBB\xC8\xA6\xE6\xA1\x40"

-- Algorithm: Find the first line with three consecutive integers
--            that looks sufficiently like a date.  We do so by
--            finding each 3-grams of digits close enough together
--            (distance 7 or less), then inspect the actual values,
--            then finally capture the spaces around them too.
--            If the year is >= 1900 then it's "year" else it's "year_roc".
inferPageDateFields :: Page -> [(Label, Field)]
inferPageDateFields page = case concatMap tryMatchDate lns of
    []      -> []
    (m:_)   -> dateMatchToFields m
    where
    lns = [1..] `zip` pageLines page

tryMatchDate :: (Int, Value) -> [DateMatch]
tryMatchDate (lineNum, content) =
    [ dateMatch{ matchLine = lineNum }
    | Just dateMatch <- validateMatch <$> reverse (content =~ datePattern)
    ]

validateMatch :: MatchText Value -> Maybe DateMatch
validateMatch match
    | month >= 1, month <= 12, day >= 1, day <= 31
    = Just MkDateMatch
        { matchLine     = undefined
        , matchIsROC    = year < 1900
        , matchYear     = yearPos
        , matchMonth    = monthPos
        , matchDay      = if S.null dayText then Nothing else Just dayPos
        }
    | otherwise
    = Nothing
    where
    [ _, (yearText,  yearPos)
       , _
       , (monthText, monthPos)
       , _
       , _
       , (dayText,   dayPos) ] = elems match
    [year, month, day] = valToInt <$> [yearText, monthText, dateText']
    dateText' | S.null dayText  = S.singleton '1'
              | otherwise       = dayText

dateMatchToFields :: DateMatch -> [(Label, Field)]
dateMatchToFields MkDateMatch{ matchLine, matchIsROC, matchYear, matchMonth, matchDay } =
    ( dateField yearLabel matchYear
    : dateField "month" matchMonth
    : maybe [] ((:[]) . dateField "day") matchDay
    )
    where
    dateField lbl (start, len) = mkField lbl (start+1) matchLine (start+len) matchLine
    yearLabel = if matchIsROC then "year_roc" else "year"