module Text.PageIO.Infer where
import Text.PageIO.Types
import Data.Char (isAlphaNum, toUpper)
import Data.Maybe (catMaybes)
import Data.Monoid
import Control.Applicative
import Data.Array (elems, (!))
import Text.Regex.TDFA
import Text.Regex.Base.RegexLike
import qualified Data.ByteString.Char8 as S
import qualified Text.PageIO.LabelMap as LM
import Debug.Trace
inferSheet :: FilePath -> [Page] -> Sheet
inferSheet fn pages = MkSheet
{ sheetName = name
, sheetBox = MkBox
{ boxTop = 1
, boxLeft = 1
, boxBottom = rows
, boxRight = cols
}
, sheetPatterns = mempty
, sheetFields = LM.fromList
( mkField "b_id" 1 1 4 1
: case pages of
[] -> []
(p:_) -> inferPageNameField p : inferPageDateFields p
)
, sheetFrames = []
, sheetOrderBy = [toLabel "b_id"]
, sheetGroupBy = [toLabel "b_id"]
, sheetUseBlockSortPriority = True
}
where
lns = pageLines <$> pages
rows = maximum $ length <$> lns
cols = maximum $ S.length <$> concat lns
name = toLabel $ toUpper <$> takeWhile isAlphaNum fn
mkField :: String -> Col -> Row -> Col -> Row -> (Label, Field)
mkField lbl l t r b = (,) (toLabel lbl) MkField
{ fieldBox = MkBox
{ boxLeft = l
, boxTop = t
, boxRight = r
, boxBottom = b
}
, fieldVariable = Nothing
, fieldKeepSpaces = False
, fieldFormat = FGeneral
}
namePattern :: Value
namePattern = S.concat
[ S.pack "(([^ ]+ {0,2})*[^ ]*("
, keywords
, S.pack ")[^< ]*)"
]
where
keywords = S.intercalate (S.pack "|") $ S.pack <$>
[ "\xB3\xF8"
, "\xAA\xED"
, "\xB1\x62"
, "\xC3\xAF"
, "\xB3\xE6"
, "\xA5\x55"
, "\xB2\xD3"
, "\xBF\xFD"
, "\xB8\xEA"
, "\xAE\xC6"
, "\xAE\xD1"
, "\xC0\xC9"
, "\xC1\x60"
, "\xAD\x70"
]
datePattern :: Value
datePattern = S.pack $ concat
[ "( {0,2}[0-9]{2,4} {0,2})"
, "([^ 0-9]{1,4}| )"
, "( {0,3}[0-9]{1,2}) {0,2}"
, "(([^ 0-9]{1,4}| )"
, "( {0,3}[0-9]{1,2} {0,1}))?"
]
data DateMatch = MkDateMatch
{ matchLine :: Int
, matchIsROC :: Bool
, matchYear :: (Int, Int)
, matchMonth :: (Int, Int)
, matchDay :: Maybe (Int, Int)
}
deriving (Show, Eq, Ord)
inferPageNameField :: Page -> (Label, Field)
inferPageNameField page = case concatMap tryMatchName (take 6 lns) of
[] -> mkField "r_name" 1 1 0 0
(f:_) -> f
where
lns = [1..] `zip` pageLines page
tryMatchName :: (Int, Value) -> [(Label, Field)]
tryMatchName (lineNum, content) =
[ mkField "r_name" (start+delta+1) lineNum (start+len) lineNum
| (text, (start, len)) <- (! (0 :: Int)) <$> content =~ namePattern
, let delta = tryDelta text
]
where
tryDelta text = case text `matchSubstring` MkMatch commercialBank of
Nothing -> 0
Just off -> off + S.length commercialBank
commercialBank = S.pack "\xB0\xD3\xB7\x7E\xBB\xC8\xA6\xE6\xA1\x40"
inferPageDateFields :: Page -> [(Label, Field)]
inferPageDateFields page = case concatMap tryMatchDate lns of
[] -> []
(m:_) -> dateMatchToFields m
where
lns = [1..] `zip` pageLines page
tryMatchDate :: (Int, Value) -> [DateMatch]
tryMatchDate (lineNum, content) =
[ dateMatch{ matchLine = lineNum }
| Just dateMatch <- validateMatch <$> reverse (content =~ datePattern)
]
validateMatch :: MatchText Value -> Maybe DateMatch
validateMatch match
| month >= 1, month <= 12, day >= 1, day <= 31
= Just MkDateMatch
{ matchLine = undefined
, matchIsROC = year < 1900
, matchYear = yearPos
, matchMonth = monthPos
, matchDay = if S.null dayText then Nothing else Just dayPos
}
| otherwise
= Nothing
where
[ _, (yearText, yearPos)
, _
, (monthText, monthPos)
, _
, _
, (dayText, dayPos) ] = elems match
[year, month, day] = valToInt <$> [yearText, monthText, dateText']
dateText' | S.null dayText = S.singleton '1'
| otherwise = dayText
dateMatchToFields :: DateMatch -> [(Label, Field)]
dateMatchToFields MkDateMatch{ matchLine, matchIsROC, matchYear, matchMonth, matchDay } =
( dateField yearLabel matchYear
: dateField "month" matchMonth
: maybe [] ((:[]) . dateField "day") matchDay
)
where
dateField lbl (start, len) = mkField lbl (start+1) matchLine (start+len) matchLine
yearLabel = if matchIsROC then "year_roc" else "year"