-----------------------------------------------------------------------------
-- Copyright 2019, Advise-Me project team. This file is distributed under 
-- the terms of the Apache License 2.0. For more information, see the files
-- "LICENSE.txt" and "NOTICE.txt", which are included in the distribution.
-----------------------------------------------------------------------------
-- |
-- Maintainer  :  bastiaan.heeren@ou.nl
-- Stability   :  provisional
-- Portability :  portable (depends on ghc)
--
-- Access to human assessments of the databases.
-- 
-- This module reads spreadsheets containing human assessments of the data we have
-- collected. This will enable us to compare the data, and use human-collected
-- evidence to feed the Bayesian networks.
--
-----------------------------------------------------------------------------

module Database.HumanAssessment
   ( Tally(..)
   , Match(..)
   , prettyMatches
   , summarise
   , summariseTasks
   , score
   , tally
   , tally'
   , getSpreadsheet
   , getEvidence
   , getHumanMachineFiles
   , readEvidence
   ) where

import Data.List ( sortBy )
import Data.Function ( on )
import Data.Maybe ( fromJust, isJust, fromMaybe )
import Data.Char ( isDigit )
import Data.Semigroup ( Semigroup, (<>) )
import Data.Text ( unpack )
import Control.Monad ( forM, when )
import Control.Lens ( (^.) )
import Text.Printf ( printf )
import System.FilePath ( (</>), (<.>), takeBaseName, takeExtension )
import System.Directory ( getDirectoryContents )
import Text.PrettyPrint.Leijen ( pretty, (<+>), (<$$>) )
import qualified Data.ByteString.Lazy as BS
import qualified Data.Map as M
import qualified Codec.Xlsx as XLSX
import qualified Database.HDBC.Sqlite3 as SQL ( connectSqlite3 )
import qualified Text.PrettyPrint.Leijen as PP

import Util.List ( index )
import Util.String ( split, trim, normalize, percentage, (//) )
import Database.Data ( PilotID, StudentID, TaskID, NodeID, allEvidence )
import Bayes.Evidence ( Evidence, hardEvidence, setId )
import Main.Tasks ( findTaskFuzzy, taskNetwork )
import Bayes.Network ( findNodeFuzzy, states, Node, nodeId, state2label, label2state )
import Recognize.Data.MathStoryProblem ( Task(Task), singleNetwork )


-- | Obtain a list of pilots and associated spreadsheet + database files.
getHumanMachineFiles :: FilePath -> FilePath -> IO [(PilotID, FilePath, FilePath)]
getHumanMachineFiles machineDir humanDir = do
   files <- filter ((== ".xlsx") . takeExtension) <$> getDirectoryContents humanDir
   forM files $ \file -> do
      let pilot = takeBaseName file
      let spreadsheet = humanDir </> pilot <.> "xlsx"
      let database = machineDir </> pilot <.> "db"
      return (pilot, spreadsheet, database)


-------------------------------------------------------------------------------
-- Tally

-- | Where did a match or mismatch occur?
data Match = Match
   { locStudent :: StudentID
   , locTask :: TaskID
   , locNode :: NodeID
   , expectation :: Maybe String -- ^ Human assessment
   , observation :: Maybe String -- ^ Machine assessment
   }

-- | Group matches together
--groupMatches :: [Match] -> [((StudentID,TaskID), Match)]
--groupMatches = orderBy (locStudent &&& locTask)


-- | Counter for the number of times we encounter agreements and disagreements
-- between the human and the machine assessment.
data Tally = Tally
   { matchState :: [Match] -- ^ Human & machine assessment match on state
   , matchAbsence :: [Match] -- ^ Neither human nor machine set a state on the node
   , mismatchState :: [Match] -- ^ Human & machine set a state, but not the same
   , expectedPresence :: [Match] -- ^ Machine does not set a state but human does
   , expectedAbsence:: [Match] -- ^ Human does not set a state but machine does
   }

instance Monoid Tally where
   mempty  = Tally [] [] [] [] []
   mappend = (<>)

instance Semigroup Tally where
   x <> y = Tally
      { matchState = matchState x ++ matchState y
      , matchAbsence = matchAbsence x ++ matchAbsence y
      , mismatchState = mismatchState x ++ mismatchState y
      , expectedPresence = expectedPresence x ++ expectedPresence y
      , expectedAbsence = expectedAbsence x ++ expectedAbsence y
      }

instance PP.Pretty Match where
   pretty m =
      PP.fill 35 (pretty $ locNode m) <+>
      PP.fill 10 (pretty $ locStudent m) <+>
      PP.fill 16 (maybe (pretty "empty") (PP.dquotes . pretty) (expectation m)) <+>
      maybe (pretty "empty") (PP.dquotes . pretty) (observation m)


prettyMatches :: [Match] -> PP.Doc
prettyMatches matches =
   PP.fill 35 (pretty "NODE") <+>
   PP.fill 10 (pretty "STUDENT") <+>
   PP.fill 16 (pretty "HUMAN/EXPECTED") <+>
   pretty "MACHINE/OBSERVED" <$$>
   (PP.vsep . map pretty $ matches)


instance PP.Pretty Tally where
   pretty t =
         pretty "Match" <+> perc totalMatch totalTally <$$> (PP.indent 4 $
            pretty "States match" <+> perc (length . matchState) totalMatch <$$>
            pretty "Agree on absence" <+> perc (length . matchAbsence) totalMatch
         ) <$$>
         pretty "Mismatch" <+> perc totalMismatch totalTally <$$> (PP.indent 4 $
            pretty "States mismatch" <+> perc (length . mismatchState) totalMismatch <$$>
            pretty "Expected presence" <+> perc (length . expectedPresence) totalMismatch <$$>
            pretty "Expected absence" <+> perc (length . expectedAbsence) totalMismatch
         ) <$$>
         pretty "Severe mismatches:" <+> PP.text (percentage 1 (score (\t' -> length $ mismatchState t' ++ expectedAbsence t') totalTally t))

         where
         perc f g =
            pretty (f t) <+> pretty "of" <+> pretty (g t) <+>
            PP.parens (PP.text $ percentage 1 (score f g t))


-- | Obtain the total number of times the human agreed with the machine.
totalMatch :: Tally -> Int
totalMatch t = length (matchState t) + length (matchAbsence t)


-- | Obtain the total number of times the human disagreed with the machine.
totalMismatch :: Tally -> Int
totalMismatch t = length (mismatchState t) + length (expectedAbsence t) + length (expectedPresence t)


-- | Obtain the total number of observations.
totalTally :: Tally -> Int
totalTally t = totalMatch t + totalMismatch t

-- | Convert tally fields to a corresponding score.
score :: (Tally -> Int) -> (Tally -> Int) -> Tally -> Double
score field1 field2 t = field1 t // field2 t


-------------------------------------------------------------------------------
-- Tallying the spreadsheet/database combination

type State = Maybe String

getState :: M.Map (StudentID, TaskID) Evidence
         -> StudentID -> TaskID -> NodeID -> State
getState db sID tID nID =
   M.lookup (sID, tID) db >>= lookup nID . hardEvidence >>= return . state2label taskNetwork nID


-- | Get the string value at a particular row/col.
(!) :: XLSX.CellMap -> (Int,Int) -> Maybe String
cells ! rowcol = trim . toString <$> (M.lookup rowcol cells >>= (^. XLSX.cellValue))

   where
   -- | Obtain string from cell.
   toString :: XLSX.CellValue -> String
   toString (XLSX.CellText t) = unpack t
   toString (XLSX.CellDouble d) = printf "%.f" d
   toString t = error $ "not a text cell: " ++ show t


-- | Get the canonical node ID, that is, the node as it appears in the actual
-- network interface (including prefix, correct capitalisation, etc).
canonicalNode :: TaskID -> NodeID -> Node ()
canonicalNode tID nID = either error id $ do
   Task t <- findTaskFuzzy tID
   findNodeFuzzy (singleNetwork t) nID


-- | Obtain task worksheets from a spreadsheet.
getWorksheets :: XLSX.Xlsx -> [(TaskID, XLSX.CellMap)]
getWorksheets spreadsheet =
   [ (tID, cells)
   | (sheetname, worksheet) <- spreadsheet ^. XLSX.xlSheets
   , let cells = worksheet ^. XLSX.wsCells
   , let tID = normalize . unpack $ sheetname
   ]


-- | Get the node names and their rows in the spreadsheet (excluding strategy
-- nodes).
getNodes :: XLSX.CellMap -> [(NodeID, Int)]
getNodes cells =
   [ (nodeID_noncanonical, row)
   | (row, col) <- M.keys cells
   , row >= 3
   , col == 1
   , let value = cells ! (row,col)
   , isJust value
   , let nodeID_noncanonical = fromJust value
   ]

-- | Get the students and their columns in the spreadsheet.
getStudents :: XLSX.CellMap -> [(StudentID, Int)]
getStudents cells =
   [ (sID, col)
   | (row, col) <- M.keys cells
   , row == 1
   , col >= 3
   , let value = cells ! (row,col)
   , isJust value
   , let sID = fromJust value
   ]


-- Get canonical node IDs and state labels of all nodes in a worksheet.
getStates :: TaskID -> XLSX.CellMap -> [((StudentID, NodeID), State)]
getStates tID cells =
   [ ((sID, nodeId node), cells ! (row, col))
   | (sID, col) <- getStudents cells
   , (nodeID_noncanonical, row) <- getNodes cells
   , let node = canonicalNode tID nodeID_noncanonical
   ]


-- | Get expected evidence from a spreadsheet.
getEvidence :: XLSX.Xlsx -> M.Map (StudentID, TaskID) Evidence
getEvidence spreadsheet = M.fromListWith (<>) $
   [ ((sID, tID), maybe mempty (\state -> setId nodeID (label2state taskNetwork nodeID state) mempty) expected )
   | (tID, cells) <- getWorksheets spreadsheet
   , ((sID, nodeID), expected) <- getStates tID cells
   ]


-- | Get evidence from an Excel file.
readEvidence :: FilePath -> IO (M.Map (StudentID, TaskID) Evidence)
readEvidence path = getEvidence . XLSX.toXlsx <$> BS.readFile path


getSpreadsheet :: FilePath -> IO XLSX.Xlsx
getSpreadsheet path = XLSX.toXlsx <$> BS.readFile path

tally :: FilePath -> FilePath -> IO (M.Map (TaskID, NodeID) Tally)
tally spreadsheetFile databaseFile = do
   spreadsheet <- getSpreadsheet spreadsheetFile
   allObserved <- SQL.connectSqlite3 databaseFile >>= allEvidence
   tally' spreadsheet allObserved


-- | Get the tallies for every task/node combination, based on a spreadsheet
-- file with expected, human-assessed values and a database file with observed,
-- IDEAS-assessed values.
tally' :: XLSX.Xlsx -> M.Map (StudentID,TaskID) Evidence -> IO (M.Map (TaskID, NodeID) Tally)
tally' spreadsheet allObserved = do
   return . M.fromListWith (<>) $
      [ (,) (tID, nodeID) $ categorise match
      | (tID, cells) <- getWorksheets spreadsheet
      , ((sID, nodeID), expected) <- getStates tID cells
      , let observed = getState allObserved sID tID nodeID
      , let match = Match { locStudent = sID, locTask = tID, locNode = nodeID, observation = observed, expectation = expected }
      ]

   where

   -- | Compare the expected and observed states of a node.
   categorise :: Match -> Tally
   categorise match = case expectation match of
      Just e -> case observation match of
         Just o -> if e == o then mempty { matchState = [match] } else mempty { mismatchState = [match] }
         Nothing -> mempty { expectedPresence = [match] }
      Nothing -> case observation match of
         Just _ -> mempty { expectedAbsence = [match] }
         Nothing -> mempty { matchAbsence = [match] }




-- | Summarise the tallies per task by combining the tallies for each node.
summariseTasks :: M.Map (TaskID, NodeID) Tally -> M.Map TaskID Tally
summariseTasks = M.mapKeysWith (<>) fst


-- | Combine multiple tallies.
summarise :: Foldable t => t Tally -> Tally
summarise = foldl (<>) mempty