----------------------------------------------------------------------------- -- Copyright 2019, Advise-Me project team. This file is distributed under -- the terms of the Apache License 2.0. For more information, see the files -- "LICENSE.txt" and "NOTICE.txt", which are included in the distribution. ----------------------------------------------------------------------------- -- | -- Maintainer : bastiaan.heeren@ou.nl -- Stability : provisional -- Portability : portable (depends on ghc) -- -- Access to human assessments of the databases. -- -- This module reads spreadsheets containing human assessments of the data we have -- collected. This will enable us to compare the data, and use human-collected -- evidence to feed the Bayesian networks. -- ----------------------------------------------------------------------------- module Database.HumanAssessment ( Tally(..) , Match(..) , prettyMatches , summarise , summariseTasks , score , tally , tally' , getSpreadsheet , getEvidence , getHumanMachineFiles , readEvidence ) where import Data.List ( sortBy ) import Data.Function ( on ) import Data.Maybe ( fromJust, isJust, fromMaybe ) import Data.Char ( isDigit ) import Data.Semigroup ( Semigroup, (<>) ) import Data.Text ( unpack ) import Control.Monad ( forM, when ) import Control.Lens ( (^.) ) import Text.Printf ( printf ) import System.FilePath ( (), (<.>), takeBaseName, takeExtension ) import System.Directory ( getDirectoryContents ) import Text.PrettyPrint.Leijen ( pretty, (<+>), (<$$>) ) import qualified Data.ByteString.Lazy as BS import qualified Data.Map as M import qualified Codec.Xlsx as XLSX import qualified Database.HDBC.Sqlite3 as SQL ( connectSqlite3 ) import qualified Text.PrettyPrint.Leijen as PP import Util.List ( index ) import Util.String ( split, trim, normalize, percentage, (//) ) import Database.Data ( PilotID, StudentID, TaskID, NodeID, allEvidence ) import Bayes.Evidence ( Evidence, hardEvidence, setId ) import Main.Tasks ( findTaskFuzzy, taskNetwork ) import Bayes.Network ( findNodeFuzzy, states, Node, nodeId, state2label, label2state ) import Recognize.Data.MathStoryProblem ( Task(Task), singleNetwork ) -- | Obtain a list of pilots and associated spreadsheet + database files. getHumanMachineFiles :: FilePath -> FilePath -> IO [(PilotID, FilePath, FilePath)] getHumanMachineFiles machineDir humanDir = do files <- filter ((== ".xlsx") . takeExtension) <$> getDirectoryContents humanDir forM files $ \file -> do let pilot = takeBaseName file let spreadsheet = humanDir pilot <.> "xlsx" let database = machineDir pilot <.> "db" return (pilot, spreadsheet, database) ------------------------------------------------------------------------------- -- Tally -- | Where did a match or mismatch occur? data Match = Match { locStudent :: StudentID , locTask :: TaskID , locNode :: NodeID , expectation :: Maybe String -- ^ Human assessment , observation :: Maybe String -- ^ Machine assessment } -- | Group matches together --groupMatches :: [Match] -> [((StudentID,TaskID), Match)] --groupMatches = orderBy (locStudent &&& locTask) -- | Counter for the number of times we encounter agreements and disagreements -- between the human and the machine assessment. data Tally = Tally { matchState :: [Match] -- ^ Human & machine assessment match on state , matchAbsence :: [Match] -- ^ Neither human nor machine set a state on the node , mismatchState :: [Match] -- ^ Human & machine set a state, but not the same , expectedPresence :: [Match] -- ^ Machine does not set a state but human does , expectedAbsence:: [Match] -- ^ Human does not set a state but machine does } instance Monoid Tally where mempty = Tally [] [] [] [] [] mappend = (<>) instance Semigroup Tally where x <> y = Tally { matchState = matchState x ++ matchState y , matchAbsence = matchAbsence x ++ matchAbsence y , mismatchState = mismatchState x ++ mismatchState y , expectedPresence = expectedPresence x ++ expectedPresence y , expectedAbsence = expectedAbsence x ++ expectedAbsence y } instance PP.Pretty Match where pretty m = PP.fill 35 (pretty $ locNode m) <+> PP.fill 10 (pretty $ locStudent m) <+> PP.fill 16 (maybe (pretty "empty") (PP.dquotes . pretty) (expectation m)) <+> maybe (pretty "empty") (PP.dquotes . pretty) (observation m) prettyMatches :: [Match] -> PP.Doc prettyMatches matches = PP.fill 35 (pretty "NODE") <+> PP.fill 10 (pretty "STUDENT") <+> PP.fill 16 (pretty "HUMAN/EXPECTED") <+> pretty "MACHINE/OBSERVED" <$$> (PP.vsep . map pretty $ matches) instance PP.Pretty Tally where pretty t = pretty "Match" <+> perc totalMatch totalTally <$$> (PP.indent 4 $ pretty "States match" <+> perc (length . matchState) totalMatch <$$> pretty "Agree on absence" <+> perc (length . matchAbsence) totalMatch ) <$$> pretty "Mismatch" <+> perc totalMismatch totalTally <$$> (PP.indent 4 $ pretty "States mismatch" <+> perc (length . mismatchState) totalMismatch <$$> pretty "Expected presence" <+> perc (length . expectedPresence) totalMismatch <$$> pretty "Expected absence" <+> perc (length . expectedAbsence) totalMismatch ) <$$> pretty "Severe mismatches:" <+> PP.text (percentage 1 (score (\t' -> length $ mismatchState t' ++ expectedAbsence t') totalTally t)) where perc f g = pretty (f t) <+> pretty "of" <+> pretty (g t) <+> PP.parens (PP.text $ percentage 1 (score f g t)) -- | Obtain the total number of times the human agreed with the machine. totalMatch :: Tally -> Int totalMatch t = length (matchState t) + length (matchAbsence t) -- | Obtain the total number of times the human disagreed with the machine. totalMismatch :: Tally -> Int totalMismatch t = length (mismatchState t) + length (expectedAbsence t) + length (expectedPresence t) -- | Obtain the total number of observations. totalTally :: Tally -> Int totalTally t = totalMatch t + totalMismatch t -- | Convert tally fields to a corresponding score. score :: (Tally -> Int) -> (Tally -> Int) -> Tally -> Double score field1 field2 t = field1 t // field2 t ------------------------------------------------------------------------------- -- Tallying the spreadsheet/database combination type State = Maybe String getState :: M.Map (StudentID, TaskID) Evidence -> StudentID -> TaskID -> NodeID -> State getState db sID tID nID = M.lookup (sID, tID) db >>= lookup nID . hardEvidence >>= return . state2label taskNetwork nID -- | Get the string value at a particular row/col. (!) :: XLSX.CellMap -> (Int,Int) -> Maybe String cells ! rowcol = trim . toString <$> (M.lookup rowcol cells >>= (^. XLSX.cellValue)) where -- | Obtain string from cell. toString :: XLSX.CellValue -> String toString (XLSX.CellText t) = unpack t toString (XLSX.CellDouble d) = printf "%.f" d toString t = error $ "not a text cell: " ++ show t -- | Get the canonical node ID, that is, the node as it appears in the actual -- network interface (including prefix, correct capitalisation, etc). canonicalNode :: TaskID -> NodeID -> Node () canonicalNode tID nID = either error id $ do Task t <- findTaskFuzzy tID findNodeFuzzy (singleNetwork t) nID -- | Obtain task worksheets from a spreadsheet. getWorksheets :: XLSX.Xlsx -> [(TaskID, XLSX.CellMap)] getWorksheets spreadsheet = [ (tID, cells) | (sheetname, worksheet) <- spreadsheet ^. XLSX.xlSheets , let cells = worksheet ^. XLSX.wsCells , let tID = normalize . unpack $ sheetname ] -- | Get the node names and their rows in the spreadsheet (excluding strategy -- nodes). getNodes :: XLSX.CellMap -> [(NodeID, Int)] getNodes cells = [ (nodeID_noncanonical, row) | (row, col) <- M.keys cells , row >= 3 , col == 1 , let value = cells ! (row,col) , isJust value , let nodeID_noncanonical = fromJust value ] -- | Get the students and their columns in the spreadsheet. getStudents :: XLSX.CellMap -> [(StudentID, Int)] getStudents cells = [ (sID, col) | (row, col) <- M.keys cells , row == 1 , col >= 3 , let value = cells ! (row,col) , isJust value , let sID = fromJust value ] -- Get canonical node IDs and state labels of all nodes in a worksheet. getStates :: TaskID -> XLSX.CellMap -> [((StudentID, NodeID), State)] getStates tID cells = [ ((sID, nodeId node), cells ! (row, col)) | (sID, col) <- getStudents cells , (nodeID_noncanonical, row) <- getNodes cells , let node = canonicalNode tID nodeID_noncanonical ] -- | Get expected evidence from a spreadsheet. getEvidence :: XLSX.Xlsx -> M.Map (StudentID, TaskID) Evidence getEvidence spreadsheet = M.fromListWith (<>) $ [ ((sID, tID), maybe mempty (\state -> setId nodeID (label2state taskNetwork nodeID state) mempty) expected ) | (tID, cells) <- getWorksheets spreadsheet , ((sID, nodeID), expected) <- getStates tID cells ] -- | Get evidence from an Excel file. readEvidence :: FilePath -> IO (M.Map (StudentID, TaskID) Evidence) readEvidence path = getEvidence . XLSX.toXlsx <$> BS.readFile path getSpreadsheet :: FilePath -> IO XLSX.Xlsx getSpreadsheet path = XLSX.toXlsx <$> BS.readFile path tally :: FilePath -> FilePath -> IO (M.Map (TaskID, NodeID) Tally) tally spreadsheetFile databaseFile = do spreadsheet <- getSpreadsheet spreadsheetFile allObserved <- SQL.connectSqlite3 databaseFile >>= allEvidence tally' spreadsheet allObserved -- | Get the tallies for every task/node combination, based on a spreadsheet -- file with expected, human-assessed values and a database file with observed, -- IDEAS-assessed values. tally' :: XLSX.Xlsx -> M.Map (StudentID,TaskID) Evidence -> IO (M.Map (TaskID, NodeID) Tally) tally' spreadsheet allObserved = do return . M.fromListWith (<>) $ [ (,) (tID, nodeID) $ categorise match | (tID, cells) <- getWorksheets spreadsheet , ((sID, nodeID), expected) <- getStates tID cells , let observed = getState allObserved sID tID nodeID , let match = Match { locStudent = sID, locTask = tID, locNode = nodeID, observation = observed, expectation = expected } ] where -- | Compare the expected and observed states of a node. categorise :: Match -> Tally categorise match = case expectation match of Just e -> case observation match of Just o -> if e == o then mempty { matchState = [match] } else mempty { mismatchState = [match] } Nothing -> mempty { expectedPresence = [match] } Nothing -> case observation match of Just _ -> mempty { expectedAbsence = [match] } Nothing -> mempty { matchAbsence = [match] } -- | Summarise the tallies per task by combining the tallies for each node. summariseTasks :: M.Map (TaskID, NodeID) Tally -> M.Map TaskID Tally summariseTasks = M.mapKeysWith (<>) fst -- | Combine multiple tallies. summarise :: Foldable t => t Tally -> Tally summarise = foldl (<>) mempty