{-# LANGUAGE NamedFieldPuns, RecordWildCards #-} import Control.Applicative import Control.Monad import Data.List import Data.Maybe import System.Directory import System.FilePath import Makedo import Makedo.Contained import HSH import System.IO.HVFS.Utils import qualified Data.ByteString as B import CorpusFiles -- ---------------------------------------------------------------------- -- customise here -- ---------------------------------------------------------------------- f_DATASET = "dev" -- f_DATASET = "test" d_ORIGINALS = "data" d_SENTENCE_SELECTION = "sentence-selection" d_SENTENCE_SELECTION_SUBSET = d_SENTENCE_SELECTION <.> "subset-" ++ x_REFERENCE d_DATA_SELECTION = "data-selection" d_DATA_SELECTION_SUBSET = d_DATA_SELECTION <.> "subset-" ++ x_REFERENCE d_FRAGMENTS = "fragments-extracted" d_SCORES = "scores" d_WORK_DOC_ALIGNED = "doc-aligned" d_WORK_PREPROCESSED = "preprocessed" d_SRC = d_ORIGINALS "document-pairs" d_LLR_1 = "doc-llr-1" d_LLR_2 = "doc-llr-2" d_REFERENCE = d_ORIGINALS f_DATASET ++ "-set" x_REFERENCE = "reference-" ++ f_DATASET d_FRAGMENTS_REFERENCE = "fragments-reference" d_FRAGMENTS_BASELINE = "fragments-baseline" d_PEERS = "peers" d_SENTENCE_SELECTION_REFERENCE = d_SENTENCE_SELECTION <.> x_REFERENCE d_CALCULATORS = "../calculators" d_BIN = "cabal-dev/bin" d_RASP_HOME = "/opt/rasp" f_DATASET :: String x_REFERENCE :: String d_ORIGINALS :: FilePath d_SENTENCE_SELECTION :: FilePath d_FRAGMENTS :: FilePath d_SENTENCE_SELECTION_REFERENCE :: FilePath d_SENTENCE_SELECTION_SUBSET :: FilePath d_FRAGMENTS_REFERENCE :: FilePath d_FRAGMENTS_BASELINE :: FilePath d_REFERENCE :: FilePath d_DATA_SELECTION :: FilePath d_DATA_SELECTION_SUBSET :: FilePath d_SCORES :: FilePath d_WORK_DOC_ALIGNED :: FilePath d_WORK_PREPROCESSED :: FilePath d_SRC :: FilePath d_LLR_1 :: FilePath d_LLR_2 :: FilePath d_BIN :: FilePath d_CALCULATORS :: FilePath d_RASP_HOME :: FilePath d_PEERS :: FilePath ssMethods :: [ SentSelMethod ] ssMethods = [ SentSelBaselineN 1 , SentSelBaselineA ] ++ map SentSelThreshold sthresholds where sthresholds = [ 60 ] dsMethods :: [ DataSelMethod ] dsMethods = DataSelAll : [ m t | m <- tmethods, t <- dthresholds ] where tmethods = [ DataSelSingle, DataSelDisj ] dthresholds = [ 15 ] llrMethods :: [ LlrMethod ] llrMethods = [ LlrNoRecompute , LlrRecomputeSent , LlrRecomputeDoc ] fbMethods :: [ FragBaselineMethod ] fbMethods = [ FragBaselineAll , FragBaselineCutoff 1 , FragBaselineCutoff 2 ] parallelRTS :: [String] parallelRTS = ["+RTS", "-N", "-RTS"] -- ---------------------------------------------------------------------- main :: IO () main = redoMain $ concatMap allTargets pipelineTargets pipelineTargets :: [Contained] pipelineTargets = concat [ initialiseTargets , docTargets , preprocessedTargets , llr1Targets , sentenceSelectionTargets , dataSelectionTargets , llr2Targets , fragmentExtractionTargets , referenceTargets , scoringTargets ] initialiseTargets :: [Contained] initialiseTargets = singleton $ Contained [] [] [target initialise] initialise :: ExactTarget initialise = ExactTarget "init" "[0] set up initial directories" $ const $ mapM_ (createDirectoryIfMissing True) containerDirs -- | See 'Contained' containerDirs :: [FilePath] containerDirs = concatMap containers pipelineTargets -- ---------------------------------------------------------------------- -- STAGE setup (after doc-alignment) -- ---------------------------------------------------------------------- x_HILL_INPUT :: String x_GEO_INPUT :: String x_WIKI_ORIG :: String x_WIKI_CHOSEN :: String x_PARA :: String x_FRAG :: String x_BLEU :: String x_SENTENCE_SELECTION :: String -- raw inputs x_HILL_INPUT = ".json" x_GEO_INPUT = ".geo-json" x_WIKI_ORIG = ".wiki" x_WIKI_CHOSEN = ".wiki.chosen" -- refs x_PARA = ".para" x_FRAG = ".frag" -- scoring x_BLEU = ".bleu" x_SENTENCE_SELECTION = ".sentence-selection" docTargets :: [ Contained ] docTargets = singleton $ Contained [] [] [ target docAlignedDir , docAlignedDep x_HILL_INPUT , docAlignedDep x_GEO_INPUT , docAlignedWikiChosen ] -- Annoyingly the corpus has different names for Hills DB-derived documents -- and for Wikipedia documents (because we have to align them). So to make -- life a bit simpler, let's assume document alignment is correct and -- give them all the same names docAlignedDir :: ExactTarget docAlignedDir = ExactTarget d_WORK_DOC_ALIGNED "[1] flatten source data" $ \ RedoArgs{rname} -> do createDirectoryIfMissing False rname wikiFiles <- run ("find", [ d_SRC, "-name", "*" ++ x_WIKI_ORIG ]) redoIfChange [ rname takeFileName (takeDirectory w) <.> x | w <- wikiFiles, x <- exts ] stampAll rname exts where exts = [ x_HILL_INPUT, x_GEO_INPUT, x_WIKI_CHOSEN ] docAlignedDep :: String -> Target docAlignedDep x = target $ WildcardTarget d_WORK_DOC_ALIGNED x "" $ \ RedoArgs{rname,rwrite} -> do createDirectoryIfMissing True (takeDirectory rname) let bn = basename rname src = d_SRC dropExtensions bn bn tgt = rwrite runIO ("cp", [src, tgt]) docAlignedWikiChosen :: Target docAlignedWikiChosen = target $ WildcardTarget d_WORK_DOC_ALIGNED x_WIKI_CHOSEN "" $ \ RedoArgs{rname,rwrite} -> do createDirectoryIfMissing True (takeDirectory rname) let bn = basename (dropExtensions rname) tgt = rwrite src <- do cs <- filterExts x_WIKI_ORIG `fmap` getDirectoryContents (d_SRC bn) case cs of [x] -> return (d_SRC bn x) _ -> fail ("Was expecting *exactly* one wiki file chosen for " ++ dropExtensions rname) runIO ("cp", [src, tgt]) -- ---------------------------------------------------------------------- -- STAGE preprocessing -- ---------------------------------------------------------------------- preprocessedTargets :: [Contained] preprocessedTargets = singleton $ Contained [] [] [ target preprocessedWorkDir , nlgInput , preprocessedData , preprocessedEn ] preprocessedWorkDir :: ExactTarget preprocessedWorkDir = ExactTarget d_WORK_PREPROCESSED "[2] preprocess data" $ \RedoArgs{rname} -> do redoIfChange [ d_WORK_DOC_ALIGNED ] createDirectoryIfMissing False rname cs <- filterExts x_HILL_INPUT <$> getDirectoryContents d_WORK_DOC_ALIGNED redoIfChange [ rname dropExtensions c <.> x | c <- cs, x <- exts ] stampAll rname exts where exts = [ extEnChoice, extDataPerDocument ] ppTarget :: String -> String -> (RedoArgs -> IO ()) -> WildcardTarget ppTarget = WildcardTarget d_WORK_PREPROCESSED nlgInput :: Target nlgInput = target $ ppTarget ext "" $ \RedoArgs{rname, rwrite} -> do let mkF x = d_WORK_DOC_ALIGNED dropExtensions (basename rname) <.> x redoIfChange (map mkF [ x_HILL_INPUT, x_GEO_INPUT ]) runMyCmd "create-nlg-input" [ mkF x_HILL_INPUT, "-o", rwrite ] where ext = "" <.> extCleanStructuredData preprocessedData :: Target preprocessedData = target $ ppTarget extDataPerDocument "" $ \RedoArgs{rname, rwrite} -> do let nj = dropExtensions rname <.> extCleanStructuredData redoIfChange [ nj ] runMyCmd "flatten-data" [ nj, rwrite ] preprocessedEn :: Target preprocessedEn = target $ ppTarget extEnChoice "" $ \RedoArgs{rname, rwrite} -> do let mkI x = dropExtensions rname <.> x nchosen = d_WORK_DOC_ALIGNED dropExtensions (basename rname) <.> x_WIKI_CHOSEN nj = mkI extCleanStructuredData nsegmented = mkI "wiki.segmented" -- 1. sentence segmentation redoIfChange [nj, nchosen] runMyCmd "take-sentences" [ nchosen, "-o", nsegmented ] -- 2. tokenisation -- hills db doesn't use accents in its recorded names for hills, so we translit them out -- while we're at it we also add spaces between punctuation let nsegmented2 = mkI "wiki.segmented-ascii" ntokenised = mkI "wiki.tokenised" run ("iconv", [ nsegmented , "-f", "utf-8" , "-t", "ascii//TRANSLIT" ]) >>= B.writeFile nsegmented2 runMyCmd "tokenise-sentences" [ nsegmented2 , "-o", ntokenised ] -- 3. lemmatisation let nprerasp = mkI "wiki.prerasp" -- TODO: replace sed with just plain Haskell run ("sed", ["-e", "s/^\\(..*\\)$/^ \\1/", ntokenised]) >>= B.writeFile nprerasp let raspCmd = setenv [ ("rasp_parse", "cat"), ("rasp_sentence", "cat") ] $ d_RASP_HOME "scripts/rasp.sh" raspChain = ("cat",[nprerasp]) -|- raspCmd -|- ("sed", [ "-e", "s/+[^_]*_/_/g" , "-e", "s/^\\^_\\^ //" , "-e", "s/^/[/" , "-e", "s/$/]/" ]) -|- ("tr", ["[A-Z]", "[a-z]"]) -- ^ yeah I know I could juts use Haskell here run raspChain >>= B.writeFile rwrite -- ---------------------------------------------------------------------- -- STAGE first stage LLR 1 -- ---------------------------------------------------------------------- llr1Targets :: [Contained] llr1Targets = singleton $ Contained [] [] [ target docLLR_1 ] docLLR_1 :: ExactTarget docLLR_1 = ExactTarget d_LLR_1 "[3] G2 (aka LLR) scores on base inputs" $ \RedoArgs { rname } -> do redoIfChange [ d_WORK_PREPROCESSED ] runMyCmd "build-llr-lexicon" $ [ d_WORK_PREPROCESSED, d_WORK_PREPROCESSED, rname ] ++ parallelRTS run $ ("cat", [rname "lexicon"]) -|- "redo-stamp" -- ---------------------------------------------------------------------- -- STAGE sentence selection -- ---------------------------------------------------------------------- data SentSelMethod = SentSelBaselineN Int | SentSelBaselineA | SentSelGtm | SentSelThreshold Float instance ShowPC SentSelMethod where showPC (SentSelBaselineN n) = "BASELINE-s" ++ show n showPC SentSelBaselineA = "BASELINE-sA" showPC SentSelGtm = "gtm" showPC (SentSelThreshold n) = "t" ++ showPaddedF 3 n instance Containable SentSelMethod where czero = SentSelBaselineA sentenceSelectionTargets :: [Contained] sentenceSelectionTargets = allTarget "all-sentence-selection" "[4] do sentence selection" [mkContained [sentenceSelection, sentenceSelectionSubset] ssMethods] sentenceSelection :: SentSelMethod -> ExactTarget sentenceSelection m = ExactTarget path "" $ \RedoArgs { rname } -> do redoIfChange [ d_WORK_PREPROCESSED, d_LLR_1 ] createDirectoryIfMissing True rname let filterSentences ps = runMyCmd "filter-sentences" $ ps ++ [ d_WORK_PREPROCESSED, d_LLR_1 "lexicon", rname ] case m of SentSelBaselineN c -> runMyCmd "create-baseline" [ "--cutoff", show c, "-s", d_WORK_PREPROCESSED, rname ] SentSelBaselineA -> do fs <- filterExts extEnChoice `fmap` getDirectoryContents d_WORK_PREPROCESSED forM_ fs $ \f -> crunIO "cp" [ d_WORK_PREPROCESSED f, rname ] SentSelGtm -> filterSentences [ "--usemeans" ] SentSelThreshold t -> filterSentences [ "--threshold", show t ] -- -- stampAll rname [ extEnChoice ] where path = d_SENTENCE_SELECTION showPC m sentenceSelectionSubset :: SentSelMethod -> ExactTarget sentenceSelectionSubset m = ExactTarget path "" $ \RedoArgs { rname } -> do let src = ePath (sentenceSelection m) tgt = rname redoIfChange [ src ] createDirectoryIfMissing True tgt copySubset extEnChoice src tgt recurseStampExt tgt [extEnChoice] where path = d_SENTENCE_SELECTION_SUBSET showPC m copySubset :: String -> FilePath -> FilePath -> IO () copySubset x src tgt = do createDirectoryIfMissing True tgt fs <- filterExts x_FRAG <$> getDirectoryContents d_REFERENCE forM_ fs $ \f -> do let f2 z = z dropExtensions f <.> x copyFile (f2 src) (f2 tgt) -- ---------------------------------------------------------------------- -- STAGE data selection -- ---------------------------------------------------------------------- data DataSelMethod = DataSelAll | DataSelDisj Float | DataSelConj Float | DataSelSingle Float instance ShowPC DataSelMethod where showPC DataSelAll = "all" showPC (DataSelDisj n) = "di-" ++ showPaddedF 3 n showPC (DataSelConj n) = "zc-" ++ showPaddedF 3 n -- z so it sorts last showPC (DataSelSingle n) = "sg-" ++ showPaddedF 3 n instance Containable DataSelMethod where czero = DataSelAll dataSelectionTargets :: [Contained] dataSelectionTargets = allTarget "all-data-selection" "[5] do data selection" [mkContained options dsMethods] where options = [ f s | f <- [ dataSelection, dataSelectionSubset ] , s <- ssMethods ] dataSelection :: SentSelMethod -> DataSelMethod -> ExactTarget dataSelection sm dm = ExactTarget path "" $ \ RedoArgs { rname } -> do let sentdir = ePath (sentenceSelection sm) redoIfChange [ d_WORK_PREPROCESSED, sentdir, d_LLR_1 ] createDirectoryIfMissing True rname let fdt m mt = runMyCmd "filter-data-tokens" $ [ "--method", m , sentdir , d_WORK_PREPROCESSED , d_LLR_1 "lexicon" , rname ] ++ maybe [] (\t -> ["--threshold", show (t :: Float)]) mt case dm of DataSelAll -> fdt "takeall" Nothing DataSelConj n -> fdt "conj" (Just n) DataSelDisj n -> fdt "disj" (Just n) DataSelSingle n -> fdt "single" (Just n) -- -- stampAll rname [ extDataPerSentence ] where path = d_DATA_SELECTION showPC sm showPC dm dataSelectionSubset :: SentSelMethod -> DataSelMethod -> ExactTarget dataSelectionSubset sm dm = ExactTarget path "" $ \RedoArgs { rname } -> do let src = ePath (dataSelection sm dm) tgt = rname redoIfChange [ src ] createDirectoryIfMissing True tgt copySubset extEnChoice src tgt copySubset extDataPerDocument src tgt copySubset extDataPerSentence src tgt recurseStampExt tgt [ extEnChoice, extDataPerDocument, extDataPerSentence ] where path = d_DATA_SELECTION_SUBSET showPC sm showPC dm -- ---------------------------------------------------------------------- -- STAGE LLR/G2 recomputation -- ---------------------------------------------------------------------- data LlrMethod = LlrNoRecompute | LlrRecomputeDoc | LlrRecomputeSent instance ShowPC LlrMethod where showPC LlrNoRecompute = "old-d" showPC LlrRecomputeDoc = "recomp-d" showPC LlrRecomputeSent = "recomp-s" instance Containable LlrMethod where czero = LlrRecomputeDoc -- not LlrNoRecompute because that just ponits back to llr1 llr2Targets :: [Contained] llr2Targets = allTarget "all-new-llr" "[6] G2 (aka LLR) scores revisited" [ llr2Contained ] where llr2Contained = Contained { containers = [ takeDirectory (ePath t) | Just t <- map (\f -> f czero) fs ] , targets = catMaybes [ f l | f <- fs, l <- llrMethods ] , allTargets = map target (targets llr2Contained) } fs = [ llr2 s d | s <- ssMethods, d <- dsMethods ] llr2 :: SentSelMethod -> DataSelMethod -> LlrMethod -> Maybe ExactTarget llr2 sm dm lm = mkTarget <$> recomp where llrCmd xs s d o = runMyCmd "build-llr-lexicon" $ [ s, d, o ] ++ xs ++ parallelRTS recomp = case lm of LlrNoRecompute -> Nothing LlrRecomputeSent -> Just (llrCmd ["--sentencelevel"]) LlrRecomputeDoc -> Just (llrCmd []) mkTarget llrBuilder = ExactTarget path "" $ \RedoArgs { rname } -> do redoIfChange [ sentDir, dataDir ] createDirectoryIfMissing True rname _ <- llrBuilder sentDir dataDir rname run $ (\() -> readFile (rname "lexicon")) -|- "redo-stamp" sentDir = dataDir -- yes the data dir: data selection results in -- recomputing the sentence selection dataDir = ePath (dataSelection sm dm) path = llr2Dir sm dm lm llr2Dir :: SentSelMethod -> DataSelMethod -> LlrMethod -> FilePath llr2Dir sm dm lm = case lm of LlrNoRecompute -> d_LLR_1 -- not sure if I'll regret this _ -> d_LLR_2 showPC sm showPC dm showPC lm -- ---------------------------------------------------------------------- -- STAGE fragment extraction -- ---------------------------------------------------------------------- data FragBaselineMethod = FragBaselineAll | FragBaselineCutoff Int instance ShowPC FragBaselineMethod where showPC FragBaselineAll = "BASELINE-fA" showPC (FragBaselineCutoff n) = "BASELINE-s" ++ show n instance Containable FragBaselineMethod where czero = FragBaselineAll fragmentExtractionTargets :: [Contained] fragmentExtractionTargets = allTarget "all-fragment-extraction" "[7] extract fragments" $ [ fragmentExtractionContained , fragmentBaselineContained ] fragmentExtractionContained :: Contained fragmentExtractionContained = mkContained [ fragmentExtraction s d | s <- ssMethods, d <- dsMethods ] llrMethods fragmentBaselineContained :: Contained fragmentBaselineContained = mkContained [ fragmentBaseline ] fbMethods fragmentExtraction :: SentSelMethod -> DataSelMethod -> LlrMethod -> ExactTarget fragmentExtraction sm dm lm = ExactTarget path "" $ \ RedoArgs { rname } -> do redoIfChange [ sentDir, dataDir, llrDir ] createDirectoryIfMissing True rname runMyCmd "detect-fragments" $ [ sentDir, dataDir, llrDir "lexicon", rname ] ++ parallelRTS recurseStampExt rname [ extEnChoice, extDataPerSentence ] where path = d_FRAGMENTS showPC sm showPC dm showPC lm sentDir = dataDir -- see llr2 sentDir dataDir = ePath (dataSelection sm dm) llrDir = llr2Dir sm dm lm fragmentBaseline :: FragBaselineMethod -> ExactTarget fragmentBaseline m = ExactTarget path "" $ \ RedoArgs { rname } -> do redoIfChange [ d_WORK_PREPROCESSED ] let cutoff = case m of FragBaselineAll -> [] FragBaselineCutoff n -> [ "--cutoff", show n ] runMyCmd "create-baseline" $ cutoff ++ [ d_WORK_PREPROCESSED, rname ] stampAll rname [ extDataPerDocument, extEnFragments ] where path = d_FRAGMENTS_BASELINE showPC m -- ---------------------------------------------------------------------- -- STAGE reference data -- ---------------------------------------------------------------------- referenceTargets :: [Contained] referenceTargets = [ Contained [] [] $ map target [ sentenceSelectionReference, fragmentsReferenceDir ] ] sentenceSelectionReference :: ExactTarget sentenceSelectionReference = ExactTarget d_SENTENCE_SELECTION_REFERENCE "" $ \ RedoArgs { rname } -> do redoIfChange [ d_WORK_PREPROCESSED ] runMyCmd "from-handwritten" [ d_REFERENCE, rname, "-c", "--sourcetext", d_WORK_PREPROCESSED ] stampAll rname [ extEnChoice ] fragmentsReferenceDir :: ExactTarget fragmentsReferenceDir = ExactTarget d_FRAGMENTS_REFERENCE "[8] convert reference data" $ \ RedoArgs { rname } -> do redoIfChange [ d_WORK_PREPROCESSED ] createDirectoryIfMissing True rname runMyCmd "from-handwritten" [ d_REFERENCE, rname, "--sourcetext", d_WORK_PREPROCESSED ] hasPara <- any (\f -> takeExtensions f == x_PARA) <$> getDirectoryContents d_REFERENCE when hasPara (fromPara d_REFERENCE rname) stampAll rname [ extEnFragments, extDataPerSentence, extDataPerDocument ] fromPara :: FilePath -> FilePath -> IO () fromPara indir outdir = do fs <- filterExts x_PARA <$> getDirectoryContents indir mapM_ convert fs where convert f = do let ofile x = outdir dropExtensions f <.> x oselected = ofile extDataPerSentence osubset = ofile extDataPerDocument h <- doesFileExist oselected when h $ do renameFile oselected (oselected <.> "from-frag") renameFile osubset (osubset <.> "from-frag") (run $ (\() -> readFile (indir f)) -|- filter ("[" `isPrefixOf`) -|- ("sed", ["-e", "s/^\\[\\[//" ,"-e", "s/\\].*//" ,"-e", "s/,/ /g" ])) >>= writeFile oselected writeFile osubset =<< unwords . concatMap words . sort . nub . lines <$> readFile oselected -- ---------------------------------------------------------------------- -- STAGE scoring -- ---------------------------------------------------------------------- scoringTargets :: [Contained] scoringTargets = [ Contained [] [] (map target [ score, peersDir ]) , peersTargets ] ++ sentenceSelectionScoreTargets sentenceSelectionScoreTargets :: [Contained] sentenceSelectionScoreTargets = [ mkContained [ sentenceSelectionScore ] ssMethods , mkContained (map sentenceSelectionScore2 ssMethods) dsMethods ] peersTargets :: Contained peersTargets = mkContained [ peersSubdir s d | s <- ssMethods, d <- dsMethods ] llrMethods peersDir :: ExactTarget peersDir = ExactTarget d_PEERS "" $ \RedoArgs { rname } -> do redoIfChange $ [ d_FRAGMENTS_REFERENCE ] ++ blineTargets runIO $ ("rm", [ "-rf", rname ]) createDirectoryIfMissing False rname fs <- referenceBasenames -- reference data let peerRefDir = rname "REFERENCE" createDirectoryIfMissing False peerRefDir copyReferenceFiles d_FRAGMENTS_REFERENCE peerRefDir fs -- baseline fragments forM_ blineTargets $ \p -> copyReferenceFiles p (rname takeFileName p) fs -- extracted fragments redoIfChange fragTargets recurseStampExt rname [ extEnFragments, extDataPerDocument ] where fragTargets = map ePath (targets peersTargets) blineTargets = map ePath (targets fragmentBaselineContained) peersSubdir :: SentSelMethod -> DataSelMethod -> LlrMethod -> ExactTarget peersSubdir sm dm lm = ExactTarget (path []) "" $ const $ do redoIfChange [ d_FRAGMENTS_REFERENCE, frag ] ds <- subdirectories frag forM_ ds $ \d -> do let src = frag d tgt = path [d] createDirectoryIfMissing True tgt referenceBasenames >>= copyReferenceFiles src tgt -- tricky because we don't generate rname :-( -- recurseStamp rname [ extEnFragments, extDataPerDocument ] where frag = ePath (fragmentExtraction sm dm lm ) path xs = squishPath . joinPath $ [ d_PEERS, showPC sm, showPC dm, showPC lm ] ++ xs squishPath :: FilePath -> FilePath squishPath = intercalate "_" . splitDirectories copyReferenceFiles :: FilePath -- ^ src -> FilePath -- ^ tgt -> [FilePath] -- ^ basenames -> IO () copyReferenceFiles src tgt bns = do createDirectoryIfMissing False tgt forM_ bns $ \bn -> let f = bn <.> extEnFragments in copyFile (src f) (tgt f) referenceBasenames :: IO [FilePath] referenceBasenames = (map dropExtensions . filterExts extEnFragments) <$> getDirectoryContents d_FRAGMENTS_REFERENCE score :: ExactTarget score = ExactTarget d_SCORES "compute BLEU scores" $ const $ do let pdir = ePath peersDir createDirectoryIfMissing False d_SCORES redoIfChange $ map ePath $ concatMap targets sentenceSelectionScoreTargets redoIfChange [ pdir ] crunIO "perl" [ "-I", d_CALCULATORS, d_CALCULATORS "bleu-scores.pl" , pdir "REFERENCE" , pdir, d_SCORES ] stampAll d_SCORES [ x_BLEU, x_SENTENCE_SELECTION ] sentenceSelectionScore :: SentSelMethod -> ExactTarget sentenceSelectionScore sm = scoreSentenceSelection peerDir refDir path where path = d_SCORES showPC sm refDir = ePath sentenceSelectionReference peerDir = ePath (sentenceSelectionSubset sm) sentenceSelectionScore2 :: SentSelMethod -> DataSelMethod -> ExactTarget sentenceSelectionScore2 sm dm = scoreSentenceSelection peerDir refDir path where path = squishPath $ d_SCORES showPC sm showPC dm refDir = ePath sentenceSelectionReference peerDir = ePath (dataSelectionSubset sm dm) scoreSentenceSelection :: FilePath -- ^ peer -> FilePath -- ^ ref -> FilePath -- ^ output -> ExactTarget scoreSentenceSelection p r out_ = ExactTarget out "" $ \RedoArgs {rwrite} -> do redoIfChange [ p, r ] run (d_BIN "score-sentence-selection", [ "-e", extEnChoice, r, p ]) >>= B.writeFile rwrite where out = out_ <.> x_SENTENCE_SELECTION -- ---------------------------------------------------------------------- -- utils -- ---------------------------------------------------------------------- allTarget :: String -> String -> [Contained] -> [Contained] allTarget name description xs = header : xs where header = Contained [] [] [target headerT] headerT = ExactTarget name description $ const $ redoIfChange ("init" : paths) paths = map ePath (concatMap targets xs) crunIO :: FilePath -> [FilePath] -> IO () crunIO = curry run runMyCmd :: String -> [String] -> IO () runMyCmd x args = run (d_BIN x, args) filterExts :: String -> [FilePath] -> [FilePath] filterExts x = filter (\f -> takeExtensions f == ("" <.> x)) subdirectories :: FilePath -> IO [FilePath] subdirectories f = filterM isSubdir =<< filter (not . junk) <$> getDirectoryContents f where isSubdir d = doesDirectoryExist (f d) junk = (`elem` [".", ".."]) recurseStamp :: (FilePath -> Bool) -> FilePath -> IO () recurseStamp c d = run $ (\() -> filter c <$> recurseDir SystemFS d) -|- mapM readFile -|- "redo-stamp" recurseStampExt :: FilePath -> [String] -> IO () recurseStampExt t xs = recurseStamp (\f -> takeExtensions f `elem` xs) t showPaddedF :: Int -> Float -> String showPaddedF n x = if intlength x < n then (replicate (n - intlength x) '0') ++ showNiceF x else showNiceF x where intlength = length . show . floorI showNiceF :: Float -> String showNiceF x = if fromIntegral (floorI x) == x then show (floorI x) else show x floorI :: Float -> Int floorI = floor -- sigh, just making a warning go away singleton :: a -> [a] singleton x = [x]