-- A script for adding missing words to DictEngSwe by taking them from -- from DictSwe, guided by translations in Folkets Lexikon, -- http://folkets-lexikon.csc.kth.se/folkets/om.html -- Run it from lib/src/swedish/folketslexikon with +RTS -K32M -M1500M -- TODO: alo try Lexin import Prelude hiding (lookup) import Data.List hiding (lookup) import System.Directory(doesFileExist,renameFile) import System.Cmd(system) import System.Exit import Control.Monad(unless) import Data.Maybe(fromJust) import qualified Data.Map as M import Text.XML.Light import PGF main = do prepare create --- Source and target names ---------------------------------------------------- folkets_root = "http://folkets-lexikon.csc.kth.se/folkets/" folkets_en_sv = "folkets_en_sv_public.xml" folkets_sv_en = "folkets_sv_en_public.xml" name = "FolketsLexikon" target = name++".gf" --- Prepare -------------------------------------------------------------------- prepare = do writeFile target dummy download folkets_root folkets_en_sv download folkets_root folkets_sv_en makePGF pre "../../english/" "DictEngAbs.pgf" "DictEng.gf" gfmake engswe "../" "DictEngSwe.gf" makePGF swe "../" "DictSweAbs.pgf" "DictSwe.gf" where download url file = ifMissing file $ curl (url++file) makePGF imps dir pgf src = ifMissing (dir++pgf) (gfmake imps dir src) curl url = shell $ ["curl -OL --compressed",url] gfmake imps dir src = shell $ ["cd",dir,"&&","gf",opts,imp imps,src] opts = "-s -make -no-pmcfg -gfo-dir ../../alltenses" imp dirs = unwords ["-i ../"++dir|dir<-dirs] pre = ["prelude","abstract","common"] swe = pre++["scandinavian"] engswe = swe++["english"] ifMissing path cmd = do e <- doesFileExist path unless e cmd shell ws = do let cmd = unwords ws putStrLn cmd e <- system cmd case e of ExitSuccess -> return () _ -> fail "command failed" --- Create --------------------------------------------------------------------- create = do lexicon1 <- fromFolketsEnSv `fmap` readFile folkets_en_sv lexicon2 <- fromFolketsSvEn `fmap` readFile folkets_sv_en let lexicon = M.unionWith (++) lexicon1 lexicon2 missing <- missingWords `fmap` readPGF "../DictEngAbs.pgf" putStrLn $ "Looking for "++show (length missing)++" missing words." eng <- readPGF "../../english/DictEngAbs.pgf" saldo <- dict2map `fmap` readPGF "../DictSweAbs.pgf" let entries = map (translate lexicon eng saldo) missing histo = M.toList $ M.fromListWith (+) [(length ts,1)|(_,ts)<-entries] tmp = target++".tmp" putStrLn $ "Found "++show (sum [n|(a,n)<-histo,a>0])++" words." writeFile tmp . concWrap $ map toGF entries renameFile tmp target mapM_ print histo lang1 = head . languages -- Finding functions by parsing is too slow. Linearize all functions instead. dict2map :: PGF -> M.Map String [CId] dict2map pgf = fromList [(w,[f])|f<-functions pgf,w<-lins pgf f] lin dict f = linearize dict (lang1 dict) (mkApp f []) lins dict f = nub . filter (/="") . map snd . concat . tabularLinearizes dict (lang1 dict) $ mkApp f [] missingWords pgf = missingLins pgf (lang1 pgf) fromFolketsEnSv :: String -> M.Map String [String] fromFolketsEnSv = fromList . map entry . findElements (unqual "word") . fromJust . parseXMLDoc where entry w = (value w,map value (findChildren (unqual "translation") w)) fromFolketsSvEn :: String -> M.Map String [String] fromFolketsSvEn = fromList . concatMap entry . findElements (unqual "word") . fromJust . parseXMLDoc where entry w = [(en,[sv])|let sv=value w, en<-map value (findChildren (unqual "translation") w)] value = fromJust . findAttr (unqual "value") lookup m = maybe [] id . M.lookup m fromList xs = fmap nub $ M.fromListWith (++) xs translate lexicon eng saldo eabs = (eabs,nub [sabs|let en=lin eng eabs, sv<-lookup en lexicon, sabs<-lookup sv saldo, sameCat eabs sabs]) sameCat f1 f2 = suffix f1==suffix f2 where suffix = takeWhile (/='_') . reverse . show toGF (eabs,[]) = "--"++show eabs++"\n" toGF (eabs,s:ss) = " "++en++" = "++show s++";" ++" -- "++show (1+length ss) ++"\n" ++unlines [indent++"--"++show s|s<-{-take 10-} ss] where en = show eabs indent = replicate (length en+3) ' ' concWrap ws = unlines header++concat ws++unlines footer header = ["concrete "++name++" of DictEngAbs = CatSwe ** open DictSwe in {", "", "flags coding=utf8 ;", "", "lin"] footer = ["}"] dummy = unlines (init header++footer)