module NLP.RAKE.Resources where ------------------------------------------------------------------------- -- | List containing characters at which we do not split words. -- This list is language dependent. ------------------------------------------------------------------------- type NoSplit = String ------------------------------------------------------------------------- -- | The default list is for English and does only consider -- ASCII characters, the numbers 0..9 and some other symbols. -- -- There are resources for other languages, -- but they need review and contribution! ------------------------------------------------------------------------- defaultNosplit :: NoSplit defaultNosplit = enNosplit ++ numNosplit ++ othNosplit enNosplit, numNosplit, othNosplit :: NoSplit ------------------------------------------------------------------------- -- | ASCII characters, ------------------------------------------------------------------------- enNosplit = ['a'..'z'] ++ ['A'..'Z'] ------------------------------------------------------------------------- -- | digits ------------------------------------------------------------------------- numNosplit = ['0'..'9'] ------------------------------------------------------------------------- -- | and some more symbols (\"+-/\") ------------------------------------------------------------------------- othNosplit = "+-/" latin1Nosplit, latinExAnosplit, latinExBnosplit :: NoSplit ------------------------------------------------------------------------- -- | Latin1 ------------------------------------------------------------------------- latin1Nosplit = ['\192'..'\214'] ++ ['\216'..'\246'] ++ ['\248'..'\255'] ------------------------------------------------------------------------- -- | Latin1 extended-A ------------------------------------------------------------------------- latinExAnosplit = ['\256'..'\383'] ------------------------------------------------------------------------- -- | Latin1 extended-B ------------------------------------------------------------------------- latinExBnosplit = ['\384'..'\447'] ++ ['\452'..'\591'] ------------------------------------------------------------------------- -- | Greek and Coptic (needs revision) ------------------------------------------------------------------------- greekNosplit :: NoSplit greekNosplit = ['\913'..'\929'] ++ ['\931'..'\1023'] ------------------------------------------------------------------------- -- | Cyrillic (needs revision) ------------------------------------------------------------------------- cyrillicNosplit :: NoSplit cyrillicNosplit = ['\1024'..'\1154'] ++ ['\1162'..'\1279'] ++ ['\1280'..'\1327']