module NLP.RAKE.Resources
where

  -------------------------------------------------------------------------
  -- | List containing characters at which we do not split words.
  --   This list is language dependent.
  -------------------------------------------------------------------------
  type NoSplit = String
  -------------------------------------------------------------------------
  -- | The default list is for English and does only consider
  --   ASCII characters, the numbers 0..9 and some other symbols.
  --
  --   There are resources for other languages, 
  --   but they need review and contribution!
  -------------------------------------------------------------------------
  defaultNosplit :: NoSplit
  defaultNosplit = enNosplit ++ numNosplit ++ othNosplit 

  enNosplit, numNosplit, othNosplit :: NoSplit
  -------------------------------------------------------------------------
  -- | ASCII characters,
  -------------------------------------------------------------------------
  enNosplit     = ['a'..'z'] ++ ['A'..'Z'] 
  -------------------------------------------------------------------------
  -- | digits
  -------------------------------------------------------------------------
  numNosplit    = ['0'..'9'] 
  -------------------------------------------------------------------------
  -- | and some more symbols (\"+-/\")
  -------------------------------------------------------------------------
  othNosplit    = "+-/"

  latin1Nosplit, latinExAnosplit, latinExBnosplit :: NoSplit
  -------------------------------------------------------------------------
  -- | Latin1
  -------------------------------------------------------------------------
  latin1Nosplit   = ['\192'..'\214'] ++ ['\216'..'\246'] ++ ['\248'..'\255']
  -------------------------------------------------------------------------
  -- | Latin1 extended-A
  -------------------------------------------------------------------------
  latinExAnosplit = ['\256'..'\383']
  -------------------------------------------------------------------------
  -- | Latin1 extended-B
  -------------------------------------------------------------------------
  latinExBnosplit = ['\384'..'\447'] ++ ['\452'..'\591']

  -------------------------------------------------------------------------
  -- | Greek and Coptic (needs revision)
  -------------------------------------------------------------------------
  greekNosplit :: NoSplit
  greekNosplit     = ['\913'..'\929']   ++ ['\931'..'\1023']


  -------------------------------------------------------------------------
  -- | Cyrillic (needs revision)
  -------------------------------------------------------------------------
  cyrillicNosplit :: NoSplit
  cyrillicNosplit  = ['\1024'..'\1154'] ++ ['\1162'..'\1279'] ++ 
                     ['\1280'..'\1327']