module NLP.RAKE.Resources

  -- | List containing characters at which we do not split words.
  --   This list is language dependent.
  type NoSplit = String
  -- | The default list is for English and does only consider
  --   ASCII characters, the numbers 0..9 and some other symbols.
  --   There are resources for other languages, 
  --   but they need review and contribution!
  defaultNosplit :: NoSplit
  defaultNosplit = enNosplit ++ numNosplit ++ othNosplit 

  enNosplit, numNosplit, othNosplit :: NoSplit
  -- | ASCII characters,
  enNosplit     = ['a'..'z'] ++ ['A'..'Z'] 
  -- | digits
  numNosplit    = ['0'..'9'] 
  -- | and some more symbols (\"+-/\")
  othNosplit    = "+-/"

  latin1Nosplit, latinExAnosplit, latinExBnosplit :: NoSplit
  -- | Latin1
  latin1Nosplit   = ['\192'..'\214'] ++ ['\216'..'\246'] ++ ['\248'..'\255']
  -- | Latin1 extended-A
  latinExAnosplit = ['\256'..'\383']
  -- | Latin1 extended-B
  latinExBnosplit = ['\384'..'\447'] ++ ['\452'..'\591']

  -- | Greek and Coptic (needs revision)
  greekNosplit :: NoSplit
  greekNosplit     = ['\913'..'\929']   ++ ['\931'..'\1023']

  -- | Cyrillic (needs revision)
  cyrillicNosplit :: NoSplit
  cyrillicNosplit  = ['\1024'..'\1154'] ++ ['\1162'..'\1279'] ++ 