{-# LANGUAGE Trustworthy, DeriveDataTypeable, DeriveAnyClass, DeriveGeneric #-}
{-# OPTIONS_HADDOCK show-extensions #-}

{-
Copyright 2014 Daniel Fox Franke

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-}

{-|
Module : Text.CLD2
License : Apache-2.0
Stability : provisional
Portability: portable

This module provides simple Haskell bindings for Compact Language Detector 2, a language-detection library used by Google Chrome. See <https://code.google.com/p/cld2/>.
-}
module Text.CLD2 (
  detectLanguageDebug, detectLanguage, detectLanguageSimple,
  Language(..),
  Hints(..), defaultHints,
  Encoding(..),
  DebugFlags(..), defaultDebugFlags,
  Result(..),
  Chunk(..)
  ) where

import Control.Exception(AsyncException(..),AssertionFailed(..),mask_,throwIO)
import Data.Bits(Bits(..), (.|.))
import Data.ByteString.Unsafe(unsafeUseAsCStringLen)
import Data.Data(Data)
import Data.Functor((<$>))
import Data.Hashable(Hashable)
import Data.Text(Text)
import Data.Text.Encoding(encodeUtf8)
import Data.Typeable(Typeable)
import Foreign.C.Error(Errno(..), eNOMEM, eOK)
import Foreign.C.String(CString, withCString)
import Foreign.C.Types(CShort(..),CInt(..),CChar(..),CDouble(..),CSize(..))
import Foreign.Ptr(Ptr, nullPtr)
import Foreign.Marshal.Alloc(alloca, free)
import Foreign.Marshal.Array(peekArray, allocaArray)
import Foreign.Storable(peek)
import GHC.Generics(Generic)
import System.IO.Unsafe(unsafePerformIO)

-- | An enumeration of all languages recognized by CLD2
data Language =
  Cld2Language_ENGLISH
  | Cld2Language_DANISH
  | Cld2Language_DUTCH
  | Cld2Language_FINNISH
  | Cld2Language_FRENCH
  | Cld2Language_GERMAN
  | Cld2Language_HEBREW
  | Cld2Language_ITALIAN
  | Cld2Language_JAPANESE
  | Cld2Language_KOREAN
  | Cld2Language_NORWEGIAN
  | Cld2Language_POLISH
  | Cld2Language_PORTUGUESE
  | Cld2Language_RUSSIAN
  | Cld2Language_SPANISH
  | Cld2Language_SWEDISH
  | Cld2Language_CHINESE
  | Cld2Language_CZECH
  | Cld2Language_GREEK
  | Cld2Language_ICELANDIC
  | Cld2Language_LATVIAN
  | Cld2Language_LITHUANIAN
  | Cld2Language_ROMANIAN
  | Cld2Language_HUNGARIAN
  | Cld2Language_ESTONIAN
  | Cld2Language_TG_UNKNOWN_LANGUAGE
  | Cld2Language_UNKNOWN_LANGUAGE
  | Cld2Language_BULGARIAN
  | Cld2Language_CROATIAN
  | Cld2Language_SERBIAN
  | Cld2Language_IRISH
  | Cld2Language_GALICIAN
  | Cld2Language_TAGALOG
  | Cld2Language_TURKISH
  | Cld2Language_UKRAINIAN
  | Cld2Language_HINDI
  | Cld2Language_MACEDONIAN
  | Cld2Language_BENGALI
  | Cld2Language_INDONESIAN
  | Cld2Language_LATIN
  | Cld2Language_MALAY
  | Cld2Language_MALAYALAM
  | Cld2Language_WELSH
  | Cld2Language_NEPALI
  | Cld2Language_TELUGU
  | Cld2Language_ALBANIAN
  | Cld2Language_TAMIL
  | Cld2Language_BELARUSIAN
  | Cld2Language_JAVANESE
  | Cld2Language_OCCITAN
  | Cld2Language_URDU
  | Cld2Language_BIHARI
  | Cld2Language_GUJARATI
  | Cld2Language_THAI
  | Cld2Language_ARABIC
  | Cld2Language_CATALAN
  | Cld2Language_ESPERANTO
  | Cld2Language_BASQUE
  | Cld2Language_INTERLINGUA
  | Cld2Language_KANNADA
  | Cld2Language_PUNJABI
  | Cld2Language_SCOTS_GAELIC
  | Cld2Language_SWAHILI
  | Cld2Language_SLOVENIAN
  | Cld2Language_MARATHI
  | Cld2Language_MALTESE
  | Cld2Language_VIETNAMESE
  | Cld2Language_FRISIAN
  | Cld2Language_SLOVAK
  | Cld2Language_CHINESE_T
  | Cld2Language_FAROESE
  | Cld2Language_SUNDANESE
  | Cld2Language_UZBEK
  | Cld2Language_AMHARIC
  | Cld2Language_AZERBAIJANI
  | Cld2Language_GEORGIAN
  | Cld2Language_TIGRINYA
  | Cld2Language_PERSIAN
  | Cld2Language_BOSNIAN
  | Cld2Language_SINHALESE
  | Cld2Language_NORWEGIAN_N
  | Cld2Language_X_81
  | Cld2Language_X_82
  | Cld2Language_XHOSA
  | Cld2Language_ZULU
  | Cld2Language_GUARANI
  | Cld2Language_SESOTHO
  | Cld2Language_TURKMEN
  | Cld2Language_KYRGYZ
  | Cld2Language_BRETON
  | Cld2Language_TWI
  | Cld2Language_YIDDISH
  | Cld2Language_X_92
  | Cld2Language_SOMALI
  | Cld2Language_UIGHUR
  | Cld2Language_KURDISH
  | Cld2Language_MONGOLIAN
  | Cld2Language_ARMENIAN
  | Cld2Language_LAOTHIAN
  | Cld2Language_SINDHI
  | Cld2Language_RHAETO_ROMANCE
  | Cld2Language_AFRIKAANS
  | Cld2Language_LUXEMBOURGISH
  | Cld2Language_BURMESE
  | Cld2Language_KHMER
  | Cld2Language_TIBETAN
  | Cld2Language_DHIVEHI
  | Cld2Language_CHEROKEE
  | Cld2Language_SYRIAC
  | Cld2Language_LIMBU
  | Cld2Language_ORIYA
  | Cld2Language_ASSAMESE
  | Cld2Language_CORSICAN
  | Cld2Language_INTERLINGUE
  | Cld2Language_KAZAKH
  | Cld2Language_LINGALA
  | Cld2Language_X_116
  | Cld2Language_PASHTO
  | Cld2Language_QUECHUA
  | Cld2Language_SHONA
  | Cld2Language_TAJIK
  | Cld2Language_TATAR
  | Cld2Language_TONGA
  | Cld2Language_YORUBA
  | Cld2Language_X_124
  | Cld2Language_X_125
  | Cld2Language_X_126
  | Cld2Language_X_127
  | Cld2Language_MAORI
  | Cld2Language_WOLOF
  | Cld2Language_ABKHAZIAN
  | Cld2Language_AFAR
  | Cld2Language_AYMARA
  | Cld2Language_BASHKIR
  | Cld2Language_BISLAMA
  | Cld2Language_DZONGKHA
  | Cld2Language_FIJIAN
  | Cld2Language_GREENLANDIC
  | Cld2Language_HAUSA
  | Cld2Language_HAITIAN_CREOLE
  | Cld2Language_INUPIAK
  | Cld2Language_INUKTITUT
  | Cld2Language_KASHMIRI
  | Cld2Language_KINYARWANDA
  | Cld2Language_MALAGASY
  | Cld2Language_NAURU
  | Cld2Language_OROMO
  | Cld2Language_RUNDI
  | Cld2Language_SAMOAN
  | Cld2Language_SANGO
  | Cld2Language_SANSKRIT
  | Cld2Language_SISWANT
  | Cld2Language_TSONGA
  | Cld2Language_TSWANA
  | Cld2Language_VOLAPUK
  | Cld2Language_ZHUANG
  | Cld2Language_KHASI
  | Cld2Language_SCOTS
  | Cld2Language_GANDA
  | Cld2Language_MANX
  | Cld2Language_MONTENEGRIN
  | Cld2Language_AKAN
  | Cld2Language_IGBO
  | Cld2Language_MAURITIAN_CREOLE
  | Cld2Language_HAWAIIAN
  | Cld2Language_CEBUANO
  | Cld2Language_EWE
  | Cld2Language_GA
  | Cld2Language_HMONG
  | Cld2Language_KRIO
  | Cld2Language_LOZI
  | Cld2Language_LUBA_LULUA
  | Cld2Language_LUO_KENYA_AND_TANZANIA
  | Cld2Language_NEWARI
  | Cld2Language_NYANJA
  | Cld2Language_OSSETIAN
  | Cld2Language_PAMPANGA
  | Cld2Language_PEDI
  | Cld2Language_RAJASTHANI
  | Cld2Language_SESELWA
  | Cld2Language_TUMBUKA
  | Cld2Language_VENDA
  | Cld2Language_WARAY_PHILIPPINES
  | Cld2Language_X_183
  | Cld2Language_X_184
  | Cld2Language_X_185
  | Cld2Language_X_186
  | Cld2Language_X_187
  | Cld2Language_X_188
  | Cld2Language_X_189
  | Cld2Language_X_190
  | Cld2Language_X_191
  | Cld2Language_X_192
  | Cld2Language_X_193
  | Cld2Language_X_194
  | Cld2Language_X_195
  | Cld2Language_X_196
  | Cld2Language_X_197
  | Cld2Language_X_198
  | Cld2Language_X_199
  | Cld2Language_X_200
  | Cld2Language_X_201
  | Cld2Language_X_202
  | Cld2Language_X_203
  | Cld2Language_X_204
  | Cld2Language_X_205
  | Cld2Language_X_206
  | Cld2Language_X_207
  | Cld2Language_X_208
  | Cld2Language_X_209
  | Cld2Language_X_210
  | Cld2Language_X_211
  | Cld2Language_X_212
  | Cld2Language_X_213
  | Cld2Language_X_214
  | Cld2Language_X_215
  | Cld2Language_X_216
  | Cld2Language_X_217
  | Cld2Language_X_218
  | Cld2Language_X_219
  | Cld2Language_X_220
  | Cld2Language_X_221
  | Cld2Language_X_222
  | Cld2Language_X_223
  | Cld2Language_X_224
  | Cld2Language_X_225
  | Cld2Language_X_226
  | Cld2Language_X_227
  | Cld2Language_X_228
  | Cld2Language_X_229
  | Cld2Language_X_230
  | Cld2Language_X_231
  | Cld2Language_X_232
  | Cld2Language_X_233
  | Cld2Language_X_234
  | Cld2Language_X_235
  | Cld2Language_X_236
  | Cld2Language_X_237
  | Cld2Language_X_238
  | Cld2Language_X_239
  | Cld2Language_X_240
  | Cld2Language_X_241
  | Cld2Language_X_242
  | Cld2Language_X_243
  | Cld2Language_X_244
  | Cld2Language_X_245
  | Cld2Language_X_246
  | Cld2Language_X_247
  | Cld2Language_X_248
  | Cld2Language_X_249
  | Cld2Language_X_250
  | Cld2Language_X_251
  | Cld2Language_X_252
  | Cld2Language_X_253
  | Cld2Language_X_254
  | Cld2Language_X_255
  | Cld2Language_X_256
  | Cld2Language_X_257
  | Cld2Language_X_258
  | Cld2Language_X_259
  | Cld2Language_X_260
  | Cld2Language_X_261
  | Cld2Language_X_262
  | Cld2Language_X_263
  | Cld2Language_X_264
  | Cld2Language_X_265
  | Cld2Language_X_266
  | Cld2Language_X_267
  | Cld2Language_X_268
  | Cld2Language_X_269
  | Cld2Language_X_270
  | Cld2Language_X_271
  | Cld2Language_X_272
  | Cld2Language_X_273
  | Cld2Language_X_274
  | Cld2Language_X_275
  | Cld2Language_X_276
  | Cld2Language_X_277
  | Cld2Language_X_278
  | Cld2Language_X_279
  | Cld2Language_X_280
  | Cld2Language_X_281
  | Cld2Language_X_282
  | Cld2Language_X_283
  | Cld2Language_X_284
  | Cld2Language_X_285
  | Cld2Language_X_286
  | Cld2Language_X_287
  | Cld2Language_X_288
  | Cld2Language_X_289
  | Cld2Language_X_290
  | Cld2Language_X_291
  | Cld2Language_X_292
  | Cld2Language_X_293
  | Cld2Language_X_294
  | Cld2Language_X_295
  | Cld2Language_X_296
  | Cld2Language_X_297
  | Cld2Language_X_298
  | Cld2Language_X_299
  | Cld2Language_X_300
  | Cld2Language_X_301
  | Cld2Language_X_302
  | Cld2Language_X_303
  | Cld2Language_X_304
  | Cld2Language_X_305
  | Cld2Language_X_306
  | Cld2Language_X_307
  | Cld2Language_X_308
  | Cld2Language_X_309
  | Cld2Language_X_310
  | Cld2Language_X_311
  | Cld2Language_X_312
  | Cld2Language_X_313
  | Cld2Language_X_314
  | Cld2Language_X_315
  | Cld2Language_X_316
  | Cld2Language_X_317
  | Cld2Language_X_318
  | Cld2Language_X_319
  | Cld2Language_X_320
  | Cld2Language_X_321
  | Cld2Language_X_322
  | Cld2Language_X_323
  | Cld2Language_X_324
  | Cld2Language_X_325
  | Cld2Language_X_326
  | Cld2Language_X_327
  | Cld2Language_X_328
  | Cld2Language_X_329
  | Cld2Language_X_330
  | Cld2Language_X_331
  | Cld2Language_X_332
  | Cld2Language_X_333
  | Cld2Language_X_334
  | Cld2Language_X_335
  | Cld2Language_X_336
  | Cld2Language_X_337
  | Cld2Language_X_338
  | Cld2Language_X_339
  | Cld2Language_X_340
  | Cld2Language_X_341
  | Cld2Language_X_342
  | Cld2Language_X_343
  | Cld2Language_X_344
  | Cld2Language_X_345
  | Cld2Language_X_346
  | Cld2Language_X_347
  | Cld2Language_X_348
  | Cld2Language_X_349
  | Cld2Language_X_350
  | Cld2Language_X_351
  | Cld2Language_X_352
  | Cld2Language_X_353
  | Cld2Language_X_354
  | Cld2Language_X_355
  | Cld2Language_X_356
  | Cld2Language_X_357
  | Cld2Language_X_358
  | Cld2Language_X_359
  | Cld2Language_X_360
  | Cld2Language_X_361
  | Cld2Language_X_362
  | Cld2Language_X_363
  | Cld2Language_X_364
  | Cld2Language_X_365
  | Cld2Language_X_366
  | Cld2Language_X_367
  | Cld2Language_X_368
  | Cld2Language_X_369
  | Cld2Language_X_370
  | Cld2Language_X_371
  | Cld2Language_X_372
  | Cld2Language_X_373
  | Cld2Language_X_374
  | Cld2Language_X_375
  | Cld2Language_X_376
  | Cld2Language_X_377
  | Cld2Language_X_378
  | Cld2Language_X_379
  | Cld2Language_X_380
  | Cld2Language_X_381
  | Cld2Language_X_382
  | Cld2Language_X_383
  | Cld2Language_X_384
  | Cld2Language_X_385
  | Cld2Language_X_386
  | Cld2Language_X_387
  | Cld2Language_X_388
  | Cld2Language_X_389
  | Cld2Language_X_390
  | Cld2Language_X_391
  | Cld2Language_X_392
  | Cld2Language_X_393
  | Cld2Language_X_394
  | Cld2Language_X_395
  | Cld2Language_X_396
  | Cld2Language_X_397
  | Cld2Language_X_398
  | Cld2Language_X_399
  | Cld2Language_X_400
  | Cld2Language_X_401
  | Cld2Language_X_402
  | Cld2Language_X_403
  | Cld2Language_X_404
  | Cld2Language_X_405
  | Cld2Language_X_406
  | Cld2Language_X_407
  | Cld2Language_X_408
  | Cld2Language_X_409
  | Cld2Language_X_410
  | Cld2Language_X_411
  | Cld2Language_X_412
  | Cld2Language_X_413
  | Cld2Language_X_414
  | Cld2Language_X_415
  | Cld2Language_X_416
  | Cld2Language_X_417
  | Cld2Language_X_418
  | Cld2Language_X_419
  | Cld2Language_X_420
  | Cld2Language_X_421
  | Cld2Language_X_422
  | Cld2Language_X_423
  | Cld2Language_X_424
  | Cld2Language_X_425
  | Cld2Language_X_426
  | Cld2Language_X_427
  | Cld2Language_X_428
  | Cld2Language_X_429
  | Cld2Language_X_430
  | Cld2Language_X_431
  | Cld2Language_X_432
  | Cld2Language_X_433
  | Cld2Language_X_434
  | Cld2Language_X_435
  | Cld2Language_X_436
  | Cld2Language_X_437
  | Cld2Language_X_438
  | Cld2Language_X_439
  | Cld2Language_X_440
  | Cld2Language_X_441
  | Cld2Language_X_442
  | Cld2Language_X_443
  | Cld2Language_X_444
  | Cld2Language_X_445
  | Cld2Language_X_446
  | Cld2Language_X_447
  | Cld2Language_X_448
  | Cld2Language_X_449
  | Cld2Language_X_450
  | Cld2Language_X_451
  | Cld2Language_X_452
  | Cld2Language_X_453
  | Cld2Language_X_454
  | Cld2Language_X_455
  | Cld2Language_X_456
  | Cld2Language_X_457
  | Cld2Language_X_458
  | Cld2Language_X_459
  | Cld2Language_X_460
  | Cld2Language_X_461
  | Cld2Language_X_462
  | Cld2Language_X_463
  | Cld2Language_X_464
  | Cld2Language_X_465
  | Cld2Language_X_466
  | Cld2Language_X_467
  | Cld2Language_X_468
  | Cld2Language_X_469
  | Cld2Language_X_470
  | Cld2Language_X_471
  | Cld2Language_X_472
  | Cld2Language_X_473
  | Cld2Language_X_474
  | Cld2Language_X_475
  | Cld2Language_X_476
  | Cld2Language_X_477
  | Cld2Language_X_478
  | Cld2Language_X_479
  | Cld2Language_X_480
  | Cld2Language_X_481
  | Cld2Language_X_482
  | Cld2Language_X_483
  | Cld2Language_X_484
  | Cld2Language_X_485
  | Cld2Language_X_486
  | Cld2Language_X_487
  | Cld2Language_X_488
  | Cld2Language_X_489
  | Cld2Language_X_490
  | Cld2Language_X_491
  | Cld2Language_X_492
  | Cld2Language_X_493
  | Cld2Language_X_494
  | Cld2Language_X_495
  | Cld2Language_X_496
  | Cld2Language_X_497
  | Cld2Language_X_498
  | Cld2Language_X_499
  | Cld2Language_X_500
  | Cld2Language_X_501
  | Cld2Language_X_502
  | Cld2Language_X_503
  | Cld2Language_X_504
  | Cld2Language_X_505
  | Cld2Language_NDEBELE
  | Cld2Language_X_BORK_BORK_BORK
  | Cld2Language_X_PIG_LATIN
  | Cld2Language_X_HACKER
  | Cld2Language_X_KLINGON
  | Cld2Language_X_ELMER_FUDD
  | Cld2Language_X_Common
  | Cld2Language_X_Latin
  | Cld2Language_X_Greek
  | Cld2Language_X_Cyrillic
  | Cld2Language_X_Armenian
  | Cld2Language_X_Hebrew
  | Cld2Language_X_Arabic
  | Cld2Language_X_Syriac
  | Cld2Language_X_Thaana
  | Cld2Language_X_Devanagari
  | Cld2Language_X_Bengali
  | Cld2Language_X_Gurmukhi
  | Cld2Language_X_Gujarati
  | Cld2Language_X_Oriya
  | Cld2Language_X_Tamil
  | Cld2Language_X_Telugu
  | Cld2Language_X_Kannada
  | Cld2Language_X_Malayalam
  | Cld2Language_X_Sinhala
  | Cld2Language_X_Thai
  | Cld2Language_X_Lao
  | Cld2Language_X_Tibetan
  | Cld2Language_X_Myanmar
  | Cld2Language_X_Georgian
  | Cld2Language_X_Hangul
  | Cld2Language_X_Ethiopic
  | Cld2Language_X_Cherokee
  | Cld2Language_X_Canadian_Aboriginal
  | Cld2Language_X_Ogham
  | Cld2Language_X_Runic
  | Cld2Language_X_Khmer
  | Cld2Language_X_Mongolian
  | Cld2Language_X_Hiragana
  | Cld2Language_X_Katakana
  | Cld2Language_X_Bopomofo
  | Cld2Language_X_Han
  | Cld2Language_X_Yi
  | Cld2Language_X_Old_Italic
  | Cld2Language_X_Gothic
  | Cld2Language_X_Deseret
  | Cld2Language_X_Inherited
  | Cld2Language_X_Tagalog
  | Cld2Language_X_Hanunoo
  | Cld2Language_X_Buhid
  | Cld2Language_X_Tagbanwa
  | Cld2Language_X_Limbu
  | Cld2Language_X_Tai_Le
  | Cld2Language_X_Linear_B
  | Cld2Language_X_Ugaritic
  | Cld2Language_X_Shavian
  | Cld2Language_X_Osmanya
  | Cld2Language_X_Cypriot
  | Cld2Language_X_Braille
  | Cld2Language_X_Buginese
  | Cld2Language_X_Coptic
  | Cld2Language_X_New_Tai_Lue
  | Cld2Language_X_Glagolitic
  | Cld2Language_X_Tifinagh
  | Cld2Language_X_Syloti_Nagri
  | Cld2Language_X_Old_Persian
  | Cld2Language_X_Kharoshthi
  | Cld2Language_X_Balinese
  | Cld2Language_X_Cuneiform
  | Cld2Language_X_Phoenician
  | Cld2Language_X_Phags_Pa
  | Cld2Language_X_Nko
  | Cld2Language_X_Sundanese
  | Cld2Language_X_Lepcha
  | Cld2Language_X_Ol_Chiki
  | Cld2Language_X_Vai
  | Cld2Language_X_Saurashtra
  | Cld2Language_X_Kayah_Li
  | Cld2Language_X_Rejang
  | Cld2Language_X_Lycian
  | Cld2Language_X_Carian
  | Cld2Language_X_Lydian
  | Cld2Language_X_Cham
  | Cld2Language_X_Tai_Tham
  | Cld2Language_X_Tai_Viet
  | Cld2Language_X_Avestan
  | Cld2Language_X_Egyptian_Hieroglyphs
  | Cld2Language_X_Samaritan
  | Cld2Language_X_Lisu
  | Cld2Language_X_Bamum
  | Cld2Language_X_Javanese
  | Cld2Language_X_Meetei_Mayek
  | Cld2Language_X_Imperial_Aramaic
  | Cld2Language_X_Old_South_Arabian
  | Cld2Language_X_Inscriptional_Parthian
  | Cld2Language_X_Inscriptional_Pahlavi
  | Cld2Language_X_Old_Turkic
  | Cld2Language_X_Kaithi
  | Cld2Language_X_Batak
  | Cld2Language_X_Brahmi
  | Cld2Language_X_Mandaic
  | Cld2Language_X_Chakma
  | Cld2Language_X_Meroitic_Cursive
  | Cld2Language_X_Meroitic_Hieroglyphs
  | Cld2Language_X_Miao
  | Cld2Language_X_Sharada
  | Cld2Language_X_Sora_Sompeng
  | Cld2Language_X_Takri
  deriving (Eq,Ord,Show,Bounded,Enum,Typeable,Data,Generic,Hashable)

-- | An enumeration of character encodings which can be included in 'Hints'
data Encoding =
  Cld2Encoding_ISO_8859_1
  | Cld2Encoding_ISO_8859_2
  | Cld2Encoding_ISO_8859_3
  | Cld2Encoding_ISO_8859_4
  | Cld2Encoding_ISO_8859_5
  | Cld2Encoding_ISO_8859_6
  | Cld2Encoding_ISO_8859_7
  | Cld2Encoding_ISO_8859_8
  | Cld2Encoding_ISO_8859_9
  | Cld2Encoding_ISO_8859_10
  | Cld2Encoding_JAPANESE_EUC_JP
  | Cld2Encoding_JAPANESE_SHIFT_JIS
  | Cld2Encoding_JAPANESE_JIS
  | Cld2Encoding_CHINESE_BIG5
  | Cld2Encoding_CHINESE_GB
  | Cld2Encoding_CHINESE_EUC_CN
  | Cld2Encoding_KOREAN_EUC_KR
  | Cld2Encoding_UNICODE_UNUSED
  | Cld2Encoding_CHINESE_EUC_DEC
  | Cld2Encoding_CHINESE_CNS
  | Cld2Encoding_CHINESE_BIG5_CP950
  | Cld2Encoding_JAPANESE_CP932
  | Cld2Encoding_UTF8
  | Cld2Encoding_UNKNOWN_ENCODING
  | Cld2Encoding_ASCII_7BIT
  | Cld2Encoding_RUSSIAN_KOI8_R
  | Cld2Encoding_RUSSIAN_CP1251
  | Cld2Encoding_MSFT_CP1252
  | Cld2Encoding_RUSSIAN_KOI8_RU
  | Cld2Encoding_MSFT_CP1250
  | Cld2Encoding_ISO_8859_15
  | Cld2Encoding_MSFT_CP1254
  | Cld2Encoding_MSFT_CP1257
  | Cld2Encoding_ISO_8859_11
  | Cld2Encoding_MSFT_CP874
  | Cld2Encoding_MSFT_CP1256
  | Cld2Encoding_MSFT_CP1255
  | Cld2Encoding_ISO_8859_8_I
  | Cld2Encoding_HEBREW_VISUAL
  | Cld2Encoding_CZECH_CP852
  | Cld2Encoding_CZECH_CSN_369103
  | Cld2Encoding_MSFT_CP1253
  | Cld2Encoding_RUSSIAN_CP866
  | Cld2Encoding_ISO_8859_13
  | Cld2Encoding_ISO_2022_KR
  | Cld2Encoding_GBK
  | Cld2Encoding_GB18030
  | Cld2Encoding_BIG5_HKSCS
  | Cld2Encoding_ISO_2022_CN
  | Cld2Encoding_TSCII
  | Cld2Encoding_TAMIL_MONO
  | Cld2Encoding_TAMIL_BI
  | Cld2Encoding_JAGRAN
  | Cld2Encoding_MACINTOSH_ROMAN
  | Cld2Encoding_UTF7
  | Cld2Encoding_BHASKAR
  | Cld2Encoding_HTCHANAKYA
  | Cld2Encoding_UTF16BE
  | Cld2Encoding_UTF16LE
  | Cld2Encoding_UTF32BE
  | Cld2Encoding_UTF32LE
  | Cld2Encoding_BINARYENC
  | Cld2Encoding_HZ_GB_2312
  | Cld2Encoding_UTF8UTF8
  | Cld2Encoding_TAM_ELANGO
  | Cld2Encoding_TAM_LTTMBARANI
  | Cld2Encoding_TAM_SHREE
  | Cld2Encoding_TAM_TBOOMIS
  | Cld2Encoding_TAM_TMNEWS
  | Cld2Encoding_TAM_WEBTAMIL
  | Cld2Encoding_KDDI_SHIFT_JIS
  | Cld2Encoding_DOCOMO_SHIFT_JIS
  | Cld2Encoding_SOFTBANK_SHIFT_JIS
  | Cld2Encoding_KDDI_ISO_2022_JP
  | Cld2Encoding_SOFTBANK_ISO_2022_JP
  deriving (Eq,Ord,Show,Bounded,Enum,Typeable,Data,Generic,Hashable)

-- | A collection of contextual clues which can help improve the
-- accuracy of language detection
data Hints =
    Hints { -- | The value of the @Content-Language@ HTTP header
            hintContentLanguage :: Maybe String,
            -- | The TLD of the website which served the corpus being analyzed
            hintTLD :: Maybe String,
            -- | The original character encoding of the corpus
            hintEncoding :: Encoding,
            -- | A hint from any other available context
            hintLanguage :: Language }
             deriving (Eq,Ord,Show,Typeable,Data,Generic,Hashable)

-- | The default set of hints, which is @Hints@ @Nothing@ @Nothing@ @Cld2Encoding_UNKNOWN_ENCODING@ @Cld2Language_UNKNOWN_LANGUAGE@
defaultHints :: Hints
defaultHints = Hints Nothing Nothing Cld2Encoding_UNKNOWN_ENCODING Cld2Language_UNKNOWN_LANGUAGE

-- | Flags which cause CLD2 to dump debugging output to stderr.
data DebugFlags = DebugFlags { debugFlagScoreAsQuads :: Bool,
                               debugFlagHtml :: Bool,
                               debugFlagCr :: Bool,
                               debugFlagVerbose :: Bool,
                               debugFlagQuiet :: Bool,
                               debugFlagEcho :: Bool }
                  deriving (Eq,Ord,Show,Typeable,Data,Generic,Hashable)

-- | The default set of debugging flags, all @False@
defaultDebugFlags :: DebugFlags
defaultDebugFlags = DebugFlags False False False False False False

-- | Represents a range of text and its detected language
data Chunk =
    Chunk { -- | The offset of the start of the chunk, in bytes
            chunkOffset :: Int,
            -- | The size of the chunk, in bytes
            chunkSize :: Int,
            -- | The detected language of this chunk
            chunkLanguage :: Language }
             deriving (Eq,Ord,Show,Typeable,Data,Generic,Hashable)

-- | The result of performing language detection on a corpus
data Result = Result {
      -- | The primary language of the corpus
      resultSimple :: Language,
      -- | The top three most prevalent languages in the corpus
      resultTop3 :: (Language, Language, Language),
      -- | The
      resultTop3Percent :: (Int, Int, Int),
      -- | Confidence scores for the top three most prevalent languages
      resultTop3Score :: (Double, Double, Double),
      -- | Identifies the language of each chunk of the corpus
      resultChunks :: [Chunk],
      -- | The size of the corpus that was analyzed
      resultTextBytes :: Int,
      -- | Whether this result should be considered reliable
      resultIsReliable :: Bool }
              deriving (Eq,Ord,Show,Typeable,Data,Generic,Hashable)

foreign import ccall "cld2_haskell_shim" c_cld2_haskell_shim ::
  Ptr CInt -> Ptr CChar -> CInt -> CInt -> CString -> CString -> CInt -> CInt
  -> CInt -> Ptr CInt -> Ptr CInt -> Ptr CDouble -> Ptr CSize
  -> Ptr (Ptr CInt) -> Ptr (Ptr CShort) -> Ptr (Ptr CShort) -> Ptr CInt
  -> Ptr CInt -> IO CInt

boolToCInt :: Bool -> CInt
boolToCInt False = 0
boolToCInt True = 1

cIntToBool :: CInt -> Bool
cIntToBool 0 = False
cIntToBool _ = True

takeBit :: (Bits a) => a -> Bool -> a
takeBit x True = x
takeBit _x False = zeroBits

flagsToCInt :: DebugFlags -> CInt
flagsToCInt (DebugFlags a b c d e f) =
    takeBit 0x0100 a .|.
    takeBit 0x0200 b .|.
    takeBit 0x0400 c .|.
    takeBit 0x0800 d .|.
    takeBit 0x1000 e .|.
    takeBit 0x2000 f

withMaybeCString :: Maybe String -> (CString -> IO a) -> IO a
withMaybeCString (Just str) f = withCString str f
withMaybeCString Nothing f = f nullPtr

-- | This function is the most general way to invoke CLD2. Since setting
-- debug flags can cause output on stderr, the result is returned in the IO
-- monad.
detectLanguageDebug ::
    Text -- ^ The corpus to be analyzed
    -> Bool -- ^ True for plain text, False for HTML
    -> Hints
    -> DebugFlags
    -> IO Result
detectLanguageDebug text isPlainText hints flags =
  unsafeUseAsCStringLen (encodeUtf8 text) $ \(cStr,cLen) ->
    withMaybeCString (hintContentLanguage hints) $ \cContentLanguage ->
      withMaybeCString (hintTLD hints) $ \cTld ->
        allocaArray 3 $ \cLanguage3 ->
          allocaArray 3 $ \cPercent3 ->
            allocaArray 3 $ \cScore3 ->
              alloca $ \cNumChunksPtr ->
                alloca $ \cChunkOffsetsPtr ->
                  alloca $ \cChunkSizesPtr ->
                    alloca $ \cChunkLangsPtr ->
                      alloca $ \cTextBytesPtr ->
                        alloca $ \cIsReliablePtr ->
                          alloca $ \cResultLangPtr -> do
                            let cIsPlainText = boolToCInt isPlainText
                            let cEncodingHint =
                                    toEnum . fromEnum $ hintEncoding hints
                            let cLanguageHint =
                                    toEnum . fromEnum $ hintLanguage hints
                            let cFlags = flagsToCInt flags
                            let cBufferLen = toEnum cLen
                            (cChunkOffsets,cChunkSizes,cChunkLangs) <-
                                mask_ $ do
                                  cResult <-
                                      c_cld2_haskell_shim
                                      cResultLangPtr cStr cBufferLen
                                      cIsPlainText cContentLanguage cTld
                                      cEncodingHint cLanguageHint cFlags
                                      cLanguage3 cPercent3 cScore3
                                      cNumChunksPtr cChunkOffsetsPtr
                                      cChunkSizesPtr cChunkLangsPtr
                                      cTextBytesPtr cIsReliablePtr
                                  if (Errno cResult) == eNOMEM then
                                    throwIO HeapOverflow
                                  else if (Errno cResult) == eOK then
                                    do cNumChunks <-
                                           fromEnum <$> peek cNumChunksPtr
                                       cChunkOffsetsArray <-
                                           peek cChunkOffsetsPtr
                                       cChunkSizesArray <- peek cChunkSizesPtr
                                       cChunkLangsArray <- peek cChunkLangsPtr
                                       cChunkOffsets <-
                                          peekArray cNumChunks
                                                    cChunkOffsetsArray
                                       cChunkSizes <-
                                           peekArray cNumChunks cChunkSizesArray
                                       cChunkLangs <-
                                           peekArray cNumChunks cChunkLangsArray
                                       free cChunkOffsetsArray
                                       free cChunkSizesArray
                                       free cChunkLangsArray
                                       return (cChunkOffsets, cChunkSizes,
                                               cChunkLangs)
                                  else
                                    throwIO $
                                    AssertionFailed "unknown error in CLD2"
                            cResultLang <- peek cResultLangPtr
                            [cLanguage0,cLanguage1,cLanguage2] <-
                                peekArray 3 cLanguage3
                            [cPercent0, cPercent1, cPercent2] <-
                                peekArray 3 cPercent3
                            [cScore0, cScore1, cScore2] <-
                                peekArray 3 cScore3
                            _cNumChunks <- fromEnum <$> peek cNumChunksPtr
                            cTextBytes <- peek cTextBytesPtr
                            cIsReliable <- peek cIsReliablePtr
                            let theSimple =
                                    toEnum . fromEnum $ cResultLang
                            let theTop3 =
                                    (toEnum . fromEnum $ cLanguage0,
                                     toEnum . fromEnum $ cLanguage1,
                                     toEnum . fromEnum $ cLanguage2)
                            let theTop3Percent =
                                    (fromEnum cPercent0,
                                     fromEnum cPercent1,
                                     fromEnum cPercent2)
                            let (CDouble theScore0) = cScore0
                            let (CDouble theScore1) = cScore1
                            let (CDouble theScore2) = cScore2
                            let theChunks =
                                    (flip map)
                                    (zip3 cChunkOffsets cChunkSizes
                                          cChunkLangs)
                                    (\(offset,size,language) ->
                                         Chunk
                                         (fromEnum offset)
                                         (fromEnum size)
                                         (toEnum . fromEnum $ language))
                            let theTextBytes = fromEnum cTextBytes
                            let theIsReliable = cIntToBool cIsReliable
                            return $ Result
                                   theSimple theTop3 theTop3Percent
                                   (theScore0, theScore1, theScore2)
                                   theChunks theTextBytes theIsReliable


-- | Call 'detectLanguageDebug' with all debug flags disabled and
-- call 'unsafePerformIO' on the result. This is the recommended
-- function for most use cases.
detectLanguage ::
    Text  -- ^ The corpus to be analyzed
    -> Bool -- ^ True for plain text, False for HTML
    -> Hints -> Result
detectLanguage text isPlainText hints =
    unsafePerformIO $
    detectLanguageDebug text isPlainText hints defaultDebugFlags

-- | Call 'detectLanguage' on HTML input with no hints and return the
-- @resultSimple@ field of the result.
detectLanguageSimple :: Text -> Language
detectLanguageSimple text =
    resultSimple $ detectLanguage text False defaultHints