-- Hoogle documentation, generated by Haddock
-- See Hoogle, http://www.haskell.org/hoogle/


-- | Unicode normalization
--   
--   Fast Unicode 8.0 normalization in Haskell (NFC, NFKC, NFD, NFKD).
@package unicode-transforms
@version 0.2.0


-- | Character set normalization functions for Unicode. The documentation
--   and API in this module is largely borrowed from <tt>text-icu</tt>.
module Data.Unicode.Types

-- | Normalization transforms Unicode text into an equivalent composed or
--   decomposed form, allowing for easier sorting and searching of text.
--   Standard normalization forms are described in
--   <a>http://www.unicode.org/unicode/reports/tr15/</a>, Unicode Standard
--   Annex #15: Unicode Normalization Forms.
--   
--   Characters with accents or other adornments can be encoded in several
--   different ways in Unicode. For example, take the character A-acute. In
--   Unicode, this can be encoded as a single character (the "composed"
--   form):
--   
--   <pre>
--   00C1    LATIN CAPITAL LETTER A WITH ACUTE
--   </pre>
--   
--   or as two separate characters (the "decomposed" form):
--   
--   <pre>
--   0041    LATIN CAPITAL LETTER A
--   0301    COMBINING ACUTE ACCENT
--   </pre>
--   
--   To a user of your program, however, both of these sequences should be
--   treated as the same "user-level" character "A with acute accent". When
--   you are searching or comparing text, you must ensure that these two
--   sequences are treated equivalently. In addition, you must handle
--   characters with more than one accent. Sometimes the order of a
--   character's combining accents is significant, while in other cases
--   accent sequences in different orders are really equivalent.
--   
--   Similarly, the string "ffi" can be encoded as three separate letters:
--   
--   <pre>
--   0066    LATIN SMALL LETTER F
--   0066    LATIN SMALL LETTER F
--   0069    LATIN SMALL LETTER I
--   </pre>
--   
--   or as the single character
--   
--   <pre>
--   FB03    LATIN SMALL LIGATURE FFI
--   </pre>
--   
--   The "ffi" ligature is not a distinct semantic character, and strictly
--   speaking it shouldn't be in Unicode at all, but it was included for
--   compatibility with existing character sets that already provided it.
--   The Unicode standard identifies such characters by giving them
--   "compatibility" decompositions into the corresponding semantic
--   characters. When sorting and searching, you will often want to use
--   these mappings.
--   
--   Normalization helps solve these problems by transforming text into the
--   canonical composed and decomposed forms as shown in the first example
--   above. In addition, you can have it perform compatibility
--   decompositions so that you can treat compatibility characters the same
--   as their equivalents. Finally, normalization rearranges accents into
--   the proper canonical order, so that you do not have to worry about
--   accent rearrangement on your own.
--   
--   The W3C generally recommends to exchange texts in <a>NFC</a>. Note
--   also that most legacy character encodings use only precomposed forms
--   and often do not encode any combining marks by themselves. For
--   conversion to such character encodings the Unicode text needs to be
--   normalized to <a>NFC</a>. For more usage examples, see the Unicode
--   Standard Annex.
data NormalizationMode

-- | Canonical decomposition.
NFD :: NormalizationMode

-- | Compatibility decomposition.
NFKD :: NormalizationMode

-- | Canonical decomposition followed by canonical composition.
NFC :: NormalizationMode

-- | Compatibility decomposition followed by canonical composition.
NFKC :: NormalizationMode
instance GHC.Enum.Enum Data.Unicode.Types.NormalizationMode
instance GHC.Show.Show Data.Unicode.Types.NormalizationMode
instance GHC.Classes.Eq Data.Unicode.Types.NormalizationMode


-- | Unicode normalization for <tt>Text</tt> data type.
module Data.Text.Normalize

-- | Normalization transforms Unicode text into an equivalent composed or
--   decomposed form, allowing for easier sorting and searching of text.
--   Standard normalization forms are described in
--   <a>http://www.unicode.org/unicode/reports/tr15/</a>, Unicode Standard
--   Annex #15: Unicode Normalization Forms.
--   
--   Characters with accents or other adornments can be encoded in several
--   different ways in Unicode. For example, take the character A-acute. In
--   Unicode, this can be encoded as a single character (the "composed"
--   form):
--   
--   <pre>
--   00C1    LATIN CAPITAL LETTER A WITH ACUTE
--   </pre>
--   
--   or as two separate characters (the "decomposed" form):
--   
--   <pre>
--   0041    LATIN CAPITAL LETTER A
--   0301    COMBINING ACUTE ACCENT
--   </pre>
--   
--   To a user of your program, however, both of these sequences should be
--   treated as the same "user-level" character "A with acute accent". When
--   you are searching or comparing text, you must ensure that these two
--   sequences are treated equivalently. In addition, you must handle
--   characters with more than one accent. Sometimes the order of a
--   character's combining accents is significant, while in other cases
--   accent sequences in different orders are really equivalent.
--   
--   Similarly, the string "ffi" can be encoded as three separate letters:
--   
--   <pre>
--   0066    LATIN SMALL LETTER F
--   0066    LATIN SMALL LETTER F
--   0069    LATIN SMALL LETTER I
--   </pre>
--   
--   or as the single character
--   
--   <pre>
--   FB03    LATIN SMALL LIGATURE FFI
--   </pre>
--   
--   The "ffi" ligature is not a distinct semantic character, and strictly
--   speaking it shouldn't be in Unicode at all, but it was included for
--   compatibility with existing character sets that already provided it.
--   The Unicode standard identifies such characters by giving them
--   "compatibility" decompositions into the corresponding semantic
--   characters. When sorting and searching, you will often want to use
--   these mappings.
--   
--   Normalization helps solve these problems by transforming text into the
--   canonical composed and decomposed forms as shown in the first example
--   above. In addition, you can have it perform compatibility
--   decompositions so that you can treat compatibility characters the same
--   as their equivalents. Finally, normalization rearranges accents into
--   the proper canonical order, so that you do not have to worry about
--   accent rearrangement on your own.
--   
--   The W3C generally recommends to exchange texts in <a>NFC</a>. Note
--   also that most legacy character encodings use only precomposed forms
--   and often do not encode any combining marks by themselves. For
--   conversion to such character encodings the Unicode text needs to be
--   normalized to <a>NFC</a>. For more usage examples, see the Unicode
--   Standard Annex.
data NormalizationMode

-- | Canonical decomposition.
NFD :: NormalizationMode

-- | Compatibility decomposition.
NFKD :: NormalizationMode

-- | Canonical decomposition followed by canonical composition.
NFC :: NormalizationMode

-- | Compatibility decomposition followed by canonical composition.
NFKC :: NormalizationMode

-- | Perform Unicode normalization on <tt>Text</tt> according to the
--   specified normalization mode.
normalize :: NormalizationMode -> Text -> Text


-- | Unicode normalization for <tt>ByteString</tt> data type.

-- | <i>Deprecated: Convert ByteString to Text and then normalize</i>
module Data.ByteString.UTF8.Normalize

-- | Normalization transforms Unicode text into an equivalent composed or
--   decomposed form, allowing for easier sorting and searching of text.
--   Standard normalization forms are described in
--   <a>http://www.unicode.org/unicode/reports/tr15/</a>, Unicode Standard
--   Annex #15: Unicode Normalization Forms.
--   
--   Characters with accents or other adornments can be encoded in several
--   different ways in Unicode. For example, take the character A-acute. In
--   Unicode, this can be encoded as a single character (the "composed"
--   form):
--   
--   <pre>
--   00C1    LATIN CAPITAL LETTER A WITH ACUTE
--   </pre>
--   
--   or as two separate characters (the "decomposed" form):
--   
--   <pre>
--   0041    LATIN CAPITAL LETTER A
--   0301    COMBINING ACUTE ACCENT
--   </pre>
--   
--   To a user of your program, however, both of these sequences should be
--   treated as the same "user-level" character "A with acute accent". When
--   you are searching or comparing text, you must ensure that these two
--   sequences are treated equivalently. In addition, you must handle
--   characters with more than one accent. Sometimes the order of a
--   character's combining accents is significant, while in other cases
--   accent sequences in different orders are really equivalent.
--   
--   Similarly, the string "ffi" can be encoded as three separate letters:
--   
--   <pre>
--   0066    LATIN SMALL LETTER F
--   0066    LATIN SMALL LETTER F
--   0069    LATIN SMALL LETTER I
--   </pre>
--   
--   or as the single character
--   
--   <pre>
--   FB03    LATIN SMALL LIGATURE FFI
--   </pre>
--   
--   The "ffi" ligature is not a distinct semantic character, and strictly
--   speaking it shouldn't be in Unicode at all, but it was included for
--   compatibility with existing character sets that already provided it.
--   The Unicode standard identifies such characters by giving them
--   "compatibility" decompositions into the corresponding semantic
--   characters. When sorting and searching, you will often want to use
--   these mappings.
--   
--   Normalization helps solve these problems by transforming text into the
--   canonical composed and decomposed forms as shown in the first example
--   above. In addition, you can have it perform compatibility
--   decompositions so that you can treat compatibility characters the same
--   as their equivalents. Finally, normalization rearranges accents into
--   the proper canonical order, so that you do not have to worry about
--   accent rearrangement on your own.
--   
--   The W3C generally recommends to exchange texts in <a>NFC</a>. Note
--   also that most legacy character encodings use only precomposed forms
--   and often do not encode any combining marks by themselves. For
--   conversion to such character encodings the Unicode text needs to be
--   normalized to <a>NFC</a>. For more usage examples, see the Unicode
--   Standard Annex.
data NormalizationMode

-- | Canonical decomposition.
NFD :: NormalizationMode

-- | Compatibility decomposition.
NFKD :: NormalizationMode

-- | Canonical decomposition followed by canonical composition.
NFC :: NormalizationMode

-- | Compatibility decomposition followed by canonical composition.
NFKC :: NormalizationMode

-- | Perform Unicode normalization on a UTF8 encoded <tt>ByteString</tt>
--   according to the specified normalization mode.
normalize :: NormalizationMode -> ByteString -> ByteString