{-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE DuplicateRecordFields #-} {-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE RecordWildCards #-} {-# LANGUAGE StrictData #-} {-# LANGUAGE NoImplicitPrelude #-} {-# OPTIONS_GHC -fno-warn-unused-imports #-} {-# OPTIONS_GHC -fno-warn-unused-matches #-} -- Derived from AWS service descriptions, licensed under Apache 2.0. -- | -- Module : Amazonka.MachineLearning.Types.S3DataSpec -- Copyright : (c) 2013-2023 Brendan Hay -- License : Mozilla Public License, v. 2.0. -- Maintainer : Brendan Hay -- Stability : auto-generated -- Portability : non-portable (GHC extensions) module Amazonka.MachineLearning.Types.S3DataSpec where import qualified Amazonka.Core as Core import qualified Amazonka.Core.Lens.Internal as Lens import qualified Amazonka.Data as Data import qualified Amazonka.Prelude as Prelude -- | Describes the data specification of a @DataSource@. -- -- /See:/ 'newS3DataSpec' smart constructor. data S3DataSpec = S3DataSpec' { -- | A JSON string that represents the splitting and rearrangement processing -- to be applied to a @DataSource@. If the @DataRearrangement@ parameter is -- not provided, all of the input data is used to create the @Datasource@. -- -- There are multiple parameters that control what data is used to create a -- datasource: -- -- - __@percentBegin@__ -- -- Use @percentBegin@ to indicate the beginning of the range of the -- data used to create the Datasource. If you do not include -- @percentBegin@ and @percentEnd@, Amazon ML includes all of the data -- when creating the datasource. -- -- - __@percentEnd@__ -- -- Use @percentEnd@ to indicate the end of the range of the data used -- to create the Datasource. If you do not include @percentBegin@ and -- @percentEnd@, Amazon ML includes all of the data when creating the -- datasource. -- -- - __@complement@__ -- -- The @complement@ parameter instructs Amazon ML to use the data that -- is not included in the range of @percentBegin@ to @percentEnd@ to -- create a datasource. The @complement@ parameter is useful if you -- need to create complementary datasources for training and -- evaluation. To create a complementary datasource, use the same -- values for @percentBegin@ and @percentEnd@, along with the -- @complement@ parameter. -- -- For example, the following two datasources do not share any data, -- and can be used to train and evaluate a model. The first datasource -- has 25 percent of the data, and the second one has 75 percent of the -- data. -- -- Datasource for evaluation: -- @{\"splitting\":{\"percentBegin\":0, \"percentEnd\":25}}@ -- -- Datasource for training: -- @{\"splitting\":{\"percentBegin\":0, \"percentEnd\":25, \"complement\":\"true\"}}@ -- -- - __@strategy@__ -- -- To change how Amazon ML splits the data for a datasource, use the -- @strategy@ parameter. -- -- The default value for the @strategy@ parameter is @sequential@, -- meaning that Amazon ML takes all of the data records between the -- @percentBegin@ and @percentEnd@ parameters for the datasource, in -- the order that the records appear in the input data. -- -- The following two @DataRearrangement@ lines are examples of -- sequentially ordered training and evaluation datasources: -- -- Datasource for evaluation: -- @{\"splitting\":{\"percentBegin\":70, \"percentEnd\":100, \"strategy\":\"sequential\"}}@ -- -- Datasource for training: -- @{\"splitting\":{\"percentBegin\":70, \"percentEnd\":100, \"strategy\":\"sequential\", \"complement\":\"true\"}}@ -- -- To randomly split the input data into the proportions indicated by -- the percentBegin and percentEnd parameters, set the @strategy@ -- parameter to @random@ and provide a string that is used as the seed -- value for the random data splitting (for example, you can use the S3 -- path to your data as the random seed string). If you choose the -- random split strategy, Amazon ML assigns each row of data a -- pseudo-random number between 0 and 100, and then selects the rows -- that have an assigned number between @percentBegin@ and -- @percentEnd@. Pseudo-random numbers are assigned using both the -- input seed string value and the byte offset as a seed, so changing -- the data results in a different split. Any existing ordering is -- preserved. The random splitting strategy ensures that variables in -- the training and evaluation data are distributed similarly. It is -- useful in the cases where the input data may have an implicit sort -- order, which would otherwise result in training and evaluation -- datasources containing non-similar data records. -- -- The following two @DataRearrangement@ lines are examples of -- non-sequentially ordered training and evaluation datasources: -- -- Datasource for evaluation: -- @{\"splitting\":{\"percentBegin\":70, \"percentEnd\":100, \"strategy\":\"random\", \"randomSeed\"=\"s3:\/\/my_s3_path\/bucket\/file.csv\"}}@ -- -- Datasource for training: -- @{\"splitting\":{\"percentBegin\":70, \"percentEnd\":100, \"strategy\":\"random\", \"randomSeed\"=\"s3:\/\/my_s3_path\/bucket\/file.csv\", \"complement\":\"true\"}}@ dataRearrangement :: Prelude.Maybe Prelude.Text, -- | A JSON string that represents the schema for an Amazon S3 @DataSource@. -- The @DataSchema@ defines the structure of the observation data in the -- data file(s) referenced in the @DataSource@. -- -- You must provide either the @DataSchema@ or the @DataSchemaLocationS3@. -- -- Define your @DataSchema@ as a series of key-value pairs. @attributes@ -- and @excludedVariableNames@ have an array of key-value pairs for their -- value. Use the following format to define your @DataSchema@. -- -- { \"version\": \"1.0\", -- -- \"recordAnnotationFieldName\": \"F1\", -- -- \"recordWeightFieldName\": \"F2\", -- -- \"targetFieldName\": \"F3\", -- -- \"dataFormat\": \"CSV\", -- -- \"dataFileContainsHeader\": true, -- -- \"attributes\": [ -- -- { \"fieldName\": \"F1\", \"fieldType\": \"TEXT\" }, { \"fieldName\": -- \"F2\", \"fieldType\": \"NUMERIC\" }, { \"fieldName\": \"F3\", -- \"fieldType\": \"CATEGORICAL\" }, { \"fieldName\": \"F4\", -- \"fieldType\": \"NUMERIC\" }, { \"fieldName\": \"F5\", \"fieldType\": -- \"CATEGORICAL\" }, { \"fieldName\": \"F6\", \"fieldType\": \"TEXT\" }, { -- \"fieldName\": \"F7\", \"fieldType\": \"WEIGHTED_INT_SEQUENCE\" }, { -- \"fieldName\": \"F8\", \"fieldType\": \"WEIGHTED_STRING_SEQUENCE\" } ], -- -- \"excludedVariableNames\": [ \"F6\" ] } dataSchema :: Prelude.Maybe Prelude.Text, -- | Describes the schema location in Amazon S3. You must provide either the -- @DataSchema@ or the @DataSchemaLocationS3@. dataSchemaLocationS3 :: Prelude.Maybe Prelude.Text, -- | The location of the data file(s) used by a @DataSource@. The URI -- specifies a data file or an Amazon Simple Storage Service (Amazon S3) -- directory or bucket containing data files. dataLocationS3 :: Prelude.Text } deriving (Prelude.Eq, Prelude.Read, Prelude.Show, Prelude.Generic) -- | -- Create a value of 'S3DataSpec' with all optional fields omitted. -- -- Use or to modify other optional fields. -- -- The following record fields are available, with the corresponding lenses provided -- for backwards compatibility: -- -- 'dataRearrangement', 's3DataSpec_dataRearrangement' - A JSON string that represents the splitting and rearrangement processing -- to be applied to a @DataSource@. If the @DataRearrangement@ parameter is -- not provided, all of the input data is used to create the @Datasource@. -- -- There are multiple parameters that control what data is used to create a -- datasource: -- -- - __@percentBegin@__ -- -- Use @percentBegin@ to indicate the beginning of the range of the -- data used to create the Datasource. If you do not include -- @percentBegin@ and @percentEnd@, Amazon ML includes all of the data -- when creating the datasource. -- -- - __@percentEnd@__ -- -- Use @percentEnd@ to indicate the end of the range of the data used -- to create the Datasource. If you do not include @percentBegin@ and -- @percentEnd@, Amazon ML includes all of the data when creating the -- datasource. -- -- - __@complement@__ -- -- The @complement@ parameter instructs Amazon ML to use the data that -- is not included in the range of @percentBegin@ to @percentEnd@ to -- create a datasource. The @complement@ parameter is useful if you -- need to create complementary datasources for training and -- evaluation. To create a complementary datasource, use the same -- values for @percentBegin@ and @percentEnd@, along with the -- @complement@ parameter. -- -- For example, the following two datasources do not share any data, -- and can be used to train and evaluate a model. The first datasource -- has 25 percent of the data, and the second one has 75 percent of the -- data. -- -- Datasource for evaluation: -- @{\"splitting\":{\"percentBegin\":0, \"percentEnd\":25}}@ -- -- Datasource for training: -- @{\"splitting\":{\"percentBegin\":0, \"percentEnd\":25, \"complement\":\"true\"}}@ -- -- - __@strategy@__ -- -- To change how Amazon ML splits the data for a datasource, use the -- @strategy@ parameter. -- -- The default value for the @strategy@ parameter is @sequential@, -- meaning that Amazon ML takes all of the data records between the -- @percentBegin@ and @percentEnd@ parameters for the datasource, in -- the order that the records appear in the input data. -- -- The following two @DataRearrangement@ lines are examples of -- sequentially ordered training and evaluation datasources: -- -- Datasource for evaluation: -- @{\"splitting\":{\"percentBegin\":70, \"percentEnd\":100, \"strategy\":\"sequential\"}}@ -- -- Datasource for training: -- @{\"splitting\":{\"percentBegin\":70, \"percentEnd\":100, \"strategy\":\"sequential\", \"complement\":\"true\"}}@ -- -- To randomly split the input data into the proportions indicated by -- the percentBegin and percentEnd parameters, set the @strategy@ -- parameter to @random@ and provide a string that is used as the seed -- value for the random data splitting (for example, you can use the S3 -- path to your data as the random seed string). If you choose the -- random split strategy, Amazon ML assigns each row of data a -- pseudo-random number between 0 and 100, and then selects the rows -- that have an assigned number between @percentBegin@ and -- @percentEnd@. Pseudo-random numbers are assigned using both the -- input seed string value and the byte offset as a seed, so changing -- the data results in a different split. Any existing ordering is -- preserved. The random splitting strategy ensures that variables in -- the training and evaluation data are distributed similarly. It is -- useful in the cases where the input data may have an implicit sort -- order, which would otherwise result in training and evaluation -- datasources containing non-similar data records. -- -- The following two @DataRearrangement@ lines are examples of -- non-sequentially ordered training and evaluation datasources: -- -- Datasource for evaluation: -- @{\"splitting\":{\"percentBegin\":70, \"percentEnd\":100, \"strategy\":\"random\", \"randomSeed\"=\"s3:\/\/my_s3_path\/bucket\/file.csv\"}}@ -- -- Datasource for training: -- @{\"splitting\":{\"percentBegin\":70, \"percentEnd\":100, \"strategy\":\"random\", \"randomSeed\"=\"s3:\/\/my_s3_path\/bucket\/file.csv\", \"complement\":\"true\"}}@ -- -- 'dataSchema', 's3DataSpec_dataSchema' - A JSON string that represents the schema for an Amazon S3 @DataSource@. -- The @DataSchema@ defines the structure of the observation data in the -- data file(s) referenced in the @DataSource@. -- -- You must provide either the @DataSchema@ or the @DataSchemaLocationS3@. -- -- Define your @DataSchema@ as a series of key-value pairs. @attributes@ -- and @excludedVariableNames@ have an array of key-value pairs for their -- value. Use the following format to define your @DataSchema@. -- -- { \"version\": \"1.0\", -- -- \"recordAnnotationFieldName\": \"F1\", -- -- \"recordWeightFieldName\": \"F2\", -- -- \"targetFieldName\": \"F3\", -- -- \"dataFormat\": \"CSV\", -- -- \"dataFileContainsHeader\": true, -- -- \"attributes\": [ -- -- { \"fieldName\": \"F1\", \"fieldType\": \"TEXT\" }, { \"fieldName\": -- \"F2\", \"fieldType\": \"NUMERIC\" }, { \"fieldName\": \"F3\", -- \"fieldType\": \"CATEGORICAL\" }, { \"fieldName\": \"F4\", -- \"fieldType\": \"NUMERIC\" }, { \"fieldName\": \"F5\", \"fieldType\": -- \"CATEGORICAL\" }, { \"fieldName\": \"F6\", \"fieldType\": \"TEXT\" }, { -- \"fieldName\": \"F7\", \"fieldType\": \"WEIGHTED_INT_SEQUENCE\" }, { -- \"fieldName\": \"F8\", \"fieldType\": \"WEIGHTED_STRING_SEQUENCE\" } ], -- -- \"excludedVariableNames\": [ \"F6\" ] } -- -- 'dataSchemaLocationS3', 's3DataSpec_dataSchemaLocationS3' - Describes the schema location in Amazon S3. You must provide either the -- @DataSchema@ or the @DataSchemaLocationS3@. -- -- 'dataLocationS3', 's3DataSpec_dataLocationS3' - The location of the data file(s) used by a @DataSource@. The URI -- specifies a data file or an Amazon Simple Storage Service (Amazon S3) -- directory or bucket containing data files. newS3DataSpec :: -- | 'dataLocationS3' Prelude.Text -> S3DataSpec newS3DataSpec pDataLocationS3_ = S3DataSpec' { dataRearrangement = Prelude.Nothing, dataSchema = Prelude.Nothing, dataSchemaLocationS3 = Prelude.Nothing, dataLocationS3 = pDataLocationS3_ } -- | A JSON string that represents the splitting and rearrangement processing -- to be applied to a @DataSource@. If the @DataRearrangement@ parameter is -- not provided, all of the input data is used to create the @Datasource@. -- -- There are multiple parameters that control what data is used to create a -- datasource: -- -- - __@percentBegin@__ -- -- Use @percentBegin@ to indicate the beginning of the range of the -- data used to create the Datasource. If you do not include -- @percentBegin@ and @percentEnd@, Amazon ML includes all of the data -- when creating the datasource. -- -- - __@percentEnd@__ -- -- Use @percentEnd@ to indicate the end of the range of the data used -- to create the Datasource. If you do not include @percentBegin@ and -- @percentEnd@, Amazon ML includes all of the data when creating the -- datasource. -- -- - __@complement@__ -- -- The @complement@ parameter instructs Amazon ML to use the data that -- is not included in the range of @percentBegin@ to @percentEnd@ to -- create a datasource. The @complement@ parameter is useful if you -- need to create complementary datasources for training and -- evaluation. To create a complementary datasource, use the same -- values for @percentBegin@ and @percentEnd@, along with the -- @complement@ parameter. -- -- For example, the following two datasources do not share any data, -- and can be used to train and evaluate a model. The first datasource -- has 25 percent of the data, and the second one has 75 percent of the -- data. -- -- Datasource for evaluation: -- @{\"splitting\":{\"percentBegin\":0, \"percentEnd\":25}}@ -- -- Datasource for training: -- @{\"splitting\":{\"percentBegin\":0, \"percentEnd\":25, \"complement\":\"true\"}}@ -- -- - __@strategy@__ -- -- To change how Amazon ML splits the data for a datasource, use the -- @strategy@ parameter. -- -- The default value for the @strategy@ parameter is @sequential@, -- meaning that Amazon ML takes all of the data records between the -- @percentBegin@ and @percentEnd@ parameters for the datasource, in -- the order that the records appear in the input data. -- -- The following two @DataRearrangement@ lines are examples of -- sequentially ordered training and evaluation datasources: -- -- Datasource for evaluation: -- @{\"splitting\":{\"percentBegin\":70, \"percentEnd\":100, \"strategy\":\"sequential\"}}@ -- -- Datasource for training: -- @{\"splitting\":{\"percentBegin\":70, \"percentEnd\":100, \"strategy\":\"sequential\", \"complement\":\"true\"}}@ -- -- To randomly split the input data into the proportions indicated by -- the percentBegin and percentEnd parameters, set the @strategy@ -- parameter to @random@ and provide a string that is used as the seed -- value for the random data splitting (for example, you can use the S3 -- path to your data as the random seed string). If you choose the -- random split strategy, Amazon ML assigns each row of data a -- pseudo-random number between 0 and 100, and then selects the rows -- that have an assigned number between @percentBegin@ and -- @percentEnd@. Pseudo-random numbers are assigned using both the -- input seed string value and the byte offset as a seed, so changing -- the data results in a different split. Any existing ordering is -- preserved. The random splitting strategy ensures that variables in -- the training and evaluation data are distributed similarly. It is -- useful in the cases where the input data may have an implicit sort -- order, which would otherwise result in training and evaluation -- datasources containing non-similar data records. -- -- The following two @DataRearrangement@ lines are examples of -- non-sequentially ordered training and evaluation datasources: -- -- Datasource for evaluation: -- @{\"splitting\":{\"percentBegin\":70, \"percentEnd\":100, \"strategy\":\"random\", \"randomSeed\"=\"s3:\/\/my_s3_path\/bucket\/file.csv\"}}@ -- -- Datasource for training: -- @{\"splitting\":{\"percentBegin\":70, \"percentEnd\":100, \"strategy\":\"random\", \"randomSeed\"=\"s3:\/\/my_s3_path\/bucket\/file.csv\", \"complement\":\"true\"}}@ s3DataSpec_dataRearrangement :: Lens.Lens' S3DataSpec (Prelude.Maybe Prelude.Text) s3DataSpec_dataRearrangement = Lens.lens (\S3DataSpec' {dataRearrangement} -> dataRearrangement) (\s@S3DataSpec' {} a -> s {dataRearrangement = a} :: S3DataSpec) -- | A JSON string that represents the schema for an Amazon S3 @DataSource@. -- The @DataSchema@ defines the structure of the observation data in the -- data file(s) referenced in the @DataSource@. -- -- You must provide either the @DataSchema@ or the @DataSchemaLocationS3@. -- -- Define your @DataSchema@ as a series of key-value pairs. @attributes@ -- and @excludedVariableNames@ have an array of key-value pairs for their -- value. Use the following format to define your @DataSchema@. -- -- { \"version\": \"1.0\", -- -- \"recordAnnotationFieldName\": \"F1\", -- -- \"recordWeightFieldName\": \"F2\", -- -- \"targetFieldName\": \"F3\", -- -- \"dataFormat\": \"CSV\", -- -- \"dataFileContainsHeader\": true, -- -- \"attributes\": [ -- -- { \"fieldName\": \"F1\", \"fieldType\": \"TEXT\" }, { \"fieldName\": -- \"F2\", \"fieldType\": \"NUMERIC\" }, { \"fieldName\": \"F3\", -- \"fieldType\": \"CATEGORICAL\" }, { \"fieldName\": \"F4\", -- \"fieldType\": \"NUMERIC\" }, { \"fieldName\": \"F5\", \"fieldType\": -- \"CATEGORICAL\" }, { \"fieldName\": \"F6\", \"fieldType\": \"TEXT\" }, { -- \"fieldName\": \"F7\", \"fieldType\": \"WEIGHTED_INT_SEQUENCE\" }, { -- \"fieldName\": \"F8\", \"fieldType\": \"WEIGHTED_STRING_SEQUENCE\" } ], -- -- \"excludedVariableNames\": [ \"F6\" ] } s3DataSpec_dataSchema :: Lens.Lens' S3DataSpec (Prelude.Maybe Prelude.Text) s3DataSpec_dataSchema = Lens.lens (\S3DataSpec' {dataSchema} -> dataSchema) (\s@S3DataSpec' {} a -> s {dataSchema = a} :: S3DataSpec) -- | Describes the schema location in Amazon S3. You must provide either the -- @DataSchema@ or the @DataSchemaLocationS3@. s3DataSpec_dataSchemaLocationS3 :: Lens.Lens' S3DataSpec (Prelude.Maybe Prelude.Text) s3DataSpec_dataSchemaLocationS3 = Lens.lens (\S3DataSpec' {dataSchemaLocationS3} -> dataSchemaLocationS3) (\s@S3DataSpec' {} a -> s {dataSchemaLocationS3 = a} :: S3DataSpec) -- | The location of the data file(s) used by a @DataSource@. The URI -- specifies a data file or an Amazon Simple Storage Service (Amazon S3) -- directory or bucket containing data files. s3DataSpec_dataLocationS3 :: Lens.Lens' S3DataSpec Prelude.Text s3DataSpec_dataLocationS3 = Lens.lens (\S3DataSpec' {dataLocationS3} -> dataLocationS3) (\s@S3DataSpec' {} a -> s {dataLocationS3 = a} :: S3DataSpec) instance Prelude.Hashable S3DataSpec where hashWithSalt _salt S3DataSpec' {..} = _salt `Prelude.hashWithSalt` dataRearrangement `Prelude.hashWithSalt` dataSchema `Prelude.hashWithSalt` dataSchemaLocationS3 `Prelude.hashWithSalt` dataLocationS3 instance Prelude.NFData S3DataSpec where rnf S3DataSpec' {..} = Prelude.rnf dataRearrangement `Prelude.seq` Prelude.rnf dataSchema `Prelude.seq` Prelude.rnf dataSchemaLocationS3 `Prelude.seq` Prelude.rnf dataLocationS3 instance Data.ToJSON S3DataSpec where toJSON S3DataSpec' {..} = Data.object ( Prelude.catMaybes [ ("DataRearrangement" Data..=) Prelude.<$> dataRearrangement, ("DataSchema" Data..=) Prelude.<$> dataSchema, ("DataSchemaLocationS3" Data..=) Prelude.<$> dataSchemaLocationS3, Prelude.Just ("DataLocationS3" Data..= dataLocationS3) ] )