My dataset includes both categorical and numerical features, I have tried this:
"categorical_index": TrainingInput(
s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
"categorical_index"
].S3Output.S3Uri,
content_type="text/json",
),
as shown in the code below but it doesn't seem to solve the problem.
anybody has an idea?
#****************************************************************************************************************************TRAIN
# training step for generating model artifacts
from sagemaker import image_uris, model_uris, script_uris
from sagemaker import hyperparameters
from sagemaker.estimator import Estimator
from sagemaker.utils import name_from_base
train_model_id, train_model_version, train_scope = "catboost-classification-model", "*", "training"
# Retrieve the docker image
train_image_uri = image_uris.retrieve(
region=None,
framework=None,
model_id=train_model_id,
model_version=train_model_version,
image_scope=train_scope,
instance_type=training_instance_type
)
# Retrieve the training script
train_source_uri = script_uris.retrieve(
model_id=train_model_id, model_version=train_model_version, script_scope=train_scope
)
train_model_uri = model_uris.retrieve(
model_id=train_model_id, model_version=train_model_version, model_scope=train_scope
)
# Retrieve the default hyperparameters for training the model
hyperparameters = hyperparameters.retrieve_default(
model_id=train_model_id, model_version=train_model_version
)
# [Optional] Override default hyperparameters with custom values
hyperparameters["iterations"] = "1500"
hyperparameters["early_stopping_rounds"] = "100"
hyperparameters["eval_metric"] = "F1"
hyperparameters["learning_rate"] = "0.2"
print(hyperparameters)
s3_output_location= f"s3://{default_bucket}/{base_job_prefix}/"
# Create SageMaker Estimator instance
tabular_estimator = Estimator(
role=role,
image_uri=train_image_uri,
source_dir=train_source_uri,
model_uri=train_model_uri,
entry_point="transfer_learning.py",
instance_count=1,
instance_type=training_instance_type,
max_run=360000,
hyperparameters=hyperparameters,
output_path=s3_output_location
)
# Launch a SageMaker Training job by passing s3 path of the training data
from sagemaker.workflow.steps import CacheConfig
cache_config = CacheConfig(enable_caching=True, expire_after="PT1H")
step_train = TrainingStep(
name="TrainTeesModel",
estimator=tabular_estimator,
inputs={
"training": TrainingInput(
s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
"train"
].S3Output.S3Uri,
content_type="text/csv",
),
"validation": TrainingInput(
s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
"validation"
].S3Output.S3Uri,
content_type="text/csv",
),
"categorical_index": TrainingInput(
s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
"categorical_index"
].S3Output.S3Uri,
content_type="text/json",
),
},
)
Catboost is suposed to handle the categorical features , which also become helpful when running the inference pipeline consuming the model previously built (correct me if I am wrong) Now since I am using Catboost from Jumpstart, was wondering if there could be a clear way to pass those categorical features index to the training step.