ECS Fargate container deployment stuck "in progress" for 6 hours with no feedback

0

I have created an ECS Task Definition using the new EBS support as follows:

{
  "taskDefinitionArn": "arn:aws:ecs:eu-central-1:6634********:task-definition/pavelContainerSandboxStackDataPlaneCoreTaskDef033D5881:3",
  "containerDefinitions": [
    {
      "name": "restate",
      "image": "docker.io/restatedev/restate:0.7.0",
      "cpu": 0,
      "links": [],
      "portMappings": [
        {
          "containerPort": 8080,
          "hostPort": 8080,
          "protocol": "tcp"
        },
        {
          "containerPort": 9070,
          "hostPort": 9070,
          "protocol": "tcp"
        }
      ],
      "essential": true,
      "entryPoint": [],
      "command": [],
      "environment": [
        {
          "name": "LOG_FORMAT",
          "value": "Json"
        }
      ],
      "environmentFiles": [],
      "mountPoints": [
        {
          "sourceVolume": "data",
          "containerPath": "/target"
        }
      ],
      "volumesFrom": [],
      "secrets": [],
      "dnsServers": [],
      "dnsSearchDomains": [],
      "extraHosts": [],
      "dockerSecurityOptions": [],
      "dockerLabels": {},
      "ulimits": [],
      "logConfiguration": {
        "logDriver": "awslogs",
        "options": {
          "awslogs-group": "/restate/restate-server-logs",
          "awslogs-region": "eu-central-1",
          "awslogs-stream-prefix": "restate"
        },
        "secretOptions": []
      },
      "systemControls": []
    }
  ],
  "family": "pavelContainerSandboxStackDataPlaneCoreTaskDef033D5881",
  "taskRoleArn": "arn:aws:iam::6634********:role/pavel-ContainerSandboxSta-DataPlaneCoreTaskDefTaskR-KjAhnSvpZzRU",
  "executionRoleArn": "arn:aws:iam::6634********:role/pavel-ContainerSandboxSta-DataPlaneCoreTaskDefExecu-8AMoiMnipDgs",
  "networkMode": "awsvpc",
  "revision": 3,
  "volumes": [
    {
      "name": "data",
      "configuredAtLaunch": true
    }
  ],
  "status": "ACTIVE",
  "requiresAttributes": [
    {
      "name": "com.amazonaws.ecs.capability.logging-driver.awslogs"
    },
    {
      "name": "ecs.capability.execution-role-awslogs"
    },
    {
      "name": "com.amazonaws.ecs.capability.docker-remote-api.1.19"
    },
    {
      "name": "com.amazonaws.ecs.capability.docker-remote-api.1.17"
    },
    {
      "name": "com.amazonaws.ecs.capability.task-iam-role"
    },
    {
      "name": "com.amazonaws.ecs.capability.docker-remote-api.1.18"
    },
    {
      "name": "ecs.capability.task-eni"
    }
  ],
  "placementConstraints": [],
  "compatibilities": [
    "EC2",
    "FARGATE"
  ],
  "requiresCompatibilities": [
    "FARGATE"
  ],
  "cpu": "256",
  "memory": "512",
  "registeredAt": "2024-01-29T13:42:37.988Z",
  "registeredBy": "arn:aws:sts::6634********:assumed-role/cdk-hnb659fds-cfn-exec-role-6634********-eu-central-1/AWSCloudFormation",
  "tags": []
}

I tried to start it as follows:

await ecsClient.send(
    new ecs.CreateServiceCommand({
        cluster,
        taskDefinition,
        serviceName: "restate-test-3",
        deploymentController: {
            type: "ECS",
        },
        desiredCount: 1,
        launchType: "FARGATE",
        networkConfiguration: {
            awsvpcConfiguration: {
                subnets: ["subnet-0fa274bd75f089910"],
            },
        },
        deploymentConfiguration: {
            minimumHealthyPercent: 0,
            maximumPercent: 100,
        },
        volumeConfigurations: [
            {
                name: "data",
                managedEBSVolume: {
                    volumeType: "gp3",
                    sizeInGiB: 1,
                    roleArn: "arn:aws:iam::663487780041:role/pavel-ContainerSandboxSta-EcsInfrastructureRole2786-r6uB2XmsO6mW",
                },
            },
        ],
        tags: [
            {
                key: "restate:environment_id",
                value: "env-123456",
            },
            {
                key: "restate:account_id",
                value: "tenant-012345",
            },
        ],
    }),
)

It's been 6 hours and ECS still says there's a deployment in progress with zero other feedback:

% aws ecs describe-services --cluster core --services test-3 | jq
{
  "services": [
    {
      "serviceArn": "arn:aws:ecs:eu-central-1:6634********:service/core/test-3",
      "serviceName": "test-3",
      "clusterArn": "arn:aws:ecs:eu-central-1:6634********:cluster/core",
      "loadBalancers": [],
      "serviceRegistries": [],
      "status": "ACTIVE",
      "desiredCount": 1,
      "runningCount": 0,
      "pendingCount": 0,
      "launchType": "FARGATE",
      "platformVersion": "LATEST",
      "platformFamily": "Linux",
      "taskDefinition": "arn:aws:ecs:eu-central-1:6634********:task-definition/pavelContainerSandboxStackDataPlaneCoreTaskDef033D5881:3",
      "deploymentConfiguration": {
        "deploymentCircuitBreaker": {
          "enable": false,
          "rollback": false
        },
        "maximumPercent": 100,
        "minimumHealthyPercent": 0
      },
      "deployments": [
        {
          "id": "ecs-svc/5513642900068321699",
          "status": "PRIMARY",
          "taskDefinition": "arn:aws:ecs:eu-central-1:6634********:task-definition/pavelContainerSandboxStackDataPlaneCoreTaskDef033D5881:3",
          "desiredCount": 1,
          "pendingCount": 0,
          "runningCount": 0,
          "failedTasks": 0,
          "createdAt": "2024-01-29T16:55:38.714000+02:00",
          "updatedAt": "2024-01-29T17:25:34.507000+02:00",
          "launchType": "FARGATE",
          "platformVersion": "1.4.0",
          "platformFamily": "Linux",
          "networkConfiguration": {
            "awsvpcConfiguration": {
              "subnets": [
                "subnet-0fa274bd75f089910"
              ],
              "securityGroups": [],
              "assignPublicIp": "DISABLED"
            }
          },
          "rolloutState": "IN_PROGRESS",
          "rolloutStateReason": "ECS deployment ecs-svc/5513642900068321699 in progress.",
          "volumeConfigurations": [
            {
              "name": "data",
              "managedEBSVolume": {
                "volumeType": "gp3",
                "sizeInGiB": 1,
                "tagSpecifications": [
                  {
                    "resourceType": "volume"
                  }
                ],
                "roleArn": "arn:aws:iam::6634********:role/pavel-ContainerSandboxSta-EcsInfrastructureRole2786-r6uB2XmsO6mW",
                "filesystemType": "xfs"
              }
            }
          ]
        }
      ],
      "roleArn": "arn:aws:iam::6634********:role/aws-service-role/ecs.amazonaws.com/AWSServiceRoleForECS",
      "events": [],
      "createdAt": "2024-01-29T16:55:38.714000+02:00",
      "placementConstraints": [],
      "placementStrategy": [],
      "networkConfiguration": {
        "awsvpcConfiguration": {
          "subnets": [
            "subnet-0fa274bd75f089910"
          ],
          "securityGroups": [],
          "assignPublicIp": "DISABLED"
        }
      },
      "schedulingStrategy": "REPLICA",
      "deploymentController": {
        "type": "ECS"
      },
      "createdBy": "arn:aws:iam::6634********:role/pavel-ContainerSandboxSta-RestateServiceLauncherRol-fQYDMQMxXRWm",
      "enableECSManagedTags": false,
      "propagateTags": "NONE",
      "enableExecuteCommand": false
    }
  ],
  "failures": []
}

In CloudTrail, I see repeated failed attempts to call EBS CreateVolume from "ECSTaskVolumesForEBS" failing because it's a dry-run call:

{
  "eventVersion": "1.09",
  "userIdentity": {
    "type": "AssumedRole",
    "principalId": "ARO******:ECSTaskVolumesForEBS",
    "arn": "arn:aws:sts::6634********:assumed-role/pavel-ContainerSandboxSta-EcsInfrastructureRole2786-r6uB2XmsO6mW/ECSTaskVolumesForEBS",
    "accountId": "6634********",
    "sessionContext": {
      "sessionIssuer": {
        "type": "Role",
        "principalId": "ARO******",
        "arn": "arn:aws:iam::6634********:role/pavel-ContainerSandboxSta-EcsInfrastructureRole2786-r6uB2XmsO6mW",
        "accountId": "6634********",
        "userName": "pavel-ContainerSandboxSta-EcsInfrastructureRole2786-r6uB2XmsO6mW"
      },
      "attributes": {
        "creationDate": "2024-01-29T14:53:47Z",
        "mfaAuthenticated": "false"
      }
    },
    "invokedBy": "ecs.amazonaws.com"
  },
  "eventTime": "2024-01-29T14:53:47Z",
  "eventSource": "ec2.amazonaws.com",
  "eventName": "CreateVolume",
  "awsRegion": "eu-central-1",
  "sourceIPAddress": "ecs.amazonaws.com",
  "userAgent": "ecs.amazonaws.com",
  "errorCode": "Client.DryRunOperation",
  "errorMessage": "Request would have succeeded, but DryRun flag is set.",
  "requestParameters": {
    "size": "1",
    "zone": "eu-central-1b",
    "volumeType": "gp3",
    "tagSpecificationSet": {
      "items": [
        {
          "resourceType": "volume",
          "tags": [
            {
              "key": "AmazonECSManaged",
              "value": "true"
            },
            {
              "key": "AmazonECSCreated",
              "value": "arn:aws:ecs:eu-central-1:6634********:task/dev-core/*"
            }
          ]
        }
      ]
    },
    "multiAttachEnabled": false,
    "clientToken": "a45a9892-4aed-4283-acfe-90ea6de041c3"
  },
  "responseElements": null,
  "requestID": "f92328ac-92f1-4554-ab9e-66e8fdbfae1c",
  "eventID": "266b6220-0d47-415f-839d-d5017bd0809e",
  "readOnly": false,
  "eventType": "AwsApiCall",
  "managementEvent": true,
  "recipientAccountId": "6634********",
  "vpcEndpointId": "vpce-0d4186aa25991abed",
  "eventCategory": "Management"
}

Is this an ECS bug?

1 Answer
0

Hello.

Have you set the IAM policy "AmazonECSInfrastructorRolePolicyFor Volumes" on your ECS infrastructure role?
This IAM policy includes actions to create an EBS volume.
https://docs.aws.amazon.com/AmazonECS/latest/userguide/security-iam-awsmanpol.html

profile picture
EXPERT
answered 3 months ago
  • I have! The role pavel-ContainerSandboxSta-EcsInfrastructureRole2786 has AmazonECSInfrastructureRolePolicyForVolumes policy attached and can be assumed by ecs.amazonaws.com. If ECS didn't have appropriate permissions, I shouldn't be seeing the successful dry-run attempts in CloudTrail...

You are not logged in. Log in to post an answer.

A good answer clearly answers the question and provides constructive feedback and encourages professional growth in the question asker.

Guidelines for Answering Questions