ECS Fargate container deployment stuck "in progress" for 6 hours with no feedback

0

I have created an ECS Task Definition using the new EBS support as follows:

{
  "taskDefinitionArn": "arn:aws:ecs:eu-central-1:6634********:task-definition/pavelContainerSandboxStackDataPlaneCoreTaskDef033D5881:3",
  "containerDefinitions": [
    {
      "name": "restate",
      "image": "docker.io/restatedev/restate:0.7.0",
      "cpu": 0,
      "links": [],
      "portMappings": [
        {
          "containerPort": 8080,
          "hostPort": 8080,
          "protocol": "tcp"
        },
        {
          "containerPort": 9070,
          "hostPort": 9070,
          "protocol": "tcp"
        }
      ],
      "essential": true,
      "entryPoint": [],
      "command": [],
      "environment": [
        {
          "name": "LOG_FORMAT",
          "value": "Json"
        }
      ],
      "environmentFiles": [],
      "mountPoints": [
        {
          "sourceVolume": "data",
          "containerPath": "/target"
        }
      ],
      "volumesFrom": [],
      "secrets": [],
      "dnsServers": [],
      "dnsSearchDomains": [],
      "extraHosts": [],
      "dockerSecurityOptions": [],
      "dockerLabels": {},
      "ulimits": [],
      "logConfiguration": {
        "logDriver": "awslogs",
        "options": {
          "awslogs-group": "/restate/restate-server-logs",
          "awslogs-region": "eu-central-1",
          "awslogs-stream-prefix": "restate"
        },
        "secretOptions": []
      },
      "systemControls": []
    }
  ],
  "family": "pavelContainerSandboxStackDataPlaneCoreTaskDef033D5881",
  "taskRoleArn": "arn:aws:iam::6634********:role/pavel-ContainerSandboxSta-DataPlaneCoreTaskDefTaskR-KjAhnSvpZzRU",
  "executionRoleArn": "arn:aws:iam::6634********:role/pavel-ContainerSandboxSta-DataPlaneCoreTaskDefExecu-8AMoiMnipDgs",
  "networkMode": "awsvpc",
  "revision": 3,
  "volumes": [
    {
      "name": "data",
      "configuredAtLaunch": true
    }
  ],
  "status": "ACTIVE",
  "requiresAttributes": [
    {
      "name": "com.amazonaws.ecs.capability.logging-driver.awslogs"
    },
    {
      "name": "ecs.capability.execution-role-awslogs"
    },
    {
      "name": "com.amazonaws.ecs.capability.docker-remote-api.1.19"
    },
    {
      "name": "com.amazonaws.ecs.capability.docker-remote-api.1.17"
    },
    {
      "name": "com.amazonaws.ecs.capability.task-iam-role"
    },
    {
      "name": "com.amazonaws.ecs.capability.docker-remote-api.1.18"
    },
    {
      "name": "ecs.capability.task-eni"
    }
  ],
  "placementConstraints": [],
  "compatibilities": [
    "EC2",
    "FARGATE"
  ],
  "requiresCompatibilities": [
    "FARGATE"
  ],
  "cpu": "256",
  "memory": "512",
  "registeredAt": "2024-01-29T13:42:37.988Z",
  "registeredBy": "arn:aws:sts::6634********:assumed-role/cdk-hnb659fds-cfn-exec-role-6634********-eu-central-1/AWSCloudFormation",
  "tags": []
}

I tried to start it as follows:

await ecsClient.send(
    new ecs.CreateServiceCommand({
        cluster,
        taskDefinition,
        serviceName: "restate-test-3",
        deploymentController: {
            type: "ECS",
        },
        desiredCount: 1,
        launchType: "FARGATE",
        networkConfiguration: {
            awsvpcConfiguration: {
                subnets: ["subnet-0fa274bd75f089910"],
            },
        },
        deploymentConfiguration: {
            minimumHealthyPercent: 0,
            maximumPercent: 100,
        },
        volumeConfigurations: [
            {
                name: "data",
                managedEBSVolume: {
                    volumeType: "gp3",
                    sizeInGiB: 1,
                    roleArn: "arn:aws:iam::663487780041:role/pavel-ContainerSandboxSta-EcsInfrastructureRole2786-r6uB2XmsO6mW",
                },
            },
        ],
        tags: [
            {
                key: "restate:environment_id",
                value: "env-123456",
            },
            {
                key: "restate:account_id",
                value: "tenant-012345",
            },
        ],
    }),
)

It's been 6 hours and ECS still says there's a deployment in progress with zero other feedback:

% aws ecs describe-services --cluster core --services test-3 | jq
{
  "services": [
    {
      "serviceArn": "arn:aws:ecs:eu-central-1:6634********:service/core/test-3",
      "serviceName": "test-3",
      "clusterArn": "arn:aws:ecs:eu-central-1:6634********:cluster/core",
      "loadBalancers": [],
      "serviceRegistries": [],
      "status": "ACTIVE",
      "desiredCount": 1,
      "runningCount": 0,
      "pendingCount": 0,
      "launchType": "FARGATE",
      "platformVersion": "LATEST",
      "platformFamily": "Linux",
      "taskDefinition": "arn:aws:ecs:eu-central-1:6634********:task-definition/pavelContainerSandboxStackDataPlaneCoreTaskDef033D5881:3",
      "deploymentConfiguration": {
        "deploymentCircuitBreaker": {
          "enable": false,
          "rollback": false
        },
        "maximumPercent": 100,
        "minimumHealthyPercent": 0
      },
      "deployments": [
        {
          "id": "ecs-svc/5513642900068321699",
          "status": "PRIMARY",
          "taskDefinition": "arn:aws:ecs:eu-central-1:6634********:task-definition/pavelContainerSandboxStackDataPlaneCoreTaskDef033D5881:3",
          "desiredCount": 1,
          "pendingCount": 0,
          "runningCount": 0,
          "failedTasks": 0,
          "createdAt": "2024-01-29T16:55:38.714000+02:00",
          "updatedAt": "2024-01-29T17:25:34.507000+02:00",
          "launchType": "FARGATE",
          "platformVersion": "1.4.0",
          "platformFamily": "Linux",
          "networkConfiguration": {
            "awsvpcConfiguration": {
              "subnets": [
                "subnet-0fa274bd75f089910"
              ],
              "securityGroups": [],
              "assignPublicIp": "DISABLED"
            }
          },
          "rolloutState": "IN_PROGRESS",
          "rolloutStateReason": "ECS deployment ecs-svc/5513642900068321699 in progress.",
          "volumeConfigurations": [
            {
              "name": "data",
              "managedEBSVolume": {
                "volumeType": "gp3",
                "sizeInGiB": 1,
                "tagSpecifications": [
                  {
                    "resourceType": "volume"
                  }
                ],
                "roleArn": "arn:aws:iam::6634********:role/pavel-ContainerSandboxSta-EcsInfrastructureRole2786-r6uB2XmsO6mW",
                "filesystemType": "xfs"
              }
            }
          ]
        }
      ],
      "roleArn": "arn:aws:iam::6634********:role/aws-service-role/ecs.amazonaws.com/AWSServiceRoleForECS",
      "events": [],
      "createdAt": "2024-01-29T16:55:38.714000+02:00",
      "placementConstraints": [],
      "placementStrategy": [],
      "networkConfiguration": {
        "awsvpcConfiguration": {
          "subnets": [
            "subnet-0fa274bd75f089910"
          ],
          "securityGroups": [],
          "assignPublicIp": "DISABLED"
        }
      },
      "schedulingStrategy": "REPLICA",
      "deploymentController": {
        "type": "ECS"
      },
      "createdBy": "arn:aws:iam::6634********:role/pavel-ContainerSandboxSta-RestateServiceLauncherRol-fQYDMQMxXRWm",
      "enableECSManagedTags": false,
      "propagateTags": "NONE",
      "enableExecuteCommand": false
    }
  ],
  "failures": []
}

In CloudTrail, I see repeated failed attempts to call EBS CreateVolume from "ECSTaskVolumesForEBS" failing because it's a dry-run call:

{
  "eventVersion": "1.09",
  "userIdentity": {
    "type": "AssumedRole",
    "principalId": "ARO******:ECSTaskVolumesForEBS",
    "arn": "arn:aws:sts::6634********:assumed-role/pavel-ContainerSandboxSta-EcsInfrastructureRole2786-r6uB2XmsO6mW/ECSTaskVolumesForEBS",
    "accountId": "6634********",
    "sessionContext": {
      "sessionIssuer": {
        "type": "Role",
        "principalId": "ARO******",
        "arn": "arn:aws:iam::6634********:role/pavel-ContainerSandboxSta-EcsInfrastructureRole2786-r6uB2XmsO6mW",
        "accountId": "6634********",
        "userName": "pavel-ContainerSandboxSta-EcsInfrastructureRole2786-r6uB2XmsO6mW"
      },
      "attributes": {
        "creationDate": "2024-01-29T14:53:47Z",
        "mfaAuthenticated": "false"
      }
    },
    "invokedBy": "ecs.amazonaws.com"
  },
  "eventTime": "2024-01-29T14:53:47Z",
  "eventSource": "ec2.amazonaws.com",
  "eventName": "CreateVolume",
  "awsRegion": "eu-central-1",
  "sourceIPAddress": "ecs.amazonaws.com",
  "userAgent": "ecs.amazonaws.com",
  "errorCode": "Client.DryRunOperation",
  "errorMessage": "Request would have succeeded, but DryRun flag is set.",
  "requestParameters": {
    "size": "1",
    "zone": "eu-central-1b",
    "volumeType": "gp3",
    "tagSpecificationSet": {
      "items": [
        {
          "resourceType": "volume",
          "tags": [
            {
              "key": "AmazonECSManaged",
              "value": "true"
            },
            {
              "key": "AmazonECSCreated",
              "value": "arn:aws:ecs:eu-central-1:6634********:task/dev-core/*"
            }
          ]
        }
      ]
    },
    "multiAttachEnabled": false,
    "clientToken": "a45a9892-4aed-4283-acfe-90ea6de041c3"
  },
  "responseElements": null,
  "requestID": "f92328ac-92f1-4554-ab9e-66e8fdbfae1c",
  "eventID": "266b6220-0d47-415f-839d-d5017bd0809e",
  "readOnly": false,
  "eventType": "AwsApiCall",
  "managementEvent": true,
  "recipientAccountId": "6634********",
  "vpcEndpointId": "vpce-0d4186aa25991abed",
  "eventCategory": "Management"
}

Is this an ECS bug?

1 Antwort
0

Hello.

Have you set the IAM policy "AmazonECSInfrastructorRolePolicyFor Volumes" on your ECS infrastructure role?
This IAM policy includes actions to create an EBS volume.
https://docs.aws.amazon.com/AmazonECS/latest/userguide/security-iam-awsmanpol.html

profile picture
EXPERTE
beantwortet vor 3 Monaten
  • I have! The role pavel-ContainerSandboxSta-EcsInfrastructureRole2786 has AmazonECSInfrastructureRolePolicyForVolumes policy attached and can be assumed by ecs.amazonaws.com. If ECS didn't have appropriate permissions, I shouldn't be seeing the successful dry-run attempts in CloudTrail...

Du bist nicht angemeldet. Anmelden um eine Antwort zu veröffentlichen.

Eine gute Antwort beantwortet die Frage klar, gibt konstruktives Feedback und fördert die berufliche Weiterentwicklung des Fragenstellers.

Richtlinien für die Beantwortung von Fragen