Skip to content

[ILM] Rollover action errors after restart #34465

@colings86

Description

@colings86

4 node cluster with 1 hot, 2 warm and 1 cold node

  1. Create a policy:
PUT _ilm/my_lifecycle3
{
  "policy": {
    "phases": {
      "hot": {
        "actions": {
          "rollover": {
            "max_age": "30s"
          }
        }
      },
      "warm": {
        "actions": {
          "forcemerge": {
            "max_num_segments": 1
          },
          "allocate": {
            "number_of_replicas": 1,
            "include": {
              "box_type": "warm"
            },
            "exclude": {},
            "require": {}
          }
        }
      },
      "cold": {
        "minimum_age": "1m",
        "actions": {
          "allocate": {
            "number_of_replicas": 0,
            "include": {
              "box_type": "cold"
            },
            "exclude": {},
            "require": {}
          }
        }
      },
      "delete": {
        "minimum_age": "2m",
        "actions": {
          "delete": {}
        }
      }
    }
  }
}
  1. Create an index template:
PUT _template/my_template
{
  "index_patterns": ["test-*"],
  "settings": {
    "number_of_shards": 2,
    "number_of_replicas": 0,
    "index.lifecycle.name": "my_lifecycle3",
    "index.lifecycle.rollover_alias": "test-alias",
    "index.routing.allocation.include.box_type": "hot"
  }
}
  1. create the first index:
PUT test-000001
{
  "aliases": {
    "test-alias":{
      "is_write_index": true
    }
  }
}
  1. Check the index is on the attempt_rollover step using the explain api:
GET test-*/_ilm/explain?human
  1. Shutdown all nodes

  2. Restart all nodes

  3. Wait for the 2nd index to appear using the explain API:

GET test-*/_ilm/explain?human
  1. Observe that the first index is in the ERROR state with a response something like the following
{
  "indices": {
    "test-000001": {
      "index": "test-000001",
      "managed": true,
      "policy": "my_lifecycle3",
      "skip": false,
      "lifecycle_date": "2018-10-15T14:49:32.281Z",
      "phase": "hot",
      "phase_time": "2018-10-15T14:49:32.531Z",
      "action": "rollover",
      "action_time": "2018-10-15T14:49:32.531Z",
      "step": "ERROR",
      "step_time": "2018-10-15T14:50:18.632Z",
      "failed_step": "attempt_rollover",
      "step_info": {
        "type": "resource_already_exists_exception",
        "reason": "index [test-000002/cRr06akcS4mJXfZniAFfGQ] already exists",
        "index_uuid": "cRr06akcS4mJXfZniAFfGQ",
        "index": "test-000002"
      },
      "phase_execution": {
        "policy": "my_lifecycle3",
        "phase_definition": {
          "minimum_age": "0ms",
          "actions": {
            "rollover": {
              "max_age": "30s"
            }
          }
        },
        "version": 1,
        "modified_date": "2018-10-15T14:49:24.641Z",
        "modified_date_in_millis": 1539614964641
      }
    },
    "test-000002": {
      "index": "test-000002",
      "managed": true,
      "policy": "my_lifecycle3",
      "skip": false,
      "lifecycle_date": "2018-10-15T14:50:17.338Z",
      "phase": "new",
      "phase_time": "2018-10-15T14:50:18.725Z",
      "action": "complete",
      "action_time": "2018-10-15T14:50:18.384Z",
      "step": "complete",
      "step_time": "2018-10-15T14:50:18.384Z"
    }
  }
}

It seems like in this scenario the rollover step might be getting run twice?

Metadata

Metadata

Assignees

Labels

:Data Management/ILM+SLMDO NOT USE. Use ":StorageEngine/ILM" or ":Distributed Coordination/SLM" instead.>bugblocker

Type

No type
No fields configured for issues without a type.

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions