Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
178 changes: 178 additions & 0 deletions qa/workunits/rbd/cli_generic.sh
Original file line number Diff line number Diff line change
Expand Up @@ -1246,6 +1246,44 @@ test_trash_purge_schedule() {
ceph osd pool rm rbd2 rbd2 --yes-i-really-really-mean-it
}

test_trash_purge_schedule_recovery() {
echo "testing recovery of trash_purge_schedule handler after module's RADOS client is blocklisted..."
remove_images
ceph osd pool create rbd3 8
rbd pool init rbd3
rbd namespace create rbd3/ns1

rbd trash purge schedule add -p rbd3/ns1 2d
rbd trash purge schedule ls -p rbd3 -R | grep 'rbd3 *ns1 *every 2d'

# Fetch and blocklist the rbd_support module's RADOS client
CLIENT_ADDR=$(ceph mgr dump | jq .active_clients[] |
jq 'select(.name == "rbd_support")' |
jq -r '[.addrvec[0].addr, "/", .addrvec[0].nonce|tostring] | add')
ceph osd blocklist add $CLIENT_ADDR
ceph osd blocklist ls | grep $CLIENT_ADDR

# Check that you can add a trash purge schedule after a few retries
expect_fail rbd trash purge schedule add -p rbd3 10m
sleep 10
for i in `seq 24`; do
rbd trash purge schedule add -p rbd3 10m && break
sleep 10
done

rbd trash purge schedule ls -p rbd3 -R | grep 'every 10m'
# Verify that the schedule present before client blocklisting is preserved
rbd trash purge schedule ls -p rbd3 -R | grep 'rbd3 *ns1 *every 2d'

rbd trash purge schedule remove -p rbd3 10m
rbd trash purge schedule remove -p rbd3/ns1 2d
rbd trash purge schedule ls -p rbd3 -R | expect_fail grep 'every 10m'
rbd trash purge schedule ls -p rbd3 -R | expect_fail grep 'rbd3 *ns1 *every 2d'

ceph osd pool rm rbd3 rbd3 --yes-i-really-really-mean-it

}

test_mirror_snapshot_schedule() {
echo "testing mirror snapshot schedule..."
remove_images
Expand Down Expand Up @@ -1358,6 +1396,54 @@ test_mirror_snapshot_schedule() {
ceph osd pool rm rbd2 rbd2 --yes-i-really-really-mean-it
}

test_mirror_snapshot_schedule_recovery() {
echo "testing recovery of mirror snapshot scheduler after module's RADOS client is blocklisted..."
remove_images
ceph osd pool create rbd3 8
rbd pool init rbd3
rbd namespace create rbd3/ns1

rbd mirror pool enable rbd3 image
rbd mirror pool enable rbd3/ns1 image
rbd mirror pool peer add rbd3 cluster1

rbd create $RBD_CREATE_ARGS -s 1 rbd3/ns1/test1
rbd mirror image enable rbd3/ns1/test1 snapshot
test "$(rbd mirror image status rbd3/ns1/test1 |
grep -c mirror.primary)" = '1'

rbd mirror snapshot schedule add -p rbd3/ns1 --image test1 1m
test "$(rbd mirror snapshot schedule ls -p rbd3/ns1 --image test1)" = 'every 1m'

# Fetch and blocklist rbd_support module's RADOS client
CLIENT_ADDR=$(ceph mgr dump | jq .active_clients[] |
jq 'select(.name == "rbd_support")' |
jq -r '[.addrvec[0].addr, "/", .addrvec[0].nonce|tostring] | add')
ceph osd blocklist add $CLIENT_ADDR
ceph osd blocklist ls | grep $CLIENT_ADDR

# Check that you can add a mirror snapshot schedule after a few retries
expect_fail rbd mirror snapshot schedule add -p rbd3/ns1 --image test1 2m
sleep 10
for i in `seq 24`; do
rbd mirror snapshot schedule add -p rbd3/ns1 --image test1 2m && break
sleep 10
done

rbd mirror snapshot schedule ls -p rbd3/ns1 --image test1 | grep 'every 2m'
# Verify that the schedule present before client blocklisting is preserved
rbd mirror snapshot schedule ls -p rbd3/ns1 --image test1 | grep 'every 1m'

rbd mirror snapshot schedule rm -p rbd3/ns1 --image test1 2m
rbd mirror snapshot schedule rm -p rbd3/ns1 --image test1 1m
rbd mirror snapshot schedule ls -p rbd3/ns1 --image test1 | expect_fail grep 'every 2m'
rbd mirror snapshot schedule ls -p rbd3/ns1 --image test1 | expect_fail grep 'every 1m'

rbd snap purge rbd3/ns1/test1
rbd rm rbd3/ns1/test1
ceph osd pool rm rbd3 rbd3 --yes-i-really-really-mean-it
}

test_perf_image_iostat() {
echo "testing perf image iostat..."
remove_images
Expand Down Expand Up @@ -1413,6 +1499,55 @@ test_perf_image_iostat() {
ceph osd pool rm rbd1 rbd1 --yes-i-really-really-mean-it
}

test_perf_image_iostat_recovery() {
echo "testing recovery of perf handler after module's RADOS client is blocklisted..."
remove_images

ceph osd pool create rbd3 8
rbd pool init rbd3
rbd namespace create rbd3/ns

IMAGE_SPECS=("rbd3/test1" "rbd3/ns/test2")
for spec in "${IMAGE_SPECS[@]}"; do
# ensure all images are created without a separate data pool
# as we filter iostat by specific pool specs below
rbd create $RBD_CREATE_ARGS --size 10G --rbd-default-data-pool '' $spec
done

BENCH_PIDS=()
for spec in "${IMAGE_SPECS[@]}"; do
rbd bench --io-type write --io-pattern rand --io-total 10G --io-threads 1 \
--rbd-cache false $spec >/dev/null 2>&1 &
BENCH_PIDS+=($!)
done

test "$(rbd perf image iostat --format json rbd3 |
jq -r 'map(.image) | sort | join(" ")')" = 'test1'

# Fetch and blocklist the rbd_support module's RADOS client
CLIENT_ADDR=$(ceph mgr dump | jq .active_clients[] |
jq 'select(.name == "rbd_support")' |
jq -r '[.addrvec[0].addr, "/", .addrvec[0].nonce|tostring] | add')
ceph osd blocklist add $CLIENT_ADDR
ceph osd blocklist ls | grep $CLIENT_ADDR

expect_fail rbd perf image iostat --format json rbd3/ns
sleep 10
for i in `seq 24`; do
test "$(rbd perf image iostat --format json rbd3/ns |
jq -r 'map(.image) | sort | join(" ")')" = 'test2' && break
sleep 10
done

for pid in "${BENCH_PIDS[@]}"; do
kill $pid
done
wait

remove_images
ceph osd pool rm rbd3 rbd3 --yes-i-really-really-mean-it
}

test_mirror_pool_peer_bootstrap_create() {
echo "testing mirror pool peer bootstrap create..."
remove_images
Expand Down Expand Up @@ -1508,6 +1643,45 @@ test_tasks_removed_pool() {
remove_images
}

test_tasks_recovery() {
echo "testing task handler recovery after module's RADOS client is blocklisted..."
remove_images

ceph osd pool create rbd2 8
rbd pool init rbd2

rbd create $RBD_CREATE_ARGS --size 1G rbd2/img1
rbd bench --io-type write --io-pattern seq --io-size 1M --io-total 1G rbd2/img1
rbd snap create rbd2/img1@snap
rbd snap protect rbd2/img1@snap
rbd clone rbd2/img1@snap rbd2/clone1

# Fetch and blocklist rbd_support module's RADOS client
CLIENT_ADDR=$(ceph mgr dump | jq .active_clients[] |
jq 'select(.name == "rbd_support")' |
jq -r '[.addrvec[0].addr, "/", .addrvec[0].nonce|tostring] | add')
ceph osd blocklist add $CLIENT_ADDR
ceph osd blocklist ls | grep $CLIENT_ADDR

expect_fail ceph rbd task add flatten rbd2/clone1
sleep 10
for i in `seq 24`; do
ceph rbd task add flatten rbd2/clone1 && break
sleep 10
done
test "$(ceph rbd task list)" != "[]"

for i in {1..12}; do
rbd info rbd2/clone1 | grep 'parent: ' || break
sleep 10
done
rbd info rbd2/clone1 | expect_fail grep 'parent: '
rbd snap unprotect rbd2/img1@snap

test "$(ceph rbd task list)" = "[]"
ceph osd pool rm rbd2 rbd2 --yes-i-really-really-mean-it
}

test_pool_image_args
test_rename
test_ls
Expand All @@ -1529,9 +1703,13 @@ test_clone_v2
test_thick_provision
test_namespace
test_trash_purge_schedule
test_trash_purge_schedule_recovery
test_mirror_snapshot_schedule
test_mirror_snapshot_schedule_recovery
test_perf_image_iostat
test_perf_image_iostat_recovery
test_mirror_pool_peer_bootstrap_create
test_tasks_removed_pool
test_tasks_recovery

echo OK
1 change: 1 addition & 0 deletions src/pybind/mgr/mgr_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -1342,6 +1342,7 @@ def shutdown(self) -> None:
addrs = self._rados.get_addrs()
self._rados.shutdown()
self._ceph_unregister_client(addrs)
self._rados = None

@API.expose
def get(self, data_name: str) -> Any:
Expand Down
19 changes: 16 additions & 3 deletions src/pybind/mgr/rbd_support/mirror_snapshot_schedule.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def __del__(self) -> None:
self.wait_for_pending()

def wait_for_pending(self) -> None:
self.log.debug("CreateSnapshotRequests.wait_for_pending")
with self.lock:
while self.pending:
self.condition.wait()
Expand Down Expand Up @@ -288,6 +289,7 @@ def finish(self, image_spec: ImageSpec) -> None:

with self.lock:
self.pending.remove(image_spec)
self.condition.notify()
if not self.queue:
return
image_spec = self.queue.pop(0)
Expand Down Expand Up @@ -329,7 +331,6 @@ class MirrorSnapshotScheduleHandler:

lock = Lock()
condition = Condition(lock)
thread = None

def __init__(self, module: Any) -> None:
self.module = module
Expand All @@ -339,16 +340,23 @@ def __init__(self, module: Any) -> None:

self.init_schedule_queue()

self.stop_thread = False
self.thread = Thread(target=self.run)
self.thread.start()

def _cleanup(self) -> None:
def shutdown(self) -> None:
self.log.info("MirrorSnapshotScheduleHandler: shutting down")
self.stop_thread = True
if self.thread.is_alive():
self.log.debug("MirrorSnapshotScheduleHandler: joining thread")
self.thread.join()
self.create_snapshot_requests.wait_for_pending()
self.log.info("MirrorSnapshotScheduleHandler: shut down")

def run(self) -> None:
try:
self.log.info("MirrorSnapshotScheduleHandler: starting")
while True:
while not self.stop_thread:
refresh_delay = self.refresh_images()
with self.lock:
(image_spec, wait_time) = self.dequeue()
Expand All @@ -360,6 +368,9 @@ def run(self) -> None:
with self.lock:
self.enqueue(datetime.now(), pool_id, namespace, image_id)

except (rados.ConnectionShutdown, rbd.ConnectionShutdown):
self.log.exception("MirrorSnapshotScheduleHandler: client blocklisted")
self.module.client_blocklisted.set()
except Exception as ex:
self.log.fatal("Fatal runtime error: {}\n{}".format(
ex, traceback.format_exc()))
Expand Down Expand Up @@ -450,6 +461,8 @@ def load_pool_images(self,
self.log.debug(
"load_pool_images: adding image {}".format(name))
images[pool_id][namespace][image_id] = name
except rbd.ConnectionShutdown:
raise
except Exception as e:
self.log.error(
"load_pool_images: exception when scanning pool {}: {}".format(
Expand Down
43 changes: 43 additions & 0 deletions src/pybind/mgr/rbd_support/module.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,11 @@
import inspect
import rados
import rbd
import traceback
from typing import cast, Any, Callable, Optional, Tuple, TypeVar

from mgr_module import CLIReadCommand, CLIWriteCommand, MgrModule, Option
from threading import Thread, Event

from .common import NotAuthorizedError
from .mirror_snapshot_schedule import image_validator, namespace_validator, \
Expand All @@ -35,6 +37,8 @@ class ImageSortBy(enum.Enum):
def with_latest_osdmap(func: FuncT) -> FuncT:
@functools.wraps(func)
def wrapper(self: 'Module', *args: Any, **kwargs: Any) -> Tuple[int, str, str]:
if not self.module_ready:
return -errno.EAGAIN, "", ""
# ensure we have latest pools available
self.rados.wait_for_latest_osdmap()
try:
Expand All @@ -46,6 +50,10 @@ def wrapper(self: 'Module', *args: Any, **kwargs: Any) -> Tuple[int, str, str]:
# log the full traceback but don't send it to the CLI user
self.log.exception("Fatal runtime error: ")
raise
except (rados.ConnectionShutdown, rbd.ConnectionShutdown) as ex:
self.log.debug("with_latest_osdmap: client blocklisted")
self.client_blocklisted.set()
return -errno.EAGAIN, "", str(ex)
except rados.Error as ex:
return -ex.errno, "", str(ex)
except rbd.OSError as ex:
Expand Down Expand Up @@ -74,11 +82,46 @@ class Module(MgrModule):

def __init__(self, *args: Any, **kwargs: Any) -> None:
super(Module, self).__init__(*args, **kwargs)
self.client_blocklisted = Event()
self.recovery_thread = Thread(target=self.run)
self.recovery_thread.start()
self.setup()

def setup(self) -> None:
self.log.info("starting setup")
# new client is created and registed in the MgrMap implicitly
# as 'rados' is a property attribute.
self.rados.wait_for_latest_osdmap()
self.mirror_snapshot_schedule = MirrorSnapshotScheduleHandler(self)
self.perf = PerfHandler(self)
self.task = TaskHandler(self)
self.trash_purge_schedule = TrashPurgeScheduleHandler(self)
self.log.info("setup complete")
self.module_ready = True

def run(self) -> None:
self.log.info("recovery thread starting")
try:
while True:
# block until rados client is blocklisted
self.client_blocklisted.wait()
self.log.info("restarting")
self.shutdown()
self.client_blocklisted.clear()
self.setup()
self.log.info("restarted")
except Exception as ex:
self.log.fatal("Fatal runtime error: {}\n{}".format(
ex, traceback.format_exc()))

def shutdown(self) -> None:
self.module_ready = False
self.mirror_snapshot_schedule.shutdown()
self.trash_purge_schedule.shutdown()
self.task.shutdown()
self.perf.shutdown()
# shut down client and deregister it from MgrMap
super().shutdown()

@CLIWriteCommand('rbd mirror snapshot schedule add')
@with_latest_osdmap
Expand Down
Loading