Skip to content

Commit eba2919

Browse files
authored
[WB-4607] Add wandb artifact cache cleanup command (#1895)
1 parent 5b476cf commit eba2919

File tree

10 files changed

+180
-29
lines changed

10 files changed

+180
-29
lines changed

tests/test_cli.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -282,7 +282,7 @@ def test_artifact_ls(runner, git_repo, mock_server):
282282
print(result.exception)
283283
print(traceback.print_tb(result.exc_info[2]))
284284
assert result.exit_code == 0
285-
assert "9KB" in result.output
285+
assert "10.0KB" in result.output
286286
assert "mnist:v2" in result.output
287287

288288

tests/test_util.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -234,10 +234,24 @@ def test_parse_sweep_id():
234234
assert parts == {"name": "test", "entity": "test", "project": "test"}
235235

236236

237-
def test_sizeof_fmt():
238-
assert util.sizeof_fmt(1000) == "1000.0B"
239-
assert util.sizeof_fmt(1000000) == "976.6KiB"
240-
assert util.sizeof_fmt(5000000) == "4.8MiB"
237+
def test_from_human_size():
238+
assert util.from_human_size("1000B", units=util.POW_2_BYTES) == 1000
239+
assert util.from_human_size("976.6KiB", units=util.POW_2_BYTES) == 1000038
240+
assert util.from_human_size("4.8MiB", units=util.POW_2_BYTES) == 5033164
241+
242+
assert util.from_human_size("1000.0B") == 1000
243+
assert util.from_human_size("1000KB") == 1000000
244+
assert util.from_human_size("5.0MB") == 5000000
245+
246+
247+
def test_to_human_size():
248+
assert util.to_human_size(1000, units=util.POW_2_BYTES) == "1000.0B"
249+
assert util.to_human_size(1000000, units=util.POW_2_BYTES) == "976.6KiB"
250+
assert util.to_human_size(5000000, units=util.POW_2_BYTES) == "4.8MiB"
251+
252+
assert util.to_human_size(1000) == "1000.0B"
253+
assert util.to_human_size(1000000) == "1000.0KB"
254+
assert util.to_human_size(5000000) == "5.0MB"
241255

242256

243257
def test_matplotlib_contains_images():
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
import os
2+
import time
3+
4+
from wandb import wandb_sdk
5+
6+
7+
def test_artifacts_cache_cleanup_empty(runner):
8+
with runner.isolated_filesystem():
9+
os.mkdir("cache")
10+
cache = wandb_sdk.wandb_artifacts.ArtifactsCache("cache")
11+
reclaimed_bytes = cache.cleanup(100000)
12+
assert reclaimed_bytes == 0
13+
14+
15+
def test_artifacts_cache_cleanup(runner):
16+
with runner.isolated_filesystem():
17+
os.makedirs("cache/obj/md5/aa/")
18+
with open("cache/obj/md5/aa/aardvark", "w") as f:
19+
f.truncate(5000)
20+
time.sleep(0.1)
21+
22+
os.makedirs("cache/obj/md5/ab/")
23+
with open("cache/obj/md5/ab/absolute", "w") as f:
24+
f.truncate(2000)
25+
time.sleep(0.1)
26+
27+
os.makedirs("cache/obj/md5/ac/")
28+
with open("cache/obj/md5/ac/accelerate", "w") as f:
29+
f.truncate(1000)
30+
31+
cache = wandb_sdk.wandb_artifacts.ArtifactsCache("cache")
32+
reclaimed_bytes = cache.cleanup(5000)
33+
34+
# We should get rid of "aardvark" in this case
35+
assert reclaimed_bytes == 5000

wandb/apis/public.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1702,7 +1702,9 @@ def delete(self):
17021702

17031703
def __repr__(self):
17041704
return "<File {} ({}) {}>".format(
1705-
self.name, self.mimetype, util.sizeof_fmt(self.size)
1705+
self.name,
1706+
self.mimetype,
1707+
util.to_human_size(self.size, units=util.POW_2_BYTES),
17061708
)
17071709

17081710

wandb/cli/cli.py

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1232,14 +1232,6 @@ def ls(path, type):
12321232
else:
12331233
types = public_api.artifact_types(path)
12341234

1235-
def human_size(bytes, units=None):
1236-
units = units or ["", "KB", "MB", "GB", "TB", "PB", "EB"]
1237-
return (
1238-
str(bytes) + units[0]
1239-
if bytes < 1024
1240-
else human_size(bytes >> 10, units[1:])
1241-
)
1242-
12431235
for kind in types:
12441236
for collection in kind.collections():
12451237
versions = public_api.artifact_versions(
@@ -1250,11 +1242,32 @@ def human_size(bytes, units=None):
12501242
latest = next(versions)
12511243
print(
12521244
"{:<15s}{:<15s}{:>15s} {:<20s}".format(
1253-
kind.type, latest.updated_at, human_size(latest.size), latest.name
1245+
kind.type,
1246+
latest.updated_at,
1247+
util.to_human_size(latest.size),
1248+
latest.name,
12541249
)
12551250
)
12561251

12571252

1253+
@artifact.group(help="Commands for interacting with the artifact cache")
1254+
def cache():
1255+
pass
1256+
1257+
1258+
@cache.command(
1259+
context_settings=CONTEXT,
1260+
help="Clean up less frequently used files from the artifacts cache",
1261+
)
1262+
@click.argument("target_size")
1263+
@display_error
1264+
def cleanup(target_size):
1265+
target_size = util.from_human_size(target_size)
1266+
cache = wandb_sdk.wandb_artifacts.get_artifacts_cache()
1267+
reclaimed_bytes = cache.cleanup(target_size)
1268+
print("Reclaimed {} of space".format(util.to_human_size(reclaimed_bytes)))
1269+
1270+
12581271
@cli.command(context_settings=CONTEXT, help="Pull files from Weights & Biases")
12591272
@click.argument("run", envvar=env.RUN_ID)
12601273
@click.option(

wandb/sdk/interface/artifacts.py

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from wandb.data_types import WBValue
1212

1313
if wandb.TYPE_CHECKING: # type: ignore
14-
from typing import List, Optional, Union
14+
from typing import List, Optional, Union, Dict
1515

1616

1717
def md5_string(string):
@@ -727,11 +727,36 @@ def get_artifact(self, artifact_id):
727727
def store_artifact(self, artifact):
728728
self._artifacts_by_id[artifact.id] = artifact
729729

730+
def cleanup(self, target_size: int) -> int:
731+
bytes_reclaimed: int = 0
732+
paths: Dict[os.PathLike, os.stat_result] = {}
733+
total_size: int = 0
734+
for root, _, files in os.walk(self._cache_dir):
735+
for file in files:
736+
path = os.path.join(root, file)
737+
stat_res = os.stat(path)
738+
paths[path] = stat_res
739+
total_size += stat_res.st_size
740+
741+
sorted_paths = sorted(paths.items(), key=lambda x: x[1].st_atime)
742+
for path, stat in sorted_paths:
743+
if total_size < target_size:
744+
return bytes_reclaimed
745+
746+
try:
747+
os.remove(path)
748+
except OSError:
749+
pass
750+
751+
total_size -= stat.st_size
752+
bytes_reclaimed += stat.st_size
753+
return bytes_reclaimed
754+
730755

731756
_artifacts_cache = None
732757

733758

734-
def get_artifacts_cache():
759+
def get_artifacts_cache() -> ArtifactsCache:
735760
global _artifacts_cache
736761
if _artifacts_cache is None:
737762
cache_dir = os.path.join(env.get_cache_dir(), "artifacts")

wandb/sdk/wandb_artifacts.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,11 @@
1919
from wandb.errors.error import CommError
2020
from wandb.errors.term import termlog, termwarn
2121

22-
from .interface.artifacts import (
22+
from .interface.artifacts import ( # noqa: F401 pylint: disable=unused-import
2323
Artifact as ArtifactInterface,
2424
ArtifactEntry,
2525
ArtifactManifest,
26+
ArtifactsCache,
2627
b64_string_to_hex,
2728
get_artifacts_cache,
2829
md5_file_b64,

wandb/sdk_py27/interface/artifacts.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from wandb.data_types import WBValue
1212

1313
if wandb.TYPE_CHECKING: # type: ignore
14-
from typing import List, Optional, Union
14+
from typing import List, Optional, Union, Dict
1515

1616

1717
def md5_string(string):
@@ -727,6 +727,31 @@ def get_artifact(self, artifact_id):
727727
def store_artifact(self, artifact):
728728
self._artifacts_by_id[artifact.id] = artifact
729729

730+
def cleanup(self, target_size):
731+
bytes_reclaimed = 0
732+
paths = {}
733+
total_size = 0
734+
for root, _, files in os.walk(self._cache_dir):
735+
for file in files:
736+
path = os.path.join(root, file)
737+
stat_res = os.stat(path)
738+
paths[path] = stat_res
739+
total_size += stat_res.st_size
740+
741+
sorted_paths = sorted(paths.items(), key=lambda x: x[1].st_atime)
742+
for path, stat in sorted_paths:
743+
if total_size < target_size:
744+
return bytes_reclaimed
745+
746+
try:
747+
os.remove(path)
748+
except OSError:
749+
pass
750+
751+
total_size -= stat.st_size
752+
bytes_reclaimed += stat.st_size
753+
return bytes_reclaimed
754+
730755

731756
_artifacts_cache = None
732757

wandb/sdk_py27/wandb_artifacts.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,11 @@
1919
from wandb.errors.error import CommError
2020
from wandb.errors.term import termlog, termwarn
2121

22-
from .interface.artifacts import (
22+
from .interface.artifacts import ( # noqa: F401 pylint: disable=unused-import
2323
Artifact as ArtifactInterface,
2424
ArtifactEntry,
2525
ArtifactManifest,
26+
ArtifactsCache,
2627
b64_string_to_hex,
2728
get_artifacts_cache,
2829
md5_file_b64,

wandb/util.py

Lines changed: 44 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,26 @@
7171
environment=SENTRY_ENV,
7272
)
7373

74+
POW_10_BYTES = [
75+
("B", 10 ** 0),
76+
("KB", 10 ** 3),
77+
("MB", 10 ** 6),
78+
("GB", 10 ** 9),
79+
("TB", 10 ** 12),
80+
("PB", 10 ** 15),
81+
("EB", 10 ** 18),
82+
]
83+
84+
POW_2_BYTES = [
85+
("B", 2 ** 0),
86+
("KiB", 2 ** 10),
87+
("MiB", 2 ** 20),
88+
("GiB", 2 ** 30),
89+
("TiB", 2 ** 40),
90+
("PiB", 2 ** 50),
91+
("EiB", 2 ** 60),
92+
]
93+
7494

7595
def sentry_message(message):
7696
if error_reporting_enabled():
@@ -1007,15 +1027,30 @@ def isatty(ob):
10071027
return hasattr(ob, "isatty") and ob.isatty()
10081028

10091029

1010-
def sizeof_fmt(num, suffix="B"):
1011-
"""Pretty print file size
1012-
https://stackoverflow.com/questions/1094841/reusable-library-to-get-human-readable-version-of-file-size
1013-
"""
1014-
for unit in ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"]:
1015-
if abs(num) < 1024.0:
1016-
return "%3.1f%s%s" % (num, unit, suffix)
1017-
num /= 1024.0
1018-
return "%.1f%s%s" % (num, "Yi", suffix)
1030+
def to_human_size(bytes, units=None):
1031+
units = units or POW_10_BYTES
1032+
unit, value = units[0]
1033+
factor = round(float(bytes) / value, 1)
1034+
return (
1035+
"{}{}".format(factor, unit)
1036+
if factor < 1024 or len(units) == 1
1037+
else to_human_size(bytes, units[1:])
1038+
)
1039+
1040+
1041+
def from_human_size(size, units=None):
1042+
units = {unit.upper(): value for (unit, value) in units or POW_10_BYTES}
1043+
regex = re.compile(
1044+
r"(\d+\.?\d*)\s*({})?".format("|".join(units.keys())), re.IGNORECASE
1045+
)
1046+
match = re.match(regex, size)
1047+
if not match:
1048+
raise ValueError("Size must be of the form `10`, `10B` or `10 B`.")
1049+
factor, unit = (
1050+
float(match.group(1)),
1051+
units[match.group(2).upper()] if match.group(2) else 1,
1052+
)
1053+
return int(factor * unit)
10191054

10201055

10211056
def auto_project_name(program):

0 commit comments

Comments
 (0)