Skip to content

Commit cdca5af

Browse files
authored
Revert "[Dashboard] Turn on New Dashboard by Default (#11321)" (#11502)
This reverts commit f500292.
1 parent cbc5dac commit cdca5af

15 files changed

Lines changed: 814 additions & 123 deletions

File tree

.gitignore

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
# The build output should clearly not be checked in
2-
*test-output.xml
32
/bazel-*
43
/python/ray/core
54
/python/ray/pickle5_files/
@@ -12,7 +11,7 @@
1211
/thirdparty/pkg/
1312
/build/java
1413
.jar
15-
/dashboard/client/build
14+
1615
# Files generated by flatc should be ignored
1716
/src/ray/gcs/format/*_generated.h
1817
/src/ray/object_manager/format/*_generated.h

ci/travis/test-wheels.sh

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ if [ -z "${BUILD_DIR}" ]; then
2222
fi
2323
TEST_DIR="${BUILD_DIR}/python/ray/tests"
2424
TEST_SCRIPTS=("$TEST_DIR/test_microbenchmarks.py" "$TEST_DIR/test_basic.py")
25+
UI_TEST_SCRIPT="${BUILD_DIR}/python/ray/tests/test_webui.py"
2526

2627
function retry {
2728
local n=1
@@ -76,6 +77,9 @@ if [[ "$platform" == "linux" ]]; then
7677
for SCRIPT in "${TEST_SCRIPTS[@]}"; do
7778
retry "$PYTHON_EXE" "$SCRIPT"
7879
done
80+
81+
# Run the UI test to make sure that the packaged UI works.
82+
retry "$PYTHON_EXE" "$UI_TEST_SCRIPT"
7983
done
8084

8185
# Check that the other wheels are present.
@@ -114,6 +118,12 @@ elif [[ "$platform" == "macosx" ]]; then
114118
for SCRIPT in "${TEST_SCRIPTS[@]}"; do
115119
retry "$PYTHON_EXE" "$SCRIPT"
116120
done
121+
122+
if (( $(echo "$PY_MM >= 3.0" | bc) )); then
123+
# Run the UI test to make sure that the packaged UI works.
124+
retry "$PYTHON_EXE" "$UI_TEST_SCRIPT"
125+
fi
126+
117127
done
118128
elif [ "${platform}" = windows ]; then
119129
echo "WARNING: Wheel testing not yet implemented for Windows."

dashboard/agent.py

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@
3838
class DashboardAgent(object):
3939
def __init__(self,
4040
redis_address,
41-
dashboard_agent_port,
4241
redis_password=None,
4342
temp_dir=None,
4443
log_dir=None,
@@ -52,7 +51,6 @@ def __init__(self,
5251
self.redis_password = redis_password
5352
self.temp_dir = temp_dir
5453
self.log_dir = log_dir
55-
self.dashboard_agent_port = dashboard_agent_port
5654
self.metrics_export_port = metrics_export_port
5755
self.node_manager_port = node_manager_port
5856
self.object_store_name = object_store_name
@@ -61,8 +59,7 @@ def __init__(self,
6159
assert self.node_id, "Empty node id (RAY_NODE_ID)."
6260
self.ip = ray._private.services.get_node_ip_address()
6361
self.server = aiogrpc.server(options=(("grpc.so_reuseport", 0), ))
64-
self.grpc_port = self.server.add_insecure_port(
65-
f"[::]:{self.dashboard_agent_port}")
62+
self.grpc_port = self.server.add_insecure_port("[::]:0")
6663
logger.info("Dashboard agent grpc address: %s:%s", self.ip,
6764
self.grpc_port)
6865
self.aioredis_client = None
@@ -189,11 +186,6 @@ async def _check_parent():
189186
required=True,
190187
type=int,
191188
help="The port to expose metrics through Prometheus.")
192-
parser.add_argument(
193-
"--dashboard-agent-port",
194-
required=True,
195-
type=int,
196-
help="The port on which the dashboard agent will receive GRPCs.")
197189
parser.add_argument(
198190
"--node-manager-port",
199191
required=True,
@@ -296,7 +288,6 @@ async def _check_parent():
296288

297289
agent = DashboardAgent(
298290
args.redis_address,
299-
args.dashboard_agent_port,
300291
redis_password=args.redis_password,
301292
temp_dir=temp_dir,
302293
log_dir=log_dir,

dashboard/dashboard.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
except ImportError:
44
print("The dashboard requires aiohttp to run.")
55
import sys
6+
67
sys.exit(1)
78

89
import argparse

dashboard/datacenter.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -111,13 +111,15 @@ async def get_node_info(cls, node_id):
111111
node_physical_stats = DataSource.node_physical_stats.get(node_id, {})
112112
node_stats = DataSource.node_stats.get(node_id, {})
113113
node = DataSource.nodes.get(node_id, {})
114-
node_ip = DataSource.node_id_to_ip.get(node_id)
114+
115115
# Merge node log count information into the payload
116-
log_info = DataSource.ip_and_pid_to_logs.get(node_ip, {})
116+
log_info = DataSource.ip_and_pid_to_logs.get(node_physical_stats["ip"],
117+
{})
117118
node_log_count = 0
118119
for entries in log_info.values():
119120
node_log_count += len(entries)
120-
error_info = DataSource.ip_and_pid_to_errors.get(node_ip, {})
121+
error_info = DataSource.ip_and_pid_to_errors.get(
122+
node_physical_stats["ip"], {})
121123
node_err_count = 0
122124
for entries in error_info.values():
123125
node_err_count += len(entries)

dashboard/modules/logical_view/test_logical_view_head.py

Lines changed: 2 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,9 @@ class InfeasibleActor:
3333
foo_actors = [Foo.remote(4), Foo.remote(5)]
3434
infeasible_actor = InfeasibleActor.remote() # noqa
3535
results = [actor.do_task.remote() for actor in foo_actors] # noqa
36+
assert (wait_until_server_available(ray_start_with_dashboard["webui_url"])
37+
is True)
3638
webui_url = ray_start_with_dashboard["webui_url"]
37-
assert wait_until_server_available(webui_url)
3839
webui_url = format_web_url(webui_url)
3940

4041
timeout_seconds = 5
@@ -74,66 +75,5 @@ class InfeasibleActor:
7475
raise Exception(f"Timed out while testing, {ex_stack}")
7576

7677

77-
def test_kill_actor(ray_start_with_dashboard):
78-
@ray.remote
79-
class Actor:
80-
def __init__(self):
81-
pass
82-
83-
def f(self):
84-
ray.show_in_dashboard("test")
85-
return os.getpid()
86-
87-
a = Actor.remote()
88-
worker_pid = ray.get(a.f.remote()) # noqa
89-
90-
webui_url = ray_start_with_dashboard["webui_url"]
91-
assert wait_until_server_available(webui_url)
92-
webui_url = format_web_url(webui_url)
93-
94-
def actor_killed(pid):
95-
"""Check For the existence of a unix pid."""
96-
try:
97-
os.kill(pid, 0)
98-
except OSError:
99-
return True
100-
else:
101-
return False
102-
103-
def get_actor():
104-
resp = requests.get(f"{webui_url}/logical/actor_groups")
105-
resp.raise_for_status()
106-
actor_groups_resp = resp.json()
107-
assert actor_groups_resp["result"] is True, actor_groups_resp["msg"]
108-
actor_groups = actor_groups_resp["data"]["actorGroups"]
109-
actor = actor_groups["Actor"]["entries"][0]
110-
return actor
111-
112-
def kill_actor_using_dashboard(actor):
113-
resp = requests.get(
114-
webui_url + "/logical/kill_actor",
115-
params={
116-
"actorId": actor["actorId"],
117-
"ipAddress": actor["ipAddress"],
118-
"port": actor["port"]
119-
})
120-
resp.raise_for_status()
121-
resp_json = resp.json()
122-
assert resp_json["result"] is True, "msg" in resp_json
123-
124-
start = time.time()
125-
last_exc = None
126-
while time.time() - start <= 10:
127-
try:
128-
actor = get_actor()
129-
kill_actor_using_dashboard(actor)
130-
last_exc = None
131-
break
132-
except (KeyError, AssertionError) as e:
133-
last_exc = e
134-
time.sleep(.1)
135-
assert last_exc is None
136-
137-
13878
if __name__ == "__main__":
13979
sys.exit(pytest.main(["-v", __file__]))

dashboard/modules/reporter/reporter_agent.py

Lines changed: 23 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -94,15 +94,23 @@ async def GetProfilingStats(self, request, context):
9494
return reporter_pb2.GetProfilingStatsReply(
9595
profiling_stats=profiling_stats, std_out=stdout, std_err=stderr)
9696

97-
async def ReportOCMetrics(self, request, context):
98-
# This function receives a GRPC containing OpenCensus (OC) metrics
99-
# from a Ray process, then exposes those metrics to Prometheus.
97+
async def ReportMetrics(self, request, context):
98+
# NOTE: Exceptions are not propagated properly
99+
# when we don't catch them here.
100100
try:
101-
self._metrics_agent.record_metric_points_from_protobuf(
102-
request.metrics)
103-
except Exception:
101+
metrcs_description_required = (
102+
self._metrics_agent.record_metrics_points(
103+
request.metrics_points))
104+
except Exception as e:
105+
logger.error(e)
104106
logger.error(traceback.format_exc())
105-
return reporter_pb2.ReportOCMetricsReply()
107+
108+
# If metrics description is missing, we should notify cpp processes
109+
# that we need them. Cpp processes will then report them to here.
110+
# We need it when (1) a new metric is reported (application metric)
111+
# (2) a reporter goes down and restarted (currently not implemented).
112+
return reporter_pb2.ReportMetricsReply(
113+
metrcs_description_required=metrcs_description_required)
106114

107115
@staticmethod
108116
def _get_cpu_percent():
@@ -117,7 +125,8 @@ def _get_gpu_usage():
117125
try:
118126
gpus = gpustat.new_query().gpus
119127
except Exception as e:
120-
logger.debug(f"gpustat failed to retrieve GPU information: {e}")
128+
logger.debug(
129+
"gpustat failed to retrieve GPU information: {}".format(e))
121130
for gpu in gpus:
122131
# Note the keys in this dict have periods which throws
123132
# off javascript so we change .s to _s
@@ -224,8 +233,12 @@ def _get_all_stats(self):
224233
"cmdline": self._get_raylet_cmdline(),
225234
}
226235

227-
async def _perform_iteration(self, aioredis_client):
236+
async def _perform_iteration(self):
228237
"""Get any changes to the log files and push updates to Redis."""
238+
aioredis_client = await aioredis.create_redis_pool(
239+
address=self._dashboard_agent.redis_address,
240+
password=self._dashboard_agent.redis_password)
241+
229242
while True:
230243
try:
231244
stats = self._get_all_stats()
@@ -236,8 +249,5 @@ async def _perform_iteration(self, aioredis_client):
236249
reporter_consts.REPORTER_UPDATE_INTERVAL_MS / 1000)
237250

238251
async def run(self, server):
239-
aioredis_client = await aioredis.create_redis_pool(
240-
address=self._dashboard_agent.redis_address,
241-
password=self._dashboard_agent.redis_password)
242252
reporter_pb2_grpc.add_ReporterServiceServicer_to_server(self, server)
243-
await self._perform_iteration(aioredis_client)
253+
await self._perform_iteration()

dashboard/modules/tune/tune_head.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ async def collect(self):
130130

131131
# search through all the sub_directories in log directory
132132
analysis = Analysis(str(self._logdir))
133-
df = analysis.dataframe(metric=None, mode=None)
133+
df = analysis.dataframe(metric="episode_reward_mean", mode="max")
134134

135135
if len(df) == 0 or "trial_id" not in df.columns:
136136
return

python/build-wheel-macos.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,7 @@ source "$HOME"/.nvm/nvm.sh
3939
nvm use node
4040

4141
# Build the dashboard so its static assets can be included in the wheel.
42-
# TODO(mfitton): switch this back when deleting old dashboard code.
43-
pushd python/ray/new_dashboard/client
42+
pushd python/ray/dashboard/client
4443
npm ci
4544
npm run build
4645
popd

python/build-wheel-manylinux1.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,7 @@ nvm install node
3535
nvm use node
3636

3737
# Build the dashboard so its static assets can be included in the wheel.
38-
# TODO(mfitton): switch this back when deleting old dashboard code.
39-
pushd python/ray/new_dashboard/client
38+
pushd python/ray/dashboard/client
4039
npm ci
4140
npm run build
4241
popd

0 commit comments

Comments
 (0)