Initial test of using gpu_topology#4150
Conversation
|
|
||
| return placements | ||
|
|
||
| def calculate_hosts_per_topo(gpu_topology: str, machine_type: NSDict) -> int: |
There was a problem hiding this comment.
machine_type is MachineType , not an NSDict, please make sure that mypy is happy)
| top_split = [int(x) for x in str(gpu_topology).split("x")] | ||
| except Exception as e: | ||
| log.error("Incorrectly formatted accelerator topology") | ||
| return 0 |
There was a problem hiding this comment.
Returning magic number is a bad taste. Let's raise an exception instead.
| # Look for gpu topology first | ||
| if gpu_topology is not None: | ||
| hosts_per_topo = calculate_hosts_per_topo(gpu_topology, machine_type) | ||
| if hosts_per_topo > 0: |
There was a problem hiding this comment.
Catch exception and log instead of using magic comparison.
|
|
||
| return placements | ||
|
|
||
| def calculate_hosts_per_topo(gpu_topology: str, machine_type: NSDict) -> int: |
There was a problem hiding this comment.
It would be nice to add unit tests
| try: | ||
| top_split = [int(x) for x in str(gpu_topology).split("x")] | ||
| except Exception as e: | ||
| log.error("Incorrectly formatted accelerator topology") |
There was a problem hiding this comment.
All errors / exceptions should contain the accelerator_topology value to be useful.
| def calculate_hosts_per_topo(gpu_topology: str, machine_type: NSDict) -> int: | ||
| # Calculate total number of hosts per topology (Assumes format: '1x72') | ||
| try: | ||
| top_split = [int(x) for x in str(gpu_topology).split("x")] |
There was a problem hiding this comment.
gpu_topology is already str, why cast?
|
|
||
| return placements | ||
|
|
||
| def calculate_hosts_per_topo(gpu_topology: str, machine_type: NSDict) -> int: |
There was a problem hiding this comment.
nit. rename gpu_topology -> topology OR accelerator_topology OR topo
| log.error("Incorrectly formatted accelerator topology") | ||
| return 0 | ||
|
|
||
| gpus_per_machine = machine_type.accelerators[0].count |
There was a problem hiding this comment.
This will throw if len(machine_type.accelerators) == 0
| log.error("Incorrectly formatted accelerator topology") | ||
| elif gpus_per_machine <= 0: | ||
| log.error("Cannot use accelerator topology with machine type that has no accelerators") | ||
| elif top_split[1] % machine_type.accelerators[0].count != 0: |
There was a problem hiding this comment.
s/machine_type.accelerators[0].count/gpus_per_machine/
| elif top_split[1] % machine_type.accelerators[0].count != 0: | ||
| log.error("GPU count per node must be a factor of the gpu topology") | ||
| else: | ||
| return (top_split[0] * top_split[1]) // gpus_per_machine |
There was a problem hiding this comment.
Check that top_split[0] > 0, otherwise we up for surprise.
| else: | ||
| return (top_split[0] * top_split[1]) // gpus_per_machine | ||
|
|
||
| return 0 |
|
|
||
|
|
||
| def create_placement_request(pg_name: str, region: str, max_distance: Optional[int]): | ||
| def create_placement_request(pg_name: str, region: str, max_distance: Optional[int], gpu_topology: Optional[str]): |
There was a problem hiding this comment.
nit. rename gpu_topology -> accelerator_topology where applicable, to make codebase less confusing and more "searchable".
| nodeset = self.node_nodeset(node_name) | ||
| return parse_self_link(nodeset.subnetwork).region | ||
|
|
||
| def nodeset_gpu_topology(self, nodeset_name: str) -> str: |
|
@cdunbar13 I believe this is no longer needed, hence closing this. Please feel free to re-open if it is still needed. |
In anticipation of new machine_types, this PR adds the accelerator_topology field to the slurm controller. It eventually translates to gpuTopology for the placement policy for the slurm cluster.
This can be a template for a similar field for the TPU version of the nodeset and can share the same variable until it hits
resume.py.This will be tested soon, do not merge.