Skip to content

LLaVA model parallelism/fork bug #260

@lockon-n

Description

@lockon-n

Thanks for this wonderful work! I was trying to use parallel inference/fork in sglang for the llava 1.5 model.

Here is my env:

torch 2.1.2+cu118
sglang: built from main branch (b0b722e)

Here is my code:

"""
Usage: python3 srt_example_llava.py
"""
import sglang as sgl

@sgl.function
def image_qa(s, image_path, question):
    s += sgl.user(sgl.image(image_path)+question)
    forks = s.fork(2)
    forks+= sgl.assistant(sgl.gen("answer"))
    forks.join()


def single():
    state = image_qa.run(
        image_path="images/cat.png",
        question="What is this?",
        max_new_tokens=64)
    for out in state["answer"]:
        print(out, end="\n", flush=True)


def batch():
    states = image_qa.run_batch(
        [
            {"image_path": "images/cat.png", "question":"What is this?"},
            {"image_path": "images/dog.png", "question":"What is this?"},
        ],
        max_new_tokens=64,
        temperature=1.0,
    )
    for s in states:
        for out in s["answer"]:
            print(out, end="\n", flush=True)
        print('----------------')

if __name__ == "__main__":
    runtime = sgl.Runtime(model_path="liuhaotian/llava-v1.5-7b",
                          tokenizer_path="llava-hf/llava-1.5-7b-hf",
                          chat_template="vicuna_v1.1")
    # runtime = sgl.Runtime(model_path="lmsys/vicuna-7b-v1.5",chat_template="vicuna_v1.1")
    sgl.set_default_backend(runtime)


    # # Run a single request
    print("\n========== single ==========\n")
    single()

    # Run a batch of requests
    print("\n========== batch ==========\n")
    batch()

    runtime.shutdown()

The error info:

../aten/src/ATen/native/cuda/Indexing.cu:1292: indexSelectLargeIndex: block: [41,0,0], thread: [64,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1292: indexSelectLargeIndex: block: [41,0,0], thread: [65,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1292: indexSelectLargeIndex: block: [41,0,0], thread: [66,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1292: indexSelectLargeIndex: block: [41,0,0], thread: [67,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1292: indexSelectLargeIndex: block: [41,0,0], thread: [68,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1292: indexSelectLargeIndex: block: [41,0,0], thread: [69,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1292: indexSelectLargeIndex: block: [41,0,0], thread: [70,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1292: indexSelectLargeIndex: block: [41,0,0], thread: [71,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1292: indexSelectLargeIndex: block: [41,0,0], thread: [72,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1292: indexSelectLargeIndex: block: [41,0,0], thread: [73,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1292: indexSelectLargeIndex: block: [41,0,0], thread: [74,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1292: indexSelectLargeIndex: block: [41,0,0], thread: [75,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1292: indexSelectLargeIndex: block: [41,0,0], thread: [76,0,0] Assertion `srcIndex < srcSelectDimSize` failed.

......

Exception in ModelRpcClient:
Traceback (most recent call last):
  File "/home/LeiFeng/weiliu/sglang/python/sglang/srt/managers/router/model_rpc.py", line 176, in exposed_step
    self.forward_step()
  File "/home/LeiFeng/anaconda3/envs/jl_gllava/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
    return func(*args, **kwargs)
  File "/home/LeiFeng/weiliu/sglang/python/sglang/srt/managers/router/model_rpc.py", line 191, in forward_step
    self.forward_fill_batch(new_batch)
  File "/home/LeiFeng/weiliu/sglang/python/sglang/srt/managers/router/model_rpc.py", line 404, in forward_fill_batch
    ) = self.model_runner.forward(
  File "/home/LeiFeng/weiliu/sglang/python/sglang/srt/managers/router/model_runner.py", line 478, in forward
    return self.forward_extend_multi_modal(**kwargs)
  File "/home/LeiFeng/anaconda3/envs/jl_gllava/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
    return func(*args, **kwargs)
  File "/home/LeiFeng/weiliu/sglang/python/sglang/srt/managers/router/model_runner.py", line 455, in forward_extend_multi_modal
    return self.model.forward(
  File "/home/LeiFeng/weiliu/sglang/python/sglang/srt/models/llava.py", line 108, in forward
    .cpu()
RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

This code works well when removing the image input and using vicuna for inference.

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions