conda create -n gltrack python==3.10
conda activate gltrack
cd ms-swift
conda install -c conda-forge pyarrow sentencepiece
pip install -e .
pip install "sglang[all]" -U
pip install "vllm>=0.5.1" "transformers<4.55" "trl<0.21" -U
pip install "lmdeploy>=0.5" -U
pip install autoawq -U --no-deps
pip install auto_gptq optimum bitsandbytes "gradio<5.33" -U
pip install git+https://github.com/modelscope/ms-swift.git
pip install timm -U
pip install "deepspeed" -U
pip install flash-attn==2.7.4.post1 --no-build-isolation
conda install av -c conda-forge
pip install qwen_vl_utils qwen_omni_utils decord librosa icecream soundfile -U
pip install liger_kernel nvitop pre-commit math_verify py-spy -U
|-- data
│ ├── tnl2k
│ │ ├──test
│ │ | ├──advSamp_Baseball_game_002-Done
│ │ | └──...
│ │ └──train
│ │ ├──Arrow_Video_ZZ04_done
│ │ └──...
│ └── tnllt
│ ├──JE_Assian_ship_v01
│ └──...
bash data_preparation.shbash train.shbash infer.shYou can download it from HuggingFace: VPTracker
This code is developed on the top of ms-swift
Email: jcwang@stu.ecnu.edu.cn. Any kind discussions are welcomed!
If our work is useful for your research, please consider cite:
@misc{wang2025vptrackerglobalvisionlanguagetracking,
title={VPTracker: Global Vision-Language Tracking via Visual Prompt and MLLM},
author={Jingchao Wang and Kaiwen Zhou and Zhijian Wu and Kunhua Ji and Dingjiang Huang and Yefeng Zheng},
year={2025},
eprint={2512.22799},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2512.22799},
}