Run Video-LLMs with as little as 0.07% of tokens using learned discrete neural tokens. Fixed & adaptive budgets. Seamless with LLaVA-OneVision via lmms-eval.
Goal. Compress a long video-derived token sequence t into a minimal set of tokens t' without sacrificing downstream performance.
Setting. Given a video v and query q, a video-language model (vLLM) tokenizes the video: t = Tokenize(v), then uses t and q to predict an answer a.
Reduction function. A function R maps t → t' with |t'| ≪ |t| such that the vLLM’s accuracy on a remains comparable.
We also introduce the Token Information Density (TokDense) metric to quantify information retained per token and provide a formal definition covering both fixed-length and adaptive-length regimes.
@inproceedings{zhang2025vqtoken,
title = {VQToken: Neural Discrete Token Representation Learning for Extreme Token Reduction in Video Large Language Models},
author = {Haichao Zhang and Yun Fu},
booktitle = {NeurIPS},
year = {2025}
}
Date | Status | Description |
---|---|---|
2025/09/21 | ✅ Released | VQ-Token 0.5B pretrained model on Hugging Face |
2025/09 | ✅ Released | Testing & training code (this repo) |
TBD | ⭕ Planned | Project website enhancements go online |
TBD | ⭕ Planned | Update Hugging Face model card README |
# clone your repo
git clone https://github.com/Hai-chao-Zhang/VQToken.git
cd VQToken
# conda env
conda create -n vqtoken python=3.10 -y
conda activate vqtoken
# install lmms-eval (dev mode)
git clone https://github.com/EvolvingLMMs-Lab/lmms-eval
cd lmms-eval
pip install -e .
cd ..
# install VQToken (train extras)
pip install -e ".[train]"
export HF_HOME="/path/to/your/hf/cache"
export HF_TOKEN="your_hf_token_here"
export HF_HUB_ENABLE_HF_TRANSFER=1
export NCCL_P2P_DISABLE="1"
export NCCL_IB_DISABLE="1"
PRETRAIN=haichaozhang/VQ-Token-llava-ov-0.5b
CUDA_VISIBLE_DEVICES=2 accelerate launch --num_processes=1 --main_process_port 29509 \
-m lmms_eval \
--model llava_onevision_vqtoken \
--model_args pretrained=$PRETRAIN,conv_template=qwen_1_5,model_name=llava_qwen \
--tasks activitynetqa --batch_size 1 \
--log_samples --log_samples_suffix llava_onevision \
--output_path ./logs_new/
from decord import VideoReader, cpu
from llava.model.builder import load_pretrained_model
from llava.mm_utils import tokenizer_image_token
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
from llava.conversation import conv_templates
import numpy as np, torch, copy
tok, model, imgproc, _ = load_pretrained_model(
"haichaozhang/VQ-Token-llava-ov-0.5b", None, "llava_qwen",
device_map="auto", attn_implementation="sdpa", multimodal=True
)
model.eval()
def frames(path, n=16):
vr = VideoReader(path, ctx=cpu(0))
idx = np.linspace(0, len(vr)-1, n, dtype=int).tolist()
return vr.get_batch(idx).asnumpy()
vid = frames("sample/demo.mp4", 16)
pix = imgproc.preprocess(vid, return_tensors="pt")["pixel_values"].half().cuda()
images = [pix]
conv = copy.deepcopy(conv_templates["qwen_1_5"])
q = f"{DEFAULT_IMAGE_TOKEN}\nDescribe what's happening in this video."
conv.append_message(conv.roles[0], q); conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
ids = tokenizer_image_token(prompt, tok, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).cuda()
sizes = [f.shape[:2] for f in vid]
out = model.generate(ids, images=images, image_sizes=sizes,
do_sample=False, temperature=0, max_new_tokens=512,
modalities=["video"], vis=True)
print(tok.batch_decode(out, skip_special_tokens=True)[0])
VQToken
├─ VLMEvalKit/ # VLMEvalKit evaluation
├─ VQToken/ # VQToken core code
├─ llava/ # modified from LLaVA-OneVision
├─ lmms_eval/ # lmms-eval Evaluation (preferred)
├─ finetune_ov_all.sh # train bash
└─ test_vqtoken_0.5b.sh # test bash