awq量化 精度降低6个点。推理耗时降低从0.447s降低到0.4s
在llamafactory环境中,安装
pip install autoawq
量化代码:
def qu_awq():
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer
import json
model_path = "model_path"
quant_path = "awq_model_path"
calib_data = "_quantize.json"
quant_config = {"zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM"}
# Load model
model = AutoAWQForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(
model_path, trust_remote_code=True, device_map="auto", safetensors=True
)
# The pattern of data
""" # Example
msg=[
{"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
{"role": "user", "content": "Tell me who you are."},
{"role": "assistant", "content": "I am a large language model named Qwen..."}
]
data = []
for msg in dataset:
text = tokenizer.apply_chat_template(msg, tokenize=False, add_generation_prompt=False)
data.append(text.strip())
return data
"""
# !!!!!!!!! Customize the code here for calib_data processing !!!!!!!!!!!!!!
def data_gen():
data = []
with open(calib_data, "r", encoding="utf-8") as file:
for line in file:
msg = json.loads(line)["messages"]
text = tokenizer.apply_chat_template(
msg, tokenize=False, add_generation_prompt=False
)
data.append(text.strip())
return data
# !!!!!!!!! Customize the code here for calib_data processing !!!!!!!!!!!!!!
with open(calib_data, 'r', encoding='utf-8') as f:
json_data = json.load(f)
json_data = [each["text"] for each in json_data]
# Quantize
model.quantize(
tokenizer,
quant_config=quant_config,
calib_data=json_data,
n_parallel_calib_samples=1,
max_calib_samples=256,
max_calib_seq_len=1024,
)
# Save quantized model
model.save_quantized(quant_path)
tokenizer.save_pretrained(quant_path)
print(f'Model is quantized and saved at "{quant_path}"')
qu_awq()
推理:
#!/bin/bash
# XFORMERS 比 FLASH_ATTN 少10ms
#export VLLM_ATTENTION_BACKEND=XFORMERS #old machine use
export VLLM_ATTENTION_BACKEND=FLASH_ATTN
source /opt/conda/etc/profile.d/conda.sh
conda activate /opt/conda/envs/vllm085
Model_path="/llm/models/general_knowledge_agent_router/general_knowledge_agent_202250820_v21_01_awq5"
#Model_path="/llm/models/Qwen3-4B-Instruct-2507"
CUDA_VISIBLE_DEVICES=0 nohup python -m vllm.entrypoints.openai.api_server \
--model ${Model_path} \
--served-model-name 'qwen3_4b' \
--host 0.0.0.0 \
--port 9005 \
--max-model-len 9000 \
--trust-remote-code \
--device cuda \
--tensor-parallel-size 1 \
--swap-space 0 \
--quantization awq \
--dtype float16 \
--gpu-memory-utilization 0.7 \
--max-num-seqs 1 > eval_qwen3_quant.log 2>&1 &