Single Card Models
1. Run the container and load a shell into it. For HABANA_VISIBLE_DEVICES, choose a single, available card from 0 through 7
HFCACHE="/mnt/hf_cache"
MOUNT="/mnt/data"
VOLUME_OPTS="-v ${HFCACHE}:/hf_cache -v ${MOUNT}:/data"
DOCKER_OPTS="-e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -d --runtime=habana --restart always"
DOCKER_OPTS="${DOCKER_OPTS} -e HF_TOKEN=$hf_token -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=$no_proxy"
docker run --entrypoint /bin/bash ${DOCKER_OPTS} -e HABANA_VISIBLE_DEVICES=4 ${VOLUME_OPTS} --name oh-1.22.0 oh-1.22.0-gaudi -c "sleep infinity"
docker exec -it oh-1.22.0 bash
2.Build Measurement files for Single Card models - this needs to be run once per model.
	- Change the value for model_name to the one you need to use
- Set world_size to the number of HPUs recommended as per the table above
export model_name=meta-llama/Llama-3.1-8B-Instruct
export world_size=1
export PT_HPU_LAZY_MODE=1
export HF_TOKEN=<YOUR_TOKEN_HERE>
export HF_DATASETS_TRUST_REMOTE_CODE=true
export TQDM_DISABLE=1
export QUANT_CONFIG=/root/optimum-habana/examples/text-generation/quantization_config/maxabs_measure.json
cd /root/optimum-habana/examples/text-generation/
python3 run_lm_eval.py \
  -o acc_llama_quant.json \
  --model_name_or_path ${model_name} \
  --warmup 0 \
  --flash_attention_causal_mask \
  --attn_softmax_bf16 \
  --use_hpu_graphs \
  --trim_logits \
  --use_kv_cache \
  --bf16 \
  --batch_size 1 \
  --bucket_size=128 \
  --bucket_internal \
  --trust_remote_code \
  --tasks hellaswag lambada_openai piqa winogrande \
  --use_flash_attention \
  --flash_attention_recompute
3. Run benchmark for Single card models (world_size=1)
export QUANT_CONFIG=/root/optimum-habana/examples/text-generation/quantization_config/maxabs_quant.json
export input_len=128
export output_len=128
export batch_size=1536
export world_size=1
cd /root/optimum-habana/examples/text-generation/
python3 run_generation.py \
  --model_name_or_path ${model_name} \
  --attn_softmax_bf16 \
  --trim_logits \
  --warmup 2 \
  --use_kv_cache \
  --use_hpu_graphs \
  --limit_hpu_graphs \
  --bucket_size=128 \
  --bucket_internal \
  --attn_batch_split 2 \
  --bf16 \
  --flash_attention_causal_mask \
  --use_flash_attention \
  --flash_attention_recompute \
  --batch_size ${batch_size} \
  --max_new_tokens ${output_len} \
  --max_input_tokens ${input_len}