huggingface
diff --git a/‎backends/gaudi/examples/docker_commands/docker_commands.md‎
Lines changed: 12 additions & 183 deletions b/‎backends/gaudi/examples/docker_commands/docker_commands.md‎
Lines changed: 12 additions & 183 deletions
diff --git a/‎backends/gaudi/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py‎
Lines changed: 5 additions & 7 deletions b/‎backends/gaudi/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py‎
Lines changed: 5 additions & 7 deletions
diff --git a/‎backends/gaudi/server/text_generation_server/models/flash_causal_lm.py‎
Lines changed: 36 additions & 33 deletions b/‎backends/gaudi/server/text_generation_server/models/flash_causal_lm.py‎
Lines changed: 36 additions & 33 deletions
@@ -19,11 +19,7 @@ docker run -p 8080:80 \
    --ipc=host \
    -v $volume:/data \
    -e HF_TOKEN=$hf_token \
-   -e MAX_TOTAL_TOKENS=2048 \
-   -e PREFILL_BATCH_BUCKET_SIZE=2 \
-   -e BATCH_BUCKET_SIZE=32 \
-   -e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \
-   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.3.4-gaudi \
    --model-id $model \
    --max-input-tokens 1024 --max-total-tokens 2048 \
    --max-batch-prefill-tokens 2048 --max-batch-size 32 \
@@ -43,60 +39,7 @@ docker run -p 8080:80 \
    --ipc=host \
    -v $volume:/data \
    -e HF_TOKEN=$hf_token \
-   -e MAX_TOTAL_TOKENS=2048 \
-   -e BATCH_BUCKET_SIZE=256 \
-   -e PREFILL_BATCH_BUCKET_SIZE=4 \
-   -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
-   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
-   --model-id $model \
-   --sharded true --num-shard 8 \
-   --max-input-tokens 1024 --max-total-tokens 2048 \
-   --max-batch-prefill-tokens 4096 --max-batch-size 256 \
-   --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 512
-```
-
-### Llama2-7B on 1 Card (BF16)
-
-```bash
-model=meta-llama/Llama-2-7b-chat-hf
-hf_token=YOUR_ACCESS_TOKEN
-volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run
-
-docker run -p 8080:80 \
-   --runtime=habana \
-   --cap-add=sys_nice \
-   --ipc=host \
-   -v $volume:/data \
-   -e HF_TOKEN=$hf_token \
-   -e MAX_TOTAL_TOKENS=2048 \
-   -e PREFILL_BATCH_BUCKET_SIZE=2 \
-   -e BATCH_BUCKET_SIZE=32 \
-   -e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \
-   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
-   --model-id $model \
-   --max-input-tokens 1024 --max-total-tokens 2048 \
-   --max-batch-prefill-tokens 2048 --max-batch-size 32 \
-   --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 64
-```
-
-### Llama2-70B on 8 cards (BF16)
-
-```bash
-model=meta-llama/Llama-2-70b-chat-hf
-hf_token=YOUR_ACCESS_TOKEN
-volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run
-
-docker run -p 8080:80 \
-   --runtime=habana \
-   --cap-add=sys_nice \
-   --ipc=host \
-   -v $volume:/data \
-   -e HF_TOKEN=$hf_token \
-   -e MAX_TOTAL_TOKENS=2048 \
-   -e BATCH_BUCKET_SIZE=256 \
-   -e PREFILL_BATCH_BUCKET_SIZE=4 \
-   -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
-   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.3.4-gaudi \
    --model-id $model \
    --sharded true --num-shard 8 \
    --max-input-tokens 1024 --max-total-tokens 2048 \
@@ -115,49 +58,20 @@ docker run -p 8080:80 \
    --cap-add=sys_nice \
    --ipc=host \
    -v $volume:/data \
-    -e PREFILL_BATCH_BUCKET_SIZE=1 \
-    -e BATCH_BUCKET_SIZE=1 \
-   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.3.4-gaudi \
    --model-id $model \
    --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
    --max-total-tokens 8192 --max-batch-size 4
 ```
 
 ## FP8 Precision
 
-Please refer to the [FP8 Precision](https://huggingface.co/docs/text-generation-inference/backends/gaudi_new#how-to-use-different-precision-formats) section for more details. You need to measure the statistics of the model first before running the model in FP8 precision.
-
-## Llama3.1-8B on 1 Card (FP8)
-
-```bash
-model=meta-llama/Meta-Llama-3.1-8B-Instruct
-hf_token=YOUR_ACCESS_TOKEN
-volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run
-
-docker run -p 8080:80 \
-   --runtime=habana \
-   --cap-add=sys_nice \
-   --ipc=host \
-   -v $volume:/data \
-   -v $PWD/quantization_config:/usr/src/quantization_config \
-   -v $PWD/hqt_output:/usr/src/hqt_output \
-   -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
-   -e HF_TOKEN=$hf_token \
-   -e MAX_TOTAL_TOKENS=2048 \
-   -e PREFILL_BATCH_BUCKET_SIZE=2 \
-   -e BATCH_BUCKET_SIZE=32 \
-   -e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \
-   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
-   --model-id $model \
-   --max-input-tokens 1024 --max-total-tokens 2048 \
-   --max-batch-prefill-tokens 2048 --max-batch-size 32 \
-   --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 64
-```
+You could also set kv cache dtype to FP8 when launching the server, fp8_e4m3fn is supported in Gaudi
 
-## Llama3.1-70B on 8 cards (FP8)
+## Llama3-8B on 1 Card (FP8)
 
 ```bash
-model=meta-llama/Meta-Llama-3.1-70B-Instruct
+model=RedHatAI/Meta-Llama-3-8B-Instruct-FP8-KV
 hf_token=YOUR_ACCESS_TOKEN
 volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run
 
@@ -166,53 +80,19 @@ docker run -p 8080:80 \
    --cap-add=sys_nice \
    --ipc=host \
    -v $volume:/data \
-   -v $PWD/quantization_config:/usr/src/quantization_config \
-   -v $PWD/hqt_output:/usr/src/hqt_output \
-   -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
-   -e HF_TOKEN=$hf_token \
-   -e MAX_TOTAL_TOKENS=2048 \
-   -e BATCH_BUCKET_SIZE=256 \
-   -e PREFILL_BATCH_BUCKET_SIZE=4 \
-   -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
-   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
-   --model-id $model \
-   --sharded true --num-shard 8 \
-   --max-input-tokens 1024 --max-total-tokens 2048 \
-   --max-batch-prefill-tokens 4096 --max-batch-size 256 \
-   --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 512
-```
-
-## Llama2-7B on 1 Card (FP8)
-
-```bash
-model=meta-llama/Llama-2-7b-chat-hf
-hf_token=YOUR_ACCESS_TOKEN
-volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
-
-docker run -p 8080:80 \
-   --runtime=habana \
-   --cap-add=sys_nice \
-   --ipc=host \
-   -v $volume:/data \
-   -v $PWD/quantization_config:/usr/src/quantization_config \
-   -v $PWD/hqt_output:/usr/src/hqt_output \
-   -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
    -e HF_TOKEN=$hf_token \
-   -e MAX_TOTAL_TOKENS=2048 \
-   -e PREFILL_BATCH_BUCKET_SIZE=2 \
-   -e BATCH_BUCKET_SIZE=32 \
-   -e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \
-   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.3.4-gaudi \
    --model-id $model \
+   --kv-cache-dtype fp8_e4m3fn \
    --max-input-tokens 1024 --max-total-tokens 2048 \
    --max-batch-prefill-tokens 2048 --max-batch-size 32 \
    --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 64
 ```
 
-## Llama2-70B on 8 Cards (FP8)
+## Llama3-70B on 8 cards (FP8)
 
 ```bash
-model=meta-llama/Llama-2-70b-chat-hf
+model=RedHatAI/Meta-Llama-3-70B-Instruct-FP8
 hf_token=YOUR_ACCESS_TOKEN
 volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run
 
@@ -221,63 +101,12 @@ docker run -p 8080:80 \
    --cap-add=sys_nice \
    --ipc=host \
    -v $volume:/data \
-   -v $PWD/quantization_config:/usr/src/quantization_config \
-   -v $PWD/hqt_output:/usr/src/hqt_output \
-   -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
    -e HF_TOKEN=$hf_token \
-   -e MAX_TOTAL_TOKENS=2048 \
-   -e BATCH_BUCKET_SIZE=256 \
-   -e PREFILL_BATCH_BUCKET_SIZE=4 \
-   -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
-   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.3.4-gaudi \
    --model-id $model \
+   --kv-cache-dtype fp8_e4m3fn \
    --sharded true --num-shard 8 \
    --max-input-tokens 1024 --max-total-tokens 2048 \
    --max-batch-prefill-tokens 4096 --max-batch-size 256 \
    --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 512
 ```
-
-## Llava-v1.6-Mistral-7B on 1 Card (FP8)
-
-```bash
-model=llava-hf/llava-v1.6-mistral-7b-hf
-volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run
-
-docker run -p 8080:80 \
-   --runtime=habana \
-   --cap-add=sys_nice \
-   --ipc=host \
-   -v $volume:/data \
-   -v $PWD/quantization_config:/usr/src/quantization_config \
-   -v $PWD/hqt_output:/usr/src/hqt_output \
-   -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
-    -e PREFILL_BATCH_BUCKET_SIZE=1 \
-    -e BATCH_BUCKET_SIZE=1 \
-   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
-   --model-id $model \
-   --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
-   --max-total-tokens 8192 --max-batch-size 4
-```
-
-## Llava-v1.6-Mistral-7B on 8 Cards (FP8)
-
-```bash
-model=llava-hf/llava-v1.6-mistral-7b-hf
-volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run
-
-docker run -p 8080:80 \
-   --runtime=habana \
-   --cap-add=sys_nice \
-   --ipc=host \
-   -v $volume:/data \
-   -v $PWD/quantization_config:/usr/src/quantization_config \
-   -v $PWD/hqt_output:/usr/src/hqt_output \
-   -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
-    -e PREFILL_BATCH_BUCKET_SIZE=1 \
-    -e BATCH_BUCKET_SIZE=1 \
-   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
-   --model-id $model \
-   --sharded true --num-shard 8 \
-   --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
-   --max-total-tokens 8192 --max-batch-size 4
-```
@@ -140,12 +140,6 @@ def __init__(
         self.hidden_size = config.hidden_size
         self.head_size = self.hidden_size // self.num_heads
 
-        # Setting defaults for baichuan custom config which doesn't apply them.
-        config.rope_theta = getattr(config, "rope_theta", 10000)
-        config.num_key_value_heads = getattr(
-            config, "num_key_value_heads", config.num_attention_heads
-        )
-
         self.rotary_emb = rotary_emb
 
         # `config.attention_multiplier` is used in Granite
@@ -476,7 +470,11 @@ def __init__(self, prefix, config, weights):
         # Skip fp8 quant for first and last layers
         self.layers = nn.ModuleList()
         self.cross_attention_layers = getattr(config, "cross_attention_layers", [])
-
+        # Setting defaults for baichuan custom config which doesn't apply them.
+        config.rope_theta = getattr(config, "rope_theta", 10000)
+        config.num_key_value_heads = getattr(
+            config, "num_key_value_heads", config.num_attention_heads
+        )
         rotary_emb = PositionRotaryEmbedding.static(
             config=config,
             dim=config.hidden_size // config.num_attention_heads,
 
@@ -1076,22 +1076,23 @@ def prepare_for_decode(
             (0, padded_bs - self.cache_lengths_tensor.shape[0]),
             value=0,
         )
-        next_token_chooser_parameters = []
-        next_token_chooser_parameters.extend([r.parameters for r in self.requests])
-        pad_next_token_chooser_parameters(next_token_chooser_parameters, padded_bs)
-        # update past grammar states
-        fsm_grammar_states = [0] * padded_bs
-
-        for i, req in enumerate(self.requests):
-            fsm_grammar_states[i] = self.next_token_chooser.fsm_grammar_states[i]
-
-        self.next_token_chooser = HeterogeneousNextTokenChooser.from_pb(
-            next_token_chooser_parameters,
-            self.next_token_chooser.dtype,
-            self.next_token_chooser.device,
-            self.next_token_chooser.tokenizer,
-            fsm_grammar_states,
-        )
+        if len(self.next_token_chooser.do_sample) != padded_bs:
+            next_token_chooser_parameters = []
+            next_token_chooser_parameters.extend([r.parameters for r in self.requests])
+            pad_next_token_chooser_parameters(next_token_chooser_parameters, padded_bs)
+            # update past grammar states
+            fsm_grammar_states = [0] * padded_bs
+
+            for i, req in enumerate(self.requests):
+                fsm_grammar_states[i] = self.next_token_chooser.fsm_grammar_states[i]
+
+            self.next_token_chooser = HeterogeneousNextTokenChooser.from_pb(
+                next_token_chooser_parameters,
+                self.next_token_chooser.dtype,
+                self.next_token_chooser.device,
+                self.next_token_chooser.tokenizer,
+                fsm_grammar_states,
+            )
 
     def prepare_for_prefill(
         self, max_padded_input_len, max_padded_bs, max_total_tokens, pad_token_id
@@ -1379,23 +1380,25 @@ def prepare_for_prefill(
                 self.all_input_ids_tensor[i]
             )
         self.all_input_ids_tensor = all_input_ids_tensor
-
-        next_token_chooser_parameters = []
-        next_token_chooser_parameters.extend([r.parameters for r in self.requests])
-        pad_next_token_chooser_parameters(next_token_chooser_parameters, max_padded_bs)
-        # update past grammar states
-        fsm_grammar_states = [0] * max_padded_bs
-
-        for i, req in enumerate(self.requests):
-            fsm_grammar_states[i] = self.next_token_chooser.fsm_grammar_states[i]
-
-        self.next_token_chooser = HeterogeneousNextTokenChooser.from_pb(
-            next_token_chooser_parameters,
-            self.next_token_chooser.dtype,
-            self.next_token_chooser.device,
-            self.next_token_chooser.tokenizer,
-            fsm_grammar_states,
-        )
+        if len(self.next_token_chooser.do_sample) != max_padded_bs:
+            next_token_chooser_parameters = []
+            next_token_chooser_parameters.extend([r.parameters for r in self.requests])
+            pad_next_token_chooser_parameters(
+                next_token_chooser_parameters, max_padded_bs
+            )
+            # update past grammar states
+            fsm_grammar_states = [0] * max_padded_bs
+
+            for i, req in enumerate(self.requests):
+                fsm_grammar_states[i] = self.next_token_chooser.fsm_grammar_states[i]
+
+            self.next_token_chooser = HeterogeneousNextTokenChooser.from_pb(
+                next_token_chooser_parameters,
+                self.next_token_chooser.dtype,
+                self.next_token_chooser.device,
+                self.next_token_chooser.tokenizer,
+                fsm_grammar_states,
+            )
 
         if ADAPTER_TO_INDEX:
             if adapter_set: