Skip to content

Commit a4dcd65

Browse files
[BFCL] Add Support for Fully Offline Model Inference via --local-model-path (ShishirPatil#985)
Resolve ShishirPatil#968 - **New CLI flag**: `--local-model-path` added to generate command - **Validation logic**: Checks for required files (`config.json`, `tokenizer_config.json`) in the model path - Updated model loading: - `AutoTokenizer` and `AutoConfig` now use `local_files_only=True` when loading local models - Server subprocesses (vLLM / SGLang) now reference `model_path_or_id` - Inference calls to VLLM use `model_path_or_id` instead of `model_name_huggingface` to avoid mismatches --------- Co-authored-by: Huanzhi Mao <huanzhimao@gmail.com>
1 parent 9108a65 commit a4dcd65

File tree

5 files changed

+69
-22
lines changed

5 files changed

+69
-22
lines changed

berkeley-function-call-leaderboard/CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
All notable changes to the Berkeley Function Calling Leaderboard will be documented in this file.
44

5+
- [Apr 14, 2025] [#985](https://github.com/ShishirPatil/gorilla/pull/985): Support fully offline inference via the `--local-model-path` flag. Point it to a directory that already holds the model’s files (`config.json`, tokenizer, weights, etc.); use this flag only when the model has been pre‑downloaded outside the default $HF_HOME cache.
56
- [Apr 13, 2025] [#980](https://github.com/ShishirPatil/gorilla/pull/980): Integrate Novita AI as a third-party inference provider for the following open-source models:
67
- `Llama-4-Maverick-17B-128E-Instruct-FP8` (Prompt & FC)
78
- `Llama-4-Scout-17B-16E-Instruct` (Prompt & FC)

berkeley-function-call-leaderboard/README.md

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,11 +127,18 @@ bfcl generate --model MODEL_NAME --test-category TEST_CATEGORY --num-threads 1
127127
#### For Locally-hosted OSS Models
128128

129129
```bash
130-
bfcl generate --model MODEL_NAME --test-category TEST_CATEGORY --backend {vllm|sglang} --num-gpus 1 --gpu-memory-utilization 0.9
130+
bfcl generate \
131+
--model MODEL_NAME \
132+
--test-category TEST_CATEGORY \
133+
--backend {vllm|sglang} \
134+
--num-gpus 1 \
135+
--gpu-memory-utilization 0.9 \
136+
--local-model-path /path/to/local/model # ← optional
131137
```
132138

133139
- Choose your backend using `--backend vllm` or `--backend sglang`. The default backend is `vllm`.
134140
- Control GPU usage by adjusting `--num-gpus` (default `1`, relevant for multi-GPU tensor parallelism) and `--gpu-memory-utilization` (default `0.9`), which can help avoid out-of-memory errors.
141+
- `--local-model-path` (optional): Point this flag at a directory that already contains the model’s files (`config.json`, tokenizer, weights, etc.). Use it only when you’ve pre‑downloaded the model and the weights live somewhere other than the default `$HF_HOME` cache.
135142

136143
##### For Pre-existing OpenAI-compatible Endpoints
137144

berkeley-function-call-leaderboard/bfcl/__main__.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
import csv
22
from datetime import datetime
3+
import os
34
from types import SimpleNamespace
4-
from typing import List
5+
from typing import List, Optional
56

67
import typer
78
from bfcl._llm_response_generation import main as generation_main
@@ -118,6 +119,11 @@ def generate(
118119
"--skip-server-setup",
119120
help="Skip vLLM/SGLang server setup and use existing endpoint specified by the VLLM_ENDPOINT and VLLM_PORT environment variables.",
120121
),
122+
local_model_path: Optional[str] = typer.Option(
123+
None,
124+
"--local-model-path",
125+
help="Specify the path to a local directory containing the model's config/tokenizer/weights for fully offline inference. Use this only if the model weights are stored in a location other than the default HF_HOME directory.",
126+
),
121127
result_dir: str = typer.Option(
122128
RESULT_PATH,
123129
"--result-dir",
@@ -150,6 +156,7 @@ def generate(
150156
gpu_memory_utilization=gpu_memory_utilization,
151157
backend=backend,
152158
skip_server_setup=skip_server_setup,
159+
local_model_path=local_model_path,
153160
result_dir=result_dir,
154161
allow_overwrite=allow_overwrite,
155162
run_ids=run_ids,

berkeley-function-call-leaderboard/bfcl/_llm_response_generation.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,7 @@
1818
from bfcl.eval_checker.eval_runner_helper import load_file
1919
from bfcl.model_handler.handler_map import HANDLER_MAP
2020
from bfcl.model_handler.model_style import ModelStyle
21-
from bfcl.utils import (
22-
is_multi_turn,
23-
parse_test_category_argument,
24-
sort_key,
25-
)
21+
from bfcl.utils import is_multi_turn, parse_test_category_argument, sort_key
2622
from tqdm import tqdm
2723

2824
RETRY_LIMIT = 3
@@ -55,7 +51,15 @@ def get_args():
5551
default=False,
5652
help="Skip vLLM/SGLang server setup and use existing endpoint specified by the VLLM_ENDPOINT and VLLM_PORT environment variables."
5753
)
54+
# Optional local model path
55+
parser.add_argument(
56+
"--local-model-path",
57+
type=str,
58+
default=None,
59+
help="Specify the path to a local directory containing the model's config/tokenizer/weights for fully offline inference. Use this only if the model weights are stored in a location other than the default HF_HOME directory.",
60+
)
5861
args = parser.parse_args()
62+
5963
return args
6064

6165

@@ -226,6 +230,7 @@ def generate_results(args, model_name, test_cases_total):
226230
gpu_memory_utilization=args.gpu_memory_utilization,
227231
backend=args.backend,
228232
skip_server_setup=args.skip_server_setup,
233+
local_model_path=args.local_model_path,
229234
include_input_log=args.include_input_log,
230235
exclude_state_log=args.exclude_state_log,
231236
result_dir=args.result_dir,

berkeley-function-call-leaderboard/bfcl/model_handler/local_inference/base_oss_handler.py

Lines changed: 42 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,12 @@
1+
import os
12
import subprocess
23
import threading
34
import time
4-
import os
55
from concurrent.futures import ThreadPoolExecutor
6+
from typing import Optional
67

78
import requests
8-
from bfcl.constants.eval_config import (
9-
RESULT_PATH,
10-
VLLM_PORT,
11-
)
9+
from bfcl.constants.eval_config import RESULT_PATH, VLLM_PORT
1210
from bfcl.model_handler.base_handler import BaseHandler
1311
from bfcl.model_handler.model_style import ModelStyle
1412
from bfcl.model_handler.utils import (
@@ -28,6 +26,11 @@ def __init__(self, model_name, temperature, dtype="bfloat16") -> None:
2826
self.model_name_huggingface = model_name
2927
self.model_style = ModelStyle.OSSMODEL
3028
self.dtype = dtype
29+
30+
# Might be overridden in batch_inference method if local_model_path is provided
31+
# Used to indicate where the tokenizer and config should be loaded from
32+
self.model_path_or_id = self.model_name_huggingface
33+
3134
# Read from env vars with fallbacks
3235
self.vllm_host = os.getenv("VLLM_ENDPOINT", "localhost")
3336
self.vllm_port = os.getenv("VLLM_PORT", VLLM_PORT)
@@ -63,6 +66,7 @@ def batch_inference(
6366
gpu_memory_utilization: float,
6467
backend: str,
6568
skip_server_setup: bool,
69+
local_model_path: Optional[str],
6670
include_input_log: bool,
6771
exclude_state_log: bool,
6872
update_mode: bool,
@@ -73,13 +77,36 @@ def batch_inference(
7377
"""
7478
from transformers import AutoConfig, AutoTokenizer
7579

76-
self.tokenizer = AutoTokenizer.from_pretrained(
77-
self.model_name_huggingface, trust_remote_code=True
78-
)
80+
# Determine the model source
81+
if local_model_path is not None:
82+
# Validate the local_model_path
83+
if not os.path.isdir(local_model_path):
84+
raise ValueError(
85+
f"local_model_path '{local_model_path}' does not exist or is not a directory."
86+
)
87+
88+
required_files = ["config.json", "tokenizer_config.json"]
89+
for file_name in required_files:
90+
if not os.path.exists(os.path.join(local_model_path, file_name)):
91+
raise ValueError(
92+
f"Required file '{file_name}' not found in local_model_path '{local_model_path}'."
93+
)
94+
95+
self.model_path_or_id = local_model_path
96+
load_kwargs = {
97+
"pretrained_model_name_or_path": self.model_path_or_id,
98+
"local_files_only": True,
99+
"trust_remote_code": True,
100+
}
101+
else:
102+
load_kwargs = {
103+
"pretrained_model_name_or_path": self.model_path_or_id,
104+
"trust_remote_code": True,
105+
}
106+
107+
self.tokenizer = AutoTokenizer.from_pretrained(**load_kwargs)
108+
config = AutoConfig.from_pretrained(**load_kwargs)
79109

80-
config = AutoConfig.from_pretrained(
81-
self.model_name_huggingface, trust_remote_code=True
82-
)
83110
if hasattr(config, "max_position_embeddings"):
84111
self.max_context_length = config.max_position_embeddings
85112
elif self.tokenizer.model_max_length is not None:
@@ -97,7 +124,7 @@ def batch_inference(
97124
[
98125
"vllm",
99126
"serve",
100-
str(self.model_name_huggingface),
127+
str(self.model_path_or_id),
101128
"--port",
102129
str(self.vllm_port),
103130
"--dtype",
@@ -120,7 +147,7 @@ def batch_inference(
120147
"-m",
121148
"sglang.launch_server",
122149
"--model-path",
123-
str(self.model_name_huggingface),
150+
str(self.model_path_or_id),
124151
"--port",
125152
str(self.vllm_port),
126153
"--dtype",
@@ -311,7 +338,7 @@ def _query_prompting(self, inference_data: dict):
311338
start_time = time.time()
312339
if len(extra_body) > 0:
313340
api_response = self.client.completions.create(
314-
model=self.model_name_huggingface,
341+
model=self.model_path_or_id,
315342
temperature=self.temperature,
316343
prompt=formatted_prompt,
317344
max_tokens=leftover_tokens_count,
@@ -320,7 +347,7 @@ def _query_prompting(self, inference_data: dict):
320347
)
321348
else:
322349
api_response = self.client.completions.create(
323-
model=self.model_name_huggingface,
350+
model=self.model_path_or_id,
324351
temperature=self.temperature,
325352
prompt=formatted_prompt,
326353
max_tokens=leftover_tokens_count,

0 commit comments

Comments
 (0)