From 4be8f2aa60d3c0cec2741bfec88d42fad6af1542 Mon Sep 17 00:00:00 2001
From: Johan Stenberg <johan.stenberg@microsoft.com>
Date: Mon, 2 Oct 2023 21:43:24 -0700
Subject: [PATCH 01/10] Initial prototype v1 openai support for azure

---
 src/openai/azure.py | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)
 create mode 100644 src/openai/azure.py

diff --git a/src/openai/azure.py b/src/openai/azure.py
new file mode 100644
index 0000000000..966c9c06ae
--- /dev/null
+++ b/src/openai/azure.py
@@ -0,0 +1,38 @@
+import openai
+
+class TokenCredential:
+
+    def __init__(self):
+        import azure.identity
+        self._credential = azure.identity.DefaultAzureCredential()
+
+    def get_token(self):
+        return self._credential.get_token('https://cognitiveservices.azure.com/.default').token
+
+
+class AzureClient(openai.Client):
+
+    def __init__(self, *args, deployment: str, credential: TokenCredential | None, api_version: str = '2023-03-15-preview', **kwargs):
+        default_query = kwargs.get('default_query', {})
+        default_query.setdefault('api-version', api_version)
+        kwargs['default_query'] = default_query
+        self.credential = credential
+        if credential:
+            kwargs['api_key'] = 'Placeholder: AAD'
+        super().__init__(*args, **kwargs)
+        self.deployment = deployment
+
+    @property
+    def auth_headers(self) -> dict[str, str]:
+        if self.credential:
+            return { 'Authorization': f'Bearer {self.credential.get_token()}'}
+        return {"api-key": self.api_key}
+    
+
+    def request(self, *args, **kwargs):
+        if self.deployment:
+            args = list(args)
+            options = args[1] if len(args) >= 2 else kwargs.get('options')
+            options.url = f'openai/deployments/{self.deployment}' + options.url
+        return super().request(*args, **kwargs)
+

From ec76eed7aef29f458410de30e3123b08c4e0f176 Mon Sep 17 00:00:00 2001
From: Krista Pratico <krpratic@microsoft.com>
Date: Wed, 4 Oct 2023 11:22:29 -0700
Subject: [PATCH 02/10] make AzureClient pass tests (except audio) (#8)

* make AzureClient pass tests (except audio)

* async client edits
---
 src/openai/__init__.py |   3 +
 src/openai/azure.py    | 140 +++++++++++++++++++++++++++++++++++++++--
 2 files changed, 137 insertions(+), 6 deletions(-)

diff --git a/src/openai/__init__.py b/src/openai/__init__.py
index d011d416ac..b386c35fee 100644
--- a/src/openai/__init__.py
+++ b/src/openai/__init__.py
@@ -18,6 +18,7 @@
     AsyncStream,
     RequestOptions,
 )
+from .azure import AzureClient, AzureAsyncClient
 from ._version import __title__, __version__
 from ._exceptions import (
     APIError,
@@ -66,6 +67,8 @@
     "OpenAI",
     "AsyncOpenAI",
     "file_from_path",
+    "AzureClient",
+    "AzureAsyncClient"
 ]
 
 from .version import VERSION as VERSION
diff --git a/src/openai/azure.py b/src/openai/azure.py
index 966c9c06ae..d9d82e2caf 100644
--- a/src/openai/azure.py
+++ b/src/openai/azure.py
@@ -1,4 +1,8 @@
 import openai
+import time
+import httpx
+
+TIMEOUT_SECS = 600
 
 class TokenCredential:
 
@@ -12,7 +16,7 @@ def get_token(self):
 
 class AzureClient(openai.Client):
 
-    def __init__(self, *args, deployment: str, credential: TokenCredential | None, api_version: str = '2023-03-15-preview', **kwargs):
+    def __init__(self, *args, deployment: str = "", credential: TokenCredential | None = None, api_version: str = '2023-09-01-preview', **kwargs):
         default_query = kwargs.get('default_query', {})
         default_query.setdefault('api-version', api_version)
         kwargs['default_query'] = default_query
@@ -27,12 +31,136 @@ def auth_headers(self) -> dict[str, str]:
         if self.credential:
             return { 'Authorization': f'Bearer {self.credential.get_token()}'}
         return {"api-key": self.api_key}
-    
+
+    def _check_polling_response(self, response, predicate):
+        if not predicate(response):
+            return
+        error_data = response.json()['error']
+        message = error_data.get('message', 'Operation failed')
+        code = error_data.get('code')
+        raise openai.OpenAIError(message=message, code=code)
+
+    def _poll(
+        self,
+        method,
+        url,
+        until,
+        failed,
+        cast_to,
+        interval = None,
+        delay = None,
+    ):
+        if delay:
+            time.sleep(delay)
+
+        opts = openai._models.FinalRequestOptions.construct(method=method, url=url)
+        response = super().request(cast_to, opts)
+        self._check_polling_response(response, failed)
+        start_time = time.time()
+        while not until(response):
+            if time.time() - start_time > TIMEOUT_SECS:
+                raise openai.Timeout("Operation polling timed out.")
+
+            time.sleep(interval or int(response.headers.get("retry-after")) or 10)
+            response = super().request(cast_to, opts)
+            self._check_polling_response(response, failed)
+
+        response_json = response.json()
+        return openai.types.ImagesResponse.construct(**response_json["result"])
 
     def request(self, *args, **kwargs):
-        if self.deployment:
-            args = list(args)
-            options = args[1] if len(args) >= 2 else kwargs.get('options')
-            options.url = f'openai/deployments/{self.deployment}' + options.url
+        args = list(args)
+        options = args[1] if len(args) >= 2 else kwargs.get('options')
+        if options.url == "/images/generations":
+            options.url = "openai/images/generations:submit"
+            response = super().request(*args, **kwargs)
+            operation_id = response.model_extra['id']
+            return self._poll(
+                "get", f"openai/operations/images/{operation_id}", cast_to=httpx.Response,
+                until=lambda response: response.json()["status"] in ["succeeded"],
+                failed=lambda response: response.json()["status"] in ["failed"],
+            )
+        elif options.extra_json and options.extra_json.get("dataSources"):
+            model = options.json_data["model"]
+            options.url = f'openai/deployments/{model}/extensions' + options.url
+        else:
+            model = options.json_data["model"]
+            options.url = f'openai/deployments/{model}' + options.url
         return super().request(*args, **kwargs)
 
+
+
+class AzureAsyncClient(openai.AsyncClient):
+
+    def __init__(self, *args, deployment: str = "", credential: TokenCredential | None = None, api_version: str = '2023-09-01-preview', **kwargs):
+        default_query = kwargs.get('default_query', {})
+        default_query.setdefault('api-version', api_version)
+        kwargs['default_query'] = default_query
+        self.credential = credential
+        if credential:
+            kwargs['api_key'] = 'Placeholder: AAD'
+        super().__init__(*args, **kwargs)
+        self.deployment = deployment
+
+    @property
+    def auth_headers(self) -> dict[str, str]:
+        if self.credential:
+            return { 'Authorization': f'Bearer {self.credential.get_token()}'}
+        return {"api-key": self.api_key}
+
+    def _check_polling_response(self, response, predicate):
+        if not predicate(response):
+            return
+        error_data = response.json()['error']
+        message = error_data.get('message', 'Operation failed')
+        code = error_data.get('code')
+        raise openai.OpenAIError(message=message, code=code)
+
+    async def _poll(
+        self,
+        method,
+        url,
+        until,
+        failed,
+        cast_to,
+        interval = None,
+        delay = None,
+    ):
+        if delay:
+            time.sleep(delay)
+
+        opts = openai._models.FinalRequestOptions.construct(method=method, url=url)
+        response = await super().request(cast_to, opts)
+        self._check_polling_response(response, failed)
+        start_time = time.time()
+        while not until(response):
+            if time.time() - start_time > TIMEOUT_SECS:
+                raise openai.Timeout("Operation polling timed out.")
+
+            time.sleep(interval or int(response.headers.get("retry-after")) or 10)
+            response = await super().request(cast_to, opts)
+            self._check_polling_response(response, failed)
+
+        response_json = response.json()
+        return openai.types.ImagesResponse.construct(**response_json["result"])
+
+    async def request(self, *args, **kwargs):
+        args = list(args)
+        options = args[1] if len(args) >= 2 else kwargs.get('options')
+        if options.url == "/images/generations":
+            options.url = "openai/images/generations:submit"
+            response = await super().request(*args, **kwargs)
+            operation_id = response.model_extra['id']
+            return await self._poll(
+                "get", f"openai/operations/images/{operation_id}", cast_to=httpx.Response,
+                until=lambda response: response.json()["status"] in ["succeeded"],
+                failed=lambda response: response.json()["status"] in ["failed"],
+            )
+        elif options.extra_json and options.extra_json.get("dataSources"):
+            model = options.json_data["model"]
+            options.url = f'openai/deployments/{model}/extensions' + options.url
+        else:
+            model = options.json_data["model"]
+            options.url = f'openai/deployments/{model}' + options.url
+        return await super().request(*args, **kwargs)
+

From ba7f7e15c263eb07f4ba2771a8cc5189f5577d68 Mon Sep 17 00:00:00 2001
From: Johan Stenberg <johan.stenberg@microsoft.com>
Date: Thu, 5 Oct 2023 12:53:09 -0700
Subject: [PATCH 03/10] Split sync and async clients into separate modules

---
 src/openai/azure.py               | 166 -----------
 src/openai/azure/__init__.py      |   9 +
 src/openai/azure/_async_client.py | 436 +++++++++++++++++++++++++++++
 src/openai/azure/_azuremodels.py  |   5 +
 src/openai/azure/_credential.py   |  13 +
 src/openai/azure/_sync_client.py  | 440 ++++++++++++++++++++++++++++++
 6 files changed, 903 insertions(+), 166 deletions(-)
 delete mode 100644 src/openai/azure.py
 create mode 100644 src/openai/azure/__init__.py
 create mode 100644 src/openai/azure/_async_client.py
 create mode 100644 src/openai/azure/_azuremodels.py
 create mode 100644 src/openai/azure/_credential.py
 create mode 100644 src/openai/azure/_sync_client.py

diff --git a/src/openai/azure.py b/src/openai/azure.py
deleted file mode 100644
index d9d82e2caf..0000000000
--- a/src/openai/azure.py
+++ /dev/null
@@ -1,166 +0,0 @@
-import openai
-import time
-import httpx
-
-TIMEOUT_SECS = 600
-
-class TokenCredential:
-
-    def __init__(self):
-        import azure.identity
-        self._credential = azure.identity.DefaultAzureCredential()
-
-    def get_token(self):
-        return self._credential.get_token('https://cognitiveservices.azure.com/.default').token
-
-
-class AzureClient(openai.Client):
-
-    def __init__(self, *args, deployment: str = "", credential: TokenCredential | None = None, api_version: str = '2023-09-01-preview', **kwargs):
-        default_query = kwargs.get('default_query', {})
-        default_query.setdefault('api-version', api_version)
-        kwargs['default_query'] = default_query
-        self.credential = credential
-        if credential:
-            kwargs['api_key'] = 'Placeholder: AAD'
-        super().__init__(*args, **kwargs)
-        self.deployment = deployment
-
-    @property
-    def auth_headers(self) -> dict[str, str]:
-        if self.credential:
-            return { 'Authorization': f'Bearer {self.credential.get_token()}'}
-        return {"api-key": self.api_key}
-
-    def _check_polling_response(self, response, predicate):
-        if not predicate(response):
-            return
-        error_data = response.json()['error']
-        message = error_data.get('message', 'Operation failed')
-        code = error_data.get('code')
-        raise openai.OpenAIError(message=message, code=code)
-
-    def _poll(
-        self,
-        method,
-        url,
-        until,
-        failed,
-        cast_to,
-        interval = None,
-        delay = None,
-    ):
-        if delay:
-            time.sleep(delay)
-
-        opts = openai._models.FinalRequestOptions.construct(method=method, url=url)
-        response = super().request(cast_to, opts)
-        self._check_polling_response(response, failed)
-        start_time = time.time()
-        while not until(response):
-            if time.time() - start_time > TIMEOUT_SECS:
-                raise openai.Timeout("Operation polling timed out.")
-
-            time.sleep(interval or int(response.headers.get("retry-after")) or 10)
-            response = super().request(cast_to, opts)
-            self._check_polling_response(response, failed)
-
-        response_json = response.json()
-        return openai.types.ImagesResponse.construct(**response_json["result"])
-
-    def request(self, *args, **kwargs):
-        args = list(args)
-        options = args[1] if len(args) >= 2 else kwargs.get('options')
-        if options.url == "/images/generations":
-            options.url = "openai/images/generations:submit"
-            response = super().request(*args, **kwargs)
-            operation_id = response.model_extra['id']
-            return self._poll(
-                "get", f"openai/operations/images/{operation_id}", cast_to=httpx.Response,
-                until=lambda response: response.json()["status"] in ["succeeded"],
-                failed=lambda response: response.json()["status"] in ["failed"],
-            )
-        elif options.extra_json and options.extra_json.get("dataSources"):
-            model = options.json_data["model"]
-            options.url = f'openai/deployments/{model}/extensions' + options.url
-        else:
-            model = options.json_data["model"]
-            options.url = f'openai/deployments/{model}' + options.url
-        return super().request(*args, **kwargs)
-
-
-
-class AzureAsyncClient(openai.AsyncClient):
-
-    def __init__(self, *args, deployment: str = "", credential: TokenCredential | None = None, api_version: str = '2023-09-01-preview', **kwargs):
-        default_query = kwargs.get('default_query', {})
-        default_query.setdefault('api-version', api_version)
-        kwargs['default_query'] = default_query
-        self.credential = credential
-        if credential:
-            kwargs['api_key'] = 'Placeholder: AAD'
-        super().__init__(*args, **kwargs)
-        self.deployment = deployment
-
-    @property
-    def auth_headers(self) -> dict[str, str]:
-        if self.credential:
-            return { 'Authorization': f'Bearer {self.credential.get_token()}'}
-        return {"api-key": self.api_key}
-
-    def _check_polling_response(self, response, predicate):
-        if not predicate(response):
-            return
-        error_data = response.json()['error']
-        message = error_data.get('message', 'Operation failed')
-        code = error_data.get('code')
-        raise openai.OpenAIError(message=message, code=code)
-
-    async def _poll(
-        self,
-        method,
-        url,
-        until,
-        failed,
-        cast_to,
-        interval = None,
-        delay = None,
-    ):
-        if delay:
-            time.sleep(delay)
-
-        opts = openai._models.FinalRequestOptions.construct(method=method, url=url)
-        response = await super().request(cast_to, opts)
-        self._check_polling_response(response, failed)
-        start_time = time.time()
-        while not until(response):
-            if time.time() - start_time > TIMEOUT_SECS:
-                raise openai.Timeout("Operation polling timed out.")
-
-            time.sleep(interval or int(response.headers.get("retry-after")) or 10)
-            response = await super().request(cast_to, opts)
-            self._check_polling_response(response, failed)
-
-        response_json = response.json()
-        return openai.types.ImagesResponse.construct(**response_json["result"])
-
-    async def request(self, *args, **kwargs):
-        args = list(args)
-        options = args[1] if len(args) >= 2 else kwargs.get('options')
-        if options.url == "/images/generations":
-            options.url = "openai/images/generations:submit"
-            response = await super().request(*args, **kwargs)
-            operation_id = response.model_extra['id']
-            return await self._poll(
-                "get", f"openai/operations/images/{operation_id}", cast_to=httpx.Response,
-                until=lambda response: response.json()["status"] in ["succeeded"],
-                failed=lambda response: response.json()["status"] in ["failed"],
-            )
-        elif options.extra_json and options.extra_json.get("dataSources"):
-            model = options.json_data["model"]
-            options.url = f'openai/deployments/{model}/extensions' + options.url
-        else:
-            model = options.json_data["model"]
-            options.url = f'openai/deployments/{model}' + options.url
-        return await super().request(*args, **kwargs)
-
diff --git a/src/openai/azure/__init__.py b/src/openai/azure/__init__.py
new file mode 100644
index 0000000000..805d97a52f
--- /dev/null
+++ b/src/openai/azure/__init__.py
@@ -0,0 +1,9 @@
+from ._sync_client import AzureOpenAIClient
+from ._async_client import AsyncAzureOpenAIClient
+from ._credential import TokenCredential
+
+__all__ = [
+    "AzureOpenAIClient",
+    "TokenCredential",
+    "AsyncAzureOpenAIClient",
+]
\ No newline at end of file
diff --git a/src/openai/azure/_async_client.py b/src/openai/azure/_async_client.py
new file mode 100644
index 0000000000..a34c36eb62
--- /dev/null
+++ b/src/openai/azure/_async_client.py
@@ -0,0 +1,436 @@
+from typing_extensions import Literal, override
+from typing import Any, Callable, cast, List, Mapping, Dict, Optional, overload, Type, Union
+import time
+
+import httpx
+
+from openai import AsyncClient, OpenAIError
+from openai.resources.chat import AsyncChat, AsyncCompletions
+from openai.types import ImagesResponse
+from openai.types.chat import ChatCompletionMessageParam, ChatCompletion, ChatCompletionChunk
+from openai.types.chat.completion_create_params import FunctionCall, Function
+
+# These types are needed for correct typing of overrides
+from openai._types import NotGiven, NOT_GIVEN, Headers, Query, Body, ResponseT
+
+# These are types used in the public API surface area that are not exported as public
+from openai._models import FinalRequestOptions
+from openai._streaming import AsyncStream
+
+# Azure specific types
+from ._credential import TokenCredential
+from ._azuremodels import ChatExtensionConfiguration
+
+TIMEOUT_SECS = 600
+
+class AsyncAzureChat(AsyncChat):
+
+    @property
+    def completions(self) -> "AsyncAzureCompletions":
+        return self._completions
+    
+    def __init__(self, client: "AsyncAzureOpenAIClient"):
+        self._completions = AsyncAzureCompletions(client)
+
+class AsyncAzureCompletions(AsyncCompletions):
+    
+    @overload
+    async def create(
+        self,
+        *,
+        messages: List[ChatCompletionMessageParam],
+        model: Union[
+            str,
+            Literal[
+                "gpt-4",
+                "gpt-4-0314",
+                "gpt-4-0613",
+                "gpt-4-32k",
+                "gpt-4-32k-0314",
+                "gpt-4-32k-0613",
+                "gpt-3.5-turbo",
+                "gpt-3.5-turbo-16k",
+                "gpt-3.5-turbo-0301",
+                "gpt-3.5-turbo-0613",
+                "gpt-3.5-turbo-16k-0613",
+            ],
+        ],
+        frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        function_call: FunctionCall | NotGiven = NOT_GIVEN,
+        functions: List[Function] | NotGiven = NOT_GIVEN,
+        logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN,
+        max_tokens: Optional[int] | NotGiven = NOT_GIVEN,
+        n: Optional[int] | NotGiven = NOT_GIVEN,
+        presence_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        stop: Union[Optional[str], List[str]] | NotGiven = NOT_GIVEN,
+        stream: Optional[Literal[False]] | NotGiven = NOT_GIVEN,
+        temperature: Optional[float] | NotGiven = NOT_GIVEN,
+        top_p: Optional[float] | NotGiven = NOT_GIVEN,
+        user: str | NotGiven = NOT_GIVEN,
+        data_sources: List[ChatExtensionConfiguration] | NotGiven = NOT_GIVEN, # TODO
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | None | NotGiven = NOT_GIVEN,
+    ) -> ChatCompletion:
+        """
+        Creates a model response for the given chat conversation.
+
+        Args:
+          messages: A list of messages comprising the conversation so far.
+              [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_format_inputs_to_ChatGPT_models.ipynb).
+
+          model: ID of the model to use. See the
+              [model endpoint compatibility](https://platform.openai.com/docs/models/model-endpoint-compatibility)
+              table for details on which models work with the Chat API.
+
+          frequency_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on their
+              existing frequency in the text so far, decreasing the model's likelihood to
+              repeat the same line verbatim.
+
+              [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/gpt/parameter-details)
+
+          function_call: Controls how the model responds to function calls. `none` means the model does
+              not call a function, and responds to the end-user. `auto` means the model can
+              pick between an end-user or calling a function. Specifying a particular function
+              via `{"name": "my_function"}` forces the model to call that function. `none` is
+              the default when no functions are present. `auto` is the default if functions
+              are present.
+
+          functions: A list of functions the model may generate JSON inputs for.
+
+          logit_bias: Modify the likelihood of specified tokens appearing in the completion.
+
+              Accepts a json object that maps tokens (specified by their token ID in the
+              tokenizer) to an associated bias value from -100 to 100. Mathematically, the
+              bias is added to the logits generated by the model prior to sampling. The exact
+              effect will vary per model, but values between -1 and 1 should decrease or
+              increase likelihood of selection; values like -100 or 100 should result in a ban
+              or exclusive selection of the relevant token.
+
+          max_tokens: The maximum number of [tokens](/tokenizer) to generate in the chat completion.
+
+              The total length of input tokens and generated tokens is limited by the model's
+              context length.
+              [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb)
+              for counting tokens.
+
+          n: How many chat completion choices to generate for each input message.
+
+          presence_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on
+              whether they appear in the text so far, increasing the model's likelihood to
+              talk about new topics.
+
+              [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/gpt/parameter-details)
+
+          stop: Up to 4 sequences where the API will stop generating further tokens.
+
+          stream: If set, partial message deltas will be sent, like in ChatGPT. Tokens will be
+              sent as data-only
+              [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format)
+              as they become available, with the stream terminated by a `data: [DONE]`
+              message.
+              [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_stream_completions.ipynb).
+
+          temperature: What sampling temperature to use, between 0 and 2. Higher values like 0.8 will
+              make the output more random, while lower values like 0.2 will make it more
+              focused and deterministic.
+
+              We generally recommend altering this or `top_p` but not both.
+
+          top_p: An alternative to sampling with temperature, called nucleus sampling, where the
+              model considers the results of the tokens with top_p probability mass. So 0.1
+              means only the tokens comprising the top 10% probability mass are considered.
+
+              We generally recommend altering this or `temperature` but not both.
+
+          user: A unique identifier representing your end-user, which can help OpenAI to monitor
+              and detect abuse.
+              [Learn more](https://platform.openai.com/docs/guides/safety-best-practices/end-user-ids).
+
+          extra_headers: Send extra headers
+
+          extra_query: Add additional query parameters to the request
+
+          extra_body: Add additional JSON properties to the request
+
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        ...
+
+    @overload
+    async def create(
+        self,
+        *,
+        messages: List[ChatCompletionMessageParam],
+        model: Union[
+            str,
+            Literal[
+                "gpt-4",
+                "gpt-4-0314",
+                "gpt-4-0613",
+                "gpt-4-32k",
+                "gpt-4-32k-0314",
+                "gpt-4-32k-0613",
+                "gpt-3.5-turbo",
+                "gpt-3.5-turbo-16k",
+                "gpt-3.5-turbo-0301",
+                "gpt-3.5-turbo-0613",
+                "gpt-3.5-turbo-16k-0613",
+            ],
+        ],
+        stream: Literal[True],
+        frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        function_call: FunctionCall | NotGiven = NOT_GIVEN,
+        functions: List[Function] | NotGiven = NOT_GIVEN,
+        logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN,
+        max_tokens: Optional[int] | NotGiven = NOT_GIVEN,
+        n: Optional[int] | NotGiven = NOT_GIVEN,
+        presence_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        stop: Union[Optional[str], List[str]] | NotGiven = NOT_GIVEN,
+        temperature: Optional[float] | NotGiven = NOT_GIVEN,
+        top_p: Optional[float] | NotGiven = NOT_GIVEN,
+        user: str | NotGiven = NOT_GIVEN,
+        data_sources: List[ChatExtensionConfiguration] | NotGiven = NOT_GIVEN, # TODO
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | None | NotGiven = NOT_GIVEN,
+    ) -> AsyncStream[ChatCompletionChunk]:
+        """
+        Creates a model response for the given chat conversation.
+
+        Args:
+          messages: A list of messages comprising the conversation so far.
+              [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_format_inputs_to_ChatGPT_models.ipynb).
+
+          model: ID of the model to use. See the
+              [model endpoint compatibility](https://platform.openai.com/docs/models/model-endpoint-compatibility)
+              table for details on which models work with the Chat API.
+
+          stream: If set, partial message deltas will be sent, like in ChatGPT. Tokens will be
+              sent as data-only
+              [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format)
+              as they become available, with the stream terminated by a `data: [DONE]`
+              message.
+              [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_stream_completions.ipynb).
+
+          frequency_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on their
+              existing frequency in the text so far, decreasing the model's likelihood to
+              repeat the same line verbatim.
+
+              [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/gpt/parameter-details)
+
+          function_call: Controls how the model responds to function calls. `none` means the model does
+              not call a function, and responds to the end-user. `auto` means the model can
+              pick between an end-user or calling a function. Specifying a particular function
+              via `{"name": "my_function"}` forces the model to call that function. `none` is
+              the default when no functions are present. `auto` is the default if functions
+              are present.
+
+          functions: A list of functions the model may generate JSON inputs for.
+
+          logit_bias: Modify the likelihood of specified tokens appearing in the completion.
+
+              Accepts a json object that maps tokens (specified by their token ID in the
+              tokenizer) to an associated bias value from -100 to 100. Mathematically, the
+              bias is added to the logits generated by the model prior to sampling. The exact
+              effect will vary per model, but values between -1 and 1 should decrease or
+              increase likelihood of selection; values like -100 or 100 should result in a ban
+              or exclusive selection of the relevant token.
+
+          max_tokens: The maximum number of [tokens](/tokenizer) to generate in the chat completion.
+
+              The total length of input tokens and generated tokens is limited by the model's
+              context length.
+              [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb)
+              for counting tokens.
+
+          n: How many chat completion choices to generate for each input message.
+
+          presence_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on
+              whether they appear in the text so far, increasing the model's likelihood to
+              talk about new topics.
+
+              [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/gpt/parameter-details)
+
+          stop: Up to 4 sequences where the API will stop generating further tokens.
+
+          temperature: What sampling temperature to use, between 0 and 2. Higher values like 0.8 will
+              make the output more random, while lower values like 0.2 will make it more
+              focused and deterministic.
+
+              We generally recommend altering this or `top_p` but not both.
+
+          top_p: An alternative to sampling with temperature, called nucleus sampling, where the
+              model considers the results of the tokens with top_p probability mass. So 0.1
+              means only the tokens comprising the top 10% probability mass are considered.
+
+              We generally recommend altering this or `temperature` but not both.
+
+          user: A unique identifier representing your end-user, which can help OpenAI to monitor
+              and detect abuse.
+              [Learn more](https://platform.openai.com/docs/guides/safety-best-practices/end-user-ids).
+
+          extra_headers: Send extra headers
+
+          extra_query: Add additional query parameters to the request
+
+          extra_body: Add additional JSON properties to the request
+
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        ...
+    @override
+    async def create(
+        self,
+        *,
+        messages: List[ChatCompletionMessageParam],
+        model: Union[
+            str,
+            Literal[
+                "gpt-4",
+                "gpt-4-0314",
+                "gpt-4-0613",
+                "gpt-4-32k",
+                "gpt-4-32k-0314",
+                "gpt-4-32k-0613",
+                "gpt-3.5-turbo",
+                "gpt-3.5-turbo-16k",
+                "gpt-3.5-turbo-0301",
+                "gpt-3.5-turbo-0613",
+                "gpt-3.5-turbo-16k-0613",
+            ],
+        ],
+        frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        function_call: FunctionCall | NotGiven = NOT_GIVEN,
+        functions: List[Function] | NotGiven = NOT_GIVEN,
+        logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN,
+        max_tokens: Optional[int] | NotGiven = NOT_GIVEN,
+        n: Optional[int] | NotGiven = NOT_GIVEN,
+        presence_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        stop: Union[Optional[str], List[str]] | NotGiven = NOT_GIVEN,
+        stream: Optional[Literal[False]] | Literal[True] | NotGiven = NOT_GIVEN,
+        temperature: Optional[float] | NotGiven = NOT_GIVEN,
+        top_p: Optional[float] | NotGiven = NOT_GIVEN,
+        user: str | NotGiven = NOT_GIVEN,
+        data_sources: List[ChatExtensionConfiguration] | NotGiven = NOT_GIVEN, # TODO
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | None | NotGiven = NOT_GIVEN,
+    ) -> ChatCompletion | AsyncStream[ChatCompletionChunk]:
+        if data_sources:
+            if extra_body is None:
+                extra_body= {}
+            cast(Dict[str, Any], extra_body)['dataSources'] = data_sources
+        stream_dict: Dict[str, Literal[True]] = { # TODO: pylance is upset if I pass through the parameter value. Overload + override combination is problematic
+            "stream": True
+        } if stream else {}
+        response = await super().create(
+            messages=messages,
+            model=model,
+            frequency_penalty = frequency_penalty,
+            function_call=function_call,
+            functions=functions,
+            logit_bias=logit_bias,
+            max_tokens=max_tokens,
+            n=n,
+            presence_penalty=presence_penalty,
+            stop=stop,
+            **stream_dict,
+            temperature=temperature,
+            top_p=top_p,
+            user=user,
+            extra_headers=extra_headers,
+            extra_query=extra_query,
+            extra_body=extra_body,
+            timeout=timeout
+        )
+        return response
+
+class AsyncAzureOpenAIClient(AsyncClient):
+
+    @property
+    @override
+    def chat(self) -> AsyncAzureChat:
+        return self._chat
+    
+    def __init__(self, *args: Any, credential: "TokenCredential" | None = None, api_version: str = '2023-09-01-preview', **kwargs: Any):
+        default_query = kwargs.get('default_query', {})
+        default_query.setdefault('api-version', api_version)
+        kwargs['default_query'] = default_query
+        self.credential = credential
+        if credential:
+            kwargs['api_key'] = 'Placeholder: AAD' # TODO: There is an assumption/validation there is always an API key.
+        super().__init__(*args, **kwargs)
+        self._chat = AsyncAzureChat(self)
+
+    @property
+    def auth_headers(self) -> Dict[str, str]:
+        if self.credential:
+            return { 'Authorization': f'Bearer {self.credential.get_token()}'}
+        return {"api-key": self.api_key}
+
+
+    def _check_polling_response(self, response: httpx.Response, predicate: Callable[[httpx.Response], bool]) -> bool:
+        if not predicate(response):
+            return False
+        error_data = response.json()['error']
+        message: str = cast(str, error_data.get('message', 'Operation failed'))
+        code = error_data.get('code')
+        raise OpenAIError(f'Error: {message} ({code})')
+
+    async def _poll(
+        self,
+        method: str,
+        url: str,
+        until: Callable[[httpx.Response], bool],
+        failed: Callable[[httpx.Response], bool],
+        interval: Optional[float] = None,
+        delay: Optional[float] = None,
+    ) -> ImagesResponse:
+        if delay:
+            time.sleep(delay)
+
+        opts = FinalRequestOptions.construct(method=method, url=url)
+        response = await super().request(httpx.Response, opts)
+        self._check_polling_response(response, failed)
+        start_time = time.time()
+        while not until(response):
+            if time.time() - start_time > TIMEOUT_SECS:
+                raise Exception("Operation polling timed out.") # TODO: Fix up exception type. 
+
+            time.sleep(interval or int(response.headers.get("retry-after")) or 10)
+            response = await super().request(httpx.Response, opts)
+            self._check_polling_response(response, failed)
+
+        response_json = response.json()
+        return ImagesResponse.construct(**response_json["result"])
+
+    async def _request(self, cast_to: Type[ResponseT], options: FinalRequestOptions, **kwargs: Any) -> Any:
+        if options.url == "/images/generations":
+            options.url = "openai/images/generations:submit"
+            response = await super().request(httpx.Response, **kwargs)
+            operation_id = cast(Mapping[str, Any], getattr(response, 'model_extra')) or {}
+            return await self._poll(
+                "get", f"openai/operations/images/{operation_id}",
+                until=lambda response: response.json()["status"] in ["succeeded"],
+                failed=lambda response: response.json()["status"] in ["failed"],
+            )
+        elif options.extra_json and options.extra_json.get("dataSources"):
+            assert isinstance(options.json_data, Mapping)
+            model = cast(str, options.json_data["model"])
+            options.url = f'openai/deployments/{model}/extensions' + options.url
+        else:
+            assert isinstance(options.json_data, Mapping)
+            model = cast(str, options.json_data["model"])
+            options.url = f'openai/deployments/{model}' + options.url
+        return await super().request(cast_to=cast_to, options=options, **kwargs)
+
diff --git a/src/openai/azure/_azuremodels.py b/src/openai/azure/_azuremodels.py
new file mode 100644
index 0000000000..bfc2f31fd4
--- /dev/null
+++ b/src/openai/azure/_azuremodels.py
@@ -0,0 +1,5 @@
+from typing import TypedDict
+
+class ChatExtensionConfiguration(TypedDict):
+    type: str
+    parameters: object
diff --git a/src/openai/azure/_credential.py b/src/openai/azure/_credential.py
new file mode 100644
index 0000000000..075a7116cd
--- /dev/null
+++ b/src/openai/azure/_credential.py
@@ -0,0 +1,13 @@
+class TokenCredential:
+    """Placeholder/example token credential class
+    
+       A real implementation would be compatible with e.g. azure-identity and also should be easily
+       adaptible to other token credential implementations.
+    """
+    def __init__(self):
+        import azure.identity
+        self._credential = azure.identity.DefaultAzureCredential()
+
+    def get_token(self):
+        return self._credential.get_token('https://cognitiveservices.azure.com/.default').token
+
diff --git a/src/openai/azure/_sync_client.py b/src/openai/azure/_sync_client.py
new file mode 100644
index 0000000000..499c74e7ce
--- /dev/null
+++ b/src/openai/azure/_sync_client.py
@@ -0,0 +1,440 @@
+from typing_extensions import Literal, override
+from typing import Any, Callable, cast, List, Mapping, Dict, Optional, overload, Union
+import time
+
+import httpx
+
+from openai import Client, OpenAIError
+from openai.types import ImagesResponse
+
+# These are types used in the public API surface area that are not exported as public
+from openai._models import FinalRequestOptions
+
+# These types are needed for correct typing of overrides
+from openai._types import NotGiven, NOT_GIVEN, Headers, Query, Body
+from openai._streaming import Stream
+
+from openai.resources.chat import Chat, Completions
+from openai.types.chat import ChatCompletionMessageParam, ChatCompletion, ChatCompletionChunk
+from openai.types.chat.completion_create_params import FunctionCall, Function
+
+# Azure specific types
+from ._credential import TokenCredential
+from ._azuremodels import ChatExtensionConfiguration
+
+TIMEOUT_SECS = 600
+
+class AzureChat(Chat):
+
+    @property
+    def completions(self) -> "AzureCompletions":
+        return self._completions
+    
+    def __init__(self, client: "AzureOpenAIClient"):
+        self._completions = AzureCompletions(client)
+
+class AzureCompletions(Completions):
+    
+    @overload
+    def create(
+        self,
+        *,
+        messages: List[ChatCompletionMessageParam],
+        model: Union[
+            str,
+            Literal[
+                "gpt-4",
+                "gpt-4-0314",
+                "gpt-4-0613",
+                "gpt-4-32k",
+                "gpt-4-32k-0314",
+                "gpt-4-32k-0613",
+                "gpt-3.5-turbo",
+                "gpt-3.5-turbo-16k",
+                "gpt-3.5-turbo-0301",
+                "gpt-3.5-turbo-0613",
+                "gpt-3.5-turbo-16k-0613",
+            ],
+        ],
+        frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        function_call: FunctionCall | NotGiven = NOT_GIVEN,
+        functions: List[Function] | NotGiven = NOT_GIVEN,
+        logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN,
+        max_tokens: Optional[int] | NotGiven = NOT_GIVEN,
+        n: Optional[int] | NotGiven = NOT_GIVEN,
+        presence_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        stop: Union[Optional[str], List[str]] | NotGiven = NOT_GIVEN,
+        stream: Optional[Literal[False]] | NotGiven = NOT_GIVEN,
+        temperature: Optional[float] | NotGiven = NOT_GIVEN,
+        top_p: Optional[float] | NotGiven = NOT_GIVEN,
+        user: str | NotGiven = NOT_GIVEN,
+        data_sources: List[ChatExtensionConfiguration] | NotGiven = NOT_GIVEN, # TODO
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | None | NotGiven = NOT_GIVEN,
+    ) -> ChatCompletion:
+        """
+        Creates a model response for the given chat conversation.
+
+        Args:
+          messages: A list of messages comprising the conversation so far.
+              [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_format_inputs_to_ChatGPT_models.ipynb).
+
+          model: ID of the model to use. See the
+              [model endpoint compatibility](https://platform.openai.com/docs/models/model-endpoint-compatibility)
+              table for details on which models work with the Chat API.
+
+          frequency_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on their
+              existing frequency in the text so far, decreasing the model's likelihood to
+              repeat the same line verbatim.
+
+              [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/gpt/parameter-details)
+
+          function_call: Controls how the model responds to function calls. `none` means the model does
+              not call a function, and responds to the end-user. `auto` means the model can
+              pick between an end-user or calling a function. Specifying a particular function
+              via `{"name": "my_function"}` forces the model to call that function. `none` is
+              the default when no functions are present. `auto` is the default if functions
+              are present.
+
+          functions: A list of functions the model may generate JSON inputs for.
+
+          logit_bias: Modify the likelihood of specified tokens appearing in the completion.
+
+              Accepts a json object that maps tokens (specified by their token ID in the
+              tokenizer) to an associated bias value from -100 to 100. Mathematically, the
+              bias is added to the logits generated by the model prior to sampling. The exact
+              effect will vary per model, but values between -1 and 1 should decrease or
+              increase likelihood of selection; values like -100 or 100 should result in a ban
+              or exclusive selection of the relevant token.
+
+          max_tokens: The maximum number of [tokens](/tokenizer) to generate in the chat completion.
+
+              The total length of input tokens and generated tokens is limited by the model's
+              context length.
+              [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb)
+              for counting tokens.
+
+          n: How many chat completion choices to generate for each input message.
+
+          presence_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on
+              whether they appear in the text so far, increasing the model's likelihood to
+              talk about new topics.
+
+              [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/gpt/parameter-details)
+
+          stop: Up to 4 sequences where the API will stop generating further tokens.
+
+          stream: If set, partial message deltas will be sent, like in ChatGPT. Tokens will be
+              sent as data-only
+              [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format)
+              as they become available, with the stream terminated by a `data: [DONE]`
+              message.
+              [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_stream_completions.ipynb).
+
+          temperature: What sampling temperature to use, between 0 and 2. Higher values like 0.8 will
+              make the output more random, while lower values like 0.2 will make it more
+              focused and deterministic.
+
+              We generally recommend altering this or `top_p` but not both.
+
+          top_p: An alternative to sampling with temperature, called nucleus sampling, where the
+              model considers the results of the tokens with top_p probability mass. So 0.1
+              means only the tokens comprising the top 10% probability mass are considered.
+
+              We generally recommend altering this or `temperature` but not both.
+
+          user: A unique identifier representing your end-user, which can help OpenAI to monitor
+              and detect abuse.
+              [Learn more](https://platform.openai.com/docs/guides/safety-best-practices/end-user-ids).
+
+          extra_headers: Send extra headers
+
+          extra_query: Add additional query parameters to the request
+
+          extra_body: Add additional JSON properties to the request
+
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        ...
+
+    @overload
+    def create(
+        self,
+        *,
+        messages: List[ChatCompletionMessageParam],
+        model: Union[
+            str,
+            Literal[
+                "gpt-4",
+                "gpt-4-0314",
+                "gpt-4-0613",
+                "gpt-4-32k",
+                "gpt-4-32k-0314",
+                "gpt-4-32k-0613",
+                "gpt-3.5-turbo",
+                "gpt-3.5-turbo-16k",
+                "gpt-3.5-turbo-0301",
+                "gpt-3.5-turbo-0613",
+                "gpt-3.5-turbo-16k-0613",
+            ],
+        ],
+        stream: Literal[True],
+        frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        function_call: FunctionCall | NotGiven = NOT_GIVEN,
+        functions: List[Function] | NotGiven = NOT_GIVEN,
+        logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN,
+        max_tokens: Optional[int] | NotGiven = NOT_GIVEN,
+        n: Optional[int] | NotGiven = NOT_GIVEN,
+        presence_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        stop: Union[Optional[str], List[str]] | NotGiven = NOT_GIVEN,
+        temperature: Optional[float] | NotGiven = NOT_GIVEN,
+        top_p: Optional[float] | NotGiven = NOT_GIVEN,
+        user: str | NotGiven = NOT_GIVEN,
+        data_sources: List[ChatExtensionConfiguration] | NotGiven = NOT_GIVEN, # TODO
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | None | NotGiven = NOT_GIVEN,
+    ) -> Stream[ChatCompletionChunk]:
+        """
+        Creates a model response for the given chat conversation.
+
+        Args:
+          messages: A list of messages comprising the conversation so far.
+              [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_format_inputs_to_ChatGPT_models.ipynb).
+
+          model: ID of the model to use. See the
+              [model endpoint compatibility](https://platform.openai.com/docs/models/model-endpoint-compatibility)
+              table for details on which models work with the Chat API.
+
+          stream: If set, partial message deltas will be sent, like in ChatGPT. Tokens will be
+              sent as data-only
+              [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format)
+              as they become available, with the stream terminated by a `data: [DONE]`
+              message.
+              [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_stream_completions.ipynb).
+
+          frequency_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on their
+              existing frequency in the text so far, decreasing the model's likelihood to
+              repeat the same line verbatim.
+
+              [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/gpt/parameter-details)
+
+          function_call: Controls how the model responds to function calls. `none` means the model does
+              not call a function, and responds to the end-user. `auto` means the model can
+              pick between an end-user or calling a function. Specifying a particular function
+              via `{"name": "my_function"}` forces the model to call that function. `none` is
+              the default when no functions are present. `auto` is the default if functions
+              are present.
+
+          functions: A list of functions the model may generate JSON inputs for.
+
+          logit_bias: Modify the likelihood of specified tokens appearing in the completion.
+
+              Accepts a json object that maps tokens (specified by their token ID in the
+              tokenizer) to an associated bias value from -100 to 100. Mathematically, the
+              bias is added to the logits generated by the model prior to sampling. The exact
+              effect will vary per model, but values between -1 and 1 should decrease or
+              increase likelihood of selection; values like -100 or 100 should result in a ban
+              or exclusive selection of the relevant token.
+
+          max_tokens: The maximum number of [tokens](/tokenizer) to generate in the chat completion.
+
+              The total length of input tokens and generated tokens is limited by the model's
+              context length.
+              [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb)
+              for counting tokens.
+
+          n: How many chat completion choices to generate for each input message.
+
+          presence_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on
+              whether they appear in the text so far, increasing the model's likelihood to
+              talk about new topics.
+
+              [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/gpt/parameter-details)
+
+          stop: Up to 4 sequences where the API will stop generating further tokens.
+
+          temperature: What sampling temperature to use, between 0 and 2. Higher values like 0.8 will
+              make the output more random, while lower values like 0.2 will make it more
+              focused and deterministic.
+
+              We generally recommend altering this or `top_p` but not both.
+
+          top_p: An alternative to sampling with temperature, called nucleus sampling, where the
+              model considers the results of the tokens with top_p probability mass. So 0.1
+              means only the tokens comprising the top 10% probability mass are considered.
+
+              We generally recommend altering this or `temperature` but not both.
+
+          user: A unique identifier representing your end-user, which can help OpenAI to monitor
+              and detect abuse.
+              [Learn more](https://platform.openai.com/docs/guides/safety-best-practices/end-user-ids).
+
+          extra_headers: Send extra headers
+
+          extra_query: Add additional query parameters to the request
+
+          extra_body: Add additional JSON properties to the request
+
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        ...
+    @override
+    def create(
+        self,
+        *,
+        messages: List[ChatCompletionMessageParam],
+        model: Union[
+            str,
+            Literal[
+                "gpt-4",
+                "gpt-4-0314",
+                "gpt-4-0613",
+                "gpt-4-32k",
+                "gpt-4-32k-0314",
+                "gpt-4-32k-0613",
+                "gpt-3.5-turbo",
+                "gpt-3.5-turbo-16k",
+                "gpt-3.5-turbo-0301",
+                "gpt-3.5-turbo-0613",
+                "gpt-3.5-turbo-16k-0613",
+            ],
+        ],
+        frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        function_call: FunctionCall | NotGiven = NOT_GIVEN,
+        functions: List[Function] | NotGiven = NOT_GIVEN,
+        logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN,
+        max_tokens: Optional[int] | NotGiven = NOT_GIVEN,
+        n: Optional[int] | NotGiven = NOT_GIVEN,
+        presence_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        stop: Union[Optional[str], List[str]] | NotGiven = NOT_GIVEN,
+        stream: Optional[Literal[False]] | Literal[True] | NotGiven = NOT_GIVEN,
+        temperature: Optional[float] | NotGiven = NOT_GIVEN,
+        top_p: Optional[float] | NotGiven = NOT_GIVEN,
+        user: str | NotGiven = NOT_GIVEN,
+        data_sources: List[ChatExtensionConfiguration] | NotGiven = NOT_GIVEN, # TODO
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | None | NotGiven = NOT_GIVEN,
+    ) -> ChatCompletion | Stream[ChatCompletionChunk]:
+        if data_sources:
+            if extra_body is None:
+                extra_body= {}
+            cast(Dict[str, Any], extra_body)['dataSources'] = data_sources
+        stream_dict: Dict[str, Literal[True]] = { # TODO: pylance is upset if I pass through the parameter value. Overload + override combination is problematic
+            "stream": True
+        } if stream else {}
+        response = super().create(
+            messages=messages,
+            model=model,
+            frequency_penalty = frequency_penalty,
+            function_call=function_call,
+            functions=functions,
+            logit_bias=logit_bias,
+            max_tokens=max_tokens,
+            n=n,
+            presence_penalty=presence_penalty,
+            stop=stop,
+            **stream_dict,
+            temperature=temperature,
+            top_p=top_p,
+            user=user,
+            extra_headers=extra_headers,
+            extra_query=extra_query,
+            extra_body=extra_body,
+            timeout=timeout
+        )
+        return response
+
+
+class AzureOpenAIClient(Client):
+
+    @property
+    @override
+    def chat(self) -> AzureChat:
+        return self._chat
+    
+    def __init__(self, *args: Any, credential: "TokenCredential" | None = None, api_version: str = '2023-09-01-preview', **kwargs: Any):
+        default_query = kwargs.get('default_query', {})
+        default_query.setdefault('api-version', api_version)
+        kwargs['default_query'] = default_query
+        self.credential = credential
+        if credential:
+            kwargs['api_key'] = 'Placeholder: AAD' # TODO: There is an assumption/validation there is always an API key.
+        super().__init__(*args, **kwargs)
+        self._chat = AzureChat(self)
+
+    @property
+    def auth_headers(self) -> Dict[str, str]:
+        if self.credential:
+            return { 'Authorization': f'Bearer {self.credential.get_token()}'}
+        return {"api-key": self.api_key}
+
+    # NOTE: We override the internal method because overriding overloaded methods and keeping typing happy is a pain. Most typing tools are lacking...
+    def _request(self, *, options: FinalRequestOptions, **kwargs: Any) -> Any:
+        if options.url == "/images/generations":
+            options.url = "openai/images/generations:submit"
+            response = super().request(httpx.Response, **kwargs)
+            model_extra = cast(Mapping[str, Any], getattr(response, 'model_extra')) or {}
+            operation_id = cast(str, model_extra['id'])
+            return self._poll(
+                "get", f"openai/operations/images/{operation_id}",
+                until=lambda response: response.json()["status"] in ["succeeded"],
+                failed=lambda response: response.json()["status"] in ["failed"],
+            )
+        if options.extra_json and options.extra_json.get("dataSources"):
+            assert isinstance(options.json_data, Mapping)
+            model = cast(str, options.json_data["model"])
+            options.url = f'openai/deployments/{model}/extensions' + options.url
+        else:
+            assert isinstance(options.json_data, Mapping)
+            model = cast(str, options.json_data["model"])
+            options.url = f'openai/deployments/{model}' + options.url
+        return super().request(options=options, **kwargs)
+
+    # Internal azure specific "helper" methods
+    def _check_polling_response(self, response: httpx.Response, predicate: Callable[[httpx.Response], bool]) -> bool:
+        if not predicate(response):
+            return False
+        error_data = cast(Dict[str, Any], response.json()['error'])
+        message = error_data.get('message', 'Operation failed')
+        code = error_data.get('code')
+        raise OpenAIError(message, code)
+
+    def _poll(
+        self,
+        method: str,
+        url: str,
+        until: Callable[[httpx.Response], bool],
+        failed: Callable[[httpx.Response], bool],
+        interval: Optional[float] = None,
+        delay: Optional[float] = None,
+    ) -> ImagesResponse:
+        if delay:
+            time.sleep(delay)
+
+        opts = FinalRequestOptions.construct(method=method, url=url)
+        response = super().request(httpx.Response, opts)
+        self._check_polling_response(response, failed)
+        start_time = time.time()
+        while not until(response):
+            if time.time() - start_time > TIMEOUT_SECS:
+                raise OpenAIError("Operation polling timed out.") # TODO: Find the right exception
+
+            time.sleep(interval or int(response.headers.get("retry-after")) or 10)
+            response = super().request(httpx.Response, opts)
+            self._check_polling_response(response, failed)
+
+        response_json = response.json()
+        return ImagesResponse.construct(**response_json["result"])
+

From 357c2ee0e7c167b4dc8abad59c7c5a5495fdcf71 Mon Sep 17 00:00:00 2001
From: Johan Stenberg <johan.stenberg@microsoft.com>
Date: Thu, 5 Oct 2023 13:43:10 -0700
Subject: [PATCH 04/10] Change resources on client to be read-only properties

---
 src/openai/_client.py             | 14 ++++++++++----
 src/openai/resources/chat/chat.py | 16 +++++++++++-----
 2 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/src/openai/_client.py b/src/openai/_client.py
index e0e5e37f4c..a1ee81e2ea 100644
--- a/src/openai/_client.py
+++ b/src/openai/_client.py
@@ -46,7 +46,11 @@
 
 class OpenAI(SyncAPIClient):
     completions: resources.Completions
-    chat: resources.Chat
+
+    @property
+    def chat(self) -> resources.chat.Chat:
+        return self._chat
+    # chat: resources.chat.Chat
     edits: resources.Edits
     embeddings: resources.Embeddings
     files: resources.Files
@@ -122,7 +126,7 @@ def __init__(
         self._default_stream_cls = Stream
 
         self.completions = resources.Completions(self)
-        self.chat = resources.Chat(self)
+        self._chat = resources.Chat(self)
         self.edits = resources.Edits(self)
         self.embeddings = resources.Embeddings(self)
         self.files = resources.Files(self)
@@ -244,7 +248,9 @@ def _make_status_error(
 
 class AsyncOpenAI(AsyncAPIClient):
     completions: resources.AsyncCompletions
-    chat: resources.AsyncChat
+    @property
+    def chat(self) -> resources.AsyncChat:
+        return self._chat
     edits: resources.AsyncEdits
     embeddings: resources.AsyncEmbeddings
     files: resources.AsyncFiles
@@ -320,7 +326,7 @@ def __init__(
         self._default_stream_cls = AsyncStream
 
         self.completions = resources.AsyncCompletions(self)
-        self.chat = resources.AsyncChat(self)
+        self._chat = resources.AsyncChat(self)
         self.edits = resources.AsyncEdits(self)
         self.embeddings = resources.AsyncEmbeddings(self)
         self.files = resources.AsyncFiles(self)
diff --git a/src/openai/resources/chat/chat.py b/src/openai/resources/chat/chat.py
index 62bb796571..226b3d7add 100644
--- a/src/openai/resources/chat/chat.py
+++ b/src/openai/resources/chat/chat.py
@@ -14,16 +14,22 @@
 
 
 class Chat(SyncAPIResource):
-    completions: Completions
+
+    @property
+    def completions(self) -> Completions:
+        return self._completions
 
     def __init__(self, client: OpenAI) -> None:
         super().__init__(client)
-        self.completions = Completions(client)
+        self._completions = Completions(client)
 
 
 class AsyncChat(AsyncAPIResource):
-    completions: AsyncCompletions
-
+    
+    @property
+    def completions(self) -> AsyncCompletions:
+        return self._completions
+    
     def __init__(self, client: AsyncOpenAI) -> None:
         super().__init__(client)
-        self.completions = AsyncCompletions(client)
+        self._completions = AsyncCompletions(client)

From 016143e57d1ad164edbffceca274ba01667552c3 Mon Sep 17 00:00:00 2001
From: Johan Stenberg <johan.stenberg@microsoft.com>
Date: Thu, 5 Oct 2023 13:47:33 -0700
Subject: [PATCH 05/10] Missed renames of azure client

---
 src/openai/__init__.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/openai/__init__.py b/src/openai/__init__.py
index b386c35fee..ab7324fb1c 100644
--- a/src/openai/__init__.py
+++ b/src/openai/__init__.py
@@ -18,7 +18,7 @@
     AsyncStream,
     RequestOptions,
 )
-from .azure import AzureClient, AzureAsyncClient
+from .azure import AzureOpenAIClient, AsyncAzureOpenAIClient
 from ._version import __title__, __version__
 from ._exceptions import (
     APIError,
@@ -67,8 +67,8 @@
     "OpenAI",
     "AsyncOpenAI",
     "file_from_path",
-    "AzureClient",
-    "AzureAsyncClient"
+    "AzureOpenAIClient",
+    "AsyncAzureOpenAIClient"
 ]
 
 from .version import VERSION as VERSION

From 38929f499fd07f425fcac95590df5184e5d102a4 Mon Sep 17 00:00:00 2001
From: Johan Stenberg <johan.stenberg@microsoft.com>
Date: Thu, 5 Oct 2023 15:16:32 -0700
Subject: [PATCH 06/10] Fix url rebuilding for retries(?)

---
 src/openai/__init__.py            |  5 +----
 src/openai/azure/_async_client.py | 18 +++++++++---------
 src/openai/azure/_sync_client.py  | 21 ++++++++++-----------
 3 files changed, 20 insertions(+), 24 deletions(-)

diff --git a/src/openai/__init__.py b/src/openai/__init__.py
index ab7324fb1c..7d2630f123 100644
--- a/src/openai/__init__.py
+++ b/src/openai/__init__.py
@@ -18,7 +18,6 @@
     AsyncStream,
     RequestOptions,
 )
-from .azure import AzureOpenAIClient, AsyncAzureOpenAIClient
 from ._version import __title__, __version__
 from ._exceptions import (
     APIError,
@@ -66,9 +65,7 @@
     "AsyncStream",
     "OpenAI",
     "AsyncOpenAI",
-    "file_from_path",
-    "AzureOpenAIClient",
-    "AsyncAzureOpenAIClient"
+    "file_from_path"
 ]
 
 from .version import VERSION as VERSION
diff --git a/src/openai/azure/_async_client.py b/src/openai/azure/_async_client.py
index a34c36eb62..62445403a2 100644
--- a/src/openai/azure/_async_client.py
+++ b/src/openai/azure/_async_client.py
@@ -362,7 +362,7 @@ class AsyncAzureOpenAIClient(AsyncClient):
     def chat(self) -> AsyncAzureChat:
         return self._chat
     
-    def __init__(self, *args: Any, credential: "TokenCredential" | None = None, api_version: str = '2023-09-01-preview', **kwargs: Any):
+    def __init__(self, *args: Any, credential: Optional["TokenCredential"] = None, api_version: str = '2023-09-01-preview', **kwargs: Any):
         default_query = kwargs.get('default_query', {})
         default_query.setdefault('api-version', api_version)
         kwargs['default_query'] = default_query
@@ -414,6 +414,7 @@ async def _poll(
         response_json = response.json()
         return ImagesResponse.construct(**response_json["result"])
 
+    # NOTE: We override the internal method because `@overrid`ing `@overload`ed methods and keeping typing happy is a pain. Most typing tools are lacking...
     async def _request(self, cast_to: Type[ResponseT], options: FinalRequestOptions, **kwargs: Any) -> Any:
         if options.url == "/images/generations":
             options.url = "openai/images/generations:submit"
@@ -424,13 +425,12 @@ async def _request(self, cast_to: Type[ResponseT], options: FinalRequestOptions,
                 until=lambda response: response.json()["status"] in ["succeeded"],
                 failed=lambda response: response.json()["status"] in ["failed"],
             )
-        elif options.extra_json and options.extra_json.get("dataSources"):
-            assert isinstance(options.json_data, Mapping)
-            model = cast(str, options.json_data["model"])
-            options.url = f'openai/deployments/{model}/extensions' + options.url
-        else:
-            assert isinstance(options.json_data, Mapping)
-            model = cast(str, options.json_data["model"])
-            options.url = f'openai/deployments/{model}' + options.url
+        if isinstance(options.json_data, Mapping):
+            model = cast(str, options.json_data["model"])        
+            if not options.url.startswith(f'openai/deployments/{model}'):
+                if options.extra_json and options.extra_json.get("dataSources"):
+                    options.url = f'openai/deployments/{model}/extensions' + options.url
+                else:                
+                    options.url = f'openai/deployments/{model}' + options.url
         return await super().request(cast_to=cast_to, options=options, **kwargs)
 
diff --git a/src/openai/azure/_sync_client.py b/src/openai/azure/_sync_client.py
index 499c74e7ce..645f1e630d 100644
--- a/src/openai/azure/_sync_client.py
+++ b/src/openai/azure/_sync_client.py
@@ -364,14 +364,14 @@ class AzureOpenAIClient(Client):
     def chat(self) -> AzureChat:
         return self._chat
     
-    def __init__(self, *args: Any, credential: "TokenCredential" | None = None, api_version: str = '2023-09-01-preview', **kwargs: Any):
+    def __init__(self, *args: Any, base_url: str, credential: Optional["TokenCredential"] = None, api_version: str = '2023-09-01-preview', **kwargs: Any):
         default_query = kwargs.get('default_query', {})
         default_query.setdefault('api-version', api_version)
         kwargs['default_query'] = default_query
         self.credential = credential
         if credential:
             kwargs['api_key'] = 'Placeholder: AAD' # TODO: There is an assumption/validation there is always an API key.
-        super().__init__(*args, **kwargs)
+        super().__init__(*args, base_url=base_url, **kwargs)
         self._chat = AzureChat(self)
 
     @property
@@ -380,7 +380,7 @@ def auth_headers(self) -> Dict[str, str]:
             return { 'Authorization': f'Bearer {self.credential.get_token()}'}
         return {"api-key": self.api_key}
 
-    # NOTE: We override the internal method because overriding overloaded methods and keeping typing happy is a pain. Most typing tools are lacking...
+    # NOTE: We override the internal method because `@overrid`ing `@overload`ed methods and keeping typing happy is a pain. Most typing tools are lacking...
     def _request(self, *, options: FinalRequestOptions, **kwargs: Any) -> Any:
         if options.url == "/images/generations":
             options.url = "openai/images/generations:submit"
@@ -392,14 +392,13 @@ def _request(self, *, options: FinalRequestOptions, **kwargs: Any) -> Any:
                 until=lambda response: response.json()["status"] in ["succeeded"],
                 failed=lambda response: response.json()["status"] in ["failed"],
             )
-        if options.extra_json and options.extra_json.get("dataSources"):
-            assert isinstance(options.json_data, Mapping)
-            model = cast(str, options.json_data["model"])
-            options.url = f'openai/deployments/{model}/extensions' + options.url
-        else:
-            assert isinstance(options.json_data, Mapping)
-            model = cast(str, options.json_data["model"])
-            options.url = f'openai/deployments/{model}' + options.url
+        if isinstance(options.json_data, Mapping):
+            model = cast(str, options.json_data["model"])        
+            if not options.url.startswith(f'openai/deployments/{model}'):
+                if options.extra_json and options.extra_json.get("dataSources"):
+                    options.url = f'openai/deployments/{model}/extensions' + options.url
+                else:                
+                    options.url = f'openai/deployments/{model}' + options.url
         return super().request(options=options, **kwargs)
 
     # Internal azure specific "helper" methods

From 7ad129f082f703714356c253c909a96d2c233c89 Mon Sep 17 00:00:00 2001
From: Krista Pratico <krpratic@microsoft.com>
Date: Thu, 5 Oct 2023 16:26:19 -0700
Subject: [PATCH 07/10] few fixes after the refactor (#9)

---
 src/openai/azure/_async_client.py | 12 ++++++------
 src/openai/azure/_sync_client.py  |  8 ++++----
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/openai/azure/_async_client.py b/src/openai/azure/_async_client.py
index 62445403a2..77708fa74f 100644
--- a/src/openai/azure/_async_client.py
+++ b/src/openai/azure/_async_client.py
@@ -418,19 +418,19 @@ async def _poll(
     async def _request(self, cast_to: Type[ResponseT], options: FinalRequestOptions, **kwargs: Any) -> Any:
         if options.url == "/images/generations":
             options.url = "openai/images/generations:submit"
-            response = await super().request(httpx.Response, **kwargs)
-            operation_id = cast(Mapping[str, Any], getattr(response, 'model_extra')) or {}
+            response = await super()._request(cast_to=cast_to, options=options, **kwargs)
+            model_extra = cast(Mapping[str, Any], getattr(response, 'model_extra')) or {}
+            operation_id = cast(str, model_extra['id'])
             return await self._poll(
                 "get", f"openai/operations/images/{operation_id}",
                 until=lambda response: response.json()["status"] in ["succeeded"],
                 failed=lambda response: response.json()["status"] in ["failed"],
             )
         if isinstance(options.json_data, Mapping):
-            model = cast(str, options.json_data["model"])        
+            model = cast(str, options.json_data["model"])
             if not options.url.startswith(f'openai/deployments/{model}'):
                 if options.extra_json and options.extra_json.get("dataSources"):
                     options.url = f'openai/deployments/{model}/extensions' + options.url
-                else:                
+                else:
                     options.url = f'openai/deployments/{model}' + options.url
-        return await super().request(cast_to=cast_to, options=options, **kwargs)
-
+        return await super()._request(cast_to=cast_to, options=options, **kwargs)
\ No newline at end of file
diff --git a/src/openai/azure/_sync_client.py b/src/openai/azure/_sync_client.py
index 645f1e630d..ba7faccf20 100644
--- a/src/openai/azure/_sync_client.py
+++ b/src/openai/azure/_sync_client.py
@@ -384,7 +384,7 @@ def auth_headers(self) -> Dict[str, str]:
     def _request(self, *, options: FinalRequestOptions, **kwargs: Any) -> Any:
         if options.url == "/images/generations":
             options.url = "openai/images/generations:submit"
-            response = super().request(httpx.Response, **kwargs)
+            response = super()._request(options=options, **kwargs)
             model_extra = cast(Mapping[str, Any], getattr(response, 'model_extra')) or {}
             operation_id = cast(str, model_extra['id'])
             return self._poll(
@@ -393,13 +393,13 @@ def _request(self, *, options: FinalRequestOptions, **kwargs: Any) -> Any:
                 failed=lambda response: response.json()["status"] in ["failed"],
             )
         if isinstance(options.json_data, Mapping):
-            model = cast(str, options.json_data["model"])        
+            model = cast(str, options.json_data["model"])
             if not options.url.startswith(f'openai/deployments/{model}'):
                 if options.extra_json and options.extra_json.get("dataSources"):
                     options.url = f'openai/deployments/{model}/extensions' + options.url
-                else:                
+                else:
                     options.url = f'openai/deployments/{model}' + options.url
-        return super().request(options=options, **kwargs)
+        return super()._request(options=options, **kwargs)
 
     # Internal azure specific "helper" methods
     def _check_polling_response(self, response: httpx.Response, predicate: Callable[[httpx.Response], bool]) -> bool:

From 664178c39c716efd771d2663b77cc0360f6de20d Mon Sep 17 00:00:00 2001
From: Krista Pratico <krpratic@microsoft.com>
Date: Fri, 6 Oct 2023 16:46:08 -0700
Subject: [PATCH 08/10] custom auth (#12)

Change to use customAuth rather than "hiding" token credentials behind auth_headers. Added async support for token auth.
---
 src/openai/azure/_async_client.py |  8 +++++---
 src/openai/azure/_credential.py   | 33 +++++++++++++++++++++++++++++++
 src/openai/azure/_sync_client.py  |  9 ++++++---
 3 files changed, 44 insertions(+), 6 deletions(-)

diff --git a/src/openai/azure/_async_client.py b/src/openai/azure/_async_client.py
index 77708fa74f..8ec0cc6cae 100644
--- a/src/openai/azure/_async_client.py
+++ b/src/openai/azure/_async_client.py
@@ -18,7 +18,7 @@
 from openai._streaming import AsyncStream
 
 # Azure specific types
-from ._credential import TokenCredential
+from ._credential import TokenCredential, TokenAuth
 from ._azuremodels import ChatExtensionConfiguration
 
 TIMEOUT_SECS = 600
@@ -374,10 +374,12 @@ def __init__(self, *args: Any, credential: Optional["TokenCredential"] = None, a
 
     @property
     def auth_headers(self) -> Dict[str, str]:
-        if self.credential:
-            return { 'Authorization': f'Bearer {self.credential.get_token()}'}
         return {"api-key": self.api_key}
 
+    @property
+    def custom_auth(self) -> httpx.Auth | None:
+        if self.credential:
+            return TokenAuth(self.credential)
 
     def _check_polling_response(self, response: httpx.Response, predicate: Callable[[httpx.Response], bool]) -> bool:
         if not predicate(response):
diff --git a/src/openai/azure/_credential.py b/src/openai/azure/_credential.py
index 075a7116cd..9d10e14909 100644
--- a/src/openai/azure/_credential.py
+++ b/src/openai/azure/_credential.py
@@ -1,3 +1,9 @@
+from typing import AsyncGenerator, Generator, Any
+import time
+import asyncio
+import httpx
+
+
 class TokenCredential:
     """Placeholder/example token credential class
     
@@ -11,3 +17,30 @@ def __init__(self):
     def get_token(self):
         return self._credential.get_token('https://cognitiveservices.azure.com/.default').token
 
+
+class TokenAuth(httpx.Auth):
+    def __init__(self, credential: "TokenCredential") -> None:
+        self._credential = credential
+        self._async_lock = asyncio.Lock()
+        self.cached_token = None
+
+    def sync_get_token(self) -> str:
+        if not self.cached_token or self.cached_token.expires_on - time.time() < 300:
+            return self._credential.get_token("https://cognitiveservices.azure.com/.default").token
+        return self.cached_token.token
+
+    def sync_auth_flow(self, request: httpx.Request) -> Generator[httpx.Request, Any, Any]:
+        token = self.sync_get_token()
+        request.headers["Authorization"] = f"Bearer {token}"
+        yield request
+
+    async def async_get_token(self) -> str:
+        async with self._async_lock:
+            if not self.cached_token or self.cached_token.expires_on - time.time() < 300:
+                return (await self._credential.get_token("https://cognitiveservices.azure.com/.default")).token
+        return self.cached_token.token
+
+    async def async_auth_flow(self, request: httpx.Request) -> AsyncGenerator[httpx.Request, Any]:
+        token = await self.async_get_token()
+        request.headers["Authorization"] = f"Bearer {token}"
+        yield request
diff --git a/src/openai/azure/_sync_client.py b/src/openai/azure/_sync_client.py
index ba7faccf20..3f904b482f 100644
--- a/src/openai/azure/_sync_client.py
+++ b/src/openai/azure/_sync_client.py
@@ -19,7 +19,7 @@
 from openai.types.chat.completion_create_params import FunctionCall, Function
 
 # Azure specific types
-from ._credential import TokenCredential
+from ._credential import TokenCredential, TokenAuth
 from ._azuremodels import ChatExtensionConfiguration
 
 TIMEOUT_SECS = 600
@@ -376,10 +376,13 @@ def __init__(self, *args: Any, base_url: str, credential: Optional["TokenCredent
 
     @property
     def auth_headers(self) -> Dict[str, str]:
-        if self.credential:
-            return { 'Authorization': f'Bearer {self.credential.get_token()}'}
         return {"api-key": self.api_key}
 
+    @property
+    def custom_auth(self) -> httpx.Auth | None:
+        if self.credential:
+            return TokenAuth(self.credential)
+
     # NOTE: We override the internal method because `@overrid`ing `@overload`ed methods and keeping typing happy is a pain. Most typing tools are lacking...
     def _request(self, *, options: FinalRequestOptions, **kwargs: Any) -> Any:
         if options.url == "/images/generations":

From b92e77ecb3db21906fe98ff04cda338685358415 Mon Sep 17 00:00:00 2001
From: Krista Pratico <krpratic@microsoft.com>
Date: Fri, 6 Oct 2023 16:46:40 -0700
Subject: [PATCH 09/10] prepend openai to other models, files, fine-tuning,
 fine-tunes (#13)

---
 src/openai/azure/_async_client.py | 2 ++
 src/openai/azure/_sync_client.py  | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/src/openai/azure/_async_client.py b/src/openai/azure/_async_client.py
index 8ec0cc6cae..64e9f232dc 100644
--- a/src/openai/azure/_async_client.py
+++ b/src/openai/azure/_async_client.py
@@ -435,4 +435,6 @@ async def _request(self, cast_to: Type[ResponseT], options: FinalRequestOptions,
                     options.url = f'openai/deployments/{model}/extensions' + options.url
                 else:
                     options.url = f'openai/deployments/{model}' + options.url
+        if options.url.startswith(("/models", "/fine_tuning", "/files", "/fine-tunes")):
+            options.url = f"openai{options.url}"
         return await super()._request(cast_to=cast_to, options=options, **kwargs)
\ No newline at end of file
diff --git a/src/openai/azure/_sync_client.py b/src/openai/azure/_sync_client.py
index 3f904b482f..44a48c2ef4 100644
--- a/src/openai/azure/_sync_client.py
+++ b/src/openai/azure/_sync_client.py
@@ -402,6 +402,8 @@ def _request(self, *, options: FinalRequestOptions, **kwargs: Any) -> Any:
                     options.url = f'openai/deployments/{model}/extensions' + options.url
                 else:
                     options.url = f'openai/deployments/{model}' + options.url
+        if options.url.startswith(("/models", "/fine_tuning", "/files", "/fine-tunes")):
+            options.url = f"openai{options.url}"
         return super()._request(options=options, **kwargs)
 
     # Internal azure specific "helper" methods

From be65dd4d9349556186e8da36a555b963076e57db Mon Sep 17 00:00:00 2001
From: Krista Pratico <krpratic@microsoft.com>
Date: Fri, 6 Oct 2023 17:48:09 -0700
Subject: [PATCH 10/10] capture azure-only properties (#10)

Correctly type/"subclass" operations
---
 src/openai/azure/_async_client.py | 473 ++++++++++++++++++++++++++++--
 src/openai/azure/_azuremodels.py  |  81 ++++-
 src/openai/azure/_sync_client.py  | 471 +++++++++++++++++++++++++++--
 3 files changed, 965 insertions(+), 60 deletions(-)

diff --git a/src/openai/azure/_async_client.py b/src/openai/azure/_async_client.py
index 64e9f232dc..2f9ca5bc80 100644
--- a/src/openai/azure/_async_client.py
+++ b/src/openai/azure/_async_client.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 from typing_extensions import Literal, override
 from typing import Any, Callable, cast, List, Mapping, Dict, Optional, overload, Type, Union
 import time
@@ -6,9 +8,11 @@
 
 from openai import AsyncClient, OpenAIError
 from openai.resources.chat import AsyncChat, AsyncCompletions
+from openai.resources.completions import AsyncCompletions as AsyncCompletionsOperations
 from openai.types import ImagesResponse
 from openai.types.chat import ChatCompletionMessageParam, ChatCompletion, ChatCompletionChunk
 from openai.types.chat.completion_create_params import FunctionCall, Function
+from openai.types.completion import Completion
 
 # These types are needed for correct typing of overrides
 from openai._types import NotGiven, NOT_GIVEN, Headers, Query, Body, ResponseT
@@ -19,20 +23,26 @@
 
 # Azure specific types
 from ._credential import TokenCredential, TokenAuth
-from ._azuremodels import ChatExtensionConfiguration
+from ._azuremodels import (
+    ChatExtensionConfiguration,
+    AzureChatCompletion, 
+    AzureChatCompletionChunk,
+    AzureCompletion,
+)
 
 TIMEOUT_SECS = 600
 
 class AsyncAzureChat(AsyncChat):
 
     @property
-    def completions(self) -> "AsyncAzureCompletions":
+    def completions(self) -> "AsyncAzureChatCompletions":
         return self._completions
     
     def __init__(self, client: "AsyncAzureOpenAIClient"):
-        self._completions = AsyncAzureCompletions(client)
+        self._completions = AsyncAzureChatCompletions(client)
+
 
-class AsyncAzureCompletions(AsyncCompletions):
+class AsyncAzureChatCompletions(AsyncCompletions):
     
     @overload
     async def create(
@@ -74,7 +84,7 @@ async def create(
         extra_query: Query | None = None,
         extra_body: Body | None = None,
         timeout: float | None | NotGiven = NOT_GIVEN,
-    ) -> ChatCompletion:
+    ) -> AzureChatCompletion:
         """
         Creates a model response for the given chat conversation.
 
@@ -200,7 +210,7 @@ async def create(
         extra_query: Query | None = None,
         extra_body: Body | None = None,
         timeout: float | None | NotGiven = NOT_GIVEN,
-    ) -> AsyncStream[ChatCompletionChunk]:
+    ) -> AsyncStream[AzureChatCompletionChunk]:
         """
         Creates a model response for the given chat conversation.
 
@@ -325,7 +335,7 @@ async def create(
         extra_query: Query | None = None,
         extra_body: Body | None = None,
         timeout: float | None | NotGiven = NOT_GIVEN,
-    ) -> ChatCompletion | AsyncStream[ChatCompletionChunk]:
+    ) -> AzureChatCompletion | AsyncStream[AzureChatCompletionChunk]:
         if data_sources:
             if extra_body is None:
                 extra_body= {}
@@ -333,27 +343,423 @@ async def create(
         stream_dict: Dict[str, Literal[True]] = { # TODO: pylance is upset if I pass through the parameter value. Overload + override combination is problematic
             "stream": True
         } if stream else {}
-        response = await super().create(
-            messages=messages,
-            model=model,
-            frequency_penalty = frequency_penalty,
-            function_call=function_call,
-            functions=functions,
-            logit_bias=logit_bias,
-            max_tokens=max_tokens,
-            n=n,
-            presence_penalty=presence_penalty,
-            stop=stop,
-            **stream_dict,
-            temperature=temperature,
-            top_p=top_p,
-            user=user,
-            extra_headers=extra_headers,
-            extra_query=extra_query,
-            extra_body=extra_body,
-            timeout=timeout
+        response = cast(
+            Union[ChatCompletion, ChatCompletionChunk],
+            await super().create(
+                messages=messages,
+                model=model,
+                frequency_penalty = frequency_penalty,
+                function_call=function_call,
+                functions=functions,
+                logit_bias=logit_bias,
+                max_tokens=max_tokens,
+                n=n,
+                presence_penalty=presence_penalty,
+                stop=stop,
+                **stream_dict,
+                temperature=temperature,
+                top_p=top_p,
+                user=user,
+                extra_headers=extra_headers,
+                extra_query=extra_query,
+                extra_body=extra_body,
+                timeout=timeout
+            )
         )
-        return response
+        if isinstance(response, AsyncStream):
+            response._cast_to = AzureChatCompletionChunk  # or rebuild the stream?
+        else:
+            response_json = response.model_dump(mode="json")
+            response = AzureChatCompletion.construct(**response_json)
+        return response  # type: ignore
+
+
+class AsyncAzureCompletions(AsyncCompletionsOperations):
+    @overload
+    async def create(
+        self,
+        *,
+        model: Union[
+            str,
+            Literal[
+                "babbage-002",
+                "davinci-002",
+                "gpt-3.5-turbo-instruct",
+                "text-davinci-003",
+                "text-davinci-002",
+                "text-davinci-001",
+                "code-davinci-002",
+                "text-curie-001",
+                "text-babbage-001",
+                "text-ada-001",
+            ],
+        ],
+        prompt: Union[str, List[str], List[int], List[List[int]], None],
+        best_of: Optional[int] | NotGiven = NOT_GIVEN,
+        echo: Optional[bool] | NotGiven = NOT_GIVEN,
+        frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN,
+        logprobs: Optional[int] | NotGiven = NOT_GIVEN,
+        max_tokens: Optional[int] | NotGiven = NOT_GIVEN,
+        n: Optional[int] | NotGiven = NOT_GIVEN,
+        presence_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        stop: Union[Optional[str], List[str], None] | NotGiven = NOT_GIVEN,
+        stream: Optional[Literal[False]] | NotGiven = NOT_GIVEN,
+        suffix: Optional[str] | NotGiven = NOT_GIVEN,
+        temperature: Optional[float] | NotGiven = NOT_GIVEN,
+        top_p: Optional[float] | NotGiven = NOT_GIVEN,
+        user: str | NotGiven = NOT_GIVEN,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | None | NotGiven = NOT_GIVEN,
+    ) -> AzureCompletion:
+        """
+        Creates a completion for the provided prompt and parameters.
+
+        Args:
+          model: ID of the model to use. You can use the
+              [List models](https://platform.openai.com/docs/api-reference/models/list) API to
+              see all of your available models, or see our
+              [Model overview](https://platform.openai.com/docs/models/overview) for
+              descriptions of them.
+
+          prompt: The prompt(s) to generate completions for, encoded as a string, array of
+              strings, array of tokens, or array of token arrays.
+
+              Note that <|endoftext|> is the document separator that the model sees during
+              training, so if a prompt is not specified the model will generate as if from the
+              beginning of a new document.
+
+          best_of: Generates `best_of` completions server-side and returns the "best" (the one with
+              the highest log probability per token). Results cannot be streamed.
+
+              When used with `n`, `best_of` controls the number of candidate completions and
+              `n` specifies how many to return – `best_of` must be greater than `n`.
+
+              **Note:** Because this parameter generates many completions, it can quickly
+              consume your token quota. Use carefully and ensure that you have reasonable
+              settings for `max_tokens` and `stop`.
+
+          echo: Echo back the prompt in addition to the completion
+
+          frequency_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on their
+              existing frequency in the text so far, decreasing the model's likelihood to
+              repeat the same line verbatim.
+
+              [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/gpt/parameter-details)
+
+          logit_bias: Modify the likelihood of specified tokens appearing in the completion.
+
+              Accepts a json object that maps tokens (specified by their token ID in the GPT
+              tokenizer) to an associated bias value from -100 to 100. You can use this
+              [tokenizer tool](/tokenizer?view=bpe) (which works for both GPT-2 and GPT-3) to
+              convert text to token IDs. Mathematically, the bias is added to the logits
+              generated by the model prior to sampling. The exact effect will vary per model,
+              but values between -1 and 1 should decrease or increase likelihood of selection;
+              values like -100 or 100 should result in a ban or exclusive selection of the
+              relevant token.
+
+              As an example, you can pass `{"50256": -100}` to prevent the <|endoftext|> token
+              from being generated.
+
+          logprobs: Include the log probabilities on the `logprobs` most likely tokens, as well the
+              chosen tokens. For example, if `logprobs` is 5, the API will return a list of
+              the 5 most likely tokens. The API will always return the `logprob` of the
+              sampled token, so there may be up to `logprobs+1` elements in the response.
+
+              The maximum value for `logprobs` is 5.
+
+          max_tokens: The maximum number of [tokens](/tokenizer) to generate in the completion.
+
+              The token count of your prompt plus `max_tokens` cannot exceed the model's
+              context length.
+              [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb)
+              for counting tokens.
+
+          n: How many completions to generate for each prompt.
+
+              **Note:** Because this parameter generates many completions, it can quickly
+              consume your token quota. Use carefully and ensure that you have reasonable
+              settings for `max_tokens` and `stop`.
+
+          presence_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on
+              whether they appear in the text so far, increasing the model's likelihood to
+              talk about new topics.
+
+              [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/gpt/parameter-details)
+
+          stop: Up to 4 sequences where the API will stop generating further tokens. The
+              returned text will not contain the stop sequence.
+
+          stream: Whether to stream back partial progress. If set, tokens will be sent as
+              data-only
+              [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format)
+              as they become available, with the stream terminated by a `data: [DONE]`
+              message.
+              [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_stream_completions.ipynb).
+
+          suffix: The suffix that comes after a completion of inserted text.
+
+          temperature: What sampling temperature to use, between 0 and 2. Higher values like 0.8 will
+              make the output more random, while lower values like 0.2 will make it more
+              focused and deterministic.
+
+              We generally recommend altering this or `top_p` but not both.
+
+          top_p: An alternative to sampling with temperature, called nucleus sampling, where the
+              model considers the results of the tokens with top_p probability mass. So 0.1
+              means only the tokens comprising the top 10% probability mass are considered.
+
+              We generally recommend altering this or `temperature` but not both.
+
+          user: A unique identifier representing your end-user, which can help OpenAI to monitor
+              and detect abuse.
+              [Learn more](https://platform.openai.com/docs/guides/safety-best-practices/end-user-ids).
+
+          extra_headers: Send extra headers
+
+          extra_query: Add additional query parameters to the request
+
+          extra_body: Add additional JSON properties to the request
+
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        ...
+
+    @overload
+    async def create(
+        self,
+        *,
+        model: Union[
+            str,
+            Literal[
+                "babbage-002",
+                "davinci-002",
+                "gpt-3.5-turbo-instruct",
+                "text-davinci-003",
+                "text-davinci-002",
+                "text-davinci-001",
+                "code-davinci-002",
+                "text-curie-001",
+                "text-babbage-001",
+                "text-ada-001",
+            ],
+        ],
+        prompt: Union[str, List[str], List[int], List[List[int]], None],
+        stream: Literal[True],
+        best_of: Optional[int] | NotGiven = NOT_GIVEN,
+        echo: Optional[bool] | NotGiven = NOT_GIVEN,
+        frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN,
+        logprobs: Optional[int] | NotGiven = NOT_GIVEN,
+        max_tokens: Optional[int] | NotGiven = NOT_GIVEN,
+        n: Optional[int] | NotGiven = NOT_GIVEN,
+        presence_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        stop: Union[Optional[str], List[str], None] | NotGiven = NOT_GIVEN,
+        suffix: Optional[str] | NotGiven = NOT_GIVEN,
+        temperature: Optional[float] | NotGiven = NOT_GIVEN,
+        top_p: Optional[float] | NotGiven = NOT_GIVEN,
+        user: str | NotGiven = NOT_GIVEN,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | None | NotGiven = NOT_GIVEN,
+    ) -> AsyncStream[AzureCompletion]:
+        """
+        Creates a completion for the provided prompt and parameters.
+
+        Args:
+          model: ID of the model to use. You can use the
+              [List models](https://platform.openai.com/docs/api-reference/models/list) API to
+              see all of your available models, or see our
+              [Model overview](https://platform.openai.com/docs/models/overview) for
+              descriptions of them.
+
+          prompt: The prompt(s) to generate completions for, encoded as a string, array of
+              strings, array of tokens, or array of token arrays.
+
+              Note that <|endoftext|> is the document separator that the model sees during
+              training, so if a prompt is not specified the model will generate as if from the
+              beginning of a new document.
+
+          stream: Whether to stream back partial progress. If set, tokens will be sent as
+              data-only
+              [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format)
+              as they become available, with the stream terminated by a `data: [DONE]`
+              message.
+              [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_stream_completions.ipynb).
+
+          best_of: Generates `best_of` completions server-side and returns the "best" (the one with
+              the highest log probability per token). Results cannot be streamed.
+
+              When used with `n`, `best_of` controls the number of candidate completions and
+              `n` specifies how many to return – `best_of` must be greater than `n`.
+
+              **Note:** Because this parameter generates many completions, it can quickly
+              consume your token quota. Use carefully and ensure that you have reasonable
+              settings for `max_tokens` and `stop`.
+
+          echo: Echo back the prompt in addition to the completion
+
+          frequency_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on their
+              existing frequency in the text so far, decreasing the model's likelihood to
+              repeat the same line verbatim.
+
+              [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/gpt/parameter-details)
+
+          logit_bias: Modify the likelihood of specified tokens appearing in the completion.
+
+              Accepts a json object that maps tokens (specified by their token ID in the GPT
+              tokenizer) to an associated bias value from -100 to 100. You can use this
+              [tokenizer tool](/tokenizer?view=bpe) (which works for both GPT-2 and GPT-3) to
+              convert text to token IDs. Mathematically, the bias is added to the logits
+              generated by the model prior to sampling. The exact effect will vary per model,
+              but values between -1 and 1 should decrease or increase likelihood of selection;
+              values like -100 or 100 should result in a ban or exclusive selection of the
+              relevant token.
+
+              As an example, you can pass `{"50256": -100}` to prevent the <|endoftext|> token
+              from being generated.
+
+          logprobs: Include the log probabilities on the `logprobs` most likely tokens, as well the
+              chosen tokens. For example, if `logprobs` is 5, the API will return a list of
+              the 5 most likely tokens. The API will always return the `logprob` of the
+              sampled token, so there may be up to `logprobs+1` elements in the response.
+
+              The maximum value for `logprobs` is 5.
+
+          max_tokens: The maximum number of [tokens](/tokenizer) to generate in the completion.
+
+              The token count of your prompt plus `max_tokens` cannot exceed the model's
+              context length.
+              [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb)
+              for counting tokens.
+
+          n: How many completions to generate for each prompt.
+
+              **Note:** Because this parameter generates many completions, it can quickly
+              consume your token quota. Use carefully and ensure that you have reasonable
+              settings for `max_tokens` and `stop`.
+
+          presence_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on
+              whether they appear in the text so far, increasing the model's likelihood to
+              talk about new topics.
+
+              [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/gpt/parameter-details)
+
+          stop: Up to 4 sequences where the API will stop generating further tokens. The
+              returned text will not contain the stop sequence.
+
+          suffix: The suffix that comes after a completion of inserted text.
+
+          temperature: What sampling temperature to use, between 0 and 2. Higher values like 0.8 will
+              make the output more random, while lower values like 0.2 will make it more
+              focused and deterministic.
+
+              We generally recommend altering this or `top_p` but not both.
+
+          top_p: An alternative to sampling with temperature, called nucleus sampling, where the
+              model considers the results of the tokens with top_p probability mass. So 0.1
+              means only the tokens comprising the top 10% probability mass are considered.
+
+              We generally recommend altering this or `temperature` but not both.
+
+          user: A unique identifier representing your end-user, which can help OpenAI to monitor
+              and detect abuse.
+              [Learn more](https://platform.openai.com/docs/guides/safety-best-practices/end-user-ids).
+
+          extra_headers: Send extra headers
+
+          extra_query: Add additional query parameters to the request
+
+          extra_body: Add additional JSON properties to the request
+
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        ...
+
+    @override
+    async def create(
+        self,
+        *,
+        model: Union[
+            str,
+            Literal[
+                "babbage-002",
+                "davinci-002",
+                "gpt-3.5-turbo-instruct",
+                "text-davinci-003",
+                "text-davinci-002",
+                "text-davinci-001",
+                "code-davinci-002",
+                "text-curie-001",
+                "text-babbage-001",
+                "text-ada-001",
+            ],
+        ],
+        prompt: Union[str, List[str], List[int], List[List[int]], None],
+        best_of: Optional[int] | NotGiven = NOT_GIVEN,
+        echo: Optional[bool] | NotGiven = NOT_GIVEN,
+        frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN,
+        logprobs: Optional[int] | NotGiven = NOT_GIVEN,
+        max_tokens: Optional[int] | NotGiven = NOT_GIVEN,
+        n: Optional[int] | NotGiven = NOT_GIVEN,
+        presence_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        stop: Union[Optional[str], List[str], None] | NotGiven = NOT_GIVEN,
+        stream: Optional[Literal[False]] | Literal[True] | NotGiven = NOT_GIVEN,
+        suffix: Optional[str] | NotGiven = NOT_GIVEN,
+        temperature: Optional[float] | NotGiven = NOT_GIVEN,
+        top_p: Optional[float] | NotGiven = NOT_GIVEN,
+        user: str | NotGiven = NOT_GIVEN,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | None | NotGiven = NOT_GIVEN,
+    ) -> AzureCompletion | AsyncStream[AzureCompletion]:
+        stream_dict: Dict[str, Literal[True]] = { # TODO: pylance is upset if I pass through the parameter value. Overload + override combination is problematic
+            "stream": True
+        } if stream else {}
+        response = cast(
+            Union[Completion, AsyncStream[Completion]],
+            await super().create(
+                model=model,
+                prompt=prompt,
+                best_of=best_of,
+                echo=echo,
+                frequency_penalty = frequency_penalty,
+                logit_bias=logit_bias,
+                logprobs=logprobs,
+                max_tokens=max_tokens,
+                n=n,
+                presence_penalty=presence_penalty,
+                stop=stop,
+                **stream_dict,
+                suffix=suffix,
+                temperature=temperature,
+                top_p=top_p,
+                user=user,
+                extra_headers=extra_headers,
+                extra_query=extra_query,
+                extra_body=extra_body,
+                timeout=timeout
+            )
+        )
+
+        if isinstance(response, AsyncStream):
+            response._cast_to = AzureCompletion
+        else:
+            response_json = response.model_dump(mode="json")
+            response = AzureCompletion.construct(**response_json)
+        return response  # type: ignore
+
 
 class AsyncAzureOpenAIClient(AsyncClient):
 
@@ -361,7 +767,16 @@ class AsyncAzureOpenAIClient(AsyncClient):
     @override
     def chat(self) -> AsyncAzureChat:
         return self._chat
-    
+
+    @property
+    @override
+    def completions(self) -> AsyncAzureCompletions:
+        return self._completions
+
+    @completions.setter
+    def completions(self, value: AsyncAzureCompletions) -> None:
+        self._completions = value
+
     def __init__(self, *args: Any, credential: Optional["TokenCredential"] = None, api_version: str = '2023-09-01-preview', **kwargs: Any):
         default_query = kwargs.get('default_query', {})
         default_query.setdefault('api-version', api_version)
@@ -437,4 +852,4 @@ async def _request(self, cast_to: Type[ResponseT], options: FinalRequestOptions,
                     options.url = f'openai/deployments/{model}' + options.url
         if options.url.startswith(("/models", "/fine_tuning", "/files", "/fine-tunes")):
             options.url = f"openai{options.url}"
-        return await super()._request(cast_to=cast_to, options=options, **kwargs)
\ No newline at end of file
+        return await super()._request(cast_to=cast_to, options=options, **kwargs)
diff --git a/src/openai/azure/_azuremodels.py b/src/openai/azure/_azuremodels.py
index bfc2f31fd4..841bd11d78 100644
--- a/src/openai/azure/_azuremodels.py
+++ b/src/openai/azure/_azuremodels.py
@@ -1,5 +1,82 @@
-from typing import TypedDict
+from typing import List, Optional
+from typing_extensions import TypedDict, Literal
+from openai._models import BaseModel as BaseModel
+
+from openai.types.chat import ChatCompletion, ChatCompletionChunk, ChatCompletionMessage
+from openai.types.chat.chat_completion import Choice as ChatChoice
+from openai.types.chat.chat_completion_chunk import ChoiceDelta, Choice as ChatChoiceDelta
+from openai.types.completion import Completion
+from openai.types.completion_choice import CompletionChoice
+
+
+AzureChatCompletionRole = Literal["system", "user", "assistant", "function", "tool"]
+
 
 class ChatExtensionConfiguration(TypedDict):
-    type: str
+    type: Literal["AzureCognitiveSearch"]
     parameters: object
+
+
+class ContentFilterResult(BaseModel):
+    severity: Literal["safe", "low", "medium", "high"]
+    filtered: bool
+
+
+class Error(BaseModel):
+    code: str
+    message: str
+
+
+class ContentFilterResults(BaseModel):
+    hate: Optional[ContentFilterResult]
+    self_harm: Optional[ContentFilterResult]
+    violence: Optional[ContentFilterResult]
+    sexual: Optional[ContentFilterResult]
+    error: Optional[Error]
+
+
+class PromptFilterResult(BaseModel):
+    prompt_index: int
+    content_filter_results: Optional[ContentFilterResults]
+
+
+class AzureChatExtensionsMessageContext(BaseModel):
+    messages: Optional[List[ChatCompletionMessage]]
+
+
+class AzureChatCompletionMessage(ChatCompletionMessage):
+    context: Optional[AzureChatExtensionsMessageContext]
+    role: AzureChatCompletionRole  # type: ignore
+
+
+class AzureChatCompletionChoice(ChatChoice):
+    content_filter_results: Optional[ContentFilterResults]
+    message: AzureChatCompletionMessage  # type: ignore
+
+
+class AzureChatCompletion(ChatCompletion):
+    choices: List[AzureChatCompletionChoice]  # type: ignore
+    prompt_filter_results: Optional[List[PromptFilterResult]]
+
+
+class AzureChoiceDelta(ChoiceDelta):
+    context: Optional[AzureChatExtensionsMessageContext]
+
+
+class AzureChatCompletionChoiceDelta(ChatChoiceDelta):
+    delta: AzureChoiceDelta  # type: ignore
+    content_filter_results: Optional[ContentFilterResults]
+
+
+class AzureChatCompletionChunk(ChatCompletionChunk):
+    choices: List[AzureChatCompletionChoiceDelta]  # type: ignore
+    prompt_filter_results: Optional[List[PromptFilterResult]]
+
+
+class AzureCompletionChoice(CompletionChoice):
+    content_filter_results: Optional[ContentFilterResults]
+
+
+class AzureCompletion(Completion):
+    choices: List[AzureCompletionChoice]  # type: ignore
+    prompt_filter_results: Optional[List[PromptFilterResult]]
diff --git a/src/openai/azure/_sync_client.py b/src/openai/azure/_sync_client.py
index 44a48c2ef4..677f9e8ac3 100644
--- a/src/openai/azure/_sync_client.py
+++ b/src/openai/azure/_sync_client.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 from typing_extensions import Literal, override
 from typing import Any, Callable, cast, List, Mapping, Dict, Optional, overload, Union
 import time
@@ -15,25 +17,33 @@
 from openai._streaming import Stream
 
 from openai.resources.chat import Chat, Completions
+from openai.resources.completions import Completions as CompletionsOperations
 from openai.types.chat import ChatCompletionMessageParam, ChatCompletion, ChatCompletionChunk
 from openai.types.chat.completion_create_params import FunctionCall, Function
+from openai.types.completion import Completion
 
 # Azure specific types
 from ._credential import TokenCredential, TokenAuth
-from ._azuremodels import ChatExtensionConfiguration
+from ._azuremodels import (
+    ChatExtensionConfiguration,
+    AzureChatCompletion, 
+    AzureChatCompletionChunk,
+    AzureCompletion,
+)
 
 TIMEOUT_SECS = 600
 
 class AzureChat(Chat):
 
     @property
-    def completions(self) -> "AzureCompletions":
+    def completions(self) -> "AzureChatCompletions":
         return self._completions
     
     def __init__(self, client: "AzureOpenAIClient"):
-        self._completions = AzureCompletions(client)
+        self._completions = AzureChatCompletions(client)
+
 
-class AzureCompletions(Completions):
+class AzureChatCompletions(Completions):
     
     @overload
     def create(
@@ -75,7 +85,7 @@ def create(
         extra_query: Query | None = None,
         extra_body: Body | None = None,
         timeout: float | None | NotGiven = NOT_GIVEN,
-    ) -> ChatCompletion:
+    ) -> AzureChatCompletion:
         """
         Creates a model response for the given chat conversation.
 
@@ -201,7 +211,7 @@ def create(
         extra_query: Query | None = None,
         extra_body: Body | None = None,
         timeout: float | None | NotGiven = NOT_GIVEN,
-    ) -> Stream[ChatCompletionChunk]:
+    ) -> Stream[AzureChatCompletionChunk]:
         """
         Creates a model response for the given chat conversation.
 
@@ -326,7 +336,7 @@ def create(
         extra_query: Query | None = None,
         extra_body: Body | None = None,
         timeout: float | None | NotGiven = NOT_GIVEN,
-    ) -> ChatCompletion | Stream[ChatCompletionChunk]:
+    ) -> AzureChatCompletion | Stream[AzureChatCompletionChunk]:
         if data_sources:
             if extra_body is None:
                 extra_body= {}
@@ -334,27 +344,422 @@ def create(
         stream_dict: Dict[str, Literal[True]] = { # TODO: pylance is upset if I pass through the parameter value. Overload + override combination is problematic
             "stream": True
         } if stream else {}
-        response = super().create(
-            messages=messages,
-            model=model,
-            frequency_penalty = frequency_penalty,
-            function_call=function_call,
-            functions=functions,
-            logit_bias=logit_bias,
-            max_tokens=max_tokens,
-            n=n,
-            presence_penalty=presence_penalty,
-            stop=stop,
-            **stream_dict,
-            temperature=temperature,
-            top_p=top_p,
-            user=user,
-            extra_headers=extra_headers,
-            extra_query=extra_query,
-            extra_body=extra_body,
-            timeout=timeout
+        response = cast(
+            Union[ChatCompletion, Stream[ChatCompletionChunk]],
+            super().create(
+                messages=messages,
+                model=model,
+                frequency_penalty = frequency_penalty,
+                function_call=function_call,
+                functions=functions,
+                logit_bias=logit_bias,
+                max_tokens=max_tokens,
+                n=n,
+                presence_penalty=presence_penalty,
+                stop=stop,
+                **stream_dict,
+                temperature=temperature,
+                top_p=top_p,
+                user=user,
+                extra_headers=extra_headers,
+                extra_query=extra_query,
+                extra_body=extra_body,
+                timeout=timeout
+            )
         )
-        return response
+        if isinstance(response, Stream):
+            response._cast_to = AzureChatCompletionChunk  # or rebuild the stream?
+        else:
+            response_json = response.model_dump(mode="json")
+            response = AzureChatCompletion.construct(**response_json)
+        return response  # type: ignore
+
+
+class AzureCompletions(CompletionsOperations):
+    @overload
+    def create(
+        self,
+        *,
+        model: Union[
+            str,
+            Literal[
+                "babbage-002",
+                "davinci-002",
+                "gpt-3.5-turbo-instruct",
+                "text-davinci-003",
+                "text-davinci-002",
+                "text-davinci-001",
+                "code-davinci-002",
+                "text-curie-001",
+                "text-babbage-001",
+                "text-ada-001",
+            ],
+        ],
+        prompt: Union[str, List[str], List[int], List[List[int]], None],
+        best_of: Optional[int] | NotGiven = NOT_GIVEN,
+        echo: Optional[bool] | NotGiven = NOT_GIVEN,
+        frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN,
+        logprobs: Optional[int] | NotGiven = NOT_GIVEN,
+        max_tokens: Optional[int] | NotGiven = NOT_GIVEN,
+        n: Optional[int] | NotGiven = NOT_GIVEN,
+        presence_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        stop: Union[Optional[str], List[str], None] | NotGiven = NOT_GIVEN,
+        stream: Optional[Literal[False]] | NotGiven = NOT_GIVEN,
+        suffix: Optional[str] | NotGiven = NOT_GIVEN,
+        temperature: Optional[float] | NotGiven = NOT_GIVEN,
+        top_p: Optional[float] | NotGiven = NOT_GIVEN,
+        user: str | NotGiven = NOT_GIVEN,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | None | NotGiven = NOT_GIVEN,
+    ) -> AzureCompletion:
+        """
+        Creates a completion for the provided prompt and parameters.
+
+        Args:
+          model: ID of the model to use. You can use the
+              [List models](https://platform.openai.com/docs/api-reference/models/list) API to
+              see all of your available models, or see our
+              [Model overview](https://platform.openai.com/docs/models/overview) for
+              descriptions of them.
+
+          prompt: The prompt(s) to generate completions for, encoded as a string, array of
+              strings, array of tokens, or array of token arrays.
+
+              Note that <|endoftext|> is the document separator that the model sees during
+              training, so if a prompt is not specified the model will generate as if from the
+              beginning of a new document.
+
+          best_of: Generates `best_of` completions server-side and returns the "best" (the one with
+              the highest log probability per token). Results cannot be streamed.
+
+              When used with `n`, `best_of` controls the number of candidate completions and
+              `n` specifies how many to return – `best_of` must be greater than `n`.
+
+              **Note:** Because this parameter generates many completions, it can quickly
+              consume your token quota. Use carefully and ensure that you have reasonable
+              settings for `max_tokens` and `stop`.
+
+          echo: Echo back the prompt in addition to the completion
+
+          frequency_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on their
+              existing frequency in the text so far, decreasing the model's likelihood to
+              repeat the same line verbatim.
+
+              [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/gpt/parameter-details)
+
+          logit_bias: Modify the likelihood of specified tokens appearing in the completion.
+
+              Accepts a json object that maps tokens (specified by their token ID in the GPT
+              tokenizer) to an associated bias value from -100 to 100. You can use this
+              [tokenizer tool](/tokenizer?view=bpe) (which works for both GPT-2 and GPT-3) to
+              convert text to token IDs. Mathematically, the bias is added to the logits
+              generated by the model prior to sampling. The exact effect will vary per model,
+              but values between -1 and 1 should decrease or increase likelihood of selection;
+              values like -100 or 100 should result in a ban or exclusive selection of the
+              relevant token.
+
+              As an example, you can pass `{"50256": -100}` to prevent the <|endoftext|> token
+              from being generated.
+
+          logprobs: Include the log probabilities on the `logprobs` most likely tokens, as well the
+              chosen tokens. For example, if `logprobs` is 5, the API will return a list of
+              the 5 most likely tokens. The API will always return the `logprob` of the
+              sampled token, so there may be up to `logprobs+1` elements in the response.
+
+              The maximum value for `logprobs` is 5.
+
+          max_tokens: The maximum number of [tokens](/tokenizer) to generate in the completion.
+
+              The token count of your prompt plus `max_tokens` cannot exceed the model's
+              context length.
+              [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb)
+              for counting tokens.
+
+          n: How many completions to generate for each prompt.
+
+              **Note:** Because this parameter generates many completions, it can quickly
+              consume your token quota. Use carefully and ensure that you have reasonable
+              settings for `max_tokens` and `stop`.
+
+          presence_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on
+              whether they appear in the text so far, increasing the model's likelihood to
+              talk about new topics.
+
+              [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/gpt/parameter-details)
+
+          stop: Up to 4 sequences where the API will stop generating further tokens. The
+              returned text will not contain the stop sequence.
+
+          stream: Whether to stream back partial progress. If set, tokens will be sent as
+              data-only
+              [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format)
+              as they become available, with the stream terminated by a `data: [DONE]`
+              message.
+              [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_stream_completions.ipynb).
+
+          suffix: The suffix that comes after a completion of inserted text.
+
+          temperature: What sampling temperature to use, between 0 and 2. Higher values like 0.8 will
+              make the output more random, while lower values like 0.2 will make it more
+              focused and deterministic.
+
+              We generally recommend altering this or `top_p` but not both.
+
+          top_p: An alternative to sampling with temperature, called nucleus sampling, where the
+              model considers the results of the tokens with top_p probability mass. So 0.1
+              means only the tokens comprising the top 10% probability mass are considered.
+
+              We generally recommend altering this or `temperature` but not both.
+
+          user: A unique identifier representing your end-user, which can help OpenAI to monitor
+              and detect abuse.
+              [Learn more](https://platform.openai.com/docs/guides/safety-best-practices/end-user-ids).
+
+          extra_headers: Send extra headers
+
+          extra_query: Add additional query parameters to the request
+
+          extra_body: Add additional JSON properties to the request
+
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        ...
+
+    @overload
+    def create(
+        self,
+        *,
+        model: Union[
+            str,
+            Literal[
+                "babbage-002",
+                "davinci-002",
+                "gpt-3.5-turbo-instruct",
+                "text-davinci-003",
+                "text-davinci-002",
+                "text-davinci-001",
+                "code-davinci-002",
+                "text-curie-001",
+                "text-babbage-001",
+                "text-ada-001",
+            ],
+        ],
+        prompt: Union[str, List[str], List[int], List[List[int]], None],
+        stream: Literal[True],
+        best_of: Optional[int] | NotGiven = NOT_GIVEN,
+        echo: Optional[bool] | NotGiven = NOT_GIVEN,
+        frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN,
+        logprobs: Optional[int] | NotGiven = NOT_GIVEN,
+        max_tokens: Optional[int] | NotGiven = NOT_GIVEN,
+        n: Optional[int] | NotGiven = NOT_GIVEN,
+        presence_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        stop: Union[Optional[str], List[str], None] | NotGiven = NOT_GIVEN,
+        suffix: Optional[str] | NotGiven = NOT_GIVEN,
+        temperature: Optional[float] | NotGiven = NOT_GIVEN,
+        top_p: Optional[float] | NotGiven = NOT_GIVEN,
+        user: str | NotGiven = NOT_GIVEN,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | None | NotGiven = NOT_GIVEN,
+    ) -> Stream[AzureCompletion]:
+        """
+        Creates a completion for the provided prompt and parameters.
+
+        Args:
+          model: ID of the model to use. You can use the
+              [List models](https://platform.openai.com/docs/api-reference/models/list) API to
+              see all of your available models, or see our
+              [Model overview](https://platform.openai.com/docs/models/overview) for
+              descriptions of them.
+
+          prompt: The prompt(s) to generate completions for, encoded as a string, array of
+              strings, array of tokens, or array of token arrays.
+
+              Note that <|endoftext|> is the document separator that the model sees during
+              training, so if a prompt is not specified the model will generate as if from the
+              beginning of a new document.
+
+          stream: Whether to stream back partial progress. If set, tokens will be sent as
+              data-only
+              [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format)
+              as they become available, with the stream terminated by a `data: [DONE]`
+              message.
+              [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_stream_completions.ipynb).
+
+          best_of: Generates `best_of` completions server-side and returns the "best" (the one with
+              the highest log probability per token). Results cannot be streamed.
+
+              When used with `n`, `best_of` controls the number of candidate completions and
+              `n` specifies how many to return – `best_of` must be greater than `n`.
+
+              **Note:** Because this parameter generates many completions, it can quickly
+              consume your token quota. Use carefully and ensure that you have reasonable
+              settings for `max_tokens` and `stop`.
+
+          echo: Echo back the prompt in addition to the completion
+
+          frequency_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on their
+              existing frequency in the text so far, decreasing the model's likelihood to
+              repeat the same line verbatim.
+
+              [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/gpt/parameter-details)
+
+          logit_bias: Modify the likelihood of specified tokens appearing in the completion.
+
+              Accepts a json object that maps tokens (specified by their token ID in the GPT
+              tokenizer) to an associated bias value from -100 to 100. You can use this
+              [tokenizer tool](/tokenizer?view=bpe) (which works for both GPT-2 and GPT-3) to
+              convert text to token IDs. Mathematically, the bias is added to the logits
+              generated by the model prior to sampling. The exact effect will vary per model,
+              but values between -1 and 1 should decrease or increase likelihood of selection;
+              values like -100 or 100 should result in a ban or exclusive selection of the
+              relevant token.
+
+              As an example, you can pass `{"50256": -100}` to prevent the <|endoftext|> token
+              from being generated.
+
+          logprobs: Include the log probabilities on the `logprobs` most likely tokens, as well the
+              chosen tokens. For example, if `logprobs` is 5, the API will return a list of
+              the 5 most likely tokens. The API will always return the `logprob` of the
+              sampled token, so there may be up to `logprobs+1` elements in the response.
+
+              The maximum value for `logprobs` is 5.
+
+          max_tokens: The maximum number of [tokens](/tokenizer) to generate in the completion.
+
+              The token count of your prompt plus `max_tokens` cannot exceed the model's
+              context length.
+              [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb)
+              for counting tokens.
+
+          n: How many completions to generate for each prompt.
+
+              **Note:** Because this parameter generates many completions, it can quickly
+              consume your token quota. Use carefully and ensure that you have reasonable
+              settings for `max_tokens` and `stop`.
+
+          presence_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on
+              whether they appear in the text so far, increasing the model's likelihood to
+              talk about new topics.
+
+              [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/gpt/parameter-details)
+
+          stop: Up to 4 sequences where the API will stop generating further tokens. The
+              returned text will not contain the stop sequence.
+
+          suffix: The suffix that comes after a completion of inserted text.
+
+          temperature: What sampling temperature to use, between 0 and 2. Higher values like 0.8 will
+              make the output more random, while lower values like 0.2 will make it more
+              focused and deterministic.
+
+              We generally recommend altering this or `top_p` but not both.
+
+          top_p: An alternative to sampling with temperature, called nucleus sampling, where the
+              model considers the results of the tokens with top_p probability mass. So 0.1
+              means only the tokens comprising the top 10% probability mass are considered.
+
+              We generally recommend altering this or `temperature` but not both.
+
+          user: A unique identifier representing your end-user, which can help OpenAI to monitor
+              and detect abuse.
+              [Learn more](https://platform.openai.com/docs/guides/safety-best-practices/end-user-ids).
+
+          extra_headers: Send extra headers
+
+          extra_query: Add additional query parameters to the request
+
+          extra_body: Add additional JSON properties to the request
+
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        ...
+
+    @override
+    def create(
+        self,
+        *,
+        model: Union[
+            str,
+            Literal[
+                "babbage-002",
+                "davinci-002",
+                "gpt-3.5-turbo-instruct",
+                "text-davinci-003",
+                "text-davinci-002",
+                "text-davinci-001",
+                "code-davinci-002",
+                "text-curie-001",
+                "text-babbage-001",
+                "text-ada-001",
+            ],
+        ],
+        prompt: Union[str, List[str], List[int], List[List[int]], None],
+        best_of: Optional[int] | NotGiven = NOT_GIVEN,
+        echo: Optional[bool] | NotGiven = NOT_GIVEN,
+        frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN,
+        logprobs: Optional[int] | NotGiven = NOT_GIVEN,
+        max_tokens: Optional[int] | NotGiven = NOT_GIVEN,
+        n: Optional[int] | NotGiven = NOT_GIVEN,
+        presence_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        stop: Union[Optional[str], List[str], None] | NotGiven = NOT_GIVEN,
+        stream: Optional[Literal[False]] | Literal[True] | NotGiven = NOT_GIVEN,
+        suffix: Optional[str] | NotGiven = NOT_GIVEN,
+        temperature: Optional[float] | NotGiven = NOT_GIVEN,
+        top_p: Optional[float] | NotGiven = NOT_GIVEN,
+        user: str | NotGiven = NOT_GIVEN,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | None | NotGiven = NOT_GIVEN,
+    ) -> AzureCompletion | Stream[AzureCompletion]:
+        stream_dict: Dict[str, Literal[True]] = { # TODO: pylance is upset if I pass through the parameter value. Overload + override combination is problematic
+            "stream": True
+        } if stream else {}
+        response = cast(
+            Union[Completion, Stream[Completion]],
+            super().create(
+                model=model,
+                prompt=prompt,
+                best_of=best_of,
+                echo=echo,
+                frequency_penalty = frequency_penalty,
+                logit_bias=logit_bias,
+                logprobs=logprobs,
+                max_tokens=max_tokens,
+                n=n,
+                presence_penalty=presence_penalty,
+                stop=stop,
+                **stream_dict,
+                suffix=suffix,
+                temperature=temperature,
+                top_p=top_p,
+                user=user,
+                extra_headers=extra_headers,
+                extra_query=extra_query,
+                extra_body=extra_body,
+                timeout=timeout
+            )
+        )
+
+        if isinstance(response, Stream):
+            response._cast_to = AzureCompletion
+        else:
+            response_json = response.model_dump(mode="json")
+            response = AzureCompletion.construct(**response_json)
+        return response  # type: ignore
 
 
 class AzureOpenAIClient(Client):
@@ -363,7 +768,16 @@ class AzureOpenAIClient(Client):
     @override
     def chat(self) -> AzureChat:
         return self._chat
-    
+
+    @property
+    @override
+    def completions(self) -> AzureCompletions:
+        return self._completions
+
+    @completions.setter
+    def completions(self, value: AzureCompletions) -> None:
+        self._completions = value
+
     def __init__(self, *args: Any, base_url: str, credential: Optional["TokenCredential"] = None, api_version: str = '2023-09-01-preview', **kwargs: Any):
         default_query = kwargs.get('default_query', {})
         default_query.setdefault('api-version', api_version)
@@ -441,4 +855,3 @@ def _poll(
 
         response_json = response.json()
         return ImagesResponse.construct(**response_json["result"])
-