diff --git a/src/openai/__init__.py b/src/openai/__init__.py
index d011d416ac..7d2630f123 100644
--- a/src/openai/__init__.py
+++ b/src/openai/__init__.py
@@ -65,7 +65,7 @@
     "AsyncStream",
     "OpenAI",
     "AsyncOpenAI",
-    "file_from_path",
+    "file_from_path"
 ]
 
 from .version import VERSION as VERSION
diff --git a/src/openai/_client.py b/src/openai/_client.py
index e0e5e37f4c..a1ee81e2ea 100644
--- a/src/openai/_client.py
+++ b/src/openai/_client.py
@@ -46,7 +46,11 @@
 
 class OpenAI(SyncAPIClient):
     completions: resources.Completions
-    chat: resources.Chat
+
+    @property
+    def chat(self) -> resources.chat.Chat:
+        return self._chat
+    # chat: resources.chat.Chat
     edits: resources.Edits
     embeddings: resources.Embeddings
     files: resources.Files
@@ -122,7 +126,7 @@ def __init__(
         self._default_stream_cls = Stream
 
         self.completions = resources.Completions(self)
-        self.chat = resources.Chat(self)
+        self._chat = resources.Chat(self)
         self.edits = resources.Edits(self)
         self.embeddings = resources.Embeddings(self)
         self.files = resources.Files(self)
@@ -244,7 +248,9 @@ def _make_status_error(
 
 class AsyncOpenAI(AsyncAPIClient):
     completions: resources.AsyncCompletions
-    chat: resources.AsyncChat
+    @property
+    def chat(self) -> resources.AsyncChat:
+        return self._chat
     edits: resources.AsyncEdits
     embeddings: resources.AsyncEmbeddings
     files: resources.AsyncFiles
@@ -320,7 +326,7 @@ def __init__(
         self._default_stream_cls = AsyncStream
 
         self.completions = resources.AsyncCompletions(self)
-        self.chat = resources.AsyncChat(self)
+        self._chat = resources.AsyncChat(self)
         self.edits = resources.AsyncEdits(self)
         self.embeddings = resources.AsyncEmbeddings(self)
         self.files = resources.AsyncFiles(self)
diff --git a/src/openai/azure/__init__.py b/src/openai/azure/__init__.py
new file mode 100644
index 0000000000..805d97a52f
--- /dev/null
+++ b/src/openai/azure/__init__.py
@@ -0,0 +1,9 @@
+from ._sync_client import AzureOpenAIClient
+from ._async_client import AsyncAzureOpenAIClient
+from ._credential import TokenCredential
+
+__all__ = [
+    "AzureOpenAIClient",
+    "TokenCredential",
+    "AsyncAzureOpenAIClient",
+]
\ No newline at end of file
diff --git a/src/openai/azure/_async_client.py b/src/openai/azure/_async_client.py
new file mode 100644
index 0000000000..2f9ca5bc80
--- /dev/null
+++ b/src/openai/azure/_async_client.py
@@ -0,0 +1,855 @@
+from __future__ import annotations
+
+from typing_extensions import Literal, override
+from typing import Any, Callable, cast, List, Mapping, Dict, Optional, overload, Type, Union
+import time
+
+import httpx
+
+from openai import AsyncClient, OpenAIError
+from openai.resources.chat import AsyncChat, AsyncCompletions
+from openai.resources.completions import AsyncCompletions as AsyncCompletionsOperations
+from openai.types import ImagesResponse
+from openai.types.chat import ChatCompletionMessageParam, ChatCompletion, ChatCompletionChunk
+from openai.types.chat.completion_create_params import FunctionCall, Function
+from openai.types.completion import Completion
+
+# These types are needed for correct typing of overrides
+from openai._types import NotGiven, NOT_GIVEN, Headers, Query, Body, ResponseT
+
+# These are types used in the public API surface area that are not exported as public
+from openai._models import FinalRequestOptions
+from openai._streaming import AsyncStream
+
+# Azure specific types
+from ._credential import TokenCredential, TokenAuth
+from ._azuremodels import (
+    ChatExtensionConfiguration,
+    AzureChatCompletion, 
+    AzureChatCompletionChunk,
+    AzureCompletion,
+)
+
+TIMEOUT_SECS = 600
+
+class AsyncAzureChat(AsyncChat):
+
+    @property
+    def completions(self) -> "AsyncAzureChatCompletions":
+        return self._completions
+    
+    def __init__(self, client: "AsyncAzureOpenAIClient"):
+        self._completions = AsyncAzureChatCompletions(client)
+
+
+class AsyncAzureChatCompletions(AsyncCompletions):
+    
+    @overload
+    async def create(
+        self,
+        *,
+        messages: List[ChatCompletionMessageParam],
+        model: Union[
+            str,
+            Literal[
+                "gpt-4",
+                "gpt-4-0314",
+                "gpt-4-0613",
+                "gpt-4-32k",
+                "gpt-4-32k-0314",
+                "gpt-4-32k-0613",
+                "gpt-3.5-turbo",
+                "gpt-3.5-turbo-16k",
+                "gpt-3.5-turbo-0301",
+                "gpt-3.5-turbo-0613",
+                "gpt-3.5-turbo-16k-0613",
+            ],
+        ],
+        frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        function_call: FunctionCall | NotGiven = NOT_GIVEN,
+        functions: List[Function] | NotGiven = NOT_GIVEN,
+        logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN,
+        max_tokens: Optional[int] | NotGiven = NOT_GIVEN,
+        n: Optional[int] | NotGiven = NOT_GIVEN,
+        presence_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        stop: Union[Optional[str], List[str]] | NotGiven = NOT_GIVEN,
+        stream: Optional[Literal[False]] | NotGiven = NOT_GIVEN,
+        temperature: Optional[float] | NotGiven = NOT_GIVEN,
+        top_p: Optional[float] | NotGiven = NOT_GIVEN,
+        user: str | NotGiven = NOT_GIVEN,
+        data_sources: List[ChatExtensionConfiguration] | NotGiven = NOT_GIVEN, # TODO
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | None | NotGiven = NOT_GIVEN,
+    ) -> AzureChatCompletion:
+        """
+        Creates a model response for the given chat conversation.
+
+        Args:
+          messages: A list of messages comprising the conversation so far.
+              [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_format_inputs_to_ChatGPT_models.ipynb).
+
+          model: ID of the model to use. See the
+              [model endpoint compatibility](https://platform.openai.com/docs/models/model-endpoint-compatibility)
+              table for details on which models work with the Chat API.
+
+          frequency_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on their
+              existing frequency in the text so far, decreasing the model's likelihood to
+              repeat the same line verbatim.
+
+              [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/gpt/parameter-details)
+
+          function_call: Controls how the model responds to function calls. `none` means the model does
+              not call a function, and responds to the end-user. `auto` means the model can
+              pick between an end-user or calling a function. Specifying a particular function
+              via `{"name": "my_function"}` forces the model to call that function. `none` is
+              the default when no functions are present. `auto` is the default if functions
+              are present.
+
+          functions: A list of functions the model may generate JSON inputs for.
+
+          logit_bias: Modify the likelihood of specified tokens appearing in the completion.
+
+              Accepts a json object that maps tokens (specified by their token ID in the
+              tokenizer) to an associated bias value from -100 to 100. Mathematically, the
+              bias is added to the logits generated by the model prior to sampling. The exact
+              effect will vary per model, but values between -1 and 1 should decrease or
+              increase likelihood of selection; values like -100 or 100 should result in a ban
+              or exclusive selection of the relevant token.
+
+          max_tokens: The maximum number of [tokens](/tokenizer) to generate in the chat completion.
+
+              The total length of input tokens and generated tokens is limited by the model's
+              context length.
+              [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb)
+              for counting tokens.
+
+          n: How many chat completion choices to generate for each input message.
+
+          presence_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on
+              whether they appear in the text so far, increasing the model's likelihood to
+              talk about new topics.
+
+              [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/gpt/parameter-details)
+
+          stop: Up to 4 sequences where the API will stop generating further tokens.
+
+          stream: If set, partial message deltas will be sent, like in ChatGPT. Tokens will be
+              sent as data-only
+              [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format)
+              as they become available, with the stream terminated by a `data: [DONE]`
+              message.
+              [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_stream_completions.ipynb).
+
+          temperature: What sampling temperature to use, between 0 and 2. Higher values like 0.8 will
+              make the output more random, while lower values like 0.2 will make it more
+              focused and deterministic.
+
+              We generally recommend altering this or `top_p` but not both.
+
+          top_p: An alternative to sampling with temperature, called nucleus sampling, where the
+              model considers the results of the tokens with top_p probability mass. So 0.1
+              means only the tokens comprising the top 10% probability mass are considered.
+
+              We generally recommend altering this or `temperature` but not both.
+
+          user: A unique identifier representing your end-user, which can help OpenAI to monitor
+              and detect abuse.
+              [Learn more](https://platform.openai.com/docs/guides/safety-best-practices/end-user-ids).
+
+          extra_headers: Send extra headers
+
+          extra_query: Add additional query parameters to the request
+
+          extra_body: Add additional JSON properties to the request
+
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        ...
+
+    @overload
+    async def create(
+        self,
+        *,
+        messages: List[ChatCompletionMessageParam],
+        model: Union[
+            str,
+            Literal[
+                "gpt-4",
+                "gpt-4-0314",
+                "gpt-4-0613",
+                "gpt-4-32k",
+                "gpt-4-32k-0314",
+                "gpt-4-32k-0613",
+                "gpt-3.5-turbo",
+                "gpt-3.5-turbo-16k",
+                "gpt-3.5-turbo-0301",
+                "gpt-3.5-turbo-0613",
+                "gpt-3.5-turbo-16k-0613",
+            ],
+        ],
+        stream: Literal[True],
+        frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        function_call: FunctionCall | NotGiven = NOT_GIVEN,
+        functions: List[Function] | NotGiven = NOT_GIVEN,
+        logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN,
+        max_tokens: Optional[int] | NotGiven = NOT_GIVEN,
+        n: Optional[int] | NotGiven = NOT_GIVEN,
+        presence_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        stop: Union[Optional[str], List[str]] | NotGiven = NOT_GIVEN,
+        temperature: Optional[float] | NotGiven = NOT_GIVEN,
+        top_p: Optional[float] | NotGiven = NOT_GIVEN,
+        user: str | NotGiven = NOT_GIVEN,
+        data_sources: List[ChatExtensionConfiguration] | NotGiven = NOT_GIVEN, # TODO
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | None | NotGiven = NOT_GIVEN,
+    ) -> AsyncStream[AzureChatCompletionChunk]:
+        """
+        Creates a model response for the given chat conversation.
+
+        Args:
+          messages: A list of messages comprising the conversation so far.
+              [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_format_inputs_to_ChatGPT_models.ipynb).
+
+          model: ID of the model to use. See the
+              [model endpoint compatibility](https://platform.openai.com/docs/models/model-endpoint-compatibility)
+              table for details on which models work with the Chat API.
+
+          stream: If set, partial message deltas will be sent, like in ChatGPT. Tokens will be
+              sent as data-only
+              [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format)
+              as they become available, with the stream terminated by a `data: [DONE]`
+              message.
+              [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_stream_completions.ipynb).
+
+          frequency_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on their
+              existing frequency in the text so far, decreasing the model's likelihood to
+              repeat the same line verbatim.
+
+              [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/gpt/parameter-details)
+
+          function_call: Controls how the model responds to function calls. `none` means the model does
+              not call a function, and responds to the end-user. `auto` means the model can
+              pick between an end-user or calling a function. Specifying a particular function
+              via `{"name": "my_function"}` forces the model to call that function. `none` is
+              the default when no functions are present. `auto` is the default if functions
+              are present.
+
+          functions: A list of functions the model may generate JSON inputs for.
+
+          logit_bias: Modify the likelihood of specified tokens appearing in the completion.
+
+              Accepts a json object that maps tokens (specified by their token ID in the
+              tokenizer) to an associated bias value from -100 to 100. Mathematically, the
+              bias is added to the logits generated by the model prior to sampling. The exact
+              effect will vary per model, but values between -1 and 1 should decrease or
+              increase likelihood of selection; values like -100 or 100 should result in a ban
+              or exclusive selection of the relevant token.
+
+          max_tokens: The maximum number of [tokens](/tokenizer) to generate in the chat completion.
+
+              The total length of input tokens and generated tokens is limited by the model's
+              context length.
+              [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb)
+              for counting tokens.
+
+          n: How many chat completion choices to generate for each input message.
+
+          presence_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on
+              whether they appear in the text so far, increasing the model's likelihood to
+              talk about new topics.
+
+              [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/gpt/parameter-details)
+
+          stop: Up to 4 sequences where the API will stop generating further tokens.
+
+          temperature: What sampling temperature to use, between 0 and 2. Higher values like 0.8 will
+              make the output more random, while lower values like 0.2 will make it more
+              focused and deterministic.
+
+              We generally recommend altering this or `top_p` but not both.
+
+          top_p: An alternative to sampling with temperature, called nucleus sampling, where the
+              model considers the results of the tokens with top_p probability mass. So 0.1
+              means only the tokens comprising the top 10% probability mass are considered.
+
+              We generally recommend altering this or `temperature` but not both.
+
+          user: A unique identifier representing your end-user, which can help OpenAI to monitor
+              and detect abuse.
+              [Learn more](https://platform.openai.com/docs/guides/safety-best-practices/end-user-ids).
+
+          extra_headers: Send extra headers
+
+          extra_query: Add additional query parameters to the request
+
+          extra_body: Add additional JSON properties to the request
+
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        ...
+    @override
+    async def create(
+        self,
+        *,
+        messages: List[ChatCompletionMessageParam],
+        model: Union[
+            str,
+            Literal[
+                "gpt-4",
+                "gpt-4-0314",
+                "gpt-4-0613",
+                "gpt-4-32k",
+                "gpt-4-32k-0314",
+                "gpt-4-32k-0613",
+                "gpt-3.5-turbo",
+                "gpt-3.5-turbo-16k",
+                "gpt-3.5-turbo-0301",
+                "gpt-3.5-turbo-0613",
+                "gpt-3.5-turbo-16k-0613",
+            ],
+        ],
+        frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        function_call: FunctionCall | NotGiven = NOT_GIVEN,
+        functions: List[Function] | NotGiven = NOT_GIVEN,
+        logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN,
+        max_tokens: Optional[int] | NotGiven = NOT_GIVEN,
+        n: Optional[int] | NotGiven = NOT_GIVEN,
+        presence_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        stop: Union[Optional[str], List[str]] | NotGiven = NOT_GIVEN,
+        stream: Optional[Literal[False]] | Literal[True] | NotGiven = NOT_GIVEN,
+        temperature: Optional[float] | NotGiven = NOT_GIVEN,
+        top_p: Optional[float] | NotGiven = NOT_GIVEN,
+        user: str | NotGiven = NOT_GIVEN,
+        data_sources: List[ChatExtensionConfiguration] | NotGiven = NOT_GIVEN, # TODO
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | None | NotGiven = NOT_GIVEN,
+    ) -> AzureChatCompletion | AsyncStream[AzureChatCompletionChunk]:
+        if data_sources:
+            if extra_body is None:
+                extra_body= {}
+            cast(Dict[str, Any], extra_body)['dataSources'] = data_sources
+        stream_dict: Dict[str, Literal[True]] = { # TODO: pylance is upset if I pass through the parameter value. Overload + override combination is problematic
+            "stream": True
+        } if stream else {}
+        response = cast(
+            Union[ChatCompletion, ChatCompletionChunk],
+            await super().create(
+                messages=messages,
+                model=model,
+                frequency_penalty = frequency_penalty,
+                function_call=function_call,
+                functions=functions,
+                logit_bias=logit_bias,
+                max_tokens=max_tokens,
+                n=n,
+                presence_penalty=presence_penalty,
+                stop=stop,
+                **stream_dict,
+                temperature=temperature,
+                top_p=top_p,
+                user=user,
+                extra_headers=extra_headers,
+                extra_query=extra_query,
+                extra_body=extra_body,
+                timeout=timeout
+            )
+        )
+        if isinstance(response, AsyncStream):
+            response._cast_to = AzureChatCompletionChunk  # or rebuild the stream?
+        else:
+            response_json = response.model_dump(mode="json")
+            response = AzureChatCompletion.construct(**response_json)
+        return response  # type: ignore
+
+
+class AsyncAzureCompletions(AsyncCompletionsOperations):
+    @overload
+    async def create(
+        self,
+        *,
+        model: Union[
+            str,
+            Literal[
+                "babbage-002",
+                "davinci-002",
+                "gpt-3.5-turbo-instruct",
+                "text-davinci-003",
+                "text-davinci-002",
+                "text-davinci-001",
+                "code-davinci-002",
+                "text-curie-001",
+                "text-babbage-001",
+                "text-ada-001",
+            ],
+        ],
+        prompt: Union[str, List[str], List[int], List[List[int]], None],
+        best_of: Optional[int] | NotGiven = NOT_GIVEN,
+        echo: Optional[bool] | NotGiven = NOT_GIVEN,
+        frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN,
+        logprobs: Optional[int] | NotGiven = NOT_GIVEN,
+        max_tokens: Optional[int] | NotGiven = NOT_GIVEN,
+        n: Optional[int] | NotGiven = NOT_GIVEN,
+        presence_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        stop: Union[Optional[str], List[str], None] | NotGiven = NOT_GIVEN,
+        stream: Optional[Literal[False]] | NotGiven = NOT_GIVEN,
+        suffix: Optional[str] | NotGiven = NOT_GIVEN,
+        temperature: Optional[float] | NotGiven = NOT_GIVEN,
+        top_p: Optional[float] | NotGiven = NOT_GIVEN,
+        user: str | NotGiven = NOT_GIVEN,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | None | NotGiven = NOT_GIVEN,
+    ) -> AzureCompletion:
+        """
+        Creates a completion for the provided prompt and parameters.
+
+        Args:
+          model: ID of the model to use. You can use the
+              [List models](https://platform.openai.com/docs/api-reference/models/list) API to
+              see all of your available models, or see our
+              [Model overview](https://platform.openai.com/docs/models/overview) for
+              descriptions of them.
+
+          prompt: The prompt(s) to generate completions for, encoded as a string, array of
+              strings, array of tokens, or array of token arrays.
+
+              Note that <|endoftext|> is the document separator that the model sees during
+              training, so if a prompt is not specified the model will generate as if from the
+              beginning of a new document.
+
+          best_of: Generates `best_of` completions server-side and returns the "best" (the one with
+              the highest log probability per token). Results cannot be streamed.
+
+              When used with `n`, `best_of` controls the number of candidate completions and
+              `n` specifies how many to return – `best_of` must be greater than `n`.
+
+              **Note:** Because this parameter generates many completions, it can quickly
+              consume your token quota. Use carefully and ensure that you have reasonable
+              settings for `max_tokens` and `stop`.
+
+          echo: Echo back the prompt in addition to the completion
+
+          frequency_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on their
+              existing frequency in the text so far, decreasing the model's likelihood to
+              repeat the same line verbatim.
+
+              [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/gpt/parameter-details)
+
+          logit_bias: Modify the likelihood of specified tokens appearing in the completion.
+
+              Accepts a json object that maps tokens (specified by their token ID in the GPT
+              tokenizer) to an associated bias value from -100 to 100. You can use this
+              [tokenizer tool](/tokenizer?view=bpe) (which works for both GPT-2 and GPT-3) to
+              convert text to token IDs. Mathematically, the bias is added to the logits
+              generated by the model prior to sampling. The exact effect will vary per model,
+              but values between -1 and 1 should decrease or increase likelihood of selection;
+              values like -100 or 100 should result in a ban or exclusive selection of the
+              relevant token.
+
+              As an example, you can pass `{"50256": -100}` to prevent the <|endoftext|> token
+              from being generated.
+
+          logprobs: Include the log probabilities on the `logprobs` most likely tokens, as well the
+              chosen tokens. For example, if `logprobs` is 5, the API will return a list of
+              the 5 most likely tokens. The API will always return the `logprob` of the
+              sampled token, so there may be up to `logprobs+1` elements in the response.
+
+              The maximum value for `logprobs` is 5.
+
+          max_tokens: The maximum number of [tokens](/tokenizer) to generate in the completion.
+
+              The token count of your prompt plus `max_tokens` cannot exceed the model's
+              context length.
+              [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb)
+              for counting tokens.
+
+          n: How many completions to generate for each prompt.
+
+              **Note:** Because this parameter generates many completions, it can quickly
+              consume your token quota. Use carefully and ensure that you have reasonable
+              settings for `max_tokens` and `stop`.
+
+          presence_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on
+              whether they appear in the text so far, increasing the model's likelihood to
+              talk about new topics.
+
+              [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/gpt/parameter-details)
+
+          stop: Up to 4 sequences where the API will stop generating further tokens. The
+              returned text will not contain the stop sequence.
+
+          stream: Whether to stream back partial progress. If set, tokens will be sent as
+              data-only
+              [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format)
+              as they become available, with the stream terminated by a `data: [DONE]`
+              message.
+              [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_stream_completions.ipynb).
+
+          suffix: The suffix that comes after a completion of inserted text.
+
+          temperature: What sampling temperature to use, between 0 and 2. Higher values like 0.8 will
+              make the output more random, while lower values like 0.2 will make it more
+              focused and deterministic.
+
+              We generally recommend altering this or `top_p` but not both.
+
+          top_p: An alternative to sampling with temperature, called nucleus sampling, where the
+              model considers the results of the tokens with top_p probability mass. So 0.1
+              means only the tokens comprising the top 10% probability mass are considered.
+
+              We generally recommend altering this or `temperature` but not both.
+
+          user: A unique identifier representing your end-user, which can help OpenAI to monitor
+              and detect abuse.
+              [Learn more](https://platform.openai.com/docs/guides/safety-best-practices/end-user-ids).
+
+          extra_headers: Send extra headers
+
+          extra_query: Add additional query parameters to the request
+
+          extra_body: Add additional JSON properties to the request
+
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        ...
+
+    @overload
+    async def create(
+        self,
+        *,
+        model: Union[
+            str,
+            Literal[
+                "babbage-002",
+                "davinci-002",
+                "gpt-3.5-turbo-instruct",
+                "text-davinci-003",
+                "text-davinci-002",
+                "text-davinci-001",
+                "code-davinci-002",
+                "text-curie-001",
+                "text-babbage-001",
+                "text-ada-001",
+            ],
+        ],
+        prompt: Union[str, List[str], List[int], List[List[int]], None],
+        stream: Literal[True],
+        best_of: Optional[int] | NotGiven = NOT_GIVEN,
+        echo: Optional[bool] | NotGiven = NOT_GIVEN,
+        frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN,
+        logprobs: Optional[int] | NotGiven = NOT_GIVEN,
+        max_tokens: Optional[int] | NotGiven = NOT_GIVEN,
+        n: Optional[int] | NotGiven = NOT_GIVEN,
+        presence_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        stop: Union[Optional[str], List[str], None] | NotGiven = NOT_GIVEN,
+        suffix: Optional[str] | NotGiven = NOT_GIVEN,
+        temperature: Optional[float] | NotGiven = NOT_GIVEN,
+        top_p: Optional[float] | NotGiven = NOT_GIVEN,
+        user: str | NotGiven = NOT_GIVEN,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | None | NotGiven = NOT_GIVEN,
+    ) -> AsyncStream[AzureCompletion]:
+        """
+        Creates a completion for the provided prompt and parameters.
+
+        Args:
+          model: ID of the model to use. You can use the
+              [List models](https://platform.openai.com/docs/api-reference/models/list) API to
+              see all of your available models, or see our
+              [Model overview](https://platform.openai.com/docs/models/overview) for
+              descriptions of them.
+
+          prompt: The prompt(s) to generate completions for, encoded as a string, array of
+              strings, array of tokens, or array of token arrays.
+
+              Note that <|endoftext|> is the document separator that the model sees during
+              training, so if a prompt is not specified the model will generate as if from the
+              beginning of a new document.
+
+          stream: Whether to stream back partial progress. If set, tokens will be sent as
+              data-only
+              [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format)
+              as they become available, with the stream terminated by a `data: [DONE]`
+              message.
+              [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_stream_completions.ipynb).
+
+          best_of: Generates `best_of` completions server-side and returns the "best" (the one with
+              the highest log probability per token). Results cannot be streamed.
+
+              When used with `n`, `best_of` controls the number of candidate completions and
+              `n` specifies how many to return – `best_of` must be greater than `n`.
+
+              **Note:** Because this parameter generates many completions, it can quickly
+              consume your token quota. Use carefully and ensure that you have reasonable
+              settings for `max_tokens` and `stop`.
+
+          echo: Echo back the prompt in addition to the completion
+
+          frequency_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on their
+              existing frequency in the text so far, decreasing the model's likelihood to
+              repeat the same line verbatim.
+
+              [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/gpt/parameter-details)
+
+          logit_bias: Modify the likelihood of specified tokens appearing in the completion.
+
+              Accepts a json object that maps tokens (specified by their token ID in the GPT
+              tokenizer) to an associated bias value from -100 to 100. You can use this
+              [tokenizer tool](/tokenizer?view=bpe) (which works for both GPT-2 and GPT-3) to
+              convert text to token IDs. Mathematically, the bias is added to the logits
+              generated by the model prior to sampling. The exact effect will vary per model,
+              but values between -1 and 1 should decrease or increase likelihood of selection;
+              values like -100 or 100 should result in a ban or exclusive selection of the
+              relevant token.
+
+              As an example, you can pass `{"50256": -100}` to prevent the <|endoftext|> token
+              from being generated.
+
+          logprobs: Include the log probabilities on the `logprobs` most likely tokens, as well the
+              chosen tokens. For example, if `logprobs` is 5, the API will return a list of
+              the 5 most likely tokens. The API will always return the `logprob` of the
+              sampled token, so there may be up to `logprobs+1` elements in the response.
+
+              The maximum value for `logprobs` is 5.
+
+          max_tokens: The maximum number of [tokens](/tokenizer) to generate in the completion.
+
+              The token count of your prompt plus `max_tokens` cannot exceed the model's
+              context length.
+              [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb)
+              for counting tokens.
+
+          n: How many completions to generate for each prompt.
+
+              **Note:** Because this parameter generates many completions, it can quickly
+              consume your token quota. Use carefully and ensure that you have reasonable
+              settings for `max_tokens` and `stop`.
+
+          presence_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on
+              whether they appear in the text so far, increasing the model's likelihood to
+              talk about new topics.
+
+              [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/gpt/parameter-details)
+
+          stop: Up to 4 sequences where the API will stop generating further tokens. The
+              returned text will not contain the stop sequence.
+
+          suffix: The suffix that comes after a completion of inserted text.
+
+          temperature: What sampling temperature to use, between 0 and 2. Higher values like 0.8 will
+              make the output more random, while lower values like 0.2 will make it more
+              focused and deterministic.
+
+              We generally recommend altering this or `top_p` but not both.
+
+          top_p: An alternative to sampling with temperature, called nucleus sampling, where the
+              model considers the results of the tokens with top_p probability mass. So 0.1
+              means only the tokens comprising the top 10% probability mass are considered.
+
+              We generally recommend altering this or `temperature` but not both.
+
+          user: A unique identifier representing your end-user, which can help OpenAI to monitor
+              and detect abuse.
+              [Learn more](https://platform.openai.com/docs/guides/safety-best-practices/end-user-ids).
+
+          extra_headers: Send extra headers
+
+          extra_query: Add additional query parameters to the request
+
+          extra_body: Add additional JSON properties to the request
+
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        ...
+
+    @override
+    async def create(
+        self,
+        *,
+        model: Union[
+            str,
+            Literal[
+                "babbage-002",
+                "davinci-002",
+                "gpt-3.5-turbo-instruct",
+                "text-davinci-003",
+                "text-davinci-002",
+                "text-davinci-001",
+                "code-davinci-002",
+                "text-curie-001",
+                "text-babbage-001",
+                "text-ada-001",
+            ],
+        ],
+        prompt: Union[str, List[str], List[int], List[List[int]], None],
+        best_of: Optional[int] | NotGiven = NOT_GIVEN,
+        echo: Optional[bool] | NotGiven = NOT_GIVEN,
+        frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN,
+        logprobs: Optional[int] | NotGiven = NOT_GIVEN,
+        max_tokens: Optional[int] | NotGiven = NOT_GIVEN,
+        n: Optional[int] | NotGiven = NOT_GIVEN,
+        presence_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        stop: Union[Optional[str], List[str], None] | NotGiven = NOT_GIVEN,
+        stream: Optional[Literal[False]] | Literal[True] | NotGiven = NOT_GIVEN,
+        suffix: Optional[str] | NotGiven = NOT_GIVEN,
+        temperature: Optional[float] | NotGiven = NOT_GIVEN,
+        top_p: Optional[float] | NotGiven = NOT_GIVEN,
+        user: str | NotGiven = NOT_GIVEN,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | None | NotGiven = NOT_GIVEN,
+    ) -> AzureCompletion | AsyncStream[AzureCompletion]:
+        stream_dict: Dict[str, Literal[True]] = { # TODO: pylance is upset if I pass through the parameter value. Overload + override combination is problematic
+            "stream": True
+        } if stream else {}
+        response = cast(
+            Union[Completion, AsyncStream[Completion]],
+            await super().create(
+                model=model,
+                prompt=prompt,
+                best_of=best_of,
+                echo=echo,
+                frequency_penalty = frequency_penalty,
+                logit_bias=logit_bias,
+                logprobs=logprobs,
+                max_tokens=max_tokens,
+                n=n,
+                presence_penalty=presence_penalty,
+                stop=stop,
+                **stream_dict,
+                suffix=suffix,
+                temperature=temperature,
+                top_p=top_p,
+                user=user,
+                extra_headers=extra_headers,
+                extra_query=extra_query,
+                extra_body=extra_body,
+                timeout=timeout
+            )
+        )
+
+        if isinstance(response, AsyncStream):
+            response._cast_to = AzureCompletion
+        else:
+            response_json = response.model_dump(mode="json")
+            response = AzureCompletion.construct(**response_json)
+        return response  # type: ignore
+
+
+class AsyncAzureOpenAIClient(AsyncClient):
+
+    @property
+    @override
+    def chat(self) -> AsyncAzureChat:
+        return self._chat
+
+    @property
+    @override
+    def completions(self) -> AsyncAzureCompletions:
+        return self._completions
+
+    @completions.setter
+    def completions(self, value: AsyncAzureCompletions) -> None:
+        self._completions = value
+
+    def __init__(self, *args: Any, credential: Optional["TokenCredential"] = None, api_version: str = '2023-09-01-preview', **kwargs: Any):
+        default_query = kwargs.get('default_query', {})
+        default_query.setdefault('api-version', api_version)
+        kwargs['default_query'] = default_query
+        self.credential = credential
+        if credential:
+            kwargs['api_key'] = 'Placeholder: AAD' # TODO: There is an assumption/validation there is always an API key.
+        super().__init__(*args, **kwargs)
+        self._chat = AsyncAzureChat(self)
+
+    @property
+    def auth_headers(self) -> Dict[str, str]:
+        return {"api-key": self.api_key}
+
+    @property
+    def custom_auth(self) -> httpx.Auth | None:
+        if self.credential:
+            return TokenAuth(self.credential)
+
+    def _check_polling_response(self, response: httpx.Response, predicate: Callable[[httpx.Response], bool]) -> bool:
+        if not predicate(response):
+            return False
+        error_data = response.json()['error']
+        message: str = cast(str, error_data.get('message', 'Operation failed'))
+        code = error_data.get('code')
+        raise OpenAIError(f'Error: {message} ({code})')
+
+    async def _poll(
+        self,
+        method: str,
+        url: str,
+        until: Callable[[httpx.Response], bool],
+        failed: Callable[[httpx.Response], bool],
+        interval: Optional[float] = None,
+        delay: Optional[float] = None,
+    ) -> ImagesResponse:
+        if delay:
+            time.sleep(delay)
+
+        opts = FinalRequestOptions.construct(method=method, url=url)
+        response = await super().request(httpx.Response, opts)
+        self._check_polling_response(response, failed)
+        start_time = time.time()
+        while not until(response):
+            if time.time() - start_time > TIMEOUT_SECS:
+                raise Exception("Operation polling timed out.") # TODO: Fix up exception type. 
+
+            time.sleep(interval or int(response.headers.get("retry-after")) or 10)
+            response = await super().request(httpx.Response, opts)
+            self._check_polling_response(response, failed)
+
+        response_json = response.json()
+        return ImagesResponse.construct(**response_json["result"])
+
+    # NOTE: We override the internal method because `@overrid`ing `@overload`ed methods and keeping typing happy is a pain. Most typing tools are lacking...
+    async def _request(self, cast_to: Type[ResponseT], options: FinalRequestOptions, **kwargs: Any) -> Any:
+        if options.url == "/images/generations":
+            options.url = "openai/images/generations:submit"
+            response = await super()._request(cast_to=cast_to, options=options, **kwargs)
+            model_extra = cast(Mapping[str, Any], getattr(response, 'model_extra')) or {}
+            operation_id = cast(str, model_extra['id'])
+            return await self._poll(
+                "get", f"openai/operations/images/{operation_id}",
+                until=lambda response: response.json()["status"] in ["succeeded"],
+                failed=lambda response: response.json()["status"] in ["failed"],
+            )
+        if isinstance(options.json_data, Mapping):
+            model = cast(str, options.json_data["model"])
+            if not options.url.startswith(f'openai/deployments/{model}'):
+                if options.extra_json and options.extra_json.get("dataSources"):
+                    options.url = f'openai/deployments/{model}/extensions' + options.url
+                else:
+                    options.url = f'openai/deployments/{model}' + options.url
+        if options.url.startswith(("/models", "/fine_tuning", "/files", "/fine-tunes")):
+            options.url = f"openai{options.url}"
+        return await super()._request(cast_to=cast_to, options=options, **kwargs)
diff --git a/src/openai/azure/_azuremodels.py b/src/openai/azure/_azuremodels.py
new file mode 100644
index 0000000000..841bd11d78
--- /dev/null
+++ b/src/openai/azure/_azuremodels.py
@@ -0,0 +1,82 @@
+from typing import List, Optional
+from typing_extensions import TypedDict, Literal
+from openai._models import BaseModel as BaseModel
+
+from openai.types.chat import ChatCompletion, ChatCompletionChunk, ChatCompletionMessage
+from openai.types.chat.chat_completion import Choice as ChatChoice
+from openai.types.chat.chat_completion_chunk import ChoiceDelta, Choice as ChatChoiceDelta
+from openai.types.completion import Completion
+from openai.types.completion_choice import CompletionChoice
+
+
+AzureChatCompletionRole = Literal["system", "user", "assistant", "function", "tool"]
+
+
+class ChatExtensionConfiguration(TypedDict):
+    type: Literal["AzureCognitiveSearch"]
+    parameters: object
+
+
+class ContentFilterResult(BaseModel):
+    severity: Literal["safe", "low", "medium", "high"]
+    filtered: bool
+
+
+class Error(BaseModel):
+    code: str
+    message: str
+
+
+class ContentFilterResults(BaseModel):
+    hate: Optional[ContentFilterResult]
+    self_harm: Optional[ContentFilterResult]
+    violence: Optional[ContentFilterResult]
+    sexual: Optional[ContentFilterResult]
+    error: Optional[Error]
+
+
+class PromptFilterResult(BaseModel):
+    prompt_index: int
+    content_filter_results: Optional[ContentFilterResults]
+
+
+class AzureChatExtensionsMessageContext(BaseModel):
+    messages: Optional[List[ChatCompletionMessage]]
+
+
+class AzureChatCompletionMessage(ChatCompletionMessage):
+    context: Optional[AzureChatExtensionsMessageContext]
+    role: AzureChatCompletionRole  # type: ignore
+
+
+class AzureChatCompletionChoice(ChatChoice):
+    content_filter_results: Optional[ContentFilterResults]
+    message: AzureChatCompletionMessage  # type: ignore
+
+
+class AzureChatCompletion(ChatCompletion):
+    choices: List[AzureChatCompletionChoice]  # type: ignore
+    prompt_filter_results: Optional[List[PromptFilterResult]]
+
+
+class AzureChoiceDelta(ChoiceDelta):
+    context: Optional[AzureChatExtensionsMessageContext]
+
+
+class AzureChatCompletionChoiceDelta(ChatChoiceDelta):
+    delta: AzureChoiceDelta  # type: ignore
+    content_filter_results: Optional[ContentFilterResults]
+
+
+class AzureChatCompletionChunk(ChatCompletionChunk):
+    choices: List[AzureChatCompletionChoiceDelta]  # type: ignore
+    prompt_filter_results: Optional[List[PromptFilterResult]]
+
+
+class AzureCompletionChoice(CompletionChoice):
+    content_filter_results: Optional[ContentFilterResults]
+
+
+class AzureCompletion(Completion):
+    choices: List[AzureCompletionChoice]  # type: ignore
+    prompt_filter_results: Optional[List[PromptFilterResult]]
diff --git a/src/openai/azure/_credential.py b/src/openai/azure/_credential.py
new file mode 100644
index 0000000000..9d10e14909
--- /dev/null
+++ b/src/openai/azure/_credential.py
@@ -0,0 +1,46 @@
+from typing import AsyncGenerator, Generator, Any
+import time
+import asyncio
+import httpx
+
+
+class TokenCredential:
+    """Placeholder/example token credential class
+    
+       A real implementation would be compatible with e.g. azure-identity and also should be easily
+       adaptible to other token credential implementations.
+    """
+    def __init__(self):
+        import azure.identity
+        self._credential = azure.identity.DefaultAzureCredential()
+
+    def get_token(self):
+        return self._credential.get_token('https://cognitiveservices.azure.com/.default').token
+
+
+class TokenAuth(httpx.Auth):
+    def __init__(self, credential: "TokenCredential") -> None:
+        self._credential = credential
+        self._async_lock = asyncio.Lock()
+        self.cached_token = None
+
+    def sync_get_token(self) -> str:
+        if not self.cached_token or self.cached_token.expires_on - time.time() < 300:
+            return self._credential.get_token("https://cognitiveservices.azure.com/.default").token
+        return self.cached_token.token
+
+    def sync_auth_flow(self, request: httpx.Request) -> Generator[httpx.Request, Any, Any]:
+        token = self.sync_get_token()
+        request.headers["Authorization"] = f"Bearer {token}"
+        yield request
+
+    async def async_get_token(self) -> str:
+        async with self._async_lock:
+            if not self.cached_token or self.cached_token.expires_on - time.time() < 300:
+                return (await self._credential.get_token("https://cognitiveservices.azure.com/.default")).token
+        return self.cached_token.token
+
+    async def async_auth_flow(self, request: httpx.Request) -> AsyncGenerator[httpx.Request, Any]:
+        token = await self.async_get_token()
+        request.headers["Authorization"] = f"Bearer {token}"
+        yield request
diff --git a/src/openai/azure/_sync_client.py b/src/openai/azure/_sync_client.py
new file mode 100644
index 0000000000..677f9e8ac3
--- /dev/null
+++ b/src/openai/azure/_sync_client.py
@@ -0,0 +1,857 @@
+from __future__ import annotations
+
+from typing_extensions import Literal, override
+from typing import Any, Callable, cast, List, Mapping, Dict, Optional, overload, Union
+import time
+
+import httpx
+
+from openai import Client, OpenAIError
+from openai.types import ImagesResponse
+
+# These are types used in the public API surface area that are not exported as public
+from openai._models import FinalRequestOptions
+
+# These types are needed for correct typing of overrides
+from openai._types import NotGiven, NOT_GIVEN, Headers, Query, Body
+from openai._streaming import Stream
+
+from openai.resources.chat import Chat, Completions
+from openai.resources.completions import Completions as CompletionsOperations
+from openai.types.chat import ChatCompletionMessageParam, ChatCompletion, ChatCompletionChunk
+from openai.types.chat.completion_create_params import FunctionCall, Function
+from openai.types.completion import Completion
+
+# Azure specific types
+from ._credential import TokenCredential, TokenAuth
+from ._azuremodels import (
+    ChatExtensionConfiguration,
+    AzureChatCompletion, 
+    AzureChatCompletionChunk,
+    AzureCompletion,
+)
+
+TIMEOUT_SECS = 600
+
+class AzureChat(Chat):
+
+    @property
+    def completions(self) -> "AzureChatCompletions":
+        return self._completions
+    
+    def __init__(self, client: "AzureOpenAIClient"):
+        self._completions = AzureChatCompletions(client)
+
+
+class AzureChatCompletions(Completions):
+    
+    @overload
+    def create(
+        self,
+        *,
+        messages: List[ChatCompletionMessageParam],
+        model: Union[
+            str,
+            Literal[
+                "gpt-4",
+                "gpt-4-0314",
+                "gpt-4-0613",
+                "gpt-4-32k",
+                "gpt-4-32k-0314",
+                "gpt-4-32k-0613",
+                "gpt-3.5-turbo",
+                "gpt-3.5-turbo-16k",
+                "gpt-3.5-turbo-0301",
+                "gpt-3.5-turbo-0613",
+                "gpt-3.5-turbo-16k-0613",
+            ],
+        ],
+        frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        function_call: FunctionCall | NotGiven = NOT_GIVEN,
+        functions: List[Function] | NotGiven = NOT_GIVEN,
+        logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN,
+        max_tokens: Optional[int] | NotGiven = NOT_GIVEN,
+        n: Optional[int] | NotGiven = NOT_GIVEN,
+        presence_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        stop: Union[Optional[str], List[str]] | NotGiven = NOT_GIVEN,
+        stream: Optional[Literal[False]] | NotGiven = NOT_GIVEN,
+        temperature: Optional[float] | NotGiven = NOT_GIVEN,
+        top_p: Optional[float] | NotGiven = NOT_GIVEN,
+        user: str | NotGiven = NOT_GIVEN,
+        data_sources: List[ChatExtensionConfiguration] | NotGiven = NOT_GIVEN, # TODO
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | None | NotGiven = NOT_GIVEN,
+    ) -> AzureChatCompletion:
+        """
+        Creates a model response for the given chat conversation.
+
+        Args:
+          messages: A list of messages comprising the conversation so far.
+              [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_format_inputs_to_ChatGPT_models.ipynb).
+
+          model: ID of the model to use. See the
+              [model endpoint compatibility](https://platform.openai.com/docs/models/model-endpoint-compatibility)
+              table for details on which models work with the Chat API.
+
+          frequency_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on their
+              existing frequency in the text so far, decreasing the model's likelihood to
+              repeat the same line verbatim.
+
+              [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/gpt/parameter-details)
+
+          function_call: Controls how the model responds to function calls. `none` means the model does
+              not call a function, and responds to the end-user. `auto` means the model can
+              pick between an end-user or calling a function. Specifying a particular function
+              via `{"name": "my_function"}` forces the model to call that function. `none` is
+              the default when no functions are present. `auto` is the default if functions
+              are present.
+
+          functions: A list of functions the model may generate JSON inputs for.
+
+          logit_bias: Modify the likelihood of specified tokens appearing in the completion.
+
+              Accepts a json object that maps tokens (specified by their token ID in the
+              tokenizer) to an associated bias value from -100 to 100. Mathematically, the
+              bias is added to the logits generated by the model prior to sampling. The exact
+              effect will vary per model, but values between -1 and 1 should decrease or
+              increase likelihood of selection; values like -100 or 100 should result in a ban
+              or exclusive selection of the relevant token.
+
+          max_tokens: The maximum number of [tokens](/tokenizer) to generate in the chat completion.
+
+              The total length of input tokens and generated tokens is limited by the model's
+              context length.
+              [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb)
+              for counting tokens.
+
+          n: How many chat completion choices to generate for each input message.
+
+          presence_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on
+              whether they appear in the text so far, increasing the model's likelihood to
+              talk about new topics.
+
+              [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/gpt/parameter-details)
+
+          stop: Up to 4 sequences where the API will stop generating further tokens.
+
+          stream: If set, partial message deltas will be sent, like in ChatGPT. Tokens will be
+              sent as data-only
+              [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format)
+              as they become available, with the stream terminated by a `data: [DONE]`
+              message.
+              [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_stream_completions.ipynb).
+
+          temperature: What sampling temperature to use, between 0 and 2. Higher values like 0.8 will
+              make the output more random, while lower values like 0.2 will make it more
+              focused and deterministic.
+
+              We generally recommend altering this or `top_p` but not both.
+
+          top_p: An alternative to sampling with temperature, called nucleus sampling, where the
+              model considers the results of the tokens with top_p probability mass. So 0.1
+              means only the tokens comprising the top 10% probability mass are considered.
+
+              We generally recommend altering this or `temperature` but not both.
+
+          user: A unique identifier representing your end-user, which can help OpenAI to monitor
+              and detect abuse.
+              [Learn more](https://platform.openai.com/docs/guides/safety-best-practices/end-user-ids).
+
+          extra_headers: Send extra headers
+
+          extra_query: Add additional query parameters to the request
+
+          extra_body: Add additional JSON properties to the request
+
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        ...
+
+    @overload
+    def create(
+        self,
+        *,
+        messages: List[ChatCompletionMessageParam],
+        model: Union[
+            str,
+            Literal[
+                "gpt-4",
+                "gpt-4-0314",
+                "gpt-4-0613",
+                "gpt-4-32k",
+                "gpt-4-32k-0314",
+                "gpt-4-32k-0613",
+                "gpt-3.5-turbo",
+                "gpt-3.5-turbo-16k",
+                "gpt-3.5-turbo-0301",
+                "gpt-3.5-turbo-0613",
+                "gpt-3.5-turbo-16k-0613",
+            ],
+        ],
+        stream: Literal[True],
+        frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        function_call: FunctionCall | NotGiven = NOT_GIVEN,
+        functions: List[Function] | NotGiven = NOT_GIVEN,
+        logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN,
+        max_tokens: Optional[int] | NotGiven = NOT_GIVEN,
+        n: Optional[int] | NotGiven = NOT_GIVEN,
+        presence_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        stop: Union[Optional[str], List[str]] | NotGiven = NOT_GIVEN,
+        temperature: Optional[float] | NotGiven = NOT_GIVEN,
+        top_p: Optional[float] | NotGiven = NOT_GIVEN,
+        user: str | NotGiven = NOT_GIVEN,
+        data_sources: List[ChatExtensionConfiguration] | NotGiven = NOT_GIVEN, # TODO
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | None | NotGiven = NOT_GIVEN,
+    ) -> Stream[AzureChatCompletionChunk]:
+        """
+        Creates a model response for the given chat conversation.
+
+        Args:
+          messages: A list of messages comprising the conversation so far.
+              [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_format_inputs_to_ChatGPT_models.ipynb).
+
+          model: ID of the model to use. See the
+              [model endpoint compatibility](https://platform.openai.com/docs/models/model-endpoint-compatibility)
+              table for details on which models work with the Chat API.
+
+          stream: If set, partial message deltas will be sent, like in ChatGPT. Tokens will be
+              sent as data-only
+              [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format)
+              as they become available, with the stream terminated by a `data: [DONE]`
+              message.
+              [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_stream_completions.ipynb).
+
+          frequency_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on their
+              existing frequency in the text so far, decreasing the model's likelihood to
+              repeat the same line verbatim.
+
+              [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/gpt/parameter-details)
+
+          function_call: Controls how the model responds to function calls. `none` means the model does
+              not call a function, and responds to the end-user. `auto` means the model can
+              pick between an end-user or calling a function. Specifying a particular function
+              via `{"name": "my_function"}` forces the model to call that function. `none` is
+              the default when no functions are present. `auto` is the default if functions
+              are present.
+
+          functions: A list of functions the model may generate JSON inputs for.
+
+          logit_bias: Modify the likelihood of specified tokens appearing in the completion.
+
+              Accepts a json object that maps tokens (specified by their token ID in the
+              tokenizer) to an associated bias value from -100 to 100. Mathematically, the
+              bias is added to the logits generated by the model prior to sampling. The exact
+              effect will vary per model, but values between -1 and 1 should decrease or
+              increase likelihood of selection; values like -100 or 100 should result in a ban
+              or exclusive selection of the relevant token.
+
+          max_tokens: The maximum number of [tokens](/tokenizer) to generate in the chat completion.
+
+              The total length of input tokens and generated tokens is limited by the model's
+              context length.
+              [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb)
+              for counting tokens.
+
+          n: How many chat completion choices to generate for each input message.
+
+          presence_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on
+              whether they appear in the text so far, increasing the model's likelihood to
+              talk about new topics.
+
+              [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/gpt/parameter-details)
+
+          stop: Up to 4 sequences where the API will stop generating further tokens.
+
+          temperature: What sampling temperature to use, between 0 and 2. Higher values like 0.8 will
+              make the output more random, while lower values like 0.2 will make it more
+              focused and deterministic.
+
+              We generally recommend altering this or `top_p` but not both.
+
+          top_p: An alternative to sampling with temperature, called nucleus sampling, where the
+              model considers the results of the tokens with top_p probability mass. So 0.1
+              means only the tokens comprising the top 10% probability mass are considered.
+
+              We generally recommend altering this or `temperature` but not both.
+
+          user: A unique identifier representing your end-user, which can help OpenAI to monitor
+              and detect abuse.
+              [Learn more](https://platform.openai.com/docs/guides/safety-best-practices/end-user-ids).
+
+          extra_headers: Send extra headers
+
+          extra_query: Add additional query parameters to the request
+
+          extra_body: Add additional JSON properties to the request
+
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        ...
+    @override
+    def create(
+        self,
+        *,
+        messages: List[ChatCompletionMessageParam],
+        model: Union[
+            str,
+            Literal[
+                "gpt-4",
+                "gpt-4-0314",
+                "gpt-4-0613",
+                "gpt-4-32k",
+                "gpt-4-32k-0314",
+                "gpt-4-32k-0613",
+                "gpt-3.5-turbo",
+                "gpt-3.5-turbo-16k",
+                "gpt-3.5-turbo-0301",
+                "gpt-3.5-turbo-0613",
+                "gpt-3.5-turbo-16k-0613",
+            ],
+        ],
+        frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        function_call: FunctionCall | NotGiven = NOT_GIVEN,
+        functions: List[Function] | NotGiven = NOT_GIVEN,
+        logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN,
+        max_tokens: Optional[int] | NotGiven = NOT_GIVEN,
+        n: Optional[int] | NotGiven = NOT_GIVEN,
+        presence_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        stop: Union[Optional[str], List[str]] | NotGiven = NOT_GIVEN,
+        stream: Optional[Literal[False]] | Literal[True] | NotGiven = NOT_GIVEN,
+        temperature: Optional[float] | NotGiven = NOT_GIVEN,
+        top_p: Optional[float] | NotGiven = NOT_GIVEN,
+        user: str | NotGiven = NOT_GIVEN,
+        data_sources: List[ChatExtensionConfiguration] | NotGiven = NOT_GIVEN, # TODO
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | None | NotGiven = NOT_GIVEN,
+    ) -> AzureChatCompletion | Stream[AzureChatCompletionChunk]:
+        if data_sources:
+            if extra_body is None:
+                extra_body= {}
+            cast(Dict[str, Any], extra_body)['dataSources'] = data_sources
+        stream_dict: Dict[str, Literal[True]] = { # TODO: pylance is upset if I pass through the parameter value. Overload + override combination is problematic
+            "stream": True
+        } if stream else {}
+        response = cast(
+            Union[ChatCompletion, Stream[ChatCompletionChunk]],
+            super().create(
+                messages=messages,
+                model=model,
+                frequency_penalty = frequency_penalty,
+                function_call=function_call,
+                functions=functions,
+                logit_bias=logit_bias,
+                max_tokens=max_tokens,
+                n=n,
+                presence_penalty=presence_penalty,
+                stop=stop,
+                **stream_dict,
+                temperature=temperature,
+                top_p=top_p,
+                user=user,
+                extra_headers=extra_headers,
+                extra_query=extra_query,
+                extra_body=extra_body,
+                timeout=timeout
+            )
+        )
+        if isinstance(response, Stream):
+            response._cast_to = AzureChatCompletionChunk  # or rebuild the stream?
+        else:
+            response_json = response.model_dump(mode="json")
+            response = AzureChatCompletion.construct(**response_json)
+        return response  # type: ignore
+
+
+class AzureCompletions(CompletionsOperations):
+    @overload
+    def create(
+        self,
+        *,
+        model: Union[
+            str,
+            Literal[
+                "babbage-002",
+                "davinci-002",
+                "gpt-3.5-turbo-instruct",
+                "text-davinci-003",
+                "text-davinci-002",
+                "text-davinci-001",
+                "code-davinci-002",
+                "text-curie-001",
+                "text-babbage-001",
+                "text-ada-001",
+            ],
+        ],
+        prompt: Union[str, List[str], List[int], List[List[int]], None],
+        best_of: Optional[int] | NotGiven = NOT_GIVEN,
+        echo: Optional[bool] | NotGiven = NOT_GIVEN,
+        frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN,
+        logprobs: Optional[int] | NotGiven = NOT_GIVEN,
+        max_tokens: Optional[int] | NotGiven = NOT_GIVEN,
+        n: Optional[int] | NotGiven = NOT_GIVEN,
+        presence_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        stop: Union[Optional[str], List[str], None] | NotGiven = NOT_GIVEN,
+        stream: Optional[Literal[False]] | NotGiven = NOT_GIVEN,
+        suffix: Optional[str] | NotGiven = NOT_GIVEN,
+        temperature: Optional[float] | NotGiven = NOT_GIVEN,
+        top_p: Optional[float] | NotGiven = NOT_GIVEN,
+        user: str | NotGiven = NOT_GIVEN,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | None | NotGiven = NOT_GIVEN,
+    ) -> AzureCompletion:
+        """
+        Creates a completion for the provided prompt and parameters.
+
+        Args:
+          model: ID of the model to use. You can use the
+              [List models](https://platform.openai.com/docs/api-reference/models/list) API to
+              see all of your available models, or see our
+              [Model overview](https://platform.openai.com/docs/models/overview) for
+              descriptions of them.
+
+          prompt: The prompt(s) to generate completions for, encoded as a string, array of
+              strings, array of tokens, or array of token arrays.
+
+              Note that <|endoftext|> is the document separator that the model sees during
+              training, so if a prompt is not specified the model will generate as if from the
+              beginning of a new document.
+
+          best_of: Generates `best_of` completions server-side and returns the "best" (the one with
+              the highest log probability per token). Results cannot be streamed.
+
+              When used with `n`, `best_of` controls the number of candidate completions and
+              `n` specifies how many to return – `best_of` must be greater than `n`.
+
+              **Note:** Because this parameter generates many completions, it can quickly
+              consume your token quota. Use carefully and ensure that you have reasonable
+              settings for `max_tokens` and `stop`.
+
+          echo: Echo back the prompt in addition to the completion
+
+          frequency_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on their
+              existing frequency in the text so far, decreasing the model's likelihood to
+              repeat the same line verbatim.
+
+              [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/gpt/parameter-details)
+
+          logit_bias: Modify the likelihood of specified tokens appearing in the completion.
+
+              Accepts a json object that maps tokens (specified by their token ID in the GPT
+              tokenizer) to an associated bias value from -100 to 100. You can use this
+              [tokenizer tool](/tokenizer?view=bpe) (which works for both GPT-2 and GPT-3) to
+              convert text to token IDs. Mathematically, the bias is added to the logits
+              generated by the model prior to sampling. The exact effect will vary per model,
+              but values between -1 and 1 should decrease or increase likelihood of selection;
+              values like -100 or 100 should result in a ban or exclusive selection of the
+              relevant token.
+
+              As an example, you can pass `{"50256": -100}` to prevent the <|endoftext|> token
+              from being generated.
+
+          logprobs: Include the log probabilities on the `logprobs` most likely tokens, as well the
+              chosen tokens. For example, if `logprobs` is 5, the API will return a list of
+              the 5 most likely tokens. The API will always return the `logprob` of the
+              sampled token, so there may be up to `logprobs+1` elements in the response.
+
+              The maximum value for `logprobs` is 5.
+
+          max_tokens: The maximum number of [tokens](/tokenizer) to generate in the completion.
+
+              The token count of your prompt plus `max_tokens` cannot exceed the model's
+              context length.
+              [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb)
+              for counting tokens.
+
+          n: How many completions to generate for each prompt.
+
+              **Note:** Because this parameter generates many completions, it can quickly
+              consume your token quota. Use carefully and ensure that you have reasonable
+              settings for `max_tokens` and `stop`.
+
+          presence_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on
+              whether they appear in the text so far, increasing the model's likelihood to
+              talk about new topics.
+
+              [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/gpt/parameter-details)
+
+          stop: Up to 4 sequences where the API will stop generating further tokens. The
+              returned text will not contain the stop sequence.
+
+          stream: Whether to stream back partial progress. If set, tokens will be sent as
+              data-only
+              [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format)
+              as they become available, with the stream terminated by a `data: [DONE]`
+              message.
+              [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_stream_completions.ipynb).
+
+          suffix: The suffix that comes after a completion of inserted text.
+
+          temperature: What sampling temperature to use, between 0 and 2. Higher values like 0.8 will
+              make the output more random, while lower values like 0.2 will make it more
+              focused and deterministic.
+
+              We generally recommend altering this or `top_p` but not both.
+
+          top_p: An alternative to sampling with temperature, called nucleus sampling, where the
+              model considers the results of the tokens with top_p probability mass. So 0.1
+              means only the tokens comprising the top 10% probability mass are considered.
+
+              We generally recommend altering this or `temperature` but not both.
+
+          user: A unique identifier representing your end-user, which can help OpenAI to monitor
+              and detect abuse.
+              [Learn more](https://platform.openai.com/docs/guides/safety-best-practices/end-user-ids).
+
+          extra_headers: Send extra headers
+
+          extra_query: Add additional query parameters to the request
+
+          extra_body: Add additional JSON properties to the request
+
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        ...
+
+    @overload
+    def create(
+        self,
+        *,
+        model: Union[
+            str,
+            Literal[
+                "babbage-002",
+                "davinci-002",
+                "gpt-3.5-turbo-instruct",
+                "text-davinci-003",
+                "text-davinci-002",
+                "text-davinci-001",
+                "code-davinci-002",
+                "text-curie-001",
+                "text-babbage-001",
+                "text-ada-001",
+            ],
+        ],
+        prompt: Union[str, List[str], List[int], List[List[int]], None],
+        stream: Literal[True],
+        best_of: Optional[int] | NotGiven = NOT_GIVEN,
+        echo: Optional[bool] | NotGiven = NOT_GIVEN,
+        frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN,
+        logprobs: Optional[int] | NotGiven = NOT_GIVEN,
+        max_tokens: Optional[int] | NotGiven = NOT_GIVEN,
+        n: Optional[int] | NotGiven = NOT_GIVEN,
+        presence_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        stop: Union[Optional[str], List[str], None] | NotGiven = NOT_GIVEN,
+        suffix: Optional[str] | NotGiven = NOT_GIVEN,
+        temperature: Optional[float] | NotGiven = NOT_GIVEN,
+        top_p: Optional[float] | NotGiven = NOT_GIVEN,
+        user: str | NotGiven = NOT_GIVEN,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | None | NotGiven = NOT_GIVEN,
+    ) -> Stream[AzureCompletion]:
+        """
+        Creates a completion for the provided prompt and parameters.
+
+        Args:
+          model: ID of the model to use. You can use the
+              [List models](https://platform.openai.com/docs/api-reference/models/list) API to
+              see all of your available models, or see our
+              [Model overview](https://platform.openai.com/docs/models/overview) for
+              descriptions of them.
+
+          prompt: The prompt(s) to generate completions for, encoded as a string, array of
+              strings, array of tokens, or array of token arrays.
+
+              Note that <|endoftext|> is the document separator that the model sees during
+              training, so if a prompt is not specified the model will generate as if from the
+              beginning of a new document.
+
+          stream: Whether to stream back partial progress. If set, tokens will be sent as
+              data-only
+              [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format)
+              as they become available, with the stream terminated by a `data: [DONE]`
+              message.
+              [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_stream_completions.ipynb).
+
+          best_of: Generates `best_of` completions server-side and returns the "best" (the one with
+              the highest log probability per token). Results cannot be streamed.
+
+              When used with `n`, `best_of` controls the number of candidate completions and
+              `n` specifies how many to return – `best_of` must be greater than `n`.
+
+              **Note:** Because this parameter generates many completions, it can quickly
+              consume your token quota. Use carefully and ensure that you have reasonable
+              settings for `max_tokens` and `stop`.
+
+          echo: Echo back the prompt in addition to the completion
+
+          frequency_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on their
+              existing frequency in the text so far, decreasing the model's likelihood to
+              repeat the same line verbatim.
+
+              [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/gpt/parameter-details)
+
+          logit_bias: Modify the likelihood of specified tokens appearing in the completion.
+
+              Accepts a json object that maps tokens (specified by their token ID in the GPT
+              tokenizer) to an associated bias value from -100 to 100. You can use this
+              [tokenizer tool](/tokenizer?view=bpe) (which works for both GPT-2 and GPT-3) to
+              convert text to token IDs. Mathematically, the bias is added to the logits
+              generated by the model prior to sampling. The exact effect will vary per model,
+              but values between -1 and 1 should decrease or increase likelihood of selection;
+              values like -100 or 100 should result in a ban or exclusive selection of the
+              relevant token.
+
+              As an example, you can pass `{"50256": -100}` to prevent the <|endoftext|> token
+              from being generated.
+
+          logprobs: Include the log probabilities on the `logprobs` most likely tokens, as well the
+              chosen tokens. For example, if `logprobs` is 5, the API will return a list of
+              the 5 most likely tokens. The API will always return the `logprob` of the
+              sampled token, so there may be up to `logprobs+1` elements in the response.
+
+              The maximum value for `logprobs` is 5.
+
+          max_tokens: The maximum number of [tokens](/tokenizer) to generate in the completion.
+
+              The token count of your prompt plus `max_tokens` cannot exceed the model's
+              context length.
+              [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb)
+              for counting tokens.
+
+          n: How many completions to generate for each prompt.
+
+              **Note:** Because this parameter generates many completions, it can quickly
+              consume your token quota. Use carefully and ensure that you have reasonable
+              settings for `max_tokens` and `stop`.
+
+          presence_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on
+              whether they appear in the text so far, increasing the model's likelihood to
+              talk about new topics.
+
+              [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/gpt/parameter-details)
+
+          stop: Up to 4 sequences where the API will stop generating further tokens. The
+              returned text will not contain the stop sequence.
+
+          suffix: The suffix that comes after a completion of inserted text.
+
+          temperature: What sampling temperature to use, between 0 and 2. Higher values like 0.8 will
+              make the output more random, while lower values like 0.2 will make it more
+              focused and deterministic.
+
+              We generally recommend altering this or `top_p` but not both.
+
+          top_p: An alternative to sampling with temperature, called nucleus sampling, where the
+              model considers the results of the tokens with top_p probability mass. So 0.1
+              means only the tokens comprising the top 10% probability mass are considered.
+
+              We generally recommend altering this or `temperature` but not both.
+
+          user: A unique identifier representing your end-user, which can help OpenAI to monitor
+              and detect abuse.
+              [Learn more](https://platform.openai.com/docs/guides/safety-best-practices/end-user-ids).
+
+          extra_headers: Send extra headers
+
+          extra_query: Add additional query parameters to the request
+
+          extra_body: Add additional JSON properties to the request
+
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        ...
+
+    @override
+    def create(
+        self,
+        *,
+        model: Union[
+            str,
+            Literal[
+                "babbage-002",
+                "davinci-002",
+                "gpt-3.5-turbo-instruct",
+                "text-davinci-003",
+                "text-davinci-002",
+                "text-davinci-001",
+                "code-davinci-002",
+                "text-curie-001",
+                "text-babbage-001",
+                "text-ada-001",
+            ],
+        ],
+        prompt: Union[str, List[str], List[int], List[List[int]], None],
+        best_of: Optional[int] | NotGiven = NOT_GIVEN,
+        echo: Optional[bool] | NotGiven = NOT_GIVEN,
+        frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN,
+        logprobs: Optional[int] | NotGiven = NOT_GIVEN,
+        max_tokens: Optional[int] | NotGiven = NOT_GIVEN,
+        n: Optional[int] | NotGiven = NOT_GIVEN,
+        presence_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        stop: Union[Optional[str], List[str], None] | NotGiven = NOT_GIVEN,
+        stream: Optional[Literal[False]] | Literal[True] | NotGiven = NOT_GIVEN,
+        suffix: Optional[str] | NotGiven = NOT_GIVEN,
+        temperature: Optional[float] | NotGiven = NOT_GIVEN,
+        top_p: Optional[float] | NotGiven = NOT_GIVEN,
+        user: str | NotGiven = NOT_GIVEN,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | None | NotGiven = NOT_GIVEN,
+    ) -> AzureCompletion | Stream[AzureCompletion]:
+        stream_dict: Dict[str, Literal[True]] = { # TODO: pylance is upset if I pass through the parameter value. Overload + override combination is problematic
+            "stream": True
+        } if stream else {}
+        response = cast(
+            Union[Completion, Stream[Completion]],
+            super().create(
+                model=model,
+                prompt=prompt,
+                best_of=best_of,
+                echo=echo,
+                frequency_penalty = frequency_penalty,
+                logit_bias=logit_bias,
+                logprobs=logprobs,
+                max_tokens=max_tokens,
+                n=n,
+                presence_penalty=presence_penalty,
+                stop=stop,
+                **stream_dict,
+                suffix=suffix,
+                temperature=temperature,
+                top_p=top_p,
+                user=user,
+                extra_headers=extra_headers,
+                extra_query=extra_query,
+                extra_body=extra_body,
+                timeout=timeout
+            )
+        )
+
+        if isinstance(response, Stream):
+            response._cast_to = AzureCompletion
+        else:
+            response_json = response.model_dump(mode="json")
+            response = AzureCompletion.construct(**response_json)
+        return response  # type: ignore
+
+
+class AzureOpenAIClient(Client):
+
+    @property
+    @override
+    def chat(self) -> AzureChat:
+        return self._chat
+
+    @property
+    @override
+    def completions(self) -> AzureCompletions:
+        return self._completions
+
+    @completions.setter
+    def completions(self, value: AzureCompletions) -> None:
+        self._completions = value
+
+    def __init__(self, *args: Any, base_url: str, credential: Optional["TokenCredential"] = None, api_version: str = '2023-09-01-preview', **kwargs: Any):
+        default_query = kwargs.get('default_query', {})
+        default_query.setdefault('api-version', api_version)
+        kwargs['default_query'] = default_query
+        self.credential = credential
+        if credential:
+            kwargs['api_key'] = 'Placeholder: AAD' # TODO: There is an assumption/validation there is always an API key.
+        super().__init__(*args, base_url=base_url, **kwargs)
+        self._chat = AzureChat(self)
+
+    @property
+    def auth_headers(self) -> Dict[str, str]:
+        return {"api-key": self.api_key}
+
+    @property
+    def custom_auth(self) -> httpx.Auth | None:
+        if self.credential:
+            return TokenAuth(self.credential)
+
+    # NOTE: We override the internal method because `@overrid`ing `@overload`ed methods and keeping typing happy is a pain. Most typing tools are lacking...
+    def _request(self, *, options: FinalRequestOptions, **kwargs: Any) -> Any:
+        if options.url == "/images/generations":
+            options.url = "openai/images/generations:submit"
+            response = super()._request(options=options, **kwargs)
+            model_extra = cast(Mapping[str, Any], getattr(response, 'model_extra')) or {}
+            operation_id = cast(str, model_extra['id'])
+            return self._poll(
+                "get", f"openai/operations/images/{operation_id}",
+                until=lambda response: response.json()["status"] in ["succeeded"],
+                failed=lambda response: response.json()["status"] in ["failed"],
+            )
+        if isinstance(options.json_data, Mapping):
+            model = cast(str, options.json_data["model"])
+            if not options.url.startswith(f'openai/deployments/{model}'):
+                if options.extra_json and options.extra_json.get("dataSources"):
+                    options.url = f'openai/deployments/{model}/extensions' + options.url
+                else:
+                    options.url = f'openai/deployments/{model}' + options.url
+        if options.url.startswith(("/models", "/fine_tuning", "/files", "/fine-tunes")):
+            options.url = f"openai{options.url}"
+        return super()._request(options=options, **kwargs)
+
+    # Internal azure specific "helper" methods
+    def _check_polling_response(self, response: httpx.Response, predicate: Callable[[httpx.Response], bool]) -> bool:
+        if not predicate(response):
+            return False
+        error_data = cast(Dict[str, Any], response.json()['error'])
+        message = error_data.get('message', 'Operation failed')
+        code = error_data.get('code')
+        raise OpenAIError(message, code)
+
+    def _poll(
+        self,
+        method: str,
+        url: str,
+        until: Callable[[httpx.Response], bool],
+        failed: Callable[[httpx.Response], bool],
+        interval: Optional[float] = None,
+        delay: Optional[float] = None,
+    ) -> ImagesResponse:
+        if delay:
+            time.sleep(delay)
+
+        opts = FinalRequestOptions.construct(method=method, url=url)
+        response = super().request(httpx.Response, opts)
+        self._check_polling_response(response, failed)
+        start_time = time.time()
+        while not until(response):
+            if time.time() - start_time > TIMEOUT_SECS:
+                raise OpenAIError("Operation polling timed out.") # TODO: Find the right exception
+
+            time.sleep(interval or int(response.headers.get("retry-after")) or 10)
+            response = super().request(httpx.Response, opts)
+            self._check_polling_response(response, failed)
+
+        response_json = response.json()
+        return ImagesResponse.construct(**response_json["result"])
diff --git a/src/openai/resources/chat/chat.py b/src/openai/resources/chat/chat.py
index 62bb796571..226b3d7add 100644
--- a/src/openai/resources/chat/chat.py
+++ b/src/openai/resources/chat/chat.py
@@ -14,16 +14,22 @@
 
 
 class Chat(SyncAPIResource):
-    completions: Completions
+
+    @property
+    def completions(self) -> Completions:
+        return self._completions
 
     def __init__(self, client: OpenAI) -> None:
         super().__init__(client)
-        self.completions = Completions(client)
+        self._completions = Completions(client)
 
 
 class AsyncChat(AsyncAPIResource):
-    completions: AsyncCompletions
-
+    
+    @property
+    def completions(self) -> AsyncCompletions:
+        return self._completions
+    
     def __init__(self, client: AsyncOpenAI) -> None:
         super().__init__(client)
-        self.completions = AsyncCompletions(client)
+        self._completions = AsyncCompletions(client)