diff --git a/src/openai/__init__.py b/src/openai/__init__.py index d011d416ac..7d2630f123 100644 --- a/src/openai/__init__.py +++ b/src/openai/__init__.py @@ -65,7 +65,7 @@ "AsyncStream", "OpenAI", "AsyncOpenAI", - "file_from_path", + "file_from_path" ] from .version import VERSION as VERSION diff --git a/src/openai/_client.py b/src/openai/_client.py index e0e5e37f4c..a1ee81e2ea 100644 --- a/src/openai/_client.py +++ b/src/openai/_client.py @@ -46,7 +46,11 @@ class OpenAI(SyncAPIClient): completions: resources.Completions - chat: resources.Chat + + @property + def chat(self) -> resources.chat.Chat: + return self._chat + # chat: resources.chat.Chat edits: resources.Edits embeddings: resources.Embeddings files: resources.Files @@ -122,7 +126,7 @@ def __init__( self._default_stream_cls = Stream self.completions = resources.Completions(self) - self.chat = resources.Chat(self) + self._chat = resources.Chat(self) self.edits = resources.Edits(self) self.embeddings = resources.Embeddings(self) self.files = resources.Files(self) @@ -244,7 +248,9 @@ def _make_status_error( class AsyncOpenAI(AsyncAPIClient): completions: resources.AsyncCompletions - chat: resources.AsyncChat + @property + def chat(self) -> resources.AsyncChat: + return self._chat edits: resources.AsyncEdits embeddings: resources.AsyncEmbeddings files: resources.AsyncFiles @@ -320,7 +326,7 @@ def __init__( self._default_stream_cls = AsyncStream self.completions = resources.AsyncCompletions(self) - self.chat = resources.AsyncChat(self) + self._chat = resources.AsyncChat(self) self.edits = resources.AsyncEdits(self) self.embeddings = resources.AsyncEmbeddings(self) self.files = resources.AsyncFiles(self) diff --git a/src/openai/azure/__init__.py b/src/openai/azure/__init__.py new file mode 100644 index 0000000000..805d97a52f --- /dev/null +++ b/src/openai/azure/__init__.py @@ -0,0 +1,9 @@ +from ._sync_client import AzureOpenAIClient +from ._async_client import AsyncAzureOpenAIClient +from ._credential import TokenCredential + +__all__ = [ + "AzureOpenAIClient", + "TokenCredential", + "AsyncAzureOpenAIClient", +] \ No newline at end of file diff --git a/src/openai/azure/_async_client.py b/src/openai/azure/_async_client.py new file mode 100644 index 0000000000..2f9ca5bc80 --- /dev/null +++ b/src/openai/azure/_async_client.py @@ -0,0 +1,855 @@ +from __future__ import annotations + +from typing_extensions import Literal, override +from typing import Any, Callable, cast, List, Mapping, Dict, Optional, overload, Type, Union +import time + +import httpx + +from openai import AsyncClient, OpenAIError +from openai.resources.chat import AsyncChat, AsyncCompletions +from openai.resources.completions import AsyncCompletions as AsyncCompletionsOperations +from openai.types import ImagesResponse +from openai.types.chat import ChatCompletionMessageParam, ChatCompletion, ChatCompletionChunk +from openai.types.chat.completion_create_params import FunctionCall, Function +from openai.types.completion import Completion + +# These types are needed for correct typing of overrides +from openai._types import NotGiven, NOT_GIVEN, Headers, Query, Body, ResponseT + +# These are types used in the public API surface area that are not exported as public +from openai._models import FinalRequestOptions +from openai._streaming import AsyncStream + +# Azure specific types +from ._credential import TokenCredential, TokenAuth +from ._azuremodels import ( + ChatExtensionConfiguration, + AzureChatCompletion, + AzureChatCompletionChunk, + AzureCompletion, +) + +TIMEOUT_SECS = 600 + +class AsyncAzureChat(AsyncChat): + + @property + def completions(self) -> "AsyncAzureChatCompletions": + return self._completions + + def __init__(self, client: "AsyncAzureOpenAIClient"): + self._completions = AsyncAzureChatCompletions(client) + + +class AsyncAzureChatCompletions(AsyncCompletions): + + @overload + async def create( + self, + *, + messages: List[ChatCompletionMessageParam], + model: Union[ + str, + Literal[ + "gpt-4", + "gpt-4-0314", + "gpt-4-0613", + "gpt-4-32k", + "gpt-4-32k-0314", + "gpt-4-32k-0613", + "gpt-3.5-turbo", + "gpt-3.5-turbo-16k", + "gpt-3.5-turbo-0301", + "gpt-3.5-turbo-0613", + "gpt-3.5-turbo-16k-0613", + ], + ], + frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN, + function_call: FunctionCall | NotGiven = NOT_GIVEN, + functions: List[Function] | NotGiven = NOT_GIVEN, + logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN, + max_tokens: Optional[int] | NotGiven = NOT_GIVEN, + n: Optional[int] | NotGiven = NOT_GIVEN, + presence_penalty: Optional[float] | NotGiven = NOT_GIVEN, + stop: Union[Optional[str], List[str]] | NotGiven = NOT_GIVEN, + stream: Optional[Literal[False]] | NotGiven = NOT_GIVEN, + temperature: Optional[float] | NotGiven = NOT_GIVEN, + top_p: Optional[float] | NotGiven = NOT_GIVEN, + user: str | NotGiven = NOT_GIVEN, + data_sources: List[ChatExtensionConfiguration] | NotGiven = NOT_GIVEN, # TODO + # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. + # The extra values given here take precedence over values defined on the client or passed to this method. + extra_headers: Headers | None = None, + extra_query: Query | None = None, + extra_body: Body | None = None, + timeout: float | None | NotGiven = NOT_GIVEN, + ) -> AzureChatCompletion: + """ + Creates a model response for the given chat conversation. + + Args: + messages: A list of messages comprising the conversation so far. + [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_format_inputs_to_ChatGPT_models.ipynb). + + model: ID of the model to use. See the + [model endpoint compatibility](https://platform.openai.com/docs/models/model-endpoint-compatibility) + table for details on which models work with the Chat API. + + frequency_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on their + existing frequency in the text so far, decreasing the model's likelihood to + repeat the same line verbatim. + + [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/gpt/parameter-details) + + function_call: Controls how the model responds to function calls. `none` means the model does + not call a function, and responds to the end-user. `auto` means the model can + pick between an end-user or calling a function. Specifying a particular function + via `{"name": "my_function"}` forces the model to call that function. `none` is + the default when no functions are present. `auto` is the default if functions + are present. + + functions: A list of functions the model may generate JSON inputs for. + + logit_bias: Modify the likelihood of specified tokens appearing in the completion. + + Accepts a json object that maps tokens (specified by their token ID in the + tokenizer) to an associated bias value from -100 to 100. Mathematically, the + bias is added to the logits generated by the model prior to sampling. The exact + effect will vary per model, but values between -1 and 1 should decrease or + increase likelihood of selection; values like -100 or 100 should result in a ban + or exclusive selection of the relevant token. + + max_tokens: The maximum number of [tokens](/tokenizer) to generate in the chat completion. + + The total length of input tokens and generated tokens is limited by the model's + context length. + [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb) + for counting tokens. + + n: How many chat completion choices to generate for each input message. + + presence_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on + whether they appear in the text so far, increasing the model's likelihood to + talk about new topics. + + [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/gpt/parameter-details) + + stop: Up to 4 sequences where the API will stop generating further tokens. + + stream: If set, partial message deltas will be sent, like in ChatGPT. Tokens will be + sent as data-only + [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format) + as they become available, with the stream terminated by a `data: [DONE]` + message. + [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_stream_completions.ipynb). + + temperature: What sampling temperature to use, between 0 and 2. Higher values like 0.8 will + make the output more random, while lower values like 0.2 will make it more + focused and deterministic. + + We generally recommend altering this or `top_p` but not both. + + top_p: An alternative to sampling with temperature, called nucleus sampling, where the + model considers the results of the tokens with top_p probability mass. So 0.1 + means only the tokens comprising the top 10% probability mass are considered. + + We generally recommend altering this or `temperature` but not both. + + user: A unique identifier representing your end-user, which can help OpenAI to monitor + and detect abuse. + [Learn more](https://platform.openai.com/docs/guides/safety-best-practices/end-user-ids). + + extra_headers: Send extra headers + + extra_query: Add additional query parameters to the request + + extra_body: Add additional JSON properties to the request + + timeout: Override the client-level default timeout for this request, in seconds + """ + ... + + @overload + async def create( + self, + *, + messages: List[ChatCompletionMessageParam], + model: Union[ + str, + Literal[ + "gpt-4", + "gpt-4-0314", + "gpt-4-0613", + "gpt-4-32k", + "gpt-4-32k-0314", + "gpt-4-32k-0613", + "gpt-3.5-turbo", + "gpt-3.5-turbo-16k", + "gpt-3.5-turbo-0301", + "gpt-3.5-turbo-0613", + "gpt-3.5-turbo-16k-0613", + ], + ], + stream: Literal[True], + frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN, + function_call: FunctionCall | NotGiven = NOT_GIVEN, + functions: List[Function] | NotGiven = NOT_GIVEN, + logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN, + max_tokens: Optional[int] | NotGiven = NOT_GIVEN, + n: Optional[int] | NotGiven = NOT_GIVEN, + presence_penalty: Optional[float] | NotGiven = NOT_GIVEN, + stop: Union[Optional[str], List[str]] | NotGiven = NOT_GIVEN, + temperature: Optional[float] | NotGiven = NOT_GIVEN, + top_p: Optional[float] | NotGiven = NOT_GIVEN, + user: str | NotGiven = NOT_GIVEN, + data_sources: List[ChatExtensionConfiguration] | NotGiven = NOT_GIVEN, # TODO + # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. + # The extra values given here take precedence over values defined on the client or passed to this method. + extra_headers: Headers | None = None, + extra_query: Query | None = None, + extra_body: Body | None = None, + timeout: float | None | NotGiven = NOT_GIVEN, + ) -> AsyncStream[AzureChatCompletionChunk]: + """ + Creates a model response for the given chat conversation. + + Args: + messages: A list of messages comprising the conversation so far. + [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_format_inputs_to_ChatGPT_models.ipynb). + + model: ID of the model to use. See the + [model endpoint compatibility](https://platform.openai.com/docs/models/model-endpoint-compatibility) + table for details on which models work with the Chat API. + + stream: If set, partial message deltas will be sent, like in ChatGPT. Tokens will be + sent as data-only + [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format) + as they become available, with the stream terminated by a `data: [DONE]` + message. + [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_stream_completions.ipynb). + + frequency_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on their + existing frequency in the text so far, decreasing the model's likelihood to + repeat the same line verbatim. + + [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/gpt/parameter-details) + + function_call: Controls how the model responds to function calls. `none` means the model does + not call a function, and responds to the end-user. `auto` means the model can + pick between an end-user or calling a function. Specifying a particular function + via `{"name": "my_function"}` forces the model to call that function. `none` is + the default when no functions are present. `auto` is the default if functions + are present. + + functions: A list of functions the model may generate JSON inputs for. + + logit_bias: Modify the likelihood of specified tokens appearing in the completion. + + Accepts a json object that maps tokens (specified by their token ID in the + tokenizer) to an associated bias value from -100 to 100. Mathematically, the + bias is added to the logits generated by the model prior to sampling. The exact + effect will vary per model, but values between -1 and 1 should decrease or + increase likelihood of selection; values like -100 or 100 should result in a ban + or exclusive selection of the relevant token. + + max_tokens: The maximum number of [tokens](/tokenizer) to generate in the chat completion. + + The total length of input tokens and generated tokens is limited by the model's + context length. + [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb) + for counting tokens. + + n: How many chat completion choices to generate for each input message. + + presence_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on + whether they appear in the text so far, increasing the model's likelihood to + talk about new topics. + + [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/gpt/parameter-details) + + stop: Up to 4 sequences where the API will stop generating further tokens. + + temperature: What sampling temperature to use, between 0 and 2. Higher values like 0.8 will + make the output more random, while lower values like 0.2 will make it more + focused and deterministic. + + We generally recommend altering this or `top_p` but not both. + + top_p: An alternative to sampling with temperature, called nucleus sampling, where the + model considers the results of the tokens with top_p probability mass. So 0.1 + means only the tokens comprising the top 10% probability mass are considered. + + We generally recommend altering this or `temperature` but not both. + + user: A unique identifier representing your end-user, which can help OpenAI to monitor + and detect abuse. + [Learn more](https://platform.openai.com/docs/guides/safety-best-practices/end-user-ids). + + extra_headers: Send extra headers + + extra_query: Add additional query parameters to the request + + extra_body: Add additional JSON properties to the request + + timeout: Override the client-level default timeout for this request, in seconds + """ + ... + @override + async def create( + self, + *, + messages: List[ChatCompletionMessageParam], + model: Union[ + str, + Literal[ + "gpt-4", + "gpt-4-0314", + "gpt-4-0613", + "gpt-4-32k", + "gpt-4-32k-0314", + "gpt-4-32k-0613", + "gpt-3.5-turbo", + "gpt-3.5-turbo-16k", + "gpt-3.5-turbo-0301", + "gpt-3.5-turbo-0613", + "gpt-3.5-turbo-16k-0613", + ], + ], + frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN, + function_call: FunctionCall | NotGiven = NOT_GIVEN, + functions: List[Function] | NotGiven = NOT_GIVEN, + logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN, + max_tokens: Optional[int] | NotGiven = NOT_GIVEN, + n: Optional[int] | NotGiven = NOT_GIVEN, + presence_penalty: Optional[float] | NotGiven = NOT_GIVEN, + stop: Union[Optional[str], List[str]] | NotGiven = NOT_GIVEN, + stream: Optional[Literal[False]] | Literal[True] | NotGiven = NOT_GIVEN, + temperature: Optional[float] | NotGiven = NOT_GIVEN, + top_p: Optional[float] | NotGiven = NOT_GIVEN, + user: str | NotGiven = NOT_GIVEN, + data_sources: List[ChatExtensionConfiguration] | NotGiven = NOT_GIVEN, # TODO + # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. + # The extra values given here take precedence over values defined on the client or passed to this method. + extra_headers: Headers | None = None, + extra_query: Query | None = None, + extra_body: Body | None = None, + timeout: float | None | NotGiven = NOT_GIVEN, + ) -> AzureChatCompletion | AsyncStream[AzureChatCompletionChunk]: + if data_sources: + if extra_body is None: + extra_body= {} + cast(Dict[str, Any], extra_body)['dataSources'] = data_sources + stream_dict: Dict[str, Literal[True]] = { # TODO: pylance is upset if I pass through the parameter value. Overload + override combination is problematic + "stream": True + } if stream else {} + response = cast( + Union[ChatCompletion, ChatCompletionChunk], + await super().create( + messages=messages, + model=model, + frequency_penalty = frequency_penalty, + function_call=function_call, + functions=functions, + logit_bias=logit_bias, + max_tokens=max_tokens, + n=n, + presence_penalty=presence_penalty, + stop=stop, + **stream_dict, + temperature=temperature, + top_p=top_p, + user=user, + extra_headers=extra_headers, + extra_query=extra_query, + extra_body=extra_body, + timeout=timeout + ) + ) + if isinstance(response, AsyncStream): + response._cast_to = AzureChatCompletionChunk # or rebuild the stream? + else: + response_json = response.model_dump(mode="json") + response = AzureChatCompletion.construct(**response_json) + return response # type: ignore + + +class AsyncAzureCompletions(AsyncCompletionsOperations): + @overload + async def create( + self, + *, + model: Union[ + str, + Literal[ + "babbage-002", + "davinci-002", + "gpt-3.5-turbo-instruct", + "text-davinci-003", + "text-davinci-002", + "text-davinci-001", + "code-davinci-002", + "text-curie-001", + "text-babbage-001", + "text-ada-001", + ], + ], + prompt: Union[str, List[str], List[int], List[List[int]], None], + best_of: Optional[int] | NotGiven = NOT_GIVEN, + echo: Optional[bool] | NotGiven = NOT_GIVEN, + frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN, + logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN, + logprobs: Optional[int] | NotGiven = NOT_GIVEN, + max_tokens: Optional[int] | NotGiven = NOT_GIVEN, + n: Optional[int] | NotGiven = NOT_GIVEN, + presence_penalty: Optional[float] | NotGiven = NOT_GIVEN, + stop: Union[Optional[str], List[str], None] | NotGiven = NOT_GIVEN, + stream: Optional[Literal[False]] | NotGiven = NOT_GIVEN, + suffix: Optional[str] | NotGiven = NOT_GIVEN, + temperature: Optional[float] | NotGiven = NOT_GIVEN, + top_p: Optional[float] | NotGiven = NOT_GIVEN, + user: str | NotGiven = NOT_GIVEN, + # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. + # The extra values given here take precedence over values defined on the client or passed to this method. + extra_headers: Headers | None = None, + extra_query: Query | None = None, + extra_body: Body | None = None, + timeout: float | None | NotGiven = NOT_GIVEN, + ) -> AzureCompletion: + """ + Creates a completion for the provided prompt and parameters. + + Args: + model: ID of the model to use. You can use the + [List models](https://platform.openai.com/docs/api-reference/models/list) API to + see all of your available models, or see our + [Model overview](https://platform.openai.com/docs/models/overview) for + descriptions of them. + + prompt: The prompt(s) to generate completions for, encoded as a string, array of + strings, array of tokens, or array of token arrays. + + Note that <|endoftext|> is the document separator that the model sees during + training, so if a prompt is not specified the model will generate as if from the + beginning of a new document. + + best_of: Generates `best_of` completions server-side and returns the "best" (the one with + the highest log probability per token). Results cannot be streamed. + + When used with `n`, `best_of` controls the number of candidate completions and + `n` specifies how many to return – `best_of` must be greater than `n`. + + **Note:** Because this parameter generates many completions, it can quickly + consume your token quota. Use carefully and ensure that you have reasonable + settings for `max_tokens` and `stop`. + + echo: Echo back the prompt in addition to the completion + + frequency_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on their + existing frequency in the text so far, decreasing the model's likelihood to + repeat the same line verbatim. + + [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/gpt/parameter-details) + + logit_bias: Modify the likelihood of specified tokens appearing in the completion. + + Accepts a json object that maps tokens (specified by their token ID in the GPT + tokenizer) to an associated bias value from -100 to 100. You can use this + [tokenizer tool](/tokenizer?view=bpe) (which works for both GPT-2 and GPT-3) to + convert text to token IDs. Mathematically, the bias is added to the logits + generated by the model prior to sampling. The exact effect will vary per model, + but values between -1 and 1 should decrease or increase likelihood of selection; + values like -100 or 100 should result in a ban or exclusive selection of the + relevant token. + + As an example, you can pass `{"50256": -100}` to prevent the <|endoftext|> token + from being generated. + + logprobs: Include the log probabilities on the `logprobs` most likely tokens, as well the + chosen tokens. For example, if `logprobs` is 5, the API will return a list of + the 5 most likely tokens. The API will always return the `logprob` of the + sampled token, so there may be up to `logprobs+1` elements in the response. + + The maximum value for `logprobs` is 5. + + max_tokens: The maximum number of [tokens](/tokenizer) to generate in the completion. + + The token count of your prompt plus `max_tokens` cannot exceed the model's + context length. + [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb) + for counting tokens. + + n: How many completions to generate for each prompt. + + **Note:** Because this parameter generates many completions, it can quickly + consume your token quota. Use carefully and ensure that you have reasonable + settings for `max_tokens` and `stop`. + + presence_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on + whether they appear in the text so far, increasing the model's likelihood to + talk about new topics. + + [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/gpt/parameter-details) + + stop: Up to 4 sequences where the API will stop generating further tokens. The + returned text will not contain the stop sequence. + + stream: Whether to stream back partial progress. If set, tokens will be sent as + data-only + [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format) + as they become available, with the stream terminated by a `data: [DONE]` + message. + [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_stream_completions.ipynb). + + suffix: The suffix that comes after a completion of inserted text. + + temperature: What sampling temperature to use, between 0 and 2. Higher values like 0.8 will + make the output more random, while lower values like 0.2 will make it more + focused and deterministic. + + We generally recommend altering this or `top_p` but not both. + + top_p: An alternative to sampling with temperature, called nucleus sampling, where the + model considers the results of the tokens with top_p probability mass. So 0.1 + means only the tokens comprising the top 10% probability mass are considered. + + We generally recommend altering this or `temperature` but not both. + + user: A unique identifier representing your end-user, which can help OpenAI to monitor + and detect abuse. + [Learn more](https://platform.openai.com/docs/guides/safety-best-practices/end-user-ids). + + extra_headers: Send extra headers + + extra_query: Add additional query parameters to the request + + extra_body: Add additional JSON properties to the request + + timeout: Override the client-level default timeout for this request, in seconds + """ + ... + + @overload + async def create( + self, + *, + model: Union[ + str, + Literal[ + "babbage-002", + "davinci-002", + "gpt-3.5-turbo-instruct", + "text-davinci-003", + "text-davinci-002", + "text-davinci-001", + "code-davinci-002", + "text-curie-001", + "text-babbage-001", + "text-ada-001", + ], + ], + prompt: Union[str, List[str], List[int], List[List[int]], None], + stream: Literal[True], + best_of: Optional[int] | NotGiven = NOT_GIVEN, + echo: Optional[bool] | NotGiven = NOT_GIVEN, + frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN, + logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN, + logprobs: Optional[int] | NotGiven = NOT_GIVEN, + max_tokens: Optional[int] | NotGiven = NOT_GIVEN, + n: Optional[int] | NotGiven = NOT_GIVEN, + presence_penalty: Optional[float] | NotGiven = NOT_GIVEN, + stop: Union[Optional[str], List[str], None] | NotGiven = NOT_GIVEN, + suffix: Optional[str] | NotGiven = NOT_GIVEN, + temperature: Optional[float] | NotGiven = NOT_GIVEN, + top_p: Optional[float] | NotGiven = NOT_GIVEN, + user: str | NotGiven = NOT_GIVEN, + # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. + # The extra values given here take precedence over values defined on the client or passed to this method. + extra_headers: Headers | None = None, + extra_query: Query | None = None, + extra_body: Body | None = None, + timeout: float | None | NotGiven = NOT_GIVEN, + ) -> AsyncStream[AzureCompletion]: + """ + Creates a completion for the provided prompt and parameters. + + Args: + model: ID of the model to use. You can use the + [List models](https://platform.openai.com/docs/api-reference/models/list) API to + see all of your available models, or see our + [Model overview](https://platform.openai.com/docs/models/overview) for + descriptions of them. + + prompt: The prompt(s) to generate completions for, encoded as a string, array of + strings, array of tokens, or array of token arrays. + + Note that <|endoftext|> is the document separator that the model sees during + training, so if a prompt is not specified the model will generate as if from the + beginning of a new document. + + stream: Whether to stream back partial progress. If set, tokens will be sent as + data-only + [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format) + as they become available, with the stream terminated by a `data: [DONE]` + message. + [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_stream_completions.ipynb). + + best_of: Generates `best_of` completions server-side and returns the "best" (the one with + the highest log probability per token). Results cannot be streamed. + + When used with `n`, `best_of` controls the number of candidate completions and + `n` specifies how many to return – `best_of` must be greater than `n`. + + **Note:** Because this parameter generates many completions, it can quickly + consume your token quota. Use carefully and ensure that you have reasonable + settings for `max_tokens` and `stop`. + + echo: Echo back the prompt in addition to the completion + + frequency_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on their + existing frequency in the text so far, decreasing the model's likelihood to + repeat the same line verbatim. + + [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/gpt/parameter-details) + + logit_bias: Modify the likelihood of specified tokens appearing in the completion. + + Accepts a json object that maps tokens (specified by their token ID in the GPT + tokenizer) to an associated bias value from -100 to 100. You can use this + [tokenizer tool](/tokenizer?view=bpe) (which works for both GPT-2 and GPT-3) to + convert text to token IDs. Mathematically, the bias is added to the logits + generated by the model prior to sampling. The exact effect will vary per model, + but values between -1 and 1 should decrease or increase likelihood of selection; + values like -100 or 100 should result in a ban or exclusive selection of the + relevant token. + + As an example, you can pass `{"50256": -100}` to prevent the <|endoftext|> token + from being generated. + + logprobs: Include the log probabilities on the `logprobs` most likely tokens, as well the + chosen tokens. For example, if `logprobs` is 5, the API will return a list of + the 5 most likely tokens. The API will always return the `logprob` of the + sampled token, so there may be up to `logprobs+1` elements in the response. + + The maximum value for `logprobs` is 5. + + max_tokens: The maximum number of [tokens](/tokenizer) to generate in the completion. + + The token count of your prompt plus `max_tokens` cannot exceed the model's + context length. + [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb) + for counting tokens. + + n: How many completions to generate for each prompt. + + **Note:** Because this parameter generates many completions, it can quickly + consume your token quota. Use carefully and ensure that you have reasonable + settings for `max_tokens` and `stop`. + + presence_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on + whether they appear in the text so far, increasing the model's likelihood to + talk about new topics. + + [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/gpt/parameter-details) + + stop: Up to 4 sequences where the API will stop generating further tokens. The + returned text will not contain the stop sequence. + + suffix: The suffix that comes after a completion of inserted text. + + temperature: What sampling temperature to use, between 0 and 2. Higher values like 0.8 will + make the output more random, while lower values like 0.2 will make it more + focused and deterministic. + + We generally recommend altering this or `top_p` but not both. + + top_p: An alternative to sampling with temperature, called nucleus sampling, where the + model considers the results of the tokens with top_p probability mass. So 0.1 + means only the tokens comprising the top 10% probability mass are considered. + + We generally recommend altering this or `temperature` but not both. + + user: A unique identifier representing your end-user, which can help OpenAI to monitor + and detect abuse. + [Learn more](https://platform.openai.com/docs/guides/safety-best-practices/end-user-ids). + + extra_headers: Send extra headers + + extra_query: Add additional query parameters to the request + + extra_body: Add additional JSON properties to the request + + timeout: Override the client-level default timeout for this request, in seconds + """ + ... + + @override + async def create( + self, + *, + model: Union[ + str, + Literal[ + "babbage-002", + "davinci-002", + "gpt-3.5-turbo-instruct", + "text-davinci-003", + "text-davinci-002", + "text-davinci-001", + "code-davinci-002", + "text-curie-001", + "text-babbage-001", + "text-ada-001", + ], + ], + prompt: Union[str, List[str], List[int], List[List[int]], None], + best_of: Optional[int] | NotGiven = NOT_GIVEN, + echo: Optional[bool] | NotGiven = NOT_GIVEN, + frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN, + logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN, + logprobs: Optional[int] | NotGiven = NOT_GIVEN, + max_tokens: Optional[int] | NotGiven = NOT_GIVEN, + n: Optional[int] | NotGiven = NOT_GIVEN, + presence_penalty: Optional[float] | NotGiven = NOT_GIVEN, + stop: Union[Optional[str], List[str], None] | NotGiven = NOT_GIVEN, + stream: Optional[Literal[False]] | Literal[True] | NotGiven = NOT_GIVEN, + suffix: Optional[str] | NotGiven = NOT_GIVEN, + temperature: Optional[float] | NotGiven = NOT_GIVEN, + top_p: Optional[float] | NotGiven = NOT_GIVEN, + user: str | NotGiven = NOT_GIVEN, + # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. + # The extra values given here take precedence over values defined on the client or passed to this method. + extra_headers: Headers | None = None, + extra_query: Query | None = None, + extra_body: Body | None = None, + timeout: float | None | NotGiven = NOT_GIVEN, + ) -> AzureCompletion | AsyncStream[AzureCompletion]: + stream_dict: Dict[str, Literal[True]] = { # TODO: pylance is upset if I pass through the parameter value. Overload + override combination is problematic + "stream": True + } if stream else {} + response = cast( + Union[Completion, AsyncStream[Completion]], + await super().create( + model=model, + prompt=prompt, + best_of=best_of, + echo=echo, + frequency_penalty = frequency_penalty, + logit_bias=logit_bias, + logprobs=logprobs, + max_tokens=max_tokens, + n=n, + presence_penalty=presence_penalty, + stop=stop, + **stream_dict, + suffix=suffix, + temperature=temperature, + top_p=top_p, + user=user, + extra_headers=extra_headers, + extra_query=extra_query, + extra_body=extra_body, + timeout=timeout + ) + ) + + if isinstance(response, AsyncStream): + response._cast_to = AzureCompletion + else: + response_json = response.model_dump(mode="json") + response = AzureCompletion.construct(**response_json) + return response # type: ignore + + +class AsyncAzureOpenAIClient(AsyncClient): + + @property + @override + def chat(self) -> AsyncAzureChat: + return self._chat + + @property + @override + def completions(self) -> AsyncAzureCompletions: + return self._completions + + @completions.setter + def completions(self, value: AsyncAzureCompletions) -> None: + self._completions = value + + def __init__(self, *args: Any, credential: Optional["TokenCredential"] = None, api_version: str = '2023-09-01-preview', **kwargs: Any): + default_query = kwargs.get('default_query', {}) + default_query.setdefault('api-version', api_version) + kwargs['default_query'] = default_query + self.credential = credential + if credential: + kwargs['api_key'] = 'Placeholder: AAD' # TODO: There is an assumption/validation there is always an API key. + super().__init__(*args, **kwargs) + self._chat = AsyncAzureChat(self) + + @property + def auth_headers(self) -> Dict[str, str]: + return {"api-key": self.api_key} + + @property + def custom_auth(self) -> httpx.Auth | None: + if self.credential: + return TokenAuth(self.credential) + + def _check_polling_response(self, response: httpx.Response, predicate: Callable[[httpx.Response], bool]) -> bool: + if not predicate(response): + return False + error_data = response.json()['error'] + message: str = cast(str, error_data.get('message', 'Operation failed')) + code = error_data.get('code') + raise OpenAIError(f'Error: {message} ({code})') + + async def _poll( + self, + method: str, + url: str, + until: Callable[[httpx.Response], bool], + failed: Callable[[httpx.Response], bool], + interval: Optional[float] = None, + delay: Optional[float] = None, + ) -> ImagesResponse: + if delay: + time.sleep(delay) + + opts = FinalRequestOptions.construct(method=method, url=url) + response = await super().request(httpx.Response, opts) + self._check_polling_response(response, failed) + start_time = time.time() + while not until(response): + if time.time() - start_time > TIMEOUT_SECS: + raise Exception("Operation polling timed out.") # TODO: Fix up exception type. + + time.sleep(interval or int(response.headers.get("retry-after")) or 10) + response = await super().request(httpx.Response, opts) + self._check_polling_response(response, failed) + + response_json = response.json() + return ImagesResponse.construct(**response_json["result"]) + + # NOTE: We override the internal method because `@overrid`ing `@overload`ed methods and keeping typing happy is a pain. Most typing tools are lacking... + async def _request(self, cast_to: Type[ResponseT], options: FinalRequestOptions, **kwargs: Any) -> Any: + if options.url == "/images/generations": + options.url = "openai/images/generations:submit" + response = await super()._request(cast_to=cast_to, options=options, **kwargs) + model_extra = cast(Mapping[str, Any], getattr(response, 'model_extra')) or {} + operation_id = cast(str, model_extra['id']) + return await self._poll( + "get", f"openai/operations/images/{operation_id}", + until=lambda response: response.json()["status"] in ["succeeded"], + failed=lambda response: response.json()["status"] in ["failed"], + ) + if isinstance(options.json_data, Mapping): + model = cast(str, options.json_data["model"]) + if not options.url.startswith(f'openai/deployments/{model}'): + if options.extra_json and options.extra_json.get("dataSources"): + options.url = f'openai/deployments/{model}/extensions' + options.url + else: + options.url = f'openai/deployments/{model}' + options.url + if options.url.startswith(("/models", "/fine_tuning", "/files", "/fine-tunes")): + options.url = f"openai{options.url}" + return await super()._request(cast_to=cast_to, options=options, **kwargs) diff --git a/src/openai/azure/_azuremodels.py b/src/openai/azure/_azuremodels.py new file mode 100644 index 0000000000..841bd11d78 --- /dev/null +++ b/src/openai/azure/_azuremodels.py @@ -0,0 +1,82 @@ +from typing import List, Optional +from typing_extensions import TypedDict, Literal +from openai._models import BaseModel as BaseModel + +from openai.types.chat import ChatCompletion, ChatCompletionChunk, ChatCompletionMessage +from openai.types.chat.chat_completion import Choice as ChatChoice +from openai.types.chat.chat_completion_chunk import ChoiceDelta, Choice as ChatChoiceDelta +from openai.types.completion import Completion +from openai.types.completion_choice import CompletionChoice + + +AzureChatCompletionRole = Literal["system", "user", "assistant", "function", "tool"] + + +class ChatExtensionConfiguration(TypedDict): + type: Literal["AzureCognitiveSearch"] + parameters: object + + +class ContentFilterResult(BaseModel): + severity: Literal["safe", "low", "medium", "high"] + filtered: bool + + +class Error(BaseModel): + code: str + message: str + + +class ContentFilterResults(BaseModel): + hate: Optional[ContentFilterResult] + self_harm: Optional[ContentFilterResult] + violence: Optional[ContentFilterResult] + sexual: Optional[ContentFilterResult] + error: Optional[Error] + + +class PromptFilterResult(BaseModel): + prompt_index: int + content_filter_results: Optional[ContentFilterResults] + + +class AzureChatExtensionsMessageContext(BaseModel): + messages: Optional[List[ChatCompletionMessage]] + + +class AzureChatCompletionMessage(ChatCompletionMessage): + context: Optional[AzureChatExtensionsMessageContext] + role: AzureChatCompletionRole # type: ignore + + +class AzureChatCompletionChoice(ChatChoice): + content_filter_results: Optional[ContentFilterResults] + message: AzureChatCompletionMessage # type: ignore + + +class AzureChatCompletion(ChatCompletion): + choices: List[AzureChatCompletionChoice] # type: ignore + prompt_filter_results: Optional[List[PromptFilterResult]] + + +class AzureChoiceDelta(ChoiceDelta): + context: Optional[AzureChatExtensionsMessageContext] + + +class AzureChatCompletionChoiceDelta(ChatChoiceDelta): + delta: AzureChoiceDelta # type: ignore + content_filter_results: Optional[ContentFilterResults] + + +class AzureChatCompletionChunk(ChatCompletionChunk): + choices: List[AzureChatCompletionChoiceDelta] # type: ignore + prompt_filter_results: Optional[List[PromptFilterResult]] + + +class AzureCompletionChoice(CompletionChoice): + content_filter_results: Optional[ContentFilterResults] + + +class AzureCompletion(Completion): + choices: List[AzureCompletionChoice] # type: ignore + prompt_filter_results: Optional[List[PromptFilterResult]] diff --git a/src/openai/azure/_credential.py b/src/openai/azure/_credential.py new file mode 100644 index 0000000000..9d10e14909 --- /dev/null +++ b/src/openai/azure/_credential.py @@ -0,0 +1,46 @@ +from typing import AsyncGenerator, Generator, Any +import time +import asyncio +import httpx + + +class TokenCredential: + """Placeholder/example token credential class + + A real implementation would be compatible with e.g. azure-identity and also should be easily + adaptible to other token credential implementations. + """ + def __init__(self): + import azure.identity + self._credential = azure.identity.DefaultAzureCredential() + + def get_token(self): + return self._credential.get_token('https://cognitiveservices.azure.com/.default').token + + +class TokenAuth(httpx.Auth): + def __init__(self, credential: "TokenCredential") -> None: + self._credential = credential + self._async_lock = asyncio.Lock() + self.cached_token = None + + def sync_get_token(self) -> str: + if not self.cached_token or self.cached_token.expires_on - time.time() < 300: + return self._credential.get_token("https://cognitiveservices.azure.com/.default").token + return self.cached_token.token + + def sync_auth_flow(self, request: httpx.Request) -> Generator[httpx.Request, Any, Any]: + token = self.sync_get_token() + request.headers["Authorization"] = f"Bearer {token}" + yield request + + async def async_get_token(self) -> str: + async with self._async_lock: + if not self.cached_token or self.cached_token.expires_on - time.time() < 300: + return (await self._credential.get_token("https://cognitiveservices.azure.com/.default")).token + return self.cached_token.token + + async def async_auth_flow(self, request: httpx.Request) -> AsyncGenerator[httpx.Request, Any]: + token = await self.async_get_token() + request.headers["Authorization"] = f"Bearer {token}" + yield request diff --git a/src/openai/azure/_sync_client.py b/src/openai/azure/_sync_client.py new file mode 100644 index 0000000000..677f9e8ac3 --- /dev/null +++ b/src/openai/azure/_sync_client.py @@ -0,0 +1,857 @@ +from __future__ import annotations + +from typing_extensions import Literal, override +from typing import Any, Callable, cast, List, Mapping, Dict, Optional, overload, Union +import time + +import httpx + +from openai import Client, OpenAIError +from openai.types import ImagesResponse + +# These are types used in the public API surface area that are not exported as public +from openai._models import FinalRequestOptions + +# These types are needed for correct typing of overrides +from openai._types import NotGiven, NOT_GIVEN, Headers, Query, Body +from openai._streaming import Stream + +from openai.resources.chat import Chat, Completions +from openai.resources.completions import Completions as CompletionsOperations +from openai.types.chat import ChatCompletionMessageParam, ChatCompletion, ChatCompletionChunk +from openai.types.chat.completion_create_params import FunctionCall, Function +from openai.types.completion import Completion + +# Azure specific types +from ._credential import TokenCredential, TokenAuth +from ._azuremodels import ( + ChatExtensionConfiguration, + AzureChatCompletion, + AzureChatCompletionChunk, + AzureCompletion, +) + +TIMEOUT_SECS = 600 + +class AzureChat(Chat): + + @property + def completions(self) -> "AzureChatCompletions": + return self._completions + + def __init__(self, client: "AzureOpenAIClient"): + self._completions = AzureChatCompletions(client) + + +class AzureChatCompletions(Completions): + + @overload + def create( + self, + *, + messages: List[ChatCompletionMessageParam], + model: Union[ + str, + Literal[ + "gpt-4", + "gpt-4-0314", + "gpt-4-0613", + "gpt-4-32k", + "gpt-4-32k-0314", + "gpt-4-32k-0613", + "gpt-3.5-turbo", + "gpt-3.5-turbo-16k", + "gpt-3.5-turbo-0301", + "gpt-3.5-turbo-0613", + "gpt-3.5-turbo-16k-0613", + ], + ], + frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN, + function_call: FunctionCall | NotGiven = NOT_GIVEN, + functions: List[Function] | NotGiven = NOT_GIVEN, + logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN, + max_tokens: Optional[int] | NotGiven = NOT_GIVEN, + n: Optional[int] | NotGiven = NOT_GIVEN, + presence_penalty: Optional[float] | NotGiven = NOT_GIVEN, + stop: Union[Optional[str], List[str]] | NotGiven = NOT_GIVEN, + stream: Optional[Literal[False]] | NotGiven = NOT_GIVEN, + temperature: Optional[float] | NotGiven = NOT_GIVEN, + top_p: Optional[float] | NotGiven = NOT_GIVEN, + user: str | NotGiven = NOT_GIVEN, + data_sources: List[ChatExtensionConfiguration] | NotGiven = NOT_GIVEN, # TODO + # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. + # The extra values given here take precedence over values defined on the client or passed to this method. + extra_headers: Headers | None = None, + extra_query: Query | None = None, + extra_body: Body | None = None, + timeout: float | None | NotGiven = NOT_GIVEN, + ) -> AzureChatCompletion: + """ + Creates a model response for the given chat conversation. + + Args: + messages: A list of messages comprising the conversation so far. + [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_format_inputs_to_ChatGPT_models.ipynb). + + model: ID of the model to use. See the + [model endpoint compatibility](https://platform.openai.com/docs/models/model-endpoint-compatibility) + table for details on which models work with the Chat API. + + frequency_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on their + existing frequency in the text so far, decreasing the model's likelihood to + repeat the same line verbatim. + + [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/gpt/parameter-details) + + function_call: Controls how the model responds to function calls. `none` means the model does + not call a function, and responds to the end-user. `auto` means the model can + pick between an end-user or calling a function. Specifying a particular function + via `{"name": "my_function"}` forces the model to call that function. `none` is + the default when no functions are present. `auto` is the default if functions + are present. + + functions: A list of functions the model may generate JSON inputs for. + + logit_bias: Modify the likelihood of specified tokens appearing in the completion. + + Accepts a json object that maps tokens (specified by their token ID in the + tokenizer) to an associated bias value from -100 to 100. Mathematically, the + bias is added to the logits generated by the model prior to sampling. The exact + effect will vary per model, but values between -1 and 1 should decrease or + increase likelihood of selection; values like -100 or 100 should result in a ban + or exclusive selection of the relevant token. + + max_tokens: The maximum number of [tokens](/tokenizer) to generate in the chat completion. + + The total length of input tokens and generated tokens is limited by the model's + context length. + [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb) + for counting tokens. + + n: How many chat completion choices to generate for each input message. + + presence_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on + whether they appear in the text so far, increasing the model's likelihood to + talk about new topics. + + [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/gpt/parameter-details) + + stop: Up to 4 sequences where the API will stop generating further tokens. + + stream: If set, partial message deltas will be sent, like in ChatGPT. Tokens will be + sent as data-only + [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format) + as they become available, with the stream terminated by a `data: [DONE]` + message. + [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_stream_completions.ipynb). + + temperature: What sampling temperature to use, between 0 and 2. Higher values like 0.8 will + make the output more random, while lower values like 0.2 will make it more + focused and deterministic. + + We generally recommend altering this or `top_p` but not both. + + top_p: An alternative to sampling with temperature, called nucleus sampling, where the + model considers the results of the tokens with top_p probability mass. So 0.1 + means only the tokens comprising the top 10% probability mass are considered. + + We generally recommend altering this or `temperature` but not both. + + user: A unique identifier representing your end-user, which can help OpenAI to monitor + and detect abuse. + [Learn more](https://platform.openai.com/docs/guides/safety-best-practices/end-user-ids). + + extra_headers: Send extra headers + + extra_query: Add additional query parameters to the request + + extra_body: Add additional JSON properties to the request + + timeout: Override the client-level default timeout for this request, in seconds + """ + ... + + @overload + def create( + self, + *, + messages: List[ChatCompletionMessageParam], + model: Union[ + str, + Literal[ + "gpt-4", + "gpt-4-0314", + "gpt-4-0613", + "gpt-4-32k", + "gpt-4-32k-0314", + "gpt-4-32k-0613", + "gpt-3.5-turbo", + "gpt-3.5-turbo-16k", + "gpt-3.5-turbo-0301", + "gpt-3.5-turbo-0613", + "gpt-3.5-turbo-16k-0613", + ], + ], + stream: Literal[True], + frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN, + function_call: FunctionCall | NotGiven = NOT_GIVEN, + functions: List[Function] | NotGiven = NOT_GIVEN, + logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN, + max_tokens: Optional[int] | NotGiven = NOT_GIVEN, + n: Optional[int] | NotGiven = NOT_GIVEN, + presence_penalty: Optional[float] | NotGiven = NOT_GIVEN, + stop: Union[Optional[str], List[str]] | NotGiven = NOT_GIVEN, + temperature: Optional[float] | NotGiven = NOT_GIVEN, + top_p: Optional[float] | NotGiven = NOT_GIVEN, + user: str | NotGiven = NOT_GIVEN, + data_sources: List[ChatExtensionConfiguration] | NotGiven = NOT_GIVEN, # TODO + # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. + # The extra values given here take precedence over values defined on the client or passed to this method. + extra_headers: Headers | None = None, + extra_query: Query | None = None, + extra_body: Body | None = None, + timeout: float | None | NotGiven = NOT_GIVEN, + ) -> Stream[AzureChatCompletionChunk]: + """ + Creates a model response for the given chat conversation. + + Args: + messages: A list of messages comprising the conversation so far. + [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_format_inputs_to_ChatGPT_models.ipynb). + + model: ID of the model to use. See the + [model endpoint compatibility](https://platform.openai.com/docs/models/model-endpoint-compatibility) + table for details on which models work with the Chat API. + + stream: If set, partial message deltas will be sent, like in ChatGPT. Tokens will be + sent as data-only + [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format) + as they become available, with the stream terminated by a `data: [DONE]` + message. + [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_stream_completions.ipynb). + + frequency_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on their + existing frequency in the text so far, decreasing the model's likelihood to + repeat the same line verbatim. + + [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/gpt/parameter-details) + + function_call: Controls how the model responds to function calls. `none` means the model does + not call a function, and responds to the end-user. `auto` means the model can + pick between an end-user or calling a function. Specifying a particular function + via `{"name": "my_function"}` forces the model to call that function. `none` is + the default when no functions are present. `auto` is the default if functions + are present. + + functions: A list of functions the model may generate JSON inputs for. + + logit_bias: Modify the likelihood of specified tokens appearing in the completion. + + Accepts a json object that maps tokens (specified by their token ID in the + tokenizer) to an associated bias value from -100 to 100. Mathematically, the + bias is added to the logits generated by the model prior to sampling. The exact + effect will vary per model, but values between -1 and 1 should decrease or + increase likelihood of selection; values like -100 or 100 should result in a ban + or exclusive selection of the relevant token. + + max_tokens: The maximum number of [tokens](/tokenizer) to generate in the chat completion. + + The total length of input tokens and generated tokens is limited by the model's + context length. + [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb) + for counting tokens. + + n: How many chat completion choices to generate for each input message. + + presence_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on + whether they appear in the text so far, increasing the model's likelihood to + talk about new topics. + + [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/gpt/parameter-details) + + stop: Up to 4 sequences where the API will stop generating further tokens. + + temperature: What sampling temperature to use, between 0 and 2. Higher values like 0.8 will + make the output more random, while lower values like 0.2 will make it more + focused and deterministic. + + We generally recommend altering this or `top_p` but not both. + + top_p: An alternative to sampling with temperature, called nucleus sampling, where the + model considers the results of the tokens with top_p probability mass. So 0.1 + means only the tokens comprising the top 10% probability mass are considered. + + We generally recommend altering this or `temperature` but not both. + + user: A unique identifier representing your end-user, which can help OpenAI to monitor + and detect abuse. + [Learn more](https://platform.openai.com/docs/guides/safety-best-practices/end-user-ids). + + extra_headers: Send extra headers + + extra_query: Add additional query parameters to the request + + extra_body: Add additional JSON properties to the request + + timeout: Override the client-level default timeout for this request, in seconds + """ + ... + @override + def create( + self, + *, + messages: List[ChatCompletionMessageParam], + model: Union[ + str, + Literal[ + "gpt-4", + "gpt-4-0314", + "gpt-4-0613", + "gpt-4-32k", + "gpt-4-32k-0314", + "gpt-4-32k-0613", + "gpt-3.5-turbo", + "gpt-3.5-turbo-16k", + "gpt-3.5-turbo-0301", + "gpt-3.5-turbo-0613", + "gpt-3.5-turbo-16k-0613", + ], + ], + frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN, + function_call: FunctionCall | NotGiven = NOT_GIVEN, + functions: List[Function] | NotGiven = NOT_GIVEN, + logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN, + max_tokens: Optional[int] | NotGiven = NOT_GIVEN, + n: Optional[int] | NotGiven = NOT_GIVEN, + presence_penalty: Optional[float] | NotGiven = NOT_GIVEN, + stop: Union[Optional[str], List[str]] | NotGiven = NOT_GIVEN, + stream: Optional[Literal[False]] | Literal[True] | NotGiven = NOT_GIVEN, + temperature: Optional[float] | NotGiven = NOT_GIVEN, + top_p: Optional[float] | NotGiven = NOT_GIVEN, + user: str | NotGiven = NOT_GIVEN, + data_sources: List[ChatExtensionConfiguration] | NotGiven = NOT_GIVEN, # TODO + # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. + # The extra values given here take precedence over values defined on the client or passed to this method. + extra_headers: Headers | None = None, + extra_query: Query | None = None, + extra_body: Body | None = None, + timeout: float | None | NotGiven = NOT_GIVEN, + ) -> AzureChatCompletion | Stream[AzureChatCompletionChunk]: + if data_sources: + if extra_body is None: + extra_body= {} + cast(Dict[str, Any], extra_body)['dataSources'] = data_sources + stream_dict: Dict[str, Literal[True]] = { # TODO: pylance is upset if I pass through the parameter value. Overload + override combination is problematic + "stream": True + } if stream else {} + response = cast( + Union[ChatCompletion, Stream[ChatCompletionChunk]], + super().create( + messages=messages, + model=model, + frequency_penalty = frequency_penalty, + function_call=function_call, + functions=functions, + logit_bias=logit_bias, + max_tokens=max_tokens, + n=n, + presence_penalty=presence_penalty, + stop=stop, + **stream_dict, + temperature=temperature, + top_p=top_p, + user=user, + extra_headers=extra_headers, + extra_query=extra_query, + extra_body=extra_body, + timeout=timeout + ) + ) + if isinstance(response, Stream): + response._cast_to = AzureChatCompletionChunk # or rebuild the stream? + else: + response_json = response.model_dump(mode="json") + response = AzureChatCompletion.construct(**response_json) + return response # type: ignore + + +class AzureCompletions(CompletionsOperations): + @overload + def create( + self, + *, + model: Union[ + str, + Literal[ + "babbage-002", + "davinci-002", + "gpt-3.5-turbo-instruct", + "text-davinci-003", + "text-davinci-002", + "text-davinci-001", + "code-davinci-002", + "text-curie-001", + "text-babbage-001", + "text-ada-001", + ], + ], + prompt: Union[str, List[str], List[int], List[List[int]], None], + best_of: Optional[int] | NotGiven = NOT_GIVEN, + echo: Optional[bool] | NotGiven = NOT_GIVEN, + frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN, + logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN, + logprobs: Optional[int] | NotGiven = NOT_GIVEN, + max_tokens: Optional[int] | NotGiven = NOT_GIVEN, + n: Optional[int] | NotGiven = NOT_GIVEN, + presence_penalty: Optional[float] | NotGiven = NOT_GIVEN, + stop: Union[Optional[str], List[str], None] | NotGiven = NOT_GIVEN, + stream: Optional[Literal[False]] | NotGiven = NOT_GIVEN, + suffix: Optional[str] | NotGiven = NOT_GIVEN, + temperature: Optional[float] | NotGiven = NOT_GIVEN, + top_p: Optional[float] | NotGiven = NOT_GIVEN, + user: str | NotGiven = NOT_GIVEN, + # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. + # The extra values given here take precedence over values defined on the client or passed to this method. + extra_headers: Headers | None = None, + extra_query: Query | None = None, + extra_body: Body | None = None, + timeout: float | None | NotGiven = NOT_GIVEN, + ) -> AzureCompletion: + """ + Creates a completion for the provided prompt and parameters. + + Args: + model: ID of the model to use. You can use the + [List models](https://platform.openai.com/docs/api-reference/models/list) API to + see all of your available models, or see our + [Model overview](https://platform.openai.com/docs/models/overview) for + descriptions of them. + + prompt: The prompt(s) to generate completions for, encoded as a string, array of + strings, array of tokens, or array of token arrays. + + Note that <|endoftext|> is the document separator that the model sees during + training, so if a prompt is not specified the model will generate as if from the + beginning of a new document. + + best_of: Generates `best_of` completions server-side and returns the "best" (the one with + the highest log probability per token). Results cannot be streamed. + + When used with `n`, `best_of` controls the number of candidate completions and + `n` specifies how many to return – `best_of` must be greater than `n`. + + **Note:** Because this parameter generates many completions, it can quickly + consume your token quota. Use carefully and ensure that you have reasonable + settings for `max_tokens` and `stop`. + + echo: Echo back the prompt in addition to the completion + + frequency_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on their + existing frequency in the text so far, decreasing the model's likelihood to + repeat the same line verbatim. + + [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/gpt/parameter-details) + + logit_bias: Modify the likelihood of specified tokens appearing in the completion. + + Accepts a json object that maps tokens (specified by their token ID in the GPT + tokenizer) to an associated bias value from -100 to 100. You can use this + [tokenizer tool](/tokenizer?view=bpe) (which works for both GPT-2 and GPT-3) to + convert text to token IDs. Mathematically, the bias is added to the logits + generated by the model prior to sampling. The exact effect will vary per model, + but values between -1 and 1 should decrease or increase likelihood of selection; + values like -100 or 100 should result in a ban or exclusive selection of the + relevant token. + + As an example, you can pass `{"50256": -100}` to prevent the <|endoftext|> token + from being generated. + + logprobs: Include the log probabilities on the `logprobs` most likely tokens, as well the + chosen tokens. For example, if `logprobs` is 5, the API will return a list of + the 5 most likely tokens. The API will always return the `logprob` of the + sampled token, so there may be up to `logprobs+1` elements in the response. + + The maximum value for `logprobs` is 5. + + max_tokens: The maximum number of [tokens](/tokenizer) to generate in the completion. + + The token count of your prompt plus `max_tokens` cannot exceed the model's + context length. + [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb) + for counting tokens. + + n: How many completions to generate for each prompt. + + **Note:** Because this parameter generates many completions, it can quickly + consume your token quota. Use carefully and ensure that you have reasonable + settings for `max_tokens` and `stop`. + + presence_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on + whether they appear in the text so far, increasing the model's likelihood to + talk about new topics. + + [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/gpt/parameter-details) + + stop: Up to 4 sequences where the API will stop generating further tokens. The + returned text will not contain the stop sequence. + + stream: Whether to stream back partial progress. If set, tokens will be sent as + data-only + [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format) + as they become available, with the stream terminated by a `data: [DONE]` + message. + [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_stream_completions.ipynb). + + suffix: The suffix that comes after a completion of inserted text. + + temperature: What sampling temperature to use, between 0 and 2. Higher values like 0.8 will + make the output more random, while lower values like 0.2 will make it more + focused and deterministic. + + We generally recommend altering this or `top_p` but not both. + + top_p: An alternative to sampling with temperature, called nucleus sampling, where the + model considers the results of the tokens with top_p probability mass. So 0.1 + means only the tokens comprising the top 10% probability mass are considered. + + We generally recommend altering this or `temperature` but not both. + + user: A unique identifier representing your end-user, which can help OpenAI to monitor + and detect abuse. + [Learn more](https://platform.openai.com/docs/guides/safety-best-practices/end-user-ids). + + extra_headers: Send extra headers + + extra_query: Add additional query parameters to the request + + extra_body: Add additional JSON properties to the request + + timeout: Override the client-level default timeout for this request, in seconds + """ + ... + + @overload + def create( + self, + *, + model: Union[ + str, + Literal[ + "babbage-002", + "davinci-002", + "gpt-3.5-turbo-instruct", + "text-davinci-003", + "text-davinci-002", + "text-davinci-001", + "code-davinci-002", + "text-curie-001", + "text-babbage-001", + "text-ada-001", + ], + ], + prompt: Union[str, List[str], List[int], List[List[int]], None], + stream: Literal[True], + best_of: Optional[int] | NotGiven = NOT_GIVEN, + echo: Optional[bool] | NotGiven = NOT_GIVEN, + frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN, + logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN, + logprobs: Optional[int] | NotGiven = NOT_GIVEN, + max_tokens: Optional[int] | NotGiven = NOT_GIVEN, + n: Optional[int] | NotGiven = NOT_GIVEN, + presence_penalty: Optional[float] | NotGiven = NOT_GIVEN, + stop: Union[Optional[str], List[str], None] | NotGiven = NOT_GIVEN, + suffix: Optional[str] | NotGiven = NOT_GIVEN, + temperature: Optional[float] | NotGiven = NOT_GIVEN, + top_p: Optional[float] | NotGiven = NOT_GIVEN, + user: str | NotGiven = NOT_GIVEN, + # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. + # The extra values given here take precedence over values defined on the client or passed to this method. + extra_headers: Headers | None = None, + extra_query: Query | None = None, + extra_body: Body | None = None, + timeout: float | None | NotGiven = NOT_GIVEN, + ) -> Stream[AzureCompletion]: + """ + Creates a completion for the provided prompt and parameters. + + Args: + model: ID of the model to use. You can use the + [List models](https://platform.openai.com/docs/api-reference/models/list) API to + see all of your available models, or see our + [Model overview](https://platform.openai.com/docs/models/overview) for + descriptions of them. + + prompt: The prompt(s) to generate completions for, encoded as a string, array of + strings, array of tokens, or array of token arrays. + + Note that <|endoftext|> is the document separator that the model sees during + training, so if a prompt is not specified the model will generate as if from the + beginning of a new document. + + stream: Whether to stream back partial progress. If set, tokens will be sent as + data-only + [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format) + as they become available, with the stream terminated by a `data: [DONE]` + message. + [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_stream_completions.ipynb). + + best_of: Generates `best_of` completions server-side and returns the "best" (the one with + the highest log probability per token). Results cannot be streamed. + + When used with `n`, `best_of` controls the number of candidate completions and + `n` specifies how many to return – `best_of` must be greater than `n`. + + **Note:** Because this parameter generates many completions, it can quickly + consume your token quota. Use carefully and ensure that you have reasonable + settings for `max_tokens` and `stop`. + + echo: Echo back the prompt in addition to the completion + + frequency_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on their + existing frequency in the text so far, decreasing the model's likelihood to + repeat the same line verbatim. + + [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/gpt/parameter-details) + + logit_bias: Modify the likelihood of specified tokens appearing in the completion. + + Accepts a json object that maps tokens (specified by their token ID in the GPT + tokenizer) to an associated bias value from -100 to 100. You can use this + [tokenizer tool](/tokenizer?view=bpe) (which works for both GPT-2 and GPT-3) to + convert text to token IDs. Mathematically, the bias is added to the logits + generated by the model prior to sampling. The exact effect will vary per model, + but values between -1 and 1 should decrease or increase likelihood of selection; + values like -100 or 100 should result in a ban or exclusive selection of the + relevant token. + + As an example, you can pass `{"50256": -100}` to prevent the <|endoftext|> token + from being generated. + + logprobs: Include the log probabilities on the `logprobs` most likely tokens, as well the + chosen tokens. For example, if `logprobs` is 5, the API will return a list of + the 5 most likely tokens. The API will always return the `logprob` of the + sampled token, so there may be up to `logprobs+1` elements in the response. + + The maximum value for `logprobs` is 5. + + max_tokens: The maximum number of [tokens](/tokenizer) to generate in the completion. + + The token count of your prompt plus `max_tokens` cannot exceed the model's + context length. + [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb) + for counting tokens. + + n: How many completions to generate for each prompt. + + **Note:** Because this parameter generates many completions, it can quickly + consume your token quota. Use carefully and ensure that you have reasonable + settings for `max_tokens` and `stop`. + + presence_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on + whether they appear in the text so far, increasing the model's likelihood to + talk about new topics. + + [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/gpt/parameter-details) + + stop: Up to 4 sequences where the API will stop generating further tokens. The + returned text will not contain the stop sequence. + + suffix: The suffix that comes after a completion of inserted text. + + temperature: What sampling temperature to use, between 0 and 2. Higher values like 0.8 will + make the output more random, while lower values like 0.2 will make it more + focused and deterministic. + + We generally recommend altering this or `top_p` but not both. + + top_p: An alternative to sampling with temperature, called nucleus sampling, where the + model considers the results of the tokens with top_p probability mass. So 0.1 + means only the tokens comprising the top 10% probability mass are considered. + + We generally recommend altering this or `temperature` but not both. + + user: A unique identifier representing your end-user, which can help OpenAI to monitor + and detect abuse. + [Learn more](https://platform.openai.com/docs/guides/safety-best-practices/end-user-ids). + + extra_headers: Send extra headers + + extra_query: Add additional query parameters to the request + + extra_body: Add additional JSON properties to the request + + timeout: Override the client-level default timeout for this request, in seconds + """ + ... + + @override + def create( + self, + *, + model: Union[ + str, + Literal[ + "babbage-002", + "davinci-002", + "gpt-3.5-turbo-instruct", + "text-davinci-003", + "text-davinci-002", + "text-davinci-001", + "code-davinci-002", + "text-curie-001", + "text-babbage-001", + "text-ada-001", + ], + ], + prompt: Union[str, List[str], List[int], List[List[int]], None], + best_of: Optional[int] | NotGiven = NOT_GIVEN, + echo: Optional[bool] | NotGiven = NOT_GIVEN, + frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN, + logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN, + logprobs: Optional[int] | NotGiven = NOT_GIVEN, + max_tokens: Optional[int] | NotGiven = NOT_GIVEN, + n: Optional[int] | NotGiven = NOT_GIVEN, + presence_penalty: Optional[float] | NotGiven = NOT_GIVEN, + stop: Union[Optional[str], List[str], None] | NotGiven = NOT_GIVEN, + stream: Optional[Literal[False]] | Literal[True] | NotGiven = NOT_GIVEN, + suffix: Optional[str] | NotGiven = NOT_GIVEN, + temperature: Optional[float] | NotGiven = NOT_GIVEN, + top_p: Optional[float] | NotGiven = NOT_GIVEN, + user: str | NotGiven = NOT_GIVEN, + # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. + # The extra values given here take precedence over values defined on the client or passed to this method. + extra_headers: Headers | None = None, + extra_query: Query | None = None, + extra_body: Body | None = None, + timeout: float | None | NotGiven = NOT_GIVEN, + ) -> AzureCompletion | Stream[AzureCompletion]: + stream_dict: Dict[str, Literal[True]] = { # TODO: pylance is upset if I pass through the parameter value. Overload + override combination is problematic + "stream": True + } if stream else {} + response = cast( + Union[Completion, Stream[Completion]], + super().create( + model=model, + prompt=prompt, + best_of=best_of, + echo=echo, + frequency_penalty = frequency_penalty, + logit_bias=logit_bias, + logprobs=logprobs, + max_tokens=max_tokens, + n=n, + presence_penalty=presence_penalty, + stop=stop, + **stream_dict, + suffix=suffix, + temperature=temperature, + top_p=top_p, + user=user, + extra_headers=extra_headers, + extra_query=extra_query, + extra_body=extra_body, + timeout=timeout + ) + ) + + if isinstance(response, Stream): + response._cast_to = AzureCompletion + else: + response_json = response.model_dump(mode="json") + response = AzureCompletion.construct(**response_json) + return response # type: ignore + + +class AzureOpenAIClient(Client): + + @property + @override + def chat(self) -> AzureChat: + return self._chat + + @property + @override + def completions(self) -> AzureCompletions: + return self._completions + + @completions.setter + def completions(self, value: AzureCompletions) -> None: + self._completions = value + + def __init__(self, *args: Any, base_url: str, credential: Optional["TokenCredential"] = None, api_version: str = '2023-09-01-preview', **kwargs: Any): + default_query = kwargs.get('default_query', {}) + default_query.setdefault('api-version', api_version) + kwargs['default_query'] = default_query + self.credential = credential + if credential: + kwargs['api_key'] = 'Placeholder: AAD' # TODO: There is an assumption/validation there is always an API key. + super().__init__(*args, base_url=base_url, **kwargs) + self._chat = AzureChat(self) + + @property + def auth_headers(self) -> Dict[str, str]: + return {"api-key": self.api_key} + + @property + def custom_auth(self) -> httpx.Auth | None: + if self.credential: + return TokenAuth(self.credential) + + # NOTE: We override the internal method because `@overrid`ing `@overload`ed methods and keeping typing happy is a pain. Most typing tools are lacking... + def _request(self, *, options: FinalRequestOptions, **kwargs: Any) -> Any: + if options.url == "/images/generations": + options.url = "openai/images/generations:submit" + response = super()._request(options=options, **kwargs) + model_extra = cast(Mapping[str, Any], getattr(response, 'model_extra')) or {} + operation_id = cast(str, model_extra['id']) + return self._poll( + "get", f"openai/operations/images/{operation_id}", + until=lambda response: response.json()["status"] in ["succeeded"], + failed=lambda response: response.json()["status"] in ["failed"], + ) + if isinstance(options.json_data, Mapping): + model = cast(str, options.json_data["model"]) + if not options.url.startswith(f'openai/deployments/{model}'): + if options.extra_json and options.extra_json.get("dataSources"): + options.url = f'openai/deployments/{model}/extensions' + options.url + else: + options.url = f'openai/deployments/{model}' + options.url + if options.url.startswith(("/models", "/fine_tuning", "/files", "/fine-tunes")): + options.url = f"openai{options.url}" + return super()._request(options=options, **kwargs) + + # Internal azure specific "helper" methods + def _check_polling_response(self, response: httpx.Response, predicate: Callable[[httpx.Response], bool]) -> bool: + if not predicate(response): + return False + error_data = cast(Dict[str, Any], response.json()['error']) + message = error_data.get('message', 'Operation failed') + code = error_data.get('code') + raise OpenAIError(message, code) + + def _poll( + self, + method: str, + url: str, + until: Callable[[httpx.Response], bool], + failed: Callable[[httpx.Response], bool], + interval: Optional[float] = None, + delay: Optional[float] = None, + ) -> ImagesResponse: + if delay: + time.sleep(delay) + + opts = FinalRequestOptions.construct(method=method, url=url) + response = super().request(httpx.Response, opts) + self._check_polling_response(response, failed) + start_time = time.time() + while not until(response): + if time.time() - start_time > TIMEOUT_SECS: + raise OpenAIError("Operation polling timed out.") # TODO: Find the right exception + + time.sleep(interval or int(response.headers.get("retry-after")) or 10) + response = super().request(httpx.Response, opts) + self._check_polling_response(response, failed) + + response_json = response.json() + return ImagesResponse.construct(**response_json["result"]) diff --git a/src/openai/resources/chat/chat.py b/src/openai/resources/chat/chat.py index 62bb796571..226b3d7add 100644 --- a/src/openai/resources/chat/chat.py +++ b/src/openai/resources/chat/chat.py @@ -14,16 +14,22 @@ class Chat(SyncAPIResource): - completions: Completions + + @property + def completions(self) -> Completions: + return self._completions def __init__(self, client: OpenAI) -> None: super().__init__(client) - self.completions = Completions(client) + self._completions = Completions(client) class AsyncChat(AsyncAPIResource): - completions: AsyncCompletions - + + @property + def completions(self) -> AsyncCompletions: + return self._completions + def __init__(self, client: AsyncOpenAI) -> None: super().__init__(client) - self.completions = AsyncCompletions(client) + self._completions = AsyncCompletions(client)