diff --git a/audio.go b/audio.go index 46c37112b..9231f9dfa 100644 --- a/audio.go +++ b/audio.go @@ -17,9 +17,10 @@ const ( type AudioResponseFormat string const ( - AudioResponseFormatJSON AudioResponseFormat = "json" - AudioResponseFormatSRT AudioResponseFormat = "srt" - AudioResponseFormatVTT AudioResponseFormat = "vtt" + AudioResponseFormatJSON AudioResponseFormat = "json" + AudioResponseFormatSRT AudioResponseFormat = "srt" + AudioResponseFormatVTT AudioResponseFormat = "vtt" + AudioResponseFormatVerboseJSON AudioResponseFormat = "verbose_json" ) // AudioRequest represents a request structure for audio API. @@ -33,8 +34,31 @@ type AudioRequest struct { Format AudioResponseFormat } +// AudioSegment contains information about a part of the audio, segmented by the model. +type AudioSegment struct { + ID int `json:"id"` + Seek int `json:"seek"` + Start float32 `json:"start"` + End float32 `json:"end"` + Text string `json:"text"` + Tokens []int `json:"tokens"` + Temperature float32 `json:"temperature"` + AvgLogProb float64 `json:"avg_logprob"` + CompressionRatio float64 `json:"compression_ratio"` + NoSpeechProb float64 `json:"no_speech_prob"` + Transient bool `json:"transient"` +} + +type AudioResponseVerboseJson struct { + Task string `json:"task"` + Language string `json:"language"` + Duration float32 `json:"duration"` + Segments []AudioSegment `json:"segments"` +} + // AudioResponse represents a response structure for audio API. type AudioResponse struct { + AudioResponseVerboseJson Text string `json:"text"` } @@ -86,7 +110,9 @@ func (c *Client) callAudioAPI( // HasJSONResponse returns true if the response format is JSON. func (r AudioRequest) HasJSONResponse() bool { - return r.Format == "" || r.Format == AudioResponseFormatJSON + return r.Format == "" || + r.Format == AudioResponseFormatJSON || + r.Format == AudioResponseFormatVerboseJSON } // audioMultipartForm creates a form with audio file contents and the name of the model to use for