diff --git a/.gitattributes b/.gitattributes
deleted file mode 100644
index 8343da4..0000000
--- a/.gitattributes
+++ /dev/null
@@ -1,2 +0,0 @@
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
\ No newline at end of file
diff --git a/.github/workflows/build_and_publish_to_pypi.yml b/.github/workflows/build_and_publish_to_pypi.yml
index ce8e4be..bee4a13 100755
--- a/.github/workflows/build_and_publish_to_pypi.yml
+++ b/.github/workflows/build_and_publish_to_pypi.yml
@@ -3,6 +3,9 @@ name: Publish Python distributions to PyPI
on:
push:
workflow_dispatch:
+ create:
+ tags:
+ - "*"
jobs:
build-n-publish:
@@ -10,8 +13,6 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@master
- with:
- lfs: true
- name: Set up Python 3.8
uses: actions/setup-python@v3
with:
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 6a13242..ffc7c50 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -8,6 +8,7 @@ on:
branches: [ "main" ]
pull_request:
branches: [ "main" ]
+ workflow_dispatch:
jobs:
unit_tests_linux:
@@ -18,8 +19,6 @@ jobs:
steps:
- uses: actions/checkout@v3
- with:
- lfs: true
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v3
with:
@@ -42,8 +41,6 @@ jobs:
steps:
- uses: actions/checkout@v3
- with:
- lfs: true
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v3
with:
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0484d60..a69c7ca 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,17 @@
# Change Log
+## v0.6.0 - 2023/06/15
+
+### Added
+
+* Various bug fixes, and some new functionality in `model.py` to control repeated detections
+
+### Changed
+
+* Models are no longer included in the PyPi package, and must be downloaded separately
+
+### Removed
+
## v0.5.0 - 2023/06/15
### Added
diff --git a/MANIFEST.in b/MANIFEST.in
deleted file mode 100644
index eef6e33..0000000
--- a/MANIFEST.in
+++ /dev/null
@@ -1,2 +0,0 @@
-recursive-include openwakeword *.onnx
-recursive-include openwakeword *.tflite
\ No newline at end of file
diff --git a/README.md b/README.md
index 7e5f4f5..d6f6566 100644
--- a/README.md
+++ b/README.md
@@ -11,12 +11,21 @@ openWakeWord is an open-source wakeword library that can be used to create voice
# Updates
+**2024/02/11**
+- v0.6.0 of openWakeWord released. See the [changelog](CHANGELOG.md) for a full descriptions of new features and changes.
+
+**2023/11/09**
+- Added example scripts under `examples/web` that demonstrate streaming audio from a web application into openWakeWord.
+
+**2023/10/11**
+- Significant improvements to the process of [training new models](#training-new-models), including an example Google Colab notebook demonstrating how to train a basic wake word model in <1 hour.
+
**2023/06/15**
- v0.5.0 of openWakeWord released. See the [changelog](CHANGELOG.md) for a full descriptions of new features and changes.
# Demo
-You can try an online demo of the included pre-trained models via HuggingFace Spaces [right here!](https://huggingface.co/spaces/davidscripka/openWakeWord).
+You can try an online demo of the included pre-trained models via HuggingFace Spaces [right here](https://huggingface.co/spaces/davidscripka/openWakeWord)!
Note that real-time detection of a microphone stream can occasionally behave strangely in Spaces. For the most reliable testing, perform a local installation as described below.
@@ -41,16 +50,20 @@ Many thanks to [TeaPoly](https://github.com/TeaPoly/speexdsp-ns-python) for thei
# Usage
-For quick local testing, clone this repository and use the included [example script](examples/detect_from_microphone.py) to try streaming detection from a local microphone. **Important note!** The model files are stored in this repo using [git-lfs](https://git-lfs.com/); make sure it is installed on your system and if needed use `git-lfs fetch --all` to make sure the the models download correctly.
+For quick local testing, clone this repository and use the included [example script](examples/detect_from_microphone.py) to try streaming detection from a local microphone. You can individually download pre-trained models from current and past [releases](https://github.com/dscripka/openWakeWord/releases/), or you can download them using Python (see below).
Adding openWakeWord to your own Python code requires just a few lines:
```python
+import openwakeword
from openwakeword.model import Model
-# Instantiate the model
+# One-time download of all pre-trained models (or only select models)
+openwakeword.utils.download_models()
+
+# Instantiate the model(s)
model = Model(
- wakeword_models=["path/to/model.onnx"], # can also leave this argument empty to load all of the included pre-trained models
+ wakeword_models=["path/to/model.tflite"], # can also leave this argument empty to load all of the included pre-trained models
)
# Get audio data containing 16-bit 16khz PCM audio data from a file, microphone, network stream, etc.
@@ -130,7 +143,7 @@ The table below lists each model, examples of the word/phrases it is trained to
| current weather | "what's the weather" | [docs](docs/models/weather.md) |
| timers | "set a 10 minute timer" | [docs](docs/models/timers.md) |
-Based on the methods discussed in [performance testing](#performance-and-evaluation), each included model aims to meet the target performance criteria of <5% false-reject rates and <0.5/hour false-accept rates with appropriate threshold tuning. These levels are subjective, but hopefully are below the annoyance threshold where the average user becomes frustrated with a system that often misses intended activations and/or causes disruption by activating too frequently at undesired times. For example, at these performance levels a user could expect to have the model process continuous mixed content audio of several hours with at most a few false activations, and have a failed intended activation in only 1/20 attempts (and a failed retry in only 1/400 attempts).
+Based on the methods discussed in [performance testing](#performance-and-evaluation), each included model aims to meet the target performance criteria of <5% false-reject rates and <0.5/hour false-accept rates with appropriate threshold tuning. These levels are subjective, but hopefully are below the annoyance threshold where the average user becomes frustrated with a system that often misses intended activations and/or causes disruption by activating too frequently at undesired times. For example, at these performance levels a user could expect to have the model process continuous mixed content audio of several hours with at most a few false activations, and have a failed intended activation in only 1/20 attempts (and a failed retry in only 1/400 attempts).
If you have a new wake word or phrase that you would like to see included in the next release, please open an issue, and we'll do a best to train a model! The focus of these requests and future release will be on words and phrases that have broad general usage versus highly specific application.
@@ -206,7 +219,15 @@ While the models are trained with background noise to increase robustness, in so
# Training New Models
-Training new models is conceptually simple, and the entire process is demonstrated in a [tutorial notebook](notebooks/training_models.ipynb).
+openWakeWord includes an automated utility that greatly simplifies the process of training custom models. This can be used in two ways:
+
+1) A simple [Google Colab](https://colab.research.google.com/drive/1q1oe2zOyZp7UsB3jJiQ1IFn8z5YfjwEb?usp=sharing) notebook with an easy to use interface and simple end-to-end process. This allows anyone to produce a custom model very quickly (<1 hour) and doesn't require any development experience, but the performance of the model may be low in some deployment scenarios.
+
+2) A more detailed [notebook](notebooks/automatic_model_training.ipynb) (also on [Google Colab](https://colab.research.google.com/drive/1yyFH-fpguX2BTAW8wSQxTrJnJTM-0QAd?usp=sharing)) that describes the training process in more details, and enables more customization. This can produce high quality models, but requires more development experience.
+
+For a collection of models trained using the notebooks above by the Home Assistant Community (and with much gratitude to @fwartner), see the excellent repository [here](https://github.com/fwartner/home-assistant-wakewords-collection).
+
+For users interested in understanding the fundamental concepts behind model training there is a more detailed, educational [tutorial notebook](notebooks/training_models.ipynb) also available. However, this specific notebook is not intended for training production models, and the automated process above is recommended for that purpose.
Fundamentally, a new model requires two data generation and collection steps:
@@ -227,9 +248,11 @@ Future release road maps may have non-english support. In particular, [Mycroft.A
**Can openWakeWord be run in a browser with javascript?**
- While the ONNX runtime [does support javascript](https://onnxruntime.ai/docs/get-started/with-javascript.html), much of the other functionality required for openWakeWord models would need to be ported. This is not currently on the roadmap, but please open an issue/start a discussion if this feature is of particular interest.
+- As a potential work-around for some applications, the example scripts in `examples/web` demonstrate how audio can be captured in a browser and streaming via websockets into openWakeWord running in a Python backend server.
+- Other potential options could include projects like `pyodide` (see [here](https://github.com/pyodide/pyodide/issues/4220)) for a related issue.
**Is there a C++ version of openWakeWord?**
-- While the ONNX runtime [also has a C++ API](https://onnxruntime.ai/docs/get-started/with-cpp.html), there isn't an official C++ implementation of the full openWakeWord library. However, [@synesthesiam](https://github.com/synesthesiam) has created a [C++ version](https://github.com/rhasspy/openWakeWord-cpp) of openWakeWord with the essential functionality implemented.
+- While the ONNX runtime [also has a C++ API](https://onnxruntime.ai/docs/get-started/with-cpp.html), there isn't an official C++ implementation of the full openWakeWord library. However, [@synesthesiam](https://github.com/synesthesiam) has created a [C++ version of openWakeWord](https://github.com/rhasspy/openWakeWord-cpp) with basic functionality implemented.
**Why are there three separate models instead of just one?**
- Separating the models was an intentional choice to provide flexibility and optimize the efficiency of the end-to-end prediction process. For example, with separate melspectrogram, embedding, and prediction models, each one can operate on different size inputs of audio to optimize overall latency and share computations between models. It certainly is possible to make a combined model with all of the steps integrated, though, if that was a requirement of a particular use case.
@@ -237,6 +260,16 @@ Future release road maps may have non-english support. In particular, [Mycroft.A
**I still get a large number of false activations when I use the pre-trained models, how can I reduce these?**
- First, review the [recommendations for usage](#recommendations-for-usage) and ensure that these options do not improve overall system accuracy. Second, experiment with [custom verifier models](#user-specific-models), if possible. If neither of these approaches are helping, please open an issue with details of the deployment environment and the types of false activations that you are experiencing. We certainly appreciate feedback & requests on how to improve the base pre-trained models!
+# Acknowledgements
+
+I am very grateful for the encouraging and positive response from the open-source community since the release of openWakeWord in January 2023. In particular, I want to acknowledge and thank the following individuals and groups for their feedback, collaboration, and development support:
+
+- [synesthesiam](https://github.com/synesthesiam)
+- [SecretSauceAI](https://github.com/secretsauceai)
+- [OpenVoiceOS](https://github.com/OpenVoiceOS)
+- [Nabu Casa](https://github.com/NabuCasa)
+- [Home Assistant](https://github.com/home-assistant)
+
# License
-All of the code in this repository is licensed under the **Apache 2.0** license. All of the included pre-trained models are licensed under the [Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International](https://creativecommons.org/licenses/by-nc-sa/4.0/) license due to the inclusion of datasets with unknown or restrictive licensing as part of the training data. If you are interested in pre-trained models with more permissive licensing, please raise an issue and we will try to add them to a future release.
\ No newline at end of file
+All of the code in this repository is licensed under the **Apache 2.0** license. All of the included pre-trained models are licensed under the [Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International](https://creativecommons.org/licenses/by-nc-sa/4.0/) license due to the inclusion of datasets with unknown or restrictive licensing as part of the training data. If you are interested in pre-trained models with more permissive licensing, please raise an issue and we will try to add them to a future release.
diff --git a/examples/capture_activations.py b/examples/capture_activations.py
index fae900e..8f8e80d 100644
--- a/examples/capture_activations.py
+++ b/examples/capture_activations.py
@@ -68,10 +68,26 @@
default=False,
required=False
)
+parser=argparse.ArgumentParser()
+parser.add_argument(
+ "--chunk_size",
+ help="How much audio (in number of 16khz samples) to predict on at once",
+ type=int,
+ default=1280,
+ required=False
+)
+parser.add_argument(
+ "--model_path",
+ help="The path of a specific model to load",
+ type=str,
+ default="",
+ required=False
+)
parser.add_argument(
- "--model",
- help="The model to use for openWakeWord, leave blank to use all available models",
+ "--inference_framework",
+ help="The inference framework to use (either 'onnx' or 'tflite'",
type=str,
+ default='tflite',
required=False
)
parser.add_argument(
@@ -87,25 +103,26 @@
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
-CHUNK = 1280
+CHUNK = args.chunk_size
audio = pyaudio.PyAudio()
mic_stream = audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK)
# Load pre-trained openwakeword models
-if args.model:
+if args.model_path:
model_paths = openwakeword.get_pretrained_model_paths()
for path in model_paths:
- if args.model in path:
+ if args.model_path in path:
model_path = path
if model_path:
owwModel = Model(
- wakeword_model_paths=[model_path],
+ wakeword_models=[model_path],
enable_speex_noise_suppression=args.noise_suppression,
- vad_threshold = args.vad_threshold
- )
+ vad_threshold = args.vad_threshold,
+ inference_framework=args.inference_framework
+ )
else:
- print(f'Could not find model \"{args.model}\"')
+ print(f'Could not find model \"{args.model_path}\"')
exit()
else:
owwModel = Model(
diff --git a/examples/custom_model.yml b/examples/custom_model.yml
new file mode 100644
index 0000000..a41d327
--- /dev/null
+++ b/examples/custom_model.yml
@@ -0,0 +1,101 @@
+## Configuration file to be used with `train.py` to create custom wake word/phrase models
+
+# The name of the model (will be used when creating directoires and when saving the final .onnx and .tflite files)
+model_name: "my_model"
+
+# The target word/phrase to be detected by the model. Adding multiple unique words/phrases will
+# still only train a binary model detection model, but it will activate on any one of the provided words/phrases.
+target_phrase:
+ - "hey jarvis"
+
+# Specific phrases that you do *not* want the model to activate on, outside of those generated automatically via phoneme overlap
+# This can be a good way to reduce false positives if you notice that, in practice, certain words or phrases are problematic
+custom_negative_phrases: []
+
+# The total number of positive samples to generate for training (minimum of 20,000 recommended, often 100,000+ is best)
+n_samples: 10000
+
+# The total number of positive samples to generate for validation and early stopping of model training
+n_samples_val: 2000
+
+# The batch size to use with Piper TTS when generating synthetic training data
+tts_batch_size: 50
+
+# The batch size to use when performing data augmentation on generated clips prior to training
+# It's recommended that this not be too large to ensure that there is enough variety in the augmentation
+augmentation_batch_size: 16
+
+# The path to a fork of the piper-sample-generator repository for TTS (https://github.com/dscripka/piper-sample-generator)
+piper_sample_generator_path: "./piper-sample-generator"
+
+# The output directory for the generated synthetic clips, openwakeword features, and trained models
+# Sub-directories will be automatically created for train and test clips for both positive and negative examples
+output_dir: "./my_custom_model"
+
+# The directories containing Room Impulse Response recordings
+rir_paths:
+ - "./mit_rirs"
+
+# The directories containing background audio files to mix with training data
+background_paths:
+ - "./background_clips"
+
+# The duplication rate for the background audio clips listed above (1 or higher). Can be useful as a way to oversample
+# a particular type of background noise more relevant to a given deployment environment. Values apply in the same
+# order as the background_paths list above. Only useful when multiple directories are provided above.
+background_paths_duplication_rate:
+ - 1
+
+# The location of pre-computed openwakeword features for false-positive validation data
+# If you do not have deployment environment validation data, a good general purpose dataset with
+# a reasonable mix with ~11 hours of speech, noise, and music is available here: https://huggingface.co/datasets/davidscripka/openwakeword_features
+false_positive_validation_data_path: "./validation_set_features.npy"
+
+# The number of times to apply augmentations to the generated training data
+# Values greater than 1 reuse each generation that many times, producing overall unique
+# clips for training due to the randomness intrinsic to the augmentation despite using
+# the same original synthetic generation. Can be a useful way to increase model robustness
+# without having to generate extremely large numbers of synthetic examples.
+augmentation_rounds: 1
+
+# Paths to pre-computed openwakeword features for positive and negative data. Each file must be a saved
+# .npy array (see the example notebook on manually training new models for details on how to create these).
+# There is no limit on the number of files but training speed will decrease as more
+# data will need to be read from disk for each additional file.
+# Also, there is a custom dataloader that uses memory-mapping with loading data, so the total size
+# of the files is not limited by the amount of available system memory (though this will result
+# in decreased training throughput depending on the speed of the underlying storage device). A fast
+# NVME SSD is recommended for optimal performance.
+
+feature_data_files:
+ "ACAV100M_sample": "./openwakeword_features_ACAV100M_2000_hrs_16bit.npy"
+
+# Define the number of examples from each data file per batch. Note that the key names here
+# must correspond to those define in the `feature_data_files` dictionary above (except for
+# the `positive` and `adversarial_negative` keys, which are automatically defined). The sum
+# of the values for each key define the total batch size for training. Initial testing indicates
+# that batch sizes of 1024-4096 work well in practice.
+
+batch_n_per_class:
+ "ACAV100M_sample": 1024
+ "adversarial_negative": 50
+ "positive": 50
+
+# Define the type of size of the openwakeword model to train. Increasing the layer size
+# may result in a more capable model, at the cost of decreased inference speed. The default
+# value (32) seems to work well in practice for most wake words/phrases.
+
+model_type: "dnn"
+layer_size: 32
+
+# Define training parameters. The values below are recommended defaults for most applications,
+# but unique deployment environments will likely require testing to determine which values
+# are the most appropriate.
+
+# The maximum number of steps to train the model
+steps: 50000
+
+# The maximum negative weight and target false positives per hour, used to control the auto training process
+# The target false positive rate may not be achieved, and adjusting the maximum negative weight may be necessary
+max_negative_weight: 1500
+target_false_positives_per_hour: 0.2
\ No newline at end of file
diff --git a/examples/detect_from_microphone.py b/examples/detect_from_microphone.py
index 7c21e10..6e69c92 100644
--- a/examples/detect_from_microphone.py
+++ b/examples/detect_from_microphone.py
@@ -22,10 +22,10 @@
parser=argparse.ArgumentParser()
parser.add_argument(
"--chunk_size",
- help="How much audio (in samples) to predict on at once",
+ help="How much audio (in number of samples) to predict on at once",
type=int,
default=1280,
- required=True
+ required=False
)
parser.add_argument(
"--model_path",
diff --git a/examples/web/README.md b/examples/web/README.md
new file mode 100644
index 0000000..bd4e970
--- /dev/null
+++ b/examples/web/README.md
@@ -0,0 +1,21 @@
+# Examples
+
+This folder contains examples of using openWakeWord with web applications.
+
+## Websocket Streaming
+
+As openWakeWord does not have a native Javascript port, using it within a web browswer is best accomplished with websocket streaming of the audio data from the browser to a simple Python application. To install the requirements for this example:
+
+```
+pip install aiohttp
+pip install resampy
+```
+
+The `streaming_client.html` page shows a simple implementation of audio capture and streamimng from a microphone and streaming in a browser, and the `streaming_server.py` file is the corresponding websocket server that passes the audio into openWakeWord.
+
+To run the example, execute `python streaming_server.py` (add the `--help` argument to see options) and navigate to `localhost:9000` in your browser.
+
+Note that this example is illustrative only, and integration of this approach with other web applications may have different requirements. In particular, some key considerations:
+
+- This example captures PCM audio from the web browser and streams full 16-bit integer representations of ~250 ms audio chunks over the websocket connection. In practice, bandwidth efficient streams of compressed audio may be more suitable for some applications.
+- The browser captures audio at the native sampling rate of the capture device, which can require re-sampling prior to passing the audio data to openWakeWord. This example uses the `resampy` library which has a good balance between performance and quality, but other resampling approaches that optimize different aspects may be more suitable for some applications.
\ No newline at end of file
diff --git a/examples/web/streaming_client.html b/examples/web/streaming_client.html
new file mode 100644
index 0000000..c2df273
--- /dev/null
+++ b/examples/web/streaming_client.html
@@ -0,0 +1,197 @@
+
+
+
+
+
+ Websocket Microphone Streaming
+
+
+
+
Streaming Audio to openWakeWord Using Websockets
+
+
+
+
+
Wakeword
+
Detected
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/examples/web/streaming_server.py b/examples/web/streaming_server.py
new file mode 100644
index 0000000..449d251
--- /dev/null
+++ b/examples/web/streaming_server.py
@@ -0,0 +1,112 @@
+# Copyright 2023 David Scripka. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#######################################################################################
+
+# This example scripts runs openWakeWord in a simple web server receiving audio
+# from a web page using websockets.
+
+#######################################################################################
+
+# Imports
+import aiohttp
+from aiohttp import web
+import numpy as np
+from openwakeword import Model
+import resampy
+import argparse
+import json
+
+# Define websocket handler
+async def websocket_handler(request):
+ ws = web.WebSocketResponse()
+ await ws.prepare(request)
+
+ # Send loaded models
+ await ws.send_str(json.dumps({"loaded_models": list(owwModel.models.keys())}))
+
+ # Start listening for websocket messages
+ async for msg in ws:
+ # Get the sample rate of the microphone from the browser
+ if msg.type == aiohttp.WSMsgType.TEXT:
+ sample_rate = int(msg.data)
+ elif msg.type == aiohttp.WSMsgType.ERROR:
+ print(f"WebSocket error: {ws.exception()}")
+ else:
+ # Get audio data from websocket
+ audio_bytes = msg.data
+
+ # Add extra bytes of silence if needed
+ if len(msg.data) % 2 == 1:
+ audio_bytes += (b'\x00')
+
+ # Convert audio to correct format and sample rate
+ data = np.frombuffer(audio_bytes, dtype=np.int16)
+ if sample_rate != 16000:
+ data = resampy.resample(data, sample_rate, 16000)
+
+ # Get openWakeWord predictions and set to browser client
+ predictions = owwModel.predict(data)
+
+ activations = []
+ for key in predictions:
+ if predictions[key] >= 0.5:
+ activations.append(key)
+
+ if activations != []:
+ await ws.send_str(json.dumps({"activations": activations}))
+
+ return ws
+
+# Define static file handler
+async def static_file_handler(request):
+ return web.FileResponse('./streaming_client.html')
+
+app = web.Application()
+app.add_routes([web.get('/ws', websocket_handler), web.get('/', static_file_handler)])
+
+if __name__ == '__main__':
+ # Parse CLI arguments
+ parser=argparse.ArgumentParser()
+ parser.add_argument(
+ "--chunk_size",
+ help="How much audio (in number of samples) to predict on at once",
+ type=int,
+ default=1280,
+ required=False
+ )
+ parser.add_argument(
+ "--model_path",
+ help="The path of a specific model to load",
+ type=str,
+ default="",
+ required=False
+ )
+ parser.add_argument(
+ "--inference_framework",
+ help="The inference framework to use (either 'onnx' or 'tflite'",
+ type=str,
+ default='tflite',
+ required=False
+ )
+ args=parser.parse_args()
+
+ # Load openWakeWord models
+ if args.model_path != "":
+ owwModel = Model(wakeword_models=[args.model_path], inference_framework=args.inference_framework)
+ else:
+ owwModel = Model(inference_framework=args.inference_framework)
+
+ # Start webapp
+ web.run_app(app, host='localhost', port=9000)
\ No newline at end of file
diff --git a/notebooks/.gitignore b/notebooks/.gitignore
new file mode 100644
index 0000000..b0f0998
--- /dev/null
+++ b/notebooks/.gitignore
@@ -0,0 +1 @@
+cv11_test_clips
diff --git a/notebooks/automatic_model_training.ipynb b/notebooks/automatic_model_training.ipynb
new file mode 100644
index 0000000..51ed036
--- /dev/null
+++ b/notebooks/automatic_model_training.ipynb
@@ -0,0 +1,494 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "c1eab0b3",
+ "metadata": {
+ "id": "c1eab0b3"
+ },
+ "source": [
+ "# Introduction"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "882058c5",
+ "metadata": {
+ "id": "882058c5"
+ },
+ "source": [
+ "This notebook demonstrates how to train custom openWakeWord models using pre-defined datasets and an automated process for dataset generation and training. While not guaranteed to always produce the best performing model, the methods shown in this notebook often produce baseline models with releatively strong performance.\n",
+ "\n",
+ "Manual data preparation and model training (e.g., see the [training models](training_models.ipynb) notebook) remains an option for when full control over the model development process is needed.\n",
+ "\n",
+ "At a high level, the automatic training process takes advantages of several techniques to try and produce a good model, including:\n",
+ "\n",
+ "- Early-stopping and checkpoint averaging (similar to [stochastic weight averaging](https://arxiv.org/abs/1803.05407)) to search for the best models found during training, according to the validation data\n",
+ "- Variable learning rates with cosine decay and multiple cycles\n",
+ "- Adaptive batch construction to focus on only high-loss examples when the model begins to converge, combined with gradient accumulation to ensure that batch sizes are still large enough for stable training\n",
+ "- Cycical weight schedules for negative examples to help the model reduce false-positive rates\n",
+ "\n",
+ "See the contents of the `train.py` file for more details."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e08d031b",
+ "metadata": {
+ "id": "e08d031b"
+ },
+ "source": [
+ "# Environment Setup"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "aee78c37",
+ "metadata": {
+ "id": "aee78c37"
+ },
+ "source": [
+ "To begin, we'll need to install the requirements for training custom models. In particular, a relatively recent version of Pytorch and custom fork of the [piper-sample-generator](https://github.com/dscripka/piper-sample-generator) library for generating synthetic examples for the custom model.\n",
+ "\n",
+ "**Important Note!** Currently, automated model training is only supported on linux systems due to the requirements of the text to speech library used for synthetic sample generation (Piper). It may be possible to use Piper on Windows/Mac systems, but that has not (yet) been tested."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4b1227eb",
+ "metadata": {
+ "id": "4b1227eb"
+ },
+ "outputs": [],
+ "source": [
+ "## Environment setup\n",
+ "\n",
+ "# install piper-sample-generator (currently only supports linux systems)\n",
+ "!git clone https://github.com/rhasspy/piper-sample-generator\n",
+ "!wget -O piper-sample-generator/models/en_US-libritts_r-medium.pt 'https://github.com/rhasspy/piper-sample-generator/releases/download/v2.0.0/en_US-libritts_r-medium.pt'\n",
+ "!pip install piper-phonemize\n",
+ "!pip install webrtcvad\n",
+ "\n",
+ "# install openwakeword (full installation to support training)\n",
+ "!git clone https://github.com/dscripka/openwakeword\n",
+ "!pip install -e ./openwakeword\n",
+ "!cd openwakeword\n",
+ "\n",
+ "# install other dependencies\n",
+ "!pip install mutagen==1.47.0\n",
+ "!pip install torchinfo==1.8.0\n",
+ "!pip install torchmetrics==1.2.0\n",
+ "!pip install speechbrain==0.5.14\n",
+ "!pip install audiomentations==0.33.0\n",
+ "!pip install torch-audiomentations==0.11.0\n",
+ "!pip install acoustics==0.2.6\n",
+ "!pip install tensorflow-cpu==2.8.1\n",
+ "!pip install tensorflow_probability==0.16.0\n",
+ "!pip install onnx_tf==1.10.0\n",
+ "!pip install pronouncing==0.2.0\n",
+ "!pip install datasets==2.14.6\n",
+ "!pip install deep-phonemizer==0.0.19\n",
+ "\n",
+ "# Download required models (workaround for Colab)\n",
+ "import os\n",
+ "os.makedirs(\"./openwakeword/openwakeword/resources/models\")\n",
+ "!wget https://github.com/dscripka/openWakeWord/releases/download/v0.5.1/embedding_model.onnx -O ./openwakeword/openwakeword/resources/models/embedding_model.onnx\n",
+ "!wget https://github.com/dscripka/openWakeWord/releases/download/v0.5.1/embedding_model.tflite -O ./openwakeword/openwakeword/resources/models/embedding_model.tflite\n",
+ "!wget https://github.com/dscripka/openWakeWord/releases/download/v0.5.1/melspectrogram.onnx -O ./openwakeword/openwakeword/resources/models/melspectrogram.onnx\n",
+ "!wget https://github.com/dscripka/openWakeWord/releases/download/v0.5.1/melspectrogram.tflite -O ./openwakeword/openwakeword/resources/models/melspectrogram.tflite\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d4c1056e",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-09-04T13:42:01.183840Z",
+ "start_time": "2023-09-04T13:41:59.752153Z"
+ },
+ "id": "d4c1056e"
+ },
+ "outputs": [],
+ "source": [
+ "# Imports\n",
+ "\n",
+ "import os\n",
+ "import numpy as np\n",
+ "import torch\n",
+ "import sys\n",
+ "from pathlib import Path\n",
+ "import uuid\n",
+ "import yaml\n",
+ "import datasets\n",
+ "import scipy\n",
+ "from tqdm import tqdm\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e9d7a05a",
+ "metadata": {
+ "id": "e9d7a05a"
+ },
+ "source": [
+ "# Download Data"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c52f75cc",
+ "metadata": {
+ "id": "c52f75cc"
+ },
+ "source": [
+ "When training new openWakeWord models using the automated procedure, four specific types of data are required:\n",
+ "\n",
+ "1) Synthetic examples of the target word/phrase generated with text-to-speech models\n",
+ "\n",
+ "2) Synthetic examples of adversarial words/phrases generated with text-to-speech models\n",
+ "\n",
+ "3) Room impulse reponses and noise/background audio data to augment the synthetic examples and make them more realistic\n",
+ "\n",
+ "4) Generic \"negative\" audio data that is very unlikely to contain examples of the target word/phrase in the context where the model should detect it. This data can be the original audio data, or precomputed openWakeWord features ready for model training.\n",
+ "\n",
+ "5) Validation data to use for early-stopping when training the model.\n",
+ "\n",
+ "For the purposes of this notebook, all five of these sources will either be generated manually or can be obtained from HuggingFace thanks to their excellent `datasets` library and extremely generous hosting policy. Also note that while only a portion of some datasets are downloaded, for the best possible performance it is recommended to download the entire dataset and keep a local copy for future training runs."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d25a93b1",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-09-04T01:07:17.746749Z",
+ "start_time": "2023-09-04T01:07:17.740846Z"
+ },
+ "id": "d25a93b1"
+ },
+ "outputs": [],
+ "source": [
+ "# Download room impulse responses collected by MIT\n",
+ "# https://mcdermottlab.mit.edu/Reverb/IR_Survey.html\n",
+ "\n",
+ "output_dir = \"./mit_rirs\"\n",
+ "if not os.path.exists(output_dir):\n",
+ " os.mkdir(output_dir)\n",
+ "rir_dataset = datasets.load_dataset(\"davidscripka/MIT_environmental_impulse_responses\", split=\"train\", streaming=True)\n",
+ "\n",
+ "# Save clips to 16-bit PCM wav files\n",
+ "for row in tqdm(rir_dataset):\n",
+ " name = row['audio']['path'].split('/')[-1]\n",
+ " scipy.io.wavfile.write(os.path.join(output_dir, name), 16000, (row['audio']['array']*32767).astype(np.int16))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2c0e178b",
+ "metadata": {
+ "id": "2c0e178b"
+ },
+ "outputs": [],
+ "source": [
+ "## Download noise and background audio\n",
+ "\n",
+ "# Audioset Dataset (https://research.google.com/audioset/dataset/index.html)\n",
+ "# Download one part of the audioset .tar files, extract, and convert to 16khz\n",
+ "# For full-scale training, it's recommended to download the entire dataset from\n",
+ "# https://huggingface.co/datasets/agkphysics/AudioSet, and\n",
+ "# even potentially combine it with other background noise datasets (e.g., FSD50k, Freesound, etc.)\n",
+ "\n",
+ "if not os.path.exists(\"audioset\"):\n",
+ " os.mkdir(\"audioset\")\n",
+ "\n",
+ "fname = \"bal_train09.tar\"\n",
+ "out_dir = f\"audioset/{fname}\"\n",
+ "link = \"https://huggingface.co/datasets/agkphysics/AudioSet/resolve/main//" + fname\n",
+ "!wget -O {out_dir} {link}\n",
+ "!cd audioset && tar -xvf bal_train09.tar\n",
+ "\n",
+ "output_dir = \"./audioset_16k\"\n",
+ "if not os.path.exists(output_dir):\n",
+ " os.mkdir(output_dir)\n",
+ "\n",
+ "# Convert audioset files to 16khz sample rate\n",
+ "audioset_dataset = datasets.Dataset.from_dict({\"audio\": [str(i) for i in Path(\"audioset/audio\").glob(\"**/*.flac\")]})\n",
+ "audioset_dataset = audioset_dataset.cast_column(\"audio\", datasets.Audio(sampling_rate=16000))\n",
+ "for row in tqdm(audioset_dataset):\n",
+ " name = row['audio']['path'].split('/')[-1].replace(\".flac\", \".wav\")\n",
+ " scipy.io.wavfile.write(os.path.join(output_dir, name), 16000, (row['audio']['array']*32767).astype(np.int16))\n",
+ "\n",
+ "# Free Music Archive dataset (https://github.com/mdeff/fma)\n",
+ "output_dir = \"./fma\"\n",
+ "if not os.path.exists(output_dir):\n",
+ " os.mkdir(output_dir)\n",
+ "fma_dataset = datasets.load_dataset(\"rudraml/fma\", name=\"small\", split=\"train\", streaming=True)\n",
+ "fma_dataset = iter(fma_dataset.cast_column(\"audio\", datasets.Audio(sampling_rate=16000)))\n",
+ "\n",
+ "n_hours = 1 # use only 1 hour of clips for this example notebook, recommend increasing for full-scale training\n",
+ "for i in tqdm(range(n_hours*3600//30)): # this works because the FMA dataset is all 30 second clips\n",
+ " row = next(fma_dataset)\n",
+ " name = row['audio']['path'].split('/')[-1].replace(\".mp3\", \".wav\")\n",
+ " scipy.io.wavfile.write(os.path.join(output_dir, name), 16000, (row['audio']['array']*32767).astype(np.int16))\n",
+ " i += 1\n",
+ " if i == n_hours*3600//30:\n",
+ " break\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d01ec467",
+ "metadata": {
+ "id": "d01ec467"
+ },
+ "outputs": [],
+ "source": [
+ "# Download pre-computed openWakeWord features for training and validation\n",
+ "\n",
+ "# training set (~2,000 hours from the ACAV100M Dataset)\n",
+ "# See https://huggingface.co/datasets/davidscripka/openwakeword_features for more information\n",
+ "!wget https://huggingface.co/datasets/davidscripka/openwakeword_features/resolve/main/openwakeword_features_ACAV100M_2000_hrs_16bit.npy\n",
+ "\n",
+ "# validation set for false positive rate estimation (~11 hours)\n",
+ "!wget https://huggingface.co/datasets/davidscripka/openwakeword_features/resolve/main/validation_set_features.npy"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cfe82647",
+ "metadata": {
+ "id": "cfe82647"
+ },
+ "source": [
+ "# Define Training Configuration"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b2e71329",
+ "metadata": {
+ "id": "b2e71329"
+ },
+ "source": [
+ "For automated model training openWakeWord uses a specially designed training script and a [YAML](https://yaml.org/) configuration file that defines all of the information required for training a new wake word/phrase detection model.\n",
+ "\n",
+ "It is strongly recommended that you review [the example config file](../examples/custom_model.yml), as each value is fully documented there. For the purposes of this notebook, we'll read in the YAML file to modify certain configuration parameters before saving a new YAML file for training our example model. Specifically:\n",
+ "\n",
+ "- We'll train a detection model for the phrase \"hey sebastian\"\n",
+ "- We'll only generate 5,000 positive and negative examples (to save on time for this example)\n",
+ "- We'll only generate 1,000 validation positive and negative examples for early stopping (again to save time)\n",
+ "- The model will only be trained for 10,000 steps (larger datasets will benefit from longer training)\n",
+ "- We'll reduce the target metrics to account for the small dataset size and limited training.\n",
+ "\n",
+ "On the topic of target metrics, there are *not* specific guidelines about what these metrics should be in practice, and you will need to conduct testing in your target deployment environment to establish good thresholds. However, from very limited testing the default values in the config file (accuracy >= 0.7, recall >= 0.5, false-positive rate <= 0.2 per hour) seem to produce models with reasonable performance.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "fb0b6e4f",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-09-04T18:11:33.893397Z",
+ "start_time": "2023-09-04T18:11:33.878938Z"
+ },
+ "id": "fb0b6e4f"
+ },
+ "outputs": [],
+ "source": [
+ "# Load default YAML config file for training\n",
+ "config = yaml.load(open(\"openwakeword/examples/custom_model.yml\", 'r').read(), yaml.Loader)\n",
+ "config"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "482cf2d0",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-09-04T15:07:00.859210Z",
+ "start_time": "2023-09-04T15:07:00.841472Z"
+ },
+ "id": "482cf2d0"
+ },
+ "outputs": [],
+ "source": [
+ "# Modify values in the config and save a new version\n",
+ "\n",
+ "config[\"target_phrase\"] = [\"hey sebastian\"]\n",
+ "config[\"model_name\"] = config[\"target_phrase\"][0].replace(\" \", \"_\")\n",
+ "config[\"n_samples\"] = 1000\n",
+ "config[\"n_samples_val\"] = 1000\n",
+ "config[\"steps\"] = 10000\n",
+ "config[\"target_accuracy\"] = 0.6\n",
+ "config[\"target_recall\"] = 0.25\n",
+ "\n",
+ "config[\"background_paths\"] = ['./audioset_16k', './fma'] # multiple background datasets are supported\n",
+ "config[\"false_positive_validation_data_path\"] = \"validation_set_features.npy\"\n",
+ "config[\"feature_data_files\"] = {\"ACAV100M_sample\": \"openwakeword_features_ACAV100M_2000_hrs_16bit.npy\"}\n",
+ "\n",
+ "with open('my_model.yaml', 'w') as file:\n",
+ " documents = yaml.dump(config, file)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "aa6b2ab0",
+ "metadata": {
+ "id": "aa6b2ab0"
+ },
+ "source": [
+ "# Train the Model"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a51202c0",
+ "metadata": {
+ "id": "a51202c0"
+ },
+ "source": [
+ "With the data downloaded and training configuration set, we can now start training the model. We'll do this in parts to better illustrate the sequence, but you can also execute every step at once for a fully automated process."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f01531fa",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-09-04T13:50:08.803326Z",
+ "start_time": "2023-09-04T13:50:06.790241Z"
+ },
+ "id": "f01531fa"
+ },
+ "outputs": [],
+ "source": [
+ "# Step 1: Generate synthetic clips\n",
+ "# For the number of clips we are using, this should take ~10 minutes on a free Google Colab instance with a T4 GPU\n",
+ "# If generation fails, you can simply run this command again as it will continue generating until the\n",
+ "# number of files meets the targets specified in the config file\n",
+ "\n",
+ "!{sys.executable} openwakeword/openwakeword/train.py --training_config my_model.yaml --generate_clips"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "afeedae4",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-09-04T13:56:08.781018Z",
+ "start_time": "2023-09-04T13:55:40.203515Z"
+ },
+ "id": "afeedae4"
+ },
+ "outputs": [],
+ "source": [
+ "# Step 2: Augment the generated clips\n",
+ "\n",
+ "!{sys.executable} openwakeword/openwakeword/train.py --training_config my_model.yaml --augment_clips"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9ad81ea0",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-09-04T15:11:14.742260Z",
+ "start_time": "2023-09-04T15:07:03.755159Z"
+ },
+ "id": "9ad81ea0"
+ },
+ "outputs": [],
+ "source": [
+ "# Step 3: Train model\n",
+ "\n",
+ "!{sys.executable} openwakeword/openwakeword/train.py --training_config my_model.yaml --train_model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Step 4 (Optional): On Google Colab, sometimes the .tflite model isn't saved correctly\n",
+ "# If so, run this cell to retry\n",
+ "\n",
+ "# Manually save to tflite as this doesn't work right in colab\n",
+ "def convert_onnx_to_tflite(onnx_model_path, output_path):\n",
+ " \"\"\"Converts an ONNX version of an openwakeword model to the Tensorflow tflite format.\"\"\"\n",
+ " # imports\n",
+ " import onnx\n",
+ " import logging\n",
+ " import tempfile\n",
+ " from onnx_tf.backend import prepare\n",
+ " import tensorflow as tf\n",
+ "\n",
+ " # Convert to tflite from onnx model\n",
+ " onnx_model = onnx.load(onnx_model_path)\n",
+ " tf_rep = prepare(onnx_model, device=\"CPU\")\n",
+ " with tempfile.TemporaryDirectory() as tmp_dir:\n",
+ " tf_rep.export_graph(os.path.join(tmp_dir, \"tf_model\"))\n",
+ " converter = tf.lite.TFLiteConverter.from_saved_model(os.path.join(tmp_dir, \"tf_model\"))\n",
+ " tflite_model = converter.convert()\n",
+ "\n",
+ " logging.info(f\"####\\nSaving tflite mode to '{output_path}'\")\n",
+ " with open(output_path, 'wb') as f:\n",
+ " f.write(tflite_model)\n",
+ "\n",
+ " return None\n",
+ "\n",
+ "convert_onnx_to_tflite(f\"my_custom_model/{config['model_name']}.onnx\", f\"my_custom_model/{config['model_name']}.tflite\")\n"
+ ],
+ "metadata": {
+ "id": "JSKWWLalnYzR"
+ },
+ "id": "JSKWWLalnYzR",
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "After the model finishes training, the auto training script will automatically convert it to ONNX and tflite versions, saving them as `my_custom_model/.onnx/tflite` in the present working directory, where `` is defined in the YAML training config file. Either version can be used as normal with `openwakeword`. I recommend testing them with the [`detect_from_microphone.py`](https://github.com/dscripka/openWakeWord/blob/main/examples/detect_from_microphone.py) example script to see how the model performs!"
+ ],
+ "metadata": {
+ "id": "f9OyUW3ltOSs"
+ },
+ "id": "f9OyUW3ltOSs"
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "name": "python3"
+ },
+ "language_info": {
+ "name": "python"
+ },
+ "toc": {
+ "base_numbering": 1,
+ "nav_menu": {},
+ "number_sections": true,
+ "sideBar": true,
+ "skip_h1_title": false,
+ "title_cell": "Table of Contents",
+ "title_sidebar": "Contents",
+ "toc_cell": false,
+ "toc_position": {},
+ "toc_section_display": true,
+ "toc_window_display": false
+ },
+ "colab": {
+ "provenance": []
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
\ No newline at end of file
diff --git a/notebooks/converting_google_speech_embedding_model.ipynb b/notebooks/converting_google_speech_embedding_model.ipynb
new file mode 100644
index 0000000..4df8ea3
--- /dev/null
+++ b/notebooks/converting_google_speech_embedding_model.ipynb
@@ -0,0 +1,1113 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "838ffa12",
+ "metadata": {},
+ "source": [
+ "This notebook demonstrates how the speech embedding model from Google (https://www.kaggle.com/models/google/speech-embedding/frameworks/tensorFlow1/variations/speech-embedding/versions/1) is re-implemented in Keras manually, which can then be converted to ONNX and tflite formats for use in openWakeWord.\n",
+ "\n",
+ "Note that Keras was used here, but in theory other deep learning frameworks (e.g., PyTorch) could work as well."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "893d29dc",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2024-01-18T00:26:11.649261Z",
+ "start_time": "2024-01-18T00:26:10.190666Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2024-01-17 19:26:10.372628: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory\n",
+ "2024-01-17 19:26:10.372640: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Imports\n",
+ "\n",
+ "import os\n",
+ "import numpy as np\n",
+ "import scipy\n",
+ "import tensorflow as tf\n",
+ "import tensorflow_hub as hub # install with `pip install tensorflow_hub`\n",
+ "import matplotlib.pyplot as plt"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fe3054ae",
+ "metadata": {},
+ "source": [
+ "# Load Orignal Model from TFHub"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "fa2bc0d3",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2024-01-18T00:26:12.257919Z",
+ "start_time": "2024-01-18T00:26:11.650661Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2024-01-17 19:26:11.857817: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
+ "2024-01-17 19:26:11.858167: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory\n",
+ "2024-01-17 19:26:11.858193: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory\n",
+ "2024-01-17 19:26:11.858215: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory\n",
+ "2024-01-17 19:26:11.858237: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcufft.so.10'; dlerror: libcufft.so.10: cannot open shared object file: No such file or directory\n",
+ "2024-01-17 19:26:11.858258: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcurand.so.10'; dlerror: libcurand.so.10: cannot open shared object file: No such file or directory\n",
+ "2024-01-17 19:26:11.858278: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcusolver.so.11'; dlerror: libcusolver.so.11: cannot open shared object file: No such file or directory\n",
+ "2024-01-17 19:26:11.858299: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcusparse.so.11'; dlerror: libcusparse.so.11: cannot open shared object file: No such file or directory\n",
+ "2024-01-17 19:26:11.858320: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory\n",
+ "2024-01-17 19:26:11.858325: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1850] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.\n",
+ "Skipping registering GPU devices...\n",
+ "2024-01-17 19:26:11.858458: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F FMA\n",
+ "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Load the original speech embedding model (now hosted on Kaggle) as a KerasLayer object\n",
+ "\n",
+ "embedding_model_url = \"https://tfhub.dev/google/speech_embedding/1/"\n",
+ "embedding_model = hub.KerasLayer(embedding_model_url, trainable=False)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "6769931f",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2024-01-18T00:26:12.375204Z",
+ "start_time": "2024-01-18T00:26:12.259632Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Embedding Output Shape: (1, 1, 1, 96)\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Get predictions from the embedding model for a chunk of ~775 ms audio data (at 16khz)\n",
+ "# This is the minimum input size for the model per the documentation here: https://www.kaggle.com/models/google/speech-embedding/frameworks/tensorFlow1/variations/speech-embedding/versions/1\n",
+ "\n",
+ "# Load sample clip, and select a 775 ms chunk and normalize between -1 and 1\n",
+ "sr, sample_data = scipy.io.wavfile.read(\"../tests/data/hey_mycroft_test.wav\")\n",
+ "sample_data = (sample_data[0:12400][None,]/32767).astype(np.float32)\n",
+ "embeddings = embedding_model(sample_data)\n",
+ "print(\"Embedding Output Shape:\", embeddings.shape)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e8acfba9",
+ "metadata": {},
+ "source": [
+ "# Convert original model to tflite"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "b4ee1693",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2024-01-18T00:26:14.554068Z",
+ "start_time": "2024-01-18T00:26:12.376427Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n",
+ "2024-01-17 19:26:12.970115: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "INFO:tensorflow:Assets written to: google_speech_embedding_fixed_input/assets\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "INFO:tensorflow:Assets written to: google_speech_embedding_fixed_input/assets\n",
+ "2024-01-17 19:26:14.061242: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:357] Ignored output_format.\n",
+ "2024-01-17 19:26:14.061261: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:360] Ignored drop_control_dependency.\n",
+ "2024-01-17 19:26:14.061688: I tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: google_speech_embedding_fixed_input\n",
+ "2024-01-17 19:26:14.066816: I tensorflow/cc/saved_model/reader.cc:78] Reading meta graph with tags { serve }\n",
+ "2024-01-17 19:26:14.066828: I tensorflow/cc/saved_model/reader.cc:119] Reading SavedModel debug info (if present) from: google_speech_embedding_fixed_input\n",
+ "2024-01-17 19:26:14.076781: I tensorflow/cc/saved_model/loader.cc:228] Restoring SavedModel bundle.\n",
+ "2024-01-17 19:26:14.173155: I tensorflow/cc/saved_model/loader.cc:212] Running initialization op on SavedModel bundle at path: google_speech_embedding_fixed_input\n",
+ "2024-01-17 19:26:14.236537: I tensorflow/cc/saved_model/loader.cc:301] SavedModel load for tags { serve }; Status: success: OK. Took 174850 microseconds.\n",
+ "2024-01-17 19:26:14.288965: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:237] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n",
+ "2024-01-17 19:26:14.405541: W tensorflow/compiler/mlir/lite/flatbuffer_export.cc:1881] Graph contains the following resource op(s), that use(s) resource type. Currently, the resource type is not natively supported in TFLite. Please consider not using the resource type if there are issues with either TFLite converter or TFLite runtime:\n",
+ "Resource ops: TensorArrayGatherV3, TensorArrayReadV3, TensorArrayScatterV3, TensorArrayV3, TensorArrayWriteV3\n",
+ "Details:\n",
+ "\ttf.TensorArrayGatherV3(tensor<2x!tf_type.resource>>, tensor, tensor) -> (tensor) : {device = \"\", element_shape = #tf_type.shape}\n",
+ "\ttf.TensorArrayReadV3(tensor<2x!tf_type.resource>>, tensor, tensor) -> (tensor<*xf32>) : {device = \"\"}\n",
+ "\ttf.TensorArrayScatterV3(tensor<2x!tf_type.resource>>, tensor, tensor, tensor) -> (tensor) : {device = \"\"}\n",
+ "\ttf.TensorArrayV3(tensor) -> (tensor<2x!tf_type.resource>>, tensor) : {clear_after_read = true, device = \"\", dtype = f32, dynamic_size = false, element_shape = #tf_type.shape<*>, identical_element_shapes = true, tensor_array_name = \"\"}\n",
+ "\ttf.TensorArrayWriteV3(tensor<2x!tf_type.resource>>, tensor, tensor, tensor) -> (tensor) : {device = \"\"}\n",
+ "2024-01-17 19:26:14.405561: W tensorflow/compiler/mlir/lite/flatbuffer_export.cc:1892] TFLite interpreter needs to link Flex delegate in order to run the model since it contains the following Select TFop(s):\n",
+ "Flex ops: FlexTensorArrayGatherV3, FlexTensorArrayReadV3, FlexTensorArrayScatterV3, FlexTensorArrayV3, FlexTensorArrayWriteV3\n",
+ "Details:\n",
+ "\ttf.TensorArrayGatherV3(tensor<2x!tf_type.resource>>, tensor, tensor) -> (tensor) : {device = \"\", element_shape = #tf_type.shape}\n",
+ "\ttf.TensorArrayReadV3(tensor<2x!tf_type.resource>>, tensor, tensor) -> (tensor<*xf32>) : {device = \"\"}\n",
+ "\ttf.TensorArrayScatterV3(tensor<2x!tf_type.resource>>, tensor, tensor, tensor) -> (tensor) : {device = \"\"}\n",
+ "\ttf.TensorArrayV3(tensor) -> (tensor<2x!tf_type.resource>>, tensor) : {clear_after_read = true, device = \"\", dtype = f32, dynamic_size = false, element_shape = #tf_type.shape<*>, identical_element_shapes = true, tensor_array_name = \"\"}\n",
+ "\ttf.TensorArrayWriteV3(tensor<2x!tf_type.resource>>, tensor, tensor, tensor) -> (tensor) : {device = \"\"}\n",
+ "See instructions: https://www.tensorflow.org/lite/guide/ops_select\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Build model with specific input size, and save\n",
+ "inputs = tf.keras.Input((12400,))\n",
+ "x = embedding_model(inputs)\n",
+ "model = tf.keras.Model(inputs=inputs, outputs=x)\n",
+ "model.save(\"google_speech_embedding_fixed_input\")\n",
+ "\n",
+ "speech_embedding_dir = \"google_speech_embedding_fixed_input\"\n",
+ "# speech_embedding_dir = \"google_speech_embedding_savedmodel/\"\n",
+ "\n",
+ "converter = tf.lite.TFLiteConverter.from_saved_model(speech_embedding_dir)#, tags=[\"train\"])\n",
+ "# convert = tf.lite.TFLiteConverter.from_keras_model(embedding_model)\n",
+ "converter.target_spec.supported_ops = [\n",
+ " tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS\n",
+ "]\n",
+ "# converter.allow_custom_ops = True\n",
+ "\n",
+ "tflite_model = converter.convert()\n",
+ "with open(speech_embedding_dir + '/speech_embeddings.tflite', 'wb') as f:\n",
+ " f.write(tflite_model)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "927ebfda",
+ "metadata": {},
+ "source": [
+ "# Comparing Log-Mel Features"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "48c7138b",
+ "metadata": {},
+ "source": [
+ "The speech embedding model from Google computes it's own input features from raw audio, which is convenient, but not ideal as it combines pre-processing with the model in a way that makes the model less understandable. In particular, this is (to my knowledge) the total information provided about the feature creation:\n",
+ "\n",
+ "From the model page [here:](https://www.kaggle.com/models/google/speech-embedding/frameworks/tensorFlow1/variations/speech-embedding/versions/1)\n",
+ "```\n",
+ "The module computes its own 32 dimensional log-mel features from the provided audio samples using the following parameters:\n",
+ "\n",
+ " stft window size: 25ms\n",
+ " stft window step: 10ms\n",
+ " mel band limits: 60Hz - 3800Hz\n",
+ " mel frequency bins: 32\n",
+ "```\n",
+ "\n",
+ "And then this excerpt from the corresponding [paper](https://arxiv.org/abs/2002.01322):\n",
+ "\n",
+ "```\n",
+ "Our model is designed for deployment in an environment\n",
+ "where both memory and compute power are very limited,\n",
+ "such as on a digital signal processor (DSP). It runs on top of a\n",
+ "low footprint feature extractor that provides a 32 dimensional\n",
+ "log mel feature vector covering the frequency range from\n",
+ "60 Hz to 3800 Hz, quantized to 8 bits every 10 ms\n",
+ "```\n",
+ "\n",
+ "It seems likely that this implementation is simply a [spectrogram](https://librosa.org/doc/main/generated/librosa.feature.melspectrogram.html) with [log scaling](https://librosa.org/doc/main/generated/librosa.power_to_db.html), but the investigation below shows that this may note be the case.\n",
+ "\n",
+ "If you have a theory as to what the original model is doing, or why a standard log-mel spectrogram does not match, please open an issue on the [openWakeWord](https://github.com/dscripka/openWakeWord), I would love learn more about this!"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "6599a6a0",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2024-01-18T00:26:14.703602Z",
+ "start_time": "2024-01-18T00:26:14.555114Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "INFO: Created TensorFlow Lite delegate for select TF ops.\n",
+ "2024-01-17 19:26:14.557849: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
+ "2024-01-17 19:26:14.558198: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1850] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.\n",
+ "Skipping registering GPU devices...\n",
+ "INFO: TfLiteFlexDelegate delegate: 4 nodes delegated out of 76 nodes with 2 partitions.\n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Embedding model features shape: (32, 76)\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAh8AAAD+CAYAAACa2mffAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAABC4UlEQVR4nO29e5BVV53+/Zz7pS+naaC76XAJuUguCDrEIKKZmGCQ8WclmrKi49SQ0dExA44JTqnUq8bLOESt0egMkhknk2hpRONPdOKUZCIxnTczIRqUl1wMBkKguXQ30PQ53ed+9t7vH5m0dvr7rMkhcBq6n0/VqaLXPmvvtb5r7XW+nLOe/YSCIAgghBBCCNEgwhPdACGEEEJMLZR8CCGEEKKhKPkQQgghRENR8iGEEEKIhqLkQwghhBANRcmHEEIIIRqKkg8hhBBCNBQlH0IIIYRoKEo+hBBCCNFQohPdgJfi+z4OHz6MlpYWhEKhiW6OEEIIIV4GQRBgeHgY3d3dCIf/l+82gtPEP/3TPwXz5s0LEolEcPnllwePPfbYy6rX29sbANBLL7300ksvvc7CV29v7//6WX9avvn4/ve/j3Xr1uGOO+7A0qVLcfvtt2PlypXYvXs3Ojo6nHVbWloAAK/9P/8PIrHk+OP782a94wtb6DljhcAsH57DM7OYfRmEfPtcAFBttr+p8R1RjiwZMsvPbTvBKxHikRo9Fg7Z7fYD/u0SO1asxWidVLRqlqejFVonHvLqbkM0zOtESF89R19ZnaofoXVqgT1/Cl6c1slX7WMJx9hVXBOIkIzY48DmgetYGLxOJlY0y+OO8WmN2nUAYLDSbJb/v4fOo3Vqu+17P5bl493U55vlfoTXGVxsxyHEhw7xQXuOhB118heVzfK3XPxbWqcjPmyWRx33VoUs/x6Z1wCQ9xL2uRz3ydGyPaYAEA3b43Cs2ETr7O+bbpbP7uBr5pEnuszyIOy4H7oLdp1e3rZo3p4/zb38OvFhOwaRkl0OACFiyVZp4eMQH7bngqtOcaZ9LJ4ja2y1hF3/9/Ojn+MuTkvy8ZWvfAUf+MAH8Bd/8RcAgDvuuAP/8R//gX/7t3/DJz7xCWfdF39qicSSiBrJRzRiBzASH//e0TpVO1CRBL/JyNrtTD78hD3xQq7kI23fzLEm/gHGiEV4f05l8lGt8bbFonadmCMGrg+qiU4+Qo5FNUwWaVd8olUy3o6xC04i+YiRD9FTnXzE4/Y4xB2frokoPxav2Iktu08AwE/a936k5BjvuL2whxzJRzhZf/LB1pgwn1YIp+w2xJv5vErE7bjFQnxehU4i+ah65D8ejjkaizruB5J8RMN8vMMpe7yjTY46ZI44k4+03baAnAsAIjV77CJxfp1ojCQfniP5IJ9DfoxPrGjUvldddSJx+5irPwBe1paJU77htFKpYMeOHVixYsXvLxIOY8WKFXj00UfHvb9cLiOXy415CSGEEGLycsqTj2PHjsHzPHR2do4p7+zsRF9f37j3b9iwAZlMZvQ1Z86cU90kIYQQQpxBTLjUdv369chms6Ov3t7eiW6SEEIIIU4jp3zPx4wZMxCJRNDf3z+mvL+/H11d4zf9JBIJJBL8tzohhBBCTC5OefIRj8exZMkSbNu2Dddddx2AF57dsW3bNqxdu/blN6wYIFobv6klP9feaZzI8c051bT9BU/U3swMAIjlySbVSv1ql2oLr/PH5+wzy989fTutEwfZtQy+cWjYtzdJuTaXtUXsACVDZDcugDjscYiF+PgkHTvyyR5eR0+BCNnsVCE7xAGQVgNkrzIAwIN9nYJj891xP22W532egJcCe5Of7xg7j3ypeTKbbmOOHZVsLjSFbcUGAEQcG1ifKp9jlt9fupjWiY/YfarZoQYAsOlY6nRswCaba1v38/FmUyF5jMeg1mwrw65o3U3rsHt1yONBOO7ZKpRjVa5USJCNxAcLbbSOS+mWq6TMcs/nczuZts+XJEo7APDY5tEUX3uCGlllHJstY2QuOpY/eHG7TmqA96c8zd7E61JRMZjaBgDKbWQdIXtuvTp+Szktapd169Zh9erVuOyyy3D55Zfj9ttvRz6fH1W/CCGEEGLqclqSjxtuuAFHjx7Fpz/9afT19eE1r3kNtm7dOm4TqhBCCCGmHqft8epr166t62cWIYQQQkwNJlztIoQQQoiphZIPIYQQQjSUM87V9kWiJQ/R2vidyLWQvQM5dYTvrvfSdjcLHdyjhO0Ajhb47uhSm61YCPl8B313YsgsnxMZoXUOkx3sLvVDX7XNLGeqCABoDZfMcqaKAPgjuquOtrnULkmiXHFsHj8pPNIlpmhxkXD0hyk9BonyAABKvj1PmQoGAKqBfZ+MsG3qAGKOdtdbJxHmO/Vd9JbazfJanvc1Q5Qj5TY+dokTdruz8/k8jebsmDYd5rMxP8s+n8tViz0G28XDwwvM8lzNVpMAQJlIcYZrDuUV8XVy+Urtzc6gx+a3DprlLvuE4bKt9Ch7PKgh8tjzEFEwAUD4sH2vNB/g8yo9YM+FxBCPT6RstyH6lK2EBIBot71/strO1U2RIvF7OniU1ynNNsvz3URt45IHvvS9L/udQgghhBCnACUfQgghhGgoSj6EEEII0VCUfAghhBCioSj5EEIIIURDUfIhhBBCiIZyxkpt/WgYfmx8bpQYtE2FyjPqd8aNFR1mdM12XsbKAW62kzjB5Uf7CrYM7dfJblrne/2X02OM4YrduFiES82aYxeZ5TWH6ROT2kYdzkrnpIboscGKbSRY9Lj0klFzyH19YrjGyl3EHTFNRWy5244+W9IGANGwHbtKjd++bemiWV6s8jqtSVuuXvG4jd85zVmz3DXe56aP02OHSm32gTC/h4bn2uVtv+N10vvtdgev47LQdJ89F6JEKgkA6QG7vDCTz8Vy2R6jvlqG1okRaeqCdB+vQwwDXfJ7JtX+TW4OrXN+5hg9xmDrFQBkh205adUxTzHdntvBCVsyCgBMLR7P8nkVz9njkHyOz/mQb58vmGHLzgHAT9nrXzXD18XoiP3ZGYrzGPhRe8637LPXl1rNfjyDhb75EEIIIURDUfIhhBBCiIai5EMIIYQQDUXJhxBCCCEaipIPIYQQQjSUM1btEil7iHjjdw6HS/YW5CDMd+wmBu06+XO4Qqaasnf5RhzGObG8fSxS5nWeGuwyy5dmnqN1/mLWI2Z51eFWdbxmG5dNj3IDu7ZwwSx3GZpViKGZC5ep2v4IVx8wkmSbetihwGCmfK46ZWL4dqzK+/NM1jaEel1XL63THs+b5YkwN6uiMSDGdi5ijuu0EPPBlgjf9T5I5iIA9PRfYJaHonwc2HR03avV6baKqtrqsCwM2XOk2M7n/PA8ex2J2UP6Qhuy9rrETAkBYHnT78zy5yodtE6WGFS65m9vcZpZ7lLARUncAGCkavc1W+ZqF3quIW6qFhq0J0mswNVsIc8+FjiUV9ER+74LjdhrKQB459hrXHiEm6VGBu11Oxl2qKg67PjEIzwG8aytkAnnSds83uZx53jZ7xRCCCGEOAUo+RBCCCFEQ1HyIYQQQoiGouRDCCGEEA1FyYcQQgghGoqSDyGEEEI0lFMutf3MZz6Dz372s2PKFixYgGeeeaau88SPFRA1DLoK81rN94e4txPivbapT27+ObROaQYxkXJIs5hvWcThgXZiyJa1tUW4Fu+86KBZXnBIbZ8u2n2dE+OGR5fEh83yGOo3WxvyuYTxV2UeoMXpA3VfixpmOYzlXDLlunGoBKdFbcndiRqXCcbI5HaZ3g0TA7B8jcvLfTKunuM6zCgvHbYlegDwHDFTBIBsIWWWh49yKX0sR9oXcEmkl7LlsV4rX0i8ZnsO19Jcaus12efzEg5zxqJ9vgRzOgPwX/lXmeXZmh1PAJibsNeRGTEuv2fy7l1Zx1rqMIHMV/m4MvyKHR+XHNtP28f8Gh+7mO09CM/hYRpE2YcAv041Y9+r8ZpD9t1/1CwOD+VoFf8Ntoy9PIPPkUiZxG2aHYRarQS8zI/60/Kcj0svvRQ///nPf3+R6Bn7OBEhhBBCNJjTkhVEo1F0ddkPz3op5XIZ5fLvH0ySy/HMTQghhBBnP6dlz8ezzz6L7u5unHfeeXjve9+LAwf4V+cbNmxAJpMZfc2ZM+d0NEkIIYQQZwinPPlYunQp7r77bmzduhWbNm3Cvn378KY3vQnDw/b+gfXr1yObzY6+env5Y6aFEEIIcfZzyn92WbVq1ei/Fy1ahKVLl2LevHn4wQ9+gPe///3j3p9IJJBIOHbwCCGEEGJScdp3gra1teFVr3oV9uzZU1e9ckcTvOj4XcCpQ/ZO7PJMrhbwm+1jiRzf2R6u2l8KDc/lXxal++zd9elj3JgrF7Xb4DLf+vzRt5nlJY8P50ChxSzf2Tyb1pmTOmGWR8N8FzYzNHMpTZ4vTKfH+ot2u6se3z3OTK7CIa5+yBbtHeeW4upF4mTsOtNcLTA7PWSWb3veVisAQKapaJZXPR7Tprg9Dq0JbvjWnrAVVu1xbop1ftLede+i7PN5ujtiG6FVqlxxw4Q1qaNcHTJ4kf0fnlgLV5nNnGZ/e9t3NEPr4MRJqDmIQsalgHtN636zPOmQAR71bHO9XxfPpXWKRLkyN22vFYB7vJ8u2UaLxQpXyDARUyzG+1qJEoWMQ+2S7rcv5MX5XCzNsNsdrnKDPyYe9FM8BhFi1hdKc+VKtGDHJ3ac399BzI5PrdW+f8Iuhc5L3/uy33mSjIyMYO/evZg1a9bpvpQQQgghzgJOefLxt3/7t+jp6cHzzz+P//7v/8Y73vEORCIRvOc97znVlxJCCCHEWcgp/9nl4MGDeM973oPjx49j5syZeOMb34jt27dj5syZp/pSQgghhDgLOeXJx+bNm0/1KYUQQggxiZC3ixBCCCEaipIPIYQQQjSUM9Z0xUuEEYqNz42YcU+0wOWsoaO2gVLpsmmO69vl8SyXaw7PtzVT8RGe45WH7Qs9mecSWCYZHa5yR7NE1I7Ps8f5Xhy/3e6PS7LKTKTSUW40VqpxSdnBwTaz3PO43C2ZJHJfhzS1TMztvCK/RcI5+1h/hBunXXHVA2b5BZf00zqMauAwqyLyxnSkbJYDQDJkx60lwuW5rM7Rmi2RBoD9hXZ6LLvflq02H+PjXbG9JlHo4vPKJ4eiDrnm/FZ7HckRmTYAFPvsY8ykDgAiOXtc4w7Z7MywPa4PFmwzMYCbSl7V9FtaZyhlSzm35S6ldX43wmWm6Zg9fw4f4nME5N73HLLZ6HF7wJsO8nmV77bLmQQXAJLH7f74cb72lKaTtjk+02oXzzXLIwUuLw/V7HbnLmqjdWLD9pxLHbYfJxDy+PryUvTNhxBCCCEaipIPIYQQQjQUJR9CCCGEaChKPoQQQgjRUJR8CCGEEKKhnLFqFz8Wgh8bvxOZGd2EfL4DGTV717DD6wzlaUS54lC7kA3nqCX5jurEIXun83PncbO1c9JZs7wzZRtfAcCeIVuBkT3ATbGGm+0dzZ3pHK3TRFQt2QpXBAyVuRkSIxLh4xAEdrxZOQD4NTIZiMEgAPgJW7EQyfAd5wlivLdrhKubfDJRwyGumIiRY9EwV0zw6/O4HS7a8+eZo7ZhGAAURriLdSxn95WpzwAgecwuT/fxcSh02icsFbgR3GPPn2uW+77D9I6EO9LOVQG1nN2GZ8s8pkOebZ75liZu6Dk3aptX7qtyZdreqq2OW5TupXU64ny9+P+G55jlB1vbaJ1ykSjTylztEo7Z60X2UocRGhk7l3IlRCYque0BAPERuw3FLr5mMq++sMfreMbnKeD+fEr325+dlRn2fKuxddRA33wIIYQQoqEo+RBCCCFEQ1HyIYQQQoiGouRDCCGEEA1FyYcQQgghGoqSDyGEEEI0lDNWapsYrCBqmMjVmmyZlW+Y0L1IdOG8uq/P/LeqzVyWxPDivE7rc3b5bzu49PK3vn0s5JBEBmFbapbq5/K0Z8l19jpkY36cSGAdbXMoRpE4bo9r8jiX2vpR+1rJEYchFJFyegne7ioxNKu2caliE9FjL299ltZZlDhklncTs0AASIfsca0GPNhDvn2s17MlmQDwWOF8s7xQ45LV354gjl0A4lk73oVZvN1BxJ4jsWE+DjUy4IHLsLDFPl/lSS5XjxZIf0a46V2EGFHuL3LDwvMyR83y59gkBXC4Zt/IQz6X+XdFbZn/r4vn0jrHqtxkMB625/D5M4l+GsDxoi3znJnO0zpDJVvOX6rxj8CjA3bsKlG+Zg7DPhYbdsj8yRrX1Mdl8dW0XacW4ddJHbVj3dTnMLBL2v0pt9lxq1VfvpRf33wIIYQQoqEo+RBCCCFEQ1HyIYQQQoiGouRDCCGEEA1FyYcQQgghGkrdapeHH34YX/7yl7Fjxw4cOXIEW7ZswXXXXTd6PAgC3HrrrfjmN7+JoaEhLF++HJs2bcKFF15Y13W8dBSh6Pjmhav2rvfE0QI9V3mmvTvatcs3WrR3+dZSDvUDUcIkcg45x0nAjMvCEcfu6LytPohwQYBThcIIiIFTqIlLZAKHMVc5ZKsCksd4nWjJbkP6GI/P8Gx7vEszuUKm2mYHaHorn4vDnr3r/pLkQVonEbLbnXeYKfaRQ89VO2id58r2sT1FXufpE11m+YEj7bRO/DBXepRm2A2PFvl4R4p2eXj3flonuGah3bY0n6flkt3uRM4xf6eR+6HM/98XP2Efe/yobcIGAANlW5HUkbDNIQFgYZM9586LD9A6T5TsNgxUuKpmWoyrUGJkbhc9PkeY0WGUnAsA5racMMtzVW7EVq3Za4LLaHHYZ3Fw/T/fPl/aFjABAGIFe14lhviC7iXIGtfOlWnNz9rqpnC1ySyvEQWVeY6X/c7/IZ/PY/Hixdi4caN5/Etf+hK+/vWv44477sBjjz2GpqYmrFy5EqVSqd5LCSGEEGISUvc3H6tWrcKqVavMY0EQ4Pbbb8cnP/lJXHvttQCAb3/72+js7MSPf/xjvPvd735lrRVCCCHEWc8p3fOxb98+9PX1YcWKFaNlmUwGS5cuxaOPPmrWKZfLyOVyY15CCCGEmLyc0uSjr68PANDZ2TmmvLOzc/TYS9mwYQMymczoa84c/rumEEIIIc5+Jlztsn79emSz2dFXb2/vRDdJCCGEEKeRU5p8dHW9sPO9v79/THl/f//osZeSSCTQ2to65iWEEEKIycspNZabP38+urq6sG3bNrzmNa8BAORyOTz22GO46aab6jpX8tAwooYONJSzZVtBijiDAfBm27IghzILkSrRKjp85cpt9sFakldixmXRIX4dr0rOV+HXiZfIdbgKjpq6RYm0EQBKM0kMKjzPjZR5uxOD9rEK9/JC4oQ9diOzuCFUaYZ9HS/J9cZRYgB2rI8n0N8Nv84sj4SX0Dqd6WGz3A94TCu+3ddijUsYwyE7blWPx+1ozpZ4xnod92OKS4TZXJj1CJfFs/9ChdK2rBkAArLyVQo8PqGCXSnKlYqUxFEe0/SAHZ+WBHG7BJCM2BLHZ3MzaZ2yb/fn6Qg3/kuR6wxW7DUWAHYOcZPMkYo9T9i8AoBYzJ4LNSKNBYBalRgt5vl4x5pt2erMNi5fHiZT2/VogDBRxxZm8v60P2XL+cvT+X0XkNORaQAACFXtWEeKdnlQc9ynL6Hu5GNkZAR79uwZ/Xvfvn3YuXMn2tvbMXfuXNx88834u7/7O1x44YWYP38+PvWpT6G7u3vMs0CEEEIIMXWpO/l4/PHH8eY3v3n073Xr1gEAVq9ejbvvvhsf+9jHkM/n8cEPfhBDQ0N44xvfiK1btyKZ5A9zEUIIIcTUoe7k48orr0QQ8K9NQ6EQPve5z+Fzn/vcK2qYEEIIISYnE652EUIIIcTUQsmHEEIIIRrKKVW7nEpChSJC4TqczWK8K/GcvQO33OYwuJpm52XlDN+17JNd78UOnuPVyFYYL8X7Hp5u73r3a/w6XpUcC3gMmCKgOJvLhMLEQC4a5f2pZvkO7ShRGHhx/tNfsdM+5rti2mK3u6mZexKxmdAe5+ZK52YGzfLnhqbTOtmKrdrwfD7eJwp2HWaW5aJa4fdWbdieP6EWPj6xYYcpYME+RoQ4AIBEP5FfJfm8Ykq3lmncFLCQsM8XOmQbVwJAdZp9ofAAjylTwEUcTo+zkvaToWfGuTIjV7PnSNHj8p3WqH0/uMzWmmNcpdM33GKWl4b52JXIojnvXO7ENtBjK3gCxzxtmmnHbijPVVRMxRTL8+t4cTt25WkOA7tzyf3d5FDVECFKmCk7ARTOn2aWRwv2vPYjL3990TcfQgghhGgoSj6EEEII0VCUfAghhBCioSj5EEIIIURDUfIhhBBCiIai5EMIIYQQDeWMldrWDhwCQuNlfKGo3eRwy3x6rpBvS4mYBBcAailbQth8iNcZ6SZtc3jtxIjStWibAAMA/KO21MyhxKOGXfGsQzrMZgdzKALgD9vHAkfbHMpUarrkcvhjsswKV0TCJ1LkkazDnKxs97XcxuW5C2cfNstnxB0Of4REmAdupM2WKmarvD+5ij2vTpR4nVKTPYGLZS7hrgbchKy51y6PZelEQK3FloaGXANOpk8swmXkrc22pLeQ5PFh16kQCS4AVIhB5eXNtkwbAFoi9pyLRfl1Xt100Czvr3LXxiq59zsTttT3BbjR4uIO+37Yl+TS8+N5e1xbE/y+O3SpLaGOJ/g9lM3a1wlHuDQ1RZTaI3NoFaSP2OcLMwNRAJVWYuCZdsnY7eswaTcAVFrsD4GWQ+T61ZefUuibDyGEEEI0FCUfQgghhGgoSj6EEEII0VCUfAghhBCioSj5EEIIIURDOWPVLtH5cxENGzv2q7Z0JBg4Ts8Vq9k7vguv4juqoyV7Z3DTM9y8KFKyz+fH+G7iaou9ezx9hCtKSjNs6YhDhILkoN2G5HG+c7vUTgyuiHLmf1phlzqqeNxDCrUm0tfpXP2QSNk72DubiAEZgGTUnldVh3lblpi3tTdxc7ILEv1meUeMqwV2ka3yzBgMAA6M2IZQB4e4kqFGTOcqDuM/MtwIF/hkTA7wmMZz9njn5/C+tuwdtg84FFbsXknGuDQtRowuy5fysQvvt43T/Ay/TqhkN264SlwoAaTT9v3gUkSFSYBizHXPcb5morYBgAJz3AQQIW3YB742l4r2+Q5m+dxmqpayQ5UVEKPOIMzjUyNCLpeqMFK2b6IKFwkhdYwpOF3rud0fqmoEkBwk6zm5f1yqxpeibz6EEEII0VCUfAghhBCioSj5EEIIIURDUfIhhBBCiIai5EMIIYQQDUXJhxBCCCEaSt1S24cffhhf/vKXsWPHDhw5cgRbtmzBddddN3r8xhtvxLe+9a0xdVauXImtW7fWdZ3afttYDr4tc4p2ddJzBbkR+4DP5VwMv4VL/mInbLlZEOM5XrRkD8GJCx3XGWGSKS6zYgZtiSzXRsXytjysNI3Lxrykfcyh3kNppqvddl8rDvleOWHL5wZa6pfVgVz/hUp2cSnD27arw5bNLml6ntZ5Y+vvzPLtI+fTOhEiC41G+HhXK/ZcDMV5naBc//9fXJJwZsRWnM7nXBC25awtz3PJM7tXmmNcwj0taZ8vX+Hj3bTAfgRAzeNxO9Fvayx9h5kik7N2x07QOkxSO+yQ586M2rLiw1Vb2g0AzZEyPfa7kQ6zvD9rjykA1Ar2PB0abKN1osN2vEOORwBEyDyND/Hx9pLEJK7ikNpW7DrRAq/DZLOZ57iEuzSNzDnHLVzsIJ8B0+0x8Mqn0Vgun89j8eLF2LhxI33PW9/6Vhw5cmT09b3vfa/eywghhBBiklL3Nx+rVq3CqlWrnO9JJBLo6nJ4wgshhBBiynJa9nw89NBD6OjowIIFC3DTTTfh+HH+9NFyuYxcLjfmJYQQQojJyylPPt761rfi29/+NrZt24YvfvGL6OnpwapVq+B59u+LGzZsQCaTGX3NmWP/Ji6EEEKIycEp93Z597vfPfrvV7/61Vi0aBHOP/98PPTQQ7j66qvHvX/9+vVYt27d6N+5XE4JiBBCCDGJOe3Gcueddx5mzJiBPXv2mMlHIpFAIjHeuCoUiyIUGt+8SMcs8zr+0WO0DeFZthImdYgYUgEozCOuPh5XZoxc0GyWM+MgAAjX7GMOMQe8BDmfY+c228AeuMQc5JjLKC9MNlsXOnkMXHgpu56XdjgYEXVGJMbr+KxLEd7ukEM5wugv2/Mq0szPVQpslY7LAKw9YSszRlLcJI4pMKp5rhIKF21JQKjmmIyOqcAMHdnufgAI1+zYhUe4ciXk2w5gmQQ3H5yXHrSv4+hQKmLfeIkIVyX8bGChWR4N1T/fnivbahIASJJFoeqQIz1XnGmWu0wOnxzk+/+OniCqlkP8fCli0uY01iR+oGUu0qHnSw/w8c6dZ5e7zDM9sp66FIJV8vE0dD7/SK8Q373KND6vwlW7bYnjdrmjyePPXcd7T4qDBw/i+PHjmDXLThqEEEIIMbWo+5uPkZER7NmzZ/Tvffv2YefOnWhvb0d7ezs++9nP4vrrr0dXVxf27t2Lj33sY7jggguwcuXKU9pwIYQQQpyd1J18PP7443jzm988+veL+zVWr16NTZs2YdeuXfjWt76FoaEhdHd345prrsHnP/9586cVIYQQQkw96k4+rrzySgQB/83r/vvvf0UNEkIIIcTkRt4uQgghhGgoSj6EEEII0VBOu9T2ZIl0dyISHr9PJIjZTWZyWgBA1Za1Bcw5CEDqiC1V9JsdpkJxW35UbuU5XttzthldpOwwQSPN9h2jWUvb5YWZPAY+aYJLOlwiBmCVbm5WBY/LMkNEyumUFcdtwVciwdsQIdLdZIxLIgtley7kT3CZ4PMj7Wb5LxNEowfgmtYnzfK21jyt80BgyzX7i9ywqylpS1ML4P1hOLzJkDzO50+liRhm7eUnDKLkvuu2pe8AN/o6XrIluADQlbSfvjwrmaV1EkR7/nSOq/+Ykd9gmdzEAPpito4y4RgIJqmtOTSrAyU7pkfyRPsJYOAYPxbpTZrlsRHHmkD0nC4hMnt0gUuey0JXbeJtSw3Y5ZGiY8632edjZqAAqFy9yqc8qq0kQjO48V/QZ48PU33XowbXNx9CCCGEaChKPoQQQgjRUJR8CCGEEKKhKPkQQgghRENR8iGEEEKIhqLkQwghhBAN5YyV2gbJBILIeKltdZot+wtXuJ9eiMiSCt1cQsjkpNXm+vO11CBv20i3/dj5SoZLs5jbbBDldagTrUOyyuS5PpEUA0B5JtFahR3usExOC4cbr8/HwQ/b07rk6Cuj4Gg3gvpjmq/Ymr/Hjp9L64TJBP4/rTtpnbdmnjDLBytcrskotHN5eT5iS/FAnHgBoJLhY5fI2fMniPCgFmbY86flENf9MSfPWJjfq8fKto6x5PFldFrcdsl98mA3rRMesOO9LzWd1kkS99yuFHfuLpN2D9e4Fcaxoh2DYoWPN47x80VO4qZsOmzfD5UMPxdbz+NcJU3PV253rGXEHtuzzYABAAFZY1yRCZEnADAXWoDLy6slPn9DHbYMN5+2x9svvnxfW33zIYQQQoiGouRDCCGEEA1FyYcQQgghGoqSDyGEEEI0FCUfQgghhGgoZ6zapdTdjGh0/E766LC9qzuStXeVA4DfYu/IrziUK4bQBgAXOACg25Nd12F1qhmHeidtHwvH+O7+QrvdBu8A34lezdjn85P8OuEme3xmto/QOkcPTKPHQkTV4sf4jvPwiK1+cHoeEaVQOObYvc3mQo1PEqYKCDFJFoDnC7bK4ZkkNydri9jGiH/UeoDW+W3EPp9LAZJN2/fWYDM3aKsQ1QgAoNcuHul2KKK49x/Fa7MrzWsepHUqxLmx4vO2HSV9dY23lyLqB0edaNie3XFHcNixKpPTAWhN2EaYw8RkEQAiZX4/MIVeNOeoQ8Jd4Z6J8GfY52vez2MaG7aP1VIO0ztm+MZvB27G5hDaMVVLpY2vcmxYw0k+R8IREoOEvSYEvtQuQgghhDhDUfIhhBBCiIai5EMIIYQQDUXJhxBCCCEaipIPIYQQQjQUJR9CCCGEaCh1SW03bNiAH/3oR3jmmWeQSqXwhje8AV/84hexYMGC0feUSiV89KMfxebNm1Eul7Fy5Up84xvfQGdnZ10Ni5Q9RLzxsp3IsG10g4jDaCxhd7PaxCVTNaJY8hymasy8yEWU1XGkhdGELY0KuczbiAas3EWc2wCEUkQ25fEYxEjbStWTU3VH8+RaDt0sk/Y5PJeo3C1U5YZZTPLHhZdAuc2OQ4RIJQFgsGybwe3Mz6V15iZsyWh7lEueF6T7zfIUMS0DuFFdOsbr7M1zWWahn0u/GTXiD9nU5xhwEu6jJS4DZgZ/h0YytE5n2o531CHh9prt2Lmktj55BkDCIbWdEbNN59i5AC43LjvuExf0HirxvqaP2n3y4nyNYTLT+IjDfJA8IiFqq40BcMO3eNTxWZMmcmO29gHwkuTRAC5ZM3k8gVdxrFhxMk/Z51MdX2fU9c1HT08P1qxZg+3bt+OBBx5AtVrFNddcg3w+P/qeW265Bffddx/uvfde9PT04PDhw3jnO99Zz2WEEEIIMYmp67+jW7duHfP33XffjY6ODuzYsQNXXHEFstks7rzzTtxzzz246qqrAAB33XUXLr74Ymzfvh2vf/3rx52zXC6jXP79txm5XO5k+iGEEEKIs4RXtOcjm80CANrb2wEAO3bsQLVaxYoVK0bfc9FFF2Hu3Ll49NFHzXNs2LABmUxm9DVnzpxX0iQhhBBCnOGcdPLh+z5uvvlmLF++HAsXLgQA9PX1IR6Po62tbcx7Ozs70dfXZ55n/fr1yGazo6/eXvJ8ZSGEEEJMCk7a22XNmjV48skn8cgjj7yiBiQSCSQS9W8yE0IIIcTZyUklH2vXrsVPf/pTPPzww5g9e/ZoeVdXFyqVCoaGhsZ8+9Hf34+urq66rlFLRoDo+F241Tm2e1A8x3fXV1vsndguQ6pYnqhD+MZ2lNvJTmOHCKZATIrCTUTVA6ApbR9z7lInapNajE+BFGlDyCEimNGcN8uPjXBnpUiefwFXbSa7uh3+RX7crsNMrAAgVCLGe2mHiV7FDoSf4NepVsg4xPlkLFRtdcjhIp+MNWJ2FiOmhADQHTthls+LH6N1DldtU8DHgvm0zsGmNnqsPM3ua+KEYxc/mT4j3XxuR3L2GB12KFdyBdtEL+xQmeWJ4VqtyhUGibS9lvnEZBEASp69xj2V5eaDF7TY8WmO8LWnFrfb3d5kGxkCwKEkd3xj91CkQqug2mS3Icw/AuCR/9/6DhWKTwQ8Icfa09RvrxfMIPN/jtZVDACRkn3QtV75M+ygRo5x9ZmXstudPmjPHa/stO8cQ10/uwRBgLVr12LLli148MEHMX/+2AVmyZIliMVi2LZt22jZ7t27ceDAASxbtqyeSwkhhBBiklLXNx9r1qzBPffcg5/85CdoaWkZ3ceRyWSQSqWQyWTw/ve/H+vWrUN7eztaW1vx4Q9/GMuWLTOVLkIIIYSYetSVfGzatAkAcOWVV44pv+uuu3DjjTcCAL761a8iHA7j+uuvH/OQMSGEEEIIoM7kIwj+9yd4JpNJbNy4ERs3bjzpRgkhhBBi8iJvFyGEEEI0FCUfQgghhGgoJ/2cj9NNpOwj4o2X7RS6bFlQrMCliuU2W5rF5FcAMDKb5GUO+RPDd0g8faJy6pjBHzM/q8k+lq/yDjFTqqNJLoGd05o1y2tM2wjggpajZvme+Exa5+kRx0D4dsBDA1weFkTqN/jzU7ZELIg5pGssdydtBrgss1Tk/elnMmmHfC9btt3WysQYDACWZ541y5MODSOT517SbMtSAeBgpo0e6/dtWabHw4O4PU2pWSAAxAft2A2NEJc6AD4Z18AxDjOm2eZtiSjXa44U7fuhto+b3j0fsTtbq/G27RmYYZbPmsbXnpa4LcNtTXC3tYPTuG42dMjuq2OJQWGmfdBlRheE7bGLOqShlVay9jikthViVhqp8LZRGbnjs6ZKpkKk4JAOZ23tMHs0AQAgbsen1GGX+6XTJLUVQgghhHilKPkQQgghRENR8iGEEEKIhqLkQwghhBANRcmHEEIIIRrKGat28eNh+NHxuVEtae/mrTbxrhQ67BxrZA7f5eu12Tv8k218V7dHdpZXS44wF20lzsL2Plrl4qYj9nUCblbVW2o3y5tj3ETqdW377ToRHoMycWN6bsTeWQ8AIYcxl+/Z411rcWw5Z6dLvvyd2C8H9tC9kMMUy6vacyTsUNWEiMqiSFQwAFfCHIm00jo7o3PN8nMStqIFANJhu7MjDilZvsKlK2xqOU0gR+pXNzHvtOKwQ3lFxi6W4fdQpUaM2NJFWufYwTb7Oo4pXzhkyx9C5P4BAOZDub8/zS/Uaq+Lrns4cKi/mHLEpVRqPuwIBKE43R67cgv//3dy0G5Ebh5fZwvExy/V7xgHchsnTjjmNRs8h7moH7f7WsvweIZH7MYxQ0CwcuscL/udQgghhBCnACUfQgghhGgoSj6EEEII0VCUfAghhBCioSj5EEIIIURDUfIhhBBCiIZyxkpta8kwEBufG/lE5cSMg1x407lhVqLZls9lmrhELh6xJUt9g1zeGG2xr7Oo+SCt86b078zyMDGPA4D/jpxvll/cxPPPtzb9lh5j/Ko0xywfrnAJY1smT4+d2D/NLA87zKq8it2npgyXCFer9sTyPR6fIGHPOZ9IMgEgREzvXHLEeMrWmc5s4nGrkhslGeFznsHk0wAQI1pJz/H/mkSU62azbUS+7Gh2bMQubznoMJucZvcp5JA8szGKRHmdwSMZszzf5pD0Joj00SGlTxy3jxElNAAuc2VmlwCAfvs6TC4KuI01I2U7phW+ZMIjklHXZ0C4Zreh2sLreEkiTeX+fpSyvYwBAKLkI2V4nkM2G7P7QyWwALykXSfS4ri58vY8DTN1OVedjz/Hy3+rEEIIIcQrR8mHEEIIIRqKkg8hhBBCNBQlH0IIIYRoKEo+hBBCCNFQ6lK7bNiwAT/60Y/wzDPPIJVK4Q1veAO++MUvYsGCBaPvufLKK9HT0zOm3l/91V/hjjvuqKth4UqAsGHclRyyd5ZXHAZBbFc3iCoCAMoj9i7fEw7jHrYbPupwhGpN2wqMwVoTrXOgZm+djtOOAsdqLWb5QMUuB4DXpZ4zy5OO6wx5tinVzBSRJMBtNBZqs7frp5scBn9kl3rcobIIEaUQU8EAQI2YhgU1l/KK7NR31PFT9rHAMRcTEbuvLkUUg5nHAUCSyFAi4AoQVxtqTXa9+Al+r3pEdVRN8zoR0qWgzMc7FLfnfbnI1UAg6qby8RS/DjGDS5xwGLSRcMezPNbRkn2sRuL5woXs4liBX8dlChgQQ7pSG29DlNz6XoK3gS1ZTX18LYuUiTqkxOdIpdVud3k6rYIqUc8wdQoABGRqs3gCoHORqQMBIERimhyw63hEvWRR1zcfPT09WLNmDbZv344HHngA1WoV11xzDfL5sZK/D3zgAzhy5Mjo60tf+lI9lxFCCCHEJKaubz62bt065u+7774bHR0d2LFjB6644orR8nQ6ja6urlPTQiGEEEJMKl7Rno9sNgsAaG9vH1P+3e9+FzNmzMDChQuxfv16FAoFeo5yuYxcLjfmJYQQQojJy0k/4dT3fdx8881Yvnw5Fi5cOFr+p3/6p5g3bx66u7uxa9cufPzjH8fu3bvxox/9yDzPhg0b8NnPfvZkmyGEEEKIs4yTTj7WrFmDJ598Eo888siY8g9+8IOj/371q1+NWbNm4eqrr8bevXtx/vnjH/G9fv16rFu3bvTvXC6HOXPsR3QLIYQQ4uznpJKPtWvX4qc//SkefvhhzJ492/nepUuXAgD27NljJh+JRAKJhMPnQAghhBCTirqSjyAI8OEPfxhbtmzBQw89hPnz5/+vdXbu3AkAmDVrVl0N8xMh+LHxsh2rzFUOAPFhWy4UO8ElUz4xL6o4ZILxtC07LBW4lDSVsDV/PtO0AagSFycmewSAjpi9l6bqMKt6unwOPcb4Vc6eE0/28fEvDjuSz6LdvrxDUeYX7fgUiNTspCHy2Mgwj6nXbGsiw0W+/crP2MeKNS7x7EgP02MMbhLH52IpsNtwoNhulgPAQNbhzEXk6i6ZKZvCI918HCIVIqPM8TqxYbuvtRSfV/45ti40cMzFgBgThjzetqYj9rxi/QSAKjGVjBXrl+dWHAaV8RyXsyaO205k8SxfMyMVu6/lNv5xxqSp5Vbe7tSg3e6WXi49r2TsNgQRPnbMdC7s8oAkQ1Sd53B2Y+aVjmUx3mHv1yzn7XvYJ/PDoq7kY82aNbjnnnvwk5/8BC0tLejr6wMAZDIZpFIp7N27F/fccw/+5E/+BNOnT8euXbtwyy234IorrsCiRYvquZQQQgghJil1JR+bNm0C8MKDxP6Qu+66CzfeeCPi8Th+/vOf4/bbb0c+n8ecOXNw/fXX45Of/OQpa7AQQgghzm7q/tnFxZw5c8Y93VQIIYQQ4g+Rt4sQQgghGoqSDyGEEEI0lJN+zsfpJjbiIRodv9vYj9o7dsNVh1lVkqgSSjz3qrUQpyaivgAAL27XmdUxROtEiHrGpRZgZCJFeuyytG0Sd368n9Z5ZGSBWf7UMFeu7BmcYZYXj9mGcwCQPuDYpc5EDiHH1CV1/JjjZ0NyIVcdl/ETI1Ql87eTG+W1EPPBGQ6zPobvMKNjyqdnRvh4n6jYBmm7DnKllD/I1U2pPqLAGHGNnV3sxXlfqdkZUwQAqKWJEVszV3OwFSZwmJMxtZSXpFWooZkf5WscUwkFYYeBHelqzCE/qzgUJX7M7hQzCwSACDFCY4oWAMieZ3c2McTbXcrYdVyGhSw+Ve4TinInMYFs4nKXSJR8PpUdaykxLAyxc4GbVwaziYqrwNexl6JvPoQQQgjRUJR8CCGEEKKhKPkQQgghRENR8iGEEEKIhqLkQwghhBANRcmHEEIIIRrKGSu19aNh+LHxuVH6iG2c4yW5dC3UanczxOR2AJL9RGbVzKVZoSG7zpHj3CQpNM02KTo+wqWpv2uaaZYz2S4APJIa7ygMAFkilQSAwycyZnkyziVgQwMtZnnUYdgVtb2LAACRst0nj8inAQBMOeaQEDITp4hDOcZk38Uufp0KkbWFI/XL3QYKdqwBIBq2z5eI8El/ILAdrti5nNdJ8DlSKfA5l9lnn6/SXL/8k5mgAdxwLUyk0IBDWp3g8aGmXTFeJ0wM5EKOy0SIn1jUYRIXHyEGbS0OKSlpQ3qAj3ehgxsgxvJEZkrM9QBuIhoj/QGA9mfsOAQRx3gTqbZL0hshj3yIOtaRUIXIpMHjFhqxGxGe5TCWI6drbeEL8EXTB8zy57LTzXIvXsZ+3oIx6JsPIYQQQjQUJR9CCCGEaChKPoQQQgjRUJR8CCGEEKKhKPkQQgghREM5Y9Uu8WwZUUNNEDlum2mFpnHnnsSRYbO8luTmbcUZdl6WOMF3R5fb7PJ4lis9ykXbWKnQwnc6+zPttvkOU6xDR+3G+Tl+HaRsGUE54tiFXbbbFs3ztjGDKwCI5e3yiEPJ4BNxETMYBIBogZzP4WdWnmmfr+YwnAuIuiniULu0Ju2t8tOTJDgACjU7CMUaH7uKby8HtYD3pzlq766f1sRNDvtCrbwNLQ4VE6tDTNUS2fqVHtGiY0kk/1ULXMZc5J4MOYzlwmXSn0Hen2oTU4DwOrER+/4utTnM6MiharPDcJP7CKLcZsc7fZjLQwJDBfnCAX6dWoi0z2FIylR41SYeHxY7l2oudcRum2tdrLTZc84b5vf3zDknzPJqjV+oRga8o8n+HK7CXt8s9M2HEEIIIRqKkg8hhBBCNBQlH0IIIYRoKEo+hBBCCNFQlHwIIYQQoqEo+RBCCCFEQ6lLartp0yZs2rQJzz//PADg0ksvxac//WmsWrUKAFAqlfDRj34UmzdvRrlcxsqVK/GNb3wDnZ2ddTcsXKwiHBmfG9U6bJmeZUL3IsxvzWU8xSRqnq2MBcBNqVxSs/gQMRVy9IdJaj3PUYdIakPEtAwAgjKRYKW4OVmIxoDH2mXmVSX+emGHKSAzhHKNN5MQltt422K22gwBMZwDuBSuluAd8ny7cdkyN2iLEMO3sMN8kBFlzm3gRnWu67jGrpq2Y9dyiLfBi9vzNJbnElhmChh2KAWjIPfqUW4c6bXbnY1l+b3KZJmlGY65aD9NADXuT8mvT8wcAaCcIYZmNd4f571KlKG58/jcTp5gEmEuGWVybJcUmU3hkO+oQ6ap63ODXYfc9gCASJHMxQ4e7OGC3Yi2Zm4sl6vYdQaL9sTyCg5ju5dQ1zcfs2fPxm233YYdO3bg8ccfx1VXXYVrr70WTz31FADglltuwX333Yd7770XPT09OHz4MN75znfWcwkhhBBCTHLq+ubj7W9/+5i/v/CFL2DTpk3Yvn07Zs+ejTvvvBP33HMPrrrqKgDAXXfdhYsvvhjbt2/H61//evOc5XIZ5fLvs6VcLldvH4QQQghxFnHSez48z8PmzZuRz+exbNky7NixA9VqFStWrBh9z0UXXYS5c+fi0UcfpefZsGEDMpnM6GvOnDkn2yQhhBBCnAXUnXw88cQTaG5uRiKRwIc+9CFs2bIFl1xyCfr6+hCPx9HW1jbm/Z2dnejr66PnW79+PbLZ7Oirt7e37k4IIYQQ4uyhbm+XBQsWYOfOnchms/jhD3+I1atXo6en56QbkEgkkEg4dmQKIYQQYlJRd/IRj8dxwQUXAACWLFmCX/3qV/ja176GG264AZVKBUNDQ2O+/ejv70dXV1fdDfOaEwhFjZ22ZKexl+Q7neMHbUOdYB7fCh4lO779GN9xHiKb68NVWoXW8dN8pz4jleJb9YOk3YhyiRsR1bJkF79LIRMjcXN0Jzzo2MVPNmKHaw5DKKJqiTiUDB7pavgYr8OuE3KojryEfawS5wn4QW+afR2HooQdicW4aoSZ283K1L8Pa7jE+xMpuRREdstrCV4nTMzBXAZgiazd13iufjVQtdlhhkfmfa2V3xDhKlPv8MuwdcRzxK3Sal/Hae5HDrkUfawOAISIci+R5fOUGUQyxRoAxIfrN45kSiF2D7vawNYXgKuBfLKWAkC13R7woMA/0qsR+3yZBHe9K3v2+ZhyxuPCmXG84ud8+L6PcrmMJUuWIBaLYdu2baPHdu/ejQMHDmDZsmWv9DJCCCGEmCTU9c3H+vXrsWrVKsydOxfDw8O455578NBDD+H+++9HJpPB+9//fqxbtw7t7e1obW3Fhz/8YSxbtowqXYQQQggx9agr+RgYGMCf//mf48iRI8hkMli0aBHuv/9+vOUtbwEAfPWrX0U4HMb1118/5iFjQgghhBAvUlfyceeddzqPJ5NJbNy4ERs3bnxFjRJCCCHE5EXeLkIIIYRoKHWrXU43QfDCjtxajTwjnqhdalW+e7zm2eeqVfkuXz9EPEoqDt+OMnnePt+4zdUuRf6Mfp/sTvZq/EIBUaj4JV7HL9qNC8HRoSKZUs64OdQhFXu8A8+h9CCXChyqIzZ7mI8OAICoLLyKw2OHKGRc4x0K2cdORu3iOdQuILvha1Hu11CN2RIil8eDV+L3HRvvECkHAI8p0BzqJrZeuMaO4fPuwC/aky5U4uo8jxwLO2wzQqSvLpVZrWrPBVcMggiLtUM24riF2BixtgGAR9fm+v29XGoXIvSA5+gQVbs4plXA1C4OLyq2NgfkHgaAUMSei7U8n1g1j8yRgj3p/eIL53rxc9xFKHg572ogBw8e1FNOhRBCiLOU3t5ezJ492/meMy758H0fhw8fRktLC0KhEHK5HObMmYPe3l60ttqOtpMdxUAxABSDF1EcFANAMQDOvBgEQYDh4WF0d3cjHHZ/i3jG/ewSDofNjKm1tfWMCO5EohgoBoBi8CKKg2IAKAbAmRWDTCbzst6nDadCCCGEaChKPoQQQgjRUM745CORSODWW2+d0uZzioFiACgGL6I4KAaAYgCc3TE44zacCiGEEGJyc8Z/8yGEEEKIyYWSDyGEEEI0FCUfQgghhGgoSj6EEEII0VCUfAghhBCioZzRycfGjRtx7rnnIplMYunSpfjlL3850U06rTz88MN4+9vfju7uboRCIfz4xz8eczwIAnz605/GrFmzkEqlsGLFCjz77LMT09jTwIYNG/C6170OLS0t6OjowHXXXYfdu3ePeU+pVMKaNWswffp0NDc34/rrr0d/f/8Etfj0sGnTJixatGj0qYXLli3Dz372s9HjUyEGL+W2225DKBTCzTffPFo22ePwmc98BqFQaMzroosuGj0+2fv/IocOHcKf/dmfYfr06UilUnj1q1+Nxx9/fPT4ZF8XAeDcc88dNxdCoRDWrFkD4OycC2ds8vH9738f69atw6233opf//rXWLx4MVauXImBgYGJbtppI5/PY/Hixdi4caN5/Etf+hK+/vWv44477sBjjz2GpqYmrFy5EiWHS+jZRE9PD9asWYPt27fjgQceQLVaxTXXXIN8Pj/6nltuuQX33Xcf7r33XvT09ODw4cN45zvfOYGtPvXMnj0bt912G3bs2IHHH38cV111Fa699lo89dRTAKZGDP6QX/3qV/jnf/5nLFq0aEz5VIjDpZdeiiNHjoy+HnnkkdFjU6H/J06cwPLlyxGLxfCzn/0MTz/9NP7hH/4B06ZNG33PZF8XgRfugT+cBw888AAA4F3veheAs3QuBGcol19+ebBmzZrRvz3PC7q7u4MNGzZMYKsaB4Bgy5Yto3/7vh90dXUFX/7yl0fLhoaGgkQiEXzve9+bgBaefgYGBgIAQU9PTxAEL/Q3FosF99577+h7fvvb3wYAgkcffXSimtkQpk2bFvzrv/7rlIvB8PBwcOGFFwYPPPBA8Md//MfBRz7ykSAIpsZcuPXWW4PFixebx6ZC/4MgCD7+8Y8Hb3zjG+nxqbguBkEQfOQjHwnOP//8wPf9s3YunJHffFQqFezYsQMrVqwYLQuHw1ixYgUeffTRCWzZxLFv3z709fWNiUkmk8HSpUsnbUyy2SwAoL29HQCwY8cOVKvVMTG46KKLMHfu3EkbA8/zsHnzZuTzeSxbtmzKxWDNmjV429veNqa/wNSZC88++yy6u7tx3nnn4b3vfS8OHDgAYOr0/9///d9x2WWX4V3vehc6Ojrw2te+Ft/85jdHj0/FdbFSqeA73/kO3ve+9yEUCp21c+GMTD6OHTsGz/PQ2dk5pryzsxN9fX0T1KqJ5cV+T5WY+L6Pm2++GcuXL8fChQsBvBCDeDyOtra2Me+djDF44okn0NzcjEQigQ996EPYsmULLrnkkikVg82bN+PXv/41NmzYMO7YVIjD0qVLcffdd2Pr1q3YtGkT9u3bhze96U0YHh6eEv0HgOeeew6bNm3ChRdeiPvvvx833XQT/uZv/gbf+ta3AEy9dREAfvzjH2NoaAg33ngjgLP3XohOdAOEsFizZg2efPLJMb9xTyUWLFiAnTt3IpvN4oc//CFWr16Nnp6eiW5Ww+jt7cVHPvIRPPDAA0gmkxPdnAlh1apVo/9etGgRli5dinnz5uEHP/gBUqnUBLascfi+j8suuwx///d/DwB47WtfiyeffBJ33HEHVq9ePcGtmxjuvPNOrFq1Ct3d3RPdlFfEGfnNx4wZMxCJRMbt1u3v70dXV9cEtWpiebHfUyEma9euxU9/+lP84he/wOzZs0fLu7q6UKlUMDQ0NOb9kzEG8XgcF1xwAZYsWYINGzZg8eLF+NrXvjZlYrBjxw4MDAzgj/7ojxCNRhGNRtHT04Ovf/3riEaj6OzsnBJx+EPa2trwqle9Cnv27Jky82DWrFm45JJLxpRdfPHFoz8/TaV1EQD279+Pn//85/jLv/zL0bKzdS6ckclHPB7HkiVLsG3bttEy3/exbds2LFu2bAJbNnHMnz8fXV1dY2KSy+Xw2GOPTZqYBEGAtWvXYsuWLXjwwQcxf/78MceXLFmCWCw2Jga7d+/GgQMHJk0MGL7vo1wuT5kYXH311XjiiSewc+fO0ddll12G9773vaP/ngpx+ENGRkawd+9ezJo1a8rMg+XLl4+T2//ud7/DvHnzAEyNdfEPueuuu9DR0YG3ve1to2Vn7VyY6B2vjM2bNweJRCK4++67g6effjr44Ac/GLS1tQV9fX0T3bTTxvDwcPCb3/wm+M1vfhMACL7yla8Ev/nNb4L9+/cHQRAEt912W9DW1hb85Cc/CXbt2hVce+21wfz584NisTjBLT813HTTTUEmkwkeeuih4MiRI6OvQqEw+p4PfehDwdy5c4MHH3wwePzxx4Nly5YFy5Ytm8BWn3o+8YlPBD09PcG+ffuCXbt2BZ/4xCeCUCgU/Od//mcQBFMjBhZ/qHYJgskfh49+9KPBQw89FOzbty/4r//6r2DFihXBjBkzgoGBgSAIJn//gyAIfvnLXwbRaDT4whe+EDz77LPBd7/73SCdTgff+c53Rt8z2dfFF/E8L5g7d27w8Y9/fNyxs3EunLHJRxAEwT/+4z8Gc+fODeLxeHD55ZcH27dvn+gmnVZ+8YtfBADGvVavXh0EwQuysk996lNBZ2dnkEgkgquvvjrYvXv3xDb6FGL1HUBw1113jb6nWCwGf/3Xfx1MmzYtSKfTwTve8Y7gyJEjE9fo08D73ve+YN68eUE8Hg9mzpwZXH311aOJRxBMjRhYvDT5mOxxuOGGG4JZs2YF8Xg8OOecc4Ibbrgh2LNnz+jxyd7/F7nvvvuChQsXBolEIrjooouCf/mXfxlzfLKviy9y//33BwDMvp2NcyEUBEEwIV+5CCGEEGJKckbu+RBCCCHE5EXJhxBCCCEaipIPIYQQQjQUJR9CCCGEaChKPoQQQgjRUJR8CCGEEKKhKPkQQgghRENR8iGEEEKIhqLkQwghhBANRcmHEEIIIRqKkg8hhBBCNJT/H6hZm5PgOBo8AAAAAElFTkSuQmCC",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Use the converted tflite model and get intermediate outputs to extract the log-mel features\n",
+ "\n",
+ "interpreter = tf.lite.Interpreter(\n",
+ " model_path=os.path.join(speech_embedding_dir, \"speech_embeddings.tflite\"),\n",
+ " num_threads=1,\n",
+ " experimental_preserve_all_tensors=True\n",
+ ")\n",
+ "interpreter.allocate_tensors()\n",
+ "\n",
+ "# Get input and output tensors\n",
+ "input_details = interpreter.get_input_details()\n",
+ "output_details = interpreter.get_output_details()\n",
+ "interpreter.set_tensor(input_details[0]['index'], sample_data)\n",
+ "interpreter.invoke()\n",
+ "\n",
+ "spec = interpreter.get_tensor(65) # This index is the log-mel features, to my knowledge\n",
+ "spec = spec.squeeze().T # transform for visualization\n",
+ "print(\"Embedding model features shape:\", spec.shape)\n",
+ "\n",
+ "_ = plt.imshow(spec)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "dc88eeaa",
+ "metadata": {},
+ "source": [
+ "This certainly *looks* like a log-mel spectrogram, and we can compute the same from the reference Librosa implementation for comparison."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "5363ba83",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2024-01-18T00:26:26.195376Z",
+ "start_time": "2024-01-18T00:26:25.418886Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Librosa features shape: (32, 76)\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "import librosa\n",
+ "\n",
+ "S = librosa.feature.melspectrogram(y=sample_data, win_length=int(0.025*16000), \n",
+ " hop_length=int(0.010*16000), n_fft=512, center=True,\n",
+ " sr=16000, n_mels=32, fmin=60, fmax=3800, power=2)#, norm=None)\n",
+ "\n",
+ "S = librosa.power_to_db(S).squeeze()[:, 1:-1] # convert to logmel and remove edge columns from center=True\n",
+ "\n",
+ "print(\"Librosa features shape:\", spec.shape)\n",
+ "_ = plt.imshow(S)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f03308c2",
+ "metadata": {},
+ "source": [
+ "Visually, these mel-spectrograms are very similar, but on closer inspection there are differences. Plotting at a single time slice better shows the difference:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "d80091c0",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2024-01-18T00:26:30.675702Z",
+ "start_time": "2024-01-18T00:26:30.579560Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Plot single time slice from both melspectrograms\n",
+ "_ = plt.plot(spec[:, 33])\n",
+ "_ = plt.plot(S[:, 33]/10 + 2) # apply simple scalar transformation to better align the points\n",
+ "_ = plt.xlabel(\"Frequency Bins\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "98dce06c",
+ "metadata": {},
+ "source": [
+ "While the overall trend and specific frequency features are very similar between the two, they are not exact. After some time investigating this difference, eventually I moved on to other tasks with openWakeWord, and assumed that the similarity of the spectrograms would mean the downstream model performance would be relatively unnaffected. This assumption seems to have been largely true, and typically the performance difference between the openWakeWord implementation and the original Google embedding model is small.\n",
+ "\n",
+ "For completeness, below is the implementation of a melspectrogram using just PyTorch, so that it can be converted to ONNX/tflite for more efficient computation on a wide range of devices. This code was based on the implementation from [torchlibrosa](https://github.com/qiuqiangkong/torchlibrosa) and is identical to the librosa reference implementation to within rounding error."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "07e24374",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# torchlibrosa version of melspectrogram\n",
+ "\n",
+ "import torch\n",
+ "import torchlibrosa as tl\n",
+ "import numpy as np\n",
+ "\n",
+ "batch_size = 1\n",
+ "sample_rate = 16000\n",
+ "win_length = 400\n",
+ "hop_length = 160\n",
+ "n_mels = 32\n",
+ "nfft=512\n",
+ "\n",
+ "batch_audio = torch.empty(batch_size, 32000).uniform_(-1, 1) # (batch_size, sample_rate)\n",
+ "\n",
+ "def f(self, input):\n",
+ " r\"\"\"Power to db, this function is the pytorch implementation of \n",
+ " librosa.power_to_lb.\n",
+ " \"\"\"\n",
+ " ref_value = self.ref\n",
+ " log_spec = 10.0 * torch.log(torch.clamp(input, min=self.amin, max=np.inf))/torch.log(torch.tensor(10))\n",
+ " log_spec -= 10.0 * torch.log(torch.maximum(torch.tensor(self.amin), torch.tensor(ref_value)))/torch.log(torch.tensor(10))\n",
+ "\n",
+ " if self.top_db is not None:\n",
+ " if self.top_db < 0:\n",
+ " raise librosa.util.exceptions.ParameterError('top_db must be non-negative')\n",
+ " log_spec = torch.clamp(log_spec, min=log_spec.max() - self.top_db, max=np.inf)\n",
+ "\n",
+ " return log_spec\n",
+ "\n",
+ "tl.stft.LogmelFilterBank.power_to_db = f\n",
+ "\n",
+ "# TorchLibrosa feature extractor the same as librosa.feature.melspectrogram()\n",
+ "feature_extractor = torch.nn.Sequential(\n",
+ " tl.Spectrogram(\n",
+ " center=False,\n",
+ " n_fft=nfft,\n",
+ " hop_length=hop_length,\n",
+ " win_length=win_length,\n",
+ " ), tl.LogmelFilterBank(\n",
+ " n_fft=nfft,\n",
+ " sr=sample_rate,\n",
+ " n_mels=n_mels,\n",
+ " fmin=60,\n",
+ " fmax=3800,\n",
+ " is_log=True, # Default is true\n",
+ " ))\n",
+ "\n",
+ "# export to onnx\n",
+ "torch.onnx.export(feature_extractor, batch_audio, \"torchlibrosa_onnx_melspectrogram.onnx\",\n",
+ " opset_version=12, input_names = ['input'], output_names = ['output'], \n",
+ " dynamic_axes={\"input\": {0: 'batch_size', 1: 'samples'}, \"output\": {0: 'time'}})\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bbf87b19",
+ "metadata": {},
+ "source": [
+ "# Create New Model with Keras"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6bda8bd3",
+ "metadata": {},
+ "source": [
+ "After separting the log-mel feature calculution from the embedding model, we can now re-produce the rest of the model manually in Keras.\n",
+ "\n",
+ "Note that for many of the layers below, the hard-coded values and parameters were obtained by inspecting the tflite version of the original embedding model, using a tool like [Netron](http://www.netron.app)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "4fca0fa3",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2024-01-18T00:27:56.152072Z",
+ "start_time": "2024-01-18T00:27:55.876595Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Model: \"model_1\"\n",
+ "__________________________________________________________________________________________________\n",
+ " Layer (type) Output Shape Param # Connected to \n",
+ "==================================================================================================\n",
+ " input_2 (InputLayer) [(None, 76, 32, 1)] 0 [] \n",
+ " \n",
+ " zero_padding2d (ZeroPadding2D) (None, 76, 34, 1) 0 ['input_2[0][0]'] \n",
+ " \n",
+ " conv2d (Conv2D) (None, 74, 32, 24) 216 ['zero_padding2d[0][0]'] \n",
+ " \n",
+ " batch_normalization (BatchNorm (None, 74, 32, 24) 96 ['conv2d[0][0]'] \n",
+ " alization) \n",
+ " \n",
+ " tf.math.multiply (TFOpLambda) (None, 74, 32, 24) 0 ['batch_normalization[0][0]'] \n",
+ " \n",
+ " tf.math.truediv (TFOpLambda) (None, 74, 32, 24) 0 ['tf.math.multiply[0][0]'] \n",
+ " \n",
+ " tf.math.maximum (TFOpLambda) (None, 74, 32, 24) 0 ['tf.math.truediv[0][0]', \n",
+ " 'batch_normalization[0][0]'] \n",
+ " \n",
+ " tf.math.maximum_1 (TFOpLambda) (None, 74, 32, 24) 0 ['tf.math.maximum[0][0]'] \n",
+ " \n",
+ " conv2d_1 (Conv2D) (None, 74, 32, 24) 1728 ['tf.math.maximum_1[0][0]'] \n",
+ " \n",
+ " batch_normalization_1 (BatchNo (None, 74, 32, 24) 96 ['conv2d_1[0][0]'] \n",
+ " rmalization) \n",
+ " \n",
+ " tf.math.multiply_1 (TFOpLambda (None, 74, 32, 24) 0 ['batch_normalization_1[0][0]'] \n",
+ " ) \n",
+ " \n",
+ " tf.math.truediv_1 (TFOpLambda) (None, 74, 32, 24) 0 ['tf.math.multiply_1[0][0]'] \n",
+ " \n",
+ " tf.math.maximum_2 (TFOpLambda) (None, 74, 32, 24) 0 ['tf.math.truediv_1[0][0]', \n",
+ " 'batch_normalization_1[0][0]'] \n",
+ " \n",
+ " tf.math.maximum_3 (TFOpLambda) (None, 74, 32, 24) 0 ['tf.math.maximum_2[0][0]'] \n",
+ " \n",
+ " conv2d_2 (Conv2D) (None, 72, 32, 24) 1728 ['tf.math.maximum_3[0][0]'] \n",
+ " \n",
+ " batch_normalization_2 (BatchNo (None, 72, 32, 24) 96 ['conv2d_2[0][0]'] \n",
+ " rmalization) \n",
+ " \n",
+ " tf.math.multiply_2 (TFOpLambda (None, 72, 32, 24) 0 ['batch_normalization_2[0][0]'] \n",
+ " ) \n",
+ " \n",
+ " tf.math.truediv_2 (TFOpLambda) (None, 72, 32, 24) 0 ['tf.math.multiply_2[0][0]'] \n",
+ " \n",
+ " tf.math.maximum_4 (TFOpLambda) (None, 72, 32, 24) 0 ['tf.math.truediv_2[0][0]', \n",
+ " 'batch_normalization_2[0][0]'] \n",
+ " \n",
+ " tf.math.maximum_5 (TFOpLambda) (None, 72, 32, 24) 0 ['tf.math.maximum_4[0][0]'] \n",
+ " \n",
+ " max_pooling2d (MaxPooling2D) (None, 36, 16, 24) 0 ['tf.math.maximum_5[0][0]'] \n",
+ " \n",
+ " conv2d_3 (Conv2D) (None, 36, 16, 48) 3456 ['max_pooling2d[0][0]'] \n",
+ " \n",
+ " batch_normalization_3 (BatchNo (None, 36, 16, 48) 192 ['conv2d_3[0][0]'] \n",
+ " rmalization) \n",
+ " \n",
+ " tf.math.multiply_3 (TFOpLambda (None, 36, 16, 48) 0 ['batch_normalization_3[0][0]'] \n",
+ " ) \n",
+ " \n",
+ " tf.math.truediv_3 (TFOpLambda) (None, 36, 16, 48) 0 ['tf.math.multiply_3[0][0]'] \n",
+ " \n",
+ " tf.math.maximum_6 (TFOpLambda) (None, 36, 16, 48) 0 ['tf.math.truediv_3[0][0]', \n",
+ " 'batch_normalization_3[0][0]'] \n",
+ " \n",
+ " tf.math.maximum_7 (TFOpLambda) (None, 36, 16, 48) 0 ['tf.math.maximum_6[0][0]'] \n",
+ " \n",
+ " conv2d_4 (Conv2D) (None, 34, 16, 48) 6912 ['tf.math.maximum_7[0][0]'] \n",
+ " \n",
+ " batch_normalization_4 (BatchNo (None, 34, 16, 48) 192 ['conv2d_4[0][0]'] \n",
+ " rmalization) \n",
+ " \n",
+ " tf.math.multiply_4 (TFOpLambda (None, 34, 16, 48) 0 ['batch_normalization_4[0][0]'] \n",
+ " ) \n",
+ " \n",
+ " tf.math.truediv_4 (TFOpLambda) (None, 34, 16, 48) 0 ['tf.math.multiply_4[0][0]'] \n",
+ " \n",
+ " tf.math.maximum_8 (TFOpLambda) (None, 34, 16, 48) 0 ['tf.math.truediv_4[0][0]', \n",
+ " 'batch_normalization_4[0][0]'] \n",
+ " \n",
+ " tf.math.maximum_9 (TFOpLambda) (None, 34, 16, 48) 0 ['tf.math.maximum_8[0][0]'] \n",
+ " \n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " conv2d_5 (Conv2D) (None, 34, 16, 48) 6912 ['tf.math.maximum_9[0][0]'] \n",
+ " \n",
+ " batch_normalization_5 (BatchNo (None, 34, 16, 48) 192 ['conv2d_5[0][0]'] \n",
+ " rmalization) \n",
+ " \n",
+ " tf.math.multiply_5 (TFOpLambda (None, 34, 16, 48) 0 ['batch_normalization_5[0][0]'] \n",
+ " ) \n",
+ " \n",
+ " tf.math.truediv_5 (TFOpLambda) (None, 34, 16, 48) 0 ['tf.math.multiply_5[0][0]'] \n",
+ " \n",
+ " tf.math.maximum_10 (TFOpLambda (None, 34, 16, 48) 0 ['tf.math.truediv_5[0][0]', \n",
+ " ) 'batch_normalization_5[0][0]'] \n",
+ " \n",
+ " tf.math.maximum_11 (TFOpLambda (None, 34, 16, 48) 0 ['tf.math.maximum_10[0][0]'] \n",
+ " ) \n",
+ " \n",
+ " conv2d_6 (Conv2D) (None, 32, 16, 48) 6912 ['tf.math.maximum_11[0][0]'] \n",
+ " \n",
+ " batch_normalization_6 (BatchNo (None, 32, 16, 48) 192 ['conv2d_6[0][0]'] \n",
+ " rmalization) \n",
+ " \n",
+ " tf.math.multiply_6 (TFOpLambda (None, 32, 16, 48) 0 ['batch_normalization_6[0][0]'] \n",
+ " ) \n",
+ " \n",
+ " tf.math.truediv_6 (TFOpLambda) (None, 32, 16, 48) 0 ['tf.math.multiply_6[0][0]'] \n",
+ " \n",
+ " tf.math.maximum_12 (TFOpLambda (None, 32, 16, 48) 0 ['tf.math.truediv_6[0][0]', \n",
+ " ) 'batch_normalization_6[0][0]'] \n",
+ " \n",
+ " tf.math.maximum_13 (TFOpLambda (None, 32, 16, 48) 0 ['tf.math.maximum_12[0][0]'] \n",
+ " ) \n",
+ " \n",
+ " max_pooling2d_1 (MaxPooling2D) (None, 32, 8, 48) 0 ['tf.math.maximum_13[0][0]'] \n",
+ " \n",
+ " conv2d_7 (Conv2D) (None, 32, 8, 72) 10368 ['max_pooling2d_1[0][0]'] \n",
+ " \n",
+ " batch_normalization_7 (BatchNo (None, 32, 8, 72) 288 ['conv2d_7[0][0]'] \n",
+ " rmalization) \n",
+ " \n",
+ " tf.math.multiply_7 (TFOpLambda (None, 32, 8, 72) 0 ['batch_normalization_7[0][0]'] \n",
+ " ) \n",
+ " \n",
+ " tf.math.truediv_7 (TFOpLambda) (None, 32, 8, 72) 0 ['tf.math.multiply_7[0][0]'] \n",
+ " \n",
+ " tf.math.maximum_14 (TFOpLambda (None, 32, 8, 72) 0 ['tf.math.truediv_7[0][0]', \n",
+ " ) 'batch_normalization_7[0][0]'] \n",
+ " \n",
+ " tf.math.maximum_15 (TFOpLambda (None, 32, 8, 72) 0 ['tf.math.maximum_14[0][0]'] \n",
+ " ) \n",
+ " \n",
+ " conv2d_8 (Conv2D) (None, 30, 8, 72) 15552 ['tf.math.maximum_15[0][0]'] \n",
+ " \n",
+ " batch_normalization_8 (BatchNo (None, 30, 8, 72) 288 ['conv2d_8[0][0]'] \n",
+ " rmalization) \n",
+ " \n",
+ " tf.math.multiply_8 (TFOpLambda (None, 30, 8, 72) 0 ['batch_normalization_8[0][0]'] \n",
+ " ) \n",
+ " \n",
+ " tf.math.truediv_8 (TFOpLambda) (None, 30, 8, 72) 0 ['tf.math.multiply_8[0][0]'] \n",
+ " \n",
+ " tf.math.maximum_16 (TFOpLambda (None, 30, 8, 72) 0 ['tf.math.truediv_8[0][0]', \n",
+ " ) 'batch_normalization_8[0][0]'] \n",
+ " \n",
+ " tf.math.maximum_17 (TFOpLambda (None, 30, 8, 72) 0 ['tf.math.maximum_16[0][0]'] \n",
+ " ) \n",
+ " \n",
+ " conv2d_9 (Conv2D) (None, 30, 8, 72) 15552 ['tf.math.maximum_17[0][0]'] \n",
+ " \n",
+ " batch_normalization_9 (BatchNo (None, 30, 8, 72) 288 ['conv2d_9[0][0]'] \n",
+ " rmalization) \n",
+ " \n",
+ " tf.math.multiply_9 (TFOpLambda (None, 30, 8, 72) 0 ['batch_normalization_9[0][0]'] \n",
+ " ) \n",
+ " \n",
+ " tf.math.truediv_9 (TFOpLambda) (None, 30, 8, 72) 0 ['tf.math.multiply_9[0][0]'] \n",
+ " \n",
+ " tf.math.maximum_18 (TFOpLambda (None, 30, 8, 72) 0 ['tf.math.truediv_9[0][0]', \n",
+ " ) 'batch_normalization_9[0][0]'] \n",
+ " \n",
+ " tf.math.maximum_19 (TFOpLambda (None, 30, 8, 72) 0 ['tf.math.maximum_18[0][0]'] \n",
+ " ) \n",
+ " \n",
+ " conv2d_10 (Conv2D) (None, 28, 8, 72) 15552 ['tf.math.maximum_19[0][0]'] \n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " \n",
+ " batch_normalization_10 (BatchN (None, 28, 8, 72) 288 ['conv2d_10[0][0]'] \n",
+ " ormalization) \n",
+ " \n",
+ " tf.math.multiply_10 (TFOpLambd (None, 28, 8, 72) 0 ['batch_normalization_10[0][0]'] \n",
+ " a) \n",
+ " \n",
+ " tf.math.truediv_10 (TFOpLambda (None, 28, 8, 72) 0 ['tf.math.multiply_10[0][0]'] \n",
+ " ) \n",
+ " \n",
+ " tf.math.maximum_20 (TFOpLambda (None, 28, 8, 72) 0 ['tf.math.truediv_10[0][0]', \n",
+ " ) 'batch_normalization_10[0][0]'] \n",
+ " \n",
+ " tf.math.maximum_21 (TFOpLambda (None, 28, 8, 72) 0 ['tf.math.maximum_20[0][0]'] \n",
+ " ) \n",
+ " \n",
+ " max_pooling2d_2 (MaxPooling2D) (None, 14, 4, 72) 0 ['tf.math.maximum_21[0][0]'] \n",
+ " \n",
+ " conv2d_11 (Conv2D) (None, 14, 4, 96) 20736 ['max_pooling2d_2[0][0]'] \n",
+ " \n",
+ " batch_normalization_11 (BatchN (None, 14, 4, 96) 384 ['conv2d_11[0][0]'] \n",
+ " ormalization) \n",
+ " \n",
+ " tf.math.multiply_11 (TFOpLambd (None, 14, 4, 96) 0 ['batch_normalization_11[0][0]'] \n",
+ " a) \n",
+ " \n",
+ " tf.math.truediv_11 (TFOpLambda (None, 14, 4, 96) 0 ['tf.math.multiply_11[0][0]'] \n",
+ " ) \n",
+ " \n",
+ " tf.math.maximum_22 (TFOpLambda (None, 14, 4, 96) 0 ['tf.math.truediv_11[0][0]', \n",
+ " ) 'batch_normalization_11[0][0]'] \n",
+ " \n",
+ " tf.math.maximum_23 (TFOpLambda (None, 14, 4, 96) 0 ['tf.math.maximum_22[0][0]'] \n",
+ " ) \n",
+ " \n",
+ " conv2d_12 (Conv2D) (None, 12, 4, 96) 27648 ['tf.math.maximum_23[0][0]'] \n",
+ " \n",
+ " batch_normalization_12 (BatchN (None, 12, 4, 96) 384 ['conv2d_12[0][0]'] \n",
+ " ormalization) \n",
+ " \n",
+ " tf.math.multiply_12 (TFOpLambd (None, 12, 4, 96) 0 ['batch_normalization_12[0][0]'] \n",
+ " a) \n",
+ " \n",
+ " tf.math.truediv_12 (TFOpLambda (None, 12, 4, 96) 0 ['tf.math.multiply_12[0][0]'] \n",
+ " ) \n",
+ " \n",
+ " tf.math.maximum_24 (TFOpLambda (None, 12, 4, 96) 0 ['tf.math.truediv_12[0][0]', \n",
+ " ) 'batch_normalization_12[0][0]'] \n",
+ " \n",
+ " tf.math.maximum_25 (TFOpLambda (None, 12, 4, 96) 0 ['tf.math.maximum_24[0][0]'] \n",
+ " ) \n",
+ " \n",
+ " conv2d_13 (Conv2D) (None, 12, 4, 96) 27648 ['tf.math.maximum_25[0][0]'] \n",
+ " \n",
+ " batch_normalization_13 (BatchN (None, 12, 4, 96) 384 ['conv2d_13[0][0]'] \n",
+ " ormalization) \n",
+ " \n",
+ " tf.math.multiply_13 (TFOpLambd (None, 12, 4, 96) 0 ['batch_normalization_13[0][0]'] \n",
+ " a) \n",
+ " \n",
+ " tf.math.truediv_13 (TFOpLambda (None, 12, 4, 96) 0 ['tf.math.multiply_13[0][0]'] \n",
+ " ) \n",
+ " \n",
+ " tf.math.maximum_26 (TFOpLambda (None, 12, 4, 96) 0 ['tf.math.truediv_13[0][0]', \n",
+ " ) 'batch_normalization_13[0][0]'] \n",
+ " \n",
+ " tf.math.maximum_27 (TFOpLambda (None, 12, 4, 96) 0 ['tf.math.maximum_26[0][0]'] \n",
+ " ) \n",
+ " \n",
+ " conv2d_14 (Conv2D) (None, 10, 4, 96) 27648 ['tf.math.maximum_27[0][0]'] \n",
+ " \n",
+ " batch_normalization_14 (BatchN (None, 10, 4, 96) 384 ['conv2d_14[0][0]'] \n",
+ " ormalization) \n",
+ " \n",
+ " tf.math.multiply_14 (TFOpLambd (None, 10, 4, 96) 0 ['batch_normalization_14[0][0]'] \n",
+ " a) \n",
+ " \n",
+ " tf.math.truediv_14 (TFOpLambda (None, 10, 4, 96) 0 ['tf.math.multiply_14[0][0]'] \n",
+ " ) \n",
+ " \n",
+ " tf.math.maximum_28 (TFOpLambda (None, 10, 4, 96) 0 ['tf.math.truediv_14[0][0]', \n",
+ " ) 'batch_normalization_14[0][0]'] \n",
+ " \n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " tf.math.maximum_29 (TFOpLambda (None, 10, 4, 96) 0 ['tf.math.maximum_28[0][0]'] \n",
+ " ) \n",
+ " \n",
+ " max_pooling2d_3 (MaxPooling2D) (None, 10, 2, 96) 0 ['tf.math.maximum_29[0][0]'] \n",
+ " \n",
+ " conv2d_15 (Conv2D) (None, 10, 2, 96) 27648 ['max_pooling2d_3[0][0]'] \n",
+ " \n",
+ " batch_normalization_15 (BatchN (None, 10, 2, 96) 384 ['conv2d_15[0][0]'] \n",
+ " ormalization) \n",
+ " \n",
+ " tf.math.multiply_15 (TFOpLambd (None, 10, 2, 96) 0 ['batch_normalization_15[0][0]'] \n",
+ " a) \n",
+ " \n",
+ " tf.math.truediv_15 (TFOpLambda (None, 10, 2, 96) 0 ['tf.math.multiply_15[0][0]'] \n",
+ " ) \n",
+ " \n",
+ " tf.math.maximum_30 (TFOpLambda (None, 10, 2, 96) 0 ['tf.math.truediv_15[0][0]', \n",
+ " ) 'batch_normalization_15[0][0]'] \n",
+ " \n",
+ " tf.math.maximum_31 (TFOpLambda (None, 10, 2, 96) 0 ['tf.math.maximum_30[0][0]'] \n",
+ " ) \n",
+ " \n",
+ " conv2d_16 (Conv2D) (None, 8, 2, 96) 27648 ['tf.math.maximum_31[0][0]'] \n",
+ " \n",
+ " batch_normalization_16 (BatchN (None, 8, 2, 96) 384 ['conv2d_16[0][0]'] \n",
+ " ormalization) \n",
+ " \n",
+ " tf.math.multiply_16 (TFOpLambd (None, 8, 2, 96) 0 ['batch_normalization_16[0][0]'] \n",
+ " a) \n",
+ " \n",
+ " tf.math.truediv_16 (TFOpLambda (None, 8, 2, 96) 0 ['tf.math.multiply_16[0][0]'] \n",
+ " ) \n",
+ " \n",
+ " tf.math.maximum_32 (TFOpLambda (None, 8, 2, 96) 0 ['tf.math.truediv_16[0][0]', \n",
+ " ) 'batch_normalization_16[0][0]'] \n",
+ " \n",
+ " tf.math.maximum_33 (TFOpLambda (None, 8, 2, 96) 0 ['tf.math.maximum_32[0][0]'] \n",
+ " ) \n",
+ " \n",
+ " conv2d_17 (Conv2D) (None, 8, 2, 96) 27648 ['tf.math.maximum_33[0][0]'] \n",
+ " \n",
+ " batch_normalization_17 (BatchN (None, 8, 2, 96) 384 ['conv2d_17[0][0]'] \n",
+ " ormalization) \n",
+ " \n",
+ " tf.math.multiply_17 (TFOpLambd (None, 8, 2, 96) 0 ['batch_normalization_17[0][0]'] \n",
+ " a) \n",
+ " \n",
+ " tf.math.truediv_17 (TFOpLambda (None, 8, 2, 96) 0 ['tf.math.multiply_17[0][0]'] \n",
+ " ) \n",
+ " \n",
+ " tf.math.maximum_34 (TFOpLambda (None, 8, 2, 96) 0 ['tf.math.truediv_17[0][0]', \n",
+ " ) 'batch_normalization_17[0][0]'] \n",
+ " \n",
+ " tf.math.maximum_35 (TFOpLambda (None, 8, 2, 96) 0 ['tf.math.maximum_34[0][0]'] \n",
+ " ) \n",
+ " \n",
+ " conv2d_18 (Conv2D) (None, 6, 2, 96) 27648 ['tf.math.maximum_35[0][0]'] \n",
+ " \n",
+ " batch_normalization_18 (BatchN (None, 6, 2, 96) 384 ['conv2d_18[0][0]'] \n",
+ " ormalization) \n",
+ " \n",
+ " tf.math.multiply_18 (TFOpLambd (None, 6, 2, 96) 0 ['batch_normalization_18[0][0]'] \n",
+ " a) \n",
+ " \n",
+ " tf.math.truediv_18 (TFOpLambda (None, 6, 2, 96) 0 ['tf.math.multiply_18[0][0]'] \n",
+ " ) \n",
+ " \n",
+ " tf.math.maximum_36 (TFOpLambda (None, 6, 2, 96) 0 ['tf.math.truediv_18[0][0]', \n",
+ " ) 'batch_normalization_18[0][0]'] \n",
+ " \n",
+ " tf.math.maximum_37 (TFOpLambda (None, 6, 2, 96) 0 ['tf.math.maximum_36[0][0]'] \n",
+ " ) \n",
+ " \n",
+ " max_pooling2d_4 (MaxPooling2D) (None, 3, 1, 96) 0 ['tf.math.maximum_37[0][0]'] \n",
+ " \n",
+ " conv2d_19 (Conv2D) (None, 1, 1, 96) 27648 ['max_pooling2d_4[0][0]'] \n",
+ " \n",
+ "==================================================================================================\n",
+ "Total params: 332,088\n",
+ "Trainable params: 329,448\n",
+ "Non-trainable params: 2,640\n",
+ "__________________________________________________________________________________________________\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Recreate the embedding model after the melspectrogram layers\n",
+ "# That is, have the melspectrogram of the audio as the input instead of the raw audio\n",
+ "\n",
+ "# A custom function for the leaky relu activation function, to make exporting to ONNX/tflite easier\n",
+ "def MyLeakyReLU(alpha = 0.20000000298023224*2):\n",
+ " return lambda x : tf.keras.backend.maximum(alpha * x/2, x)\n",
+ "\n",
+ "# Define convolutional block helper functions\n",
+ "def batch_norm_and_activation(x):\n",
+ " x = tf.keras.layers.BatchNormalization()(x)\n",
+ " x = MyLeakyReLU()(x)\n",
+ " x = tf.maximum(x, -0.4000000059604645)\n",
+ " return x\n",
+ "\n",
+ "# Define contraint for zero mean conv2d layer\n",
+ "class CenterAround(tf.keras.constraints.Constraint):\n",
+ " \"\"\"Constrains weight tensors to be centered around `ref_value`.\"\"\"\n",
+ " def __init__(self, ref_value):\n",
+ " self.ref_value = ref_value\n",
+ "\n",
+ " def __call__(self, w):\n",
+ " mean = tf.reduce_mean(w, axis=(0,1))\n",
+ " return w - mean + self.ref_value\n",
+ "\n",
+ "\n",
+ "# Contruct inputs\n",
+ "inputs = tf.keras.Input((76, 32, 1)) # melspectrogram shape when provided with 12400 samples at 16 khz\n",
+ "\n",
+ "# Input conv block\n",
+ "x = tf.keras.layers.ZeroPadding2D((0,1))(inputs)\n",
+ "x = tf.keras.layers.Conv2D(24, (3,3), use_bias=False, kernel_constraint=CenterAround(0.0),\n",
+ " activation='relu', padding='valid')(x)\n",
+ "x = batch_norm_and_activation(x)\n",
+ "\n",
+ "# Conv block #1\n",
+ "x = tf.keras.layers.Conv2D(24, (1,3), use_bias=False, padding='same')(x)\n",
+ "x = batch_norm_and_activation(x)\n",
+ "x = tf.keras.layers.Conv2D(24, (3,1), use_bias=False, padding='valid')(x)\n",
+ "x = batch_norm_and_activation(x)\n",
+ "x = tf.keras.layers.MaxPool2D((2,2), (2,2), padding='valid')(x)\n",
+ "x = tf.keras.layers.Conv2D(48, (1,3), use_bias=False, padding='same')(x)\n",
+ "x = batch_norm_and_activation(x)\n",
+ "x = tf.keras.layers.Conv2D(48, (3,1), use_bias=False, padding='valid')(x)\n",
+ "x = batch_norm_and_activation(x)\n",
+ "\n",
+ "# Conv block #2\n",
+ "x = tf.keras.layers.Conv2D(48, (1,3), use_bias=False, padding='same')(x)\n",
+ "x = batch_norm_and_activation(x)\n",
+ "x = tf.keras.layers.Conv2D(48, (3,1), use_bias=False, padding='valid')(x)\n",
+ "x = batch_norm_and_activation(x)\n",
+ "x = tf.keras.layers.MaxPool2D((1,2), (1,2), padding='same')(x)\n",
+ "x = tf.keras.layers.Conv2D(72, (1,3), use_bias=False, padding='same')(x)\n",
+ "x = batch_norm_and_activation(x)\n",
+ "x = tf.keras.layers.Conv2D(72, (3,1), use_bias=False, padding='valid')(x)\n",
+ "x = batch_norm_and_activation(x)\n",
+ "\n",
+ "# Conv block #3\n",
+ "x = tf.keras.layers.Conv2D(72, (1,3), use_bias=False, padding='same')(x)\n",
+ "x = batch_norm_and_activation(x)\n",
+ "x = tf.keras.layers.Conv2D(72, (3,1), use_bias=False, padding='valid')(x)\n",
+ "x = batch_norm_and_activation(x)\n",
+ "x = tf.keras.layers.MaxPool2D((2,2), (2,2), padding='valid')(x)\n",
+ "x = tf.keras.layers.Conv2D(96, (1,3), use_bias=False, padding='same')(x)\n",
+ "x = batch_norm_and_activation(x)\n",
+ "x = tf.keras.layers.Conv2D(96, (3,1), use_bias=False, padding='valid')(x)\n",
+ "x = batch_norm_and_activation(x)\n",
+ "\n",
+ "# Conv block #4\n",
+ "x = tf.keras.layers.Conv2D(96, (1,3), use_bias=False, padding='same')(x)\n",
+ "x = batch_norm_and_activation(x)\n",
+ "x = tf.keras.layers.Conv2D(96, (3,1), use_bias=False, padding='valid')(x)\n",
+ "x = batch_norm_and_activation(x)\n",
+ "x = tf.keras.layers.MaxPool2D((1,2), (1,2), padding='valid')(x)\n",
+ "x = tf.keras.layers.Conv2D(96, (1,3), use_bias=False, padding='same')(x)\n",
+ "x = batch_norm_and_activation(x)\n",
+ "x = tf.keras.layers.Conv2D(96, (3,1), use_bias=False, padding='valid')(x)\n",
+ "x = batch_norm_and_activation(x)\n",
+ "\n",
+ "# Conv block #5\n",
+ "x = tf.keras.layers.Conv2D(96, (1,3), use_bias=False, padding='same')(x)\n",
+ "x = batch_norm_and_activation(x)\n",
+ "x = tf.keras.layers.Conv2D(96, (3,1), use_bias=False, padding='valid')(x)\n",
+ "x = batch_norm_and_activation(x)\n",
+ "x = tf.keras.layers.MaxPool2D((2,2), (2,2), padding=\"valid\")(x)\n",
+ "x = tf.keras.layers.Conv2D(96, (3,1), use_bias=False, padding='valid')(x)\n",
+ "\n",
+ "# Build the keras model\n",
+ "reimplemented_model = tf.keras.Model(inputs=inputs, outputs=x)\n",
+ "reimplemented_model.summary()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "3a83d707",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2024-01-18T00:28:05.030126Z",
+ "start_time": "2024-01-18T00:28:05.008907Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Manually set the weights of the new Keras model with those from the original embedding model\n",
+ "\n",
+ "# Set weights for all layers\n",
+ "reimplemented_model.set_weights(embedding_model.get_weights())\n",
+ "\n",
+ "# Adjust weights of specific layer that needs to be centered around 0.0\n",
+ "reimplemented_model.layers[2].set_weights([CenterAround(0.0)(reimplemented_model.layers[2].weights[0])])\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "eba224b4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Convert the new keras model to tflite format (optional for this notebook)\n",
+ "converter = tf.lite.TFLiteConverter.from_keras_model(model)\n",
+ "tflite_model = converter.convert()\n",
+ "\n",
+ "# Save the model.\n",
+ "with open('embedding_model.tflite', 'wb') as f:\n",
+ " f.write(tflite_model)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7cfc246b",
+ "metadata": {},
+ "source": [
+ "# Compare Predictions"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4907f757",
+ "metadata": {},
+ "source": [
+ "Now that we have a re-implemented embedding model, we can verify that the predictions are the same as the original. Note that as discussed previously, the log-mel feature calculation is different, so we will start from the original audio features obtained via tflite and calculate the final embeddings from there."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "6d6ba4b5",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2024-01-18T00:29:27.288771Z",
+ "start_time": "2024-01-18T00:29:27.165734Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Get original embedding model prediction\n",
+ "original_embeddings = embedding_model(sample_data)\n",
+ "\n",
+ "# Reshape original log-mel inputs from tflite model above and pass to re-implemented model\n",
+ "reimplemented_embeddings = reimplemented_model(spec.T[None, ..., None])\n",
+ "\n",
+ "# Plot final output embeddings for the sample data\n",
+ "_ = plt.plot(original_embeddings.numpy().flatten(), label=\"Original Google Model\")\n",
+ "_ = plt.plot(reimplemented_embeddings.numpy().flatten(), label=\"Reimplemented Model\")\n",
+ "_ = plt.legend()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "ea6f9936",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2024-01-18T00:29:29.894151Z",
+ "start_time": "2024-01-18T00:29:29.882876Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.00010585785"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Check maximum absolute difference in the output embeddings to confirm practical equivalence\n",
+ "np.abs(original_embeddings.numpy().flatten() - reimplemented_embeddings.numpy().flatten()).max()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "openwakeword_dev",
+ "language": "python",
+ "name": "openwakeword_dev"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.16"
+ },
+ "toc": {
+ "base_numbering": 1,
+ "nav_menu": {},
+ "number_sections": true,
+ "sideBar": true,
+ "skip_h1_title": false,
+ "title_cell": "Table of Contents",
+ "title_sidebar": "Contents",
+ "toc_cell": false,
+ "toc_position": {
+ "height": "calc(100% - 180px)",
+ "left": "10px",
+ "top": "150px",
+ "width": "384px"
+ },
+ "toc_section_display": true,
+ "toc_window_display": true
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/training_models.ipynb b/notebooks/training_models.ipynb
index 6f118fd..43fed78 100644
--- a/notebooks/training_models.ipynb
+++ b/notebooks/training_models.ipynb
@@ -43,16 +43,7 @@
"start_time": "2023-02-18T03:26:24.785801Z"
}
},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/home/dscripka/anaconda3/envs/torch_gpu/lib/python3.9/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
- " from .autonotebook import tqdm as notebook_tqdm\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# Imports\n",
"\n",
@@ -128,10 +119,8 @@
"name": "stderr",
"output_type": "stream",
"text": [
- " 0%| | 0/5000 [00:00, ?it/s]\n",
- "Reading metadata...: 0it [00:00, ?it/s]\u001b[A\n",
- "Reading metadata...: 16354it [00:00, 108226.87it/s]A\n",
- "100%|██████████| 5000/5000 [01:06<00:00, 74.81it/s]\n"
+ "Reading metadata...: 16354it [00:00, 26183.62it/s]\n",
+ "100%|██████████| 5000/5000 [00:44<00:00, 112.28it/s]\n"
]
}
],
@@ -143,14 +132,14 @@
"cv_11 = iter(cv_11)\n",
"\n",
"# Convert and save clips (only first 5000)\n",
- "if not os.path.exists(\"cv11_test_clips\"):\n",
- " os.mkdir(\"cv11_test_clips\")\n",
- " \n",
"limit = 5000\n",
"for i in tqdm(range(limit)):\n",
" example = next(cv_11)\n",
+ " output = os.path.join(\"cv11_test_clips\", example[\"path\"][0:-4] + \".wav\")\n",
+ " os.makedirs(os.path.dirname(output), exist_ok=True)\n",
+ "\n",
" wav_data = (example[\"audio\"][\"array\"]*32767).astype(np.int16) # convert to 16-bit PCM format\n",
- " scipy.io.wavfile.write(os.path.join(\"cv11_test_clips\", example[\"path\"][0:-4] + \".wav\"), 16000, wav_data)\n"
+ " scipy.io.wavfile.write(output, 16000, wav_data)\n"
]
},
{
@@ -719,7 +708,7 @@
},
{
"data": {
- "image/png": "\n",
+ "image/png": "",
"text/plain": [
""
]
@@ -776,7 +765,7 @@
},
{
"data": {
- "image/png": "\n",
+ "image/png": "",
"text/plain": [
""
]
@@ -872,7 +861,7 @@
},
{
"data": {
- "image/png": "\n",
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAiMAAAGiCAYAAAA1LsZRAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAA9hAAAPYQGoP6dpAAApJ0lEQVR4nO3df3RV1Z338c+5N8kNRRMVMD8UYrTSaqPMNKk2Uez4o3HQOuNqpzLjPAUtuMwoIkRdFllT1OWaOD5rKP0F2hF0XGMr04qVeaRK+oz8UOwzNSYWgcfyFDTRJmSCYxJAE3Lvfv5Izs39GbiBnHPuue/XWndBzj2X7M1B81l7f/feljHGCAAAwCUBtxsAAAByG2EEAAC4ijACAABcRRgBAACuIowAAABXEUYAAICrCCMAAMBVhBEAAOAqwggAAHAVYQQAALgq4zCybds23XDDDSovL5dlWfrlL395zM9s3bpV1dXVKiws1LnnnqvHH398PG0FAAA+lHEYOXz4sGbNmqUf/ehHx3X//v37dd1112n27NlqbW3VAw88oMWLF+v555/PuLEAAMB/rBM5KM+yLL3wwgu68cYb095z//33a+PGjdqzZ0/0WkNDg95++2298cYb4/3WAADAJ/Im+hu88cYbqq+vj7t27bXXau3atTp69Kjy8/OTPjMwMKCBgYHo15FIRB999JGmTJkiy7ImuskAAOAkMMaov79f5eXlCgTST8ZMeBjp6upSSUlJ3LWSkhINDQ2pp6dHZWVlSZ9pamrSQw89NNFNAwAADujo6NDZZ5+d9v0JDyOSkkYz7JmhdKMcy5YtU2NjY/Tr3t5ezZgxQx0dHSoqKpq4hgIAgJOmr69P06dP16mnnjrmfRMeRkpLS9XV1RV3rbu7W3l5eZoyZUrKz4RCIYVCoaTrRUVFhBEAALLMsUosJnyfkdraWjU3N8dd27x5s2pqalLWiwAAgNyScRg5dOiQ2tra1NbWJml46W5bW5va29slDU+xzJs3L3p/Q0OD3n//fTU2NmrPnj1at26d1q5dq3vvvffk9AAAAGS1jKdp3nzzTV155ZXRr+3ajvnz5+vpp59WZ2dnNJhIUmVlpTZt2qSlS5fqxz/+scrLy/WDH/xA3/jGN05C8wEAQLY7oX1GnNLX16fi4mL19vZSMwIAQJY43p/fnE0DAABcRRgBAACuIowAAABXEUYAAICrCCMAAMBVhBEAAOAqwggAAHAVYQQAALiKMAIAAFxFGAEAAK4ijAAAAFcRRgAAgKsIIwAAwFWEEQAA4CrCCAAAcBVhBAAAuIowAgAAXEUYAQAAriKMAAAAVxFGAACAqwgjAADAVYQRAADgKsIIAABwFWEEAAC4ijACAABcRRgBAACuIowAAABXEUYAAICrCCMAAMBVhBEAAOAqwggAAHAVYQQAALiKMAIAAFxFGAEAAK4ijAAAAFcRRgAAgKsIIwAAwFWEEQAA4CrCCAAAcBVhBAAAuIowAgAAXEUYAQAAriKMAAAAVxFGAACAqwgjAADAVYQRAADgKsIIAABwFWEEAAC4ijACAABcRRgBAACuIowAAABXEUYAAICrCCMAAMBVhBEAAOAqwggAAHAVYQQAALiKMAIAAFxFGAEAAK4ijAAAAFcRRgAAgKsIIwAAwFWEEQAA4CrCCAAAHhSJGBlj3G6GI8YVRlavXq3KykoVFhaqurpa27dvH/P+Z599VrNmzdJnPvMZlZWV6dZbb9XBgwfH1WAAAPxuYCisa763Vbc90+J2UxyRcRhZv369lixZouXLl6u1tVWzZ8/WnDlz1N7envL+1157TfPmzdOCBQu0a9cu/fznP9dvf/tbLVy48IQbDwCAH+34fwe1778O69d7DrjdFEdkHEZWrlypBQsWaOHChbrgggu0atUqTZ8+XWvWrEl5/29+8xudc845Wrx4sSorK3X55Zfr9ttv15tvvnnCjQcAwI8ODQy53QRHZRRGBgcH1dLSovr6+rjr9fX12rFjR8rP1NXV6YMPPtCmTZtkjNGBAwf0i1/8Qtdff33a7zMwMKC+vr64FwAAueIwYSS9np4ehcNhlZSUxF0vKSlRV1dXys/U1dXp2Wef1dy5c1VQUKDS0lKddtpp+uEPf5j2+zQ1Nam4uDj6mj59eibNBAAgqx0eDLvdBEeNq4DVsqy4r40xSddsu3fv1uLFi/Xd735XLS0tevnll7V//341NDSk/fOXLVum3t7e6Kujo2M8zQQAICsdybGRkbxMbp46daqCwWDSKEh3d3fSaImtqalJl112me677z5J0sUXX6zJkydr9uzZeuSRR1RWVpb0mVAopFAolEnTAADwjUODuRVGMhoZKSgoUHV1tZqbm+OuNzc3q66uLuVnjhw5okAg/tsEg0FJypn10wAAZOLIANM0Y2psbNSTTz6pdevWac+ePVq6dKna29uj0y7Lli3TvHnzovffcMMN2rBhg9asWaN9+/bp9ddf1+LFi3XJJZeovLz85PUEAACfOJxjIyMZTdNI0ty5c3Xw4EE9/PDD6uzsVFVVlTZt2qSKigpJUmdnZ9yeI7fccov6+/v1ox/9SPfcc49OO+00XXXVVfrHf/zHk9cLAAB8JNdW01gmC+ZK+vr6VFxcrN7eXhUVFbndHAAAJtT/ePL/6LX/1yNJeu/R9FtheN3x/vzmbBoAADwmdpomC8YMThhhBAAAj4ktYI34P4sQRgAA8JrY7eAZGQEAAI47EjtN42I7nEIYAQDAYw7HTdP4P44QRgAA8JjBcCT6+xzIIoQRAAC8ZHAocuybfIYwAgCAhyRueMY0DQAAcNShhDCSA1mEMAIAgJf0f5oQRlxqh5MIIwAAeEjyyIj/4whhBAAADzkymFgz4lJDHEQYAQDAQ5IKVgkjAADASeGElb0mB9IIYQQAAA8JJ8zLME0DAAAclThNQwErAABwVOLIiP+jCGEEAABPSR4ZcakhDiKMAADgIUkjIzmQRggjAAB4CNM0AADAVUzTAAAAVyXuM8KpvQAAwFHhxJERl9rhJMIIAAAeEqGAFQAAuCl5NY1LDXEQYQQAAA+hgBUAALgqeWmv/9MIYQQAAA9JPBiPkREAAOCoxGkalvYCAABHsQMrAABwFatpAACAq5JX0/g/jRBGAADwEKZpAACAq5K2g8+BNEIYAQDAQ5K2g8+BsRHCCAAAHpJ0am8k9X1+QhgBAMBDkgpYGRkBAABOYmkvAABwFQWsAADAVRSwAgAAVzFNAwAAXJU4TcNBeQAAwFGJ2cP/UYQwAgCApzBNAwAAXJU4TZMLYyOEEQAAPCRxNU3E/1mEMAIAgJcwTQMAAFyVtB18DqQRwggAAB6SODLCNA0AAHBUOGlpr//TCGEEAAAPSSxgzYEsQhgBAMBLkgpYXWqHkwgjAAB4CNvBAwAAVyWd2uv/LEIYAQDASxJHRnIgixBGAADwkuQdWP0fRwgjAAB4SNLZNP7PIoQRAAC8JBKJ/5p9RgAAgKMSp2USw4kfEUYAAPAQ9hkBAACuSlpNQwErAABwUtI+Iy61w0mEEQAAPMQeGQkGLEmMjKS1evVqVVZWqrCwUNXV1dq+ffuY9w8MDGj58uWqqKhQKBTSeeedp3Xr1o2rwQAA+JldsDoaRlxsjEPyMv3A+vXrtWTJEq1evVqXXXaZnnjiCc2ZM0e7d+/WjBkzUn7mpptu0oEDB7R27Vp99rOfVXd3t4aGhk648QAA+I1dwJoXsDSo3JimyTiMrFy5UgsWLNDChQslSatWrdIrr7yiNWvWqKmpKen+l19+WVu3btW+fft0xhlnSJLOOeecE2s1AAA+ZU/T5I2MjLADa4LBwUG1tLSovr4+7np9fb127NiR8jMbN25UTU2NHnvsMZ111lmaOXOm7r33Xn3yySdpv8/AwID6+vriXgAA5AK7gDUvOPwjOgeySGYjIz09PQqHwyopKYm7XlJSoq6urpSf2bdvn1577TUVFhbqhRdeUE9Pj+644w599NFHaetGmpqa9NBDD2XSNAAAfCGpgNXNxjhkXAWslmXFfW2MSbpmi0QisixLzz77rC655BJdd911WrlypZ5++um0oyPLli1Tb29v9NXR0TGeZgIAkHVia0ak3FhNk9HIyNSpUxUMBpNGQbq7u5NGS2xlZWU666yzVFxcHL12wQUXyBijDz74QOeff37SZ0KhkEKhUCZNAwDAF0anaXJnNU1GIyMFBQWqrq5Wc3Nz3PXm5mbV1dWl/Mxll12mP/7xjzp06FD02u9//3sFAgGdffbZ42gyAAD+NVrAOlIzkgMTNRlP0zQ2NurJJ5/UunXrtGfPHi1dulTt7e1qaGiQNDzFMm/evOj9N998s6ZMmaJbb71Vu3fv1rZt23Tffffp29/+tiZNmnTyegIAgA/YG7Cyz8gY5s6dq4MHD+rhhx9WZ2enqqqqtGnTJlVUVEiSOjs71d7eHr3/lFNOUXNzs+666y7V1NRoypQpuummm/TII4+cvF4AAOATkUji0l43W+MMy2RBZUxfX5+Ki4vV29uroqIit5sDAMCEqVz2koyRqs4q0jsf9ul//tXF+mbNdLebNS7H+/Obs2kAAPAIY0x0WiYYrRnxP8IIAAAeEY6Zk8nPoaW9hBEAADwiHBM8cqmAlTACAIBH2Cf2SjH7jLjUFicRRgAA8Ij4kZHcOZuGMAIAgEekqhnh1F4AAOCYSEwYCXBQHgAAcFrsNI296VkuzNMQRgAA8Ah7ZMSypICVOzuwEkYAAPAIe2QkaFlSdGDE/2mEMAIAgEfYBayBgGVnEWpGAACAc+x9RoKWFZ2myYGBEcIIAABeYS/jDQYsjWQRlvYCAADn2DUjgdGSkZxAGAEAwCPs1TTBANM0AADABeGYaRoxTQMAAJwWXU1jWbLX0/g/ihBGAADwjOhqmoClHNqAlTACAIBXjBawspoGAAC4IBxTwGrl0HoawggAAB6Rap8RtoMHAACOGS1glSyW9gIAAKfF7jMyWjPiYoMcQhgBAMAj4gpYR66ZHFjcSxgBAMAjwuzACgAA3BRJsbSXAlYAAOCY8MimZ4FA7DSN/xFGAADwiOjSXlbTAAAAN6ReTeP/NEIYAQDAI+JX03BQHgAAcFg4xchIDgyMEEYAAPCK2O3go6f25sDYCGEEAACPiK6msSwKWAEAgPPiClhHrrHPCAAAcExcASsjIwAAwGmjBazioDwAAOC82AJWDsoDAACOs0dG4s+mcbFBDiGMAADgEfaUTOypvbmAMAIAgEdEYkdG7Gs5MDRCGAEAwCMiMatpxGoaAADgtNGlvWIHVgAA4Lz4Tc+G0whLewEAgGPs4BHgoDwAAOCG0aW90uhaGv+nEcIIAAAeEd30zLIUGCkaiUTcbJEzCCMAAHhEdDVNIHZchJERAADgkPDIKAg7sAIAAFfEnk1j78CaA1mEMAIAgFfYS3utmAJWdmAFAACOCccUsFqjx/b6HmEEAACPMCkOysuBLEIYAQDAK8LRaZrR1TRM0wAAAMfET9NwUB4AAHCYia6mGS1gzYEsQhgBAMArYqdpoqf25sDQCGEEAACPsDc9CwaYpgEAAC6wR0EClkZ3YM2BiRrCCAAAHhGOhhFGRgAAgAsiMfuMsAMrAABwnL0dPAflAQAAV9iraQIBS/bYSA5kEcIIAABeEYnZ9IylvcewevVqVVZWqrCwUNXV1dq+fftxfe71119XXl6e/uRP/mQ83xYAAF+LpFpN4/8sknkYWb9+vZYsWaLly5ertbVVs2fP1pw5c9Te3j7m53p7ezVv3jxdffXV424sAAB+xjTNcVq5cqUWLFighQsX6oILLtCqVas0ffp0rVmzZszP3X777br55ptVW1t7zO8xMDCgvr6+uBcAAH4XXU0TV8Dq/ziSURgZHBxUS0uL6uvr467X19drx44daT/31FNP6Q9/+INWrFhxXN+nqalJxcXF0df06dMzaSYAAFkpOk0TGD25N+L/LJJZGOnp6VE4HFZJSUnc9ZKSEnV1daX8zN69e/Wd73xHzz77rPLy8o7r+yxbtky9vb3RV0dHRybNBAAgK4Vjl/aOXMuBLKLjSwcJ7LRmM8YkXZOkcDism2++WQ899JBmzpx53H9+KBRSKBQaT9MAAMhakZgdWAOB4d/nwjRNRmFk6tSpCgaDSaMg3d3dSaMlktTf368333xTra2tWrRokSQpEonIGKO8vDxt3rxZV1111Qk0HwAA/4jEHJQXjrAdfEoFBQWqrq5Wc3Nz3PXm5mbV1dUl3V9UVKSdO3eqra0t+mpoaNDnPvc5tbW16dJLLz2x1gMA4CMRk2IH1hyYqMl4mqaxsVHf+ta3VFNTo9raWv3kJz9Re3u7GhoaJA3Xe3z44Yd65plnFAgEVFVVFff5M888U4WFhUnXAQDIdeGYfUZsuTAyknEYmTt3rg4ePKiHH35YnZ2dqqqq0qZNm1RRUSFJ6uzsPOaeIwAAIJl9Nk0wYCmQQ6f2WiYLKmP6+vpUXFys3t5eFRUVud0cAAAmxA0/fE07P+zVU7d+SYcHhrTop626tPIMrb/92Ht0edHx/vzmbBoAADwifmkvO7ACAACHpTooLxfSCGEEAACPSHVQXsT71RQnjDACAIBHxB6UJ6ZpAACA0+xBkGCAg/IAAIALYvcZCXBQHgAAcFquHpRHGAEAwCNSTdPkwq5nhBEAADwidmQkugOrmw1yCGEEAACPCMcclCeW9gIAAKfZK2eCgZiaEf9nEcIIAABeMTpNo5w6KI8wAgCAR9jLeAMxBaxM0wAAAMdEUhyUlwsIIwAAeEQ45qC80R1YXWyQQwgjAAB4RPSgvMDoQXkmBxb3EkYAAPCISGT419hpGraDBwAAjgnHLu3loDwAAOC0iGEHVgAA4BJjTLRYNWCJAlYAAOCscExxSPwOrP5PI4QRAAA8ILZQ1bIsWUzTAAAAJ8XutBpkB1YAAOC0uDBicVAeAABwWGzNiGVpdJqGMAIAAJxgb3gmDU/TBHLnaBrCCAAAXpA8TWMlXfcrwggAAB4QNonTNMO/z4EsQhgBAMALIhF791V7ae/wdQ7KAwAAjrDrV4MjxSIclAcAABxlT9PYq2iYpgEAAI6yp2mCCWEkF/ZgJYwAAOABoyf2auRX9hkBAAAOsjc9C0RrRoaxtBcAADgiqYA1uprG/wgjAAB4wOg0jR1GmKYBAAAOik7TWEzTAAAAF9ihIzjyk9nKoXkawggAAB5gH5Rnj4wEcieLEEYAAPCCcGLNCAflAQAAJ0ULWKPTNMO/5kAWIYwAAOAFiTuw2jgoDwAAOMLeZ8Te9Mz+lZERAADgiHRLewkjAADAEdGlvYmn9jJNAwAAnDBawGov7WWaBgAAOGh0mmb4a3ZgBQAAjhrdgTU+jfg/ihBGAADwhMQdWO1Nz3JgYIQwAgCAF4zuwKq4XyXJ+DyREEYAAPAAkzBNY8VsfubzLEIYAQDAC8Ij0zRWwj4jkv/rRggjAAB4QDhhn5FA3MiIv+MIYQQAAA9InKaJHRqJ+DuLEEYAAPACe58Re0Ak9rw8v+/CShgBAMAD7DASDKSapnGlSY4hjAAA4AF24AimKmAljAAAgIlmF7BaCQflSUzTAAAAB4xO0wx/bYlpGgAA4CAT3YE11ciIvxFGAADwgOipvYHkMOL3k3sJIwAAeEAkqYCVaRoAAOCgyBgH5fl9nmZcYWT16tWqrKxUYWGhqqurtX379rT3btiwQV/96lc1bdo0FRUVqba2Vq+88sq4GwwAgB8lT9OMphGmaRKsX79eS5Ys0fLly9Xa2qrZs2drzpw5am9vT3n/tm3b9NWvflWbNm1SS0uLrrzySt1www1qbW094cYDAOAXydM0o/wdRaS8TD+wcuVKLViwQAsXLpQkrVq1Sq+88orWrFmjpqampPtXrVoV9/U//MM/6MUXX9S///u/60//9E9Tfo+BgQENDAxEv+7r68u0mQAAZJXIWKtpGBkZNTg4qJaWFtXX18ddr6+v144dO47rz4hEIurv79cZZ5yR9p6mpiYVFxdHX9OnT8+kmQAAZJ2xpmn8HUUyDCM9PT0Kh8MqKSmJu15SUqKurq7j+jP+6Z/+SYcPH9ZNN92U9p5ly5apt7c3+uro6MikmQAAZJ2Iid/0TBodHfF7zUjG0zRSfFqThoePEq+l8rOf/UwPPvigXnzxRZ155plp7wuFQgqFQuNpGgAAWSkSiZ+mkYbrRozk+6GRjEZGpk6dqmAwmDQK0t3dnTRakmj9+vVasGCB/u3f/k3XXHNN5i0FAMDH7ALW2DBin+Ab9vnISEZhpKCgQNXV1Wpubo673tzcrLq6urSf+9nPfqZbbrlFP/3pT3X99dePr6UAAPhY2CSPjNhhZCjs7zCS8TRNY2OjvvWtb6mmpka1tbX6yU9+ovb2djU0NEgarvf48MMP9cwzz0gaDiLz5s3T97//fX35y1+OjqpMmjRJxcXFJ7ErAABkr0gkuWYkPxDQp4pEi1v9KuMwMnfuXB08eFAPP/ywOjs7VVVVpU2bNqmiokKS1NnZGbfnyBNPPKGhoSHdeeeduvPOO6PX58+fr6effvrEewAAgA9El/bGbL0aDI6MjBBGkt1xxx264447Ur6XGDC2bNkynm8BAEBOCUeGf42dpsmza0Z8HkY4mwYAAA+ILu1NUTNy1E4qPkUYAQDAA4Yiw4EjGIgdGRn+Mc3ICAAAmHD2ipn8YEwYyZGaEcIIAAAecHQkjOTFLKcZXdrLNA0AAJhg9jRNXoACVgAA4ILRaZrRH812zQjTNAAAYMLZK2byUtSMMDICAAAmnD36kR9IrhlhaS8AAJhwduDIz6NmBAAAuCA6TROgZgQAALhgrH1GGBkBAAAT7uhI4MijZgQAALhhKNVqGmpGAACAU9hnBAAAuOpoih1Yg0G2gwcAAA4ZSnE2jR1MGBkBAAATzh79KEhxUB41IwAAYMINRkdGRqdp8qkZAQAATrFP7Y3dZ2S0ZoQwAgAAJli0ZiSQXDMSjlDACgAAJljKU3uZpgEAAE6Jntobu5qG7eABAIATjDHRwBG3z0h0O3jCCAAAmECxYSPVPiPUjAAAgAk1FBM2CtgOHgAAOC1+ZMRK+j1LewEAwIQ6GnP2TKqaEUZGAADAhBrdY8SSZcUu7aVmBAAAOCDVHiMSB+UBAACHRPcYCcT/WA6OFLNSMwIAACbUECMjAADATUejJ/YmjIxQMwIAAJxg7zNSkBBG7BN8GRkBAAATanRkJH6aJhigZgQAADggWjMSSF0zwkF5AABgQtkjI/lpakaGqBkBAAAT6Wgk9WoaakYAAIAjRndgTRwZoWYEAAA4wK4ZyU+zzwg1IwAAYEIdjaQeGcmjZgQAADgh7Q6s1IwAAAAn2DUhiZueUTMCAAAckW41DTUjAADAEUNpzqZhmgYAADjiqL2aJs0OrBSwAgCACZX+1N7hr8PUjAAAgIl0rH1GmKYBAAATKu0+I0GmaQAAgAPS7TMSZGQEAAA4wQ4bifuM2CMlxkgRHwcSwggAAC47eoyREcnfoyOEEQAAXJbu1N7YglY/140QRgAAcNnRNKtpGBkBAACOSLfPSOxIiZ/3GiGMAADgMnsKJi9hB9bYLxkZAQAAE8auGclPGBmxLCs6dUPNCAAAmDDpVtNIMXuNME0DAAAmij0FkzgyIo3WjYSZpgEAABOl75OjkqRJ+cGk93JhS3jCCAAALjLGaG/3IUnSudMmJ72fC4flEUYAAHBRd/+Aej85qoAlnTftlKT3qRkBAAAT6vcH+iVJ50yZrMJU0zTUjAAAgIn0btdwGJlZcmrK96kZSWP16tWqrKxUYWGhqqurtX379jHv37p1q6qrq1VYWKhzzz1Xjz/++Lgai+y3+499evr1/To8MOR2UwB4zNFwRP/VP+B2Mxy398BwvcjM0jRhZGSa5vcj9/lRXqYfWL9+vZYsWaLVq1frsssu0xNPPKE5c+Zo9+7dmjFjRtL9+/fv13XXXafbbrtN//qv/6rXX39dd9xxh6ZNm6ZvfOMbJ6UT47V5V5f2dh+SZUmWrJFfFf+1ZcmSFDFGQxGjcMRoKGwUMaPDZfb99u8lyYp5Txo5/tko7nOx7w9/xkpxLfnPMdE/z8iMXIwYyWjk15HfywzPNeYFLAUDgZE+RBSOSMGAFLCs6CsYkAKBkd9bVlwbMmHSjCIaGe3vOaKfv9mhoYjRL976QNdeWKr8vIDygwFZsvtlYvpp4vprf62R+1K9Z/992M0Yfp6WggFLgZHnGYjp3Oidqdse357RewLW8N+X/ecGEv7CDg+E9b//7wFJ0gWlRTr79EkKBJL/UjN9/rH3pfoz4ts+dh9NymujXwQs+9+OFXc+RtJnEv7iTsZAcib//Kzx/mOdYLF/LybuulJeT/xM/PXYz5gx3kt9PVG6/09F3z/Of2OJ7Yj7bzTVtTH+uw1HItrw1ofq7P1U50z5jGafP01TTwmpIC+gUF5AASv2c7F/Vvz/94yJ/z6Whv9btSz7/3nDv1ojv3eL3b5IxGjb3v+SJM0sSa4XkaT6L5RqzZY/6LsvvqN3u/pVflph3M+UobCJ/rtI+lkW8/DS/ayz37vy82emrFlxgmXS/etP49JLL9UXv/hFrVmzJnrtggsu0I033qimpqak+++//35t3LhRe/bsiV5raGjQ22+/rTfeeCPl9xgYGNDAwGg67u3t1YwZM9TR0aGioqJMmjum+37+tn71TtdJ+/NwfPKDgegGPwAAKZQf0P9adLnKTpuU9F44YnT/L97Wy7sOTGgbHvuri3XdRWUn9c/s6+vT9OnT9fHHH6u4uDj9jSYDAwMDJhgMmg0bNsRdX7x4sbniiitSfmb27Nlm8eLFcdc2bNhg8vLyzODgYMrPrFixwmgk/PLixYsXL168svvV0dExZr7IaJqmp6dH4XBYJSUlcddLSkrU1ZV6hKGrqyvl/UNDQ+rp6VFZWXIKW7ZsmRobG6NfRyIRffTRR5oyZcpJHZK1E9vJHnHxmlzoJ330j1zoZy70UcqNftLHsRlj1N/fr/Ly8jHvy7hmREqeozXGjBkSUt2f6rotFAopFArFXTvttNPG0dLjU1RU5Nt/RLFyoZ/00T9yoZ+50EcpN/pJH9Mbc3pmREaraaZOnapgMJg0CtLd3Z00+mErLS1NeX9eXp6mTJmSybcHAAA+lFEYKSgoUHV1tZqbm+OuNzc3q66uLuVnamtrk+7fvHmzampqlJ+fn2FzAQCA32S8z0hjY6OefPJJrVu3Tnv27NHSpUvV3t6uhoYGScP1HvPmzYve39DQoPfff1+NjY3as2eP1q1bp7Vr1+ree+89eb0Yp1AopBUrViRNCflNLvSTPvpHLvQzF/oo5UY/6ePJkfHSXml407PHHntMnZ2dqqqq0ve+9z1dccUVkqRbbrlF7733nrZs2RK9f+vWrVq6dKl27dql8vJy3X///dHwAgAActu4wggAAMDJwtk0AADAVYQRAADgKsIIAABwFWEEAAC4KqfDyOrVq1VZWanCwkJVV1dr+/btbjdp3B588MHhE4ZjXqWlpdH3jTF68MEHVV5erkmTJunP/uzPtGvXLhdbfGzbtm3TDTfcoPLyclmWpV/+8pdx7x9PnwYGBnTXXXdp6tSpmjx5sv7iL/5CH3zwgYO9OLZj9fOWW25JerZf/vKX4+7xej+bmpr0pS99SaeeeqrOPPNM3XjjjXr33Xfj7sn253k8fcz2Z7lmzRpdfPHF0Z04a2tr9atf/Sr6frY/Q9ux+pntzzGVpqYmWZalJUuWRK85+jyPcTaebz333HMmPz/f/PM//7PZvXu3ufvuu83kyZPN+++/73bTxmXFihXmC1/4guns7Iy+uru7o+8/+uij5tRTTzXPP/+82blzp5k7d64pKyszfX19LrZ6bJs2bTLLly83zz//vJFkXnjhhbj3j6dPDQ0N5qyzzjLNzc3mrbfeMldeeaWZNWuWGRoacrg36R2rn/Pnzzd//ud/HvdsDx48GHeP1/t57bXXmqeeesq88847pq2tzVx//fVmxowZ5tChQ9F7sv15Hk8fs/1Zbty40bz00kvm3XffNe+++6554IEHTH5+vnnnnXeMMdn/DG3H6me2P8dE//mf/2nOOeccc/HFF5u77747et3J55mzYeSSSy4xDQ0Ncdc+//nPm+985zsutejErFixwsyaNSvle5FIxJSWlppHH300eu3TTz81xcXF5vHHH3eohScm8Yf08fTp448/Nvn5+ea5556L3vPhhx+aQCBgXn75Zcfanol0YeQv//Iv034mG/vZ3d1tJJmtW7caY/z5PBP7aIw/n+Xpp59unnzySV8+w1h2P43x13Ps7+83559/vmlubjZf+cpXomHE6eeZk9M0g4ODamlpUX19fdz1+vp67dixw6VWnbi9e/eqvLxclZWV+uu//mvt27dPkrR//351dXXF9TcUCukrX/lK1vb3ePrU0tKio0ePxt1TXl6uqqqqrOv3li1bdOaZZ2rmzJm67bbb1N3dHX0vG/vZ29srSTrjjDMk+fN5JvbR5pdnGQ6H9dxzz+nw4cOqra315TOUkvtp88tzvPPOO3X99dfrmmuuibvu9PMc16m92a6np0fhcDjpcL+SkpKkQ/2yxaWXXqpnnnlGM2fO1IEDB/TII4+orq5Ou3btivYpVX/ff/99N5p7wo6nT11dXSooKNDpp5+edE82Pec5c+bom9/8pioqKrR//379/d//va666iq1tLQoFAplXT+NMWpsbNTll1+uqqoqSf57nqn6KPnjWe7cuVO1tbX69NNPdcopp+iFF17QhRdeGP3h45dnmK6fkj+eoyQ999xzeuutt/Tb3/426T2n/5vMyTBisywr7mtjTNK1bDFnzpzo7y+66CLV1tbqvPPO07/8y79EC6v81F/bePqUbf2eO3du9PdVVVWqqalRRUWFXnrpJX39619P+zmv9nPRokX63e9+p9deey3pPb88z3R99MOz/NznPqe2tjZ9/PHHev755zV//nxt3bo1+r5fnmG6fl544YW+eI4dHR26++67tXnzZhUWFqa9z6nnmZPTNFOnTlUwGExKbt3d3UkpMFtNnjxZF110kfbu3RtdVeOn/h5Pn0pLSzU4OKj//u//TntPNiorK1NFRYX27t0rKbv6edddd2njxo169dVXdfbZZ0ev++l5putjKtn4LAsKCvTZz35WNTU1ampq0qxZs/T973/fV89QSt/PVLLxOba0tKi7u1vV1dXKy8tTXl6etm7dqh/84AfKy8uLttOp55mTYaSgoEDV1dVqbm6Ou97c3Ky6ujqXWnVyDQwMaM+ePSorK1NlZaVKS0vj+js4OKitW7dmbX+Pp0/V1dXKz8+Pu6ezs1PvvPNO1vZbkg4ePKiOjg6VlZVJyo5+GmO0aNEibdiwQf/xH/+hysrKuPf98DyP1cdUsvFZJjLGaGBgwBfPcCx2P1PJxud49dVXa+fOnWpra4u+ampq9Ld/+7dqa2vTueee6+zzzLDw1jfspb1r1641u3fvNkuWLDGTJ0827733nttNG5d77rnHbNmyxezbt8/85je/MV/72tfMqaeeGu3Po48+aoqLi82GDRvMzp07zd/8zd94fmlvf3+/aW1tNa2trUaSWblypWltbY0uvz6ePjU0NJizzz7b/PrXvzZvvfWWueqqqzy3vG6sfvb395t77rnH7Nixw+zfv9+8+uqrpra21px11llZ1c+/+7u/M8XFxWbLli1xyyGPHDkSvSfbn+ex+uiHZ7ls2TKzbds2s3//fvO73/3OPPDAAyYQCJjNmzcbY7L/GdrG6qcfnmM6satpjHH2eeZsGDHGmB//+MemoqLCFBQUmC9+8YtxS/Cyjb3+Oz8/35SXl5uvf/3rZteuXdH3I5GIWbFihSktLTWhUMhcccUVZufOnS62+NheffVVIynpNX/+fGPM8fXpk08+MYsWLTJnnHGGmTRpkvna175m2tvbXehNemP188iRI6a+vt5MmzbN5OfnmxkzZpj58+cn9cHr/UzVP0nmqaeeit6T7c/zWH30w7P89re/Hf1/5rRp08zVV18dDSLGZP8ztI3VTz88x3QSw4iTz9MyxpjMxlIAAABOnpysGQEAAN5BGAEAAK4ijAAAAFcRRgAAgKsIIwAAwFWEEQAA4CrCCAAAcBVhBAAAuIowAgAAXEUYAQAAriKMAAAAV/1/NEoHiB/UT8gAAAAASUVORK5CYII=",
"text/plain": [
""
]
@@ -1003,7 +992,7 @@
"outputs": [
{
"data": {
- "image/png": "\n",
+ "image/png": "",
"text/plain": [
""
]
@@ -1046,7 +1035,7 @@
"outputs": [
{
"data": {
- "image/png": "\n",
+ "image/png": "",
"text/plain": [
""
]
@@ -1279,7 +1268,7 @@
"outputs": [
{
"data": {
- "image/png": "\n",
+ "image/png": "",
"text/plain": [
""
]
diff --git a/openwakeword/__init__.py b/openwakeword/__init__.py
index d49e9e2..6ad8f3f 100755
--- a/openwakeword/__init__.py
+++ b/openwakeword/__init__.py
@@ -5,24 +5,48 @@
__all__ = ['Model', 'VAD', 'train_custom_verifier']
-models = {
+FEATURE_MODELS = {
+ "embedding": {
+ "model_path": os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources/models/embedding_model.tflite"),
+ "download_url": "https://github.com/dscripka/openWakeWord/releases/download/v0.5.1/embedding_model.tflite"
+ },
+ "melspectrogram": {
+ "model_path": os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources/models/melspectrogram.tflite"),
+ "download_url": "https://github.com/dscripka/openWakeWord/releases/download/v0.5.1/melspectrogram.tflite"
+ }
+}
+
+VAD_MODELS = {
+ "silero_vad": {
+ "model_path": os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources/models/silero_vad.onnx"),
+ "download_url": "https://github.com/dscripka/openWakeWord/releases/download/v0.5.1/silero_vad.onnx"
+ }
+}
+
+MODELS = {
"alexa": {
- "model_path": os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources/models/alexa_v0.1.tflite")
+ "model_path": os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources/models/alexa_v0.1.tflite"),
+ "download_url": "https://github.com/dscripka/openWakeWord/releases/download/v0.5.1/alexa_v0.1.tflite"
},
"hey_mycroft": {
- "model_path": os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources/models/hey_mycroft_v0.1.tflite")
+ "model_path": os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources/models/hey_mycroft_v0.1.tflite"),
+ "download_url": "https://github.com/dscripka/openWakeWord/releases/download/v0.5.1/hey_mycroft_v0.1.tflite"
},
"hey_jarvis": {
- "model_path": os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources/models/hey_jarvis_v0.1.tflite")
+ "model_path": os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources/models/hey_jarvis_v0.1.tflite"),
+ "download_url": "https://github.com/dscripka/openWakeWord/releases/download/v0.5.1/hey_jarvis_v0.1.tflite"
},
"hey_rhasspy": {
- "model_path": os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources/models/hey_rhasspy_v0.1.tflite")
+ "model_path": os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources/models/hey_rhasspy_v0.1.tflite"),
+ "download_url": "https://github.com/dscripka/openWakeWord/releases/download/v0.5.1/hey_rhasspy_v0.1.tflite"
},
"timer": {
- "model_path": os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources/models/timer_v0.1.tflite")
+ "model_path": os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources/models/timer_v0.1.tflite"),
+ "download_url": "https://github.com/dscripka/openWakeWord/releases/download/v0.5.1/timer_v0.1.tflite"
},
"weather": {
- "model_path": os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources/models/weather_v0.1.tflite")
+ "model_path": os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources/models/weather_v0.1.tflite"),
+ "download_url": "https://github.com/dscripka/openWakeWord/releases/download/v0.5.1/weather_v0.1.tflite"
}
}
@@ -40,6 +64,6 @@
def get_pretrained_model_paths(inference_framework="tflite"):
if inference_framework == "tflite":
- return [models[i]["model_path"] for i in models.keys()]
+ return [MODELS[i]["model_path"] for i in MODELS.keys()]
elif inference_framework == "onnx":
- return [models[i]["model_path"].replace(".tflite", ".onnx") for i in models.keys()]
+ return [MODELS[i]["model_path"].replace(".tflite", ".onnx") for i in MODELS.keys()]
diff --git a/openwakeword/data.py b/openwakeword/data.py
index 7c34549..c43da5d 100755
--- a/openwakeword/data.py
+++ b/openwakeword/data.py
@@ -15,13 +15,19 @@
# imports
from multiprocessing.pool import ThreadPool
import os
+import re
+import logging
from functools import partial
from pathlib import Path
import random
from tqdm import tqdm
from typing import List, Tuple
import numpy as np
+import itertools
+import pronouncing
import torch
+import audiomentations
+import torch_audiomentations
from numpy.lib.format import open_memmap
from speechbrain.dataio.dataio import read_audio
from speechbrain.processing.signal_processing import reverberate
@@ -445,7 +451,7 @@ def mix_clips_batch(
# Apply volume augmentation
if volume_augmentation:
volume_levels = np.random.uniform(0.02, 1.0, mixed_clips_batch.shape[0])
- mixed_clips_batch = (volume_levels/mixed_clips_batch.max(axis=1)[0])[..., None]*mixed_clips_batch
+ mixed_clips_batch = (volume_levels/mixed_clips_batch.max(dim=1)[0])[..., None]*mixed_clips_batch
else:
# Normalize clips only if max value is outside of [-1, 1]
abs_max, _ = torch.max(
@@ -457,7 +463,7 @@ def mix_clips_batch(
mixed_clips_batch = (mixed_clips_batch.numpy()*32767).astype(np.int16)
# Remove any clips that are silent (happens rarely when mixing/reverberating)
- error_index = np.where(mixed_clips_batch.max(axis=1) != 0)[0]
+ error_index = torch.from_numpy(np.where(mixed_clips_batch.max(dim=1) != 0)[0])
mixed_clips_batch = mixed_clips_batch[error_index]
labels_batch = labels_batch[error_index]
sequence_labels_batch = sequence_labels_batch[error_index]
@@ -548,6 +554,181 @@ def apply_reverb(x, rir_files):
return reverbed.numpy()
+# Alternate data augmentation method using audiomentations library (https://pypi.org/project/audiomentations/)
+def augment_clips(
+ clip_paths: List[str],
+ total_length: int,
+ sr: int = 16000,
+ batch_size: int = 128,
+ augmentation_probabilities: dict = {
+ "SevenBandParametricEQ": 0.25,
+ "TanhDistortion": 0.25,
+ "PitchShift": 0.25,
+ "BandStopFilter": 0.25,
+ "AddColoredNoise": 0.25,
+ "AddBackgroundNoise": 0.75,
+ "Gain": 1.0,
+ "RIR": 0.5
+ },
+ background_clip_paths: List[str] = [],
+ RIR_paths: List[str] = []
+ ):
+ """
+ Applies audio augmentations to the specified audio clips, returning a generator that applies
+ the augmentations in batches to support very large quantities of input audio files.
+
+ The augmentations (and probabilities) are chosen from experience based on training openWakeWord models, as well
+ as for the efficiency of the augmentation. The individual probabilities of each augmentation may be adjusted
+ with the "augmentation_probabilities" argument.
+
+ Args:
+ clip_paths (List[str]) = The input audio files (as paths) to augment. Note that these should be shorter
+ than the "total_length" argument, else they will be truncated.
+ total_length (int): The total length of audio files (in samples) after augmentation. All input clips
+ will be left-padded with silence to reach this size, with between 0 and 200 ms
+ of other audio after the end of the original input clip.
+ sr (int): The sample size of the input audio files
+ batch_size (int): The number of audio files to augment at once.
+ augmentation_probabilities (dict): The individual probabilities of each augmentation. If all probabilities
+ are zero, the input audio files will simply be padded with silence. THe
+ default values are:
+
+ {
+ "SevenBandParametricEQ": 0.25,
+ "TanhDistortion": 0.25,
+ "PitchShift": 0.25,
+ "BandStopFilter": 0.25,
+ "AddColoredNoise": 0.25,
+ "AddBackgroundNoise": 0.75,
+ "Gain": 1.0,
+ "RIR": 0.5
+ }
+
+ background_clip_paths (List[str]) = The paths to background audio files to mix with the input files
+ RIR_paths (List[str]) = The paths to room impulse response functions (RIRs) to convolve with the input files,
+ producing a version of the input clip with different acoustic characteristics.
+
+ Returns:
+ ndarray: A batch of augmented audio clips of size (batch_size, total_length)
+ """
+ # Define augmentations
+
+ # First pass augmentations that can't be done as a batch
+ augment1 = audiomentations.Compose([
+ audiomentations.SevenBandParametricEQ(min_gain_db=-6, max_gain_db=6, p=augmentation_probabilities["SevenBandParametricEQ"]),
+ audiomentations.TanhDistortion(
+ min_distortion=0.0001,
+ max_distortion=0.10,
+ p=augmentation_probabilities["TanhDistortion"]
+ ),
+ ])
+
+ # Augmentations that can be done as a batch
+ if background_clip_paths != []:
+ augment2 = torch_audiomentations.Compose([
+ torch_audiomentations.PitchShift(
+ min_transpose_semitones=-3,
+ max_transpose_semitones=3,
+ p=augmentation_probabilities["PitchShift"],
+ sample_rate=16000,
+ mode="per_batch"
+ ),
+ torch_audiomentations.BandStopFilter(p=augmentation_probabilities["BandStopFilter"], mode="per_batch"),
+ torch_audiomentations.AddColoredNoise(
+ min_snr_in_db=10, max_snr_in_db=30,
+ min_f_decay=-1, max_f_decay=2, p=augmentation_probabilities["AddColoredNoise"],
+ mode="per_batch"
+ ),
+ torch_audiomentations.AddBackgroundNoise(
+ p=augmentation_probabilities["AddBackgroundNoise"],
+ background_paths=background_clip_paths,
+ min_snr_in_db=-10,
+ max_snr_in_db=15,
+ mode="per_batch"
+ ),
+ torch_audiomentations.Gain(max_gain_in_db=0, p=augmentation_probabilities["Gain"]),
+ ])
+ else:
+ augment2 = torch_audiomentations.Compose([
+ torch_audiomentations.PitchShift(
+ min_transpose_semitones=-3,
+ max_transpose_semitones=3,
+ p=augmentation_probabilities["PitchShift"],
+ sample_rate=16000,
+ mode="per_batch"
+ ),
+ torch_audiomentations.BandStopFilter(p=augmentation_probabilities["BandStopFilter"], mode="per_batch"),
+ torch_audiomentations.AddColoredNoise(
+ min_snr_in_db=10, max_snr_in_db=30,
+ min_f_decay=-1, max_f_decay=2, p=augmentation_probabilities["AddColoredNoise"],
+ mode="per_batch"
+ ),
+ torch_audiomentations.Gain(max_gain_in_db=0, p=augmentation_probabilities["Gain"]),
+ ])
+
+ # Iterate through all clips and augment them
+ for i in range(0, len(clip_paths), batch_size):
+ batch = clip_paths[i:i+batch_size]
+ augmented_clips = []
+ for clip in batch:
+ clip_data, clip_sr = torchaudio.load(clip)
+ clip_data = clip_data[0]
+ if clip_data.shape[0] > total_length:
+ clip_data = clip_data[0:total_length]
+
+ if clip_sr != sr:
+ raise ValueError("Error! Clip does not have the correct sample rate!")
+
+ clip_data = create_fixed_size_clip(clip_data, total_length, clip_sr)
+
+ # Do first pass augmentations
+ augmented_clips.append(torch.from_numpy(augment1(samples=clip_data, sample_rate=sr)))
+
+ # Do second pass augmentations
+ device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+ augmented_batch = augment2(samples=torch.vstack(augmented_clips).unsqueeze(dim=1).to(device), sample_rate=sr).squeeze(axis=1)
+
+ # Do reverberation
+ if augmentation_probabilities["RIR"] >= np.random.random() and RIR_paths != []:
+ rir_waveform, sr = torchaudio.load(random.choice(RIR_paths))
+ augmented_batch = reverberate(augmented_batch.cpu(), rir_waveform, rescale_amp="avg")
+
+ # yield batch of 16-bit PCM audio data
+ yield (augmented_batch.cpu().numpy()*32767).astype(np.int16)
+
+
+def create_fixed_size_clip(x, n_samples, sr=16000, start=None, end_jitter=.200):
+ """
+ Create a fixed-length clip of the specified size by padding an input clip with zeros
+ Optionally specify the start/end position of the input clip, or let it be chosen randomly.
+
+ Args:
+ x (ndarray): The input audio to pad to a fixed size
+ n_samples (int): The total number of samples for the fixed length clip
+ sr (int): The sample rate of the audio
+ start (int): The start position of the clip in the fixed length output, in samples (default: None)
+ end_jitter (float): The time (in seconds) from the end of the fixed length output
+ that the input clip should end, if `start` is None.
+
+ Returns:
+ ndarray: A new array of audio data of the specified length
+ """
+ dat = np.zeros(n_samples)
+ end_jitter = int(np.random.uniform(0, end_jitter)*sr)
+ if start is None:
+ start = max(0, n_samples - (int(len(x))+end_jitter))
+
+ if len(x) > n_samples:
+ if np.random.random() >= 0.5:
+ dat = x[0:n_samples].numpy()
+ else:
+ dat = x[-n_samples:].numpy()
+ else:
+ dat[start:start+len(x)] = x
+
+ return dat
+
+
# Load batches of data from mmaped numpy arrays
class mmap_batch_generator:
"""
@@ -645,7 +826,6 @@ def __next__(self):
# Restart at zeroth index if an array reaches the end
if self.data_counter[label] >= self.shapes[label][0]:
self.data_counter[label] = 0
- # self.data[label] = np.load(self.data_files[label], mmap_mode='r')
# Get data from mmaped file
x = self.data[label][self.data_counter[label]:self.data_counter[label]+n]
@@ -697,7 +877,7 @@ def trim_mmap(mmap_path):
mmap_file2 = open_memmap(output_file2, mode='w+', dtype=np.float32,
shape=(N_new, mmap_file1.shape[1], mmap_file1.shape[2]))
- for i in tqdm(range(0, mmap_file1.shape[0], 1024), total=mmap_file1.shape[0]//1024):
+ for i in tqdm(range(0, mmap_file1.shape[0], 1024), total=mmap_file1.shape[0]//1024, desc="Trimming empty rows"):
if i + 1024 > N_new:
mmap_file2[i:N_new] = mmap_file1[i:N_new].copy()
mmap_file2.flush()
@@ -710,3 +890,126 @@ def trim_mmap(mmap_path):
# Rename new mmap file to match original
os.rename(output_file2, mmap_path)
+
+
+# Generate words that sound similar ("adversarial") to the input phrase using phoneme overlap
+def generate_adversarial_texts(input_text: str, N: int, include_partial_phrase: float = 0, include_input_words: float = 0):
+ """
+ Generate adversarial words and phrases based on phoneme overlap.
+ Currently only works for english texts.
+ Note that homophones are excluded, as this wouldn't actually be an adversarial example for the input text.
+
+ Args:
+ input_text (str): The target text for adversarial phrases
+ N (int): The total number of adversarial texts to return. Uses sampling,
+ so not all possible combinations will be included and some duplicates
+ may be present.
+ include_partial_phrase (float): The probability of returning a number of words less than the input
+ text (but always between 1 and the number of input words)
+ include_input_words (float): The probability of including individual input words in the adversarial
+ texts when the input text consists of multiple words. For example,
+ if the `input_text` was "ok google", then setting this value > 0.0
+ will allow for adversarial texts like "ok noodle", versus the word "ok"
+ never being present in the adversarial texts.
+
+ Returns:
+ list: A list of strings corresponding to words and phrases that are phonetically similar (but not identical)
+ to the input text.
+ """
+ # Get phonemes for english vowels (CMUDICT labels)
+ vowel_phones = ["AA", "AE", "AH", "AO", "AW", "AX", "AXR", "AY", "EH", "ER", "EY", "IH", "IX", "IY", "OW", "OY", "UH", "UW", "UX"]
+
+ word_phones = []
+ input_text_phones = [pronouncing.phones_for_word(i) for i in input_text.split()]
+
+ # Download phonemizer model for OOV words, if needed
+ if [] in input_text_phones:
+ phonemizer_mdl_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources", "en_us_cmudict_forward.pt")
+ if not os.path.exists(os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources")):
+ os.mkdir(os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources"))
+ if not os.path.exists(phonemizer_mdl_path):
+ logging.warning("Downloading phonemizer model from DeepPhonemizer library...")
+ import requests
+ file_url = "https://public-asai-dl-models.s3.eu-central-1.amazonaws.com/DeepPhonemizer/en_us_cmudict_forward.pt"
+ r = requests.get(file_url, stream=True)
+ with open(phonemizer_mdl_path, "wb") as f:
+ for chunk in r.iter_content(chunk_size=2048):
+ if chunk:
+ f.write(chunk)
+
+ # Create phonemizer object
+ from dp.phonemizer import Phonemizer
+ phonemizer = Phonemizer.from_checkpoint(phonemizer_mdl_path)
+
+ for phones, word in zip(input_text_phones, input_text.split()):
+ if phones != []:
+ word_phones.extend(phones)
+ elif phones == []:
+ logging.warning(f"The word '{word}' was not found in the pronunciation dictionary! "
+ "Using the DeepPhonemizer library to predict the phonemes.")
+ phones = phonemizer(word, lang='en_us')
+ logging.warning(f"Phones for '{word}': {phones}")
+ word_phones.append(re.sub(r"[\]|\[]", "", re.sub(r"\]\[", " ", phones)))
+ elif isinstance(phones[0], list):
+ logging.warning(f"There are multiple pronunciations for the word '{word}'.")
+ word_phones.append(phones[0])
+
+ # add all possible lexical stresses to vowels
+ word_phones = [re.sub('|'.join(vowel_phones), lambda x: str(x.group(0)) + '[0|1|2]', re.sub(r'\d+', '', i)) for i in word_phones]
+
+ adversarial_phrases = []
+ for phones, word in zip(word_phones, input_text.split()):
+ query_exps = []
+ phones = phones.split()
+ adversarial_words = []
+ if len(phones) <= 2:
+ query_exps.append(" ".join(phones))
+ else:
+ query_exps.extend(phoneme_replacement(phones, max_replace=max(0, len(phones)-2), replace_char="(.){1,3}"))
+
+ for query in query_exps:
+ matches = pronouncing.search(query)
+ matches_phones = [pronouncing.phones_for_word(i)[0] for i in matches]
+ allowed_matches = [i for i, j in zip(matches, matches_phones) if j != phones]
+ adversarial_words.extend([i for i in allowed_matches if word.lower() != i])
+
+ if adversarial_words != []:
+ adversarial_phrases.append(adversarial_words)
+
+ # Build combinations for final output
+ adversarial_texts = []
+ for i in range(N):
+ txts = []
+ for j, k in zip(adversarial_phrases, input_text.split()):
+ if np.random.random() > (1 - include_input_words):
+ txts.append(k)
+ else:
+ txts.append(np.random.choice(j))
+
+ if include_partial_phrase is not None and len(input_text.split()) > 1 and np.random.random() <= include_partial_phrase:
+ n_words = np.random.randint(1, len(input_text.split())+1)
+ adversarial_texts.append(" ".join(np.random.choice(txts, size=n_words, replace=False)))
+ else:
+ adversarial_texts.append(" ".join(txts))
+
+ # Remove any exact matches to input phrase
+ adversarial_texts = [i for i in adversarial_texts if i != input_text]
+
+ return adversarial_texts
+
+
+def phoneme_replacement(input_chars, max_replace, replace_char='"(.){1,3}"'):
+ results = []
+ chars = list(input_chars)
+
+ # iterate over the number of characters to replace (1 to max_replace)
+ for r in range(1, max_replace+1):
+ # get all combinations for a fixed r
+ comb = itertools.combinations(range(len(chars)), r)
+ for indices in comb:
+ chars_copy = chars.copy()
+ for i in indices:
+ chars_copy[i] = replace_char
+ results.append(' '.join(chars_copy))
+
+ return results
diff --git a/openwakeword/model.py b/openwakeword/model.py
index 46f603a..6029963 100755
--- a/openwakeword/model.py
+++ b/openwakeword/model.py
@@ -67,7 +67,7 @@ def __init__(
with VAD scores above the threshold will be returned. The default value (0),
disables voice activity detection entirely.
custom_verifier_models (dict): A dictionary of paths to custom verifier models, where
- the keys are the model names (corresponding to the openwakeword.models
+ the keys are the model names (corresponding to the openwakeword.MODELS
attribute) and the values are the filepaths of the
custom verifier models.
custom_verifier_threshold (float): The score threshold to use a custom verifier model. If the score
@@ -85,7 +85,7 @@ def __init__(
wakeword_model_names = []
if wakeword_models == []:
wakeword_models = pretrained_model_paths
- wakeword_model_names = list(openwakeword.models.keys())
+ wakeword_model_names = list(openwakeword.MODELS.keys())
elif len(wakeword_models) >= 1:
for ndx, i in enumerate(wakeword_models):
if os.path.exists(i):
@@ -224,10 +224,13 @@ def get_parent_model_from_label(self, label):
return parent_model
def reset(self):
- """Reset the prediction buffer"""
+ """Reset the prediction and audio feature buffers. Useful for re-initializing the model, though may not be efficient
+ when called too frequently."""
self.prediction_buffer = defaultdict(partial(deque, maxlen=30))
+ self.preprocessor.reset()
- def predict(self, x: np.ndarray, patience: dict = {}, threshold: dict = {}, timing: bool = False):
+ def predict(self, x: np.ndarray, patience: dict = {},
+ threshold: dict = {}, debounce_time: float = 0.0, timing: bool = False):
"""Predict with all of the wakeword models on the input audio frames
Args:
@@ -242,9 +245,11 @@ def predict(self, x: np.ndarray, patience: dict = {}, threshold: dict = {}, timi
model names and the values are the number of frames. Can reduce false-positive
detections at the cost of a lower true-positive rate.
By default, this behavior is disabled.
- threshold (dict): The threshold values to use when the `patience` behavior is enabled.
+ threshold (dict): The threshold values to use when the `patience` or `debounce_time` behavior is enabled.
Must be provided as an a dictionary where the keys are the
model names and the values are the thresholds.
+ debounce_time (float): The time (in seconds) to wait before returning another non-zero prediction
+ after a non-zero prediction. Can preven multiple detections of the same wake-word.
timing (bool): Whether to return timing information of the models. Can be useful to debug and
assess how efficiently models are running on the current hardware.
@@ -322,27 +327,40 @@ def predict(self, x: np.ndarray, patience: dict = {}, threshold: dict = {}, timi
)[0][-1]
predictions[cls] = verifier_prediction
- # Update prediction buffer, and zero predictions for first 5 frames during model initialization
+ # Zero predictions for first 5 frames during model initialization
for cls in predictions.keys():
if len(self.prediction_buffer[cls]) < 5:
predictions[cls] = 0.0
- self.prediction_buffer[cls].append(predictions[cls])
# Get timing information
if timing:
timing_dict["models"][mdl] = time.time() - model_start
# Update scores based on thresholds or patience arguments
- if patience != {}:
+ if patience != {} or debounce_time > 0:
if threshold == {}:
raise ValueError("Error! When using the `patience` argument, threshold "
"values must be provided via the `threshold` argument!")
+ if patience != {} and debounce_time > 0:
+ raise ValueError("Error! The `patience` and `debounce_time` arguments cannot be used together!")
for mdl in predictions.keys():
parent_model = self.get_parent_model_from_label(mdl)
- if parent_model in patience.keys():
- scores = np.array(self.prediction_buffer[mdl])[-patience[parent_model]:]
- if (scores >= threshold[parent_model]).sum() < patience[parent_model]:
- predictions[mdl] = 0.0
+ if predictions[mdl] != 0.0:
+ if parent_model in patience.keys():
+ scores = np.array(self.prediction_buffer[mdl])[-patience[parent_model]:]
+ if (scores >= threshold[parent_model]).sum() < patience[parent_model]:
+ predictions[mdl] = 0.0
+ elif debounce_time > 0:
+ if parent_model in threshold.keys():
+ n_frames = int(np.ceil(debounce_time/(n_prepared_samples/16000)))
+ recent_predictions = np.array(self.prediction_buffer[mdl])[-n_frames:]
+ if predictions[mdl] >= threshold[parent_model] and \
+ (recent_predictions >= threshold[parent_model]).sum() > 0:
+ predictions[mdl] = 0.0
+
+ # Update prediction buffer
+ for mdl in predictions.keys():
+ self.prediction_buffer[mdl].append(predictions[mdl])
# (optionally) get voice activity detection scores and update model scores
if self.vad_threshold > 0:
diff --git a/openwakeword/resources/models/alexa_v0.1.onnx b/openwakeword/resources/models/alexa_v0.1.onnx
deleted file mode 100644
index f52240e..0000000
--- a/openwakeword/resources/models/alexa_v0.1.onnx
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6ff566a01d12670e8d9e3c59da32651db1575d17272a601b7f8a39283dfbae3e
-size 854246
diff --git a/openwakeword/resources/models/alexa_v0.1.tflite b/openwakeword/resources/models/alexa_v0.1.tflite
deleted file mode 100644
index 5d516e2..0000000
--- a/openwakeword/resources/models/alexa_v0.1.tflite
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7333a317a790070a7f3432b81d9439c779481cc4ebd67c73da7174ea3cf48397
-size 855312
diff --git a/openwakeword/resources/models/embedding_model.onnx b/openwakeword/resources/models/embedding_model.onnx
deleted file mode 100644
index 2c928ee..0000000
--- a/openwakeword/resources/models/embedding_model.onnx
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:70d164290c1d095d1d4ee149bc5e00543250a7316b59f31d056cff7bd3075c1f
-size 1326578
diff --git a/openwakeword/resources/models/embedding_model.tflite b/openwakeword/resources/models/embedding_model.tflite
deleted file mode 100644
index 52a5336..0000000
--- a/openwakeword/resources/models/embedding_model.tflite
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c0aea21eb84a4ce90a08c870da41b7a7173b45269e6a3207c71d67c40f3a59d8
-size 1330312
diff --git a/openwakeword/resources/models/hey_jarvis_v0.1.onnx b/openwakeword/resources/models/hey_jarvis_v0.1.onnx
deleted file mode 100644
index a45f1de..0000000
--- a/openwakeword/resources/models/hey_jarvis_v0.1.onnx
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:94a13cfe60075b132f6a472e7e462e8123ee70861bc3fb58434a73712ee0d2cb
-size 1271370
diff --git a/openwakeword/resources/models/hey_jarvis_v0.1.tflite b/openwakeword/resources/models/hey_jarvis_v0.1.tflite
deleted file mode 100644
index d155242..0000000
--- a/openwakeword/resources/models/hey_jarvis_v0.1.tflite
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:14bff778604985e1b5c19f0f7bbe477a69cf281d8db34b232b3b972411f710e2
-size 1278912
diff --git a/openwakeword/resources/models/hey_mycroft_v0.1.onnx b/openwakeword/resources/models/hey_mycroft_v0.1.onnx
deleted file mode 100644
index b9952b3..0000000
--- a/openwakeword/resources/models/hey_mycroft_v0.1.onnx
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c2a311e8fa1338de89c31b3b46dc4dffd4af2f9a8d6ddead48893c2d301b1f18
-size 857691
diff --git a/openwakeword/resources/models/hey_mycroft_v0.1.tflite b/openwakeword/resources/models/hey_mycroft_v0.1.tflite
deleted file mode 100644
index 53b373c..0000000
--- a/openwakeword/resources/models/hey_mycroft_v0.1.tflite
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:bf9e43136afd3ca323698820a6e32a47f885ef4c30a3b8b577ec71688a9d64d8
-size 860300
diff --git a/openwakeword/resources/models/hey_rhasspy_v0.1.onnx b/openwakeword/resources/models/hey_rhasspy_v0.1.onnx
deleted file mode 100644
index dea9d9d..0000000
--- a/openwakeword/resources/models/hey_rhasspy_v0.1.onnx
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5a9b3ed3be2910e35780e097905aa9f35a9c10038df47914cf2b3ec4d670f6ea
-size 204081
diff --git a/openwakeword/resources/models/hey_rhasspy_v0.1.tflite b/openwakeword/resources/models/hey_rhasspy_v0.1.tflite
deleted file mode 100644
index 4fb9b6c..0000000
--- a/openwakeword/resources/models/hey_rhasspy_v0.1.tflite
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:01d2526b45068f565aa3849d6ec2b7abae099154fc1b496f9ef20de9ef241fe9
-size 416140
diff --git a/openwakeword/resources/models/melspectrogram.onnx b/openwakeword/resources/models/melspectrogram.onnx
deleted file mode 100644
index be0643d..0000000
--- a/openwakeword/resources/models/melspectrogram.onnx
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ba2b0e0f8b7b875369a2c89cb13360ff53bac436f2895cced9f479fa65eb176f
-size 1087958
diff --git a/openwakeword/resources/models/melspectrogram.tflite b/openwakeword/resources/models/melspectrogram.tflite
deleted file mode 100644
index c0f0ab8..0000000
--- a/openwakeword/resources/models/melspectrogram.tflite
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:96fa0adccb6e8cf95cb14465409a1a2898ee4a96a85bb9ed3c7eb0e68bf163e8
-size 1092516
diff --git a/openwakeword/resources/models/silero_vad.onnx b/openwakeword/resources/models/silero_vad.onnx
deleted file mode 100755
index 664012e..0000000
--- a/openwakeword/resources/models/silero_vad.onnx
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a35ebf52fd3ce5f1469b2a36158dba761bc47b973ea3382b3186ca15b1f5af28
-size 1807522
diff --git a/openwakeword/resources/models/timer_v0.1.onnx b/openwakeword/resources/models/timer_v0.1.onnx
deleted file mode 100644
index 5603f7d..0000000
--- a/openwakeword/resources/models/timer_v0.1.onnx
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:371e44535470a29248b3b8f1bbbbaf2525c86417fd8f75c67fcf02ae0b9626df
-size 1742475
diff --git a/openwakeword/resources/models/timer_v0.1.tflite b/openwakeword/resources/models/timer_v0.1.tflite
deleted file mode 100644
index 11a7d50..0000000
--- a/openwakeword/resources/models/timer_v0.1.tflite
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:21d5b0267e97df64870b7aca312e2043ebed248d365698926a115a3694ff9626
-size 1743316
diff --git a/openwakeword/resources/models/weather_v0.1.onnx b/openwakeword/resources/models/weather_v0.1.onnx
deleted file mode 100644
index 6c5599e..0000000
--- a/openwakeword/resources/models/weather_v0.1.onnx
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8441da8e746899e8d969528d5bad5651cdd563079c05962788f77753041f60e7
-size 1149158
diff --git a/openwakeword/resources/models/weather_v0.1.tflite b/openwakeword/resources/models/weather_v0.1.tflite
deleted file mode 100644
index 95dab6e..0000000
--- a/openwakeword/resources/models/weather_v0.1.tflite
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4178991c7aeb76670f5a56559eb4129a6f3ae6207886db8bd8094fea7d362c3f
-size 1150224
diff --git a/openwakeword/train.py b/openwakeword/train.py
new file mode 100755
index 0000000..7e468bf
--- /dev/null
+++ b/openwakeword/train.py
@@ -0,0 +1,902 @@
+import torch
+from torch import optim, nn
+import torchinfo
+import torchmetrics
+import copy
+import os
+import sys
+import tempfile
+import uuid
+import numpy as np
+import scipy
+import collections
+import argparse
+import logging
+from tqdm import tqdm
+import yaml
+from pathlib import Path
+import openwakeword
+from openwakeword.data import generate_adversarial_texts, augment_clips, mmap_batch_generator
+from openwakeword.utils import compute_features_from_generator
+from openwakeword.utils import AudioFeatures
+
+
+# Base model class for an openwakeword model
+class Model(nn.Module):
+ def __init__(self, n_classes=1, input_shape=(16, 96), model_type="dnn",
+ layer_dim=128, n_blocks=1, seconds_per_example=None):
+ super().__init__()
+
+ # Store inputs as attributes
+ self.n_classes = n_classes
+ self.input_shape = input_shape
+ self.seconds_per_example = seconds_per_example
+ self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+ self.best_models = []
+ self.best_model_scores = []
+ self.best_val_fp = 1000
+ self.best_val_accuracy = 0
+ self.best_val_recall = 0
+ self.best_train_recall = 0
+
+ # Define model (currently on fully-connected network supported)
+ if model_type == "dnn":
+ # self.model = nn.Sequential(
+ # nn.Flatten(),
+ # nn.Linear(input_shape[0]*input_shape[1], layer_dim),
+ # nn.LayerNorm(layer_dim),
+ # nn.ReLU(),
+ # nn.Linear(layer_dim, layer_dim),
+ # nn.LayerNorm(layer_dim),
+ # nn.ReLU(),
+ # nn.Linear(layer_dim, n_classes),
+ # nn.Sigmoid() if n_classes == 1 else nn.ReLU(),
+ # )
+
+ class FCNBlock(nn.Module):
+ def __init__(self, layer_dim):
+ super().__init__()
+ self.fcn_layer = nn.Linear(layer_dim, layer_dim)
+ self.relu = nn.ReLU()
+ self.layer_norm = nn.LayerNorm(layer_dim)
+
+ def forward(self, x):
+ return self.relu(self.layer_norm(self.fcn_layer(x)))
+
+ class Net(nn.Module):
+ def __init__(self, input_shape, layer_dim, n_blocks=1, n_classes=1):
+ super().__init__()
+ self.flatten = nn.Flatten()
+ self.layer1 = nn.Linear(input_shape[0]*input_shape[1], layer_dim)
+ self.relu1 = nn.ReLU()
+ self.layernorm1 = nn.LayerNorm(layer_dim)
+ self.blocks = nn.ModuleList([FCNBlock(layer_dim) for i in range(n_blocks)])
+ self.last_layer = nn.Linear(layer_dim, n_classes)
+ self.last_act = nn.Sigmoid() if n_classes == 1 else nn.ReLU()
+
+ def forward(self, x):
+ x = self.relu1(self.layernorm1(self.layer1(self.flatten(x))))
+ for block in self.blocks:
+ x = block(x)
+ x = self.last_act(self.last_layer(x))
+ return x
+ self.model = Net(input_shape, layer_dim, n_blocks=n_blocks, n_classes=n_classes)
+ elif model_type == "rnn":
+ class Net(nn.Module):
+ def __init__(self, input_shape, n_classes=1):
+ super().__init__()
+ self.layer1 = nn.LSTM(input_shape[-1], 64, num_layers=2, bidirectional=True,
+ batch_first=True, dropout=0.0)
+ self.layer2 = nn.Linear(64*2, n_classes)
+ self.layer3 = nn.Sigmoid() if n_classes == 1 else nn.ReLU()
+
+ def forward(self, x):
+ out, h = self.layer1(x)
+ return self.layer3(self.layer2(out[:, -1]))
+ self.model = Net(input_shape, n_classes)
+
+ # Define metrics
+ if n_classes == 1:
+ self.fp = lambda pred, y: (y-pred <= -0.5).sum()
+ self.recall = torchmetrics.Recall(task='binary')
+ self.accuracy = torchmetrics.Accuracy(task='binary')
+ else:
+ def multiclass_fp(p, y, threshold=0.5):
+ probs = torch.nn.functional.softmax(p, dim=1)
+ neg_ndcs = y == 0
+ fp = (probs[neg_ndcs].argmax(axis=1) != 0 & (probs[neg_ndcs].max(axis=1)[0] > threshold)).sum()
+ return fp
+
+ def positive_class_recall(p, y, negative_class_label=0, threshold=0.5):
+ probs = torch.nn.functional.softmax(p, dim=1)
+ pos_ndcs = y != 0
+ rcll = (probs[pos_ndcs].argmax(axis=1) > 0
+ & (probs[pos_ndcs].max(axis=1)[0] >= threshold)).sum()/pos_ndcs.sum()
+ return rcll
+
+ def positive_class_accuracy(p, y, negative_class_label=0):
+ probs = torch.nn.functional.softmax(p, dim=1)
+ pos_preds = probs.argmax(axis=1) != negative_class_label
+ acc = (probs[pos_preds].argmax(axis=1) == y[pos_preds]).sum()/pos_preds.sum()
+ return acc
+
+ self.fp = multiclass_fp
+ self.acc = positive_class_accuracy
+ self.recall = positive_class_recall
+
+ self.n_fp = 0
+ self.val_fp = 0
+
+ # Define logging dict (in-memory)
+ self.history = collections.defaultdict(list)
+
+ # Define optimizer and loss
+ self.loss = torch.nn.functional.binary_cross_entropy if n_classes == 1 else nn.functional.cross_entropy
+ self.optimizer = optim.Adam(self.model.parameters(), lr=0.0001)
+
+ def save_model(self, output_path):
+ """
+ Saves the weights of a trained Pytorch model
+ """
+ if self.n_classes == 1:
+ torch.save(self.model, output_path)
+
+ def export_to_onnx(self, output_path, class_mapping=""):
+ obj = self
+ # Make simple model for export based on model structure
+ if self.n_classes == 1:
+ # Save ONNX model
+ torch.onnx.export(self.model.to("cpu"), torch.rand(self.input_shape)[None, ], output_path,
+ output_names=[class_mapping])
+
+ elif self.n_classes >= 1:
+ class M(nn.Module):
+ def __init__(self):
+ super().__init__()
+
+ # Define model
+ self.model = obj.model.to("cpu")
+
+ def forward(self, x):
+ return torch.nn.functional.softmax(self.model(x), dim=1)
+
+ # Save ONNX model
+ torch.onnx.export(M(), torch.rand(self.input_shape)[None, ], output_path,
+ output_names=[class_mapping])
+
+ def lr_warmup_cosine_decay(self,
+ global_step,
+ warmup_steps=0,
+ hold=0,
+ total_steps=0,
+ start_lr=0.0,
+ target_lr=1e-3
+ ):
+ # Cosine decay
+ learning_rate = 0.5 * target_lr * (1 + np.cos(np.pi * (global_step - warmup_steps - hold)
+ / float(total_steps - warmup_steps - hold)))
+
+ # Target LR * progress of warmup (=1 at the final warmup step)
+ warmup_lr = target_lr * (global_step / warmup_steps)
+
+ # Choose between `warmup_lr`, `target_lr` and `learning_rate` based on whether
+ # `global_step < warmup_steps` and we're still holding.
+ # i.e. warm up if we're still warming up and use cosine decayed lr otherwise
+ if hold > 0:
+ learning_rate = np.where(global_step > warmup_steps + hold,
+ learning_rate, target_lr)
+
+ learning_rate = np.where(global_step < warmup_steps, warmup_lr, learning_rate)
+ return learning_rate
+
+ def forward(self, x):
+ return self.model(x)
+
+ def summary(self):
+ return torchinfo.summary(self.model, input_size=(1,) + self.input_shape, device='cpu')
+
+ def average_models(self, models=None):
+ """Averages the weights of the provided models together to make a new model"""
+
+ if models is None:
+ models = self.best_models
+
+ # Clone a model from the list as the base for the averaged model
+ averaged_model = copy.deepcopy(models[0])
+ averaged_model_dict = averaged_model.state_dict()
+
+ # Initialize a running total of the weights
+ for key in averaged_model_dict:
+ averaged_model_dict[key] *= 0 # set to 0
+
+ for model in models:
+ model_dict = model.state_dict()
+ for key, value in model_dict.items():
+ averaged_model_dict[key] += value
+
+ for key in averaged_model_dict:
+ averaged_model_dict[key] /= len(models)
+
+ # Load the averaged weights into the model
+ averaged_model.load_state_dict(averaged_model_dict)
+
+ return averaged_model
+
+ def _select_best_model(self, false_positive_validate_data, val_set_hrs=11.3, max_fp_per_hour=0.5, min_recall=0.20):
+ """
+ Select the top model based on the false positive rate on the validation data
+
+ Args:
+ false_positive_validate_data (torch.DataLoader): A dataloader with validation data
+ n (int): The number of models to select
+
+ Returns:
+ list: A list of the top n models
+ """
+ # Get false positive rates for each model
+ false_positive_rates = [0]*len(self.best_models)
+ for batch in false_positive_validate_data:
+ x_val, y_val = batch[0].to(self.device), batch[1].to(self.device)
+ for mdl_ndx, model in tqdm(enumerate(self.best_models), total=len(self.best_models),
+ desc="Find best checkpoints by false positive rate"):
+ with torch.no_grad():
+ val_ps = model(x_val)
+ false_positive_rates[mdl_ndx] = false_positive_rates[mdl_ndx] + self.fp(val_ps, y_val[..., None]).detach().cpu().numpy()
+ false_positive_rates = [fp/val_set_hrs for fp in false_positive_rates]
+
+ candidate_model_ndx = [ndx for ndx, fp in enumerate(false_positive_rates) if fp <= max_fp_per_hour]
+ candidate_model_recall = [self.best_model_scores[ndx]["val_recall"] for ndx in candidate_model_ndx]
+ if max(candidate_model_recall) <= min_recall:
+ logging.warning(f"No models with recall >= {min_recall} found!")
+ return None
+ else:
+ best_model = self.best_models[candidate_model_ndx[np.argmax(candidate_model_recall)]]
+ best_model_training_step = self.best_model_scores[candidate_model_ndx[np.argmax(candidate_model_recall)]]["training_step_ndx"]
+ logging.info(f"Best model from training step {best_model_training_step} out of {len(candidate_model_ndx)}"
+ f"models has recall of {np.max(candidate_model_recall)} and false positive rate of"
+ f" {false_positive_rates[candidate_model_ndx[np.argmax(candidate_model_recall)]]}")
+
+ return best_model
+
+ def auto_train(self, X_train, X_val, false_positive_val_data, steps=50000, max_negative_weight=1000,
+ target_fp_per_hour=0.2):
+ """A sequence of training steps that produce relatively strong models
+ automatically, based on validation data and performance targets provided.
+ After training merges the best checkpoints and returns a single model.
+ """
+
+ # Get false positive validation data duration
+ val_set_hrs = 11.3
+
+ # Sequence 1
+ logging.info("#"*50 + "\nStarting training sequence 1...\n" + "#"*50)
+ lr = 0.0001
+ weights = np.linspace(1, max_negative_weight, int(steps)).tolist()
+ val_steps = np.linspace(steps-int(steps*0.25), steps, 20).astype(np.int64)
+ self.train_model(
+ X=X_train,
+ X_val=X_val,
+ false_positive_val_data=false_positive_val_data,
+ max_steps=steps,
+ negative_weight_schedule=weights,
+ val_steps=val_steps, warmup_steps=steps//5,
+ hold_steps=steps//3, lr=lr, val_set_hrs=val_set_hrs)
+
+ # Sequence 2
+ logging.info("#"*50 + "\nStarting training sequence 2...\n" + "#"*50)
+ lr = lr/10
+ steps = steps/10
+
+ # Adjust weights as needed based on false positive per hour performance from first sequence
+ if self.best_val_fp > target_fp_per_hour:
+ max_negative_weight = max_negative_weight*2
+ logging.info("Increasing weight on negative examples to reduce false positives...")
+
+ weights = np.linspace(1, max_negative_weight, int(steps)).tolist()
+ val_steps = np.linspace(1, steps, 20).astype(np.int16)
+ self.train_model(
+ X=X_train,
+ X_val=X_val,
+ false_positive_val_data=false_positive_val_data,
+ max_steps=steps,
+ negative_weight_schedule=weights,
+ val_steps=val_steps, warmup_steps=steps//5,
+ hold_steps=steps//3, lr=lr, val_set_hrs=val_set_hrs)
+
+ # Sequence 3
+ logging.info("#"*50 + "\nStarting training sequence 3...\n" + "#"*50)
+ lr = lr/10
+
+ # Adjust weights as needed based on false positive per hour performance from second sequence
+ if self.best_val_fp > target_fp_per_hour:
+ max_negative_weight = max_negative_weight*2
+ logging.info("Increasing weight on negative examples to reduce false positives...")
+
+ weights = np.linspace(1, max_negative_weight, int(steps)).tolist()
+ val_steps = np.linspace(1, steps, 20).astype(np.int16)
+ self.train_model(
+ X=X_train,
+ X_val=X_val,
+ false_positive_val_data=false_positive_val_data,
+ max_steps=steps,
+ negative_weight_schedule=weights,
+ val_steps=val_steps, warmup_steps=steps//5,
+ hold_steps=steps//3, lr=lr, val_set_hrs=val_set_hrs)
+
+ # Merge best models
+ logging.info("Merging checkpoints above the 90th percentile into single model...")
+ accuracy_percentile = np.percentile(self.history["val_accuracy"], 90)
+ recall_percentile = np.percentile(self.history["val_recall"], 90)
+ fp_percentile = np.percentile(self.history["val_fp_per_hr"], 10)
+
+ # Get models above the 90th percentile
+ models = []
+ for model, score in zip(self.best_models, self.best_model_scores):
+ if score["val_accuracy"] >= accuracy_percentile and \
+ score["val_recall"] >= recall_percentile and \
+ score["val_fp_per_hr"] <= fp_percentile:
+ models.append(model)
+
+ if len(models) > 0:
+ combined_model = self.average_models(models=models)
+ else:
+ combined_model = self.model
+
+ # Report validation metrics for combined model
+ with torch.no_grad():
+ for batch in X_val:
+ x, y = batch[0].to(self.device), batch[1].to(self.device)
+ val_ps = combined_model(x)
+
+ combined_model_recall = self.recall(val_ps, y[..., None]).detach().cpu().numpy()
+ combined_model_accuracy = self.accuracy(val_ps, y[..., None].to(torch.int64)).detach().cpu().numpy()
+
+ combined_model_fp = 0
+ for batch in false_positive_val_data:
+ x_val, y_val = batch[0].to(self.device), batch[1].to(self.device)
+ val_ps = combined_model(x_val)
+ combined_model_fp += self.fp(val_ps, y_val[..., None])
+
+ combined_model_fp_per_hr = (combined_model_fp/val_set_hrs).detach().cpu().numpy()
+
+ logging.info(f"\n################\nFinal Model Accuracy: {combined_model_accuracy}"
+ f"\nFinal Model Recall: {combined_model_recall}\nFinal Model False Positives per Hour: {combined_model_fp_per_hr}"
+ "\n################\n")
+
+ return combined_model
+
+ def predict_on_features(self, features, model=None):
+ """
+ Predict on Tensors of openWakeWord features corresponding to single audio clips
+
+ Args:
+ features (torch.Tensor): A Tensor of openWakeWord features with shape (batch, features)
+ model (torch.nn.Module): A Pytorch model to use for prediction (default None, which will use self.model)
+
+ Returns:
+ torch.Tensor: An array of predictions of shape (batch, prediction), where 0 is negative and 1 is positive
+ """
+ if len(features) < 3:
+ features = features[None, ]
+
+ features = features.to(self.device)
+ predictions = []
+ for x in tqdm(features, desc="Predicting on clips"):
+ x = x[None, ]
+ batch = []
+ for i in range(0, x.shape[1]-16, 1): # step size of 1 (80 ms)
+ batch.append(x[:, i:i+16, :])
+ batch = torch.vstack(batch)
+ if model is None:
+ preds = self.model(batch)
+ else:
+ preds = model(batch)
+ predictions.append(preds.detach().cpu().numpy()[None, ])
+
+ return np.vstack(predictions)
+
+ def predict_on_clips(self, clips, model=None):
+ """
+ Predict on Tensors of 16-bit 16 khz audio data
+
+ Args:
+ clips (np.ndarray): A Numpy array of audio clips with shape (batch, samples)
+ model (torch.nn.Module): A Pytorch model to use for prediction (default None, which will use self.model)
+
+ Returns:
+ np.ndarray: An array of predictions of shape (batch, prediction), where 0 is negative and 1 is positive
+ """
+
+ # Get features from clips
+ F = AudioFeatures(device='cpu', ncpu=4)
+ features = F.embed_clips(clips, batch_size=16)
+
+ # Predict on features
+ preds = self.predict_on_features(torch.from_numpy(features), model=model)
+
+ return preds
+
+ def export_model(self, model, model_name, output_dir):
+ """Saves the trained openwakeword model to both onnx and tflite formats"""
+
+ if self.n_classes != 1:
+ raise ValueError("Exporting models to both onnx and tflite with more than one class is currently not supported! "
+ "Use the `export_to_onnx` function instead.")
+
+ # Save ONNX model
+ logging.info(f"####\nSaving ONNX mode as '{os.path.join(output_dir, model_name + '.onnx')}'")
+ model_to_save = copy.deepcopy(model)
+ torch.onnx.export(model_to_save.to("cpu"), torch.rand(self.input_shape)[None, ],
+ os.path.join(output_dir, model_name + ".onnx"), opset_version=13)
+
+ return None
+
+ def train_model(self, X, max_steps, warmup_steps, hold_steps, X_val=None,
+ false_positive_val_data=None, positive_test_clips=None,
+ negative_weight_schedule=[1],
+ val_steps=[250], lr=0.0001, val_set_hrs=1):
+ # Move models and main class to target device
+ self.to(self.device)
+ self.model.to(self.device)
+
+ # Train model
+ accumulation_steps = 1
+ accumulated_samples = 0
+ accumulated_predictions = torch.Tensor([]).to(self.device)
+ accumulated_labels = torch.Tensor([]).to(self.device)
+ for step_ndx, data in tqdm(enumerate(X, 0), total=max_steps, desc="Training"):
+ # get the inputs; data is a list of [inputs, labels]
+ x, y = data[0].to(self.device), data[1].to(self.device)
+ y_ = y[..., None].to(torch.float32)
+
+ # Update learning rates
+ for g in self.optimizer.param_groups:
+ g['lr'] = self.lr_warmup_cosine_decay(step_ndx, warmup_steps=warmup_steps, hold=hold_steps,
+ total_steps=max_steps, target_lr=lr)
+
+ # zero the parameter gradients
+ self.optimizer.zero_grad()
+
+ # Get predictions for batch
+ predictions = self.model(x)
+
+ # Construct batch with only samples that have high loss
+ neg_high_loss = predictions[(y == 0) & (predictions.squeeze() >= 0.001)] # thresholds were chosen arbitrarily but work well
+ pos_high_loss = predictions[(y == 1) & (predictions.squeeze() < 0.999)]
+ y = torch.cat((y[(y == 0) & (predictions.squeeze() >= 0.001)], y[(y == 1) & (predictions.squeeze() < 0.999)]))
+ y_ = y[..., None].to(torch.float32)
+ predictions = torch.cat((neg_high_loss, pos_high_loss))
+
+ # Set weights for batch
+ if len(negative_weight_schedule) == 1:
+ w = torch.ones(y.shape[0])*negative_weight_schedule[0]
+ pos_ndcs = y == 1
+ w[pos_ndcs] = 1
+ w = w[..., None]
+ else:
+ if self.n_classes == 1:
+ w = torch.ones(y.shape[0])*negative_weight_schedule[step_ndx]
+ pos_ndcs = y == 1
+ w[pos_ndcs] = 1
+ w = w[..., None]
+
+ if predictions.shape[0] != 0:
+ # Do backpropagation, with gradient accumulation if the batch-size after selecting high loss examples is too small
+ loss = self.loss(predictions, y_ if self.n_classes == 1 else y, w.to(self.device))
+ loss = loss/accumulation_steps
+ accumulated_samples += predictions.shape[0]
+
+ if predictions.shape[0] >= 128:
+ accumulated_predictions = predictions
+ accumulated_labels = y_
+ if accumulated_samples < 128:
+ accumulation_steps += 1
+ accumulated_predictions = torch.cat((accumulated_predictions, predictions))
+ accumulated_labels = torch.cat((accumulated_labels, y_))
+ else:
+ loss.backward()
+ self.optimizer.step()
+ accumulation_steps = 1
+ accumulated_samples = 0
+
+ self.history["loss"].append(loss.detach().cpu().numpy())
+
+ # Compute training metrics and log them
+ fp = self.fp(accumulated_predictions, accumulated_labels if self.n_classes == 1 else y)
+ self.n_fp += fp
+ self.history["recall"].append(self.recall(accumulated_predictions, accumulated_labels).detach().cpu().numpy())
+
+ accumulated_predictions = torch.Tensor([]).to(self.device)
+ accumulated_labels = torch.Tensor([]).to(self.device)
+
+ # Run validation and log validation metrics
+ if step_ndx in val_steps and step_ndx > 1 and false_positive_val_data is not None:
+ # Get false positives per hour with false positive data
+ val_fp = 0
+ for val_step_ndx, data in enumerate(false_positive_val_data):
+ with torch.no_grad():
+ x_val, y_val = data[0].to(self.device), data[1].to(self.device)
+ val_predictions = self.model(x_val)
+ val_fp += self.fp(val_predictions, y_val[..., None])
+ val_fp_per_hr = (val_fp/val_set_hrs).detach().cpu().numpy()
+ self.history["val_fp_per_hr"].append(val_fp_per_hr)
+
+ # Get recall on test clips
+ if step_ndx in val_steps and step_ndx > 1 and positive_test_clips is not None:
+ tp = 0
+ fn = 0
+ for val_step_ndx, data in enumerate(positive_test_clips):
+ with torch.no_grad():
+ x_val = data[0].to(self.device)
+ batch = []
+ for i in range(0, x_val.shape[1]-16, 1):
+ batch.append(x_val[:, i:i+16, :])
+ batch = torch.vstack(batch)
+ preds = self.model(batch)
+ if any(preds >= 0.5):
+ tp += 1
+ else:
+ fn += 1
+ self.history["positive_test_clips_recall"].append(tp/(tp + fn))
+
+ if step_ndx in val_steps and step_ndx > 1 and X_val is not None:
+ # Get metrics for balanced test examples of positive and negative clips
+ for val_step_ndx, data in enumerate(X_val):
+ with torch.no_grad():
+ x_val, y_val = data[0].to(self.device), data[1].to(self.device)
+ val_predictions = self.model(x_val)
+ val_recall = self.recall(val_predictions, y_val[..., None]).detach().cpu().numpy()
+ val_acc = self.accuracy(val_predictions, y_val[..., None].to(torch.int64))
+ val_fp = self.fp(val_predictions, y_val[..., None])
+ self.history["val_accuracy"].append(val_acc.detach().cpu().numpy())
+ self.history["val_recall"].append(val_recall)
+ self.history["val_n_fp"].append(val_fp.detach().cpu().numpy())
+
+ # Save models with a validation score above/below the 90th percentile
+ # of the validation scores up to that point
+ if step_ndx in val_steps and step_ndx > 1:
+ if self.history["val_n_fp"][-1] <= np.percentile(self.history["val_n_fp"], 50) and \
+ self.history["val_recall"][-1] >= np.percentile(self.history["val_recall"], 5):
+ # logging.info("Saving checkpoint with metrics >= to targets!")
+ self.best_models.append(copy.deepcopy(self.model))
+ self.best_model_scores.append({"training_step_ndx": step_ndx, "val_n_fp": self.history["val_n_fp"][-1],
+ "val_recall": self.history["val_recall"][-1],
+ "val_accuracy": self.history["val_accuracy"][-1],
+ "val_fp_per_hr": self.history.get("val_fp_per_hr", [0])[-1]})
+ self.best_val_recall = self.history["val_recall"][-1]
+ self.best_val_accuracy = self.history["val_accuracy"][-1]
+
+ if step_ndx == max_steps-1:
+ break
+
+
+# Separate function to convert onnx models to tflite format
+def convert_onnx_to_tflite(onnx_model_path, output_path):
+ """Converts an ONNX version of an openwakeword model to the Tensorflow tflite format."""
+ # imports
+ import onnx
+ from onnx_tf.backend import prepare
+ import tensorflow as tf
+
+ # Convert to tflite from onnx model
+ onnx_model = onnx.load(onnx_model_path)
+ tf_rep = prepare(onnx_model, device="CPU")
+ with tempfile.TemporaryDirectory() as tmp_dir:
+ tf_rep.export_graph(os.path.join(tmp_dir, "tf_model"))
+ converter = tf.lite.TFLiteConverter.from_saved_model(os.path.join(tmp_dir, "tf_model"))
+ tflite_model = converter.convert()
+
+ logging.info(f"####\nSaving tflite mode to '{output_path}'")
+ with open(output_path, 'wb') as f:
+ f.write(tflite_model)
+
+ return None
+
+
+if __name__ == '__main__':
+ # Get training config file
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--training_config",
+ help="The path to the training config file (required)",
+ type=str,
+ required=True
+ )
+ parser.add_argument(
+ "--generate_clips",
+ help="Execute the synthetic data generation process",
+ action="https://github.com/store_true",
+ default="False",
+ required=False
+ )
+ parser.add_argument(
+ "--augment_clips",
+ help="Execute the synthetic data augmentation process",
+ action="https://github.com/store_true",
+ default="False",
+ required=False
+ )
+ parser.add_argument(
+ "--overwrite",
+ help="Overwrite existing openwakeword features when the --augment_clips flag is used",
+ action="https://github.com/store_true",
+ default="False",
+ required=False
+ )
+ parser.add_argument(
+ "--train_model",
+ help="Execute the model training process",
+ action="https://github.com/store_true",
+ default="False",
+ required=False
+ )
+
+ args = parser.parse_args()
+ config = yaml.load(open(args.training_config, 'r').read(), yaml.Loader)
+
+ # imports Piper for synthetic sample generation
+ sys.path.insert(0, os.path.abspath(config["piper_sample_generator_path"]))
+ from generate_samples import generate_samples
+
+ # Define output locations
+ config["output_dir"] = os.path.abspath(config["output_dir"])
+ if not os.path.exists(config["output_dir"]):
+ os.mkdir(config["output_dir"])
+ if not os.path.exists(os.path.join(config["output_dir"], config["model_name"])):
+ os.mkdir(os.path.join(config["output_dir"], config["model_name"]))
+
+ positive_train_output_dir = os.path.join(config["output_dir"], config["model_name"], "positive_train")
+ positive_test_output_dir = os.path.join(config["output_dir"], config["model_name"], "positive_test")
+ negative_train_output_dir = os.path.join(config["output_dir"], config["model_name"], "negative_train")
+ negative_test_output_dir = os.path.join(config["output_dir"], config["model_name"], "negative_test")
+ feature_save_dir = os.path.join(config["output_dir"], config["model_name"])
+
+ # Get paths for impulse response and background audio files
+ rir_paths = [i.path for j in config["rir_paths"] for i in os.scandir(j)]
+ background_paths = []
+ if len(config["background_paths_duplication_rate"]) != len(config["background_paths"]):
+ config["background_paths_duplication_rate"] = [1]*len(config["background_paths"])
+ for background_path, duplication_rate in zip(config["background_paths"], config["background_paths_duplication_rate"]):
+ background_paths.extend([i.path for i in os.scandir(background_path)]*duplication_rate)
+
+ if args.generate_clips is True:
+ # Generate positive clips for training
+ logging.info("#"*50 + "\nGenerating positive clips for training\n" + "#"*50)
+ if not os.path.exists(positive_train_output_dir):
+ os.mkdir(positive_train_output_dir)
+ n_current_samples = len(os.listdir(positive_train_output_dir))
+ if n_current_samples <= 0.95*config["n_samples"]:
+ generate_samples(
+ text=config["target_phrase"], max_samples=config["n_samples"]-n_current_samples,
+ batch_size=config["tts_batch_size"],
+ noise_scales=[0.98], noise_scale_ws=[0.98], length_scales=[0.75, 1.0, 1.25],
+ output_dir=positive_train_output_dir, auto_reduce_batch_size=True,
+ file_names=[uuid.uuid4().hex + ".wav" for i in range(config["n_samples"])]
+ )
+ torch.cuda.empty_cache()
+ else:
+ logging.warning(f"Skipping generation of positive clips for training, as ~{config['n_samples']} already exist")
+
+ # Generate positive clips for testing
+ logging.info("#"*50 + "\nGenerating positive clips for testing\n" + "#"*50)
+ if not os.path.exists(positive_test_output_dir):
+ os.mkdir(positive_test_output_dir)
+ n_current_samples = len(os.listdir(positive_test_output_dir))
+ if n_current_samples <= 0.95*config["n_samples_val"]:
+ generate_samples(text=config["target_phrase"], max_samples=config["n_samples_val"]-n_current_samples,
+ batch_size=config["tts_batch_size"],
+ noise_scales=[1.0], noise_scale_ws=[1.0], length_scales=[0.75, 1.0, 1.25],
+ output_dir=positive_test_output_dir, auto_reduce_batch_size=True)
+ torch.cuda.empty_cache()
+ else:
+ logging.warning(f"Skipping generation of positive clips testing, as ~{config['n_samples_val']} already exist")
+
+ # Generate adversarial negative clips for training
+ logging.info("#"*50 + "\nGenerating negative clips for training\n" + "#"*50)
+ if not os.path.exists(negative_train_output_dir):
+ os.mkdir(negative_train_output_dir)
+ n_current_samples = len(os.listdir(negative_train_output_dir))
+ if n_current_samples <= 0.95*config["n_samples"]:
+ adversarial_texts = config["custom_negative_phrases"]
+ for target_phrase in config["target_phrase"]:
+ adversarial_texts.extend(generate_adversarial_texts(
+ input_text=target_phrase,
+ N=config["n_samples"]//len(config["target_phrase"]),
+ include_partial_phrase=1.0,
+ include_input_words=0.2))
+ generate_samples(text=adversarial_texts, max_samples=config["n_samples"]-n_current_samples,
+ batch_size=config["tts_batch_size"]//7,
+ noise_scales=[0.98], noise_scale_ws=[0.98], length_scales=[0.75, 1.0, 1.25],
+ output_dir=negative_train_output_dir, auto_reduce_batch_size=True,
+ file_names=[uuid.uuid4().hex + ".wav" for i in range(config["n_samples"])]
+ )
+ torch.cuda.empty_cache()
+ else:
+ logging.warning(f"Skipping generation of negative clips for training, as ~{config['n_samples']} already exist")
+
+ # Generate adversarial negative clips for testing
+ logging.info("#"*50 + "\nGenerating negative clips for testing\n" + "#"*50)
+ if not os.path.exists(negative_test_output_dir):
+ os.mkdir(negative_test_output_dir)
+ n_current_samples = len(os.listdir(negative_test_output_dir))
+ if n_current_samples <= 0.95*config["n_samples_val"]:
+ adversarial_texts = config["custom_negative_phrases"]
+ for target_phrase in config["target_phrase"]:
+ adversarial_texts.extend(generate_adversarial_texts(
+ input_text=target_phrase,
+ N=config["n_samples_val"]//len(config["target_phrase"]),
+ include_partial_phrase=1.0,
+ include_input_words=0.2))
+ generate_samples(text=adversarial_texts, max_samples=config["n_samples_val"]-n_current_samples,
+ batch_size=config["tts_batch_size"]//7,
+ noise_scales=[1.0], noise_scale_ws=[1.0], length_scales=[0.75, 1.0, 1.25],
+ output_dir=negative_test_output_dir, auto_reduce_batch_size=True)
+ torch.cuda.empty_cache()
+ else:
+ logging.warning(f"Skipping generation of negative clips for testing, as ~{config['n_samples_val']} already exist")
+
+ # Set the total length of the training clips based on the ~median generated clip duration, rounding to the nearest 1000 samples
+ # and setting to 32000 when the median + 750 ms is close to that, as it's a good default value
+ n = 50 # sample size
+ positive_clips = [str(i) for i in Path(positive_test_output_dir).glob("*.wav")]
+ duration_in_samples = []
+ for i in range(n):
+ sr, dat = scipy.io.wavfile.read(positive_clips[np.random.randint(0, len(positive_clips))])
+ duration_in_samples.append(len(dat))
+
+ config["total_length"] = int(round(np.median(duration_in_samples)/1000)*1000) + 12000 # add 750 ms to clip duration as buffer
+ if config["total_length"] < 32000:
+ config["total_length"] = 32000 # set a minimum of 32000 samples (2 seconds)
+ elif abs(config["total_length"] - 32000) <= 4000:
+ config["total_length"] = 32000
+
+ # Do Data Augmentation
+ if args.augment_clips is True:
+ if not os.path.exists(os.path.join(feature_save_dir, "positive_features_train.npy")) or args.overwrite is True:
+ positive_clips_train = [str(i) for i in Path(positive_train_output_dir).glob("*.wav")]*config["augmentation_rounds"]
+ positive_clips_train_generator = augment_clips(positive_clips_train, total_length=config["total_length"],
+ batch_size=config["augmentation_batch_size"],
+ background_clip_paths=background_paths,
+ RIR_paths=rir_paths)
+
+ positive_clips_test = [str(i) for i in Path(positive_test_output_dir).glob("*.wav")]*config["augmentation_rounds"]
+ positive_clips_test_generator = augment_clips(positive_clips_test, total_length=config["total_length"],
+ batch_size=config["augmentation_batch_size"],
+ background_clip_paths=background_paths,
+ RIR_paths=rir_paths)
+
+ negative_clips_train = [str(i) for i in Path(negative_train_output_dir).glob("*.wav")]*config["augmentation_rounds"]
+ negative_clips_train_generator = augment_clips(negative_clips_train, total_length=config["total_length"],
+ batch_size=config["augmentation_batch_size"],
+ background_clip_paths=background_paths,
+ RIR_paths=rir_paths)
+
+ negative_clips_test = [str(i) for i in Path(negative_test_output_dir).glob("*.wav")]*config["augmentation_rounds"]
+ negative_clips_test_generator = augment_clips(negative_clips_test, total_length=config["total_length"],
+ batch_size=config["augmentation_batch_size"],
+ background_clip_paths=background_paths,
+ RIR_paths=rir_paths)
+
+ # Compute features and save to disk via memmapped arrays
+ logging.info("#"*50 + "\nComputing openwakeword features for generated samples\n" + "#"*50)
+ n_cpus = os.cpu_count()
+ if n_cpus is None:
+ n_cpus = 1
+ else:
+ n_cpus = n_cpus//2
+ compute_features_from_generator(positive_clips_train_generator, n_total=len(os.listdir(positive_train_output_dir)),
+ clip_duration=config["total_length"],
+ output_file=os.path.join(feature_save_dir, "positive_features_train.npy"),
+ device="gpu" if torch.cuda.is_available() else "cpu",
+ ncpu=n_cpus if not torch.cuda.is_available() else 1)
+
+ compute_features_from_generator(negative_clips_train_generator, n_total=len(os.listdir(negative_train_output_dir)),
+ clip_duration=config["total_length"],
+ output_file=os.path.join(feature_save_dir, "negative_features_train.npy"),
+ device="gpu" if torch.cuda.is_available() else "cpu",
+ ncpu=n_cpus if not torch.cuda.is_available() else 1)
+
+ compute_features_from_generator(positive_clips_test_generator, n_total=len(os.listdir(positive_test_output_dir)),
+ clip_duration=config["total_length"],
+ output_file=os.path.join(feature_save_dir, "positive_features_test.npy"),
+ device="gpu" if torch.cuda.is_available() else "cpu",
+ ncpu=n_cpus if not torch.cuda.is_available() else 1)
+
+ compute_features_from_generator(negative_clips_test_generator, n_total=len(os.listdir(negative_test_output_dir)),
+ clip_duration=config["total_length"],
+ output_file=os.path.join(feature_save_dir, "negative_features_test.npy"),
+ device="gpu" if torch.cuda.is_available() else "cpu",
+ ncpu=n_cpus if not torch.cuda.is_available() else 1)
+ else:
+ logging.warning("Openwakeword features already exist, skipping data augmentation and feature generation")
+
+ # Create openwakeword model
+ if args.train_model is True:
+ F = openwakeword.utils.AudioFeatures(device='cpu')
+ input_shape = F.get_embedding_shape(config["total_length"]//16000) # training data is always 16 khz
+
+ oww = Model(n_classes=1, input_shape=input_shape, model_type=config["model_type"],
+ layer_dim=config["layer_size"], seconds_per_example=1280*input_shape[0]/16000)
+
+ # Create data transform function for batch generation to handle differ clip lengths (todo: write tests for this)
+ def f(x, n=16):
+ """Simple transformation function to ensure negative data is the appropriate shape for the model size"""
+ if n > x.shape[1] or n < x.shape[1]:
+ x = np.vstack(x)
+ new_batch = np.array([x[i:i+n, :] for i in range(0, x.shape[0]-n, n)])
+ else:
+ return x
+ return new_batch
+
+ # Create label transforms as needed for model (currently only supports binary classification models)
+ data_transforms = {key: f for key in config["feature_data_files"].keys()}
+ label_transforms = {}
+ for key in ["positive"] + list(config["feature_data_files"].keys()) + ["adversarial_negative"]:
+ if key == "positive":
+ label_transforms[key] = lambda x: [1 for i in x]
+ else:
+ label_transforms[key] = lambda x: [0 for i in x]
+
+ # Add generated positive and adversarial negative clips to the feature data files dictionary
+ config["feature_data_files"]['positive'] = os.path.join(feature_save_dir, "positive_features_train.npy")
+ config["feature_data_files"]['adversarial_negative'] = os.path.join(feature_save_dir, "negative_features_train.npy")
+
+ # Make PyTorch data loaders for training and validation data
+ batch_generator = mmap_batch_generator(
+ config["feature_data_files"],
+ n_per_class=config["batch_n_per_class"],
+ data_transform_funcs=data_transforms,
+ label_transform_funcs=label_transforms
+ )
+
+ class IterDataset(torch.utils.data.IterableDataset):
+ def __init__(self, generator):
+ self.generator = generator
+
+ def __iter__(self):
+ return self.generator
+
+ n_cpus = os.cpu_count()
+ if n_cpus is None:
+ n_cpus = 1
+ else:
+ n_cpus = n_cpus//2
+ X_train = torch.utils.data.DataLoader(IterDataset(batch_generator),
+ batch_size=None, num_workers=n_cpus, prefetch_factor=16)
+
+ X_val_fp = np.load(config["false_positive_validation_data_path"])
+ X_val_fp = np.array([X_val_fp[i:i+input_shape[0]] for i in range(0, X_val_fp.shape[0]-input_shape[0], 1)]) # reshape to match model
+ X_val_fp_labels = np.zeros(X_val_fp.shape[0]).astype(np.float32)
+ X_val_fp = torch.utils.data.DataLoader(
+ torch.utils.data.TensorDataset(torch.from_numpy(X_val_fp), torch.from_numpy(X_val_fp_labels)),
+ batch_size=len(X_val_fp_labels)
+ )
+
+ X_val_pos = np.load(os.path.join(feature_save_dir, "positive_features_test.npy"))
+ X_val_neg = np.load(os.path.join(feature_save_dir, "negative_features_test.npy"))
+ labels = np.hstack((np.ones(X_val_pos.shape[0]), np.zeros(X_val_neg.shape[0]))).astype(np.float32)
+
+ X_val = torch.utils.data.DataLoader(
+ torch.utils.data.TensorDataset(
+ torch.from_numpy(np.vstack((X_val_pos, X_val_neg))),
+ torch.from_numpy(labels)
+ ),
+ batch_size=len(labels)
+ )
+
+ # Run auto training
+ best_model = oww.auto_train(
+ X_train=X_train,
+ X_val=X_val,
+ false_positive_val_data=X_val_fp,
+ steps=config["steps"],
+ max_negative_weight=config["max_negative_weight"],
+ target_fp_per_hour=config["target_false_positives_per_hour"],
+ )
+
+ # Export the trained model to onnx
+ oww.export_model(model=best_model, model_name=config["model_name"], output_dir=config["output_dir"])
+
+ # Convert the model from onnx to tflite format
+ convert_onnx_to_tflite(os.path.join(config["output_dir"], config["model_name"] + ".onnx"),
+ os.path.join(config["output_dir"], config["model_name"] + ".tflite"))
diff --git a/openwakeword/utils.py b/openwakeword/utils.py
index c4f9b15..4964706 100644
--- a/openwakeword/utils.py
+++ b/openwakeword/utils.py
@@ -21,8 +21,11 @@
from multiprocessing import Process, Queue
import time
import logging
+from tqdm import tqdm
import openwakeword
+from numpy.lib.format import open_memmap
from typing import Union, List, Callable, Deque
+import requests
# Base class for computing audio features using Google's speech_embedding
@@ -157,7 +160,7 @@ def tflite_embedding_predict(x):
self.embedding_model_predict = tflite_embedding_predict
- # Create databuffers
+ # Create databuffers with empty/random data
self.raw_data_buffer: Deque = deque(maxlen=sr*10)
self.melspectrogram_buffer = np.ones((76, 32)) # n_frames x num_features
self.melspectrogram_max_len = 10*97 # 97 is the number of frames in 1 second of 16hz audio
@@ -166,6 +169,14 @@ def tflite_embedding_predict(x):
self.feature_buffer = self._get_embeddings(np.random.randint(-1000, 1000, 16000*4).astype(np.int16))
self.feature_buffer_max_len = 120 # ~10 seconds of feature buffer history
+ def reset(self):
+ """Reset the internal buffers"""
+ self.raw_data_buffer.clear()
+ self.melspectrogram_buffer = np.ones((76, 32))
+ self.accumulated_samples = 0
+ self.raw_data_remainder = np.empty(0)
+ self.feature_buffer = self._get_embeddings(np.random.randint(-1000, 1000, 16000*4).astype(np.int16))
+
def _get_melspectrogram(self, x: Union[np.ndarray, List], melspec_transform: Callable = lambda x: x/10 + 2):
"""
Function to compute the mel-spectrogram of the provided audio samples.
@@ -266,8 +277,9 @@ def _get_melspectrogram_batch(self, x, batch_size=128, ncpu=1):
result = self._get_melspectrogram(batch)
elif pool:
+ chunksize = batch.shape[0]//ncpu if batch.shape[0] >= ncpu else 1
result = np.array(pool.map(self._get_melspectrogram,
- batch, chunksize=batch.shape[0]//ncpu))
+ batch, chunksize=chunksize))
melspecs[i:i+batch_size, :, :] = result.squeeze()
@@ -327,8 +339,9 @@ def _get_embeddings_batch(self, x, batch_size=128, ncpu=1):
result = self.embedding_model_predict(batch)
elif pool:
+ chunksize = batch.shape[0]//ncpu if batch.shape[0] >= ncpu else 1
result = np.array(pool.map(self._get_embeddings_from_melspec,
- batch, chunksize=batch.shape[0]//ncpu))
+ batch, chunksize=chunksize))
for j, ndx2 in zip(range(0, result.shape[0], n_frames), ndcs):
embeddings[ndx2, :, :] = result[j:j+n_frames]
@@ -526,6 +539,140 @@ def f(clips):
return {list(i.keys())[0]: list(i.values())[0] for i in results}
+def compute_features_from_generator(generator, n_total, clip_duration, output_file, device="cpu", ncpu=1):
+ """
+ Computes audio features from a generator that produces Numpy arrays of shape (batch_size, samples)
+ containing 16-bit PCM audio data.
+
+ Args:
+ generator (Generator): The generator that process the arrays of audio data
+ n_total (int): The total number of rows (audio clips) that the generator will produce.
+ Ideally this is precise, but it can be approximate as well as the output
+ .npy file will be automatically trimmed to remove empty values.
+ clip_duration (float): The duration (in samples) of the audio produced by the generator
+ output_file (str): The output file (.npy) containing the audio features. Note that this file
+ will be written to using memmap arrays, so it can be substantially larger
+ than the available system memory.
+ device (str): The device ("cpu" or "gpu") to use for computing features.
+ ncpu (int): The number of cores to use when process the audio features (if computing on CPU)
+
+ Returns:
+ None
+ """
+ # Function specific imports
+ from openwakeword.data import trim_mmap
+
+ # Create audio features object
+ F = AudioFeatures(device=device)
+
+ # Determine the output shape and create output file
+ n_feature_cols = F.get_embedding_shape(clip_duration/16000)
+ output_shape = (n_total, n_feature_cols[0], n_feature_cols[1])
+ fp = open_memmap(output_file, mode='w+', dtype=np.float32, shape=output_shape)
+
+ # Get batch size by pulling one value from the generator and store features
+ row_counter = 0
+ audio_data = next(generator)
+ batch_size = audio_data.shape[0]
+
+ if batch_size > n_total:
+ raise ValueError(f"The value of 'n_total' ({n_total}) is less than the batch size ({batch_size})."
+ " Please increase 'n_total' to be >= batch size.")
+
+ features = F.embed_clips(audio_data, batch_size=batch_size)
+ fp[row_counter:row_counter+features.shape[0], :, :] = features
+ row_counter += features.shape[0]
+ fp.flush()
+
+ # Compute features and add data to output file
+ for audio_data in tqdm(generator, total=n_total//batch_size, desc="Computing features"):
+ if row_counter >= n_total:
+ break
+
+ features = F.embed_clips(audio_data, batch_size=batch_size, ncpu=ncpu)
+ if row_counter + features.shape[0] > n_total:
+ features = features[0:n_total-row_counter]
+
+ fp[row_counter:row_counter+features.shape[0], :, :] = features
+ row_counter += features.shape[0]
+ fp.flush()
+
+ # Trip empty rows from the mmapped array
+ trim_mmap(output_file)
+
+
+# Function to download files from a URL with a progress bar
+def download_file(url, target_directory, file_size=None):
+ """A simple function to download a file from a URL with a progress bar using only the requests library"""
+ local_filename = url.split('/')[-1]
+
+ with requests.get(url, stream=True) as r:
+ if file_size is not None:
+ progress_bar = tqdm(total=file_size, unit='iB', unit_scale=True, desc=f"{local_filename}")
+ else:
+ total_size = int(r.headers.get('content-length', 0))
+ progress_bar = tqdm(total=total_size, unit='iB', unit_scale=True, desc=f"{local_filename}")
+
+ with open(os.path.join(target_directory, local_filename), 'wb') as f:
+ for chunk in r.iter_content(chunk_size=8192):
+ f.write(chunk)
+ progress_bar.update(len(chunk))
+
+ progress_bar.close()
+
+
+# Function to download models from GitHub release assets
+def download_models(
+ model_names: List[str] = [],
+ target_directory: str = os.path.join(pathlib.Path(__file__).parent.resolve(), "resources", "models")
+ ):
+ """
+ Download the specified models from the release assets in the openWakeWord GitHub repository.
+ Uses the official urls in the MODELS dictionary in openwakeword/__init__.py.
+
+ Args:
+ model_names (List[str]): The names of the models to download (e.g., hey_jarvis_v0.1). Both ONNX and
+ tflite models will be downloaded. If not provided (the default),
+ the latest versions of all models will be downloaded.
+ target_directory (str): The directory to save the models to. Defaults to the install location
+ of openWakeWord (i.e., the `resources/models` directory).
+ Returns:
+ None
+ """
+ if not isinstance(model_names, list):
+ raise ValueError("The model_names argument must be a list of strings")
+
+ # Always download melspectrogram and embedding models, if they don't already exist
+ if not os.path.exists(target_directory):
+ os.makedirs(target_directory)
+ for feature_model in openwakeword.FEATURE_MODELS.values():
+ if not os.path.exists(os.path.join(target_directory, feature_model["download_url"].split("/")[-1])):
+ download_file(feature_model["download_url"], target_directory)
+ download_file(feature_model["download_url"].replace(".tflite", ".onnx"), target_directory)
+
+ # Always download VAD models, if they don't already exist
+ for vad_model in openwakeword.VAD_MODELS.values():
+ if not os.path.exists(os.path.join(target_directory, vad_model["download_url"].split("/")[-1])):
+ download_file(vad_model["download_url"], target_directory)
+
+ # Get all model urls
+ official_model_urls = [i["download_url"] for i in openwakeword.MODELS.values()]
+ official_model_names = [i["download_url"].split("/")[-1] for i in openwakeword.MODELS.values()]
+
+ if model_names != []:
+ for model_name in model_names:
+ url = [i for i, j in zip(official_model_urls, official_model_names) if model_name in j]
+ if url != []:
+ if not os.path.exists(os.path.join(target_directory, url[0].split("/")[-1])):
+ download_file(url[0], target_directory)
+ download_file(url[0].replace(".tflite", ".onnx"), target_directory)
+ else:
+ for official_model_url in official_model_urls:
+ if not os.path.exists(os.path.join(target_directory, official_model_url.split("/")[-1])):
+ download_file(official_model_url, target_directory)
+ download_file(official_model_url.replace(".tflite", ".onnx"), target_directory)
+
+
# Handle deprecated arguments and naming (thanks to https://stackoverflow.com/a/74564394)
def re_arg(kwarg_map):
def decorator(func):
diff --git a/openwakeword/vad.py b/openwakeword/vad.py
index 18bf5e3..b332ee6 100755
--- a/openwakeword/vad.py
+++ b/openwakeword/vad.py
@@ -63,18 +63,20 @@ def __init__(self,
"resources",
"models",
"silero_vad.onnx"
- )
+ ),
+ n_threads: int = 1
):
"""Initialize the VAD model object.
Args:
model_path (str): The path to the Silero VAD ONNX model.
+ n_threads (int): The number of threads to use for the VAD model.
"""
# Initialize the ONNX model
sessionOptions = ort.SessionOptions()
- sessionOptions.inter_op_num_threads = 1
- sessionOptions.intra_op_num_threads = 1
+ sessionOptions.inter_op_num_threads = n_threads
+ sessionOptions.intra_op_num_threads = n_threads
self.model = ort.InferenceSession(model_path, sess_options=sessionOptions,
providers=["CPUExecutionProvider"])
diff --git a/pyproject.toml b/pyproject.toml
index 23c373c..d420f04 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -12,7 +12,7 @@ testpaths = [
[project]
name = "openwakeword"
-version = "0.5.1"
+version = "0.6.0"
authors = [
{ name="David Scripka", email="david.scripka@gmail.com" },
]
@@ -24,6 +24,7 @@ classifiers = [
"License :: OSI Approved :: Apache Software License",
"Operating System :: OS Independent",
]
+dynamic = ["dependencies", "optional-dependencies"]
[project.urls]
"Homepage" = "https://github.com/dscripka/openWakeWord"
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 1c7aecb..df0a666 100644
--- a/setup.py
+++ b/setup.py
@@ -26,13 +26,14 @@ def build_additional_requires():
setuptools.setup(
name="openwakeword",
- version="0.5.1",
+ version="0.6.0",
install_requires=[
'onnxruntime>=1.10.0,<2',
'tflite-runtime>=2.8.0,<3; platform_system == "Linux"',
'tqdm>=4.0,<5.0',
'scipy>=1.3,<2',
- 'scikit-learn>=1,<2'
+ 'scikit-learn>=1,<2',
+ 'requests>=2.0,<3',
],
extras_require={
'test': [
@@ -41,18 +42,36 @@ def build_additional_requires():
'pytest-flake8>=1.1.1,<2',
'flake8>=4.0,<4.1',
'pytest-mypy>=0.10.0,<1',
+ 'types-requests',
+ 'types-PyYAML',
'mock>=5.1,<6',
- 'types-mock>=5.1,<6'
+ 'types-mock>=5.1,<6',
+ 'types-requests>=2.0,<3'
],
'full': [
'mutagen>=1.46.0,<2',
- 'speechbrain>=0.5.13,<1',
+ 'torch>=1.13.1,<3',
+ 'torchaudio>=0.13.1,<1',
+ 'torchinfo>=1.8.0,<2',
+ 'torchmetrics>=0.11.4,<1',
+ 'speechbrain>=0.5.14,<1',
+ 'audiomentations>=0.30.0,<1',
+ 'torch-audiomentations>=0.11.0,<1',
+ 'tqdm>=4.64.0,<5',
'pytest>=7.2.0,<8',
'pytest-cov>=2.10.1,<3',
'pytest-flake8>=1.1.1,<2',
'pytest-mypy>=0.10.0,<1',
- 'plotext>=5.2.7,<6',
- 'sounddevice>=0.4.1,<1'
+ 'acoustics>=0.2.6,<1',
+ 'pyyaml>=6.0,<7',
+ 'tensorflow-cpu==2.8.1',
+ 'tensorflow_probability==0.16.0',
+ 'protobuf>=3.20,<4',
+ 'onnx_tf==1.10.0',
+ 'onnx==1.14.0',
+ 'pronouncing>=0.2.0,<1',
+ 'datasets>=2.14.4,<3',
+ 'deep-phonemizer==0.0.19'
]
},
author="David Scripka",
diff --git a/tests/test_custom_verifier_model.py b/tests/test_custom_verifier_model.py
index d5665e5..53f02ce 100644
--- a/tests/test_custom_verifier_model.py
+++ b/tests/test_custom_verifier_model.py
@@ -34,6 +34,9 @@
import tempfile
import pytest
+# Download models needed for tests
+openwakeword.utils.download_models(model_names=["alexa_v0.1", "hey_mycroft_v0.1"])
+
# Tests
class TestModels:
diff --git a/tests/test_models.py b/tests/test_models.py
index c38ecb8..b3907ff 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -39,6 +39,10 @@
import pickle
import tempfile
import mock
+import wave
+
+# Download models needed for tests
+openwakeword.utils.download_models()
# Tests
@@ -205,6 +209,48 @@ def test_models_with_speex_noise_cancellation(self):
)
assert 1 == 1
+ def test_models_with_debounce(self):
+ # Load model with defaults
+ owwModel = openwakeword.Model()
+
+ # Predict with chunks of 1280 with and without debounce
+ predictions = owwModel.predict_clip(os.path.join("tests", "data", "alexa_test.wav"),
+ debounce_time=0, threshold={"alexa_v0.1": 0.5})
+ scores = np.array([i['alexa'] for i in predictions])
+
+ predictions = owwModel.predict_clip(os.path.join("tests", "data", "alexa_test.wav"),
+ debounce_time=1.25, threshold={"alexa": 0.5})
+ scores_with_debounce = np.array([i['alexa'] for i in predictions])
+ print(scores, scores_with_debounce)
+ assert (scores >= 0.5).sum() > 1
+ assert (scores_with_debounce >= 0.5).sum() == 1
+
+ def test_model_reset(self):
+ # Load the model
+ owwModel = openwakeword.Model()
+
+ # Get test clip and load it
+ clip = os.path.join("tests", "data", "alexa_test.wav")
+ with wave.open(clip, mode='rb') as f:
+ data = np.frombuffer(f.readframes(f.getnframes()), dtype=np.int16)
+
+ # Predict frame by frame
+ for i in range(0, len(data), 1280):
+ prediction = owwModel.predict(data[i:i+1280])
+ if prediction['alexa'] > 0.5:
+ break
+
+ # Assert that next prediction is still > 0.5
+ prediction = owwModel.predict(data[i:i+1280])
+ assert prediction['alexa'] > 0.5
+
+ # Reset the model
+ owwModel.reset()
+
+ # Assert that next prediction is < 0.5
+ prediction = owwModel.predict(data[i:i+1280])
+ assert prediction['alexa'] < 0.5
+
def test_models_with_vad(self):
# Load model with defaults
owwModel = openwakeword.Model(vad_threshold=0.5)