blob: ff23059b273508a20c30543590263890aed205d5 [file] [log] [blame]
# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Interface to asynchronously capture continuous audio from PyAudio.
This module requires pyaudio. See here for installation instructions:
http://people.csail.mit.edu/hubert/pyaudio/
This module provides one class, AudioRecorder, which buffers chunks of audio
from PyAudio.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import logging
import math
import time
import numpy as np
import pyaudio
import queue
logger = logging.getLogger(__name__)
class TimeoutError(Exception):
"""A timeout while waiting for pyaudio to buffer samples."""
pass
class AudioRecorder(object):
"""Asynchronously record and buffer audio using pyaudio.
This class wraps the pyaudio interface. It contains a queue.Queue object to
hold chunks of raw audio, and a callback function _enqueue_audio() which
places raw audio into this queue. This allows the pyaudio.Stream object to
record asynchronously at low latency.
The class acts as a context manager. When entering the context it creates a
pyaudio.Stream object and starts recording; it stops recording on exit. The
Stream saves all of its audio to the Queue as two-tuples of
(timestamp, raw_audio). The raw_audio is available from the queue as a numpy
array using the get_audio() function.
This class uses the term "frame" in the same sense that PortAudio does, so
"frame" means something different here than elsewhere in the daredevil stack.
A frame in PortAudio is one audio sample across all channels, so one frame of
16-bit stereo audio is four bytes of data as two 16-bit integers.
"""
pyaudio_format = pyaudio.paInt16
numpy_format = np.int16
num_channels = 1
# How many frames of audio PyAudio will fetch at once.
# Higher numbers will increase the latancy.
frames_per_chunk = 2**9
# Limit queue to this number of audio chunks.
max_queue_chunks = 1200
# Timeout if we can't get a chunk from the queue for timeout_factor times the
# chunk duration.
timeout_factor = 4
def __init__(self, raw_audio_sample_rate_hz=48000,
downsample_factor=3,
device_index=None):
self._downsample_factor = downsample_factor
self._raw_audio_sample_rate_hz = raw_audio_sample_rate_hz
self.audio_sample_rate_hz = self._raw_audio_sample_rate_hz // self._downsample_factor
self._raw_audio_queue = queue.Queue(self.max_queue_chunks)
self._audio = pyaudio.PyAudio()
self._print_input_devices()
self._device_index = device_index
def __enter__(self):
if self._device_index is None:
self._device_index = self._audio.get_default_input_device_info()["index"]
kwargs = {
"input_device_index": self._device_index
}
device_info = self._audio.get_device_info_by_host_api_device_index(
0, self._device_index)
if device_info.get("maxInputChannels") <= 0:
raise ValueError("Audio device has insufficient input channels.")
print("Using audio device '%s' for index %d" % (
device_info["name"], device_info["index"]))
self._stream = self._audio.open(
format=self.pyaudio_format,
channels=self.num_channels,
rate=self._raw_audio_sample_rate_hz,
input=True,
output=False,
frames_per_buffer=self.frames_per_chunk,
start=True,
stream_callback=self._enqueue_raw_audio,
**kwargs)
logger.info("Started audio stream.")
return self
def __exit__(self, exception_type, exception_value, traceback):
self._stream.stop_stream()
self._stream.close()
logger.info("Stopped and closed audio stream.")
def __del__(self):
self._audio.terminate()
logger.info("Terminated PyAudio/PortAudio.")
@property
def is_active(self):
return self._stream.is_active()
@property
def bytes_per_sample(self):
return pyaudio.get_sample_size(self.pyaudio_format)
@property
def _chunk_duration_seconds(self):
return self.frames_per_chunk / self._raw_audio_sample_rate_hz
def _print_input_devices(self):
info = self._audio.get_host_api_info_by_index(0)
print("\nInput microphone devices:")
for i in range(0, info.get("deviceCount")):
device_info = self._audio.get_device_info_by_host_api_device_index(0, i)
if device_info.get("maxInputChannels") <= 0: continue
print(" ID: ", i, " - ", device_info.get("name"))
def _enqueue_raw_audio(self, in_data, *_): # unused args to match expected
try:
self._raw_audio_queue.put((in_data, time.time()), block=False)
return None, pyaudio.paContinue
except queue.Full:
error_message = "Raw audio buffer full."
logger.critical(error_message)
raise TimeoutError(error_message)
def _get_chunk(self, timeout=None):
raw_data, timestamp = self._raw_audio_queue.get(timeout=timeout)
array_data = np.fromstring(raw_data, self.numpy_format).reshape(
-1, self.num_channels)
return array_data, timestamp
def get_audio_device_info(self):
if self._device_index is None:
return self._audio.get_default_input_device_info()
else:
return self._audio.get_device_info_by_index(self._device_index)
def sample_duration_seconds(self, num_samples):
return num_samples / self.audio_sample_rate_hz / self.num_channels
def clear_queue(self):
logger.debug("Purging %d chunks from queue.", self._raw_audio_queue.qsize())
while not self._raw_audio_queue.empty():
self._raw_audio_queue.get()
def get_audio(self, num_audio_frames):
"""Grab at least num_audio_frames frames of audio.
Record at least num_audio_frames of audio and transform it into a
numpy array. The term "frame" is in the sense used by PortAudio; see the
note in the class docstring for details.
Audio returned will be the earliest audio in the queue; it could be from
before this function was called.
Args:
num_audio_frames: minimum number of samples of audio to grab.
Returns:
A tuple of (audio, first_timestamp, last_timestamp).
"""
num_audio_chunks = int(math.ceil(num_audio_frames *
self._downsample_factor / self.frames_per_chunk))
logger.debug("Capturing %d chunks to get at least %d frames.",
num_audio_chunks, num_audio_frames)
if num_audio_chunks < 1:
num_audio_chunks = 1
try:
timeout = self.timeout_factor * self._chunk_duration_seconds
chunks, timestamps = zip(
*[self._get_chunk(timeout=timeout) for _ in range(num_audio_chunks)])
except queue.Empty:
error_message = "Audio capture timed out after %.1f seconds." % timeout
logger.critical(error_message)
raise TimeoutError(error_message)
assert len(chunks) == num_audio_chunks
logger.debug("Got %d chunks. Chunk 0 has shape %s and dtype %s.",
len(chunks), chunks[0].shape, chunks[0].dtype)
if self._raw_audio_queue.qsize() > (0.8 * self.max_queue_chunks):
logger.warning("%d chunks remain in the queue.",
self._raw_audio_queue.qsize())
else:
logger.debug("%d chunks remain in the queue.",
self._raw_audio_queue.qsize())
audio = np.concatenate(chunks)
if self._downsample_factor != 1:
audio = audio[::self._downsample_factor]
logging.debug("Audio array has shape %s and dtype %s.", audio.shape,
audio.dtype)
return audio * 0.5, timestamps[0], timestamps[-1]