Source code for librosa.core.audio

# -*- coding: utf-8 -*-
"""Core IO, DSP and utility functions."""

import os
import warnings
import six

import audioread
import numpy as np
import scipy.signal
import scipy.fftpack as fft


from .time_frequency import frames_to_samples, time_to_samples
from .. import cache
from .. import util
from ..util.exceptions import ParameterError

__all__ = ['load', 'to_mono', 'resample', 'get_duration',
           'autocorrelate', 'zero_crossings', 'clicks',
           # Deprecated functions
           'peak_pick', 'localmax']

# Resampling bandwidths as percentage of Nyquist
# http://www.mega-nerd.com/SRC/api_misc.html#Converters
BW_BEST = 0.97
BW_MEDIUM = 0.9
BW_FASTEST = 0.8

# Do we have scikits.samplerate?
try:
    # Pylint won't handle dynamic imports, so we suppress this warning
    import scikits.samplerate as samplerate  # pylint: disable=import-error
    _HAS_SAMPLERATE = True
except ImportError:
    warnings.warn('Could not import scikits.samplerate. '
                  'Falling back to scipy.signal')
    _HAS_SAMPLERATE = False


# -- CORE ROUTINES --#
# Load should never be cached, since we cannot verify that the contents of
# 'path' are unchanged across calls.
def load(path, sr=22050, mono=True, offset=0.0, duration=None,
[docs] dtype=np.float32): """Load an audio file as a floating point time series. Parameters ---------- path : string path to the input file. Any format supported by `audioread` will work. sr : number > 0 [scalar] target sampling rate 'None' uses the native sampling rate mono : bool convert signal to mono offset : float start reading after this time (in seconds) duration : float only load up to this much audio (in seconds) dtype : numeric type data type of `y` Returns ------- y : np.ndarray [shape=(n,) or (2, n)] audio time series sr : number > 0 [scalar] sampling rate of `y` Examples -------- >>> # Load a wav file >>> filename = librosa.util.example_audio_file() >>> y, sr = librosa.load(filename) >>> y array([ -4.756e-06, -6.020e-06, ..., -1.040e-06, 0.000e+00], dtype=float32) >>> sr 22050 >>> # Load a wav file and resample to 11 KHz >>> filename = librosa.util.example_audio_file() >>> y, sr = librosa.load(filename, sr=11025) >>> y array([ -2.077e-06, -2.928e-06, ..., -4.395e-06, 0.000e+00], dtype=float32) >>> sr 11025 >>> # Load 5 seconds of a wav file, starting 15 seconds in >>> filename = librosa.util.example_audio_file() >>> y, sr = librosa.load(filename, offset=15.0, duration=5.0) >>> y array([ 0.069, 0.1 , ..., -0.101, 0. ], dtype=float32) >>> sr 22050 """ y = [] with audioread.audio_open(os.path.realpath(path)) as input_file: sr_native = input_file.samplerate n_channels = input_file.channels s_start = int(np.round(sr_native * offset)) * n_channels if duration is None: s_end = np.inf else: s_end = s_start + (int(np.round(sr_native * duration)) * n_channels) n = 0 for frame in input_file: frame = util.buf_to_float(frame, dtype=dtype) n_prev = n n = n + len(frame) if n < s_start: # offset is after the current frame # keep reading continue if s_end < n_prev: # we're off the end. stop reading break if s_end < n: # the end is in this frame. crop. frame = frame[:s_end - n_prev] if n_prev <= s_start <= n: # beginning is in this frame frame = frame[(s_start - n_prev):] # tack on the current frame y.append(frame) if y: y = np.concatenate(y) if n_channels > 1: y = y.reshape((-1, 2)).T if mono: y = to_mono(y) if sr is not None: y = resample(y, sr_native, sr) else: sr = sr_native # Final cleanup for dtype and contiguity y = np.ascontiguousarray(y, dtype=dtype) return (y, sr) @cache
def to_mono(y):
[docs] '''Force an audio signal down to mono. Parameters ---------- y : np.ndarray [shape=(2,n) or shape=(n,)] audio time series, either stereo or mono Returns ------- y_mono : np.ndarray [shape=(n,)] `y` as a monophonic time-series Examples -------- >>> y, sr = librosa.load(librosa.util.example_audio_file(), mono=False) >>> y.shape (2, 1355168) >>> y_mono = librosa.to_mono(y) >>> y_mono.shape (1355168,) ''' # Validate the buffer. Stereo is ok here. util.valid_audio(y, mono=False) if y.ndim > 1: y = np.mean(y, axis=0) return y @cache
def resample(y, orig_sr, target_sr, res_type='sinc_best', fix=True, scale=False, **kwargs):
[docs] """Resample a time series from orig_sr to target_sr Parameters ---------- y : np.ndarray [shape=(n,) or shape=(2, n)] audio time series. Can be mono or stereo. orig_sr : number > 0 [scalar] original sampling rate of `y` target_sr : number > 0 [scalar] target sampling rate res_type : str resample type (see note) .. note:: If `scikits.samplerate` is installed, `resample` will use `res_type`. Otherwise, fall back on `scipy.signal.resample`. To force use of `scipy.signal.resample`, set `res_type='scipy'`. fix : bool adjust the length of the resampled signal to be of size exactly `ceil(target_sr * len(y) / orig_sr)` scale : bool Scale the resampled signal so that `y` and `y_hat` have approximately equal total energy. kwargs : additional keyword arguments If `fix==True`, additional keyword arguments to pass to `librosa.util.fix_length`. Returns ------- y_hat : np.ndarray [shape=(n * target_sr / orig_sr,)] `y` resampled from `orig_sr` to `target_sr` See Also -------- librosa.util.fix_length scipy.signal.resample Examples -------- Downsample from 22 KHz to 8 KHz >>> y, sr = librosa.load(librosa.util.example_audio_file(), sr=22050) >>> y_8k = librosa.resample(y, sr, 8000) >>> y.shape, y_8k.shape ((1355168,), (491671,)) """ if y.ndim > 1: return np.vstack([resample(yi, orig_sr, target_sr, res_type=res_type, fix=fix, **kwargs) for yi in y]) # First, validate the audio buffer util.valid_audio(y, mono=True) if orig_sr == target_sr: return y ratio = float(target_sr) / orig_sr n_samples = int(np.ceil(y.shape[-1] * ratio)) scipy_resample = (res_type == 'scipy') if _HAS_SAMPLERATE and not scipy_resample: y_hat = samplerate.resample(y.T, ratio, res_type).T else: y_hat = scipy.signal.resample(y, n_samples, axis=-1) if fix: y_hat = util.fix_length(y_hat, n_samples, **kwargs) if scale: y_hat /= np.sqrt(ratio) return np.ascontiguousarray(y_hat, dtype=y.dtype) def get_duration(y=None, sr=22050, S=None, n_fft=2048, hop_length=512,
[docs] center=True): """Compute the duration (in seconds) of an audio time series or STFT matrix. Examples -------- >>> # Load the example audio file >>> y, sr = librosa.load(librosa.util.example_audio_file()) >>> librosa.get_duration(y=y, sr=sr) 61.44 >>> # Or compute duration from an STFT matrix >>> y, sr = librosa.load(librosa.util.example_audio_file()) >>> S = librosa.stft(y) >>> librosa.get_duration(S=S, sr=sr) 61.44 >>> # Or a non-centered STFT matrix >>> S_left = librosa.stft(y, center=False) >>> librosa.get_duration(S=S_left, sr=sr) 61.3471201814059 Parameters ---------- y : np.ndarray [shape=(n,), (2, n)] or None audio time series sr : number > 0 [scalar] audio sampling rate of `y` S : np.ndarray [shape=(d, t)] or None STFT matrix, or any STFT-derived matrix (e.g., chromagram or mel spectrogram). n_fft : int > 0 [scalar] FFT window size for `S` hop_length : int > 0 [ scalar] number of audio samples between columns of `S` center : boolean - If `True`, `S[:, t]` is centered at `y[t * hop_length]` - If `False`, then `S[:, t]` begins at `y[t * hop_length]` Returns ------- d : float >= 0 Duration (in seconds) of the input time series or spectrogram. """ if y is None: assert S is not None n_frames = S.shape[1] n_samples = n_fft + hop_length * (n_frames - 1) # If centered, we lose half a window from each end of S if center: n_samples = n_samples - 2 * int(n_fft / 2) else: # Validate the audio buffer. Stereo is okay here. util.valid_audio(y, mono=False) if y.ndim == 1: n_samples = len(y) else: n_samples = y.shape[-1] return float(n_samples) / sr @cache
def autocorrelate(y, max_size=None, axis=-1):
[docs] """Bounded auto-correlation Parameters ---------- y : np.ndarray array to autocorrelate max_size : int > 0 or None maximum correlation lag. If unspecified, defaults to `y.shape[axis]` (unbounded) axis : int The axis along which to autocorrelate. By default, the last axis (-1) is taken. Returns ------- z : np.ndarray truncated autocorrelation `y*y` along the specified axis. If `max_size` is specified, then `z.shape[axis]` is bounded to `max_size`. Examples -------- Compute full autocorrelation of y >>> y, sr = librosa.load(librosa.util.example_audio_file()) >>> librosa.autocorrelate(y) array([ 1.584e+04, 1.580e+04, ..., -1.154e-10, -2.725e-13]) Compute onset strength auto-correlation up to 4 seconds >>> import matplotlib.pyplot as plt >>> odf = librosa.onset.onset_strength(y=y, sr=sr, hop_length=512) >>> ac = librosa.autocorrelate(odf, max_size=4* sr / 512) >>> plt.plot(ac) >>> plt.title('Auto-correlation') >>> plt.xlabel('Lag (frames)') """ if max_size is None: max_size = y.shape[axis] max_size = int(min(max_size, y.shape[axis])) # Compute the power spectrum along the chosen axis # Pad out the signal to support full-length auto-correlation. powspec = np.abs(fft.fft(y, n=2 * y.shape[axis] + 1, axis=axis))**2 # Convert back to time domain autocorr = fft.ifft(powspec, axis=axis, overwrite_x=True) # Slice down to max_size subslice = [Ellipsis] * autocorr.ndim subslice[axis] = slice(max_size) autocorr = autocorr[subslice] if not np.iscomplexobj(y): autocorr = autocorr.real return autocorr @cache
def zero_crossings(y, threshold=1e-10, ref_magnitude=None, pad=True,
[docs] zero_pos=True, axis=-1): '''Find the zero-crossings of a signal `y`: indices `i` such that `sign(y[i]) != sign(y[j])`. If `y` is multi-dimensional, then zero-crossings are computed along the specified `axis`. Examples -------- >>> # Generate a time-series >>> y = np.sin(np.linspace(0, 4 * 2 * np.pi, 20)) >>> y array([ 0.000e+00, 9.694e-01, 4.759e-01, -7.357e-01, -8.372e-01, 3.247e-01, 9.966e-01, 1.646e-01, -9.158e-01, -6.142e-01, 6.142e-01, 9.158e-01, -1.646e-01, -9.966e-01, -3.247e-01, 8.372e-01, 7.357e-01, -4.759e-01, -9.694e-01, -9.797e-16]) >>> # Compute zero-crossings >>> z = librosa.zero_crossings(y) >>> z array([ True, False, False, True, False, True, False, False, True, False, True, False, True, False, False, True, False, True, False, True], dtype=bool) >>> # Stack y against the zero-crossing indicator >>> np.vstack([y, z]).T array([[ 0.000e+00, 1.000e+00], [ 9.694e-01, 0.000e+00], [ 4.759e-01, 0.000e+00], [ -7.357e-01, 1.000e+00], [ -8.372e-01, 0.000e+00], [ 3.247e-01, 1.000e+00], [ 9.966e-01, 0.000e+00], [ 1.646e-01, 0.000e+00], [ -9.158e-01, 1.000e+00], [ -6.142e-01, 0.000e+00], [ 6.142e-01, 1.000e+00], [ 9.158e-01, 0.000e+00], [ -1.646e-01, 1.000e+00], [ -9.966e-01, 0.000e+00], [ -3.247e-01, 0.000e+00], [ 8.372e-01, 1.000e+00], [ 7.357e-01, 0.000e+00], [ -4.759e-01, 1.000e+00], [ -9.694e-01, 0.000e+00], [ -9.797e-16, 1.000e+00]]) >>> # Find the indices of zero-crossings >>> np.nonzero(z) (array([ 0, 3, 5, 8, 10, 12, 15, 17, 19]),) Parameters ---------- y : np.ndarray The input array threshold : float > 0 or None If specified, values where `-threshold <= y <= threshold` are clipped to 0. ref_magnitude : float > 0 or callable If numeric, the threshold is scaled relative to `ref_magnitude`. If callable, the threshold is scaled relative to `ref_magnitude(np.abs(y))`. pad : boolean If `True`, then `y[0]` is considered a valid zero-crossing. zero_pos : boolean If `True` then the value 0 is interpreted as having positive sign. If `False`, then 0, -1, and +1 all have distinct signs. axis : int Axis along which to compute zero-crossings. Returns ------- zero_crossings : np.ndarray [shape=y.shape, dtype=boolean] Indicator array of zero-crossings in `y` along the selected axis. ''' # Clip within the threshold if threshold is None: threshold = 0.0 if six.callable(ref_magnitude): threshold = threshold * ref_magnitude(np.abs(y)) elif ref_magnitude is not None: threshold = threshold * ref_magnitude if threshold > 0: y = y.copy() y[np.abs(y) <= threshold] = 0 # Extract the sign bit if zero_pos: y_sign = np.signbit(y) else: y_sign = np.sign(y) # Find the change-points by slicing slice_pre = [slice(None)] * y.ndim slice_pre[axis] = slice(1, None) slice_post = [slice(None)] * y.ndim slice_post[axis] = slice(-1) # Since we've offset the input by one, pad back onto the front padding = [(0, 0)] * y.ndim padding[axis] = (1, 0) return np.pad((y_sign[slice_post] != y_sign[slice_pre]), padding, mode='constant', constant_values=pad) @cache
def clicks(times=None, frames=None, sr=22050, hop_length=512,
[docs] click_freq=1000.0, click_duration=0.1, click=None, length=None): """Returns a signal with the signal `click` placed at each specified time Parameters ---------- times : np.ndarray or None times to place clicks, in seconds frames : np.ndarray or None frame indices to place clicks sr : number > 0 desired sampling rate of the output signal hop_length : int > 0 if positions are specified by `frames`, the number of samples between frames. click_freq : float > 0 frequency (in Hz) of the default click signal. Default is 1KHz. click_duration : float > 0 duration (in seconds) of the default click signal. Default is 100ms. click : np.ndarray or None optional click signal sample to use instead of the default blip. length : int > 0 desired number of samples in the output signal Returns ------- click_signal : np.ndarray Synthesized click signal Raises ------ ParameterError - If neither `times` nor `frames` are provided. - If any of `click_freq`, `click_duration`, or `length` are out of range. Examples -------- >>> # Sonify detected beat events >>> y, sr = librosa.load(librosa.util.example_audio_file()) >>> tempo, beats = librosa.beat.beat_track(y=y, sr=sr) >>> y_beats = librosa.clicks(frames=beats, sr=sr) >>> # Or generate a signal of the same length as y >>> y_beats = librosa.clicks(frames=beats, sr=sr, length=len(y)) >>> # Or use timing instead of frame indices >>> times = librosa.frames_to_time(beats, sr=sr) >>> y_beat_times = librosa.clicks(times=times, sr=sr) >>> # Or with a click frequency of 880Hz and a 500ms sample >>> y_beat_times880 = librosa.clicks(times=times, sr=sr, ... click_freq=880, click_duration=0.5) """ # Compute sample positions from time or frames if times is None: if frames is None: raise ParameterError('either "times" or "frames" must be provided') positions = frames_to_samples(frames, hop_length=hop_length) else: # Convert times to positions positions = time_to_samples(times, sr=sr) if click is not None: # Check that we have a well-formed audio buffer util.valid_audio(click, mono=True) else: # Create default click signal if click_duration <= 0: raise ParameterError('click_duration must be strictly positive') if click_freq <= 0: raise ParameterError('click_freq must be strictly positive') angular_freq = 2 * np.pi * click_freq / float(sr) click = np.logspace(0, -10, num=int(np.round(sr * click_duration)), base=2.0) click *= np.sin(angular_freq * np.arange(len(click))) # Set default length if length is None: length = positions.max() + click.shape[0] else: if length < 1: raise ParameterError('length must be a positive integer') # Filter out any positions past the length boundary positions = positions[positions < length] # Pre-allocate click signal click_signal = np.zeros(length, dtype=np.float32) # Place clicks for start in positions: # Compute the end-point of this click end = start + click.shape[0] if end >= length: click_signal[start:] += click[:length - start] else: # Normally, just add a click here click_signal[start:end] += click return click_signal # Moved/deprecated functions peak_pick = util.decorators.moved('librosa.core.peak_pick',
'0.4', '0.5')(util.peak_pick) localmax = util.decorators.moved('librosa.core.localmax', '0.4', '0.5')(util.localmax)