signal.py

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
from paddle import _C_ops, _legacy_C_ops
from paddle.fluid.framework import in_dygraph_mode

from .fft import fft_c2c, fft_c2r, fft_r2c
from .fluid.data_feeder import check_variable_and_dtype
from .fluid.framework import _non_static_mode
from .fluid.layer_helper import LayerHelper
from .tensor.attribute import is_complex

__all__ = [
    'stft',
    'istft',
]


def frame(x, frame_length, hop_length, axis=-1, name=None):
    """
    Slice the N-dimensional (where N >= 1) input into (overlapping) frames.

    Args:
        x (Tensor): The input data which is a N-dimensional (where N >= 1) Tensor
            with shape `[..., seq_length]` or `[seq_length, ...]`.
        frame_length (int): Length of the frame and `0 < frame_length <= x.shape[axis]`.
        hop_length (int): Number of steps to advance between adjacent frames
            and `0 < hop_length`.
        axis (int, optional): Specify the axis to operate on the input Tensors. Its
            value should be 0(the first dimension) or -1(the last dimension). If not
            specified, the last axis is used by default.

    Returns:
        The output frames tensor with shape `[..., frame_length, num_frames]` if `axis==-1`,
            otherwise `[num_frames, frame_length, ...]` where

            `num_frames = 1 + (x.shape[axis] - frame_length) // hop_length`

    Examples:

    .. code-block:: python

        import paddle
        from paddle.signal import frame

        # 1D
        x = paddle.arange(8)
        y0 = frame(x, frame_length=4, hop_length=2, axis=-1)  # [4, 3]
        # [[0, 2, 4],
        #  [1, 3, 5],
        #  [2, 4, 6],
        #  [3, 5, 7]]

        y1 = frame(x, frame_length=4, hop_length=2, axis=0)   # [3, 4]
        # [[0, 1, 2, 3],
        #  [2, 3, 4, 5],
        #  [4, 5, 6, 7]]

        # 2D
        x0 = paddle.arange(16).reshape([2, 8])
        y0 = frame(x0, frame_length=4, hop_length=2, axis=-1)  # [2, 4, 3]
        # [[[0, 2, 4],
        #   [1, 3, 5],
        #   [2, 4, 6],
        #   [3, 5, 7]],
        #
        #  [[8 , 10, 12],
        #   [9 , 11, 13],
        #   [10, 12, 14],
        #   [11, 13, 15]]]

        x1 = paddle.arange(16).reshape([8, 2])
        y1 = frame(x1, frame_length=4, hop_length=2, axis=0)   # [3, 4, 2]
        # [[[0 , 1 ],
        #   [2 , 3 ],
        #   [4 , 5 ],
        #   [6 , 7 ]],
        #
        #   [4 , 5 ],
        #   [6 , 7 ],
        #   [8 , 9 ],
        #   [10, 11]],
        #
        #   [8 , 9 ],
        #   [10, 11],
        #   [12, 13],
        #   [14, 15]]]

        # > 2D
        x0 = paddle.arange(32).reshape([2, 2, 8])
        y0 = frame(x0, frame_length=4, hop_length=2, axis=-1)  # [2, 2, 4, 3]

        x1 = paddle.arange(32).reshape([8, 2, 2])
        y1 = frame(x1, frame_length=4, hop_length=2, axis=0)   # [3, 4, 2, 2]
    """
    if axis not in [0, -1]:
        raise ValueError(f'Unexpected axis: {axis}. It should be 0 or -1.')

    if not isinstance(frame_length, int) or frame_length <= 0:
        raise ValueError(
            f'Unexpected frame_length: {frame_length}. It should be an positive integer.'
        )

    if not isinstance(hop_length, int) or hop_length <= 0:
        raise ValueError(
            f'Unexpected hop_length: {hop_length}. It should be an positive integer.'
        )

    if _non_static_mode():
        if frame_length > x.shape[axis]:
            raise ValueError(
                f'Attribute frame_length should be less equal than sequence length, '
                f'but got ({frame_length}) > ({x.shape[axis]}).'
            )

    if in_dygraph_mode():
        return _C_ops.frame(x, frame_length, hop_length, axis)
    else:
        op_type = 'frame'
        check_variable_and_dtype(
            x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'], op_type
        )
        helper = LayerHelper(op_type, **locals())
        dtype = helper.input_dtype(input_param_name='x')
        out = helper.create_variable_for_type_inference(dtype=dtype)
        helper.append_op(
            type=op_type,
            inputs={'X': x},
            attrs={
                'frame_length': frame_length,
                'hop_length': hop_length,
                'axis': axis,
            },
            outputs={'Out': out},
        )
    return out


def overlap_add(x, hop_length, axis=-1, name=None):
    """
    Reconstructs a tensor consisted of overlap added sequences from input frames.

    Args:
        x (Tensor): The input data which is a N-dimensional (where N >= 2) Tensor
            with shape `[..., frame_length, num_frames]` or
            `[num_frames, frame_length ...]`.
        hop_length (int): Number of steps to advance between adjacent frames and
            `0 < hop_length <= frame_length`.
        axis (int, optional): Specify the axis to operate on the input Tensors. Its
            value should be 0(the first dimension) or -1(the last dimension). If not
            specified, the last axis is used by default.

    Returns:
        The output frames tensor with shape `[..., seq_length]` if `axis==-1`,
            otherwise `[seq_length, ...]` where

            `seq_length = (n_frames - 1) * hop_length + frame_length`

    Examples:

    .. code-block:: python

        import paddle
        from paddle.signal import overlap_add

        # 2D
        x0 = paddle.arange(16).reshape([8, 2])
        # [[0 , 1 ],
        #  [2 , 3 ],
        #  [4 , 5 ],
        #  [6 , 7 ],
        #  [8 , 9 ],
        #  [10, 11],
        #  [12, 13],
        #  [14, 15]]
        y0 = overlap_add(x0, hop_length=2, axis=-1)  # [10]
        # [0 , 2 , 5 , 9 , 13, 17, 21, 25, 13, 15]

        x1 = paddle.arange(16).reshape([2, 8])
        # [[0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 ],
        #  [8 , 9 , 10, 11, 12, 13, 14, 15]]
        y1 = overlap_add(x1, hop_length=2, axis=0)   # [10]
        # [0 , 1 , 10, 12, 14, 16, 18, 20, 14, 15]

        # > 2D
        x0 = paddle.arange(32).reshape([2, 1, 8, 2])
        y0 = overlap_add(x0, hop_length=2, axis=-1)  # [2, 1, 10]

        x1 = paddle.arange(32).reshape([2, 8, 1, 2])
        y1 = overlap_add(x1, hop_length=2, axis=0)   # [10, 1, 2]
    """
    if axis not in [0, -1]:
        raise ValueError(f'Unexpected axis: {axis}. It should be 0 or -1.')

    if not isinstance(hop_length, int) or hop_length <= 0:
        raise ValueError(
            f'Unexpected hop_length: {hop_length}. It should be an positive integer.'
        )

    op_type = 'overlap_add'

    if in_dygraph_mode():
        out = _C_ops.overlap_add(x, hop_length, axis)
    elif paddle.in_dynamic_mode():
        attrs = ('hop_length', hop_length, 'axis', axis)
        op = getattr(_legacy_C_ops, op_type)
        out = op(x, *attrs)
    else:
        check_variable_and_dtype(
            x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'], op_type
        )
        helper = LayerHelper(op_type, **locals())
        dtype = helper.input_dtype(input_param_name='x')
        out = helper.create_variable_for_type_inference(dtype=dtype)
        helper.append_op(
            type=op_type,
            inputs={'X': x},
            attrs={'hop_length': hop_length, 'axis': axis},
            outputs={'Out': out},
        )
    return out


def stft(
    x,
    n_fft,
    hop_length=None,
    win_length=None,
    window=None,
    center=True,
    pad_mode='reflect',
    normalized=False,
    onesided=True,
    name=None,
):
    r"""

    Short-time Fourier transform (STFT).

    The STFT computes the discrete Fourier transforms (DFT) of short overlapping
    windows of the input using this formula:

    .. math::
        X_t[f] = \sum_{n = 0}^{N-1} \text{window}[n]\ x[t \times H + n]\ e^{-{2 \pi j f n}/{N}}

    Where:
    - :math:`t`: The :math:`t`-th input window.
    - :math:`f`: Frequency :math:`0 \leq f < \text{n_fft}` for `onesided=False`,
    or :math:`0 \leq f < \lfloor \text{n_fft} / 2 \rfloor + 1` for `onesided=True`.
    - :math:`N`: Value of `n_fft`.
    - :math:`H`: Value of `hop_length`.

    Args:
        x (Tensor): The input data which is a 1-dimensional or 2-dimensional Tensor with
            shape `[..., seq_length]`. It can be a real-valued or a complex Tensor.
        n_fft (int): The number of input samples to perform Fourier transform.
        hop_length (int, optional): Number of steps to advance between adjacent windows
            and `0 < hop_length`. Default: `None` (treated as equal to `n_fft//4`)
        win_length (int, optional): The size of window. Default: `None` (treated as equal
            to `n_fft`)
        window (Tensor, optional): A 1-dimensional tensor of size `win_length`. It will
            be center padded to length `n_fft` if `win_length < n_fft`. Default: `None` (
            treated as a rectangle window with value equal to 1 of size `win_length`).
        center (bool, optional): Whether to pad `x` to make that the
            :math:`t \times hop\_length` at the center of :math:`t`-th frame. Default: `True`.
        pad_mode (str, optional): Choose padding pattern when `center` is `True`. See
            `paddle.nn.functional.pad` for all padding options. Default: `"reflect"`
        normalized (bool, optional): Control whether to scale the output by `1/sqrt(n_fft)`.
            Default: `False`
        onesided (bool, optional): Control whether to return half of the Fourier transform
            output that satisfies the conjugate symmetry condition when input is a real-valued
            tensor. It can not be `True` if input is a complex tensor. Default: `True`
        name (str, optional): The default value is None. Normally there is no need for user
            to set this property. For more information, please refer to :ref:`api_guide_Name`.

    Returns:
        The complex STFT output tensor with shape `[..., n_fft//2 + 1, num_frames]`
        (real-valued input and `onesided` is `True`) or `[..., n_fft, num_frames]`
        (`onesided` is `False`)

    Examples:
        .. code-block:: python

            import paddle
            from paddle.signal import stft

            # real-valued input
            x = paddle.randn([8, 48000], dtype=paddle.float64)
            y1 = stft(x, n_fft=512)  # [8, 257, 376]
            y2 = stft(x, n_fft=512, onesided=False)  # [8, 512, 376]

            # complex input
            x = paddle.randn([8, 48000], dtype=paddle.float64) + \
                    paddle.randn([8, 48000], dtype=paddle.float64)*1j  # [8, 48000] complex128
            y1 = stft(x, n_fft=512, center=False, onesided=False)  # [8, 512, 372]

    """

    x_rank = len(x.shape)
    assert x_rank in [
        1,
        2,
    ], f'x should be a 1D or 2D real tensor, but got rank of x is {x_rank}'

    if x_rank == 1:  # (batch, seq_length)
        x = x.unsqueeze(0)

    if hop_length is None:
        hop_length = int(n_fft // 4)

    assert hop_length > 0, f'hop_length should be > 0, but got {hop_length}.'

    if win_length is None:
        win_length = n_fft

    if _non_static_mode():
        assert (
            0 < n_fft <= x.shape[-1]
        ), f'n_fft should be in (0, seq_length({x.shape[-1]})], but got {n_fft}.'

    assert (
        0 < win_length <= n_fft
    ), f'win_length should be in (0, n_fft({n_fft})], but got {win_length}.'

    if window is not None:
        assert (
            len(window.shape) == 1 and len(window) == win_length
        ), f'expected a 1D window tensor of size equal to win_length({win_length}), but got window with shape {window.shape}.'
    else:
        window = paddle.ones(shape=(win_length,), dtype=x.dtype)

    if win_length < n_fft:
        pad_left = (n_fft - win_length) // 2
        pad_right = n_fft - win_length - pad_left
        window = paddle.nn.functional.pad(
            window, pad=[pad_left, pad_right], mode='constant'
        )

    if center:
        assert pad_mode in [
            'constant',
            'reflect',
        ], 'pad_mode should be "reflect" or "constant", but got "{}".'.format(
            pad_mode
        )

        pad_length = n_fft // 2
        # FIXME: Input `x` can be a complex tensor but pad does not support complex input.
        x = paddle.nn.functional.pad(
            x.unsqueeze(-1),
            pad=[pad_length, pad_length],
            mode=pad_mode,
            data_format="NLC",
        ).squeeze(-1)

    x_frames = frame(x=x, frame_length=n_fft, hop_length=hop_length, axis=-1)
    x_frames = x_frames.transpose(
        perm=[0, 2, 1]
    )  # switch n_fft to last dim, egs: (batch, num_frames, n_fft)
    x_frames = paddle.multiply(x_frames, window)

    norm = 'ortho' if normalized else 'backward'
    if is_complex(x_frames):
        assert (
            not onesided
        ), 'onesided should be False when input or window is a complex Tensor.'

    if not is_complex(x):
        out = fft_r2c(
            x=x_frames,
            n=None,
            axis=-1,
            norm=norm,
            forward=True,
            onesided=onesided,
            name=name,
        )
    else:
        out = fft_c2c(
            x=x_frames, n=None, axis=-1, norm=norm, forward=True, name=name
        )

    out = out.transpose(perm=[0, 2, 1])  # (batch, n_fft, num_frames)

    if x_rank == 1:
        out.squeeze_(0)

    return out


def istft(
    x,
    n_fft,
    hop_length=None,
    win_length=None,
    window=None,
    center=True,
    normalized=False,
    onesided=True,
    length=None,
    return_complex=False,
    name=None,
):
    r"""
    Inverse short-time Fourier transform (ISTFT).

    Reconstruct time-domain signal from the giving complex input and window tensor when
    nonzero overlap-add (NOLA) condition is met:

    .. math::
        \sum_{t = -\infty}^{\infty} \text{window}^2[n - t \times H]\ \neq \ 0, \ \text{for } all \ n

    Where:
    - :math:`t`: The :math:`t`-th input window.
    - :math:`N`: Value of `n_fft`.
    - :math:`H`: Value of `hop_length`.

        Result of `istft` expected to be the inverse of `paddle.signal.stft`, but it is
        not guaranteed to reconstruct a exactly realizable time-domain signal from a STFT
        complex tensor which has been modified (via masking or otherwise). Therefore, `istft`
        gives the `[Griffin-Lim optimal estimate] <https://ieeexplore.ieee.org/document/1164317>`_
        (optimal in a least-squares sense) for the corresponding signal.

    Args:
        x (Tensor): The input data which is a 2-dimensional or 3-dimensional **complex**
            Tensor with shape `[..., n_fft, num_frames]`.
        n_fft (int): The size of Fourier transform.
        hop_length (int, optional): Number of steps to advance between adjacent windows
            from time-domain signal and `0 < hop_length < win_length`. Default: `None` (
            treated as equal to `n_fft//4`)
        win_length (int, optional): The size of window. Default: `None` (treated as equal
            to `n_fft`)
        window (Tensor, optional): A 1-dimensional tensor of size `win_length`. It will
            be center padded to length `n_fft` if `win_length < n_fft`. It should be a
            real-valued tensor if `return_complex` is False. Default: `None`(treated as
            a rectangle window with value equal to 1 of size `win_length`).
        center (bool, optional): It means that whether the time-domain signal has been
            center padded. Default: `True`.
        normalized (bool, optional): Control whether to scale the output by :math:`1/sqrt(n_{fft})`.
            Default: `False`
        onesided (bool, optional): It means that whether the input STFT tensor is a half
            of the conjugate symmetry STFT tensor transformed from a real-valued signal
            and `istft` will return a real-valued tensor when it is set to `True`.
            Default: `True`.
        length (int, optional): Specify the length of time-domain signal. Default: `None`(
            treated as the whole length of signal).
        return_complex (bool, optional): It means that whether the time-domain signal is
            real-valued. If `return_complex` is set to `True`, `onesided` should be set to
            `False` cause the output is complex.
        name (str, optional): The default value is None. Normally there is no need for user
            to set this property. For more information, please refer to :ref:`api_guide_Name`.

    Returns:
        A tensor of least squares estimation of the reconstructed signal(s) with shape
        `[..., seq_length]`

    Examples:
        .. code-block:: python

            import numpy as np
            import paddle
            from paddle.signal import stft, istft

            paddle.seed(0)

            # STFT
            x = paddle.randn([8, 48000], dtype=paddle.float64)
            y = stft(x, n_fft=512)  # [8, 257, 376]

            # ISTFT
            x_ = istft(y, n_fft=512)  # [8, 48000]

            np.allclose(x, x_)  # True
    """
    check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], 'istft')

    x_rank = len(x.shape)
    assert x_rank in [
        2,
        3,
    ], 'x should be a 2D or 3D complex tensor, but got rank of x is {}'.format(
        x_rank
    )

    if x_rank == 2:  # (batch, n_fft, n_frames)
        x = x.unsqueeze(0)

    if hop_length is None:
        hop_length = int(n_fft // 4)

    if win_length is None:
        win_length = n_fft

    # Assure no gaps between frames.
    assert (
        0 < hop_length <= win_length
    ), 'hop_length should be in (0, win_length({})], but got {}.'.format(
        win_length, hop_length
    )

    assert (
        0 < win_length <= n_fft
    ), 'win_length should be in (0, n_fft({})], but got {}.'.format(
        n_fft, win_length
    )

    n_frames = x.shape[-1]
    fft_size = x.shape[-2]

    if _non_static_mode():
        if onesided:
            assert (
                fft_size == n_fft // 2 + 1
            ), 'fft_size should be equal to n_fft // 2 + 1({}) when onesided is True, but got {}.'.format(
                n_fft // 2 + 1, fft_size
            )
        else:
            assert (
                fft_size == n_fft
            ), 'fft_size should be equal to n_fft({}) when onesided is False, but got {}.'.format(
                n_fft, fft_size
            )

    if window is not None:
        assert (
            len(window.shape) == 1 and len(window) == win_length
        ), 'expected a 1D window tensor of size equal to win_length({}), but got window with shape {}.'.format(
            win_length, window.shape
        )
    else:
        window_dtype = (
            paddle.float32
            if x.dtype in [paddle.float32, paddle.complex64]
            else paddle.float64
        )
        window = paddle.ones(shape=(win_length,), dtype=window_dtype)

    if win_length < n_fft:
        pad_left = (n_fft - win_length) // 2
        pad_right = n_fft - win_length - pad_left
        # FIXME: Input `window` can be a complex tensor but pad does not support complex input.
        window = paddle.nn.functional.pad(
            window, pad=[pad_left, pad_right], mode='constant'
        )

    x = x.transpose(
        perm=[0, 2, 1]
    )  # switch n_fft to last dim, egs: (batch, num_frames, n_fft)
    norm = 'ortho' if normalized else 'backward'

    if return_complex:
        assert (
            not onesided
        ), 'onesided should be False when input(output of istft) or window is a complex Tensor.'

        out = fft_c2c(x=x, n=None, axis=-1, norm=norm, forward=False, name=None)
    else:
        assert not is_complex(
            window
        ), 'Data type of window should not be complex when return_complex is False.'

        if onesided is False:
            x = x[:, :, : n_fft // 2 + 1]
        out = fft_c2r(x=x, n=None, axis=-1, norm=norm, forward=False, name=None)

    out = paddle.multiply(out, window).transpose(
        perm=[0, 2, 1]
    )  # (batch, n_fft, num_frames)
    out = overlap_add(
        x=out, hop_length=hop_length, axis=-1
    )  # (batch, seq_length)

    window_envelop = overlap_add(
        x=paddle.tile(
            x=paddle.multiply(window, window).unsqueeze(0),
            repeat_times=[n_frames, 1],
        ).transpose(
            perm=[1, 0]
        ),  # (n_fft, num_frames)
        hop_length=hop_length,
        axis=-1,
    )  # (seq_length, )

    if length is None:
        if center:
            out = out[:, (n_fft // 2) : -(n_fft // 2)]
            window_envelop = window_envelop[(n_fft // 2) : -(n_fft // 2)]
    else:
        if center:
            start = n_fft // 2
        else:
            start = 0

        out = out[:, start : start + length]
        window_envelop = window_envelop[start : start + length]

    # Check whether the Nonzero Overlap Add (NOLA) constraint is met.
    if _non_static_mode() and window_envelop.abs().min().item() < 1e-11:
        raise ValueError(
            'Abort istft because Nonzero Overlap Add (NOLA) condition failed. For more information about NOLA constraint please see `scipy.signal.check_NOLA`(https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.check_NOLA.html).'
        )

    out = out / window_envelop

    if x_rank == 2:
        out.squeeze_(0)

    return out