Source code for mlfinpy.labeling.fixed_time_horizon

"""
Fixed-Time-Horizon Labeling Method

The article "Classification-based Financial Markets Prediction using Deep Neural Networks"
by Dixon et al. (2016) describes how labeling data this way can be used in training deep
neural networks to predict price movements.
"""

import warnings
from typing import Optional, Union

import pandas as pd



[docs]
def fixed_time_horizon(
    prices: Union[pd.Series, pd.DataFrame],
    threshold: Optional[Union[float, pd.Series]] = 0,
    resample_by: Optional[str] = None,
    lag: Optional[bool] = True,
    standardized: Optional[bool] = False,
    window: Optional[int] = None,
) -> Union[pd.Series, pd.DataFrame]:
    """
    Fixed-Time Horizon Labeling Method.

    This method is originally described in the book Advances in Financial Machine Learning,
    Chapter 3.2, p.43-44.

    Returns 1 if return is greater than the threshold, -1 if less, and 0 if in between.
    If no threshold is provided then it will simply take the sign of the return.

    Parameters
    ----------
    prices : pd.Series or pd.DataFrame
        Time-indexed stock prices used to calculate returns.
    threshold : float or pd.Series, optional
        When the absolute value of return exceeds the threshold, the observation is
        labeled with 1 or -1, depending on the sign of the return. If return is less, it's labeled as 0.
        Can be dynamic if threshold is inputted as a pd.Series, and threshold.index must match prices.index.
        If resampling is used, the index of threshold must match the index of prices after resampling.
        If threshold is negative, then the directionality of the labels will be reversed. If no threshold
        is provided, it is assumed to be 0 and the sign of the return is returned.
    resample_by : str, optional
        If not None, the resampling period for price data prior to calculating returns. 'B' = per
        business day, 'W' = week, 'M' = month, etc. Will take the last observation for each period.
        For full details see `here.
        <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects>`_
    lag : bool, optional
        If True, returns will be lagged to make them forward-looking.
    standardized : bool, optional
        Whether returns are scaled by mean and standard deviation.
    window : int, optional
        If standardized is True, the rolling window period for calculating the mean and standard
        deviation of returns.

    Returns
    -------
    pd.Series or pd.DataFrame
        -1, 0, or 1 denoting whether the return for each observation is
        less/between/greater than the threshold at each corresponding time index. First or last row will be
        NaN, depending on lag.
    """
    # Apply resample period, if applicable.
    if resample_by is not None:
        prices = prices.resample(resample_by).last()

    # Calculate returns.
    if lag:
        returns = prices.pct_change(1).shift(-1)
    else:
        returns = prices.pct_change(1)

    # If threshold is pd.Series, its index must patch prices.index; otherwise labels will fail to return.
    if isinstance(threshold, pd.Series):
        assert threshold.index.equals(prices.index), (
            "prices.index and threshold.index must match! If prices are "
            "resampled, the threshold index must match the resampled prices "
            "index."
        )

    # Adjust by mean and stdev, if desired. Assert that window must exist if standardization is on. Warning if window
    # is too large.
    if standardized:
        assert isinstance(window, int), "When standardized is True, window must be int."
        if window >= len(returns):
            warnings.warn("The window is greater than the length of the Series. All labels will be NaN.", UserWarning)

        # Apply standardization.
        mean = returns.rolling(window=window).mean()
        stdev = returns.rolling(window=window).std()
        returns -= mean
        returns /= stdev

    # Apply labeling.
    labels = returns.copy()  # Copy returns so labels aren't all 0 when threshold => 1.
    labels[returns.lt(-threshold, axis=0)] = -1
    labels[returns.gt(threshold, axis=0)] = 1
    labels[(returns.ge(-threshold, axis=0)) & (returns.le(threshold, axis=0))] = 0

    return labels