Source code for mlfinpy.data_structure.imbalance_bars

"""
Advances in Financial Machine Learning, Marcos Lopez de Prado

Chapter 2: Financial Data Structures: Imbalance Bars

This module contains the functions to help users create structured financial data from raw unstructured data,
in the form of tick, volume, and dollar imbalance bars.

These bars are used throughout the text book (Advances in Financial Machine Learning, By Marcos Lopez de Prado, 2018,
pg 29) to build the more interesting features for predicting financial time series data.

These financial data structures have better statistical properties when compared to those based on fixed time
interval sampling. A great paper to read more about this is titled: The Volume Clock: Insights into the high
frequency paradigm, Lopez de Prado, et al. These ideas are then extended in another paper: Flow toxicity and liquidity
in a high-frequency world.

We have introduced two types of imbalance bars: with expected number of tick defined through
EMA (book implementation) and constant number of ticks.

A good blog post to read, which helped us a lot in the implementation here is writen by Maksim Ivanov:
https://towardsdatascience.com/financial-machine-learning-part-0-bars-745897d4e4ba
"""

# Imports
from typing import Iterable, List, Optional, Tuple, Union

import numpy as np
import pandas as pd

from mlfinpy.data_structure.base_bars import BaseImbalanceBars
from mlfinpy.util.fast_ewma import ewma


class EMAImbalanceBars(BaseImbalanceBars):
    """
    Encapsulates the logic for constructing the imbalance bars from chapter 2 of
    "Advances in Financial Machine Learning" by Marcos Lopez de Prado. This class is not
    intended for direct use. Instead, utilize package functions like `get_ema_dollar_imbalance_bars`
    to create an instance and construct the imbalance bars.
    """

    def __init__(
        self,
        inform_bar_type: str,
        num_prev_bars: int,
        expected_imbalance_window: int,
        exp_num_ticks_init: int,
        exp_num_ticks_constraints: Optional[List[int]] = None,
        batch_size: int = 2000000,
        analyse_thresholds: bool = False,
    ) -> None:
        """
        Constructor for EMAImbalanceBars

        Parameters
        ----------
        inform_bar_type : str
            Type of imbalance bar to create. Example: "dollar_imbalance".
        num_prev_bars : int
            Window size for E[T]s (number of previous bars to use for expected number of ticks estimation).
        expected_imbalance_window : int
            EMA window used to estimate expected imbalance.
        exp_num_ticks_init : int
            Initial number of expected ticks.
        exp_num_ticks_constraints : list or None, optional (default=None)
            Minimum and maximum possible number of expected ticks. Used to control bars sampling convergence.
        batch_size : int, optional (default=2000000)
            Number of rows to read in from the csv, per batch.
        analyse_thresholds : bool, optional (default=False)
            flag to return thresholds values (theta, exp_num_ticks, exp_imbalance) in a form of Pandas DataFrame.
        """
        BaseImbalanceBars.__init__(
            self,
            inform_bar_type,
            batch_size,
            expected_imbalance_window,
            exp_num_ticks_init,
            analyse_thresholds,
        )

        # EMA Imbalance specific  hyper parameters
        self.num_prev_bars = num_prev_bars
        if exp_num_ticks_constraints is None:
            self.min_exp_num_ticks = 0
            self.max_exp_num_ticks = np.inf
        else:
            self.min_exp_num_ticks = exp_num_ticks_constraints[0]
            self.max_exp_num_ticks = exp_num_ticks_constraints[1]

    def _get_exp_num_ticks(self):
        prev_num_of_ticks = self.imbalance_tick_statistics["num_ticks_bar"]
        exp_num_ticks = ewma(
            np.array(prev_num_of_ticks[-self.num_prev_bars :], dtype=float),
            self.num_prev_bars,
        )[-1]
        return min(max(exp_num_ticks, self.min_exp_num_ticks), self.max_exp_num_ticks)


class ConstImbalanceBars(BaseImbalanceBars):
    """
    Encapsulates the logic for constructing the imbalance bars from chapter 2 of
    "Advances in Financial Machine Learning" by Marcos Lopez de Prado. This class
    is not intended for direct use. Instead, utilize package functions like
    `get_ema_dollar_imbalance_bars` to create an instance and construct the imbalance bars.
    """

    def __init__(
        self,
        inform_bar_type: str,
        expected_imbalance_window: int,
        exp_num_ticks_init: int,
        batch_size: int,
        analyse_thresholds: bool,
    ) -> None:
        """
        Constructor

        Parameters
        ----------
        inform_bar_type : str
            Type of imbalance bar to create. Example: "dollar_imbalance".
        expected_imbalance_window : int
            EMA window used to estimate expected imbalance.
        exp_num_ticks_init : int
            Initial number of expected ticks.
        batch_size : int
            Number of rows to read in from the csv, per batch.
        analyse_thresholds : bool
            Flag to save  and return thresholds used to sample imbalance bars.
        """
        super().__init__(
            inform_bar_type,
            batch_size,
            expected_imbalance_window,
            exp_num_ticks_init,
            analyse_thresholds,
        )

    def _get_exp_num_ticks(self) -> int:
        return self.thresholds["exp_num_ticks"]


[docs] def get_ema_dollar_imbalance_bars( file_path_or_df: Union[str, Iterable[str], pd.DataFrame], num_prev_bars: int = 3, expected_imbalance_window: int = 10000, exp_num_ticks_init: int = 20000, exp_num_ticks_constraints: Optional[List[float]] = None, batch_size: int = 2e7, analyse_thresholds: bool = False, verbose: bool = True, to_csv: bool = False, output_path: Optional[str] = None, ) -> Tuple[pd.DataFrame, pd.DataFrame]: """ Creates a DataFrame of EMA dollar imbalance bars with columns: date_time, open, high, low, close, volume, cum_buy_volume, cum_ticks, cum_dollar_value. Parameters ---------- file_path_or_df : str or iterable of str or pd.DataFrame Path to the csv file(s) or Pandas Data Frame containing raw tick data in the format[date_time, price, volume]. num_prev_bars : int, optional (default=3) Window size for E[T]s (number of previous bars to use for expected number of ticks estimation). expected_imbalance_window : int, optional (default=10000) EMA window used to estimate expected imbalance. exp_num_ticks_init : int, optional (default=20000) Initial expected number of ticks per bar. exp_num_ticks_constraints : list or None, optional (default=None) Minimum and maximum possible number of expected ticks. Used to control bars sampling convergence. batch_size : int, optional (default=2e7) The number of rows per batch. Less RAM = smaller batch size. verbose : bool, optional (default=True) Print out batch numbers (True or False). to_csv : bool, optional (default=False) Save bars to csv after every batch run (True or False). output_path : str or None, optional (default=None) Path to csv file, if to_csv is True. analyse_thresholds : bool, optional (default=False) Flag to save and return thresholds used to sample imbalance bars. Returns ------- imbalance_bars : pd.DataFrame DataFrame of dollar imbalance bars. thresholds : pd.DataFrame DataFrame of thresholds, if to_csv=True returns None. """ bars = EMAImbalanceBars( inform_bar_type="dollar_imbalance", num_prev_bars=num_prev_bars, expected_imbalance_window=expected_imbalance_window, exp_num_ticks_init=exp_num_ticks_init, exp_num_ticks_constraints=exp_num_ticks_constraints, batch_size=batch_size, analyse_thresholds=analyse_thresholds, ) imbalance_bars = bars.batch_run( file_path_or_df=file_path_or_df, verbose=verbose, to_csv=to_csv, output_path=output_path, ) return imbalance_bars, pd.DataFrame(bars.bars_thresholds)
[docs] def get_ema_volume_imbalance_bars( file_path_or_df: Union[str, Iterable[str], pd.DataFrame], num_prev_bars: int = 3, expected_imbalance_window: int = 10000, exp_num_ticks_init: int = 20000, exp_num_ticks_constraints: Optional[List[float]] = None, batch_size: int = 2e7, analyse_thresholds: bool = False, verbose: bool = True, to_csv: bool = False, output_path: Optional[str] = None, ) -> Tuple[pd.DataFrame, pd.DataFrame]: """ Creates a DataFrame of EMA volume imbalance bars with columns: date_time, open, high, low, close, volume, cum_buy_volume, cum_ticks, cum_dollar_value. Parameters ---------- file_path_or_df : str, iterable of str, or pd.DataFrame Path to the csv file(s) or Pandas Data Frame containing raw tick data in the format[date_time, price, volume]. num_prev_bars : int, optional (default=3) Window size for E[T]s (number of previous bars to use for expected number of ticks estimation). expected_imbalance_window : int, optional (default=10000) EMA window used to estimate expected imbalance. exp_num_ticks_init : int, optional (default=20000) Initial expected number of ticks per bar. exp_num_ticks_constraints : list or None, optional (default=None) Minimum and maximum possible number of expected ticks. Used to control bars sampling convergence. batch_size : int, optional (default=2e7) The number of rows per batch. Less RAM = smaller batch size. verbose : bool, optional (default=True) Print out batch numbers (True or False). to_csv : bool, optional (default=False) Save bars to csv after every batch run (True or False). analyse_thresholds : bool, optional (default=False) Flag to save and return thresholds used to sample imbalance bars. output_path : str or None, optional (default=None) Path to csv file, if to_csv is True. Returns ------- imbalance_bars : pd.DataFrame DataFrame of volume imbalance bars. thresholds : pd.DataFrame DataFrame of thresholds, if to_csv=True returns None. """ bars = EMAImbalanceBars( inform_bar_type="volume_imbalance", num_prev_bars=num_prev_bars, expected_imbalance_window=expected_imbalance_window, exp_num_ticks_init=exp_num_ticks_init, exp_num_ticks_constraints=exp_num_ticks_constraints, batch_size=batch_size, analyse_thresholds=analyse_thresholds, ) imbalance_bars = bars.batch_run( file_path_or_df=file_path_or_df, verbose=verbose, to_csv=to_csv, output_path=output_path, ) return imbalance_bars, pd.DataFrame(bars.bars_thresholds)
[docs] def get_ema_tick_imbalance_bars( file_path_or_df: Union[str, Iterable[str], pd.DataFrame], num_prev_bars: int = 3, expected_imbalance_window: int = 10000, exp_num_ticks_init: int = 20000, exp_num_ticks_constraints: Optional[List[float]] = None, batch_size: int = 2e7, analyse_thresholds: bool = False, verbose: bool = True, to_csv: bool = False, output_path: Optional[str] = None, ) -> Tuple[pd.DataFrame, pd.DataFrame]: """ Creates a DataFrame of EMA tick imbalance bars with columns: date_time, open, high, low, close, volume, cum_buy_volume, cum_ticks, cum_dollar_value. Parameters ---------- file_path_or_df : str, iterable of str, or pd.DataFrame Path to the csv file(s) or Pandas Data Frame containing raw tick data in the format[date_time, price, volume]. num_prev_bars : int, optional (default=3) Window size for E[T]s (number of previous bars to use for expected number of ticks estimation). expected_imbalance_window : int, optional (default=10000) EMA window used to estimate expected imbalance. exp_num_ticks_init : int, optional (default=20000) Initial expected number of ticks per bar. exp_num_ticks_constraints : list or None, optional (default=None) Minimum and maximum possible number of expected ticks. Used to control bars sampling convergence. batch_size : int, optional (default=2e7) The number of rows per batch. Less RAM = smaller batch size. verbose : bool, optional (default=True) Print out batch numbers (True or False). to_csv : bool, optional (default=False) Save bars to csv after every batch run (True or False). analyse_thresholds : bool, optional (default=False) Flag to save and return thresholds used to sample imbalance bars. output_path : str or None, optional (default=None) Path to csv file, if to_csv is True. Returns ------- imbalance_bars : pd.DataFrame DataFrame of tick imbalance bars. thresholds : pd.DataFrame DataFrame of thresholds, if to_csv=True returns None. """ bars = EMAImbalanceBars( inform_bar_type="tick_imbalance", num_prev_bars=num_prev_bars, expected_imbalance_window=expected_imbalance_window, exp_num_ticks_init=exp_num_ticks_init, exp_num_ticks_constraints=exp_num_ticks_constraints, batch_size=batch_size, analyse_thresholds=analyse_thresholds, ) imbalance_bars = bars.batch_run( file_path_or_df=file_path_or_df, verbose=verbose, to_csv=to_csv, output_path=output_path, ) return imbalance_bars, pd.DataFrame(bars.bars_thresholds)
[docs] def get_const_dollar_imbalance_bars( file_path_or_df: Union[str, Iterable[str], pd.DataFrame], expected_imbalance_window: int = 10000, exp_num_ticks_init: int = 20000, batch_size: int = 2e7, analyse_thresholds: bool = False, verbose: bool = True, to_csv: bool = False, output_path: Optional[str] = None, ) -> Tuple[pd.DataFrame, pd.DataFrame]: """ Creates a DataFrame of Const dollar imbalance bars with columns: date_time, open, high, low, close, volume, cum_buy_volume, cum_ticks, cum_dollar_value. Parameters ---------- file_path_or_df : str, iterable of str, or pd.DataFrame Path to the csv file(s) or Pandas Data Frame containing raw tick data in the format[date_time, price, volume]. expected_imbalance_window : int, optional (default=10000) EMA window used to estimate expected imbalance. exp_num_ticks_init : int, optional (default=20000) Initial expected number of ticks per bar. batch_size : int, optional (default=2e7) The number of rows per batch. Less RAM = smaller batch size. verbose : bool, optional (default=True) Print out batch numbers (True or False). to_csv : bool, optional (default=False) Save bars to csv after every batch run (True or False). analyse_thresholds : bool, optional (default=False) Flag to save and return thresholds used to sample imbalance bars. output_path : str or None, optional (default=None) Path to csv file, if to_csv is True. Returns ------- imbalance_bars : pd.DataFrame DataFrame of dollar imbalance bars. thresholds : pd.DataFrame DataFrame of thresholds, if to_csv=True returns None. """ bars = ConstImbalanceBars( inform_bar_type="dollar_imbalance", expected_imbalance_window=expected_imbalance_window, exp_num_ticks_init=exp_num_ticks_init, batch_size=batch_size, analyse_thresholds=analyse_thresholds, ) imbalance_bars = bars.batch_run( file_path_or_df=file_path_or_df, verbose=verbose, to_csv=to_csv, output_path=output_path, ) return imbalance_bars, pd.DataFrame(bars.bars_thresholds)
[docs] def get_const_volume_imbalance_bars( file_path_or_df: Union[str, Iterable[str], pd.DataFrame], expected_imbalance_window: int = 10000, exp_num_ticks_init: int = 20000, batch_size: int = 2e7, analyse_thresholds: bool = False, verbose: bool = True, to_csv: bool = False, output_path: Optional[str] = None, ) -> Tuple[pd.DataFrame, pd.DataFrame]: """ Creates a DataFrame of Const volume imbalance bars with columns: date_time, open, high, low, close, volume, cum_buy_volume, cum_ticks, cum_dollar_value. Parameters ---------- file_path_or_df : str, iterable of str, or pd.DataFrame Path to the csv file(s) or Pandas Data Frame containing raw tick data in the format[date_time, price, volume]. expected_imbalance_window : int, optional (default=10000) EMA window used to estimate expected imbalance. exp_num_ticks_init : int, optional (default=20000) Initial expected number of ticks per bar. batch_size : int, optional (default=2e7) The number of rows per batch. Less RAM = smaller batch size. verbose : bool, optional (default=True) Print out batch numbers (True or False). to_csv : bool, optional (default=False) Save bars to csv after every batch run (True or False). analyse_thresholds : bool, optional (default=False) Flag to save and return thresholds used to sample imbalance bars. output_path : str or None, optional (default=None) Path to csv file, if to_csv is True. Returns ------- imbalance_bars : pd.DataFrame DataFrame of volume imbalance bars. thresholds : pd.DataFrame DataFrame of thresholds, if to_csv=True returns None. """ bars = ConstImbalanceBars( inform_bar_type="volume_imbalance", expected_imbalance_window=expected_imbalance_window, exp_num_ticks_init=exp_num_ticks_init, batch_size=batch_size, analyse_thresholds=analyse_thresholds, ) imbalance_bars = bars.batch_run( file_path_or_df=file_path_or_df, verbose=verbose, to_csv=to_csv, output_path=output_path, ) return imbalance_bars, pd.DataFrame(bars.bars_thresholds)
[docs] def get_const_tick_imbalance_bars( file_path_or_df: Union[str, Iterable[str], pd.DataFrame], expected_imbalance_window: int = 10000, exp_num_ticks_init: int = 20000, batch_size: int = 2e7, analyse_thresholds: bool = False, verbose: bool = True, to_csv: bool = False, output_path: Optional[str] = None, ) -> Tuple[pd.DataFrame, pd.DataFrame]: """ Creates a DataFrame of Const tick imbalance bars with columns: date_time, open, high, low, close, volume, cum_buy_volume, cum_ticks, cum_dollar_value. Parameters ---------- file_path_or_df : str or pd.DataFrame Path to the csv file or Pandas Data Frame containing raw tick data in the format[date_time, price, volume]. expected_imbalance_window : int, optional (default=10000) EMA window used to estimate expected imbalance. exp_num_ticks_init : int, optional (default=20000) Initial expected number of ticks per bar. batch_size : int, optional (default=2e7) The number of rows per batch. Less RAM = smaller batch size. analyse_thresholds : bool, optional (default=False) Flag to save and return thresholds used to sample imbalance bars. verbose : bool, optional (default=True) Print out batch numbers (True or False). to_csv : bool, optional (default=False) Save bars to csv after every batch run (True or False). output_path : str or None, optional (default=None) Path to csv file, if to_csv is True. Returns ------- imbalance_bars : pd.DataFrame DataFrame of tick imbalance bars. thresholds : pd.DataFrame DataFrame of thresholds, if to_csv=True returns None. """ bars = ConstImbalanceBars( inform_bar_type="tick_imbalance", expected_imbalance_window=expected_imbalance_window, exp_num_ticks_init=exp_num_ticks_init, batch_size=batch_size, analyse_thresholds=analyse_thresholds, ) imbalance_bars = bars.batch_run( file_path_or_df=file_path_or_df, verbose=verbose, to_csv=to_csv, output_path=output_path, ) return imbalance_bars, pd.DataFrame(bars.bars_thresholds)