"""
Advances in Financial Machine Learning, Marcos Lopez de Prado
Chapter 2: Financial Data Structures: Imbalance Bars
This module contains the functions to help users create structured financial data from raw unstructured data,
in the form of tick, volume, and dollar imbalance bars.
These bars are used throughout the text book (Advances in Financial Machine Learning, By Marcos Lopez de Prado, 2018,
pg 29) to build the more interesting features for predicting financial time series data.
These financial data structures have better statistical properties when compared to those based on fixed time
interval sampling. A great paper to read more about this is titled: The Volume Clock: Insights into the high
frequency paradigm, Lopez de Prado, et al. These ideas are then extended in another paper: Flow toxicity and liquidity
in a high-frequency world.
We have introduced two types of imbalance bars: with expected number of tick defined through
EMA (book implementation) and constant number of ticks.
A good blog post to read, which helped us a lot in the implementation here is writen by Maksim Ivanov:
https://towardsdatascience.com/financial-machine-learning-part-0-bars-745897d4e4ba
"""
# Imports
from typing import Iterable, List, Optional, Tuple, Union
import numpy as np
import pandas as pd
from mlfinpy.data_structure.base_bars import BaseImbalanceBars
from mlfinpy.util.fast_ewma import ewma
class EMAImbalanceBars(BaseImbalanceBars):
"""
Encapsulates the logic for constructing the imbalance bars from chapter 2 of
"Advances in Financial Machine Learning" by Marcos Lopez de Prado. This class is not
intended for direct use. Instead, utilize package functions like `get_ema_dollar_imbalance_bars`
to create an instance and construct the imbalance bars.
"""
def __init__(
self,
inform_bar_type: str,
num_prev_bars: int,
expected_imbalance_window: int,
exp_num_ticks_init: int,
exp_num_ticks_constraints: Optional[List[int]] = None,
batch_size: int = 2000000,
analyse_thresholds: bool = False,
) -> None:
"""
Constructor for EMAImbalanceBars
Parameters
----------
inform_bar_type : str
Type of imbalance bar to create. Example: "dollar_imbalance".
num_prev_bars : int
Window size for E[T]s (number of previous bars to use for expected number of ticks estimation).
expected_imbalance_window : int
EMA window used to estimate expected imbalance.
exp_num_ticks_init : int
Initial number of expected ticks.
exp_num_ticks_constraints : list or None, optional (default=None)
Minimum and maximum possible number of expected ticks. Used to control bars sampling convergence.
batch_size : int, optional (default=2000000)
Number of rows to read in from the csv, per batch.
analyse_thresholds : bool, optional (default=False)
flag to return thresholds values (theta, exp_num_ticks, exp_imbalance) in a form of Pandas DataFrame.
"""
BaseImbalanceBars.__init__(
self,
inform_bar_type,
batch_size,
expected_imbalance_window,
exp_num_ticks_init,
analyse_thresholds,
)
# EMA Imbalance specific hyper parameters
self.num_prev_bars = num_prev_bars
if exp_num_ticks_constraints is None:
self.min_exp_num_ticks = 0
self.max_exp_num_ticks = np.inf
else:
self.min_exp_num_ticks = exp_num_ticks_constraints[0]
self.max_exp_num_ticks = exp_num_ticks_constraints[1]
def _get_exp_num_ticks(self):
prev_num_of_ticks = self.imbalance_tick_statistics["num_ticks_bar"]
exp_num_ticks = ewma(
np.array(prev_num_of_ticks[-self.num_prev_bars :], dtype=float),
self.num_prev_bars,
)[-1]
return min(max(exp_num_ticks, self.min_exp_num_ticks), self.max_exp_num_ticks)
class ConstImbalanceBars(BaseImbalanceBars):
"""
Encapsulates the logic for constructing the imbalance bars from chapter 2 of
"Advances in Financial Machine Learning" by Marcos Lopez de Prado. This class
is not intended for direct use. Instead, utilize package functions like
`get_ema_dollar_imbalance_bars` to create an instance and construct the imbalance bars.
"""
def __init__(
self,
inform_bar_type: str,
expected_imbalance_window: int,
exp_num_ticks_init: int,
batch_size: int,
analyse_thresholds: bool,
) -> None:
"""
Constructor
Parameters
----------
inform_bar_type : str
Type of imbalance bar to create. Example: "dollar_imbalance".
expected_imbalance_window : int
EMA window used to estimate expected imbalance.
exp_num_ticks_init : int
Initial number of expected ticks.
batch_size : int
Number of rows to read in from the csv, per batch.
analyse_thresholds : bool
Flag to save and return thresholds used to sample imbalance bars.
"""
super().__init__(
inform_bar_type,
batch_size,
expected_imbalance_window,
exp_num_ticks_init,
analyse_thresholds,
)
def _get_exp_num_ticks(self) -> int:
return self.thresholds["exp_num_ticks"]
[docs]
def get_ema_dollar_imbalance_bars(
file_path_or_df: Union[str, Iterable[str], pd.DataFrame],
num_prev_bars: int = 3,
expected_imbalance_window: int = 10000,
exp_num_ticks_init: int = 20000,
exp_num_ticks_constraints: Optional[List[float]] = None,
batch_size: int = 2e7,
analyse_thresholds: bool = False,
verbose: bool = True,
to_csv: bool = False,
output_path: Optional[str] = None,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""
Creates a DataFrame of EMA dollar imbalance bars with columns: date_time, open, high, low,
close, volume, cum_buy_volume, cum_ticks, cum_dollar_value.
Parameters
----------
file_path_or_df : str or iterable of str or pd.DataFrame
Path to the csv file(s) or Pandas Data Frame containing raw tick data in the format[date_time, price, volume].
num_prev_bars : int, optional (default=3)
Window size for E[T]s (number of previous bars to use for expected number of ticks estimation).
expected_imbalance_window : int, optional (default=10000)
EMA window used to estimate expected imbalance.
exp_num_ticks_init : int, optional (default=20000)
Initial expected number of ticks per bar.
exp_num_ticks_constraints : list or None, optional (default=None)
Minimum and maximum possible number of expected ticks. Used to control bars sampling convergence.
batch_size : int, optional (default=2e7)
The number of rows per batch. Less RAM = smaller batch size.
verbose : bool, optional (default=True)
Print out batch numbers (True or False).
to_csv : bool, optional (default=False)
Save bars to csv after every batch run (True or False).
output_path : str or None, optional (default=None)
Path to csv file, if to_csv is True.
analyse_thresholds : bool, optional (default=False)
Flag to save and return thresholds used to sample imbalance bars.
Returns
-------
imbalance_bars : pd.DataFrame
DataFrame of dollar imbalance bars.
thresholds : pd.DataFrame
DataFrame of thresholds, if to_csv=True returns None.
"""
bars = EMAImbalanceBars(
inform_bar_type="dollar_imbalance",
num_prev_bars=num_prev_bars,
expected_imbalance_window=expected_imbalance_window,
exp_num_ticks_init=exp_num_ticks_init,
exp_num_ticks_constraints=exp_num_ticks_constraints,
batch_size=batch_size,
analyse_thresholds=analyse_thresholds,
)
imbalance_bars = bars.batch_run(
file_path_or_df=file_path_or_df,
verbose=verbose,
to_csv=to_csv,
output_path=output_path,
)
return imbalance_bars, pd.DataFrame(bars.bars_thresholds)
[docs]
def get_ema_volume_imbalance_bars(
file_path_or_df: Union[str, Iterable[str], pd.DataFrame],
num_prev_bars: int = 3,
expected_imbalance_window: int = 10000,
exp_num_ticks_init: int = 20000,
exp_num_ticks_constraints: Optional[List[float]] = None,
batch_size: int = 2e7,
analyse_thresholds: bool = False,
verbose: bool = True,
to_csv: bool = False,
output_path: Optional[str] = None,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""
Creates a DataFrame of EMA volume imbalance bars with columns: date_time, open, high, low,
close, volume, cum_buy_volume, cum_ticks, cum_dollar_value.
Parameters
----------
file_path_or_df : str, iterable of str, or pd.DataFrame
Path to the csv file(s) or Pandas Data Frame containing raw tick data in the format[date_time, price, volume].
num_prev_bars : int, optional (default=3)
Window size for E[T]s (number of previous bars to use for expected number of ticks estimation).
expected_imbalance_window : int, optional (default=10000)
EMA window used to estimate expected imbalance.
exp_num_ticks_init : int, optional (default=20000)
Initial expected number of ticks per bar.
exp_num_ticks_constraints : list or None, optional (default=None)
Minimum and maximum possible number of expected ticks. Used to control bars sampling convergence.
batch_size : int, optional (default=2e7)
The number of rows per batch. Less RAM = smaller batch size.
verbose : bool, optional (default=True)
Print out batch numbers (True or False).
to_csv : bool, optional (default=False)
Save bars to csv after every batch run (True or False).
analyse_thresholds : bool, optional (default=False)
Flag to save and return thresholds used to sample imbalance bars.
output_path : str or None, optional (default=None)
Path to csv file, if to_csv is True.
Returns
-------
imbalance_bars : pd.DataFrame
DataFrame of volume imbalance bars.
thresholds : pd.DataFrame
DataFrame of thresholds, if to_csv=True returns None.
"""
bars = EMAImbalanceBars(
inform_bar_type="volume_imbalance",
num_prev_bars=num_prev_bars,
expected_imbalance_window=expected_imbalance_window,
exp_num_ticks_init=exp_num_ticks_init,
exp_num_ticks_constraints=exp_num_ticks_constraints,
batch_size=batch_size,
analyse_thresholds=analyse_thresholds,
)
imbalance_bars = bars.batch_run(
file_path_or_df=file_path_or_df,
verbose=verbose,
to_csv=to_csv,
output_path=output_path,
)
return imbalance_bars, pd.DataFrame(bars.bars_thresholds)
[docs]
def get_ema_tick_imbalance_bars(
file_path_or_df: Union[str, Iterable[str], pd.DataFrame],
num_prev_bars: int = 3,
expected_imbalance_window: int = 10000,
exp_num_ticks_init: int = 20000,
exp_num_ticks_constraints: Optional[List[float]] = None,
batch_size: int = 2e7,
analyse_thresholds: bool = False,
verbose: bool = True,
to_csv: bool = False,
output_path: Optional[str] = None,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""
Creates a DataFrame of EMA tick imbalance bars with columns: date_time, open, high, low,
close, volume, cum_buy_volume, cum_ticks, cum_dollar_value.
Parameters
----------
file_path_or_df : str, iterable of str, or pd.DataFrame
Path to the csv file(s) or Pandas Data Frame containing raw tick data in the format[date_time, price, volume].
num_prev_bars : int, optional (default=3)
Window size for E[T]s (number of previous bars to use for expected number of ticks estimation).
expected_imbalance_window : int, optional (default=10000)
EMA window used to estimate expected imbalance.
exp_num_ticks_init : int, optional (default=20000)
Initial expected number of ticks per bar.
exp_num_ticks_constraints : list or None, optional (default=None)
Minimum and maximum possible number of expected ticks. Used to control bars sampling convergence.
batch_size : int, optional (default=2e7)
The number of rows per batch. Less RAM = smaller batch size.
verbose : bool, optional (default=True)
Print out batch numbers (True or False).
to_csv : bool, optional (default=False)
Save bars to csv after every batch run (True or False).
analyse_thresholds : bool, optional (default=False)
Flag to save and return thresholds used to sample imbalance bars.
output_path : str or None, optional (default=None)
Path to csv file, if to_csv is True.
Returns
-------
imbalance_bars : pd.DataFrame
DataFrame of tick imbalance bars.
thresholds : pd.DataFrame
DataFrame of thresholds, if to_csv=True returns None.
"""
bars = EMAImbalanceBars(
inform_bar_type="tick_imbalance",
num_prev_bars=num_prev_bars,
expected_imbalance_window=expected_imbalance_window,
exp_num_ticks_init=exp_num_ticks_init,
exp_num_ticks_constraints=exp_num_ticks_constraints,
batch_size=batch_size,
analyse_thresholds=analyse_thresholds,
)
imbalance_bars = bars.batch_run(
file_path_or_df=file_path_or_df,
verbose=verbose,
to_csv=to_csv,
output_path=output_path,
)
return imbalance_bars, pd.DataFrame(bars.bars_thresholds)
[docs]
def get_const_dollar_imbalance_bars(
file_path_or_df: Union[str, Iterable[str], pd.DataFrame],
expected_imbalance_window: int = 10000,
exp_num_ticks_init: int = 20000,
batch_size: int = 2e7,
analyse_thresholds: bool = False,
verbose: bool = True,
to_csv: bool = False,
output_path: Optional[str] = None,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""
Creates a DataFrame of Const dollar imbalance bars with columns: date_time, open, high, low,
close, volume, cum_buy_volume, cum_ticks, cum_dollar_value.
Parameters
----------
file_path_or_df : str, iterable of str, or pd.DataFrame
Path to the csv file(s) or Pandas Data Frame containing raw tick data in the format[date_time, price, volume].
expected_imbalance_window : int, optional (default=10000)
EMA window used to estimate expected imbalance.
exp_num_ticks_init : int, optional (default=20000)
Initial expected number of ticks per bar.
batch_size : int, optional (default=2e7)
The number of rows per batch. Less RAM = smaller batch size.
verbose : bool, optional (default=True)
Print out batch numbers (True or False).
to_csv : bool, optional (default=False)
Save bars to csv after every batch run (True or False).
analyse_thresholds : bool, optional (default=False)
Flag to save and return thresholds used to sample imbalance bars.
output_path : str or None, optional (default=None)
Path to csv file, if to_csv is True.
Returns
-------
imbalance_bars : pd.DataFrame
DataFrame of dollar imbalance bars.
thresholds : pd.DataFrame
DataFrame of thresholds, if to_csv=True returns None.
"""
bars = ConstImbalanceBars(
inform_bar_type="dollar_imbalance",
expected_imbalance_window=expected_imbalance_window,
exp_num_ticks_init=exp_num_ticks_init,
batch_size=batch_size,
analyse_thresholds=analyse_thresholds,
)
imbalance_bars = bars.batch_run(
file_path_or_df=file_path_or_df,
verbose=verbose,
to_csv=to_csv,
output_path=output_path,
)
return imbalance_bars, pd.DataFrame(bars.bars_thresholds)
[docs]
def get_const_volume_imbalance_bars(
file_path_or_df: Union[str, Iterable[str], pd.DataFrame],
expected_imbalance_window: int = 10000,
exp_num_ticks_init: int = 20000,
batch_size: int = 2e7,
analyse_thresholds: bool = False,
verbose: bool = True,
to_csv: bool = False,
output_path: Optional[str] = None,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""
Creates a DataFrame of Const volume imbalance bars with columns: date_time, open, high, low,
close, volume, cum_buy_volume, cum_ticks, cum_dollar_value.
Parameters
----------
file_path_or_df : str, iterable of str, or pd.DataFrame
Path to the csv file(s) or Pandas Data Frame containing raw tick data in the format[date_time, price, volume].
expected_imbalance_window : int, optional (default=10000)
EMA window used to estimate expected imbalance.
exp_num_ticks_init : int, optional (default=20000)
Initial expected number of ticks per bar.
batch_size : int, optional (default=2e7)
The number of rows per batch. Less RAM = smaller batch size.
verbose : bool, optional (default=True)
Print out batch numbers (True or False).
to_csv : bool, optional (default=False)
Save bars to csv after every batch run (True or False).
analyse_thresholds : bool, optional (default=False)
Flag to save and return thresholds used to sample imbalance bars.
output_path : str or None, optional (default=None)
Path to csv file, if to_csv is True.
Returns
-------
imbalance_bars : pd.DataFrame
DataFrame of volume imbalance bars.
thresholds : pd.DataFrame
DataFrame of thresholds, if to_csv=True returns None.
"""
bars = ConstImbalanceBars(
inform_bar_type="volume_imbalance",
expected_imbalance_window=expected_imbalance_window,
exp_num_ticks_init=exp_num_ticks_init,
batch_size=batch_size,
analyse_thresholds=analyse_thresholds,
)
imbalance_bars = bars.batch_run(
file_path_or_df=file_path_or_df,
verbose=verbose,
to_csv=to_csv,
output_path=output_path,
)
return imbalance_bars, pd.DataFrame(bars.bars_thresholds)
[docs]
def get_const_tick_imbalance_bars(
file_path_or_df: Union[str, Iterable[str], pd.DataFrame],
expected_imbalance_window: int = 10000,
exp_num_ticks_init: int = 20000,
batch_size: int = 2e7,
analyse_thresholds: bool = False,
verbose: bool = True,
to_csv: bool = False,
output_path: Optional[str] = None,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""
Creates a DataFrame of Const tick imbalance bars with columns: date_time, open, high, low,
close, volume, cum_buy_volume, cum_ticks, cum_dollar_value.
Parameters
----------
file_path_or_df : str or pd.DataFrame
Path to the csv file or Pandas Data Frame containing raw tick data in the format[date_time, price, volume].
expected_imbalance_window : int, optional (default=10000)
EMA window used to estimate expected imbalance.
exp_num_ticks_init : int, optional (default=20000)
Initial expected number of ticks per bar.
batch_size : int, optional (default=2e7)
The number of rows per batch. Less RAM = smaller batch size.
analyse_thresholds : bool, optional (default=False)
Flag to save and return thresholds used to sample imbalance bars.
verbose : bool, optional (default=True)
Print out batch numbers (True or False).
to_csv : bool, optional (default=False)
Save bars to csv after every batch run (True or False).
output_path : str or None, optional (default=None)
Path to csv file, if to_csv is True.
Returns
-------
imbalance_bars : pd.DataFrame
DataFrame of tick imbalance bars.
thresholds : pd.DataFrame
DataFrame of thresholds, if to_csv=True returns None.
"""
bars = ConstImbalanceBars(
inform_bar_type="tick_imbalance",
expected_imbalance_window=expected_imbalance_window,
exp_num_ticks_init=exp_num_ticks_init,
batch_size=batch_size,
analyse_thresholds=analyse_thresholds,
)
imbalance_bars = bars.batch_run(
file_path_or_df=file_path_or_df,
verbose=verbose,
to_csv=to_csv,
output_path=output_path,
)
return imbalance_bars, pd.DataFrame(bars.bars_thresholds)