Source code for zipline.pipeline.factors.statistical

from numexpr import evaluate
import numpy as np
from numpy import broadcast_arrays
from scipy.stats import (
    linregress,
    spearmanr,
)

from zipline.assets import Asset
from zipline.errors import IncompatibleTerms
from zipline.pipeline.factors import CustomFactor
from zipline.pipeline.filters import SingleAsset
from zipline.pipeline.mixins import StandardOutputs
from zipline.pipeline.sentinels import NotSpecified
from zipline.pipeline.term import AssetExists
from zipline.utils.input_validation import (
    expect_bounded,
    expect_dtypes,
    expect_types,
)
from zipline.utils.math_utils import nanmean
from zipline.utils.numpy_utils import (
    float64_dtype,
    int64_dtype,
)


from .basic import Returns


ALLOWED_DTYPES = (float64_dtype, int64_dtype)


class _RollingCorrelation(CustomFactor):
    @expect_dtypes(base_factor=ALLOWED_DTYPES, target=ALLOWED_DTYPES)
    @expect_bounded(correlation_length=(2, None))
    def __new__(cls, base_factor, target, correlation_length, mask=NotSpecified):
        if target.ndim == 2 and base_factor.mask is not target.mask:
            raise IncompatibleTerms(term_1=base_factor, term_2=target)

        return super(_RollingCorrelation, cls).__new__(
            cls,
            inputs=[base_factor, target],
            window_length=correlation_length,
            mask=mask,
        )


[docs]class RollingPearson(_RollingCorrelation):
    """
    A Factor that computes pearson correlation coefficients between the columns
    of a given Factor and either the columns of another Factor/BoundColumn or a
    slice/single column of data.

    Parameters
    ----------
    base_factor : zipline.pipeline.Factor
        The factor for which to compute correlations of each of its columns
        with `target`.
    target : zipline.pipeline.Term with a numeric dtype
        The term with which to compute correlations against each column of data
        produced by `base_factor`. This term may be a Factor, a BoundColumn or
        a Slice. If `target` is two-dimensional, correlations are computed
        asset-wise.
    correlation_length : int
        Length of the lookback window over which to compute each correlation
        coefficient.
    mask : zipline.pipeline.Filter, optional
        A Filter describing which assets (columns) of `base_factor` should have
        their correlation with `target` computed each day.

    See Also
    --------
    :func:`scipy.stats.pearsonr`
    :meth:`Factor.pearsonr`
    :class:`zipline.pipeline.factors.RollingPearsonOfReturns`

    Notes
    -----
    Most users should call Factor.pearsonr rather than directly construct an
    instance of this class.
    """

    window_safe = True

[docs]    def compute(self, today, assets, out, base_data, target_data):
        vectorized_pearson_r(
            base_data,
            target_data,
            allowed_missing=0,
            out=out,
        )


[docs]class RollingSpearman(_RollingCorrelation):
    """
    A Factor that computes spearman rank correlation coefficients between the
    columns of a given Factor and either the columns of another
    Factor/BoundColumn or a slice/single column of data.

    Parameters
    ----------
    base_factor : zipline.pipeline.Factor
        The factor for which to compute correlations of each of its columns
        with `target`.
    target : zipline.pipeline.Term with a numeric dtype
        The term with which to compute correlations against each column of data
        produced by `base_factor`. This term may be a Factor, a BoundColumn or
        a Slice. If `target` is two-dimensional, correlations are computed
        asset-wise.
    correlation_length : int
        Length of the lookback window over which to compute each correlation
        coefficient.
    mask : zipline.pipeline.Filter, optional
        A Filter describing which assets (columns) of `base_factor` should have
        their correlation with `target` computed each day.

    See Also
    --------
    :func:`scipy.stats.spearmanr`
    :meth:`Factor.spearmanr`
    :class:`zipline.pipeline.factors.RollingSpearmanOfReturns`

    Notes
    -----
    Most users should call Factor.spearmanr rather than directly construct an
    instance of this class.
    """

    window_safe = True

[docs]    def compute(self, today, assets, out, base_data, target_data):
        # If `target_data` is a Slice or single column of data, broadcast it
        # out to the same shape as `base_data`, then compute column-wise. This
        # is efficient because each column of the broadcasted array only refers
        # to a single memory location.
        target_data = broadcast_arrays(target_data, base_data)[0]
        for i in range(len(out)):
            out[i] = spearmanr(base_data[:, i], target_data[:, i])[0]


class RollingLinearRegression(CustomFactor):
    """
    A Factor that performs an ordinary least-squares regression predicting the
    columns of a given Factor from either the columns of another
    Factor/BoundColumn or a slice/single column of data.

    Parameters
    ----------
    dependent : zipline.pipeline.Factor
        The factor whose columns are the predicted/dependent variable of each
        regression with `independent`.
    independent : zipline.pipeline.slice.Slice or zipline.pipeline.Factor
        The factor/slice whose columns are the predictor/independent variable
        of each regression with `dependent`. If `independent` is a Factor,
        regressions are computed asset-wise.
    regression_length : int
        Length of the lookback window over which to compute each regression.
    mask : zipline.pipeline.Filter, optional
        A Filter describing which assets (columns) of `dependent` should be
        regressed against `independent` each day.

    See Also
    --------
    :func:`scipy.stats.linregress`
    :meth:`Factor.linear_regression`
    :class:`zipline.pipeline.factors.RollingLinearRegressionOfReturns`

    Notes
    -----
    Most users should call Factor.linear_regression rather than directly
    construct an instance of this class.
    """

    outputs = ["alpha", "beta", "r_value", "p_value", "stderr"]

    @expect_dtypes(dependent=ALLOWED_DTYPES, independent=ALLOWED_DTYPES)
    @expect_bounded(regression_length=(2, None))
    def __new__(cls, dependent, independent, regression_length, mask=NotSpecified):
        if independent.ndim == 2 and dependent.mask is not independent.mask:
            raise IncompatibleTerms(term_1=dependent, term_2=independent)

        return super(RollingLinearRegression, cls).__new__(
            cls,
            inputs=[dependent, independent],
            window_length=regression_length,
            mask=mask,
        )

    def compute(self, today, assets, out, dependent, independent):
        alpha = out.alpha
        beta = out.beta
        r_value = out.r_value
        p_value = out.p_value
        stderr = out.stderr

        def regress(y, x):
            regr_results = linregress(y=y, x=x)
            # `linregress` returns its results in the following order:
            # slope, intercept, r-value, p-value, stderr
            alpha[i] = regr_results[1]
            beta[i] = regr_results[0]
            r_value[i] = regr_results[2]
            p_value[i] = regr_results[3]
            stderr[i] = regr_results[4]

        # If `independent` is a Slice or single column of data, broadcast it
        # out to the same shape as `dependent`, then compute column-wise. This
        # is efficient because each column of the broadcasted array only refers
        # to a single memory location.
        independent = broadcast_arrays(independent, dependent)[0]
        for i in range(len(out)):
            regress(y=dependent[:, i], x=independent[:, i])


[docs]class RollingPearsonOfReturns(RollingPearson):
    """
    Calculates the Pearson product-moment correlation coefficient of the
    returns of the given asset with the returns of all other assets.

    Pearson correlation is what most people mean when they say "correlation
    coefficient" or "R-value".

    Parameters
    ----------
    target : zipline.assets.Asset
        The asset to correlate with all other assets.
    returns_length : int >= 2
        Length of the lookback window over which to compute returns. Daily
        returns require a window length of 2.
    correlation_length : int >= 1
        Length of the lookback window over which to compute each correlation
        coefficient.
    mask : zipline.pipeline.Filter, optional
        A Filter describing which assets should have their correlation with the
        target asset computed each day.

    Notes
    -----
    Computing this factor over many assets can be time consuming. It is
    recommended that a mask be used in order to limit the number of assets over
    which correlations are computed.

    Examples
    --------
    Let the following be example 10-day returns for three different assets::

                       SPY    MSFT     FB
        2017-03-13    -.03     .03    .04
        2017-03-14    -.02    -.03    .02
        2017-03-15    -.01     .02    .01
        2017-03-16       0    -.02    .01
        2017-03-17     .01     .04   -.01
        2017-03-20     .02    -.03   -.02
        2017-03-21     .03     .01   -.02
        2017-03-22     .04    -.02   -.02

    Suppose we are interested in SPY's rolling returns correlation with each
    stock from 2017-03-17 to 2017-03-22, using a 5-day look back window (that
    is, we calculate each correlation coefficient over 5 days of data). We can
    achieve this by doing::

        rolling_correlations = RollingPearsonOfReturns(
            target=sid(8554),
            returns_length=10,
            correlation_length=5,
        )

    The result of computing ``rolling_correlations`` from 2017-03-17 to
    2017-03-22 gives::

                       SPY   MSFT     FB
        2017-03-17       1    .15   -.96
        2017-03-20       1    .10   -.96
        2017-03-21       1   -.16   -.94
        2017-03-22       1   -.16   -.85

    Note that the column for SPY is all 1's, as the correlation of any data
    series with itself is always 1. To understand how each of the other values
    were calculated, take for example the .15 in MSFT's column. This is the
    correlation coefficient between SPY's returns looking back from 2017-03-17
    (-.03, -.02, -.01, 0, .01) and MSFT's returns (.03, -.03, .02, -.02, .04).

    See Also
    --------
    :class:`zipline.pipeline.factors.RollingSpearmanOfReturns`
    :class:`zipline.pipeline.factors.RollingLinearRegressionOfReturns`
    """

    def __new__(cls, target, returns_length, correlation_length, mask=NotSpecified):
        # Use the `SingleAsset` filter here because it protects against
        # inputting a non-existent target asset.
        returns = Returns(
            window_length=returns_length,
            mask=(AssetExists() | SingleAsset(asset=target)),
        )
        return super(RollingPearsonOfReturns, cls).__new__(
            cls,
            base_factor=returns,
            target=returns[target],
            correlation_length=correlation_length,
            mask=mask,
        )


[docs]class RollingSpearmanOfReturns(RollingSpearman):
    """
    Calculates the Spearman rank correlation coefficient of the returns of the
    given asset with the returns of all other assets.

    Parameters
    ----------
    target : zipline.assets.Asset
        The asset to correlate with all other assets.
    returns_length : int >= 2
        Length of the lookback window over which to compute returns. Daily
        returns require a window length of 2.
    correlation_length : int >= 1
        Length of the lookback window over which to compute each correlation
        coefficient.
    mask : zipline.pipeline.Filter, optional
        A Filter describing which assets should have their correlation with the
        target asset computed each day.

    Notes
    -----
    Computing this factor over many assets can be time consuming. It is
    recommended that a mask be used in order to limit the number of assets over
    which correlations are computed.

    See Also
    --------
    :class:`zipline.pipeline.factors.RollingPearsonOfReturns`
    :class:`zipline.pipeline.factors.RollingLinearRegressionOfReturns`
    """

    def __new__(cls, target, returns_length, correlation_length, mask=NotSpecified):
        # Use the `SingleAsset` filter here because it protects against
        # inputting a non-existent target asset.
        returns = Returns(
            window_length=returns_length,
            mask=(AssetExists() | SingleAsset(asset=target)),
        )
        return super(RollingSpearmanOfReturns, cls).__new__(
            cls,
            base_factor=returns,
            target=returns[target],
            correlation_length=correlation_length,
            mask=mask,
        )


[docs]class RollingLinearRegressionOfReturns(RollingLinearRegression):
    """Perform an ordinary least-squares regression predicting the returns of all
    other assets on the given asset.

    Parameters
    ----------
    target : zipline.assets.Asset
        The asset to regress against all other assets.
    returns_length : int >= 2
        Length of the lookback window over which to compute returns. Daily
        returns require a window length of 2.
    regression_length : int >= 1
        Length of the lookback window over which to compute each regression.
    mask : zipline.pipeline.Filter, optional
        A Filter describing which assets should be regressed against the target
        asset each day.

    Notes
    -----
    Computing this factor over many assets can be time consuming. It is
    recommended that a mask be used in order to limit the number of assets over
    which regressions are computed.

    This factor is designed to return five outputs:

    - alpha, a factor that computes the intercepts of each regression.
    - beta, a factor that computes the slopes of each regression.
    - r_value, a factor that computes the correlation coefficient of each
      regression.
    - p_value, a factor that computes, for each regression, the two-sided
      p-value for a hypothesis test whose null hypothesis is that the slope is
      zero.
    - stderr, a factor that computes the standard error of the estimate of each
      regression.

    For more help on factors with multiple outputs, see
    :class:`zipline.pipeline.CustomFactor`.

    Examples
    --------
    Let the following be example 10-day returns for three different assets::

                       SPY    MSFT     FB
        2017-03-13    -.03     .03    .04
        2017-03-14    -.02    -.03    .02
        2017-03-15    -.01     .02    .01
        2017-03-16       0    -.02    .01
        2017-03-17     .01     .04   -.01
        2017-03-20     .02    -.03   -.02
        2017-03-21     .03     .01   -.02
        2017-03-22     .04    -.02   -.02

    Suppose we are interested in predicting each stock's returns from SPY's
    over rolling 5-day look back windows. We can compute rolling regression
    coefficients (alpha and beta) from 2017-03-17 to 2017-03-22 by doing::

        regression_factor = RollingRegressionOfReturns(
            target=sid(8554),
            returns_length=10,
            regression_length=5,
        )
        alpha = regression_factor.alpha
        beta = regression_factor.beta

    The result of computing ``alpha`` from 2017-03-17 to 2017-03-22 gives::

                       SPY    MSFT     FB
        2017-03-17       0    .011   .003
        2017-03-20       0   -.004   .004
        2017-03-21       0    .007   .006
        2017-03-22       0    .002   .008

    And the result of computing ``beta`` from 2017-03-17 to 2017-03-22 gives::

                       SPY    MSFT     FB
        2017-03-17       1      .3   -1.1
        2017-03-20       1      .2     -1
        2017-03-21       1     -.3     -1
        2017-03-22       1     -.3    -.9

    Note that SPY's column for alpha is all 0's and for beta is all 1's, as the
    regression line of SPY with itself is simply the function y = x.

    To understand how each of the other values were calculated, take for
    example MSFT's ``alpha`` and ``beta`` values on 2017-03-17 (.011 and .3,
    respectively). These values are the result of running a linear regression
    predicting MSFT's returns from SPY's returns, using values starting at
    2017-03-17 and looking back 5 days. That is, the regression was run with
    x = [-.03, -.02, -.01, 0, .01] and y = [.03, -.03, .02, -.02, .04], and it
    produced a slope of .3 and an intercept of .011.

    See Also
    --------
    :class:`zipline.pipeline.factors.RollingPearsonOfReturns`
    :class:`zipline.pipeline.factors.RollingSpearmanOfReturns`
    """

    window_safe = True

    def __new__(cls, target, returns_length, regression_length, mask=NotSpecified):
        # Use the `SingleAsset` filter here because it protects against
        # inputting a non-existent target asset.
        returns = Returns(
            window_length=returns_length,
            mask=(AssetExists() | SingleAsset(asset=target)),
        )
        return super(RollingLinearRegressionOfReturns, cls).__new__(
            cls,
            dependent=returns,
            independent=returns[target],
            regression_length=regression_length,
            mask=mask,
        )


[docs]class SimpleBeta(CustomFactor, StandardOutputs):
    """Factor producing the slope of a regression line between each asset's daily
    returns to the daily returns of a single "target" asset.

    Parameters
    ----------
    target : zipline.Asset
        Asset against which other assets should be regressed.
    regression_length : int
        Number of days of daily returns to use for the regression.
    allowed_missing_percentage : float, optional
        Percentage of returns observations (between 0 and 1) that are allowed
        to be missing when calculating betas. Assets with more than this
        percentage of returns observations missing will produce values of
        NaN. Default behavior is that 25% of inputs can be missing.
    """

    window_safe = True
    dtype = float64_dtype
    params = ("allowed_missing_count",)

    @expect_types(
        target=Asset,
        regression_length=int,
        allowed_missing_percentage=(int, float),
        __funcname="SimpleBeta",
    )
    @expect_bounded(
        regression_length=(3, None),
        allowed_missing_percentage=(0.0, 1.0),
        __funcname="SimpleBeta",
    )
    def __new__(cls, target, regression_length, allowed_missing_percentage=0.25):
        daily_returns = Returns(
            window_length=2,
            mask=(AssetExists() | SingleAsset(asset=target)),
        )
        allowed_missing_count = int(allowed_missing_percentage * regression_length)
        return super(SimpleBeta, cls).__new__(
            cls,
            inputs=[daily_returns, daily_returns[target]],
            window_length=regression_length,
            allowed_missing_count=allowed_missing_count,
        )

[docs]    def compute(
        self, today, assets, out, all_returns, target_returns, allowed_missing_count
    ):
        vectorized_beta(
            dependents=all_returns,
            independent=target_returns,
            allowed_missing=allowed_missing_count,
            out=out,
        )

[docs]    def graph_repr(self):
        return "{}({!r}, {}, {})".format(
            type(self).__name__,
            str(self.target.symbol),  # coerce from unicode to str in py2.
            self.window_length,
            self.params["allowed_missing_count"],
        )

    @property
    def target(self):
        """Get the target of the beta calculation."""
        return self.inputs[1].asset

    def __repr__(self):
        return "{}({}, length={}, allowed_missing={})".format(
            type(self).__name__,
            self.target,
            self.window_length,
            self.params["allowed_missing_count"],
        )


def vectorized_beta(dependents, independent, allowed_missing, out=None):
    """Compute slopes of linear regressions between columns of ``dependents`` and
    ``independent``.

    Parameters
    ----------
    dependents : np.array[N, M]
        Array with columns of data to be regressed against ``independent``.
    independent : np.array[N, 1]
        Independent variable of the regression
    allowed_missing : int
        Number of allowed missing (NaN) observations per column. Columns with
        more than this many non-nan observations in either ``dependents`` or
        ``independents`` will output NaN as the regression coefficient.
    out : np.array[M] or None, optional
        Output array into which to write results.  If None, a new array is
        created and returned.

    Returns
    -------
    slopes : np.array[M]
        Linear regression coefficients for each column of ``dependents``.
    """
    # Cache these as locals since we're going to call them multiple times.
    nan = np.nan
    isnan = np.isnan
    N, M = dependents.shape

    if out is None:
        out = np.full(M, nan)

    # Copy N times as a column vector and fill with nans to have the same
    # missing value pattern as the dependent variable.
    #
    # PERF_TODO: We could probably avoid the space blowup by doing this in
    # Cython.

    # shape: (N, M)
    independent = np.where(
        isnan(dependents),
        nan,
        independent,
    )

    # Calculate beta as Cov(X, Y) / Cov(X, X).
    # https://en.wikipedia.org/wiki/Simple_linear_regression#Fitting_the_regression_line  # noqa
    #
    # NOTE: The usual formula for covariance is::
    #
    #    mean((X - mean(X)) * (Y - mean(Y)))
    #
    # However, we don't actually need to take the mean of both sides of the
    # product, because of the folllowing equivalence::
    #
    # Let X_res = (X - mean(X)).
    # We have:
    #
    #     mean(X_res * (Y - mean(Y))) = mean(X_res * (Y - mean(Y)))
    #                             (1) = mean((X_res * Y) - (X_res * mean(Y)))
    #                             (2) = mean(X_res * Y) - mean(X_res * mean(Y))
    #                             (3) = mean(X_res * Y) - mean(X_res) * mean(Y)
    #                             (4) = mean(X_res * Y) - 0 * mean(Y)
    #                             (5) = mean(X_res * Y)
    #
    #
    # The tricky step in the above derivation is step (4). We know that
    # mean(X_res) is zero because, for any X:
    #
    #     mean(X - mean(X)) = mean(X) - mean(X) = 0.
    #
    # The upshot of this is that we only have to center one of `independent`
    # and `dependent` when calculating covariances. Since we need the centered
    # `independent` to calculate its variance in the next step, we choose to
    # center `independent`.

    # shape: (N, M)
    ind_residual = independent - nanmean(independent, axis=0)

    # shape: (M,)
    covariances = nanmean(ind_residual * dependents, axis=0)

    # We end up with different variances in each column here because each
    # column may have a different subset of the data dropped due to missing
    # data in the corresponding dependent column.
    # shape: (M,)
    independent_variances = nanmean(ind_residual**2, axis=0)

    # shape: (M,)
    np.divide(covariances, independent_variances, out=out)

    # Write nans back to locations where we have more then allowed number of
    # missing entries.
    nanlocs = isnan(independent).sum(axis=0) > allowed_missing
    out[nanlocs] = nan

    return out


def vectorized_pearson_r(dependents, independents, allowed_missing, out=None):
    """Compute Pearson's r between columns of ``dependents`` and ``independents``.

    Parameters
    ----------
    dependents : np.array[N, M]
        Array with columns of data to be regressed against ``independent``.
    independents : np.array[N, M] or np.array[N, 1]
        Independent variable(s) of the regression. If a single column is
        passed, it is broadcast to the shape of ``dependents``.
    allowed_missing : int
        Number of allowed missing (NaN) observations per column. Columns with
        more than this many non-nan observations in either ``dependents`` or
        ``independents`` will output NaN as the correlation coefficient.
    out : np.array[M] or None, optional
        Output array into which to write results.  If None, a new array is
        created and returned.

    Returns
    -------
    correlations : np.array[M]
        Pearson correlation coefficients for each column of ``dependents``.

    See Also
    --------
    :class:`zipline.pipeline.factors.RollingPearson`
    :class:`zipline.pipeline.factors.RollingPearsonOfReturns`
    """
    nan = np.nan
    isnan = np.isnan
    N, M = dependents.shape

    if out is None:
        out = np.full(M, nan)

    if allowed_missing > 0:
        # If we're handling nans robustly, we need to mask both arrays to
        # locations where either was nan.
        either_nan = isnan(dependents) | isnan(independents)
        independents = np.where(either_nan, nan, independents)
        dependents = np.where(either_nan, nan, dependents)
        mean = nanmean
    else:
        # Otherwise, we can just use mean, which will give us a nan for any
        # column where there's ever a nan.
        mean = np.mean

    # Pearson R is Cov(X, Y) / StdDev(X) * StdDev(Y)
    # c.f. https://en.wikipedia.org/wiki/Pearson_correlation_coefficient
    ind_residual = independents - mean(independents, axis=0)
    dep_residual = dependents - mean(dependents, axis=0)

    ind_variance = mean(ind_residual**2, axis=0)
    dep_variance = mean(dep_residual**2, axis=0)

    covariances = mean(ind_residual * dep_residual, axis=0)

    evaluate(
        "where(mask, nan, cov / sqrt(ind_variance * dep_variance))",
        local_dict={
            "cov": covariances,
            "mask": isnan(independents).sum(axis=0) > allowed_missing,
            "nan": np.nan,
            "ind_variance": ind_variance,
            "dep_variance": dep_variance,
        },
        global_dict={},
        out=out,
    )
    return out