Source code for wbia.algo.hots.toy_nan_rf

# -*- coding: utf-8 -*-
import logging
import numpy as np
import utool as ut

(print, rrr, profile) = ut.inject2(__name__)
logger = logging.getLogger('wbia')


[docs]def get_toydata(rng):
    if ut.get_argflag('--toy2'):
        X_true, X, y = toydata2(rng)
    else:
        X_true, X, y = toydata1(rng)
    return X_true, X, y


[docs]def toydata2(rng):
    from sklearn.datasets import samples_generator

    n_samples = 1000
    n_features = 2
    n_classes = 2
    n_informative = 2
    n_clusters_per_class = int((2 ** n_informative) // n_classes)
    hypercube = False
    samplekw = dict(
        flip_y=0.00,
        class_sep=1.0,
        shift=[-10, 10],
        scale=1.0,
        n_redundant=0,
        n_repeated=0,
        hypercube=hypercube,
        n_samples=n_samples,
        n_informative=n_informative,
        n_classes=n_classes,
        n_clusters_per_class=n_clusters_per_class,
        weights=None,
        shuffle=True,
        n_features=n_features,
        random_state=rng,
    )

    X_true, y = samples_generator.make_classification(**samplekw)
    with_extra = ut.get_argflag('--extra')
    # make very informative nan dimension
    if with_extra:
        n_informative_nan = 100
        # extra_x = (rng.randn(n_informative_nan, 2) / 2 + [[12, -8]])
        extra_x = rng.randn(n_informative_nan, 2) / 2 + [[10, -12]]
        X_true = np.vstack((X_true, extra_x))
        y = np.append(y, [0] * n_informative_nan)

    # Randomly drop datapoints
    X = X_true.copy()
    nanrate = ut.get_argval('--nanrate', default=0.01)
    if nanrate:
        # TODO:
        # * informative nan
        # * random nan
        # * random nan + informative nan
        X.ravel()[rng.rand(X.size) < nanrate] = np.nan

    if with_extra:
        if True:
            X.T[1][-n_informative_nan:] = np.nan
        else:
            X.T[0][-n_informative_nan : -n_informative_nan // 2] = np.nan
            X.T[1][-n_informative_nan // 2 :] = np.nan
    return X_true, X, y


[docs]def toydata1(rng):
    """
    **Description of Plot**

    You'll notice that there are 4 plots. This is necessary to visualize a grid
    with nans. Each plot shows points in the 2-dimensional grid with corners at
    (0, 0) and (40, 40). The top left plot has these coordinates labeled. The
    other 3 plots correspond to the top left grid, but in these plots at least
    one of the dimensions has been "nanned". In the top right the x-dimension
    is "nanned". In the bottom left the y-dimension is "nanned", and in the
    bottom right both dimensions are "nanned". Even though all plots are drawn
    as a 2d-surface only the topleft plot is truly a surface with 2 degrees of
    freedom. The top right and bottom left plots are really lines with 1 degree
    of freedom, and the bottom right plot is actually just a single point with
    0 degrees of freedom.

    In this example I create 10 Gaussian blobs where the first 9 have their
    means laid out in a 3x3 grid and the last one has its mean in the center,
    but I gave it a high standard deviation. I'll refer to the high std cluster
    as 9, and label the other clusters at the grid means (to agree with the
    demo code) like this:

    ```
    6   7   8
    3   4   5
    0   1   2
    ```

    Looking at the top left plot you can see clusters 0, 1, 2, 4, 6, and 8. The
    reason the other cluster do not appear in this grid is because I've set at
    least one of their dimensions to be nan.  Specifically, cluster 3 had its y
    dimension set to nan; cluster 5 and 7 had their x dimension set to nan; and
    cluster 9 had both x and y dimensions set to nan.

    For clusters 3, 5, and 7, I plot "nanned" points as lines along the nanned
    dimension to show that only the non-nan dimensions can be used to
    distinguish these points. I also plot the original position before I
    "nanned" it for visualization purposes, but the learning algorithm never
    sees this. For cluster 9, I only plot the original positions because all of
    this data collapses to a single point [nan, nan].

    Red points are of class 0, and blue points are of class 1. Points in each
    plot represent the training data. The colored background of each plot
    represents the classification surface.
    """
    from sklearn.datasets import samples_generator
    import functools

    step = 20
    n_samples = 100

    blob = functools.partial(
        samples_generator.make_blobs, n_samples=n_samples, random_state=rng
    )

    Xy_blobs = [
        (0, blob(centers=[[0 * step, 0 * step]])[0]),
        (1, blob(centers=[[1 * step, 0 * step]])[0]),
        (0, blob(centers=[[2 * step, 0 * step]])[0]),
        (1, blob(centers=[[0 * step, 1 * step]])[0]),
        (0, blob(centers=[[1 * step, 1 * step]])[0]),
        (0, blob(centers=[[2 * step, 1 * step]])[0]),
        (0, blob(centers=[[0 * step, 2 * step]])[0]),
        (1, blob(centers=[[1 * step, 2 * step]])[0]),
        (0, blob(centers=[[2 * step, 2 * step]])[0]),
        (1, blob(centers=[[1 * step, 1 * step]], cluster_std=5)[0]),
    ]
    X_blobs = [Xy[1] for Xy in Xy_blobs]
    X_true = np.vstack(X_blobs)
    y_blobs = [np.full(len(X), y_, dtype=np.int) for y_, X in Xy_blobs]

    # nanify some values
    if True:
        X_blobs[3][:, 1] = np.nan
        X_blobs[7][:, 0] = np.nan
        X_blobs[5][:, 0] = np.nan
        X_blobs[-1][:, :] = np.nan

    X = np.vstack(X_blobs)
    y = np.hstack(y_blobs)
    return X_true, X, y


[docs]def show_nan_decision_function_2d(X, y, X_true, clf):
    import numpy as np

    logger.info('Drawing')

    # Now plot the decision boundary using a fine mesh as input to a
    # filled contour plot
    plot_step = 1.0
    x_min, x_max = X_true[:, 0].min() - 1, X_true[:, 0].max() + 1
    y_min, y_max = X_true[:, 1].min() - 1, X_true[:, 1].max() + 1
    xx, yy = np.meshgrid(
        np.arange(x_min, x_max, plot_step), np.arange(y_min, y_max, plot_step)
    )
    yynan = np.full(yy.shape, fill_value=np.nan)
    xxnan = np.full(yy.shape, fill_value=np.nan)

    # Get prediction surface in the non-nan-zone
    Z_nonnan = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()]).T[1].reshape(xx.shape)

    # Get prediction surface in the xnan-zone
    Z_xnan = clf.predict_proba(np.c_[xxnan.ravel(), yy.ravel()]).T[1].reshape(xx.shape)

    # Get prediction surface in the ynan-zone
    Z_ynan = clf.predict_proba(np.c_[xx.ravel(), yynan.ravel()]).T[1].reshape(xx.shape)

    # Get prediction surface for all-nan-zone
    Z_fullnan = (
        clf.predict_proba(np.c_[xxnan.ravel(), yynan.ravel()]).T[1].reshape(xx.shape)
    )

    is_nonnan = np.logical_and(~np.isnan(X.T[0]), ~np.isnan(X.T[1]))
    is_xnan = np.logical_and(np.isnan(X.T[0]), ~np.isnan(X.T[1]))
    is_ynan = np.logical_and(~np.isnan(X.T[0]), np.isnan(X.T[1]))
    is_fullnan = np.logical_and(np.isnan(X.T[0]), np.isnan(X.T[1]))

    # Draw surfaces and support points in different axes
    import matplotlib.gridspec as gridspec
    import matplotlib.pyplot as plt

    gs = gridspec.GridSpec(17, 17)
    pnum1 = (gs[0:8, 0:8],)
    pnum2 = (gs[0:8, 8:16],)
    pnum3 = (gs[9:17, 0:8],)
    pnum4 = (gs[9:17, 8:16],)

    fig = plt.figure()

    cmap = plt.cm.RdYlBu
    norm = plt.Normalize(vmin=0, vmax=1)
    sm = plt.cm.ScalarMappable(cmap=cmap)
    sm.set_array(np.linspace(0, 1))

    color0 = cmap(0)
    logger.info('color0 = %r' % (color0,))
    color1 = cmap(1.0)
    logger.info('color1 = %r' % (color1,))

    def draw_line_segments(pts1, pts2, ax=None, **kwargs):
        import matplotlib as mpl

        if ax is None:
            ax = plt.gca()
        assert len(pts1) == len(pts2), 'unaligned'
        segments = [(xy1, xy2) for xy1, xy2 in zip(pts1, pts2)]
        linewidth = kwargs.pop('lw', kwargs.pop('linewidth', 1.0))
        alpha = kwargs.pop('alpha', 1.0)
        line_group = mpl.collections.LineCollection(
            segments, linewidth, alpha=alpha, **kwargs
        )
        ax.add_collection(line_group)

    def draw_single_nan_lines(X_true, y, flags, nan_dim):
        if not np.any(flags):
            return
        nandim_min = np.nanmin(X_true.T[nan_dim])
        nandim_max = np.nanmax(X_true.T[nan_dim])

        num_dim = 1 - nan_dim  # 2d only
        numdim_pts = X[flags].T[num_dim]

        pts1 = np.empty((flags.sum(), 2))
        pts2 = np.empty((flags.sum(), 2))
        pts1[:, nan_dim] = nandim_min
        pts2[:, nan_dim] = nandim_max
        pts1[:, num_dim] = numdim_pts
        pts2[:, num_dim] = numdim_pts
        y_ = y[flags]
        draw_line_segments(
            pts1[y_ == 0], pts2[y_ == 0], color=color0, linestyle='-', alpha=1.0
        )
        draw_line_segments(
            pts1[y_ == 1], pts2[y_ == 1], color=color1, linestyle='-', alpha=1.0
        )

    def draw_train_points(X_true, y, flags):
        plt.plot(
            X_true[flags].T[0][y[flags] == 0],
            X_true[flags].T[1][y[flags] == 0],
            'o',
            color=color0,
            markeredgecolor='w',
        )
        plt.plot(
            X_true[flags].T[0][y[flags] == 1],
            X_true[flags].T[1][y[flags] == 1],
            'o',
            color=color1,
            markeredgecolor='w',
        )

    def _contour(Z):
        plt.contourf(xx, yy, Z, cmap=cmap, norm=norm, alpha=1.0)

    fig.add_subplot(*pnum1)
    _contour(Z_nonnan)
    flags = is_nonnan
    draw_train_points(X_true, y, flags)
    plt.title('non-nan decision surface')
    plt.gca().set_aspect('equal')

    fig.add_subplot(*pnum2)
    _contour(Z_xnan)
    flags = is_xnan
    draw_train_points(X_true, y, flags)
    draw_single_nan_lines(X_true, y, flags, 0)
    plt.gca().set_xticks([])
    plt.gca().set_xlabel('nan')

    plt.title('x-nan decision surface')
    plt.gca().set_aspect('equal')

    fig.add_subplot(*pnum3)
    _contour(Z_ynan)
    flags = is_ynan
    draw_train_points(X_true, y, flags)
    # make nan-lines
    draw_single_nan_lines(X_true, y, flags, 1)
    plt.title('y-nan decision surface')
    plt.gca().set_aspect('equal')
    plt.gca().set_yticks([])
    plt.gca().set_ylabel('nan')

    fig.add_subplot(*pnum4)
    _contour(Z_fullnan)
    flags = is_fullnan
    draw_train_points(X_true, y, flags)
    plt.title('full-nan decision surface')
    plt.gca().set_aspect('equal')
    plt.gca().set_xticks([])
    plt.gca().set_yticks([])
    plt.gca().set_xlabel('nan')
    plt.gca().set_ylabel('nan')

    plt.gcf().suptitle('RandomForestClassifier With NaN decision criteria')

    gs = gridspec.GridSpec(1, 16)
    subspec = gs[:, -1:]
    cax = plt.subplot(subspec)
    plt.colorbar(sm, cax)
    cax.set_ylabel('probability class 1')

    new_subplotpars = fig.subplotpars.__dict__.copy()
    del new_subplotpars['validate']
    new_subplotpars.update(
        left=0.001, right=0.9, top=0.9, bottom=0.05, hspace=1.0, wspace=1.0
    )
    plt.subplots_adjust(**new_subplotpars)


[docs]def main():
    r"""
    SeeAlso:
        python -m sklearn.ensemble.tests.test_forest test_multioutput

    CommandLine:
        python -m wbia toy_classify_nans
        python -m wbia toy_classify_nans --toy1 --save "rf_nan_toy1.jpg" --figsize=10,10
        python -m wbia toy_classify_nans --toy2 --save "rf_nan_toy2.jpg" --figsize=10,10
        python -m wbia toy_classify_nans --toy2 --save "rf_nan_toy3.jpg" --figsize=10,10 --extra
        python -m wbia toy_classify_nans --toy2 --save "rf_nan_toy4.jpg" --figsize=10,10 --extra --nanrate=0
        python -m wbia toy_classify_nans --toy2 --save "rf_nan_toy5.jpg" --figsize=10,10 --nanrate=0

    Example:
        >>> # DISABLE_DOCTEST
        >>> result = toy_classify_nans()
    """
    from sklearn.ensemble import RandomForestClassifier

    rng = np.random.RandomState(42)
    logger.info('Creating test data')

    X_true, X, y = get_toydata(rng)

    assert len(X) == len(y)

    logger.info('Fitting RF on %d points' % (len(X),))
    # Train uncalibrated random forest classifier on train data
    clf = RandomForestClassifier(
        n_estimators=64,
        random_state=42,
        criterion='gini',
        missing_values=np.nan,
        bootstrap=False,
    )
    # import pprint
    # pprint.pprint(clf.__dict__)
    clf.fit(X, y)
    # pprint.pprint(clf.__dict__)

    show_nan_decision_function_2d(X, y, X_true, clf)


if __name__ == '__main__':
    r"""
    CommandLine:
        python -m wbia.algo.hots.toy_nan_rf --show
    """
    main()
    import matplotlib.pyplot as plt

    plt.show()