# -*- coding: utf-8 -*-
import logging
import numpy as np
import utool as ut
(print, rrr, profile) = ut.inject2(__name__)
logger = logging.getLogger('wbia')
[docs]def get_toydata(rng):
if ut.get_argflag('--toy2'):
X_true, X, y = toydata2(rng)
else:
X_true, X, y = toydata1(rng)
return X_true, X, y
[docs]def toydata2(rng):
from sklearn.datasets import samples_generator
n_samples = 1000
n_features = 2
n_classes = 2
n_informative = 2
n_clusters_per_class = int((2 ** n_informative) // n_classes)
hypercube = False
samplekw = dict(
flip_y=0.00,
class_sep=1.0,
shift=[-10, 10],
scale=1.0,
n_redundant=0,
n_repeated=0,
hypercube=hypercube,
n_samples=n_samples,
n_informative=n_informative,
n_classes=n_classes,
n_clusters_per_class=n_clusters_per_class,
weights=None,
shuffle=True,
n_features=n_features,
random_state=rng,
)
X_true, y = samples_generator.make_classification(**samplekw)
with_extra = ut.get_argflag('--extra')
# make very informative nan dimension
if with_extra:
n_informative_nan = 100
# extra_x = (rng.randn(n_informative_nan, 2) / 2 + [[12, -8]])
extra_x = rng.randn(n_informative_nan, 2) / 2 + [[10, -12]]
X_true = np.vstack((X_true, extra_x))
y = np.append(y, [0] * n_informative_nan)
# Randomly drop datapoints
X = X_true.copy()
nanrate = ut.get_argval('--nanrate', default=0.01)
if nanrate:
# TODO:
# * informative nan
# * random nan
# * random nan + informative nan
X.ravel()[rng.rand(X.size) < nanrate] = np.nan
if with_extra:
if True:
X.T[1][-n_informative_nan:] = np.nan
else:
X.T[0][-n_informative_nan : -n_informative_nan // 2] = np.nan
X.T[1][-n_informative_nan // 2 :] = np.nan
return X_true, X, y
[docs]def toydata1(rng):
"""
**Description of Plot**
You'll notice that there are 4 plots. This is necessary to visualize a grid
with nans. Each plot shows points in the 2-dimensional grid with corners at
(0, 0) and (40, 40). The top left plot has these coordinates labeled. The
other 3 plots correspond to the top left grid, but in these plots at least
one of the dimensions has been "nanned". In the top right the x-dimension
is "nanned". In the bottom left the y-dimension is "nanned", and in the
bottom right both dimensions are "nanned". Even though all plots are drawn
as a 2d-surface only the topleft plot is truly a surface with 2 degrees of
freedom. The top right and bottom left plots are really lines with 1 degree
of freedom, and the bottom right plot is actually just a single point with
0 degrees of freedom.
In this example I create 10 Gaussian blobs where the first 9 have their
means laid out in a 3x3 grid and the last one has its mean in the center,
but I gave it a high standard deviation. I'll refer to the high std cluster
as 9, and label the other clusters at the grid means (to agree with the
demo code) like this:
```
6 7 8
3 4 5
0 1 2
```
Looking at the top left plot you can see clusters 0, 1, 2, 4, 6, and 8. The
reason the other cluster do not appear in this grid is because I've set at
least one of their dimensions to be nan. Specifically, cluster 3 had its y
dimension set to nan; cluster 5 and 7 had their x dimension set to nan; and
cluster 9 had both x and y dimensions set to nan.
For clusters 3, 5, and 7, I plot "nanned" points as lines along the nanned
dimension to show that only the non-nan dimensions can be used to
distinguish these points. I also plot the original position before I
"nanned" it for visualization purposes, but the learning algorithm never
sees this. For cluster 9, I only plot the original positions because all of
this data collapses to a single point [nan, nan].
Red points are of class 0, and blue points are of class 1. Points in each
plot represent the training data. The colored background of each plot
represents the classification surface.
"""
from sklearn.datasets import samples_generator
import functools
step = 20
n_samples = 100
blob = functools.partial(
samples_generator.make_blobs, n_samples=n_samples, random_state=rng
)
Xy_blobs = [
(0, blob(centers=[[0 * step, 0 * step]])[0]),
(1, blob(centers=[[1 * step, 0 * step]])[0]),
(0, blob(centers=[[2 * step, 0 * step]])[0]),
(1, blob(centers=[[0 * step, 1 * step]])[0]),
(0, blob(centers=[[1 * step, 1 * step]])[0]),
(0, blob(centers=[[2 * step, 1 * step]])[0]),
(0, blob(centers=[[0 * step, 2 * step]])[0]),
(1, blob(centers=[[1 * step, 2 * step]])[0]),
(0, blob(centers=[[2 * step, 2 * step]])[0]),
(1, blob(centers=[[1 * step, 1 * step]], cluster_std=5)[0]),
]
X_blobs = [Xy[1] for Xy in Xy_blobs]
X_true = np.vstack(X_blobs)
y_blobs = [np.full(len(X), y_, dtype=np.int) for y_, X in Xy_blobs]
# nanify some values
if True:
X_blobs[3][:, 1] = np.nan
X_blobs[7][:, 0] = np.nan
X_blobs[5][:, 0] = np.nan
X_blobs[-1][:, :] = np.nan
X = np.vstack(X_blobs)
y = np.hstack(y_blobs)
return X_true, X, y
[docs]def show_nan_decision_function_2d(X, y, X_true, clf):
import numpy as np
logger.info('Drawing')
# Now plot the decision boundary using a fine mesh as input to a
# filled contour plot
plot_step = 1.0
x_min, x_max = X_true[:, 0].min() - 1, X_true[:, 0].max() + 1
y_min, y_max = X_true[:, 1].min() - 1, X_true[:, 1].max() + 1
xx, yy = np.meshgrid(
np.arange(x_min, x_max, plot_step), np.arange(y_min, y_max, plot_step)
)
yynan = np.full(yy.shape, fill_value=np.nan)
xxnan = np.full(yy.shape, fill_value=np.nan)
# Get prediction surface in the non-nan-zone
Z_nonnan = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()]).T[1].reshape(xx.shape)
# Get prediction surface in the xnan-zone
Z_xnan = clf.predict_proba(np.c_[xxnan.ravel(), yy.ravel()]).T[1].reshape(xx.shape)
# Get prediction surface in the ynan-zone
Z_ynan = clf.predict_proba(np.c_[xx.ravel(), yynan.ravel()]).T[1].reshape(xx.shape)
# Get prediction surface for all-nan-zone
Z_fullnan = (
clf.predict_proba(np.c_[xxnan.ravel(), yynan.ravel()]).T[1].reshape(xx.shape)
)
is_nonnan = np.logical_and(~np.isnan(X.T[0]), ~np.isnan(X.T[1]))
is_xnan = np.logical_and(np.isnan(X.T[0]), ~np.isnan(X.T[1]))
is_ynan = np.logical_and(~np.isnan(X.T[0]), np.isnan(X.T[1]))
is_fullnan = np.logical_and(np.isnan(X.T[0]), np.isnan(X.T[1]))
# Draw surfaces and support points in different axes
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
gs = gridspec.GridSpec(17, 17)
pnum1 = (gs[0:8, 0:8],)
pnum2 = (gs[0:8, 8:16],)
pnum3 = (gs[9:17, 0:8],)
pnum4 = (gs[9:17, 8:16],)
fig = plt.figure()
cmap = plt.cm.RdYlBu
norm = plt.Normalize(vmin=0, vmax=1)
sm = plt.cm.ScalarMappable(cmap=cmap)
sm.set_array(np.linspace(0, 1))
color0 = cmap(0)
logger.info('color0 = %r' % (color0,))
color1 = cmap(1.0)
logger.info('color1 = %r' % (color1,))
def draw_line_segments(pts1, pts2, ax=None, **kwargs):
import matplotlib as mpl
if ax is None:
ax = plt.gca()
assert len(pts1) == len(pts2), 'unaligned'
segments = [(xy1, xy2) for xy1, xy2 in zip(pts1, pts2)]
linewidth = kwargs.pop('lw', kwargs.pop('linewidth', 1.0))
alpha = kwargs.pop('alpha', 1.0)
line_group = mpl.collections.LineCollection(
segments, linewidth, alpha=alpha, **kwargs
)
ax.add_collection(line_group)
def draw_single_nan_lines(X_true, y, flags, nan_dim):
if not np.any(flags):
return
nandim_min = np.nanmin(X_true.T[nan_dim])
nandim_max = np.nanmax(X_true.T[nan_dim])
num_dim = 1 - nan_dim # 2d only
numdim_pts = X[flags].T[num_dim]
pts1 = np.empty((flags.sum(), 2))
pts2 = np.empty((flags.sum(), 2))
pts1[:, nan_dim] = nandim_min
pts2[:, nan_dim] = nandim_max
pts1[:, num_dim] = numdim_pts
pts2[:, num_dim] = numdim_pts
y_ = y[flags]
draw_line_segments(
pts1[y_ == 0], pts2[y_ == 0], color=color0, linestyle='-', alpha=1.0
)
draw_line_segments(
pts1[y_ == 1], pts2[y_ == 1], color=color1, linestyle='-', alpha=1.0
)
def draw_train_points(X_true, y, flags):
plt.plot(
X_true[flags].T[0][y[flags] == 0],
X_true[flags].T[1][y[flags] == 0],
'o',
color=color0,
markeredgecolor='w',
)
plt.plot(
X_true[flags].T[0][y[flags] == 1],
X_true[flags].T[1][y[flags] == 1],
'o',
color=color1,
markeredgecolor='w',
)
def _contour(Z):
plt.contourf(xx, yy, Z, cmap=cmap, norm=norm, alpha=1.0)
fig.add_subplot(*pnum1)
_contour(Z_nonnan)
flags = is_nonnan
draw_train_points(X_true, y, flags)
plt.title('non-nan decision surface')
plt.gca().set_aspect('equal')
fig.add_subplot(*pnum2)
_contour(Z_xnan)
flags = is_xnan
draw_train_points(X_true, y, flags)
draw_single_nan_lines(X_true, y, flags, 0)
plt.gca().set_xticks([])
plt.gca().set_xlabel('nan')
plt.title('x-nan decision surface')
plt.gca().set_aspect('equal')
fig.add_subplot(*pnum3)
_contour(Z_ynan)
flags = is_ynan
draw_train_points(X_true, y, flags)
# make nan-lines
draw_single_nan_lines(X_true, y, flags, 1)
plt.title('y-nan decision surface')
plt.gca().set_aspect('equal')
plt.gca().set_yticks([])
plt.gca().set_ylabel('nan')
fig.add_subplot(*pnum4)
_contour(Z_fullnan)
flags = is_fullnan
draw_train_points(X_true, y, flags)
plt.title('full-nan decision surface')
plt.gca().set_aspect('equal')
plt.gca().set_xticks([])
plt.gca().set_yticks([])
plt.gca().set_xlabel('nan')
plt.gca().set_ylabel('nan')
plt.gcf().suptitle('RandomForestClassifier With NaN decision criteria')
gs = gridspec.GridSpec(1, 16)
subspec = gs[:, -1:]
cax = plt.subplot(subspec)
plt.colorbar(sm, cax)
cax.set_ylabel('probability class 1')
new_subplotpars = fig.subplotpars.__dict__.copy()
del new_subplotpars['validate']
new_subplotpars.update(
left=0.001, right=0.9, top=0.9, bottom=0.05, hspace=1.0, wspace=1.0
)
plt.subplots_adjust(**new_subplotpars)
[docs]def main():
r"""
SeeAlso:
python -m sklearn.ensemble.tests.test_forest test_multioutput
CommandLine:
python -m wbia toy_classify_nans
python -m wbia toy_classify_nans --toy1 --save "rf_nan_toy1.jpg" --figsize=10,10
python -m wbia toy_classify_nans --toy2 --save "rf_nan_toy2.jpg" --figsize=10,10
python -m wbia toy_classify_nans --toy2 --save "rf_nan_toy3.jpg" --figsize=10,10 --extra
python -m wbia toy_classify_nans --toy2 --save "rf_nan_toy4.jpg" --figsize=10,10 --extra --nanrate=0
python -m wbia toy_classify_nans --toy2 --save "rf_nan_toy5.jpg" --figsize=10,10 --nanrate=0
Example:
>>> # DISABLE_DOCTEST
>>> result = toy_classify_nans()
"""
from sklearn.ensemble import RandomForestClassifier
rng = np.random.RandomState(42)
logger.info('Creating test data')
X_true, X, y = get_toydata(rng)
assert len(X) == len(y)
logger.info('Fitting RF on %d points' % (len(X),))
# Train uncalibrated random forest classifier on train data
clf = RandomForestClassifier(
n_estimators=64,
random_state=42,
criterion='gini',
missing_values=np.nan,
bootstrap=False,
)
# import pprint
# pprint.pprint(clf.__dict__)
clf.fit(X, y)
# pprint.pprint(clf.__dict__)
show_nan_decision_function_2d(X, y, X_true, clf)
if __name__ == '__main__':
r"""
CommandLine:
python -m wbia.algo.hots.toy_nan_rf --show
"""
main()
import matplotlib.pyplot as plt
plt.show()