Source code for wbia.algo.preproc.preproc_occurrence

# -*- coding: utf-8 -*-
import logging
import utool as ut
import numpy as np
import vtool as vt
from scipy.spatial import distance
import scipy.cluster.hierarchy
import sklearn.cluster

(print, rrr, profile) = ut.inject2(__name__, '[preproc_occurrence]')
logger = logging.getLogger('wbia')


[docs]def wbia_compute_occurrences(ibs, gid_list, config=None, verbose=None):
    """
    clusters occurrences togethers (by time, not yet space)
    An occurrence is a meeting, localized in time and space between a camera
    and a group of animals.
    Animals are identified within each occurrence.

    Does not modify database state, just returns cluster ids

    Args:
        ibs (IBEISController):  wbia controller object
        gid_list (list):

    Returns:
        tuple: (None, None)

    CommandLine:
        python -m wbia --tf wbia_compute_occurrences:0 --show
        TODO: FIXME: good example of autogen doctest return failure
    """
    if config is None:
        config = {'use_gps': False, 'seconds_thresh': 600}
        # from wbia.algo import Config
        # config = Config.OccurrenceConfig().asdict()
    occur_labels, occur_gids = compute_occurrence_groups(
        ibs, gid_list, config, verbose=verbose
    )
    if True:
        gid2_label = {
            gid: label for label, gids in zip(occur_labels, occur_gids) for gid in gids
        }
        # Assert that each gid only belongs to one occurrence
        flat_imgsetids = ut.dict_take(gid2_label, gid_list)
        flat_gids = gid_list
    else:
        # Flatten gids list by enounter
        flat_imgsetids, flat_gids = ut.flatten_membership_mapping(
            occur_labels, occur_gids
        )
    return flat_imgsetids, flat_gids


[docs]def compute_occurrence_groups(ibs, gid_list, config={}, use_gps=False, verbose=None):
    r"""
    Args:
        ibs (IBEISController):  wbia controller object
        gid_list (list):

    Returns:
        tuple: (None, None)

    CommandLine:
        python -m wbia compute_occurrence_groups

    Example:
        >>> # DISABLE_DOCTEST
        >>> from wbia.algo.preproc.preproc_occurrence import *  # NOQA
        >>> import wbia
        >>> ibs = wbia.opendb(defaultdb='testdb1')
        >>> verbose = True
        >>> images = ibs.images()
        >>> gid_list = images.gids
        >>> config = {}  # wbia.algo.Config.OccurrenceConfig().asdict()
        >>> tup = wbia_compute_occurrences(ibs, gid_list)
        >>> (flat_imgsetids, flat_gids)
        >>> aids_list = list(ut.group_items(aid_list_, flat_imgsetids).values())
        >>> metric = list(map(len, aids_list))
        >>> sortx = ut.list_argsort(metric)[::-1]
        >>> index = sortx[1]
        >>> aids = aids_list[index]
        >>> gids = list(set(ibs.get_annot_gids(aids)))
    """
    if verbose is None:
        verbose = ut.NOT_QUIET
    # Config info
    gid_list = np.unique(gid_list)
    if verbose:
        logger.info('[occur] Computing occurrences on %r images.' % (len(gid_list)))
        logger.info('[occur] config = ' + ut.repr3(config))

    use_gps = config['use_gps']
    datas = prepare_X_data(ibs, gid_list, use_gps=use_gps)

    from wbia.algo.preproc import occurrence_blackbox

    cluster_algo = config.get('cluster_algo', 'agglomerative')
    km_per_sec = config.get('km_per_sec', occurrence_blackbox.KM_PER_SEC)
    thresh_sec = config.get('seconds_thresh', 30 * 60.0)
    min_imgs_per_occurence = config.get('min_imgs_per_occurence', 1)
    # 30 minutes = 3.6 kilometers
    # 5 minutes = 0.6 kilometers

    assert cluster_algo == 'agglomerative', 'only agglomerative is supported'

    # Group datas with different values separately
    all_gids = []
    all_labels = []
    for key in datas.keys():
        val = datas[key]
        gids, latlons, posixtimes = val
        labels = occurrence_blackbox.cluster_timespace_sec(
            latlons, posixtimes, thresh_sec, km_per_sec=km_per_sec
        )
        if labels is None:
            labels = np.zeros(len(gids), dtype=np.int)
        all_gids.append(gids)
        all_labels.append(labels)

    # Combine labels across different groups
    pads = [vt.safe_max(ys, fill=0) + 1 for ys in all_labels]
    offsets = np.array([0] + pads[:-1]).cumsum()
    all_labels_ = [ys + offset for ys, offset in zip(all_labels, offsets)]
    label_arr = np.array(ut.flatten(all_labels_))
    gid_arr = np.array(ut.flatten(all_gids))

    # Group images by unique label
    labels, label_gids = group_images_by_label(label_arr, gid_arr)
    # Remove occurrences less than the threshold
    occur_labels = labels
    occur_gids = label_gids
    occur_unixtimes = compute_occurrence_unixtime(ibs, occur_gids)
    occur_labels, occur_gids = filter_and_relabel(
        labels, label_gids, min_imgs_per_occurence, occur_unixtimes
    )
    if verbose:
        logger.info('[occur] Found %d clusters.' % len(occur_labels))
    if len(label_gids) > 0 and verbose:
        logger.info('[occur] Cluster image size stats:')
        ut.print_dict(
            ut.get_stats(list(map(len, occur_gids)), use_median=True, use_sum=True),
            'occur image stats',
        )
    return occur_labels, occur_gids


[docs]def compute_occurrence_unixtime(ibs, occur_gids):
    # assert isinstance(ibs, IBEISController)
    # TODO: account for -1
    from wbia.other import ibsfuncs

    unixtimes = ibsfuncs.unflat_map(ibs.get_image_unixtime, occur_gids)
    time_arrs = list(map(np.array, unixtimes))
    occur_unixtimes = list(map(np.mean, time_arrs))
    return occur_unixtimes


def _compute_occurrence_datetime(ibs, occur_gids):
    # assert isinstance(ibs, IBEISController)
    # from wbia.other import ibsfuncs
    occur_unixtimes = compute_occurrence_unixtime(ibs, occur_gids)
    occur_datetimes = list(map(ut.unixtime_to_datetimestr, occur_unixtimes))
    return occur_datetimes


[docs]def prepare_X_data(ibs, gid_list, use_gps=True):
    """
    Splits data into groups with/without gps and time

    Example:
        >>> # ENABLE_DOCTEST
        >>> from wbia.algo.preproc.preproc_occurrence import *  # NOQA
        >>> import wbia
        >>> ibs = wbia.opendb(defaultdb='testdb1')
        >>> images = ibs.images()
        >>> # wbia.control.accessor_decors.DEBUG_GETTERS = True
        >>> use_gps = True
        >>> gid_list = images.gids
        >>> datas = prepare_X_data(ibs, gid_list, use_gps)
        >>> print(ut.repr2(datas, nl=2, precision=2))
        >>> assert len(datas['both'][0]) == 12
        >>> assert len(datas['neither'][0]) == 0
    """
    images = ibs.images(gid_list, caching=True)
    gps_list_ = images.gps2
    unixtime_list_ = images.unixtime2
    gps_list_ = vt.ensure_shape(gps_list_, (None, 2))
    has_gps = np.all(np.logical_not(np.isnan(gps_list_)), axis=1)
    has_time = np.logical_not(np.isnan(unixtime_list_))

    if not use_gps:
        has_gps[:] = False

    has_both = np.logical_and(has_time, has_gps)
    has_either = np.logical_or(has_time, has_gps)
    has_gps_only = np.logical_and(has_gps, np.logical_not(has_both))
    has_time_only = np.logical_and(has_time, np.logical_not(has_both))
    has_neither = np.logical_not(has_either)

    both = images.compress(has_both)
    xgps = images.compress(has_gps_only)
    xtime = images.compress(has_time_only)
    neither = images.compress(has_neither)

    # Group imagse with different attributes separately
    datas = {
        'both': (both.gids, both.unixtime2, both.gps2),
        'gps_only': (xgps.gids, None, xgps.gps2),
        'time_only': (xtime.gids, xtime.unixtime2, None),
        'neither': (neither.gids, None, None),
    }
    return datas


[docs]def agglomerative_cluster_occurrences(X_data, thresh_sec):
    """
    Agglomerative occurrence clustering algorithm

    Args:
        X_data (ndarray):  Length N array of data to cluster
        thresh_sec (float):

    Returns:
        ndarray: (label_arr) - Length N array of cluster indexes

    CommandLine:
        python -m wbia.algo.preproc.preproc_occurrence --exec-agglomerative_cluster_occurrences

    References:
        https://docs.scipy.org/doc/scipy-0.9.0/reference/generated/scipy.cluster.hierarchy.fclusterdata.html#scipy.cluster.hierarchy.fclusterdata
        http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.cluster.hierarchy.fcluster.html

    Example:
        >>> # DISABLE_DOCTEST
        >>> from wbia.algo.preproc.preproc_occurrence import *  # NOQA
        >>> X_data = '?'
        >>> thresh_sec = '?'
        >>> (occur_ids, occur_gids) = agglomerative_cluster_occurrences(X_data, thresh_sec)
        >>> result = ('(occur_ids, occur_gids) = %s' % (str((occur_ids, occur_gids)),))
        >>> print(result)
    """
    label_arr = scipy.cluster.hierarchy.fclusterdata(
        X_data, thresh_sec, criterion='distance'
    )
    return label_arr


[docs]def meanshift_cluster_occurrences(X_data, quantile):
    """Meanshift occurrence clustering algorithm

    Args:
        X_data (ndarray):  Length N array of data to cluster
        quantile (float): quantile should be between [0, 1].
            eg: quantile=.5 represents the median of all pairwise distances

    Returns:
        ndarray : Length N array of labels

    CommandLine:
        python -m wbia.algo.preproc.preproc_occurrence --exec-meanshift_cluster_occurrences

    Example:
        >>> # DISABLE_DOCTEST
        >>> from wbia.algo.preproc.preproc_occurrence import *  # NOQA
        >>> X_data = '?'
        >>> quantile = '?'
        >>> result = meanshift_cluster_occurrences(X_data, quantile)
        >>> print(result)
    """
    try:
        bandwidth = sklearn.cluster.estimate_bandwidth(
            X_data, quantile=quantile, n_samples=500
        )
        assert bandwidth != 0, '[occur] bandwidth is 0. Cannot cluster'
        # bandwidth is with respect to the RBF used in clustering
        # ms = sklearn.cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True, cluster_all=True)
        ms = sklearn.cluster.MeanShift(
            bandwidth=bandwidth, bin_seeding=True, cluster_all=False
        )
        ms.fit(X_data)
        label_arr = ms.labels_

        unique_labels = np.unique(label_arr)
        max_label = max(0, unique_labels.max())
        num_orphans = (label_arr == -1).sum()
        label_arr[label_arr == -1] = np.arange(max_label + 1, max_label + 1 + num_orphans)
    except Exception as ex:
        ut.printex(
            ex,
            'error computing meanshift',
            key_list=['X_data', 'quantile'],
            iswarning=True,
        )
        # Fallback to all from same occurrence
        label_arr = np.zeros(X_data.size)
    return label_arr


[docs]def group_images_by_label(label_arr, gid_arr):
    """
    Input: Length N list of labels and ids
    Output: Length M list of unique labels, and lenth M list of lists of ids
    """
    # Reverse the image to cluster index mapping
    import vtool as vt

    labels_, groupxs_ = vt.group_indices(label_arr)
    sortx = np.array(list(map(len, groupxs_))).argsort()[::-1]
    labels = labels_.take(sortx, axis=0)
    groupxs = ut.take(groupxs_, sortx)
    label_gids = vt.apply_grouping(gid_arr, groupxs)
    return labels, label_gids


[docs]def filter_and_relabel(labels, label_gids, min_imgs_per_occurence, occur_unixtimes=None):
    """
    Removes clusters with too few members.
    Relabels clusters-labels such that label 0 has the most members
    """
    label_nGids = np.array(list(map(len, label_gids)))
    label_isvalid = label_nGids >= min_imgs_per_occurence
    occur_gids = ut.compress(label_gids, label_isvalid)
    if occur_unixtimes is not None:
        occur_unixtimes = ut.compress(occur_unixtimes, label_isvalid)
        # Rebase ids so occurrence0 has the most images
        # occur_ids  = list(range(label_isvalid.sum()))
        # else:
        # sort by time instead
        unixtime_arr = np.array(occur_unixtimes)
        # Reorder occurrences so the oldest has the lowest number
        occur_gids = ut.take(label_gids, unixtime_arr.argsort())
    occur_ids = list(range(len(occur_gids)))
    return occur_ids, occur_gids


[docs]def timespace_distance(pt1, pt2):
    (sec1, lat1, lon1) = pt1
    (sec2, lat2, lon2) = pt2
    km_dist = vt.haversine((lat1, lon1), (lat2, lon2))
    km_per_sec = 0.002  # conversion ratio for reasonable animal walking speed
    # sec_dist = (((sec1 - sec2) * km_per_sec) ** 2)
    sec_dist = np.abs(sec1 - sec2) * km_per_sec
    timespace_dist = km_dist + sec_dist
    return timespace_dist


[docs]def timespace_pdist(X_data):
    if X_data.shape[1] == 3:
        return distance.pdist(X_data, timespace_distance)
    if X_data.shape[1] == 1:
        return distance.pdist(X_data, 'euclidian')


[docs]def cluster_timespace(X_data, thresh):
    """
    References:
        http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/
            scipy.cluster.hierarchy.linkage.html

    CommandLine:
        python -m wbia.algo.preproc.preproc_occurrence cluster_timespace --show

    Example:
        >>> # DISABLE_DOCTEST
        >>> from wbia.algo.preproc.preproc_occurrence import *  # NOQA
        >>> X_data = testdata_gps()
        >>> thresh = 10
        >>> X_labels = cluster_timespace(X_data, thresh)
        >>> fnum = pt.ensure_fnum(None)
        >>> fig = pt.figure(fnum=fnum, doclf=True, docla=True)
        >>> hier.dendrogram(linkage_mat, orientation='top')
        >>> plot_annotaiton_gps(X_data)
        >>> ut.show_if_requested()

    """
    condenced_dist_mat = distance.pdist(X_data, timespace_distance)
    # Compute heirarchical linkages
    linkage_mat = scipy.cluster.hierarchy.linkage(condenced_dist_mat, method='centroid')
    # Cluster linkages
    X_labels = scipy.cluster.hierarchy.fcluster(
        linkage_mat, thresh, criterion='inconsistent', depth=2, R=None, monocrit=None
    )
    return X_labels


[docs]def testdata_gps():
    r"""
    Simple data to test GPS algorithm.

    Returns:
        X_name (ndarray): Nx1 matrix denoting groundtruth locations
        X_data (ndarray): Nx3 matrix where each columns are (time, lat, lon)
    """
    lon = np.array(
        [
            4.54,
            104.0,
            -14.9,
            56.26,
            103.46,
            103.37,
            54.22,
            23.3,
            25.53,
            23.31,
            118.0,
            103.53,
            54.40,
            103.48,
            6.14,
            7.25,
            2.38,
            18.18,
            103.54,
            103.40,
            28.59,
            25.21,
            29.35,
            25.20,
        ]
    )

    lat = np.array(
        [
            52.22,
            1.14,
            27.34,
            25.16,
            1.16,
            1.11,
            24.30,
            37.54,
            37.26,
            38.1,
            24.25,
            1.13,
            24.49,
            1.13,
            42.33,
            43.44,
            39.34,
            70.30,
            1.16,
            1.10,
            40.58,
            37.34,
            41.18,
            38.35,
        ]
    )

    time = np.zeros(len(lon))

    X_data = np.vstack((time, lat, lon)).T

    X_name = np.array([0, 1, 2, 2, 2, 2, 3, 3, 3])  # NOQA
    X_data = np.array(
        [
            (0, 42.727985, -73.683994),  # MRC
            (0, 42.657872, -73.764148),  # Home
            (0, 42.657414, -73.774448),  # Park1
            (0, 42.658333, -73.770993),  # Park2
            (0, 42.654384, -73.768919),  # Park3
            (0, 42.655039, -73.769048),  # Park4
            (0, 42.876974, -73.819311),  # CP1
            (0, 42.862946, -73.804977),  # CP2
            (0, 42.849809, -73.758486),  # CP3
        ]
    )
    return X_name, X_data


[docs]def plot_gps_html(gps_list):
    """Plots gps coordinates on a map projection

    InstallBasemap:
        sudo apt-get install libgeos-dev
        pip install git+https://github.com/matplotlib/basemap
        http://matplotlib.org/basemap/users/examples.html

        pip install gmplot

        sudo apt-get install netcdf-bin
        sudo apt-get install libnetcdf-dev
        pip install netCDF4

    Ignore:
        pip install git+git://github.com/myuser/foo.git@v123

    Example:
        >>> # DISABLE_DOCTEST
        >>> from wbia.algo.preproc.preproc_occurrence import *  # NOQA
        >>> import wbia
        >>> ibs = wbia.opendb(defaultdb='testdb1')
        >>> images = ibs.images()
        >>> # Setup GPS points to draw
        >>> print('Setup GPS points')
        >>> gps_list_ = np.array(images.gps2)
        >>> unixtime_list_ = np.array(images.unixtime2)
        >>> has_gps = np.all(np.logical_not(np.isnan(gps_list_)), axis=1)
        >>> has_unixtime = np.logical_not(np.isnan(unixtime_list_))
        >>> isvalid = np.logical_and(has_gps, has_unixtime)
        >>> gps_list = gps_list_.compress(isvalid, axis=0)
        >>> unixtime_list = unixtime_list_.compress(isvalid)  # NOQA
        >>> plot_image_gps(gps_list)
    """
    import wbia.plottool as pt
    import gmplot
    import matplotlib as mpl
    import vtool as vt

    pt.qt4ensure()

    lat = gps_list.T[0]
    lon = gps_list.T[1]

    # Get extent of
    bbox = vt.bbox_from_verts(gps_list)
    centerx, centery = vt.bbox_center(bbox)

    gmap = gmplot.GoogleMapPlotter(centerx, centery, 13)
    color = mpl.colors.rgb2hex(pt.ORANGE)
    gmap.scatter(lat, lon, color=color, size=100, marker=False)
    gmap.draw('mymap.html')
    ut.startfile('mymap.html')

    # # Scale
    # bbox = vt.scale_bbox(bbox, 10.0)
    # extent = vt.extent_from_bbox(bbox)
    # basemap_extent = dict(llcrnrlon=extent[2], urcrnrlon=extent[3],
    #                      llcrnrlat=extent[0], urcrnrlat=extent[1])
    # # Whole globe
    # #basemap_extent = dict(llcrnrlon=0, llcrnrlat=-80,
    # #                      urcrnrlon=360, urcrnrlat=80)

    # from mpl_toolkits.basemap import Basemap
    # from matplotlib.colors import LightSource  # NOQA
    # from mpl_toolkits.basemap import shiftgrid, cm  # NOQA
    # from netCDF4 import Dataset
    # # Read information to make background pretty
    # logger.info('Grab topo information')
    # etopodata = Dataset('http://ferret.pmel.noaa.gov/thredds/dodsC/data/PMEL/etopo5.nc')
    # logger.info('Read topo information')
    # topoin = etopodata.variables['ROSE'][:]
    # lons = etopodata.variables['ETOPO05_X'][:]
    # lats = etopodata.variables['ETOPO05_Y'][:]
    # # shift data so lons go from -180 to 180 instead of 20 to 380.
    # logger.info('Shift data')
    # topoin, lons = shiftgrid(180., topoin, lons, start=False)

    # logger.info('Make figure')
    # fnum = pt.ensure_fnum(None)
    # fig = pt.figure(fnum=fnum, doclf=True, docla=True)  # NOQA
    # logger.info('Draw projection')
    # m = Basemap(projection='mill', **basemap_extent)
    # # setup Lambert Conformal basemap.
    # #m = Basemap(projection='cea',resolution='h', **basemap_extent)

    # # transform to nx x ny regularly spaced 5km native projection grid
    # logger.info('projection grid')
    # nx = int((m.xmax - m.xmin) / 5000.) + 1
    # ny = int((m.ymax - m.ymin) / 5000.) + 1
    # topodat = m.transform_scalar(topoin, lons, lats, nx, ny)

    # # plot image over map with imshow.
    # im = m.imshow(topodat, cm.GMT_haxby)  # NOQA
    # # draw coastlines and political boundaries.
    # m.drawcoastlines()
    # m.drawcountries()
    # m.drawstates()

    # transform to nx x ny regularly spaced 5km native projection grid
    # ls = LightSource(azdeg=90, altdeg=20)
    # rgb = ls.shade(topodat, cm.GMT_haxby)
    # im = m.imshow(rgb)
    # draw coastlines and political boundaries.

    # m.drawcoastlines()
    # m.drawcountries()
    # m.drawstates()

    # draw a boundary around the map, fill the background.
    # this background will end up being the ocean color, since
    # the continents will be drawn on top.
    # m.bluemarble()
    # m.drawmapboundary(fill_color='aqua')
    # m.fillcontinents(color='coral', lake_color='aqua')
    # Convert GPS to projected coordinates
    # x1, y1 = m(lon, lat)  # convert to meters # lon==X, lat==Y
    # m.plot(x1, y1, '*', markersize=10)
    # fig.zoom_fac = pt.zoom_factory()
    # fig.pan_fac = pt.pan_factory()
    # fig.show()