Source code for wbia.algo.preproc.occurrence_blackbox

# -*- coding: utf-8 -*-
"""
animal_walking_speeds

ZEBRA_SPEED_MAX  = 64  # km/h
ZEBRA_SPEED_RUN  = 50  # km/h
ZEBRA_SPEED_SLOW_RUN  = 20  # km/h
ZEBRA_SPEED_FAST_WALK = 10  # km/h
ZEBRA_SPEED_WALK = 7  # km/h


km_per_sec = .02
km_per_sec = .001
mph = km_per_sec / ut.KM_PER_MILE * 60 * 60
print('mph = %r' % (mph,))

1 / km_per_sec

import datetime
thresh_sec = datetime.timedelta(minutes=5).seconds
thresh_km = thresh_sec * km_per_sec
print('thresh_sec = %r' % (thresh_sec,))
print('thresh_km = %r' % (thresh_km,))
thresh_sec = thresh_km / km_per_sec
print('thresh_sec = %r' % (thresh_sec,))
"""
import logging
import functools
import numpy as np
import utool as ut
import scipy.cluster.hierarchy
from scipy.spatial import distance

(print, rrr, profile) = ut.inject2(__name__)
logger = logging.getLogger('wbia')


KM_PER_SEC = 0.002


[docs]def haversine(latlon1, latlon2): r""" Calculate the great circle distance between two points on the earth (specified in decimal degrees) Args: latlon1 (tuple): (lat, lon) latlon2 (tuple): (lat, lon) Returns: float : distance in kilometers References: en.wikipedia.org/wiki/Haversine_formula gis.stackexchange.com/questions/81551/matching-gps-tracks stackoverflow.com/questions/4913349/haversine-distance-gps-points Doctest: >>> from wbia.algo.preproc.occurrence_blackbox import * # NOQA >>> import scipy.spatial.distance as spdist >>> import functools >>> latlon1 = [-80.21895315, -158.81099213] >>> latlon2 = [ 9.77816711, -17.27471498] >>> kilometers = haversine(latlon1, latlon2) >>> result = ('kilometers = %s' % (kilometers,)) >>> print(result) kilometers = 11930.909364189827 """ # convert decimal degrees to radians lat1, lon1 = np.radians(latlon1) lat2, lon2 = np.radians(latlon2) # haversine formula dlon = lon2 - lon1 dlat = lat2 - lat1 a = (np.sin(dlat / 2) ** 2) + np.cos(lat1) * np.cos(lat2) * (np.sin(dlon / 2) ** 2) c = 2 * np.arcsin(np.sqrt(a)) # convert to kilometers EARTH_RADIUS_KM = 6367 kilometers = EARTH_RADIUS_KM * c return kilometers
[docs]def haversine_rad(lat1, lon1, lat2, lon2): # haversine formula dlon = lon2 - lon1 dlat = lat2 - lat1 a = (np.sin(dlat / 2) ** 2) + np.cos(lat1) * np.cos(lat2) * (np.sin(dlon / 2) ** 2) c = 2 * np.arcsin(np.sqrt(a)) # convert to kilometers EARTH_RADIUS_KM = 6367 kilometers = EARTH_RADIUS_KM * c return kilometers
[docs]def timespace_distance_km(pt1, pt2, km_per_sec=KM_PER_SEC): """ Computes distance between two points in space and time. Time is converted into spatial units using km_per_sec Args: pt1 (tuple) : (seconds, lat, lon) pt2 (tuple) : (seconds, lat, lon) km_per_sec (float): reasonable animal walking speed Returns: float: distance in kilometers Doctest: >>> from wbia.algo.preproc.occurrence_blackbox import * # NOQA >>> import scipy.spatial.distance as spdist >>> import functools >>> km_per_sec = .02 >>> latlon1 = [40.779299,-73.9719498] # museum of natural history >>> latlon2 = [37.7336402,-122.5050342] # san fransisco zoo >>> pt1 = [0.0] + latlon1 >>> pt2 = [0.0] + latlon2 >>> # google measures about 4138.88 kilometers >>> dist_km1 = timespace_distance_km(pt1, pt2) >>> print('dist_km1 = {!r}'.format(dist_km1)) >>> # Now add a time component >>> pt1 = [360.0] + latlon1 >>> pt2 = [0.0] + latlon2 >>> dist_km2 = timespace_distance_km(pt1, pt2) >>> print('dist_km2 = {!r}'.format(dist_km2)) >>> assert np.isclose(dist_km1, 4136.4568647922624) >>> assert np.isclose(dist_km2, 4137.1768647922627) """ sec1, latlon1 = pt1[0], pt1[1:] sec2, latlon2 = pt2[0], pt2[1:] # Get pure gps distance km_dist = haversine(latlon1, latlon2) # Get distance in seconds and convert to km sec_dist = np.abs(sec1 - sec2) * km_per_sec # Add distances # (return nan if points are not comparable, otherwise nansum) parts = np.array([km_dist, sec_dist]) timespace_dist = np.nan if np.all(np.isnan(parts)) else np.nansum(parts) return timespace_dist
[docs]def timespace_distance_sec(pt1, pt2, km_per_sec=KM_PER_SEC): # Return in seconds sec1, latlon1 = pt1[0], pt1[1:] sec2, latlon2 = pt2[0], pt2[1:] # Get pure gps distance and convert to seconds km_dist = haversine(latlon1, latlon2) km_dist = km_dist / km_per_sec # Get distance in seconds sec_dist = np.abs(sec1 - sec2) # Add distances # (return nan if points are not comparable, otherwise nansum) parts = np.array([km_dist, sec_dist]) timespace_dist = np.nan if np.all(np.isnan(parts)) else np.nansum(parts) return timespace_dist
[docs]def space_distance_sec(pt1, pt2, km_per_sec=KM_PER_SEC): # Return in seconds latlon1, latlon2 = pt1, pt2 # Get pure gps distance and convert to seconds km_dist = haversine(latlon1, latlon2) space_dist = km_dist / km_per_sec return space_dist
[docs]def space_distance_km(pt1, pt2): # Return in seconds latlon1, latlon2 = pt1, pt2 # Get pure gps distance and convert to seconds km_dist = haversine(latlon1, latlon2) return km_dist
[docs]def time_dist_sec(sec1, sec2): sec_dist = np.abs(sec1 - sec2) return sec_dist
[docs]def time_dist_km(sec1, sec2, km_per_sec=KM_PER_SEC): sec_dist = np.abs(sec1 - sec2) sec_dist *= km_per_sec return sec_dist
[docs]def prepare_data(posixtimes, latlons, km_per_sec=KM_PER_SEC, thresh_units='seconds'): r""" Package datas and picks distance function Args: posixtimes (ndarray): latlons (ndarray): km_per_sec (float): (default = 0.002) thresh_units (str): (default = 'seconds') Returns: ndarray: arr_ - CommandLine: python -m wbia.algo.preproc.occurrence_blackbox prepare_data Doctest: >>> from wbia.algo.preproc.occurrence_blackbox import * # NOQA >>> posixtimes = np.array([10, 50, np.nan, np.nan, 5, 80, np.nan, np.nan]) >>> latlons = np.array([ >>> (42.727985, -73.683994), >>> (np.nan, np.nan), >>> (np.nan, np.nan), >>> (42.658333, -73.770993), >>> (42.227985, -73.083994), >>> (np.nan, np.nan), >>> (np.nan, np.nan), >>> (42.258333, -73.470993), >>> ]) >>> km_per_sec = 0.002 >>> thresh_units = 'seconds' >>> X_data, dist_func, columns = prepare_data(posixtimes, latlons, km_per_sec, thresh_units) >>> result = ('arr_ = %s' % (ut.repr2(X_data),)) >>> [dist_func(a, b) for a, b in ut.combinations(X_data, 2)] >>> print(result) """ def atleast_nd(arr, n, tofront=False): r"""ut.static_func_source(vt.atleast_nd)""" arr_ = np.asanyarray(arr) ndims = len(arr_.shape) if n is not None and ndims < n: # append the required number of dimensions to the end if tofront: expander = (None,) * (n - ndims) + (Ellipsis,) else: expander = (Ellipsis,) + (None,) * (n - ndims) arr_ = arr_[expander] return arr_ def ensure_column_shape(arr, num_cols): r"""ut.static_func_source(vt.ensure_shape)""" arr_ = np.asanyarray(arr) if len(arr_.shape) == 0: pass elif len(arr_.shape) == 1: arr_.shape = (arr_.size, num_cols) else: assert arr_.shape[1] == num_cols, 'bad number of cols' return arr_ have_gps = latlons is not None and not np.all(np.isnan(latlons)) have_times = posixtimes is not None and not np.all(np.isnan(posixtimes)) if not have_gps and not have_times: # There is no data, so there is nothing to do dist_func = None X_data = None columns = tuple() elif not have_gps and have_times: # We have gps but no timestamps X_data = atleast_nd(posixtimes, 2) if thresh_units == 'seconds': dist_func = time_dist_sec elif thresh_units == 'km': dist_func = time_dist_km columns = ('time',) elif have_gps and not have_times: # We have timesamps but no gps X_data = np.array(latlons) if thresh_units == 'seconds': dist_func = functools.partial(space_distance_sec, km_per_sec=km_per_sec) elif thresh_units == 'km': dist_func = space_distance_km columns = ('lat', 'lon') elif have_gps and have_times: # We have some combination of gps and timestamps posixtimes = atleast_nd(posixtimes, 2) latlons = ensure_column_shape(latlons, 2) # latlons = np.array(latlons, ndmin=2) X_data = np.hstack([posixtimes, latlons]) if thresh_units == 'seconds': dist_func = functools.partial(timespace_distance_sec, km_per_sec=km_per_sec) elif thresh_units == 'km': dist_func = functools.partial(timespace_distance_km, km_per_sec=km_per_sec) columns = ('time', 'lat', 'lon') else: raise AssertionError('impossible state') return X_data, dist_func, columns
[docs]def cluster_timespace_km(posixtimes, latlons, thresh_km, km_per_sec=KM_PER_SEC): """ Agglometerative clustering of time/space data Args: X_data (ndarray) : Nx3 array where columns are (seconds, lat, lon) thresh_km (float) : threshold in kilometers References: http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/ scipy.cluster.hierarchy.linkage.html scipy.cluster.hierarchy.fcluster.html Notes: # Visualize spots http://www.darrinward.com/lat-long/?id=2009879 CommandLine: python -m wbia.algo.preproc.occurrence_blackbox cluster_timespace_km Doctest: >>> from wbia.algo.preproc.occurrence_blackbox import * # NOQA >>> # Nx1 matrix denoting groundtruth locations (for testing) >>> X_name = np.array([0, 1, 1, 1, 1, 1, 2, 2, 2]) >>> # Nx3 matrix where each columns are (time, lat, lon) >>> X_data = np.array([ >>> (0, 42.727985, -73.683994), # MRC >>> (0, 42.657414, -73.774448), # Park1 >>> (0, 42.658333, -73.770993), # Park2 >>> (0, 42.654384, -73.768919), # Park3 >>> (0, 42.655039, -73.769048), # Park4 >>> (0, 42.657872, -73.764148), # Park5 >>> (0, 42.876974, -73.819311), # CP1 >>> (0, 42.862946, -73.804977), # CP2 >>> (0, 42.849809, -73.758486), # CP3 >>> ]) >>> thresh_km = 5.0 # kilometers >>> posixtimes = X_data.T[0] >>> latlons = X_data.T[1:3].T >>> km_per_sec = KM_PER_SEC >>> X_labels = cluster_timespace_km(posixtimes, latlons, thresh_km) >>> result = 'X_labels = {}'.format(ut.repr2(X_labels)) >>> print(result) X_labels = np.array([3, 2, 2, 2, 2, 2, 1, 1, 1]) """ X_data, dist_func, columns = prepare_data(posixtimes, latlons, km_per_sec, 'km') if X_data is None: return None # Compute pairwise distances between all inputs dist_func = functools.partial(dist_func, km_per_sec=km_per_sec) condenced_dist_mat = distance.pdist(X_data, dist_func) # Compute heirarchical linkages linkage_mat = scipy.cluster.hierarchy.linkage(condenced_dist_mat, method='single') # Cluster linkages X_labels = scipy.cluster.hierarchy.fcluster( linkage_mat, thresh_km, criterion='distance' ) return X_labels
[docs]def cluster_timespace_sec(posixtimes, latlons, thresh_sec=5, km_per_sec=KM_PER_SEC): """ Args: X_data (ndarray) : Nx3 array where columns are (seconds, lat, lon) thresh_sec (float) : threshold in seconds Doctest: >>> from wbia.algo.preproc.occurrence_blackbox import * # NOQA >>> # Nx1 matrix denoting groundtruth locations (for testing) >>> X_name = np.array([0, 1, 1, 1, 1, 1, 2, 2, 2]) >>> # Nx3 matrix where each columns are (time, lat, lon) >>> X_data = np.array([ >>> (0, 42.727985, -73.683994), # MRC >>> (0, 42.657414, -73.774448), # Park1 >>> (0, 42.658333, -73.770993), # Park2 >>> (0, 42.654384, -73.768919), # Park3 >>> (0, 42.655039, -73.769048), # Park4 >>> (0, 42.657872, -73.764148), # Park5 >>> (0, 42.876974, -73.819311), # CP1 >>> (0, 42.862946, -73.804977), # CP2 >>> (0, 42.849809, -73.758486), # CP3 >>> ]) >>> posixtimes = X_data.T[0] >>> latlons = X_data.T[1:3].T >>> thresh_sec = 250 # seconds >>> X_labels = cluster_timespace_sec(posixtimes, latlons, thresh_sec) >>> result = ('X_labels = %r' % (X_labels,)) >>> print(result) X_labels = array([6, 4, 4, 4, 4, 5, 1, 2, 3]) Doctest: >>> from wbia.algo.preproc.occurrence_blackbox import * # NOQA >>> # Nx1 matrix denoting groundtruth locations (for testing) >>> X_name = np.array([0, 1, 1, 1, 1, 1, 2, 2, 2]) >>> # Nx3 matrix where each columns are (time, lat, lon) >>> X_data = np.array([ >>> (np.nan, 42.657414, -73.774448), # Park1 >>> (0, 42.658333, -73.770993), # Park2 >>> (np.nan, np.nan, np.nan), # Park3 >>> (np.nan, np.nan, np.nan), # Park3.5 >>> (0, 42.655039, -73.769048), # Park4 >>> (0, 42.657872, -73.764148), # Park5 >>> ]) >>> posixtimes = X_data.T[0] >>> latlons = X_data.T[1:3].T >>> thresh_sec = 250 # seconds >>> km_per_sec = KM_PER_SEC >>> X_labels = cluster_timespace_sec(posixtimes, latlons, thresh_sec) >>> result = 'X_labels = {}'.format(ut.repr2(X_labels)) >>> print(result) X_labels = np.array([3, 4, 1, 2, 4, 5]) """ X_data, dist_func, columns = prepare_data(posixtimes, latlons, km_per_sec, 'seconds') if X_data is None: return None # Cluster nan distributions differently X_bools = ~np.isnan(X_data) group_id = (X_bools * np.power(2, [2, 1, 0])).sum(axis=1) import vtool as vt unique_ids, groupxs = vt.group_indices(group_id) grouped_labels = [] for xs in groupxs: X_part = X_data.take(xs, axis=0) labels = _cluster_part(X_part, dist_func, columns, thresh_sec, km_per_sec) grouped_labels.append((labels, xs)) # Undo grouping and rectify overlaps X_labels = _recombine_labels(grouped_labels) # Do clustering return X_labels
def _recombine_labels(chunk_labels): """ Ensure each group has different indices chunk_labels = grouped_labels """ import utool as ut labels = ut.take_column(chunk_labels, 0) idxs = ut.take_column(chunk_labels, 1) # nunique_list = [len(np.unique(a)) for a in labels] chunksizes = ut.lmap(len, idxs) cumsum = np.cumsum(chunksizes).tolist() combined_idxs = np.hstack(idxs) combined_labels = np.hstack(labels) offset = 0 # Ensure each chunk has unique labels for start, stop in zip([0] + cumsum, cumsum): combined_labels[start:stop] += offset offset += len(np.unique(combined_labels[start:stop])) # Ungroup X_labels = np.empty(combined_idxs.max() + 1, dtype=np.int) # new_labels[:] = -1 X_labels[combined_idxs] = combined_labels return X_labels def _cluster_part(X_part, dist_func, columns, thresh_sec, km_per_sec): if len(X_part) > 500 and 'time' in columns and ~np.isnan(X_part[0, 0]): # Try and break problem up into smaller chunks by finding feasible # one-dimensional breakpoints (is this a cutting plane?) chunk_labels = [] chunk_idxs = list(_chunk_time(X_part, thresh_sec)) for idxs in chunk_idxs: # logger.info('Doing occurrence chunk {}'.format(len(idxs))) X_chunk = X_part.take(idxs, axis=0) labels = _cluster_chunk(X_chunk, dist_func, thresh_sec) chunk_labels.append((labels, idxs)) X_labels = _recombine_labels(chunk_labels) else: # Compute the whole problem X_labels = _cluster_chunk(X_part, dist_func, thresh_sec) return X_labels def _cluster_chunk(X_data, dist_func, thresh_sec): if len(X_data) == 0: X_labels = np.empty(len(X_data), dtype=np.int) elif len(X_data) == 1: X_labels = np.ones(len(X_data), dtype=np.int) elif np.all(np.isnan(X_data)): X_labels = np.arange(1, len(X_data) + 1, dtype=np.int) else: # Compute pairwise distances between all inputs condenced_dist_mat = distance.pdist(X_data, dist_func) # Compute heirarchical linkages linkage_mat = scipy.cluster.hierarchy.linkage(condenced_dist_mat, method='single') # Cluster linkages X_labels = scipy.cluster.hierarchy.fcluster( linkage_mat, thresh_sec, criterion='distance' ) return X_labels def _chunk_time(X_part, thresh_sec): X_time = X_part.T[0] time_sortx = X_time.argsort() timedata = X_time[time_sortx] # Look for points that are beyond the thresh in one dimension consec_delta = np.diff(timedata) consec_delta[consec_delta > thresh_sec] breakpoint = (consec_delta > thresh_sec) | np.isnan(consec_delta) idxs = np.hstack([[0], np.where(breakpoint)[0] + 1, [len(X_part)]]) iter_window = list(zip(idxs, idxs[1:])) for start, stop in iter_window: idxs = time_sortx[start:stop] # chunk = X_time[idxs] # logger.info((np.diff(chunk[chunk.argsort()]) > thresh_sec).sum()) yield idxs # def _chunk_lat(X_chunk, thresh_sec, km_per_sec): # # X_time = X_chunk.T[0] # X_lats = X_chunk.T[-2] # X_lons = X_chunk.T[-1] # # approximates 1 dimensional distance # ave_lon = np.mean(X_lons) # lat_sortx = X_lats.argsort() # latdata = X_lats[lat_sortx] # latdata_rad = np.radians(latdata) # avelon_rad = np.radians(ave_lon) # lat1 = latdata_rad[:-1] # lon1 = avelon_rad # lat2 = latdata_rad[1:] # lon2 = avelon_rad # consec_kmdelta = haversine_rad(lat1, lon1, lat2, lon2) # consec_delta = consec_kmdelta / km_per_sec # consec_delta[consec_delta > thresh_sec] # breakpoint = (consec_delta > thresh_sec) | np.isnan(consec_delta) # idxs = np.hstack([[0], np.where(breakpoint)[0] + 1, [len(X_chunk)]]) # iter_window = list(zip(idxs, idxs[1:])) # for start, stop in iter_window: # idxs = lat_sortx[start:stop] # # chunk = X_time[idxs] # # logger.info((np.diff(chunk[chunk.argsort()]) > thresh_sec).sum()) # yield idxs # def _chunk_lon(X_chunk, thresh_sec, km_per_sec): # # X_time = X_chunk.T[0] # X_lats = X_chunk.T[-2] # X_lons = X_chunk.T[-1] # # approximates 1 dimensional distance (assuming lons are not too different) # ave_lat = np.mean(X_lats) # lon_sortx = X_lons.argsort() # londata = X_lons[lon_sortx] # londata_rad = np.radians(londata) # avelat_rad = np.radians(ave_lat) # lat1 = avelat_rad # lon1 = londata_rad[:-1] # lat2 = avelat_rad # lon2 = londata_rad[1:] # consec_kmdelta = haversine_rad(lat1, lon1, lat2, lon2) # consec_delta = consec_kmdelta / km_per_sec # consec_delta[consec_delta > thresh_sec] # breakpoint = (consec_delta > thresh_sec) | np.isnan(consec_delta) # idxs = np.hstack([[0], np.where(breakpoint)[0] + 1, [len(X_chunk)]]) # iter_window = list(zip(idxs, idxs[1:])) # for start, stop in iter_window: # idxs = lon_sortx[start:stop] # # chunk = X_time[idxs] # # logger.info((np.diff(chunk[chunk.argsort()]) > thresh_sec).sum()) # yield idxs
[docs]def main(): """ CommandLine: ib cd ~/code/wbia/wbia/algo/preproc python occurrence_blackbox.py --lat 42.727985 42.657414 42.658333 42.654384 --lon -73.683994 -73.774448 -73.770993 -73.768919 --sec 0 0 0 0 # Should return X_labels = [2, 1, 1, 1] """ import argparse parser = argparse.ArgumentParser(description='Compute agglomerative cluster') parser.add_argument('--lat', type=float, nargs='*', help='list of latitude coords') parser.add_argument('--lon', type=float, nargs='*', help='list of longitude coords') parser.add_argument( '--sec', type=float, nargs='*', help='list of POSIX_TIMEs in seconds' ) parser.add_argument( '--thresh', type=float, nargs=1, default=1.0, help='threshold in kilometers' ) parser.add_argument( '--km_per_sec', type=float, nargs=1, default=KM_PER_SEC, help='reasonable animal speed in km/s', ) args = parser.parse_args() sec = [0] * len(args.lat) if args.sec is None else args.sec latlons = np.vstack([args.lat, args.lon]).T X_labels = cluster_timespace_km(sec, latlons, args.thresh, km_per_sec=args.km_per_sec) logger.info('X_labels = %r' % (X_labels.tolist(),))
if __name__ == '__main__': r""" CommandLine: python -m wbia.algo.preproc.occurrence_blackbox """ main()