# -*- coding: utf-8 -*-
"""
animal_walking_speeds
ZEBRA_SPEED_MAX = 64 # km/h
ZEBRA_SPEED_RUN = 50 # km/h
ZEBRA_SPEED_SLOW_RUN = 20 # km/h
ZEBRA_SPEED_FAST_WALK = 10 # km/h
ZEBRA_SPEED_WALK = 7 # km/h
km_per_sec = .02
km_per_sec = .001
mph = km_per_sec / ut.KM_PER_MILE * 60 * 60
print('mph = %r' % (mph,))
1 / km_per_sec
import datetime
thresh_sec = datetime.timedelta(minutes=5).seconds
thresh_km = thresh_sec * km_per_sec
print('thresh_sec = %r' % (thresh_sec,))
print('thresh_km = %r' % (thresh_km,))
thresh_sec = thresh_km / km_per_sec
print('thresh_sec = %r' % (thresh_sec,))
"""
import logging
import functools
import numpy as np
import utool as ut
import scipy.cluster.hierarchy
from scipy.spatial import distance
(print, rrr, profile) = ut.inject2(__name__)
logger = logging.getLogger('wbia')
KM_PER_SEC = 0.002
[docs]def haversine(latlon1, latlon2):
r"""
Calculate the great circle distance between two points
on the earth (specified in decimal degrees)
Args:
latlon1 (tuple): (lat, lon)
latlon2 (tuple): (lat, lon)
Returns:
float : distance in kilometers
References:
en.wikipedia.org/wiki/Haversine_formula
gis.stackexchange.com/questions/81551/matching-gps-tracks
stackoverflow.com/questions/4913349/haversine-distance-gps-points
Doctest:
>>> from wbia.algo.preproc.occurrence_blackbox import * # NOQA
>>> import scipy.spatial.distance as spdist
>>> import functools
>>> latlon1 = [-80.21895315, -158.81099213]
>>> latlon2 = [ 9.77816711, -17.27471498]
>>> kilometers = haversine(latlon1, latlon2)
>>> result = ('kilometers = %s' % (kilometers,))
>>> print(result)
kilometers = 11930.909364189827
"""
# convert decimal degrees to radians
lat1, lon1 = np.radians(latlon1)
lat2, lon2 = np.radians(latlon2)
# haversine formula
dlon = lon2 - lon1
dlat = lat2 - lat1
a = (np.sin(dlat / 2) ** 2) + np.cos(lat1) * np.cos(lat2) * (np.sin(dlon / 2) ** 2)
c = 2 * np.arcsin(np.sqrt(a))
# convert to kilometers
EARTH_RADIUS_KM = 6367
kilometers = EARTH_RADIUS_KM * c
return kilometers
[docs]def haversine_rad(lat1, lon1, lat2, lon2):
# haversine formula
dlon = lon2 - lon1
dlat = lat2 - lat1
a = (np.sin(dlat / 2) ** 2) + np.cos(lat1) * np.cos(lat2) * (np.sin(dlon / 2) ** 2)
c = 2 * np.arcsin(np.sqrt(a))
# convert to kilometers
EARTH_RADIUS_KM = 6367
kilometers = EARTH_RADIUS_KM * c
return kilometers
[docs]def timespace_distance_km(pt1, pt2, km_per_sec=KM_PER_SEC):
"""
Computes distance between two points in space and time.
Time is converted into spatial units using km_per_sec
Args:
pt1 (tuple) : (seconds, lat, lon)
pt2 (tuple) : (seconds, lat, lon)
km_per_sec (float): reasonable animal walking speed
Returns:
float: distance in kilometers
Doctest:
>>> from wbia.algo.preproc.occurrence_blackbox import * # NOQA
>>> import scipy.spatial.distance as spdist
>>> import functools
>>> km_per_sec = .02
>>> latlon1 = [40.779299,-73.9719498] # museum of natural history
>>> latlon2 = [37.7336402,-122.5050342] # san fransisco zoo
>>> pt1 = [0.0] + latlon1
>>> pt2 = [0.0] + latlon2
>>> # google measures about 4138.88 kilometers
>>> dist_km1 = timespace_distance_km(pt1, pt2)
>>> print('dist_km1 = {!r}'.format(dist_km1))
>>> # Now add a time component
>>> pt1 = [360.0] + latlon1
>>> pt2 = [0.0] + latlon2
>>> dist_km2 = timespace_distance_km(pt1, pt2)
>>> print('dist_km2 = {!r}'.format(dist_km2))
>>> assert np.isclose(dist_km1, 4136.4568647922624)
>>> assert np.isclose(dist_km2, 4137.1768647922627)
"""
sec1, latlon1 = pt1[0], pt1[1:]
sec2, latlon2 = pt2[0], pt2[1:]
# Get pure gps distance
km_dist = haversine(latlon1, latlon2)
# Get distance in seconds and convert to km
sec_dist = np.abs(sec1 - sec2) * km_per_sec
# Add distances
# (return nan if points are not comparable, otherwise nansum)
parts = np.array([km_dist, sec_dist])
timespace_dist = np.nan if np.all(np.isnan(parts)) else np.nansum(parts)
return timespace_dist
[docs]def timespace_distance_sec(pt1, pt2, km_per_sec=KM_PER_SEC):
# Return in seconds
sec1, latlon1 = pt1[0], pt1[1:]
sec2, latlon2 = pt2[0], pt2[1:]
# Get pure gps distance and convert to seconds
km_dist = haversine(latlon1, latlon2)
km_dist = km_dist / km_per_sec
# Get distance in seconds
sec_dist = np.abs(sec1 - sec2)
# Add distances
# (return nan if points are not comparable, otherwise nansum)
parts = np.array([km_dist, sec_dist])
timespace_dist = np.nan if np.all(np.isnan(parts)) else np.nansum(parts)
return timespace_dist
[docs]def space_distance_sec(pt1, pt2, km_per_sec=KM_PER_SEC):
# Return in seconds
latlon1, latlon2 = pt1, pt2
# Get pure gps distance and convert to seconds
km_dist = haversine(latlon1, latlon2)
space_dist = km_dist / km_per_sec
return space_dist
[docs]def space_distance_km(pt1, pt2):
# Return in seconds
latlon1, latlon2 = pt1, pt2
# Get pure gps distance and convert to seconds
km_dist = haversine(latlon1, latlon2)
return km_dist
[docs]def time_dist_sec(sec1, sec2):
sec_dist = np.abs(sec1 - sec2)
return sec_dist
[docs]def time_dist_km(sec1, sec2, km_per_sec=KM_PER_SEC):
sec_dist = np.abs(sec1 - sec2)
sec_dist *= km_per_sec
return sec_dist
[docs]def prepare_data(posixtimes, latlons, km_per_sec=KM_PER_SEC, thresh_units='seconds'):
r"""
Package datas and picks distance function
Args:
posixtimes (ndarray):
latlons (ndarray):
km_per_sec (float): (default = 0.002)
thresh_units (str): (default = 'seconds')
Returns:
ndarray: arr_ -
CommandLine:
python -m wbia.algo.preproc.occurrence_blackbox prepare_data
Doctest:
>>> from wbia.algo.preproc.occurrence_blackbox import * # NOQA
>>> posixtimes = np.array([10, 50, np.nan, np.nan, 5, 80, np.nan, np.nan])
>>> latlons = np.array([
>>> (42.727985, -73.683994),
>>> (np.nan, np.nan),
>>> (np.nan, np.nan),
>>> (42.658333, -73.770993),
>>> (42.227985, -73.083994),
>>> (np.nan, np.nan),
>>> (np.nan, np.nan),
>>> (42.258333, -73.470993),
>>> ])
>>> km_per_sec = 0.002
>>> thresh_units = 'seconds'
>>> X_data, dist_func, columns = prepare_data(posixtimes, latlons, km_per_sec, thresh_units)
>>> result = ('arr_ = %s' % (ut.repr2(X_data),))
>>> [dist_func(a, b) for a, b in ut.combinations(X_data, 2)]
>>> print(result)
"""
def atleast_nd(arr, n, tofront=False):
r"""ut.static_func_source(vt.atleast_nd)"""
arr_ = np.asanyarray(arr)
ndims = len(arr_.shape)
if n is not None and ndims < n:
# append the required number of dimensions to the end
if tofront:
expander = (None,) * (n - ndims) + (Ellipsis,)
else:
expander = (Ellipsis,) + (None,) * (n - ndims)
arr_ = arr_[expander]
return arr_
def ensure_column_shape(arr, num_cols):
r"""ut.static_func_source(vt.ensure_shape)"""
arr_ = np.asanyarray(arr)
if len(arr_.shape) == 0:
pass
elif len(arr_.shape) == 1:
arr_.shape = (arr_.size, num_cols)
else:
assert arr_.shape[1] == num_cols, 'bad number of cols'
return arr_
have_gps = latlons is not None and not np.all(np.isnan(latlons))
have_times = posixtimes is not None and not np.all(np.isnan(posixtimes))
if not have_gps and not have_times:
# There is no data, so there is nothing to do
dist_func = None
X_data = None
columns = tuple()
elif not have_gps and have_times:
# We have gps but no timestamps
X_data = atleast_nd(posixtimes, 2)
if thresh_units == 'seconds':
dist_func = time_dist_sec
elif thresh_units == 'km':
dist_func = time_dist_km
columns = ('time',)
elif have_gps and not have_times:
# We have timesamps but no gps
X_data = np.array(latlons)
if thresh_units == 'seconds':
dist_func = functools.partial(space_distance_sec, km_per_sec=km_per_sec)
elif thresh_units == 'km':
dist_func = space_distance_km
columns = ('lat', 'lon')
elif have_gps and have_times:
# We have some combination of gps and timestamps
posixtimes = atleast_nd(posixtimes, 2)
latlons = ensure_column_shape(latlons, 2)
# latlons = np.array(latlons, ndmin=2)
X_data = np.hstack([posixtimes, latlons])
if thresh_units == 'seconds':
dist_func = functools.partial(timespace_distance_sec, km_per_sec=km_per_sec)
elif thresh_units == 'km':
dist_func = functools.partial(timespace_distance_km, km_per_sec=km_per_sec)
columns = ('time', 'lat', 'lon')
else:
raise AssertionError('impossible state')
return X_data, dist_func, columns
[docs]def cluster_timespace_km(posixtimes, latlons, thresh_km, km_per_sec=KM_PER_SEC):
"""
Agglometerative clustering of time/space data
Args:
X_data (ndarray) : Nx3 array where columns are (seconds, lat, lon)
thresh_km (float) : threshold in kilometers
References:
http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/
scipy.cluster.hierarchy.linkage.html
scipy.cluster.hierarchy.fcluster.html
Notes:
# Visualize spots
http://www.darrinward.com/lat-long/?id=2009879
CommandLine:
python -m wbia.algo.preproc.occurrence_blackbox cluster_timespace_km
Doctest:
>>> from wbia.algo.preproc.occurrence_blackbox import * # NOQA
>>> # Nx1 matrix denoting groundtruth locations (for testing)
>>> X_name = np.array([0, 1, 1, 1, 1, 1, 2, 2, 2])
>>> # Nx3 matrix where each columns are (time, lat, lon)
>>> X_data = np.array([
>>> (0, 42.727985, -73.683994), # MRC
>>> (0, 42.657414, -73.774448), # Park1
>>> (0, 42.658333, -73.770993), # Park2
>>> (0, 42.654384, -73.768919), # Park3
>>> (0, 42.655039, -73.769048), # Park4
>>> (0, 42.657872, -73.764148), # Park5
>>> (0, 42.876974, -73.819311), # CP1
>>> (0, 42.862946, -73.804977), # CP2
>>> (0, 42.849809, -73.758486), # CP3
>>> ])
>>> thresh_km = 5.0 # kilometers
>>> posixtimes = X_data.T[0]
>>> latlons = X_data.T[1:3].T
>>> km_per_sec = KM_PER_SEC
>>> X_labels = cluster_timespace_km(posixtimes, latlons, thresh_km)
>>> result = 'X_labels = {}'.format(ut.repr2(X_labels))
>>> print(result)
X_labels = np.array([3, 2, 2, 2, 2, 2, 1, 1, 1])
"""
X_data, dist_func, columns = prepare_data(posixtimes, latlons, km_per_sec, 'km')
if X_data is None:
return None
# Compute pairwise distances between all inputs
dist_func = functools.partial(dist_func, km_per_sec=km_per_sec)
condenced_dist_mat = distance.pdist(X_data, dist_func)
# Compute heirarchical linkages
linkage_mat = scipy.cluster.hierarchy.linkage(condenced_dist_mat, method='single')
# Cluster linkages
X_labels = scipy.cluster.hierarchy.fcluster(
linkage_mat, thresh_km, criterion='distance'
)
return X_labels
[docs]def cluster_timespace_sec(posixtimes, latlons, thresh_sec=5, km_per_sec=KM_PER_SEC):
"""
Args:
X_data (ndarray) : Nx3 array where columns are (seconds, lat, lon)
thresh_sec (float) : threshold in seconds
Doctest:
>>> from wbia.algo.preproc.occurrence_blackbox import * # NOQA
>>> # Nx1 matrix denoting groundtruth locations (for testing)
>>> X_name = np.array([0, 1, 1, 1, 1, 1, 2, 2, 2])
>>> # Nx3 matrix where each columns are (time, lat, lon)
>>> X_data = np.array([
>>> (0, 42.727985, -73.683994), # MRC
>>> (0, 42.657414, -73.774448), # Park1
>>> (0, 42.658333, -73.770993), # Park2
>>> (0, 42.654384, -73.768919), # Park3
>>> (0, 42.655039, -73.769048), # Park4
>>> (0, 42.657872, -73.764148), # Park5
>>> (0, 42.876974, -73.819311), # CP1
>>> (0, 42.862946, -73.804977), # CP2
>>> (0, 42.849809, -73.758486), # CP3
>>> ])
>>> posixtimes = X_data.T[0]
>>> latlons = X_data.T[1:3].T
>>> thresh_sec = 250 # seconds
>>> X_labels = cluster_timespace_sec(posixtimes, latlons, thresh_sec)
>>> result = ('X_labels = %r' % (X_labels,))
>>> print(result)
X_labels = array([6, 4, 4, 4, 4, 5, 1, 2, 3])
Doctest:
>>> from wbia.algo.preproc.occurrence_blackbox import * # NOQA
>>> # Nx1 matrix denoting groundtruth locations (for testing)
>>> X_name = np.array([0, 1, 1, 1, 1, 1, 2, 2, 2])
>>> # Nx3 matrix where each columns are (time, lat, lon)
>>> X_data = np.array([
>>> (np.nan, 42.657414, -73.774448), # Park1
>>> (0, 42.658333, -73.770993), # Park2
>>> (np.nan, np.nan, np.nan), # Park3
>>> (np.nan, np.nan, np.nan), # Park3.5
>>> (0, 42.655039, -73.769048), # Park4
>>> (0, 42.657872, -73.764148), # Park5
>>> ])
>>> posixtimes = X_data.T[0]
>>> latlons = X_data.T[1:3].T
>>> thresh_sec = 250 # seconds
>>> km_per_sec = KM_PER_SEC
>>> X_labels = cluster_timespace_sec(posixtimes, latlons, thresh_sec)
>>> result = 'X_labels = {}'.format(ut.repr2(X_labels))
>>> print(result)
X_labels = np.array([3, 4, 1, 2, 4, 5])
"""
X_data, dist_func, columns = prepare_data(posixtimes, latlons, km_per_sec, 'seconds')
if X_data is None:
return None
# Cluster nan distributions differently
X_bools = ~np.isnan(X_data)
group_id = (X_bools * np.power(2, [2, 1, 0])).sum(axis=1)
import vtool as vt
unique_ids, groupxs = vt.group_indices(group_id)
grouped_labels = []
for xs in groupxs:
X_part = X_data.take(xs, axis=0)
labels = _cluster_part(X_part, dist_func, columns, thresh_sec, km_per_sec)
grouped_labels.append((labels, xs))
# Undo grouping and rectify overlaps
X_labels = _recombine_labels(grouped_labels)
# Do clustering
return X_labels
def _recombine_labels(chunk_labels):
"""
Ensure each group has different indices
chunk_labels = grouped_labels
"""
import utool as ut
labels = ut.take_column(chunk_labels, 0)
idxs = ut.take_column(chunk_labels, 1)
# nunique_list = [len(np.unique(a)) for a in labels]
chunksizes = ut.lmap(len, idxs)
cumsum = np.cumsum(chunksizes).tolist()
combined_idxs = np.hstack(idxs)
combined_labels = np.hstack(labels)
offset = 0
# Ensure each chunk has unique labels
for start, stop in zip([0] + cumsum, cumsum):
combined_labels[start:stop] += offset
offset += len(np.unique(combined_labels[start:stop]))
# Ungroup
X_labels = np.empty(combined_idxs.max() + 1, dtype=np.int)
# new_labels[:] = -1
X_labels[combined_idxs] = combined_labels
return X_labels
def _cluster_part(X_part, dist_func, columns, thresh_sec, km_per_sec):
if len(X_part) > 500 and 'time' in columns and ~np.isnan(X_part[0, 0]):
# Try and break problem up into smaller chunks by finding feasible
# one-dimensional breakpoints (is this a cutting plane?)
chunk_labels = []
chunk_idxs = list(_chunk_time(X_part, thresh_sec))
for idxs in chunk_idxs:
# logger.info('Doing occurrence chunk {}'.format(len(idxs)))
X_chunk = X_part.take(idxs, axis=0)
labels = _cluster_chunk(X_chunk, dist_func, thresh_sec)
chunk_labels.append((labels, idxs))
X_labels = _recombine_labels(chunk_labels)
else:
# Compute the whole problem
X_labels = _cluster_chunk(X_part, dist_func, thresh_sec)
return X_labels
def _cluster_chunk(X_data, dist_func, thresh_sec):
if len(X_data) == 0:
X_labels = np.empty(len(X_data), dtype=np.int)
elif len(X_data) == 1:
X_labels = np.ones(len(X_data), dtype=np.int)
elif np.all(np.isnan(X_data)):
X_labels = np.arange(1, len(X_data) + 1, dtype=np.int)
else:
# Compute pairwise distances between all inputs
condenced_dist_mat = distance.pdist(X_data, dist_func)
# Compute heirarchical linkages
linkage_mat = scipy.cluster.hierarchy.linkage(condenced_dist_mat, method='single')
# Cluster linkages
X_labels = scipy.cluster.hierarchy.fcluster(
linkage_mat, thresh_sec, criterion='distance'
)
return X_labels
def _chunk_time(X_part, thresh_sec):
X_time = X_part.T[0]
time_sortx = X_time.argsort()
timedata = X_time[time_sortx]
# Look for points that are beyond the thresh in one dimension
consec_delta = np.diff(timedata)
consec_delta[consec_delta > thresh_sec]
breakpoint = (consec_delta > thresh_sec) | np.isnan(consec_delta)
idxs = np.hstack([[0], np.where(breakpoint)[0] + 1, [len(X_part)]])
iter_window = list(zip(idxs, idxs[1:]))
for start, stop in iter_window:
idxs = time_sortx[start:stop]
# chunk = X_time[idxs]
# logger.info((np.diff(chunk[chunk.argsort()]) > thresh_sec).sum())
yield idxs
# def _chunk_lat(X_chunk, thresh_sec, km_per_sec):
# # X_time = X_chunk.T[0]
# X_lats = X_chunk.T[-2]
# X_lons = X_chunk.T[-1]
# # approximates 1 dimensional distance
# ave_lon = np.mean(X_lons)
# lat_sortx = X_lats.argsort()
# latdata = X_lats[lat_sortx]
# latdata_rad = np.radians(latdata)
# avelon_rad = np.radians(ave_lon)
# lat1 = latdata_rad[:-1]
# lon1 = avelon_rad
# lat2 = latdata_rad[1:]
# lon2 = avelon_rad
# consec_kmdelta = haversine_rad(lat1, lon1, lat2, lon2)
# consec_delta = consec_kmdelta / km_per_sec
# consec_delta[consec_delta > thresh_sec]
# breakpoint = (consec_delta > thresh_sec) | np.isnan(consec_delta)
# idxs = np.hstack([[0], np.where(breakpoint)[0] + 1, [len(X_chunk)]])
# iter_window = list(zip(idxs, idxs[1:]))
# for start, stop in iter_window:
# idxs = lat_sortx[start:stop]
# # chunk = X_time[idxs]
# # logger.info((np.diff(chunk[chunk.argsort()]) > thresh_sec).sum())
# yield idxs
# def _chunk_lon(X_chunk, thresh_sec, km_per_sec):
# # X_time = X_chunk.T[0]
# X_lats = X_chunk.T[-2]
# X_lons = X_chunk.T[-1]
# # approximates 1 dimensional distance (assuming lons are not too different)
# ave_lat = np.mean(X_lats)
# lon_sortx = X_lons.argsort()
# londata = X_lons[lon_sortx]
# londata_rad = np.radians(londata)
# avelat_rad = np.radians(ave_lat)
# lat1 = avelat_rad
# lon1 = londata_rad[:-1]
# lat2 = avelat_rad
# lon2 = londata_rad[1:]
# consec_kmdelta = haversine_rad(lat1, lon1, lat2, lon2)
# consec_delta = consec_kmdelta / km_per_sec
# consec_delta[consec_delta > thresh_sec]
# breakpoint = (consec_delta > thresh_sec) | np.isnan(consec_delta)
# idxs = np.hstack([[0], np.where(breakpoint)[0] + 1, [len(X_chunk)]])
# iter_window = list(zip(idxs, idxs[1:]))
# for start, stop in iter_window:
# idxs = lon_sortx[start:stop]
# # chunk = X_time[idxs]
# # logger.info((np.diff(chunk[chunk.argsort()]) > thresh_sec).sum())
# yield idxs
[docs]def main():
"""
CommandLine:
ib
cd ~/code/wbia/wbia/algo/preproc
python occurrence_blackbox.py --lat 42.727985 42.657414 42.658333 42.654384 --lon -73.683994 -73.774448 -73.770993 -73.768919 --sec 0 0 0 0
# Should return
X_labels = [2, 1, 1, 1]
"""
import argparse
parser = argparse.ArgumentParser(description='Compute agglomerative cluster')
parser.add_argument('--lat', type=float, nargs='*', help='list of latitude coords')
parser.add_argument('--lon', type=float, nargs='*', help='list of longitude coords')
parser.add_argument(
'--sec', type=float, nargs='*', help='list of POSIX_TIMEs in seconds'
)
parser.add_argument(
'--thresh', type=float, nargs=1, default=1.0, help='threshold in kilometers'
)
parser.add_argument(
'--km_per_sec',
type=float,
nargs=1,
default=KM_PER_SEC,
help='reasonable animal speed in km/s',
)
args = parser.parse_args()
sec = [0] * len(args.lat) if args.sec is None else args.sec
latlons = np.vstack([args.lat, args.lon]).T
X_labels = cluster_timespace_km(sec, latlons, args.thresh, km_per_sec=args.km_per_sec)
logger.info('X_labels = %r' % (X_labels.tolist(),))
if __name__ == '__main__':
r"""
CommandLine:
python -m wbia.algo.preproc.occurrence_blackbox
"""
main()