Coverage for /usr/lib/python3/dist-packages/scipy/spatial/distance.py: 13%

100

101

102

103

"""

=====================================================

Distance computations (:mod:`scipy.spatial.distance`)

=====================================================

.. sectionauthor:: Damian Eads

Function Reference

------------------

Distance matrix computation from a collection of raw observation vectors

stored in a rectangular array.

.. autosummary::

:toctree: generated/

pdist -- pairwise distances between observation vectors.

cdist -- distances between two collections of observation vectors

squareform -- convert distance matrix to a condensed one and vice versa

directed_hausdorff -- directed Hausdorff distance between arrays

Predicates for checking the validity of distance matrices, both

condensed and redundant. Also contained in this module are functions

for computing the number of observations in a distance matrix.

.. autosummary::

:toctree: generated/

is_valid_dm -- checks for a valid distance matrix

is_valid_y -- checks for a valid condensed distance matrix

num_obs_dm -- # of observations in a distance matrix

num_obs_y -- # of observations in a condensed distance matrix

Distance functions between two numeric vectors ``u`` and ``v``. Computing

distances over a large collection of vectors is inefficient for these

functions. Use ``pdist`` for this purpose.

.. autosummary::

:toctree: generated/

braycurtis -- the Bray-Curtis distance.

canberra -- the Canberra distance.

chebyshev -- the Chebyshev distance.

cityblock -- the Manhattan distance.

correlation -- the Correlation distance.

cosine -- the Cosine distance.

euclidean -- the Euclidean distance.

mahalanobis -- the Mahalanobis distance.

minkowski -- the Minkowski distance.

seuclidean -- the normalized Euclidean distance.

sqeuclidean -- the squared Euclidean distance.

wminkowski -- (deprecated) alias of `minkowski`.

Distance functions between two boolean vectors (representing sets) ``u`` and

``v``. As in the case of numerical vectors, ``pdist`` is more efficient for

computing the distances between all pairs.

.. autosummary::

:toctree: generated/

dice -- the Dice dissimilarity.

hamming -- the Hamming distance.

jaccard -- the Jaccard distance.

kulsinski -- the Kulsinski distance.

rogerstanimoto -- the Rogers-Tanimoto dissimilarity.

russellrao -- the Russell-Rao dissimilarity.

sokalmichener -- the Sokal-Michener dissimilarity.

sokalsneath -- the Sokal-Sneath dissimilarity.

yule -- the Yule dissimilarity.

:func:`hamming` also operates over discrete numerical vectors.

"""

from __future__ import division, print_function, absolute_import

__all__ = [

'braycurtis',

'canberra',

'cdist',

'chebyshev',

'cityblock',

'correlation',

'cosine',

'dice',

'directed_hausdorff',

'euclidean',

'hamming',

'is_valid_dm',

'is_valid_y',

'jaccard',

'kulsinski',

'mahalanobis',

'matching',

'minkowski',

'num_obs_dm',

'num_obs_y',

'pdist',

'rogerstanimoto',

'russellrao',

'seuclidean',

'sokalmichener',

'sokalsneath',

'sqeuclidean',

'squareform',

'wminkowski',

'yule'

]

import warnings

import numpy as np

from functools import partial

from collections import namedtuple

from scipy._lib.six import callable, string_types

from scipy._lib.six import xrange

from scipy._lib._util import _asarray_validated

from . import _distance_wrap

from . import _hausdorff

from ..linalg import norm

def _args_to_kwargs_xdist(args, kwargs, metric, func_name):

"""

Convert legacy positional arguments to keyword arguments for pdist/cdist.

"""

if not args:

return kwargs

if (callable(metric) and metric not in [

braycurtis, canberra, chebyshev, cityblock, correlation, cosine,

dice, euclidean, hamming, jaccard, kulsinski, mahalanobis,

matching, minkowski, rogerstanimoto, russellrao, seuclidean,

sokalmichener, sokalsneath, sqeuclidean, yule, wminkowski]):

raise TypeError('When using a custom metric arguments must be passed'

'as keyword (i.e., ARGNAME=ARGVALUE)')

if func_name == 'pdist':

old_arg_names = ['p', 'w', 'V', 'VI']

else:

old_arg_names = ['p', 'V', 'VI', 'w']

num_args = len(args)

warnings.warn('%d metric parameters have been passed as positional.'

'This will raise an error in a future version.'

'Please pass arguments as keywords(i.e., ARGNAME=ARGVALUE)'

% num_args, DeprecationWarning)

if num_args > 4:

raise ValueError('Deprecated %s signature accepts only 4'

'positional arguments (%s), %d given.'

% (func_name, ', '.join(old_arg_names), num_args))

for old_arg, arg in zip(old_arg_names, args):

if old_arg in kwargs:

raise TypeError('%s() got multiple values for argument %s'

% (func_name, old_arg))

kwargs[old_arg] = arg

return kwargs

def _copy_array_if_base_present(a):

"""Copy the array if its base points to a parent array."""

if a.base is not None:

return a.copy()

return a

def _correlation_cdist_wrap(XA, XB, dm, **kwargs):

XA = XA - XA.mean(axis=1, keepdims=True)

XB = XB - XB.mean(axis=1, keepdims=True)

_distance_wrap.cdist_cosine_double_wrap(XA, XB, dm, **kwargs)

def _correlation_pdist_wrap(X, dm, **kwargs):

X2 = X - X.mean(axis=1, keepdims=True)

_distance_wrap.pdist_cosine_double_wrap(X2, dm, **kwargs)

def _convert_to_type(X, out_type):

return np.ascontiguousarray(X, dtype=out_type)

def _filter_deprecated_kwargs(kwargs, args_blacklist):

# Filtering out old default keywords

for k in args_blacklist:

if k in kwargs:

del kwargs[k]

warnings.warn('Got unexpected kwarg %s. This will raise an error'

' in a future version.' % k, DeprecationWarning)

def _nbool_correspond_all(u, v, w=None):

if u.dtype == v.dtype == bool and w is None:

not_u = ~u

not_v = ~v

nff = (not_u & not_v).sum()

nft = (not_u & v).sum()

ntf = (u & not_v).sum()

ntt = (u & v).sum()

else:

dtype = np.find_common_type([int], [u.dtype, v.dtype])

u = u.astype(dtype)

v = v.astype(dtype)

not_u = 1.0 - u

not_v = 1.0 - v

if w is not None:

not_u = w * not_u

u = w * u

nff = (not_u * not_v).sum()

nft = (not_u * v).sum()

ntf = (u * not_v).sum()

ntt = (u * v).sum()

return (nff, nft, ntf, ntt)

def _nbool_correspond_ft_tf(u, v, w=None):

if u.dtype == v.dtype == bool and w is None:

not_u = ~u

not_v = ~v

nft = (not_u & v).sum()

ntf = (u & not_v).sum()

else:

dtype = np.find_common_type([int], [u.dtype, v.dtype])

u = u.astype(dtype)

v = v.astype(dtype)

not_u = 1.0 - u

not_v = 1.0 - v

if w is not None:

not_u = w * not_u

u = w * u

nft = (not_u * v).sum()

ntf = (u * not_v).sum()

return (nft, ntf)

def _validate_cdist_input(XA, XB, mA, mB, n, metric_name, **kwargs):

if metric_name is not None:

# get supported types

types = _METRICS[metric_name].types

# choose best type

typ = types[types.index(XA.dtype)] if XA.dtype in types else types[0]

# validate data

XA = _convert_to_type(XA, out_type=typ)

XB = _convert_to_type(XB, out_type=typ)

# validate kwargs

_validate_kwargs = _METRICS[metric_name].validator

if _validate_kwargs:

kwargs = _validate_kwargs(np.vstack([XA, XB]), mA + mB, n, **kwargs)

else:

typ = None

return XA, XB, typ, kwargs

def _validate_mahalanobis_kwargs(X, m, n, **kwargs):

VI = kwargs.pop('VI', None)

if VI is None:

if m <= n:

# There are fewer observations than the dimension of

# the observations.

raise ValueError("The number of observations (%d) is too "

"small; the covariance matrix is "

"singular. For observations with %d "

"dimensions, at least %d observations "

"are required." % (m, n, n + 1))

CV = np.atleast_2d(np.cov(X.astype(np.double).T))

VI = np.linalg.inv(CV).T.copy()

kwargs["VI"] = _convert_to_double(VI)

return kwargs

def _validate_minkowski_kwargs(X, m, n, **kwargs):

if 'p' not in kwargs:

kwargs['p'] = 2.

return kwargs

def _validate_pdist_input(X, m, n, metric_name, **kwargs):

if metric_name is not None:

# get supported types

types = _METRICS[metric_name].types

# choose best type

typ = types[types.index(X.dtype)] if X.dtype in types else types[0]

# validate data

X = _convert_to_type(X, out_type=typ)

# validate kwargs

_validate_kwargs = _METRICS[metric_name].validator

if _validate_kwargs:

kwargs = _validate_kwargs(X, m, n, **kwargs)

else:

typ = None

return X, typ, kwargs

def _validate_seuclidean_kwargs(X, m, n, **kwargs):

V = kwargs.pop('V', None)

if V is None:

V = np.var(X.astype(np.double), axis=0, ddof=1)

else:

V = np.asarray(V, order='c')

if V.dtype != np.double:

raise TypeError('Variance vector V must contain doubles.')

if len(V.shape) != 1:

raise ValueError('Variance vector V must '

'be one-dimensional.')

if V.shape[0] != n:

raise ValueError('Variance vector V must be of the same '

'dimension as the vectors on which the distances '

'are computed.')

kwargs['V'] = _convert_to_double(V)

return kwargs

def _validate_vector(u, dtype=None):

# XXX Is order='c' really necessary?

u = np.asarray(u, dtype=dtype, order='c').squeeze()

# Ensure values such as u=1 and u=[1] still return 1-D arrays.

u = np.atleast_1d(u)

if u.ndim > 1:

raise ValueError("Input vector should be 1-D.")

return u

def _validate_weights(w, dtype=np.double):

w = _validate_vector(w, dtype=dtype)

if np.any(w < 0):

raise ValueError("Input weights should be all non-negative")

return w

def _validate_wminkowski_kwargs(X, m, n, **kwargs):

w = kwargs.pop('w', None)

if w is None:

raise ValueError('weighted minkowski requires a weight '

'vector `w` to be given.')

kwargs['w'] = _convert_to_double(w)

if 'p' not in kwargs:

kwargs['p'] = 2.

return kwargs

def directed_hausdorff(u, v, seed=0):

"""

Compute the directed Hausdorff distance between two N-D arrays.

Distances between pairs are calculated using a Euclidean metric.

Parameters

----------

u : (M,N) ndarray

Input array.

v : (O,N) ndarray

Input array.

seed : int or None

Local `np.random.RandomState` seed. Default is 0, a random shuffling of

u and v that guarantees reproducibility.

Returns

-------

d : double

The directed Hausdorff distance between arrays `u` and `v`,

index_1 : int

index of point contributing to Hausdorff pair in `u`

index_2 : int

index of point contributing to Hausdorff pair in `v`

Notes

-----

Uses the early break technique and the random sampling approach

described by [1]_. Although worst-case performance is ``O(m * o)``

(as with the brute force algorithm), this is unlikely in practice

as the input data would have to require the algorithm to explore

every single point interaction, and after the algorithm shuffles

the input points at that. The best case performance is O(m), which

is satisfied by selecting an inner loop distance that is less than

cmax and leads to an early break as often as possible. The authors

have formally shown that the average runtime is closer to O(m).

.. versionadded:: 0.19.0

References

----------

.. [1] A. A. Taha and A. Hanbury, "An efficient algorithm for

calculating the exact Hausdorff distance." IEEE Transactions On

Pattern Analysis And Machine Intelligence, vol. 37 pp. 2153-63,

2015.

Coverage for /usr/lib/python3/dist-packages/scipy/spatial/distance.py : 13%

548 statements 69 run 479 missing 58 excluded