1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

""" Sketching-based Matrix Computations """ 

 

# Author: Jordi Montes <jomsdev@gmail.com> 

# August 28, 2017 

 

from __future__ import division, print_function, absolute_import 

 

import numpy as np 

 

from scipy._lib._util import check_random_state 

 

__all__ = ['clarkson_woodruff_transform'] 

 

 

def cwt_matrix(n_rows, n_columns, seed=None): 

r"""" 

Generate a matrix S for the Clarkson-Woodruff sketch. 

 

Given the desired size of matrix, the method returns a matrix S of size 

(n_rows, n_columns) where each column has all the entries set to 0 less one 

position which has been randomly set to +1 or -1 with equal probability. 

 

Parameters 

---------- 

n_rows: int 

Number of rows of S 

n_columns: int 

Number of columns of S 

seed : None or int or `numpy.random.RandomState` instance, optional 

This parameter defines the ``RandomState`` object to use for drawing 

random variates. 

If None (or ``np.random``), the global ``np.random`` state is used. 

If integer, it is used to seed the local ``RandomState`` instance. 

Default is None. 

 

Returns 

------- 

S : (n_rows, n_columns) array_like 

 

Notes 

----- 

Given a matrix A, with probability at least 9/10, 

.. math:: ||SA|| == (1 \pm \epsilon)||A|| 

Where epsilon is related to the size of S 

""" 

S = np.zeros((n_rows, n_columns)) 

nz_positions = np.random.randint(0, n_rows, n_columns) 

rng = check_random_state(seed) 

values = rng.choice([1, -1], n_columns) 

for i in range(n_columns): 

S[nz_positions[i]][i] = values[i] 

 

return S 

 

 

def clarkson_woodruff_transform(input_matrix, sketch_size, seed=None): 

r"""" 

Find low-rank matrix approximation via the Clarkson-Woodruff Transform. 

 

Given an input_matrix ``A`` of size ``(n, d)``, compute a matrix ``A'`` of 

size (sketch_size, d) which holds: 

 

.. math:: ||Ax|| = (1 \pm \epsilon)||A'x|| 

 

with high probability. 

 

The error is related to the number of rows of the sketch and it is bounded 

 

.. math:: poly(r(\epsilon^{-1})) 

 

Parameters 

---------- 

input_matrix: array_like 

Input matrix, of shape ``(n, d)``. 

sketch_size: int 

Number of rows for the sketch. 

seed : None or int or `numpy.random.RandomState` instance, optional 

This parameter defines the ``RandomState`` object to use for drawing 

random variates. 

If None (or ``np.random``), the global ``np.random`` state is used. 

If integer, it is used to seed the local ``RandomState`` instance. 

Default is None. 

 

Returns 

------- 

A' : array_like 

Sketch of the input matrix ``A``, of size ``(sketch_size, d)``. 

 

Notes 

----- 

This is an implementation of the Clarkson-Woodruff Transform (CountSketch). 

``A'`` can be computed in principle in ``O(nnz(A))`` (with ``nnz`` meaning 

the number of nonzero entries), however we don't take advantage of sparse 

matrices in this implementation. 

 

Examples 

-------- 

Given a big dense matrix ``A``: 

 

>>> from scipy import linalg 

>>> n_rows, n_columns, sketch_n_rows = (2000, 100, 100) 

>>> threshold = 0.1 

>>> tmp = np.random.normal(0, 0.1, n_rows*n_columns) 

>>> A = np.reshape(tmp, (n_rows, n_columns)) 

>>> sketch = linalg.clarkson_woodruff_transform(A, sketch_n_rows) 

>>> sketch.shape 

(100, 100) 

>>> normA = linalg.norm(A) 

>>> norm_sketch = linalg.norm(sketch) 

 

Now with high probability, the condition ``abs(normA-normSketch) < 

threshold`` holds. 

 

References 

---------- 

.. [1] Kenneth L. Clarkson and David P. Woodruff. Low rank approximation and 

regression in input sparsity time. In STOC, 2013. 

 

""" 

S = cwt_matrix(sketch_size, input_matrix.shape[0], seed) 

return np.dot(S, input_matrix)