1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

"""Some functions for working with contingency tables (i.e. cross tabulations). 

""" 

 

 

from __future__ import division, print_function, absolute_import 

 

from functools import reduce 

import numpy as np 

from .stats import power_divergence 

 

 

__all__ = ['margins', 'expected_freq', 'chi2_contingency'] 

 

 

def margins(a): 

"""Return a list of the marginal sums of the array `a`. 

 

Parameters 

---------- 

a : ndarray 

The array for which to compute the marginal sums. 

 

Returns 

------- 

margsums : list of ndarrays 

A list of length `a.ndim`. `margsums[k]` is the result 

of summing `a` over all axes except `k`; it has the same 

number of dimensions as `a`, but the length of each axis 

except axis `k` will be 1. 

 

Examples 

-------- 

>>> a = np.arange(12).reshape(2, 6) 

>>> a 

array([[ 0, 1, 2, 3, 4, 5], 

[ 6, 7, 8, 9, 10, 11]]) 

>>> m0, m1 = margins(a) 

>>> m0 

array([[15], 

[51]]) 

>>> m1 

array([[ 6, 8, 10, 12, 14, 16]]) 

 

>>> b = np.arange(24).reshape(2,3,4) 

>>> m0, m1, m2 = margins(b) 

>>> m0 

array([[[ 66]], 

[[210]]]) 

>>> m1 

array([[[ 60], 

[ 92], 

[124]]]) 

>>> m2 

array([[[60, 66, 72, 78]]]) 

""" 

margsums = [] 

ranged = list(range(a.ndim)) 

for k in ranged: 

marg = np.apply_over_axes(np.sum, a, [j for j in ranged if j != k]) 

margsums.append(marg) 

return margsums 

 

 

def expected_freq(observed): 

""" 

Compute the expected frequencies from a contingency table. 

 

Given an n-dimensional contingency table of observed frequencies, 

compute the expected frequencies for the table based on the marginal 

sums under the assumption that the groups associated with each 

dimension are independent. 

 

Parameters 

---------- 

observed : array_like 

The table of observed frequencies. (While this function can handle 

a 1-D array, that case is trivial. Generally `observed` is at 

least 2-D.) 

 

Returns 

------- 

expected : ndarray of float64 

The expected frequencies, based on the marginal sums of the table. 

Same shape as `observed`. 

 

Examples 

-------- 

>>> observed = np.array([[10, 10, 20],[20, 20, 20]]) 

>>> from scipy.stats import expected_freq 

>>> expected_freq(observed) 

array([[ 12., 12., 16.], 

[ 18., 18., 24.]]) 

 

""" 

# Typically `observed` is an integer array. If `observed` has a large 

# number of dimensions or holds large values, some of the following 

# computations may overflow, so we first switch to floating point. 

observed = np.asarray(observed, dtype=np.float64) 

 

# Create a list of the marginal sums. 

margsums = margins(observed) 

 

# Create the array of expected frequencies. The shapes of the 

# marginal sums returned by apply_over_axes() are just what we 

# need for broadcasting in the following product. 

d = observed.ndim 

expected = reduce(np.multiply, margsums) / observed.sum() ** (d - 1) 

return expected 

 

 

def chi2_contingency(observed, correction=True, lambda_=None): 

"""Chi-square test of independence of variables in a contingency table. 

 

This function computes the chi-square statistic and p-value for the 

hypothesis test of independence of the observed frequencies in the 

contingency table [1]_ `observed`. The expected frequencies are computed 

based on the marginal sums under the assumption of independence; see 

`scipy.stats.contingency.expected_freq`. The number of degrees of 

freedom is (expressed using numpy functions and attributes):: 

 

dof = observed.size - sum(observed.shape) + observed.ndim - 1 

 

 

Parameters 

---------- 

observed : array_like 

The contingency table. The table contains the observed frequencies 

(i.e. number of occurrences) in each category. In the two-dimensional 

case, the table is often described as an "R x C table". 

correction : bool, optional 

If True, *and* the degrees of freedom is 1, apply Yates' correction 

for continuity. The effect of the correction is to adjust each 

observed value by 0.5 towards the corresponding expected value. 

lambda_ : float or str, optional. 

By default, the statistic computed in this test is Pearson's 

chi-squared statistic [2]_. `lambda_` allows a statistic from the 

Cressie-Read power divergence family [3]_ to be used instead. See 

`power_divergence` for details. 

 

Returns 

------- 

chi2 : float 

The test statistic. 

p : float 

The p-value of the test 

dof : int 

Degrees of freedom 

expected : ndarray, same shape as `observed` 

The expected frequencies, based on the marginal sums of the table. 

 

See Also 

-------- 

contingency.expected_freq 

fisher_exact 

chisquare 

power_divergence 

 

Notes 

----- 

An often quoted guideline for the validity of this calculation is that 

the test should be used only if the observed and expected frequencies 

in each cell are at least 5. 

 

This is a test for the independence of different categories of a 

population. The test is only meaningful when the dimension of 

`observed` is two or more. Applying the test to a one-dimensional 

table will always result in `expected` equal to `observed` and a 

chi-square statistic equal to 0. 

 

This function does not handle masked arrays, because the calculation 

does not make sense with missing values. 

 

Like stats.chisquare, this function computes a chi-square statistic; 

the convenience this function provides is to figure out the expected 

frequencies and degrees of freedom from the given contingency table. 

If these were already known, and if the Yates' correction was not 

required, one could use stats.chisquare. That is, if one calls:: 

 

chi2, p, dof, ex = chi2_contingency(obs, correction=False) 

 

then the following is true:: 

 

(chi2, p) == stats.chisquare(obs.ravel(), f_exp=ex.ravel(), 

ddof=obs.size - 1 - dof) 

 

The `lambda_` argument was added in version 0.13.0 of scipy. 

 

References 

---------- 

.. [1] "Contingency table", http://en.wikipedia.org/wiki/Contingency_table 

.. [2] "Pearson's chi-squared test", 

http://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test 

.. [3] Cressie, N. and Read, T. R. C., "Multinomial Goodness-of-Fit 

Tests", J. Royal Stat. Soc. Series B, Vol. 46, No. 3 (1984), 

pp. 440-464. 

 

Examples 

-------- 

A two-way example (2 x 3): 

 

>>> from scipy.stats import chi2_contingency 

>>> obs = np.array([[10, 10, 20], [20, 20, 20]]) 

>>> chi2_contingency(obs) 

(2.7777777777777777, 

0.24935220877729619, 

2, 

array([[ 12., 12., 16.], 

[ 18., 18., 24.]])) 

 

Perform the test using the log-likelihood ratio (i.e. the "G-test") 

instead of Pearson's chi-squared statistic. 

 

>>> g, p, dof, expctd = chi2_contingency(obs, lambda_="log-likelihood") 

>>> g, p 

(2.7688587616781319, 0.25046668010954165) 

 

A four-way example (2 x 2 x 2 x 2): 

 

>>> obs = np.array( 

... [[[[12, 17], 

... [11, 16]], 

... [[11, 12], 

... [15, 16]]], 

... [[[23, 15], 

... [30, 22]], 

... [[14, 17], 

... [15, 16]]]]) 

>>> chi2_contingency(obs) 

(8.7584514426741897, 

0.64417725029295503, 

11, 

array([[[[ 14.15462386, 14.15462386], 

[ 16.49423111, 16.49423111]], 

[[ 11.2461395 , 11.2461395 ], 

[ 13.10500554, 13.10500554]]], 

[[[ 19.5591166 , 19.5591166 ], 

[ 22.79202844, 22.79202844]], 

[[ 15.54012004, 15.54012004], 

[ 18.10873492, 18.10873492]]]])) 

""" 

observed = np.asarray(observed) 

if np.any(observed < 0): 

raise ValueError("All values in `observed` must be nonnegative.") 

if observed.size == 0: 

raise ValueError("No data; `observed` has size 0.") 

 

expected = expected_freq(observed) 

if np.any(expected == 0): 

# Include one of the positions where expected is zero in 

# the exception message. 

zeropos = list(zip(*np.where(expected == 0)))[0] 

raise ValueError("The internally computed table of expected " 

"frequencies has a zero element at %s." % (zeropos,)) 

 

# The degrees of freedom 

dof = expected.size - sum(expected.shape) + expected.ndim - 1 

 

if dof == 0: 

# Degenerate case; this occurs when `observed` is 1D (or, more 

# generally, when it has only one nontrivial dimension). In this 

# case, we also have observed == expected, so chi2 is 0. 

chi2 = 0.0 

p = 1.0 

else: 

if dof == 1 and correction: 

# Adjust `observed` according to Yates' correction for continuity. 

observed = observed + 0.5 * np.sign(expected - observed) 

 

chi2, p = power_divergence(observed, expected, 

ddof=observed.size - 1 - dof, axis=None, 

lambda_=lambda_) 

 

return chi2, p, dof, expected