1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

""" 

A buffered iterator for big arrays. 

 

This module solves the problem of iterating over a big file-based array 

without having to read it into memory. The `Arrayterator` class wraps 

an array object, and when iterated it will return sub-arrays with at most 

a user-specified number of elements. 

 

""" 

from __future__ import division, absolute_import, print_function 

 

from operator import mul 

from functools import reduce 

 

from numpy.compat import long 

 

__all__ = ['Arrayterator'] 

 

 

class Arrayterator(object): 

""" 

Buffered iterator for big arrays. 

 

`Arrayterator` creates a buffered iterator for reading big arrays in small 

contiguous blocks. The class is useful for objects stored in the 

file system. It allows iteration over the object *without* reading 

everything in memory; instead, small blocks are read and iterated over. 

 

`Arrayterator` can be used with any object that supports multidimensional 

slices. This includes NumPy arrays, but also variables from 

Scientific.IO.NetCDF or pynetcdf for example. 

 

Parameters 

---------- 

var : array_like 

The object to iterate over. 

buf_size : int, optional 

The buffer size. If `buf_size` is supplied, the maximum amount of 

data that will be read into memory is `buf_size` elements. 

Default is None, which will read as many element as possible 

into memory. 

 

Attributes 

---------- 

var 

buf_size 

start 

stop 

step 

shape 

flat 

 

See Also 

-------- 

ndenumerate : Multidimensional array iterator. 

flatiter : Flat array iterator. 

memmap : Create a memory-map to an array stored in a binary file on disk. 

 

Notes 

----- 

The algorithm works by first finding a "running dimension", along which 

the blocks will be extracted. Given an array of dimensions 

``(d1, d2, ..., dn)``, e.g. if `buf_size` is smaller than ``d1``, the 

first dimension will be used. If, on the other hand, 

``d1 < buf_size < d1*d2`` the second dimension will be used, and so on. 

Blocks are extracted along this dimension, and when the last block is 

returned the process continues from the next dimension, until all 

elements have been read. 

 

Examples 

-------- 

>>> a = np.arange(3 * 4 * 5 * 6).reshape(3, 4, 5, 6) 

>>> a_itor = np.lib.Arrayterator(a, 2) 

>>> a_itor.shape 

(3, 4, 5, 6) 

 

Now we can iterate over ``a_itor``, and it will return arrays of size 

two. Since `buf_size` was smaller than any dimension, the first 

dimension will be iterated over first: 

 

>>> for subarr in a_itor: 

... if not subarr.all(): 

... print(subarr, subarr.shape) 

... 

[[[[0 1]]]] (1, 1, 1, 2) 

 

""" 

 

def __init__(self, var, buf_size=None): 

self.var = var 

self.buf_size = buf_size 

 

self.start = [0 for dim in var.shape] 

self.stop = [dim for dim in var.shape] 

self.step = [1 for dim in var.shape] 

 

def __getattr__(self, attr): 

return getattr(self.var, attr) 

 

def __getitem__(self, index): 

""" 

Return a new arrayterator. 

 

""" 

# Fix index, handling ellipsis and incomplete slices. 

if not isinstance(index, tuple): 

index = (index,) 

fixed = [] 

length, dims = len(index), self.ndim 

for slice_ in index: 

if slice_ is Ellipsis: 

fixed.extend([slice(None)] * (dims-length+1)) 

length = len(fixed) 

elif isinstance(slice_, (int, long)): 

fixed.append(slice(slice_, slice_+1, 1)) 

else: 

fixed.append(slice_) 

index = tuple(fixed) 

if len(index) < dims: 

index += (slice(None),) * (dims-len(index)) 

 

# Return a new arrayterator object. 

out = self.__class__(self.var, self.buf_size) 

for i, (start, stop, step, slice_) in enumerate( 

zip(self.start, self.stop, self.step, index)): 

out.start[i] = start + (slice_.start or 0) 

out.step[i] = step * (slice_.step or 1) 

out.stop[i] = start + (slice_.stop or stop-start) 

out.stop[i] = min(stop, out.stop[i]) 

return out 

 

def __array__(self): 

""" 

Return corresponding data. 

 

""" 

slice_ = tuple(slice(*t) for t in zip( 

self.start, self.stop, self.step)) 

return self.var[slice_] 

 

@property 

def flat(self): 

""" 

A 1-D flat iterator for Arrayterator objects. 

 

This iterator returns elements of the array to be iterated over in 

`Arrayterator` one by one. It is similar to `flatiter`. 

 

See Also 

-------- 

Arrayterator 

flatiter 

 

Examples 

-------- 

>>> a = np.arange(3 * 4 * 5 * 6).reshape(3, 4, 5, 6) 

>>> a_itor = np.lib.Arrayterator(a, 2) 

 

>>> for subarr in a_itor.flat: 

... if not subarr: 

... print(subarr, type(subarr)) 

... 

0 <type 'numpy.int32'> 

 

""" 

for block in self: 

for value in block.flat: 

yield value 

 

@property 

def shape(self): 

""" 

The shape of the array to be iterated over. 

 

For an example, see `Arrayterator`. 

 

""" 

return tuple(((stop-start-1)//step+1) for start, stop, step in 

zip(self.start, self.stop, self.step)) 

 

def __iter__(self): 

# Skip arrays with degenerate dimensions 

if [dim for dim in self.shape if dim <= 0]: 

return 

 

start = self.start[:] 

stop = self.stop[:] 

step = self.step[:] 

ndims = self.var.ndim 

 

while True: 

count = self.buf_size or reduce(mul, self.shape) 

 

# iterate over each dimension, looking for the 

# running dimension (ie, the dimension along which 

# the blocks will be built from) 

rundim = 0 

for i in range(ndims-1, -1, -1): 

# if count is zero we ran out of elements to read 

# along higher dimensions, so we read only a single position 

if count == 0: 

stop[i] = start[i]+1 

elif count <= self.shape[i]: 

# limit along this dimension 

stop[i] = start[i] + count*step[i] 

rundim = i 

else: 

# read everything along this dimension 

stop[i] = self.stop[i] 

stop[i] = min(self.stop[i], stop[i]) 

count = count//self.shape[i] 

 

# yield a block 

slice_ = tuple(slice(*t) for t in zip(start, stop, step)) 

yield self.var[slice_] 

 

# Update start position, taking care of overflow to 

# other dimensions 

start[rundim] = stop[rundim] # start where we stopped 

for i in range(ndims-1, 0, -1): 

if start[i] >= self.stop[i]: 

start[i] = self.start[i] 

start[i-1] += self.step[i-1] 

if start[0] >= self.stop[0]: 

return