1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

# http://pyrocko.org - GPLv3 

# 

# The Pyrocko Developers, 21st Century 

# ---|P------/S----------~Lg---------- 

 

from __future__ import absolute_import, print_function 

 

import time 

import logging 

from builtins import str as newstr 

 

from pyrocko.io_common import FileLoadError 

from pyrocko.progress import progress 

 

from .backends import \ 

mseed, sac, datacube, stationxml, textfiles, virtual, yaml, tdms_idas 

 

from ..model import to_kind_ids 

 

backend_modules = [ 

mseed, sac, datacube, stationxml, textfiles, virtual, yaml, tdms_idas] 

 

 

logger = logging.getLogger('pyrocko.sqirrel.io') 

 

 

def make_task(*args, **kwargs): 

kwargs['logger'] = logger 

return progress.task(*args, **kwargs) 

 

 

def update_format_providers(): 

'''Update global mapping from file format to io backend module.''' 

 

global g_format_providers 

g_format_providers = {} 

for mod in backend_modules: 

for format in mod.provided_formats(): 

if format not in g_format_providers: 

g_format_providers[format] = [] 

 

g_format_providers[format].append(mod) 

 

 

g_format_providers = {} 

update_format_providers() 

 

 

class FormatDetectionFailed(FileLoadError): 

''' 

Exception raised when file format detection fails. 

''' 

 

def __init__(self, path): 

FileLoadError.__init__( 

self, 'format detection failed for file: %s' % path) 

 

 

class UnknownFormat(Exception): 

''' 

Exception raised when user requests an unknown file format. 

''' 

 

def __init__(self, format): 

Exception.__init__( 

self, 'unknown format: %s' % format) 

 

 

def get_backend(fmt): 

''' 

Get squirrel io backend module for a given file format. 

 

:params str fmt: format identifier 

''' 

 

try: 

return g_format_providers[fmt][0] 

except KeyError: 

raise UnknownFormat(fmt) 

 

 

def detect_format(path): 

''' 

Determine file type from first 512 bytes. 

 

:param str path: path of file 

''' 

 

if path.startswith('virtual:'): 

return 'virtual' 

 

try: 

with open(path, 'rb') as f: 

data = f.read(512) 

 

except (OSError, IOError): 

raise FormatDetectionFailed(path) 

 

fmt = None 

for mod in backend_modules: 

fmt = mod.detect(data) 

if fmt is not None: 

return fmt 

 

raise FormatDetectionFailed(path) 

 

 

def supported_formats(): 

''' 

Get list of file formats supported by Squirrel. 

''' 

return sorted(g_format_providers.keys()) 

 

 

g_content_kinds = ['waveform', 'station', 'channel', 'response', 'event'] 

 

 

def supported_content_kinds(): 

''' 

Get list of supported content kinds offered through Squirrel. 

''' 

return g_content_kinds 

 

 

def iload( 

paths, 

segment=None, 

format='detect', 

database=None, 

check=True, 

commit=True, 

skip_unchanged=False, 

content=g_content_kinds): 

 

''' 

Iteratively load content or index/reindex meta-information from files. 

 

:param paths: iterator yielding file names to load from or 

:py:class:`pyrocko.squirrel.Selection` object 

:param str segment: file-specific segment identifier (con only be used 

when loading from a single file. 

:param str format: file format identifier or ``'detect'`` for 

autodetection. When loading from a selection, per-file format 

assignation is taken from the hint in the selection and this flag is 

ignored. 

:param database: database to use for meta-information caching 

:type database: :py:class:`pyrocko.squirrel.Database` 

:param bool check: if ``True``, investigate modification time and file 

sizes of known files to debunk modified files (pessimistic mode), or 

``False`` to deactivate checks (optimistic mode) 

:param bool commit: flag, whether to commit updated information to the 

meta-information database 

:param bool skip_unchanged: if ``True``, only yield index nuts 

for new / modified files 

:param content: list of strings, selection of content types to load 

 

This generator yields :py:class:`pyrocko.squirrel.Nut` objects for 

individual pieces of information found when reading the given files. Such a 

nut may represent a waveform, a station, a channel, an event or other data 

type. The nut itself only contains the meta-information. The actual content 

information is attached to the nut if requested. All nut meta-information 

is stored in the squirrel meta-information database. If possible, this 

function avoids accessing the actual disk files and provides the requested 

information straight from the database. Modified files are recognized and 

reindexed as needed. 

''' 

 

from ..base import Selection 

 

n_db = 0 

n_load = 0 

selection = None 

kind_ids = to_kind_ids(content) 

 

if isinstance(paths, (str, newstr)): 

paths = [paths] 

else: 

if segment is not None: 

raise TypeError( 

'iload: segment argument can only be used when loading from ' 

'a single file') 

 

if isinstance(paths, Selection): 

selection = paths 

if database is not None: 

raise TypeError( 

'iload: database argument must be None when called with a ' 

'selection') 

 

database = selection.get_database() 

 

if skip_unchanged and not isinstance(paths, Selection): 

raise TypeError( 

'iload: need selection when called with "skip_unchanged=True"') 

 

temp_selection = None 

if database: 

if not selection: 

temp_selection = database.new_selection(paths) 

selection = temp_selection 

 

if skip_unchanged: 

selection.flag_modified(check) 

it = selection.undig_grouped(skip_unchanged=True) 

else: 

it = selection.undig_grouped() 

 

else: 

it = (((format, path), []) for path in paths) 

 

try: 

n_files_total = len(it) 

except TypeError: 

n_files_total = None 

 

task = None 

if progress is not None: 

if not kind_ids: 

task = make_task('Indexing files', n_files_total) 

else: 

task = make_task('Loading files', n_files_total) 

 

n_files = 0 

tcommit = time.time() 

database_modified = False 

for (format, path), old_nuts in it: 

if task is not None: 

condition = '(nuts: %i from file, %i from cache)\n %s' % ( 

n_load, n_db, path) 

task.update(n_files, condition) 

 

n_files += 1 

if database and commit and database_modified: 

tnow = time.time() 

if tnow - tcommit > 20. or n_files % 1000 == 0: 

database.commit() 

tcommit = tnow 

 

try: 

if check and old_nuts and old_nuts[0].file_modified(): 

old_nuts = [] 

 

if segment is not None: 

old_nuts = [ 

nut for nut in old_nuts if nut.file_segment == segment] 

 

if old_nuts: 

db_only_operation = not kind_ids or all( 

nut.kind_id in kind_ids and nut.content_in_db 

for nut in old_nuts) 

 

if db_only_operation: 

# logger.debug('using cached information for file %s, ' 

# % path) 

 

for nut in old_nuts: 

if nut.kind_id in kind_ids: 

database.undig_content(nut) 

 

n_db += 1 

yield nut 

 

continue 

 

if format == 'detect': 

if old_nuts and not old_nuts[0].file_modified(): 

format_this = old_nuts[0].file_format 

else: 

format_this = detect_format(path) 

else: 

format_this = format 

 

mod = get_backend(format_this) 

mtime, size = mod.get_stats(path) 

 

logger.debug('reading file %s' % path) 

nuts = [] 

for nut in mod.iload(format_this, path, segment, content): 

nut.file_path = path 

nut.file_format = format_this 

nut.file_mtime = mtime 

nut.file_size = size 

 

nuts.append(nut) 

n_load += 1 

yield nut 

 

if database and nuts != old_nuts: 

if segment is not None: 

nuts = mod.iload(format_this, path, None, []) 

for nut in nuts: 

nut.file_path = path 

nut.file_format = format_this 

nut.file_mtime = mtime 

 

database.dig(nuts) 

database_modified = True 

 

except FileLoadError: 

logger.error('An error occured while reading file: %s' % path) 

if database: 

database.reset(path) 

database_modified = True 

 

if task is not None: 

condition = '(nuts: %i from file, %i from cache)' % (n_load, n_db) 

task.update(n_files, condition) 

task.done(condition) 

 

if database: 

if commit and database_modified: 

database.commit() 

 

if temp_selection: 

del temp_selection 

 

logger.debug('iload: from cache: %i, from files: %i, files: %i' % ( 

n_db, n_load, n_files)) 

 

 

__all__ = [ 

'iload', 

'detect_format', 

'supported_formats', 

'supported_content_kinds', 

'get_backend', 

'FormatDetectionFailed', 

'UnknownFormat', 

]