1# http://pyrocko.org - GPLv3 

2# 

3# The Pyrocko Developers, 21st Century 

4# ---|P------/S----------~Lg---------- 

5 

6from __future__ import absolute_import, print_function 

7 

8import time 

9import logging 

10from builtins import str as newstr 

11 

12from pyrocko.io.io_common import FileLoadError 

13from pyrocko.progress import progress 

14 

15from .backends import \ 

16 mseed, sac, datacube, stationxml, textfiles, virtual, yaml, tdms_idas 

17 

18from ..model import to_kind_ids 

19 

20backend_modules = [ 

21 mseed, sac, datacube, stationxml, textfiles, virtual, yaml, tdms_idas] 

22 

23 

24logger = logging.getLogger('psq.io') 

25 

26 

27def make_task(*args): 

28 return progress.task(*args, logger=logger) 

29 

30 

31def update_format_providers(): 

32 '''Update global mapping from file format to io backend module.''' 

33 

34 global g_format_providers 

35 g_format_providers = {} 

36 for mod in backend_modules: 

37 for format in mod.provided_formats(): 

38 if format not in g_format_providers: 

39 g_format_providers[format] = [] 

40 

41 g_format_providers[format].append(mod) 

42 

43 

44g_format_providers = {} 

45update_format_providers() 

46 

47 

48class FormatDetectionFailed(FileLoadError): 

49 ''' 

50 Exception raised when file format detection fails. 

51 ''' 

52 

53 def __init__(self, path): 

54 FileLoadError.__init__( 

55 self, 'format detection failed for file: %s' % path) 

56 

57 

58class UnknownFormat(Exception): 

59 ''' 

60 Exception raised when user requests an unknown file format. 

61 ''' 

62 

63 def __init__(self, format): 

64 Exception.__init__( 

65 self, 'unknown format: %s' % format) 

66 

67 

68def get_backend(fmt): 

69 ''' 

70 Get squirrel io backend module for a given file format. 

71 

72 :param fmt: 

73 Format identifier. 

74 :type fmt: 

75 str 

76 ''' 

77 

78 try: 

79 return g_format_providers[fmt][0] 

80 except KeyError: 

81 raise UnknownFormat(fmt) 

82 

83 

84def detect_format(path): 

85 ''' 

86 Determine file type from first 512 bytes. 

87 

88 :param path: 

89 Path to file. 

90 :type path: 

91 str 

92 ''' 

93 

94 if path.startswith('virtual:'): 

95 return 'virtual' 

96 

97 try: 

98 with open(path, 'rb') as f: 

99 data = f.read(512) 

100 

101 except (OSError, IOError): 

102 raise FormatDetectionFailed(path) 

103 

104 fmt = None 

105 for mod in backend_modules: 

106 fmt = mod.detect(data) 

107 if fmt is not None: 

108 return fmt 

109 

110 raise FormatDetectionFailed(path) 

111 

112 

113def supported_formats(): 

114 ''' 

115 Get list of file formats supported by Squirrel. 

116 ''' 

117 return sorted(g_format_providers.keys()) 

118 

119 

120g_content_kinds = ['waveform', 'station', 'channel', 'response', 'event'] 

121 

122 

123def supported_content_kinds(): 

124 ''' 

125 Get list of supported content kinds offered through Squirrel. 

126 ''' 

127 return g_content_kinds 

128 

129 

130def iload( 

131 paths, 

132 segment=None, 

133 format='detect', 

134 database=None, 

135 check=True, 

136 skip_unchanged=False, 

137 content=g_content_kinds, 

138 show_progress=True): 

139 

140 ''' 

141 Iteratively load content or index/reindex meta-information from files. 

142 

143 :param paths: 

144 Iterator yielding file names to load from or a Squirrel selection 

145 object providing the file names. 

146 :type paths: 

147 iterator yielding :py:class:`str` or 

148 :py:class:`~pyrocko.squirrel.selection.Selection` 

149 

150 :param segment: 

151 File-specific segment identifier (can only be used when loading from a 

152 single file). 

153 :type segment: 

154 int 

155 

156 :param format: 

157 File format identifier or ``'detect'`` for autodetection. When loading 

158 from a selection, per-file format assignation is taken from the hint in 

159 the selection and this flag is ignored. 

160 :type format: 

161 str 

162 

163 :param database: 

164 Database to use for meta-information caching. When loading from a 

165 selection, this should be ``None`` and the database from the selection 

166 is used. 

167 :type database: 

168 :py:class:`~pyrocko.squirrel.database.Database` 

169 

170 :param check: 

171 If ``True``, investigate modification time and file sizes of known 

172 files to debunk modified files (pessimistic mode), or ``False`` to 

173 deactivate checks (optimistic mode). 

174 :type check: 

175 bool 

176 

177 :param skip_unchanged: 

178 If ``True``, only yield index nuts for new / modified files. 

179 :type skip_unchanged: 

180 bool 

181 

182 :param content: 

183 Selection of content types to load. 

184 :type content: 

185 :py:class:`list` of :py:class:`str` 

186 

187 This generator yields :py:class:`~pyrocko.squirrel.model.Nut` objects for 

188 individual pieces of information found when reading the given files. Such a 

189 nut may represent a waveform, a station, a channel, an event or other data 

190 type. The nut itself only contains the meta-information. The actual content 

191 information is attached to the nut if requested. All nut meta-information 

192 is stored in the squirrel meta-information database. If possible, this 

193 function avoids accessing the actual disk files and provides the requested 

194 information straight from the database. Modified files are recognized and 

195 reindexed as needed. 

196 ''' 

197 

198 from ..selection import Selection 

199 

200 n_db = 0 

201 n_load = 0 

202 selection = None 

203 kind_ids = to_kind_ids(content) 

204 

205 if isinstance(paths, (str, newstr)): 

206 paths = [paths] 

207 else: 

208 if segment is not None: 

209 raise TypeError( 

210 'iload: segment argument can only be used when loading from ' 

211 'a single file') 

212 

213 if isinstance(paths, Selection): 

214 selection = paths 

215 if database is not None: 

216 raise TypeError( 

217 'iload: database argument must be None when called with a ' 

218 'selection') 

219 

220 database = selection.get_database() 

221 

222 if skip_unchanged and not isinstance(paths, Selection): 

223 raise TypeError( 

224 'iload: need selection when called with "skip_unchanged=True"') 

225 

226 temp_selection = None 

227 if database: 

228 if not selection: 

229 temp_selection = database.new_selection( 

230 paths, show_progress=show_progress) 

231 

232 selection = temp_selection 

233 

234 if skip_unchanged: 

235 selection.flag_modified(check) 

236 it = selection.undig_grouped(skip_unchanged=True) 

237 else: 

238 it = selection.undig_grouped() 

239 

240 else: 

241 it = (((format, path), []) for path in paths) 

242 

243 try: 

244 n_files_total = len(it) 

245 except TypeError: 

246 n_files_total = None 

247 

248 task = None 

249 if show_progress: 

250 if not kind_ids: 

251 task = make_task('Indexing files', n_files_total) 

252 else: 

253 task = make_task('Loading files', n_files_total) 

254 

255 n_files = 0 

256 tcommit = time.time() 

257 if database: 

258 transaction = database.transaction() 

259 transaction.begin() 

260 

261 database_modified = False 

262 clean = False 

263 try: 

264 for (format, path), old_nuts in it: 

265 if task is not None: 

266 condition = '(nuts: %i from file, %i from cache)\n %s' % ( 

267 n_load, n_db, path) 

268 task.update(n_files, condition) 

269 

270 n_files += 1 

271 if database and database_modified: 

272 tnow = time.time() 

273 if tnow - tcommit > 20. or n_files % 1000 == 0: 

274 transaction.commit() 

275 tcommit = tnow 

276 transaction.begin() 

277 

278 try: 

279 if check and old_nuts and old_nuts[0].file_modified(): 

280 old_nuts = [] 

281 

282 if segment is not None: 

283 old_nuts = [ 

284 nut for nut in old_nuts if nut.file_segment == segment] 

285 

286 if old_nuts: 

287 db_only_operation = not kind_ids or all( 

288 nut.kind_id in kind_ids and nut.content_in_db 

289 for nut in old_nuts) 

290 

291 if db_only_operation: 

292 # logger.debug('using cached information for file %s, ' 

293 # % path) 

294 

295 for nut in old_nuts: 

296 if nut.kind_id in kind_ids: 

297 database.undig_content(nut) 

298 

299 n_db += 1 

300 yield nut 

301 

302 continue 

303 

304 if format == 'detect': 

305 if old_nuts and not old_nuts[0].file_modified(): 

306 format_this = old_nuts[0].file_format 

307 else: 

308 format_this = detect_format(path) 

309 else: 

310 format_this = format 

311 

312 mod = get_backend(format_this) 

313 mtime, size = mod.get_stats(path) 

314 

315 logger.debug('reading file %s' % path) 

316 nuts = [] 

317 for nut in mod.iload(format_this, path, segment, content): 

318 nut.file_path = path 

319 nut.file_format = format_this 

320 nut.file_mtime = mtime 

321 nut.file_size = size 

322 if nut.content is not None: 

323 nut.content._squirrel_key = nut.key 

324 

325 nuts.append(nut) 

326 n_load += 1 

327 yield nut 

328 

329 if database and nuts != old_nuts: 

330 if segment is not None: 

331 nuts = mod.iload(format_this, path, None, []) 

332 for nut in nuts: 

333 nut.file_path = path 

334 nut.file_format = format_this 

335 nut.file_mtime = mtime 

336 nut.file_size = size 

337 

338 database.dig(nuts, transaction=transaction) 

339 database_modified = True 

340 

341 except FileLoadError: 

342 logger.error('Cannot read file: %s' % path) 

343 if database: 

344 database.reset(path, transaction=transaction) 

345 database_modified = True 

346 

347 clean = True 

348 

349 finally: 

350 if task is not None: 

351 condition = '(nuts: %i from file, %i from cache)' % (n_load, n_db) 

352 task.update(n_files, condition) 

353 if clean: 

354 task.done(condition) 

355 else: 

356 task.fail(condition + ' terminated') 

357 

358 if database: 

359 transaction.commit() 

360 transaction.close() 

361 

362 if temp_selection: 

363 del temp_selection 

364 

365 

366__all__ = [ 

367 'iload', 

368 'detect_format', 

369 'supported_formats', 

370 'supported_content_kinds', 

371 'get_backend', 

372 'FormatDetectionFailed', 

373 'UnknownFormat', 

374]