Coverage for /usr/local/lib/python3.9/dist-packages/pyrocko/squirrel/io/base.py: 98%

1# http://pyrocko.org - GPLv3

3# The Pyrocko Developers, 21st Century

4# ---|P------/S----------~Lg----------

6import time

7import logging

9from pyrocko import util

10from pyrocko.io.io_common import FileLoadError

11from pyrocko.progress import progress

13from .backends import \

14 mseed, sac, datacube, stationxml, textfiles, virtual, yaml, tdms_idas

16from ..model import to_kind_ids

18backend_modules = [

19 mseed, sac, datacube, stationxml, textfiles, virtual, yaml, tdms_idas]

22logger = logging.getLogger('psq.io')

25def make_task(*args):

26 return progress.task(*args, logger=logger)

29def update_format_providers():

30 '''Update global mapping from file format to io backend module.'''

32 global g_format_providers

33 g_format_providers = {}

34 for mod in backend_modules:

35 for format in mod.provided_formats():

36 if format not in g_format_providers:

37 g_format_providers[format] = []

39 g_format_providers[format].append(mod)

42g_format_providers = {}

43update_format_providers()

46class FormatDetectionFailed(FileLoadError):

47 '''

48 Exception raised when file format detection fails.

49 '''

51 def __init__(self, path):

52 FileLoadError.__init__(

53 self, 'format detection failed for file: %s' % path)

56class UnknownFormat(Exception):

57 '''

58 Exception raised when user requests an unknown file format.

59 '''

61 def __init__(self, format):

62 Exception.__init__(

63 self, 'unknown format: %s' % format)

66def get_backend(fmt):

67 '''

68 Get squirrel io backend module for a given file format.

70 :param fmt:

71 Format identifier.

72 :type fmt:

73 str

74 '''

76 try:

77 return g_format_providers[fmt][0]

78 except KeyError:

79 raise UnknownFormat(fmt)

82def detect_format(path):

83 '''

84 Determine file type from first 512 bytes.

86 :param path:

87 Path to file.

88 :type path:

89 str

90 '''

92 if path.startswith('virtual:'):

93 return 'virtual'

95 try:

96 with open(path, 'rb') as f:

97 data = f.read(512)

99 except (OSError, IOError):

100 raise FormatDetectionFailed(path)

101

102 fmt = None

103 for mod in backend_modules:

104 fmt = mod.detect(data)

105 if fmt is not None:

106 return fmt

107

108 raise FormatDetectionFailed(path)

109

110

111def supported_formats():

112 '''

113 Get list of file formats supported by Squirrel.

114 '''

115 return sorted(g_format_providers.keys())

116

117

118g_content_kinds = ['waveform', 'station', 'channel', 'response', 'event']

119

120

121def supported_content_kinds():

122 '''

123 Get list of supported content kinds offered through Squirrel.

124 '''

125 return g_content_kinds + ['waveform_promise']

126

127

128def iload(

129 paths,

130 segment=None,

131 format='detect',

132 database=None,

133 check=True,

134 skip_unchanged=False,

135 content=g_content_kinds,

136 show_progress=True,

137 update_selection=None):

138

139 '''

140 Iteratively load content or index/reindex meta-information from files.

141

142 :param paths:

143 Iterator yielding file names to load from or a Squirrel selection

144 object providing the file names.

145 :type paths:

146 iterator yielding :py:class:`str` or

147 :py:class:`~pyrocko.squirrel.selection.Selection`

148

149 :param segment:

150 File-specific segment identifier (can only be used when loading from a

151 single file).

152 :type segment:

153 int

154

155 :param format:

156 File format identifier or ``'detect'`` for autodetection. When loading

157 from a selection, per-file format assignation is taken from the hint in

158 the selection and this flag is ignored.

159 :type format:

160 str

161

162 :param database:

163 Database to use for meta-information caching. When loading from a

164 selection, this should be ``None`` and the database from the selection

165 is used.

166 :type database:

167 :py:class:`~pyrocko.squirrel.database.Database`

168

169 :param check:

170 If ``True``, investigate modification time and file sizes of known

171 files to debunk modified files (pessimistic mode), or ``False`` to

172 deactivate checks (optimistic mode).

173 :type check:

174 bool

175

176 :param skip_unchanged:

177 If ``True``, only yield index nuts for new / modified files.

178 :type skip_unchanged:

179 bool

180

181 :param content:

182 Selection of content types to load.

183 :type content:

184 :py:class:`list` of :py:class:`str`

185

186 This generator yields :py:class:`~pyrocko.squirrel.model.Nut` objects for

187 individual pieces of information found when reading the given files. Such a

188 nut may represent a waveform, a station, a channel, an event or other data

189 type. The nut itself only contains the meta-information. The actual content

190 information is attached to the nut if requested. All nut meta-information

191 is stored in the squirrel meta-information database. If possible, this

192 function avoids accessing the actual disk files and provides the requested

193 information straight from the database. Modified files are recognized and

194 reindexed as needed.

195 '''

196

197 from ..selection import Selection

198

199 n_db = 0

200 n_load = 0

201 selection = None

202 kind_ids = to_kind_ids(content)

203

204 if isinstance(paths, str):

205 paths = [paths]

206 else:

207 if segment is not None:

208 raise TypeError(

209 'iload: segment argument can only be used when loading from '

210 'a single file')

211

212 if isinstance(paths, Selection):

213 selection = paths

214 if database is not None:

215 raise TypeError(

216 'iload: database argument must be None when called with a '

217 'selection')

218

219 database = selection.get_database()

220

221 if skip_unchanged and not isinstance(paths, Selection):

222 raise TypeError(

223 'iload: need selection when called with "skip_unchanged=True"')

224

225 temp_selection = None

226 transaction = None

227 if database:

228 if not selection:

229 # Avoid creating temporary selection for small batches.

230 # this is helpful because then, we can avoid locking the database,

231 # e.g. during loading of content, when the content has not been

232 # modified.

233 paths = util.short_to_list(100, paths)

234 if not (isinstance(paths, list) and len(paths) < 100

235 and not skip_unchanged):

236

237 temp_selection = database.new_selection(

238 paths, show_progress=show_progress, format=format)

239

240 selection = temp_selection

241

242 if skip_unchanged:

243 selection.flag_modified(check)

244

245 if selection:

246 # undig_grouped starts a long select which causes deadlocks

247 # when transaction is started after starting the select, therefore

248 # the transaction has to be started before in these cases.

249 # The db will be locked for a long time in this case. This could be

250 # solved either by breaking the indexing into smaller blocks in

251 # the caller or by modifying undig_grouped to allow limit and

252 # offset and add an outer loop below.

253 transaction = database.transaction(

254 'update content index')

255 transaction.begin()

256 it = selection.undig_grouped(skip_unchanged=skip_unchanged)

257 else:

258 # The list() causes the query to finish, so we don't have to lock,

259 # and can start a transaction only when encountering a modified/new

260 # file.

261 it = list(database.undig_few(paths, format=format))

262

263 else:

264 it = (((format, path), []) for path in paths)

265

266 try:

267 n_files_total = len(it)

268 except TypeError:

269 n_files_total = None

270

271 task = None

272 if show_progress:

273 if not kind_ids:

274 task = make_task('Indexing files', n_files_total)

275 else:

276 task = make_task('Loading files', n_files_total)

277

278 n_files = 0

279 tcommit = time.time()

280

281 clean = False

282 try:

283 for (format, path), old_nuts in it:

284 if task is not None:

285 condition = '(nuts: %i from file, %i from cache)\n %s' % (

286 n_load, n_db, path)

287 task.update(n_files, condition)

288

289 n_files += 1

290 if database and transaction:

291 tnow = time.time()

292 if tnow - tcommit > 20. or n_files % 1000 == 0:

293 transaction.commit()

294 tcommit = tnow

295 transaction.begin()

296

297 try:

298 if check and old_nuts and old_nuts[0].file_modified():

299 old_nuts = []

300 modified = True

301 else:

302 modified = False

303

304 if segment is not None:

305 old_nuts = [

306 nut for nut in old_nuts if nut.file_segment == segment]

307

308 if old_nuts:

309 db_only_operation = not kind_ids or all(

310 nut.kind_id in kind_ids and nut.content_in_db

311 for nut in old_nuts)

312

313 if db_only_operation:

314 # logger.debug('using cached information for file %s, '

315 # % path)

316

317 for nut in old_nuts:

318 if nut.kind_id in kind_ids:

319 database.undig_content(nut)

320

321 n_db += 1

322 yield nut

323

324 continue

325

326 if format == 'detect':

327 if old_nuts and not old_nuts[0].file_modified():

328 format_this = old_nuts[0].file_format

329 else:

330 format_this = detect_format(path)

331 else:

332 format_this = format

333

334 mod = get_backend(format_this)

335 mtime, size = mod.get_stats(path)

336

337 if segment is not None:

338 logger.debug(

339 'Reading file "%s", segment "%s".' % (path, segment))

340 else:

341 logger.debug(

342 'Reading file "%s".' % path)

343

344 nuts = []

345 for nut in mod.iload(format_this, path, segment, content):

346 nut.file_path = path

347 nut.file_format = format_this

348 nut.file_mtime = mtime

349 nut.file_size = size

350 if nut.content is not None:

351 nut.content._squirrel_key = nut.key

352

353 nuts.append(nut)

354 n_load += 1

355 yield nut

356

357 if database and nuts != old_nuts:

358 if old_nuts or modified:

359 logger.debug(

360 'File has been modified since last access: %s'

361 % path)

362

363 if segment is not None:

364 nuts = list(mod.iload(format_this, path, None, []))

365 for nut in nuts:

366 nut.file_path = path

367 nut.file_format = format_this

368 nut.file_mtime = mtime

369 nut.file_size = size

370

371 if not transaction:

372 transaction = database.transaction(

373 'update content index')

374 transaction.begin()

375

376 database.dig(nuts, transaction=transaction)

377 if update_selection is not None:

378 update_selection._set_file_states_force_check(

379 [path], transaction=transaction)

380 update_selection._update_nuts(transaction=transaction)

381

382 except FileLoadError:

383 logger.error('Cannot read file: %s' % path)

384 if database:

385 if not transaction:

386 transaction = database.transaction(

387 'update content index')

388 transaction.begin()

389 database.reset(path, transaction=transaction)

390

391 clean = True

392

393 finally:

394 if task is not None:

395 condition = '(nuts: %i from file, %i from cache)' % (n_load, n_db)

396 task.update(n_files, condition)

397 if clean:

398 task.done(condition)

399 else:

400 task.fail(condition + ' terminated')

401

402 if database and transaction:

403 transaction.commit()

404 transaction.close()

405

406 if temp_selection:

407 del temp_selection

408

409

410__all__ = [

411 'iload',

412 'detect_format',

413 'supported_formats',

414 'supported_content_kinds',

415 'get_backend',

416 'FormatDetectionFailed',

417 'UnknownFormat',

418]

Coverage for /usr/local/lib/python3.9/dist-packages/pyrocko/squirrel/io/base.py : 98%

174 statements 170 run 4 missing 9 excluded