1# http://pyrocko.org - GPLv3
2#
3# The Pyrocko Developers, 21st Century
4# ---|P------/S----------~Lg----------
6from __future__ import absolute_import, print_function
8import time
9import logging
10from builtins import str as newstr
12from pyrocko.io.io_common import FileLoadError
13from pyrocko.progress import progress
15from .backends import \
16 mseed, sac, datacube, stationxml, textfiles, virtual, yaml, tdms_idas
18from ..model import to_kind_ids
20backend_modules = [
21 mseed, sac, datacube, stationxml, textfiles, virtual, yaml, tdms_idas]
24logger = logging.getLogger('psq.io')
27def make_task(*args):
28 return progress.task(*args, logger=logger)
31def update_format_providers():
32 '''Update global mapping from file format to io backend module.'''
34 global g_format_providers
35 g_format_providers = {}
36 for mod in backend_modules:
37 for format in mod.provided_formats():
38 if format not in g_format_providers:
39 g_format_providers[format] = []
41 g_format_providers[format].append(mod)
44g_format_providers = {}
45update_format_providers()
48class FormatDetectionFailed(FileLoadError):
49 '''
50 Exception raised when file format detection fails.
51 '''
53 def __init__(self, path):
54 FileLoadError.__init__(
55 self, 'format detection failed for file: %s' % path)
58class UnknownFormat(Exception):
59 '''
60 Exception raised when user requests an unknown file format.
61 '''
63 def __init__(self, format):
64 Exception.__init__(
65 self, 'unknown format: %s' % format)
68def get_backend(fmt):
69 '''
70 Get squirrel io backend module for a given file format.
72 :param fmt:
73 Format identifier.
74 :type fmt:
75 str
76 '''
78 try:
79 return g_format_providers[fmt][0]
80 except KeyError:
81 raise UnknownFormat(fmt)
84def detect_format(path):
85 '''
86 Determine file type from first 512 bytes.
88 :param path:
89 Path to file.
90 :type path:
91 str
92 '''
94 if path.startswith('virtual:'):
95 return 'virtual'
97 try:
98 with open(path, 'rb') as f:
99 data = f.read(512)
101 except (OSError, IOError):
102 raise FormatDetectionFailed(path)
104 fmt = None
105 for mod in backend_modules:
106 fmt = mod.detect(data)
107 if fmt is not None:
108 return fmt
110 raise FormatDetectionFailed(path)
113def supported_formats():
114 '''
115 Get list of file formats supported by Squirrel.
116 '''
117 return sorted(g_format_providers.keys())
120g_content_kinds = ['waveform', 'station', 'channel', 'response', 'event']
123def supported_content_kinds():
124 '''
125 Get list of supported content kinds offered through Squirrel.
126 '''
127 return g_content_kinds
130def iload(
131 paths,
132 segment=None,
133 format='detect',
134 database=None,
135 check=True,
136 skip_unchanged=False,
137 content=g_content_kinds,
138 show_progress=True):
140 '''
141 Iteratively load content or index/reindex meta-information from files.
143 :param paths:
144 Iterator yielding file names to load from or a Squirrel selection
145 object providing the file names.
146 :type paths:
147 iterator yielding :py:class:`str` or
148 :py:class:`~pyrocko.squirrel.selection.Selection`
150 :param segment:
151 File-specific segment identifier (can only be used when loading from a
152 single file).
153 :type segment:
154 int
156 :param format:
157 File format identifier or ``'detect'`` for autodetection. When loading
158 from a selection, per-file format assignation is taken from the hint in
159 the selection and this flag is ignored.
160 :type format:
161 str
163 :param database:
164 Database to use for meta-information caching. When loading from a
165 selection, this should be ``None`` and the database from the selection
166 is used.
167 :type database:
168 :py:class:`~pyrocko.squirrel.database.Database`
170 :param check:
171 If ``True``, investigate modification time and file sizes of known
172 files to debunk modified files (pessimistic mode), or ``False`` to
173 deactivate checks (optimistic mode).
174 :type check:
175 bool
177 :param skip_unchanged:
178 If ``True``, only yield index nuts for new / modified files.
179 :type skip_unchanged:
180 bool
182 :param content:
183 Selection of content types to load.
184 :type content:
185 :py:class:`list` of :py:class:`str`
187 This generator yields :py:class:`~pyrocko.squirrel.model.Nut` objects for
188 individual pieces of information found when reading the given files. Such a
189 nut may represent a waveform, a station, a channel, an event or other data
190 type. The nut itself only contains the meta-information. The actual content
191 information is attached to the nut if requested. All nut meta-information
192 is stored in the squirrel meta-information database. If possible, this
193 function avoids accessing the actual disk files and provides the requested
194 information straight from the database. Modified files are recognized and
195 reindexed as needed.
196 '''
198 from ..selection import Selection
200 n_db = 0
201 n_load = 0
202 selection = None
203 kind_ids = to_kind_ids(content)
205 if isinstance(paths, (str, newstr)):
206 paths = [paths]
207 else:
208 if segment is not None:
209 raise TypeError(
210 'iload: segment argument can only be used when loading from '
211 'a single file')
213 if isinstance(paths, Selection):
214 selection = paths
215 if database is not None:
216 raise TypeError(
217 'iload: database argument must be None when called with a '
218 'selection')
220 database = selection.get_database()
222 if skip_unchanged and not isinstance(paths, Selection):
223 raise TypeError(
224 'iload: need selection when called with "skip_unchanged=True"')
226 temp_selection = None
227 if database:
228 if not selection:
229 temp_selection = database.new_selection(
230 paths, show_progress=show_progress)
232 selection = temp_selection
234 if skip_unchanged:
235 selection.flag_modified(check)
236 it = selection.undig_grouped(skip_unchanged=True)
237 else:
238 it = selection.undig_grouped()
240 else:
241 it = (((format, path), []) for path in paths)
243 try:
244 n_files_total = len(it)
245 except TypeError:
246 n_files_total = None
248 task = None
249 if show_progress:
250 if not kind_ids:
251 task = make_task('Indexing files', n_files_total)
252 else:
253 task = make_task('Loading files', n_files_total)
255 n_files = 0
256 tcommit = time.time()
257 if database:
258 transaction = database.transaction()
259 transaction.begin()
261 database_modified = False
262 clean = False
263 try:
264 for (format, path), old_nuts in it:
265 if task is not None:
266 condition = '(nuts: %i from file, %i from cache)\n %s' % (
267 n_load, n_db, path)
268 task.update(n_files, condition)
270 n_files += 1
271 if database and database_modified:
272 tnow = time.time()
273 if tnow - tcommit > 20. or n_files % 1000 == 0:
274 transaction.commit()
275 tcommit = tnow
276 transaction.begin()
278 try:
279 if check and old_nuts and old_nuts[0].file_modified():
280 old_nuts = []
282 if segment is not None:
283 old_nuts = [
284 nut for nut in old_nuts if nut.file_segment == segment]
286 if old_nuts:
287 db_only_operation = not kind_ids or all(
288 nut.kind_id in kind_ids and nut.content_in_db
289 for nut in old_nuts)
291 if db_only_operation:
292 # logger.debug('using cached information for file %s, '
293 # % path)
295 for nut in old_nuts:
296 if nut.kind_id in kind_ids:
297 database.undig_content(nut)
299 n_db += 1
300 yield nut
302 continue
304 if format == 'detect':
305 if old_nuts and not old_nuts[0].file_modified():
306 format_this = old_nuts[0].file_format
307 else:
308 format_this = detect_format(path)
309 else:
310 format_this = format
312 mod = get_backend(format_this)
313 mtime, size = mod.get_stats(path)
315 logger.debug('reading file %s' % path)
316 nuts = []
317 for nut in mod.iload(format_this, path, segment, content):
318 nut.file_path = path
319 nut.file_format = format_this
320 nut.file_mtime = mtime
321 nut.file_size = size
322 if nut.content is not None:
323 nut.content._squirrel_key = nut.key
325 nuts.append(nut)
326 n_load += 1
327 yield nut
329 if database and nuts != old_nuts:
330 if segment is not None:
331 nuts = mod.iload(format_this, path, None, [])
332 for nut in nuts:
333 nut.file_path = path
334 nut.file_format = format_this
335 nut.file_mtime = mtime
336 nut.file_size = size
338 database.dig(nuts, transaction=transaction)
339 database_modified = True
341 except FileLoadError:
342 logger.error('Cannot read file: %s' % path)
343 if database:
344 database.reset(path, transaction=transaction)
345 database_modified = True
347 clean = True
349 finally:
350 if task is not None:
351 condition = '(nuts: %i from file, %i from cache)' % (n_load, n_db)
352 task.update(n_files, condition)
353 if clean:
354 task.done(condition)
355 else:
356 task.fail(condition + ' terminated')
358 if database:
359 transaction.commit()
360 transaction.close()
362 if temp_selection:
363 del temp_selection
366__all__ = [
367 'iload',
368 'detect_format',
369 'supported_formats',
370 'supported_content_kinds',
371 'get_backend',
372 'FormatDetectionFailed',
373 'UnknownFormat',
374]