Source code for pyrocko.pile

# https://pyrocko.org - GPLv3
#
# The Pyrocko Developers, 21st Century
# ---|P------/S----------~Lg----------

'''
Waveform archive lookup, data loading and caching infrastructure.

.. note::

    This module has been superseded by :py:mod:`~pyrocko.squirrel` but will
    remain available for backwards compatibility.
'''

import os
import logging
import time
import copy
import re
import sys
import operator
import math
import hashlib
try:
    import cPickle as pickle
except ImportError:
    import pickle


from . import avl
from . import trace, io, util
from . import config
from .trace import degapper


is_windows = sys.platform.startswith('win')
show_progress_force_off = False
version_salt = 'v1-'


def ehash(s):
    return hashlib.sha1((version_salt + s).encode('utf8')).hexdigest()


def cmp(a, b):
    return int(a > b) - int(a < b)


def sl(s):
    return [str(x) for x in sorted(s)]


class Counter(dict):

    def __missing__(self, k):
        return 0

    def update(self, other):
        for k, v in other.items():
            self[k] += v

    def subtract(self, other):
        for k, v in other.items():
            self[k] -= v
            if self[k] <= 0:
                del self[k]

    def subtract1(self, k):
        self[k] -= 1
        if self[k] <= 0:
            del self[k]


def fix_unicode_copy(counter, func):
    counter_new = Counter()
    for k in counter:
        counter_new[func(k)] = counter[k]
    return counter_new


pjoin = os.path.join
logger = logging.getLogger('pyrocko.pile')


def avl_remove_exact(avltree, element):
    ilo, ihi = avltree.span(element)
    for i in range(ilo, ihi):
        if avltree[i] is element:
            avltree.remove_at(i)
            return i

    raise ValueError(
        'avl_remove_exact(avltree, element): element not in avltree')


def cmpfunc(key):
    if isinstance(key, str):
        # special cases; these run about 50% faster than the generic one on
        # Python 2.5
        if key == 'tmin':
            return lambda a, b: cmp(a.tmin, b.tmin)
        if key == 'tmax':
            return lambda a, b: cmp(a.tmax, b.tmax)

        key = operator.attrgetter(key)

    return lambda a, b: cmp(key(a), key(b))


g_dummys = {}


def get_dummy(key):
    if key not in g_dummys:
        class Dummy(object):
            def __init__(self, k):
                setattr(self, key, k)

        g_dummys[key] = Dummy

    return g_dummys[key]


class TooMany(Exception):
    def __init__(self, n):
        Exception.__init__(self)
        self.n = n


class Sorted(object):
    def __init__(self, values=[], key=None):
        self._set_key(key)
        self._avl = avl.new(values, self._cmp)

    def _set_key(self, key):
        self._key = key
        self._cmp = cmpfunc(key)
        if isinstance(key, str):
            self._dummy = get_dummy(key)

    def __getstate__(self):
        state = list(self._avl.iter()), self._key
        return state

    def __setstate__(self, state):
        it, key = state
        self._set_key(key)
        self._avl = avl.from_iter(iter(it), len(it))

    def insert(self, value):
        self._avl.insert(value)

    def remove(self, value):
        return avl_remove_exact(self._avl, value)

    def remove_at(self, i):
        return self._avl.remove_at(i)

    def insert_many(self, values):
        for value in values:
            self._avl.insert(value)

    def remove_many(self, values):
        for value in values:
            avl_remove_exact(self._avl, value)

    def __iter__(self):
        return iter(self._avl)

    def with_key_in(self, kmin, kmax):
        omin, omax = self._dummy(kmin), self._dummy(kmax)
        ilo, ihi = self._avl.span(omin, omax)
        return self._avl[ilo:ihi]

    def with_key_in_limited(self, kmin, kmax, nmax):
        omin, omax = self._dummy(kmin), self._dummy(kmax)
        ilo, ihi = self._avl.span(omin, omax)
        if ihi - ilo > nmax:
            raise TooMany(ihi - ilo)

        return self._avl[ilo:ihi]

    def index(self, value):
        ilo, ihi = self._avl.span(value)
        for i in range(ilo, ihi):
            if self._avl[i] is value:
                return i

        raise ValueError('element is not in avl tree')

    def min(self):
        return self._avl.min()

    def max(self):
        return self._avl.max()

    def __len__(self):
        return len(self._avl)

    def __getitem__(self, i):
        return self._avl[i]


[docs]class TracesFileCache(object): ''' Manages trace metainformation cache. For each directory with files containing traces, one cache file is maintained to hold the trace metainformation of all files which are contained in the directory. ''' caches = {} def __init__(self, cachedir): ''' Create new cache. :param cachedir: directory to hold the cache files. ''' self.cachedir = cachedir self.dircaches = {} self.modified = set() util.ensuredir(self.cachedir)
[docs] def get(self, abspath): ''' Try to get an item from the cache. :param abspath: absolute path of the object to retrieve :returns: a stored object is returned or None if nothing could be found. ''' dircache = self._get_dircache_for(abspath) if abspath in dircache: return dircache[abspath] return None
[docs] def put(self, abspath, tfile): ''' Put an item into the cache. :param abspath: absolute path of the object to be stored :param tfile: object to be stored ''' cachepath = self._dircachepath(abspath) # get lock on cachepath here dircache = self._get_dircache(cachepath) dircache[abspath] = tfile self.modified.add(cachepath)
[docs] def dump_modified(self): ''' Save any modifications to disk. ''' for cachepath in self.modified: self._dump_dircache(self.dircaches[cachepath], cachepath) # unlock self.modified = set()
[docs] def clean(self): ''' Weed out missing files from the disk caches. ''' self.dump_modified() for fn in os.listdir(self.cachedir): if len(fn) == 40: cache = self._load_dircache(pjoin(self.cachedir, fn)) self._dump_dircache(cache, pjoin(self.cachedir, fn))
def _get_dircache_for(self, abspath): return self._get_dircache(self._dircachepath(abspath)) def _get_dircache(self, cachepath): if cachepath not in self.dircaches: if os.path.isfile(cachepath): self.dircaches[cachepath] = self._load_dircache(cachepath) else: self.dircaches[cachepath] = {} return self.dircaches[cachepath] def _dircachepath(self, abspath): cachefn = ehash(os.path.dirname(abspath)) return pjoin(self.cachedir, cachefn) def _load_dircache(self, cachefilename): with open(cachefilename, 'rb') as f: cache = pickle.load(f) # weed out files which no longer exist for fn in list(cache.keys()): if not os.path.isfile(fn): del cache[fn] time_float = util.get_time_float() for v in cache.values(): v.trees_from_content(v.traces) for tr in v.traces: tr.file = v # fix Py2 codes to not include unicode when the cache file # was created with Py3 if not isinstance(tr.station, str): tr.prune_from_reuse_cache() tr.set_codes( str(tr.network), str(tr.station), str(tr.location), str(tr.channel)) tr.tmin = time_float(tr.tmin) tr.tmax = time_float(tr.tmax) v.data_use_count = 0 v.data_loaded = False v.fix_unicode_codes() return cache def _dump_dircache(self, cache, cachefilename): if not cache: if os.path.exists(cachefilename): os.remove(cachefilename) return # make a copy without the parents and the binsearch trees cache_copy = {} for fn in cache.keys(): trf = copy.copy(cache[fn]) trf.parent = None trf.by_tmin = None trf.by_tmax = None trf.by_tlen = None trf.by_mtime = None trf.data_use_count = 0 trf.data_loaded = False traces = [] for tr in trf.traces: tr = tr.copy(data=False) tr.ydata = None tr.meta = None tr.file = trf traces.append(tr) trf.traces = traces cache_copy[fn] = trf tmpfn = cachefilename+'.%i.tmp' % os.getpid() with open(tmpfn, 'wb') as f: pickle.dump(cache_copy, f, protocol=2) if is_windows and os.path.exists(cachefilename): # windows doesn't allow to rename over existing file os.unlink(cachefilename) os.rename(tmpfn, cachefilename)
[docs]def get_cache(cachedir): ''' Get global TracesFileCache object for given directory. ''' if cachedir not in TracesFileCache.caches: TracesFileCache.caches[cachedir] = TracesFileCache(cachedir) return TracesFileCache.caches[cachedir]
def loader( filenames, fileformat, cache, filename_attributes, show_progress=True, update_progress=None): if show_progress_force_off: show_progress = False class Progress(object): def __init__(self, label, n): self._label = label self._n = n self._bar = None if show_progress: self._bar = util.progressbar(label, self._n) if update_progress: update_progress(label, 0, self._n) def update(self, i): if self._bar: if i < self._n-1: self._bar.update(i) else: self._bar.finish() self._bar = None abort = False if update_progress: abort = update_progress(self._label, i, self._n) return abort def finish(self): if self._bar: self._bar.finish() self._bar = None if not filenames: logger.warning('No files to load from') return regex = None if filename_attributes: regex = re.compile(filename_attributes) try: progress = Progress('Looking at files', len(filenames)) failures = [] to_load = [] for i, filename in enumerate(filenames): try: abspath = os.path.abspath(filename) substitutions = None if regex: m = regex.search(filename) if not m: raise FilenameAttributeError( "Cannot get attributes with pattern '%s' " "from path '%s'" % (filename_attributes, filename)) substitutions = {} for k in m.groupdict(): if k in ('network', 'station', 'location', 'channel'): substitutions[k] = m.groupdict()[k] mtime = os.stat(filename)[8] tfile = None if cache: tfile = cache.get(abspath) mustload = ( not tfile or (tfile.format != fileformat and fileformat != 'detect') or tfile.mtime != mtime or substitutions is not None) to_load.append( (mustload, mtime, abspath, substitutions, tfile)) except (OSError, FilenameAttributeError) as xerror: failures.append(abspath) logger.warning(xerror) abort = progress.update(i+1) if abort: progress.update(len(filenames)) return progress.update(len(filenames)) to_load.sort(key=lambda x: x[2]) nload = len([1 for x in to_load if x[0]]) iload = 0 count_all = False if nload < 0.01*len(to_load): nload = len(to_load) count_all = True if to_load: progress = Progress('Scanning files', nload) for (mustload, mtime, abspath, substitutions, tfile) in to_load: try: if mustload: tfile = TracesFile( None, abspath, fileformat, substitutions=substitutions, mtime=mtime) if cache and not substitutions: cache.put(abspath, tfile) if not count_all: iload += 1 if count_all: iload += 1 except (io.FileLoadError, OSError) as xerror: failures.append(abspath) logger.warning(xerror) else: yield tfile abort = progress.update(iload+1) if abort: break progress.update(nload) if failures: logger.warning( 'The following file%s caused problems and will be ignored:\n' % util.plural_s(len(failures)) + '\n'.join(failures)) if cache: cache.dump_modified() finally: progress.finish() def tlen(x): return x.tmax-x.tmin
[docs]class TracesGroup(object): ''' Trace container base class. Base class for Pile, SubPile, and TracesFile, i.e. anything containing a collection of several traces. A TracesGroup object maintains lookup sets of some of the traces meta-information, as well as a combined time-range of its contents. ''' def __init__(self, parent): self.parent = parent self.empty() self.nupdates = 0 self.abspath = None def set_parent(self, parent): self.parent = parent def get_parent(self): return self.parent def empty(self): self.networks, self.stations, self.locations, self.channels, \ self.nslc_ids, self.deltats = [Counter() for x in range(6)] self.by_tmin = Sorted([], 'tmin') self.by_tmax = Sorted([], 'tmax') self.by_tlen = Sorted([], tlen) self.by_mtime = Sorted([], 'mtime') self.tmin, self.tmax = None, None self.deltatmin, self.deltatmax = None, None def trees_from_content(self, content): self.by_tmin = Sorted(content, 'tmin') self.by_tmax = Sorted(content, 'tmax') self.by_tlen = Sorted(content, tlen) self.by_mtime = Sorted(content, 'mtime') self.adjust_minmax() def fix_unicode_codes(self): for net in self.networks: if isinstance(net, str): return self.networks = fix_unicode_copy(self.networks, str) self.stations = fix_unicode_copy(self.stations, str) self.locations = fix_unicode_copy(self.locations, str) self.channels = fix_unicode_copy(self.channels, str) self.nslc_ids = fix_unicode_copy( self.nslc_ids, lambda k: tuple(str(x) for x in k))
[docs] def add(self, content): ''' Add content to traces group and update indices. Accepts :py:class:`pyrocko.trace.Trace` objects and :py:class:`pyrocko.pile.TracesGroup` objects. ''' if isinstance(content, (trace.Trace, TracesGroup)): content = [content] for c in content: if isinstance(c, TracesGroup): self.networks.update(c.networks) self.stations.update(c.stations) self.locations.update(c.locations) self.channels.update(c.channels) self.nslc_ids.update(c.nslc_ids) self.deltats.update(c.deltats) self.by_tmin.insert_many(c.by_tmin) self.by_tmax.insert_many(c.by_tmax) self.by_tlen.insert_many(c.by_tlen) self.by_mtime.insert_many(c.by_mtime) elif isinstance(c, trace.Trace): self.networks[c.network] += 1 self.stations[c.station] += 1 self.locations[c.location] += 1 self.channels[c.channel] += 1 self.nslc_ids[c.nslc_id] += 1 self.deltats[c.deltat] += 1 self.by_tmin.insert(c) self.by_tmax.insert(c) self.by_tlen.insert(c) self.by_mtime.insert(c) self.adjust_minmax() self.nupdates += 1 self.notify_listeners('add', content) if self.parent is not None: self.parent.add(content)
[docs] def remove(self, content): ''' Remove content to traces group and update indices. ''' if isinstance(content, (trace.Trace, TracesGroup)): content = [content] for c in content: if isinstance(c, TracesGroup): self.networks.subtract(c.networks) self.stations.subtract(c.stations) self.locations.subtract(c.locations) self.channels.subtract(c.channels) self.nslc_ids.subtract(c.nslc_ids) self.deltats.subtract(c.deltats) self.by_tmin.remove_many(c.by_tmin) self.by_tmax.remove_many(c.by_tmax) self.by_tlen.remove_many(c.by_tlen) self.by_mtime.remove_many(c.by_mtime) elif isinstance(c, trace.Trace): self.networks.subtract1(c.network) self.stations.subtract1(c.station) self.locations.subtract1(c.location) self.channels.subtract1(c.channel) self.nslc_ids.subtract1(c.nslc_id) self.deltats.subtract1(c.deltat) self.by_tmin.remove(c) self.by_tmax.remove(c) self.by_tlen.remove(c) self.by_mtime.remove(c) self.adjust_minmax() self.nupdates += 1 self.notify_listeners('remove', content) if self.parent is not None: self.parent.remove(content)
[docs] def relevant(self, tmin, tmax, group_selector=None, trace_selector=None): ''' Return list of :py:class:`pyrocko.trace.Trace` objects where given arguments ``tmin`` and ``tmax`` match. :param tmin: start time :param tmax: end time :param group_selector: lambda expression taking group dict of regex match object as a single argument and which returns true or false to keep or reject a file (default: ``None``) :param trace_selector: lambda expression taking group dict of regex match object as a single argument and which returns true or false to keep or reject a file (default: ``None``) ''' if not self.by_tmin or not self.is_relevant( tmin, tmax, group_selector): return [] return [tr for tr in self.by_tmin.with_key_in(tmin-self.tlenmax, tmax) if tr.is_relevant(tmin, tmax, trace_selector)]
def adjust_minmax(self): if self.by_tmin: self.tmin = self.by_tmin.min().tmin self.tmax = self.by_tmax.max().tmax t = self.by_tlen.max() self.tlenmax = t.tmax - t.tmin self.mtime = self.by_mtime.max().mtime deltats = list(self.deltats.keys()) self.deltatmin = min(deltats) self.deltatmax = max(deltats) else: self.tmin = None self.tmax = None self.tlenmax = None self.mtime = None self.deltatmin = None self.deltatmax = None def notify_listeners(self, what, content): pass def get_update_count(self): return self.nupdates def overlaps(self, tmin, tmax): return self.tmin is not None \ and tmax >= self.tmin and self.tmax >= tmin def is_relevant(self, tmin, tmax, group_selector=None): if self.tmin is None or self.tmax is None: return False return tmax >= self.tmin and self.tmax >= tmin and ( group_selector is None or group_selector(self))
[docs]class MemTracesFile(TracesGroup): ''' This is needed to make traces without an actual disc file to be inserted into a Pile. ''' def __init__(self, parent, traces): TracesGroup.__init__(self, parent) self.add(traces) self.mtime = time.time() def add(self, traces): if isinstance(traces, trace.Trace): traces = [traces] for tr in traces: tr.file = self TracesGroup.add(self, traces) def load_headers(self, mtime=None): pass def load_data(self): pass def use_data(self): pass def drop_data(self): pass def reload_if_modified(self): return False def iter_traces(self): for tr in self.by_tmin: yield tr def get_traces(self): return list(self.by_tmin) def gather_keys(self, gather, selector=None): keys = set() for tr in self.by_tmin: if selector is None or selector(tr): keys.add(gather(tr)) return keys def __str__(self): s = 'MemTracesFile\n' s += 'file mtime: %s\n' % util.time_to_str(self.mtime) s += 'number of traces: %i\n' % len(self.by_tmin) s += 'timerange: %s - %s\n' % ( util.time_to_str(self.tmin), util.time_to_str(self.tmax)) s += 'networks: %s\n' % ', '.join(sl(self.networks.keys())) s += 'stations: %s\n' % ', '.join(sl(self.stations.keys())) s += 'locations: %s\n' % ', '.join(sl(self.locations.keys())) s += 'channels: %s\n' % ', '.join(sl(self.channels.keys())) s += 'deltats: %s\n' % ', '.join(sl(self.deltats.keys())) return s
class TracesFile(TracesGroup): def __init__( self, parent, abspath, format, substitutions=None, mtime=None): TracesGroup.__init__(self, parent) self.abspath = abspath self.format = format self.traces = [] self.data_loaded = False self.data_use_count = 0 self.substitutions = substitutions self.load_headers(mtime=mtime) self.mtime = mtime def load_headers(self, mtime=None): logger.debug('loading headers from file: %s' % self.abspath) if mtime is None: self.mtime = os.stat(self.abspath)[8] def kgen(tr): return (tr.mtime, tr.tmin, tr.tmax) + tr.nslc_id self.remove(self.traces) ks = set() for tr in io.load(self.abspath, format=self.format, getdata=False, substitutions=self.substitutions): k = kgen(tr) if k not in ks: ks.add(k) self.traces.append(tr) tr.file = self self.add(self.traces) self.data_loaded = False self.data_use_count = 0 def load_data(self, force=False): file_changed = False if not self.data_loaded or force: logger.debug('loading data from file: %s' % self.abspath) def kgen(tr): return (tr.mtime, tr.tmin, tr.tmax) + tr.nslc_id traces_ = io.load(self.abspath, format=self.format, getdata=True, substitutions=self.substitutions) # prevent adding duplicate snippets from corrupt mseed files k_loaded = set() traces = [] for tr in traces_: k = kgen(tr) if k not in k_loaded: k_loaded.add(k) traces.append(tr) k_current_d = dict((kgen(tr), tr) for tr in self.traces) k_current = set(k_current_d) k_new = k_loaded - k_current k_delete = k_current - k_loaded k_unchanged = k_current & k_loaded for tr in self.traces[:]: if kgen(tr) in k_delete: self.remove(tr) self.traces.remove(tr) tr.file = None file_changed = True for tr in traces: if kgen(tr) in k_new: tr.file = self self.traces.append(tr) self.add(tr) file_changed = True for tr in traces: if kgen(tr) in k_unchanged: ctr = k_current_d[kgen(tr)] ctr.ydata = tr.ydata self.data_loaded = True if file_changed: logger.debug('reloaded (file may have changed): %s' % self.abspath) return file_changed def use_data(self): if not self.data_loaded: raise Exception('Data not loaded') self.data_use_count += 1 def drop_data(self): if self.data_loaded: if self.data_use_count == 1: logger.debug('forgetting data of file: %s' % self.abspath) for tr in self.traces: tr.drop_data() self.data_loaded = False self.data_use_count -= 1 else: self.data_use_count = 0 def reload_if_modified(self): mtime = os.stat(self.abspath)[8] if mtime != self.mtime: logger.debug( 'mtime=%i, reloading file: %s' % (mtime, self.abspath)) self.mtime = mtime if self.data_loaded: self.load_data(force=True) else: self.load_headers() return True return False def iter_traces(self): for tr in self.traces: yield tr def gather_keys(self, gather, selector=None): keys = set() for tr in self.by_tmin: if selector is None or selector(tr): keys.add(gather(tr)) return keys def __str__(self): s = 'TracesFile\n' s += 'abspath: %s\n' % self.abspath s += 'file mtime: %s\n' % util.time_to_str(self.mtime) s += 'number of traces: %i\n' % len(self.traces) s += 'timerange: %s - %s\n' % ( util.time_to_str(self.tmin), util.time_to_str(self.tmax)) s += 'networks: %s\n' % ', '.join(sl(self.networks.keys())) s += 'stations: %s\n' % ', '.join(sl(self.stations.keys())) s += 'locations: %s\n' % ', '.join(sl(self.locations.keys())) s += 'channels: %s\n' % ', '.join(sl(self.channels.keys())) s += 'deltats: %s\n' % ', '.join(sl(self.deltats.keys())) return s class FilenameAttributeError(Exception): pass class SubPile(TracesGroup): def __init__(self, parent): TracesGroup.__init__(self, parent) self.files = [] self.empty() def add_file(self, file): self.files.append(file) file.set_parent(self) self.add(file) def remove_file(self, file): self.files.remove(file) file.set_parent(None) self.remove(file) def remove_files(self, files): for file in files: self.files.remove(file) file.set_parent(None) self.remove(files) def gather_keys(self, gather, selector=None): keys = set() for file in self.files: keys |= file.gather_keys(gather, selector) return keys def iter_traces( self, load_data=False, return_abspath=False, group_selector=None, trace_selector=None): for file in self.files: if group_selector and not group_selector(file): continue must_drop = False if load_data: file.load_data() file.use_data() must_drop = True for tr in file.iter_traces(): if trace_selector and not trace_selector(tr): continue if return_abspath: yield file.abspath, tr else: yield tr if must_drop: file.drop_data() def iter_files(self): for file in self.files: yield file def reload_modified(self): modified = False for file in self.files: modified |= file.reload_if_modified() return modified def __str__(self): s = 'SubPile\n' s += 'number of files: %i\n' % len(self.files) s += 'timerange: %s - %s\n' % ( util.time_to_str(self.tmin), util.time_to_str(self.tmax)) s += 'networks: %s\n' % ', '.join(sl(self.networks.keys())) s += 'stations: %s\n' % ', '.join(sl(self.stations.keys())) s += 'locations: %s\n' % ', '.join(sl(self.locations.keys())) s += 'channels: %s\n' % ', '.join(sl(self.channels.keys())) s += 'deltats: %s\n' % ', '.join(sl(self.deltats.keys())) return s
[docs]class Batch(object): ''' Batch of waveforms from window wise data extraction. Encapsulates state and results yielded for each window in window wise waveform extraction with the :py:meth:`Pile.chopper` method (when the `style='batch'` keyword argument set). *Attributes:* .. py:attribute:: tmin Start of this time window. .. py:attribute:: tmax End of this time window. .. py:attribute:: i Index of this time window in sequence. .. py:attribute:: n Total number of time windows in sequence. .. py:attribute:: traces Extracted waveforms for this time window. ''' def __init__(self, tmin, tmax, i, n, traces): self.tmin = tmin self.tmax = tmax self.i = i self.n = n self.traces = traces
[docs]class Pile(TracesGroup): ''' Waveform archive lookup, data loading and caching infrastructure. ''' def __init__(self): TracesGroup.__init__(self, None) self.subpiles = {} self.open_files = {} self.listeners = [] self.abspaths = set() def add_listener(self, obj): self.listeners.append(util.smart_weakref(obj)) def notify_listeners(self, what, content): for ref in self.listeners: obj = ref() if obj: obj(what, content) def load_files( self, filenames, filename_attributes=None, fileformat='mseed', cache=None, show_progress=True, update_progress=None): load = loader( filenames, fileformat, cache, filename_attributes, show_progress=show_progress, update_progress=update_progress) self.add_files(load) def add_files(self, files): for file in files: self.add_file(file) def add_file(self, file): if file.abspath is not None and file.abspath in self.abspaths: logger.warning('File already in pile: %s' % file.abspath) return if file.deltatmin is None: logger.warning('Sampling rate of all traces are zero in file: %s' % file.abspath) return subpile = self.dispatch(file) subpile.add_file(file) if file.abspath is not None: self.abspaths.add(file.abspath) def remove_file(self, file): subpile = file.get_parent() if subpile is not None: subpile.remove_file(file) if file.abspath is not None: self.abspaths.remove(file.abspath) def remove_files(self, files): subpile_files = {} for file in files: subpile = file.get_parent() if subpile not in subpile_files: subpile_files[subpile] = [] subpile_files[subpile].append(file) for subpile, files in subpile_files.items(): subpile.remove_files(files) for file in files: if file.abspath is not None: self.abspaths.remove(file.abspath) def dispatch_key(self, file): dt = int(math.floor(math.log(file.deltatmin))) return dt def dispatch(self, file): k = self.dispatch_key(file) if k not in self.subpiles: self.subpiles[k] = SubPile(self) return self.subpiles[k] def get_deltats(self): return list(self.deltats.keys()) def chop( self, tmin, tmax, group_selector=None, trace_selector=None, snap=(round, round), include_last=False, load_data=True): chopped = [] used_files = set() traces = self.relevant(tmin, tmax, group_selector, trace_selector) if load_data: files_changed = False for tr in traces: if tr.file and tr.file not in used_files: if tr.file.load_data(): files_changed = True if tr.file is not None: used_files.add(tr.file) if files_changed: traces = self.relevant( tmin, tmax, group_selector, trace_selector) for tr in traces: if not load_data and tr.ydata is not None: tr = tr.copy(data=False) tr.ydata = None try: chopped.append(tr.chop( tmin, tmax, inplace=False, snap=snap, include_last=include_last)) except trace.NoData: pass return chopped, used_files def _process_chopped( self, chopped, degap, maxgap, maxlap, want_incomplete, wmax, wmin, tpad): chopped.sort(key=lambda a: a.full_id) if degap: chopped = degapper(chopped, maxgap=maxgap, maxlap=maxlap) if not want_incomplete: chopped_weeded = [] for tr in chopped: emin = tr.tmin - (wmin-tpad) emax = tr.tmax + tr.deltat - (wmax+tpad) if (abs(emin) <= 0.5*tr.deltat and abs(emax) <= 0.5*tr.deltat): chopped_weeded.append(tr) elif degap: if (0. < emin <= 5. * tr.deltat and -5. * tr.deltat <= emax < 0.): tr.extend( wmin-tpad, wmax+tpad-tr.deltat, fillmethod='repeat') chopped_weeded.append(tr) chopped = chopped_weeded for tr in chopped: tr.wmin = wmin tr.wmax = wmax return chopped
[docs] def chopper( self, tmin=None, tmax=None, tinc=None, tpad=0., group_selector=None, trace_selector=None, want_incomplete=True, degap=True, maxgap=5, maxlap=None, keep_current_files_open=False, accessor_id=None, snap=(round, round), include_last=False, load_data=True, style=None): ''' Get iterator for shifting window wise data extraction from waveform archive. :param tmin: start time (default uses start time of available data) :param tmax: end time (default uses end time of available data) :param tinc: time increment (window shift time) (default uses ``tmax-tmin``) :param tpad: padding time appended on either side of the data windows (window overlap is ``2*tpad``) :param group_selector: filter callback taking :py:class:`TracesGroup` objects :param trace_selector: filter callback taking :py:class:`pyrocko.trace.Trace` objects :param want_incomplete: if set to ``False``, gappy/incomplete traces are discarded from the results :param degap: whether to try to connect traces and to remove gaps and overlaps :param maxgap: maximum gap size in samples which is filled with interpolated samples when ``degap`` is ``True`` :param maxlap: maximum overlap size in samples which is removed when ``degap`` is ``True`` :param keep_current_files_open: whether to keep cached trace data in memory after the iterator has ended :param accessor_id: if given, used as a key to identify different points of extraction for the decision of when to release cached trace data (should be used when data is alternately extracted from more than one region / selection) :param snap: replaces Python's :py:func:`round` function which is used to determine indices where to start and end the trace data array :param include_last: whether to include last sample :param load_data: whether to load the waveform data. If set to ``False``, traces with no data samples, but with correct meta-information are returned :param style: set to ``'batch'`` to yield waveforms and information about the chopper state as :py:class:`Batch` objects. By default lists of :py:class:`pyrocko.trace.Trace` objects are yielded. :returns: iterator providing extracted waveforms for each extracted window. See ``style`` argument for details. ''' if tmin is None: if self.tmin is None: logger.warning("Pile's tmin is not set - pile may be empty.") return tmin = self.tmin + tpad if tmax is None: if self.tmax is None: logger.warning("Pile's tmax is not set - pile may be empty.") return tmax = self.tmax - tpad if not self.is_relevant(tmin-tpad, tmax+tpad, group_selector): return if accessor_id not in self.open_files: self.open_files[accessor_id] = set() open_files = self.open_files[accessor_id] if tinc is None: tinc = tmax - tmin nwin = 1 else: eps = tinc * 1e-6 if tinc != 0.0: nwin = int(((tmax - eps) - tmin) / tinc) + 1 else: nwin = 1 for iwin in range(nwin): wmin, wmax = tmin+iwin*tinc, min(tmin+(iwin+1)*tinc, tmax) chopped, used_files = self.chop( wmin-tpad, wmax+tpad, group_selector, trace_selector, snap, include_last, load_data) for file in used_files - open_files: # increment datause counter on newly opened files file.use_data() open_files.update(used_files) processed = self._process_chopped( chopped, degap, maxgap, maxlap, want_incomplete, wmax, wmin, tpad) if style == 'batch': yield Batch( tmin=wmin, tmax=wmax, i=iwin, n=nwin, traces=processed) else: yield processed unused_files = open_files - used_files while unused_files: file = unused_files.pop() file.drop_data() open_files.remove(file) if not keep_current_files_open: while open_files: file = open_files.pop() file.drop_data()
[docs] def all(self, *args, **kwargs): ''' Shortcut to aggregate :py:meth:`chopper` output into a single list. ''' alltraces = [] for traces in self.chopper(*args, **kwargs): alltraces.extend(traces) return alltraces
def iter_all(self, *args, **kwargs): for traces in self.chopper(*args, **kwargs): for tr in traces: yield tr def chopper_grouped(self, gather, progress=None, *args, **kwargs): keys = self.gather_keys(gather) if len(keys) == 0: return outer_group_selector = None if 'group_selector' in kwargs: outer_group_selector = kwargs['group_selector'] outer_trace_selector = None if 'trace_selector' in kwargs: outer_trace_selector = kwargs['trace_selector'] # the use of this gather-cache makes it impossible to modify the pile # during chopping gather_cache = {} pbar = None try: if progress is not None: pbar = util.progressbar(progress, len(keys)) for ikey, key in enumerate(keys): def tsel(tr): return gather(tr) == key and ( outer_trace_selector is None or outer_trace_selector(tr)) def gsel(gr): if gr not in gather_cache: gather_cache[gr] = gr.gather_keys(gather) return key in gather_cache[gr] and ( outer_group_selector is None or outer_group_selector(gr)) kwargs['trace_selector'] = tsel kwargs['group_selector'] = gsel for traces in self.chopper(*args, **kwargs): yield traces if pbar: pbar.update(ikey+1) finally: if pbar: pbar.finish() def gather_keys(self, gather, selector=None): keys = set() for subpile in self.subpiles.values(): keys |= subpile.gather_keys(gather, selector) return sorted(keys)
[docs] def iter_traces( self, load_data=False, return_abspath=False, group_selector=None, trace_selector=None): ''' Iterate over all traces in pile. :param load_data: whether to load the waveform data, by default empty traces are yielded :param return_abspath: if ``True`` yield tuples containing absolute file path and :py:class:`pyrocko.trace.Trace` objects :param group_selector: filter callback taking :py:class:`TracesGroup` objects :param trace_selector: filter callback taking :py:class:`pyrocko.trace.Trace` objects Example; yields only traces, where the station code is 'HH1':: test_pile = pile.make_pile('/local/test_trace_directory') for t in test_pile.iter_traces( trace_selector=lambda tr: tr.station=='HH1'): print t ''' for subpile in self.subpiles.values(): if not group_selector or group_selector(subpile): for tr in subpile.iter_traces(load_data, return_abspath, group_selector, trace_selector): yield tr
def iter_files(self): for subpile in self.subpiles.values(): for file in subpile.iter_files(): yield file def reload_modified(self): modified = False for subpile in self.subpiles.values(): modified |= subpile.reload_modified() return modified def get_tmin(self): return self.tmin def get_tmax(self): return self.tmax def get_deltatmin(self): return self.deltatmin def get_deltatmax(self): return self.deltatmax def is_empty(self): return self.tmin is None and self.tmax is None def __str__(self): if self.tmin is not None and self.tmax is not None: tmin = util.time_to_str(self.tmin) tmax = util.time_to_str(self.tmax) s = 'Pile\n' s += 'number of subpiles: %i\n' % len(self.subpiles) s += 'timerange: %s - %s\n' % (tmin, tmax) s += 'networks: %s\n' % ', '.join(sl(self.networks.keys())) s += 'stations: %s\n' % ', '.join(sl(self.stations.keys())) s += 'locations: %s\n' % ', '.join(sl(self.locations.keys())) s += 'channels: %s\n' % ', '.join(sl(self.channels.keys())) s += 'deltats: %s\n' % ', '.join(sl(self.deltats.keys())) else: s = 'empty Pile' return s
[docs] def snuffle(self, **kwargs): ''' Visualize it. :param stations: list of :py:class:`pyrocko.model.station.Station` objects or ``None`` :param events: list of :py:class:`pyrocko.model.event.Event` objects or ``None`` :param markers: list of :py:class:`pyrocko.gui.snuffler.marker.Marker` objects or ``None`` :param ntracks: float, number of tracks to be shown initially (default: 12) :param follow: time interval (in seconds) for real time follow mode or ``None`` :param controls: bool, whether to show the main controls (default: ``True``) :param opengl: bool, whether to use opengl (default: ``False``) ''' from pyrocko.gui.snuffler.snuffler import snuffle snuffle(self, **kwargs)
[docs]def make_pile( paths=None, selector=None, regex=None, fileformat='mseed', cachedirname=None, show_progress=True): ''' Create pile from given file and directory names. :param paths: filenames and/or directories to look for traces. If paths is ``None`` ``sys.argv[1:]`` is used. :param selector: lambda expression taking group dict of regex match object as a single argument and which returns true or false to keep or reject a file :param regex: regular expression which filenames have to match :param fileformat: format of the files ('mseed', 'sac', 'kan', 'from_extension', 'detect') :param cachedirname: loader cache is stored under this directory. It is created as neccessary. :param show_progress: show progress bar and other progress information ''' if show_progress_force_off: show_progress = False if isinstance(paths, str): paths = [paths] if paths is None: paths = sys.argv[1:] if cachedirname is None: cachedirname = config.config().cache_dir fns = util.select_files( paths, include=regex, selector=selector, show_progress=show_progress) cache = get_cache(cachedirname) p = Pile() p.load_files( sorted(fns), cache=cache, fileformat=fileformat, show_progress=show_progress) return p
class Injector(trace.States): def __init__( self, pile, fixation_length=None, path=None, format='from_extension', forget_fixed=False): trace.States.__init__(self) self._pile = pile self._fixation_length = fixation_length self._format = format self._path = path self._forget_fixed = forget_fixed def set_fixation_length(self, length): ''' Set length after which the fixation method is called on buffer traces. The length should be given in seconds. Give None to disable. ''' self.fixate_all() self._fixation_length = length # in seconds def set_save_path( self, path='dump_%(network)s.%(station)s.%(location)s.%(channel)s_' '%(tmin)s_%(tmax)s.mseed'): self.fixate_all() self._path = path def inject(self, trace): logger.debug('Received a trace: %s' % trace) buf = self.get(trace) if buf is None: trbuf = trace.copy() buf = MemTracesFile(None, [trbuf]) self._pile.add_file(buf) self.set(trace, buf) else: self._pile.remove_file(buf) trbuf = buf.get_traces()[0] buf.remove(trbuf) trbuf.append(trace.ydata) buf.add(trbuf) self._pile.add_file(buf) self.set(trace, buf) trbuf = buf.get_traces()[0] if self._fixation_length is not None: if trbuf.tmax - trbuf.tmin > self._fixation_length: self._fixate(buf, complete=False) def fixate_all(self): for state in list(self._states.values()): self._fixate(state[-1]) self._states = {} def free(self, buf): self._fixate(buf) def _fixate(self, buf, complete=True): trbuf = buf.get_traces()[0] del_state = True if self._path: if self._fixation_length is not None: ttmin = trbuf.tmin ytmin = util.year_start(ttmin) n = int(math.floor((ttmin - ytmin) / self._fixation_length)) tmin = ytmin + n*self._fixation_length traces = [] t = tmin while t <= trbuf.tmax: try: traces.append( trbuf.chop( t, t+self._fixation_length, inplace=False, snap=(math.ceil, math.ceil))) except trace.NoData: pass t += self._fixation_length if abs(traces[-1].tmax - (t - trbuf.deltat)) < \ trbuf.deltat/100. or complete: self._pile.remove_file(buf) else: # reinsert incomplete last part new_trbuf = traces.pop() self._pile.remove_file(buf) buf.remove(trbuf) buf.add(new_trbuf) self._pile.add_file(buf) del_state = False else: traces = [trbuf] self._pile.remove_file(buf) fns = io.save(traces, self._path, format=self._format) if not self._forget_fixed: self._pile.load_files( fns, show_progress=False, fileformat=self._format) if del_state: del self._states[trbuf.nslc_id] def __del__(self): self.fixate_all()