Source code for pyrocko.client.fdsn

# http://pyrocko.org - GPLv3
#
# The Pyrocko Developers, 21st Century
# ---|P------/S----------~Lg----------

'''
Low-level FDSN web service client.

This module provides basic functionality to download station metadata, time
series data and event information from FDSN web services. Password and token
authentication are supported. Query responses are returned as open file-like
objects or can be parsed into Pyrocko's native data structures where
appropriate.

.. _registered-site-names:

Registered site names
.....................

A list of known FDSN site names is maintained within the module for quick
selection by the user. This list currently contains the following sites:

%s

Any other site can be specified by providing its full URL.
'''

import re
import logging
import socket


from pyrocko import util
from pyrocko.util import DownloadError
from pyrocko import config

from pyrocko.util import \
    urlencode, Request, build_opener, HTTPDigestAuthHandler, urlopen, HTTPError


logger = logging.getLogger('pyrocko.client.fdsn')

g_url = '%(site)s/fdsnws/%(service)s/%(majorversion)i/%(method)s'

g_site_abbr = {
    'geofon': 'https://geofon.gfz-potsdam.de',
    'iris': 'http://service.iris.edu',
    'orfeus': 'http://www.orfeus-eu.org',
    'bgr': 'http://eida.bgr.de',
    'emsc': 'http://www.seismicportal.eu',
    'geonet': 'http://service.geonet.org.nz',
    'knmi': 'http://rdsa.knmi.nl',
    'ncedc': 'http://service.ncedc.org',
    'scedc': 'http://service.scedc.caltech.edu',
    'usgs': 'http://earthquake.usgs.gov',
    'koeri': 'http://eida-service.koeri.boun.edu.tr',
    'ethz': 'http://eida.ethz.ch',
    'icgc': 'http://ws.icgc.cat',
    'ipgp': 'http://eida.ipgp.fr',
    'ingv': 'http://webservices.ingv.it',
    'isc': 'http://www.isc.ac.uk',
    'lmu': 'http://erde.geophysik.uni-muenchen.de',
    'noa': 'http://eida.gein.noa.gr',
    'resif': 'http://ws.resif.fr',
    'usp': 'http://seisrequest.iag.usp.br',
    'niep': 'http://eida-sc3.infp.ro'
}

g_default_site = 'geofon'


g_default_query_args = {
    'station': {
        'starttime', 'endtime', 'startbefore', 'startafter', 'endbefore',
        'endafter', 'network', 'station', 'location', 'channel', 'minlatitude',
        'maxlatitude', 'minlongitude', 'maxlongitude', 'latitude', 'longitude',
        'minradius', 'maxradius', 'level', 'includerestricted',
        'includeavailability', 'updatedafter', 'matchtimeseries', 'format',
        'nodata'},
    'dataselect': {
        'starttime', 'endtime', 'network', 'station', 'location', 'channel',
        'quality', 'minimumlength', 'longestonly', 'format', 'nodata'},
    'event': {
        'starttime', 'endtime', 'minlatitude', 'maxlatitude', 'minlongitude',
        'maxlongitude', 'latitude', 'longitude', 'minradius', 'maxradius',
        'mindepth', 'maxdepth', 'minmagnitude', 'maxmagnitude', 'eventtype',
        'includeallorigins', 'includeallmagnitudes', 'includearrivals',
        'eventid', 'limit', 'offset', 'orderby', 'catalog', 'contributor',
        'updatedafter', 'format', 'nodata'},
    'availability': {
        'starttime', 'endtime', 'network', 'station', 'location', 'channel',
        'quality', 'merge', 'orderby', 'limit', 'includerestricted', 'format',
        'nodata', 'mergegaps', 'show'}}


def doc_escape_slist(li):
    return ', '.join("``'%s'``" % s for s in li)


def doc_table_dict(d, khead, vhead, indent=''):
    keys, vals = zip(*sorted(d.items()))

    lk = max(max(len(k) for k in keys), len(khead))
    lv = max(max(len(v) for v in vals), len(vhead))

    hr = '=' * lk + ' ' + '=' * lv

    lines = [
        hr,
        '%s %s' % (khead.ljust(lk), vhead.ljust(lv)),
        hr]

    for k, v in zip(keys, vals):
        lines.append('%s %s' % (k.ljust(lk), v.ljust(lv)))

    lines.append(hr)
    return '\n'.join(indent + line for line in lines)


def strip_html(s):
    s = s.decode('utf-8')
    s = re.sub(r'<[^>]+>', '', s)
    s = re.sub(r'\r', '', s)
    s = re.sub(r'\s*\n', '\n', s)
    return s


def indent(s, ind='  '):
    return '\n'.join(ind + line for line in s.splitlines())


[docs]def get_sites(): ''' Get sorted list of registered site names. ''' return sorted(g_site_abbr.keys())
if config.config().fdsn_timeout is None: g_timeout = 20. else: g_timeout = config.config().fdsn_timeout re_realm_from_auth_header = re.compile(r'(realm)\s*[:=]\s*"([^"]*)"?')
[docs]class CannotGetRealmFromAuthHeader(DownloadError): ''' Raised when failing to parse server response during authentication. ''' pass
[docs]class CannotGetCredentialsFromAuthRequest(DownloadError): ''' Raised when failing to parse server response during token authentication. ''' pass
def get_realm_from_auth_header(headers): realm = dict(re_realm_from_auth_header.findall( headers['WWW-Authenticate'])).get('realm', None) if realm is None: raise CannotGetRealmFromAuthHeader('headers=%s' % str(headers)) return realm def sdatetime(t): return util.time_to_str(t, format='%Y-%m-%dT%H:%M:%S')
[docs]class EmptyResult(DownloadError): ''' Raised when an empty server response is retrieved. ''' def __init__(self, url): DownloadError.__init__(self) self._url = url def __str__(self): return 'No results for request %s' % self._url
[docs]class RequestEntityTooLarge(DownloadError): ''' Raised when the server indicates that too much data was requested. ''' def __init__(self, url): DownloadError.__init__(self) self._url = url def __str__(self): return 'Request entity too large: %s' % self._url
[docs]class InvalidRequest(DownloadError): ''' Raised when an invalid request would be sent / has been sent. ''' pass
[docs]class Timeout(DownloadError): ''' Raised when the server does not respond within the allowed timeout period. ''' pass
def _request( url, post=False, user=None, passwd=None, timeout=None, **kwargs): if timeout is None: timeout = g_timeout url_values = urlencode(kwargs) if url_values: url += '?' + url_values logger.debug('Accessing URL %s' % url) url_args = { 'timeout': timeout } if util.g_ssl_context: url_args['context'] = util.g_ssl_context opener = None req = Request(url) if post: if isinstance(post, str): post = post.encode('utf8') logger.debug('POST data: \n%s' % post.decode('utf8')) req.data = post req.add_header('Accept', '*/*') itry = 0 while True: itry += 1 try: urlopen_ = opener.open if opener else urlopen try: resp = urlopen_(req, **url_args) except TypeError: # context and cafile not avail before 3.4.3, 2.7.9 url_args.pop('context', None) url_args.pop('cafile', None) resp = urlopen_(req, **url_args) logger.debug('Response: %s' % resp.getcode()) if resp.getcode() == 204: raise EmptyResult(url) return resp except HTTPError as e: if e.code == 413: raise RequestEntityTooLarge(url) elif e.code == 401: headers = getattr(e, 'headers', e.hdrs) realm = get_realm_from_auth_header(headers) if itry == 1 and user is not None: auth_handler = HTTPDigestAuthHandler() auth_handler.add_password( realm=realm, uri=url, user=user, passwd=passwd or '') opener = build_opener(auth_handler) continue else: raise DownloadError( 'Authentication failed for realm "%s" when accessing ' 'url "%s". Original error was: %s' % ( realm, url, str(e))) else: raise DownloadError( 'Error content returned by server (HTML stripped):\n%s\n' ' Original error was: %s' % ( indent( strip_html(e.read()), ' ! '), str(e))) except socket.timeout: raise Timeout( 'Timeout error. No response received within %i s. You ' 'may want to retry with a longer timeout setting.' % timeout) break def fillurl(service, site, url, majorversion, method): return url % dict( site=g_site_abbr.get(site, site), service=service, majorversion=majorversion, method=method) def fix_params(d): params = dict(d) for k in ['starttime', 'endtime', 'startbefore', 'startafter', 'endbefore', 'endafter', 'updatedafter']: if k in params: params[k] = sdatetime(params[k]) if params.get('location', None) == '': params['location'] = '--' for k in params: if isinstance(params[k], bool): params[k] = ['false', 'true'][bool(params[k])] return params def make_data_selection( stations, tmin, tmax, channel_prio=[['BHZ', 'HHZ'], ['BH1', 'BHN', 'HH1', 'HHN'], ['BH2', 'BHE', 'HH2', 'HHE']]): selection = [] for station in stations: wanted = [] for group in channel_prio: gchannels = [] for channel in station.get_channels(): if channel.name in group: gchannels.append(channel) if gchannels: gchannels.sort(key=lambda a: group.index(a.name)) wanted.append(gchannels[0]) if wanted: for channel in wanted: selection.append((station.network, station.station, station.location, channel.name, tmin, tmax)) return selection
[docs]def station( site=g_default_site, url=g_url, majorversion=1, timeout=None, check=True, selection=None, parsed=True, **kwargs): ''' Query FDSN web service for station metadata. :param site: :ref:`Registered site name <registered-site-names>` or full base URL of the service (e.g. ``'https://geofon.gfz-potsdam.de'``). :type site: str :param url: URL template (default should work in 99% of cases). :type url: str :param majorversion: Major version of the service to query (always ``1`` at the time of writing). :type majorversion: int :param timeout: Network timeout in [s]. Global default timeout can be configured in Pyrocko's configuration file under ``fdsn_timeout``. :type timeout: float :param check: If ``True`` arguments are checked against self-description (WADL) of the queried web service if available or FDSN specification. :type check: bool :param selection: If given, selection to be queried as a list of tuples ``(network, station, location, channel, tmin, tmax)``. Useful for detailed queries. :type selection: :py:class:`list` of :py:class:`tuple` :param parsed: If ``True`` parse received content into :py:class:`~pyrocko.io.stationxml.FDSNStationXML` object, otherwise return open file handle to raw data stream. :type parsed: bool :param \\*\\*kwargs: Parameters passed to the server (see `FDSN web services specification <https://www.fdsn.org/webservices>`_). :returns: See description of ``parsed`` argument above. :raises: On failure, :py:exc:`~pyrocko.util.DownloadError` or one of its sub-types defined in the :py:mod:`~pyrocko.client.fdsn` module is raised. ''' service = 'station' if check: check_params(service, site, url, majorversion, timeout, **kwargs) params = fix_params(kwargs) url = fillurl(service, site, url, majorversion, 'query') if selection: lst = [] for k, v in params.items(): lst.append('%s=%s' % (k, v)) for (network, station, location, channel, tmin, tmax) in selection: if location == '': location = '--' lst.append(' '.join((network, station, location, channel, sdatetime(tmin), sdatetime(tmax)))) post = '\n'.join(lst) params = dict(post=post.encode()) if parsed: from pyrocko.io import stationxml format = kwargs.get('format', 'xml') if format == 'text': if kwargs.get('level', 'station') == 'channel': return stationxml.load_channel_table( stream=_request(url, timeout=timeout, **params)) else: raise InvalidRequest('if format="text" shall be parsed, ' 'level="channel" is required') elif format == 'xml': return stationxml.load_xml( stream=_request(url, timeout=timeout, **params)) else: raise InvalidRequest('format must be "xml" or "text"') else: return _request(url, timeout=timeout, **params)
def get_auth_credentials(service, site, url, majorversion, token, timeout): url = fillurl(service, site, url, majorversion, 'auth') f = _request(url, timeout=timeout, post=token) s = f.read().decode() try: user, passwd = s.strip().split(':') except ValueError: raise CannotGetCredentialsFromAuthRequest('data="%s"' % s) return user, passwd
[docs]def dataselect( site=g_default_site, url=g_url, majorversion=1, timeout=None, check=True, user=None, passwd=None, token=None, selection=None, **kwargs): ''' Query FDSN web service for time series data in miniSEED format. :param site: :ref:`Registered site name <registered-site-names>` or full base URL of the service (e.g. ``'https://geofon.gfz-potsdam.de'``). :type site: str :param url: URL template (default should work in 99% of cases). :type url: str :param majorversion: Major version of the service to query (always ``1`` at the time of writing). :type majorversion: int :param timeout: Network timeout in [s]. Global default timeout can be configured in Pyrocko's configuration file under ``fdsn_timeout``. :type timeout: float :param check: If ``True`` arguments are checked against self-description (WADL) of the queried web service if available or FDSN specification. :type check: bool :param user: User name for user/password authentication. :type user: str :param passwd: Password for user/password authentication. :type passwd: str :param token: Token for `token authentication <https://geofon.gfz-potsdam.de/waveform/archive/auth/auth-overview.php>`_. :type token: str :param selection: If given, selection to be queried as a list of tuples ``(network, station, location, channel, tmin, tmax)``. :type selection: :py:class:`list` of :py:class:`tuple` :param \\*\\*kwargs: Parameters passed to the server (see `FDSN web services specification <https://www.fdsn.org/webservices>`_). :returns: Open file-like object providing raw miniSEED data. ''' service = 'dataselect' if user or token: method = 'queryauth' else: method = 'query' if token is not None: user, passwd = get_auth_credentials( service, site, url, majorversion, token, timeout) if check: check_params(service, site, url, majorversion, timeout, **kwargs) params = fix_params(kwargs) url = fillurl(service, site, url, majorversion, method) if selection: lst = [] for k, v in params.items(): lst.append('%s=%s' % (k, v)) for (network, station, location, channel, tmin, tmax) in selection: if location == '': location = '--' lst.append(' '.join((network, station, location, channel, sdatetime(tmin), sdatetime(tmax)))) post = '\n'.join(lst) return _request( url, user=user, passwd=passwd, post=post.encode(), timeout=timeout) else: return _request( url, user=user, passwd=passwd, timeout=timeout, **params)
[docs]def event( site=g_default_site, url=g_url, majorversion=1, timeout=None, check=True, user=None, passwd=None, token=None, parsed=False, **kwargs): ''' Query FDSN web service for events. :param site: :ref:`Registered site name <registered-site-names>` or full base URL of the service (e.g. ``'https://geofon.gfz-potsdam.de'``). :type site: str :param url: URL template (default should work in 99% of cases). :type url: str :param majorversion: Major version of the service to query (always ``1`` at the time of writing). :type majorversion: int :param timeout: Network timeout in [s]. Global default timeout can be configured in Pyrocko's configuration file under ``fdsn_timeout``. :type timeout: float :param check: If ``True`` arguments are checked against self-description (WADL) of the queried web service if available or FDSN specification. :type check: bool :param user: User name for user/password authentication. :type user: str :param passwd: Password for user/password authentication. :type passwd: str :param token: Token for `token authentication <https://geofon.gfz-potsdam.de/waveform/archive/auth/auth-overview.php>`_. :type token: str :param parsed: If ``True`` parse received content into :py:class:`~pyrocko.io.quakeml.QuakeML` object, otherwise return open file handle to raw data stream. Note: by default unparsed data is retrieved, differently from the default behaviour of :py:func:`station` (for backward compatibility). :type parsed: bool :param \\*\\*kwargs: Parameters passed to the server (see `FDSN web services specification <https://www.fdsn.org/webservices>`_). :returns: See description of ``parsed`` argument above. ''' service = 'event' if user or token: method = 'queryauth' else: method = 'query' if token is not None: user, passwd = get_auth_credentials( service, site, url, majorversion, token, timeout) if check: check_params(service, site, url, majorversion, timeout, **kwargs) params = fix_params(kwargs) url = fillurl(service, site, url, majorversion, method) fh = _request(url, user=user, passwd=passwd, timeout=timeout, **params) if parsed: from pyrocko.io import quakeml format = kwargs.get('format', 'xml') if format != 'xml': raise InvalidRequest( 'If parsed=True is selected, format="xml" must be selected.') return quakeml.QuakeML.load_xml(stream=fh) else: return fh
[docs]def availability( method='query', site=g_default_site, url=g_url, majorversion=1, timeout=None, check=True, user=None, passwd=None, token=None, selection=None, **kwargs): ''' Query FDSN web service for time series data availablity. :param method: Availablility method to call: ``'query'``, or ``'extent'``. :param site: :ref:`Registered site name <registered-site-names>` or full base URL of the service (e.g. ``'https://geofon.gfz-potsdam.de'``). :type site: str :param url: URL template (default should work in 99% of cases). :type url: str :param majorversion: Major version of the service to query (always ``1`` at the time of writing). :type majorversion: int :param timeout: Network timeout in [s]. Global default timeout can be configured in Pyrocko's configuration file under ``fdsn_timeout``. :type timeout: float :param check: If ``True`` arguments are checked against self-description (WADL) of the queried web service if available or FDSN specification. :type check: bool :param user: User name for user/password authentication. :type user: str :param passwd: Password for user/password authentication. :type passwd: str :param token: Token for `token authentication <https://geofon.gfz-potsdam.de/waveform/archive/auth/auth-overview.php>`_. :type token: str :param selection: If given, selection to be queried as a list of tuples ``(network, station, location, channel, tmin, tmax)``. :type selection: :py:class:`list` of :py:class:`tuple` :param \\*\\*kwargs: Parameters passed to the server (see `FDSN web services specification <https://www.fdsn.org/webservices>`_). :returns: Open file-like object providing raw response. ''' service = 'availability' assert method in ('query', 'extent') if user or token: method += 'auth' if token is not None: user, passwd = get_auth_credentials( service, site, url, majorversion, token, timeout) if check: check_params(service, site, url, majorversion, timeout, **kwargs) params = fix_params(kwargs) url = fillurl(service, site, url, majorversion, method) if selection: lst = [] for k, v in params.items(): lst.append('%s=%s' % (k, v)) for (network, station, location, channel, tmin, tmax) in selection: if location == '': location = '--' lst.append(' '.join((network, station, location, channel, sdatetime(tmin), sdatetime(tmax)))) post = '\n'.join(lst) return _request( url, user=user, passwd=passwd, post=post.encode(), timeout=timeout) else: return _request( url, user=user, passwd=passwd, timeout=timeout, **params)
[docs]def check_params( service, site=g_default_site, url=g_url, majorversion=1, timeout=None, method='query', **kwargs): ''' Check query parameters against self-description of web service. Downloads WADL description of the given service and site and checks parameters if they are available. Queried WADLs are cached in memory. :param service: ``'station'``, ``'dataselect'``, ``'event'`` or ``'availability'``. :param site: :ref:`Registered site name <registered-site-names>` or full base URL of the service (e.g. ``'https://geofon.gfz-potsdam.de'``). :type site: str :param url: URL template (default should work in 99% of cases). :type url: str :param majorversion: Major version of the service to query (always ``1`` at the time of writing). :type majorversion: int :param timeout: Network timeout in [s]. Global default timeout can be configured in Pyrocko's configuration file under ``fdsn_timeout``. :type timeout: float :param \\*\\*kwargs: Parameters that would be passed to the server (see `FDSN web services specification <https://www.fdsn.org/webservices>`_). :raises: :py:exc:`ValueError` is raised if unsupported parameters are encountered. ''' avail = supported_params_wadl( service, site, url, majorversion, timeout, method) unavail = sorted(set(kwargs.keys()) - avail) if unavail: raise ValueError( 'Unsupported parameter%s for service "%s" at site "%s": %s' % ( '' if len(unavail) == 1 else 's', service, site, ', '.join(unavail)))
[docs]def supported_params_wadl( service, site=g_default_site, url=g_url, majorversion=1, timeout=None, method='query'): ''' Get query parameter names supported by a given FDSN site and service. If no WADL is provided by the queried service, default parameter sets from the FDSN standard are returned. Queried WADLs are cached in memory. :param service: ``'station'``, ``'dataselect'``, ``'event'`` or ``'availability'``. :param site: :ref:`Registered site name <registered-site-names>` or full base URL of the service (e.g. ``'https://geofon.gfz-potsdam.de'``). :type site: str :param url: URL template (default should work in 99% of cases). :type url: str :param majorversion: Major version of the service to query (always ``1`` at the time of writing). :type majorversion: int :param timeout: Network timeout in [s]. Global default timeout can be configured in Pyrocko's configuration file under ``fdsn_timeout``. :type timeout: float :returns: Supported parameter names. :rtype: :py:class:`set` of :py:class:`str` ''' wadl = cached_wadl(service, site, url, majorversion, timeout) if wadl: url = fillurl(service, site, url, majorversion, method) return set(wadl.supported_param_names(url)) else: return g_default_query_args[service]
def patch_geonet_wadl(wadl): for r in wadl.resources_list: r.base = r.base.replace('1/station', 'station/1') r.base = r.base.replace('1/dataselect', 'dataselect/1') r.base = r.base.replace('1/event', 'event/1')
[docs]def wadl( service, site=g_default_site, url=g_url, majorversion=1, timeout=None): ''' Retrieve self-description of a specific FDSN service. :param service: ``'station'``, ``'dataselect'``, ``'event'`` or ``'availability'``. :param site: :ref:`Registered site name <registered-site-names>` or full base URL of the service (e.g. ``'https://geofon.gfz-potsdam.de'``). :type site: str :param url: URL template (default should work in 99% of cases). :type url: str :param majorversion: Major version of the service to query (always ``1`` at the time of writing). :type majorversion: int :param timeout: Network timeout in [s]. Global default timeout can be configured in Pyrocko's configuration file under ``fdsn_timeout``. :type timeout: float ''' from pyrocko.client.wadl import load_xml url = fillurl(service, site, url, majorversion, 'application.wadl') wadl = load_xml(stream=_request(url, timeout=timeout)) if site == 'geonet' or site.find('geonet.org.nz') != -1: patch_geonet_wadl(wadl) return wadl
g_wadls = {}
[docs]def cached_wadl( service, site=g_default_site, url=g_url, majorversion=1, timeout=None): ''' Get self-description of a specific FDSN service. Same as :py:func:`wadl`, but results are cached in memory. ''' k = (service, site, url, majorversion) if k not in g_wadls: try: g_wadls[k] = wadl(service, site, url, majorversion, timeout) except Timeout: raise except DownloadError: logger.info( 'No service description (WADL) found for "%s" at site "%s".' % (service, site)) g_wadls[k] = None return g_wadls[k]
__doc__ %= doc_table_dict(g_site_abbr, 'Site name', 'URL', ' ') if __name__ == '__main__': import sys util.setup_logging('pyrocko.client.fdsn', 'info') if len(sys.argv) == 1: sites = get_sites() else: sites = sys.argv[1:] for site in sites: print('=== %s (%s) ===' % (site, g_site_abbr[site])) for service in ['station', 'dataselect', 'event']: try: app = wadl(service, site=site, timeout=2.0) print(indent(str(app))) except Timeout as e: logger.error(str(e)) print('%s: timeout' % (service,)) except util.DownloadError as e: logger.error(str(e)) print('%s: no wadl' % (service,))