Source code for polyfemos.data.outlierremover

# -*- coding: utf-8 -*-
# -----------------------------------------------------------------------------
# This file is part of Polyfemos.
#
# Polyfemos is free software: you can redistribute it and/or modify it under
# the terms of the GNU Lesser General Public License as published by the Free
# Software Foundation, either version 3 of the License, or any later version.
#
# Polyfemos is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
# A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
# details.
#
# You should have received a copy of the GNU Lesser General Public License and
# GNU General Public License along with Polyfemos. If not, see
# <https://www.gnu.org/licenses/>.'
#
# Author: Henrik Jänkävaara
# -----------------------------------------------------------------------------
"""
A function collection to remove outliers from the data

With default values using (242820 x 2) data set
function execution (1 call) times were

+------------+--------+------+
| STALTA     | 1.43 s | 100% |
+------------+--------+------+
| DTR        | 1.19 s |  84% |
+------------+--------+------+
| Lipschitz  | 0.58 s |  41% |
+------------+--------+------+

:copyright:
    2019, University of Oulu, Sodankyla Geophysical Observatory
:license:
    GNU Lesser General Public License v3.0 or later
    (https://spdx.org/licenses/LGPL-3.0-or-later.html)
"""
# This module should be as generic as possible.
# Do not import anything polyfemos related.
import math

import numpy as np

from sklearn.tree import DecisionTreeRegressor
from obspy.signal.trigger import classic_sta_lta, trigger_onset


[docs]def _get_mask(b, N, indices, nanindices=[]): """ Helper function to form masks :type b: bool :param b: selects either :func:`~numpy.ones` or :func:`~numpy.zeros`. :type N: int :param N: the length of the mask :type indices: :class:`~numpy.ndarray` :param indices: :type nanindices: :class:`~numpy.ndarray` :param nanindices: :rtype: :class:`~numpy.ndarray` :return: mask array containing bool values """ b = bool(b) mask = [np.ones, np.zeros][b](N, dtype=np.float) mask[nanindices] = float('nan') mask[indices] = b return mask
[docs]def dtr(data, maxdepth=0, scale=24000, medlim=10, **kwargs): """ A function to remove outliers using `Decision Tree <https://en.wikipedia.org/wiki/Decision_tree_learning>`_. The given ``data`` is approximated using :class:`~sklearn.tree.DecisionTreeRegressor` decision tree. The median of the error between the data and the approximation is calculated. If the error between a datapoint and an approximated value is greater than ``medlim`` times the median, the datapoint is excluded. ``scale`` is used to select ``maxdepth`` according to the datalen N. If N > ``scale``, ``maxdepth`` = 2. If N > 10 * ``scale``, ``maxdepth`` = 4, and so forth. If ``maxdepth`` is given, ``scale`` is ignored. :type data: :class:`~numpy.ndarray` :param data: x-y data in Nx2 array, shape (N, 2) :type maxdepth: int :param maxdepth: The maximum depth of the tree. :type scale: float :param scale: :type medlim: float :param medlim: :rtype: :class:`~numpy.ndarray` :return: mask array containing bool values """ orig_N = data.shape[0] if maxdepth <= 0: func_ = \ lambda x: int(max([1., np.floor(2 * np.log10(10 * x / scale))])) maxdepth = func_(orig_N) nanbools = np.isnan(data[:, 1]) nanindices = np.where(nanbools)[0] data = data[nanbools == False] x = data[:, 0] y = data[:, 1].ravel() X = x.reshape(data.shape[0], 1) regr = DecisionTreeRegressor(max_depth=maxdepth) regr.fit(X, y) y_pred = regr.predict(X) temp = np.abs(np.subtract(y, y_pred)) median = np.median(temp) filter_ = np.vectorize( lambda a0, a1: np.abs(a0 - a1) < np.abs(medlim * median)) xi = np.where(filter_(y, y_pred))[0] for i in nanindices: xi[i <= xi] += 1 return _get_mask(True, orig_N, xi, nanindices=nanindices)
[docs]def lipschitz(data, itern=1, klim=7e-6, **kwargs): """ A function to remove outliers based on `Lipschitz continuity <https://en.wikipedia.org/wiki/Lipschitz_continuity>`_. Calculates the change (slope, K) in y=f(x) function between two datapoints. .. code-block:: text K = |f(x1) - f(x0)| / |x1 - x0| Datapoints which cause a slope too steep, are removed. :type data: :class:`~numpy.ndarray` :param data: x-y data in Nx2 array, shape (N, 2) :type itern: int :param itern: The maximum interval between the datapoints x0 and x1 Complexity = N * ``itern`` :type klim: float :param klim: the maximum slope allowed :rtype: :class:`~numpy.ndarray` :return: mask array containing bool values """ itern = max([itern, 1]) + 1 N = data.shape[0] i = -1 remindices = set({}) nanindices = set({}) while True: i += 1 if i >= N: break if i in remindices: continue if i in nanindices: continue x0 = data[i, 0] y0 = data[i, 1] remc = 0 invalid_value = False j = i + 1 while True: if j >= min([i + itern + remc, N]): break x1 = data[j, 0] y1 = data[j, 1] if math.isnan(y1): nanindices.add(j) invalid_value = True else: dx = abs(x1 - x0) if dx < 99e-99: K = 0 else: K = abs(y1 - y0) / dx if K > klim: remindices.add(j) invalid_value = True else: invalid_value = False if invalid_value: remc += 1 else: remc = 0 j += 1 remindices = list(remindices) nanindices = list(nanindices) return _get_mask(False, N, remindices, nanindices=nanindices)
[docs]def stalta(data, nsta=3, nlta=10, threson=1.08, thresoff=1.05, offset=40, **kwargs): """ Utilises :func:`~obspy.signal.trigger.classic_sta_lta` to remove outliers :type data: :class:`~numpy.ndarray` :param data: x-y data in Nx2 array, shape (N, 2) :type nsta: int :param nsta: Length of short time average window in samples :type nlta: int :param nlta: Length of long time average window in samples :type threson: float :param threson: Value above which trigger (of characteristic function) is activated (higher threshold) :type thresoff: float :param thresoff: Value below which trigger (of characteristic function) is deactivated (lower threshold) :type offset: int :param offset: in samples, how many additional samples are removed before on trigger and after off trigger :rtype: :class:`~numpy.ndarray` :return: mask array containing bool values """ orig_N = data.shape[0] nanbools = np.isnan(data[:, 1]) nanindices = np.where(nanbools)[0] data = data[nanbools == False] cft = classic_sta_lta(data[:, 1], nsta, nlta) trigger_onoff = trigger_onset(cft, threson, thresoff) def inside_to(x_): for to in trigger_onoff: if to[0] - offset <= x_ <= to[1] + offset: return False return True filter_ = np.vectorize(inside_to) xi = np.where(filter_(np.arange(data.shape[0])))[0] for i in nanindices: xi[i <= xi] += 1 return _get_mask(True, orig_N, xi, nanindices=nanindices)