fosanalysis
A framework to evaluate distributed fiber optic sensor data
Loading...
Searching...
No Matches
masking.py
Go to the documentation of this file.
2r"""
3Contains class implementations to remove implausible values from strain data.
4This can be used to remove strain reading anomalies (SRAs) from the data.
5
6\author Bertram Richter
7\date 2022
8"""
9
10from abc import abstractmethod
11import copy
12
13import numpy as np
14
15from fosanalysis.utils import misc, windows
16from . import base
17from . import filtering
18
20 r"""
21 Abstract class for anomaly identification.
22 Strain reading anomalies (SRAs) are implausible data points.
23 SRAs are replaced by `NaN` values, effectively marking them as dropouts.
24 """
25 def run(self,
26 x: np.array,
27 y: np.array,
28 z: np.array,
29 make_copy: bool = True,
30 timespace: str = None,
31 identify_only: bool = False,
32 *args, **kwargs) -> np.array:
33 r"""
34 Mask strain reading anomalies with `NaN`s.
35 The strain data is replaced by `NaN` for all entries in the returned array being `True`.
36
37 \param identify_only If set to true, the array contains boolean
38 values, indicating a SRA by `True` and a valid entry by `False`.
39
40 \copydetails preprocessing.base.Task.run()
41 """
42 SRA_array = np.logical_not(np.isfinite(z))
43 z = copy.deepcopy(z)
44 x, y, SRA_array = super().run(x, y, z,
45 SRA_array=SRA_array,
46 make_copy=make_copy,
47 timespace=timespace,
48 *args, **kwargs)
49 if identify_only:
50 z = SRA_array
51 else:
52 z[SRA_array] = float("nan")
53 return x, y, z
54 @abstractmethod
55 def _run_1d(self,
56 x: np.array,
57 z: np.array,
58 SRA_array: np.array,
59 *args, **kwargs) -> tuple:
60 r"""
61 Estimate, which entries are strain reading anomalies, in 1D.
62 \copydetails preprocessing.base.Task._run_1d()
63 \param SRA_array Array of boolean values indicating SRAs by `True` and a valid entries by `False`.
64 This function returns the `SRA_array` instead of the `z` array.
65 """
66 return x, SRA_array
67 @abstractmethod
68 def _run_2d(self,
69 x: np.array,
70 y: np.array,
71 z: np.array,
72 SRA_array: np.array,
73 *args, **kwargs) -> tuple:
74 r"""
75 \copydoc preprocessing.base.Task._run_2d()
76 \param SRA_array Array of boolean values indicating SRAs by `True` and a valid entries by `False`.
77 This function returns the `SRA_array` instead of the `z` array.
78 """
79 return x, y, SRA_array
80 def _map_2d(self,
81 x: np.array,
82 y: np.array,
83 z: np.array,
84 SRA_array: np.array,
85 timespace: str = None,
86 *args, **kwargs) -> tuple:
87 r"""
88 Estimate, which entries are strain reading anomalies, in 2D.
89 \copydoc preprocessing.base.Task._map_2d()
90 \param SRA_array Array of boolean values indicating SRAs by `True` and a valid entries by `False`.
91 This function returns the `SRA_array` instead of the `z` array.
92 """
93 timespace = timespace if timespace is not None else self.timespace
94 if self.timespace.lower() == "1d_space":
95 for row_id, (row, SRA_row) in enumerate(zip(z, SRA_array)):
96 x, SRA_array[row_id] = self._run_1d(x, row, SRA_array=SRA_row, *args, **kwargs)
97 elif self.timespace.lower() == "1d_time":
98 for col_id, (column, SRA_column) in enumerate(zip(z.T, SRA_array.T)):
99 y, SRA_array.T[col_id] = self._run_1d(y, column, SRA_array=SRA_column, *args, **kwargs)
100 return x, y, SRA_array
101
103 r"""
104 The geometric threshold method (GTM) identifies SRAs by comparing the strain increments to a threshold.
105 The implementation is improved upon the algorithm presentend in \cite Bado_2021_Postprocessingalgorithmsfor.
106 Each entry is compared to the most recent entry accepted as plausible.
107 If the strain increment \f$\Delta_{\mathrm{max}}\f$ is exceeded,
108 the entry of the current index will be converted to a dropout by setting it to `NaN`.
109 Dropouts creating a geometrical distance greater than \f$\Delta_{\mathrm{max}}\f$,
110 would result in contigous dropout fields of extended length.
111 To avoid those dopout field, the current entry is additionally compared to its next finite neighbors.
112 The neighbor comparison uses a variable for the range,
113 which allows the user to set amount of neighbors to be compared.
114 Additional fine tuning can be performed by setting a percentage tolerance,
115 which allows a fraction of total neighbor comparisons to exceed \f$\Delta_{\mathrm{max}}\f$.
116 """
117 def __init__(self,
118 delta_max: float = 400.0,
119 forward_comparison_range: int = 1,
120 tolerance: float = 0.0,
121 to_left: bool = False,
122 activate_reverse_sweep: bool = True,
123 *args, **kwargs):
124 r"""
125 Construct a GTM object.
126 \param delta_max \copybrief delta_max For more, see \ref delta_max.
127 \param forward_comparison_range \copybrief forward_comparison_range For more, see \ref forward_comparison_range.
128 \param tolerance \copybrief tolerance For more, see \ref tolerance.
129 \param to_left \copybrief to_left For more, see \ref to_left.
130 \param activate_reverse_sweep \copybrief activate_reverse_sweep For more, see \ref activate_reverse_sweep.
131 \param *args Additional positional arguments, will be passed to the superconstructor.
132 \param **kwargs Additional keyword arguments, will be passed to the superconstructor.
133 """
134 super().__init__(*args, **kwargs)
135 assert delta_max > 0, "Acceptable maximum strain increment (delta_max) must be greater than zero!"
136 assert forward_comparison_range >= 0, "Number of neighbor to compare (forward_comparison_range) must not be negative!"
137 assert tolerance >= 0, "Acceptable tolerance ratio of neighbors exceeding delta_max must be greater or equal to zero!"
138
140 self.delta_max = delta_max
141
151 self.forward_comparison_range = forward_comparison_range
152
165 self.tolerance = tolerance
166
168 self.to_left = to_left
169
173 self.activate_reverse_sweep = activate_reverse_sweep
175 z: np.array,
176 index: int,
177 to_left: bool,
178 ) -> bool:
179 r"""
180 Evaluate, if the candidate keeps its SRA flag by comparing it to
181 its succeeding neighbors. The candidate keeps its flag, if
182 \f[
183 r_{\mathrm{tol}} \cdot n_{\mathrm{tot}} < n_{\mathrm{ex}}
184 \f]
185 with the \ref tolerance ratio \f$r_{\mathrm{tol}}\f$,
186 the number of considered succeeding neighbors \f$n_{\mathrm{tot}}\f$,
187 which is the minimum of \ref forward_comparison_range and the
188 remaining elements to the end of the the `z` array, and
189 the number of considered neighbors \f$n_{\mathrm{ex}}\f$,
190 whose absolute difference to the candidate exceeds \ref delta_max.
191 \param z Array of strain data.
192 \param to_left \copybrief to_left For more, see \ref to_left.
193 \param index Current index of the flagged entry.
194 \return Returns, whether the candidate keeps ist SRA flag.
195 """
196 # Counter for the comparison with a percentual acceptance
197 exceeding_amount = 0
198 # Number of finite neighbors actually considered
199 actual_neighbor_number = 0
200 for nth_neighbor in range(self.forward_comparison_range):
201 n_i, neighbor = misc.next_finite_neighbor(array=z,
202 index=index,
203 to_left=to_left,
204 recurse=nth_neighbor)
205 if neighbor is None:
206 # no neighbor in this direction found
207 break
208 actual_neighbor_number = actual_neighbor_number + 1
209 if abs(z[index] - neighbor) > self.delta_max:
210 exceeding_amount = exceeding_amount + 1
211 if actual_neighbor_number == 0:
212 return True
213 return (actual_neighbor_number * self.tolerance < exceeding_amount)
214 def _run_1d(self,
215 x: np.array,
216 z: np.array,
217 SRA_array: np.array,
218 start_index: int = None,
219 end_index: int = None,
220 to_left: bool = None,
221 reverse_state: bool = False,
222 *args, **kwargs) -> np.array:
223 r"""
224 Flag SRAs between the start and the end index (inclusive).
225 \param x Array of measuring point positions in accordance to `z`.
226 \param z Array of strain data in accordance to `x`.
227 \param SRA_array Array of boolean values indicating SRAs by `True` and a valid entries by `False`.
228 \param start_index Starting index of the method to filter the array.
229 The starting index is assumed not to be an anomalous reading.
230 \param end_index Last index to check in this sweep.
231 \param to_left \copybrief to_left For more, see \ref to_left.
232 \param reverse_state Switch indicating if the sweep method is currently reverse sweeping to set direction.
233 Default value is `False`.
234 Setting this switch to `True` supresses recursion.
235 \param *args Additional positional arguments, ignored.
236 \param **kwargs Additional keyword arguments, ignored.
237 """
238 to_left = to_left if to_left is not None else self.to_left
239 # decide the direction of operation
240 if to_left:
241 step = -1
242 start_index = start_index if start_index is not None else len(z)-1
243 end_index = end_index-1 if end_index is not None else -1
244 else:
245 step = 1
246 start_index = start_index if start_index is not None else 0
247 end_index = end_index+1 if end_index is not None else len(z)
248 index_last_trusted = start_index
249 # operation start
250 for index in range(start_index, end_index, step):
251 candidate = z[index]
252 if not np.isnan(candidate):
253 # check, if the candidate needs to be flagged (marked) as SRA
254 flag = (abs(candidate - z[index_last_trusted]) > self.delta_max
255 and
256 self._compare_forward(z=z, index=index, to_left=to_left)
257 )
258 SRA_array[index]= flag
259 if not flag:
260 index_last_trusted = index
261 # commence a reverse sweep if:
262 # if the most recent entry was accepted,
263 # and the entries aren't neighbors,
264 # reverse is activated the most activated
265 # and not already in reverse mode (no double direction change)
266 if (not reverse_state
268 and abs(index - index_last_trusted) > 1):
269 self._run_1d(x=x,
270 SRA_array=SRA_array,
271 z=z,
272 start_index=index,
273 end_index=index_last_trusted,
274 to_left=not to_left,
275 reverse_state=True)
276 return x, SRA_array
277 def _run_2d(self,
278 x: np.array,
279 y: np.array,
280 z: np.array,
281 SRA_array: np.array,
282 *args, **kwargs)->tuple:
283 r"""
284 GTM has no true 2D operation mode.
285 Set \ref timespace to `"1d_space"`!
286 """
287 raise NotImplementedError("GTM does not support true 2D operation. Please use `timepace='1d_space'` instead.")
288
290 r"""
291 Class for outlier detection an cancellation based on the outlier
292 specific correction procedure (OSCP) as originally presented in
293 \cite Ismail_2010_Anoutliercorrection and \cite Ismail_2014_EvaluationOutlierSpecific.
294 The outlier detection is a two stage algorithm.
295 The first stage, the detection of outlier candidates is based on the
296 height difference of a pixel to the median height of its surrounding.
297 If this height difference of a pixel exceeds a \ref threshold it is
298 marked as an outlier candidate.
299 The \ref threshold can be estimated from the data, based on the change
300 rate of the cumulated density function of all differences in the data.
301 In the second stage, groups are formed, limited by large differences
302 in-between two pixels, (like a simple edge detection).
303 The threshold for the difference is estimated like in the first stage.
304 The members of the groups are then assigned outlier or normal status.
305 Groups consisting of outlier candidates only are considered outlier.
306 All other groups are considered normal data.
307 Finally, all outliers are converted to `NaN`.
308 """
309 def __init__(self,
310 max_radius: int,
311 threshold: float = None,
312 delta_s: float = None,
313 n_quantile: int = 50,
314 min_quantile: float = 0.5,
315 timespace: str = "1d_space",
316 *args, **kwargs):
317 r"""
318 Construct an instance of the class.
319 \param max_radius \copybrief max_radius \copydetails max_radius
320 \param delta_s \copybrief delta_s \copydetails delta_s
321 \param n_quantile \copybrief n_quantile \copydetails n_quantile
322 \param threshold \copybrief threshold \copydetails threshold
323 \param min_quantile \copybrief min_quantile \copydetails min_quantile
324 \param timespace \copybrief timespace \copydetails timespace
325 \param *args Additional positional arguments, will be passed to the superconstructor.
326 \param **kwargs Additional keyword arguments, will be passed to the superconstructor.
327 """
328 super().__init__(timespace=timespace, *args, **kwargs)
329 assert delta_s is not None or threshold is not None, "Either delta_s or threshold must be set!"
330
334 self.max_radius = max_radius
335
343 self.delta_s = delta_s
344
351 self.n_quantile = n_quantile
352
354 self.threshold = threshold
355
358 self.min_quantile = min_quantile
359 def _run_1d(self,
360 x: np.array,
361 z: np.array,
362 SRA_array: np.array,
363 *args, **kwargs) -> tuple:
364 r"""
365 Estimate which entries are strain reading anomalies in 1D.
366 \copydetails AnomalyMasker._run_1d()
367 """
368 SRA_array = self._outlier_candidates(z, SRA_array)
369 SRA_array = self._verify_candidates_1d(z, SRA_array)
370 return x, SRA_array
371 def _run_2d(self,
372 x: np.array,
373 y: np.array,
374 z: np.array,
375 SRA_array: np.array,
376 *args, **kwargs) -> tuple:
377 r"""
378 Estimate which entries are strain reading anomalies in 2D.
379 \copydetails AnomalyMasker._run_2d()
380 """
381 SRA_array = self._outlier_candidates(z, SRA_array)
382 SRA_array = self._verify_candidates_2d(z, SRA_array)
383 return x, y, SRA_array
384 def _outlier_candidates(self, z, SRA_array) -> np.array:
385 r"""
386 Detect outlier candidates in the given strain data.
387 This is the first phase according to \cite Ismail_2010_Anoutliercorrection.
388 For each radius \f$r \in [1, r_{\mathrm{max}}]\f$, the relative
389 height of all pixels is compared to the \ref threshold.
390 \param z Array containing strain data.
391 \param SRA_array Array indicating, outlier condidates.
392 \return Returns an updated `SRA_array`, with outlier candidates.
393 """
394 for radius in range(1, self.max_radius + 1):
395 height_array = self._get_median_heights(z, radius)
396 threshold = self._get_threshold(height_array)
397 candidate_array = height_array > threshold
398 SRA_array = np.logical_or(SRA_array, candidate_array)
399 return SRA_array
400 def _verify_candidates_1d(self, z, SRA_array) -> np.array:
401 r"""
402 This is the second phase of the algorithm according to
403 \cite Ismail_2010_Anoutliercorrection, adapted for 1D operation.
404 Outlier candidates are verified as SRAs, by building groups, which
405 are bordered by large enough increments between neighboring entries.
406 The increment threshold is estimated by \ref _get_threshold().
407
408 Three different types of groups are possible:
409 1. normal pixels only,
410 2. mixed normal pixels and outlier candidates,
411 3. outlier candidates only.
412
413 Groups of the third type are considered outliers.
414 Outlier candidates in mixed groups are reaccepted as normal data.
415
416 \param z Array containing strain data.
417 \param SRA_array Array indicating, outlier condidates.
418 \return Returns an updated `SRA_array` with the identified SRAs.
419 """
420 height_array = np.abs(misc.nan_diff(z, axis=0))
421 threshold = self._get_threshold(height_array)
422 group_boundaries = np.argwhere(np.greater(height_array, threshold))
423 i_prev = 0
424 for boundary in group_boundaries:
425 i = boundary[0] + 1
426 group = SRA_array[i_prev:i]
427 group[:] = np.all(group)
428 i_prev = i
429 group = SRA_array[i_prev:None]
430 group[:] = np.all(group)
431 return SRA_array
432 def _verify_candidates_2d(self, z, SRA_array) -> np.array:
433 r"""
434 This is the second phase of the algorithm according to
435 \cite Ismail_2010_Anoutliercorrection, adapted for 2D operation.
436 \copydetails _verify_candidates_1d()
437
438 Adaptation to a 2D takes some more steps, because the building of
439 the groups is not as straight-forward as in 1D.
440 This is not described in \cite Ismail_2010_Anoutliercorrection
441 and \cite Ismail_2014_EvaluationOutlierSpecific, so a detailed
442 description of the taken approach is provided here.
443 The detection of group boundaries is separated for each direction.
444 Once along the space axis and once along the time axis separately,
445 increments are calculated and the increment threshold is estimated
446 by \ref _get_threshold().
447 The next step (still separated for each direction) is generating
448 groups of indices by iterating over the arrays indices.
449 A new group is started if
450 - the current index is contained in the set of group boundaries
451 (indices of the group's start) or
452 - a new row (or column) is started (that is the end of the array
453 in this direction is reached and the iteration resumes with
454 the first entry of the next line.
455
456 After all such groups are stored in a single list, the groups of
457 indices are merged using \ref _merge_groups(), until only pairwise
458 distinct groups are left.
459 If a pixel is contained in two groups, those groups are connected
460 and merged into one.
461 This results in non-rectangular shaped groups being built.
462
463 Finally, only groups containing candidates only are verified as SRA.
464 """
465 group_list = []
466 for fast_axis, length in enumerate(z.shape):
467 slow_axis = fast_axis -1
468 # Get the value increments along the axes
469 height_array = np.abs(misc.nan_diff(z, axis=fast_axis))
470 threshold = self._get_threshold(height_array)
471 # Generate the boundaries
472 group_boundaries = np.argwhere(np.greater(height_array, threshold))
473 # Preparation of groups row/column wise
474 offset = np.zeros(len(z.shape), dtype=int)
475 offset[fast_axis] += 1
476 group_boundaries = group_boundaries + offset
477 group_boundaries = {tuple(boundary) for boundary in group_boundaries}
478 index = [0]*z.ndim
479 for slow in range(z.shape[slow_axis]):
480 group = set()
481 index[slow_axis] = slow
482 for fast in range(z.shape[fast_axis]):
483 index[fast_axis] = fast
484 if tuple(index) in group_boundaries:
485 group_list.append(group)
486 group = set()
487 group.add(tuple(index))
488 group_list.append(group)
489 # merge groups together
490 final_groups = self._merge_groups(group_list)
491 for group in final_groups:
492 # np expects a list for every axis, so transposing
493 group_indices = tuple(np.array(list(group)).T)
494 # last step: only set SRA to candidate only groups
495 SRA_array[group_indices] = np.all(SRA_array[group_indices])
496 return SRA_array
497 def _get_median_heights(self, z, radius) -> np.array:
498 r"""
499 Get the height difference to the local vicinity of all the pixels.
500 The median height is retrieved by \ref filtering.SlidingFilter.
501 The local vicinity is determined by the inradius \f$r\f$ or the
502 quadratic sliding window (see \ref filtering.SlidingFilter.radius).
503 Then, the absolute difference between the array of the median and
504 and the pixels's values is returned.
505 \param z Array containing strain data.
506 \param radius Inradius of the sliding window.
507 """
508 local_height_calc = filtering.SlidingFilter(
509 radius=radius,
510 method="nanmedian",
511 timespace=self.timespace)
512 x_tmp, y_tmp, median_array = local_height_calc.run(
513 x=None,
514 y=None,
515 z=z,
516 )
517 return np.abs(z - median_array)
518 def _get_quantiles(self, values: np.array) -> tuple:
519 r"""
520 Get quantiles of the the given data (including finite values only).
521 Only quantiles above \ref min_quantile are returned.
522 If \ref n_quantile is `None`, the upper part (> \ref min_quantile)
523 of the sorted values is returned.
524 Else, the upper part is resampled into \ref n_quantile + 1 points.
525 \param values Array, for which to calculate the quantiles.
526 """
527 if self.n_quantile is not None:
528 cdf = np.linspace(self.min_quantile, 1.0, self.n_quantile + 1)
529 return cdf, np.nanquantile(values, cdf)
530 else:
531 clean = values[np.isfinite(values)]
532 sorted_heights = np.sort(clean, axis=None)
533 length = sorted_heights.shape[0]
534 first_index = np.floor(length*self.min_quantile).astype(int)
535 cdf = np.linspace(1/length, 1, length)
536 return cdf[first_index:None], sorted_heights[first_index:None]
537 def _get_threshold(self, values: np.array) -> float:
538 r"""
539 Estimate the anomaly threshold from the data.
540 The threshold \f$t\f$ is set to the point, where the cumulated
541 density function is leveled out. That is, whre the required
542 increase in value per increase in quantile exceeds \ref delta_s.
543 If \ref threshold is set to `None` it is determined from the
544 data and \ref delta_s, else it is simply returned.
545 \param values Array, from which to estimate the threshold.
546 """
547 if self.threshold is not None:
548 return self.threshold
549 cdf, quantiles = self._get_quantiles(values)
550 change_rate = np.diff(quantiles)/np.diff(cdf)
551 is_over_threshold = np.greater(change_rate, self.delta_s)
552 threshold_index = np.argmax(is_over_threshold)
553 if not np.any(is_over_threshold):
554 threshold_index = -1
555 threshold = quantiles[threshold_index]
556 return threshold
557 def _merge_groups(self, initial_groups) -> list:
558 r"""
559 Merge all groups in the input that have at least one pairwise common entry.
560 Each group is a `set` of `tuple` standing for the strain array indices.
561 The result is a list of pairwise distinct groups, equivalent to the input.
562 \param initial_groups List of input groups (`set`s of `tuples`s).
563 """
564 result = []
565 initial_groups = copy.deepcopy(initial_groups)
566 while(initial_groups):
567 group_1 = initial_groups.pop()
568 merge_required = len(initial_groups) > 0
569 while merge_required:
570 merge_required = False
571 for group_2 in copy.deepcopy(initial_groups):
572 if group_1.intersection(group_2):
573 merge_required = True
574 initial_groups.remove(group_2)
575 group_1.update(group_2)
576 result.append(group_1)
577 return result
578
580 r"""
581 Abstract base class for outlier detection by different kinds of z-scores.
582 After calculating the z-score by the given \ref _get_z_score(), SRAs
583 are identified by comparing the strain increments to a \ref threshold.
584 The algorithms only make sense for sufficient measuring values (not NaN).
585 An overview of all z-score calculations can be found at
586 https://towardsdatascience.com/removing-spikes-from-raman-spectra-8a9fdda0ac22
587 """
588 def __init__(self,
589 threshold: float = 3.5,
590 timespace: str = "1d_space",
591 *args, **kwargs):
592 r"""
593 Construct an instance of the class.
594 \param threshold \copydoc threshold
595 \param timespace \copybrief timespace \copydetails timespace
596 \param *args Additional positional arguments, will be passed to the superconstructor.
597 \param **kwargs Additional keyword arguments, will be passed to the superconstructor.
598 """
599 super().__init__(timespace=timespace, *args, **kwargs)
600
602 self.threshold = threshold
603 def _run_1d(self,
604 x: np.array,
605 z: np.array,
606 SRA_array: np.array,
607 *args, **kwargs) -> tuple:
608 r"""
609 Estimate which entries are strain reading anomalies in 1D.
610 \copydetails AnomalyMasker._run_1d()
611 """
612 z_score = self._get_z_score(z)
613 SRA_array = self._get_outlier_mask(z_score)
614 return x, SRA_array
615 def _run_2d(self,
616 x: np.array,
617 y: np.array,
618 z: np.array,
619 SRA_array: np.array,
620 *args, **kwargs) -> tuple:
621 r"""
622 Estimate which entries are strain reading anomalies in 2D.
623 \copydetails AnomalyMasker._run_2d()
624 """
625 z_score = self._get_z_score(z)
626 SRA_array = self._get_outlier_mask(z_score)
627 return x, y, SRA_array
628 def _get_outlier_mask(self, z_score: np.array) -> np.array:
629 r"""
630 Mask entries as SRA, whose z-scores exceed \ref threshold.
631 \param z_score Array containing the z-score values.
632 \return Boolean array with values as outlier mask.
633 """
634 mask = np.array(np.abs(z_score) > self.threshold)
635 return mask
636 @abstractmethod
637 def _get_z_score(self, z: np.array) -> np.array:
638 """
639 Calculate the z-score for the given array.
640 Sub-classes need to provide a meaningful implementation.
641 """
642 raise NotImplementedError()
643
645 r"""
646 Class for the standard z-score approach for spike detection.
647 Describing a data point in terms of its relationship to the mean and
648 standard deviation of strain values.
649 The method can be mainly used for constant (noise) signals.
650 See \cite Iglewicz_Hoaglin_1993_How_to_detect_and_handle_outliers.
651 """
652 def _get_z_score(self, z: np.array) -> np.array:
653 r"""
654 Calculates the z-score of the given strain array with mean and standard deviation.
655 \param z Array containing strain data.
656 \return Returns a z-score array.
657 """
658 mean = np.nanmean(z)
659 stdev = np.nanstd(z)
660 z_score = (z - mean) / stdev
661 return z_score
662
664 r"""
665 Class for the modified z-score approach for spike detection.
666 This method uses the median and the median absolute deviation
667 rather than the mean and standard deviation.
668 The multiplier 0.6745 is the 0.75th quantile of the standard normal distribution.
669 Disadvantage: Peaks can also detect as strain reading anomaly.
670 See \cite Iglewicz_Hoaglin_1993_How_to_detect_and_handle_outliers.
671 """
672 def _get_z_score(self, z: np.array) -> np.array:
673 r"""
674 Calculates the modified z-score of the given strain array.
675 \param z Array containing strain data.
676 \return Returns an array modified z-score.
677 """
678 median_value = np.nanmedian(z)
679 mad_array = np.nanmedian(np.abs(z - median_value))
680 z_score = 0.6745 * ((z - median_value) / mad_array)
681 return z_score
682
684 r"""
685 Class that calculates the modified zscore over a moving window.
686 The window is defined by the given radius and has a width of \f$2r+1\f$.
687 The median will be calculated only for the current vicinity.
688 """
689 def __init__(self,
690 radius: int = 0,
691 threshold: float = 3.5,
692 timespace: str = "1d_space",
693 *args, **kwargs):
694 r"""
695 Construct an instance of the class.
696 \param radius \copybrief radius \copydetails radius
697 \param threshold \copydoc threshold
698 \param timespace \copybrief timespace \copydetails timespace
699 \param *args Additional positional arguments, will be passed to the superconstructor.
700 \param **kwargs Additional keyword arguments, will be passed to the superconstructor.
701 """
702 super().__init__(timespace=timespace, threshold=threshold, *args, **kwargs)
703
706 self.radius = radius
707 def _get_z_score(self, z: np.array) -> np.array:
708 r"""
709 Calculates the modified z-score with the absolute deviation of current vicinity.
710 If the MAD is zero, the mean of the absolute deviation will be used.
711 \param z Array containing strain data.
712 \return Returns an array modified z-score.
713 """
714 if self.radius == 0:
715 median_array = np.nanmedian(z)
716 else:
717 median_array = windows.sliding_window_function(z, self.radius, np.nanmedian)
718 values = z - median_array
719 ad_values = np.abs(values)
720 mad = np.nanmedian(ad_values)
721 if mad != 0:
722 factor = mad / 0.6745
723 else:
724 factor = np.nanmean(ad_values) / 0.7979
725 z_score = values / factor
726 return z_score
727
729 r"""
730 The Whitaker & Hayes algorithm uses the high intensity and small width of spikes.
731 Therefore it uses the difference between a strain value and the next value.
732 The algorithm presented in \cite Whitaker_2018_ASimpleAlgorithmDespiking.
733 """
734 def _get_delta_strain(self, z: np.array) -> np.array:
735 r"""
736 Calculates the difference between the current strain
737 and the following strain of the given strain array.
738 \param z Array containing strain data.
739 \return Returns an array delta strain.
740 """
741 delta_s = misc.nan_diff_1d(z)
742 delta_s = np.insert(delta_s, 0, np.nan)
743 return delta_s
744 def _get_z_score(self, z: np.array) -> np.array:
745 r"""
746 Calculates the modified z-score of the given strain array.
747 It uses the median and median absolute deviation.
748 The multiplier 0.6745 is the 0.75th quartile of the standard normal distribution.
749 \param z Array containing strain data.
750 \return Returns an array modified z-score.
751 """
752 delta_s = self._get_delta_strain(z)
753 median_value = np.nanmedian(delta_s)
754 mad_array = np.nanmedian(np.abs(delta_s - median_value))
755 z_score = 0.6745 * ((delta_s - median_value) / mad_array)
756 return z_score
Abstract base class for filter classes, based on sliding windows.
Definition filtering.py:108
Abstract class for anomaly identification.
Definition masking.py:19
np.array run(self, np.array x, np.array y, np.array z, bool make_copy=True, str timespace=None, bool identify_only=False, *args, **kwargs)
Mask strain reading anomalies with NaNs.
Definition masking.py:32
tuple _run_2d(self, np.array x, np.array y, np.array z, np.array SRA_array, *args, **kwargs)
Native two-dimensional operation implementation.
Definition masking.py:73
tuple _run_1d(self, np.array x, np.array z, np.array SRA_array, *args, **kwargs)
Estimate, which entries are strain reading anomalies, in 1D.
Definition masking.py:59
tuple _map_2d(self, np.array x, np.array y, np.array z, np.array SRA_array, str timespace=None, *args, **kwargs)
Estimate, which entries are strain reading anomalies, in 2D.
Definition masking.py:86
The geometric threshold method (GTM) identifies SRAs by comparing the strain increments to a threshol...
Definition masking.py:102
tuple _run_2d(self, np.array x, np.array y, np.array z, np.array SRA_array, *args, **kwargs)
GTM has no true 2D operation mode.
Definition masking.py:282
to_left
Switch for the direction of operation.
Definition masking.py:168
bool _compare_forward(self, np.array z, int index, bool to_left)
Evaluate, if the candidate keeps its SRA flag by comparing it to its succeeding neighbors.
Definition masking.py:178
np.array _run_1d(self, np.array x, np.array z, np.array SRA_array, int start_index=None, int end_index=None, bool to_left=None, bool reverse_state=False, *args, **kwargs)
Flag SRAs between the start and the end index (inclusive).
Definition masking.py:222
delta_max
Maximum plausible absolute strain increment in [µm/m].
Definition masking.py:140
tolerance
Tolerance ratio for the forward comparison to reaccept a candidate.
Definition masking.py:165
activate_reverse_sweep
Switch to activate the reverse sweep.
Definition masking.py:173
forward_comparison_range
Number of neighbors to consider in the forward neighbor comparison.
Definition masking.py:151
__init__(self, float delta_max=400.0, int forward_comparison_range=1, float tolerance=0.0, bool to_left=False, bool activate_reverse_sweep=True, *args, **kwargs)
Construct a GTM object.
Definition masking.py:123
Class for the modified z-score approach for spike detection.
Definition masking.py:663
np.array _get_z_score(self, np.array z)
Calculates the modified z-score of the given strain array.
Definition masking.py:672
Class for outlier detection an cancellation based on the outlier specific correction procedure (OSCP)...
Definition masking.py:289
np.array _outlier_candidates(self, z, SRA_array)
Detect outlier candidates in the given strain data.
Definition masking.py:384
delta_s
Setting for the threshold estimation.
Definition masking.py:343
np.array _verify_candidates_2d(self, z, SRA_array)
This is the second phase of the algorithm according to ismail_2010_anoutliercorrection,...
Definition masking.py:432
threshold
Relative height threshold above which a pixel is flagged as SRA.
Definition masking.py:354
tuple _run_2d(self, np.array x, np.array y, np.array z, np.array SRA_array, *args, **kwargs)
Estimate which entries are strain reading anomalies in 2D.
Definition masking.py:376
n_quantile
Granularity for the threshold estimation resampling.
Definition masking.py:351
float _get_threshold(self, np.array values)
Estimate the anomaly threshold from the data.
Definition masking.py:537
list _merge_groups(self, initial_groups)
Merge all groups in the input that have at least one pairwise common entry.
Definition masking.py:557
max_radius
The radius of the largest sliding window used in the outlier candidate detection stage determines th...
Definition masking.py:334
tuple _run_1d(self, np.array x, np.array z, np.array SRA_array, *args, **kwargs)
Estimate which entries are strain reading anomalies in 1D.
Definition masking.py:363
min_quantile
The quantile, from which the cumulated density function of relative heights is kept for threshold est...
Definition masking.py:358
__init__(self, int max_radius, float threshold=None, float delta_s=None, int n_quantile=50, float min_quantile=0.5, str timespace="1d_space", *args, **kwargs)
Construct an instance of the class.
Definition masking.py:316
np.array _verify_candidates_1d(self, z, SRA_array)
This is the second phase of the algorithm according to ismail_2010_anoutliercorrection,...
Definition masking.py:400
tuple _get_quantiles(self, np.array values)
Get quantiles of the the given data (including finite values only).
Definition masking.py:518
np.array _get_median_heights(self, z, radius)
Get the height difference to the local vicinity of all the pixels.
Definition masking.py:497
Class that calculates the modified zscore over a moving window.
Definition masking.py:683
int radius
Inradius of the sliding window.
Definition masking.py:706
np.array _get_z_score(self, np.array z)
Calculates the modified z-score with the absolute deviation of current vicinity.
Definition masking.py:707
__init__(self, int radius=0, float threshold=3.5, str timespace="1d_space", *args, **kwargs)
Construct an instance of the class.
Definition masking.py:693
The Whitaker & Hayes algorithm uses the high intensity and small width of spikes.
Definition masking.py:728
np.array _get_z_score(self, np.array z)
Calculates the modified z-score of the given strain array.
Definition masking.py:744
np.array _get_delta_strain(self, np.array z)
Calculates the difference between the current strain and the following strain of the given strain arr...
Definition masking.py:734
Abstract base class for outlier detection by different kinds of z-scores.
Definition masking.py:579
np.array _get_z_score(self, np.array z)
Calculate the z-score for the given array.
Definition masking.py:637
tuple _run_2d(self, np.array x, np.array y, np.array z, np.array SRA_array, *args, **kwargs)
Estimate which entries are strain reading anomalies in 2D.
Definition masking.py:620
threshold
Relative height threshold above which a pixel is flagged as SRA.
Definition masking.py:602
tuple _run_1d(self, np.array x, np.array z, np.array SRA_array, *args, **kwargs)
Estimate which entries are strain reading anomalies in 1D.
Definition masking.py:607
np.array _get_outlier_mask(self, np.array z_score)
Mask entries as SRA, whose z-scores exceed threshold.
Definition masking.py:628
__init__(self, float threshold=3.5, str timespace="1d_space", *args, **kwargs)
Construct an instance of the class.
Definition masking.py:591
Class for the standard z-score approach for spike detection.
Definition masking.py:644
np.array _get_z_score(self, np.array z)
Calculates the z-score of the given strain array with mean and standard deviation.
Definition masking.py:652
This intermediate class indicates, that a sub-class is implementing a task.
Definition base.py:26
Contains utility modules with general pupose.
Definition __init__.py:1