Coverage for lasso/dimred/svd/clustering_betas.py: 48%
223 statements
« prev ^ index » next coverage.py v7.2.4, created at 2023-04-28 18:42 +0100
« prev ^ index » next coverage.py v7.2.4, created at 2023-04-28 18:42 +0100
1from typing import Sequence, Tuple, Union
3import numpy as np
4from sklearn.cluster import DBSCAN, OPTICS, KMeans, SpectralClustering
5from sklearn.ensemble import IsolationForest
6from sklearn.neighbors import LocalOutlierFactor
7from sklearn.svm import OneClassSVM
9from .keyword_types import ClusterType, DetectorType
12def __apply_spectral_clustering(betas, runids, datasets, idsets, random_state=11, **kwargs):
13 """
14 Method to group the input Betas.
15 Default keyword arguments: affinity='nearest_neighbors', random_state=11
17 Parameters
18 ----------
19 betas: np.ndarray
20 Betas that shall be grouped into clusters
21 run_ids: np.ndarray
22 Ids matching to each Beta
23 datasets: list
24 List where each grouped Betas will be added
25 idsets: list
26 List where the grouped ids corresponding to the grouped Betas will be saved
27 **kwargs: keyword arguments
28 Keyword arguments specific for the SpectralClustering algorythm
30 See Also
31 --------
32 Detailed Documentation of the function parameters can be found on sklearn.
33 Link: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.SpectralClustering.html#sklearn.cluster.SpectralClustering
34 """ # noqa pylint: disable = line-too-long
35 clustering = SpectralClustering(random_state=random_state, **kwargs).fit(betas)
37 indexes = clustering.labels_
39 clusters = np.unique(indexes)
41 for clump in clusters:
42 clump_index = np.where(indexes == clump)[0]
43 clump_betas = betas[clump_index]
44 clump_runs = runids[clump_index]
45 datasets.append(clump_betas)
46 idsets.append(clump_runs.tolist())
49def __apply_k_means(betas, runids, datasets, idsets, random_state=11, **kwargs):
50 """
51 Method to group the input Betas.
52 Recommended keyword arguments: n_clusters=3, random_state=11
54 Parameters
55 ----------
56 betas: np.ndarray
57 Betas that shall be grouped into clusters
58 run_ids: np.ndarray
59 Ids matching to each Beta
60 datasets: list
61 List where each grouped Betas will be added
62 idsets: list
63 List where the grouped ids corresponding to the grouped Betas will be saved
64 **kwargs: keyword arguments
65 Keyword arguments specific fot the KMeans algorythm
67 See Also
68 --------
69 Detailed Documentation of the function parameters can be found on sklearn.
70 Link: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans
71 """ # noqa: E501 pylint: disable = line-too-long
72 kmeans = KMeans(random_state=random_state, **kwargs).fit(betas)
73 indexes = kmeans.labels_
75 clusters = np.unique(indexes)
77 for clump in clusters:
78 clump_index = np.where(indexes == clump)[0]
79 clump_betas = betas[clump_index]
80 clump_runs = runids[clump_index]
81 datasets.append(clump_betas)
82 idsets.append(clump_runs.tolist())
85def __apply_dbscan(betas, runids, datasets, idsets, **kwargs):
86 """
87 Method to group the input Betas.
88 Defautl keyword arguments: eps=0.08
90 Parameters
91 ----------
92 betas: np.ndarray
93 Betas that shall be grouped into clusters
94 run_ids: np.ndarray
95 Ids matching to each Beta
96 datasets: list
97 List where each grouped Betas will be added
98 idsets: list
99 List where the grouped ids corresponding to the grouped Betas will be saved
100 **kwags: keyword arguments
101 Keyword arguments for the DBSCAN algorythm
103 See Also
104 --------
105 Detailed Documentation of the function parameters can be found on sklearn.
106 Link: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html#sklearn.cluster.DBSCAN
107 """ # noqa: E501 pylint: disable = line-too-long
108 deutsche_bahn = DBSCAN(**kwargs).fit(betas)
109 indexes = deutsche_bahn.labels_
111 clusters = np.unique(indexes)
113 for clump in clusters:
114 clump_index = np.where(indexes == clump)[0]
115 clump_betas = betas[clump_index]
116 clump_runs = runids[clump_index]
117 datasets.append(clump_betas)
118 idsets.append(clump_runs.tolist())
121def __apply_optics(betas, runids, datasets, idsets, **kwargs):
122 """
123 Method to group the input Betas.
124 Default keyword parameters: eps=0.05, min_cluster_size=10
126 Parameters
127 ----------
128 betas: np.ndarray
129 Betas that shall be grouped into clusters
130 run_ids: np.ndarray
131 Ids matching to each Beta
132 datasets: list
133 List where each grouped Betas will be added
134 idsets: list
135 List where the grouped ids corresponding to the grouped Betas will be saved
136 **kwargs: keyword arguments
137 Keyword arguments specific to the OPTICS function.
139 See Also
140 -------
141 Detailed Documentation of the function parameters can be found on sklearn.
142 Link: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.OPTICS.html#sklearn.cluster.OPTICS
143 """ # noqa: E501 pylint: disable = line-too-long
144 lense = OPTICS(**kwargs).fit(betas)
145 indexes = lense.labels_
147 clusters = np.unique(indexes)
149 for clump in clusters:
150 clump_index = np.where(indexes == clump)[0]
151 clump_betas = betas[clump_index]
152 clump_runs = runids[clump_index]
153 datasets.append(clump_betas)
154 idsets.append(clump_runs.tolist())
157def __detect_outliers_isolation_forest(
158 betas, ids, beta_clusters, id_clusters, random_state=11, **kwargs
159):
160 """
161 Detects outliers based on the IsolationForest algorythm from sklearn.
162 Detected outliers will be appended into the provided lists
163 Default keyword parameters: random_state=12, behaviour="new", contamination=0.005
165 Parameters
166 ----------
167 betas: np.ndarray
168 Numpy array containing the betas
169 ids: np.ndarray
170 Numpy array containing the ids of each beta
171 beta_clusters: list
172 List where each cluster of betas will be appended
173 id_clusters: list
174 List where each cluster of ids will be appended
175 **kwargs: keyword argument
176 Keywords specific to the IsolationForest algorythm
177 Returns
178 -------
179 inlier_betas: np.array
180 Numpy array containing the betas that are not outliers
181 inlier_ids: np.array
182 Numpy array containing the ids of betas that are not outliers
184 See Also
185 --------
186 Detailed Documentation of the function parameters can be found on sklearn.
187 Link: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.IsolationForest.html
188 """ # noqa: E501 pylint: disable = line-too-long
189 outcasts = IsolationForest(random_state=random_state, **kwargs).fit(betas).predict(betas)
191 outlier_key = np.where(outcasts == -1)[0]
192 inlier_key = np.where(outcasts == 1)[0]
193 beta_clusters.append(betas[outlier_key])
194 id_clusters.append(ids[outlier_key].tolist())
196 return betas[inlier_key], ids[inlier_key]
199def __detect_outliers_local_outlier_factor(betas, ids, beta_clusters, id_clusters, **kwargs):
200 """
201 Detects outliers based on the LocalOutlierFactor algorythm from sklearn.
202 Detected outliers will be appended into the provided lists
203 Default keyword parameters: contamination=0.01
205 Parameters
206 ----------
207 betas: np.ndarray
208 Numpy array containing the betas
209 ids: np.ndarray
210 Numpy array containing the ids of each beta
211 beta_clusters: list
212 List where each cluster of betas will be appended
213 id_clusters: list
214 List where each cluster of ids will be appended
215 **kwargs: keyword argument
216 Keywords specific to the LocalOutlierFactor algorythm.
217 Returns
218 -------
219 inlier_betas: np.ndarray
220 Numpy array containing the betas that are not outliers
221 inlier_ids: np.ndarray
222 Numpy array containing the ids of betas that are not outliers
224 See Also
225 --------
226 Detailed Documentation of the function parameters can be found on sklearn.
227 Link:https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.LocalOutlierFactor.html#sklearn.neighbors.LocalOutlierFactor
228 """ # noqa pylint: disable = line-too-long
229 outcasts = LocalOutlierFactor(**kwargs).fit_predict(betas)
231 outlier_key = np.where(outcasts == -1)[0]
232 inlier_key = np.where(outcasts == 1)[0]
233 beta_clusters.append(betas[outlier_key])
234 id_clusters.append(ids[outlier_key].tolist())
236 return betas[inlier_key], ids[inlier_key]
239def __detect_outliers_one_class_svm(betas, ids, beta_clusters, id_clusters, **kwargs):
240 """
241 Detects outliers based on the OneClassSVM algorythm from sklearn.
242 Detected outliers will be appended into the provided lists
243 Defautl keyword arguments: gamma=0.1, nu=0.01
245 Parameters
246 ----------
247 betas: np.ndarray
248 Numpy array containing the betas
249 ids: np.ndarray
250 Numpy array containing the ids of each beta
251 beta_clusters: list
252 List where each cluster of betas will be appended
253 id_clusters: list
254 List where each cluster of ids will be appended
255 **kwargs: keyword argument
256 Keywords specific to the OneClassSVM algorythm.
258 Returns
259 -------
260 inlier_betas: np.ndarray
261 Numpy array containing the betas that are not outliers
262 inlier_ids: np.ndarray
263 Numpy array containing the ids of betas that are not outliers
265 See Also
266 --------
267 Detailed Documentation of the function parameters can be found on sklearn.
268 Link: https://scikit-learn.org/stable/modules/generated/sklearn.svm.OneClassSVM.html#sklearn.svm.OneClassSVM
269 """ # noqa: E501 pylint: disable = line-too-long
271 outcasts = OneClassSVM(**kwargs).fit_predict(betas)
273 outlier_key = np.where(outcasts == -1)[0]
274 inlier_key = np.where(outcasts == 1)[0]
275 beta_clusters.append(betas[outlier_key])
276 id_clusters.append(ids[outlier_key].tolist())
278 return betas[inlier_key], ids[inlier_key]
281def __experimental_outlier_detector(betas, ids, **kwargs):
282 """
283 Detects outliers by applying LocalOutlierFactor algorythm from sklearn over multiple slices of betas .
284 Detected outliers will be appended into the provided lists
285 Default keyword arguments: contamination=0.01
286 Parameters
287 ----------
288 betas: np.ndarray
289 Numpy array containing the betas
290 ids: np.ndarray
291 Numpy array containing the ids of each beta
292 **kwargs: keyword argument
293 Keywords specific to the LocalOutlierFactor algorythm
294 Returns
295 -------
296 outliers: np.array
297 Numpy array containing the sample names identified as outliers
298 outlier_index: np.array
299 Array containing the indexes of outliers
300 inlier_index: np.array
301 Array of booleans to get inlier(not outliers) betas and IDs
303 See Also
304 --------
305 Detailed Documentation of the function parameters can be found on sklearn.
306 Link:https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.LocalOutlierFactor.html#sklearn.neighbors.LocalOutlierFactor
307 """ # noqa pylint: disable = line-too-long
309 # pylint: disable = too-many-locals
311 loops = betas.shape[1] - 2
312 alertlist = []
313 for dadoop in range(loops):
314 slicer = dadoop + 3
315 beta_slice = betas[:, dadoop:slicer]
317 sanitizer = LocalOutlierFactor(**kwargs).fit_predict(beta_slice)
318 outlier_key = np.where(sanitizer == -1)[0]
319 alertlist.append(outlier_key)
321 suspects = np.concatenate(alertlist)
322 individuals = np.unique(suspects)
323 crimecounter = np.array([np.where(suspects == tracked)[0].shape[0] for tracked in individuals])
325 the_cases = np.where(crimecounter > 2)[0]
326 the_judged = ids[individuals[the_cases]]
328 innocents = np.full(ids.shape, True)
330 if the_judged.shape != (0,):
331 judged_index = np.array([np.where(ids == convict)[0] for convict in the_judged])[:, 0]
333 innocents[judged_index] = False
334 else:
335 return False
337 return the_judged, judged_index, innocents
340def __rescale_betas(betas):
341 """
342 utility function to rescale betas into the range of [0, 1].
343 Expects only positive betas
345 Parameters
346 ----------
347 betas: np.ndarray
348 Numpy array containing the betas to be scaled. Expects betas of shape (samples, nr_betas)
350 Returns
351 -------
352 betas_scaled: np.ndarray
353 Betas scaled to range [0, 1]
354 maxb: np.ndarray
355 Array to rescale betas back to original values
356 """
357 assert len(betas.shape) == 2
358 ref_betas = np.abs(betas)
359 maxb = np.array([np.max(ref_betas[:, i]) for i in range(betas.shape[1])])
360 # return np.array([(betas[:, i]/maxb[i]) for i in range(betas.shape[1])]).T
361 return betas / (maxb.T), maxb.T
364def list_detectors_and_cluster():
365 """
366 Prints out all keywords for outlier detection and clustering functions
368 See Also
369 --------
370 list_detectors_and_cluster(keyword)"""
372 print("Implemented Detectors:")
373 for entry in __detector_dict:
374 print(" " + entry)
375 print("Implemented Clustering Functions")
376 for entry in __cluster_dict:
377 print(" " + entry)
380def document_algorithm(keyword):
381 """
382 prints out the docstring of the function related to the input keyword
384 Parameters
385 ----------
386 keyword: str
387 String keyword referencing the outlier detection or clustering function
389 See Also
390 --------
391 list_detectors_and_cluster()
392 """
393 print(__doc_dict[keyword])
396__doc_dict = {
397 DetectorType.IsolationForest: __detect_outliers_isolation_forest.__doc__,
398 DetectorType.OneClassSVM: __detect_outliers_one_class_svm.__doc__,
399 DetectorType.LocalOutlierFactor: __detect_outliers_local_outlier_factor.__doc__,
400 # DetectorType.Experimental: __experimental_outlier_detector.__doc__,
401 ClusterType.OPTICS: __apply_optics.__doc__,
402 ClusterType.DBSCAN: __apply_dbscan.__doc__,
403 ClusterType.KMeans: __apply_k_means.__doc__,
404 ClusterType.SpectralClustering: __apply_spectral_clustering.__doc__,
405}
407__detector_dict = {
408 DetectorType.IsolationForest: __detect_outliers_isolation_forest,
409 DetectorType.OneClassSVM: __detect_outliers_one_class_svm,
410 DetectorType.LocalOutlierFactor: __detect_outliers_local_outlier_factor,
411 # DetectorType.Experimental: __experimental_outlier_detector
412}
413__cluster_dict = {
414 ClusterType.OPTICS: __apply_optics,
415 ClusterType.DBSCAN: __apply_dbscan,
416 ClusterType.KMeans: __apply_k_means,
417 ClusterType.SpectralClustering: __apply_spectral_clustering,
418}
421def create_cluster_arg_dict(args: Sequence[str]) -> Union[Tuple[str, dict], str]:
422 """Determines which cluster to use and creates a python dictionary to use as cluster_params
424 Parameters
425 ----------
426 args: Sequence[str]
427 List of strings containing parameters and arguments
429 Returns
430 -------
431 cluster_type: str
432 determines which cluster algorithm to use
433 cluster_arg_dict: dict
434 dictionary containing arguments and values for specific cluster_type
435 err_msg: str
436 message containing error, mostly unrecognised keywords"""
438 # first argument must be cluster type
439 cluster_key = args[0].lower()
440 cluster_arg_dict = {}
441 cluster_type = None
442 param_type = []
444 # all following arguments are a parameter, followed by their respective value
445 parameters = []
446 values = []
447 if len(args) % 3 == 0:
448 # check if amount of parameters is valid
449 err_msg = (
450 "Invalid cluster arguments, first argument must be the chosen clustering algorithm,"
451 " and each optional subsequent parameter must be followed by its type and value"
452 )
453 return err_msg
454 if len(args) > 1:
455 # check if we even have parameters
456 parameters = args[1:-2:3]
457 param_type = args[2:-1:3]
458 values = args[3::3]
460 for cluster_option in ClusterType.get_cluster_type_name():
461 if cluster_key == cluster_option.lower():
462 cluster_type = cluster_option
464 if not cluster_type:
465 err_msg = (
466 f"No existing clustering method matching {args[0]}"
467 f"possible clustering methods are: {str(ClusterType.get_cluster_type_name())[1:-1]}"
468 )
469 return err_msg
471 for ind, param in enumerate(parameters):
472 p_t = param_type[ind]
473 v_type = None
474 if p_t == "str":
475 v_type = str
476 elif p_t == "float":
477 v_type = float
478 elif p_t == "int":
479 v_type = int
480 else:
481 err_msg = f"Clustering: Invalid type identifier {p_t}"
482 return err_msg
484 try:
485 val = v_type(values[ind])
486 except ValueError:
487 err_msg = (
488 f"Clustering: Invalid value {values[ind]} "
489 f"for parameter {param} of type {v_type}"
490 )
491 return err_msg
492 cluster_arg_dict[param] = val
494 return cluster_type, cluster_arg_dict
497def create_detector_arg_dict(args: Sequence[str]) -> Union[Tuple[str, dict], str]:
498 """Determines which detector to use and creates a python dictionary to use as detector_params
500 Parameters
501 ----------
502 args: Sequence[str]
503 List of strings containing parameters and arguments
505 Returns
506 -------
507 detector_type: str
508 determines which cluster algorithm to use
509 detector_arg_dict: dict
510 dictionary containing arguments and values for specific cluster_type
511 err_mgs: str
512 message containing error, mostly unrecognised keywords"""
514 # first argument must be detector type:
515 detector_key = args[0].lower()
516 detector_arg_dict = {}
517 detector_type = None
518 param_type = []
520 # all following arguments are a parameter, followed by their respective value
521 parameters = []
522 values = []
523 if len(args) % 3 == 0:
524 # check if amount of parameters is valid
525 err_msg = (
526 "Invalid outlier detector arguments, first argument must be "
527 "the chosen detector algorithm, and each optional subsequent "
528 "parameter must be followed by its type and value"
529 )
530 return err_msg
531 if len(args) > 1:
532 # check if we even have parameters
533 parameters = args[1:-2:3]
534 param_type = args[2:-1:3]
535 values = args[3::3]
537 for detector_option in DetectorType.get_detector_type_name():
538 if detector_key == detector_option.lower():
539 detector_type = detector_option
541 if not detector_type:
542 err_msg = (
543 f"No existing outlier detection method matching {args[0]} "
544 f"possible outlier detection methods are: "
545 f"{str(DetectorType.get_detector_type_name())[1:-1]}"
546 )
547 return err_msg
549 for ind, param in enumerate(parameters):
550 p_t = param_type[ind]
551 v_type = None
552 if p_t == "str":
553 v_type = str
554 elif p_t == "float":
555 v_type = float
556 elif p_t == "int":
557 v_type = int
558 else:
559 err_msg = f"Outlier Detection: Invalid type identifier {p_t}"
560 return err_msg
562 try:
563 val = v_type(values[ind])
564 except ValueError:
565 err_msg = (
566 f"Outlier Detection: Invalid value {values[ind]} "
567 "for parameter {param} of type {v_type}"
568 )
569 return err_msg
570 detector_arg_dict[param] = val
572 return detector_type, detector_arg_dict
575def group_betas(
576 beta_index,
577 betas,
578 scale_betas=False,
579 cluster=None,
580 detector=None,
581 cluster_params=None,
582 detector_params=None,
583) -> Union[Tuple[list, list], str]:
584 """
585 Base function to to group betas into groups, detect outliers. Provides that all different
586 clustering and outlier detection algorythms are implemented in an easy to access environment.
587 To select different clustering and outlier detection algoyrthms, please use appropriate
588 KeywordTypes. A description of each function can be accessed with document_algorythm(keyword)
589 A list of all functions can be accessed with list_detectors_and_clusters()
591 Parameters
592 ----------
593 beta_index: np.ndarray
594 Array containing the file names specific to the betas with the same index in the beta array
595 betas: np.ndarray
596 Numpy array containing the betas.
597 Betas are expected to be of shape (samples, timestep, 3)
598 The three entries per beta can either be dimesnions (x,y,z) or any three betas/eigenvalues
599 cluster: str, optional, default : "KMeans".
600 String specifying which clustering algorythm shall be applied.
601 Use ClusterTypefor easier access
602 detector: str, optional, default: None.
603 String specifying which outlier detection algorythm shall be applied.
604 Use DetectorType for easier access
605 cluster_params: dict, optional
606 Dictionary containing parameters for the clustering algorythms.
607 See the sklearn documentation for the function to learn more.
608 detector_params: dict, optional
609 Dictionary containing parameters for the outlier detection algorythms.
610 See the sklearn documentation for the function to learn more
612 Returns
613 -------
614 beta_clusters: list
615 List containing Numpy Arrays of betas in one cluster.
616 If a detector was selected, or the clustering algorythm has its
617 own outlier detection, the first entry in the list will be oultier betas
618 id_clusters: list
619 List containing lists of beta ids. Each id corresponds to the beta in
620 the same place in the beta_clusters list
621 err_msg: str
622 Error message if wrong keywords for detector or cluster algorithms were used
624 Notes
625 --------
626 document_algorithm:
627 Prints docstring of each function into console
628 list_detectors_and_clusters:
629 Prints out all detection and clustering algorythms into console
630 Sklearn Userguide chapter 2.3 Clustering:
631 https://scikit-learn.org/stable/modules/clustering.html
632 Detailed overview of different clustering algorythms
633 Sklearn Examples outlier detection:
634 https://scikit-learn.org/stable/auto_examples/plot_anomaly_comparison.html
635 Example of different used outlier detection algorythms
636 """
638 # pylint: disable = too-many-arguments, too-many-locals, too-many-branches
640 if cluster_params is None:
641 cluster_params = {}
643 if detector_params is None:
644 detector_params = {}
646 beta_clusters = []
647 id_clusters = []
649 if scale_betas:
650 betas, _ = __rescale_betas(betas)
652 if detector == "Experimental":
654 experimental_results = __detector_dict[detector](betas, beta_index, **detector_params)
655 if not isinstance(experimental_results, bool):
656 outlier_betas, outlier_index, inlier_index = experimental_results
657 beta_clusters.append(betas[outlier_index])
658 id_clusters.append(outlier_betas.tolist())
659 betas = betas[inlier_index]
660 beta_index = beta_index[inlier_index]
661 else:
662 empy_list = []
663 beta_clusters.append(empy_list)
664 id_clusters.append(empy_list)
666 detector = None
668 if detector is not None:
669 try:
670 betas_det, index_det = __detector_dict[detector](
671 betas, beta_index, beta_clusters, id_clusters, **detector_params
672 )
674 except TypeError as key_err:
675 err_msg = (
676 f"During Outlier Detection, a TypeError came up:\n{str(key_err)}\n"
677 "Please check your outlier detection arguments"
678 )
679 return err_msg
681 except ValueError as val_err:
682 err_msg = (
683 f"During Outlier Detection, a ValueError came up:\n{str(val_err)}\n"
684 "Please check your outlier detection arguments"
685 )
686 return err_msg
687 else:
688 betas_det, index_det = betas, beta_index
690 if cluster is not None:
691 try:
692 __cluster_dict[cluster](
693 betas_det, index_det, beta_clusters, id_clusters, **cluster_params
694 )
695 except TypeError as key_err:
696 err_msg = (
697 f"During Clustering, a TypeError came up:\n{str(key_err)}\n"
698 "Please check your outlier detection arguments"
699 )
700 return err_msg
702 except ValueError as val_err:
703 err_msg = (
704 f"During Clustering, a ValueError came up:\n{str(val_err)}\n"
705 "Please check your outlier detection arguments"
706 )
707 return err_msg
708 else:
709 beta_clusters, id_clusters = [*beta_clusters, betas_det], [*id_clusters, index_det]
711 return beta_clusters, id_clusters