Coverage for lasso/dimred/svd/clustering

1from typing import Sequence, Tuple, Union

3import numpy as np

4from sklearn.cluster import DBSCAN, OPTICS, KMeans, SpectralClustering

5from sklearn.ensemble import IsolationForest

6from sklearn.neighbors import LocalOutlierFactor

7from sklearn.svm import OneClassSVM

9from .keyword_types import ClusterType, DetectorType

12def __apply_spectral_clustering(betas, runids, datasets, idsets, random_state=11, **kwargs):

13 """

14 Method to group the input Betas.

15 Default keyword arguments: affinity='nearest_neighbors', random_state=11

17 Parameters

18 ----------

19 betas: np.ndarray

20 Betas that shall be grouped into clusters

21 run_ids: np.ndarray

22 Ids matching to each Beta

23 datasets: list

24 List where each grouped Betas will be added

25 idsets: list

26 List where the grouped ids corresponding to the grouped Betas will be saved

27 **kwargs: keyword arguments

28 Keyword arguments specific for the SpectralClustering algorythm

30 See Also

31 --------

32 Detailed Documentation of the function parameters can be found on sklearn.

33 Link: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.SpectralClustering.html#sklearn.cluster.SpectralClustering

34 """ # noqa pylint: disable = line-too-long

35 clustering = SpectralClustering(random_state=random_state, **kwargs).fit(betas)

37 indexes = clustering.labels_

39 clusters = np.unique(indexes)

41 for clump in clusters:

42 clump_index = np.where(indexes == clump)[0]

43 clump_betas = betas[clump_index]

44 clump_runs = runids[clump_index]

45 datasets.append(clump_betas)

46 idsets.append(clump_runs.tolist())

49def __apply_k_means(betas, runids, datasets, idsets, random_state=11, **kwargs):

50 """

51 Method to group the input Betas.

52 Recommended keyword arguments: n_clusters=3, random_state=11

54 Parameters

55 ----------

56 betas: np.ndarray

57 Betas that shall be grouped into clusters

58 run_ids: np.ndarray

59 Ids matching to each Beta

60 datasets: list

61 List where each grouped Betas will be added

62 idsets: list

63 List where the grouped ids corresponding to the grouped Betas will be saved

64 **kwargs: keyword arguments

65 Keyword arguments specific fot the KMeans algorythm

67 See Also

68 --------

69 Detailed Documentation of the function parameters can be found on sklearn.

70 Link: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans

71 """ # noqa: E501 pylint: disable = line-too-long

72 kmeans = KMeans(random_state=random_state, **kwargs).fit(betas)

73 indexes = kmeans.labels_

75 clusters = np.unique(indexes)

77 for clump in clusters:

78 clump_index = np.where(indexes == clump)[0]

79 clump_betas = betas[clump_index]

80 clump_runs = runids[clump_index]

81 datasets.append(clump_betas)

82 idsets.append(clump_runs.tolist())

85def __apply_dbscan(betas, runids, datasets, idsets, **kwargs):

86 """

87 Method to group the input Betas.

88 Defautl keyword arguments: eps=0.08

90 Parameters

91 ----------

92 betas: np.ndarray

93 Betas that shall be grouped into clusters

94 run_ids: np.ndarray

95 Ids matching to each Beta

96 datasets: list

97 List where each grouped Betas will be added

98 idsets: list

99 List where the grouped ids corresponding to the grouped Betas will be saved

100 **kwags: keyword arguments

101 Keyword arguments for the DBSCAN algorythm

102

103 See Also

104 --------

105 Detailed Documentation of the function parameters can be found on sklearn.

106 Link: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html#sklearn.cluster.DBSCAN

107 """ # noqa: E501 pylint: disable = line-too-long

108 deutsche_bahn = DBSCAN(**kwargs).fit(betas)

109 indexes = deutsche_bahn.labels_

110

111 clusters = np.unique(indexes)

112

113 for clump in clusters:

114 clump_index = np.where(indexes == clump)[0]

115 clump_betas = betas[clump_index]

116 clump_runs = runids[clump_index]

117 datasets.append(clump_betas)

118 idsets.append(clump_runs.tolist())

119

120

121def __apply_optics(betas, runids, datasets, idsets, **kwargs):

122 """

123 Method to group the input Betas.

124 Default keyword parameters: eps=0.05, min_cluster_size=10

125

126 Parameters

127 ----------

128 betas: np.ndarray

129 Betas that shall be grouped into clusters

130 run_ids: np.ndarray

131 Ids matching to each Beta

132 datasets: list

133 List where each grouped Betas will be added

134 idsets: list

135 List where the grouped ids corresponding to the grouped Betas will be saved

136 **kwargs: keyword arguments

137 Keyword arguments specific to the OPTICS function.

138

139 See Also

140 -------

141 Detailed Documentation of the function parameters can be found on sklearn.

142 Link: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.OPTICS.html#sklearn.cluster.OPTICS

143 """ # noqa: E501 pylint: disable = line-too-long

144 lense = OPTICS(**kwargs).fit(betas)

145 indexes = lense.labels_

146

147 clusters = np.unique(indexes)

148

149 for clump in clusters:

150 clump_index = np.where(indexes == clump)[0]

151 clump_betas = betas[clump_index]

152 clump_runs = runids[clump_index]

153 datasets.append(clump_betas)

154 idsets.append(clump_runs.tolist())

155

156

157def __detect_outliers_isolation_forest(

158 betas, ids, beta_clusters, id_clusters, random_state=11, **kwargs

159):

160 """

161 Detects outliers based on the IsolationForest algorythm from sklearn.

162 Detected outliers will be appended into the provided lists

163 Default keyword parameters: random_state=12, behaviour="new", contamination=0.005

164

165 Parameters

166 ----------

167 betas: np.ndarray

168 Numpy array containing the betas

169 ids: np.ndarray

170 Numpy array containing the ids of each beta

171 beta_clusters: list

172 List where each cluster of betas will be appended

173 id_clusters: list

174 List where each cluster of ids will be appended

175 **kwargs: keyword argument

176 Keywords specific to the IsolationForest algorythm

177 Returns

178 -------

179 inlier_betas: np.array

180 Numpy array containing the betas that are not outliers

181 inlier_ids: np.array

182 Numpy array containing the ids of betas that are not outliers

183

184 See Also

185 --------

186 Detailed Documentation of the function parameters can be found on sklearn.

187 Link: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.IsolationForest.html

188 """ # noqa: E501 pylint: disable = line-too-long

189 outcasts = IsolationForest(random_state=random_state, **kwargs).fit(betas).predict(betas)

190

191 outlier_key = np.where(outcasts == -1)[0]

192 inlier_key = np.where(outcasts == 1)[0]

193 beta_clusters.append(betas[outlier_key])

194 id_clusters.append(ids[outlier_key].tolist())

195

196 return betas[inlier_key], ids[inlier_key]

197

198

199def __detect_outliers_local_outlier_factor(betas, ids, beta_clusters, id_clusters, **kwargs):

200 """

201 Detects outliers based on the LocalOutlierFactor algorythm from sklearn.

202 Detected outliers will be appended into the provided lists

203 Default keyword parameters: contamination=0.01

204

205 Parameters

206 ----------

207 betas: np.ndarray

208 Numpy array containing the betas

209 ids: np.ndarray

210 Numpy array containing the ids of each beta

211 beta_clusters: list

212 List where each cluster of betas will be appended

213 id_clusters: list

214 List where each cluster of ids will be appended

215 **kwargs: keyword argument

216 Keywords specific to the LocalOutlierFactor algorythm.

217 Returns

218 -------

219 inlier_betas: np.ndarray

220 Numpy array containing the betas that are not outliers

221 inlier_ids: np.ndarray

222 Numpy array containing the ids of betas that are not outliers

223

224 See Also

225 --------

226 Detailed Documentation of the function parameters can be found on sklearn.

227 Link:https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.LocalOutlierFactor.html#sklearn.neighbors.LocalOutlierFactor

228 """ # noqa pylint: disable = line-too-long

229 outcasts = LocalOutlierFactor(**kwargs).fit_predict(betas)

230

231 outlier_key = np.where(outcasts == -1)[0]

232 inlier_key = np.where(outcasts == 1)[0]

233 beta_clusters.append(betas[outlier_key])

234 id_clusters.append(ids[outlier_key].tolist())

235

236 return betas[inlier_key], ids[inlier_key]

237

238

239def __detect_outliers_one_class_svm(betas, ids, beta_clusters, id_clusters, **kwargs):

240 """

241 Detects outliers based on the OneClassSVM algorythm from sklearn.

242 Detected outliers will be appended into the provided lists

243 Defautl keyword arguments: gamma=0.1, nu=0.01

244

245 Parameters

246 ----------

247 betas: np.ndarray

248 Numpy array containing the betas

249 ids: np.ndarray

250 Numpy array containing the ids of each beta

251 beta_clusters: list

252 List where each cluster of betas will be appended

253 id_clusters: list

254 List where each cluster of ids will be appended

255 **kwargs: keyword argument

256 Keywords specific to the OneClassSVM algorythm.

257

258 Returns

259 -------

260 inlier_betas: np.ndarray

261 Numpy array containing the betas that are not outliers

262 inlier_ids: np.ndarray

263 Numpy array containing the ids of betas that are not outliers

264

265 See Also

266 --------

267 Detailed Documentation of the function parameters can be found on sklearn.

268 Link: https://scikit-learn.org/stable/modules/generated/sklearn.svm.OneClassSVM.html#sklearn.svm.OneClassSVM

269 """ # noqa: E501 pylint: disable = line-too-long

270

271 outcasts = OneClassSVM(**kwargs).fit_predict(betas)

272

273 outlier_key = np.where(outcasts == -1)[0]

274 inlier_key = np.where(outcasts == 1)[0]

275 beta_clusters.append(betas[outlier_key])

276 id_clusters.append(ids[outlier_key].tolist())

277

278 return betas[inlier_key], ids[inlier_key]

279

280

281def __experimental_outlier_detector(betas, ids, **kwargs):

282 """

283 Detects outliers by applying LocalOutlierFactor algorythm from sklearn over multiple slices of betas .

284 Detected outliers will be appended into the provided lists

285 Default keyword arguments: contamination=0.01

286 Parameters

287 ----------

288 betas: np.ndarray

289 Numpy array containing the betas

290 ids: np.ndarray

291 Numpy array containing the ids of each beta

292 **kwargs: keyword argument

293 Keywords specific to the LocalOutlierFactor algorythm

294 Returns

295 -------

296 outliers: np.array

297 Numpy array containing the sample names identified as outliers

298 outlier_index: np.array

299 Array containing the indexes of outliers

300 inlier_index: np.array

301 Array of booleans to get inlier(not outliers) betas and IDs

302

303 See Also

304 --------

305 Detailed Documentation of the function parameters can be found on sklearn.

306 Link:https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.LocalOutlierFactor.html#sklearn.neighbors.LocalOutlierFactor

307 """ # noqa pylint: disable = line-too-long

308

309 # pylint: disable = too-many-locals

310

311 loops = betas.shape[1] - 2

312 alertlist = []

313 for dadoop in range(loops):

314 slicer = dadoop + 3

315 beta_slice = betas[:, dadoop:slicer]

316

317 sanitizer = LocalOutlierFactor(**kwargs).fit_predict(beta_slice)

318 outlier_key = np.where(sanitizer == -1)[0]

319 alertlist.append(outlier_key)

320

321 suspects = np.concatenate(alertlist)

322 individuals = np.unique(suspects)

323 crimecounter = np.array([np.where(suspects == tracked)[0].shape[0] for tracked in individuals])

324

325 the_cases = np.where(crimecounter > 2)[0]

326 the_judged = ids[individuals[the_cases]]

327

328 innocents = np.full(ids.shape, True)

329

330 if the_judged.shape != (0,):

331 judged_index = np.array([np.where(ids == convict)[0] for convict in the_judged])[:, 0]

332

333 innocents[judged_index] = False

334 else:

335 return False

336

337 return the_judged, judged_index, innocents

338

339

340def __rescale_betas(betas):

341 """

342 utility function to rescale betas into the range of [0, 1].

343 Expects only positive betas

344

345 Parameters

346 ----------

347 betas: np.ndarray

348 Numpy array containing the betas to be scaled. Expects betas of shape (samples, nr_betas)

349

350 Returns

351 -------

352 betas_scaled: np.ndarray

353 Betas scaled to range [0, 1]

354 maxb: np.ndarray

355 Array to rescale betas back to original values

356 """

357 assert len(betas.shape) == 2

358 ref_betas = np.abs(betas)

359 maxb = np.array([np.max(ref_betas[:, i]) for i in range(betas.shape[1])])

360 # return np.array([(betas[:, i]/maxb[i]) for i in range(betas.shape[1])]).T

361 return betas / (maxb.T), maxb.T

362

363

364def list_detectors_and_cluster():

365 """

366 Prints out all keywords for outlier detection and clustering functions

367

368 See Also

369 --------

370 list_detectors_and_cluster(keyword)"""

371

372 print("Implemented Detectors:")

373 for entry in __detector_dict:

374 print(" " + entry)

375 print("Implemented Clustering Functions")

376 for entry in __cluster_dict:

377 print(" " + entry)

378

379

380def document_algorithm(keyword):

381 """

382 prints out the docstring of the function related to the input keyword

383

384 Parameters

385 ----------

386 keyword: str

387 String keyword referencing the outlier detection or clustering function

388

389 See Also

390 --------

391 list_detectors_and_cluster()

392 """

393 print(__doc_dict[keyword])

394

395

396__doc_dict = {

397 DetectorType.IsolationForest: __detect_outliers_isolation_forest.__doc__,

398 DetectorType.OneClassSVM: __detect_outliers_one_class_svm.__doc__,

399 DetectorType.LocalOutlierFactor: __detect_outliers_local_outlier_factor.__doc__,

400 # DetectorType.Experimental: __experimental_outlier_detector.__doc__,

401 ClusterType.OPTICS: __apply_optics.__doc__,

402 ClusterType.DBSCAN: __apply_dbscan.__doc__,

403 ClusterType.KMeans: __apply_k_means.__doc__,

404 ClusterType.SpectralClustering: __apply_spectral_clustering.__doc__,

405}

406

407__detector_dict = {

408 DetectorType.IsolationForest: __detect_outliers_isolation_forest,

409 DetectorType.OneClassSVM: __detect_outliers_one_class_svm,

410 DetectorType.LocalOutlierFactor: __detect_outliers_local_outlier_factor,

411 # DetectorType.Experimental: __experimental_outlier_detector

412}

413__cluster_dict = {

414 ClusterType.OPTICS: __apply_optics,

415 ClusterType.DBSCAN: __apply_dbscan,

416 ClusterType.KMeans: __apply_k_means,

417 ClusterType.SpectralClustering: __apply_spectral_clustering,

418}

419

420

421def create_cluster_arg_dict(args: Sequence[str]) -> Union[Tuple[str, dict], str]:

422 """Determines which cluster to use and creates a python dictionary to use as cluster_params

423

424 Parameters

425 ----------

426 args: Sequence[str]

427 List of strings containing parameters and arguments

428

429 Returns

430 -------

431 cluster_type: str

432 determines which cluster algorithm to use

433 cluster_arg_dict: dict

434 dictionary containing arguments and values for specific cluster_type

435 err_msg: str

436 message containing error, mostly unrecognised keywords"""

437

438 # first argument must be cluster type

439 cluster_key = args[0].lower()

440 cluster_arg_dict = {}

441 cluster_type = None

442 param_type = []

443

444 # all following arguments are a parameter, followed by their respective value

445 parameters = []

446 values = []

447 if len(args) % 3 == 0:

448 # check if amount of parameters is valid

449 err_msg = (

450 "Invalid cluster arguments, first argument must be the chosen clustering algorithm,"

451 " and each optional subsequent parameter must be followed by its type and value"

452 )

453 return err_msg

454 if len(args) > 1:

455 # check if we even have parameters

456 parameters = args[1:-2:3]

457 param_type = args[2:-1:3]

458 values = args[3::3]

459

460 for cluster_option in ClusterType.get_cluster_type_name():

461 if cluster_key == cluster_option.lower():

462 cluster_type = cluster_option

463

464 if not cluster_type:

465 err_msg = (

466 f"No existing clustering method matching {args[0]}"

467 f"possible clustering methods are: {str(ClusterType.get_cluster_type_name())[1:-1]}"

468 )

469 return err_msg

470

471 for ind, param in enumerate(parameters):

472 p_t = param_type[ind]

473 v_type = None

474 if p_t == "str":

475 v_type = str

476 elif p_t == "float":

477 v_type = float

478 elif p_t == "int":

479 v_type = int

480 else:

481 err_msg = f"Clustering: Invalid type identifier {p_t}"

482 return err_msg

483

484 try:

485 val = v_type(values[ind])

486 except ValueError:

487 err_msg = (

488 f"Clustering: Invalid value {values[ind]} "

489 f"for parameter {param} of type {v_type}"

490 )

491 return err_msg

492 cluster_arg_dict[param] = val

493

494 return cluster_type, cluster_arg_dict

495

496

497def create_detector_arg_dict(args: Sequence[str]) -> Union[Tuple[str, dict], str]:

498 """Determines which detector to use and creates a python dictionary to use as detector_params

499

500 Parameters

501 ----------

502 args: Sequence[str]

503 List of strings containing parameters and arguments

504

505 Returns

506 -------

507 detector_type: str

508 determines which cluster algorithm to use

509 detector_arg_dict: dict

510 dictionary containing arguments and values for specific cluster_type

511 err_mgs: str

512 message containing error, mostly unrecognised keywords"""

513

514 # first argument must be detector type:

515 detector_key = args[0].lower()

516 detector_arg_dict = {}

517 detector_type = None

518 param_type = []

519

520 # all following arguments are a parameter, followed by their respective value

521 parameters = []

522 values = []

523 if len(args) % 3 == 0:

524 # check if amount of parameters is valid

525 err_msg = (

526 "Invalid outlier detector arguments, first argument must be "

527 "the chosen detector algorithm, and each optional subsequent "

528 "parameter must be followed by its type and value"

529 )

530 return err_msg

531 if len(args) > 1:

532 # check if we even have parameters

533 parameters = args[1:-2:3]

534 param_type = args[2:-1:3]

535 values = args[3::3]

536

537 for detector_option in DetectorType.get_detector_type_name():

538 if detector_key == detector_option.lower():

539 detector_type = detector_option

540

541 if not detector_type:

542 err_msg = (

543 f"No existing outlier detection method matching {args[0]} "

544 f"possible outlier detection methods are: "

545 f"{str(DetectorType.get_detector_type_name())[1:-1]}"

546 )

547 return err_msg

548

549 for ind, param in enumerate(parameters):

550 p_t = param_type[ind]

551 v_type = None

552 if p_t == "str":

553 v_type = str

554 elif p_t == "float":

555 v_type = float

556 elif p_t == "int":

557 v_type = int

558 else:

559 err_msg = f"Outlier Detection: Invalid type identifier {p_t}"

560 return err_msg

561

562 try:

563 val = v_type(values[ind])

564 except ValueError:

565 err_msg = (

566 f"Outlier Detection: Invalid value {values[ind]} "

567 "for parameter {param} of type {v_type}"

568 )

569 return err_msg

570 detector_arg_dict[param] = val

571

572 return detector_type, detector_arg_dict

573

574

575def group_betas(

576 beta_index,

577 betas,

578 scale_betas=False,

579 cluster=None,

580 detector=None,

581 cluster_params=None,

582 detector_params=None,

583) -> Union[Tuple[list, list], str]:

584 """

585 Base function to to group betas into groups, detect outliers. Provides that all different

586 clustering and outlier detection algorythms are implemented in an easy to access environment.

587 To select different clustering and outlier detection algoyrthms, please use appropriate

588 KeywordTypes. A description of each function can be accessed with document_algorythm(keyword)

589 A list of all functions can be accessed with list_detectors_and_clusters()

590

591 Parameters

592 ----------

593 beta_index: np.ndarray

594 Array containing the file names specific to the betas with the same index in the beta array

595 betas: np.ndarray

596 Numpy array containing the betas.

597 Betas are expected to be of shape (samples, timestep, 3)

598 The three entries per beta can either be dimesnions (x,y,z) or any three betas/eigenvalues

599 cluster: str, optional, default : "KMeans".

600 String specifying which clustering algorythm shall be applied.

601 Use ClusterTypefor easier access

602 detector: str, optional, default: None.

603 String specifying which outlier detection algorythm shall be applied.

604 Use DetectorType for easier access

605 cluster_params: dict, optional

606 Dictionary containing parameters for the clustering algorythms.

607 See the sklearn documentation for the function to learn more.

608 detector_params: dict, optional

609 Dictionary containing parameters for the outlier detection algorythms.

610 See the sklearn documentation for the function to learn more

611

612 Returns

613 -------

614 beta_clusters: list

615 List containing Numpy Arrays of betas in one cluster.

616 If a detector was selected, or the clustering algorythm has its

617 own outlier detection, the first entry in the list will be oultier betas

618 id_clusters: list

619 List containing lists of beta ids. Each id corresponds to the beta in

620 the same place in the beta_clusters list

621 err_msg: str

622 Error message if wrong keywords for detector or cluster algorithms were used

623

624 Notes

625 --------

626 document_algorithm:

627 Prints docstring of each function into console

628 list_detectors_and_clusters:

629 Prints out all detection and clustering algorythms into console

630 Sklearn Userguide chapter 2.3 Clustering:

631 https://scikit-learn.org/stable/modules/clustering.html

632 Detailed overview of different clustering algorythms

633 Sklearn Examples outlier detection:

634 https://scikit-learn.org/stable/auto_examples/plot_anomaly_comparison.html

635 Example of different used outlier detection algorythms

636 """

637

638 # pylint: disable = too-many-arguments, too-many-locals, too-many-branches

639

640 if cluster_params is None:

641 cluster_params = {}

642

643 if detector_params is None:

644 detector_params = {}

645

646 beta_clusters = []

647 id_clusters = []

648

649 if scale_betas:

650 betas, _ = __rescale_betas(betas)

651

652 if detector == "Experimental":

653

654 experimental_results = __detector_dict[detector](betas, beta_index, **detector_params)

655 if not isinstance(experimental_results, bool):

656 outlier_betas, outlier_index, inlier_index = experimental_results

657 beta_clusters.append(betas[outlier_index])

658 id_clusters.append(outlier_betas.tolist())

659 betas = betas[inlier_index]

660 beta_index = beta_index[inlier_index]

661 else:

662 empy_list = []

663 beta_clusters.append(empy_list)

664 id_clusters.append(empy_list)

665

666 detector = None

667

668 if detector is not None:

669 try:

670 betas_det, index_det = __detector_dict[detector](

671 betas, beta_index, beta_clusters, id_clusters, **detector_params

672 )

673

674 except TypeError as key_err:

675 err_msg = (

676 f"During Outlier Detection, a TypeError came up:\n{str(key_err)}\n"

677 "Please check your outlier detection arguments"

678 )

679 return err_msg

680

681 except ValueError as val_err:

682 err_msg = (

683 f"During Outlier Detection, a ValueError came up:\n{str(val_err)}\n"

684 "Please check your outlier detection arguments"

685 )

686 return err_msg

687 else:

688 betas_det, index_det = betas, beta_index

689

690 if cluster is not None:

691 try:

692 __cluster_dict[cluster](

693 betas_det, index_det, beta_clusters, id_clusters, **cluster_params

694 )

695 except TypeError as key_err:

696 err_msg = (

697 f"During Clustering, a TypeError came up:\n{str(key_err)}\n"

698 "Please check your outlier detection arguments"

699 )

700 return err_msg

701

702 except ValueError as val_err:

703 err_msg = (

704 f"During Clustering, a ValueError came up:\n{str(val_err)}\n"

705 "Please check your outlier detection arguments"

706 )

707 return err_msg

708 else:

709 beta_clusters, id_clusters = [*beta_clusters, betas_det], [*id_clusters, index_det]

710

711 return beta_clusters, id_clusters

Coverage for lasso/dimred/svd/clustering_betas.py: 48%

223 statements