Coverage for lasso/math/sampling.py: 59%

22 statements  

« prev     ^ index     » next       coverage.py v7.2.4, created at 2023-04-28 18:42 +0100

1import random 

2from typing import Union 

3import numpy as np 

4from sklearn.neighbors import KDTree 

5 

6 

7def unique_subsamples(start: int, end: int, n_samples: int, seed=None) -> np.ndarray: 

8 """Retrieve unique subsample indexes 

9 

10 Parameters 

11 ---------- 

12 start: int 

13 starting index of population 

14 end: int 

15 ending index of population (end <= start) 

16 n_samples: int 

17 number of samples to draw 

18 seed: int 

19 seed for random number generator 

20 

21 Returns 

22 ------- 

23 indexes: np.ndarray 

24 unique sample indexes 

25 """ 

26 assert start <= end 

27 

28 if end - start < n_samples: 

29 n_samples = end - start 

30 

31 random.seed(seed) 

32 indexes = np.array(random.sample(range(start, end), n_samples), dtype=np.int64) 

33 random.seed() 

34 return indexes 

35 

36 

37def homogenize_density( 

38 points: np.ndarray, 

39 dim: int = 2, 

40 target_distance: Union[float, None] = None, 

41 n_neighbors: int = 18, 

42 seed=None, 

43) -> np.ndarray: 

44 """homogenize a cloud density by probabilities 

45 

46 Parameters 

47 ---------- 

48 points: np.ndarray 

49 point cloud 

50 dim: int 

51 intrinsic dimension of the data 

52 target_distance: float 

53 target distance to aim for 

54 n_neighbors: int 

55 neighbors used for computation of average neighborhood distance 

56 seed: int 

57 seed for random number generator 

58 

59 Returns 

60 ------- 

61 is_selected: np.ndarray 

62 boolean array indicating which subsamples were selected 

63 """ 

64 n_neighbors = min(n_neighbors, len(points)) 

65 

66 random.seed(seed) 

67 d, _ = KDTree(points).query(points, k=n_neighbors + 1) 

68 d_average = np.average(d[:, 1:], axis=1) 

69 if target_distance is None: 

70 target_distance = np.median(d_average) 

71 is_selected = np.array( 

72 [ 

73 dist >= target_distance or random.random() < (dist / target_distance) ** dim 

74 for i, dist in enumerate(d_average) 

75 ] 

76 ) 

77 random.seed() 

78 return is_selected