Coverage for lasso/dimred/hashing.py: 0%
217 statements
« prev ^ index » next coverage.py v7.2.4, created at 2023-04-28 18:42 +0100
« prev ^ index » next coverage.py v7.2.4, created at 2023-04-28 18:42 +0100
1import multiprocessing
2import os
3import time
4from typing import List, Tuple, Union, Sequence
6import h5py
7import numpy as np
8from scipy import integrate
9from sklearn.neighbors import KDTree
12from ..math.stochastic import jensen_shannon_entropy
15def _match_modes(
16 hashes1: np.ndarray,
17 hashes2: np.ndarray,
18 eigenvectors_sub1: np.ndarray,
19 eigenvectors_sub2: np.ndarray,
20):
21 """Match the eigenvalue modes
23 Parameters
24 ----------
25 hashes1 : np.ndarray
26 hashes of first model
27 hashes2 : np.ndarray
28 hashes of second model
29 eigenvectors_sub1 : np.ndarray
30 eigenvector field of first model
31 eigenvectors_sub2 : np.ndarray
32 eigenvector field of second model
34 Returns
35 -------
36 matches : list(tuple(int.int))
37 indexes of the matched modes
38 """
40 matches = []
41 mode1_hash_indexes = list(range(len(hashes1)))
42 mode2_hash_indexes = list(range(len(hashes2)))
44 for i_hash in mode1_hash_indexes:
46 field1 = eigenvectors_sub1[:, i_hash]
48 found_match = False
49 for j_entry, j_hash in enumerate(mode2_hash_indexes):
51 field2 = eigenvectors_sub2[:, j_hash]
53 if is_mode_match(field1, field2):
54 matches.append((i_hash, j_hash))
55 del mode2_hash_indexes[j_entry]
56 found_match = True
57 break
59 if not found_match:
60 break
62 return matches
65def is_orientation_flip_required(eigenvectors1: np.ndarray, eigenvectors2: np.ndarray):
66 """Checks whether the eigenfields require to be flipped
68 Parameters
69 ----------
70 eigenvectors1 : np.ndarray
71 eigenvector_field of mesh1.
72 eigenvectors2 : np.ndarray
73 eigenvector_field of mesh2.
75 Returns
76 -------
77 flip_required : bool or list(bool)
78 whether a flip of the eigenvector field is required.
80 Note
81 ----
82 If the eigenvector field has multiple modes (e.g. shape n_nodes,n_modes)
83 then for every mode a flip value is returned
85 The eigenmodes require switching if the dot product of the knn-eigenfields yield
86 a negative result.
87 """
88 assert eigenvectors1.shape == eigenvectors2.shape
90 # one eigenmode only
91 if eigenvectors1.ndim == 1:
92 knn_error_basic = np.dot(eigenvectors1, eigenvectors2)
93 return knn_error_basic < 0
95 # multiple eigenmodes
96 n_modes = min(eigenvectors1.shape[1], eigenvectors2.shape[1])
97 errors = [
98 np.dot(eigenvectors1[:, i_mode], eigenvectors2[:, i_mode]) for i_mode in range(n_modes)
99 ]
101 return np.array([err < 0 for err in errors])
104def _compute_mode_similarities(
105 hashes1: np.ndarray,
106 hashes2: np.ndarray,
107 eigenvectors_sub1: np.ndarray,
108 eigenvectors_sub2: np.ndarray,
109 matches: List[Tuple[int, int]],
110) -> List[float]:
111 """Compute the mode similarity between different meshes
113 Parameters
114 ----------
115 hashes1 : np.ndarray
116 hashes of first model
117 hashes2 : np.ndarray
118 hashes of second model
119 eigenvectors_sub1 : np.ndarray
120 eigenvector field of first model
121 eigenvectors_sub2 : np.ndarray
122 eigenvector field of second model
123 matches : list(tuple(int, int))
124 matches of modes (every match will be computed)
126 Returns
127 -------
128 mode_similarities : list(float)
129 similarities of the matched modes
131 Notes
132 -----
133 This function cannot deal with unequal sampling of the input hashes.
134 """
136 mode_similarities = []
137 for i_hash, j_hash in matches:
139 assert hashes1.shape[2] == hashes2.shape[2]
141 field1 = eigenvectors_sub1[:, i_hash]
142 field2 = eigenvectors_sub2[:, j_hash]
144 # flip orientation of eigenvector field and hash if required
145 if is_orientation_flip_required(field1, field2):
146 field2 *= -1
147 # hdf5 can not handle negative slicing
148 mode_ = np.array(hashes2[j_hash, 1], copy=True)
149 mode_ = mode_[::-1]
150 else:
151 mode_ = hashes2[j_hash, 1, :]
153 # Warning: x is usually originally hashes[i_mode, 0]
154 x = np.linspace(0, 1, hashes1.shape[2])
155 norm1 = curve_normalizer(x, hashes1[i_hash, 1])
156 norm2 = curve_normalizer(x, mode_)
157 if norm1 != 0 and norm2 != 0:
158 mode_similarities.append(
159 integrate.simps(hashes1[i_hash, 1] * mode_ / np.sqrt(norm1 * norm2), x=x)
160 )
161 else:
162 mode_similarities.append(0)
164 return mode_similarities
167def _join_hash_comparison_thread_files(
168 comparison_filepath: str, thread_filepaths: Sequence[str], n_runs: int
169):
170 # pylint: disable = too-many-locals
172 if os.path.exists(comparison_filepath):
173 if os.path.isfile(comparison_filepath):
174 os.remove(comparison_filepath)
175 else:
176 raise OSError("Can not delete directory", comparison_filepath)
178 with h5py.File(comparison_filepath, "w") as hdf5_file:
179 smatrix = hdf5_file.create_dataset(
180 "similarity_matrix",
181 shape=(n_runs, n_runs, 25),
182 maxshape=(n_runs, n_runs, None),
183 dtype="float64",
184 compression="gzip",
185 )
186 ds_matches = hdf5_file.create_dataset(
187 "matches",
188 shape=(n_runs, n_runs, 25, 2),
189 maxshape=(n_runs, n_runs, None, 2),
190 dtype="int64",
191 compression="gzip",
192 )
193 ds_weights = hdf5_file.create_dataset(
194 "weights",
195 shape=(n_runs, n_runs, 25),
196 maxshape=(n_runs, n_runs, None),
197 dtype="float64",
198 compression="gzip",
199 )
201 for thread_filepath in thread_filepaths:
203 # open thread file
204 with h5py.File(thread_filepath, "r") as thread_file:
206 # insert matrix entries
207 matrix_indexes = thread_file["matrix_indexes"]
208 matrix_similarities = thread_file["matrix_similarities"]
209 matrix_matches = thread_file["matrix_matches"]
210 thread_weights = thread_file["weights"]
212 for (i_row, i_col), values, matches in zip(
213 matrix_indexes, matrix_similarities, matrix_matches
214 ):
215 smatrix[i_row, i_col] = values
216 ds_matches[i_row, i_col] = matches
217 ds_weights[i_row, i_col] = (thread_weights[i_row] + thread_weights[i_col]) / 2
219 # delete thread file
220 os.remove(thread_filepath)
223def run_hash_comparison(
224 comparison_filepath: str,
225 hashes_filepaths: List[str],
226 n_threads: int = 1,
227 print_progress: bool = False,
228):
229 """Compare two hashes of a simulation run part
231 Parameters
232 ----------
233 comparison_filepath: str
234 filepath to the hdf5 in which the result of the comparison will be
235 stored
236 hashes_filepaths: List[str]
237 filepath to the stored hashes
238 n_threads: int
239 number of threads used for the comparison
240 print_progress: bool
241 whether to print the progress
242 """
244 # pylint: disable = too-many-locals, too-many-statements
246 assert n_threads > 0
248 # fixed settings
249 hdf5_dataset_compression = "gzip"
251 # ! this is an inlined function !
252 # the actual function starts way much down
254 def _threading_run_comparison(run_indices, comparison_filepath, comm_q):
255 # pylint: disable = too-many-statements
257 n_comparisons_thread = len(run_indices)
259 # setup storage file
260 if os.path.exists(comparison_filepath):
261 if os.path.isfile(comparison_filepath):
262 os.remove(comparison_filepath)
263 else:
264 raise OSError("Can not delete directory", comparison_filepath)
266 hdf5_file = h5py.File(comparison_filepath, "w")
268 max_len = np.max([len(entry) for entry in hashes_filepaths])
269 hashes_filepaths_ascii = [entry.encode("ascii", "ignore") for entry in hashes_filepaths]
271 hdf5_file.require_dataset(
272 "filepaths",
273 data=hashes_filepaths_ascii,
274 shape=(len(hashes_filepaths_ascii), 1),
275 dtype=f"S{max_len}",
276 )
278 n_modes_estimated = 25
280 # could be compressed to one per run only!
281 ds_weights = hdf5_file.create_dataset(
282 "weights",
283 (n_runs, n_modes_estimated),
284 maxshape=(n_runs, None),
285 dtype="float64",
286 compression=hdf5_dataset_compression,
287 )
288 ds_matrix_indexes = hdf5_file.create_dataset(
289 "matrix_indexes",
290 (n_comparisons_thread, 2),
291 dtype="float64",
292 compression=hdf5_dataset_compression,
293 )
294 ds_matrix_values = hdf5_file.create_dataset(
295 "matrix_similarities",
296 (n_comparisons_thread, n_modes_estimated),
297 maxshape=(n_comparisons_thread, None),
298 dtype="float64",
299 compression=hdf5_dataset_compression,
300 )
302 # info only!
303 ds_matrix_matches = hdf5_file.create_dataset(
304 "matrix_matches",
305 (n_comparisons_thread, n_modes_estimated, 2),
306 maxshape=(n_comparisons_thread, None, 2),
307 dtype="int64",
308 compression=hdf5_dataset_compression,
309 )
311 def _save_data(computed_results, counter):
313 start = counter + 1 - len(computed_results)
314 for i_result, result in enumerate(computed_results):
316 i_run, j_run = result["matrix_index"]
317 similarities = result["similarities"]
318 matches_tmp = result["matches"]
320 ds_matrix_indexes[start + i_result, :] = i_run, j_run
321 ds_matrix_values[start + i_result, : len(similarities)] = similarities
322 ds_matrix_matches[start + i_result, : len(matches_tmp)] = matches_tmp
323 weights1 = result["weights1"]
324 n_weights1 = len(weights1)
325 ds_weights[i_run, :n_weights1] = weights1
326 weights2 = result["weights2"]
327 n_weights2 = len(weights2)
328 ds_weights[j_run, :n_weights2] = weights2
330 computed_results.clear()
332 # log
333 computation_times = []
334 io_times = []
336 counter = None # bugfix
337 computed_results = []
338 for counter, (i_run, j_run) in enumerate(run_indices):
340 start = time.time()
342 # get data (io)
343 fp1 = h5py.File(hashes_filepaths[i_run], "r")
344 fp2 = h5py.File(hashes_filepaths[j_run], "r")
345 hashes1 = fp1["hashes"]
346 hashes2 = fp2["hashes"]
347 xyz1, xyz2 = fp1["subsample_xyz"], fp2["subsample_xyz"]
348 eigenvalues1, eigenvalues2 = fp1["eigenvalues"], fp2["eigenvalues"]
350 # hdf5 can only handle increasing indexes ... thus we need to copy the field
351 eigenvectors_sub1 = np.array(fp1["eigenvectors"], copy=True)
352 eigenvectors_sub2 = np.array(fp2["eigenvectors"], copy=True)
354 # time
355 io_times.append(time.time() - start)
356 start = time.time()
358 # match points roughly in xyz
359 tree = KDTree(xyz1)
360 knn_indexes = tree.query(xyz2, return_distance=False, k=1)
361 eigenvectors_sub1 = np.squeeze(eigenvectors_sub1[knn_indexes])
363 # match modes
364 matches = _match_modes(hashes1, hashes2, eigenvectors_sub1, eigenvectors_sub2)
366 # mode weights
367 weights1 = get_mode_weights_inv(eigenvalues1)
368 weights2 = get_mode_weights_inv(eigenvalues2)
370 # compute mode similarity
371 mode_similarities = _compute_mode_similarities(
372 hashes1, hashes2, eigenvectors_sub1, eigenvectors_sub2, matches
373 )
375 # time
376 computation_times.append(time.time() - start)
378 # assemble computations
379 computation_result = {
380 "matrix_index": [i_run, j_run],
381 "matches": matches, # info only
382 "similarities": mode_similarities,
383 "weights1": weights1.tolist(),
384 "weights2": weights2.tolist(),
385 }
386 computed_results.append(computation_result)
388 # save to file occasionally
389 if counter % 500 == 0:
390 _save_data(computed_results, counter)
392 # print status
393 if comm_q and not comm_q.full():
394 comm_q.put(
395 {
396 "i_entry": counter + 1,
397 "n_entries": len(run_indices),
398 "io_time": np.mean(io_times),
399 "computation_time": np.mean(computation_times),
400 },
401 False,
402 )
404 # dump at end (if anything was computed)
405 if counter:
406 _save_data(computed_results, counter)
408 # <-- FUNCTION STARTS HERE
410 # helper vars
411 n_runs = len(hashes_filepaths)
413 # THREADS
414 if n_threads == 1:
415 matrix_entries = []
416 for i_run in range(n_runs):
417 for j_run in range(i_run + 1, n_runs):
418 matrix_entries.append((i_run, j_run))
419 _threading_run_comparison(matrix_entries, comparison_filepath, None)
420 else:
421 # enlist runs
422 thread_matrix_entries = [[] for i_thread in range(n_threads)]
423 i_thread = 0
424 for i_run in range(n_runs):
425 for j_run in range(i_run + 1, n_runs):
426 thread_matrix_entries[i_thread % n_threads].append((i_run, j_run))
427 i_thread += 1
429 # comm queues
430 queues = [multiprocessing.Queue(1) for i_thread in range(n_threads)]
432 # run threads
433 thread_filepaths = [
434 comparison_filepath + f"_thread{i_thread}" for i_thread in range(n_threads)
435 ]
436 threads = [
437 multiprocessing.Process(
438 target=_threading_run_comparison,
439 args=(matrix_indexes, thread_filepaths[i_thread], queues[i_thread]),
440 )
441 for i_thread, matrix_indexes in enumerate(thread_matrix_entries)
442 ]
443 for thread in threads:
444 thread.start()
446 # logging
447 if print_progress:
448 thread_stats = [
449 {
450 "i_entry": 0,
451 "n_entries": len(thread_matrix_entries[i_thread]),
452 "io_time": 0,
453 "computation_time": 0,
454 }
455 for i_thread in range(n_threads)
456 ]
458 while any(thread.is_alive() for thread in threads):
460 # fetch data from channel
461 for i_thread, comm_q in enumerate(queues):
462 if not comm_q.empty():
463 thread_stats[i_thread] = comm_q.get(False)
465 # print msg
466 # pylint: disable = consider-using-f-string
467 thread_msg_list = [
468 (
469 f"Thread {i_thread}: "
470 f"{(100 * stats['i_entry'] / stats['n_entries']):.1f}% "
471 f"({stats['i_entry']}/{stats['n_entries']}) "
472 f"{stats['computation_time']:.2f}s | "
473 )
474 for i_thread, stats in enumerate(thread_stats)
475 ]
476 msg = "| " + "".join(thread_msg_list) + "\r"
477 print(msg, end="")
478 time.sleep(0.35)
480 # print completion message
481 thread_msg_list = [
482 (
483 f"Thread {i_thread}: "
484 f"{(100 * stats['i_entry'] / stats['n_entries']):.1f}% "
485 f"({stats['i_entry']}/{stats['n_entries']}) "
486 f"{stats['computation_time']:.2f}s | "
487 )
488 for i_thread, stats in enumerate(thread_stats)
489 ]
490 msg = "| " + "".join(thread_msg_list) + "\r"
491 print(msg, end="")
493 print("")
494 print("done.")
496 # join thread worker files
497 for thread in threads:
498 thread.join()
499 _join_hash_comparison_thread_files(comparison_filepath, thread_filepaths, n_runs)
502def is_mode_match(
503 eigenvectors1: np.ndarray,
504 eigenvectors2: np.ndarray,
505 knn_indexes: Union[np.ndarray, None] = None,
506):
507 """Detect a mode match from the eigenvector field
509 Parameters
510 ----------
511 eigenvectors1 : np.ndarray
512 subsample eigenvector field from model 1
513 eigenvectors2 : np.ndarray
514 subsample eigenvector field from model 2
515 knn_indexes : np.ndarray
516 kdd_indexes obtained for matching xyz1 and xyz2 of the eigenvectorfields
517 so that only the coordinates of near points will be compared
519 Returns
520 -------
521 is_matched : bool
523 Notes
524 -----
525 A mode match is detected by watching the distribution
526 of the eigenvector field errors. In case of a mode switch
527 a correct orientation of the field is (obviously) not possible.
528 In such a case will the probability distribution of the
529 basic error and inverted error be quite similar, since both are wrong.
531 A matching orientation (empirically) seems to have a normal
532 distribution like character. A non-matching orientation
533 is more like a uniform distribution (constantly wrong across
534 the entire model).
535 """
537 # pylint: disable = too-many-locals
539 # if the jensen-shannon-divergence is below this value
540 # then a mode switch is assumed
541 distance_limit = 0.1
543 # number of bins for probability distribution
544 n_bins = 25
546 # (1) match sub-samples in xyz
547 # tree = KDTree(xyz1)
548 # indexes = tree.query(xyz2, return_distance=False, k=1)
550 # (2) compute field errors of normal field and inverted field
551 if knn_indexes:
552 tmp1 = eigenvectors1[knn_indexes].flatten() - eigenvectors2
553 tmp2 = eigenvectors1[knn_indexes].flatten() + eigenvectors2
554 else:
555 tmp1 = eigenvectors1 - eigenvectors2
556 tmp2 = eigenvectors1 + eigenvectors2
558 # (3) create a probability distribution for each error vector
560 # bin the values
561 xmin = min(tmp1.min(), tmp2.min())
562 xmax = max(tmp1.max(), tmp2.max())
563 bins = np.linspace(xmin, xmax, n_bins)
564 indexes_p1 = np.digitize(tmp1, bins)
565 indexes_p2 = np.digitize(tmp2, bins)
566 p1 = np.bincount(indexes_p1) / len(tmp1)
567 p2 = np.bincount(indexes_p2) / len(tmp2)
569 # align bin vector size
570 p1_tmp = np.zeros(max(len(p1), len(p2)))
571 p2_tmp = np.zeros(max(len(p1), len(p2)))
572 p1_tmp[: len(p1)] = p1
573 p2_tmp[: len(p2)] = p2
574 p1 = p1_tmp
575 p2 = p2_tmp
577 # compute similarity
578 similarity_js = jensen_shannon_entropy(p1, p2)
580 return similarity_js > distance_limit
583def get_mode_weights_inv(vec: np.ndarray):
584 """Inverse value weights (higher decay than softmax)"""
585 val = 1.0 / (vec[:])
586 return val / np.sum(val)
589def curve_normalizer(x: np.ndarray, y: np.ndarray):
590 """Compute the curve normalizer for a curve dot product
592 Parameters
593 ----------
594 x : np.ndarray
595 array of x values
596 y : np.ndarray
597 array of y values
599 Returns
600 -------
601 norm : float
602 normalizing factor
603 """
604 return integrate.simps(y**2, x=x)
607def compute_hashes(
608 eig_vecs: np.ndarray,
609 result_field: np.ndarray,
610 n_points: int = 100,
611 bandwidth: float = 0.05,
612) -> List[Tuple[np.ndarray, np.ndarray]]:
613 """Compute hashes for a result field
615 Parameters
616 ----------
617 eig_vecs : np.ndarray
618 eigenvector field of the component with (n_samples, n_modes)
619 result_field : np.ndarray
620 result field to hash
621 n_points : resolution of the hash
622 Number of equidistant points to use for smoothing.
623 Should be determined from the mesh size (2.5 times average elem size).
624 bandwidth : float
625 Bandwidth in percent of the kernel.
626 Recommended as 5 times global element size median.
628 Returns
629 -------
630 hash_functions : list(tuple(np.ndarray, np.ndarray))
631 list of the computed hash functions. Every item is the hash for
632 an eigenmode. The hash consists of a pair of two functions: (x,y).
633 For comparison, only y is usually used.
634 """
636 assert eig_vecs.shape[0] == len(result_field), f"{eig_vecs.shape[0]} != {len(result_field)}"
638 # Note: needs to be vectorized to speed it up
640 hash_functions = []
641 for i_eigen in range(eig_vecs.shape[1]):
643 xmin = eig_vecs[:, i_eigen].min()
644 xmax = eig_vecs[:, i_eigen].max()
646 x = np.linspace(xmin, xmax, n_points)
647 y = np.zeros(n_points)
649 local_bandwidth = bandwidth * (xmax - xmin)
650 c = -0.5 / local_bandwidth**2
652 for ii, point in enumerate(x):
653 y[ii] = np.dot(result_field, np.exp(c * np.square(point - eig_vecs[:, i_eigen])))
654 y /= np.sqrt(2 * np.pi) * bandwidth
656 hash_functions.append((x, y))
658 return hash_functions