Coverage for lasso/dimred/hashing.py: 0%

217 statements  

« prev     ^ index     » next       coverage.py v7.2.4, created at 2023-04-28 18:42 +0100

1import multiprocessing 

2import os 

3import time 

4from typing import List, Tuple, Union, Sequence 

5 

6import h5py 

7import numpy as np 

8from scipy import integrate 

9from sklearn.neighbors import KDTree 

10 

11 

12from ..math.stochastic import jensen_shannon_entropy 

13 

14 

15def _match_modes( 

16 hashes1: np.ndarray, 

17 hashes2: np.ndarray, 

18 eigenvectors_sub1: np.ndarray, 

19 eigenvectors_sub2: np.ndarray, 

20): 

21 """Match the eigenvalue modes 

22 

23 Parameters 

24 ---------- 

25 hashes1 : np.ndarray 

26 hashes of first model 

27 hashes2 : np.ndarray 

28 hashes of second model 

29 eigenvectors_sub1 : np.ndarray 

30 eigenvector field of first model 

31 eigenvectors_sub2 : np.ndarray 

32 eigenvector field of second model 

33 

34 Returns 

35 ------- 

36 matches : list(tuple(int.int)) 

37 indexes of the matched modes 

38 """ 

39 

40 matches = [] 

41 mode1_hash_indexes = list(range(len(hashes1))) 

42 mode2_hash_indexes = list(range(len(hashes2))) 

43 

44 for i_hash in mode1_hash_indexes: 

45 

46 field1 = eigenvectors_sub1[:, i_hash] 

47 

48 found_match = False 

49 for j_entry, j_hash in enumerate(mode2_hash_indexes): 

50 

51 field2 = eigenvectors_sub2[:, j_hash] 

52 

53 if is_mode_match(field1, field2): 

54 matches.append((i_hash, j_hash)) 

55 del mode2_hash_indexes[j_entry] 

56 found_match = True 

57 break 

58 

59 if not found_match: 

60 break 

61 

62 return matches 

63 

64 

65def is_orientation_flip_required(eigenvectors1: np.ndarray, eigenvectors2: np.ndarray): 

66 """Checks whether the eigenfields require to be flipped 

67 

68 Parameters 

69 ---------- 

70 eigenvectors1 : np.ndarray 

71 eigenvector_field of mesh1. 

72 eigenvectors2 : np.ndarray 

73 eigenvector_field of mesh2. 

74 

75 Returns 

76 ------- 

77 flip_required : bool or list(bool) 

78 whether a flip of the eigenvector field is required. 

79 

80 Note 

81 ---- 

82 If the eigenvector field has multiple modes (e.g. shape n_nodes,n_modes) 

83 then for every mode a flip value is returned 

84 

85 The eigenmodes require switching if the dot product of the knn-eigenfields yield 

86 a negative result. 

87 """ 

88 assert eigenvectors1.shape == eigenvectors2.shape 

89 

90 # one eigenmode only 

91 if eigenvectors1.ndim == 1: 

92 knn_error_basic = np.dot(eigenvectors1, eigenvectors2) 

93 return knn_error_basic < 0 

94 

95 # multiple eigenmodes 

96 n_modes = min(eigenvectors1.shape[1], eigenvectors2.shape[1]) 

97 errors = [ 

98 np.dot(eigenvectors1[:, i_mode], eigenvectors2[:, i_mode]) for i_mode in range(n_modes) 

99 ] 

100 

101 return np.array([err < 0 for err in errors]) 

102 

103 

104def _compute_mode_similarities( 

105 hashes1: np.ndarray, 

106 hashes2: np.ndarray, 

107 eigenvectors_sub1: np.ndarray, 

108 eigenvectors_sub2: np.ndarray, 

109 matches: List[Tuple[int, int]], 

110) -> List[float]: 

111 """Compute the mode similarity between different meshes 

112 

113 Parameters 

114 ---------- 

115 hashes1 : np.ndarray 

116 hashes of first model 

117 hashes2 : np.ndarray 

118 hashes of second model 

119 eigenvectors_sub1 : np.ndarray 

120 eigenvector field of first model 

121 eigenvectors_sub2 : np.ndarray 

122 eigenvector field of second model 

123 matches : list(tuple(int, int)) 

124 matches of modes (every match will be computed) 

125 

126 Returns 

127 ------- 

128 mode_similarities : list(float) 

129 similarities of the matched modes 

130 

131 Notes 

132 ----- 

133 This function cannot deal with unequal sampling of the input hashes. 

134 """ 

135 

136 mode_similarities = [] 

137 for i_hash, j_hash in matches: 

138 

139 assert hashes1.shape[2] == hashes2.shape[2] 

140 

141 field1 = eigenvectors_sub1[:, i_hash] 

142 field2 = eigenvectors_sub2[:, j_hash] 

143 

144 # flip orientation of eigenvector field and hash if required 

145 if is_orientation_flip_required(field1, field2): 

146 field2 *= -1 

147 # hdf5 can not handle negative slicing 

148 mode_ = np.array(hashes2[j_hash, 1], copy=True) 

149 mode_ = mode_[::-1] 

150 else: 

151 mode_ = hashes2[j_hash, 1, :] 

152 

153 # Warning: x is usually originally hashes[i_mode, 0] 

154 x = np.linspace(0, 1, hashes1.shape[2]) 

155 norm1 = curve_normalizer(x, hashes1[i_hash, 1]) 

156 norm2 = curve_normalizer(x, mode_) 

157 if norm1 != 0 and norm2 != 0: 

158 mode_similarities.append( 

159 integrate.simps(hashes1[i_hash, 1] * mode_ / np.sqrt(norm1 * norm2), x=x) 

160 ) 

161 else: 

162 mode_similarities.append(0) 

163 

164 return mode_similarities 

165 

166 

167def _join_hash_comparison_thread_files( 

168 comparison_filepath: str, thread_filepaths: Sequence[str], n_runs: int 

169): 

170 # pylint: disable = too-many-locals 

171 

172 if os.path.exists(comparison_filepath): 

173 if os.path.isfile(comparison_filepath): 

174 os.remove(comparison_filepath) 

175 else: 

176 raise OSError("Can not delete directory", comparison_filepath) 

177 

178 with h5py.File(comparison_filepath, "w") as hdf5_file: 

179 smatrix = hdf5_file.create_dataset( 

180 "similarity_matrix", 

181 shape=(n_runs, n_runs, 25), 

182 maxshape=(n_runs, n_runs, None), 

183 dtype="float64", 

184 compression="gzip", 

185 ) 

186 ds_matches = hdf5_file.create_dataset( 

187 "matches", 

188 shape=(n_runs, n_runs, 25, 2), 

189 maxshape=(n_runs, n_runs, None, 2), 

190 dtype="int64", 

191 compression="gzip", 

192 ) 

193 ds_weights = hdf5_file.create_dataset( 

194 "weights", 

195 shape=(n_runs, n_runs, 25), 

196 maxshape=(n_runs, n_runs, None), 

197 dtype="float64", 

198 compression="gzip", 

199 ) 

200 

201 for thread_filepath in thread_filepaths: 

202 

203 # open thread file 

204 with h5py.File(thread_filepath, "r") as thread_file: 

205 

206 # insert matrix entries 

207 matrix_indexes = thread_file["matrix_indexes"] 

208 matrix_similarities = thread_file["matrix_similarities"] 

209 matrix_matches = thread_file["matrix_matches"] 

210 thread_weights = thread_file["weights"] 

211 

212 for (i_row, i_col), values, matches in zip( 

213 matrix_indexes, matrix_similarities, matrix_matches 

214 ): 

215 smatrix[i_row, i_col] = values 

216 ds_matches[i_row, i_col] = matches 

217 ds_weights[i_row, i_col] = (thread_weights[i_row] + thread_weights[i_col]) / 2 

218 

219 # delete thread file 

220 os.remove(thread_filepath) 

221 

222 

223def run_hash_comparison( 

224 comparison_filepath: str, 

225 hashes_filepaths: List[str], 

226 n_threads: int = 1, 

227 print_progress: bool = False, 

228): 

229 """Compare two hashes of a simulation run part 

230 

231 Parameters 

232 ---------- 

233 comparison_filepath: str 

234 filepath to the hdf5 in which the result of the comparison will be 

235 stored 

236 hashes_filepaths: List[str] 

237 filepath to the stored hashes 

238 n_threads: int 

239 number of threads used for the comparison 

240 print_progress: bool 

241 whether to print the progress 

242 """ 

243 

244 # pylint: disable = too-many-locals, too-many-statements 

245 

246 assert n_threads > 0 

247 

248 # fixed settings 

249 hdf5_dataset_compression = "gzip" 

250 

251 # ! this is an inlined function ! 

252 # the actual function starts way much down 

253 

254 def _threading_run_comparison(run_indices, comparison_filepath, comm_q): 

255 # pylint: disable = too-many-statements 

256 

257 n_comparisons_thread = len(run_indices) 

258 

259 # setup storage file 

260 if os.path.exists(comparison_filepath): 

261 if os.path.isfile(comparison_filepath): 

262 os.remove(comparison_filepath) 

263 else: 

264 raise OSError("Can not delete directory", comparison_filepath) 

265 

266 hdf5_file = h5py.File(comparison_filepath, "w") 

267 

268 max_len = np.max([len(entry) for entry in hashes_filepaths]) 

269 hashes_filepaths_ascii = [entry.encode("ascii", "ignore") for entry in hashes_filepaths] 

270 

271 hdf5_file.require_dataset( 

272 "filepaths", 

273 data=hashes_filepaths_ascii, 

274 shape=(len(hashes_filepaths_ascii), 1), 

275 dtype=f"S{max_len}", 

276 ) 

277 

278 n_modes_estimated = 25 

279 

280 # could be compressed to one per run only! 

281 ds_weights = hdf5_file.create_dataset( 

282 "weights", 

283 (n_runs, n_modes_estimated), 

284 maxshape=(n_runs, None), 

285 dtype="float64", 

286 compression=hdf5_dataset_compression, 

287 ) 

288 ds_matrix_indexes = hdf5_file.create_dataset( 

289 "matrix_indexes", 

290 (n_comparisons_thread, 2), 

291 dtype="float64", 

292 compression=hdf5_dataset_compression, 

293 ) 

294 ds_matrix_values = hdf5_file.create_dataset( 

295 "matrix_similarities", 

296 (n_comparisons_thread, n_modes_estimated), 

297 maxshape=(n_comparisons_thread, None), 

298 dtype="float64", 

299 compression=hdf5_dataset_compression, 

300 ) 

301 

302 # info only! 

303 ds_matrix_matches = hdf5_file.create_dataset( 

304 "matrix_matches", 

305 (n_comparisons_thread, n_modes_estimated, 2), 

306 maxshape=(n_comparisons_thread, None, 2), 

307 dtype="int64", 

308 compression=hdf5_dataset_compression, 

309 ) 

310 

311 def _save_data(computed_results, counter): 

312 

313 start = counter + 1 - len(computed_results) 

314 for i_result, result in enumerate(computed_results): 

315 

316 i_run, j_run = result["matrix_index"] 

317 similarities = result["similarities"] 

318 matches_tmp = result["matches"] 

319 

320 ds_matrix_indexes[start + i_result, :] = i_run, j_run 

321 ds_matrix_values[start + i_result, : len(similarities)] = similarities 

322 ds_matrix_matches[start + i_result, : len(matches_tmp)] = matches_tmp 

323 weights1 = result["weights1"] 

324 n_weights1 = len(weights1) 

325 ds_weights[i_run, :n_weights1] = weights1 

326 weights2 = result["weights2"] 

327 n_weights2 = len(weights2) 

328 ds_weights[j_run, :n_weights2] = weights2 

329 

330 computed_results.clear() 

331 

332 # log 

333 computation_times = [] 

334 io_times = [] 

335 

336 counter = None # bugfix 

337 computed_results = [] 

338 for counter, (i_run, j_run) in enumerate(run_indices): 

339 

340 start = time.time() 

341 

342 # get data (io) 

343 fp1 = h5py.File(hashes_filepaths[i_run], "r") 

344 fp2 = h5py.File(hashes_filepaths[j_run], "r") 

345 hashes1 = fp1["hashes"] 

346 hashes2 = fp2["hashes"] 

347 xyz1, xyz2 = fp1["subsample_xyz"], fp2["subsample_xyz"] 

348 eigenvalues1, eigenvalues2 = fp1["eigenvalues"], fp2["eigenvalues"] 

349 

350 # hdf5 can only handle increasing indexes ... thus we need to copy the field 

351 eigenvectors_sub1 = np.array(fp1["eigenvectors"], copy=True) 

352 eigenvectors_sub2 = np.array(fp2["eigenvectors"], copy=True) 

353 

354 # time 

355 io_times.append(time.time() - start) 

356 start = time.time() 

357 

358 # match points roughly in xyz 

359 tree = KDTree(xyz1) 

360 knn_indexes = tree.query(xyz2, return_distance=False, k=1) 

361 eigenvectors_sub1 = np.squeeze(eigenvectors_sub1[knn_indexes]) 

362 

363 # match modes 

364 matches = _match_modes(hashes1, hashes2, eigenvectors_sub1, eigenvectors_sub2) 

365 

366 # mode weights 

367 weights1 = get_mode_weights_inv(eigenvalues1) 

368 weights2 = get_mode_weights_inv(eigenvalues2) 

369 

370 # compute mode similarity 

371 mode_similarities = _compute_mode_similarities( 

372 hashes1, hashes2, eigenvectors_sub1, eigenvectors_sub2, matches 

373 ) 

374 

375 # time 

376 computation_times.append(time.time() - start) 

377 

378 # assemble computations 

379 computation_result = { 

380 "matrix_index": [i_run, j_run], 

381 "matches": matches, # info only 

382 "similarities": mode_similarities, 

383 "weights1": weights1.tolist(), 

384 "weights2": weights2.tolist(), 

385 } 

386 computed_results.append(computation_result) 

387 

388 # save to file occasionally 

389 if counter % 500 == 0: 

390 _save_data(computed_results, counter) 

391 

392 # print status 

393 if comm_q and not comm_q.full(): 

394 comm_q.put( 

395 { 

396 "i_entry": counter + 1, 

397 "n_entries": len(run_indices), 

398 "io_time": np.mean(io_times), 

399 "computation_time": np.mean(computation_times), 

400 }, 

401 False, 

402 ) 

403 

404 # dump at end (if anything was computed) 

405 if counter: 

406 _save_data(computed_results, counter) 

407 

408 # <-- FUNCTION STARTS HERE 

409 

410 # helper vars 

411 n_runs = len(hashes_filepaths) 

412 

413 # THREADS 

414 if n_threads == 1: 

415 matrix_entries = [] 

416 for i_run in range(n_runs): 

417 for j_run in range(i_run + 1, n_runs): 

418 matrix_entries.append((i_run, j_run)) 

419 _threading_run_comparison(matrix_entries, comparison_filepath, None) 

420 else: 

421 # enlist runs 

422 thread_matrix_entries = [[] for i_thread in range(n_threads)] 

423 i_thread = 0 

424 for i_run in range(n_runs): 

425 for j_run in range(i_run + 1, n_runs): 

426 thread_matrix_entries[i_thread % n_threads].append((i_run, j_run)) 

427 i_thread += 1 

428 

429 # comm queues 

430 queues = [multiprocessing.Queue(1) for i_thread in range(n_threads)] 

431 

432 # run threads 

433 thread_filepaths = [ 

434 comparison_filepath + f"_thread{i_thread}" for i_thread in range(n_threads) 

435 ] 

436 threads = [ 

437 multiprocessing.Process( 

438 target=_threading_run_comparison, 

439 args=(matrix_indexes, thread_filepaths[i_thread], queues[i_thread]), 

440 ) 

441 for i_thread, matrix_indexes in enumerate(thread_matrix_entries) 

442 ] 

443 for thread in threads: 

444 thread.start() 

445 

446 # logging 

447 if print_progress: 

448 thread_stats = [ 

449 { 

450 "i_entry": 0, 

451 "n_entries": len(thread_matrix_entries[i_thread]), 

452 "io_time": 0, 

453 "computation_time": 0, 

454 } 

455 for i_thread in range(n_threads) 

456 ] 

457 

458 while any(thread.is_alive() for thread in threads): 

459 

460 # fetch data from channel 

461 for i_thread, comm_q in enumerate(queues): 

462 if not comm_q.empty(): 

463 thread_stats[i_thread] = comm_q.get(False) 

464 

465 # print msg 

466 # pylint: disable = consider-using-f-string 

467 thread_msg_list = [ 

468 ( 

469 f"Thread {i_thread}: " 

470 f"{(100 * stats['i_entry'] / stats['n_entries']):.1f}% " 

471 f"({stats['i_entry']}/{stats['n_entries']}) " 

472 f"{stats['computation_time']:.2f}s | " 

473 ) 

474 for i_thread, stats in enumerate(thread_stats) 

475 ] 

476 msg = "| " + "".join(thread_msg_list) + "\r" 

477 print(msg, end="") 

478 time.sleep(0.35) 

479 

480 # print completion message 

481 thread_msg_list = [ 

482 ( 

483 f"Thread {i_thread}: " 

484 f"{(100 * stats['i_entry'] / stats['n_entries']):.1f}% " 

485 f"({stats['i_entry']}/{stats['n_entries']}) " 

486 f"{stats['computation_time']:.2f}s | " 

487 ) 

488 for i_thread, stats in enumerate(thread_stats) 

489 ] 

490 msg = "| " + "".join(thread_msg_list) + "\r" 

491 print(msg, end="") 

492 

493 print("") 

494 print("done.") 

495 

496 # join thread worker files 

497 for thread in threads: 

498 thread.join() 

499 _join_hash_comparison_thread_files(comparison_filepath, thread_filepaths, n_runs) 

500 

501 

502def is_mode_match( 

503 eigenvectors1: np.ndarray, 

504 eigenvectors2: np.ndarray, 

505 knn_indexes: Union[np.ndarray, None] = None, 

506): 

507 """Detect a mode match from the eigenvector field 

508 

509 Parameters 

510 ---------- 

511 eigenvectors1 : np.ndarray 

512 subsample eigenvector field from model 1 

513 eigenvectors2 : np.ndarray 

514 subsample eigenvector field from model 2 

515 knn_indexes : np.ndarray 

516 kdd_indexes obtained for matching xyz1 and xyz2 of the eigenvectorfields 

517 so that only the coordinates of near points will be compared 

518 

519 Returns 

520 ------- 

521 is_matched : bool 

522 

523 Notes 

524 ----- 

525 A mode match is detected by watching the distribution 

526 of the eigenvector field errors. In case of a mode switch 

527 a correct orientation of the field is (obviously) not possible. 

528 In such a case will the probability distribution of the 

529 basic error and inverted error be quite similar, since both are wrong. 

530 

531 A matching orientation (empirically) seems to have a normal 

532 distribution like character. A non-matching orientation 

533 is more like a uniform distribution (constantly wrong across 

534 the entire model). 

535 """ 

536 

537 # pylint: disable = too-many-locals 

538 

539 # if the jensen-shannon-divergence is below this value 

540 # then a mode switch is assumed 

541 distance_limit = 0.1 

542 

543 # number of bins for probability distribution 

544 n_bins = 25 

545 

546 # (1) match sub-samples in xyz 

547 # tree = KDTree(xyz1) 

548 # indexes = tree.query(xyz2, return_distance=False, k=1) 

549 

550 # (2) compute field errors of normal field and inverted field 

551 if knn_indexes: 

552 tmp1 = eigenvectors1[knn_indexes].flatten() - eigenvectors2 

553 tmp2 = eigenvectors1[knn_indexes].flatten() + eigenvectors2 

554 else: 

555 tmp1 = eigenvectors1 - eigenvectors2 

556 tmp2 = eigenvectors1 + eigenvectors2 

557 

558 # (3) create a probability distribution for each error vector 

559 

560 # bin the values 

561 xmin = min(tmp1.min(), tmp2.min()) 

562 xmax = max(tmp1.max(), tmp2.max()) 

563 bins = np.linspace(xmin, xmax, n_bins) 

564 indexes_p1 = np.digitize(tmp1, bins) 

565 indexes_p2 = np.digitize(tmp2, bins) 

566 p1 = np.bincount(indexes_p1) / len(tmp1) 

567 p2 = np.bincount(indexes_p2) / len(tmp2) 

568 

569 # align bin vector size 

570 p1_tmp = np.zeros(max(len(p1), len(p2))) 

571 p2_tmp = np.zeros(max(len(p1), len(p2))) 

572 p1_tmp[: len(p1)] = p1 

573 p2_tmp[: len(p2)] = p2 

574 p1 = p1_tmp 

575 p2 = p2_tmp 

576 

577 # compute similarity 

578 similarity_js = jensen_shannon_entropy(p1, p2) 

579 

580 return similarity_js > distance_limit 

581 

582 

583def get_mode_weights_inv(vec: np.ndarray): 

584 """Inverse value weights (higher decay than softmax)""" 

585 val = 1.0 / (vec[:]) 

586 return val / np.sum(val) 

587 

588 

589def curve_normalizer(x: np.ndarray, y: np.ndarray): 

590 """Compute the curve normalizer for a curve dot product 

591 

592 Parameters 

593 ---------- 

594 x : np.ndarray 

595 array of x values 

596 y : np.ndarray 

597 array of y values 

598 

599 Returns 

600 ------- 

601 norm : float 

602 normalizing factor 

603 """ 

604 return integrate.simps(y**2, x=x) 

605 

606 

607def compute_hashes( 

608 eig_vecs: np.ndarray, 

609 result_field: np.ndarray, 

610 n_points: int = 100, 

611 bandwidth: float = 0.05, 

612) -> List[Tuple[np.ndarray, np.ndarray]]: 

613 """Compute hashes for a result field 

614 

615 Parameters 

616 ---------- 

617 eig_vecs : np.ndarray 

618 eigenvector field of the component with (n_samples, n_modes) 

619 result_field : np.ndarray 

620 result field to hash 

621 n_points : resolution of the hash 

622 Number of equidistant points to use for smoothing. 

623 Should be determined from the mesh size (2.5 times average elem size). 

624 bandwidth : float 

625 Bandwidth in percent of the kernel. 

626 Recommended as 5 times global element size median. 

627 

628 Returns 

629 ------- 

630 hash_functions : list(tuple(np.ndarray, np.ndarray)) 

631 list of the computed hash functions. Every item is the hash for 

632 an eigenmode. The hash consists of a pair of two functions: (x,y). 

633 For comparison, only y is usually used. 

634 """ 

635 

636 assert eig_vecs.shape[0] == len(result_field), f"{eig_vecs.shape[0]} != {len(result_field)}" 

637 

638 # Note: needs to be vectorized to speed it up 

639 

640 hash_functions = [] 

641 for i_eigen in range(eig_vecs.shape[1]): 

642 

643 xmin = eig_vecs[:, i_eigen].min() 

644 xmax = eig_vecs[:, i_eigen].max() 

645 

646 x = np.linspace(xmin, xmax, n_points) 

647 y = np.zeros(n_points) 

648 

649 local_bandwidth = bandwidth * (xmax - xmin) 

650 c = -0.5 / local_bandwidth**2 

651 

652 for ii, point in enumerate(x): 

653 y[ii] = np.dot(result_field, np.exp(c * np.square(point - eig_vecs[:, i_eigen]))) 

654 y /= np.sqrt(2 * np.pi) * bandwidth 

655 

656 hash_functions.append((x, y)) 

657 

658 return hash_functions