Coverage for lasso/dimred/hashing.py: 0%

1import multiprocessing

2import os

3import time

4from typing import List, Tuple, Union, Sequence

6import h5py

7import numpy as np

8from scipy import integrate

9from sklearn.neighbors import KDTree

12from ..math.stochastic import jensen_shannon_entropy

15def _match_modes(

16 hashes1: np.ndarray,

17 hashes2: np.ndarray,

18 eigenvectors_sub1: np.ndarray,

19 eigenvectors_sub2: np.ndarray,

20):

21 """Match the eigenvalue modes

23 Parameters

24 ----------

25 hashes1 : np.ndarray

26 hashes of first model

27 hashes2 : np.ndarray

28 hashes of second model

29 eigenvectors_sub1 : np.ndarray

30 eigenvector field of first model

31 eigenvectors_sub2 : np.ndarray

32 eigenvector field of second model

34 Returns

35 -------

36 matches : list(tuple(int.int))

37 indexes of the matched modes

38 """

40 matches = []

41 mode1_hash_indexes = list(range(len(hashes1)))

42 mode2_hash_indexes = list(range(len(hashes2)))

44 for i_hash in mode1_hash_indexes:

46 field1 = eigenvectors_sub1[:, i_hash]

48 found_match = False

49 for j_entry, j_hash in enumerate(mode2_hash_indexes):

51 field2 = eigenvectors_sub2[:, j_hash]

53 if is_mode_match(field1, field2):

54 matches.append((i_hash, j_hash))

55 del mode2_hash_indexes[j_entry]

56 found_match = True

57 break

59 if not found_match:

60 break

62 return matches

65def is_orientation_flip_required(eigenvectors1: np.ndarray, eigenvectors2: np.ndarray):

66 """Checks whether the eigenfields require to be flipped

68 Parameters

69 ----------

70 eigenvectors1 : np.ndarray

71 eigenvector_field of mesh1.

72 eigenvectors2 : np.ndarray

73 eigenvector_field of mesh2.

75 Returns

76 -------

77 flip_required : bool or list(bool)

78 whether a flip of the eigenvector field is required.

80 Note

81 ----

82 If the eigenvector field has multiple modes (e.g. shape n_nodes,n_modes)

83 then for every mode a flip value is returned

85 The eigenmodes require switching if the dot product of the knn-eigenfields yield

86 a negative result.

87 """

88 assert eigenvectors1.shape == eigenvectors2.shape

90 # one eigenmode only

91 if eigenvectors1.ndim == 1:

92 knn_error_basic = np.dot(eigenvectors1, eigenvectors2)

93 return knn_error_basic < 0

95 # multiple eigenmodes

96 n_modes = min(eigenvectors1.shape[1], eigenvectors2.shape[1])

97 errors = [

98 np.dot(eigenvectors1[:, i_mode], eigenvectors2[:, i_mode]) for i_mode in range(n_modes)

99 ]

100

101 return np.array([err < 0 for err in errors])

102

103

104def _compute_mode_similarities(

105 hashes1: np.ndarray,

106 hashes2: np.ndarray,

107 eigenvectors_sub1: np.ndarray,

108 eigenvectors_sub2: np.ndarray,

109 matches: List[Tuple[int, int]],

110) -> List[float]:

111 """Compute the mode similarity between different meshes

112

113 Parameters

114 ----------

115 hashes1 : np.ndarray

116 hashes of first model

117 hashes2 : np.ndarray

118 hashes of second model

119 eigenvectors_sub1 : np.ndarray

120 eigenvector field of first model

121 eigenvectors_sub2 : np.ndarray

122 eigenvector field of second model

123 matches : list(tuple(int, int))

124 matches of modes (every match will be computed)

125

126 Returns

127 -------

128 mode_similarities : list(float)

129 similarities of the matched modes

130

131 Notes

132 -----

133 This function cannot deal with unequal sampling of the input hashes.

134 """

135

136 mode_similarities = []

137 for i_hash, j_hash in matches:

138

139 assert hashes1.shape[2] == hashes2.shape[2]

140

141 field1 = eigenvectors_sub1[:, i_hash]

142 field2 = eigenvectors_sub2[:, j_hash]

143

144 # flip orientation of eigenvector field and hash if required

145 if is_orientation_flip_required(field1, field2):

146 field2 *= -1

147 # hdf5 can not handle negative slicing

148 mode_ = np.array(hashes2[j_hash, 1], copy=True)

149 mode_ = mode_[::-1]

150 else:

151 mode_ = hashes2[j_hash, 1, :]

152

153 # Warning: x is usually originally hashes[i_mode, 0]

154 x = np.linspace(0, 1, hashes1.shape[2])

155 norm1 = curve_normalizer(x, hashes1[i_hash, 1])

156 norm2 = curve_normalizer(x, mode_)

157 if norm1 != 0 and norm2 != 0:

158 mode_similarities.append(

159 integrate.simps(hashes1[i_hash, 1] * mode_ / np.sqrt(norm1 * norm2), x=x)

160 )

161 else:

162 mode_similarities.append(0)

163

164 return mode_similarities

165

166

167def _join_hash_comparison_thread_files(

168 comparison_filepath: str, thread_filepaths: Sequence[str], n_runs: int

169):

170 # pylint: disable = too-many-locals

171

172 if os.path.exists(comparison_filepath):

173 if os.path.isfile(comparison_filepath):

174 os.remove(comparison_filepath)

175 else:

176 raise OSError("Can not delete directory", comparison_filepath)

177

178 with h5py.File(comparison_filepath, "w") as hdf5_file:

179 smatrix = hdf5_file.create_dataset(

180 "similarity_matrix",

181 shape=(n_runs, n_runs, 25),

182 maxshape=(n_runs, n_runs, None),

183 dtype="float64",

184 compression="gzip",

185 )

186 ds_matches = hdf5_file.create_dataset(

187 "matches",

188 shape=(n_runs, n_runs, 25, 2),

189 maxshape=(n_runs, n_runs, None, 2),

190 dtype="int64",

191 compression="gzip",

192 )

193 ds_weights = hdf5_file.create_dataset(

194 "weights",

195 shape=(n_runs, n_runs, 25),

196 maxshape=(n_runs, n_runs, None),

197 dtype="float64",

198 compression="gzip",

199 )

200

201 for thread_filepath in thread_filepaths:

202

203 # open thread file

204 with h5py.File(thread_filepath, "r") as thread_file:

205

206 # insert matrix entries

207 matrix_indexes = thread_file["matrix_indexes"]

208 matrix_similarities = thread_file["matrix_similarities"]

209 matrix_matches = thread_file["matrix_matches"]

210 thread_weights = thread_file["weights"]

211

212 for (i_row, i_col), values, matches in zip(

213 matrix_indexes, matrix_similarities, matrix_matches

214 ):

215 smatrix[i_row, i_col] = values

216 ds_matches[i_row, i_col] = matches

217 ds_weights[i_row, i_col] = (thread_weights[i_row] + thread_weights[i_col]) / 2

218

219 # delete thread file

220 os.remove(thread_filepath)

221

222

223def run_hash_comparison(

224 comparison_filepath: str,

225 hashes_filepaths: List[str],

226 n_threads: int = 1,

227 print_progress: bool = False,

228):

229 """Compare two hashes of a simulation run part

230

231 Parameters

232 ----------

233 comparison_filepath: str

234 filepath to the hdf5 in which the result of the comparison will be

235 stored

236 hashes_filepaths: List[str]

237 filepath to the stored hashes

238 n_threads: int

239 number of threads used for the comparison

240 print_progress: bool

241 whether to print the progress

242 """

243

244 # pylint: disable = too-many-locals, too-many-statements

245

246 assert n_threads > 0

247

248 # fixed settings

249 hdf5_dataset_compression = "gzip"

250

251 # ! this is an inlined function !

252 # the actual function starts way much down

253

254 def _threading_run_comparison(run_indices, comparison_filepath, comm_q):

255 # pylint: disable = too-many-statements

256

257 n_comparisons_thread = len(run_indices)

258

259 # setup storage file

260 if os.path.exists(comparison_filepath):

261 if os.path.isfile(comparison_filepath):

262 os.remove(comparison_filepath)

263 else:

264 raise OSError("Can not delete directory", comparison_filepath)

265

266 hdf5_file = h5py.File(comparison_filepath, "w")

267

268 max_len = np.max([len(entry) for entry in hashes_filepaths])

269 hashes_filepaths_ascii = [entry.encode("ascii", "ignore") for entry in hashes_filepaths]

270

271 hdf5_file.require_dataset(

272 "filepaths",

273 data=hashes_filepaths_ascii,

274 shape=(len(hashes_filepaths_ascii), 1),

275 dtype=f"S{max_len}",

276 )

277

278 n_modes_estimated = 25

279

280 # could be compressed to one per run only!

281 ds_weights = hdf5_file.create_dataset(

282 "weights",

283 (n_runs, n_modes_estimated),

284 maxshape=(n_runs, None),

285 dtype="float64",

286 compression=hdf5_dataset_compression,

287 )

288 ds_matrix_indexes = hdf5_file.create_dataset(

289 "matrix_indexes",

290 (n_comparisons_thread, 2),

291 dtype="float64",

292 compression=hdf5_dataset_compression,

293 )

294 ds_matrix_values = hdf5_file.create_dataset(

295 "matrix_similarities",

296 (n_comparisons_thread, n_modes_estimated),

297 maxshape=(n_comparisons_thread, None),

298 dtype="float64",

299 compression=hdf5_dataset_compression,

300 )

301

302 # info only!

303 ds_matrix_matches = hdf5_file.create_dataset(

304 "matrix_matches",

305 (n_comparisons_thread, n_modes_estimated, 2),

306 maxshape=(n_comparisons_thread, None, 2),

307 dtype="int64",

308 compression=hdf5_dataset_compression,

309 )

310

311 def _save_data(computed_results, counter):

312

313 start = counter + 1 - len(computed_results)

314 for i_result, result in enumerate(computed_results):

315

316 i_run, j_run = result["matrix_index"]

317 similarities = result["similarities"]

318 matches_tmp = result["matches"]

319

320 ds_matrix_indexes[start + i_result, :] = i_run, j_run

321 ds_matrix_values[start + i_result, : len(similarities)] = similarities

322 ds_matrix_matches[start + i_result, : len(matches_tmp)] = matches_tmp

323 weights1 = result["weights1"]

324 n_weights1 = len(weights1)

325 ds_weights[i_run, :n_weights1] = weights1

326 weights2 = result["weights2"]

327 n_weights2 = len(weights2)

328 ds_weights[j_run, :n_weights2] = weights2

329

330 computed_results.clear()

331

332 # log

333 computation_times = []

334 io_times = []

335

336 counter = None # bugfix

337 computed_results = []

338 for counter, (i_run, j_run) in enumerate(run_indices):

339

340 start = time.time()

341

342 # get data (io)

343 fp1 = h5py.File(hashes_filepaths[i_run], "r")

344 fp2 = h5py.File(hashes_filepaths[j_run], "r")

345 hashes1 = fp1["hashes"]

346 hashes2 = fp2["hashes"]

347 xyz1, xyz2 = fp1["subsample_xyz"], fp2["subsample_xyz"]

348 eigenvalues1, eigenvalues2 = fp1["eigenvalues"], fp2["eigenvalues"]

349

350 # hdf5 can only handle increasing indexes ... thus we need to copy the field

351 eigenvectors_sub1 = np.array(fp1["eigenvectors"], copy=True)

352 eigenvectors_sub2 = np.array(fp2["eigenvectors"], copy=True)

353

354 # time

355 io_times.append(time.time() - start)

356 start = time.time()

357

358 # match points roughly in xyz

359 tree = KDTree(xyz1)

360 knn_indexes = tree.query(xyz2, return_distance=False, k=1)

361 eigenvectors_sub1 = np.squeeze(eigenvectors_sub1[knn_indexes])

362

363 # match modes

364 matches = _match_modes(hashes1, hashes2, eigenvectors_sub1, eigenvectors_sub2)

365

366 # mode weights

367 weights1 = get_mode_weights_inv(eigenvalues1)

368 weights2 = get_mode_weights_inv(eigenvalues2)

369

370 # compute mode similarity

371 mode_similarities = _compute_mode_similarities(

372 hashes1, hashes2, eigenvectors_sub1, eigenvectors_sub2, matches

373 )

374

375 # time

376 computation_times.append(time.time() - start)

377

378 # assemble computations

379 computation_result = {

380 "matrix_index": [i_run, j_run],

381 "matches": matches, # info only

382 "similarities": mode_similarities,

383 "weights1": weights1.tolist(),

384 "weights2": weights2.tolist(),

385 }

386 computed_results.append(computation_result)

387

388 # save to file occasionally

389 if counter % 500 == 0:

390 _save_data(computed_results, counter)

391

392 # print status

393 if comm_q and not comm_q.full():

394 comm_q.put(

395 {

396 "i_entry": counter + 1,

397 "n_entries": len(run_indices),

398 "io_time": np.mean(io_times),

399 "computation_time": np.mean(computation_times),

400 },

401 False,

402 )

403

404 # dump at end (if anything was computed)

405 if counter:

406 _save_data(computed_results, counter)

407

408 # <-- FUNCTION STARTS HERE

409

410 # helper vars

411 n_runs = len(hashes_filepaths)

412

413 # THREADS

414 if n_threads == 1:

415 matrix_entries = []

416 for i_run in range(n_runs):

417 for j_run in range(i_run + 1, n_runs):

418 matrix_entries.append((i_run, j_run))

419 _threading_run_comparison(matrix_entries, comparison_filepath, None)

420 else:

421 # enlist runs

422 thread_matrix_entries = [[] for i_thread in range(n_threads)]

423 i_thread = 0

424 for i_run in range(n_runs):

425 for j_run in range(i_run + 1, n_runs):

426 thread_matrix_entries[i_thread % n_threads].append((i_run, j_run))

427 i_thread += 1

428

429 # comm queues

430 queues = [multiprocessing.Queue(1) for i_thread in range(n_threads)]

431

432 # run threads

433 thread_filepaths = [

434 comparison_filepath + f"_thread{i_thread}" for i_thread in range(n_threads)

435 ]

436 threads = [

437 multiprocessing.Process(

438 target=_threading_run_comparison,

439 args=(matrix_indexes, thread_filepaths[i_thread], queues[i_thread]),

440 )

441 for i_thread, matrix_indexes in enumerate(thread_matrix_entries)

442 ]

443 for thread in threads:

444 thread.start()

445

446 # logging

447 if print_progress:

448 thread_stats = [

449 {

450 "i_entry": 0,

451 "n_entries": len(thread_matrix_entries[i_thread]),

452 "io_time": 0,

453 "computation_time": 0,

454 }

455 for i_thread in range(n_threads)

456 ]

457

458 while any(thread.is_alive() for thread in threads):

459

460 # fetch data from channel

461 for i_thread, comm_q in enumerate(queues):

462 if not comm_q.empty():

463 thread_stats[i_thread] = comm_q.get(False)

464

465 # print msg

466 # pylint: disable = consider-using-f-string

467 thread_msg_list = [

468 (

469 f"Thread {i_thread}: "

470 f"{(100 * stats['i_entry'] / stats['n_entries']):.1f}% "

471 f"({stats['i_entry']}/{stats['n_entries']}) "

472 f"{stats['computation_time']:.2f}s | "

473 )

474 for i_thread, stats in enumerate(thread_stats)

475 ]

476 msg = "| " + "".join(thread_msg_list) + "\r"

477 print(msg, end="")

478 time.sleep(0.35)

479

480 # print completion message

481 thread_msg_list = [

482 (

483 f"Thread {i_thread}: "

484 f"{(100 * stats['i_entry'] / stats['n_entries']):.1f}% "

485 f"({stats['i_entry']}/{stats['n_entries']}) "

486 f"{stats['computation_time']:.2f}s | "

487 )

488 for i_thread, stats in enumerate(thread_stats)

489 ]

490 msg = "| " + "".join(thread_msg_list) + "\r"

491 print(msg, end="")

492

493 print("")

494 print("done.")

495

496 # join thread worker files

497 for thread in threads:

498 thread.join()

499 _join_hash_comparison_thread_files(comparison_filepath, thread_filepaths, n_runs)

500

501

502def is_mode_match(

503 eigenvectors1: np.ndarray,

504 eigenvectors2: np.ndarray,

505 knn_indexes: Union[np.ndarray, None] = None,

506):

507 """Detect a mode match from the eigenvector field

508

509 Parameters

510 ----------

511 eigenvectors1 : np.ndarray

512 subsample eigenvector field from model 1

513 eigenvectors2 : np.ndarray

514 subsample eigenvector field from model 2

515 knn_indexes : np.ndarray

516 kdd_indexes obtained for matching xyz1 and xyz2 of the eigenvectorfields

517 so that only the coordinates of near points will be compared

518

519 Returns

520 -------

521 is_matched : bool

522

523 Notes

524 -----

525 A mode match is detected by watching the distribution

526 of the eigenvector field errors. In case of a mode switch

527 a correct orientation of the field is (obviously) not possible.

528 In such a case will the probability distribution of the

529 basic error and inverted error be quite similar, since both are wrong.

530

531 A matching orientation (empirically) seems to have a normal

532 distribution like character. A non-matching orientation

533 is more like a uniform distribution (constantly wrong across

534 the entire model).

535 """

536

537 # pylint: disable = too-many-locals

538

539 # if the jensen-shannon-divergence is below this value

540 # then a mode switch is assumed

541 distance_limit = 0.1

542

543 # number of bins for probability distribution

544 n_bins = 25

545

546 # (1) match sub-samples in xyz

547 # tree = KDTree(xyz1)

548 # indexes = tree.query(xyz2, return_distance=False, k=1)

549

550 # (2) compute field errors of normal field and inverted field

551 if knn_indexes:

552 tmp1 = eigenvectors1[knn_indexes].flatten() - eigenvectors2

553 tmp2 = eigenvectors1[knn_indexes].flatten() + eigenvectors2

554 else:

555 tmp1 = eigenvectors1 - eigenvectors2

556 tmp2 = eigenvectors1 + eigenvectors2

557

558 # (3) create a probability distribution for each error vector

559

560 # bin the values

561 xmin = min(tmp1.min(), tmp2.min())

562 xmax = max(tmp1.max(), tmp2.max())

563 bins = np.linspace(xmin, xmax, n_bins)

564 indexes_p1 = np.digitize(tmp1, bins)

565 indexes_p2 = np.digitize(tmp2, bins)

566 p1 = np.bincount(indexes_p1) / len(tmp1)

567 p2 = np.bincount(indexes_p2) / len(tmp2)

568

569 # align bin vector size

570 p1_tmp = np.zeros(max(len(p1), len(p2)))

571 p2_tmp = np.zeros(max(len(p1), len(p2)))

572 p1_tmp[: len(p1)] = p1

573 p2_tmp[: len(p2)] = p2

574 p1 = p1_tmp

575 p2 = p2_tmp

576

577 # compute similarity

578 similarity_js = jensen_shannon_entropy(p1, p2)

579

580 return similarity_js > distance_limit

581

582

583def get_mode_weights_inv(vec: np.ndarray):

584 """Inverse value weights (higher decay than softmax)"""

585 val = 1.0 / (vec[:])

586 return val / np.sum(val)

587

588

589def curve_normalizer(x: np.ndarray, y: np.ndarray):

590 """Compute the curve normalizer for a curve dot product

591

592 Parameters

593 ----------

594 x : np.ndarray

595 array of x values

596 y : np.ndarray

597 array of y values

598

599 Returns

600 -------

601 norm : float

602 normalizing factor

603 """

604 return integrate.simps(y**2, x=x)

605

606

607def compute_hashes(

608 eig_vecs: np.ndarray,

609 result_field: np.ndarray,

610 n_points: int = 100,

611 bandwidth: float = 0.05,

612) -> List[Tuple[np.ndarray, np.ndarray]]:

613 """Compute hashes for a result field

614

615 Parameters

616 ----------

617 eig_vecs : np.ndarray

618 eigenvector field of the component with (n_samples, n_modes)

619 result_field : np.ndarray

620 result field to hash

621 n_points : resolution of the hash

622 Number of equidistant points to use for smoothing.

623 Should be determined from the mesh size (2.5 times average elem size).

624 bandwidth : float

625 Bandwidth in percent of the kernel.

626 Recommended as 5 times global element size median.

627

628 Returns

629 -------

630 hash_functions : list(tuple(np.ndarray, np.ndarray))

631 list of the computed hash functions. Every item is the hash for

632 an eigenmode. The hash consists of a pair of two functions: (x,y).

633 For comparison, only y is usually used.

634 """

635

636 assert eig_vecs.shape[0] == len(result_field), f"{eig_vecs.shape[0]} != {len(result_field)}"

637

638 # Note: needs to be vectorized to speed it up

639

640 hash_functions = []

641 for i_eigen in range(eig_vecs.shape[1]):

642

643 xmin = eig_vecs[:, i_eigen].min()

644 xmax = eig_vecs[:, i_eigen].max()

645

646 x = np.linspace(xmin, xmax, n_points)

647 y = np.zeros(n_points)

648

649 local_bandwidth = bandwidth * (xmax - xmin)

650 c = -0.5 / local_bandwidth**2

651

652 for ii, point in enumerate(x):

653 y[ii] = np.dot(result_field, np.exp(c * np.square(point - eig_vecs[:, i_eigen])))

654 y /= np.sqrt(2 * np.pi) * bandwidth

655

656 hash_functions.append((x, y))

657

658 return hash_functions