libROM  v1.0
Data-driven physical simulation library
Matrix.cpp
1 
11 // Description: A simple, parallel Matrix class with the utility needed to
12 // support the basis generation methods of this library. A
13 // distributed Matrix has its rows distributed across processors.
14 
15 #include "Matrix.h"
16 #include "utils/HDFDatabase.h"
17 #include "utils/mpi_utils.h"
18 
19 #include "mpi.h"
20 #include <string.h>
21 #include <vector>
22 #include <random>
23 
24 #ifdef CAROM_HAS_ELEMENTAL
25 #include <El.hpp>
26 #endif
27 
28 #include "scalapack_wrapper.h"
29 
30 /* Use automatically detected Fortran name-mangling scheme */
31 #define dsyev CAROM_FC_GLOBAL(dsyev, DSYEV)
32 #define dgeev CAROM_FC_GLOBAL(dgeev, DGEEV)
33 #define dgetrf CAROM_FC_GLOBAL(dgetrf, DGETRF)
34 #define dgetri CAROM_FC_GLOBAL(dgetri, DGETRI)
35 #define dgeqp3 CAROM_FC_GLOBAL(dgeqp3, DGEQP3)
36 #define dgesdd CAROM_FC_GLOBAL(dgesdd, DGESDD)
37 
38 extern "C" {
39 // Compute eigenvalue and eigenvectors of real symmetric matrix.
40  void dsyev(char*, char*, int*, double*, int*, double*, double*, int*, int*);
41 
42 // Compute eigenvalue and eigenvectors of real non-symmetric matrix.
43  void dgeev(char*, char*, int*, double*, int*, double*, double*, double*, int*,
44  double*, int*, double*, int*, int*);
45 
46 // LU decomposition of a general matrix.
47  void dgetrf(int*, int*, double*, int*, int*, int*);
48 
49 // Generate inverse of a matrix given its LU decomposition.
50  void dgetri(int*, double*, int*, int*, double*, int*, int*);
51 
52 // BLAS-3 version of QR decomposition with column pivoting
53  void dgeqp3(int*, int*, double*, int*, int*, double*, double*, int*, int*);
54 
55 // Serial SVD of a matrix.
56  void dgesdd(char*, int*, int*, double*, int*,
57  double*, double*, int*, double*, int*,
58  double*, int*, int*, int*);
59 }
60 
61 namespace CAROM {
62 
64  d_mat(NULL),
65  d_alloc_size(0),
66  d_distributed(false),
67  d_owns_data(true)
68 {}
69 
71  int num_rows,
72  int num_cols,
73  bool distributed,
74  bool randomized) :
75  d_mat(0),
76  d_alloc_size(0),
77  d_distributed(distributed),
78  d_owns_data(true)
79 {
80  CAROM_VERIFY(num_rows > 0);
81  CAROM_VERIFY(num_cols > 0);
82  int mpi_init;
83  MPI_Initialized(&mpi_init);
84  if (mpi_init) {
85  MPI_Comm_size(MPI_COMM_WORLD, &d_num_procs);
86  MPI_Comm_rank(MPI_COMM_WORLD, &d_rank);
87  }
88  else {
89  d_num_procs = 1;
90  d_rank = 0;
91  }
92  setSize(num_rows, num_cols);
93  if (randomized) {
94  std::default_random_engine generator;
95  std::normal_distribution<double> normal_distribution(0.0, 1.0);
96  for (int i = 0; i < num_rows; i++) {
97  for (int j = 0; j < num_cols; j++) {
98  item(i,j) = normal_distribution(generator);
99  }
100  }
101  }
102 }
103 
105  double* mat,
106  int num_rows,
107  int num_cols,
108  bool distributed,
109  bool copy_data) :
110  d_mat(0),
111  d_alloc_size(0),
112  d_distributed(distributed),
113  d_owns_data(copy_data)
114 {
115  CAROM_VERIFY(mat != 0);
116  CAROM_VERIFY(num_rows > 0);
117  CAROM_VERIFY(num_cols > 0);
118  int mpi_init;
119  MPI_Initialized(&mpi_init);
120  if (mpi_init) {
121  MPI_Comm_size(MPI_COMM_WORLD, &d_num_procs);
122  MPI_Comm_rank(MPI_COMM_WORLD, &d_rank);
123  }
124  else {
125  d_num_procs = 1;
126  d_rank = 0;
127  }
128  if (copy_data) {
129  setSize(num_rows, num_cols);
130  memcpy(d_mat, mat, d_alloc_size*sizeof(double));
131  }
132  else {
133  d_mat = mat;
134  d_alloc_size = num_rows*num_cols;
135  d_num_cols = num_cols;
136  d_num_rows = num_rows;
137  if (d_distributed) {
138  calculateNumDistributedRows();
139  }
140  }
141 }
142 
144  const Matrix& other) :
145  d_mat(0),
146  d_alloc_size(0),
147  d_distributed(other.d_distributed),
148  d_owns_data(true)
149 {
150  int mpi_init;
151  MPI_Initialized(&mpi_init);
152  if (mpi_init) {
153  MPI_Comm_size(MPI_COMM_WORLD, &d_num_procs);
154  MPI_Comm_rank(MPI_COMM_WORLD, &d_rank);
155  }
156  else {
157  d_num_procs = 1;
158  d_rank = 0;
159  }
160  setSize(other.d_num_rows, other.d_num_cols);
161  memcpy(d_mat, other.d_mat, d_alloc_size*sizeof(double));
162 }
163 
165 {
166  if (d_owns_data && d_mat) {
167  delete [] d_mat;
168  }
169 }
170 
171 Matrix&
173  const Matrix& rhs)
174 {
175  d_distributed = rhs.d_distributed;
176  d_num_procs = rhs.d_num_procs;
177  setSize(rhs.d_num_rows, rhs.d_num_cols);
178  memcpy(d_mat, rhs.d_mat, d_num_rows*d_num_cols*sizeof(double));
179  return *this;
180 }
181 
182 Matrix&
184  const Matrix& rhs)
185 {
186  CAROM_VERIFY(rhs.d_num_rows == d_num_rows);
187  CAROM_VERIFY(rhs.d_num_cols == d_num_cols);
188  for(int i=0; i<d_num_rows*d_num_cols; ++i) d_mat[i] += rhs.d_mat[i];
189  return *this;
190 }
191 
192 Matrix&
194  const Matrix& rhs)
195 {
196  CAROM_VERIFY(rhs.d_num_rows == d_num_rows);
197  CAROM_VERIFY(rhs.d_num_cols == d_num_cols);
198  for(int i=0; i<d_num_rows*d_num_cols; ++i) d_mat[i] -= rhs.d_mat[i];
199  return *this;
200 }
201 
202 bool
204 {
205  // A Matrix is "balanced" (load-balanced for distributed dense matrix
206  // computations) if:
207  //
208  // (1) the number of rows owned by each process in any pair of
209  // processes on the communicator differs by at most one
210  //
211  // (2) process j has no fewer rows than k if j is less than k (j and
212  // k are both integers corresponding to process ranks)
213 
214  // Serial matrices are balanced by definition; one process owns all
215  // rows
216  if (!distributed()) return true;
217 
218  const MPI_Comm comm = MPI_COMM_WORLD;
219 
220  // Otherwise, get the total number of rows of the matrix.
221  int num_total_rows = numDistributedRows();
222 
223  const int first_rank_with_fewer = num_total_rows % d_num_procs;
224  int my_rank;
225  CAROM_VERIFY(MPI_Comm_rank(comm, &my_rank) == MPI_SUCCESS);
226 
227  const int min_rows_per_rank = num_total_rows / d_num_procs;
228  const bool has_extra_row = my_rank < first_rank_with_fewer;
229  const int max_rows_on_rank = min_rows_per_rank + has_extra_row;
230  const bool has_enough_rows = (d_num_rows >= min_rows_per_rank);
231  const bool has_too_many_rows = (d_num_rows > max_rows_on_rank);
232 
233  int result = (has_enough_rows && !has_too_many_rows);
234  const int reduce_count = 1;
235  CAROM_VERIFY(MPI_Allreduce(MPI_IN_PLACE,
236  &result,
237  reduce_count,
238  MPI_INT,
239  MPI_LAND,
240  comm) == MPI_SUCCESS);
241 
242  return result;
243 }
244 
245 Matrix&
247  const double a)
248 {
249  for(int i=0; i<d_num_rows*d_num_cols; ++i) {
250  d_mat[i] = a;
251  }
252  return *this;
253 }
254 
255 Matrix*
257 {
258  CAROM_VERIFY(n > 0 && n <= d_num_cols);
259 
260  Matrix* first_n_columns = NULL;
261  getFirstNColumns(n, first_n_columns);
262  return first_n_columns;
263 }
264 
265 void
267  int n,
268  Matrix*& result) const
269 {
270  CAROM_VERIFY(result == 0 || result->distributed() == distributed());
271  CAROM_VERIFY(n > 0 && n <= d_num_cols);
272 
273  // If the result has not been allocated then do so. Otherwise size it
274  // correctly.
275  if (result == 0)
276  {
277  result = new Matrix(d_num_rows, n, d_distributed);
278  }
279  else
280  {
281  result->setSize(d_num_rows, n);
282  }
283 
284  for (int i = 0; i < d_num_rows; i++)
285  {
286  for (int j = 0; j < n; j++)
287  {
288  result->item(i, j) = item(i, j);
289  }
290  }
291 }
292 
293 void
295  int n,
296  Matrix& result) const
297 {
298  CAROM_VERIFY(result.distributed() == distributed());
299  CAROM_VERIFY(n > 0 && n <= d_num_cols);
300 
301  // Size result correctly.
302  result.setSize(d_num_rows, n);
303 
304  for (int i = 0; i < d_num_rows; i++)
305  {
306  for (int j = 0; j < n; j++)
307  {
308  result.item(i, j) = item(i, j);
309  }
310  }
311 }
312 
313 void
315  const Matrix& other,
316  Matrix*& result) const
317 {
318  CAROM_VERIFY(result == 0 || result->distributed() == distributed());
319  CAROM_VERIFY(!other.distributed());
320  CAROM_VERIFY(numColumns() == other.numRows());
321 
322  // If the result has not been allocated then do so. Otherwise size it
323  // correctly.
324  if (result == 0) {
325  result = new Matrix(d_num_rows, other.d_num_cols, d_distributed);
326  }
327  else {
328  result->setSize(d_num_rows, other.d_num_cols);
329  }
330 
331  // Do the multiplication.
332  for (int this_row = 0; this_row < d_num_rows; ++this_row) {
333  for (int other_col = 0; other_col < other.d_num_cols; ++other_col) {
334  double result_val = 0.0;
335  for (int entry = 0; entry < d_num_cols; ++entry) {
336  result_val += item(this_row, entry)*other.item(entry, other_col);
337  }
338  result->item(this_row, other_col) = result_val;
339  }
340  }
341 }
342 
343 void
345  const Matrix& other,
346  Matrix& result) const
347 {
348  CAROM_VERIFY(result.distributed() == distributed());
349  CAROM_VERIFY(!other.distributed());
350  CAROM_VERIFY(numColumns() == other.numRows());
351 
352  // Size result correctly.
353  result.setSize(d_num_rows, other.d_num_cols);
354 
355  // Do the multiplication.
356  for (int this_row = 0; this_row < d_num_rows; ++this_row) {
357  for (int other_col = 0; other_col < other.d_num_cols; ++other_col) {
358  double result_val = 0.0;
359  for (int entry = 0; entry < d_num_cols; ++entry) {
360  result_val += item(this_row, entry)*other.item(entry, other_col);
361  }
362  result.item(this_row, other_col) = result_val;
363  }
364  }
365 }
366 
367 void
369  const Vector& other,
370  Vector*& result) const
371 {
372  CAROM_VERIFY(result == 0 || result->distributed() == distributed());
373  CAROM_VERIFY(!other.distributed());
374  CAROM_VERIFY(numColumns() == other.dim());
375 
376  // If the result has not been allocated then do so. Otherwise size it
377  // correctly.
378  if (result == 0) {
379  result = new Vector(d_num_rows, d_distributed);
380  }
381  else {
382  result->setSize(d_num_rows);
383  }
384 
385  // Do the multiplication.
386  for (int this_row = 0; this_row < d_num_rows; ++this_row) {
387  double result_val = 0.0;
388  for (int entry = 0; entry < d_num_cols; ++entry) {
389  result_val += item(this_row, entry)*other.item(entry);
390  }
391  result->item(this_row) = result_val;
392  }
393 }
394 
395 void
397  const Vector& other,
398  Vector& result) const
399 {
400  CAROM_VERIFY(result.distributed() == distributed());
401  CAROM_VERIFY(!other.distributed());
402  CAROM_VERIFY(numColumns() == other.dim());
403 
404  // Size result correctly.
405  result.setSize(d_num_rows);
406 
407  // Do the multiplication.
408  for (int this_row = 0; this_row < d_num_rows; ++this_row) {
409  double result_val = 0.0;
410  for (int entry = 0; entry < d_num_cols; ++entry) {
411  result_val += item(this_row, entry)*other.item(entry);
412  }
413  result.item(this_row) = result_val;
414  }
415 }
416 
417 void
419  int this_row,
420  const Vector& other,
421  Vector& result) const
422 {
423  // TODO: change the CAROM_ASSERTs to CAROM_VERIFYs or generalize and eliminate the checks.
424  CAROM_ASSERT(!result.distributed());
425  CAROM_ASSERT(!distributed());
426  CAROM_VERIFY(!other.distributed());
427  CAROM_VERIFY(numColumns() == other.dim());
428 
429  // Do the multiplication.
430  for (int entry = 0; entry < d_num_cols; ++entry) {
431  result.item(entry) = item(this_row, entry)*other.item(entry);
432  }
433 }
434 
435 void
437  int this_row,
438  Vector& other) const
439 {
440  CAROM_ASSERT(!distributed());
441  CAROM_VERIFY(!other.distributed());
442  CAROM_VERIFY(numColumns() == other.dim());
443 
444  // Do the multiplication.
445  for (int entry = 0; entry < d_num_cols; ++entry) {
446  other.item(entry) *= item(this_row, entry);
447  }
448 }
449 
450 void
452  const Matrix& other,
453  Matrix*& result) const
454 {
455  CAROM_VERIFY(result == 0 || result->distributed() == distributed());
456  CAROM_VERIFY(distributed() == other.distributed());
457  CAROM_VERIFY(numRows() == other.numRows());
458  CAROM_VERIFY(numColumns() == other.numColumns());
459 
460  // If the result has not been allocated then do so. Otherwise size it
461  // correctly.
462  if (result == 0) {
463  result = new Matrix(d_num_rows, d_num_cols, d_distributed);
464  }
465  else {
466  result->setSize(d_num_rows, d_num_cols);
467  }
468 
469  // Do the element-wise multiplication.
470  for (int this_row = 0; this_row < d_num_rows; ++this_row) {
471  for (int other_col = 0; other_col < d_num_cols; ++other_col) {
472  result->item(this_row, other_col) = item(this_row,
473  other_col) * other.item(this_row, other_col);
474  }
475  }
476 }
477 
478 void
480  const Matrix& other,
481  Matrix& result) const
482 {
483  CAROM_VERIFY(result.distributed() == distributed());
484  CAROM_VERIFY(distributed() == other.distributed());
485  CAROM_VERIFY(numRows() == other.numRows());
486  CAROM_VERIFY(numColumns() == other.numColumns());
487 
488  // Size result correctly.
489  result.setSize(d_num_rows, other.d_num_cols);
490 
491  // Do the multiplication.
492  for (int this_row = 0; this_row < d_num_rows; ++this_row) {
493  for (int other_col = 0; other_col < d_num_cols; ++other_col) {
494  result.item(this_row, other_col) = item(this_row,
495  other_col) * other.item(this_row, other_col);
496  }
497  }
498 }
499 
500 void
502  Matrix*& result) const
503 {
504  // If the result has not been allocated then do so. Otherwise size it
505  // correctly.
506  if (result == 0) {
507  result = new Matrix(d_num_rows, d_num_cols, d_distributed);
508  }
509  else {
510  result->setSize(d_num_rows, d_num_cols);
511  }
512 
513  // Do the pointwise square.
514  for (int this_row = 0; this_row < d_num_rows; ++this_row) {
515  for (int this_col = 0; this_col < d_num_cols; ++this_col) {
516  result->item(this_row, this_col) = item(this_row, this_col) * item(this_row,
517  this_col);
518  }
519  }
520 }
521 
522 void
524  Matrix& result) const
525 {
526  // Size result correctly.
527  result.setSize(d_num_rows, d_num_cols);
528 
529  // Do the pointwise square.
530  for (int this_row = 0; this_row < d_num_rows; ++this_row) {
531  for (int this_col = 0; this_col < d_num_cols; ++this_col) {
532  result.item(this_row, this_col) = item(this_row, this_col) * item(this_row,
533  this_col);
534  }
535  }
536 }
537 
538 void
540  Vector& a,
541  const Vector& b,
542  double c) const
543 {
544  CAROM_VERIFY(a.distributed() == distributed());
545  CAROM_VERIFY(!b.distributed());
546  CAROM_VERIFY(numColumns() == b.dim());
547  CAROM_VERIFY(numRows() == a.dim());
548 
549  for (int this_row = 0; this_row < d_num_rows; ++this_row) {
550  double tmp = 0.0;
551  for (int this_col = 0; this_col < d_num_cols; ++this_col) {
552  tmp += item(this_row, this_col)*b.item(this_col);
553  }
554  a.item(this_row) += tmp*c;
555  }
556 }
557 
558 void
560  const Matrix& other,
561  Matrix*& result) const
562 {
563  CAROM_VERIFY(result == 0 || !result->distributed());
564  CAROM_VERIFY(distributed() == other.distributed());
565  CAROM_VERIFY(numRows() == other.numRows());
566 
567  // If the result has not been allocated then do so. Otherwise size it
568  // correctly.
569  if (result == 0) {
570  result = new Matrix(d_num_cols, other.d_num_cols, false);
571  }
572  else {
573  result->setSize(d_num_cols, other.d_num_cols);
574  }
575 
576  // Do the multiplication.
577  for (int this_col = 0; this_col < d_num_cols; ++this_col) {
578  for (int other_col = 0; other_col < other.d_num_cols; ++other_col) {
579  double result_val = 0.0;
580  for (int entry = 0; entry < d_num_rows; ++entry) {
581  result_val += item(entry, this_col)*other.item(entry, other_col);
582  }
583  result->item(this_col, other_col) = result_val;
584  }
585  }
586  if (d_distributed && d_num_procs > 1) {
587  int new_mat_size = d_num_cols*other.d_num_cols;
588  MPI_Allreduce(MPI_IN_PLACE,
589  &result->item(0, 0),
590  new_mat_size,
591  MPI_DOUBLE,
592  MPI_SUM,
593  MPI_COMM_WORLD);
594  }
595 }
596 
597 void
599  const Matrix& other,
600  Matrix& result) const
601 {
602  CAROM_VERIFY(!result.distributed());
603  CAROM_VERIFY(distributed() == other.distributed());
604  CAROM_VERIFY(numRows() == other.numRows());
605 
606  // Size result correctly.
607  result.setSize(d_num_cols, other.d_num_cols);
608 
609  // Do the multiplication.
610  for (int this_col = 0; this_col < d_num_cols; ++this_col) {
611  for (int other_col = 0; other_col < other.d_num_cols; ++other_col) {
612  double result_val = 0.0;
613  for (int entry = 0; entry < d_num_rows; ++entry) {
614  result_val += item(entry, this_col)*other.item(entry, other_col);
615  }
616  result.item(this_col, other_col) = result_val;
617  }
618  }
619  if (d_distributed && d_num_procs > 1) {
620  int new_mat_size = d_num_cols*other.d_num_cols;
621  MPI_Allreduce(MPI_IN_PLACE,
622  &result.item(0, 0),
623  new_mat_size,
624  MPI_DOUBLE,
625  MPI_SUM,
626  MPI_COMM_WORLD);
627  }
628 }
629 
630 void
632  const Vector& other,
633  Vector*& result) const
634 {
635  CAROM_VERIFY(result == 0 || !result->distributed());
636  CAROM_VERIFY(distributed() == other.distributed());
637  CAROM_VERIFY(numRows() == other.dim());
638 
639  // If the result has not been allocated then do so. Otherwise size it
640  // correctly.
641  if (result == 0) {
642  result = new Vector(d_num_cols, false);
643  }
644  else {
645  result->setSize(d_num_cols);
646  }
647 
648  // Do the multiplication.
649  for (int this_col = 0; this_col < d_num_cols; ++this_col) {
650  double result_val = 0.0;
651  for (int entry = 0; entry < d_num_rows; ++entry) {
652  result_val += item(entry, this_col)*other.item(entry);
653  }
654  result->item(this_col) = result_val;
655  }
656  if (d_distributed && d_num_procs > 1) {
657  MPI_Allreduce(MPI_IN_PLACE,
658  &result->item(0),
659  d_num_cols,
660  MPI_DOUBLE,
661  MPI_SUM,
662  MPI_COMM_WORLD);
663  }
664 }
665 
666 void
668  const Vector& other,
669  Vector& result) const
670 {
671  CAROM_VERIFY(!result.distributed());
672  CAROM_VERIFY(distributed() == other.distributed());
673  CAROM_VERIFY(numRows() == other.dim());
674 
675  // If the result has not been allocated then do so. Otherwise size it
676  // correctly.
677  result.setSize(d_num_cols);
678 
679  // Do the multiplication.
680  for (int this_col = 0; this_col < d_num_cols; ++this_col) {
681  double result_val = 0.0;
682  for (int entry = 0; entry < d_num_rows; ++entry) {
683  result_val += item(entry, this_col)*other.item(entry);
684  }
685  result.item(this_col) = result_val;
686  }
687  if (d_distributed && d_num_procs > 1) {
688  MPI_Allreduce(MPI_IN_PLACE,
689  &result.item(0),
690  d_num_cols,
691  MPI_DOUBLE,
692  MPI_SUM,
693  MPI_COMM_WORLD);
694  }
695 }
696 
697 void
698 Matrix::getColumn(int column,
699  Vector*& result) const
700 {
701  if (result == 0) {
702  if (d_distributed) {
703  result = new Vector(d_num_rows, true);
704  }
705  else {
706  result = new Vector(d_num_rows, false);
707  }
708  }
709  getColumn(column, *result);
710 }
711 
712 void
713 Matrix::getColumn(int column,
714  Vector& result) const
715 {
716  result.setSize(d_num_rows);
717  for (int i = 0; i < d_num_rows; i++) {
718  result.item(i) = item(i, column);
719  }
720 }
721 
722 void
724  Matrix*& result) const
725 {
726  CAROM_VERIFY(result == 0 ||
727  (!result->distributed() &&
728  result->numRows() == numRows() &&
729  result->numColumns() == numColumns()));
730  CAROM_VERIFY(!distributed());
731  CAROM_VERIFY(numRows() == numColumns());
732 
733  // If the result has not been allocated then do so. Otherwise size it
734  // correctly.
735  if (result == 0) {
736  result = new Matrix(d_num_rows, d_num_cols, false);
737  }
738  else {
739  result->setSize(d_num_rows, d_num_cols);
740  }
741 
742  // Call lapack routines to do the inversion.
743  // Set up some stuff the lapack routines need.
744  int info;
745  int mtx_size = d_num_rows;
746  int lwork = mtx_size*mtx_size;
747  int* ipiv = new int [mtx_size];
748  double* work = new double [lwork];
749  // To use lapack we need a column major representation of this which is
750  // essentially the transform of this. Use result for this representation.
751  for (int row = 0; row < mtx_size; ++row) {
752  for (int col = 0; col < mtx_size; ++col) {
753  result->item(col, row) = item(row, col);
754  }
755  }
756  // Now call lapack to do the inversion.
757  dgetrf(&mtx_size, &mtx_size, result->d_mat, &mtx_size, ipiv, &info);
758  CAROM_VERIFY(info == 0);
759 
760  dgetri(&mtx_size, result->d_mat, &mtx_size, ipiv, work, &lwork, &info);
761  CAROM_VERIFY(info == 0);
762 
763  // Result now has the inverse in a column major representation. Put it
764  // into row major order.
765  for (int row = 0; row < mtx_size; ++row) {
766  for (int col = row+1; col < mtx_size; ++col) {
767  double tmp = result->item(row, col);
768  result->item(row, col) = result->item(col, row);
769  result->item(col, row) = tmp;
770  }
771  }
772 
773  delete [] ipiv;
774  delete [] work;
775 }
776 
777 void
779  Matrix& result) const
780 {
781  CAROM_VERIFY(!result.distributed() && result.numRows() == numRows() &&
782  result.numColumns() == numColumns());
783  CAROM_VERIFY(!distributed());
784  CAROM_VERIFY(numRows() == numColumns());
785 
786  // Size result correctly.
787  result.setSize(d_num_rows, d_num_cols);
788 
789  // Call lapack routines to do the inversion.
790  // Set up some stuff the lapack routines need.
791  int info;
792  int mtx_size = d_num_rows;
793  int lwork = mtx_size*mtx_size;
794  int* ipiv = new int [mtx_size];
795  double* work = new double [lwork];
796  // To use lapack we need a column major representation of this which is
797  // essentially the transform of this. Use result for this representation.
798  for (int row = 0; row < mtx_size; ++row) {
799  for (int col = 0; col < mtx_size; ++col) {
800  result.item(col, row) = item(row, col);
801  }
802  }
803  // Now call lapack to do the inversion.
804  dgetrf(&mtx_size, &mtx_size, result.d_mat, &mtx_size, ipiv, &info);
805  CAROM_VERIFY(info == 0);
806 
807  dgetri(&mtx_size, result.d_mat, &mtx_size, ipiv, work, &lwork, &info);
808  CAROM_VERIFY(info == 0);
809 
810  // Result now has the inverse in a column major representation. Put it
811  // into row major order.
812  for (int row = 0; row < mtx_size; ++row) {
813  for (int col = row+1; col < mtx_size; ++col) {
814  double tmp = result.item(row, col);
815  result.item(row, col) = result.item(col, row);
816  result.item(col, row) = tmp;
817  }
818  }
819 
820  delete [] ipiv;
821  delete [] work;
822 }
823 
825 {
826  CAROM_VERIFY(!distributed() && numRows() == numColumns()); // Avoid resizing
827  const int n = numRows();
828  for (int i=0; i<n-1; ++i)
829  {
830  for (int j=i+1; j<n; ++j)
831  {
832  const double t = d_mat[i*n+j];
833  d_mat[i*n+j] = d_mat[j*n+i];
834  d_mat[j*n+i] = t;
835  }
836  }
837 }
838 
839 void
841 {
842  CAROM_VERIFY(!distributed());
843  CAROM_VERIFY(numRows() == numColumns());
844 
845  // Call lapack routines to do the inversion.
846  // Set up some stuff the lapack routines need.
847  int info;
848  int mtx_size = d_num_rows;
849  int lwork = mtx_size*mtx_size;
850  int* ipiv = new int [mtx_size];
851  double* work = new double [lwork];
852  // To use lapack we need a column major representation of this which is
853  // essentially the transform of this.
854  for (int row = 0; row < mtx_size; ++row) {
855  for (int col = row+1; col < mtx_size; ++col) {
856  double tmp = item(row, col);
857  item(row, col) = item(col, row);
858  item(col, row) = tmp;
859  }
860  }
861  // Now call lapack to do the inversion.
862  dgetrf(&mtx_size, &mtx_size, d_mat, &mtx_size, ipiv, &info);
863  CAROM_VERIFY(info == 0);
864 
865  dgetri(&mtx_size, d_mat, &mtx_size, ipiv, work, &lwork, &info);
866  CAROM_VERIFY(info == 0);
867 
868  // This now has its inverse in a column major representation. Put it into
869  // row major representation.
870  for (int row = 0; row < mtx_size; ++row) {
871  for (int col = row+1; col < mtx_size; ++col) {
872  double tmp = item(row, col);
873  item(row, col) = item(col, row);
874  item(col, row) = tmp;
875  }
876  }
877 
878  delete [] ipiv;
879  delete [] work;
880 }
881 
883 {
884  CAROM_VERIFY(!distributed());
885  CAROM_VERIFY(numRows() >= numColumns());
886 
887  if (numRows() == numColumns())
888  {
889  inverse();
890  }
891  else
892  {
893  Matrix *AtA = this->transposeMult(this);
894 
895  // Directly invert AtA, which is a bad idea if AtA is not small.
896  AtA->inverse();
897 
898  // Pseudoinverse is (AtA)^{-1}*this^T, but we store the transpose of the result in this, namely this*(AtA)^{-T}.
899  Vector row(numColumns(), false);
900  Vector res(numColumns(), false);
901  for (int i=0; i<numRows(); ++i)
902  { // Compute i-th row of this multiplied by (AtA)^{-T}, whose transpose is (AtA)^{-1} times i-th row transposed.
903  for (int j=0; j<numColumns(); ++j)
904  row.item(j) = this->item(i,j);
905 
906  AtA->mult(row, res);
907 
908  // Overwrite i-th row with transpose of result.
909  for (int j=0; j<numColumns(); ++j)
910  this->item(i,j) = res.item(j);
911  }
912 
913  delete AtA;
914  }
915 }
916 
917 void
918 Matrix::print(const char * prefix) const
919 {
920  int my_rank;
921  const bool success = MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
922  CAROM_ASSERT(success);
923 
924  std::string filename_str = prefix + std::to_string(my_rank);
925  const char * filename = filename_str.c_str();
926  FILE * pFile = fopen(filename,"w");
927  for (int row = 0; row < d_num_rows; ++row) {
928  for (int col = 0; col < d_num_cols; ++col) {
929  fprintf(pFile, " %25.20e\t", item(row,col));
930  }
931  fprintf(pFile, "\n");
932  }
933  fclose(pFile);
934 }
935 
936 void
937 Matrix::write(const std::string& base_file_name) const
938 {
939  CAROM_VERIFY(!base_file_name.empty());
940 
941  int mpi_init;
942  MPI_Initialized(&mpi_init);
943  int rank;
944  if (mpi_init) {
945  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
946  }
947  else {
948  rank = 0;
949  }
950 
951  char tmp[100];
952  sprintf(tmp, ".%06d", rank);
953  std::string full_file_name = base_file_name + tmp;
954  HDFDatabase database;
955  database.create(full_file_name);
956 
957  sprintf(tmp, "distributed");
958  database.putInteger(tmp, d_distributed);
959  sprintf(tmp, "num_rows");
960  database.putInteger(tmp, d_num_rows);
961  sprintf(tmp, "num_cols");
962  database.putInteger(tmp, d_num_cols);
963  sprintf(tmp, "data");
964  database.putDoubleArray(tmp, d_mat, d_num_rows*d_num_cols);
965  database.close();
966 }
967 
968 void
969 Matrix::read(const std::string& base_file_name)
970 {
971  CAROM_VERIFY(!base_file_name.empty());
972 
973  int mpi_init;
974  MPI_Initialized(&mpi_init);
975  int rank;
976  if (mpi_init) {
977  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
978  MPI_Comm_size(MPI_COMM_WORLD, &d_num_procs);
979  }
980  else {
981  rank = 0;
982  d_num_procs = 1;
983  }
984 
985  char tmp[100];
986  sprintf(tmp, ".%06d", rank);
987  std::string full_file_name = base_file_name + tmp;
988  HDFDatabase database;
989  database.open(full_file_name, "r");
990 
991  sprintf(tmp, "distributed");
992  int distributed;
993  database.getInteger(tmp, distributed);
994  d_distributed = bool(distributed);
995  int num_rows;
996  sprintf(tmp, "num_rows");
997  database.getInteger(tmp, num_rows);
998  int num_cols;
999  sprintf(tmp, "num_cols");
1000  database.getInteger(tmp, num_cols);
1001  setSize(num_rows,num_cols);
1002  sprintf(tmp, "data");
1003  database.getDoubleArray(tmp, d_mat, d_alloc_size);
1004  d_owns_data = true;
1005  database.close();
1006 }
1007 
1008 void
1009 Matrix::local_read(const std::string& base_file_name, int rank)
1010 {
1011  CAROM_VERIFY(!base_file_name.empty());
1012 
1013  int mpi_init;
1014  MPI_Initialized(&mpi_init);
1015  if (mpi_init) {
1016  MPI_Comm_size(MPI_COMM_WORLD, &d_num_procs);
1017  }
1018  else {
1019  d_num_procs = 1;
1020  }
1021 
1022  char tmp[100];
1023  sprintf(tmp, ".%06d", rank);
1024  std::string full_file_name = base_file_name + tmp;
1025  HDFDatabase database;
1026  database.open(full_file_name, "r");
1027 
1028  sprintf(tmp, "distributed");
1029  int distributed;
1030  database.getInteger(tmp, distributed);
1031  d_distributed = bool(distributed);
1032  int num_rows;
1033  sprintf(tmp, "num_rows");
1034  database.getInteger(tmp, num_rows);
1035  int num_cols;
1036  sprintf(tmp, "num_cols");
1037  database.getInteger(tmp, num_cols);
1038  setSize(num_rows,num_cols);
1039  sprintf(tmp, "data");
1040  database.getDoubleArray(tmp, d_mat, d_alloc_size);
1041  d_owns_data = true;
1042  database.close();
1043 }
1044 
1045 void
1046 Matrix::distribute(const int &local_num_rows)
1047 {
1048  CAROM_VERIFY(!distributed());
1049  CAROM_VERIFY(d_owns_data);
1050 
1051  std::vector<int> row_offsets;
1052  int num_total_rows = get_global_offsets(local_num_rows, row_offsets,
1053  MPI_COMM_WORLD);
1054  CAROM_VERIFY(num_total_rows == d_num_rows);
1055  int local_offset = row_offsets[d_rank] * d_num_cols;
1056  const int new_size = local_num_rows * d_num_cols;
1057 
1058  double *d_new_mat = new double [new_size];
1059  if (new_size > 0)
1060  memcpy(d_new_mat, &d_mat[local_offset], 8 * new_size);
1061 
1062  delete [] d_mat;
1063  d_mat = d_new_mat;
1064  d_alloc_size = new_size;
1065 
1066  d_num_distributed_rows = d_num_rows;
1067  d_num_rows = local_num_rows;
1068 
1069  d_distributed = true;
1070 }
1071 
1072 void
1074 {
1075  CAROM_VERIFY(distributed());
1076  CAROM_VERIFY(d_owns_data);
1077 
1078  std::vector<int> row_offsets;
1079  const int num_total_rows = get_global_offsets(d_num_rows, row_offsets,
1080  MPI_COMM_WORLD);
1081  CAROM_VERIFY(num_total_rows == d_num_distributed_rows);
1082  const int new_size = d_num_distributed_rows * d_num_cols;
1083 
1084  int *data_offsets = new int[row_offsets.size() - 1];
1085  int *data_cnts = new int[row_offsets.size() - 1];
1086  for (int k = 0; k < row_offsets.size() - 1; k++)
1087  {
1088  data_offsets[k] = row_offsets[k] * d_num_cols;
1089  data_cnts[k] = (row_offsets[k+1] - row_offsets[k]) * d_num_cols;
1090  }
1091 
1092  double *d_new_mat = new double [new_size] {0.0};
1093  CAROM_VERIFY(MPI_Allgatherv(d_mat, d_alloc_size, MPI_DOUBLE,
1094  d_new_mat, data_cnts, data_offsets, MPI_DOUBLE,
1095  MPI_COMM_WORLD) == MPI_SUCCESS);
1096 
1097  delete [] d_mat;
1098  delete [] data_offsets, data_cnts;
1099  d_mat = d_new_mat;
1100  d_alloc_size = new_size;
1101 
1102  d_num_rows = d_num_distributed_rows;
1103 
1104  d_distributed = false;
1105 }
1106 
1107 void
1108 Matrix::calculateNumDistributedRows() {
1109  if (d_distributed && d_num_procs > 1) {
1110  int num_total_rows = d_num_rows;
1111  CAROM_VERIFY(MPI_Allreduce(MPI_IN_PLACE,
1112  &num_total_rows,
1113  1,
1114  MPI_INT,
1115  MPI_SUM,
1116  MPI_COMM_WORLD) == MPI_SUCCESS);
1117  d_num_distributed_rows = num_total_rows;
1118  }
1119  else {
1120  d_num_distributed_rows = d_num_rows;
1121  }
1122 }
1123 
1124 Matrix*
1126 {
1127  int myid;
1128  MPI_Comm_rank(MPI_COMM_WORLD, &myid);
1129 
1130  std::vector<int> row_offset(d_num_procs + 1);
1131  row_offset[d_num_procs] = numDistributedRows();
1132  row_offset[myid] = numRows();
1133 
1134  CAROM_VERIFY(MPI_Allgather(MPI_IN_PLACE,
1135  1,
1136  MPI_INT,
1137  row_offset.data(),
1138  1,
1139  MPI_INT,
1140  MPI_COMM_WORLD) == MPI_SUCCESS);
1141  for (int i = d_num_procs - 1; i >= 0; i--) {
1142  row_offset[i] = row_offset[i + 1] - row_offset[i];
1143  }
1144 
1145  CAROM_VERIFY(row_offset[0] == 0);
1146 
1147  SLPK_Matrix slpk;
1148 
1149  int nrow_blocks = d_num_procs;
1150  int ncol_blocks = 1;
1151 
1152  int blocksize = row_offset[d_num_procs] / d_num_procs;
1153  if (row_offset[d_num_procs] % d_num_procs != 0) blocksize += 1;
1154  initialize_matrix(&slpk, numColumns(), numDistributedRows(),
1155  ncol_blocks, nrow_blocks, numColumns(), blocksize);
1156  for (int rank = 0; rank < d_num_procs; ++rank) {
1157  scatter_block(&slpk, 1, row_offset[rank] + 1,
1158  getData(), numColumns(),
1159  row_offset[rank + 1] - row_offset[rank], rank);
1160  }
1161 
1162  QRManager QRmgr;
1163  qr_init(&QRmgr, &slpk);
1164  lqfactorize(&QRmgr);
1165 
1166  // Manipulate QRmgr.A to get elementary household reflectors.
1167  for (int i = row_offset[myid]; i < numColumns(); i++) {
1168  for (int j = 0; j < i - row_offset[myid] && j < row_offset[myid + 1]; j++) {
1169  QRmgr.A->mdata[j * QRmgr.A->mm + i] = 0;
1170  }
1171  if (i < row_offset[myid + 1]) {
1172  QRmgr.A->mdata[(i - row_offset[myid]) * QRmgr.A->mm + i] = 1;
1173  }
1174  }
1175 
1176  // Obtain Q
1177  lqcompute(&QRmgr);
1178  Matrix* qr_factorized_matrix = new Matrix(row_offset[myid + 1] -
1179  row_offset[myid],
1180  numColumns(), distributed());
1181  for (int rank = 0; rank < d_num_procs; ++rank) {
1182  gather_block(&qr_factorized_matrix->item(0, 0), QRmgr.A,
1183  1, row_offset[rank] + 1,
1184  numColumns(), row_offset[rank + 1] - row_offset[rank],
1185  rank);
1186  }
1187 
1188  free_matrix_data(&slpk);
1189  release_context(&slpk);
1190 
1191  free(QRmgr.tau);
1192  free(QRmgr.ipiv);
1193 
1194  return qr_factorized_matrix;
1195 }
1196 
1197 void
1199  int* row_pivot_owner,
1200  int pivots_requested) const
1201 {
1202  if(!distributed()) {
1203  return qrcp_pivots_transpose_serial(row_pivot,
1204  row_pivot_owner,
1205  pivots_requested);
1206  }
1207  else {
1208  return qrcp_pivots_transpose_distributed(row_pivot,
1209  row_pivot_owner,
1210  pivots_requested);
1211  }
1212 }
1213 
1214 void
1215 Matrix::qrcp_pivots_transpose_serial(int* row_pivot,
1216  int* row_pivot_owner,
1217  int pivots_requested) const
1218 {
1219  // This method assumes this matrix is serial
1220  CAROM_VERIFY(!distributed());
1221 
1222  // Number of pivots requested can't exceed the number of rows of the
1223  // matrix
1224  CAROM_VERIFY(pivots_requested <= numRows());
1225  CAROM_VERIFY(pivots_requested > 0);
1226 
1227  // Make sure arrays are allocated before entry; this method does not
1228  // own the input pointers
1229  CAROM_VERIFY(row_pivot != NULL);
1230  CAROM_VERIFY(row_pivot_owner != NULL);
1231 
1232  // Get dimensions of transpose of matrix
1233  int num_rows_of_transpose = numColumns();
1234  int num_cols_of_transpose = numRows();
1235 
1236  // LAPACK routines tend to overwrite their inputs, but we'd like to
1237  // keep the basis matrix and use it in later computations, so copy
1238  // the basis matrix here.
1239  Matrix scratch(*this);
1240 
1241  // Allocate work arrays; work array for QR must be at least 1 plus 3
1242  // times the number of columns of its matrix input. This algorithm
1243  // applies QR to transposed basis matrix, so the applicable
1244  // dimension is the number of rows of the basis matrix. It's
1245  // possible to get better performance by computing the optimal block
1246  // size and then using that value to size the work array; see the
1247  // LAPACK source code and documentation for details.
1248  int lwork = 20 * num_cols_of_transpose + 1;
1249  double* work = new double[lwork];
1250  double* tau = new double[std::min(num_rows_of_transpose,
1251  num_cols_of_transpose)];
1252  int* pivot = new int[num_cols_of_transpose] ();
1253  int info;
1254 
1255  // Compute the QR decomposition with column pivots of the transpose
1256  // of this matrix by abusing the fact that the C++ memory model is
1257  // row-major format, which is the transpose of the Fortran memory
1258  // model (which is column-major). Passing the row-major data
1259  // looks like an in-place transposition to Fortran.
1260  dgeqp3(&num_rows_of_transpose,
1261  &num_cols_of_transpose,
1262  scratch.d_mat,
1263  &num_rows_of_transpose,
1264  pivot,
1265  tau,
1266  work,
1267  &lwork,
1268  &info);
1269 
1270  // Fail if error in LAPACK routine.
1271  CAROM_VERIFY(info == 0);
1272 
1273  // Assume communicator is MPI_COMM_WORLD and get rank of this
1274  // process
1275  int is_mpi_initialized, is_mpi_finalized;
1276  CAROM_VERIFY(MPI_Initialized(&is_mpi_initialized) == MPI_SUCCESS);
1277  CAROM_VERIFY(MPI_Finalized(&is_mpi_finalized) == MPI_SUCCESS);
1278  int my_rank = 0;
1279  if(is_mpi_initialized && !is_mpi_finalized) {
1280  const MPI_Comm my_comm = MPI_COMM_WORLD;
1281  CAROM_VERIFY(MPI_Comm_rank(my_comm, &my_rank) == MPI_SUCCESS);
1282  }
1283 
1284  // Copy over pivots and subtract one to convert them from a
1285  // Fortran-based indexing convention (first element of 1-D array by
1286  // default corresponds to index of 1, though this convention can be
1287  // overridden) to a C-based indexing convention (first element of
1288  // 1-D array corresponds to index of 0).
1289  for (int i = 0; i < pivots_requested; i++) {
1290  row_pivot[i] = pivot[i] - 1;
1291  row_pivot_owner[i] = my_rank;
1292  }
1293 
1294  // Free arrays
1295  delete [] work;
1296  delete [] tau;
1297  delete [] pivot;
1298 }
1299 
1300 void
1301 Matrix::qrcp_pivots_transpose_distributed(int* row_pivot,
1302  int* row_pivot_owner,
1303  int pivots_requested)
1304 const
1305 {
1306  // Check if distributed; otherwise, use serial implementation
1307  CAROM_VERIFY(distributed());
1308 
1309 #ifdef CAROM_HAS_ELEMENTAL
1310  // Shim to design interface; not implemented yet
1311 
1312  // Elemental implementation
1313  return qrcp_pivots_transpose_distributed_elemental
1314  (row_pivot, row_pivot_owner, pivots_requested);
1315 #else
1316  qrcp_pivots_transpose_distributed_scalapack
1317  (row_pivot, row_pivot_owner, pivots_requested);
1318 #endif
1319 }
1320 
1321 void
1322 Matrix::qrcp_pivots_transpose_distributed_scalapack
1323 (int* row_pivot, int* row_pivot_owner, int pivots_requested) const
1324 {
1325  // Check if distributed; otherwise, use serial implementation
1326  CAROM_VERIFY(distributed());
1327 
1328  int num_total_rows = d_num_rows;
1329  CAROM_VERIFY(MPI_Allreduce(MPI_IN_PLACE,
1330  &num_total_rows,
1331  1,
1332  MPI_INT,
1333  MPI_SUM,
1334  MPI_COMM_WORLD) == MPI_SUCCESS);
1335 
1336  int *row_offset = new int[d_num_procs + 1];
1337  row_offset[d_num_procs] = num_total_rows;
1338 
1339  int my_rank;
1340  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
1341 
1342  row_offset[my_rank] = d_num_rows;
1343 
1344  CAROM_VERIFY(MPI_Allgather(MPI_IN_PLACE,
1345  1,
1346  MPI_INT,
1347  row_offset,
1348  1,
1349  MPI_INT,
1350  MPI_COMM_WORLD) == MPI_SUCCESS);
1351 
1352  for (int i = d_num_procs - 1; i >= 0; i--) {
1353  row_offset[i] = row_offset[i + 1] - row_offset[i];
1354  }
1355  CAROM_VERIFY(row_offset[0] == 0);
1356 
1357  SLPK_Matrix slpk;
1358  int blocksize = row_offset[d_num_procs] / d_num_procs;
1359  if (row_offset[d_num_procs] % d_num_procs != 0) blocksize += 1;
1360 
1361  initialize_matrix(&slpk, d_num_cols, row_offset[d_num_procs], 1, d_num_procs, 1,
1362  blocksize); // transposed
1363 
1364  CAROM_VERIFY(d_num_cols <=
1365  pivots_requested); // Otherwise, take a submatrix for the QR (not implemented).
1366 
1367  for (int rank = 0; rank < d_num_procs; ++rank) {
1368  // Take the row-major data in d_mat and put it in a transposed column-major array in slpk
1369  scatter_block(&slpk, 1, row_offset[rank]+1,
1370  d_mat,
1371  d_num_cols, row_offset[rank+1] - row_offset[rank],
1372  rank);
1373  }
1374 
1375  QRManager QRmgr;
1376  qr_init(&QRmgr, &slpk);
1377  qrfactorize(&QRmgr);
1378 
1379  // Just gather the pivots to root and discard the factorization
1380  CAROM_VERIFY(0 < pivots_requested && pivots_requested <= QRmgr.ipivSize);
1381  CAROM_VERIFY(pivots_requested <= std::max(d_num_rows, d_num_cols));
1382 
1383  const int scount = std::max(0, std::min(pivots_requested,
1384  row_offset[my_rank+1]) - row_offset[my_rank]);
1385  int *mypivots = (scount > 0) ? new int[scount] : NULL;
1386 
1387  for (int i=0; i<scount; ++i)
1388  mypivots[i] = QRmgr.ipiv[i]-1; // Make it 0-based
1389 
1390  int *rcount = (my_rank == 0) ? new int[d_num_procs] : NULL;
1391  int *rdisp = (my_rank == 0) ? new int[d_num_procs] : NULL;
1392 
1393  MPI_Gather(&scount, 1, MPI_INT, rcount, 1, MPI_INT, 0, MPI_COMM_WORLD);
1394 
1395  if (my_rank == 0)
1396  {
1397  rdisp[0] = 0;
1398  for (int i = 1; i<d_num_procs; ++i)
1399  rdisp[i] = rdisp[i-1] + rcount[i-1];
1400 
1401  CAROM_VERIFY(rdisp[d_num_procs-1] + rcount[d_num_procs-1] == pivots_requested);
1402  }
1403 
1404  MPI_Gatherv(mypivots, scount, MPI_INT, row_pivot, rcount, rdisp, MPI_INT, 0,
1405  MPI_COMM_WORLD);
1406 
1407  delete [] mypivots;
1408 
1409  if (my_rank == 0)
1410  {
1411  for (int i=0; i<pivots_requested; ++i)
1412  {
1413  row_pivot_owner[i] = -1;
1414  for (int j=d_num_procs-1; j>=0; --j)
1415  {
1416  if (row_offset[j] <= row_pivot[i])
1417  {
1418  row_pivot_owner[i] = j;
1419  break;
1420  }
1421  }
1422 
1423  // Note that row_pivot[i] is a global index.
1424  CAROM_VERIFY(row_pivot_owner[i] >= 0);
1425  CAROM_VERIFY(row_offset[row_pivot_owner[i]] <= row_pivot[i]
1426  && row_pivot[i] < row_offset[row_pivot_owner[i]+1]);
1427  }
1428  }
1429  else
1430  {
1431  for (int i=0; i<scount; ++i)
1432  row_pivot[i] = QRmgr.ipiv[i]-1; // Make it 0-based
1433  }
1434 
1435  free_matrix_data(&slpk);
1436  release_context(&slpk);
1437 
1438  free(QRmgr.tau);
1439  free(QRmgr.ipiv);
1440 
1441  delete [] rcount;
1442  delete [] rdisp;
1443  delete [] row_offset;
1444 }
1445 
1446 void
1447 Matrix::qrcp_pivots_transpose_distributed_elemental
1448 (int* row_pivot, int* row_pivot_owner, int pivots_requested)
1449 const
1450 {
1451 #ifdef CAROM_HAS_ELEMENTAL
1452  // Check if distributed; otherwise, use serial implementation
1453  CAROM_VERIFY(distributed());
1454 
1455  // Check if balanced
1456  if (balanced()) {
1457  qrcp_pivots_transpose_distributed_elemental_balanced
1458  (row_pivot, row_pivot_owner, pivots_requested);
1459  }
1460  else {
1461  qrcp_pivots_transpose_distributed_elemental_unbalanced
1462  (row_pivot, row_pivot_owner, pivots_requested);
1463  }
1464 #else
1465  CAROM_VERIFY(false);
1466 #endif
1467 }
1468 
1469 void
1470 Matrix::qrcp_pivots_transpose_distributed_elemental_balanced
1471 (int* row_pivot, int* row_pivot_owner, int pivots_requested)
1472 const
1473 {
1474 #ifdef CAROM_HAS_ELEMENTAL
1475  // Compute pivots redundantly across all processes using the QRCP
1476  // from the distributed dense linear algebra library Elemental.
1477 
1478  // The following assumptions are made in this implementation just to
1479  // get a version up and running:
1480  //
1481  // (1) this->balanced() == true; // The matrix is balanced
1482  //
1483  // (2) Process 0 is the master rank of the object
1484  //
1485  // (3) Matrix rows are distributed block-cyclically over all processes
1486  // starting on process zero with row 0, in increasing order of row
1487  // index (all elements of a given row are still stored on the same
1488  // process)
1489  //
1490  // (4) This Matrix is distributed over the MPI_COMM_WORLD
1491  // communicator
1492  //
1493  // Some of these assumptions can be relaxed if the Matrix object
1494  // stores more information.
1495 
1496  // Check if distributed and balanced
1497  CAROM_VERIFY(distributed() && balanced());
1498 
1499  // Make sure arrays are allocated before entry; this method does not
1500  // own the input pointers
1501  CAROM_VERIFY(row_pivot != NULL);
1502  CAROM_VERIFY(row_pivot_owner != NULL);
1503 
1504  // Compute total number of rows to set global sizes of matrix
1505  const MPI_Comm comm = MPI_COMM_WORLD;
1506  const int master_rank = 0;
1507 
1508  int num_total_rows = d_num_rows;
1509  const int reduce_count = 1;
1510  CAROM_VERIFY(MPI_Allreduce(MPI_IN_PLACE,
1511  &num_total_rows,
1512  reduce_count,
1513  MPI_INT,
1514  MPI_SUM,
1515  comm) == MPI_SUCCESS);
1516 
1517  // Number of pivots requested can't exceed the total number of rows
1518  // of the matrix
1519  CAROM_VERIFY(pivots_requested <= num_total_rows);
1520  CAROM_VERIFY(pivots_requested > 0);
1521 
1522  // To compute a column-pivoted QRCP using Elemental, we need to
1523  // first get the data into datatypes Elemental can operate on.
1524 
1525  // Construct a process grid on the communicator that is 1
1526  // by (# processes), in column-major order; each process in the grid
1527  // will own its own rows of the matrix
1528  const El::Grid grid(comm, 1);
1529 
1530  // The row communicator in the grid should have the same number
1531  // of processes as the comm owned by the Matrix
1532  CAROM_VERIFY(El::mpi::Size(grid.RowComm()) == d_num_procs);
1533 
1534  // Instantiate the transposed matrix; Elemental calls the number of
1535  // rows the "height" of the matrix, and the number of columns the
1536  // "width" of the matrix
1537  El::Int height = static_cast<El::Int>(numColumns());
1538  El::Int width = static_cast<El::Int>(numRows());
1539  El::Int root = static_cast<El::Int>(master_rank);
1540  El::DistMatrix<double> scratch(height, width, grid, root);
1541 
1542  // Columns of the matrix should be distributed round-robin; each
1543  // element of a column should be on the same process. The
1544  // distribution should satisfy two invariants:
1545  //
1546  // (1) Each "global row" should have the same rank in its column
1547  // communicator
1548  //
1549  // (2) For the scratch matrix global column index j, the process (on
1550  // the row communicator) that owns j should be process rank (j %
1551  // d_num_procs).
1552  //
1553  CAROM_VERIFY(scratch.RowOwner(0) == scratch.RowOwner(1));
1554  int my_rank;
1555  CAROM_VERIFY(MPI_Comm_rank(comm, &my_rank) == MPI_SUCCESS);
1556  El::Int rank_as_row = static_cast<El::Int>(my_rank);
1557  CAROM_VERIFY(scratch.ColOwner(rank_as_row) == rank_as_row);
1558 
1559  // Set up work matrices
1560  El::DistMatrix<double> householder_scalars(grid);
1561  El::DistMatrix<double> diagonal(grid);
1562  El::DistPermutation perm(grid);
1563  El::QRCtrl<double> ctrl;
1564 
1565  // Build the copy of the matrix element by element, mapping
1566  // local indices to global indices. The Elemental::DistMatrix indices
1567  // are the transpose of the Matrix indices
1568 
1569  // Elemental assigns elements of a matrix in a block-cyclic fashion
1570  // based on the size (specifically, the height and width) of the
1571  // El::Grid object; see
1572  // http://libelemental.org/documentation/dev/core/dist_matrix/Element/MC_MR.html
1573  // for details on the default element distribution, which is what
1574  // this code uses.
1575  //
1576  // If the matrix is balanced, then the matrix rows (i.e., columns of
1577  // the scratch, transpose matrix) should conform to the row
1578  // distribution of the Elemental process grid, and additional
1579  // communication (beyond possibly in local to global index mapping
1580  // queries) is not needed to assign elements in the proper places
1581  // because the matrix is already load-balanced. Furthermore, all of
1582  // the elements of a row of this matrix (column of scratch,
1583  // transpose matrix) are on a single process. Since the matrix rows
1584  // (columns of scratch) don't have to be redistributed, we can use
1585  // Elemental's native (global <--> local) indexing to construct a
1586  // map between local indices on a given process and global indices
1587  // of the distributed matrix, and Elemental will also give us the
1588  // correct process rank owning any given column of the scratch
1589  // matrix.
1590  for (int row = 0; row < d_num_rows; row++) {
1591  El::Int el_loc_col = static_cast<El::Int>(row);
1592  El::Int el_global_col = scratch.GlobalCol(el_loc_col);
1593  for (int col = 0; col < d_num_cols; col++) {
1594  El::Int el_loc_row = static_cast<El::Int>(col);
1595  El::Int el_global_row = scratch.GlobalRow(el_loc_row);
1596  scratch.Set(el_global_row, el_global_col, this->item(row, col));
1597  }
1598  }
1599 
1600  // After transferring the data over to Elemental's native data
1601  // types, we compute the QRCP.
1602  El::QR(scratch, householder_scalars, diagonal, perm); // add ctrl if needed
1603 
1604  // Then, we transfer the pivots into the pivot array
1605  // stored redundantly on each process.
1606  for (size_t i = 0; i < pivots_requested; i++) {
1607  El::Int el_i = static_cast<El::Int>(i);
1608  El::Int el_perm_i = perm.Image(el_i);
1609 
1610  // The permutation is computed in terms of global indices, so
1611  // we need to compute the local column pivot of the transpose;
1612  // this will be a row pivot
1613  El::Int el_loc_i = scratch.LocalCol(el_perm_i);
1614  int loc_i = static_cast<int>(el_loc_i);
1615  row_pivot[i] = loc_i;
1616 
1617  // The global index of the permutation can also be used to figure
1618  // out which process owns that pivot row, because this process is
1619  // also the process that owns that global column of the scratch
1620  // matrix
1621  El::Int el_owner = scratch.ColOwner(el_perm_i);
1622  int owner = static_cast<int>(el_owner);
1623  row_pivot_owner[i] = owner;
1624  }
1625 #else
1626  CAROM_VERIFY(false);
1627 #endif
1628 }
1629 
1630 void
1631 Matrix::qrcp_pivots_transpose_distributed_elemental_unbalanced
1632 (int* row_pivot, int* row_pivot_owner, int pivots_requested)
1633 const
1634 {
1635 #ifdef CAROM_HAS_ELEMENTAL
1636  // Compute pivots redundantly across all processes using the QRCP
1637  // from the distributed dense linear algebra library Elemental.
1638 
1639  // The following assumptions are made in this implementation just to
1640  // get a version up and running:
1641  //
1642  // (1) Process 0 is the master rank of the object
1643  //
1644  // (2) This Matrix is distributed over the MPI_COMM_WORLD
1645  // communicator
1646  //
1647  // Some of these assumptions can be relaxed if the Matrix object
1648  // stores more information.
1649 
1650  // Check if distributed and unbalanced
1651  CAROM_VERIFY(distributed() && !balanced());
1652 
1653  // Make sure arrays are allocated before entry; this method does not
1654  // own the input pointers
1655  CAROM_VERIFY(row_pivot != NULL);
1656  CAROM_VERIFY(row_pivot_owner != NULL);
1657 
1658  // Compute total number of rows to set global sizes of matrix
1659  const MPI_Comm comm = MPI_COMM_WORLD;
1660  const int master_rank = 0;
1661 
1662  int num_total_rows = d_num_rows;
1663  const int reduce_count = 1;
1664  CAROM_VERIFY(MPI_Allreduce(MPI_IN_PLACE,
1665  &num_total_rows,
1666  reduce_count,
1667  MPI_INT,
1668  MPI_SUM,
1669  comm) == MPI_SUCCESS);
1670 
1671  // Number of pivots requested can't exceed the total number of rows
1672  // of the matrix
1673  CAROM_VERIFY(pivots_requested <= num_total_rows);
1674  CAROM_VERIFY(pivots_requested > 0);
1675 
1676  // To compute a column-pivoted QRCP using Elemental, we need to
1677  // first get the data into datatypes Elemental can operate on.
1678 
1679  // Construct a process grid on the communicator that is 1
1680  // by (# processes), in column-major order; each process in the grid
1681  // will own its own rows of the matrix
1682  const El::Grid grid(comm, 1);
1683 
1684  // The row communicator in the grid should have the same number
1685  // of processes as the comm owned by the Matrix
1686  CAROM_VERIFY(El::mpi::Size(grid.RowComm()) == d_num_procs);
1687 
1688  // Instantiate the transposed matrix; Elemental calls the number of
1689  // rows the "height" of the matrix, and the number of columns the
1690  // "width" of the matrix
1691  El::Int height = static_cast<El::Int>(numColumns());
1692  El::Int width = static_cast<El::Int>(numRows());
1693  El::Int root = static_cast<El::Int>(master_rank);
1694  El::DistMatrix<double> scratch(height, width, grid, root);
1695 
1696  // Columns of the matrix should be distributed round-robin; each
1697  // element of a column should be on the same process. The
1698  // distribution should satisfy two invariants:
1699  //
1700  // (1) Each "global row" should have the same rank in its column
1701  // communicator
1702  //
1703  // (2) For the scratch matrix global column index j, the process (on
1704  // the row communicator) that owns j should be process rank (j %
1705  // d_num_procs).
1706  //
1707  CAROM_VERIFY(scratch.RowOwner(0) == scratch.RowOwner(1));
1708  int my_rank;
1709  CAROM_VERIFY(MPI_Comm_rank(comm, &my_rank) == MPI_SUCCESS);
1710  El::Int rank_as_row = static_cast<El::Int>(my_rank);
1711  CAROM_VERIFY(scratch.ColOwner(rank_as_row) == rank_as_row);
1712 
1713  // Set up work matrices
1714  El::DistMatrix<double> householder_scalars(grid);
1715  El::DistMatrix<double> diagonal(grid);
1716  El::DistPermutation perm(grid);
1717  El::QRCtrl<double> ctrl;
1718 
1719  // Build the copy of the matrix element by element, mapping local
1720  // indices to global indices. The Elemental::DistMatrix indices are
1721  // the transpose of the Matrix indices. The El::Grid object should
1722  // be constructed so that each column of the scratch matrix is owned
1723  // by a single process.
1724  //
1725  // If the matrix is unbalanced, matrix elements need to be
1726  // redistributed among processes; then, for the purposes of
1727  // computing the QR decomposition only, we redistribute matrix
1728  // elements in the scratch matrix. First, we compute global to
1729  // (process rank, local) index map. The mapping chosen is not
1730  // terribly performant -- it may do more data movement than
1731  // necessary when assigning matrix elements -- but is easy to
1732  // implement.
1733 
1734  int *row_offset = new int[d_num_procs + 1];
1735  row_offset[d_num_procs + 1] = num_total_rows;
1736 
1737  row_offset[my_rank] = d_num_rows;
1738  const int send_recv_count = 1;
1739  CAROM_VERIFY(MPI_Allgather(MPI_IN_PLACE,
1740  send_recv_count,
1741  MPI_INT,
1742  row_offset,
1743  send_recv_count,
1744  MPI_INT,
1745  comm) == MPI_SUCCESS);
1746 
1747  for (int i = d_num_procs - 1; i >= 0; i--) {
1748  row_offset[i] = row_offset[i + 1] - row_offset[i];
1749  }
1750  CAROM_VERIFY(row_offset[0] == 0);
1751 
1752  // Use the computed (local <--> global) index map to assign elements
1753  // to the scratch matrix
1754  for (int row = row_offset[my_rank]; row < row_offset[my_rank + 1]; row++) {
1755  El::Int el_loc_col = static_cast<El::Int>(row - row_offset[my_rank]);
1756  El::Int el_global_col = scratch.GlobalCol(el_loc_col);
1757  for (int col = 0; col < d_num_cols; col++) {
1758  El::Int el_loc_row = static_cast<El::Int>(col);
1759  El::Int el_global_row = scratch.GlobalRow(el_loc_row);
1760  scratch.Set(el_global_row, el_global_col, this->item(row, col));
1761  }
1762  }
1763 
1764  // After transferring the data over to Elemental's native data
1765  // types, we compute the QRCP.
1766  El::QR(scratch, householder_scalars, diagonal, perm); // add ctrl if needed
1767 
1768  // Then, we transfer the pivots into the pivot array
1769  // stored redundantly on each process.
1770  for (size_t i = 0; i < pivots_requested; i++) {
1771  El::Int el_i = static_cast<El::Int>(i);
1772  El::Int el_perm_i = perm.Image(el_i);
1773 
1774  // The permutation is computed in terms of global row indices of
1775  // the Matrix, so we need to compute the rank that owns the column
1776  // of scratch (remember: scratch is the transpose of this Matrix)
1777  // corresponding to this global index
1778  int global_row_index = static_cast<int>(el_perm_i);
1779  int rank;
1780  for (rank = 0; rank < d_num_procs; rank++) {
1781  bool is_at_or_above_lower_bound = (global_row_index >= row_offset[rank]);
1782  bool is_below_upper_bound = (global_row_index < row_offset[rank + 1]);
1783  if (is_at_or_above_lower_bound && is_below_upper_bound) {
1784  row_pivot_owner[i] = rank;
1785  break;
1786  }
1787  }
1788 
1789  // The local row index is then the global row index minus the
1790  // row_offset computed above in the simple (local <--> global) index
1791  // map computed above
1792  row_pivot[i] = global_row_index - row_offset[rank];
1793  }
1794 
1795  // Free arrays
1796  delete [] row_offset;
1797 #else
1798  CAROM_VERIFY(false);
1799 #endif
1800 }
1801 
1802 void
1803 Matrix::orthogonalize(bool double_pass, double zero_tol)
1804 {
1805  int const num_passes = double_pass ? 2 : 1;
1806 
1807  for (int work = 0; work < d_num_cols; ++work)
1808  {
1809  // Orthogonalize the column (twice if double_pass == true).
1810  for (int k = 0; k < num_passes; k++)
1811  {
1812  for (int col = 0; col < work; ++col)
1813  {
1814  double factor = 0.0;
1815 
1816  for (int i = 0; i < d_num_rows; ++i)
1817  factor += item(i, col) * item(i, work);
1818 
1819  if (d_distributed && d_num_procs > 1)
1820  {
1821  CAROM_VERIFY( MPI_Allreduce(MPI_IN_PLACE, &factor, 1,
1822  MPI_DOUBLE, MPI_SUM,
1823  MPI_COMM_WORLD)
1824  == MPI_SUCCESS );
1825  }
1826  for (int i = 0; i < d_num_rows; ++i)
1827  item(i, work) -= factor * item(i, col);
1828  }
1829  }
1830 
1831  // Normalize the column.
1832  double norm = 0.0;
1833 
1834  for (int i = 0; i < d_num_rows; ++i)
1835  norm += item(i, work) * item(i, work);
1836 
1837  if (d_distributed && d_num_procs > 1)
1838  {
1839  CAROM_VERIFY( MPI_Allreduce(MPI_IN_PLACE, &norm, 1, MPI_DOUBLE,
1840  MPI_SUM, MPI_COMM_WORLD)
1841  == MPI_SUCCESS );
1842  }
1843  if (norm > zero_tol)
1844  {
1845  norm = 1.0 / sqrt(norm);
1846  for (int i = 0; i < d_num_rows; ++i)
1847  item(i, work) *= norm;
1848  }
1849  }
1850 }
1851 
1852 void
1853 Matrix::orthogonalize_last(int ncols, bool double_pass, double zero_tol)
1854 {
1855  if (ncols == -1) ncols = d_num_cols;
1856  CAROM_VERIFY((ncols > 0) && (ncols <= d_num_cols));
1857 
1858  const int last_col = ncols - 1; // index of column to be orthonormalized
1859 
1860  int const num_passes = double_pass ? 2 : 1;
1861 
1862  // Orthogonalize the column (twice if double_pass == true).
1863  for (int k = 0; k < num_passes; k++)
1864  {
1865  for (int col = 0; col < last_col; ++col)
1866  {
1867  double factor = 0.0;
1868 
1869  for (int i = 0; i < d_num_rows; ++i)
1870  factor += item(i, col) * item(i, last_col);
1871 
1872  if (d_distributed && d_num_procs > 1)
1873  {
1874  CAROM_VERIFY( MPI_Allreduce(MPI_IN_PLACE, &factor, 1, MPI_DOUBLE,
1875  MPI_SUM, MPI_COMM_WORLD)
1876  == MPI_SUCCESS );
1877  }
1878  for (int i = 0; i < d_num_rows; ++i)
1879  item(i, last_col) -= factor * item(i, col);
1880  }
1881  }
1882 
1883  // Normalize the column.
1884  double norm = 0.0;
1885 
1886  for (int i = 0; i < d_num_rows; ++i)
1887  norm += item(i, last_col) * item(i, last_col);
1888 
1889  if (d_distributed && d_num_procs > 1)
1890  {
1891  CAROM_VERIFY( MPI_Allreduce(MPI_IN_PLACE, &norm, 1, MPI_DOUBLE,
1892  MPI_SUM, MPI_COMM_WORLD) == MPI_SUCCESS );
1893  }
1894  if (norm > zero_tol)
1895  {
1896  norm = 1.0 / sqrt(norm);
1897  for (int i = 0; i < d_num_rows; ++i)
1898  item(i, last_col) *= norm;
1899  }
1900 }
1901 
1902 void
1904 {
1905  // Rescale every matrix row by its maximum absolute value.
1906  // In the Matrix class, columns are distributed row wise, but rows are
1907  // not distributed; namely, each process acts on a number of full rows.
1908  // Therefore, no MPI communication is needed.
1909 
1910  for (int i = 0; i < d_num_rows; i++)
1911  {
1912  // Find the row's max absolute value.
1913  double row_max = fabs(item(i, 0));
1914  for (int j = 1; j < d_num_cols; j++)
1915  {
1916  if (fabs(item(i, j)) > row_max)
1917  row_max = fabs(item(i, j));
1918  }
1919 
1920  // Rescale every row entry, if max nonzero.
1921  if (row_max > 1.0e-14)
1922  {
1923  for (int j = 0; j < d_num_cols; j++)
1924  item(i, j) /= row_max;
1925  }
1926  }
1927 }
1928 
1929 void
1931 {
1932  // Rescale every matrix column by its maximum absolute value.
1933  // Matrix columns are distributed row wise, so MPI communication is needed
1934  // to get the maximum of each column across all processes.
1935 
1936  // Find each column's max absolute value in the current process.
1937  double local_max[d_num_cols];
1938  for (int j = 0; j < d_num_cols; j++)
1939  {
1940  local_max[j] = fabs(item(0, j));
1941  for (int i = 1; i < d_num_rows; i++)
1942  {
1943  if (fabs(item(i, j)) > local_max[j])
1944  local_max[j] = fabs(item(i, j));
1945  }
1946  }
1947 
1948  // Get the max across all processes, if applicable.
1949  double global_max[d_num_cols];
1950  if (d_distributed && d_num_procs > 1)
1951  {
1952  MPI_Allreduce(&local_max, &global_max, d_num_cols, MPI_DOUBLE, MPI_MAX,
1953  MPI_COMM_WORLD);
1954  }
1955  else
1956  {
1957  for (int i = 0; i < d_num_cols; i++)
1958  global_max[i] = local_max[i];
1959  }
1960 
1961  // Rescale each column's entries, if max nonzero.
1962  for (int j = 0; j < d_num_cols; j++)
1963  {
1964  if (global_max[j] > 1.0e-14)
1965  {
1966  double tmp = 1.0 / global_max[j];
1967  for (int i = 0; i < d_num_rows; i++)
1968  item(i, j) *= tmp;
1969  }
1970  }
1971 }
1972 
1973 Matrix outerProduct(const Vector &v, const Vector &w)
1974 {
1975  /*
1976  * There are two cases of concern:
1977  *
1978  * 1) w is not distributed
1979  *
1980  * 2) w is distributed
1981  *
1982  */
1983  int result_num_rows = v.dim();
1984  int result_num_cols;
1985  bool is_distributed = v.distributed();
1986  Vector gathered_w;
1987 
1988  /*
1989  * Gather all of the entries in w on each process into a Vector stored
1990  * redundantly (i.e., not distributed) on each process. If the Vector w
1991  * is not distributed, this step is trivial.
1992  */
1993  if (!w.distributed())
1994  {
1995  result_num_cols = w.dim();
1996  gathered_w = w;
1997  }
1998  else // w.distributed() is true
1999  {
2000  // Get the number of columns on each processor and gather these
2001  // counts into an array. Use std::vector as an array substitute
2002  // because variable-length arrays aren't in the C++ standard,
2003  // but the storage for std::vector containers must be contiguous
2004  // as defined by the C++ standard.
2005  int process_local_num_cols = w.dim();
2006  MPI_Comm comm = MPI_COMM_WORLD;
2007  MPI_Datatype num_cols_datatype = MPI_INT;
2008  int num_procs;
2009  MPI_Comm_size(comm, &num_procs);
2010  std::vector<int> num_cols_on_proc(num_procs, 0);
2011  int num_procs_send_count = 1;
2012  int num_procs_recv_count = 1;
2013  MPI_Allgather(&process_local_num_cols,
2014  num_procs_send_count,
2015  num_cols_datatype,
2016  num_cols_on_proc.data(),
2017  num_procs_recv_count,
2018  num_cols_datatype,
2019  comm);
2020 
2021  // Compute the displacements for the entries in the gathered
2022  // Vector, along with the total number of columns in the result.
2023  std::vector<int> gathered_w_displacements(num_procs, 0);
2024  result_num_cols = 0;
2025  for (int i = 0; i < num_cols_on_proc.size(); i++)
2026  {
2027  gathered_w_displacements.at(i) = result_num_cols;
2028  result_num_cols += num_cols_on_proc.at(i);
2029  }
2030 
2031  // Gather the data from each process onto each process -- this
2032  // step is an Allgatherv (because difference processes may
2033  // have different numbers of entries.
2034  std::vector<double> w_data(process_local_num_cols);
2035  for (int i = 0; i < w_data.size(); i++)
2036  {
2037  w_data.at(i) = w(i);
2038  }
2039 
2040  std::vector<double> gathered_w_data(result_num_cols);
2041  MPI_Datatype entry_datatype = MPI_DOUBLE;
2042  MPI_Allgatherv(w_data.data(),
2043  process_local_num_cols,
2044  entry_datatype,
2045  gathered_w_data.data(),
2046  num_cols_on_proc.data(),
2047  gathered_w_displacements.data(),
2048  entry_datatype,
2049  comm);
2050 
2051  gathered_w.setSize(result_num_cols);
2052  for (int i = 0; i < gathered_w_data.size(); i++)
2053  {
2054  gathered_w(i) = gathered_w_data.at(i);
2055  }
2056  }
2057 
2058  /* Create the matrix */
2059  Matrix result(result_num_rows, result_num_cols, is_distributed);
2060 
2061  /* Compute the outer product using the gathered copy of w. */
2062  for (int i = 0; i < result_num_rows; i++)
2063  {
2064  for (int j = 0; j < result_num_cols; j++)
2065  {
2066  result(i, j) = v(i) * gathered_w(j);
2067  }
2068  }
2069 
2070  return result;
2071 }
2072 
2074 {
2075  const int resultNumRows = v.dim();
2076  int resultNumColumns, processRowStartIndex, processNumColumns;
2077  const bool isDistributed = v.distributed();
2078 
2079  /* If v isn't distributed, sizing the output matrix is trivial. */
2080  if (false == isDistributed)
2081  {
2082  resultNumColumns = processNumColumns = resultNumRows;
2083  processRowStartIndex = 0;
2084  }
2085  else /* true == isDistributed */
2086  {
2087  using sizeType = std::vector<int>::size_type;
2088 
2094  const MPI_Comm comm = MPI_COMM_WORLD;
2095  int numProcesses;
2096  MPI_Comm_size(comm, &numProcesses);
2097  std::vector<int>
2098  numRowsOnProcess(static_cast<sizeType>(numProcesses));
2099  const int one = 1;
2100  const MPI_Datatype indexType = MPI_INT;
2101  MPI_Allgather(&resultNumRows, one, indexType,
2102  numRowsOnProcess.data(), one, indexType, comm);
2103 
2115  std::vector<int> rowStartIndexOnProcess(numProcesses);
2116  resultNumColumns = 0;
2117  for (sizeType i = 0; i < rowStartIndexOnProcess.size(); i++)
2118  {
2119  rowStartIndexOnProcess.at(i) = resultNumColumns;
2120  resultNumColumns += numRowsOnProcess.at(i);
2121  }
2122  int processNumber;
2123  MPI_Comm_rank(comm, &processNumber);
2124  processRowStartIndex =
2125  rowStartIndexOnProcess.at(static_cast<sizeType>(processNumber));
2126  } /* end (true == isDistributed) */
2127 
2132  Matrix diagonalMatrix(resultNumRows, resultNumColumns, isDistributed);
2133  for (int i = 0; i < resultNumRows; i++)
2134  {
2135  for (int j = 0; j < resultNumColumns; j++)
2136  {
2141  const double entry = (j == (i + processRowStartIndex)) ? v(i) : 0.0;
2142  diagonalMatrix(i, j) = entry;
2143  }
2144  }
2145 
2146  return diagonalMatrix;
2147 }
2148 
2150 {
2151  Vector temporary(v);
2152  temporary = 1.0;
2153  return DiagonalMatrixFactory(temporary);
2154 }
2155 
2156 struct EigenPair SymmetricRightEigenSolve(Matrix* A)
2157 {
2158  char jobz = 'V', uplo = 'U';
2159 
2160  int info;
2161  int k = A->numColumns();
2162  int lwork = std::max(1, 10*k-1);
2163  double* work = new double [lwork];
2164  double* eigs = new double [k];
2165  Matrix* ev = new Matrix(*A);
2166 
2167  // ev now in a row major representation. Put it
2168  // into column major order.
2169  for (int row = 0; row < k; ++row) {
2170  for (int col = row+1; col < k; ++col) {
2171  double tmp = ev->item(row, col);
2172  ev->item(row, col) = ev->item(col, row);
2173  ev->item(col, row) = tmp;
2174  }
2175  }
2176 
2177  // Now call lapack to do the eigensolve.
2178  dsyev(&jobz, &uplo, &k, ev->getData(), &k, eigs, work, &lwork, &info);
2179 
2180  // Eigenvectors now in a column major representation. Put it
2181  // into row major order.
2182  for (int row = 0; row < k; ++row) {
2183  for (int col = row+1; col < k; ++col) {
2184  double tmp = ev->item(row, col);
2185  ev->item(row, col) = ev->item(col, row);
2186  ev->item(col, row) = tmp;
2187  }
2188  }
2189 
2190  EigenPair eigenpair;
2191  eigenpair.ev = ev;
2192  for (int i = 0; i < k; i++)
2193  {
2194  eigenpair.eigs.push_back(eigs[i]);
2195  }
2196 
2197  delete [] work;
2198  delete [] eigs;
2199 
2200  return eigenpair;
2201 }
2202 
2203 struct ComplexEigenPair NonSymmetricRightEigenSolve(Matrix* A)
2204 {
2205  char jobvl = 'N', jobrl = 'V';
2206 
2207  int info;
2208  int k = A->numColumns();
2209  int lwork = std::max(k*k, 10*k);
2210  double* work = new double [lwork];
2211  double* e_real = new double [k];
2212  double* e_imaginary = new double [k];
2213  double* ev_l = NULL;
2214  Matrix* ev_r = new Matrix(k, k, false);
2215  Matrix* A_copy = new Matrix(*A);
2216 
2217  // A now in a row major representation. Put it
2218  // into column major order.
2219  for (int row = 0; row < k; ++row) {
2220  for (int col = row+1; col < k; ++col) {
2221  double tmp = A_copy->item(row, col);
2222  A_copy->item(row, col) = A_copy->item(col, row);
2223  A_copy->item(col, row) = tmp;
2224  }
2225  }
2226 
2227  // Now call lapack to do the eigensolve.
2228  dgeev(&jobvl, &jobrl, &k, A_copy->getData(), &k, e_real, e_imaginary, ev_l,
2229  &k, ev_r->getData(), &k, work, &lwork, &info);
2230 
2231  // Eigenvectors now in a column major representation. Put it
2232  // into row major order.
2233  for (int row = 0; row < k; ++row) {
2234  for (int col = row+1; col < k; ++col) {
2235  double tmp = ev_r->item(row, col);
2236  ev_r->item(row, col) = ev_r->item(col, row);
2237  ev_r->item(col, row) = tmp;
2238  }
2239  }
2240 
2241  ComplexEigenPair eigenpair;
2242  eigenpair.ev_real = new Matrix(k, k, false);
2243  eigenpair.ev_imaginary = new Matrix(k, k, false);
2244 
2245  // Separate lapack eigenvector into real and imaginary parts
2246  for (int i = 0; i < k; ++i)
2247  {
2248  for (int row = 0; row < k; ++row) {
2249  eigenpair.ev_real->item(row, i) = ev_r->item(row, i);
2250  }
2251  if (e_imaginary[i] != 0)
2252  {
2253  for (int row = 0; row < k; ++row) {
2254  eigenpair.ev_real->item(row, i + 1) = ev_r->item(row, i);
2255  eigenpair.ev_imaginary->item(row, i) = ev_r->item(row, i + 1);
2256  eigenpair.ev_imaginary->item(row, i + 1) = -ev_r->item(row, i + 1);
2257  }
2258 
2259  // Skip the next eigenvalue since it'll be part of the complex
2260  // conjugate pair.
2261  ++i;
2262  }
2263  }
2264 
2265  for (int i = 0; i < k; i++)
2266  {
2267  eigenpair.eigs.push_back(std::complex<double>(e_real[i], e_imaginary[i]));
2268  }
2269 
2270  delete [] work;
2271  delete [] e_real;
2272  delete [] e_imaginary;
2273  delete ev_r;
2274  delete A_copy;
2275 
2276  return eigenpair;
2277 }
2278 
2280  Matrix* U,
2281  Vector* S,
2282  Matrix* V)
2283 {
2284  CAROM_VERIFY(!A->distributed());
2285  int m = A->numRows();
2286  int n = A->numColumns();
2287 
2288  Matrix* A_copy = new Matrix(*A);
2289  if (U == NULL)
2290  {
2291  U = new Matrix(m, std::min(m, n), false);
2292  }
2293  else
2294  {
2295  CAROM_VERIFY(!U->distributed());
2296  U->setSize(m, std::min(m, n));
2297  }
2298  if (V == NULL)
2299  {
2300  CAROM_VERIFY(!V->distributed());
2301  V = new Matrix(std::min(m, n), n, false);
2302  }
2303  else
2304  {
2305  V->setSize(std::min(m, n), n);
2306  }
2307  if (S == NULL)
2308  {
2309  CAROM_VERIFY(!S->distributed());
2310  S = new Vector(n, false);
2311  }
2312  else
2313  {
2314  S->setSize(n);
2315  }
2316 
2317  char jobz = 'S';
2318  int lda = m;
2319  int ldu = m;
2320  int ldv = n;
2321  int mn = std::min(m, n);
2322  int lwork = 4 * mn * mn + 7 * mn;
2323  double* work = new double [lwork];
2324  int iwork[8*std::min(m, n)];
2325  int info;
2326 
2327  dgesdd(&jobz, &m, &n, A_copy->getData(), &lda, S->getData(), U->getData(), &ldu,
2328  V->getData(),
2329  &ldv, work, &lwork, iwork, &info);
2330 
2331  CAROM_VERIFY(info == 0);
2332 
2333  delete [] work;
2334  delete A_copy;
2335 }
2336 
2337 struct SerialSVDDecomposition SerialSVD(Matrix* A)
2338 {
2339  CAROM_VERIFY(!A->distributed());
2340  Matrix* U = NULL;
2341  Vector* S = NULL;
2342  Matrix* V = NULL;
2343 
2344  SerialSVD(A, U, S, V);
2345 
2346  struct SerialSVDDecomposition decomp;
2347  decomp.U = U;
2348  decomp.S = S;
2349  decomp.V = V;
2350 
2351  return decomp;
2352 }
2353 
2354 // Compute the product A^T * B, where A is represented by the space-time
2355 // product of As and At, and likewise for B.
2356 Matrix* SpaceTimeProduct(const CAROM::Matrix* As, const CAROM::Matrix* At,
2357  const CAROM::Matrix* Bs, const CAROM::Matrix* Bt,
2358  const std::vector<double> *tscale,
2359  const bool At0at0, const bool Bt0at0, const bool lagB,
2360  const bool skip0)
2361 {
2362  // TODO: implement reduction in parallel for the spatial matrices
2363  CAROM_VERIFY(As->distributed() && Bs->distributed());
2364 
2365  const int AtOS = At0at0 ? 1 : 0;
2366  const int BtOS0 = Bt0at0 ? 1 : 0;
2367  const int BtOS = BtOS0 + (lagB ? 1 : 0);
2368 
2369  const int nrows = As->numColumns();
2370  const int ncols = Bs->numColumns();
2371 
2372  const int nspace = As->numRows();
2373  const int ntime = At->numRows() + AtOS;
2374 
2375  CAROM_VERIFY(nspace == Bs->numRows() && ntime == Bt->numRows() + BtOS0);
2376 
2377  // For now, we assume one time vector for each space vector
2378  CAROM_VERIFY(nrows == At->numColumns() && ncols == Bt->numColumns());
2379 
2380  //const int k0 = (At0at0 || Bt0at0 || lagB) std::max(AtOS, BtOS) : 0;
2381  const int k0AB = std::max(AtOS, BtOS);
2382  const int k00 = skip0 ? 1 : 0;
2383  const int k0 = std::max(k0AB, k00);
2384 
2385  CAROM_VERIFY(tscale == NULL || (tscale->size() == ntime-1 && k0 > 0));
2386 
2387  Matrix* p = new CAROM::Matrix(nrows, ncols, false);
2388 
2389  for (int i=0; i<nrows; ++i)
2390  {
2391  for (int j=0; j<ncols; ++j)
2392  {
2393  double pij = 0.0;
2394 
2395  for (int k=k0; k<ntime; ++k)
2396  {
2397  //const double At_k = (At0at0 && k == 0) ? 0.0 : At->item(k - AtOS,i);
2398  //const double Bt_k = (Bt0at0 && k == 0) ? 0.0 : Bt->item(k - BtOS,j);
2399  const double At_k = At->item(k - AtOS,i);
2400  const double Bt_k = Bt->item(k - BtOS,j);
2401  const double ts = (tscale == NULL) ? 1.0 : (*tscale)[k - 1];
2402 
2403  double spij = 0.0; // sum over spatial entries
2404  for (int m=0; m<nspace; ++m)
2405  spij += As->item(m,i) * Bs->item(m,j);
2406 
2407  pij += spij * ts * At_k * Bt_k;
2408  }
2409 
2410  p->item(i, j) = pij;
2411  }
2412  }
2413 
2414  return p;
2415 }
2416 
2417 } // end namespace CAROM
void getInteger(const std::string &key, int &data)
Reads an integer associated with the supplied key from the currently open database file.
Definition: Database.h:182
void putInteger(const std::string &key, int data)
Writes an integer associated with the supplied key to currently open database file.
Definition: Database.h:95
virtual bool create(const std::string &file_name, const MPI_Comm comm=MPI_COMM_NULL) override
Creates a new HDF5 database file with the supplied name.
Definition: HDFDatabase.cpp:45
virtual bool open(const std::string &file_name, const std::string &type, const MPI_Comm comm=MPI_COMM_NULL) override
Opens an existing HDF5 database file with the supplied name.
Definition: HDFDatabase.cpp:76
virtual void getDoubleArray(const std::string &key, double *data, int nelements, const bool distributed=false)
Reads an array of doubles associated with the supplied key from the currently open HDF5 database file...
virtual void putDoubleArray(const std::string &key, const double *const data, int nelements, const bool distributed=false)
Writes an array of doubles associated with the supplied key to the currently open HDF5 database file.
virtual bool close()
Closes the currently open HDF5 database file.
void local_read(const std::string &base_file_name, int rank)
read a single rank of a distributed Matrix into (a) HDF file(s).
Definition: Matrix.cpp:1009
void setSize(int num_rows, int num_cols)
Sets the number of rows and columns of the matrix and reallocates storage if needed....
Definition: Matrix.h:147
Matrix * getFirstNColumns(int n) const
Get the first N columns of a matrix.
Definition: Matrix.cpp:256
double * getData() const
Get the matrix data as a pointer.
Definition: Matrix.h:1107
bool balanced() const
Returns true if rows of matrix are load-balanced.
Definition: Matrix.cpp:203
Matrix * inverse() const
Computes and returns the inverse of this.
Definition: Matrix.h:814
void transposePseudoinverse()
Computes the transposePseudoinverse of this.
Definition: Matrix.cpp:882
bool distributed() const
Returns true if the Matrix is distributed.
Definition: Matrix.h:177
void rescale_rows_max()
Rescale every matrix row by its maximum absolute value.
Definition: Matrix.cpp:1903
Matrix & operator=(const Matrix &rhs)
Assignment operator.
Definition: Matrix.cpp:172
void write(const std::string &base_file_name) const
write Matrix into (a) HDF file(s).
Definition: Matrix.cpp:937
int numColumns() const
Returns the number of columns in the Matrix. This method will return the same value from each process...
Definition: Matrix.h:220
void distribute(const int &local_num_rows)
Distribute this matrix rows among MPI processes, based on the specified local number of rows....
Definition: Matrix.cpp:1046
void gather()
Gather all the distributed rows among MPI processes. This becomes not distributed after this function...
Definition: Matrix.cpp:1073
Matrix * elementwise_mult(const Matrix &other) const
Multiplies two matrices element-wise.
Definition: Matrix.h:507
void orthogonalize_last(int ncols=-1, bool double_pass=false, double zero_tol=1.0e-15)
Orthonormalizes the matrix's last column, assuming the previous columns are already orthonormal.
Definition: Matrix.cpp:1853
Matrix & operator+=(const Matrix &rhs)
Addition operator.
Definition: Matrix.cpp:183
int numDistributedRows() const
Returns the number of rows of the Matrix across all processors.
Definition: Matrix.h:205
Vector * getColumn(int column) const
Returns a column of the matrix (not owned by Matrix).
Definition: Matrix.h:870
Matrix & operator-=(const Matrix &rhs)
Subtraction operator.
Definition: Matrix.cpp:193
void orthogonalize(bool double_pass=false, double zero_tol=1.0e-15)
Orthonormalizes the matrix.
Definition: Matrix.cpp:1803
void multPlus(Vector &a, const Vector &b, double c) const
Computes a += this*b*c.
Definition: Matrix.cpp:539
void print(const char *prefix) const
print Matrix into (a) ascii file(s).
Definition: Matrix.cpp:918
Matrix * transposeMult(const Matrix &other) const
Multiplies the transpose of this Matrix with other and returns the product, reference version.
Definition: Matrix.h:642
void pointwise_mult(int this_row, const Vector &other, Vector &result) const
Multiplies a specified row of this Matrix with other pointwise.
Definition: Matrix.cpp:418
Matrix * mult(const Matrix &other) const
Multiplies this Matrix with other and returns the product, reference version.
Definition: Matrix.h:283
~Matrix()
Destructor.
Definition: Matrix.cpp:164
const double & item(int row, int col) const
Const Matrix member access. Matrix data is stored in row-major format.
Definition: Matrix.h:1007
int numRows() const
Returns the number of rows of the Matrix on this processor.
Definition: Matrix.h:194
void read(const std::string &base_file_name)
read Matrix into (a) HDF file(s).
Definition: Matrix.cpp:969
Matrix * elementwise_square() const
Square every element in the matrix.
Definition: Matrix.h:576
void transpose()
Replaces this Matrix with its transpose (in place), in the serial square case only.
Definition: Matrix.cpp:824
Matrix * qr_factorize() const
Computes and returns the Q of the QR factorization of this.
Definition: Matrix.cpp:1125
void rescale_cols_max()
Rescale every matrix column by its maximum absolute value.
Definition: Matrix.cpp:1930
void qrcp_pivots_transpose(int *row_pivot, int *row_pivot_owner, int pivots_requested) const
Compute the leading numColumns() column pivots from a QR decomposition with column pivots (QRCP) of t...
Definition: Matrix.cpp:1198
double * getData() const
Get the vector data as a pointer.
Definition: Vector.h:748
void setSize(int dim)
Sets the length of the vector and reallocates storage if needed. All values are initialized to zero.
Definition: Vector.h:231
bool distributed() const
Returns true if the Vector is distributed.
Definition: Vector.h:255
const double & item(int i) const
Const Vector member access.
Definition: Vector.h:656
int dim() const
Returns the dimension of the Vector on this processor.
Definition: Vector.h:266
Matrix IdentityMatrixFactory(const Vector &v)
Factory function to make an identity matrix with rows distributed in the same fashion as its Vector a...
Definition: Matrix.cpp:2149
struct ComplexEigenPair NonSymmetricRightEigenSolve(Matrix *A)
Computes the eigenvectors/eigenvalues of an NxN real nonsymmetric matrix.
Definition: Matrix.cpp:2203
struct EigenPair SymmetricRightEigenSolve(Matrix *A)
Computes the eigenvectors/eigenvalues of an NxN real symmetric matrix.
Definition: Matrix.cpp:2156
Matrix outerProduct(const Vector &v, const Vector &w)
Computes the outer product of two Vectors, v and w.
Definition: Matrix.cpp:1973
void SerialSVD(Matrix *A, Matrix *U, Vector *S, Matrix *V)
Computes the SVD of a undistributed matrix in serial.
Definition: Matrix.cpp:2279
Matrix DiagonalMatrixFactory(const Vector &v)
Factory function to make a diagonal matrix with nonzero entries as in its Vector argument....
Definition: Matrix.cpp:2073
int get_global_offsets(const int local_dim, std::vector< int > &offsets, const MPI_Comm &comm)
Save integer offsets for each MPI rank under MPI communicator comm, where each rank as the size of lo...
Definition: mpi_utils.cpp:41