libROM  v1.0
Data-driven physical simulation library
Matrix.cpp
1 
11 // Description: A simple, parallel Matrix class with the utility needed to
12 // support the basis generation methods of this library. A
13 // distributed Matrix has its rows distributed across processors.
14 
15 #include "Matrix.h"
16 #include "utils/HDFDatabase.h"
17 #include "utils/mpi_utils.h"
18 
19 #include "mpi.h"
20 #include <string.h>
21 #include <vector>
22 #include <random>
23 
24 #ifdef CAROM_HAS_ELEMENTAL
25 #include <El.hpp>
26 #endif
27 
28 #include "scalapack_wrapper.h"
29 
30 /* Use automatically detected Fortran name-mangling scheme */
31 #define dsyev CAROM_FC_GLOBAL(dsyev, DSYEV)
32 #define dgeev CAROM_FC_GLOBAL(dgeev, DGEEV)
33 #define dgetrf CAROM_FC_GLOBAL(dgetrf, DGETRF)
34 #define dgetri CAROM_FC_GLOBAL(dgetri, DGETRI)
35 #define dgeqp3 CAROM_FC_GLOBAL(dgeqp3, DGEQP3)
36 #define dgesdd CAROM_FC_GLOBAL(dgesdd, DGESDD)
37 
38 extern "C" {
39 // Compute eigenvalue and eigenvectors of real symmetric matrix.
40  void dsyev(char*, char*, int*, double*, int*, double*, double*, int*, int*);
41 
42 // Compute eigenvalue and eigenvectors of real non-symmetric matrix.
43  void dgeev(char*, char*, int*, double*, int*, double*, double*, double*, int*,
44  double*, int*, double*, int*, int*);
45 
46 // LU decomposition of a general matrix.
47  void dgetrf(int*, int*, double*, int*, int*, int*);
48 
49 // Generate inverse of a matrix given its LU decomposition.
50  void dgetri(int*, double*, int*, int*, double*, int*, int*);
51 
52 // BLAS-3 version of QR decomposition with column pivoting
53  void dgeqp3(int*, int*, double*, int*, int*, double*, double*, int*, int*);
54 
55 // Serial SVD of a matrix.
56  void dgesdd(char*, int*, int*, double*, int*,
57  double*, double*, int*, double*, int*,
58  double*, int*, int*, int*);
59 }
60 
61 namespace CAROM {
62 
64  d_mat(NULL),
65  d_alloc_size(0),
66  d_distributed(false),
67  d_owns_data(true)
68 {}
69 
71  int num_rows,
72  int num_cols,
73  bool distributed,
74  bool randomized) :
75  d_mat(0),
76  d_alloc_size(0),
77  d_distributed(distributed),
78  d_owns_data(true)
79 {
80  CAROM_VERIFY(num_rows > 0);
81  CAROM_VERIFY(num_cols > 0);
82  int mpi_init;
83  MPI_Initialized(&mpi_init);
84  if (mpi_init) {
85  MPI_Comm_size(MPI_COMM_WORLD, &d_num_procs);
86  MPI_Comm_rank(MPI_COMM_WORLD, &d_rank);
87  }
88  else {
89  d_num_procs = 1;
90  d_rank = 0;
91  }
92  setSize(num_rows, num_cols);
93  if (randomized) {
94  std::default_random_engine generator;
95  std::normal_distribution<double> normal_distribution(0.0, 1.0);
96  for (int i = 0; i < num_rows; i++) {
97  for (int j = 0; j < num_cols; j++) {
98  item(i,j) = normal_distribution(generator);
99  }
100  }
101  }
102 }
103 
105  double* mat,
106  int num_rows,
107  int num_cols,
108  bool distributed,
109  bool copy_data) :
110  d_mat(0),
111  d_alloc_size(0),
112  d_distributed(distributed),
113  d_owns_data(copy_data)
114 {
115  CAROM_VERIFY(mat != 0);
116  CAROM_VERIFY(num_rows > 0);
117  CAROM_VERIFY(num_cols > 0);
118  int mpi_init;
119  MPI_Initialized(&mpi_init);
120  if (mpi_init) {
121  MPI_Comm_size(MPI_COMM_WORLD, &d_num_procs);
122  MPI_Comm_rank(MPI_COMM_WORLD, &d_rank);
123  }
124  else {
125  d_num_procs = 1;
126  d_rank = 0;
127  }
128  if (copy_data) {
129  setSize(num_rows, num_cols);
130  memcpy(d_mat, mat, d_alloc_size*sizeof(double));
131  }
132  else {
133  // Check integer multiplication overflow
134  if (num_rows > INT_MAX / num_cols)
135  CAROM_ERROR("Matrix::Matrix- new size exceeds maximum integer value!\n");
136 
137  d_mat = mat;
138  d_alloc_size = num_rows*num_cols;
139  d_num_cols = num_cols;
140  d_num_rows = num_rows;
141  if (d_distributed) {
142  calculateNumDistributedRows();
143  }
144  }
145 }
146 
148  const Matrix& other) :
149  d_mat(0),
150  d_alloc_size(0),
151  d_distributed(other.d_distributed),
152  d_owns_data(true)
153 {
154  int mpi_init;
155  MPI_Initialized(&mpi_init);
156  if (mpi_init) {
157  MPI_Comm_size(MPI_COMM_WORLD, &d_num_procs);
158  MPI_Comm_rank(MPI_COMM_WORLD, &d_rank);
159  }
160  else {
161  d_num_procs = 1;
162  d_rank = 0;
163  }
164  setSize(other.d_num_rows, other.d_num_cols);
165  memcpy(d_mat, other.d_mat, d_alloc_size*sizeof(double));
166 }
167 
169 {
170  if (d_owns_data && d_mat) {
171  delete [] d_mat;
172  }
173 }
174 
175 Matrix&
177  const Matrix& rhs)
178 {
179  d_distributed = rhs.d_distributed;
180  d_num_procs = rhs.d_num_procs;
181  setSize(rhs.d_num_rows, rhs.d_num_cols);
182  memcpy(d_mat, rhs.d_mat, d_num_rows*d_num_cols*sizeof(double));
183  return *this;
184 }
185 
186 Matrix&
188  const Matrix& rhs)
189 {
190  CAROM_VERIFY(rhs.d_num_rows == d_num_rows);
191  CAROM_VERIFY(rhs.d_num_cols == d_num_cols);
192  for(int i=0; i<d_num_rows*d_num_cols; ++i) d_mat[i] += rhs.d_mat[i];
193  return *this;
194 }
195 
196 Matrix&
198  const Matrix& rhs)
199 {
200  CAROM_VERIFY(rhs.d_num_rows == d_num_rows);
201  CAROM_VERIFY(rhs.d_num_cols == d_num_cols);
202  for(int i=0; i<d_num_rows*d_num_cols; ++i) d_mat[i] -= rhs.d_mat[i];
203  return *this;
204 }
205 
206 bool
208 {
209  // A Matrix is "balanced" (load-balanced for distributed dense matrix
210  // computations) if:
211  //
212  // (1) the number of rows owned by each process in any pair of
213  // processes on the communicator differs by at most one
214  //
215  // (2) process j has no fewer rows than k if j is less than k (j and
216  // k are both integers corresponding to process ranks)
217 
218  // Serial matrices are balanced by definition; one process owns all
219  // rows
220  if (!distributed()) return true;
221 
222  const MPI_Comm comm = MPI_COMM_WORLD;
223 
224  // Otherwise, get the total number of rows of the matrix.
225  int num_total_rows = numDistributedRows();
226 
227  const int first_rank_with_fewer = num_total_rows % d_num_procs;
228  int my_rank;
229  CAROM_VERIFY(MPI_Comm_rank(comm, &my_rank) == MPI_SUCCESS);
230 
231  const int min_rows_per_rank = num_total_rows / d_num_procs;
232  const bool has_extra_row = my_rank < first_rank_with_fewer;
233  const int max_rows_on_rank = min_rows_per_rank + has_extra_row;
234  const bool has_enough_rows = (d_num_rows >= min_rows_per_rank);
235  const bool has_too_many_rows = (d_num_rows > max_rows_on_rank);
236 
237  int result = (has_enough_rows && !has_too_many_rows);
238  const int reduce_count = 1;
239  CAROM_VERIFY(MPI_Allreduce(MPI_IN_PLACE,
240  &result,
241  reduce_count,
242  MPI_INT,
243  MPI_LAND,
244  comm) == MPI_SUCCESS);
245 
246  return result;
247 }
248 
249 Matrix&
251  const double a)
252 {
253  for(int i=0; i<d_num_rows*d_num_cols; ++i) {
254  d_mat[i] = a;
255  }
256  return *this;
257 }
258 
259 std::unique_ptr<Matrix>
261 {
262  CAROM_VERIFY(n > 0 && n <= d_num_cols);
263 
264  Matrix* first_n_columns = new Matrix(d_num_rows, n, d_distributed);
265  getFirstNColumns(n, *first_n_columns);
266  return std::unique_ptr<Matrix>(first_n_columns);
267 }
268 
269 void
271  int n,
272  Matrix& result) const
273 {
274  CAROM_VERIFY(result.distributed() == distributed());
275  CAROM_VERIFY(n > 0 && n <= d_num_cols);
276 
277  // Size result correctly.
278  result.setSize(d_num_rows, n);
279 
280  for (int i = 0; i < d_num_rows; i++)
281  {
282  for (int j = 0; j < n; j++)
283  {
284  result.item(i, j) = item(i, j);
285  }
286  }
287 }
288 
289 void
291  const Matrix& other,
292  Matrix& result) const
293 {
294  CAROM_VERIFY(result.distributed() == distributed());
295  CAROM_VERIFY(!other.distributed());
296  CAROM_VERIFY(numColumns() == other.numRows());
297 
298  // Size result correctly.
299  result.setSize(d_num_rows, other.d_num_cols);
300 
301  // Do the multiplication.
302  for (int this_row = 0; this_row < d_num_rows; ++this_row) {
303  for (int other_col = 0; other_col < other.d_num_cols; ++other_col) {
304  double result_val = 0.0;
305  for (int entry = 0; entry < d_num_cols; ++entry) {
306  result_val += item(this_row, entry)*other.item(entry, other_col);
307  }
308  result.item(this_row, other_col) = result_val;
309  }
310  }
311 }
312 
313 void
315  const Vector& other,
316  Vector& result) const
317 {
318  CAROM_VERIFY(result.distributed() == distributed());
319  CAROM_VERIFY(!other.distributed());
320  CAROM_VERIFY(numColumns() == other.dim());
321 
322  // Size result correctly.
323  result.setSize(d_num_rows);
324 
325  // Do the multiplication.
326  for (int this_row = 0; this_row < d_num_rows; ++this_row) {
327  double result_val = 0.0;
328  for (int entry = 0; entry < d_num_cols; ++entry) {
329  result_val += item(this_row, entry)*other.item(entry);
330  }
331  result.item(this_row) = result_val;
332  }
333 }
334 
335 void
337  int this_row,
338  const Vector& other,
339  Vector& result) const
340 {
341  // TODO: change the CAROM_ASSERTs to CAROM_VERIFYs or generalize and eliminate the checks.
342  CAROM_ASSERT(!result.distributed());
343  CAROM_ASSERT(!distributed());
344  CAROM_VERIFY(!other.distributed());
345  CAROM_VERIFY(numColumns() == other.dim());
346 
347  // Do the multiplication.
348  for (int entry = 0; entry < d_num_cols; ++entry) {
349  result.item(entry) = item(this_row, entry)*other.item(entry);
350  }
351 }
352 
353 void
355  int this_row,
356  Vector& other) const
357 {
358  CAROM_ASSERT(!distributed());
359  CAROM_VERIFY(!other.distributed());
360  CAROM_VERIFY(numColumns() == other.dim());
361 
362  // Do the multiplication.
363  for (int entry = 0; entry < d_num_cols; ++entry) {
364  other.item(entry) *= item(this_row, entry);
365  }
366 }
367 
368 void
370  const Matrix& other,
371  Matrix& result) const
372 {
373  CAROM_VERIFY(result.distributed() == distributed());
374  CAROM_VERIFY(distributed() == other.distributed());
375  CAROM_VERIFY(numRows() == other.numRows());
376  CAROM_VERIFY(numColumns() == other.numColumns());
377 
378  // Size result correctly.
379  result.setSize(d_num_rows, other.d_num_cols);
380 
381  // Do the multiplication.
382  for (int this_row = 0; this_row < d_num_rows; ++this_row) {
383  for (int other_col = 0; other_col < d_num_cols; ++other_col) {
384  result.item(this_row, other_col) = item(this_row,
385  other_col) * other.item(this_row, other_col);
386  }
387  }
388 }
389 
390 void
392  Matrix& result) const
393 {
394  // Size result correctly.
395  result.setSize(d_num_rows, d_num_cols);
396 
397  // Do the pointwise square.
398  for (int this_row = 0; this_row < d_num_rows; ++this_row) {
399  for (int this_col = 0; this_col < d_num_cols; ++this_col) {
400  const double a = item(this_row, this_col);
401  result.item(this_row, this_col) = a * a;
402  }
403  }
404 }
405 
406 void
408  Vector& a,
409  const Vector& b,
410  double c) const
411 {
412  CAROM_VERIFY(a.distributed() == distributed());
413  CAROM_VERIFY(!b.distributed());
414  CAROM_VERIFY(numColumns() == b.dim());
415  CAROM_VERIFY(numRows() == a.dim());
416 
417  for (int this_row = 0; this_row < d_num_rows; ++this_row) {
418  double tmp = 0.0;
419  for (int this_col = 0; this_col < d_num_cols; ++this_col) {
420  tmp += item(this_row, this_col)*b.item(this_col);
421  }
422  a.item(this_row) += tmp*c;
423  }
424 }
425 
426 void
428  const Matrix& other,
429  Matrix& result) const
430 {
431  CAROM_VERIFY(!result.distributed());
432  CAROM_VERIFY(distributed() == other.distributed());
433  CAROM_VERIFY(numRows() == other.numRows());
434 
435  // Size result correctly.
436  result.setSize(d_num_cols, other.d_num_cols);
437 
438  // Do the multiplication.
439  for (int this_col = 0; this_col < d_num_cols; ++this_col) {
440  for (int other_col = 0; other_col < other.d_num_cols; ++other_col) {
441  double result_val = 0.0;
442  for (int entry = 0; entry < d_num_rows; ++entry) {
443  result_val += item(entry, this_col)*other.item(entry, other_col);
444  }
445  result.item(this_col, other_col) = result_val;
446  }
447  }
448  if (d_distributed && d_num_procs > 1) {
449  int new_mat_size = d_num_cols*other.d_num_cols;
450  MPI_Allreduce(MPI_IN_PLACE,
451  &result.item(0, 0),
452  new_mat_size,
453  MPI_DOUBLE,
454  MPI_SUM,
455  MPI_COMM_WORLD);
456  }
457 }
458 
459 void
461  const Vector& other,
462  Vector& result) const
463 {
464  CAROM_VERIFY(!result.distributed());
465  CAROM_VERIFY(distributed() == other.distributed());
466  CAROM_VERIFY(numRows() == other.dim());
467 
468  // If the result has not been allocated then do so. Otherwise size it
469  // correctly.
470  result.setSize(d_num_cols);
471 
472  // Do the multiplication.
473  for (int this_col = 0; this_col < d_num_cols; ++this_col) {
474  double result_val = 0.0;
475  for (int entry = 0; entry < d_num_rows; ++entry) {
476  result_val += item(entry, this_col)*other.item(entry);
477  }
478  result.item(this_col) = result_val;
479  }
480  if (d_distributed && d_num_procs > 1) {
481  MPI_Allreduce(MPI_IN_PLACE,
482  &result.item(0),
483  d_num_cols,
484  MPI_DOUBLE,
485  MPI_SUM,
486  MPI_COMM_WORLD);
487  }
488 }
489 
490 void
491 Matrix::getColumn(int column,
492  Vector& result) const
493 {
494  result.setSize(d_num_rows);
495  for (int i = 0; i < d_num_rows; i++) {
496  result.item(i) = item(i, column);
497  }
498 }
499 
500 void
502  Matrix& result) const
503 {
504  CAROM_VERIFY(!result.distributed() && result.numRows() == numRows() &&
505  result.numColumns() == numColumns());
506  CAROM_VERIFY(!distributed());
507  CAROM_VERIFY(numRows() == numColumns());
508 
509  // Size result correctly.
510  result.setSize(d_num_rows, d_num_cols);
511 
512  // Call lapack routines to do the inversion.
513  // Set up some stuff the lapack routines need.
514  int info;
515  int mtx_size = d_num_rows;
516  int lwork = mtx_size*mtx_size;
517  int* ipiv = new int [mtx_size];
518  double* work = new double [lwork];
519  // To use lapack we need a column major representation of this which is
520  // essentially the transform of this. Use result for this representation.
521  for (int row = 0; row < mtx_size; ++row) {
522  for (int col = 0; col < mtx_size; ++col) {
523  result.item(col, row) = item(row, col);
524  }
525  }
526  // Now call lapack to do the inversion.
527  dgetrf(&mtx_size, &mtx_size, result.d_mat, &mtx_size, ipiv, &info);
528  CAROM_VERIFY(info == 0);
529 
530  dgetri(&mtx_size, result.d_mat, &mtx_size, ipiv, work, &lwork, &info);
531  CAROM_VERIFY(info == 0);
532 
533  // Result now has the inverse in a column major representation. Put it
534  // into row major order.
535  for (int row = 0; row < mtx_size; ++row) {
536  for (int col = row+1; col < mtx_size; ++col) {
537  double tmp = result.item(row, col);
538  result.item(row, col) = result.item(col, row);
539  result.item(col, row) = tmp;
540  }
541  }
542 
543  delete [] ipiv;
544  delete [] work;
545 }
546 
548 {
549  CAROM_VERIFY(!distributed() && numRows() == numColumns()); // Avoid resizing
550  const int n = numRows();
551  for (int i=0; i<n-1; ++i)
552  {
553  for (int j=i+1; j<n; ++j)
554  {
555  const double t = d_mat[i*n+j];
556  d_mat[i*n+j] = d_mat[j*n+i];
557  d_mat[j*n+i] = t;
558  }
559  }
560 }
561 
562 void
564 {
565  CAROM_VERIFY(!distributed());
566  CAROM_VERIFY(numRows() == numColumns());
567 
568  // Call lapack routines to do the inversion.
569  // Set up some stuff the lapack routines need.
570  int info;
571  int mtx_size = d_num_rows;
572  int lwork = mtx_size*mtx_size;
573  int* ipiv = new int [mtx_size];
574  double* work = new double [lwork];
575  // To use lapack we need a column major representation of this which is
576  // essentially the transform of this.
577  for (int row = 0; row < mtx_size; ++row) {
578  for (int col = row+1; col < mtx_size; ++col) {
579  double tmp = item(row, col);
580  item(row, col) = item(col, row);
581  item(col, row) = tmp;
582  }
583  }
584  // Now call lapack to do the inversion.
585  dgetrf(&mtx_size, &mtx_size, d_mat, &mtx_size, ipiv, &info);
586  CAROM_VERIFY(info == 0);
587 
588  dgetri(&mtx_size, d_mat, &mtx_size, ipiv, work, &lwork, &info);
589  CAROM_VERIFY(info == 0);
590 
591  // This now has its inverse in a column major representation. Put it into
592  // row major representation.
593  for (int row = 0; row < mtx_size; ++row) {
594  for (int col = row+1; col < mtx_size; ++col) {
595  double tmp = item(row, col);
596  item(row, col) = item(col, row);
597  item(col, row) = tmp;
598  }
599  }
600 
601  delete [] ipiv;
602  delete [] work;
603 }
604 
606 {
607  CAROM_VERIFY(!distributed());
608  CAROM_VERIFY(numRows() >= numColumns());
609 
610  if (numRows() == numColumns())
611  {
612  inverse();
613  }
614  else
615  {
616  std::unique_ptr<Matrix> AtA = this->transposeMult(*this);
617 
618  // Directly invert AtA, which is a bad idea if AtA is not small.
619  AtA->inverse();
620 
621  // Pseudoinverse is (AtA)^{-1}*this^T, but we store the transpose of the result in this, namely this*(AtA)^{-T}.
622  Vector row(numColumns(), false);
623  Vector res(numColumns(), false);
624  for (int i=0; i<numRows(); ++i)
625  { // Compute i-th row of this multiplied by (AtA)^{-T}, whose transpose is (AtA)^{-1} times i-th row transposed.
626  for (int j=0; j<numColumns(); ++j)
627  row.item(j) = this->item(i,j);
628 
629  AtA->mult(row, res);
630 
631  // Overwrite i-th row with transpose of result.
632  for (int j=0; j<numColumns(); ++j)
633  this->item(i,j) = res.item(j);
634  }
635  }
636 }
637 
638 void
639 Matrix::print(const char * prefix) const
640 {
641  int my_rank;
642  const bool success = MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
643  CAROM_ASSERT(success);
644 
645  std::string filename_str = prefix + std::to_string(my_rank);
646  const char * filename = filename_str.c_str();
647  FILE * pFile = fopen(filename,"w");
648  for (int row = 0; row < d_num_rows; ++row) {
649  for (int col = 0; col < d_num_cols; ++col) {
650  fprintf(pFile, " %25.20e\t", item(row,col));
651  }
652  fprintf(pFile, "\n");
653  }
654  fclose(pFile);
655 }
656 
657 void
658 Matrix::write(const std::string& base_file_name) const
659 {
660  CAROM_VERIFY(!base_file_name.empty());
661 
662  int mpi_init;
663  MPI_Initialized(&mpi_init);
664  int rank;
665  if (mpi_init) {
666  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
667  }
668  else {
669  rank = 0;
670  }
671 
672  char tmp[100];
673  sprintf(tmp, ".%06d", rank);
674  std::string full_file_name = base_file_name + tmp;
675  HDFDatabase database;
676  database.create(full_file_name);
677 
678  sprintf(tmp, "distributed");
679  database.putInteger(tmp, d_distributed);
680  sprintf(tmp, "num_rows");
681  database.putInteger(tmp, d_num_rows);
682  sprintf(tmp, "num_cols");
683  database.putInteger(tmp, d_num_cols);
684  sprintf(tmp, "data");
685  database.putDoubleArray(tmp, d_mat, d_num_rows*d_num_cols);
686  database.close();
687 }
688 
689 void
690 Matrix::read(const std::string& base_file_name)
691 {
692  CAROM_VERIFY(!base_file_name.empty());
693 
694  int mpi_init;
695  MPI_Initialized(&mpi_init);
696  int rank;
697  if (mpi_init) {
698  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
699  MPI_Comm_size(MPI_COMM_WORLD, &d_num_procs);
700  }
701  else {
702  rank = 0;
703  d_num_procs = 1;
704  }
705 
706  char tmp[100];
707  sprintf(tmp, ".%06d", rank);
708  std::string full_file_name = base_file_name + tmp;
709  HDFDatabase database;
710  database.open(full_file_name, "r");
711 
712  sprintf(tmp, "distributed");
713  int distributed;
714  database.getInteger(tmp, distributed);
715  d_distributed = bool(distributed);
716  int num_rows;
717  sprintf(tmp, "num_rows");
718  database.getInteger(tmp, num_rows);
719  int num_cols;
720  sprintf(tmp, "num_cols");
721  database.getInteger(tmp, num_cols);
722  setSize(num_rows,num_cols);
723  sprintf(tmp, "data");
724  database.getDoubleArray(tmp, d_mat, d_alloc_size);
725  d_owns_data = true;
726  database.close();
727 }
728 
729 void
730 Matrix::local_read(const std::string& base_file_name, int rank)
731 {
732  CAROM_VERIFY(!base_file_name.empty());
733 
734  int mpi_init;
735  MPI_Initialized(&mpi_init);
736  if (mpi_init) {
737  MPI_Comm_size(MPI_COMM_WORLD, &d_num_procs);
738  }
739  else {
740  d_num_procs = 1;
741  }
742 
743  char tmp[100];
744  sprintf(tmp, ".%06d", rank);
745  std::string full_file_name = base_file_name + tmp;
746  HDFDatabase database;
747  database.open(full_file_name, "r");
748 
749  sprintf(tmp, "distributed");
750  int distributed;
751  database.getInteger(tmp, distributed);
752  d_distributed = bool(distributed);
753  int num_rows;
754  sprintf(tmp, "num_rows");
755  database.getInteger(tmp, num_rows);
756  int num_cols;
757  sprintf(tmp, "num_cols");
758  database.getInteger(tmp, num_cols);
759  setSize(num_rows,num_cols);
760  sprintf(tmp, "data");
761  database.getDoubleArray(tmp, d_mat, d_alloc_size);
762  d_owns_data = true;
763  database.close();
764 }
765 
766 void
767 Matrix::distribute(const int &local_num_rows)
768 {
769  CAROM_VERIFY(!distributed());
770  CAROM_VERIFY(d_owns_data);
771 
772  std::vector<int> row_offsets;
773  int num_total_rows = get_global_offsets(local_num_rows, row_offsets,
774  MPI_COMM_WORLD);
775  CAROM_VERIFY(num_total_rows == d_num_rows);
776  int local_offset = row_offsets[d_rank] * d_num_cols;
777  const int new_size = local_num_rows * d_num_cols;
778 
779  double *d_new_mat = new double [new_size];
780  if (new_size > 0)
781  memcpy(d_new_mat, &d_mat[local_offset], 8 * new_size);
782 
783  delete [] d_mat;
784  d_mat = d_new_mat;
785  d_alloc_size = new_size;
786 
787  d_num_distributed_rows = d_num_rows;
788  d_num_rows = local_num_rows;
789 
790  d_distributed = true;
791 }
792 
793 void
795 {
796  CAROM_VERIFY(distributed());
797  CAROM_VERIFY(d_owns_data);
798 
799  std::vector<int> row_offsets;
800  const int num_total_rows = get_global_offsets(d_num_rows, row_offsets,
801  MPI_COMM_WORLD);
802  CAROM_VERIFY(num_total_rows == d_num_distributed_rows);
803  const int new_size = d_num_distributed_rows * d_num_cols;
804 
805  int *data_offsets = new int[row_offsets.size() - 1];
806  int *data_cnts = new int[row_offsets.size() - 1];
807  for (int k = 0; k < row_offsets.size() - 1; k++)
808  {
809  data_offsets[k] = row_offsets[k] * d_num_cols;
810  data_cnts[k] = (row_offsets[k+1] - row_offsets[k]) * d_num_cols;
811  }
812 
813  double *d_new_mat = new double [new_size] {0.0};
814  CAROM_VERIFY(MPI_Allgatherv(d_mat, d_num_rows * d_num_cols, MPI_DOUBLE,
815  d_new_mat, data_cnts, data_offsets, MPI_DOUBLE,
816  MPI_COMM_WORLD) == MPI_SUCCESS);
817 
818  delete [] d_mat;
819  delete [] data_offsets;
820  delete [] data_cnts;
821  d_mat = d_new_mat;
822  d_alloc_size = new_size;
823 
824  d_num_rows = d_num_distributed_rows;
825 
826  d_distributed = false;
827 }
828 
829 void
830 Matrix::calculateNumDistributedRows() {
831  if (d_distributed && d_num_procs > 1) {
832  int num_total_rows = d_num_rows;
833  CAROM_VERIFY(MPI_Allreduce(MPI_IN_PLACE,
834  &num_total_rows,
835  1,
836  MPI_INT,
837  MPI_SUM,
838  MPI_COMM_WORLD) == MPI_SUCCESS);
839  d_num_distributed_rows = num_total_rows;
840  }
841  else {
842  d_num_distributed_rows = d_num_rows;
843  }
844 }
845 
846 std::unique_ptr<Matrix> Matrix::qr_factorize() const
847 {
848  std::vector<Matrix*> QR;
849  qr_factorize(false, QR);
850  return std::unique_ptr<Matrix>(QR[0]);
851 }
852 
853 void Matrix::qr_factorize(std::vector<std::unique_ptr<Matrix>> & QR) const
854 {
855  std::vector<Matrix*> QRraw; // Raw pointers
856  qr_factorize(true, QRraw);
857  QR.clear();
858  for (int i=0; i<2; ++i)
859  QR.push_back(std::unique_ptr<Matrix>(QRraw[i]));
860 }
861 
862 void Matrix::qr_factorize(bool computeR,
863  std::vector<Matrix*> & QR) const
864 {
865  const int myid = d_rank;
866  const int ncols = numColumns();
867 
868  std::vector<int> row_offset(d_num_procs + 1);
869  row_offset[d_num_procs] = numDistributedRows();
870  row_offset[myid] = numRows();
871 
872  CAROM_VERIFY(MPI_Allgather(MPI_IN_PLACE,
873  1,
874  MPI_INT,
875  row_offset.data(),
876  1,
877  MPI_INT,
878  MPI_COMM_WORLD) == MPI_SUCCESS);
879  for (int i = d_num_procs - 1; i >= 0; i--) {
880  row_offset[i] = row_offset[i + 1] - row_offset[i];
881  }
882 
883  CAROM_VERIFY(row_offset[0] == 0);
884 
885  SLPK_Matrix slpk;
886 
887  int nrow_blocks = d_num_procs;
888  int ncol_blocks = 1;
889 
890  int blocksize = row_offset[d_num_procs] / d_num_procs;
891  if (row_offset[d_num_procs] % d_num_procs != 0) blocksize += 1;
892  initialize_matrix(&slpk, ncols, numDistributedRows(),
893  ncol_blocks, nrow_blocks, ncols, blocksize);
894  for (int rank = 0; rank < d_num_procs; ++rank) {
895  scatter_block(&slpk, 1, row_offset[rank] + 1,
896  getData(), ncols,
897  row_offset[rank + 1] - row_offset[rank], rank);
898  }
899 
900  QRManager QRmgr;
901  qr_init(&QRmgr, &slpk);
902  lqfactorize(&QRmgr);
903 
904  Matrix *R_matrix = nullptr;
905  if (computeR)
906  {
907  // Obtain L from the distributed overwritten matrix QRmgr.A
908  Matrix* L_dist_matrix = new Matrix(row_offset[myid + 1] - row_offset[myid],
909  ncols, distributed());
910 
911  for (int rank = 0; rank < d_num_procs; ++rank) {
912  gather_block(&L_dist_matrix->item(0, 0), QRmgr.A, 1,
913  row_offset[rank] + 1, ncols,
914  row_offset[rank + 1] - row_offset[rank], rank);
915  }
916 
917  R_matrix = new Matrix(ncols, ncols, false);
918  if (myid == 0)
919  {
920  const int numLocalRowsR = std::min(ncols, row_offset[myid + 1]);
921  for (int i=0; i<numLocalRowsR; ++i)
922  {
923  for (int j=i; j<ncols; ++j)
924  (*R_matrix)(i,j) = (*L_dist_matrix)(i,j);
925 
926  for (int j=0; j<i; ++j)
927  (*R_matrix)(i,j) = 0.0;
928  }
929  }
930 
931  // Gather any rows of R to root from other ranks.
932  const int maxLocalRowsR = std::min(ncols, row_offset[myid + 1]);
933  const int numLocalRowsR = std::max(0, maxLocalRowsR - row_offset[myid]);
934 
935  std::vector<int> allNumRowsR(d_num_procs);
936  MPI_Gather(&numLocalRowsR, 1, MPI_INT, allNumRowsR.data(), 1, MPI_INT,
937  0, MPI_COMM_WORLD);
938 
939  std::vector<double> localRowsR;
940 
941  if (myid > 0)
942  {
943  localRowsR.resize(numLocalRowsR * ncols);
944  for (int i=0; i<numLocalRowsR; ++i)
945  {
946  const int row = row_offset[myid] + i;
947  for (int j=row; j<ncols; ++j)
948  localRowsR[(i * ncols) + j] = (*L_dist_matrix)(i,j);
949 
950  for (int j=0; j<row; ++j)
951  localRowsR[(i * ncols) + j] = 0.0;
952  }
953  }
954  else
955  localRowsR.resize(1); // Just to have a valid pointer from data() call.
956 
957  delete L_dist_matrix;
958 
959  std::vector<double> recvRowsR;
960  if (myid == 0)
961  recvRowsR.resize((ncols - numLocalRowsR) * ncols);
962 
963  if (recvRowsR.size() == 0)
964  recvRowsR.resize(1); // Just to have a valid pointer from data() call.
965 
966  std::vector<int> disp(d_num_procs);
967  disp[0] = 0;
968  allNumRowsR[0] = 0;
969  for (int i=1; i<d_num_procs; ++i)
970  {
971  allNumRowsR[i] *= ncols;
972  disp[i] = disp[i - 1] + allNumRowsR[i-1];
973  }
974 
975  MPI_Gatherv(localRowsR.data(), myid == 0 ? 0 : numLocalRowsR * ncols,
976  MPI_DOUBLE, recvRowsR.data(),
977  allNumRowsR.data(), disp.data(), MPI_DOUBLE, 0, MPI_COMM_WORLD);
978 
979  if (myid == 0)
980  {
981  for (int i=numLocalRowsR; i<ncols; ++i)
982  {
983  for (int j=0; j<ncols; ++j)
984  (*R_matrix)(i,j) = recvRowsR[(i - numLocalRowsR) * ncols + j];
985  }
986  }
987 
988  MPI_Bcast(R_matrix->getData(), ncols * ncols, MPI_DOUBLE, 0,
989  MPI_COMM_WORLD);
990  }
991 
992  // Manipulate QRmgr.A to get elementary household reflectors.
993  for (int i = row_offset[myid]; i < ncols; i++) {
994  for (int j = 0; j < i - row_offset[myid] && j < row_offset[myid + 1]; j++) {
995  QRmgr.A->mdata[j * QRmgr.A->mm + i] = 0;
996  }
997  if (i < row_offset[myid + 1]) {
998  QRmgr.A->mdata[(i - row_offset[myid]) * QRmgr.A->mm + i] = 1;
999  }
1000  }
1001 
1002  // Obtain Q
1003  lqcompute(&QRmgr);
1004  Matrix *qr_factorized_matrix = new Matrix(row_offset[myid + 1] -
1005  row_offset[myid],
1006  ncols, distributed());
1007 
1008  for (int rank = 0; rank < d_num_procs; ++rank) {
1009  gather_block(&qr_factorized_matrix->item(0, 0), QRmgr.A, 1,
1010  row_offset[rank] + 1, ncols,
1011  row_offset[rank + 1] - row_offset[rank], rank);
1012  }
1013 
1014  free_matrix_data(&slpk);
1015  release_context(&slpk);
1016 
1017  free(QRmgr.tau);
1018  free(QRmgr.ipiv);
1019 
1020  QR.resize(2);
1021  QR[0] = qr_factorized_matrix;
1022  QR[1] = R_matrix;
1023 }
1024 
1025 void
1027  int* row_pivot_owner,
1028  int pivots_requested) const
1029 {
1030  if(!distributed()) {
1031  return qrcp_pivots_transpose_serial(row_pivot,
1032  row_pivot_owner,
1033  pivots_requested);
1034  }
1035  else {
1036  return qrcp_pivots_transpose_distributed(row_pivot,
1037  row_pivot_owner,
1038  pivots_requested);
1039  }
1040 }
1041 
1042 void
1043 Matrix::qrcp_pivots_transpose_serial(int* row_pivot,
1044  int* row_pivot_owner,
1045  int pivots_requested) const
1046 {
1047  // This method assumes this matrix is serial
1048  CAROM_VERIFY(!distributed());
1049 
1050  // Number of pivots requested can't exceed the number of rows of the
1051  // matrix
1052  CAROM_VERIFY(pivots_requested <= numRows());
1053  CAROM_VERIFY(pivots_requested > 0);
1054 
1055  // Make sure arrays are allocated before entry; this method does not
1056  // own the input pointers
1057  CAROM_VERIFY(row_pivot != NULL);
1058  CAROM_VERIFY(row_pivot_owner != NULL);
1059 
1060  // Get dimensions of transpose of matrix
1061  int num_rows_of_transpose = numColumns();
1062  int num_cols_of_transpose = numRows();
1063 
1064  // LAPACK routines tend to overwrite their inputs, but we'd like to
1065  // keep the basis matrix and use it in later computations, so copy
1066  // the basis matrix here.
1067  Matrix scratch(*this);
1068 
1069  // Allocate work arrays; work array for QR must be at least 1 plus 3
1070  // times the number of columns of its matrix input. This algorithm
1071  // applies QR to transposed basis matrix, so the applicable
1072  // dimension is the number of rows of the basis matrix. It's
1073  // possible to get better performance by computing the optimal block
1074  // size and then using that value to size the work array; see the
1075  // LAPACK source code and documentation for details.
1076  int lwork = 20 * num_cols_of_transpose + 1;
1077  double* work = new double[lwork];
1078  double* tau = new double[std::min(num_rows_of_transpose,
1079  num_cols_of_transpose)];
1080  int* pivot = new int[num_cols_of_transpose] ();
1081  int info;
1082 
1083  // Compute the QR decomposition with column pivots of the transpose
1084  // of this matrix by abusing the fact that the C++ memory model is
1085  // row-major format, which is the transpose of the Fortran memory
1086  // model (which is column-major). Passing the row-major data
1087  // looks like an in-place transposition to Fortran.
1088  dgeqp3(&num_rows_of_transpose,
1089  &num_cols_of_transpose,
1090  scratch.d_mat,
1091  &num_rows_of_transpose,
1092  pivot,
1093  tau,
1094  work,
1095  &lwork,
1096  &info);
1097 
1098  // Fail if error in LAPACK routine.
1099  CAROM_VERIFY(info == 0);
1100 
1101  // Assume communicator is MPI_COMM_WORLD and get rank of this
1102  // process
1103  int is_mpi_initialized, is_mpi_finalized;
1104  CAROM_VERIFY(MPI_Initialized(&is_mpi_initialized) == MPI_SUCCESS);
1105  CAROM_VERIFY(MPI_Finalized(&is_mpi_finalized) == MPI_SUCCESS);
1106  int my_rank = 0;
1107  if(is_mpi_initialized && !is_mpi_finalized) {
1108  const MPI_Comm my_comm = MPI_COMM_WORLD;
1109  CAROM_VERIFY(MPI_Comm_rank(my_comm, &my_rank) == MPI_SUCCESS);
1110  }
1111 
1112  // Copy over pivots and subtract one to convert them from a
1113  // Fortran-based indexing convention (first element of 1-D array by
1114  // default corresponds to index of 1, though this convention can be
1115  // overridden) to a C-based indexing convention (first element of
1116  // 1-D array corresponds to index of 0).
1117  for (int i = 0; i < pivots_requested; i++) {
1118  row_pivot[i] = pivot[i] - 1;
1119  row_pivot_owner[i] = my_rank;
1120  }
1121 
1122  // Free arrays
1123  delete [] work;
1124  delete [] tau;
1125  delete [] pivot;
1126 }
1127 
1128 void
1129 Matrix::qrcp_pivots_transpose_distributed(int* row_pivot,
1130  int* row_pivot_owner,
1131  int pivots_requested)
1132 const
1133 {
1134  // Check if distributed; otherwise, use serial implementation
1135  CAROM_VERIFY(distributed());
1136 
1137 #ifdef CAROM_HAS_ELEMENTAL
1138  // Shim to design interface; not implemented yet
1139 
1140  // Elemental implementation
1141  return qrcp_pivots_transpose_distributed_elemental
1142  (row_pivot, row_pivot_owner, pivots_requested);
1143 #else
1144  qrcp_pivots_transpose_distributed_scalapack
1145  (row_pivot, row_pivot_owner, pivots_requested);
1146 #endif
1147 }
1148 
1149 void
1150 Matrix::qrcp_pivots_transpose_distributed_scalapack
1151 (int* row_pivot, int* row_pivot_owner, int pivots_requested) const
1152 {
1153  // Check if distributed; otherwise, use serial implementation
1154  CAROM_VERIFY(distributed());
1155 
1156  int num_total_rows = d_num_rows;
1157  CAROM_VERIFY(MPI_Allreduce(MPI_IN_PLACE,
1158  &num_total_rows,
1159  1,
1160  MPI_INT,
1161  MPI_SUM,
1162  MPI_COMM_WORLD) == MPI_SUCCESS);
1163 
1164  int *row_offset = new int[d_num_procs + 1];
1165  row_offset[d_num_procs] = num_total_rows;
1166 
1167  int my_rank;
1168  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
1169 
1170  row_offset[my_rank] = d_num_rows;
1171 
1172  CAROM_VERIFY(MPI_Allgather(MPI_IN_PLACE,
1173  1,
1174  MPI_INT,
1175  row_offset,
1176  1,
1177  MPI_INT,
1178  MPI_COMM_WORLD) == MPI_SUCCESS);
1179 
1180  for (int i = d_num_procs - 1; i >= 0; i--) {
1181  row_offset[i] = row_offset[i + 1] - row_offset[i];
1182  }
1183  CAROM_VERIFY(row_offset[0] == 0);
1184 
1185  SLPK_Matrix slpk;
1186  int blocksize = row_offset[d_num_procs] / d_num_procs;
1187  if (row_offset[d_num_procs] % d_num_procs != 0) blocksize += 1;
1188 
1189  initialize_matrix(&slpk, d_num_cols, row_offset[d_num_procs], 1, d_num_procs, 1,
1190  blocksize); // transposed
1191 
1192  CAROM_VERIFY(d_num_cols <=
1193  pivots_requested); // Otherwise, take a submatrix for the QR (not implemented).
1194 
1195  for (int rank = 0; rank < d_num_procs; ++rank) {
1196  // Take the row-major data in d_mat and put it in a transposed column-major array in slpk
1197  scatter_block(&slpk, 1, row_offset[rank]+1,
1198  d_mat,
1199  d_num_cols, row_offset[rank+1] - row_offset[rank],
1200  rank);
1201  }
1202 
1203  QRManager QRmgr;
1204  qr_init(&QRmgr, &slpk);
1205  qrfactorize(&QRmgr);
1206 
1207  // Just gather the pivots to root and discard the factorization
1208  CAROM_VERIFY(0 < pivots_requested && pivots_requested <= QRmgr.ipivSize);
1209  CAROM_VERIFY(pivots_requested <= std::max(d_num_rows, d_num_cols));
1210 
1211  const int scount = std::max(0, std::min(pivots_requested,
1212  row_offset[my_rank+1]) - row_offset[my_rank]);
1213  int *mypivots = (scount > 0) ? new int[scount] : NULL;
1214 
1215  for (int i=0; i<scount; ++i)
1216  mypivots[i] = QRmgr.ipiv[i]-1; // Make it 0-based
1217 
1218  int *rcount = (my_rank == 0) ? new int[d_num_procs] : NULL;
1219  int *rdisp = (my_rank == 0) ? new int[d_num_procs] : NULL;
1220 
1221  MPI_Gather(&scount, 1, MPI_INT, rcount, 1, MPI_INT, 0, MPI_COMM_WORLD);
1222 
1223  if (my_rank == 0)
1224  {
1225  rdisp[0] = 0;
1226  for (int i = 1; i<d_num_procs; ++i)
1227  rdisp[i] = rdisp[i-1] + rcount[i-1];
1228 
1229  CAROM_VERIFY(rdisp[d_num_procs-1] + rcount[d_num_procs-1] == pivots_requested);
1230  }
1231 
1232  MPI_Gatherv(mypivots, scount, MPI_INT, row_pivot, rcount, rdisp, MPI_INT, 0,
1233  MPI_COMM_WORLD);
1234 
1235  delete [] mypivots;
1236 
1237  if (my_rank == 0)
1238  {
1239  for (int i=0; i<pivots_requested; ++i)
1240  {
1241  row_pivot_owner[i] = -1;
1242  for (int j=d_num_procs-1; j>=0; --j)
1243  {
1244  if (row_offset[j] <= row_pivot[i])
1245  {
1246  row_pivot_owner[i] = j;
1247  break;
1248  }
1249  }
1250 
1251  // Note that row_pivot[i] is a global index.
1252  CAROM_VERIFY(row_pivot_owner[i] >= 0);
1253  CAROM_VERIFY(row_offset[row_pivot_owner[i]] <= row_pivot[i]
1254  && row_pivot[i] < row_offset[row_pivot_owner[i]+1]);
1255  }
1256  }
1257  else
1258  {
1259  for (int i=0; i<scount; ++i)
1260  row_pivot[i] = QRmgr.ipiv[i]-1; // Make it 0-based
1261  }
1262 
1263  free_matrix_data(&slpk);
1264  release_context(&slpk);
1265 
1266  free(QRmgr.tau);
1267  free(QRmgr.ipiv);
1268 
1269  delete [] rcount;
1270  delete [] rdisp;
1271  delete [] row_offset;
1272 }
1273 
1274 void
1275 Matrix::qrcp_pivots_transpose_distributed_elemental
1276 (int* row_pivot, int* row_pivot_owner, int pivots_requested)
1277 const
1278 {
1279 #ifdef CAROM_HAS_ELEMENTAL
1280  // Check if distributed; otherwise, use serial implementation
1281  CAROM_VERIFY(distributed());
1282 
1283  // Check if balanced
1284  if (balanced()) {
1285  qrcp_pivots_transpose_distributed_elemental_balanced
1286  (row_pivot, row_pivot_owner, pivots_requested);
1287  }
1288  else {
1289  qrcp_pivots_transpose_distributed_elemental_unbalanced
1290  (row_pivot, row_pivot_owner, pivots_requested);
1291  }
1292 #else
1293  CAROM_VERIFY(false);
1294 #endif
1295 }
1296 
1297 void
1298 Matrix::qrcp_pivots_transpose_distributed_elemental_balanced
1299 (int* row_pivot, int* row_pivot_owner, int pivots_requested)
1300 const
1301 {
1302 #ifdef CAROM_HAS_ELEMENTAL
1303  // Compute pivots redundantly across all processes using the QRCP
1304  // from the distributed dense linear algebra library Elemental.
1305 
1306  // The following assumptions are made in this implementation just to
1307  // get a version up and running:
1308  //
1309  // (1) this->balanced() == true; // The matrix is balanced
1310  //
1311  // (2) Process 0 is the master rank of the object
1312  //
1313  // (3) Matrix rows are distributed block-cyclically over all processes
1314  // starting on process zero with row 0, in increasing order of row
1315  // index (all elements of a given row are still stored on the same
1316  // process)
1317  //
1318  // (4) This Matrix is distributed over the MPI_COMM_WORLD
1319  // communicator
1320  //
1321  // Some of these assumptions can be relaxed if the Matrix object
1322  // stores more information.
1323 
1324  // Check if distributed and balanced
1325  CAROM_VERIFY(distributed() && balanced());
1326 
1327  // Make sure arrays are allocated before entry; this method does not
1328  // own the input pointers
1329  CAROM_VERIFY(row_pivot != NULL);
1330  CAROM_VERIFY(row_pivot_owner != NULL);
1331 
1332  // Compute total number of rows to set global sizes of matrix
1333  const MPI_Comm comm = MPI_COMM_WORLD;
1334  const int master_rank = 0;
1335 
1336  int num_total_rows = d_num_rows;
1337  const int reduce_count = 1;
1338  CAROM_VERIFY(MPI_Allreduce(MPI_IN_PLACE,
1339  &num_total_rows,
1340  reduce_count,
1341  MPI_INT,
1342  MPI_SUM,
1343  comm) == MPI_SUCCESS);
1344 
1345  // Number of pivots requested can't exceed the total number of rows
1346  // of the matrix
1347  CAROM_VERIFY(pivots_requested <= num_total_rows);
1348  CAROM_VERIFY(pivots_requested > 0);
1349 
1350  // To compute a column-pivoted QRCP using Elemental, we need to
1351  // first get the data into datatypes Elemental can operate on.
1352 
1353  // Construct a process grid on the communicator that is 1
1354  // by (# processes), in column-major order; each process in the grid
1355  // will own its own rows of the matrix
1356  const El::Grid grid(comm, 1);
1357 
1358  // The row communicator in the grid should have the same number
1359  // of processes as the comm owned by the Matrix
1360  CAROM_VERIFY(El::mpi::Size(grid.RowComm()) == d_num_procs);
1361 
1362  // Instantiate the transposed matrix; Elemental calls the number of
1363  // rows the "height" of the matrix, and the number of columns the
1364  // "width" of the matrix
1365  El::Int height = static_cast<El::Int>(numColumns());
1366  El::Int width = static_cast<El::Int>(numRows());
1367  El::Int root = static_cast<El::Int>(master_rank);
1368  El::DistMatrix<double> scratch(height, width, grid, root);
1369 
1370  // Columns of the matrix should be distributed round-robin; each
1371  // element of a column should be on the same process. The
1372  // distribution should satisfy two invariants:
1373  //
1374  // (1) Each "global row" should have the same rank in its column
1375  // communicator
1376  //
1377  // (2) For the scratch matrix global column index j, the process (on
1378  // the row communicator) that owns j should be process rank (j %
1379  // d_num_procs).
1380  //
1381  CAROM_VERIFY(scratch.RowOwner(0) == scratch.RowOwner(1));
1382  int my_rank;
1383  CAROM_VERIFY(MPI_Comm_rank(comm, &my_rank) == MPI_SUCCESS);
1384  El::Int rank_as_row = static_cast<El::Int>(my_rank);
1385  CAROM_VERIFY(scratch.ColOwner(rank_as_row) == rank_as_row);
1386 
1387  // Set up work matrices
1388  El::DistMatrix<double> householder_scalars(grid);
1389  El::DistMatrix<double> diagonal(grid);
1390  El::DistPermutation perm(grid);
1391  El::QRCtrl<double> ctrl;
1392 
1393  // Build the copy of the matrix element by element, mapping
1394  // local indices to global indices. The Elemental::DistMatrix indices
1395  // are the transpose of the Matrix indices
1396 
1397  // Elemental assigns elements of a matrix in a block-cyclic fashion
1398  // based on the size (specifically, the height and width) of the
1399  // El::Grid object; see
1400  // http://libelemental.org/documentation/dev/core/dist_matrix/Element/MC_MR.html
1401  // for details on the default element distribution, which is what
1402  // this code uses.
1403  //
1404  // If the matrix is balanced, then the matrix rows (i.e., columns of
1405  // the scratch, transpose matrix) should conform to the row
1406  // distribution of the Elemental process grid, and additional
1407  // communication (beyond possibly in local to global index mapping
1408  // queries) is not needed to assign elements in the proper places
1409  // because the matrix is already load-balanced. Furthermore, all of
1410  // the elements of a row of this matrix (column of scratch,
1411  // transpose matrix) are on a single process. Since the matrix rows
1412  // (columns of scratch) don't have to be redistributed, we can use
1413  // Elemental's native (global <--> local) indexing to construct a
1414  // map between local indices on a given process and global indices
1415  // of the distributed matrix, and Elemental will also give us the
1416  // correct process rank owning any given column of the scratch
1417  // matrix.
1418  for (int row = 0; row < d_num_rows; row++) {
1419  El::Int el_loc_col = static_cast<El::Int>(row);
1420  El::Int el_global_col = scratch.GlobalCol(el_loc_col);
1421  for (int col = 0; col < d_num_cols; col++) {
1422  El::Int el_loc_row = static_cast<El::Int>(col);
1423  El::Int el_global_row = scratch.GlobalRow(el_loc_row);
1424  scratch.Set(el_global_row, el_global_col, this->item(row, col));
1425  }
1426  }
1427 
1428  // After transferring the data over to Elemental's native data
1429  // types, we compute the QRCP.
1430  El::QR(scratch, householder_scalars, diagonal, perm); // add ctrl if needed
1431 
1432  // Then, we transfer the pivots into the pivot array
1433  // stored redundantly on each process.
1434  for (size_t i = 0; i < pivots_requested; i++) {
1435  El::Int el_i = static_cast<El::Int>(i);
1436  El::Int el_perm_i = perm.Image(el_i);
1437 
1438  // The permutation is computed in terms of global indices, so
1439  // we need to compute the local column pivot of the transpose;
1440  // this will be a row pivot
1441  El::Int el_loc_i = scratch.LocalCol(el_perm_i);
1442  int loc_i = static_cast<int>(el_loc_i);
1443  row_pivot[i] = loc_i;
1444 
1445  // The global index of the permutation can also be used to figure
1446  // out which process owns that pivot row, because this process is
1447  // also the process that owns that global column of the scratch
1448  // matrix
1449  El::Int el_owner = scratch.ColOwner(el_perm_i);
1450  int owner = static_cast<int>(el_owner);
1451  row_pivot_owner[i] = owner;
1452  }
1453 #else
1454  CAROM_VERIFY(false);
1455 #endif
1456 }
1457 
1458 void
1459 Matrix::qrcp_pivots_transpose_distributed_elemental_unbalanced
1460 (int* row_pivot, int* row_pivot_owner, int pivots_requested)
1461 const
1462 {
1463 #ifdef CAROM_HAS_ELEMENTAL
1464  // Compute pivots redundantly across all processes using the QRCP
1465  // from the distributed dense linear algebra library Elemental.
1466 
1467  // The following assumptions are made in this implementation just to
1468  // get a version up and running:
1469  //
1470  // (1) Process 0 is the master rank of the object
1471  //
1472  // (2) This Matrix is distributed over the MPI_COMM_WORLD
1473  // communicator
1474  //
1475  // Some of these assumptions can be relaxed if the Matrix object
1476  // stores more information.
1477 
1478  // Check if distributed and unbalanced
1479  CAROM_VERIFY(distributed() && !balanced());
1480 
1481  // Make sure arrays are allocated before entry; this method does not
1482  // own the input pointers
1483  CAROM_VERIFY(row_pivot != NULL);
1484  CAROM_VERIFY(row_pivot_owner != NULL);
1485 
1486  // Compute total number of rows to set global sizes of matrix
1487  const MPI_Comm comm = MPI_COMM_WORLD;
1488  const int master_rank = 0;
1489 
1490  int num_total_rows = d_num_rows;
1491  const int reduce_count = 1;
1492  CAROM_VERIFY(MPI_Allreduce(MPI_IN_PLACE,
1493  &num_total_rows,
1494  reduce_count,
1495  MPI_INT,
1496  MPI_SUM,
1497  comm) == MPI_SUCCESS);
1498 
1499  // Number of pivots requested can't exceed the total number of rows
1500  // of the matrix
1501  CAROM_VERIFY(pivots_requested <= num_total_rows);
1502  CAROM_VERIFY(pivots_requested > 0);
1503 
1504  // To compute a column-pivoted QRCP using Elemental, we need to
1505  // first get the data into datatypes Elemental can operate on.
1506 
1507  // Construct a process grid on the communicator that is 1
1508  // by (# processes), in column-major order; each process in the grid
1509  // will own its own rows of the matrix
1510  const El::Grid grid(comm, 1);
1511 
1512  // The row communicator in the grid should have the same number
1513  // of processes as the comm owned by the Matrix
1514  CAROM_VERIFY(El::mpi::Size(grid.RowComm()) == d_num_procs);
1515 
1516  // Instantiate the transposed matrix; Elemental calls the number of
1517  // rows the "height" of the matrix, and the number of columns the
1518  // "width" of the matrix
1519  El::Int height = static_cast<El::Int>(numColumns());
1520  El::Int width = static_cast<El::Int>(numRows());
1521  El::Int root = static_cast<El::Int>(master_rank);
1522  El::DistMatrix<double> scratch(height, width, grid, root);
1523 
1524  // Columns of the matrix should be distributed round-robin; each
1525  // element of a column should be on the same process. The
1526  // distribution should satisfy two invariants:
1527  //
1528  // (1) Each "global row" should have the same rank in its column
1529  // communicator
1530  //
1531  // (2) For the scratch matrix global column index j, the process (on
1532  // the row communicator) that owns j should be process rank (j %
1533  // d_num_procs).
1534  //
1535  CAROM_VERIFY(scratch.RowOwner(0) == scratch.RowOwner(1));
1536  int my_rank;
1537  CAROM_VERIFY(MPI_Comm_rank(comm, &my_rank) == MPI_SUCCESS);
1538  El::Int rank_as_row = static_cast<El::Int>(my_rank);
1539  CAROM_VERIFY(scratch.ColOwner(rank_as_row) == rank_as_row);
1540 
1541  // Set up work matrices
1542  El::DistMatrix<double> householder_scalars(grid);
1543  El::DistMatrix<double> diagonal(grid);
1544  El::DistPermutation perm(grid);
1545  El::QRCtrl<double> ctrl;
1546 
1547  // Build the copy of the matrix element by element, mapping local
1548  // indices to global indices. The Elemental::DistMatrix indices are
1549  // the transpose of the Matrix indices. The El::Grid object should
1550  // be constructed so that each column of the scratch matrix is owned
1551  // by a single process.
1552  //
1553  // If the matrix is unbalanced, matrix elements need to be
1554  // redistributed among processes; then, for the purposes of
1555  // computing the QR decomposition only, we redistribute matrix
1556  // elements in the scratch matrix. First, we compute global to
1557  // (process rank, local) index map. The mapping chosen is not
1558  // terribly performant -- it may do more data movement than
1559  // necessary when assigning matrix elements -- but is easy to
1560  // implement.
1561 
1562  int *row_offset = new int[d_num_procs + 1];
1563  row_offset[d_num_procs + 1] = num_total_rows;
1564 
1565  row_offset[my_rank] = d_num_rows;
1566  const int send_recv_count = 1;
1567  CAROM_VERIFY(MPI_Allgather(MPI_IN_PLACE,
1568  send_recv_count,
1569  MPI_INT,
1570  row_offset,
1571  send_recv_count,
1572  MPI_INT,
1573  comm) == MPI_SUCCESS);
1574 
1575  for (int i = d_num_procs - 1; i >= 0; i--) {
1576  row_offset[i] = row_offset[i + 1] - row_offset[i];
1577  }
1578  CAROM_VERIFY(row_offset[0] == 0);
1579 
1580  // Use the computed (local <--> global) index map to assign elements
1581  // to the scratch matrix
1582  for (int row = row_offset[my_rank]; row < row_offset[my_rank + 1]; row++) {
1583  El::Int el_loc_col = static_cast<El::Int>(row - row_offset[my_rank]);
1584  El::Int el_global_col = scratch.GlobalCol(el_loc_col);
1585  for (int col = 0; col < d_num_cols; col++) {
1586  El::Int el_loc_row = static_cast<El::Int>(col);
1587  El::Int el_global_row = scratch.GlobalRow(el_loc_row);
1588  scratch.Set(el_global_row, el_global_col, this->item(row, col));
1589  }
1590  }
1591 
1592  // After transferring the data over to Elemental's native data
1593  // types, we compute the QRCP.
1594  El::QR(scratch, householder_scalars, diagonal, perm); // add ctrl if needed
1595 
1596  // Then, we transfer the pivots into the pivot array
1597  // stored redundantly on each process.
1598  for (size_t i = 0; i < pivots_requested; i++) {
1599  El::Int el_i = static_cast<El::Int>(i);
1600  El::Int el_perm_i = perm.Image(el_i);
1601 
1602  // The permutation is computed in terms of global row indices of
1603  // the Matrix, so we need to compute the rank that owns the column
1604  // of scratch (remember: scratch is the transpose of this Matrix)
1605  // corresponding to this global index
1606  int global_row_index = static_cast<int>(el_perm_i);
1607  int rank;
1608  for (rank = 0; rank < d_num_procs; rank++) {
1609  bool is_at_or_above_lower_bound = (global_row_index >= row_offset[rank]);
1610  bool is_below_upper_bound = (global_row_index < row_offset[rank + 1]);
1611  if (is_at_or_above_lower_bound && is_below_upper_bound) {
1612  row_pivot_owner[i] = rank;
1613  break;
1614  }
1615  }
1616 
1617  // The local row index is then the global row index minus the
1618  // row_offset computed above in the simple (local <--> global) index
1619  // map computed above
1620  row_pivot[i] = global_row_index - row_offset[rank];
1621  }
1622 
1623  // Free arrays
1624  delete [] row_offset;
1625 #else
1626  CAROM_VERIFY(false);
1627 #endif
1628 }
1629 
1630 void
1631 Matrix::orthogonalize(bool double_pass, double zero_tol)
1632 {
1633  int const num_passes = double_pass ? 2 : 1;
1634 
1635  for (int work = 0; work < d_num_cols; ++work)
1636  {
1637  // Orthogonalize the column (twice if double_pass == true).
1638  for (int k = 0; k < num_passes; k++)
1639  {
1640  for (int col = 0; col < work; ++col)
1641  {
1642  double factor = 0.0;
1643 
1644  for (int i = 0; i < d_num_rows; ++i)
1645  factor += item(i, col) * item(i, work);
1646 
1647  if (d_distributed && d_num_procs > 1)
1648  {
1649  CAROM_VERIFY( MPI_Allreduce(MPI_IN_PLACE, &factor, 1,
1650  MPI_DOUBLE, MPI_SUM,
1651  MPI_COMM_WORLD)
1652  == MPI_SUCCESS );
1653  }
1654  for (int i = 0; i < d_num_rows; ++i)
1655  item(i, work) -= factor * item(i, col);
1656  }
1657  }
1658 
1659  // Normalize the column.
1660  double norm = 0.0;
1661 
1662  for (int i = 0; i < d_num_rows; ++i)
1663  norm += item(i, work) * item(i, work);
1664 
1665  if (d_distributed && d_num_procs > 1)
1666  {
1667  CAROM_VERIFY( MPI_Allreduce(MPI_IN_PLACE, &norm, 1, MPI_DOUBLE,
1668  MPI_SUM, MPI_COMM_WORLD)
1669  == MPI_SUCCESS );
1670  }
1671  if (norm > zero_tol)
1672  {
1673  norm = 1.0 / sqrt(norm);
1674  for (int i = 0; i < d_num_rows; ++i)
1675  item(i, work) *= norm;
1676  }
1677  }
1678 }
1679 
1680 void
1681 Matrix::orthogonalize_last(int ncols, bool double_pass, double zero_tol)
1682 {
1683  if (ncols == -1) ncols = d_num_cols;
1684  CAROM_VERIFY((ncols > 0) && (ncols <= d_num_cols));
1685 
1686  const int last_col = ncols - 1; // index of column to be orthonormalized
1687 
1688  int const num_passes = double_pass ? 2 : 1;
1689 
1690  // Orthogonalize the column (twice if double_pass == true).
1691  for (int k = 0; k < num_passes; k++)
1692  {
1693  for (int col = 0; col < last_col; ++col)
1694  {
1695  double factor = 0.0;
1696 
1697  for (int i = 0; i < d_num_rows; ++i)
1698  factor += item(i, col) * item(i, last_col);
1699 
1700  if (d_distributed && d_num_procs > 1)
1701  {
1702  CAROM_VERIFY( MPI_Allreduce(MPI_IN_PLACE, &factor, 1, MPI_DOUBLE,
1703  MPI_SUM, MPI_COMM_WORLD)
1704  == MPI_SUCCESS );
1705  }
1706  for (int i = 0; i < d_num_rows; ++i)
1707  item(i, last_col) -= factor * item(i, col);
1708  }
1709  }
1710 
1711  // Normalize the column.
1712  double norm = 0.0;
1713 
1714  for (int i = 0; i < d_num_rows; ++i)
1715  norm += item(i, last_col) * item(i, last_col);
1716 
1717  if (d_distributed && d_num_procs > 1)
1718  {
1719  CAROM_VERIFY( MPI_Allreduce(MPI_IN_PLACE, &norm, 1, MPI_DOUBLE,
1720  MPI_SUM, MPI_COMM_WORLD) == MPI_SUCCESS );
1721  }
1722  if (norm > zero_tol)
1723  {
1724  norm = 1.0 / sqrt(norm);
1725  for (int i = 0; i < d_num_rows; ++i)
1726  item(i, last_col) *= norm;
1727  }
1728 }
1729 
1730 void
1732 {
1733  // Rescale every matrix row by its maximum absolute value.
1734  // In the Matrix class, columns are distributed row wise, but rows are
1735  // not distributed; namely, each process acts on a number of full rows.
1736  // Therefore, no MPI communication is needed.
1737 
1738  for (int i = 0; i < d_num_rows; i++)
1739  {
1740  // Find the row's max absolute value.
1741  double row_max = fabs(item(i, 0));
1742  for (int j = 1; j < d_num_cols; j++)
1743  {
1744  if (fabs(item(i, j)) > row_max)
1745  row_max = fabs(item(i, j));
1746  }
1747 
1748  // Rescale every row entry, if max nonzero.
1749  if (row_max > 1.0e-14)
1750  {
1751  for (int j = 0; j < d_num_cols; j++)
1752  item(i, j) /= row_max;
1753  }
1754  }
1755 }
1756 
1757 void
1759 {
1760  // Rescale every matrix column by its maximum absolute value.
1761  // Matrix columns are distributed row wise, so MPI communication is needed
1762  // to get the maximum of each column across all processes.
1763 
1764  // Find each column's max absolute value in the current process.
1765  double local_max[d_num_cols];
1766  for (int j = 0; j < d_num_cols; j++)
1767  {
1768  local_max[j] = fabs(item(0, j));
1769  for (int i = 1; i < d_num_rows; i++)
1770  {
1771  if (fabs(item(i, j)) > local_max[j])
1772  local_max[j] = fabs(item(i, j));
1773  }
1774  }
1775 
1776  // Get the max across all processes, if applicable.
1777  double global_max[d_num_cols];
1778  if (d_distributed && d_num_procs > 1)
1779  {
1780  MPI_Allreduce(&local_max, &global_max, d_num_cols, MPI_DOUBLE, MPI_MAX,
1781  MPI_COMM_WORLD);
1782  }
1783  else
1784  {
1785  for (int i = 0; i < d_num_cols; i++)
1786  global_max[i] = local_max[i];
1787  }
1788 
1789  // Rescale each column's entries, if max nonzero.
1790  for (int j = 0; j < d_num_cols; j++)
1791  {
1792  if (global_max[j] > 1.0e-14)
1793  {
1794  double tmp = 1.0 / global_max[j];
1795  for (int i = 0; i < d_num_rows; i++)
1796  item(i, j) *= tmp;
1797  }
1798  }
1799 }
1800 
1801 Matrix outerProduct(const Vector &v, const Vector &w)
1802 {
1803  /*
1804  * There are two cases of concern:
1805  *
1806  * 1) w is not distributed
1807  *
1808  * 2) w is distributed
1809  *
1810  */
1811  int result_num_rows = v.dim();
1812  int result_num_cols;
1813  bool is_distributed = v.distributed();
1814  Vector gathered_w;
1815 
1816  /*
1817  * Gather all of the entries in w on each process into a Vector stored
1818  * redundantly (i.e., not distributed) on each process. If the Vector w
1819  * is not distributed, this step is trivial.
1820  */
1821  if (!w.distributed())
1822  {
1823  result_num_cols = w.dim();
1824  gathered_w = w;
1825  }
1826  else // w.distributed() is true
1827  {
1828  // Get the number of columns on each processor and gather these
1829  // counts into an array. Use std::vector as an array substitute
1830  // because variable-length arrays aren't in the C++ standard,
1831  // but the storage for std::vector containers must be contiguous
1832  // as defined by the C++ standard.
1833  int process_local_num_cols = w.dim();
1834  MPI_Comm comm = MPI_COMM_WORLD;
1835  MPI_Datatype num_cols_datatype = MPI_INT;
1836  int num_procs;
1837  MPI_Comm_size(comm, &num_procs);
1838  std::vector<int> num_cols_on_proc(num_procs, 0);
1839  int num_procs_send_count = 1;
1840  int num_procs_recv_count = 1;
1841  MPI_Allgather(&process_local_num_cols,
1842  num_procs_send_count,
1843  num_cols_datatype,
1844  num_cols_on_proc.data(),
1845  num_procs_recv_count,
1846  num_cols_datatype,
1847  comm);
1848 
1849  // Compute the displacements for the entries in the gathered
1850  // Vector, along with the total number of columns in the result.
1851  std::vector<int> gathered_w_displacements(num_procs, 0);
1852  result_num_cols = 0;
1853  for (int i = 0; i < num_cols_on_proc.size(); i++)
1854  {
1855  gathered_w_displacements.at(i) = result_num_cols;
1856  result_num_cols += num_cols_on_proc.at(i);
1857  }
1858 
1859  // Gather the data from each process onto each process -- this
1860  // step is an Allgatherv (because difference processes may
1861  // have different numbers of entries.
1862  std::vector<double> w_data(process_local_num_cols);
1863  for (int i = 0; i < w_data.size(); i++)
1864  {
1865  w_data.at(i) = w(i);
1866  }
1867 
1868  std::vector<double> gathered_w_data(result_num_cols);
1869  MPI_Datatype entry_datatype = MPI_DOUBLE;
1870  MPI_Allgatherv(w_data.data(),
1871  process_local_num_cols,
1872  entry_datatype,
1873  gathered_w_data.data(),
1874  num_cols_on_proc.data(),
1875  gathered_w_displacements.data(),
1876  entry_datatype,
1877  comm);
1878 
1879  gathered_w.setSize(result_num_cols);
1880  for (int i = 0; i < gathered_w_data.size(); i++)
1881  {
1882  gathered_w(i) = gathered_w_data.at(i);
1883  }
1884  }
1885 
1886  /* Create the matrix */
1887  Matrix result(result_num_rows, result_num_cols, is_distributed);
1888 
1889  /* Compute the outer product using the gathered copy of w. */
1890  for (int i = 0; i < result_num_rows; i++)
1891  {
1892  for (int j = 0; j < result_num_cols; j++)
1893  {
1894  result(i, j) = v(i) * gathered_w(j);
1895  }
1896  }
1897 
1898  return result;
1899 }
1900 
1902 {
1903  const int resultNumRows = v.dim();
1904  int resultNumColumns, processRowStartIndex, processNumColumns;
1905  const bool isDistributed = v.distributed();
1906 
1907  /* If v isn't distributed, sizing the output matrix is trivial. */
1908  if (false == isDistributed)
1909  {
1910  resultNumColumns = processNumColumns = resultNumRows;
1911  processRowStartIndex = 0;
1912  }
1913  else /* true == isDistributed */
1914  {
1915  using sizeType = std::vector<int>::size_type;
1916 
1922  const MPI_Comm comm = MPI_COMM_WORLD;
1923  int numProcesses;
1924  MPI_Comm_size(comm, &numProcesses);
1925  std::vector<int>
1926  numRowsOnProcess(static_cast<sizeType>(numProcesses));
1927  const int one = 1;
1928  const MPI_Datatype indexType = MPI_INT;
1929  MPI_Allgather(&resultNumRows, one, indexType,
1930  numRowsOnProcess.data(), one, indexType, comm);
1931 
1943  std::vector<int> rowStartIndexOnProcess(numProcesses);
1944  resultNumColumns = 0;
1945  for (sizeType i = 0; i < rowStartIndexOnProcess.size(); i++)
1946  {
1947  rowStartIndexOnProcess.at(i) = resultNumColumns;
1948  resultNumColumns += numRowsOnProcess.at(i);
1949  }
1950  int processNumber;
1951  MPI_Comm_rank(comm, &processNumber);
1952  processRowStartIndex =
1953  rowStartIndexOnProcess.at(static_cast<sizeType>(processNumber));
1954  } /* end (true == isDistributed) */
1955 
1960  Matrix diagonalMatrix(resultNumRows, resultNumColumns, isDistributed);
1961  for (int i = 0; i < resultNumRows; i++)
1962  {
1963  for (int j = 0; j < resultNumColumns; j++)
1964  {
1969  const double entry = (j == (i + processRowStartIndex)) ? v(i) : 0.0;
1970  diagonalMatrix(i, j) = entry;
1971  }
1972  }
1973 
1974  return diagonalMatrix;
1975 }
1976 
1978 {
1979  Vector temporary(v);
1980  temporary = 1.0;
1981  return DiagonalMatrixFactory(temporary);
1982 }
1983 
1985 {
1986  char jobz = 'V', uplo = 'U';
1987 
1988  int info;
1989  int k = A.numColumns();
1990  int lwork = std::max(1, 10*k-1);
1991  double* work = new double [lwork];
1992  double* eigs = new double [k];
1993  Matrix* ev = new Matrix(A);
1994 
1995  // ev now in a row major representation. Put it
1996  // into column major order.
1997  for (int row = 0; row < k; ++row) {
1998  for (int col = row+1; col < k; ++col) {
1999  double tmp = ev->item(row, col);
2000  ev->item(row, col) = ev->item(col, row);
2001  ev->item(col, row) = tmp;
2002  }
2003  }
2004 
2005  // Now call lapack to do the eigensolve.
2006  dsyev(&jobz, &uplo, &k, ev->getData(), &k, eigs, work, &lwork, &info);
2007 
2008  // Eigenvectors now in a column major representation. Put it
2009  // into row major order.
2010  for (int row = 0; row < k; ++row) {
2011  for (int col = row+1; col < k; ++col) {
2012  double tmp = ev->item(row, col);
2013  ev->item(row, col) = ev->item(col, row);
2014  ev->item(col, row) = tmp;
2015  }
2016  }
2017 
2018  EigenPair eigenpair;
2019  eigenpair.ev = ev;
2020  for (int i = 0; i < k; i++)
2021  {
2022  eigenpair.eigs.push_back(eigs[i]);
2023  }
2024 
2025  delete [] work;
2026  delete [] eigs;
2027 
2028  return eigenpair;
2029 }
2030 
2032 {
2033  char jobvl = 'N', jobrl = 'V';
2034 
2035  int info;
2036  int k = A.numColumns();
2037  int lwork = std::max(k*k, 10*k);
2038  double* work = new double [lwork];
2039  double* e_real = new double [k];
2040  double* e_imaginary = new double [k];
2041  double* ev_l = NULL;
2042  Matrix* ev_r = new Matrix(k, k, false);
2043  Matrix* A_copy = new Matrix(A);
2044 
2045  // A now in a row major representation. Put it
2046  // into column major order.
2047  for (int row = 0; row < k; ++row) {
2048  for (int col = row+1; col < k; ++col) {
2049  double tmp = A_copy->item(row, col);
2050  A_copy->item(row, col) = A_copy->item(col, row);
2051  A_copy->item(col, row) = tmp;
2052  }
2053  }
2054 
2055  // Now call lapack to do the eigensolve.
2056  dgeev(&jobvl, &jobrl, &k, A_copy->getData(), &k, e_real, e_imaginary, ev_l,
2057  &k, ev_r->getData(), &k, work, &lwork, &info);
2058 
2059  // Eigenvectors now in a column major representation. Put it
2060  // into row major order.
2061  for (int row = 0; row < k; ++row) {
2062  for (int col = row+1; col < k; ++col) {
2063  double tmp = ev_r->item(row, col);
2064  ev_r->item(row, col) = ev_r->item(col, row);
2065  ev_r->item(col, row) = tmp;
2066  }
2067  }
2068 
2069  ComplexEigenPair eigenpair;
2070  eigenpair.ev_real = new Matrix(k, k, false);
2071  eigenpair.ev_imaginary = new Matrix(k, k, false);
2072 
2073  // Separate lapack eigenvector into real and imaginary parts
2074  for (int i = 0; i < k; ++i)
2075  {
2076  for (int row = 0; row < k; ++row) {
2077  eigenpair.ev_real->item(row, i) = ev_r->item(row, i);
2078  }
2079  if (e_imaginary[i] != 0)
2080  {
2081  for (int row = 0; row < k; ++row) {
2082  eigenpair.ev_real->item(row, i + 1) = ev_r->item(row, i);
2083  eigenpair.ev_imaginary->item(row, i) = ev_r->item(row, i + 1);
2084  eigenpair.ev_imaginary->item(row, i + 1) = -ev_r->item(row, i + 1);
2085  }
2086 
2087  // Skip the next eigenvalue since it'll be part of the complex
2088  // conjugate pair.
2089  ++i;
2090  }
2091  }
2092 
2093  for (int i = 0; i < k; i++)
2094  {
2095  eigenpair.eigs.push_back(std::complex<double>(e_real[i], e_imaginary[i]));
2096  }
2097 
2098  delete [] work;
2099  delete [] e_real;
2100  delete [] e_imaginary;
2101  delete ev_r;
2102  delete A_copy;
2103 
2104  return eigenpair;
2105 }
2106 
2108  Matrix* U,
2109  Vector* S,
2110  Matrix* V)
2111 {
2112  CAROM_VERIFY(!A->distributed());
2113  int m = A->numRows();
2114  int n = A->numColumns();
2115 
2116  Matrix* A_copy = new Matrix(*A);
2117  if (U == NULL)
2118  {
2119  U = new Matrix(m, std::min(m, n), false);
2120  }
2121  else
2122  {
2123  CAROM_VERIFY(!U->distributed());
2124  U->setSize(m, std::min(m, n));
2125  }
2126  if (V == NULL)
2127  {
2128  CAROM_VERIFY(!V->distributed());
2129  V = new Matrix(std::min(m, n), n, false);
2130  }
2131  else
2132  {
2133  V->setSize(std::min(m, n), n);
2134  }
2135  if (S == NULL)
2136  {
2137  CAROM_VERIFY(!S->distributed());
2138  S = new Vector(n, false);
2139  }
2140  else
2141  {
2142  S->setSize(n);
2143  }
2144 
2145  char jobz = 'S';
2146  int lda = m;
2147  int ldu = m;
2148  int ldv = n;
2149  int mn = std::min(m, n);
2150  int lwork = 4 * mn * mn + 7 * mn;
2151  double* work = new double [lwork];
2152  int iwork[8*std::min(m, n)];
2153  int info;
2154 
2155  dgesdd(&jobz, &m, &n, A_copy->getData(), &lda, S->getData(), U->getData(), &ldu,
2156  V->getData(),
2157  &ldv, work, &lwork, iwork, &info);
2158 
2159  CAROM_VERIFY(info == 0);
2160 
2161  delete [] work;
2162  delete A_copy;
2163 }
2164 
2165 SerialSVDDecomposition SerialSVD(Matrix* A)
2166 {
2167  CAROM_VERIFY(!A->distributed());
2168  Matrix* U = NULL;
2169  Vector* S = NULL;
2170  Matrix* V = NULL;
2171 
2172  SerialSVD(A, U, S, V);
2173 
2174  SerialSVDDecomposition decomp;
2175  decomp.U = U;
2176  decomp.S = S;
2177  decomp.V = V;
2178 
2179  return decomp;
2180 }
2181 
2182 // Compute the product A^T * B, where A is represented by the space-time
2183 // product of As and At, and likewise for B.
2184 std::unique_ptr<Matrix> SpaceTimeProduct(const CAROM::Matrix & As,
2185  const CAROM::Matrix & At,
2186  const CAROM::Matrix & Bs, const CAROM::Matrix & Bt,
2187  const std::vector<double> *tscale,
2188  const bool At0at0, const bool Bt0at0, const bool lagB,
2189  const bool skip0)
2190 {
2191  // TODO: implement reduction in parallel for the spatial matrices
2192  CAROM_VERIFY(As.distributed() && Bs.distributed());
2193 
2194  const int AtOS = At0at0 ? 1 : 0;
2195  const int BtOS0 = Bt0at0 ? 1 : 0;
2196  const int BtOS = BtOS0 + (lagB ? 1 : 0);
2197 
2198  const int nrows = As.numColumns();
2199  const int ncols = Bs.numColumns();
2200 
2201  const int nspace = As.numRows();
2202  const int ntime = At.numRows() + AtOS;
2203 
2204  CAROM_VERIFY(nspace == Bs.numRows() && ntime == Bt.numRows() + BtOS0);
2205 
2206  // For now, we assume one time vector for each space vector
2207  CAROM_VERIFY(nrows == At.numColumns() && ncols == Bt.numColumns());
2208 
2209  //const int k0 = (At0at0 || Bt0at0 || lagB) std::max(AtOS, BtOS) : 0;
2210  const int k0AB = std::max(AtOS, BtOS);
2211  const int k00 = skip0 ? 1 : 0;
2212  const int k0 = std::max(k0AB, k00);
2213 
2214  CAROM_VERIFY(tscale == NULL || (tscale->size() == ntime-1 && k0 > 0));
2215 
2216  Matrix* p = new CAROM::Matrix(nrows, ncols, false);
2217 
2218  for (int i=0; i<nrows; ++i)
2219  {
2220  for (int j=0; j<ncols; ++j)
2221  {
2222  double pij = 0.0;
2223 
2224  for (int k=k0; k<ntime; ++k)
2225  {
2226  //const double At_k = (At0at0 && k == 0) ? 0.0 : At->item(k - AtOS,i);
2227  //const double Bt_k = (Bt0at0 && k == 0) ? 0.0 : Bt->item(k - BtOS,j);
2228  const double At_k = At.item(k - AtOS,i);
2229  const double Bt_k = Bt.item(k - BtOS,j);
2230  const double ts = (tscale == NULL) ? 1.0 : (*tscale)[k - 1];
2231 
2232  double spij = 0.0; // sum over spatial entries
2233  for (int m=0; m<nspace; ++m)
2234  spij += As.item(m,i) * Bs.item(m,j);
2235 
2236  pij += spij * ts * At_k * Bt_k;
2237  }
2238 
2239  p->item(i, j) = pij;
2240  }
2241  }
2242 
2243  return std::unique_ptr<Matrix>(p);
2244 }
2245 
2246 } // end namespace CAROM
void getInteger(const std::string &key, int &data)
Reads an integer associated with the supplied key from the currently open database file.
Definition: Database.h:181
void putInteger(const std::string &key, int data)
Writes an integer associated with the supplied key to currently open database file.
Definition: Database.h:94
virtual void getDoubleArray(const std::string &key, double *data, int nelements, const bool distributed=false) override
Reads an array of doubles associated with the supplied key from the currently open HDF5 database file...
virtual bool create(const std::string &file_name, const MPI_Comm comm=MPI_COMM_NULL) override
Creates a new HDF5 database file with the supplied name.
Definition: HDFDatabase.cpp:45
virtual bool open(const std::string &file_name, const std::string &type, const MPI_Comm comm=MPI_COMM_NULL) override
Opens an existing HDF5 database file with the supplied name.
Definition: HDFDatabase.cpp:76
virtual void putDoubleArray(const std::string &key, const double *const data, int nelements, const bool distributed=false) override
Writes an array of doubles associated with the supplied key to the currently open HDF5 database file.
virtual bool close() override
Closes the currently open HDF5 database file.
void local_read(const std::string &base_file_name, int rank)
read a single rank of a distributed Matrix into (a) HDF file(s).
Definition: Matrix.cpp:730
void setSize(int num_rows, int num_cols)
Sets the number of rows and columns of the matrix and reallocates storage if needed....
Definition: Matrix.h:149
double * getData() const
Get the matrix data as a pointer.
Definition: Matrix.h:844
bool balanced() const
Returns true if rows of matrix are load-balanced.
Definition: Matrix.cpp:207
void transposePseudoinverse()
Computes the transposePseudoinverse of this.
Definition: Matrix.cpp:605
bool distributed() const
Returns true if the Matrix is distributed.
Definition: Matrix.h:183
void rescale_rows_max()
Rescale every matrix row by its maximum absolute value.
Definition: Matrix.cpp:1731
Matrix & operator=(const Matrix &rhs)
Assignment operator.
Definition: Matrix.cpp:176
void write(const std::string &base_file_name) const
write Matrix into (a) HDF file(s).
Definition: Matrix.cpp:658
int numColumns() const
Returns the number of columns in the Matrix. This method will return the same value from each process...
Definition: Matrix.h:226
void distribute(const int &local_num_rows)
Distribute this matrix rows among MPI processes, based on the specified local number of rows....
Definition: Matrix.cpp:767
void gather()
Gather all the distributed rows among MPI processes. This becomes not distributed after this function...
Definition: Matrix.cpp:794
void orthogonalize_last(int ncols=-1, bool double_pass=false, double zero_tol=1.0e-15)
Orthonormalizes the matrix's last column, assuming the previous columns are already orthonormal.
Definition: Matrix.cpp:1681
std::unique_ptr< Matrix > qr_factorize() const
Computes and returns the Q of the QR factorization of this.
Definition: Matrix.cpp:846
Matrix & operator+=(const Matrix &rhs)
Addition operator.
Definition: Matrix.cpp:187
int numDistributedRows() const
Returns the number of rows of the Matrix across all processors.
Definition: Matrix.h:211
Matrix & operator-=(const Matrix &rhs)
Subtraction operator.
Definition: Matrix.cpp:197
std::unique_ptr< Vector > getColumn(int column) const
Returns a deep copy of a column of the matrix.
Definition: Matrix.h:611
void orthogonalize(bool double_pass=false, double zero_tol=1.0e-15)
Orthonormalizes the matrix.
Definition: Matrix.cpp:1631
std::unique_ptr< Matrix > inverse() const
Computes and returns the inverse of this.
Definition: Matrix.h:573
void multPlus(Vector &a, const Vector &b, double c) const
Computes a += this*b*c.
Definition: Matrix.cpp:407
void print(const char *prefix) const
print Matrix into (a) ascii file(s).
Definition: Matrix.cpp:639
std::unique_ptr< Matrix > getFirstNColumns(int n) const
Get the first N columns of a matrix.
Definition: Matrix.cpp:260
std::unique_ptr< Matrix > elementwise_mult(const Matrix &other) const
Multiplies two matrices element-wise.
Definition: Matrix.h:404
void pointwise_mult(int this_row, const Vector &other, Vector &result) const
Multiplies a specified row of this Matrix with other pointwise.
Definition: Matrix.cpp:336
~Matrix()
Destructor.
Definition: Matrix.cpp:168
const double & item(int row, int col) const
Const Matrix member access. Matrix data is stored in row-major format.
Definition: Matrix.h:746
int numRows() const
Returns the number of rows of the Matrix on this processor.
Definition: Matrix.h:200
void read(const std::string &base_file_name)
read Matrix into (a) HDF file(s).
Definition: Matrix.cpp:690
std::unique_ptr< Matrix > mult(const Matrix &other) const
Multiplies this Matrix with other and returns the product.
Definition: Matrix.h:273
void transpose()
Replaces this Matrix with its transpose (in place), in the serial square case only.
Definition: Matrix.cpp:547
std::unique_ptr< Matrix > elementwise_square() const
Square every element in the matrix.
Definition: Matrix.h:435
std::unique_ptr< Matrix > transposeMult(const Matrix &other) const
Multiplies the transpose of this Matrix with other and returns the product.
Definition: Matrix.h:491
void rescale_cols_max()
Rescale every matrix column by its maximum absolute value.
Definition: Matrix.cpp:1758
void qrcp_pivots_transpose(int *row_pivot, int *row_pivot_owner, int pivots_requested) const
Compute the leading numColumns() column pivots from a QR decomposition with column pivots (QRCP) of t...
Definition: Matrix.cpp:1026
double * getData() const
Get the vector data as a pointer.
Definition: Vector.h:558
void setSize(int dim)
Sets the length of the vector and reallocates storage if needed. All values are initialized to zero.
Definition: Vector.h:217
bool distributed() const
Returns true if the Vector is distributed.
Definition: Vector.h:240
const double & item(int i) const
Const Vector member access.
Definition: Vector.h:468
int dim() const
Returns the dimension of the Vector on this processor.
Definition: Vector.h:251
ComplexEigenPair NonSymmetricRightEigenSolve(const Matrix &A)
Computes the eigenvectors/eigenvalues of an NxN real nonsymmetric matrix.
Definition: Matrix.cpp:2031
Matrix IdentityMatrixFactory(const Vector &v)
Factory function to make an identity matrix with rows distributed in the same fashion as its Vector a...
Definition: Matrix.cpp:1977
Matrix outerProduct(const Vector &v, const Vector &w)
Computes the outer product of two Vectors, v and w.
Definition: Matrix.cpp:1801
void SerialSVD(Matrix *A, Matrix *U, Vector *S, Matrix *V)
Computes the SVD of a undistributed matrix in serial.
Definition: Matrix.cpp:2107
Matrix DiagonalMatrixFactory(const Vector &v)
Factory function to make a diagonal matrix with nonzero entries as in its Vector argument....
Definition: Matrix.cpp:1901
int get_global_offsets(const int local_dim, std::vector< int > &offsets, const MPI_Comm &comm)
Save integer offsets for each MPI rank under MPI communicator comm, where each rank as the size of lo...
Definition: mpi_utils.cpp:41
EigenPair SymmetricRightEigenSolve(const Matrix &A)
Computes the eigenvectors/eigenvalues of an NxN real symmetric matrix.
Definition: Matrix.cpp:1984
Matrix * ev_real
The real component of the eigenvectors in matrix form.
Definition: Matrix.h:1126
std::vector< std::complex< double > > eigs
The corresponding complex eigenvalues.
Definition: Matrix.h:1136
Matrix * ev_imaginary
The imaginary component of the eigenvectors in matrix form.
Definition: Matrix.h:1131
std::vector< double > eigs
The corresponding eigenvalues.
Definition: Matrix.h:1114
Matrix * ev
The eigenvectors in matrix form.
Definition: Matrix.h:1109