-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathupdate-task.c
262 lines (216 loc) · 8.64 KB
/
update-task.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
#include "update-task.h"
#include "utils.h"
#include "robust.h"
#include <float.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <mm_malloc.h>
#include <assert.h>
#include <time.h>
#include <omp.h>
void tile_update(
int m, int n, int k,
double *restrict Y, int ldY,
double *restrict Ynorms, scaling_t *restrict Yscales,
const double *restrict h,
const double *restrict H, int ldH, double *restrict Hnorm,
const double *restrict wr,
omp_lock_t *lock,
const double *restrict X, int ldX,
const scaling_t *restrict Xscales,
const double *restrict s, int lds, const double *restrict c, int ldc,
const double *restrict Rtildes, const int ldRtildes,
const int applyShift)
{
// Y := Y - H * X
// (m x n) (m x k) (k x n)
while (!omp_test_lock(lock)) {
#pragma omp taskyield
;
}
// Aliases.
int num_rhs = n;
// Status flag if X has to be rescaled.
int rescale_X = 0;
// Workspace to store locally computed scaling factors.
scaling_t tmp_scales[n];
#define H(i,j) H[(i) + (j) * (size_t)ldH]
#define X(i,k) X[(i) + (k) * (size_t)ldX]
#define Y(i,k) Y[(i) + (k) * (size_t)ldY]
#define s(k,rhs) s[(k) + (rhs) * (size_t)lds]
#define c(k,rhs) c[(k) + (rhs) * (size_t) ldc]
#define Rtildes(i,rhs) Rtildes[(i) + (size_t)ldRtildes * (rhs)]
////////////////////////////////////////////////////////////////////////////
// Compute the upper bound || H ||_oo, if required.
////////////////////////////////////////////////////////////////////////////
if (*Hnorm < 0.0) {
double Hnrm = dlange('I', m, k, H, ldH);
#pragma omp atomic write
*Hnorm = Hnrm;
}
////////////////////////////////////////////////////////////////////////////
// Compute a consistent scaling.
////////////////////////////////////////////////////////////////////////////
for (int rhs = 0; rhs < num_rhs; rhs++) {
if (Yscales[rhs] < Xscales[rhs]) {
// Mark X for scaling. Physical rescaling is deferred.
rescale_X = 1;
}
else if (Xscales[rhs] < Yscales[rhs]) {
// The common scaling factor is Xscales[rhs].
//const double a = Xscales[rhs] / Yscales[rhs];
const double a = compute_upscaling(Xscales[rhs], Yscales[rhs]);
// Rescale Y.
for (int i = 0; i < m; i++)
Y(i,rhs) = a * Y(i,rhs);
// Update norm using norm(a * Y) = a * norm(Y).
Ynorms[rhs] = a * Ynorms[rhs];
// Update global scaling of Y.
Yscales[rhs] = Xscales[rhs];
}
}
////////////////////////////////////////////////////////////////////////////
// Part 1.
// y(1:l-1) = y(1:l-1) - (H11-mu*I) * (Q21^T*y(l:r))
// = y(1:l-1) - (H11-mu*I) * ytilde
// where ytilde is given as the top entry of
// G_l^T * [0; = [c(l) -s(l); * [0 ; = [ ytilde;
// y(l)] s(l) c(l) ] y(l) ] * ]
////////////////////////////////////////////////////////////////////////////
// Norm of column h.
const double hnrm = vector_infnorm(m, h);
for (int rhs = 0; rhs < num_rhs; rhs++) {
// Extract the shift.
const double mu = wr[rhs];
double ytilde;
// Apply Givens rotations to potentially downscaled rhs.
if (Yscales[rhs] < Xscales[rhs]) {
// The common scaling factor is Yscales[rhs].
const double a = Yscales[rhs] / Xscales[rhs];
ytilde = -s(0,rhs) * (a * X(0,rhs));
rescale_X = 1;
}
else {
ytilde = -s(0,rhs) * X(0,rhs);
}
// Compute a scaling factor so that the linear updates
// cannot overflow.
const scaling_t alpha = protect_update(
fmax(hnrm, fabs(mu)), fabs(ytilde), Ynorms[rhs]);
const double a = convert_scaling(alpha);
if (a != 1.0) {
// Execute the linear update safely.
for (int i = 0; i < m; i++)
Y(i,rhs) = (a * Y(i,rhs)) - h[i] * (a * ytilde);
if (applyShift) {
Y(m-1,rhs) = (a * Y(m-1,rhs)) + mu * (a * ytilde);
}
}
else {
// Execute the linear update safely.
for (int i = 0; i < m; i++)
Y(i,rhs) = Y(i,rhs) - h[i] * ytilde;
if (applyShift) {
Y(m-1,rhs) = Y(m-1,rhs) + mu * ytilde;
}
}
// Update the global scaling of Y.
#ifdef INTSCALING
Yscales[rhs] = Yscales[rhs] + alpha;
#else
Yscales[rhs] = Yscales[rhs] * alpha;
#endif
// Recompute Ynorms.
Ynorms[rhs] = vector_infnorm(m, Y + ldY * rhs);
}
////////////////////////////////////////////////////////////////////////////
// Part 2.
// y(1:l-1) = y(1:l-1) - H12 * Q22^T * y(l:r)
// = y(1:l-1) - H12 * G_n^T*...*G_{r+1}^T*G_{r}^T*...*G_{l}^T * y(l:r)
// = y(1:l-1) - (H12*G_n^T*...*G_{r+1}^T) * (G_{r}^T*...*G_{l}^T*y(l:r))
// = y(1:l-1) - H12tilde * ytilde2
////////////////////////////////////////////////////////////////////////////
double *Ytilde2 =
(double *) _mm_malloc((size_t)k * num_rhs * sizeof(double), ALIGNMENT);
#define Ytilde2(j,rhs) Ytilde2[(j) + (rhs) * (size_t)k]
////////////////////////////////////////////////////////////////////////////
// Apply a subset of Givens rotations to the right-hand side.
////////////////////////////////////////////////////////////////////////////
// Note that this computation is redundantly executed across update tasks.
for (int rhs = 0; rhs < num_rhs; rhs++) {
// Copy rhs to scratch buffer.
if (rescale_X) {
if (Yscales[rhs] < Xscales[rhs]) {
// The common scaling factor is Yscales[k]. Note that Yscales[k]
// already includes potential scalings from Part 1.
const double a = compute_upscaling(Yscales[rhs], Xscales[rhs]);
for (int i = 0; i < k; i++)
Ytilde2(i,rhs) = a * X(i,rhs);
}
}
else {
// No scaling is required.
memcpy(&Ytilde2(0,rhs), &X(0,rhs), k * sizeof(double));
}
// Apply G_l.
Ytilde2(0,rhs) = c(0,rhs) * Ytilde2(0,rhs);
// Apply Givens rotations G_{l+1}, ..., G_r.
for (int j = 1; j < k; j++) {
const double tau1 = Ytilde2(j-1,rhs);
const double tau2 = Ytilde2(j ,rhs);
Ytilde2(j-1,rhs) = c(j,rhs) * tau1 - s(j,rhs) * tau2;
Ytilde2(j ,rhs) = s(j,rhs) * tau1 + c(j,rhs) * tau2;
}
} // end for
////////////////////////////////////////////////////////////////////////////
// Compute scalings.
////////////////////////////////////////////////////////////////////////////
// Compute the norm of Ytilde2 (or: a * Xnorms * 4 -- a growth by a factor of 4 is the maximum possible growth)
double ytilde2nrm[num_rhs];
for (int rhs = 0; rhs < num_rhs; rhs++) {
ytilde2nrm[rhs] = vector_infnorm(k, &Ytilde2(0,rhs));
}
// Compute a scaling factor such that the linear update is safe.
int status = protect_multi_rhs_update_real(
ytilde2nrm, num_rhs, *Hnorm, Ynorms, tmp_scales);
// Apply the scaling factor, if necessary.
if (status == RESCALE) {
for (int rhs = 0; rhs < num_rhs; rhs++) {
scale(k, &Ytilde2(0,rhs), tmp_scales + k);
scale(m, Y + ldY * rhs, tmp_scales + rhs);
#ifdef INTSCALING
Yscales[rhs] = Yscales[rhs] + tmp_scales[rhs];
#else
Yscales[rhs] = Yscales[rhs] * tmp_scales[k];
#endif
}
}
////////////////////////////////////////////////////////////////////////////
// Compute robustly
// Y(1:l-1,:) = Y(1:l-1,:) - H(1:l-1,l:r-1) * Ytilde2(l:r-1,:)
////////////////////////////////////////////////////////////////////////////
dgemm('N', 'N', m, n, k-1, -1.0, H, ldH, Ytilde2, k, 1.0, Y, ldY);
////////////////////////////////////////////////////////////////////////////
// Rank-1 perturbation from shift.
////////////////////////////////////////////////////////////////////////////
for (int rhs = 0; rhs < num_rhs; rhs++) {
for (int i = 0; i < m; i++) {
Y(i,rhs) = Y(i,rhs) - Rtildes(i,rhs) * Ytilde2(k-1,rhs);
}
}
// Recompute and record Ynorms.
for (int rhs = 0; rhs < num_rhs; rhs++) {
Ynorms[rhs] = vector_infnorm(m, Y + ldY * rhs);
}
omp_unset_lock(lock);
#undef Ytilde2
_mm_free(Ytilde2);
#undef H
#undef X
#undef Y
#undef s
#undef c
#undef Rtildes
}