Actual source code: mpiaijcusparse.cu
1: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
3: #include <petscconf.h>
4: #include <../src/mat/impls/aij/mpi/mpiaij.h>
5: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
6: #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
7: #include <thrust/advance.h>
8: #include <thrust/partition.h>
9: #include <thrust/sort.h>
10: #include <thrust/tuple.h>
11: #include <thrust/unique.h>
12: #include <petscsf.h>
14: struct VecCUDAEquals {
15: template <typename Tuple>
16: __host__ __device__ void operator()(Tuple t)
17: {
18: thrust::get<1>(t) = thrust::get<0>(t);
19: }
20: };
22: static PetscErrorCode MatCOOStructDestroy_MPIAIJCUSPARSE(PetscCtxRt data)
23: {
24: MatCOOStruct_MPIAIJ *coo = *(MatCOOStruct_MPIAIJ **)data;
26: PetscFunctionBegin;
27: PetscCall(PetscSFDestroy(&coo->sf));
28: PetscCallCUDA(cudaFree(coo->Ajmap1));
29: PetscCallCUDA(cudaFree(coo->Aperm1));
30: PetscCallCUDA(cudaFree(coo->Bjmap1));
31: PetscCallCUDA(cudaFree(coo->Bperm1));
32: PetscCallCUDA(cudaFree(coo->Aimap2));
33: PetscCallCUDA(cudaFree(coo->Ajmap2));
34: PetscCallCUDA(cudaFree(coo->Aperm2));
35: PetscCallCUDA(cudaFree(coo->Bimap2));
36: PetscCallCUDA(cudaFree(coo->Bjmap2));
37: PetscCallCUDA(cudaFree(coo->Bperm2));
38: PetscCallCUDA(cudaFree(coo->Cperm1));
39: PetscCallCUDA(cudaFree(coo->sendbuf));
40: PetscCallCUDA(cudaFree(coo->recvbuf));
41: PetscCall(PetscFree(coo));
42: PetscFunctionReturn(PETSC_SUCCESS);
43: }
45: static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
46: {
47: Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)mat->data;
48: PetscBool dev_ij = PETSC_FALSE;
49: PetscMemType mtype = PETSC_MEMTYPE_HOST;
50: PetscInt *i, *j;
51: PetscContainer container_h;
52: MatCOOStruct_MPIAIJ *coo_h, *coo_d;
54: PetscFunctionBegin;
55: PetscCall(PetscFree(mpiaij->garray));
56: PetscCall(VecDestroy(&mpiaij->lvec));
57: #if defined(PETSC_USE_CTABLE)
58: PetscCall(PetscHMapIDestroy(&mpiaij->colmap));
59: #else
60: PetscCall(PetscFree(mpiaij->colmap));
61: #endif
62: PetscCall(VecScatterDestroy(&mpiaij->Mvctx));
63: mat->assembled = PETSC_FALSE;
64: mat->was_assembled = PETSC_FALSE;
65: PetscCall(PetscGetMemType(coo_i, &mtype));
66: if (PetscMemTypeDevice(mtype)) {
67: dev_ij = PETSC_TRUE;
68: PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
69: PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
70: PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
71: } else {
72: i = coo_i;
73: j = coo_j;
74: }
76: PetscCall(MatSetPreallocationCOO_MPIAIJ(mat, coo_n, i, j));
77: if (dev_ij) PetscCall(PetscFree2(i, j));
78: mat->offloadmask = PETSC_OFFLOAD_CPU;
79: // Create the GPU memory
80: PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A));
81: PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B));
83: // Copy the COO struct to device
84: PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
85: PetscCall(PetscContainerGetPointer(container_h, &coo_h));
86: PetscCall(PetscMalloc1(1, &coo_d));
87: *coo_d = *coo_h; // do a shallow copy and then amend fields in coo_d
89: PetscCall(PetscObjectReference((PetscObject)coo_d->sf)); // Since we destroy the sf in both coo_h and coo_d
90: PetscCallCUDA(cudaMalloc((void **)&coo_d->Ajmap1, (coo_h->Annz + 1) * sizeof(PetscCount)));
91: PetscCallCUDA(cudaMalloc((void **)&coo_d->Aperm1, coo_h->Atot1 * sizeof(PetscCount)));
92: PetscCallCUDA(cudaMalloc((void **)&coo_d->Bjmap1, (coo_h->Bnnz + 1) * sizeof(PetscCount)));
93: PetscCallCUDA(cudaMalloc((void **)&coo_d->Bperm1, coo_h->Btot1 * sizeof(PetscCount)));
94: PetscCallCUDA(cudaMalloc((void **)&coo_d->Aimap2, coo_h->Annz2 * sizeof(PetscCount)));
95: PetscCallCUDA(cudaMalloc((void **)&coo_d->Ajmap2, (coo_h->Annz2 + 1) * sizeof(PetscCount)));
96: PetscCallCUDA(cudaMalloc((void **)&coo_d->Aperm2, coo_h->Atot2 * sizeof(PetscCount)));
97: PetscCallCUDA(cudaMalloc((void **)&coo_d->Bimap2, coo_h->Bnnz2 * sizeof(PetscCount)));
98: PetscCallCUDA(cudaMalloc((void **)&coo_d->Bjmap2, (coo_h->Bnnz2 + 1) * sizeof(PetscCount)));
99: PetscCallCUDA(cudaMalloc((void **)&coo_d->Bperm2, coo_h->Btot2 * sizeof(PetscCount)));
100: PetscCallCUDA(cudaMalloc((void **)&coo_d->Cperm1, coo_h->sendlen * sizeof(PetscCount)));
101: PetscCallCUDA(cudaMalloc((void **)&coo_d->sendbuf, coo_h->sendlen * sizeof(PetscScalar)));
102: PetscCallCUDA(cudaMalloc((void **)&coo_d->recvbuf, coo_h->recvlen * sizeof(PetscScalar)));
104: PetscCallCUDA(cudaMemcpy(coo_d->Ajmap1, coo_h->Ajmap1, (coo_h->Annz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
105: PetscCallCUDA(cudaMemcpy(coo_d->Aperm1, coo_h->Aperm1, coo_h->Atot1 * sizeof(PetscCount), cudaMemcpyHostToDevice));
106: PetscCallCUDA(cudaMemcpy(coo_d->Bjmap1, coo_h->Bjmap1, (coo_h->Bnnz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
107: PetscCallCUDA(cudaMemcpy(coo_d->Bperm1, coo_h->Bperm1, coo_h->Btot1 * sizeof(PetscCount), cudaMemcpyHostToDevice));
108: PetscCallCUDA(cudaMemcpy(coo_d->Aimap2, coo_h->Aimap2, coo_h->Annz2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
109: PetscCallCUDA(cudaMemcpy(coo_d->Ajmap2, coo_h->Ajmap2, (coo_h->Annz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
110: PetscCallCUDA(cudaMemcpy(coo_d->Aperm2, coo_h->Aperm2, coo_h->Atot2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
111: PetscCallCUDA(cudaMemcpy(coo_d->Bimap2, coo_h->Bimap2, coo_h->Bnnz2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
112: PetscCallCUDA(cudaMemcpy(coo_d->Bjmap2, coo_h->Bjmap2, (coo_h->Bnnz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
113: PetscCallCUDA(cudaMemcpy(coo_d->Bperm2, coo_h->Bperm2, coo_h->Btot2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
114: PetscCallCUDA(cudaMemcpy(coo_d->Cperm1, coo_h->Cperm1, coo_h->sendlen * sizeof(PetscCount), cudaMemcpyHostToDevice));
116: // Put the COO struct in a container and then attach that to the matrix
117: PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_MPIAIJCUSPARSE));
118: PetscFunctionReturn(PETSC_SUCCESS);
119: }
121: __global__ static void MatPackCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount perm[], PetscScalar buf[])
122: {
123: PetscCount i = blockIdx.x * blockDim.x + threadIdx.x;
124: const PetscCount grid_size = gridDim.x * blockDim.x;
125: for (; i < nnz; i += grid_size) buf[i] = kv[perm[i]];
126: }
128: __global__ static void MatAddLocalCOOValues(const PetscScalar kv[], InsertMode imode, PetscCount Annz, const PetscCount Ajmap1[], const PetscCount Aperm1[], PetscScalar Aa[], PetscCount Bnnz, const PetscCount Bjmap1[], const PetscCount Bperm1[], PetscScalar Ba[])
129: {
130: PetscCount i = blockIdx.x * blockDim.x + threadIdx.x;
131: const PetscCount grid_size = gridDim.x * blockDim.x;
132: for (; i < Annz + Bnnz; i += grid_size) {
133: PetscScalar sum = 0.0;
134: if (i < Annz) {
135: for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += kv[Aperm1[k]];
136: Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum;
137: } else {
138: i -= Annz;
139: for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += kv[Bperm1[k]];
140: Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum;
141: }
142: }
143: }
145: __global__ static void MatAddRemoteCOOValues(const PetscScalar kv[], PetscCount Annz2, const PetscCount Aimap2[], const PetscCount Ajmap2[], const PetscCount Aperm2[], PetscScalar Aa[], PetscCount Bnnz2, const PetscCount Bimap2[], const PetscCount Bjmap2[], const PetscCount Bperm2[], PetscScalar Ba[])
146: {
147: PetscCount i = blockIdx.x * blockDim.x + threadIdx.x;
148: const PetscCount grid_size = gridDim.x * blockDim.x;
149: for (; i < Annz2 + Bnnz2; i += grid_size) {
150: if (i < Annz2) {
151: for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += kv[Aperm2[k]];
152: } else {
153: i -= Annz2;
154: for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += kv[Bperm2[k]];
155: }
156: }
157: }
159: static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat, const PetscScalar v[], InsertMode imode)
160: {
161: Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data);
162: Mat A = mpiaij->A, B = mpiaij->B;
163: PetscScalar *Aa, *Ba;
164: const PetscScalar *v1 = v;
165: PetscMemType memtype;
166: PetscContainer container;
167: MatCOOStruct_MPIAIJ *coo;
169: PetscFunctionBegin;
170: PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
171: PetscCheck(container, PetscObjectComm((PetscObject)mat), PETSC_ERR_PLIB, "Not found MatCOOStruct on this matrix");
172: PetscCall(PetscContainerGetPointer(container, &coo));
174: const auto &Annz = coo->Annz;
175: const auto &Annz2 = coo->Annz2;
176: const auto &Bnnz = coo->Bnnz;
177: const auto &Bnnz2 = coo->Bnnz2;
178: const auto &vsend = coo->sendbuf;
179: const auto &v2 = coo->recvbuf;
180: const auto &Ajmap1 = coo->Ajmap1;
181: const auto &Ajmap2 = coo->Ajmap2;
182: const auto &Aimap2 = coo->Aimap2;
183: const auto &Bjmap1 = coo->Bjmap1;
184: const auto &Bjmap2 = coo->Bjmap2;
185: const auto &Bimap2 = coo->Bimap2;
186: const auto &Aperm1 = coo->Aperm1;
187: const auto &Aperm2 = coo->Aperm2;
188: const auto &Bperm1 = coo->Bperm1;
189: const auto &Bperm2 = coo->Bperm2;
190: const auto &Cperm1 = coo->Cperm1;
192: PetscCall(PetscGetMemType(v, &memtype));
193: if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */
194: PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
195: PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
196: }
198: if (imode == INSERT_VALUES) {
199: PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); /* write matrix values */
200: PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(B, &Ba));
201: } else {
202: PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); /* read & write matrix values */
203: PetscCall(MatSeqAIJCUSPARSEGetArray(B, &Ba));
204: }
206: PetscCall(PetscLogGpuTimeBegin());
207: /* Pack entries to be sent to remote */
208: if (coo->sendlen) {
209: MatPackCOOValues<<<(coo->sendlen + 255) / 256, 256>>>(v1, coo->sendlen, Cperm1, vsend);
210: PetscCallCUDA(cudaPeekAtLastError());
211: }
213: /* Send remote entries to their owner and overlap the communication with local computation */
214: PetscCall(PetscSFReduceWithMemTypeBegin(coo->sf, MPIU_SCALAR, PETSC_MEMTYPE_CUDA, vsend, PETSC_MEMTYPE_CUDA, v2, MPI_REPLACE));
215: /* Add local entries to A and B */
216: if (Annz + Bnnz > 0) {
217: MatAddLocalCOOValues<<<(int)((Annz + Bnnz + 255) / 256), 256>>>(v1, imode, Annz, Ajmap1, Aperm1, Aa, Bnnz, Bjmap1, Bperm1, Ba);
218: PetscCallCUDA(cudaPeekAtLastError());
219: }
220: PetscCall(PetscSFReduceEnd(coo->sf, MPIU_SCALAR, vsend, v2, MPI_REPLACE));
222: /* Add received remote entries to A and B */
223: if (Annz2 + Bnnz2 > 0) {
224: MatAddRemoteCOOValues<<<(int)((Annz2 + Bnnz2 + 255) / 256), 256>>>(v2, Annz2, Aimap2, Ajmap2, Aperm2, Aa, Bnnz2, Bimap2, Bjmap2, Bperm2, Ba);
225: PetscCallCUDA(cudaPeekAtLastError());
226: }
227: PetscCall(PetscLogGpuTimeEnd());
229: if (imode == INSERT_VALUES) {
230: PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
231: PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(B, &Ba));
232: } else {
233: PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
234: PetscCall(MatSeqAIJCUSPARSERestoreArray(B, &Ba));
235: }
236: if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
237: mat->offloadmask = PETSC_OFFLOAD_GPU;
238: PetscFunctionReturn(PETSC_SUCCESS);
239: }
241: static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A, MatReuse scall, IS *glob, Mat *A_loc)
242: {
243: Mat Ad, Ao;
244: const PetscInt *cmap;
246: PetscFunctionBegin;
247: PetscCall(MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap));
248: PetscCall(MatSeqAIJCUSPARSEMergeMats(Ad, Ao, scall, A_loc));
249: if (glob) {
250: PetscInt cst, i, dn, on, *gidx;
252: PetscCall(MatGetLocalSize(Ad, NULL, &dn));
253: PetscCall(MatGetLocalSize(Ao, NULL, &on));
254: PetscCall(MatGetOwnershipRangeColumn(A, &cst, NULL));
255: PetscCall(PetscMalloc1(dn + on, &gidx));
256: for (i = 0; i < dn; i++) gidx[i] = cst + i;
257: for (i = 0; i < on; i++) gidx[i + dn] = cmap[i];
258: PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob));
259: }
260: PetscFunctionReturn(PETSC_SUCCESS);
261: }
263: static PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
264: {
265: Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data;
266: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr;
267: PetscInt i;
269: PetscFunctionBegin;
270: if (B->hash_active) {
271: B->ops[0] = b->cops;
272: B->hash_active = PETSC_FALSE;
273: }
274: PetscCall(PetscLayoutSetUp(B->rmap));
275: PetscCall(PetscLayoutSetUp(B->cmap));
276: if (PetscDefined(USE_DEBUG) && d_nnz) {
277: for (i = 0; i < B->rmap->n; i++) PetscCheck(d_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, d_nnz[i]);
278: }
279: if (PetscDefined(USE_DEBUG) && o_nnz) {
280: for (i = 0; i < B->rmap->n; i++) PetscCheck(o_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, o_nnz[i]);
281: }
282: #if defined(PETSC_USE_CTABLE)
283: PetscCall(PetscHMapIDestroy(&b->colmap));
284: #else
285: PetscCall(PetscFree(b->colmap));
286: #endif
287: PetscCall(PetscFree(b->garray));
288: PetscCall(VecDestroy(&b->lvec));
289: PetscCall(VecScatterDestroy(&b->Mvctx));
290: /* Because the B will have been resized we simply destroy it and create a new one each time */
291: PetscCall(MatDestroy(&b->B));
292: if (!b->A) {
293: PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
294: PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
295: }
296: if (!b->B) {
297: PetscMPIInt size;
298: PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
299: PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
300: PetscCall(MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0));
301: }
302: PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE));
303: PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE));
304: PetscCall(MatBindToCPU(b->A, B->boundtocpu));
305: PetscCall(MatBindToCPU(b->B, B->boundtocpu));
306: PetscCall(MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz));
307: PetscCall(MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz));
308: PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat));
309: PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat));
310: B->preallocated = PETSC_TRUE;
311: B->was_assembled = PETSC_FALSE;
312: B->assembled = PETSC_FALSE;
313: PetscFunctionReturn(PETSC_SUCCESS);
314: }
316: static PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy)
317: {
318: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
320: PetscFunctionBegin;
321: PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
322: PetscCall((*a->A->ops->mult)(a->A, xx, yy));
323: PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
324: PetscCall((*a->B->ops->multadd)(a->B, a->lvec, yy, yy));
325: PetscFunctionReturn(PETSC_SUCCESS);
326: }
328: static PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
329: {
330: Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data;
332: PetscFunctionBegin;
333: PetscCall(MatZeroEntries(l->A));
334: PetscCall(MatZeroEntries(l->B));
335: PetscFunctionReturn(PETSC_SUCCESS);
336: }
338: static PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
339: {
340: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
342: PetscFunctionBegin;
343: PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
344: PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz));
345: PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
346: PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz));
347: PetscFunctionReturn(PETSC_SUCCESS);
348: }
350: static PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy)
351: {
352: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
354: PetscFunctionBegin;
355: PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec));
356: PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy));
357: PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
358: PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
359: PetscFunctionReturn(PETSC_SUCCESS);
360: }
362: static PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
363: {
364: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
365: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;
367: PetscFunctionBegin;
368: switch (op) {
369: case MAT_CUSPARSE_MULT_DIAG:
370: cusparseStruct->diagGPUMatFormat = format;
371: break;
372: case MAT_CUSPARSE_MULT_OFFDIAG:
373: cusparseStruct->offdiagGPUMatFormat = format;
374: break;
375: case MAT_CUSPARSE_ALL:
376: cusparseStruct->diagGPUMatFormat = format;
377: cusparseStruct->offdiagGPUMatFormat = format;
378: break;
379: default:
380: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.", op);
381: }
382: PetscFunctionReturn(PETSC_SUCCESS);
383: }
385: static PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A, PetscOptionItems PetscOptionsObject)
386: {
387: MatCUSPARSEStorageFormat format;
388: PetscBool flg;
389: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
390: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;
392: PetscFunctionBegin;
393: PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJCUSPARSE options");
394: if (A->factortype == MAT_FACTOR_NONE) {
395: PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format", "sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
396: if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_DIAG, format));
397: PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format", "sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->offdiagGPUMatFormat, (PetscEnum *)&format, &flg));
398: if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_OFFDIAG, format));
399: PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
400: if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
401: }
402: PetscOptionsHeadEnd();
403: PetscFunctionReturn(PETSC_SUCCESS);
404: }
406: static PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A, MatAssemblyType mode)
407: {
408: Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)A->data;
410: PetscFunctionBegin;
411: PetscCall(MatAssemblyEnd_MPIAIJ(A, mode));
412: if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec, VECSEQCUDA));
413: PetscFunctionReturn(PETSC_SUCCESS);
414: }
416: static PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
417: {
418: Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
419: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr;
421: PetscFunctionBegin;
422: PetscCheck(cusparseStruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
423: PetscCallCXX(delete cusparseStruct);
424: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", NULL));
425: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", NULL));
426: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
427: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
428: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
429: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", NULL));
430: PetscCall(MatDestroy_MPIAIJ(A));
431: PetscFunctionReturn(PETSC_SUCCESS);
432: }
434: /* defines MatSetValues_MPICUSPARSE_Hash() */
435: #define TYPE AIJ
436: #define TYPE_AIJ
437: #define SUB_TYPE_CUSPARSE
438: #include "../src/mat/impls/aij/mpi/mpihashmat.h"
439: #undef TYPE
440: #undef TYPE_AIJ
441: #undef SUB_TYPE_CUSPARSE
443: static PetscErrorCode MatSetUp_MPI_HASH_CUSPARSE(Mat A)
444: {
445: Mat_MPIAIJ *b = (Mat_MPIAIJ *)A->data;
446: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr;
448: PetscFunctionBegin;
449: PetscCall(MatSetUp_MPI_Hash(A));
450: PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat));
451: PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat));
452: A->preallocated = PETSC_TRUE;
453: PetscFunctionReturn(PETSC_SUCCESS);
454: }
456: PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType, MatReuse reuse, Mat *newmat)
457: {
458: Mat_MPIAIJ *a;
459: Mat A;
461: PetscFunctionBegin;
462: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
463: if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B, MAT_COPY_VALUES, newmat));
464: else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B, *newmat, SAME_NONZERO_PATTERN));
465: A = *newmat;
466: A->boundtocpu = PETSC_FALSE;
467: PetscCall(PetscFree(A->defaultvectype));
468: PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));
470: a = (Mat_MPIAIJ *)A->data;
471: if (a->A) PetscCall(MatSetType(a->A, MATSEQAIJCUSPARSE));
472: if (a->B) PetscCall(MatSetType(a->B, MATSEQAIJCUSPARSE));
473: if (a->lvec) PetscCall(VecSetType(a->lvec, VECSEQCUDA));
475: if (reuse != MAT_REUSE_MATRIX && !a->spptr) PetscCallCXX(a->spptr = new Mat_MPIAIJCUSPARSE);
477: A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE;
478: A->ops->mult = MatMult_MPIAIJCUSPARSE;
479: A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE;
480: A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE;
481: A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
482: A->ops->destroy = MatDestroy_MPIAIJCUSPARSE;
483: A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE;
484: A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
485: A->ops->setup = MatSetUp_MPI_HASH_CUSPARSE;
486: A->ops->getcurrentmemtype = MatGetCurrentMemType_MPIAIJ;
488: PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPIAIJCUSPARSE));
489: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE));
490: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJCUSPARSE));
491: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE));
492: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJCUSPARSE));
493: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJCUSPARSE));
494: #if defined(PETSC_HAVE_HYPRE)
495: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
496: #endif
497: PetscFunctionReturn(PETSC_SUCCESS);
498: }
500: PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
501: {
502: PetscFunctionBegin;
503: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
504: PetscCall(MatCreate_MPIAIJ(A));
505: PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A, MATMPIAIJCUSPARSE, MAT_INPLACE_MATRIX, &A));
506: PetscFunctionReturn(PETSC_SUCCESS);
507: }
509: /*@
510: MatCreateAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format. This matrix will ultimately be pushed down
511: to NVIDIA GPUs and use the CuSPARSE library for calculations.
513: Collective
515: Input Parameters:
516: + comm - MPI communicator, set to `PETSC_COMM_SELF`
517: . m - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given)
518: This value should be the same as the local size used in creating the
519: $y$ vector for the matrix-vector product $y = Ax$.
520: . n - This value should be the same as the local size used in creating the
521: $x$ vector for the matrix-vector product $y = Ax$. (or `PETSC_DECIDE` to have
522: calculated if `N` is given) For square matrices `n` is almost always `m`.
523: . M - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
524: . N - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
525: . d_nz - number of nonzeros per row in DIAGONAL portion of local submatrix
526: (same value is used for all local rows)
527: . d_nnz - array containing the number of nonzeros in the various rows of the
528: DIAGONAL portion of the local submatrix (possibly different for each row)
529: or `NULL`, if `d_nz` is used to specify the nonzero structure.
530: The size of this array is equal to the number of local rows, i.e `m`.
531: For matrices you plan to factor you must leave room for the diagonal entry and
532: put in the entry even if it is zero.
533: . o_nz - number of nonzeros per row in the OFF-DIAGONAL portion of local
534: submatrix (same value is used for all local rows).
535: - o_nnz - array containing the number of nonzeros in the various rows of the
536: OFF-DIAGONAL portion of the local submatrix (possibly different for
537: each row) or `NULL`, if `o_nz` is used to specify the nonzero
538: structure. The size of this array is equal to the number
539: of local rows, i.e `m`.
541: Output Parameter:
542: . A - the matrix
544: Level: intermediate
546: Notes:
547: It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
548: MatXXXXSetPreallocation() paradigm instead of this routine directly.
549: [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
551: The AIJ format, also called the
552: compressed row storage), is fully compatible with standard Fortran
553: storage. That is, the stored row and column indices can begin at
554: either one (as in Fortran) or zero.
556: When working with matrices for GPUs, it is often better to use the `MatSetPreallocationCOO()` and `MatSetValuesCOO()` paradigm rather than using this routine and `MatSetValues()`
558: .seealso: [](ch_matrices), `Mat`, `MATAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATMPIAIJCUSPARSE`
559: @*/
560: PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A)
561: {
562: PetscMPIInt size;
564: PetscFunctionBegin;
565: PetscCall(MatCreate(comm, A));
566: PetscCall(MatSetSizes(*A, m, n, M, N));
567: PetscCallMPI(MPI_Comm_size(comm, &size));
568: if (size > 1) {
569: PetscCall(MatSetType(*A, MATMPIAIJCUSPARSE));
570: PetscCall(MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz));
571: } else {
572: PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
573: PetscCall(MatSeqAIJSetPreallocation(*A, d_nz, d_nnz));
574: }
575: PetscFunctionReturn(PETSC_SUCCESS);
576: }
578: /*MC
579: MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices on NVIDIA GPUs.
581: Options Database Keys:
582: + -mat_type mpiaijcusparse - sets the matrix type to `MATMPIAIJCUSPARSE`
583: . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices. Other options include ell (ellpack) or hyb (hybrid).
584: . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix. Other options include ell (ellpack) or hyb (hybrid).
585: - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix. Other options include ell (ellpack) or hyb (hybrid).
587: Level: beginner
589: Notes:
590: These matrices can be in either CSR, ELL, or HYB format. The ELL and HYB formats require CUDA 4.2 or later.
592: All matrix calculations are performed on NVIDIA GPUs using the cuSPARSE library.
594: Uses 32-bit integers internally. If PETSc is configured `--with-64-bit-indices`, the integer row and column indices are stored on the GPU with `int`. It is unclear what happens
595: if some integer values passed in do not fit in `int`.
597: .seealso: [](ch_matrices), `Mat`, `MatCreateAIJCUSPARSE()`, `MATSEQAIJCUSPARSE`, `MATMPIAIJCUSPARSE`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
598: M*/
600: /*MC
601: MATAIJCUSPARSE - A matrix type to be used for sparse matrices on NVIDIA GPUs; it is as same as `MATSEQAIJCUSPARSE` on one MPI process and `MATMPIAIJCUSPARSE` on multiple processes.
603: Level: beginner
605: .seealso: [](ch_matrices), `Mat`, `MATAIJCUSPARSE`, `MATSEQAIJCUSPARSE`
606: M*/