-
Notifications
You must be signed in to change notification settings - Fork 92
/
threaded_assembly.jl
218 lines (185 loc) · 7.21 KB
/
threaded_assembly.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
# # Threaded Assembly
#
#-
#md # !!! tip
#md # This example is also available as a Jupyter notebook:
#md # [`threaded_assembly.ipynb`](@__NBVIEWER_ROOT_URL__/examples/threaded_assembly.ipynb).
#-
#
# ## Example of a colored grid
#
# Creates a simple 2D grid and colors it.
# Save the example grid to a VTK file to show the coloring.
# No cells with the same color has any shared nodes (dofs).
# This means that it is safe to assemble in parallel as long as we only assemble
# one color at a time.
#
# For this structured grid the greedy algorithm uses fewer colors, but both algorithms
# result in colors that contain roughly the same number of elements. For unstructured
# grids the greedy algorithm can result in colors with very few element. For those
# cases the workstream algorithm is better since it tries to balance the colors evenly.
using Ferrite, SparseArrays
function create_example_2d_grid()
grid = generate_grid(Quadrilateral, (10, 10), Vec{2}((0.0, 0.0)), Vec{2}((10.0, 10.0)))
colors_workstream = create_coloring(grid; alg=ColoringAlgorithm.WorkStream)
colors_greedy = create_coloring(grid; alg=ColoringAlgorithm.Greedy)
vtk_grid("colored", grid) do vtk
vtk_cell_data_colors(vtk, colors_workstream, "workstream-coloring")
vtk_cell_data_colors(vtk, colors_greedy, "greedy-coloring")
end
end
create_example_2d_grid();
# ![](coloring.png)
#
# *Figure 1*: Element coloring using the "workstream"-algorithm (left) and the "greedy"-
# algorithm (right).
# ## Cantilever beam in 3D with threaded assembly
# We will now look at an example where we assemble the stiffness matrix using multiple
# threads. We set up a simple grid and create a coloring, then create a DofHandler,
# and define the material stiffness
# #### Grid for the beam
function create_colored_cantilever_grid(celltype, n)
grid = generate_grid(celltype, (10*n, n, n), Vec{3}((0.0, 0.0, 0.0)), Vec{3}((10.0, 1.0, 1.0)))
colors = create_coloring(grid)
return grid, colors
end;
# #### DofHandler
function create_dofhandler(grid::Grid{dim}) where {dim}
dh = DofHandler(grid)
push!(dh, :u, dim) # Add a displacement field
close!(dh)
end;
# ### Stiffness tensor for linear elasticity
function create_stiffness()
E = 200e9
ν = 0.3
λ = E*ν / ((1+ν) * (1 - 2ν))
μ = E / (2(1+ν))
δ(i,j) = i == j ? 1.0 : 0.0
g(i,j,k,l) = λ*δ(i,j)*δ(k,l) + μ*(δ(i,k)*δ(j,l) + δ(i,l)*δ(j,k))
C = SymmetricTensor{4, 3}(g);
return C
end;
# ## Threaded data structures
#
# ScratchValues is a thread-local collection of data that each thread needs to own,
# since we need to be able to mutate the data in the threads independently
struct ScratchBuffer{T, CV <: CellValues, FV <: FaceValues, TT <: AbstractTensor, dim, Ti}
Ke::Matrix{T}
fe::Vector{T}
cellvalues::CV
facevalues::FV
global_dofs::Vector{Int}
ε::Vector{TT}
coordinates::Vector{Vec{dim, T}}
assembler::Ferrite.AssemblerSparsityPattern{T, Ti}
end;
# Each thread need its own CellValues and FaceValues (although, for this example we don't use
# the FaceValues)
function create_values()
## Interpolations and values
interpolation_space = Lagrange{3, RefCube, 1}()
quadrature_rule = QuadratureRule{3, RefCube}(2)
face_quadrature_rule = QuadratureRule{2, RefCube}(2)
cellvalues = CellVectorValues(quadrature_rule, interpolation_space)
facevalues = FaceVectorValues(face_quadrature_rule, interpolation_space)
return cellvalues, facevalues
end;
# Create a `ScratchValues` for each thread with the thread local data
function create_scratchbuffer(K, f, dh::DofHandler)
assembler = start_assemble(K, f; fillzero=false)
cellvalues, facevalues = create_values()
n = getnbasefunctions(cellvalues)
global_dofs = zeros(Int, n) # Global element dofs
fe = zeros(n) # Element force vector force vector
Ke = zeros(n, n) # Element stiffness
ε = [zero(SymmetricTensor{2, 3}) for _ in 1:n]
coordinates = getcoordinates(dh.grid, 1)
return ScratchBuffer(Ke, fe, cellvalues, facevalues, global_dofs, ε, coordinates, assembler)
end;
# ## Threaded assemble
# The assembly function loops over each color and does a threaded assembly for that color
function doassemble(K::SparseMatrixCSC, colors, grid::Grid, dh::DofHandler, C::SymmetricTensor{4},
n_threads)
f = zeros(ndofs(dh))
fill!(K.nzval, 0)
# scratches = create_scratchbuffer(K, f, dh)
b = Vec{3}((0.0, 0.0, 0.0)) # Body force
for color in colors
@sync begin
workqueue = Channel{Int}(Inf)
foreach(x -> put!(workqueue, x), color)
close(workqueue)
## Each color is safe to assemble threaded
for _ in 1:n_threads # Threads.nthreads()
Threads.@spawn begin
local scratch = create_scratchbuffer(K, f, dh)
for i in workqueue
assemble_cell!(scratch, i, grid, dh, C, b)
end
end
end
end
end
return K, f
end
# The cell assembly function is written the same way as if it was a single threaded example.
# The only difference is that we unpack the variables from our `scratch`.
function assemble_cell!(scratch::ScratchBuffer, cell::Int,
grid::Grid, dh::DofHandler, C::SymmetricTensor{4, dim}, b::Vec{dim}) where {dim}
## Unpack our stuff from the scratch
(; Ke, fe, cellvalues, global_dofs, ε, coordinates, assembler) = scratch
fill!(Ke, 0)
fill!(fe, 0)
n_basefuncs = getnbasefunctions(cellvalues)
## Fill up the coordinates
Ferrite.cellcoords!(coordinates, grid, cell)
# nodeids = grid.cells[cell].nodes
# for j in 1:length(coordinates)
# coordinates[j] = grid.nodes[nodeids[j]].x
# end
reinit!(cellvalues, coordinates)
for q_point in 1:getnquadpoints(cellvalues)
for i in 1:n_basefuncs
ε[i] = symmetric(shape_gradient(cellvalues, q_point, i))
end
dΩ = getdetJdV(cellvalues, q_point)
for i in 1:n_basefuncs
δu = shape_value(cellvalues, q_point, i)
fe[i] += (δu ⋅ b * exp(rand())) * dΩ
εC = ε[i] ⊡ C
for j in 1:n_basefuncs
Ke[i, j] += (εC ⊡ ε[j] * exp(rand())) * dΩ
end
end
end
celldofs!(global_dofs, dh, cell)
assemble!(assembler, global_dofs, fe, Ke)
end;
function run_assemble(n_threads=Threads.nthreads())
n = 20
grid, colors = create_colored_cantilever_grid(Hexahedron, n)
dh = create_dofhandler(grid)
K = create_sparsity_pattern(dh)
C = create_stiffness()
## compilation
doassemble(K, colors, grid, dh, C, n_threads)
b = @elapsed @time K, f = doassemble(K, colors, grid, dh, C, n_threads)
return b
end
run_assemble()
# Running the code with different number of threads give the following runtimes:
# * 1 thread 2.46 seconds
# * 2 threads 1.19 seconds
# * 3 threads 0.83 seconds
# * 4 threads 0.75 seconds
#
# 4 threads 0.35
#md # ## [Plain program](@id threaded_assembly-plain-program)
#md #
#md # Here follows a version of the program without any comments.
#md # The file is also available here: [`threaded_assembly.jl`](threaded_assembly.jl).
#md #
#md # ```julia
#md # @__CODE__
#md # ```