forked from Xilinx/finn-hlslib
-
Notifications
You must be signed in to change notification settings - Fork 0
/
vvau.hpp
157 lines (146 loc) · 6.42 KB
/
vvau.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
/******************************************************************************
* Copyright (c) 2019, Xilinx, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
*******************************************************************************/
/*******************************************************************************
*
* Authors: Giulio Gambardella <giuliog@xilinx.com>
*
* \file vvau.hpp
*
* This file lists a templated funtion used to implement
* Vector-Vector-Activation Unit (used for depthwise separable convolutions)
*
*******************************************************************************/
#ifndef VVAU_HPP
#define VVAU_HPP
#include "hls_stream.h"
#include "mac.hpp"
#include "interpret.hpp"
/**
* \brief Vector vector activate function
*
* The function performs the multiplication between a weigth vector and the input activation vector,
* accumulating the results and then applying an activation function on the accumulated result.
* It is used to implement depth-wise separable convolution
*
* \tparam Channels Number of channels
* \tparam Kernel_2 Kernel * Kernel dimension (Kernel ^ 2 if square)
* \tparam SIMD Number of input columns computed in parallel
* \tparam PE Number of output rows computed in parallel
* \tparam MMV Number of output pixels computed in parallel
* \tparam TSrcI DataType of the input activation (as used in the MAC)
* \tparam TDstI DataType of the output activation (as generated by the activation)
* \tparam TWeightI DataType of the weights (as used in the MAC)
* \tparam TI DataType of the input stream - safely deducible from the paramaters
* \tparam TO DataType of the output stream - safely deducible from the paramaters
* \tparam TW DataType of the weights matrix - safely deducible from the paramaters
* \tparam TA DataType of the activation class (e.g. thresholds) - safely deducible from the paramaters
* \tparam R Datatype for the resource used for FPGA implementation of the MAC - safely deducible from the paramaters
*
* \param in Input stream
* \param out Output stream
* \param weights Weights matrix (currently supports BinaryWeights or FixedPointWeights)
* \param activation Activation class
* \param reps Number of time the function has to be repeatedly executed (e.g. number of images)
* \param r Resource type for the hardware implementation of the MAC block
*/
template<
unsigned Channels, unsigned Kernel_2, unsigned SIMD, unsigned PE, unsigned MMV,
typename TSrcI = Identity, typename TDstI = Identity, typename TWeightI = Identity,
typename TI, typename TO, typename TW, typename TA, typename R
>
void Vector_Vector_Activate_Batch(hls::stream<TI> &in,
hls::stream<TO> &out,
TW const &weights,
TA const &activation,
int const reps,
R const &r) {
static_assert(SIMD == 1, "SIMD parallelism not yet supported.");
// how many different rows each neuron will compute
// alternatively: number of vertical matrix chunks
unsigned const NF = Channels / PE;
// how many synapse groups each row is split into
// alternatively: number of horizontal matrix chunks
unsigned const SF = (Channels*Kernel_2) / Channels;
decltype(activation.init(0,0)) accu[MMV][PE];
#pragma HLS ARRAY_PARTITION variable=accu complete dim=0
unsigned nf = 0;
unsigned sf = 0;
unsigned tile = 0; // invariant: tile = nf*SF + sf
// everything merged into a common iteration space (one "big" loop instead
// of smaller nested loops) to get the pipelinening the way we want
unsigned const TOTAL_FOLD = NF * SF ;//* Channels/SIMD;
for(unsigned i = 0; i < reps * TOTAL_FOLD; i++) {
#pragma HLS pipeline style=flp II=1
TI inElem;
inElem = in.read();
// Threshold Initialisation
if(sf == 0) {
for(unsigned pe = 0; pe < PE; pe++) {
for(unsigned mmv = 0; mmv < MMV; mmv++) {
#pragma HLS UNROLL
accu[mmv][pe] = activation.init(nf, pe);
}
}
}
// compute matrix-vector product for each processing element
auto const &w = weights.weights(tile);
for(unsigned pe = 0; pe < PE; pe++) {
#pragma HLS UNROLL
auto const wgt = TWeightI()(w[pe]);
for (unsigned mmv = 0; mmv < MMV; mmv++){
auto const act = TSrcI()(inElem, mmv);
accu[mmv][pe] += mul(wgt[0], act(pe,mmv), r);
}
}
// keep track of which folded synapse/neuron we are processing
++tile;
if(++sf == SF) {
// produce output and clear accumulators
auto outElem = TDstI().template operator()<TO>();
for (unsigned pe = 0; pe < PE; pe++) {
#pragma HLS UNROLL
for (unsigned mmv = 0; mmv < MMV; mmv++){
#pragma HLS UNROLL
outElem(pe,mmv,1) = activation.activate(nf, pe, accu[mmv][pe]);
}
}
out.write(outElem);
// next folded neuron or image
sf = 0;
if(++nf == NF) {
nf = 0;
tile = 0;
}
}
}
}
#endif