-
Notifications
You must be signed in to change notification settings - Fork 0
/
hyperparameters.txt
84 lines (83 loc) · 6.38 KB
/
hyperparameters.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
Hyperparameters for All Methods
Synthetic Data Feature Extractor: Both convolutional layers have a kernel size of 5. The first convolutional layer has 40 output channels. The second convolutional layer has 100 output channels. The final linear layer has 500 output features
\begin{itemize}
\item ABDMIL
\begin{itemize}
\item Synthetic Data
\begin{itemize}
\item fixed hyperparameters: hidden layer size of 128
\item hyperparameters searched over: learning rates of [$1e-4$, $1e-5$] and the weight decays of [$1e-6$, $1e-7$]
\item without positional encoding, final hyperparameters: hidden layer size of 128, a learning rate of $1e-4$, and a weight decay of $1e-7$
\item with positional encoding, final hyperparameters: hidden layer size of 128, a learning rate of $1e-4$, and a weight decay of $1e-7$.
\end{itemize}
\item Real Data
\begin{itemize}
\item hyperparameters searched over: hidden layer sizes of [128, 512, 1024, 2048, 4096]; learning rates of [$1e-4$, $1e-5$], and weight decays of [$1e-6$, $1e-7$].
\item without positional encoding, final hyperparameters: a hidden layer size of 2048, a learning rate of $1e-4$, and a weight decay of $1e-6$
\item with positional encoding, final hyperparameters: a hidden layer size of 4096, a learning rate of $1e-4$, and a weight decay of $1e-7$.
\end{itemize}
\end{itemize}
\item CLAM-SB
\begin{itemize}
\item Synthetic Data
\begin{itemize}
\item hyperparameters searched over: learning rates of [$1e-4$, $1e-5$], weight decays of [$1e-6$, $1e-7$], and CLAM-specific parameters B of [1, 2, 5], dropout of [True, False] and $c_1$ of [0.3, 0.5, 0.7]
\item without positional encoding, final hyperparameters: a learning rate of $1e-4$, a weight decay of $1e-7$, and CLAM-specific parameters B of 5, no dropout, and $c_1$ of 0.7
\item with positional encoding, final hyperparameters: a learning rate of $1e-4$, a weight decay of $1e-7$, and CLAM-specific parameters B of 5, dropout, and $c_1$ of 0.5
\end{itemize}
\item Real Data
\begin{itemize}
\item hyperparameters searched over: learning rates of [$1e-4$, $1e-5$], weight decays of [$1e-6$, $1e-7$], and CLAM-specific parameters B of [2, 4, 6, 8, 12], dropout of [True, False] and $c_1$ of [0.3, 0.5, 0.7]
\item without positional encoding, final hyperparameters: a learning rate of $1e-5$, a weight decay of $1e-7$, and CLAM-specific parameters B of 6, no dropout, and $c_1$ of 0.5
\item with positional encoding, final hyperparameters: a learning rate of $1e-5$, a weight decay of $1e-7$, and CLAM-specific parameters B of 6, no dropout, and $c_1$ of 0.7
\end{itemize}
\end{itemize}
\item CLAM-MB
\begin{itemize}
\item Synthetic Data
\begin{itemize}
\item hyperparameters searched over: learning rates of [$1e-4$, $1e-5$], weight decays of [$1e-6$, $1e-7$], and CLAM-specific parameters B of [1, 2, 5], dropout of [True, False] and $c_1$ of [0.3, 0.5, 0.7]
\item without positional encoding, final hyperparameters: a learning rate of $1e-4$, a weight decay of $1e-7$, and CLAM-specific parameters B of 1, dropout, and $c_1$ of 0.5
\item with positional encoding, final hyperparameters: a learning rate of $1e-4$, a weight decay of $1e-6$, and CLAM-specific parameters B of 5, dropout, and $c_1$ of 0.3
\end{itemize}
\item Real Data
\begin{itemize}
\item hyperparameters searched over: learning rates of [$1e-4$, $1e-5$], weight decays of [$1e-6$, $1e-7$], and CLAM-specific parameters B of [2, 4, 6, 8, 12], dropout of [True, False] and $c_1$ of [0.3, 0.5, 0.7]
\item without positional encoding, final hyperparameters: learning rate of $1e-5$, a weight decay of $1e-7$, and CLAM-specific parameters B of 8, no dropout, and $c_1$ of 0.7
\item with positional encoding, final hyperparameters: a learning rate of $1e-5$, a weight decay of $1e-6$, and CLAM-specific parameters B of 4, no dropout, and $c_1$ of 07
\end{itemize}
\end{itemize}
\item DTFD
\begin{itemize}
\item Synthetic Data
\begin{itemize}
\item fixed hyperparameters: hidden layer size of 128
\item hyperparameters searched over: learning rates of [$1e-4$, $1e-5$] the weight decays of [$1e-6$, $1e-7$], and the number of pseudo bags of [2, 5]
\item without positional encoding, final hyperparameters: a hidden layer size of 128, a learning rate of $1e-4$, a weight decay of $1e-6$, and 5 pseudo bags
\item with positional encoding, final hyperparameters: a hidden layer size of 128, a learning rate of $1e-4$, a weight decay of $1e-6$, and 5 pseudo bags
\end{itemize}
\item Real Data
\begin{itemize}
\item hyperparameters searched over: hidden layer sizes of [128, 512, 1024, 2048, 4096]; learning rates of [$1e-4$, $1e-5$], weight decays of [$1e-6$, $1e-7$], and the number of pseudo bags of [2, 4, 6, 12, 24]
\item without positional encoding, final hyperparameters: a hidden layer size of 128, a learning rate of $1e-4$, a weight decay of $1e-6$, and 4 pseudo bags
\item with positional encoding, final hyperparameters: a hidden layer size of 1024, a learning rate of $1e-5$, a weight decay of $1e-6$, and 24 pseudo bags
\end{itemize}
\end{itemize}
\item TransMIL + Other Transformers
\begin{itemize}
\item Synthetic Data
\begin{itemize}
\item fixed hyperparameter: non-TransMIL transformer used Multihead Attention with 8 attention heads
\item hyperparameters searched over: hidden layer sizes of [32, 64, 128]; learning rates of [$1e-4$, $1e-5$] and weight decays of [$1e-6$, $1e-7$]
\item without positional encoding, final hyperparameters: a hidden layer size of 64, a learning rate of $1e-4$, and a weight decay of $1e-6$
\item with positional encoding, final hyperparameters: a hidden layer size of 32, a learning rate of $1e-4$, and a weight decay of $1e-6$
\end{itemize}
\item Real Data
\begin{itemize}
\item fixed hyperparameter: non-TransMIL transformer used Multihead Attention with 8 attention heads
\item hyperparameters searched over: hidden layer sizes of [128, 512, 1024]; learning rates of [$1e-4$, $1e-5$], and weight decays of [$1e-6$, $1e-7$].
\item without positional encoding, final hyperparameters: a hidden layer size of 128, a learning rate of $1e-5$, and a weight decay of $1e-7$
\item with positional encoding, final hyperparameters: a hidden layer size of 128, a learning rate of $1e-5$, and a weight decay of $1e-7$
\end{itemize}
\end{itemize}
\end{itemize}