-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfaultTolerance.tex
76 lines (58 loc) · 2.06 KB
/
faultTolerance.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
\begin{frame}[fragile]
\frametitle{Fault Tolerance in Charm++/AMPI}
\begin{itemize}
\item Four Approaches:
\begin{itemize}
\item Disk-based checkpoint/restart
\item In-memory double checkpoint/restart
\item Experimental: Proactive object migration
\item Experimental: Message-logging for scalable fault tolerance
\end{itemize}
\item Common Features:
\begin{itemize}
\item Easy checkpoint
\item Migrate-to-disk leverages object-migration capabilities
\item Based on dynamic runtime capabilities
\item Can be used in concert with load-balancing schemes
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Checkpointing to the file system : Split Execution}
\begin{itemize}
\item The common form of checkpointing
\begin{itemize}
\item The job runs for 5 hours, then will continue at the next
allocation another day!
\end{itemize}
\item The existing Charm++ infrastructure for chare migration helps
\item Just ``migrate'' chares to disk
\item The call to checkpoint the application is made in the main chare at a synchronization point
\end{itemize}
\begin{lstlisting}[basicstyle=\footnotesize]
CkCallback cb(CkIndex_Hello::SayHi(),helloProxy);
CkStartCheckpoint(``log'',cb);
> ./charmrun hello +p4 +restart log
\end{lstlisting}
\end{frame}
\begin{frame}[fragile]
\frametitle{In-memory checkpointing with auto restart}
\begin{itemize}
\item Idea: checkpoint data in a buddy processor's memory, in addition
to a local checkpoint
\item System auto detects when a node crashes
\item Failed process is restarted on a spare, and retrieves it's
checkpoint from the buddy
\item (you can also do without the spare)
\item Every other processor retrieves its local checkpoint
\end{itemize}
\begin{lstlisting}[basicstyle=\footnotesize]
void CkStartMemCheckpoint(CkCallback &cb)
\end{lstlisting}
\end{frame}
\begin{frame}[fragile]
\includegraphics[width=\textwidth]{figures/checkpointTimeIntrepid.png}
\end{frame}
\begin{frame}[fragile]
\includegraphics[width=\textwidth]{figures/restartTimeIntrepid.png}
\end{frame}