forked from projectcalico/felix
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfelix.go
928 lines (852 loc) · 31.5 KB
/
felix.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
// Copyright (c) 2016-2017 Tigera, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package main
import (
"context"
"errors"
"fmt"
"math/rand"
"net/http"
"os"
"os/exec"
"os/signal"
"runtime"
"runtime/debug"
"syscall"
"time"
docopt "github.com/docopt/docopt-go"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
log "github.com/sirupsen/logrus"
"k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
"github.com/projectcalico/felix/buildinfo"
"github.com/projectcalico/felix/calc"
"github.com/projectcalico/felix/config"
_ "github.com/projectcalico/felix/config"
dp "github.com/projectcalico/felix/dataplane"
"github.com/projectcalico/felix/logutils"
"github.com/projectcalico/felix/proto"
"github.com/projectcalico/felix/statusrep"
"github.com/projectcalico/felix/usagerep"
apiv3 "github.com/projectcalico/libcalico-go/lib/apis/v3"
"github.com/projectcalico/libcalico-go/lib/backend"
bapi "github.com/projectcalico/libcalico-go/lib/backend/api"
"github.com/projectcalico/libcalico-go/lib/backend/model"
"github.com/projectcalico/libcalico-go/lib/backend/syncersv1/updateprocessors"
"github.com/projectcalico/libcalico-go/lib/backend/watchersyncer"
errors2 "github.com/projectcalico/libcalico-go/lib/errors"
"github.com/projectcalico/libcalico-go/lib/health"
"github.com/projectcalico/libcalico-go/lib/set"
"github.com/projectcalico/typha/pkg/syncclient"
)
const usage = `Felix, the Calico per-host daemon.
Usage:
calico-felix [options]
Options:
-c --config-file=<filename> Config file to load [default: /etc/calico/felix.cfg].
--version Print the version and exit.
`
const (
// Our default value for GOGC if it is not set. This is the percentage that heap usage must
// grow by to trigger a garbage collection. Go's default is 100, meaning that 50% of the
// heap can be lost to garbage. We reduce it to this value to trade increased CPU usage for
// lower occupancy.
defaultGCPercent = 20
// String sent on the failure report channel to indicate we're shutting down for config
// change.
reasonConfigChanged = "config changed"
// Process return code used to report a config change. This is the same as the code used
// by SIGHUP, which means that the wrapper script also restarts Felix on a SIGHUP.
configChangedRC = 129
)
// main is the entry point to the calico-felix binary.
//
// Its main role is to sequence Felix's startup by:
//
// Initialising early logging config (log format and early debug settings).
//
// Parsing command line parameters.
//
// Loading datastore configuration from the environment or config file.
//
// Loading more configuration from the datastore (this is retried until success).
//
// Starting the configured internal (golang) or external dataplane driver.
//
// Starting the background processing goroutines, which load and keep in sync with the
// state from the datastore, the "calculation graph".
//
// Starting the usage reporting and prometheus metrics endpoint threads (if configured).
//
// Then, it defers to monitorAndManageShutdown(), which blocks until one of the components
// fails, then attempts a graceful shutdown. At that point, all the processing is in
// background goroutines.
//
// To avoid having to maintain rarely-used code paths, Felix handles updates to its
// main config parameters by exiting and allowing itself to be restarted by the init
// daemon.
func main() {
// Go's RNG is not seeded by default. Do that now.
rand.Seed(time.Now().UTC().UnixNano())
// Special-case handling for environment variable-configured logging:
// Initialise early so we can trace out config parsing.
logutils.ConfigureEarlyLogging()
ctx := context.Background()
if os.Getenv("GOGC") == "" {
// Tune the GC to trade off a little extra CPU usage for significantly lower
// occupancy at high scale. This is worthwhile because Felix runs per-host so
// any occupancy improvement is multiplied by the number of hosts.
log.Debugf("No GOGC value set, defaulting to %d%%.", defaultGCPercent)
debug.SetGCPercent(defaultGCPercent)
}
// Parse command-line args.
version := "Version: " + buildinfo.GitVersion + "\n" +
"Full git commit ID: " + buildinfo.GitRevision + "\n" +
"Build date: " + buildinfo.BuildDate + "\n"
arguments, err := docopt.Parse(usage, nil, true, version, false)
if err != nil {
println(usage)
log.Fatalf("Failed to parse usage, exiting: %v", err)
}
buildInfoLogCxt := log.WithFields(log.Fields{
"version": buildinfo.GitVersion,
"buildDate": buildinfo.BuildDate,
"gitCommit": buildinfo.GitRevision,
"GOMAXPROCS": runtime.GOMAXPROCS(0),
})
buildInfoLogCxt.Info("Felix starting up")
log.Infof("Command line arguments: %v", arguments)
// Load the configuration from all the different sources including the
// datastore and merge. Keep retrying on failure. We'll sit in this
// loop until the datastore is ready.
log.Infof("Loading configuration...")
var backendClient bapi.Client
var configParams *config.Config
var typhaAddr string
configRetry:
for {
// Load locally-defined config, including the datastore connection
// parameters. First the environment variables.
configParams = config.New()
envConfig := config.LoadConfigFromEnvironment(os.Environ())
// Then, the config file.
configFile := arguments["--config-file"].(string)
fileConfig, err := config.LoadConfigFile(configFile)
if err != nil {
log.WithError(err).WithField("configFile", configFile).Error(
"Failed to load configuration file")
time.Sleep(1 * time.Second)
continue configRetry
}
// Parse and merge the local config.
configParams.UpdateFrom(envConfig, config.EnvironmentVariable)
if configParams.Err != nil {
log.WithError(configParams.Err).WithField("configFile", configFile).Error(
"Failed to parse configuration environment variable")
time.Sleep(1 * time.Second)
continue configRetry
}
configParams.UpdateFrom(fileConfig, config.ConfigFile)
if configParams.Err != nil {
log.WithError(configParams.Err).WithField("configFile", configFile).Error(
"Failed to parse configuration file")
time.Sleep(1 * time.Second)
continue configRetry
}
// We should now have enough config to connect to the datastore
// so we can load the remainder of the config.
datastoreConfig := configParams.DatastoreConfig()
backendClient, err = backend.NewClient(datastoreConfig)
if err != nil {
log.WithError(err).Error("Failed to create datastore client")
time.Sleep(1 * time.Second)
continue configRetry
}
globalConfig, hostConfig, err := loadConfigFromDatastore(
ctx, backendClient, configParams.FelixHostname)
if err != nil {
log.WithError(err).Error("Failed to get config from datastore")
time.Sleep(1 * time.Second)
continue configRetry
}
configParams.UpdateFrom(globalConfig, config.DatastoreGlobal)
configParams.UpdateFrom(hostConfig, config.DatastorePerHost)
configParams.Validate()
if configParams.Err != nil {
log.WithError(configParams.Err).Error(
"Failed to parse/validate configuration from datastore.")
time.Sleep(1 * time.Second)
continue configRetry
}
// We now have some config flags that affect how we configure the syncer.
// After loading the config from the datastore, reconnect, possibly with new
// config. We don't need to re-load the configuration _again_ because the
// calculation graph will spot if the config has changed since we were initialised.
datastoreConfig = configParams.DatastoreConfig()
backendClient, err = backend.NewClient(datastoreConfig)
if err != nil {
log.WithError(err).Error("Failed to (re)connect to datastore")
time.Sleep(1 * time.Second)
continue configRetry
}
// If we're configured to discover Typha, do that now so we can retry if we fail.
typhaAddr, err = discoverTyphaAddr(configParams)
if err != nil {
log.WithError(err).Error("Typha discovery enabled but discovery failed.")
time.Sleep(1 * time.Second)
continue configRetry
}
break configRetry
}
// If we get here, we've loaded the configuration successfully.
// Update log levels before we do anything else.
logutils.ConfigureLogging(configParams)
// Since we may have enabled more logging, log with the build context
// again.
buildInfoLogCxt.WithField("config", configParams).Info(
"Successfully loaded configuration.")
// Health monitoring, for liveness and readiness endpoints.
healthAggregator := health.NewHealthAggregator()
// Start up the dataplane driver. This may be the internal go-based driver or an external
// one.
var dpDriver dp.DataplaneDriver
var dpDriverCmd *exec.Cmd
dpDriver, dpDriverCmd = dp.StartDataplaneDriver(configParams, healthAggregator)
// Initialise the glue logic that connects the calculation graph to/from the dataplane driver.
log.Info("Connect to the dataplane driver.")
failureReportChan := make(chan string)
var connToUsageRepUpdChan chan map[string]string
if configParams.UsageReportingEnabled {
// Make a channel for the connector to use to send updates to the usage reporter.
// (Otherwise, we pass in a nil channel, which disables such updates.)
connToUsageRepUpdChan = make(chan map[string]string, 1)
}
dpConnector := newConnector(configParams, connToUsageRepUpdChan, backendClient, dpDriver, failureReportChan)
// Now create the calculation graph, which receives updates from the
// datastore and outputs dataplane updates for the dataplane driver.
//
// The Syncer has its own thread and we use an extra thread for the
// Validator, just to pipeline that part of the calculation then the
// main calculation graph runs in a single thread for simplicity.
// The output of the calculation graph arrives at the dataplane
// connection via channel.
//
// Syncer -chan-> Validator -chan-> Calc graph -chan-> dataplane
// KVPair KVPair protobufs
// Get a Syncer from the datastore, or a connection to our remote sync daemon, Typha,
// which will feed the calculation graph with updates, bringing Felix into sync.
var syncer Startable
var typhaConnection *syncclient.SyncerClient
syncerToValidator := calc.NewSyncerCallbacksDecoupler()
if typhaAddr != "" {
// Use a remote Syncer, via the Typha server.
log.WithField("addr", typhaAddr).Info("Connecting to Typha.")
typhaConnection = syncclient.New(
typhaAddr,
buildinfo.GitVersion,
configParams.FelixHostname,
fmt.Sprintf("Revision: %s; Build date: %s",
buildinfo.GitRevision, buildinfo.BuildDate),
syncerToValidator,
&syncclient.Options{
ReadTimeout: configParams.TyphaReadTimeout,
WriteTimeout: configParams.TyphaWriteTimeout,
},
)
} else {
// Use the syncer locally.
syncer = backendClient.Syncer(syncerToValidator)
}
log.WithField("syncer", syncer).Info("Created Syncer")
// Create the ipsets/active policy calculation graph, which will
// do the dynamic calculation of ipset memberships and active policies
// etc.
asyncCalcGraph := calc.NewAsyncCalcGraph(configParams, dpConnector.ToDataplane, healthAggregator)
if configParams.UsageReportingEnabled {
// Usage reporting enabled, add stats collector to graph. When it detects an update
// to the stats, it makes a callback, which we use to send an update on a channel.
// We use a buffered channel here to avoid blocking the calculation graph.
statsChanIn := make(chan calc.StatsUpdate, 1)
statsCollector := calc.NewStatsCollector(func(stats calc.StatsUpdate) error {
statsChanIn <- stats
return nil
})
statsCollector.RegisterWith(asyncCalcGraph.Dispatcher)
// Rather than sending the updates directly to the usage reporting thread, we
// decouple with an extra goroutine. This prevents blocking the calculation graph
// goroutine if the usage reporting goroutine is blocked on IO, for example.
// Using a buffered channel wouldn't work here because the usage reporting
// goroutine can block for a long time on IO so we could build up a long queue.
statsChanOut := make(chan calc.StatsUpdate)
go func() {
var statsChanOutOrNil chan calc.StatsUpdate
var stats calc.StatsUpdate
for {
select {
case stats = <-statsChanIn:
// Got a stats update, activate the output channel.
log.WithField("stats", stats).Debug("Buffer: stats update received")
statsChanOutOrNil = statsChanOut
case statsChanOutOrNil <- stats:
// Passed on the update, deactivate the output channel until
// the next update.
log.WithField("stats", stats).Debug("Buffer: stats update sent")
statsChanOutOrNil = nil
}
}
}()
usageRep := usagerep.New(
configParams.UsageReportingInitialDelaySecs,
configParams.UsageReportingIntervalSecs,
statsChanOut,
connToUsageRepUpdChan,
)
go usageRep.PeriodicallyReportUsage(context.Background())
} else {
// Usage reporting disabled, but we still want a stats collector for the
// felix_cluster_* metrics. Register a no-op function as the callback.
statsCollector := calc.NewStatsCollector(func(stats calc.StatsUpdate) error {
return nil
})
statsCollector.RegisterWith(asyncCalcGraph.Dispatcher)
}
// Create the validator, which sits between the syncer and the
// calculation graph.
validator := calc.NewValidationFilter(asyncCalcGraph)
// Start the background processing threads.
if syncer != nil {
log.Infof("Starting the datastore Syncer")
syncer.Start()
} else {
log.Infof("Starting the Typha connection")
err := typhaConnection.Start(context.Background())
if err != nil {
log.WithError(err).Fatal("Failed to connect to Typha")
}
go func() {
typhaConnection.Finished.Wait()
failureReportChan <- "Connection to Typha failed"
}()
}
go syncerToValidator.SendTo(validator)
asyncCalcGraph.Start()
log.Infof("Started the processing graph")
var stopSignalChans []chan<- bool
if configParams.EndpointReportingEnabled {
delay := configParams.EndpointReportingDelaySecs
log.WithField("delay", delay).Info(
"Endpoint status reporting enabled, starting status reporter")
dpConnector.statusReporter = statusrep.NewEndpointStatusReporter(
configParams.FelixHostname,
dpConnector.StatusUpdatesFromDataplane,
dpConnector.InSync,
dpConnector.datastore,
delay,
delay*180,
)
dpConnector.statusReporter.Start()
}
// Start communicating with the dataplane driver.
dpConnector.Start()
// Send the opening message to the dataplane driver, giving it its
// config.
dpConnector.ToDataplane <- &proto.ConfigUpdate{
Config: configParams.RawValues(),
}
if configParams.PrometheusMetricsEnabled {
log.Info("Prometheus metrics enabled. Starting server.")
gaugeHost := prometheus.NewGauge(prometheus.GaugeOpts{
Name: "felix_host",
Help: "Configured Felix hostname (as a label), typically used in grouping/aggregating stats; the label defaults to the hostname of the host but can be overridden by configuration. The value of the gauge is always set to 1.",
ConstLabels: prometheus.Labels{"host": configParams.FelixHostname},
})
gaugeHost.Set(1)
prometheus.MustRegister(gaugeHost)
go servePrometheusMetrics(configParams)
}
if configParams.HealthEnabled {
log.WithField("port", configParams.HealthPort).Info("Health enabled. Starting server.")
go healthAggregator.ServeHTTP(configParams.HealthPort)
}
// On receipt of SIGUSR1, write out heap profile.
logutils.DumpHeapMemoryOnSignal(configParams)
// Now monitor the worker process and our worker threads and shut
// down the process gracefully if they fail.
monitorAndManageShutdown(failureReportChan, dpDriverCmd, stopSignalChans)
}
func servePrometheusMetrics(configParams *config.Config) {
for {
log.WithField("port", configParams.PrometheusMetricsPort).Info("Starting prometheus metrics endpoint")
if configParams.PrometheusGoMetricsEnabled && configParams.PrometheusProcessMetricsEnabled {
log.Info("Including Golang & Process metrics")
} else {
if !configParams.PrometheusGoMetricsEnabled {
log.Info("Discarding Golang metrics")
prometheus.Unregister(prometheus.NewGoCollector())
}
if !configParams.PrometheusProcessMetricsEnabled {
log.Info("Discarding process metrics")
prometheus.Unregister(prometheus.NewProcessCollector(os.Getpid(), ""))
}
}
http.Handle("/metrics", promhttp.Handler())
err := http.ListenAndServe(fmt.Sprintf(":%v", configParams.PrometheusMetricsPort), nil)
log.WithError(err).Error(
"Prometheus metrics endpoint failed, trying to restart it...")
time.Sleep(1 * time.Second)
}
}
func monitorAndManageShutdown(failureReportChan <-chan string, driverCmd *exec.Cmd, stopSignalChans []chan<- bool) {
// Ask the runtime to tell us if we get a term/int signal.
signalChan := make(chan os.Signal, 1)
signal.Notify(signalChan, syscall.SIGTERM)
signal.Notify(signalChan, syscall.SIGINT)
signal.Notify(signalChan, syscall.SIGHUP)
// Start a background thread to tell us when the dataplane driver stops.
// If the driver stops unexpectedly, we'll terminate this process.
// If this process needs to stop, we'll kill the driver and then wait
// for the message from the background thread.
driverStoppedC := make(chan bool)
go func() {
if driverCmd == nil {
log.Info("No driver process to monitor")
return
}
err := driverCmd.Wait()
log.WithError(err).Warn("Driver process stopped")
driverStoppedC <- true
}()
// Wait for one of the channels to give us a reason to shut down.
driverAlreadyStopped := driverCmd == nil
receivedFatalSignal := false
var reason string
select {
case <-driverStoppedC:
reason = "Driver stopped"
driverAlreadyStopped = true
case sig := <-signalChan:
if sig == syscall.SIGHUP {
log.Warning("Received a SIGHUP, treating as a request to reload config")
reason = reasonConfigChanged
} else {
reason = fmt.Sprintf("Received OS signal %v", sig)
receivedFatalSignal = true
}
case reason = <-failureReportChan:
}
logCxt := log.WithField("reason", reason)
logCxt.Warn("Felix is shutting down")
// Notify other components to stop.
for _, c := range stopSignalChans {
select {
case c <- true:
default:
}
}
if !driverAlreadyStopped {
// Driver may still be running, just in case the driver is
// unresponsive, start a thread to kill this process if we
// don't manage to kill the driver.
logCxt.Info("Driver still running, trying to shut it down...")
giveUpOnSigTerm := make(chan bool)
go func() {
time.Sleep(4 * time.Second)
giveUpOnSigTerm <- true
time.Sleep(1 * time.Second)
log.Fatal("Failed to wait for driver to exit, giving up.")
}()
// Signal to the driver to exit.
driverCmd.Process.Signal(syscall.SIGTERM)
select {
case <-driverStoppedC:
logCxt.Info("Driver shut down after SIGTERM")
case <-giveUpOnSigTerm:
logCxt.Error("Driver did not respond to SIGTERM, sending SIGKILL")
driverCmd.Process.Kill()
<-driverStoppedC
logCxt.Info("Driver shut down after SIGKILL")
}
}
if !receivedFatalSignal {
// We're exiting due to a failure or a config change, wait
// a couple of seconds to ensure that we don't go into a tight
// restart loop (which would make the init daemon in calico/node give
// up trying to restart us).
logCxt.Info("Sleeping to avoid tight restart loop.")
go func() {
time.Sleep(2 * time.Second)
if reason == reasonConfigChanged {
// We want to exit with a specific RC, but if we call Fatal() it will exit for us
// with the wrong RC. We need to call Fatal or Panic to force the log to be flushed
// so call Panic() but use defer to force an exit before the stack trace is printed.
defer os.Exit(configChangedRC)
logCxt.Panic("Exiting for config change")
return
}
logCxt.Fatal("Exiting.")
}()
for {
sig := <-signalChan
if sig == syscall.SIGHUP {
logCxt.Warning("Ignoring SIGHUP because we're already shutting down")
continue
}
logCxt.WithField("signal", sig).Fatal(
"Signal received while shutting down, exiting immediately")
}
}
logCxt.Fatal("Exiting immediately")
}
func loadConfigFromDatastore(
ctx context.Context, client bapi.Client, hostname string,
) (globalConfig, hostConfig map[string]string, err error) {
// The configuration is split over 3 different resource types and 4 different resource
// instances in the v3 data model:
// - ClusterInformation (global): name "default"
// - FelixConfiguration (global): name "default"
// - FelixConfiguration (per-host): name "node.<hostname>"
// - Node (per-host): name: <hostname>
// Get the global values and host specific values separately. We re-use the updateprocessor
// logic to convert the single v3 resource to a set of v1 key/values.
hostConfig = make(map[string]string)
globalConfig = make(map[string]string)
var ready bool
err = getAndMergeConfig(
ctx, client, globalConfig,
apiv3.KindClusterInformation, "default",
updateprocessors.NewClusterInfoUpdateProcessor(),
&ready,
)
if err != nil {
return
}
if !ready {
// The ClusterInformation struct should contain the ready flag, if it is not set, abort.
err = errors.New("datastore is not ready or has not been initialised")
return
}
err = getAndMergeConfig(
ctx, client, globalConfig,
apiv3.KindFelixConfiguration, "default",
updateprocessors.NewFelixConfigUpdateProcessor(),
&ready,
)
if err != nil {
return
}
err = getAndMergeConfig(
ctx, client, hostConfig,
apiv3.KindFelixConfiguration, "node."+hostname,
updateprocessors.NewFelixConfigUpdateProcessor(),
&ready,
)
if err != nil {
return
}
err = getAndMergeConfig(
ctx, client, hostConfig,
apiv3.KindNode, hostname,
updateprocessors.NewFelixNodeUpdateProcessor(),
&ready,
)
if err != nil {
return
}
return
}
// getAndMergeConfig gets the v3 resource configuration extracts the separate config values
// (where each configuration value is stored in a field of the v3 resource Spec) and merges into
// the supplied map, as required by our v1-style configuration loader.
func getAndMergeConfig(
ctx context.Context, client bapi.Client, config map[string]string,
kind string, name string,
configConverter watchersyncer.SyncerUpdateProcessor,
ready *bool,
) error {
logCxt := log.WithFields(log.Fields{"kind": kind, "name": name})
cfg, err := client.Get(ctx, model.ResourceKey{
Kind: kind,
Name: name,
Namespace: "",
}, "")
if err != nil {
switch err.(type) {
case errors2.ErrorResourceDoesNotExist:
logCxt.Info("No config of this type")
return nil
default:
logCxt.WithError(err).Info("Failed to load config from datastore")
return err
}
}
// Re-use the update processor logic implemented for the Syncer. We give it a v3 config
// object in a KVPair and it uses the annotations defined on it to split it into v1-style
// KV pairs. Log any errors - but don't fail completely to avoid cyclic restarts.
v1kvs, err := configConverter.Process(cfg)
if err != nil {
logCxt.WithError(err).Error("Failed to convert configuration")
}
// Loop through the converted values and update our config map with values from either the
// Global or Host configs.
for _, v1KV := range v1kvs {
if _, ok := v1KV.Key.(model.ReadyFlagKey); ok {
logCxt.WithField("ready", v1KV.Value).Info("Loaded ready flag")
if v1KV.Value == true {
*ready = true
}
} else if v1KV.Value != nil {
switch k := v1KV.Key.(type) {
case model.GlobalConfigKey:
config[k.Name] = v1KV.Value.(string)
case model.HostConfigKey:
config[k.Name] = v1KV.Value.(string)
default:
logCxt.WithField("KV", v1KV).Debug("Skipping config - not required for initial loading")
}
}
}
return nil
}
type DataplaneConnector struct {
config *config.Config
configUpdChan chan<- map[string]string
ToDataplane chan interface{}
StatusUpdatesFromDataplane chan interface{}
InSync chan bool
failureReportChan chan<- string
dataplane dp.DataplaneDriver
datastore bapi.Client
statusReporter *statusrep.EndpointStatusReporter
datastoreInSync bool
firstStatusReportSent bool
}
type Startable interface {
Start()
}
func newConnector(configParams *config.Config,
configUpdChan chan<- map[string]string,
datastore bapi.Client,
dataplane dp.DataplaneDriver,
failureReportChan chan<- string,
) *DataplaneConnector {
felixConn := &DataplaneConnector{
config: configParams,
configUpdChan: configUpdChan,
datastore: datastore,
ToDataplane: make(chan interface{}),
StatusUpdatesFromDataplane: make(chan interface{}),
InSync: make(chan bool, 1),
failureReportChan: failureReportChan,
dataplane: dataplane,
}
return felixConn
}
func (fc *DataplaneConnector) readMessagesFromDataplane() {
defer func() {
fc.shutDownProcess("Failed to read messages from dataplane")
}()
log.Info("Reading from dataplane driver pipe...")
for {
payload, err := fc.dataplane.RecvMessage()
if err != nil {
log.WithError(err).Error("Failed to read from front-end socket")
fc.shutDownProcess("Failed to read from front-end socket")
}
log.WithField("payload", payload).Debug("New message from dataplane")
switch msg := payload.(type) {
case *proto.ProcessStatusUpdate:
fc.handleProcessStatusUpdate(msg)
case *proto.WorkloadEndpointStatusUpdate:
if fc.statusReporter != nil {
fc.StatusUpdatesFromDataplane <- msg
}
case *proto.WorkloadEndpointStatusRemove:
if fc.statusReporter != nil {
fc.StatusUpdatesFromDataplane <- msg
}
case *proto.HostEndpointStatusUpdate:
if fc.statusReporter != nil {
fc.StatusUpdatesFromDataplane <- msg
}
case *proto.HostEndpointStatusRemove:
if fc.statusReporter != nil {
fc.StatusUpdatesFromDataplane <- msg
}
default:
log.WithField("msg", msg).Warning("Unknown message from dataplane")
}
log.Debug("Finished handling message from front-end")
}
}
func (fc *DataplaneConnector) handleProcessStatusUpdate(msg *proto.ProcessStatusUpdate) {
log.Debugf("Status update from dataplane driver: %v", *msg)
statusReport := model.StatusReport{
Timestamp: msg.IsoTimestamp,
UptimeSeconds: msg.Uptime,
FirstUpdate: !fc.firstStatusReportSent,
}
kv := model.KVPair{
Key: model.ActiveStatusReportKey{Hostname: fc.config.FelixHostname},
Value: &statusReport,
TTL: fc.config.ReportingTTLSecs,
}
_, err := fc.datastore.Apply(&kv)
if err != nil {
log.Warningf("Failed to write status to datastore: %v", err)
} else {
fc.firstStatusReportSent = true
}
kv = model.KVPair{
Key: model.LastStatusReportKey{Hostname: fc.config.FelixHostname},
Value: &statusReport,
}
_, err = fc.datastore.Apply(&kv)
if err != nil {
log.Warningf("Failed to write status to datastore: %v", err)
}
}
var handledConfigChanges = set.From("CalicoVersion", "ClusterGUID", "ClusterType")
func (fc *DataplaneConnector) sendMessagesToDataplaneDriver() {
defer func() {
fc.shutDownProcess("Failed to send messages to dataplane")
}()
var config map[string]string
for {
msg := <-fc.ToDataplane
switch msg := msg.(type) {
case *proto.InSync:
log.Info("Datastore now in sync.")
if !fc.datastoreInSync {
log.Info("Datastore in sync for first time, sending message to status reporter.")
fc.datastoreInSync = true
fc.InSync <- true
}
case *proto.ConfigUpdate:
if config != nil {
log.WithFields(log.Fields{
"old": config,
"new": msg.Config,
}).Info("Config updated, checking whether we need to restart")
restartNeeded := false
for kNew, vNew := range msg.Config {
logCxt := log.WithFields(log.Fields{"key": kNew, "new": vNew})
if vOld, prs := config[kNew]; !prs {
logCxt = logCxt.WithField("updateType", "add")
} else if vNew != vOld {
logCxt = logCxt.WithFields(log.Fields{"old": vOld, "updateType": "update"})
} else {
continue
}
if handledConfigChanges.Contains(kNew) {
logCxt.Info("Config change can be handled without restart")
continue
}
logCxt.Warning("Config change requires restart")
restartNeeded = true
}
for kOld, vOld := range config {
logCxt := log.WithFields(log.Fields{"key": kOld, "old": vOld, "updateType": "delete"})
if _, prs := msg.Config[kOld]; prs {
// Key was present in the message so we've handled above.
continue
}
if handledConfigChanges.Contains(kOld) {
logCxt.Info("Config change can be handled without restart")
continue
}
logCxt.Warning("Config change requires restart")
restartNeeded = true
}
if restartNeeded {
fc.shutDownProcess("config changed")
}
}
// Take a copy of the config to compare against next time.
config = make(map[string]string)
for k, v := range msg.Config {
config[k] = v
}
if fc.configUpdChan != nil {
// Send the config over to the usage reporter.
fc.configUpdChan <- config
}
case *calc.DatastoreNotReady:
log.Warn("Datastore became unready, need to restart.")
fc.shutDownProcess("datastore became unready")
}
if err := fc.dataplane.SendMessage(msg); err != nil {
fc.shutDownProcess("Failed to write to dataplane driver")
}
}
}
func (fc *DataplaneConnector) shutDownProcess(reason string) {
// Send a failure report to the managed shutdown thread then give it
// a few seconds to do the shutdown.
fc.failureReportChan <- reason
time.Sleep(5 * time.Second)
// The graceful shutdown failed, terminate the process.
log.Panic("Managed shutdown failed. Panicking.")
}
func (fc *DataplaneConnector) Start() {
// Start a background thread to write to the dataplane driver.
go fc.sendMessagesToDataplaneDriver()
// Start background thread to read messages from dataplane driver.
go fc.readMessagesFromDataplane()
}
var ErrServiceNotReady = errors.New("Kubernetes service missing IP or port.")
func discoverTyphaAddr(configParams *config.Config) (string, error) {
if configParams.TyphaAddr != "" {
// Explicit address; trumps other sources of config.
return configParams.TyphaAddr, nil
}
if configParams.TyphaK8sServiceName == "" {
// No explicit address, and no service name, not using Typha.
return "", nil
}
// If we get here, we need to look up the Typha service using the k8s API.
// TODO Typha: support Typha lookup without using rest.InClusterConfig().
k8sconf, err := rest.InClusterConfig()
if err != nil {
log.WithError(err).Error("Unable to create Kubernetes config.")
return "", err
}
clientset, err := kubernetes.NewForConfig(k8sconf)
if err != nil {
log.WithError(err).Error("Unable to create Kubernetes client set.")
return "", err
}
svcClient := clientset.CoreV1().Services(configParams.TyphaK8sNamespace)
svc, err := svcClient.Get(configParams.TyphaK8sServiceName, v1.GetOptions{})
if err != nil {
log.WithError(err).Error("Unable to get Typha service from Kubernetes.")
return "", err
}
host := svc.Spec.ClusterIP
log.WithField("clusterIP", host).Info("Found Typha ClusterIP.")
if host == "" {
log.WithError(err).Error("Typha service had no ClusterIP.")
return "", ErrServiceNotReady
}
for _, p := range svc.Spec.Ports {
if p.Name == "calico-typha" {
log.WithField("port", p).Info("Found Typha service port.")
typhaAddr := fmt.Sprintf("%s:%v", host, p.Port)
return typhaAddr, nil
}
}
log.Error("Didn't find Typha service port.")
return "", ErrServiceNotReady
}