-
Notifications
You must be signed in to change notification settings - Fork 1.3k
/
gofer.go
842 lines (761 loc) · 28 KB
/
gofer.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package cmd
import (
"context"
"encoding/json"
"fmt"
"io"
"os"
"path/filepath"
"regexp"
"runtime"
"runtime/debug"
"strings"
"github.com/google/subcommands"
specs "github.com/opencontainers/runtime-spec/specs-go"
"golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/devices/tpuproxy/vfio"
"gvisor.dev/gvisor/pkg/unet"
"gvisor.dev/gvisor/pkg/urpc"
"gvisor.dev/gvisor/runsc/boot"
"gvisor.dev/gvisor/runsc/cmd/util"
"gvisor.dev/gvisor/runsc/config"
"gvisor.dev/gvisor/runsc/container"
"gvisor.dev/gvisor/runsc/flag"
"gvisor.dev/gvisor/runsc/fsgofer"
"gvisor.dev/gvisor/runsc/fsgofer/filter"
"gvisor.dev/gvisor/runsc/profile"
"gvisor.dev/gvisor/runsc/specutils"
)
var caps = []string{
"CAP_CHOWN",
"CAP_DAC_OVERRIDE",
"CAP_DAC_READ_SEARCH",
"CAP_FOWNER",
"CAP_FSETID",
"CAP_SYS_CHROOT",
}
// goferCaps is the minimal set of capabilities needed by the Gofer to operate
// on files.
var goferCaps = &specs.LinuxCapabilities{
Bounding: caps,
Effective: caps,
Permitted: caps,
}
// goferSyncFDs contains file descriptors that are used for synchronization
// of the Gofer startup process against other processes.
type goferSyncFDs struct {
// nvproxyFD is a file descriptor that is used to wait until
// nvproxy-related setup is done. This setup involves creating mounts in the
// Gofer process's mount namespace.
// If this is set, this FD is the first that the Gofer waits for.
nvproxyFD int
// usernsFD is a file descriptor that is used to wait until
// user namespace ID mappings are established in the Gofer's userns.
// If this is set, this FD is the second that the Gofer waits for.
usernsFD int
// procMountFD is a file descriptor that has to be closed when the
// procfs mount isn't needed anymore. It is read by the procfs unmounter
// process.
// If this is set, this FD is the last that the Gofer interacts with and
// closes.
procMountFD int
}
// Gofer implements subcommands.Command for the "gofer" command, which starts a
// filesystem gofer. This command should not be called directly.
type Gofer struct {
bundleDir string
ioFDs intFlags
devIoFD int
applyCaps bool
setUpRoot bool
mountConfs boot.GoferMountConfFlags
specFD int
mountsFD int
goferToHostRPCFD int
profileFDs profile.FDArgs
syncFDs goferSyncFDs
stopProfiling func()
}
// Name implements subcommands.Command.
func (*Gofer) Name() string {
return "gofer"
}
// Synopsis implements subcommands.Command.
func (g *Gofer) Synopsis() string {
return fmt.Sprintf("launch a gofer process that proxies access to container files")
}
// Usage implements subcommands.Command.
func (*Gofer) Usage() string {
return `gofer [flags]`
}
// SetFlags implements subcommands.Command.
func (g *Gofer) SetFlags(f *flag.FlagSet) {
f.StringVar(&g.bundleDir, "bundle", "", "path to the root of the bundle directory, defaults to the current directory")
f.BoolVar(&g.applyCaps, "apply-caps", true, "if true, apply capabilities to restrict what the Gofer process can do")
f.BoolVar(&g.setUpRoot, "setup-root", true, "if true, set up an empty root for the process")
// Open FDs that are donated to the gofer.
f.Var(&g.ioFDs, "io-fds", "list of FDs to connect gofer servers. Follows the same order as --gofer-mount-confs. FDs are only donated if the mount is backed by lisafs.")
f.Var(&g.mountConfs, "gofer-mount-confs", "information about how the gofer mounts have been configured. They must follow this order: root first, then mounts as defined in the spec.")
f.IntVar(&g.devIoFD, "dev-io-fd", -1, "optional FD to connect /dev gofer server")
f.IntVar(&g.specFD, "spec-fd", -1, "required fd with the container spec")
f.IntVar(&g.mountsFD, "mounts-fd", -1, "mountsFD is the file descriptor to write list of mounts after they have been resolved (direct paths, no symlinks).")
f.IntVar(&g.goferToHostRPCFD, "rpc-fd", -1, "gofer-to-host RPC file descriptor.")
// Add synchronization FD flags.
g.syncFDs.setFlags(f)
// Profiling flags.
g.profileFDs.SetFromFlags(f)
}
// Execute implements subcommands.Command.
func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus {
if g.bundleDir == "" || len(g.ioFDs) < 1 || g.specFD < 0 {
f.Usage()
return subcommands.ExitUsageError
}
conf := args[0].(*config.Config)
// Set traceback level
debug.SetTraceback(conf.Traceback)
specFile := os.NewFile(uintptr(g.specFD), "spec file")
defer specFile.Close()
spec, err := specutils.ReadSpecFromFile(g.bundleDir, specFile, conf)
if err != nil {
util.Fatalf("reading spec: %v", err)
}
g.syncFDs.syncNVProxy()
g.syncFDs.syncUsernsForRootless()
goferToHostRPCSock, err := unet.NewSocket(g.goferToHostRPCFD)
if err != nil {
util.Fatalf("creating rpc socket: %v", err)
}
goferToHostRPC := urpc.NewClient(goferToHostRPCSock)
defer goferToHostRPC.Close()
if g.setUpRoot {
if err := g.setupRootFS(spec, conf, goferToHostRPC); err != nil {
util.Fatalf("Error setting up root FS: %v", err)
}
if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
cleanupUnmounter := g.syncFDs.spawnProcUnmounter()
defer cleanupUnmounter()
}
}
goferToHostRPC.Close()
if g.applyCaps {
overrides := g.syncFDs.flags()
overrides["apply-caps"] = "false"
overrides["setup-root"] = "false"
args := prepareArgs(g.Name(), f, overrides)
util.Fatalf("setCapsAndCallSelf(%v, %v): %v", args, goferCaps, setCapsAndCallSelf(args, goferCaps))
panic("unreachable")
}
// Start profiling. This will be a noop if no profiling arguments were passed.
profileOpts := g.profileFDs.ToOpts()
g.stopProfiling = profile.Start(profileOpts)
// At this point we won't re-execute, so it's safe to limit via rlimits. Any
// limit >= 0 works. If the limit is lower than the current number of open
// files, then Setrlimit will succeed, and the next open will fail.
if conf.FDLimit > -1 {
rlimit := unix.Rlimit{
Cur: uint64(conf.FDLimit),
Max: uint64(conf.FDLimit),
}
switch err := unix.Setrlimit(unix.RLIMIT_NOFILE, &rlimit); err {
case nil:
case unix.EPERM:
log.Warningf("FD limit %d is higher than the current hard limit or system-wide maximum", conf.FDLimit)
default:
util.Fatalf("Failed to set RLIMIT_NOFILE: %v", err)
}
}
// Find what path is going to be served by this gofer.
root := spec.Root.Path
if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
root = "/root"
}
// Resolve mount points paths, then replace mounts from our spec and send the
// mount list over to the sandbox, so they are both in sync.
//
// Note that all mount points have been mounted in the proper location in
// setupRootFS().
cleanMounts, err := g.resolveMounts(conf, spec.Mounts, root)
if err != nil {
util.Fatalf("Failure to resolve mounts: %v", err)
}
spec.Mounts = cleanMounts
go func() {
if err := g.writeMounts(cleanMounts); err != nil {
panic(fmt.Sprintf("Failed to write mounts: %v", err))
}
}()
specutils.LogSpecDebug(spec, conf.OCISeccomp)
// fsgofer should run with a umask of 0, because we want to preserve file
// modes exactly as sent by the sandbox, which will have applied its own umask.
unix.Umask(0)
procFDPath := procFDBindMount
if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
procFDPath = "/proc/self/fd"
}
if err := fsgofer.OpenProcSelfFD(procFDPath); err != nil {
util.Fatalf("failed to open /proc/self/fd: %v", err)
}
// procfs isn't needed anymore.
g.syncFDs.unmountProcfs()
if err := unix.Chroot(root); err != nil {
util.Fatalf("failed to chroot to %q: %v", root, err)
}
if err := unix.Chdir("/"); err != nil {
util.Fatalf("changing working dir: %v", err)
}
log.Infof("Process chroot'd to %q", root)
// Initialize filters.
opts := filter.Options{
UDSOpenEnabled: conf.GetHostUDS().AllowOpen(),
UDSCreateEnabled: conf.GetHostUDS().AllowCreate(),
ProfileEnabled: len(profileOpts) > 0,
DirectFS: conf.DirectFS,
CgoEnabled: config.CgoEnabled,
}
if err := filter.Install(opts); err != nil {
util.Fatalf("installing seccomp filters: %v", err)
}
return g.serve(spec, conf, root)
}
func newSocket(ioFD int) *unet.Socket {
socket, err := unet.NewSocket(ioFD)
if err != nil {
util.Fatalf("creating server on FD %d: %v", ioFD, err)
}
return socket
}
func (g *Gofer) serve(spec *specs.Spec, conf *config.Config, root string) subcommands.ExitStatus {
type connectionConfig struct {
sock *unet.Socket
mountPath string
readonly bool
}
cfgs := make([]connectionConfig, 0, len(spec.Mounts)+1)
server := fsgofer.NewLisafsServer(fsgofer.Config{
// These are global options. Ignore readonly configuration, that is set on
// a per connection basis.
HostUDS: conf.GetHostUDS(),
HostFifo: conf.HostFifo,
DonateMountPointFD: conf.DirectFS,
})
ioFDs := g.ioFDs
rootfsConf := g.mountConfs[0]
if rootfsConf.ShouldUseLisafs() {
// Start with root mount, then add any other additional mount as needed.
cfgs = append(cfgs, connectionConfig{
sock: newSocket(ioFDs[0]),
mountPath: "/", // fsgofer process is always chroot()ed. So serve root.
readonly: spec.Root.Readonly || rootfsConf.ShouldUseOverlayfs(),
})
log.Infof("Serving %q mapped to %q on FD %d (ro: %t)", "/", root, ioFDs[0], cfgs[0].readonly)
ioFDs = ioFDs[1:]
}
mountIdx := 1 // first one is the root
for _, m := range spec.Mounts {
if !specutils.IsGoferMount(m) {
continue
}
mountConf := g.mountConfs[mountIdx]
mountIdx++
if !mountConf.ShouldUseLisafs() {
continue
}
if !filepath.IsAbs(m.Destination) {
util.Fatalf("mount destination must be absolute: %q", m.Destination)
}
if len(ioFDs) == 0 {
util.Fatalf("no FD found for mount. Did you forget --io-fd? FDs: %d, Mount: %+v", len(g.ioFDs), m)
}
ioFD := ioFDs[0]
ioFDs = ioFDs[1:]
readonly := specutils.IsReadonlyMount(m.Options) || mountConf.ShouldUseOverlayfs()
cfgs = append(cfgs, connectionConfig{
sock: newSocket(ioFD),
mountPath: m.Destination,
readonly: readonly,
})
log.Infof("Serving %q mapped on FD %d (ro: %t)", m.Destination, ioFD, readonly)
}
if len(ioFDs) > 0 {
util.Fatalf("too many FDs passed for mounts. mounts: %d, FDs: %d", len(cfgs), len(g.ioFDs))
}
if g.devIoFD >= 0 {
cfgs = append(cfgs, connectionConfig{
sock: newSocket(g.devIoFD),
mountPath: "/dev",
})
log.Infof("Serving /dev mapped on FD %d (ro: false)", g.devIoFD)
}
for _, cfg := range cfgs {
conn, err := server.CreateConnection(cfg.sock, cfg.mountPath, cfg.readonly)
if err != nil {
util.Fatalf("starting connection on FD %d for gofer mount failed: %v", cfg.sock.FD(), err)
}
server.StartConnection(conn)
}
server.Wait()
server.Destroy()
log.Infof("All lisafs servers exited.")
if g.stopProfiling != nil {
g.stopProfiling()
}
return subcommands.ExitSuccess
}
func (g *Gofer) writeMounts(mounts []specs.Mount) error {
bytes, err := json.Marshal(mounts)
if err != nil {
return err
}
f := os.NewFile(uintptr(g.mountsFD), "mounts file")
defer f.Close()
for written := 0; written < len(bytes); {
w, err := f.Write(bytes[written:])
if err != nil {
return err
}
written += w
}
return nil
}
// Redhat distros don't allow to create bind-mounts in /proc/self directories.
// It is protected by selinux rules.
const procFDBindMount = "/proc/fs"
func (g *Gofer) setupRootFS(spec *specs.Spec, conf *config.Config, goferToHostRPC *urpc.Client) error {
// Convert all shared mounts into slaves to be sure that nothing will be
// propagated outside of our namespace.
procPath := "/proc"
if err := specutils.SafeMount("", "/", "", unix.MS_SLAVE|unix.MS_REC, "", procPath); err != nil {
util.Fatalf("error converting mounts: %v", err)
}
root := spec.Root.Path
if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
// runsc can't be re-executed without /proc, so we create a tmpfs mount,
// mount ./proc and ./root there, then move this mount to the root and after
// setCapsAndCallSelf, runsc will chroot into /root.
//
// We need a directory to construct a new root and we know that
// runsc can't start without /proc, so we can use it for this.
flags := uintptr(unix.MS_NOSUID | unix.MS_NODEV | unix.MS_NOEXEC)
if err := specutils.SafeMount("runsc-root", "/proc/fs", "tmpfs", flags, "", procPath); err != nil {
util.Fatalf("error mounting tmpfs: %v", err)
}
if err := unix.Mount("", "/proc/fs", "", unix.MS_UNBINDABLE, ""); err != nil {
util.Fatalf("error setting MS_UNBINDABLE")
}
// Prepare tree structure for pivot_root(2).
if err := os.Mkdir("/proc/fs/proc", 0755); err != nil {
util.Fatalf("error creating /proc/fs/proc: %v", err)
}
if err := os.Mkdir("/proc/fs/root", 0755); err != nil {
util.Fatalf("error creating /proc/fs/root: %v", err)
}
if err := os.Mkdir("/proc/fs/etc", 0755); err != nil {
util.Fatalf("error creating /proc/fs/etc: %v", err)
}
// This cannot use SafeMount because there's no available procfs. But we
// know that /proc/fs is an empty tmpfs mount, so this is safe.
if err := unix.Mount("/proc", "/proc/fs/proc", "", flags|unix.MS_RDONLY|unix.MS_BIND|unix.MS_REC, ""); err != nil {
util.Fatalf("error mounting /proc/fs/proc: %v", err)
}
// self/fd is bind-mounted, so that the FD return by
// OpenProcSelfFD() does not allow escapes with walking ".." .
if err := unix.Mount("/proc/fs/proc/self/fd", "/proc/fs/"+procFDBindMount,
"", unix.MS_RDONLY|unix.MS_BIND|flags, ""); err != nil {
util.Fatalf("error mounting proc/self/fd: %v", err)
}
if err := copyFile("/proc/fs/etc/localtime", "/etc/localtime"); err != nil {
log.Warningf("Failed to copy /etc/localtime: %v. UTC timezone will be used.", err)
}
root = "/proc/fs/root"
procPath = "/proc/fs/proc"
}
rootfsConf := g.mountConfs[0]
if rootfsConf.ShouldUseLisafs() {
// Mount root path followed by submounts.
if err := specutils.SafeMount(spec.Root.Path, root, "bind", unix.MS_BIND|unix.MS_REC, "", procPath); err != nil {
return fmt.Errorf("mounting root on root (%q) err: %v", root, err)
}
flags := uint32(unix.MS_SLAVE | unix.MS_REC)
if spec.Linux != nil && spec.Linux.RootfsPropagation != "" {
flags = specutils.PropOptionsToFlags([]string{spec.Linux.RootfsPropagation})
}
if err := specutils.SafeMount("", root, "", uintptr(flags), "", procPath); err != nil {
return fmt.Errorf("mounting root (%q) with flags: %#x, err: %v", root, flags, err)
}
}
// Replace the current spec, with the clean spec with symlinks resolved.
if err := g.setupMounts(conf, spec.Mounts, root, procPath, goferToHostRPC); err != nil {
util.Fatalf("error setting up FS: %v", err)
}
// Set up /dev directory is needed.
if g.devIoFD >= 0 {
g.setupDev(spec, conf, root, procPath)
}
// Create working directory if needed.
if spec.Process.Cwd != "" {
dst, err := resolveSymlinks(root, spec.Process.Cwd)
if err != nil {
return fmt.Errorf("resolving symlinks to %q: %v", spec.Process.Cwd, err)
}
log.Infof("Create working directory %q if needed", spec.Process.Cwd)
if err := os.MkdirAll(dst, 0755); err != nil {
return fmt.Errorf("creating working directory %q: %v", spec.Process.Cwd, err)
}
}
// Check if root needs to be remounted as readonly.
if rootfsConf.ShouldUseLisafs() && (spec.Root.Readonly || rootfsConf.ShouldUseOverlayfs()) {
// If root is a mount point but not read-only, we can change mount options
// to make it read-only for extra safety.
// unix.MS_NOSUID and unix.MS_NODEV are included here not only
// for safety reasons but also because they can be locked and
// any attempts to unset them will fail. See
// mount_namespaces(7) for more details.
log.Infof("Remounting root as readonly: %q", root)
flags := uintptr(unix.MS_BIND | unix.MS_REMOUNT | unix.MS_RDONLY | unix.MS_NOSUID | unix.MS_NODEV)
if err := specutils.SafeMount(root, root, "bind", flags, "", procPath); err != nil {
return fmt.Errorf("remounting root as read-only with source: %q, target: %q, flags: %#x, err: %v", root, root, flags, err)
}
}
if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
if err := pivotRoot("/proc/fs"); err != nil {
util.Fatalf("failed to change the root file system: %v", err)
}
if err := os.Chdir("/"); err != nil {
util.Fatalf("failed to change working directory")
}
}
return nil
}
// setupMounts bind mounts all mounts specified in the spec in their correct
// location inside root. It will resolve relative paths and symlinks. It also
// creates directories as needed.
func (g *Gofer) setupMounts(conf *config.Config, mounts []specs.Mount, root, procPath string, goferToHostRPC *urpc.Client) error {
mountIdx := 1 // First index is for rootfs.
for _, m := range mounts {
if !specutils.IsGoferMount(m) {
continue
}
mountConf := g.mountConfs[mountIdx]
mountIdx++
if !mountConf.ShouldUseLisafs() {
continue
}
dst, err := resolveSymlinks(root, m.Destination)
if err != nil {
return fmt.Errorf("resolving symlinks to %q: %v", m.Destination, err)
}
flags := specutils.OptionsToFlags(m.Options) | unix.MS_BIND
if mountConf.ShouldUseOverlayfs() {
// Force mount read-only if writes are not going to be sent to it.
flags |= unix.MS_RDONLY
}
log.Infof("Mounting src: %q, dst: %q, flags: %#x", m.Source, dst, flags)
src := m.Source
var srcFile *os.File
if err := unix.Access(src, unix.R_OK); err != nil {
// The current process doesn't have enough permissions
// to open the mount, so let's try to open it in the
// parent user namespace.
var res container.OpenMountResult
if err := goferToHostRPC.Call("goferRPC.OpenMount", &m, &res); err != nil {
return fmt.Errorf("opening %s: %w", m.Source, err)
}
srcFile = res.Files[0]
src = fmt.Sprintf("%s/self/fd/%d", procPath, srcFile.Fd())
}
err = specutils.SafeSetupAndMount(src, dst, m.Type, flags, procPath)
srcFile.Close()
if err != nil {
return fmt.Errorf("mounting %+v: %v", m, err)
}
// Set propagation options that cannot be set together with other options.
flags = specutils.PropOptionsToFlags(m.Options)
if flags != 0 {
if err := specutils.SafeMount("", dst, "", uintptr(flags), "", procPath); err != nil {
return fmt.Errorf("mount dst: %q, flags: %#x, err: %v", dst, flags, err)
}
}
}
return nil
}
// shouldExposeNvidiaDevice returns true if path refers to an Nvidia device
// which should be exposed to the container.
//
// Precondition: nvproxy is enabled.
func shouldExposeNvidiaDevice(path string) bool {
if !strings.HasPrefix(path, "/dev/nvidia") {
return false
}
if path == "/dev/nvidiactl" || path == "/dev/nvidia-uvm" {
return true
}
nvidiaDevPathReg := regexp.MustCompile(`^/dev/nvidia(\d+)$`)
return nvidiaDevPathReg.MatchString(path)
}
// shouldExposeVfioDevice returns true if path refers to an VFIO device
// which shuold be exposed to the container.
func shouldExposeVFIODevice(path string) bool {
return strings.HasPrefix(path, filepath.Dir(vfio.VFIOPath))
}
// shouldExposeTpuDevice returns true if path refers to a TPU device which
// should be exposed to the container.
//
// Precondition: tpuproxy is enabled.
func shouldExposeTpuDevice(path string) bool {
valid, _ := util.IsTPUDeviceValid(path)
return valid || shouldExposeVFIODevice(path)
}
func (g *Gofer) setupDev(spec *specs.Spec, conf *config.Config, root, procPath string) error {
if err := os.MkdirAll(filepath.Join(root, "dev"), 0777); err != nil {
return fmt.Errorf("creating dev directory: %v", err)
}
// Mount any devices specified in the spec.
if spec.Linux == nil {
return nil
}
nvproxyEnabled := specutils.NVProxyEnabled(spec, conf)
tpuproxyEnabled := specutils.TPUProxyIsEnabled(spec, conf)
for _, dev := range spec.Linux.Devices {
shouldMount := (nvproxyEnabled && shouldExposeNvidiaDevice(dev.Path)) ||
(tpuproxyEnabled && shouldExposeTpuDevice(dev.Path))
if !shouldMount {
continue
}
dst := filepath.Join(root, dev.Path)
log.Infof("Mounting device %q as bind mount at %q", dev.Path, dst)
if err := specutils.SafeSetupAndMount(dev.Path, dst, "bind", unix.MS_BIND, procPath); err != nil {
return fmt.Errorf("mounting %q: %v", dev.Path, err)
}
}
return nil
}
// resolveMounts resolved relative paths and symlinks to mount points.
//
// Note: mount points must already be in place for resolution to work.
// Otherwise, it may follow symlinks to locations that would be overwritten
// with another mount point and return the wrong location. In short, make sure
// setupMounts() has been called before.
func (g *Gofer) resolveMounts(conf *config.Config, mounts []specs.Mount, root string) ([]specs.Mount, error) {
mountIdx := 1 // First index is for rootfs.
cleanMounts := make([]specs.Mount, 0, len(mounts))
for _, m := range mounts {
if !specutils.IsGoferMount(m) {
cleanMounts = append(cleanMounts, m)
continue
}
mountConf := g.mountConfs[mountIdx]
mountIdx++
if !mountConf.ShouldUseLisafs() {
cleanMounts = append(cleanMounts, m)
continue
}
dst, err := resolveSymlinks(root, m.Destination)
if err != nil {
return nil, fmt.Errorf("resolving symlinks to %q: %v", m.Destination, err)
}
relDst, err := filepath.Rel(root, dst)
if err != nil {
panic(fmt.Sprintf("%q could not be made relative to %q: %v", dst, root, err))
}
opts, err := adjustMountOptions(conf, filepath.Join(root, relDst), m.Options)
if err != nil {
return nil, err
}
cpy := m
cpy.Destination = filepath.Join("/", relDst)
cpy.Options = opts
cleanMounts = append(cleanMounts, cpy)
}
return cleanMounts, nil
}
// ResolveSymlinks walks 'rel' having 'root' as the root directory. If there are
// symlinks, they are evaluated relative to 'root' to ensure the end result is
// the same as if the process was running inside the container.
func resolveSymlinks(root, rel string) (string, error) {
return resolveSymlinksImpl(root, root, rel, 255)
}
func resolveSymlinksImpl(root, base, rel string, followCount uint) (string, error) {
if followCount == 0 {
return "", fmt.Errorf("too many symlinks to follow, path: %q", filepath.Join(base, rel))
}
rel = filepath.Clean(rel)
for _, name := range strings.Split(rel, string(filepath.Separator)) {
if name == "" {
continue
}
// Note that Join() resolves things like ".." and returns a clean path.
path := filepath.Join(base, name)
if !strings.HasPrefix(path, root) {
// One cannot '..' their way out of root.
base = root
continue
}
fi, err := os.Lstat(path)
if err != nil {
if !os.IsNotExist(err) {
return "", err
}
// Not found means there is no symlink to check. Just keep walking dirs.
base = path
continue
}
if fi.Mode()&os.ModeSymlink != 0 {
link, err := os.Readlink(path)
if err != nil {
return "", err
}
if filepath.IsAbs(link) {
base = root
}
base, err = resolveSymlinksImpl(root, base, link, followCount-1)
if err != nil {
return "", err
}
continue
}
base = path
}
return base, nil
}
// adjustMountOptions adds filesystem-specific gofer mount options.
func adjustMountOptions(conf *config.Config, path string, opts []string) ([]string, error) {
rv := make([]string, len(opts))
copy(rv, opts)
statfs := unix.Statfs_t{}
if err := unix.Statfs(path, &statfs); err != nil {
return nil, err
}
switch statfs.Type {
case unix.OVERLAYFS_SUPER_MAGIC:
rv = append(rv, "overlayfs_stale_read")
case unix.NFS_SUPER_MAGIC, unix.FUSE_SUPER_MAGIC:
// The gofer client implements remote file handle sharing for performance.
// However, remote filesystems like NFS and FUSE rely on close(2) syscall
// for flushing file data to the server. Such handle sharing prevents the
// application's close(2) syscall from being propagated to the host. Hence
// disable file handle sharing, so remote files are flushed correctly.
rv = append(rv, "disable_file_handle_sharing")
}
return rv, nil
}
// setFlags sets sync FD flags on the given FlagSet.
func (g *goferSyncFDs) setFlags(f *flag.FlagSet) {
f.IntVar(&g.nvproxyFD, "sync-nvproxy-fd", -1, "file descriptor that the gofer waits on until nvproxy setup is done")
f.IntVar(&g.usernsFD, "sync-userns-fd", -1, "file descriptor the gofer waits on until userns mappings are set up")
f.IntVar(&g.procMountFD, "proc-mount-sync-fd", -1, "file descriptor that the gofer writes to when /proc isn't needed anymore and can be unmounted")
}
// flags returns the flags necessary to pass along the current sync FD values
// to a re-executed version of this process.
func (g *goferSyncFDs) flags() map[string]string {
return map[string]string{
"sync-nvproxy-fd": fmt.Sprintf("%d", g.nvproxyFD),
"sync-userns-fd": fmt.Sprintf("%d", g.usernsFD),
"proc-mount-sync-fd": fmt.Sprintf("%d", g.procMountFD),
}
}
// waitForFD waits for the other end of a given FD to be closed.
// `fd` is closed unconditionally after that.
// This should only be called for actual FDs (i.e. `fd` >= 0).
func waitForFD(fd int, fdName string) error {
log.Debugf("Waiting on %s %d...", fdName, fd)
f := os.NewFile(uintptr(fd), fdName)
defer f.Close()
var b [1]byte
if n, err := f.Read(b[:]); n != 0 || err != io.EOF {
return fmt.Errorf("failed to sync on %s: %v: %v", fdName, n, err)
}
log.Debugf("Synced on %s %d.", fdName, fd)
return nil
}
// spawnProcMounter executes the /proc unmounter process.
// It returns a function to wait on the proc unmounter process, which
// should be called (via defer) in case of errors in order to clean up the
// unmounter process properly.
// When procfs is no longer needed, `unmountProcfs` should be called.
func (g *goferSyncFDs) spawnProcUnmounter() func() {
if g.procMountFD != -1 {
util.Fatalf("procMountFD is set")
}
// /proc is umounted from a forked process, because the
// current one may re-execute itself without capabilities.
cmd, w := execProcUmounter()
// Clear FD_CLOEXEC. This process may be re-executed. procMountFD
// should remain open.
if _, _, errno := unix.RawSyscall(unix.SYS_FCNTL, w.Fd(), unix.F_SETFD, 0); errno != 0 {
util.Fatalf("error clearing CLOEXEC: %v", errno)
}
g.procMountFD = int(w.Fd())
return func() {
g.procMountFD = -1
w.Close()
cmd.Wait()
}
}
// unmountProcfs signals the proc unmounter process that procfs is no longer
// needed.
func (g *goferSyncFDs) unmountProcfs() {
if g.procMountFD < 0 {
return
}
umountProc(g.procMountFD)
g.procMountFD = -1
}
// syncUsernsForRootless waits on usernsFD to be closed and then sets
// UID/GID to 0. Note that this function calls runtime.LockOSThread().
// This function is a no-op if usernsFD is -1.
//
// Postcondition: All callers must re-exec themselves after this returns,
// unless usernsFD was -1.
func (g *goferSyncFDs) syncUsernsForRootless() {
if g.usernsFD < 0 {
return
}
syncUsernsForRootless(g.usernsFD)
g.usernsFD = -1
}
// syncUsernsForRootless waits on usernsFD to be closed and then sets
// UID/GID to 0. Note that this function calls runtime.LockOSThread().
//
// Postcondition: All callers must re-exec themselves after this returns.
func syncUsernsForRootless(fd int) {
if err := waitForFD(fd, "userns sync FD"); err != nil {
util.Fatalf("failed to sync on userns FD: %v", err)
}
// SETUID changes UID on the current system thread, so we have
// to re-execute current binary.
runtime.LockOSThread()
if _, _, errno := unix.RawSyscall(unix.SYS_SETUID, 0, 0, 0); errno != 0 {
util.Fatalf("failed to set UID: %v", errno)
}
if _, _, errno := unix.RawSyscall(unix.SYS_SETGID, 0, 0, 0); errno != 0 {
util.Fatalf("failed to set GID: %v", errno)
}
}
// syncNVProxy waits on nvproxyFD to be closed.
// Used for synchronization during nvproxy setup which is done from the
// non-gofer process.
// This function is a no-op if nvProxySyncFD is -1.
func (g *goferSyncFDs) syncNVProxy() {
if g.nvproxyFD < 0 {
return
}
if err := waitForFD(g.nvproxyFD, "nvproxy sync FD"); err != nil {
util.Fatalf("failed to sync on NVProxy FD: %v", err)
}
g.nvproxyFD = -1
}