| Left: | ||
| Right: |
| OLD | NEW |
|---|---|
| 1 // Copyright 2011 The Go Authors. All rights reserved. | 1 // Copyright 2011 The Go Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style | 2 // Use of this source code is governed by a BSD-style |
| 3 // license that can be found in the LICENSE file. | 3 // license that can be found in the LICENSE file. |
| 4 | 4 |
| 5 // +build linux | 5 // +build linux |
| 6 | 6 |
| 7 package syscall | 7 package syscall |
| 8 | 8 |
| 9 import ( | 9 import ( |
| 10 "unsafe" | 10 "unsafe" |
| 11 ) | 11 ) |
| 12 | 12 |
| 13 // SysProcIDMap holds Container ID to Host ID mappings used for User Namespaces in Linux. | |
| 14 // See user_namespaces(7). | |
| 15 type SysProcIDMap struct { | |
| 16 ContainerID int // Container ID. | |
| 17 HostID int // Host ID. | |
| 18 Size int // Size. | |
| 19 } | |
| 20 | |
| 13 type SysProcAttr struct { | 21 type SysProcAttr struct { |
| 14 » Chroot string // Chroot. | 22 » Chroot string // Chroot. |
| 15 » Credential *Credential // Credential. | 23 » Credential *Credential // Credential. |
| 16 » Ptrace bool // Enable tracing. | 24 » Ptrace bool // Enable tracing. |
| 17 » Setsid bool // Create session. | 25 » Setsid bool // Create session. |
| 18 » Setpgid bool // Set process group ID to new pid (SYSV setpgrp) | 26 » Setpgid bool // Set process group ID to new pid (SYSV setp grp) |
| 19 » Setctty bool // Set controlling terminal to fd Ctty (only mean ingful if Setsid is set) | 27 » Setctty bool // Set controlling terminal to fd Ctty (only meaningful if Setsid is set) |
| 20 » Noctty bool // Detach fd 0 from controlling terminal | 28 » Noctty bool // Detach fd 0 from controlling terminal |
| 21 » Ctty int // Controlling TTY fd (Linux only) | 29 » Ctty int // Controlling TTY fd (Linux only) |
| 22 » Pdeathsig Signal // Signal that the process will get when its pare nt dies (Linux only) | 30 » Pdeathsig Signal // Signal that the process will get when its parent dies (Linux only) |
| 23 » Cloneflags uintptr // Flags for clone calls (Linux only) | 31 » Cloneflags uintptr // Flags for clone calls (Linux only) |
| 24 » Foreground bool // Set foreground process group to child's pid. ( Implies Setpgid. Stdin should be a TTY) | 32 » Foreground bool // Set foreground process group to child's pi d. (Implies Setpgid. Stdin should be a TTY) |
| 25 » Joinpgrp int // If != 0, child's process group ID. (Setpgid mu st not be set) | 33 » Joinpgrp int // If != 0, child's process group ID. (Setpgi d must not be set) |
| 34 » UidMappings []SysProcIDMap // User ID mappings for user namespaces. | |
| 35 » GidMappings []SysProcIDMap // Group ID mappings for user namespaces. | |
| 26 } | 36 } |
| 27 | 37 |
| 28 // Implemented in runtime package. | 38 // Implemented in runtime package. |
| 29 func runtime_BeforeFork() | 39 func runtime_BeforeFork() |
| 30 func runtime_AfterFork() | 40 func runtime_AfterFork() |
| 31 | 41 |
| 32 // Fork, dup fd onto 0..len(fd), and exec(argv0, argvv, envv) in child. | 42 // Fork, dup fd onto 0..len(fd), and exec(argv0, argvv, envv) in child. |
| 33 // If a dup or exec fails, write the errno error to pipe. | 43 // If a dup or exec fails, write the errno error to pipe. |
| 34 // (Pipe is close-on-exec so if exec succeeds, it will be closed.) | 44 // (Pipe is close-on-exec so if exec succeeds, it will be closed.) |
| 35 // In the child, this function must not acquire any locks, because | 45 // In the child, this function must not acquire any locks, because |
| 36 // they might have been locked at the time of the fork. This means | 46 // they might have been locked at the time of the fork. This means |
| 37 // no rescheduling, no malloc calls, and no new stack segments. | 47 // no rescheduling, no malloc calls, and no new stack segments. |
| 38 // For the same reason compiler does not race instrument it. | 48 // For the same reason compiler does not race instrument it. |
| 39 // The calls to RawSyscall are okay because they are assembly | 49 // The calls to RawSyscall are okay because they are assembly |
| 40 // functions that do not grow the stack. | 50 // functions that do not grow the stack. |
| 41 func forkAndExecInChild(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (pid int, err Errno) { | 51 func forkAndExecInChild(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (pid int, err Errno) { |
| 42 // Declare all variables at top in case any | 52 // Declare all variables at top in case any |
| 43 // declarations require heap allocation (e.g., err1). | 53 // declarations require heap allocation (e.g., err1). |
| 44 var ( | 54 var ( |
| 45 r1 uintptr | 55 r1 uintptr |
| 46 err1 Errno | 56 err1 Errno |
| 57 err2 Errno | |
| 47 nextfd int | 58 nextfd int |
| 48 i int | 59 i int |
| 60 p [2]int | |
| 49 ) | 61 ) |
| 50 | 62 |
| 51 // Guard against side effects of shuffling fds below. | 63 // Guard against side effects of shuffling fds below. |
| 52 // Make sure that nextfd is beyond any currently open files so | 64 // Make sure that nextfd is beyond any currently open files so |
| 53 // that we can't run the risk of overwriting any of them. | 65 // that we can't run the risk of overwriting any of them. |
| 54 fd := make([]int, len(attr.Files)) | 66 fd := make([]int, len(attr.Files)) |
| 55 nextfd = len(attr.Files) | 67 nextfd = len(attr.Files) |
| 56 for i, ufd := range attr.Files { | 68 for i, ufd := range attr.Files { |
| 57 if nextfd < int(ufd) { | 69 if nextfd < int(ufd) { |
| 58 nextfd = int(ufd) | 70 nextfd = int(ufd) |
| 59 } | 71 } |
| 60 fd[i] = int(ufd) | 72 fd[i] = int(ufd) |
| 61 } | 73 } |
| 62 nextfd++ | 74 nextfd++ |
| 63 | 75 |
| 76 // Allocate another pipe for parent to child communication for | |
| 77 // synchronizing writing of User ID/Group ID mappings. | |
| 78 if sys.UidMappings != nil || sys.GidMappings != nil { | |
| 79 if err := forkExecPipe(p[:]); err != nil { | |
| 80 return 0, err.(Errno) | |
| 81 } | |
| 82 } | |
| 83 | |
| 64 // About to call fork. | 84 // About to call fork. |
| 65 // No more allocation or calls of non-assembly functions. | 85 // No more allocation or calls of non-assembly functions. |
| 66 runtime_BeforeFork() | 86 runtime_BeforeFork() |
| 67 r1, _, err1 = RawSyscall6(SYS_CLONE, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0, 0) | 87 r1, _, err1 = RawSyscall6(SYS_CLONE, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0, 0) |
| 68 if err1 != 0 { | 88 if err1 != 0 { |
| 69 runtime_AfterFork() | 89 runtime_AfterFork() |
| 70 return 0, err1 | 90 return 0, err1 |
| 71 } | 91 } |
| 72 | 92 |
| 73 if r1 != 0 { | 93 if r1 != 0 { |
| 74 // parent; return PID | 94 // parent; return PID |
| 75 runtime_AfterFork() | 95 runtime_AfterFork() |
| 76 pid = int(r1) | 96 pid = int(r1) |
| 77 | 97 |
| 98 if sys.UidMappings != nil || sys.GidMappings != nil { | |
| 99 Close(p[0]) | |
| 100 err := writeUidGidMappings(pid, sys) | |
| 101 if err != nil { | |
| 102 err2 = err.(Errno) | |
| 103 } | |
| 104 RawSyscall(SYS_WRITE, uintptr(p[1]), uintptr(unsafe.Poin ter(&err2)), unsafe.Sizeof(err2)) | |
| 105 Close(p[1]) | |
| 106 } | |
| 107 | |
| 78 if sys.Joinpgrp != 0 { | 108 if sys.Joinpgrp != 0 { |
| 79 // Place the child in the specified process group. | 109 // Place the child in the specified process group. |
| 80 RawSyscall(SYS_SETPGID, r1, uintptr(sys.Joinpgrp), 0) | 110 RawSyscall(SYS_SETPGID, r1, uintptr(sys.Joinpgrp), 0) |
| 81 } else if sys.Foreground || sys.Setpgid { | 111 } else if sys.Foreground || sys.Setpgid { |
| 82 // Place the child in a new process group. | 112 // Place the child in a new process group. |
| 83 RawSyscall(SYS_SETPGID, 0, 0, 0) | 113 RawSyscall(SYS_SETPGID, 0, 0, 0) |
| 84 | 114 |
| 85 if sys.Foreground { | 115 if sys.Foreground { |
| 86 // Set new foreground process group. | 116 // Set new foreground process group. |
| 87 RawSyscall(SYS_IOCTL, uintptr(Stdin), TIOCSPGRP, uintptr(unsafe.Pointer(&pid))) | 117 RawSyscall(SYS_IOCTL, uintptr(Stdin), TIOCSPGRP, uintptr(unsafe.Pointer(&pid))) |
| 88 } | 118 } |
| 89 } | 119 } |
| 90 | 120 |
| 91 return pid, 0 | 121 return pid, 0 |
| 92 } | 122 } |
| 93 | 123 |
| 94 // Fork succeeded, now in child. | 124 // Fork succeeded, now in child. |
| 95 | 125 |
| 126 // Wait for User ID/Group ID mappings to be written. | |
| 127 if sys.UidMappings != nil || sys.GidMappings != nil { | |
| 128 if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(p[1]), 0, 0); err1 != 0 { | |
| 129 goto childerror | |
| 130 } | |
| 131 r1, _, err1 = RawSyscall(SYS_READ, uintptr(p[0]), uintptr(unsafe .Pointer(&err2)), unsafe.Sizeof(err2)) | |
| 132 if err1 != 0 || r1 != unsafe.Sizeof(uintptr(0)) { | |
|
mrunalp
2014/10/02 00:07:04
Should we set err1 to something specific when r1 d
iant
2014/10/02 00:33:34
Yes. I guess EINVAL unless you can think of somet
mrunalp
2014/10/02 05:14:25
Sounds good to me.
mrunalp
2014/10/02 05:14:25
Done.
| |
| 133 goto childerror | |
| 134 } | |
| 135 if err2 != 0 { | |
| 136 err1 = err2 | |
| 137 goto childerror | |
| 138 } | |
| 139 } | |
| 140 | |
| 96 // Parent death signal | 141 // Parent death signal |
| 97 if sys.Pdeathsig != 0 { | 142 if sys.Pdeathsig != 0 { |
| 98 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_SET_PDEATHSIG, uintptr(sy s.Pdeathsig), 0, 0, 0, 0) | 143 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_SET_PDEATHSIG, uintptr(sy s.Pdeathsig), 0, 0, 0, 0) |
| 99 if err1 != 0 { | 144 if err1 != 0 { |
| 100 goto childerror | 145 goto childerror |
| 101 } | 146 } |
| 102 | 147 |
| 103 // Signal self if parent is already dead. This might cause a | 148 // Signal self if parent is already dead. This might cause a |
| 104 // duplicate signal in rare cases, but it won't matter when | 149 // duplicate signal in rare cases, but it won't matter when |
| 105 // using SIGKILL. | 150 // using SIGKILL. |
| (...skipping 183 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 289 if err = Pipe(p); err != nil { | 334 if err = Pipe(p); err != nil { |
| 290 return | 335 return |
| 291 } | 336 } |
| 292 if _, err = fcntl(p[0], F_SETFD, FD_CLOEXEC); err != nil { | 337 if _, err = fcntl(p[0], F_SETFD, FD_CLOEXEC); err != nil { |
| 293 return | 338 return |
| 294 } | 339 } |
| 295 _, err = fcntl(p[1], F_SETFD, FD_CLOEXEC) | 340 _, err = fcntl(p[1], F_SETFD, FD_CLOEXEC) |
| 296 } | 341 } |
| 297 return | 342 return |
| 298 } | 343 } |
| 344 | |
| 345 // writeIDMappings writes the user namespace User ID or Group ID mappings to the specified path. | |
| 346 func writeIDMappings(path string, idMap []SysProcIDMap) error { | |
| 347 fd, err := Open(path, O_RDWR, 0) | |
| 348 if err != nil { | |
| 349 return err | |
| 350 } | |
| 351 | |
| 352 data := "" | |
| 353 for _, im := range idMap { | |
| 354 data = data + itoa(im.ContainerID) + " " + itoa(im.HostID) + " " + itoa(im.Size) + "\n" | |
| 355 } | |
| 356 | |
| 357 bytes, err := ByteSliceFromString(data) | |
| 358 if err != nil { | |
| 359 Close(fd) | |
| 360 return err | |
| 361 } | |
| 362 | |
| 363 if _, err := Write(fd, bytes); err != nil { | |
| 364 Close(fd) | |
| 365 return err | |
| 366 } | |
| 367 | |
| 368 if err := Close(fd); err != nil { | |
| 369 return err | |
| 370 } | |
| 371 | |
| 372 return nil | |
| 373 } | |
| 374 | |
| 375 // writeUidGidMappings writes User ID and Group ID mappings for user namespaces | |
| 376 // for a process and it is called from the parent process. | |
| 377 func writeUidGidMappings(pid int, sys *SysProcAttr) error { | |
| 378 if sys.UidMappings != nil { | |
| 379 uidf := "/proc/" + itoa(pid) + "/uid_map" | |
| 380 if err := writeIDMappings(uidf, sys.UidMappings); err != nil { | |
| 381 return err | |
| 382 } | |
| 383 } | |
| 384 | |
| 385 if sys.GidMappings != nil { | |
| 386 gidf := "/proc/" + itoa(pid) + "/gid_map" | |
| 387 if err := writeIDMappings(gidf, sys.GidMappings); err != nil { | |
| 388 return err | |
| 389 } | |
| 390 } | |
| 391 | |
| 392 return nil | |
| 393 } | |
| OLD | NEW |