| Left: | ||
| Right: |
| OLD | NEW |
|---|---|
| 1 // Copyright 2011 The Go Authors. All rights reserved. | 1 // Copyright 2011 The Go Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style | 2 // Use of this source code is governed by a BSD-style |
| 3 // license that can be found in the LICENSE file. | 3 // license that can be found in the LICENSE file. |
| 4 | 4 |
| 5 // +build linux | 5 // +build linux |
| 6 | 6 |
| 7 package syscall | 7 package syscall |
| 8 | 8 |
| 9 import ( | 9 import ( |
| 10 "unsafe" | 10 "unsafe" |
| 11 ) | 11 ) |
| 12 | 12 |
| 13 // Holds Container to Host ID mappings used for User Namespaces in Linux. | |
|
bradfitz
2014/10/01 03:40:07
Style issue. See https://code.google.com/p/go-wiki
mrunalp
2014/10/01 21:48:42
Done.
| |
| 14 type IdMap struct { | |
| 15 ContainerId int // Container Id. | |
| 16 HostId int // Host Id. | |
| 17 Size int // Size. | |
| 18 } | |
| 19 | |
| 13 type SysProcAttr struct { | 20 type SysProcAttr struct { |
| 14 » Chroot string // Chroot. | 21 » Chroot string // Chroot. |
| 15 » Credential *Credential // Credential. | 22 » Credential *Credential // Credential. |
| 16 » Ptrace bool // Enable tracing. | 23 » Ptrace bool // Enable tracing. |
| 17 » Setsid bool // Create session. | 24 » Setsid bool // Create session. |
| 18 » Setpgid bool // Set process group ID to new pid (SYSV setpgrp) | 25 » Setpgid bool // Set process group ID to new pid (SYSV setpgrp ) |
| 19 » Setctty bool // Set controlling terminal to fd Ctty (only mean ingful if Setsid is set) | 26 » Setctty bool // Set controlling terminal to fd Ctty (only mea ningful if Setsid is set) |
| 20 » Noctty bool // Detach fd 0 from controlling terminal | 27 » Noctty bool // Detach fd 0 from controlling terminal |
| 21 » Ctty int // Controlling TTY fd (Linux only) | 28 » Ctty int // Controlling TTY fd (Linux only) |
| 22 » Pdeathsig Signal // Signal that the process will get when its pare nt dies (Linux only) | 29 » Pdeathsig Signal // Signal that the process will get when its par ent dies (Linux only) |
| 23 » Cloneflags uintptr // Flags for clone calls (Linux only) | 30 » Cloneflags uintptr // Flags for clone calls (Linux only) |
| 24 » Foreground bool // Set foreground process group to child's pid. ( Implies Setpgid. Stdin should be a TTY) | 31 » Foreground bool // Set foreground process group to child's pid. (Implies Setpgid. Stdin should be a TTY) |
| 25 » Joinpgrp int // If != 0, child's process group ID. (Setpgid mu st not be set) | 32 » Joinpgrp int // If != 0, child's process group ID. (Setpgid m ust not be set) |
| 33 » UidMappings []IdMap // User ID mappings for user namespaces. | |
| 34 » GidMappings []IdMap // Group ID mappings for user namespaces. | |
| 26 } | 35 } |
| 27 | 36 |
| 28 // Implemented in runtime package. | 37 // Implemented in runtime package. |
| 29 func runtime_BeforeFork() | 38 func runtime_BeforeFork() |
| 30 func runtime_AfterFork() | 39 func runtime_AfterFork() |
| 31 | 40 |
| 32 // Fork, dup fd onto 0..len(fd), and exec(argv0, argvv, envv) in child. | 41 // Fork, dup fd onto 0..len(fd), and exec(argv0, argvv, envv) in child. |
| 33 // If a dup or exec fails, write the errno error to pipe. | 42 // If a dup or exec fails, write the errno error to pipe. |
| 34 // (Pipe is close-on-exec so if exec succeeds, it will be closed.) | 43 // (Pipe is close-on-exec so if exec succeeds, it will be closed.) |
| 35 // In the child, this function must not acquire any locks, because | 44 // In the child, this function must not acquire any locks, because |
| 36 // they might have been locked at the time of the fork. This means | 45 // they might have been locked at the time of the fork. This means |
| 37 // no rescheduling, no malloc calls, and no new stack segments. | 46 // no rescheduling, no malloc calls, and no new stack segments. |
| 38 // For the same reason compiler does not race instrument it. | 47 // For the same reason compiler does not race instrument it. |
| 39 // The calls to RawSyscall are okay because they are assembly | 48 // The calls to RawSyscall are okay because they are assembly |
| 40 // functions that do not grow the stack. | 49 // functions that do not grow the stack. |
| 41 func forkAndExecInChild(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (pid int, err Errno) { | 50 func forkAndExecInChild(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, child, parent int) (pid int, err Errno) { |
| 42 // Declare all variables at top in case any | 51 // Declare all variables at top in case any |
| 43 // declarations require heap allocation (e.g., err1). | 52 // declarations require heap allocation (e.g., err1). |
| 44 var ( | 53 var ( |
| 45 r1 uintptr | 54 r1 uintptr |
| 46 err1 Errno | 55 err1 Errno |
| 47 nextfd int | 56 nextfd int |
| 48 i int | 57 i int |
| 58 lzero uintptr | |
|
iant
2014/10/01 14:55:08
This can just be byte. Or you can just use err1.
mrunalp
2014/10/01 21:48:42
Done.
| |
| 49 ) | 59 ) |
| 50 | 60 |
| 51 // Guard against side effects of shuffling fds below. | 61 // Guard against side effects of shuffling fds below. |
| 52 // Make sure that nextfd is beyond any currently open files so | 62 // Make sure that nextfd is beyond any currently open files so |
| 53 // that we can't run the risk of overwriting any of them. | 63 // that we can't run the risk of overwriting any of them. |
| 54 fd := make([]int, len(attr.Files)) | 64 fd := make([]int, len(attr.Files)) |
| 55 nextfd = len(attr.Files) | 65 nextfd = len(attr.Files) |
| 56 for i, ufd := range attr.Files { | 66 for i, ufd := range attr.Files { |
| 57 if nextfd < int(ufd) { | 67 if nextfd < int(ufd) { |
| 58 nextfd = int(ufd) | 68 nextfd = int(ufd) |
| 59 } | 69 } |
| 60 fd[i] = int(ufd) | 70 fd[i] = int(ufd) |
| 61 } | 71 } |
| 62 nextfd++ | 72 nextfd++ |
| 63 | 73 |
| 64 // About to call fork. | 74 // About to call fork. |
| 65 // No more allocation or calls of non-assembly functions. | 75 // No more allocation or calls of non-assembly functions. |
| 66 runtime_BeforeFork() | 76 runtime_BeforeFork() |
| 67 r1, _, err1 = RawSyscall6(SYS_CLONE, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0, 0) | 77 r1, _, err1 = RawSyscall6(SYS_CLONE, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0, 0) |
| 68 if err1 != 0 { | 78 if err1 != 0 { |
| 69 runtime_AfterFork() | 79 runtime_AfterFork() |
| 70 return 0, err1 | 80 return 0, err1 |
| 71 } | 81 } |
| 72 | 82 |
| 73 if r1 != 0 { | 83 if r1 != 0 { |
| 74 // parent; return PID | 84 // parent; return PID |
| 75 runtime_AfterFork() | 85 runtime_AfterFork() |
| 76 pid = int(r1) | 86 pid = int(r1) |
| 77 | 87 |
| 88 if sys.UidMappings != nil || sys.GidMappings != nil { | |
| 89 if err := writeUidGidMappings(pid, sys); err != nil { | |
| 90 return 0, err.(Errno) | |
|
mrunalp
2014/10/01 01:37:12
I am not sure if this is the right way to handle t
iant
2014/10/01 14:55:08
Good point. If there are mappings, the child proc
| |
| 91 } | |
| 92 } | |
| 93 | |
| 78 if sys.Joinpgrp != 0 { | 94 if sys.Joinpgrp != 0 { |
| 79 // Place the child in the specified process group. | 95 // Place the child in the specified process group. |
| 80 RawSyscall(SYS_SETPGID, r1, uintptr(sys.Joinpgrp), 0) | 96 RawSyscall(SYS_SETPGID, r1, uintptr(sys.Joinpgrp), 0) |
| 81 } else if sys.Foreground || sys.Setpgid { | 97 } else if sys.Foreground || sys.Setpgid { |
| 82 // Place the child in a new process group. | 98 // Place the child in a new process group. |
| 83 RawSyscall(SYS_SETPGID, 0, 0, 0) | 99 RawSyscall(SYS_SETPGID, 0, 0, 0) |
| 84 | 100 |
| 85 if sys.Foreground { | 101 if sys.Foreground { |
| 86 // Set new foreground process group. | 102 // Set new foreground process group. |
| 87 RawSyscall(SYS_IOCTL, uintptr(Stdin), TIOCSPGRP, uintptr(unsafe.Pointer(&pid))) | 103 RawSyscall(SYS_IOCTL, uintptr(Stdin), TIOCSPGRP, uintptr(unsafe.Pointer(&pid))) |
| 88 } | 104 } |
| 89 } | 105 } |
| 90 | 106 |
| 91 return pid, 0 | 107 return pid, 0 |
| 92 } | 108 } |
| 93 | 109 |
| 94 // Fork succeeded, now in child. | 110 // Fork succeeded, now in child. |
| 111 if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(child), 0, 0); err != 0 { | |
| 112 goto childerror | |
| 113 } | |
| 114 | |
| 115 if sys.UidMappings != nil || sys.GidMappings != nil { | |
| 116 _, _, err1 = RawSyscall(SYS_READ, uintptr(parent), uintptr(unsaf e.Pointer(&lzero)), uintptr(1)) | |
| 117 if err1 != 0 { | |
| 118 goto childerror | |
| 119 } | |
| 120 } | |
| 95 | 121 |
| 96 // Parent death signal | 122 // Parent death signal |
| 97 if sys.Pdeathsig != 0 { | 123 if sys.Pdeathsig != 0 { |
| 98 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_SET_PDEATHSIG, uintptr(sy s.Pdeathsig), 0, 0, 0, 0) | 124 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_SET_PDEATHSIG, uintptr(sy s.Pdeathsig), 0, 0, 0, 0) |
| 99 if err1 != 0 { | 125 if err1 != 0 { |
| 100 goto childerror | 126 goto childerror |
| 101 } | 127 } |
| 102 | 128 |
| 103 // Signal self if parent is already dead. This might cause a | 129 // Signal self if parent is already dead. This might cause a |
| 104 // duplicate signal in rare cases, but it won't matter when | 130 // duplicate signal in rare cases, but it won't matter when |
| (...skipping 83 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 188 // Chdir | 214 // Chdir |
| 189 if dir != nil { | 215 if dir != nil { |
| 190 _, _, err1 = RawSyscall(SYS_CHDIR, uintptr(unsafe.Pointer(dir)), 0, 0) | 216 _, _, err1 = RawSyscall(SYS_CHDIR, uintptr(unsafe.Pointer(dir)), 0, 0) |
| 191 if err1 != 0 { | 217 if err1 != 0 { |
| 192 goto childerror | 218 goto childerror |
| 193 } | 219 } |
| 194 } | 220 } |
| 195 | 221 |
| 196 // Pass 1: look for fd[i] < i and move those up above len(fd) | 222 // Pass 1: look for fd[i] < i and move those up above len(fd) |
| 197 // so that pass 2 won't stomp on an fd it needs later. | 223 // so that pass 2 won't stomp on an fd it needs later. |
| 198 » if pipe < nextfd { | 224 » if child < nextfd { |
| 199 » » _, _, err1 = RawSyscall(SYS_DUP2, uintptr(pipe), uintptr(nextfd) , 0) | 225 » » _, _, err1 = RawSyscall(SYS_DUP2, uintptr(child), uintptr(nextfd ), 0) |
| 200 if err1 != 0 { | 226 if err1 != 0 { |
| 201 goto childerror | 227 goto childerror |
| 202 } | 228 } |
| 203 RawSyscall(SYS_FCNTL, uintptr(nextfd), F_SETFD, FD_CLOEXEC) | 229 RawSyscall(SYS_FCNTL, uintptr(nextfd), F_SETFD, FD_CLOEXEC) |
| 204 » » pipe = nextfd | 230 » » child = nextfd |
| 205 nextfd++ | 231 nextfd++ |
| 206 } | 232 } |
| 207 for i = 0; i < len(fd); i++ { | 233 for i = 0; i < len(fd); i++ { |
| 208 if fd[i] >= 0 && fd[i] < int(i) { | 234 if fd[i] >= 0 && fd[i] < int(i) { |
| 209 _, _, err1 = RawSyscall(SYS_DUP2, uintptr(fd[i]), uintpt r(nextfd), 0) | 235 _, _, err1 = RawSyscall(SYS_DUP2, uintptr(fd[i]), uintpt r(nextfd), 0) |
| 210 if err1 != 0 { | 236 if err1 != 0 { |
| 211 goto childerror | 237 goto childerror |
| 212 } | 238 } |
| 213 RawSyscall(SYS_FCNTL, uintptr(nextfd), F_SETFD, FD_CLOEX EC) | 239 RawSyscall(SYS_FCNTL, uintptr(nextfd), F_SETFD, FD_CLOEX EC) |
| 214 fd[i] = nextfd | 240 fd[i] = nextfd |
| 215 nextfd++ | 241 nextfd++ |
| 216 » » » if nextfd == pipe { // don't stomp on pipe | 242 » » » if nextfd == child { // don't stomp on pipe |
| 217 nextfd++ | 243 nextfd++ |
| 218 } | 244 } |
| 219 } | 245 } |
| 220 } | 246 } |
| 221 | 247 |
| 222 // Pass 2: dup fd[i] down onto i. | 248 // Pass 2: dup fd[i] down onto i. |
| 223 for i = 0; i < len(fd); i++ { | 249 for i = 0; i < len(fd); i++ { |
| 224 if fd[i] == -1 { | 250 if fd[i] == -1 { |
| 225 RawSyscall(SYS_CLOSE, uintptr(i), 0, 0) | 251 RawSyscall(SYS_CLOSE, uintptr(i), 0, 0) |
| 226 continue | 252 continue |
| (...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 267 } | 293 } |
| 268 | 294 |
| 269 // Time to exec. | 295 // Time to exec. |
| 270 _, _, err1 = RawSyscall(SYS_EXECVE, | 296 _, _, err1 = RawSyscall(SYS_EXECVE, |
| 271 uintptr(unsafe.Pointer(argv0)), | 297 uintptr(unsafe.Pointer(argv0)), |
| 272 uintptr(unsafe.Pointer(&argv[0])), | 298 uintptr(unsafe.Pointer(&argv[0])), |
| 273 uintptr(unsafe.Pointer(&envv[0]))) | 299 uintptr(unsafe.Pointer(&envv[0]))) |
| 274 | 300 |
| 275 childerror: | 301 childerror: |
| 276 // send error code on pipe | 302 // send error code on pipe |
| 277 » RawSyscall(SYS_WRITE, uintptr(pipe), uintptr(unsafe.Pointer(&err1)), uns afe.Sizeof(err1)) | 303 » RawSyscall(SYS_WRITE, uintptr(child), uintptr(unsafe.Pointer(&err1)), un safe.Sizeof(err1)) |
| 278 for { | 304 for { |
| 279 RawSyscall(SYS_EXIT, 253, 0, 0) | 305 RawSyscall(SYS_EXIT, 253, 0, 0) |
| 280 } | 306 } |
| 281 } | 307 } |
| 282 | 308 |
| 283 // Try to open a pipe with O_CLOEXEC set on both file descriptors. | 309 // Try to open a pipe with O_CLOEXEC set on both file descriptors. |
| 284 func forkExecPipe(p []int) (err error) { | 310 func forkExecPipe(p []int) (err error) { |
| 285 err = Pipe2(p, O_CLOEXEC) | 311 err = Pipe2(p, O_CLOEXEC) |
| 286 // pipe2 was added in 2.6.27 and our minimum requirement is 2.6.23, so i t | 312 // pipe2 was added in 2.6.27 and our minimum requirement is 2.6.23, so i t |
| 287 // might not be implemented. | 313 // might not be implemented. |
| 288 if err == ENOSYS { | 314 if err == ENOSYS { |
| 289 if err = Pipe(p); err != nil { | 315 if err = Pipe(p); err != nil { |
| 290 return | 316 return |
| 291 } | 317 } |
| 292 if _, err = fcntl(p[0], F_SETFD, FD_CLOEXEC); err != nil { | 318 if _, err = fcntl(p[0], F_SETFD, FD_CLOEXEC); err != nil { |
| 293 return | 319 return |
| 294 } | 320 } |
| 295 _, err = fcntl(p[1], F_SETFD, FD_CLOEXEC) | 321 _, err = fcntl(p[1], F_SETFD, FD_CLOEXEC) |
| 296 } | 322 } |
| 297 return | 323 return |
| 298 } | 324 } |
| 325 | |
| 326 func writeUidGidMappings(pid int, sys *SysProcAttr) error { | |
|
iant
2014/10/01 14:55:08
Add a comment saying what this function does and n
mrunalp
2014/10/01 21:48:42
Done.
| |
| 327 if sys.UidMappings != nil { | |
| 328 uidf := "/proc/" + itoa(pid) + "/uid_map" | |
| 329 fd, err := Open(uidf, O_RDWR, 0) | |
| 330 if err != nil { | |
| 331 return err | |
| 332 } | |
| 333 | |
| 334 data := "" | |
| 335 for _, um := range sys.UidMappings { | |
|
iant
2014/10/01 14:55:08
Might as well use a function to write out the mapp
mrunalp
2014/10/01 21:48:42
Done.
| |
| 336 data = data + itoa(um.ContainerId) + " " + itoa(um.HostI d) + " " + itoa(um.Size) + "\n" | |
| 337 } | |
| 338 | |
| 339 bytes, err := ByteSliceFromString(data) | |
| 340 if err != nil { | |
| 341 return err | |
|
iant
2014/10/01 14:55:07
Close(fd).
mrunalp
2014/10/01 21:48:42
Done.
| |
| 342 } | |
| 343 | |
| 344 if _, err := Write(fd, bytes); err != nil { | |
| 345 Close(fd) | |
|
mrunalp
2014/10/01 01:37:12
Not yet handles the errors from Close. I see simil
iant
2014/10/01 14:55:08
The Close calls in exec_unix.go are for descriptor
mrunalp
2014/10/01 21:48:42
Acknowledged.
mrunalp
2014/10/01 21:48:42
Done.
| |
| 346 return err | |
| 347 } | |
| 348 | |
| 349 Close(fd) | |
| 350 } | |
| 351 | |
| 352 if sys.GidMappings != nil { | |
| 353 gidf := "/proc/" + itoa(pid) + "/gid_map" | |
| 354 fd, err := Open(gidf, O_RDWR, 0) | |
| 355 if err != nil { | |
| 356 return err | |
| 357 } | |
| 358 | |
| 359 data := "" | |
| 360 for _, gm := range sys.GidMappings { | |
| 361 data = data + itoa(gm.ContainerId) + " " + itoa(gm.HostI d) + " " + itoa(gm.Size) + "\n" | |
| 362 } | |
| 363 | |
| 364 bytes, err := ByteSliceFromString(data) | |
| 365 if err != nil { | |
| 366 return err | |
| 367 } | |
| 368 | |
| 369 if _, err := Write(fd, bytes); err != nil { | |
| 370 Close(fd) | |
| 371 return err | |
| 372 } | |
| 373 | |
| 374 Close(fd) | |
| 375 } | |
| 376 | |
| 377 return nil | |
| 378 } | |
| OLD | NEW |