Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code | Sign in
(256)

Side by Side Diff: src/syscall/exec_linux.go

Issue 126190043: syscall: Adds support for User Namespaces in Linux by a...
Patch Set: diff -r 1c674c3eefc7c3c2c73378bd2482a08b933839ee https://code.google.com/p/go Created 11 years, 7 months ago
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments. Please Sign in to add in-line comments.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | src/syscall/exec_unix.go » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2011 The Go Authors. All rights reserved. 1 // Copyright 2011 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style 2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file. 3 // license that can be found in the LICENSE file.
4 4
5 // +build linux 5 // +build linux
6 6
7 package syscall 7 package syscall
8 8
9 import ( 9 import (
10 "unsafe" 10 "unsafe"
11 ) 11 )
12 12
13 // Holds Container to Host ID mappings used for User Namespaces in Linux.
bradfitz 2014/10/01 03:40:07 Style issue. See https://code.google.com/p/go-wiki
mrunalp 2014/10/01 21:48:42 Done.
14 type IdMap struct {
15 ContainerId int // Container Id.
16 HostId int // Host Id.
17 Size int // Size.
18 }
19
13 type SysProcAttr struct { 20 type SysProcAttr struct {
14 » Chroot string // Chroot. 21 » Chroot string // Chroot.
15 » Credential *Credential // Credential. 22 » Credential *Credential // Credential.
16 » Ptrace bool // Enable tracing. 23 » Ptrace bool // Enable tracing.
17 » Setsid bool // Create session. 24 » Setsid bool // Create session.
18 » Setpgid bool // Set process group ID to new pid (SYSV setpgrp) 25 » Setpgid bool // Set process group ID to new pid (SYSV setpgrp )
19 » Setctty bool // Set controlling terminal to fd Ctty (only mean ingful if Setsid is set) 26 » Setctty bool // Set controlling terminal to fd Ctty (only mea ningful if Setsid is set)
20 » Noctty bool // Detach fd 0 from controlling terminal 27 » Noctty bool // Detach fd 0 from controlling terminal
21 » Ctty int // Controlling TTY fd (Linux only) 28 » Ctty int // Controlling TTY fd (Linux only)
22 » Pdeathsig Signal // Signal that the process will get when its pare nt dies (Linux only) 29 » Pdeathsig Signal // Signal that the process will get when its par ent dies (Linux only)
23 » Cloneflags uintptr // Flags for clone calls (Linux only) 30 » Cloneflags uintptr // Flags for clone calls (Linux only)
24 » Foreground bool // Set foreground process group to child's pid. ( Implies Setpgid. Stdin should be a TTY) 31 » Foreground bool // Set foreground process group to child's pid. (Implies Setpgid. Stdin should be a TTY)
25 » Joinpgrp int // If != 0, child's process group ID. (Setpgid mu st not be set) 32 » Joinpgrp int // If != 0, child's process group ID. (Setpgid m ust not be set)
33 » UidMappings []IdMap // User ID mappings for user namespaces.
34 » GidMappings []IdMap // Group ID mappings for user namespaces.
26 } 35 }
27 36
28 // Implemented in runtime package. 37 // Implemented in runtime package.
29 func runtime_BeforeFork() 38 func runtime_BeforeFork()
30 func runtime_AfterFork() 39 func runtime_AfterFork()
31 40
32 // Fork, dup fd onto 0..len(fd), and exec(argv0, argvv, envv) in child. 41 // Fork, dup fd onto 0..len(fd), and exec(argv0, argvv, envv) in child.
33 // If a dup or exec fails, write the errno error to pipe. 42 // If a dup or exec fails, write the errno error to pipe.
34 // (Pipe is close-on-exec so if exec succeeds, it will be closed.) 43 // (Pipe is close-on-exec so if exec succeeds, it will be closed.)
35 // In the child, this function must not acquire any locks, because 44 // In the child, this function must not acquire any locks, because
36 // they might have been locked at the time of the fork. This means 45 // they might have been locked at the time of the fork. This means
37 // no rescheduling, no malloc calls, and no new stack segments. 46 // no rescheduling, no malloc calls, and no new stack segments.
38 // For the same reason compiler does not race instrument it. 47 // For the same reason compiler does not race instrument it.
39 // The calls to RawSyscall are okay because they are assembly 48 // The calls to RawSyscall are okay because they are assembly
40 // functions that do not grow the stack. 49 // functions that do not grow the stack.
41 func forkAndExecInChild(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (pid int, err Errno) { 50 func forkAndExecInChild(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, child, parent int) (pid int, err Errno) {
42 // Declare all variables at top in case any 51 // Declare all variables at top in case any
43 // declarations require heap allocation (e.g., err1). 52 // declarations require heap allocation (e.g., err1).
44 var ( 53 var (
45 r1 uintptr 54 r1 uintptr
46 err1 Errno 55 err1 Errno
47 nextfd int 56 nextfd int
48 i int 57 i int
58 lzero uintptr
iant 2014/10/01 14:55:08 This can just be byte. Or you can just use err1.
mrunalp 2014/10/01 21:48:42 Done.
49 ) 59 )
50 60
51 // Guard against side effects of shuffling fds below. 61 // Guard against side effects of shuffling fds below.
52 // Make sure that nextfd is beyond any currently open files so 62 // Make sure that nextfd is beyond any currently open files so
53 // that we can't run the risk of overwriting any of them. 63 // that we can't run the risk of overwriting any of them.
54 fd := make([]int, len(attr.Files)) 64 fd := make([]int, len(attr.Files))
55 nextfd = len(attr.Files) 65 nextfd = len(attr.Files)
56 for i, ufd := range attr.Files { 66 for i, ufd := range attr.Files {
57 if nextfd < int(ufd) { 67 if nextfd < int(ufd) {
58 nextfd = int(ufd) 68 nextfd = int(ufd)
59 } 69 }
60 fd[i] = int(ufd) 70 fd[i] = int(ufd)
61 } 71 }
62 nextfd++ 72 nextfd++
63 73
64 // About to call fork. 74 // About to call fork.
65 // No more allocation or calls of non-assembly functions. 75 // No more allocation or calls of non-assembly functions.
66 runtime_BeforeFork() 76 runtime_BeforeFork()
67 r1, _, err1 = RawSyscall6(SYS_CLONE, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0, 0) 77 r1, _, err1 = RawSyscall6(SYS_CLONE, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0, 0)
68 if err1 != 0 { 78 if err1 != 0 {
69 runtime_AfterFork() 79 runtime_AfterFork()
70 return 0, err1 80 return 0, err1
71 } 81 }
72 82
73 if r1 != 0 { 83 if r1 != 0 {
74 // parent; return PID 84 // parent; return PID
75 runtime_AfterFork() 85 runtime_AfterFork()
76 pid = int(r1) 86 pid = int(r1)
77 87
88 if sys.UidMappings != nil || sys.GidMappings != nil {
89 if err := writeUidGidMappings(pid, sys); err != nil {
90 return 0, err.(Errno)
mrunalp 2014/10/01 01:37:12 I am not sure if this is the right way to handle t
iant 2014/10/01 14:55:08 Good point. If there are mappings, the child proc
91 }
92 }
93
78 if sys.Joinpgrp != 0 { 94 if sys.Joinpgrp != 0 {
79 // Place the child in the specified process group. 95 // Place the child in the specified process group.
80 RawSyscall(SYS_SETPGID, r1, uintptr(sys.Joinpgrp), 0) 96 RawSyscall(SYS_SETPGID, r1, uintptr(sys.Joinpgrp), 0)
81 } else if sys.Foreground || sys.Setpgid { 97 } else if sys.Foreground || sys.Setpgid {
82 // Place the child in a new process group. 98 // Place the child in a new process group.
83 RawSyscall(SYS_SETPGID, 0, 0, 0) 99 RawSyscall(SYS_SETPGID, 0, 0, 0)
84 100
85 if sys.Foreground { 101 if sys.Foreground {
86 // Set new foreground process group. 102 // Set new foreground process group.
87 RawSyscall(SYS_IOCTL, uintptr(Stdin), TIOCSPGRP, uintptr(unsafe.Pointer(&pid))) 103 RawSyscall(SYS_IOCTL, uintptr(Stdin), TIOCSPGRP, uintptr(unsafe.Pointer(&pid)))
88 } 104 }
89 } 105 }
90 106
91 return pid, 0 107 return pid, 0
92 } 108 }
93 109
94 // Fork succeeded, now in child. 110 // Fork succeeded, now in child.
111 if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(child), 0, 0); err != 0 {
112 goto childerror
113 }
114
115 if sys.UidMappings != nil || sys.GidMappings != nil {
116 _, _, err1 = RawSyscall(SYS_READ, uintptr(parent), uintptr(unsaf e.Pointer(&lzero)), uintptr(1))
117 if err1 != 0 {
118 goto childerror
119 }
120 }
95 121
96 // Parent death signal 122 // Parent death signal
97 if sys.Pdeathsig != 0 { 123 if sys.Pdeathsig != 0 {
98 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_SET_PDEATHSIG, uintptr(sy s.Pdeathsig), 0, 0, 0, 0) 124 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_SET_PDEATHSIG, uintptr(sy s.Pdeathsig), 0, 0, 0, 0)
99 if err1 != 0 { 125 if err1 != 0 {
100 goto childerror 126 goto childerror
101 } 127 }
102 128
103 // Signal self if parent is already dead. This might cause a 129 // Signal self if parent is already dead. This might cause a
104 // duplicate signal in rare cases, but it won't matter when 130 // duplicate signal in rare cases, but it won't matter when
(...skipping 83 matching lines...) Expand 10 before | Expand all | Expand 10 after
188 // Chdir 214 // Chdir
189 if dir != nil { 215 if dir != nil {
190 _, _, err1 = RawSyscall(SYS_CHDIR, uintptr(unsafe.Pointer(dir)), 0, 0) 216 _, _, err1 = RawSyscall(SYS_CHDIR, uintptr(unsafe.Pointer(dir)), 0, 0)
191 if err1 != 0 { 217 if err1 != 0 {
192 goto childerror 218 goto childerror
193 } 219 }
194 } 220 }
195 221
196 // Pass 1: look for fd[i] < i and move those up above len(fd) 222 // Pass 1: look for fd[i] < i and move those up above len(fd)
197 // so that pass 2 won't stomp on an fd it needs later. 223 // so that pass 2 won't stomp on an fd it needs later.
198 » if pipe < nextfd { 224 » if child < nextfd {
199 » » _, _, err1 = RawSyscall(SYS_DUP2, uintptr(pipe), uintptr(nextfd) , 0) 225 » » _, _, err1 = RawSyscall(SYS_DUP2, uintptr(child), uintptr(nextfd ), 0)
200 if err1 != 0 { 226 if err1 != 0 {
201 goto childerror 227 goto childerror
202 } 228 }
203 RawSyscall(SYS_FCNTL, uintptr(nextfd), F_SETFD, FD_CLOEXEC) 229 RawSyscall(SYS_FCNTL, uintptr(nextfd), F_SETFD, FD_CLOEXEC)
204 » » pipe = nextfd 230 » » child = nextfd
205 nextfd++ 231 nextfd++
206 } 232 }
207 for i = 0; i < len(fd); i++ { 233 for i = 0; i < len(fd); i++ {
208 if fd[i] >= 0 && fd[i] < int(i) { 234 if fd[i] >= 0 && fd[i] < int(i) {
209 _, _, err1 = RawSyscall(SYS_DUP2, uintptr(fd[i]), uintpt r(nextfd), 0) 235 _, _, err1 = RawSyscall(SYS_DUP2, uintptr(fd[i]), uintpt r(nextfd), 0)
210 if err1 != 0 { 236 if err1 != 0 {
211 goto childerror 237 goto childerror
212 } 238 }
213 RawSyscall(SYS_FCNTL, uintptr(nextfd), F_SETFD, FD_CLOEX EC) 239 RawSyscall(SYS_FCNTL, uintptr(nextfd), F_SETFD, FD_CLOEX EC)
214 fd[i] = nextfd 240 fd[i] = nextfd
215 nextfd++ 241 nextfd++
216 » » » if nextfd == pipe { // don't stomp on pipe 242 » » » if nextfd == child { // don't stomp on pipe
217 nextfd++ 243 nextfd++
218 } 244 }
219 } 245 }
220 } 246 }
221 247
222 // Pass 2: dup fd[i] down onto i. 248 // Pass 2: dup fd[i] down onto i.
223 for i = 0; i < len(fd); i++ { 249 for i = 0; i < len(fd); i++ {
224 if fd[i] == -1 { 250 if fd[i] == -1 {
225 RawSyscall(SYS_CLOSE, uintptr(i), 0, 0) 251 RawSyscall(SYS_CLOSE, uintptr(i), 0, 0)
226 continue 252 continue
(...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after
267 } 293 }
268 294
269 // Time to exec. 295 // Time to exec.
270 _, _, err1 = RawSyscall(SYS_EXECVE, 296 _, _, err1 = RawSyscall(SYS_EXECVE,
271 uintptr(unsafe.Pointer(argv0)), 297 uintptr(unsafe.Pointer(argv0)),
272 uintptr(unsafe.Pointer(&argv[0])), 298 uintptr(unsafe.Pointer(&argv[0])),
273 uintptr(unsafe.Pointer(&envv[0]))) 299 uintptr(unsafe.Pointer(&envv[0])))
274 300
275 childerror: 301 childerror:
276 // send error code on pipe 302 // send error code on pipe
277 » RawSyscall(SYS_WRITE, uintptr(pipe), uintptr(unsafe.Pointer(&err1)), uns afe.Sizeof(err1)) 303 » RawSyscall(SYS_WRITE, uintptr(child), uintptr(unsafe.Pointer(&err1)), un safe.Sizeof(err1))
278 for { 304 for {
279 RawSyscall(SYS_EXIT, 253, 0, 0) 305 RawSyscall(SYS_EXIT, 253, 0, 0)
280 } 306 }
281 } 307 }
282 308
283 // Try to open a pipe with O_CLOEXEC set on both file descriptors. 309 // Try to open a pipe with O_CLOEXEC set on both file descriptors.
284 func forkExecPipe(p []int) (err error) { 310 func forkExecPipe(p []int) (err error) {
285 err = Pipe2(p, O_CLOEXEC) 311 err = Pipe2(p, O_CLOEXEC)
286 // pipe2 was added in 2.6.27 and our minimum requirement is 2.6.23, so i t 312 // pipe2 was added in 2.6.27 and our minimum requirement is 2.6.23, so i t
287 // might not be implemented. 313 // might not be implemented.
288 if err == ENOSYS { 314 if err == ENOSYS {
289 if err = Pipe(p); err != nil { 315 if err = Pipe(p); err != nil {
290 return 316 return
291 } 317 }
292 if _, err = fcntl(p[0], F_SETFD, FD_CLOEXEC); err != nil { 318 if _, err = fcntl(p[0], F_SETFD, FD_CLOEXEC); err != nil {
293 return 319 return
294 } 320 }
295 _, err = fcntl(p[1], F_SETFD, FD_CLOEXEC) 321 _, err = fcntl(p[1], F_SETFD, FD_CLOEXEC)
296 } 322 }
297 return 323 return
298 } 324 }
325
326 func writeUidGidMappings(pid int, sys *SysProcAttr) error {
iant 2014/10/01 14:55:08 Add a comment saying what this function does and n
mrunalp 2014/10/01 21:48:42 Done.
327 if sys.UidMappings != nil {
328 uidf := "/proc/" + itoa(pid) + "/uid_map"
329 fd, err := Open(uidf, O_RDWR, 0)
330 if err != nil {
331 return err
332 }
333
334 data := ""
335 for _, um := range sys.UidMappings {
iant 2014/10/01 14:55:08 Might as well use a function to write out the mapp
mrunalp 2014/10/01 21:48:42 Done.
336 data = data + itoa(um.ContainerId) + " " + itoa(um.HostI d) + " " + itoa(um.Size) + "\n"
337 }
338
339 bytes, err := ByteSliceFromString(data)
340 if err != nil {
341 return err
iant 2014/10/01 14:55:07 Close(fd).
mrunalp 2014/10/01 21:48:42 Done.
342 }
343
344 if _, err := Write(fd, bytes); err != nil {
345 Close(fd)
mrunalp 2014/10/01 01:37:12 Not yet handles the errors from Close. I see simil
iant 2014/10/01 14:55:08 The Close calls in exec_unix.go are for descriptor
mrunalp 2014/10/01 21:48:42 Acknowledged.
mrunalp 2014/10/01 21:48:42 Done.
346 return err
347 }
348
349 Close(fd)
350 }
351
352 if sys.GidMappings != nil {
353 gidf := "/proc/" + itoa(pid) + "/gid_map"
354 fd, err := Open(gidf, O_RDWR, 0)
355 if err != nil {
356 return err
357 }
358
359 data := ""
360 for _, gm := range sys.GidMappings {
361 data = data + itoa(gm.ContainerId) + " " + itoa(gm.HostI d) + " " + itoa(gm.Size) + "\n"
362 }
363
364 bytes, err := ByteSliceFromString(data)
365 if err != nil {
366 return err
367 }
368
369 if _, err := Write(fd, bytes); err != nil {
370 Close(fd)
371 return err
372 }
373
374 Close(fd)
375 }
376
377 return nil
378 }
OLDNEW
« no previous file with comments | « no previous file | src/syscall/exec_unix.go » ('j') | no next file with comments »

Powered by Google App Engine
RSS Feeds Recent Issues | This issue
This is Rietveld f62528b