1 /* Guts of both `select' and `poll' for Hurd.
2    Copyright (C) 1991-2022 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4 
5    The GNU C Library is free software; you can redistribute it and/or
6    modify it under the terms of the GNU Lesser General Public
7    License as published by the Free Software Foundation; either
8    version 2.1 of the License, or (at your option) any later version.
9 
10    The GNU C Library is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    Lesser General Public License for more details.
14 
15    You should have received a copy of the GNU Lesser General Public
16    License along with the GNU C Library; if not, see
17    <https://www.gnu.org/licenses/>.  */
18 
19 #include <sys/time.h>
20 #include <sys/types.h>
21 #include <sys/poll.h>
22 #include <hurd.h>
23 #include <hurd/fd.h>
24 #include <hurd/io_request.h>
25 #include <stdlib.h>
26 #include <string.h>
27 #include <assert.h>
28 #include <stdint.h>
29 #include <limits.h>
30 #include <time.h>
31 #include <sysdep-cancel.h>
32 
33 /* All user select types.  */
34 #define SELECT_ALL (SELECT_READ | SELECT_WRITE | SELECT_URG)
35 
36 /* Used to record that a particular select rpc returned.  Must be distinct
37    from SELECT_ALL (which better not have the high bit set).  */
38 #define SELECT_RETURNED ((SELECT_ALL << 1) & ~SELECT_ALL)
39 #define SELECT_ERROR (SELECT_RETURNED << 1)
40 
41 /* Check the first NFDS descriptors either in POLLFDS (if nonnnull) or in
42    each of READFDS, WRITEFDS, EXCEPTFDS that is nonnull.  If TIMEOUT is not
43    NULL, time out after waiting the interval specified therein.  Returns
44    the number of ready descriptors, or -1 for errors.  */
45 int
_hurd_select(int nfds,struct pollfd * pollfds,fd_set * readfds,fd_set * writefds,fd_set * exceptfds,const struct timespec * timeout,const sigset_t * sigmask)46 _hurd_select (int nfds,
47 	      struct pollfd *pollfds,
48 	      fd_set *readfds, fd_set *writefds, fd_set *exceptfds,
49 	      const struct timespec *timeout, const sigset_t *sigmask)
50 {
51   int i;
52   mach_port_t portset, sigport;
53   int got, ready;
54   error_t err;
55   fd_set rfds, wfds, xfds;
56   int firstfd, lastfd;
57   mach_msg_id_t reply_msgid;
58   mach_msg_timeout_t to;
59   struct timespec ts;
60   struct
61     {
62       struct hurd_userlink ulink;
63       struct hurd_fd *cell;
64       mach_port_t io_port;
65       int type;
66       mach_port_t reply_port;
67       int error;
68     } d[nfds];
69   sigset_t oset;
70   struct hurd_sigstate *ss = NULL;
71 
72   union typeword		/* Use this to avoid unkosher casts.  */
73     {
74       mach_msg_type_t type;
75       uint32_t word;
76     };
77   assert (sizeof (union typeword) == sizeof (mach_msg_type_t));
78   assert (sizeof (uint32_t) == sizeof (mach_msg_type_t));
79 
80   if (nfds < 0 || (pollfds == NULL && nfds > FD_SETSIZE))
81     {
82       errno = EINVAL;
83       return -1;
84     }
85 
86 #define IO_SELECT_REPLY_MSGID (21012 + 100) /* XXX */
87 #define IO_SELECT_TIMEOUT_REPLY_MSGID (21031 + 100) /* XXX */
88 
89   if (timeout == NULL)
90     reply_msgid = IO_SELECT_REPLY_MSGID;
91   else
92     {
93       struct timespec now;
94 
95       if (timeout->tv_sec < 0 || ! valid_nanoseconds (timeout->tv_nsec))
96 	{
97 	  errno = EINVAL;
98 	  return -1;
99 	}
100 
101       err = __clock_gettime (CLOCK_REALTIME, &now);
102       if (err)
103 	return -1;
104 
105       ts.tv_sec = now.tv_sec + timeout->tv_sec;
106       ts.tv_nsec = now.tv_nsec + timeout->tv_nsec;
107 
108       if (ts.tv_nsec >= 1000000000)
109 	{
110 	  ts.tv_sec++;
111 	  ts.tv_nsec -= 1000000000;
112 	}
113 
114       if (ts.tv_sec < 0)
115 	ts.tv_sec = LONG_MAX; /* XXX */
116 
117       reply_msgid = IO_SELECT_TIMEOUT_REPLY_MSGID;
118     }
119 
120   if (sigmask)
121     {
122       /* Add a port to the portset for the case when we get the signal even
123          before calling __mach_msg.  */
124 
125       sigport = __mach_reply_port ();
126 
127       ss = _hurd_self_sigstate ();
128       _hurd_sigstate_lock (ss);
129       /* And tell the signal thread to message us when a signal arrives.  */
130       ss->suspended = sigport;
131       _hurd_sigstate_unlock (ss);
132 
133       if (__sigprocmask (SIG_SETMASK, sigmask, &oset))
134 	{
135 	  _hurd_sigstate_lock (ss);
136 	  ss->suspended = MACH_PORT_NULL;
137 	  _hurd_sigstate_unlock (ss);
138 	  __mach_port_destroy (__mach_task_self (), sigport);
139 	  return -1;
140 	}
141     }
142   else
143     sigport = MACH_PORT_NULL;
144 
145   if (pollfds)
146     {
147       int error = 0;
148       /* Collect interesting descriptors from the user's `pollfd' array.
149 	 We do a first pass that reads the user's array before taking
150 	 any locks.  The second pass then only touches our own stack,
151 	 and gets the port references.  */
152 
153       for (i = 0; i < nfds; ++i)
154 	if (pollfds[i].fd >= 0)
155 	  {
156 	    int type = 0;
157 	    if (pollfds[i].events & POLLIN)
158 	      type |= SELECT_READ;
159 	    if (pollfds[i].events & POLLOUT)
160 	      type |= SELECT_WRITE;
161 	    if (pollfds[i].events & POLLPRI)
162 	      type |= SELECT_URG;
163 
164 	    d[i].io_port = pollfds[i].fd;
165 	    d[i].type = type;
166 	  }
167 	else
168 	  d[i].type = 0;
169 
170       HURD_CRITICAL_BEGIN;
171       __mutex_lock (&_hurd_dtable_lock);
172 
173       for (i = 0; i < nfds; ++i)
174 	if (d[i].type != 0)
175 	  {
176 	    const int fd = (int) d[i].io_port;
177 
178 	    if (fd < _hurd_dtablesize)
179 	      {
180 		d[i].cell = _hurd_dtable[fd];
181 		if (d[i].cell != NULL)
182 		  {
183 		    d[i].io_port = _hurd_port_get (&d[i].cell->port,
184 						   &d[i].ulink);
185 		    if (d[i].io_port != MACH_PORT_NULL)
186 		      continue;
187 		  }
188 	      }
189 
190 	    /* Bogus descriptor, make it EBADF already.  */
191 	    d[i].error = EBADF;
192 	    d[i].type = SELECT_ERROR;
193 	    error = 1;
194 	  }
195 
196       __mutex_unlock (&_hurd_dtable_lock);
197       HURD_CRITICAL_END;
198 
199       if (error)
200 	{
201 	  /* Set timeout to 0.  */
202 	  err = __clock_gettime (CLOCK_REALTIME, &ts);
203 	  if (err)
204 	    {
205 	      /* Really bad luck.  */
206 	      err = errno;
207 	      HURD_CRITICAL_BEGIN;
208 	      __mutex_lock (&_hurd_dtable_lock);
209 	      while (i-- > 0)
210 		if (d[i].type & ~SELECT_ERROR != 0)
211 		  _hurd_port_free (&d[i].cell->port, &d[i].ulink,
212 				   d[i].io_port);
213 	      __mutex_unlock (&_hurd_dtable_lock);
214 	      HURD_CRITICAL_END;
215 	      if (sigmask)
216 		__sigprocmask (SIG_SETMASK, &oset, NULL);
217 	      errno = err;
218 	      return -1;
219 	    }
220 	  reply_msgid = IO_SELECT_TIMEOUT_REPLY_MSGID;
221 	}
222 
223       lastfd = i - 1;
224       firstfd = i == 0 ? lastfd : 0;
225     }
226   else
227     {
228       /* Collect interested descriptors from the user's fd_set arguments.
229 	 Use local copies so we can't crash from user bogosity.  */
230 
231       if (readfds == NULL)
232 	FD_ZERO (&rfds);
233       else
234 	rfds = *readfds;
235       if (writefds == NULL)
236 	FD_ZERO (&wfds);
237       else
238 	wfds = *writefds;
239       if (exceptfds == NULL)
240 	FD_ZERO (&xfds);
241       else
242 	xfds = *exceptfds;
243 
244       HURD_CRITICAL_BEGIN;
245       __mutex_lock (&_hurd_dtable_lock);
246 
247       /* Collect the ports for interesting FDs.  */
248       firstfd = lastfd = -1;
249       for (i = 0; i < nfds; ++i)
250 	{
251 	  int type = 0;
252 	  if (readfds != NULL && FD_ISSET (i, &rfds))
253 	    type |= SELECT_READ;
254 	  if (writefds != NULL && FD_ISSET (i, &wfds))
255 	    type |= SELECT_WRITE;
256 	  if (exceptfds != NULL && FD_ISSET (i, &xfds))
257 	    type |= SELECT_URG;
258 	  d[i].type = type;
259 	  if (type)
260 	    {
261 	      if (i < _hurd_dtablesize)
262 		{
263 		  d[i].cell = _hurd_dtable[i];
264 		  if (d[i].cell != NULL)
265 		    d[i].io_port = _hurd_port_get (&d[i].cell->port,
266 						   &d[i].ulink);
267 		}
268 	      if (i >= _hurd_dtablesize || d[i].cell == NULL ||
269 		  d[i].io_port == MACH_PORT_NULL)
270 		{
271 		  /* If one descriptor is bogus, we fail completely.  */
272 		  while (i-- > 0)
273 		    if (d[i].type != 0)
274 		      _hurd_port_free (&d[i].cell->port, &d[i].ulink,
275 				       d[i].io_port);
276 		  break;
277 		}
278 	      lastfd = i;
279 	      if (firstfd == -1)
280 		firstfd = i;
281 	    }
282 	}
283 
284       __mutex_unlock (&_hurd_dtable_lock);
285       HURD_CRITICAL_END;
286 
287       if (i < nfds)
288 	{
289 	  if (sigmask)
290 	    __sigprocmask (SIG_SETMASK, &oset, NULL);
291 	  errno = EBADF;
292 	  return -1;
293 	}
294 
295       if (nfds > _hurd_dtablesize)
296 	nfds = _hurd_dtablesize;
297     }
298 
299 
300   err = 0;
301   got = 0;
302 
303   /* Send them all io_select request messages.  */
304 
305   if (firstfd == -1)
306     {
307       if (sigport == MACH_PORT_NULL)
308 	/* But not if there were no ports to deal with at all.
309 	   We are just a pure timeout.  */
310 	portset = __mach_reply_port ();
311       else
312 	portset = sigport;
313     }
314   else
315     {
316       portset = MACH_PORT_NULL;
317 
318       for (i = firstfd; i <= lastfd; ++i)
319 	if (!(d[i].type & ~SELECT_ERROR))
320 	  d[i].reply_port = MACH_PORT_NULL;
321 	else
322 	  {
323 	    int type = d[i].type;
324 	    d[i].reply_port = __mach_reply_port ();
325 	    if (timeout == NULL)
326 	      err = __io_select_request (d[i].io_port, d[i].reply_port, type);
327 	    else
328 	      err = __io_select_timeout_request (d[i].io_port, d[i].reply_port,
329 						 ts, type);
330 	    if (!err)
331 	      {
332 		if (firstfd == lastfd && sigport == MACH_PORT_NULL)
333 		  /* When there's a single descriptor, we don't need a
334 		     portset, so just pretend we have one, but really
335 		     use the single reply port.  */
336 		  portset = d[i].reply_port;
337 		else if (got == 0)
338 		  /* We've got multiple reply ports, so we need a port set to
339 		     multiplex them.  */
340 		  {
341 		    /* We will wait again for a reply later.  */
342 		    if (portset == MACH_PORT_NULL)
343 		      /* Create the portset to receive all the replies on.  */
344 		      err = __mach_port_allocate (__mach_task_self (),
345 						  MACH_PORT_RIGHT_PORT_SET,
346 						  &portset);
347 		    if (! err)
348 		      /* Put this reply port in the port set.  */
349 		      __mach_port_move_member (__mach_task_self (),
350 					       d[i].reply_port, portset);
351 		  }
352 	      }
353 	    else
354 	      {
355 		/* No error should happen, but record it for later
356 		   processing.  */
357 		d[i].error = err;
358 		d[i].type |= SELECT_ERROR;
359 		++got;
360 	      }
361 	    _hurd_port_free (&d[i].cell->port, &d[i].ulink, d[i].io_port);
362 	  }
363 
364       if (got == 0 && sigport != MACH_PORT_NULL)
365 	{
366 	  if (portset == MACH_PORT_NULL)
367 	    /* Create the portset to receive the signal message on.  */
368 	    __mach_port_allocate (__mach_task_self (), MACH_PORT_RIGHT_PORT_SET,
369 				  &portset);
370 	  /* Put the signal reply port in the port set.  */
371 	  __mach_port_move_member (__mach_task_self (), sigport, portset);
372 	}
373     }
374 
375   /* GOT is the number of replies (or errors), while READY is the number of
376      replies with at least one type bit set.  */
377   ready = 0;
378 
379   /* Now wait for reply messages.  */
380   if (!err && got == 0)
381     {
382       /* Now wait for io_select_reply messages on PORT,
383 	 timing out as appropriate.  */
384 
385       union
386 	{
387 	  mach_msg_header_t head;
388 #ifdef MACH_MSG_TRAILER_MINIMUM_SIZE
389 	  struct
390 	    {
391 	      mach_msg_header_t head;
392 	      NDR_record_t ndr;
393 	      error_t err;
394 	    } error;
395 	  struct
396 	    {
397 	      mach_msg_header_t head;
398 	      NDR_record_t ndr;
399 	      error_t err;
400 	      int result;
401 	      mach_msg_trailer_t trailer;
402 	    } success;
403 #else
404 	  struct
405 	    {
406 	      mach_msg_header_t head;
407 	      union typeword err_type;
408 	      error_t err;
409 	    } error;
410 	  struct
411 	    {
412 	      mach_msg_header_t head;
413 	      union typeword err_type;
414 	      error_t err;
415 	      union typeword result_type;
416 	      int result;
417 	    } success;
418 #endif
419 	} msg;
420       mach_msg_option_t options;
421       error_t msgerr;
422 
423       /* We rely on servers to implement the timeout, but when there are none,
424 	 do it on the client side.  */
425       if (timeout != NULL && firstfd == -1)
426 	{
427 	  options = MACH_RCV_TIMEOUT;
428 	  to = timeout->tv_sec * 1000 + (timeout->tv_nsec + 999999) / 1000000;
429 	}
430       else
431 	{
432 	  options = 0;
433 	  to = MACH_MSG_TIMEOUT_NONE;
434 	}
435 
436       int cancel_oldtype = LIBC_CANCEL_ASYNC();
437       while ((msgerr = __mach_msg (&msg.head,
438 				   MACH_RCV_MSG | MACH_RCV_INTERRUPT | options,
439 				   0, sizeof msg, portset, to,
440 				   MACH_PORT_NULL)) == MACH_MSG_SUCCESS)
441 	{
442 	  LIBC_CANCEL_RESET (cancel_oldtype);
443 
444 	  /* We got a message.  Decode it.  */
445 #ifdef MACH_MSG_TYPE_BIT
446 	  const union typeword inttype =
447 	  { type:
448 	    { MACH_MSG_TYPE_INTEGER_T, sizeof (integer_t) * 8, 1, 1, 0, 0 }
449 	  };
450 #endif
451 
452 	  if (sigport != MACH_PORT_NULL && sigport == msg.head.msgh_local_port)
453 	    {
454 	      /* We actually got interrupted by a signal before
455 		 __mach_msg; poll for further responses and then
456 		 return quickly. */
457 	      err = EINTR;
458 	      goto poll;
459 	    }
460 
461 	  if (msg.head.msgh_id == reply_msgid
462 	      && msg.head.msgh_size >= sizeof msg.error
463 	      && !(msg.head.msgh_bits & MACH_MSGH_BITS_COMPLEX)
464 #ifdef MACH_MSG_TYPE_BIT
465 	      && msg.error.err_type.word == inttype.word
466 #endif
467 	      )
468 	    {
469 	      /* This is a properly formatted message so far.
470 		 See if it is a success or a failure.  */
471 	      if (msg.error.err == EINTR
472 		  && msg.head.msgh_size == sizeof msg.error)
473 		{
474 		  /* EINTR response; poll for further responses
475 		     and then return quickly.  */
476 		  err = EINTR;
477 		  goto poll;
478 		}
479 	      /* Keep in mind msg.success.result can be 0 if a timeout
480 		 occurred.  */
481 	      if (msg.error.err
482 #ifdef MACH_MSG_TYPE_BIT
483 		  || msg.success.result_type.word != inttype.word
484 #endif
485 		  || msg.head.msgh_size != sizeof msg.success)
486 		{
487 		  /* Error or bogus reply.  */
488 		  if (!msg.error.err)
489 		    msg.error.err = EIO;
490 		  __mach_msg_destroy (&msg.head);
491 		}
492 
493 	      /* Look up the respondent's reply port and record its
494 		 readiness.  */
495 	      {
496 		int had = got;
497 		if (firstfd != -1)
498 		  for (i = firstfd; i <= lastfd; ++i)
499 		    if (d[i].type
500 			&& d[i].reply_port == msg.head.msgh_local_port)
501 		      {
502 			if (msg.error.err)
503 			  {
504 			    d[i].error = msg.error.err;
505 			    d[i].type = SELECT_ERROR;
506 			    ++ready;
507 			  }
508 			else
509 			  {
510 			    d[i].type &= msg.success.result;
511 			    if (d[i].type)
512 			      ++ready;
513 			  }
514 
515 			d[i].type |= SELECT_RETURNED;
516 			++got;
517 		      }
518 		assert (got > had);
519 	      }
520 	    }
521 
522 	  if (msg.head.msgh_remote_port != MACH_PORT_NULL)
523 	    __mach_port_deallocate (__mach_task_self (),
524 				    msg.head.msgh_remote_port);
525 
526 	  if (got)
527 	  poll:
528 	    {
529 	      /* Poll for another message.  */
530 	      to = 0;
531 	      options |= MACH_RCV_TIMEOUT;
532 	    }
533 	}
534       LIBC_CANCEL_RESET (cancel_oldtype);
535 
536       if (msgerr == MACH_RCV_INTERRUPTED)
537 	/* Interruption on our side (e.g. signal reception).  */
538 	err = EINTR;
539 
540       if (ready)
541 	/* At least one descriptor is known to be ready now, so we will
542 	   return success.  */
543 	err = 0;
544     }
545 
546   if (firstfd != -1)
547     for (i = firstfd; i <= lastfd; ++i)
548       if (d[i].reply_port != MACH_PORT_NULL)
549 	__mach_port_destroy (__mach_task_self (), d[i].reply_port);
550 
551   if (sigport != MACH_PORT_NULL)
552     {
553       _hurd_sigstate_lock (ss);
554       ss->suspended = MACH_PORT_NULL;
555       _hurd_sigstate_unlock (ss);
556       __mach_port_destroy (__mach_task_self (), sigport);
557     }
558 
559   if ((firstfd == -1 && sigport == MACH_PORT_NULL)
560       || ((firstfd != lastfd || sigport != MACH_PORT_NULL) && portset != MACH_PORT_NULL))
561     /* Destroy PORTSET, but only if it's not actually the reply port for a
562        single descriptor (in which case it's destroyed in the previous loop;
563        not doing it here is just a bit more efficient).  */
564     __mach_port_destroy (__mach_task_self (), portset);
565 
566   if (err)
567     {
568       if (sigmask)
569 	__sigprocmask (SIG_SETMASK, &oset, NULL);
570       return __hurd_fail (err);
571     }
572 
573   if (pollfds)
574     /* Fill in the `revents' members of the user's array.  */
575     for (i = 0; i < nfds; ++i)
576       {
577 	int type = d[i].type;
578 	int revents = 0;
579 
580 	if (type & SELECT_ERROR)
581 	  switch (d[i].error)
582 	    {
583 	      case EPIPE:
584 		revents = POLLHUP;
585 		break;
586 	      case EBADF:
587 		revents = POLLNVAL;
588 		break;
589 	      default:
590 		revents = POLLERR;
591 		break;
592 	    }
593 	else
594 	  if (type & SELECT_RETURNED)
595 	    {
596 	      if (type & SELECT_READ)
597 		revents |= POLLIN;
598 	      if (type & SELECT_WRITE)
599 		revents |= POLLOUT;
600 	      if (type & SELECT_URG)
601 		revents |= POLLPRI;
602 	    }
603 
604 	pollfds[i].revents = revents;
605       }
606   else
607     {
608       /* Below we recalculate READY to include an increment for each operation
609 	 allowed on each fd.  */
610       ready = 0;
611 
612       /* Set the user bitarrays.  We only ever have to clear bits, as all
613 	 desired ones are initially set.  */
614       if (firstfd != -1)
615 	for (i = firstfd; i <= lastfd; ++i)
616 	  {
617 	    int type = d[i].type;
618 
619 	    if ((type & SELECT_RETURNED) == 0)
620 	      type = 0;
621 
622 	    /* Callers of select don't expect to see errors, so we simulate
623 	       readiness of the erring object and the next call hopefully
624 	       will get the error again.  */
625 	    if (type & SELECT_ERROR)
626 	      {
627 		type = 0;
628 		if (readfds != NULL && FD_ISSET (i, readfds))
629 		  type |= SELECT_READ;
630 		if (writefds != NULL && FD_ISSET (i, writefds))
631 		  type |= SELECT_WRITE;
632 		if (exceptfds != NULL && FD_ISSET (i, exceptfds))
633 		  type |= SELECT_URG;
634 	      }
635 
636 	    if (type & SELECT_READ)
637 	      ready++;
638 	    else if (readfds)
639 	      FD_CLR (i, readfds);
640 	    if (type & SELECT_WRITE)
641 	      ready++;
642 	    else if (writefds)
643 	      FD_CLR (i, writefds);
644 	    if (type & SELECT_URG)
645 	      ready++;
646 	    else if (exceptfds)
647 	      FD_CLR (i, exceptfds);
648 	  }
649     }
650 
651   if (sigmask && __sigprocmask (SIG_SETMASK, &oset, NULL))
652     return -1;
653 
654   return ready;
655 }
656