《TCPIP详解卷2》笔记： read和write系统调用

xiaoxiao2021-02-28 43

read系统调用我们再熟悉不过了，下面列出与read系统调用相关的读系统调用：

#include <unistd.h> ssize_t read(int fd, void *buf, size_t count); #include <sys/uio.h> ssize_t readv(int fd, const struct iovec *iov, int iovcnt); #include <sys/types.h> #include <sys/socket.h> ssize_t recv(int sockfd, void *buf, size_t len, int flags); ssize_t recvfrom(int sockfd, void *buf, size_t len, int flags, struct sockaddr *src_addr, socklen_t *addrlen); ssize_t recvmsg(int sockfd, struct msghdr *msg, int flags); 在BSD内核中，对于socket类型的描述符，这些系统调用最终都会调用到soreceive函数，如下图所示：

在这里，我只讲述read系统调用。read系统调用的代码如下：

/* * Read system call. */ int read(p, uap, retval) struct proc *p; register struct read_args /* { syscallarg(int) fd; syscallarg(char *) buf; syscallarg(u_int) nbyte; } */ *uap; register_t *retval; { register struct file *fp; register struct filedesc *fdp = p->p_fd; struct uio auio; struct iovec aiov; long cnt, error = 0; if (((u_int)SCARG(uap, fd)) >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL || (fp->f_flag & FREAD) == 0) return (EBADF); aiov.iov_base = (caddr_t)SCARG(uap, buf); /*缓冲区起始地址*/ aiov.iov_len = SCARG(uap, nbyte); /*要读取的字节数*/ auio.uio_iov = &aiov; /*缓冲区向量*/ auio.uio_iovcnt = 1; /*缓冲区个数*/ auio.uio_resid = SCARG(uap, nbyte); /*要读取的字节数*/ auio.uio_rw = UIO_READ; /*读操作*/ auio.uio_segflg = UIO_USERSPACE; /*缓冲区在用户空间*/ auio.uio_procp = p; cnt = SCARG(uap, nbyte); if (error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred)) /*调用soo_read函数完成数据读取*/ if (auio.uio_resid != cnt && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; cnt -= auio.uio_resid; *retval = cnt; /*返回读取成功的字节数*/ return (error); } 内核用iovec结构描述缓冲区，其中包含缓冲区的起始地址和大小。用uio结构描述缓冲区数据的传输操作。它们的定义如下： /* * XXX * iov_base should be a void *. */ struct iovec { char *iov_base; /* Base address. */ size_t iov_len; /* Length. */ }; enum uio_rw { UIO_READ, UIO_WRITE }; /* Segment flag values. */ enum uio_seg { UIO_USERSPACE, /* from user data space */ UIO_SYSSPACE, /* from system space */ UIO_USERISPACE /* from user I space */ }; struct uio { struct iovec *uio_iov; /*缓冲区向量*/ int uio_iovcnt; /*缓冲区个数*/ off_t uio_offset; /*要传输的数据在当前缓冲区的偏移量*/ int uio_resid; /*剩余的字节数*/ enum uio_seg uio_segflg; /*数据在用户空间还是在内核空间*/ enum uio_rw uio_rw; /*是读还是写*/ struct proc *uio_procp; };

soo_read函数调用soreceive函数完成读取接收缓冲区的数据到用户空间的缓冲区中。它的代码很简单：

/* ARGSUSED */ int soo_read(fp, uio, cred) struct file *fp; struct uio *uio; struct ucred *cred; { return (soreceive((struct socket *)fp->f_data, (struct mbuf **)0, uio, (struct mbuf **)0, (struct mbuf **)0, (int *)0)); }

我已经介绍过，soreceive函数会被多个读系统调用函数调用。它的复杂性主要体现在： 1 处理多种协议的数据，包括控制数据和TCP的带外数据. 2. 处理recv、recvfrom和recvmsg系统调用传递的标志，这些标志会影响读取数据的行为。

在这里，我只关心TCP协议。TCP是流式协议，它的接收缓冲区mbuf的组织如下所示：

soo_read函数调用soreceive函数时，只传递so和uio参数，其他的参数都为空。对于soreceive函数，我已经删去了与TCP协议无关的流程，还删去了读取TCP带外数据的流程。删去这些并不影响我们了解读取正常TCP数据的流程，简化后的代码如下：

/* * Implement receive operations on a socket. * We depend on the way that records are added to the sockbuf * by sbappend*. In particular, each record (mbufs linked through m_next) * must begin with an address if the protocol so specifies, * followed by an optional mbuf or mbufs containing ancillary data, * and then zero or more mbufs of data. * In order to avoid blocking network interrupts for the entire time here, * we splx() while doing the actual copy to user space. * Although the sockbuf is locked, new data may still be appended, * and thus we must maintain consistency of the sockbuf during that time. * * The caller may receive the data as a single mbuf chain by supplying * an mbuf **mp0 for use in returning the chain. The uio is then used * only for the count in uio_resid. */ int soreceive(so, paddr, uio, mp0, controlp, flagsp) register struct socket *so; struct mbuf **paddr; struct uio *uio; struct mbuf **mp0; struct mbuf **controlp; int *flagsp; { register struct mbuf *m; register int len, error, s; struct protosw *pr = so->so_proto; int moff; int orig_resid = uio->uio_resid; restart: if (error = sblock(&so->so_rcv, SBLOCKWAIT(0))) /*给接收缓冲区加锁*/ return (error); s = splnet(); m = so->so_rcv.sb_mb; /*指向接收缓冲区mbuf*/ if (m == 0 || (so->so_rcv.sb_cc < uio->uio_resid) && /*缓冲区无数据或者缓冲区数据量少于要读取的数据量并且少于低水位标记*/ (so->so_rcv.sb_cc < so->so_rcv.sb_lowat)) { #ifdef DIAGNOSTIC if (m == 0 && so->so_rcv.sb_cc) panic("receive 1"); #endif if (so->so_error) { /*socket中出现错误？*/ if (m) goto dontblock; /*有数据尝试去读取数据*/ error = so->so_error; so->so_error = 0; goto release; /*否则退出*/ } if (so->so_state & SS_CANTRCVMORE) { /*缓冲区不能再接收更多数据？*/ if (m) goto dontblock; else goto release; } if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0) { /*连接未建立？*/ error = ENOTCONN; goto release; } if (uio->uio_resid == 0) /*要读取的数据量为0？*/ goto release; if (so->so_state & SS_NBIO) { /*非阻塞？*/ error = EWOULDBLOCK; goto release; } sbunlock(&so->so_rcv); error = sbwait(&so->so_rcv); /*在接收缓冲区等待更多的数据*/ splx(s); if (error) return (error); goto restart; /*被唤醒后重新走一遍检查流程*/ } dontblock: /*走到这里表示可以读取缓冲区中的数据*/ moff = 0; while (m && uio->uio_resid > 0 && error == 0) { /*缓冲区有数据并且还有数据要读取，循环*/ #ifdef DIAGNOSTIC if (m->m_type != MT_DATA && m->m_type != MT_HEADER) panic("receive 3"); #endif so->so_state &= ~SS_RCVATMARK; len = uio->uio_resid; /*还要读取的字节数*/ if (len > m->m_len - moff) len = m->m_len - moff; /*一次uiomove操作能读取的字节数*/ splx(s); error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio); /*调用uiomove开始读取数据，uio->uio_resid会被更新*/ s = splnet(); if (len == m->m_len - moff) { /*一个mbuf中的数据全部被读取*/ sbfree(&so->so_rcv, m); MFREE(m, so->so_rcv.sb_mb); /*释放mbuf*/ m = so->so_rcv.sb_mb; /*指向下一个mbuf*/ } else { /*更新mbuf中的数据量和缓冲区中的数据量*/ m->m_data += len; m->m_len -= len; so->so_rcv.sb_cc -= len; } } (*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0, /*读取完数据后以PRU_RCVD命令调用tcp_usrreq函数*/ (struct mbuf *)(long)0, (struct mbuf *)0, (struct mbuf *)0); if (orig_resid == uio->uio_resid && orig_resid && (so->so_state & SS_CANTRCVMORE) == 0) { /*如果本次没有读取到任何数据？*/ sbunlock(&so->so_rcv); splx(s); goto restart; } release: sbunlock(&so->so_rcv); splx(s); return (error); }

下面是与write系统调用相关的写系统调用：

#include <unistd.h> ssize_t write(int fd, const void *buf, size_t count); #include <sys/uio.h> ssize_t writev(int fd, const struct iovec *iov, int iovcnt); #include <sys/types.h> #include <sys/socket.h> ssize_t send(int sockfd, const void *buf, size_t len, int flags); ssize_t sendto(int sockfd, const void *buf, size_t len, int flags, const struct sockaddr *dest_addr, socklen_t addrlen); ssize_t sendmsg(int sockfd, const struct msghdr *msg, int flags); 对于socket类型的描述符，这些系统调用最终都会调用到sosend函数，如下图所示：

同样，我只关系write系统调用。它的代码如下：

/* * Write system call */ int write(p, uap, retval) struct proc *p; register struct write_args /* { syscallarg(int) fd; syscallarg(char *) buf; syscallarg(u_int) nbyte; } */ *uap; register_t *retval; { register struct file *fp; register struct filedesc *fdp = p->p_fd; struct uio auio; struct iovec aiov; long cnt, error = 0; if (((u_int)SCARG(uap, fd)) >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL || (fp->f_flag & FWRITE) == 0) return (EBADF); aiov.iov_base = (caddr_t)SCARG(uap, buf); /*缓冲区起始位置*/ aiov.iov_len = SCARG(uap, nbyte); /*要读取的字节数*/ auio.uio_iov = &aiov; /*缓冲区向量*/ auio.uio_iovcnt = 1; /*缓冲区个数*/ auio.uio_resid = SCARG(uap, nbyte); /*要读取的字节数*/ auio.uio_rw = UIO_WRITE; /*写操作*/ auio.uio_segflg = UIO_USERSPACE; /*缓冲区在用户空间*/ auio.uio_procp = p; cnt = SCARG(uap, nbyte); if (error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred)) { /*调用soo_write函数写数据到内核*/ if (auio.uio_resid != cnt && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; if (error == EPIPE) psignal(p, SIGPIPE); } cnt -= auio.uio_resid; *retval = cnt; /*返回成功写入的字节数*/ return (error); }

soo_write函数的代码也很简单：

/* ARGSUSED */ int soo_write(fp, uio, cred) struct file *fp; struct uio *uio; struct ucred *cred; { return (sosend((struct socket *)fp->f_data, (struct mbuf *)0, uio, (struct mbuf *)0, (struct mbuf *)0, 0)); } soo_write函数调用sosend函数时，只传递了so和uio参数。同样我也简化了sosend函数的流程，使得它的流程只包含发送正常的TCP数据。简化后的代码如下： /* * Send on a socket. * If send must go all at once and message is larger than * send buffering, then hard error. * Lock against other senders. * If must go all at once and not enough room now, then * inform user that this would block and do nothing. * Otherwise, if nonblocking, send as much as possible. * The data to be sent is described by "uio" if nonzero, * otherwise by the mbuf chain "top" (which must be null * if uio is not). Data provided in mbuf chain must be small * enough to send all at once. * * Returns nonzero on error, timeout or signal; callers * must check for short counts if EINTR/ERESTART are returned. * Data and control buffers are freed on return. */ int sosend(so, addr, uio, top, control, flags) register struct socket *so; struct mbuf *addr; struct uio *uio; struct mbuf *top; struct mbuf *control; int flags; { struct proc *p = curproc; /* XXX */ struct mbuf **mp; register struct mbuf *m; register long space, len, resid; int clen = 0, error, s, mlen; resid = uio->uio_resid; /* * In theory resid should be unsigned. * However, space must be signed, as it might be less than 0 * if we over-committed, and we must use a signed comparison * of space and resid. On the other hand, a negative resid * causes us to loop sending 0-length segments to the protocol. */ if (resid < 0) return (EINVAL); p->p_stats->p_ru.ru_msgsnd++; #define snderr(errno) { error = errno; splx(s); goto release; } restart: if (error = sblock(&so->so_snd, SBLOCKWAIT(flags))) /*给发送缓冲区加锁*/ goto out; do { /*外循环*/ s = splnet(); if (so->so_state & SS_CANTSENDMORE) /*不能再发送数据了？*/ snderr(EPIPE); if (so->so_error) /*socket出现错误？*/ snderr(so->so_error); if ((so->so_state & SS_ISCONNECTED) == 0) { /*连接未建立？*/ if (so->so_proto->pr_flags & PR_CONNREQUIRED) { if ((so->so_state & SS_ISCONFIRMING) == 0 && !(resid == 0 && clen != 0)) snderr(ENOTCONN); } else if (addr == 0) snderr(EDESTADDRREQ); } space = sbspace(&so->so_snd); /*发送缓冲区可用空间大小*/ if (space < resid + clen && uio && (space < so->so_snd.sb_lowat || space < clen)) { /*发送缓冲区可用空间少于要发送的数据并且少于低水位标记*/ if (so->so_state & SS_NBIO) /*非阻塞》*/ snderr(EWOULDBLOCK); sbunlock(&so->so_snd); error = sbwait(&so->so_snd); /*在发送缓冲区等待可用空间变大*/ splx(s); if (error) goto out; goto restart; } splx(s); mp = ⊤ /*mp保存top指针的地址*/ space -= clen; do { /*内循环*/ if (top == 0) { /*top都为0*/ MGETHDR(m, M_WAIT, MT_DATA); /*获取存储分组首部的mbuf*/ mlen = MHLEN; m->m_pkthdr.len = 0; m->m_pkthdr.rcvif = (struct ifnet *)0; } else { MGET(m, M_WAIT, MT_DATA); mlen = MLEN; } if (resid >= MINCLSIZE && space >= MCLBYTES) { /*要写入的数据量值得分配一个簇*/ MCLGET(m, M_WAIT); /* 获取一个簇*/ if ((m->m_flags & M_EXT) == 0) goto nopages; mlen = MCLBYTES; len = min(MCLBYTES, resid); space -= MCLBYTES; /*更新发送缓冲区可用空间大小*/ } else { nopages: len = min(min(mlen, resid), space); /*一次uiomove操作可写入的数据量*/ space -= len; /*更新发送缓冲区可用空间大小*/ } error = uiomove(mtod(m, caddr_t), (int)len, uio); /*写入数据到mbuf*/ resid = uio->uio_resid; /*剩余要写入的字节数*/ m->m_len = len; /*更新mbuf中的数据量*/ *mp = m; /*top指针指向m*/ top->m_pkthdr.len += len; /*更新mbuf中的分组长度*/ if (error) goto release; mp = &m->m_next; /*指向下一个mbuf*/ if (resid <= 0) break; s = splnet(); /* XXX */ error = (*so->so_proto->pr_usrreq)(so, PRU_SEND, top, addr, control); /*以PRU_SEND调用tcp_usrreq函数*/ splx(s); clen = 0; control = 0; top = 0; mp = ⊤ /*再次指向top*/ if (error) goto release; } while (resid && space > 0); /*当还有要写入的数据并且发送缓冲区有可用空间时，循环*/ } while (resid); /*当还有数据要写入，但是发送缓冲区无可用空间时*/ release: sbunlock(&so->so_snd); out: if (top) m_freem(top); return (error); }

tcp_usrreq函数对PRU_RCVD和PUR_SEND请求命令的处理代码片段如下：

/* * After a receive, possibly send window update to peer. */ case PRU_RCVD: (void) tcp_output(tp); break; /* * Do a send by putting data in output queue and updating urgent * marker if URG set. Possibly send more data. */ case PRU_SEND: sbappend(&so->so_snd, m); error = tcp_output(tp); break;

对于PRU_RCVD命令，tcp_usrreq函数调用tcp_output函数。因为用户已经从接收缓冲区中读取数据了，接收缓冲区空间变大，可能需要发送窗口更新报文。

对于PRU_SEND命令，tcp_usrreq函数将数据（已放在由sosend函数构造的mbuf中）添加到发送缓冲区，然后调用tcp_output函数发送。

转载请注明原文地址: https://www.6miu.com/read-2621791.html

技术

最新回复(0)

《TCPIP详解 卷2》 笔记： read和write系统调用

技术

《TCPIP详解卷2》笔记： read和write系统调用