【Linux4.1.12源码分析】邻居子系统实现分析 - neigh

xiaoxiao2021-02-28  153

http://blog.csdn.net/one_clouder/article/details/52889921

邻居子系统实现了IP层发包不感知MAC,即由邻居子系统实现了MAC头封装。MAC头信息包括:源MAC、目的MAC、协议类型,其中协议类型由上层指定,例如IPV4等等,源MAC地址是出口设备MAC地址(在路由表中确定出口设备),目的MAC是由邻居子系统提供的,大致可以猜到,邻居子系统会主动发起arp请求获取到mac地址,实现MAC封包。IP层发包最后会调用ip_finish_output2函数,我们从该函数入手分析邻居子系统。

ip_finish_output2函数

[cpp] view plain copy static inline int ip_finish_output2(struct sock *sk, struct sk_buff *skb)  {      struct dst_entry *dst = skb_dst(skb);      struct rtable *rt = (struct rtable *)dst;      struct net_device *dev = dst->dev;       //出口设备      unsigned int hh_len = LL_RESERVED_SPACE(dev);      struct neighbour *neigh;      u32 nexthop;        if (rt->rt_type == RTN_MULTICAST) {          IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);      } else if (rt->rt_type == RTN_BROADCAST)          IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);        /* Be paranoid, rather than too clever. */      if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {          struct sk_buff *skb2;            skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));          if (!skb2) {              kfree_skb(skb);              return -ENOMEM;          }          if (skb->sk)              skb_set_owner_w(skb2, skb->sk);          consume_skb(skb);          skb = skb2;      }        rcu_read_lock_bh();      nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr); //目的IP地址      neigh = __ipv4_neigh_lookup_noref(dev, nexthop);    //根据目的IP查找邻居项是否存在      if (unlikely(!neigh))          neigh = __neigh_create(&arp_tbl, &nexthop, dev, false); //如果不存在,则创建neigh项      if (!IS_ERR(neigh)) {          int res = dst_neigh_output(dst, neigh, skb);    //调用邻居子系统封装MAC头,并且调用二层发包函数完成报文发送            rcu_read_unlock_bh();          return res;      }      rcu_read_unlock_bh();        net_dbg_ratelimited("%s: No header cache and no neighbour!\n",                  __func__);      kfree_skb(skb);      return -EINVAL;  }   首先会根据出口设备和目的IP地址,查找是否已经存在邻居项,如果没有则创建邻居项,然后通过dst_neigh_output发包,本文分析假设没有邻居项。 先邻居项的查找函数:

__ipv4_neigh_lookup_noref函数

[cpp] view plain copy static inline struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev, u32 key)  {      return ___neigh_lookup_noref(&arp_tbl, neigh_key_eq32, arp_hashfn, &key, dev);  //ipv4从arp_tbl中查找  }   ___neigh_lookup_noref函数 [cpp] view plain copy static inline struct neighbour *___neigh_lookup_noref(      struct neigh_table *tbl,      bool (*key_eq)(const struct neighbour *n, const void *pkey),      __u32 (*hash)(const void *pkey,                const struct net_device *dev,                __u32 *hash_rnd),      const void *pkey,      struct net_device *dev)  {      struct neigh_hash_table *nht = rcu_dereference_bh(tbl->nht); //hash表,邻居数量大时加速      struct neighbour *n;      u32 hash_val;        hash_val = hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift);    //计算hash值      for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]);           n != NULL;           n = rcu_dereference_bh(n->next)) {          if (n->dev == dev && key_eq(n, pkey))    //dev相同并且pkey相同,这里pkey是IPV4地址              return n;      }        return NULL;  }   邻居表项查找比较简单,就是在hash表中查找匹配设备和目的IP地址的邻居表项,该函数支持IPV6, 可扩展性通过参数实现,接下来看下创建邻居表项的实现:

__neigh_create函数

[cpp] view plain copy struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey,                   struct net_device *dev, bool want_ref)  {      u32 hash_val;      int key_len = tbl->key_len;      int error;      struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev); //创建邻居表项对象      struct neigh_hash_table *nht;        if (!n) {          rc = ERR_PTR(-ENOBUFS);          goto out;      }        memcpy(n->primary_key, pkey, key_len);      n->dev = dev;      dev_hold(dev);        /* Protocol specific setup. */      if (tbl->constructor &&  (error = tbl->constructor(n)) < 0) {  //IPV4实际调用arp_constructor函数,设置output函数          rc = ERR_PTR(error);          goto out_neigh_release;      }        if (dev->netdev_ops->ndo_neigh_construct) {   //一般设备不设置该变量          error = dev->netdev_ops->ndo_neigh_construct(n);          if (error < 0) {              rc = ERR_PTR(error);              goto out_neigh_release;          }      }        /* Device specific setup. */      if (n->parms->neigh_setup &&          (error = n->parms->neigh_setup(n)) < 0) {  //IPV4未定义该函数          rc = ERR_PTR(error);          goto out_neigh_release;      }        n->confirmed = jiffies - (NEIGH_VAR(n->parms, BASE_REACHABLE_TIME) << 1);        write_lock_bh(&tbl->lock);      nht = rcu_dereference_protected(tbl->nht,                      lockdep_is_held(&tbl->lock));        if (atomic_read(&tbl->entries) > (1 << nht->hash_shift))          nht = neigh_hash_grow(tbl, nht->hash_shift + 1);        hash_val = tbl->hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift);    //计算hash值,计算方式由邻居表定义        if (n->parms->dead) {          rc = ERR_PTR(-EINVAL);          goto out_tbl_unlock;      }        for (n1 = rcu_dereference_protected(nht->hash_buckets[hash_val], //找到有相同hash值得neighbour链表                          lockdep_is_held(&tbl->lock));           n1 != NULL;           n1 = rcu_dereference_protected(n1->next,              lockdep_is_held(&tbl->lock))) {          if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) {              if (want_ref)                  neigh_hold(n1);              rc = n1;              goto out_tbl_unlock;          }      }        n->dead = 0;      if (want_ref)          neigh_hold(n);      rcu_assign_pointer(n->next,                 rcu_dereference_protected(nht->hash_buckets[hash_val],                               lockdep_is_held(&tbl->lock)));  //插入到链表中      rcu_assign_pointer(nht->hash_buckets[hash_val], n);      write_unlock_bh(&tbl->lock);      neigh_dbg(2, "neigh %p is created\n", n);      rc = n;  out:      return rc;  out_tbl_unlock:      write_unlock_bh(&tbl->lock);  out_neigh_release:      neigh_release(n);      goto out;  }   neigh_alloc函数 [cpp] view plain copy static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev)  {      struct neighbour *n = NULL;      unsigned long now = jiffies;      int entries;        entries = atomic_inc_return(&tbl->entries) - 1;      if (entries >= tbl->gc_thresh3 ||          (entries >= tbl->gc_thresh2 &&           time_after(now, tbl->last_flush + 5 * HZ))) {          if (!neigh_forced_gc(tbl) &&              entries >= tbl->gc_thresh3)              goto out_entries;      }        n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC);      if (!n)          goto out_entries;        __skb_queue_head_init(&n->arp_queue);    //初始化arp_queue队列      rwlock_init(&n->lock);      seqlock_init(&n->ha_lock);      n->updated     = n->used = now;      n->nud_state   = NUD_NONE;       //状态为不可用      n->output      = neigh_blackhole;    //直接丢弃报文      seqlock_init(&n->hh.hh_lock);      n->parms   = neigh_parms_clone(&tbl->parms);  //拷贝neigh_table中的parms      setup_timer(&n->timer, neigh_timer_handler, (unsigned long)n);   //注册定时器        NEIGH_CACHE_STAT_INC(tbl, allocs);      n->tbl         = tbl;      atomic_set(&n->refcnt, 1);      n->dead        = 1;  out:      return n;    out_entries:      atomic_dec(&tbl->entries);      goto out;  }   arp_constructor函数 [cpp] view plain copy static int arp_constructor(struct neighbour *neigh)  {      __be32 addr = *(__be32 *)neigh->primary_key;      struct net_device *dev = neigh->dev;      struct in_device *in_dev;      struct neigh_parms *parms;        rcu_read_lock();      in_dev = __in_dev_get_rcu(dev);     //通过net_device得到in_device      if (!in_dev) {          rcu_read_unlock();          return -EINVAL;      }        neigh->type = inet_addr_type(dev_net(dev), addr);    //设置地址类型        parms = in_dev->arp_parms;      __neigh_parms_put(neigh->parms);      neigh->parms = neigh_parms_clone(parms);      rcu_read_unlock();        if (!dev->header_ops) {      //基本上的网卡都会设置该值          neigh->nud_state = NUD_NOARP;          neigh->ops = &arp_direct_ops;          neigh->output = neigh_direct_output;      } else {          /* Good devices (checked by reading texts, but only Ethernet is            tested)             ARPHRD_ETHER: (ethernet, apfddi)            ARPHRD_FDDI: (fddi)            ARPHRD_IEEE802: (tr)            ARPHRD_METRICOM: (strip)            ARPHRD_ARCNET:            etc. etc. etc.             ARPHRD_IPDDP will also work, if author repairs it.            I did not it, because this driver does not work even            in old paradigm.          */            if (neigh->type == RTN_MULTICAST) {      //组播地址不需要arp              neigh->nud_state = NUD_NOARP;              arp_mc_map(addr, neigh->ha, dev, 1);          } else if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) {    //设备明确不需要arp或本地回环设备,不需要arp              neigh->nud_state = NUD_NOARP;              memcpy(neigh->ha, dev->dev_addr, dev->addr_len);          } else if (neigh->type == RTN_BROADCAST ||                 (dev->flags & IFF_POINTOPOINT)) { //广播或点对点,也不需要arp              neigh->nud_state = NUD_NOARP;              memcpy(neigh->ha, dev->broadcast, dev->addr_len);          }            if (dev->header_ops->cache)       //eth_header_ops包含cache              neigh->ops = &arp_hh_ops;          else              neigh->ops = &arp_generic_ops;            if (neigh->nud_state & NUD_VALID)              neigh->output = neigh->ops->connected_output;          else              neigh->output = neigh->ops->output;    //初始阶段为该值,即arp_hh_ops的neigh_resolve_output函数      }      return 0;  }   邻居表项创建后,output函数为neigh_resolve_output,此时邻居子系统还不具备发送IP报文的能力,因为目的MAC地址还未获取,我们来看下dst_neigh_output函数实现:

dst_neigh_output函数

[cpp] view plain copy static inline int dst_neigh_output(struct dst_entry *dst, struct neighbour *n,                     struct sk_buff *skb)  {      const struct hh_cache *hh;        if (dst->pending_confirm) {          unsigned long now = jiffies;            dst->pending_confirm = 0;          /* avoid dirtying neighbour */          if (n->confirmed != now)              n->confirmed = now;      }        hh = &n->hh;      if ((n->nud_state & NUD_CONNECTED) && hh->hh_len) //如果neighbour已连接且hh已设置          return neigh_hh_output(hh, skb);      else          return n->output(n, skb);    //初始阶段调用此函数,此时为neigh_resolve_output函数  }   neigh_resolve_output函数 [cpp] view plain copy int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb)  {      int rc = 0;        if (!neigh_event_send(neigh, skb)) {        //发送arp请求,第一次返回true          int err;          struct net_device *dev = neigh->dev;          unsigned int seq;            if (dev->header_ops->cache && !neigh->hh.hh_len)              neigh_hh_init(neigh);       //初始化MAC缓存值,目的是加速            do {              __skb_pull(skb, skb_network_offset(skb));   //常见情况,skb指向network header              seq = read_seqbegin(&neigh->ha_lock);              err = dev_hard_header(skb, dev, ntohs(skb->protocol),  //封装MAC头                            neigh->ha, NULL, skb->len);          } while (read_seqretry(&neigh->ha_lock, seq));            if (err >= 0)              rc = dev_queue_xmit(skb);   //二层发送报文          else              goto out_kfree_skb;      }  out:      return rc;  out_kfree_skb:      rc = -EINVAL;      kfree_skb(skb);      goto out;  }   neigh_event_send函数 [cpp] view plain copy static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)  {      unsigned long now = jiffies;            if (neigh->used != now)          neigh->used = now;      if (!(neigh->nud_state&(NUD_CONNECTED|NUD_DELAY|NUD_PROBE)))          return __neigh_event_send(neigh, skb);  //发送arp请求      return 0;  }   __neigh_event_send函数 [cpp] view plain copy int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)  {      int rc;      bool immediate_probe = false;        write_lock_bh(&neigh->lock);        rc = 0;      if (neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE))          goto out_unlock_bh;      if (neigh->dead)          goto out_dead;        if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) {    //初始阶段进入此分支          if (NEIGH_VAR(neigh->parms, MCAST_PROBES) +              NEIGH_VAR(neigh->parms, APP_PROBES)) {              unsigned long next, now = jiffies;                atomic_set(&neigh->probes,                     NEIGH_VAR(neigh->parms, UCAST_PROBES));              neigh->nud_state     = NUD_INCOMPLETE;       //设置表项状态为incomplete              neigh->updated = now;              next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME),                       HZ/2);              neigh_add_timer(neigh, next);   //触发定时器,期望刷新表项状态和output函数,500毫秒后执行              immediate_probe = true;          } else {              neigh->nud_state = NUD_FAILED;              neigh->updated = jiffies;              write_unlock_bh(&neigh->lock);                kfree_skb(skb);              return 1;          }      } else if (neigh->nud_state & NUD_STALE) {          neigh_dbg(2, "neigh %p is delayed\n", neigh);          neigh->nud_state = NUD_DELAY;          neigh->updated = jiffies;          neigh_add_timer(neigh, jiffies +                  NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME));      }        if (neigh->nud_state == NUD_INCOMPLETE) {          if (skb) {              while (neigh->arp_queue_len_bytes + skb->truesize >                     NEIGH_VAR(neigh->parms, QUEUE_LEN_BYTES)) {   //如果等待发送的报文数量超过设定值,丢弃报文                  struct sk_buff *buff;                    buff = __skb_dequeue(&neigh->arp_queue);                  if (!buff)                      break;                  neigh->arp_queue_len_bytes -= buff->truesize;                  kfree_skb(buff);                  NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards);              }              skb_dst_force(skb);              __skb_queue_tail(&neigh->arp_queue, skb);    //报文放入arp_queue队列中              neigh->arp_queue_len_bytes += skb->truesize;          }          rc = 1;      }  out_unlock_bh:      if (immediate_probe)        //初始阶段,邻居项设置状态设置为incomplete,同时设置该变量为true          neigh_probe(neigh); //探测邻居表项      else          write_unlock(&neigh->lock);      local_bh_enable();      return rc;    out_dead:      if (neigh->nud_state & NUD_STALE)          goto out_unlock_bh;      write_unlock_bh(&neigh->lock);      kfree_skb(skb);      return 1;  }   neigh_probe函数 [cpp] view plain copy static void neigh_probe(struct neighbour *neigh)      __releases(neigh->lock)  {      struct sk_buff *skb = skb_peek_tail(&neigh->arp_queue);  //取出报文      /* keep skb alive even if arp_queue overflows */      if (skb)          skb = skb_copy(skb, GFP_ATOMIC);    //拷贝skb      write_unlock(&neigh->lock);      neigh->ops->solicit(neigh, skb);  //实际调用arp_solicit函数,该函数会发送arp请求      atomic_inc(&neigh->probes);      kfree_skb(skb);  }   从上述函数可以看到,报文并没有被发送出去,做了3个事情:1)发送了arp请求, 2)缓存了报文,3)启动定时器500毫秒后执行。 报文被丢弃了? 没有,其实报文是在neigh_update函数中被发送的,该函数的一个调用者是arp处理函数。 调用neigh_update函数后,neigh的output函数被改变,在这个之前,ouput函数仍然是neigh_resolve_output,如果是同一个目的IP,不会再次发送arp请求,仅仅把报文缓存起来,下面我们来看下neigh_update函数:

neigh_update函数

[cpp] view plain copy /* Generic update routine.    -- lladdr is new lladdr or NULL, if it is not supplied.    -- new    is new state.    -- flags     NEIGH_UPDATE_F_OVERRIDE allows to override existing lladdr,                 if it is different.     NEIGH_UPDATE_F_WEAK_OVERRIDE will suspect existing "connected"                 lladdr instead of overriding it                 if it is different.                 It also allows to retain current state                 if lladdr is unchanged.     NEIGH_UPDATE_F_ADMIN    means that the change is administrative.      NEIGH_UPDATE_F_OVERRIDE_ISROUTER allows to override existing                 NTF_ROUTER flag.     NEIGH_UPDATE_F_ISROUTER indicates if the neighbour is known as                 a router.     Caller MUST hold reference count on the entry.  */    int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,           u32 flags)  {      u8 old;      int err;      int notify = 0;      struct net_device *dev;      int update_isrouter = 0;        write_lock_bh(&neigh->lock);        dev    = neigh->dev;      old    = neigh->nud_state;      err    = -EPERM;        if (!(flags & NEIGH_UPDATE_F_ADMIN) &&          (old & (NUD_NOARP | NUD_PERMANENT)))          goto out;      if (neigh->dead)          goto out;        if (!(new & NUD_VALID)) {          neigh_del_timer(neigh);          if (old & NUD_CONNECTED)              neigh_suspect(neigh);          neigh->nud_state = new;          err = 0;          notify = old & NUD_VALID;          if ((old & (NUD_INCOMPLETE | NUD_PROBE)) &&              (new & NUD_FAILED)) {              neigh_invalidate(neigh);              notify = 1;          }          goto out;      }        /* Compare new lladdr with cached one */      if (!dev->addr_len) {          /* First case: device needs no address. */          lladdr = neigh->ha;      } else if (lladdr) {          /* The second case: if something is already cached            and a new address is proposed:            - compare new & old            - if they are different, check override flag          */          if ((old & NUD_VALID) &&              !memcmp(lladdr, neigh->ha, dev->addr_len))              lladdr = neigh->ha;      } else {          /* No address is supplied; if we know something,            use it, otherwise discard the request.          */          err = -EINVAL;          if (!(old & NUD_VALID))              goto out;          lladdr = neigh->ha;      }        if (new & NUD_CONNECTED)          neigh->confirmed = jiffies;      neigh->updated = jiffies;        /* If entry was valid and address is not changed,        do not change entry state, if new one is STALE.      */      err = 0;      update_isrouter = flags & NEIGH_UPDATE_F_OVERRIDE_ISROUTER;      if (old & NUD_VALID) {          if (lladdr != neigh->ha && !(flags & NEIGH_UPDATE_F_OVERRIDE)) {              update_isrouter = 0;              if ((flags & NEIGH_UPDATE_F_WEAK_OVERRIDE) &&                  (old & NUD_CONNECTED)) {                  lladdr = neigh->ha;                  new = NUD_STALE;              } else                  goto out;          } else {              if (lladdr == neigh->ha && new == NUD_STALE &&                  ((flags & NEIGH_UPDATE_F_WEAK_OVERRIDE) ||                   (old & NUD_CONNECTED))                  )                  new = old;          }      }        if (new != old) {          neigh_del_timer(neigh);          if (new & NUD_IN_TIMER)              neigh_add_timer(neigh, (jiffies +                          ((new & NUD_REACHABLE) ?                           neigh->parms->reachable_time :                           0)));          neigh->nud_state = new;          notify = 1;      }        if (lladdr != neigh->ha) {          write_seqlock(&neigh->ha_lock);          memcpy(&neigh->ha, lladdr, dev->addr_len);          write_sequnlock(&neigh->ha_lock);          neigh_update_hhs(neigh);          if (!(new & NUD_CONNECTED))              neigh->confirmed = jiffies -                        (NEIGH_VAR(neigh->parms, BASE_REACHABLE_TIME) << 1);          notify = 1;      }      if (new == old)          goto out;      if (new & NUD_CONNECTED)          neigh_connect(neigh);   //修改output函数为neigh_connected_output      else          neigh_suspect(neigh);      if (!(old & NUD_VALID)) {   //如果源状态不为valid,则发送缓存的skb          struct sk_buff *skb;            /* Again: avoid dead loop if something went wrong */            while (neigh->nud_state & NUD_VALID &&                 (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) {   //取出缓冲报文              struct dst_entry *dst = skb_dst(skb);              struct neighbour *n2, *n1 = neigh;              write_unlock_bh(&neigh->lock);                rcu_read_lock();                /* Why not just use 'neigh' as-is?  The problem is that              * things such as shaper, eql, and sch_teql can end up              * using alternative, different, neigh objects to output              * the packet in the output path.  So what we need to do              * here is re-lookup the top-level neigh in the path so              * we can reinject the packet there.              */              n2 = NULL;              if (dst) {                  n2 = dst_neigh_lookup_skb(dst, skb);                  if (n2)                      n1 = n2;              }              n1->output(n1, skb);     //调用neigh的output函数,此时已经改成connect函数              if (n2)                  neigh_release(n2);              rcu_read_unlock();                write_lock_bh(&neigh->lock);          }          __skb_queue_purge(&neigh->arp_queue);    //清空缓存          neigh->arp_queue_len_bytes = 0;      }  out:      if (update_isrouter) {          neigh->flags = (flags & NEIGH_UPDATE_F_ISROUTER) ?              (neigh->flags | NTF_ROUTER) :              (neigh->flags & ~NTF_ROUTER);      }      write_unlock_bh(&neigh->lock);        if (notify)          neigh_update_notify(neigh);        return err;  }   至此,arp的整个大流程基本清晰了,有些细节还有待梳理,例如neigh_update中发包时,为什么需要重新查找neigh表项而不用当前的neigh等。
转载请注明原文地址: https://www.6miu.com/read-31370.html

最新回复(0)