<?xml version="1.0" encoding="US-ASCII"?>
<!-- This template is for creating an Internet Draft using xml2rfc,
    which is available here: http://xml.resource.org. -->
<!DOCTYPE rfc SYSTEM "rfc2629.dtd" [
<!-- One method to get references from the online citation libraries.
    There has to be one entity for each item to be referenced. 
    An alternate method (rfc include) is described in the references. -->
<!ENTITY RFC2119 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2119.xml">
<!ENTITY RFC2629 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2629.xml">
<!ENTITY RFC3552 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.3552.xml">
<!ENTITY I-D.narten-iana-considerations-rfc2434bis SYSTEM "http://xml.resource.org/public/rfc/bibxml3/reference.I-D.narten-iana-considerations-rfc2434bis.xml">
]>
<?xml-stylesheet type='text/xsl' href='rfc2629.xslt' ?>
<!-- used by XSLT processors -->
<!-- For a complete list and description of processing instructions (PIs), 
    please see http://xml.resource.org/authoring/README.html. -->
<!-- Below are generally applicable Processing Instructions (PIs) that most I-Ds might want to use.
    (Here they are set differently than their defaults in xml2rfc v1.32) -->
<?rfc strict="yes" ?>
<!-- give errors regarding ID-nits and DTD validation -->
<!-- control the table of contents (ToC) -->
<?rfc toc="yes"?>
<!-- generate a ToC -->
<?rfc tocdepth="4"?>
<!-- the number of levels of subsections in ToC. default: 3 -->
<!-- control references -->
<?rfc symrefs="yes"?>
<!-- use symbolic references tags, i.e, [RFC2119] instead of [1] -->
<?rfc sortrefs="yes" ?>
<!-- sort the reference entries alphabetically -->
<!-- control vertical white space 
    (using these PIs as follows is recommended by the RFC Editor) -->
<?rfc compact="yes" ?>
<!-- do not start each main section on a new page -->
<?rfc subcompact="no" ?>
<!-- keep one blank line between list items -->
<!-- end of list of popular I-D processing instructions -->
<rfc category="std" docName="draft-xu-rtgwg-fare-in-sun-02" ipr="trust200902">
  <front>
    <title abbrev="FARE in SUN">Fully Adaptive Routing Ethernet in Scale-Up
    Networks</title>

    <author fullname="Xiaohu Xu" initials="X." surname="Xu">
      <organization>China Mobile</organization>

      <address>
        <email>xuxiaohu_ietf@hotmail.com</email>
      </address>
    </author>

    <author fullname="Zongying He" initials="Z." surname="He">
      <organization>Broadcom</organization>

      <address>
        <email>zongying.he@broadcom.com</email>
      </address>
    </author>

    <author fullname="Nan Wang " initials="N." surname="Wang">
      <organization>Intel</organization>

      <address>
        <email>nan.wang@intel.com</email>
      </address>
    </author>

    <author fullname="Nan Wang " initials="N." surname="Wang">
      <organization>Hygon</organization>

      <address>
        <postal>
          <street/>

          <city/>

          <region/>

          <code/>

          <country/>
        </postal>

        <phone/>

        <facsimile/>

        <email>wangn@hygon.cn</email>

        <uri/>
      </address>
    </author>

    <author fullname="Hua Wang" initials="H." surname="Wang">
      <organization>Moore Threads</organization>

      <address>
        <email>wh@mthreads.com</email>
      </address>
    </author>

    <author fullname="Jian Guo" initials="J." surname="Guo">
      <organization>Biren Technology</organization>

      <address>
        <postal>
          <street/>

          <city/>

          <region/>

          <code/>

          <country/>
        </postal>

        <phone/>

        <facsimile/>

        <email>jguo@birentech.com</email>

        <uri/>
      </address>
    </author>

    <author fullname="Xiang Li" initials="X." surname="Li">
      <organization>Enflame Technology</organization>

      <address>
        <postal>
          <street/>

          <city/>

          <region/>

          <code/>

          <country/>
        </postal>

        <phone/>

        <facsimile/>

        <email>xiang.li@enflame-tech.com</email>

        <uri/>
      </address>
    </author>

    <author fullname="Tianyou Zhou" initials="T." surname="Zhou">
      <organization>Resnics Technology</organization>

      <address>
        <email>tzhou@resnics.com</email>
      </address>
    </author>

    <author fullname="Yongtao Yang" initials="Y." surname="Yang">
      <organization>Centec</organization>

      <address>
        <email>yangyt@centec.com</email>
      </address>
    </author>

    <author fullname="Yinben Xia" initials="Y." surname="Xia">
      <organization>Tencent</organization>

      <address>
        <email>forestxia@tencent.com</email>
      </address>
    </author>

    <author fullname="Weifeng Zhang" initials="W." surname="Zhang">
      <organization>Tencent</organization>

      <address>
        <email>wikkizhang@tencent.com</email>
      </address>
    </author>

    <author fullname="Peilong Wang" initials="P." surname="Wang">
      <organization>Baidu</organization>

      <address>
        <email>wangpeilong01@baidu.com</email>
      </address>
    </author>

    <author fullname="Yan Zhuang" initials="Y." surname="Zhuang">
      <organization>Huawei Technologies</organization>

      <address>
        <email>zhuangyan.zhuang@huawei.com</email>
      </address>
    </author>

    <author fullname="Fajie Yang " initials="F." surname="Yang">
      <organization>Cloudnine Information Technologies</organization>

      <address>
        <email>yangfajie@cloudnineinfo.com</email>
      </address>
    </author>

    <author fullname="Chao Li" initials="C." surname="Li">
      <organization>Metanet Networking Technology</organization>

      <address>
        <email>lichao22@ieisystem.com</email>
      </address>
    </author>

    <author fullname="Wang Xiaojun" initials="X." surname="Wang">
      <organization>Ruijie Networks</organization>

      <address>
        <email>wxj@ruijie.com.cn</email>
      </address>
    </author>

    <!---->

    <date day="26" month="February" year="2026"/>

    <abstract>
      <t>The Mixture of Experts (MoE) has become a dominant paradigm in
      transformer-based artificial intelligence (AI) large language models
      (LLMs). It is widely adopted in both distributed training and
      distributed inference. To enable efficient expert parallelization and
      even tensor parallelization across dozens or even hundreds of Graphics
      Processing Units (GPUs) in MoE architectures, an ultra-high-throughput,
      ultra-low-latency AI scale-up network (SUN) is critical. This document
      describes how to extend the Weighted Equal-Cost Multi-Path (WECMP)
      load-balancing mechanism, referred to as Fully Adaptive Routing Ethernet
      (FARE), which was originally designed for scale-out networks, to
      scale-up networks.</t>
    </abstract>

    <note title="Requirements Language">
      <t>The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT",
      "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this
      document are to be interpreted as described in <xref
      target="RFC2119">RFC 2119</xref>.</t>
    </note>
  </front>

  <middle>
    <section title="Introduction">
      <t>The Mixture of Experts (MoE) has become a dominant paradigm in
      transformer-based artificial intelligence (AI) large language models
      (LLMs). It is widely adopted in both distributed training and
      distributed inference. To enable efficient expert parallelization and
      even tensor parallelization across dozens or even hundreds of Graphics
      Processing Units (GPUs) in MoE architectures, an ultra-high-throughput,
      ultra-low-latency AI scale-up network (SUN) is indispensable. This
      network serves as the interconnection fabric, allowing GPUs to function
      as a unified super GPU, referred to as a&nbsp;SuperPoD. The scale-up
      network is fundamental for efficiently transporting substantial volumes
      of communication traffic within the SuperPoD. It includes but not
      limited to&#65306;1) all-to-all traffic for Expert Parallelism (EP)
      communication, and 2) all-reduce traffic for Tensor Parallelism (TP)
      communication, ensuring consistent tensor values across GPUs during
      training and inference.</t>

      <figure>
        <artwork align="center"><![CDATA[      
   +----+ +----+ +----+ +----+  
   | L1 | | L2 | | L3 | | L4 |  (Leaf)
   +----+ +----+ +----+ +----+             
             
   +----+ +----+ +----+ +----+ +----+ +----+      +----+
   | G1 | | G2 | | G3 | | G4 | | G5 | | G6 | ...  |G64 |  (GPU)
   +----+ +----+ +----+ +----+ +----+ +----+      +----+ 


                              Figure 1]]></artwork>
      </figure>

      <t>(Note that the diagram above does not include the connections between
      GPUs and leaf switches. However, it can be assumed that GPUs are
      connected to every leaf switch in the above scale-up network
      topology.)</t>

      <t>As shown in Figure 1, it's a 64-GPU SuperPoD that consists of 64 GPUs
      and four leaf switches with high radix (e.g., 128 400G ports). To
      achieve inter-GPU bandwidths of several terabits per second (Tbps) or
      higher, each GPU is typically equipped with multiple scale-up network
      ports (e.g., four 800 Gbps ports). Each port connects to a separate
      scale-up leaf switch via a Y-cable, forming four distinct network
      planes.</t>

      <t>In such multi-plane scale-up networks, achieving ultra-high bandwidth
      and ultra-low latency requires two key strategies. First, efficiently
      distributing data across all network planes is critical. For instance,
      if an 800G port on a GPU fails, traffic destined for that GPU over the
      faulty plane must immediately cease. If only one 400G sub-cable of a
      given 800G Y-cable malfunctions, halving the bandwidth of the affected
      network plane, traffic on that network plane between the relevant GPU
      pair should be proportionally reduced. Second, incast traffic patterns
      inherent to all-to-all communication may cause congestion on the egress
      ports of a last-hop switch; therefore, a more efficient congestion
      management mechanism is required.</t>

      <t>This document describes how to extend the Fully Adaptive Routing
      Ethernet (FARE) using BGP (FARE-BGP in short) as described in <xref
      target="I-D.xu-idr-fare"/>, which was originally designed for scale-out
      netowrks, to scale-up networks.</t>
    </section>

    <section anchor="Abbreviations_Terminology" title="Terminology">
      <t>This memo makes use of the terms defined in <xref
      target="RFC2119"/>.</t>
    </section>

    <section title="Solution Description">
      <t>Each pair of GPUs establishes multiple Remote Direct Memory Access
      (RDMA) Queue Pairs (QPs) for data transmission using the loopback
      addresses of the GPU servers. It is recommended that each loopback
      address be bound to a single GPU. While the use of port-level or
      sub-port-level physical addresses for QP establishment is technically
      supported, this approach is not recommended.</t>

      <t>Additionally, upper-layer adaptations (e.g., transaction layer) can
      facilitate memory semantic operations (load/store/atomic) based on RDMA
      message semantics. However, implementation details are beyond the scope
      of this document.</t>

      <t>Acting as stub BGP speakers, servers exchange BGP routes with
      connected switches across different planes, advertising the reachability
      of their loopback addresses and learning the reachability of remote
      GPUs. Additionally, by extending FARE-BGP from switches to servers, they
      can obtain path bandwidth information related to ECMP routes for other
      GPUs. This capability enables GPUs to perform WECMP load balancing
      across all available network planes of a scale-up network.</t>

      <t>When the path bandwidth of a route through a specific network plane
      to a destination GPU degrades due to events such as network plane
      failures or partial link outages, existing Queue Pairs (QPs) traversing
      unaffected planes maintain their established forwarding paths.
      Meanwhile, the source GPU must adjust the traffic load allocated to the
      affected network plane based on updated weight values. Conversely, when
      the path bandwidth through a previously degraded network plane
      recovers&mdash;such as after failed links or planes are
      restored&mdash;the source GPU should increase the traffic load allocated
      to that plane. This approach ensures optimal traffic distribution across
      all operational network planes.</t>

      <section title="Per-Flow Weighted Load Balancing">
        <t>Per-flow weighted load balancing is recommended when ordered packet
        delivery is essential.</t>

        <t>For per-flow weighted load balancing, at least one Queue Pair (QP)
        per sub-port must be established between a pair of GPUs. When QPs are
        configured using the loopback address assigned to each GPU, each QP
        should be assigned a unique UDP source port to differentiate traffic
        flows across all network planes between the GPU pair. If QPs are
        configured using the physical addresses assigned to ports, each QP
        should be assigned a unique UDP source port to differentiate traffic
        flows across the same network plane. If QPs are configured using the
        physical addresses assigned to sub-ports, there is no need for
        assigning unique UDP source port for each QP anymore.</t>

        <t>The traffic allocated to a given network plane is evenly
        distributed among all available QPs traversing that plane.</t>

        <t>The switch within each network plane SHOULD perform per-flow load
        balancing as well to ensure ordered packet delivery for all QPs.</t>
      </section>

      <section title="Per-Packet Weighted Load Balancing&#8232;">
        <t>Rer-packet weighted load balancing is recommended in the case where
        disordered packet delivery is acceptable.</t>

        <t>For per-packet weighted load balancing, all QPs established between
        a pair of GPUs must support disordered packet delivery (e.g., through
        the Direct Data Placement mechanism <xref target="RFC7306"/>). In this
        mode, a single QP per network plane between a given GPU pair is
        sufficient, with the traffic of that QP evenly distributed across all
        available routes within that network plane.</t>

        <t>The switch within each network plane SHOULD perform per-packet
        weighted load balancing since disordered packet delivery is acceptable
        for all QPs.</t>
      </section>
    </section>

    <section title="Considerations on Memory Semantic Operations">
      <t>When implementing memory semantics, the ordering guarantees for
      network transmission can be categorized as follows: </t>

      <t>a. Weak Ordering Guarantee for Network Transmission: The network
      adopts full packet spraying, and the GPUs rely entirely on the Reorder
      Buffer (ROB) to maintain ordering. This results in a significant
      increase in implementation complexity on the GPU side. </t>

      <t>b. Partial Ordering Constraint for Network Transmission: For
      transactions with strict ordering requirements (e.g., fence and barrier
      operations), sequential execution is mandatory. These transactions are
      marked with a "strong ordering" flag, and the endpoint side uses a
      blocking mechanism to wait and satisfy the ordering requirement. For
      transactions that allow out-of-order transmission, the network provides
      a baseline hash-based ordering guarantee mechanism. When the GPU
      generates transactions with the same hash key, in-order delivery is
      enforced between these transactions. This approach grants the GPU ample
      flexibility while enabling fine-grained local control over ordering.
      </t>

      <t>c. Strong Ordering Guarantee for Network Transmission: To simplify
      the implementation of memory semantic transactions, some GPUs require
      that the same transaction stream be transmitted strictly in order along
      the entire network path, with out-of-order transmission completely
      prohibited. This achieves a highly simplified implementation on the GPU
      side.</t>

      <t>When implementing native Load/Store memory semantics directly on top
      of RDMA QPs, additional purpose-built mechanisms are required to
      guarantee the sequential consistency of memory
      transactions&mdash;particularly for GPUs built on weak-order memory
      models. Specifically, for weak-order memory models, transactions of the
      same type targeting the same memory address must maintain consistent
      ordering throughout their entire network transmission and transaction
      processing pipeline. To achieve this, transactions should be routed to
      the same QP via a hash-based strategy: all transactions targeting the
      same memory address are hashed to the same QP. Furthermore, each QP
      enforces strict in-order transmission and completion along its dedicated
      network path when operating in per-flow weighted load-balancing mode.
      </t>
    </section>

    <section anchor="Acknowledgements" title="Acknowledgements">
      <t>TBD.</t>

      <!---->
    </section>

    <section anchor="IANA" title="IANA Considerations">
      <t>TBD.</t>
    </section>

    <section anchor="Security" title="Security Considerations">
      <t>TBD.</t>

      <!---->
    </section>
  </middle>

  <back>
    <references title="Normative References">
      <?rfc include='reference.RFC.2119'?>

      <!---->
    </references>

    <references title="Informative References">
      <?rfc include='reference.RFC.7306'?>

      <?rfc include="reference.I-D.xu-idr-fare"?>

      <!---->
    </references>
  </back>
</rfc>
