<?xml version="1.0" encoding="US-ASCII"?>
<!-- This template is for creating an Internet Draft using xml2rfc,
    which is available here: http://xml.resource.org. -->
<!DOCTYPE rfc SYSTEM "rfc2629.dtd" [
<!-- One method to get references from the online citation libraries.
    There has to be one entity for each item to be referenced. 
    An alternate method (rfc include) is described in the references. -->

<!ENTITY RFC2119 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2119.xml">
<!ENTITY RFC2629 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2629.xml">
<!ENTITY RFC3552 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.3552.xml">
<!ENTITY RFC5226 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.5226.xml">
<!ENTITY RFC7209 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.7209.xml">
<!ENTITY RFC7432 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.7432.xml">
]>
<?xml-stylesheet type='text/xsl' href='rfc2629.xslt' ?>
<!-- used by XSLT processors -->
<!-- For a complete list and description of processing instructions (PIs), 
    please see http://xml.resource.org/authoring/README.html. -->
<!-- Below are generally applicable Processing Instructions (PIs) that most I-Ds might want to use.
    (Here they are set differently than their defaults in xml2rfc v1.32) -->
<?rfc strict="yes" ?>
<!-- give errors regarding ID-nits and DTD validation -->
<!-- control the table of contents (ToC) -->
<?rfc toc="yes"?>
<!-- generate a ToC -->
<?rfc tocdepth="4"?>
<!-- the number of levels of subsections in ToC. default: 3 -->
<!-- control references -->
<?rfc symrefs="yes"?>
<!-- use symbolic references tags, i.e, [RFC2119] instead of [1] -->
<?rfc sortrefs="yes" ?>
<!-- sort the reference entries alphabetically -->
<!-- control vertical white space 
    (using these PIs as follows is recommended by the RFC Editor) -->
<?rfc compact="yes" ?>
<!-- do not start each main section on a new page -->
<?rfc subcompact="no" ?>
<!-- keep one blank line between list items -->
<!-- end of list of popular I-D processing instructions -->
<rfc category="std" docName="draft-sharma-multi-site-evpn-01" ipr="trust200902">
 <!-- category values: std, bcp, info, exp, and historic
    ipr values: trust200902, noModificationTrust200902, noDerivativesTrust200902,
       or pre5378Trust200902
    you can add the attributes updates="NNNN" and obsoletes="NNNN" 
    they will automatically be output with "(if approved)" -->

 <!-- ***** FRONT MATTER ***** -->

 <front>
   <!-- The abbreviated title is used in the page header - it is only necessary if the 
        full title is longer than 39 characters -->

   <title abbrev="Multi-site EVPN" > 
                  Multi-site EVPN based VXLAN using Border Gateways
   </title>

   <!-- add 'role="editor"' below for the editors if appropriate -->

   <!-- Another author who claims to be an editor -->

   <author fullname="Rajesh Sharma" initials="R.S." role="editor"
           surname="Sharma">
     <organization>Cisco Systems</organization>
     <address>
       <postal>
         <street> 170 W Tasman Drive</street>
         <city>San Jose</city>
         <region>CA</region>
         <code></code>
         <country>USA</country>
       </postal>
       <phone></phone>
       <email>rajshr@cisco.com</email>

       <!-- uri and facsimile elements may also be added -->
     </address>
   </author>
   <author fullname="Ayan Banerjee" initials="A.B."
           surname="Banerjee">
     <organization>Cisco Systems</organization>
     <address>
       <postal>
         <street> 170 W Tasman Drive</street>
         <city>San Jose</city>
         <region>CA</region>
         <code></code>
         <country>USA</country>
       </postal>
       <phone></phone>
       <email>ayabaner@cisco.com</email>

       <!-- uri and facsimile elements may also be added -->
     </address>
   </author>
   <author fullname="Raghava Sivaramu" initials="R.S."
           surname="Sivaramu">
     <organization>Cisco Systems</organization>
     <address>
       <postal>
         <street> 170 W Tasman Drive</street>
         <city>San Jose</city>
         <region>CA</region>
         <code></code>
         <country>USA</country>
       </postal>
       <phone></phone>
       <email>raghavas@cisco.com</email>

       <!-- uri and facsimile elements may also be added -->
     </address>
   </author>
   <author fullname="Ali Sajassi" initials="A.S."
           surname="Sajassi">
     <organization>Cisco Systems</organization>
     <address>
       <postal>
         <street> 170 W Tasman Drive</street>
         <city>San Jose</city>
         <region>CA</region>
         <code></code>
         <country>USA</country>
       </postal>
       <phone></phone>
       <email>sajassi@cisco.com</email>

       <!-- uri and facsimile elements may also be added -->
     </address>
   </author>


   <date year="2016" />

   <!-- If the month and year are both specified and are the current ones, xml2rfc will fill 
        in the current day for you. If only the current year is specified, xml2rfc will fill 
	 in the current day and month for you. If the year is not the current one, it is 
	 necessary to specify at least a month (xml2rfc assumes day="1" if not specified for the 
	 purpose of calculating the expiry date).  With drafts it is normally sufficient to 
	 specify just the year. -->

   <!-- Meta-data Declarations -->

   <area>General</area>

   <workgroup>Internet Engineering Task Force</workgroup>

   <!-- WG name at the upperleft corner of the doc,
        IETF is fine for individual submissions.  
	 If this element is not present, the default is "Network Working Group",
        which is used by the RFC Editor as a nod to the history of the IETF. -->

   <keyword>evpn</keyword>

   <!-- Keywords will be incorporated into HTML output
        files in a meta tag but they have no effect on text or nroff
        output. If you submit your draft to the RFC Editor, the
        keywords will be used for the search engine. -->

   <abstract>
     <t> This document describes the procedures for interconnecting two or more 
         BGP based Ethernet VPN (EVPN) sites in a scalable fashion over an IP-only network.
         The motivation is 
         to support extension of EVPN sites without having to rely on typical Data
         Center Interconnect (DCI) technologies like MPLS/VPLS for the interconnection.
         The requirements for such a deployment are very similar to the ones specified 
         in RFC 7209 -- "Requirements for Ethernet VPN (EVPN)".
     </t>
   </abstract>
 </front>

 <middle>


   <section anchor="sec_intro" title="Introduction">
     <t> 
         BGP based Ethernet VPNs (EVPNs) are being used to support various 
         VPN topologies with the motivation and requirements being discussed in detail in
         <xref target="RFC7209">RFC7209</xref>. EVPN has been used to provide
         a Network Virtualization Overly (NVO) solution with a variety of  
         tunnel encapsulation options over IP as described in 
         <xref target="DCI-EVPN-OVERLAY"></xref>. 
         EVPN used for the Data center interconnect (DCI) at the 
         WAN Edge is discussed in <xref target="DCI-EVPN-OVERLAY"></xref>. The EVPN
         DCI procedures are defined for IP and MPLS hand-off at the site boundaries. 
     </t>
  
     <t>
         In the current EVPN deployments, there is a need to segment the 
         EVPN domains within a Data Center (DC) primarily due to the service architecture
         and the scaling requirements around it. The number of routes, tunnel end-points,
         and next-hops needed in the DC are larger than some of the hardware elements that
         are being deployed. Network operators would like to ensure that they have means to 
         have smaller sites within the data center, if they so desire, without having to
         have traditional DCI technologies to inter-connect them. In essence, they want 
         smaller multi-site EVPN domains with an IP backbone. 
     </t>

     <t> 
         Network operators today are using the Virtual Network Identifier (VNI) to designate
         a service. However, they would like to have this service available to a smaller set of
         nodes within the DC for administrative reasons; in essence they want to break up the 
         EVPN domain to multiple smaller sites. An advantage of having a smaller 
         footprint for these EVPN sites, implies that the various fault isolation domains are 
         now more constrained. It is also feasible to have features that can re-use the VNI space
         across these sites if desired. The above mentioned motivations for having smaller multi-site
         EVPN domains are over and above the ones that are already detailed in
         <xref target="RFC7209">RFC7209</xref>.
     </t>

     <t> In this document we focus primarily on the VXLAN encapsulation for EVPN 
         deployments. We assume that the underlay provides simple IP connectivity. We 
         go into the details of the IP/VXLAN hand-off mechanisms, to interconnect these 
         smaller sites, within the data center itself. 
         We describe this deployment model as a scalable multi-site EVPN (MS-EVPN) deployment. 
         The procedures described here go into substantial detail regarding
         interconnecting L2 and L3, unicast and multicast domains across multiple
         EVPN sites. 
     </t>

     <section title="Requirements Language">
       <t>The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT",
       "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this
       document are to be interpreted as described in <xref
       target="RFC2119">RFC 2119</xref>.</t>
     </section>
   </section>

   <section anchor="terminology" title="Terminology">
   
      <!--   <t>List styles: 'empty', 'symbols', 'letters', 'numbers', 'hanging',
             'format'.</t> >

       -->

     <t><list style="symbols">
         <t>
           Border Gateway (BG): This is the node that interacts with nodes within a site
           and with nodes that are external to the site. For example, in 
           a leaf-spine data center fabric, it can be a leaf, a spine, or a separate device
           acting as gateway to interconnect the sites.
         </t>

         <t>
          Anycast Border Gateway: A Virtual set of shared Border Gateways (or Next-hops) 
          acting as Multiple entry-exit points for a site. </t>

          <t>
          Multipath Border Gateway: A Virtual set of unique border Gateways (or Next-hops)
          acting as a Multiple entry-exit points for a site.
          </t>

          <t> A-D: Auto-discovery.</t>
      </list>
      </t>

      <!-- 
         <t>Second bullet</t>
       </list> You can write text here as well.</t>
       -->

   </section>



   <section anchor="sec_ms_evpn_overview" title="Multi-Site EVPN Overview">
     <t> 
        In this section we describe the motivation, requirements, and framework of the multi-site
        EVPN enhancements. 
     </t>
           

        <section anchor="sec_ms_evpn_req" title="MS-EVPN Interconnect Requirements">

        <t> In this section we discuss the requirements and motivation for interconnecting 
            different EVPN sites within a data center. In general any interconnect technology
            has the following requirements:
        </t>

        <t><list style="letters">
            <t> Scalability: 
	
	     Multi-Site EVPN (MS-EVPN) should be able to interconnect multiple sites in a scalable 
            fashion.
	     In other words, interconnecting such sites should not lead to one giant fabric
            with full mesh of end-to-end VXLAN tunnels across leafs in different sites. This leads
            to scale issues with respect to managing large number of tunnel end-points and a large 
            number of tunnel next-hops. Also a huge flat fabric rules out option of ingress 
            replication (IR) trees as number of replications becomes practically unachievable due to
            the internal bandwidth needed in hardware. 
            </t>

            <t>  Multi-Destination traffic over unicast-only cloud:

              MS-EVPN mechanisms should be able to provide an efficient forwarding mechanism for
              multi-destination frames even if the underlay inter-site network is not capable of 
              forwarding multicast frames. This requirement is meant to ensure that for the 
              solution to work there are no additional constraints being requested of the IP network.
              This allows for use of existing network elements as-is.
            </t>

            <t>
             Maintain Site-specific Administrative control: The MS-EVPN technology should be able to 
             interconnect fabrics from different Administrative domains. It is possible that different sites 
             have different VLAN-VNI mappings, use different underlay routing protocols, and/or have different 
             PIM-SM group ranges etc. It is expected that the technology should not impose any additional 
             constraints on the various administrative domains.
             </t>

             <t> Isolate fault domains: MS-EVPN technology hand-off should have capability to isolate traffic 
             cross site boundaries and prevent defects to percolate from one site to another. As an example, 
             a broadcast storm in a site should not lead to meltdown of all other sites.
             </t>

             <t> Loop detection and prevention:
                In the scenarios where flood domains are stretched across fabrics, interconnecting sites are very
                vulnerable to loops and flood storms. There is a need to provide comprehensive loop detection and
                prevention capabilities.
             </t>

             <t> Plug-and-play and extensibility: 
               Addition of new sites or increasing capacity of existing sites should be achievable in a completely
               plug-and-play fashion. This essentially means that all control plane and forwarding states 
               (L2 or L3 interconnect) should be built in downstream allocation mode. MS-EVPN should not pose any
               maximum requirements on the scale and capacity, it should be easily extendable on those metrics.
             </t>

         </list>
        </t>

        </section>

        <section anchor="sec_ms_evpn_concept" title="MS-EVPN Interconnect concept and framework">

           <t>
               EVPN with an IP-only interconnect is conceptualized as multiple site-local EVPN 
               control planes and IP forwarding domains interconnected via a single common EVPN 
               control and IP forwarding domain. 
               Every EVPN node is identified with a unique site-scope identifier. 
               A site-local EVPN domain consists of EVPN nodes with the same site identifier. 
               Border gateways on one hand are also part of site-specific EVPN domain and on other hand 
               part of a common EVPN domain to interconnect with Border Gateways from other sites. Although
               a border gateway has only a single explicit site-id (that of the site it is a member of), it
               can be considered to also have a second implicit site-id, that of the interconnect-domain
               which has membership of all the BG's from all sites that are being interconnected. 
               This implicit site-id membership is derived by the presence of the Border A-D route
               announced by that border gateway node (please refer to <xref target="sec_ms_evpn_spec_ad" />
               for details of the route format). 
           </t>

          <t>
               These border gateways discover each other through EVPN Border A-D routes and act as both
               control and forwarding plane gateway across sites. This will facilitate site-specific nodes 
               to visualize all other sites to be reachable only via its Border Gateways.       
          </t>

          <t> We describe the MS-EVPN deployment model using the topology below. In the 
              topology there are 3 sites, Site A, Site B, and Site C that are inter-connected using IP. 
	       This entire topology is deemed to be part of the same Data Center. In most deployments these
              sites can be thought of as pods, which may span a rack, a row, or multiple rows in 
              the data center, depending on the size of domain desired for 
              scale and fault and/or administrative isolation domains. 
          </t>

     <t>

     <figure align="center" anchor="dci_topology">
        <!--
         <preamble>Preamble text - can be omitted or empty.</preamble>
          -->
       <artwork align="left"><![CDATA[
____________________________
| ooo Encapsulation tunnel |
| X X X  Leaf-spine fabric |
|__________________________|


  Site A (EVPN site A)               Site B (EVPN site B)
 ___________________________      ____________________________
|      X X X X X X X X     |      |      X X X X X X X X     | 
|         X X X X          |      |         X X X X          | 
|        o       o         |      |        o       o         |
|BG-1 Site A    BG-2 Site A|      |BG-1 Site B    BG-2 Site B|
 ___________________________      ____________________________
        o           o                o               o
         o           o              o               o
          o           o            o               o
           o           o          o               o
       _______________________________________________
       |                                             |
       |                                             |
       |        Inter-site common EVPN site          |
       |                                             |
       |                                             |
       _______________________________________________
                     o                   o
                      o                 o
                       o               o
                        o             o
                   ___________________________
 	           | BG-1 Site C    BG-2 Site C|
                   |         X X X X           | 
                   |      X X X X X X X X      |
                   _____________________________
                    Site C (EVPN site C)
                          ]]></artwork>

     </figure>
     </t>


         <t>  In this topology, site-local nodes are connected to each other by iBGP EVPN peering 
              and Border Gateways are connected by eBGP Muti-hop EVPN peering via inter-site cloud. We 
              explicitly spell this out to ensure that we can re-use BGP semantics of route announcement
              between and across the sites. There are other BGP mechanisms to instantiate this and they
              are not discussed in this document.
              This implies that each domain has its own AS number associated with it. In 
              the topology, only 2 border gateway per site are shown; this is more for ease of illustration and
              explanation. The technology poses no such limitation. As mentioned earlier, site-specific EVPN 
              domain will consists of only site-local nodes in the sites. A Border Gateway is logically 
              partitioned into site specific EVPN domain towards the site and into common EVPN domain 
              towards other sites. This facilitates them to acts as control and forwarding plane gateway for 
              forwarding traffic across sites. 
          </t>


         <t>
                 EVPN nodes with in a site will discover each other via regular EVPN procedures and 
                 build site-local bidirectional VXLAN tunnels and multi-destination trees from leaves to
                 Border Gateways.
                 Border Gateways will discover each other by A-D routes with unique site-identifiers
                 (as described in <xref target="sec_ms_evpn_spec_ad" />) and build inter-site bi-directional VXLAN tunnels and 
                 Multi-destination trees between them. We thus build an end-to-end 
                 bidirectional forwarding path across all sites by stitching (and not 
                 by stretching end-to-end) site-local VXLAN tunnels with inter-site VXLAN tunnels. 
         </t>

       <t>

	In essence, a MS-EVPN fabric is proposed to be built in complete downstream and modular fashion. </t>


  <t><list style="symbols">

	<t> Site-local Bridging domains are interconnected ONLY via Border Gateways with Bridging domains 
            from other sites. Such interconnect do not
            assume uniform mappings of mac-vrf VNI-VLAN across sites and stitches such bridging domains 
            in complete downstream fashion using EVPN route advertisements. </t>

        <t> Site-local Routing domains are interconnected ONLY via Border Gateways with Routing domains 
            from other sites. Such interconnect do not
            assume uniform mappings of IP VRF-VNI across sites and stitches such routing domains in complete
            downstream fashion using EVPN route advertisements. </t>

        <t> Site-local Flood domains are interconnected ONLY via Border Gateways with flood domains from 
            other sites. Such interconnect do not
            assume uniform mappings of mac-vrf VNI across sites (or mechanisms to build flood domains 
            with in site) and stitches such flood domains in 
            complete downstream fashion using EVPN route advertisements. It however do
            not exclude possibility of building an end-to-end flood domain, if desired for other reasons. </t>

	<t> There could be potential use cases where border gateways should behave as gateway for a subset of 
            VXLAN tunnels and an underlay pass through for the rest.  In other words, MS-EVPN fabric can 
            be built by stitching VXLAN tunnels at border gateways while providing flexibility for 
            other VXLAN (or VNI) tunnels to pass through border gateways as native L3 underlay. The procedure
            defined here provides flexibility to accommodate such use cases. </t>

      </list>
     </t>
    

      <t> 
          The above architecture satisfies the constraints laid out in <xref target="sec_ms_evpn_req" />. 
          For example, the size
          of a domain may be made dependent on the route and next-hop scale that can be supported by the
          deployment of the network nodes. There are no constraints on the network that connects the 
          nodes within the domain or across the domains. In the event multicast capability is available
          and enabled, the nodes can use those resources. In the event the underlay is connected 
          using unicast semantics, creation of ingress replication lists ensure that multi-destination
          frames reach their destinations. The domains may have their own deployment constraints, and
          the overlay does not need any form of stretching. It is within the control of the administrator
          with respect to containing fault isolation domains. The automated discovery of the border nodes
          needs no further configurations for existing deployed domains.  
      </t>

      <!-- 
         <t>Second bullet</t>
       </list> You can write text here as well.</t>
       -->

   </section>
</section>

<section anchor="sec_ms_evpn_procedures" title="Multi-site EVPN Interconnect Procedures">
 
     <t> 
        In this section we describe the new functionalities in the Border Gateway nodes
	for interconnecting EVPN sites within the DC. 
     </t>

<section anchor="sec_ms_evpn_spec_ad" title="Border Gateway Discovery">

     <t>
      Border Gateway discovery will facilitate termination and re-origination of inter-site VXLAN tunnels. 
      Such discovery provides flexibility for intra-site leaf-to-leaf VXLAN tunnels to co-exists with 
      inter-site tunnels terminating on Border Gateways. In other words, border gateways discovery
      will facilitate learning of VXLAN tunnel termination points while providing flexibility for such border gateways 
      to behave as native L3 transit for other VXLAN tunnels. 
     </t>

     <t>
     Border Gateways leverage the Type-1 A-D route type defined in <xref target="RFC7432">RFC7432</xref>. 
     Border Gateways in different sites will use Type-1 A-D routes with unique site-identifiers to 
     announce themselves as "Borders" to other border gateways. Nodes within the same site MUST be 
     configured or auto-generate to announce the same site-identifier. Nodes that are not configured to 
     be a border node will build VXLAN tunnels only between each member of the site (which it is aware 
     due to the site-identifier that is additionally announced by them). Border nodes will additionally
     build VXLAN tunnels between itself and other border nodes that are announced with a different site 
     identifier. Note that the site-identifier is encoded within the ESI label itself as described below.
     </t> 


      <!-- 
      Discover Border Gateways Multi-destination tunnel end points. In case of multicast underlay build by non-EVPN procedures,
      these A-D routes will be used to build Multi-destination IP tunnels rooted at Border Gateways. 
       -->

      <t>
          In this specification, we define a new Ethernet Segment Type (as described in Section 5 of 
          <xref target="RFC7432">RFC7432</xref>) that can be auto-generated or
          configured by the operator. 
      </t>

       <t><list style="symbols">
	<t>
         Type 6 (T=0x06) - This type indicates a multi-site router-ID ESI Value that
         can be auto-generated or configured by the operator.  The ESI Value
         is constructed as follows:
         <list>
         <t> 
             Router ID (4 octets):  The system router ID MUST be encoded in
             the high-order 4 octets of the ESI Value field. In case of both Anycast Border Gateway 
             and Multipath Border Gateway, this field carries unique router ID of Border gateways.
         </t>
         <t>
             Site Identifier (4 octets): The Site Identifier and its value MUST be encoded in 4 octets
             next to the Router ID.
	  </t>
          <t>
              Reserved (1 octet): The low-order octet of the ESI Value will be set to 0 and will 
              be ignored on receipt. 
 	  </t>

        </list>
        </t>

        </list>
       </t>

      
      <t>  Along with the Type-1 A-D routes, border nodes MUST announce an ESI label extended community
           with such A-D routes. They will also announce the Type-4 Ethernet Segment routes
           with the ESI Label extended community (defined in Section 7.5 of <xref target="RFC7432">RFC7432</xref>
           and shown below in Figure 2) in order to perform the Designated Forwarder election
           among the Border gateways of the same site. These Type-4 routes and ESI Label extended community
           will carry a new bit in the Flags field to indicate that the DF election is for Border gateways as 
           against the
           traditional Ethernet segment DF election. Routes with such bits set are generated only by Border
           Gateways and imported by all site-local leafs, site-local Border Gateways, and inter-site
           Border gateways.
      </t>

      <t>  

     <figure align="center" anchor="esi_label_ext_comm">
        <!--
         <preamble>Preamble text - can be omitted or empty.</preamble>
          -->
       <artwork align="left"><![CDATA[
 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| Type=0x06     | Sub-Type=0x01 | Flags(1 octet)|  Reserved=0   |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|  Reserved=0   |          ESI Label                            |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+]]></artwork>

     </figure>

     </t>

 	<t> The lowest order bit of Flags Octet in ESI Label extended community has been defined to address
            multihoming with the Single-Active or All-Active redundancy mode. In this specification, we
            define the the Second Low order bit of Flag Octet in ESI Label extended Community. It MUST be set
            to 1 by border gateway nodes if it is willing to take part in the DF election for the VNI carried
            in the associated ESI label.
 	</t>


     <t> Type-4 Ethernet Segment routes with the ESI Label extended community will be leveraged to perform 
          Designated Forwarder election among the Border gateways of the same site. ESI label extended community 
          encoding will be same as described above for Type-1 A-D routes. Site Identifier encoding in ESI 
          label extended community will help border gateways to negotiate DF winner with in a site and ignore 
          Type-4 routes from other sites. 
     </t>


       <t>
       These A-D routes are advertised with mac-VRF and IP-VRF RTs depending on whether the VNI carried 
       is a mac-VRF VNI or an IP VRF VNI. 
       </t>
	
        <t> After a Border Gateway is provisioned, Border A-D routes will be announced after some delay 
            interval from all border gateways. This will provide sufficient time to learn Border A-D routes 
            from Border Gateways of different sites. Border gateways will not be used to build VXLAN
             tunnels from same-site Border Gateways.
        </t>

	<t> Once Border Gateways are discovered, any Type-2/Type-5 routes will be terminated and 
           re-originated on such Border Gateways. 
           Similarly Type-1, Type-3, Type-4 from other sites will be terminated at the Border Gateways.  
           (Also see section 8 for Type-1 handling for loop detection and prevention across sites)
	</t>


       <t>
       As has been defined in the specifications, 
       Type 2, Type 3, and Type 5 routes carry downstream VNI labels. 
       These A-D routes will help to pre-build VXLAN tunnels in the common EVPN domain for L2, L3, 
       and Multi-Destination traffic. Also these A-D routes will help in correlating next-hop of EVPN routes and will 
       facilitate in rewriting next-hop attributes before re-advertising these routes from other sites to a 
       given site. This provides flexibility to keep different VNI-VLAN mapping in different sites and still 
       able to interconnect L3 and L2 domains. 
      </t>


         <t> 
                All control plane and data plane states are interconnected in a complete downstream
                fashion. For example, BGP import rules for a Type 3 route should be able to extend a 
                flood domain for a VNI and flood traffic destined to advertised EVPN node should 
                carry the VNI which is announced in Type 3 route. Similarly Type 2, Type 5 control 
                and forwarding states should be interconnected in a complete downstream fashion.

         </t>
    </section>

    <section anchor="sec_ms_evpn_spec_bg_types"  title="Border Gateway Provisioning">

     <t> 
        Border Gateway nodes manage both the control-plane communications and the data forwarding plane for any inter-site
        traffic. Border Gateway functionality in an EVPN site SHOULD be enabled on more than one node in the network
        for redundancy and high-availability purposes. Any external Type-2/Type-5 routes that are received by the BGs of a
        site are advertised to all the intra-site nodes by all the BGs. For internal Type-2/Type-5 routes received by 
        the BG's from the intra-site nodes,
        all the BGs of a site would advertise them to the remote BG's, so any L2/L3 known unicast traffic to internal destinations 
        could be sent to any one of the local BG's by remote sources. For known L2 and L3 unicast traffic, all of the individual border gateway
        nodes will behave either as single logical forwarding node or a set of active forwarding nodes. This 
        can be perceived by intra-site nodes as multiple
        entry/exit points for inter-site traffic. For unknown unicast/multi-destination traffic, there must be a
        designated forwarder election mechanism to determine which node would perform the primary forwarding role at any given
        point in time, to ensure there is no duplication of traffic for any given flow (See <xref target="sec_ms_evpn_spec_df_elect"/>).
     </t>

    <section anchor="sec_ms_evpn_spec_df_elect"  title="Border Gateway Designated Forwarder Election">
    <t>
        In the presence of more than one Border Gateway nodes in a site, forwarding of multi-destination L2 or 
        L3 traffic both into the site and out of the site needs to be carried out by a single node. 
	 Border Gateways between same site will run a Designated forwarder election per MAC-VRF VNI 
        for multi-destination traffic across the site. 
        Border A-D routes coming from different site will not trigger DF election and will only be 
        cached to terminate VXLAN tunnels from such border gateways. </t> 

    <t>
	 Border Gateway DF election will leverage Type-4 EVPN route and Ethernet segment DF election defined 
        in <xref target="RFC7432">RFC7432</xref>. Ethernet segment and ESI label extended community will be 
        encoded as explained in Border Gateway discovery procedures. ESI label extended community is MUST to be 
        announced with such routes. DF election will ignore such routes that are announced by border 
        gateways which have a different site identifier value in them.
    </t>

    <t> This DF election could be done independently by each
        candidate border gateway, by subjecting an ordered "candidate list" of all the BG's present in the same site (identified by
        reception of the Border A-D routes per-VNI with the same site-id as itself) to a hash-function on a per-VNI basis. All the
        candidate border gateways of the same site are required to use a uniform hash-function to yield the same result. Failure
        events which lead to a BG losing all of its connectivity to the IP interconnect backbone should trigger the BG to withdraw
        its Border A-D route(s), to indicate to other BG's of the site that it is no longer a candidate BG. </t>


    <t> There are two modes proposed for Border gateway provisioning. </t>

 </section>

    <section anchor="sec_ms_evpn_spec_aabg" title ="Anycast Border Gateway">
    <t>

	 In this mode all border gateways share same gateway IP and rewrite EVPN next-hop attributes with a shared 
        logical next-hop entity. However, these Gateways will maintain unique gateway IP to facilitate building IR 
        trees from site-local nodes to forward Multi-Destination traffic.  
        EVPN Type 2, Type 5 routes will be advertised to the nodes in the site from all border gateways and 
        Border gateway will run DF election per VNI for Multi destination traffic.
        Type 3 routes may be advertised by the DF winner Border gateway for a given VNI so that only DF will 
        receive and forward inter-site traffic. It is also possible to advertise and draw traffic by all  Border
        Gateways at a site to improve convergence properties of the network. In case of multi-destination trees 
        built by non-EVPN procedures (say PIM), all border gateways will receive but only DF winner will 
        forward traffic. 
    </t>

	  <t> This mode is useful when there is no preference between different border-gateways to 
      forward traffic from different VNIs. Standard data plane hashing of VXLAN header will load balance traffic
      among Border Gateways. </t>

        <t>Additionally, it is recommended that border gateway be enabled in the Anycast mode wherein the BG
        functionality is available to the rest of the network as a single logical entity (as in Anycast) for inter-site communication.
        In the absence of capability for Anycast, the BG could be enabled as individual gateways (Single-Active BG) wherein a
        single node will perform the active BG role for a given flow at a given time. </t>

    </section>


    <section anchor="sec_ms_evpn_spec_aabg1" title ="Multi-path Border Gateway">

	<t>
	In this mode, Border gateways will rewrite EVPN Next-hop attributes with unique next-hop entities. This provides
        flexibility to apply usual policies and 
        pick per-VRF, per-VNI or per-flow primary/backup border Gateways. Hence, an intra-site node will see each BG as a next-hop
        for any external L2 or L3 unicast destination, and would perform an ECMP path selection to load-balance traffic sent to
        external destinations. In case an intra-site node is not capable of performing ECMP hash based path-selection (possibly
        some L2 forwarding implementations), the node is expected to choose one of the BG's as its designated forwarder.
        EVPN Type 2, Type 5 routes will be advertised to the nodes in the site from all border gateways and Border gateway 
        will run DF election per VNI for 
        Multi destination traffic. 
        Type 3 routes will be advertised by DF winner Border gateway for a given VNI so that only DF will receive 
        and forward inter-site traffic. It is also possible to advertise and draw traffic by all  Border
        Gateways at a site to improve convergence properties of the network. In case of multi-destination trees 
        built by non-EVPN procedures (say PIM), all border gateways will receive but only DF winner will 
        forward traffic. 

 </t>

    </section>


    </section>


    <section anchor="sec_ms_evpn_spec_route_proc" title="EVPN route processing at Border Gateway">
    <t>

	Border gateways will build EVPN peering on processing A-D routes from other Border
        gateways. Route targets MAY be auto-generated based on some site-specific identifier. If BGP
	AS number is used as site-specific identifier, import and export route targets can be auto-generated as explained in
       <xref target="RFC7432">RFC7432</xref>. 
       This will facilitate site-local nodes to import routes from other nodes in same site and from its Border Gateways. Also 
       this will prevent routes exchange between nodes from different sites. However, in this auto-generated scheme, 
       import mechanism on Border Gateway should be relaxed to allow unconditional import of Border A-D routes 
       from other border gateways. Also the routes which are imported 
       at Border Gateway and re-advertised should implement a mechanism to avoid looping of updates should they come
       back at Border Gateways. </t>


       <t> Type 2/Type 5 EVPN routes will be rewritten with Border Gateway IP, Border Gateway system mac as next-hop and re-advertised. 
       Only EVPN routes received from discovered Border gateways with different site identifiers will be rewritten and 
       re-advertised. 
       This will avoid rewriting every EVPN update if border gateways are also acting as Route reflector (RR) for 
       site-local EVPN peering. Also this will help in interoperating MS-EVPN fabric with sites which do not have Border Gateway functionality. </t>


	<t> There are few mechanisms suggested below for re-advertising these inter-site routes to a site and provide connectivity of inter-site hosts and subnets. </t>


<t><list style="symbols">


    <t> All routes everywhere : In this mode all inter-site EVPN Type2/Type5 routes are downloaded on 
         site-local leafs from Border Gateways. In other words, every leaf in the MS-EVPN fabric will have routes
         from every intra-site and inter-site leafs. This mechanism is best-fit for the scenarios where
         inter-site traffic is as voluminous as intra-site flow traffic. Also this mechanism preserves usual
         glean processing, silent host discovery and unknown traffic handling at the leafs.   
    </t>

    <t> Default bridging and routing to Border Gateways : In this mode, all received inter-site EVPN
        Type 2/Type 5 routes will be installed only at Border Gateways and will not be advertised in the site.
        Border Gateways will inject Type 5 default routes to site-local nodes and avoid re-advertising 
        Type 2 from other sites. This mode provides scaling advantage by not downloading all inter-site 
        routes to every leaf in MS-EVPN fabric. This mechanism MAY require glean processing and unknown traffic
        handling to be tailored to provide efficient traffic forwarding.
    </t>


      <t> Site-scope flow registry and discovery : This mechanism provides scaling advantage by downloading
          inter-site routes on-demand. It provides scaling advantages of default routing with out need to 
          tailor glean processing and unknown traffic handling at the leafs. Leafs will create on-demand flow
          registry on their border Gateways and based on this flow registry border gateways will advertise 
          Type 2 routes in a site. In other words, assuming that we have a trigger to send the EVPN routes that 
          are needed by the site for conversational learning from the Border Gateways, we can optimize on the 
          control plane state that is needed at the various leaf nodes. Hardware programming can be further 
          optimized based on actual conversations needed by the leaf, as opposed to to the ones needed by the
          site. We will describe a mechanism in the appendix with respect to ARP processing at the Border Gateway.
       </t>

     </list>
       </t>


	<t> Type 3 routes will be imported and processed on border gateways from other border gateways but
            MUST NOT be advertised again. In both modes (Anycast and Multipath), Type 3 routes will be generated
            locally and advertised by DF winner Border Gateway with unique gateway IP. This will facilitate
            building fast converging flood domain connectivity inter-site and intra-site and on same time avoiding
            duplicate traffic by electing DF winner to forward multi-destination inter-site traffic.
        </t>


       </section>

 

	<section anchor="sec_ms_evpn_spec_md_tree" title="Multi-Destination tree between Border Gateways">

	 <t> The procedures described here recommends building an Ingress Replication (IR) tree between Border Gateways. 
        This will facilitate every site to independently build site-specific Multi destination trees. 
        Multi-destination end-to-end trees between leafs could be PIM (site 1) + IR (between border Gateways) +
        PIM(site 2) or IR-IR-IR or PIM-IR-IR. However this does not rule out using IR-PIM-IR or end-to-end PIM 
        to build multi-destination trees end-to-end. </t>

	 <t> Border Gateways will generate Type 3 routes with unique gateway IP and advertise to Border Gateways of other sites.
         These Type 3 routes will help in building IR trees between border gateways. However only DF winner per VNI will forward multi-destination traffic across sites. </t>

	 <t> As Border Gateways are part of both site-specific and inter-site Multi-destination IR trees, 
        split-horizon mechanism will be used to avoid loops. Multi-destination tree with Border gateway 
        as root to other sites (or Border-Gateways) will be in a separate horizon group. 
        Similarity Multi-destination IR tree with Border Gateway as root to site-local nodes 
        will be in another split horizon group. </t>

	 <t> If PIM is used to build Multi-Destination trees in site-specific domain, all Border gateway 
        will join such PIM trees and draw multi-destination traffic. However only DF Border Gateway will 
        forward traffic towards other sites. </t>
      </section>


  <section anchor="sec_ms_evpn_spec_ucast_traffic" title="Inter-site Unicast traffic">

	<t>As site-local nodes will see all inter-site EVPN routes via Border Gateways, VXLAN tunnels 
      will be built between
      leafs and site-local Border Gateways and Inter-site VXLAN tunnels will be built between Border gateways in 
      different sites. An end-to-end VXLAN bidirectional forwarding path between inter-site leafs will 
      consist of VXLAN tunnel from leaf (say Site A) to its Border Gateway, another VXLAN tunnel from 
      Border Gateway to Border Gateway in another site (say site B) and Border gateway to leaf (in site B). Such 
      arrangement of tunnels are very scalable as a full mesh of VXLAN tunnels across inter-site leafs is substituted
      by combination of intra-site and inter-site tunnels. </t>

	<t> L2 and L3 unicast frames from site-local leafs will reach border gateway using
       VXLAN encapsulation. At Border gateway, VXLAN header is stripped out and another VXLAN header is pushed to sent
       frames to destination site Border Gateway. Destination site Border gateway will strip off VXLAN header and push
       another VXLAN header to send frame to the destination site leaf. </t>


  </section>
       
  <section anchor="sec_ms_evpn_spec_mbcast_traffic" title ="Inter-site Multi-destination traffic"> 

	<t> Multi-destination traffic will be forwarded from one site to other site only by DF
      for that VNI. As frames reach Border Gateway from site-local nodes, VXLAN header will be popped and another 
      VXLAN header (derived from downstream Type3 EVPN routes) will be pushed to forward frame to destination site border gateway. 
      Similarly destination site Border 
      Gateway will strip off VXLAN header and forward frame after pushing another VXLAN header towards the destination 
      leaf. </t>

	<t> As explained in <xref target="sec_ms_evpn_spec_md_tree" />, split horizon mechanism will be used to avoid looping of inter-site 
       multi-destination frames. </t>
   </section>


   <section anchor="sec_ms_evpn_spec_host_mob" title="Host Mobility">
    <t>

	Host movement handling will be same as defined in <xref target="RFC7432">RFC7432</xref>. When host moves, 
        EVPN Type 2 routes with 
     updated sequence number will be propagated to every EVPN node. When a host moves inter-site, 
     only Border gateways may see EVPN updates with both next-hop attributes and sequence number changes 
     and leafs may see updates only with updated sequence numbers. However in other cases both Border gateway and leafs
     may see next-hop and sequence number changes.  </t>
   </section>

   </section>

   <section anchor="sec_ms_evpn_spec_converg" title ="Convergence">

     <section title="Fabric to Border Gateway Failure">

	 <t>If a Border Gateway is lost, Border gateway next-hop will be withdrawn for Type 2 routes. 
        Also per-VNI DF election will be triggered to chose new DF. DF new winner will become forwarder of Multi-destination inter-site traffic. </t>
      </section>
		
 
      <section title="Border Gateway to Border Gateway Failures">
       <t>
        In case where inter-site cloud has link failures, direct forwarding path between border gateways can be lost. 
        In this case, traffic from one site can reach other site via border gateway of an intermediate site. However
        this will be addressed like regular underlay failure and traffic terminations end-points will still stay same for inter-site traffic flows.
       </t>
       </section>
   </section>
       

   <section anchor="sec_ms_evpn_spec_interop" title="Interoperability">

	<t> The procedures defined here are only for Border Gateways. Therefore other EVPN nodes in the network should 
     be <xref target="RFC7432">RFC7432</xref> compliant to operate in such topologies.</t>


	<t> As the procedures described here are applicable only after receiving Border A-D route, if other domains
        are connected which are not capable of such multi-site gateway model, they can work in regular EVPN mode. The 
        exact procedures will be detailed in a future version of the draft. </t> 

	<t> The procedures here provides flexibility to connect non-EVPN VXLAN sites by provisioning Border Gateways
	    on such sites and inter-connecting such Border Gateways by Border Gateways of other sites. Such Border Gateways
	    in non-EVPN VXLAN sites will play dual role of EVPN gateway towards common EVPN domain and non-EVPN gateway towards
	    non-EVPN VXLAN site.
	</t>

   </section>


   <section anchor="sec_ms_evpn_spec_fault_iso" title="Isolation of Fault Domains">
    <t>

 	Isolation of network defects requires policies like storm control, security ACLs etc to be implemented at
     site boundaries. Border gateways should be capable of inspecting inner payload of packets received 
     from VXLAN tunnels and enforce configured policies to prevent defects percolating from one part to rest
	of the network. 

    </t>
    </section>


    <section anchor="sec_ms_evpn_spec_loop_detect" title="Loop detection and Prevention">

     <t>
      Customer L2 network deploy some flavor of Spanning tree protocol (STP) to detect and prevent loops. Also Customer L2 segments deploy some form of multihoming to connect L2 segments to EVPN nodes or VTEPs. Such multihoming connectivity takes care of preventing L2 loops by multihoming mechanisms at the VTEPs. However misconfiguration or other unexpected events in the customer L2 segments can lead to inconsistent
connectivity to VTEPs leading to L2 loops. </t>

<t>
This specification leverages Type-2 encoding of ESI label extended community in Type-1 A-D route type as defined in <xref target="RFC7432">RFC7432</xref> to exchange STP root bridge information among VTEPs. When VTEPs discovers same STP root bridge from VTEPs which are not multihoming VTEP peers for a given L2 segment, it signals possibility of loop and forwarding engine prunes VNI from the server facing ports to cut down loop. As root bridge conflict across VTEPs is resolved, forwarding engine will reestablish VNI on the server facing ports.  

This mechanism can coexist with other mechanism like fast mac move detections and is recommended as additional protection to prevent L2 loops poised by inconsistent connectivity of customer L2 segments to L3 MS-EVPN fabric. </t>

<t>
Such route advertisement should be originated by every EVPN node and terminated at the border gateways. However if there is possibility of server facing L2 segments to be stretched across sites, such routes can be terminated and re-originated with out modifications to be received by every other EVPN node. This behavior is exception to usual guideline of terminating (and re-originating if required) all routes types at border gateway. However such exception will help in detecting loops if a customer L2 segment is inconsistently connected to VTEPs in different sites.

     </t>

    <t> 
       Also as defined in <xref target="sec_ms_evpn_spec_df_elect" /> border gateways
       uses mechanisms like Designated Forwarder and Split Horizon forwarding to prevent inter-site loops in this network. 
     </t>

     </section>
         


   <section anchor="Acknowledgements" title="Acknowledgements">
     <t>This authors would like to thank Max Ardica, Lukas Krattiger, Anuj Mittal, Lilian Quan, 
        Veera Ravinutala, for 
        their review and comments.</t>
   </section>

   <!-- Possibly a 'Contributors' section ... -->

   <section anchor="IANA" title="IANA Considerations">
     <t>TBD.</t>

   </section>

   <section anchor="Security" title="Security Considerations">
          <t>TBD.</t>
   </section>
 </middle>

 <!--  *****BACK MATTER ***** -->

 <back>
   <!-- References split into informative and normative -->

   <!-- There are 2 ways to insert reference entries from the citation libraries:
    1. define an ENTITY at the top, and use "ampersand character"RFC2629; here (as shown)
    2. simply use a PI "less than character"?rfc include="reference.RFC.2119.xml"?> here
       (for I-Ds: include="reference.I-D.narten-iana-considerations-rfc2434bis.xml")

    Both are cited textually in the same manner: by using xref elements.
    If you use the PI option, xml2rfc will, by default, try to find included files in the same
    directory as the including file. You can also define the XML_LIBRARY environment variable
    with a value containing a set of directories to search.  These can be either in the local
    filing system or remote ones accessed by http (http://domain/dir/... ).-->

   <references title="Normative References">
     <!--?rfc include="http://xml.resource.org/public/rfc/bibxml/reference.RFC.2119.xml"?-->
     &RFC2119;
     &RFC7432;

       <reference anchor="DCI-EVPN-OVERLAY"
                target="https://tools.ietf.org/html/draft-ietf-bess-evpn-overlay-02">
       <front>
         <title>A Network Virtualization Overlay Solution using EVPN</title>

         <author>
           <organization>A. Sajassi et. al.</organization>
         </author>

         <date year="2016" />
       </front>
     </reference>

   </references>

   <references title="Informative References">
     <!-- Here we use entities that we defined at the beginning. -->

     &RFC7209;

     <!-- A reference written by by an organization not a person. -->

     </references>

   <section anchor="app-additional" title="Additional Stuff">
     <t>TBD.</t>
   </section>

   <!-- Change Log

v00 2006-03-15  EBD   Initial version

v01 2006-04-03  EBD   Moved PI location back to position 1 -
                     v3.1 of XMLmind is better with them at this location.
v02 2007-03-07  AH    removed extraneous nested_list attribute,
                     other minor corrections
v03 2007-03-09  EBD   Added comments on null IANA sections and fixed heading capitalization.
                     Modified comments around figure to reflect non-implementation of
                     figure indent control.  Put in reference using anchor="DOMINATION".
                     Fixed up the date specification comments to reflect current truth.
v04 2007-03-09 AH     Major changes: shortened discussion of PIs,
                     added discussion of rfc include.
v05 2007-03-10 EBD    Added preamble to C program example to tell about ABNF and alternative 
                     images. Removed meta-characters from comments (causes problems).

v06 2010-04-01 TT     Changed ipr attribute values to latest ones. Changed date to
                     year only, to be consistent with the comments. Updated the 
                     IANA guidelines reference from the I-D to the finished RFC.  -->
 </back>
</rfc>
