Garden | NCCL 算法解析

NCCL 算法选择过程

算法与拓扑

nccl 目前有六种算法

1
2
3
4
5
6


#define NCCL_ALGO_TREE 0
#define NCCL_ALGO_RING 1
#define NCCL_ALGO_COLLNET_DIRECT 2
#define NCCL_ALGO_COLLNET_CHAIN 3
#define NCCL_ALGO_NVLS 4
#define NCCL_ALGO_NVLS_TREE 5

目前共有 6 种拓扑

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25


#define NCCL_TOPO_PATTERN_BALANCED_TREE 1   // Spread NIC traffic between two GPUs (Tree parent + one child on first GPU, second child on second GPU)
#define NCCL_TOPO_PATTERN_SPLIT_TREE 2      // Spread NIC traffic between two GPUs (Tree parent on first GPU, tree children on the second GPU)
#define NCCL_TOPO_PATTERN_TREE 3            // All NIC traffic going to/from the same GPU
#define NCCL_TOPO_PATTERN_RING 4            // Ring
#define NCCL_TOPO_PATTERN_NVLS 5            // NVLS+SHARP and NVLS+Tree
struct ncclTopoGraph {
  // Input / output
  int id; // ring : 0, tree : 1, collnet : 2
  int pattern;
  int crossNic;
  int collNet;
  int minChannels;
  int maxChannels;
  // Output
  int nChannels;    // 搜索到的 channel 数量
  float bwIntra;    // 节点内单个 channel 带宽
  float bwInter;    // 节点间单个 channel 带宽
  float latencyInter;
  int typeIntra;    // 节点内 channel 的路径类型
  int typeInter;    // 节点间 channel 的路径类型
  int sameChannels; // channel 是否一样
  int nHops;
  int intra[MAXCHANNELS*NCCL_TOPO_MAX_NODES];  // 节点内每个 channel 路径
  int inter[MAXCHANNELS*2];                    // 节点间每个 channel 路径
};

算法与拓扑的映射关系如下：

Aglo	Topo	Topo Pattern
NCCL_ALGO_TREE	treeGraph	NCCL_TOPO_PATTERN_BALANCED_TREE
NCCL_ALGO_RING	ringGraph	NCCL_TOPO_PATHERN_RING
NCCL_ALGO_COLLNET_DIRECT	collNetGraph	NCCL_TOPO_PATTERN_TREE
NCCL_ALGO_COLLNET_CHAIN	collNetGraph	NCCL_TOPO_PATTERN_TREE
NCCL_ALGO_NVLS	nvlsGraph	NCCL_TOPO_PATTERN_NVLS
NCCL_ALGO_NVLS_TREE	nvlsGraph	NCCL_TOPO_PATTERN_NVLS

1
2
3
4
5
6
7


static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* parent = NULL) {
  struct ncclTopoGraph ringGraph;
  struct ncclTopoGraph treeGraph;
  struct ncclTopoGraph collNetGraph;
  struct ncclTopoGraph nvlsGraph;
  struct ncclTopoGraph* graphs[] = { &treeGraph, &ringGraph, &collNetGraph, &collNetGraph, &nvlsGraph, &nvlsGraph };
}

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29


// Local (myself)
#define PATH_LOC 0

// Connection traversing NVLink
#define PATH_NVL 1

// Connection through NVLink using an intermediate GPU
#define PATH_NVB 2

// Connection traversing at most a single PCIe bridge
#define PATH_PIX 3

// Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
#define PATH_PXB 4

// Connection between a GPU and a NIC using an intermediate GPU. Used to enable rail-local, aggregated network send/recv operations.
#define PATH_PXN 5

// Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
#define PATH_PHB 6

// Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
#define PATH_SYS 7

// Connection through the network
#define PATH_NET 8

// Disconnected
#define PATH_DIS 9

拓扑建立

在建立拓扑时，会建立 ring， tree， collnet， nvls 4种拓扑结构，4种拓扑均会建联， 此处注意 nvls 拓扑会占用大量的 GPU 内存(参见 ncclNvlsInit 的内存申请)

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43



  // Get rings and trees
  ringGraph.id = 0;
  ringGraph.pattern = NCCL_TOPO_PATTERN_RING;
  ringGraph.collNet = 0;
  ringGraph.minChannels = 1;
  ringGraph.maxChannels = MAXCHANNELS/2;
  NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &ringGraph), ret, fail);
  NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &ringGraph), ret, fail);

  treeGraph.id = 1;
  treeGraph.pattern = NCCL_TOPO_PATTERN_BALANCED_TREE;
  treeGraph.collNet = 0;
  treeGraph.minChannels = ringGraph.nChannels;
  treeGraph.maxChannels = ringGraph.nChannels;
  NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &treeGraph), ret, fail);
  NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &treeGraph), ret, fail);

  // 建立collnet 拓扑
  collNetGraph.id = 2;
  collNetGraph.pattern = NCCL_TOPO_PATTERN_TREE;
  collNetGraph.collNet = 1;
  collNetGraph.minChannels = collNetGraph.maxChannels = ringGraph.nChannels;
  if (comm->collNetSupport) {
    NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &collNetGraph), ret, fail);
    NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &collNetGraph), ret, fail);
  } else {
    collNetGraph.nChannels = 0;
  }

  // 建立nvls 拓扑
  nvlsGraph.id = 3;
  nvlsGraph.pattern = NCCL_TOPO_PATTERN_NVLS;
  nvlsGraph.collNet = 0;
  nvlsGraph.minChannels = 1;
  nvlsGraph.maxChannels = MAXCHANNELS;
  if (comm->nvlsSupport) {
    NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &nvlsGraph), ret, fail);
    NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &nvlsGraph), ret, fail);
  } else {
    nvlsGraph.nChannels = 0;
  }
  comm->nChannels = std::min(treeGraph.nChannels, ringGraph.nChannels);

这里会通过 ncclTopoCompute 计算每种拓扑结构的带宽，并记录在 ncclTopoGraph 结构中，包括了各种机内与机间的通讯带宽和路径。

最终算法选择

nccl 会对各个算法(NCCL_ALGO)的时间有个估计，并根据数据量和集合通讯类型(allreduce, allgather) 选择耗时最小的算法

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27


float minTime = 3600000000.0; // Hopefully no operation will take an hour to complete.
// Find algorithm / protocol.
info->algorithm = -1;
info->protocol = -1;
int nAlgos = NCCL_NUM_ALGORITHMS;
for (int a=0; a<nAlgos; a++) {
  if ((a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) && collNetTypeSupport != 1) continue;
  if (a == NCCL_ALGO_NVLS && !NCCL_NVLS_SUPPORTS(info->datatype, info->opFull.op)) continue;
  if (a == NCCL_ALGO_NVLS && collNetTypeSupport != 1 && comm->nNodes > 1) continue;
  if (a == NCCL_ALGO_NVLS_TREE && !NCCL_NVLS_SUPPORTS(info->datatype, info->opFull.op)) continue;

  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
    float time;
    NCCLCHECK(ncclTopoGetAlgoTime(info, a, p, numPipeOps, &time));
    if (time >= 0 && time < minTime) {
      info->algorithm = a;
      info->protocol = p;
      minTime = time;
    }
  }
}
if (info->algorithm == -1 || info->protocol == -1) {
  WARN("Error : no algorithm/protocol available");
  return ncclInternalError;
}
//if (comm->rank == 0) INFO(NCCL_TUNING, "%ld Bytes -> Algo %d proto %d time %f", info->nBytes, info->algorithm, info->protocol, minTime);
TRACE(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %f", info->nBytes, info->algorithm, info->protocol, minTime);

拓扑带宽获取过程

ncclTopoCompute 的时候，会记录每种拓扑结构的带宽，并存储在 ncclTopoGraph 结构中

这里就是实际搜索 channel 的过程，目标是搜索出来尽可能多，带宽尽可能大的一系列 channel，本质就是暴力搜索，先设置一系列的条件搜答案，如果搜不出来则降低条件继续搜。

由于此时没有 NET 节点，所以 crossNic 为 0，然后初始化 graph，首先设置最高的条件，限制节点内部只能使用不超过 PATH_NVL 路径，节点间只能使用不超过 PATH_PIX 的路径，然后通过 system-maxWidth 设置 speedIntra 和 speedInter，接着执行 ncclTopoSearchRec 搜索出一个答案存储到 tmpGraph 中。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16


float speedArrayIntra[] = { 40.0, 30.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0 };
float speedArrayInter[] = { 48.0, 30.0, 28.0, 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
float sm90SpeedArrayIntra[] = { 60.0, 40.0, 30.0, 24.0, 20.0, 15.0, 12.0, 6.0, 3.0 };
float sm90SpeedArrayInter[] = { 48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 };

float* speedArray = NULL;
if (system->nodes[NET].count == 0) {
  nspeeds = ccMin >= 90 ? NSPEEDSINTRA_SM90 : NSPEEDSINTRA;
  speedArray = ccMin >= 90 ? sm90SpeedArrayIntra : speedArrayIntra;
} else {
  nspeeds = ccMin >= 90 ? NSPEEDSINTER_SM90 : NSPEEDSINTER;
  speedArray = ccMin >= 90 ? sm90SpeedArrayInter : speedArrayInter;
}
float maxBw = system->maxBw;
while (speedArray[speedIndex] > maxBw && speedIndex < nspeeds-1) speedIndex++;
tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[speedIndex];

NVLink

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15


#define SM60_NVLINK_BW 18.0
#define SM70_NVLINK_BW 20.0
#define SM80_NVLINK_BW 20.0
#define SM90_NVLINK_BW 20.0
#define SM86_NVLINK_BW 12.0

static float ncclTopoNVLinkBw(int cudaCompCap) {
  return
    cudaCompCap >= 90 ? SM90_NVLINK_BW :
    cudaCompCap == 86 ? SM86_NVLINK_BW :
    cudaCompCap >= 80 ? SM80_NVLINK_BW :
    cudaCompCap >= 70 ? SM70_NVLINK_BW :
    cudaCompCap >= 60 ? SM60_NVLINK_BW :
    SM80_NVLINK_BW;
}

NET

1
2
3
4


  NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "speed", &mbps, 0));
  if (mbps <= 0) mbps = 10000; // Some NICs define speed = -1
  net->net.bw = mbps / 8000.0;
  if (xmlGetAttrFloat(xmlNet, "latency", &net->net.latency) != ncclSuccess) net->net.latency = 0;

P2P

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15


 struct kvDict kvDictPciGen[] = {
  { "2.5 GT/s", 15 }, { "5 GT/s", 30 }, { "8 GT/s", 60 }, { "16 GT/s", 120 }, { "32 GT/s", 240 }, /* Kernel 5.6 and earlier */
  { "2.5 GT/s PCIe", 15 }, { "5.0 GT/s PCIe", 30 }, { "8.0 GT/s PCIe", 60 }, { "16.0 GT/s PCIe", 120 }, { "32.0 GT/s PCIe", 240 }, { "64.0 GT/s PCIe", 480 },
  { NULL, 60 /* Default fallback */ } }; // x100 Mbps per lane
  
    int width, speed;
    NCCLCHECK(xmlGetAttrInt(xmlPci, "link_width", &width));
    NCCLCHECK(xmlGetAttrStr(xmlPci, "link_speed", &str));

    // Manage cases where speed was not indicated in /sys
    if (width == 0) width = 16;
    NCCLCHECK(kvConvertToInt(str, &speed, kvDictPciGen)); // Values in 100Mbps, per lane (we want GB/s in the end)

    NCCLCHECK(ncclTopoConnectNodes(node, parent, LINK_PCI, width*speed/80.0));
    NCCLCHECK(ncclTopoConnectNodes(parent, node, LINK_PCI, width*speed/80.0))

SYS TOPO

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54


$ lspci -t -mm -vvv
-+-[0000:74]---00.0-[75]--+-01.0  Mellanox Technologies MT2910 Family [ConnectX-7]
 |                        +-02.0  NVIDIA Corporation Device 2324
 |                        \-03.0  Intel Corporation NVMe DC SSD [3DNAND, Sentinel Rock Controller]
 +-[0000:72]---00.0-[73]--+-01.0  Mellanox Technologies MT2910 Family [ConnectX-7]
 |                        +-02.0  NVIDIA Corporation Device 2324
 |                        \-03.0  Intel Corporation NVMe DC SSD [3DNAND, Sentinel Rock Controller]
 +-[0000:70]---00.0-[71]--+-01.0  Mellanox Technologies MT2910 Family [ConnectX-7]
 |                        +-02.0  NVIDIA Corporation Device 2324
 |                        \-03.0  Intel Corporation NVMe DC SSD [3DNAND, Sentinel Rock Controller]
 +-[0000:6e]---00.0-[6f]--+-01.0  Mellanox Technologies MT2910 Family [ConnectX-7]
 |                        +-02.0  NVIDIA Corporation Device 2324
 |                        \-03.0  Intel Corporation NVMe DC SSD [3DNAND, Sentinel Rock Controller]
 +-[0000:6c]---00.0-[6d]--+-01.0  NVIDIA Corporation Device 22a3
 |                        +-02.0  NVIDIA Corporation Device 22a3
 |                        +-03.0  NVIDIA Corporation Device 22a3
 |                        \-04.0  NVIDIA Corporation Device 22a3
 +-[0000:6a]---00.0-[6b]--+-01.0  Mellanox Technologies MT2910 Family [ConnectX-7]
 |                        +-02.0  NVIDIA Corporation Device 2324
 |                        \-03.0  Intel Corporation NVMe DC SSD [3DNAND, Sentinel Rock Controller]
 +-[0000:68]---00.0-[69]--+-01.0  Mellanox Technologies MT2910 Family [ConnectX-7]
 |                        +-02.0  NVIDIA Corporation Device 2324
 |                        \-03.0  Intel Corporation NVMe DC SSD [3DNAND, Sentinel Rock Controller]
 +-[0000:66]---00.0-[67]--+-01.0  Mellanox Technologies MT2910 Family [ConnectX-7]
 |                        +-02.0  NVIDIA Corporation Device 2324
 |                        \-03.0  Intel Corporation NVMe DC SSD [3DNAND, Sentinel Rock Controller]
 +-[0000:64]---00.0-[65]--+-01.0  Mellanox Technologies MT2910 Family [ConnectX-7]
 |                        +-02.0  NVIDIA Corporation Device 2324
 |                        \-03.0  Intel Corporation NVMe DC SSD [3DNAND, Sentinel Rock Controller]
 \-[0000:00]-+-00.0  Intel Corporation 440FX - 82441FX PMC [Natoma]
             +-01.0  Intel Corporation 82371SB PIIX3 ISA [Natoma/Triton II]
             +-01.1  Intel Corporation 82371SB PIIX3 IDE [Natoma/Triton II]
             +-01.3  Intel Corporation 82371AB/EB/MB PIIX4 ACPI
             +-02.0  Mellanox Technologies ConnectX Family mlx5Gen Virtual Function
             +-0b.0-[01]--+-01.0  Red Hat, Inc. Virtio SCSI
             |            +-02.0  Red Hat, Inc. Virtio SCSI
             |            +-03.0  Red Hat, Inc. Virtio SCSI
             |            \-04.0  Red Hat, Inc. Virtio SCSI
             +-0c.0-[02]----01.0  Red Hat, Inc. Virtio block device
             +-0d.0-[03]--
             +-0e.0-[04]--
             +-0f.0-[05]--
             +-11.0  Red Hat, Inc. QEMU PCI Expander bridge
             +-12.0  Red Hat, Inc. QEMU PCI Expander bridge
             +-13.0  Red Hat, Inc. QEMU PCI Expander bridge
             +-14.0  Red Hat, Inc. QEMU PCI Expander bridge
             +-15.0  Red Hat, Inc. QEMU PCI Expander bridge
             +-16.0  Red Hat, Inc. QEMU PCI Expander bridge
             +-17.0  Red Hat, Inc. QEMU PCI Expander bridge
             +-18.0  Red Hat, Inc. QEMU PCI Expander bridge
             +-19.0  Red Hat, Inc. QEMU PCI Expander bridge
             +-1a.0  Red Hat, Inc. Virtio console
             +-1d.0  Intel Corporation 82801FB/FBM/FR/FW/FRW (ICH6 Family) High Definition Audio Controller
             \-1e.0  Cirrus Logic GD 5446

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58


# cat /var/run/nvidia-topologyd/virtualTopology.xml 
<system version="1">
<cpu numaid="0" affinity="00000,00000000,00000000,03ffffff,ffffffff,ffffffff" arch="x86_64" vendor="GenuineIntel" familyid="6" modelid="143">
<pci busid="0000:00:02.0" class="0x020000" vendor="0x15b3" device="0x101e" subsystem_vendor="0x15b3" subsystem_device="0x0021" link_speed="32.0 GT/s PCIe" link_width="16">
<nic> <net name="mlx5_0"/> </nic>
</pci>
<pci busid="0000:64:00.0" class="0x060400" vendor="0x1b36" device="0x0001" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:65:01.0" class="0x020700" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0123" link_speed="32.0 GT/s PCIe" link_width="16">
<nic> <net name="mlx5_1"/> </nic>
</pci>
<pci busid="0000:65:02.0" class="0x030200" vendor="0x10de" device="0x2324" subsystem_vendor="0x10de" subsystem_device="0x17a6" link_speed="32.0 GT/s PCIe" link_width="16"/>
</pci>
<pci busid="0000:66:00.0" class="0x060400" vendor="0x1b36" device="0x0001" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:67:01.0" class="0x020700" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0123" link_speed="32.0 GT/s PCIe" link_width="16">
<nic> <net name="mlx5_2"/> </nic>
</pci>
<pci busid="0000:67:02.0" class="0x030200" vendor="0x10de" device="0x2324" subsystem_vendor="0x10de" subsystem_device="0x17a6" link_speed="32.0 GT/s PCIe" link_width="16"/>
</pci>
<pci busid="0000:68:00.0" class="0x060400" vendor="0x1b36" device="0x0001" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:69:01.0" class="0x020700" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0123" link_speed="32.0 GT/s PCIe" link_width="16">
<nic> <net name="mlx5_3"/> </nic>
</pci>
<pci busid="0000:69:02.0" class="0x030200" vendor="0x10de" device="0x2324" subsystem_vendor="0x10de" subsystem_device="0x17a6" link_speed="32.0 GT/s PCIe" link_width="16"/>
</pci>
<pci busid="0000:6a:00.0" class="0x060400" vendor="0x1b36" device="0x0001" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:6b:01.0" class="0x020700" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0123" link_speed="32.0 GT/s PCIe" link_width="16">
<nic> <net name="mlx5_4"/> </nic>
</pci>
<pci busid="0000:6b:02.0" class="0x030200" vendor="0x10de" device="0x2324" subsystem_vendor="0x10de" subsystem_device="0x17a6" link_speed="32.0 GT/s PCIe" link_width="16"/>
</pci>
</cpu>
<cpu numaid="1" affinity="fffff,ffffffff,ffffffff,fc000000,00000000,00000000" arch="x86_64" vendor="GenuineIntel" familyid="6" modelid="143">
<pci busid="0000:6e:00.0" class="0x060400" vendor="0x1b36" device="0x0001" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:6f:01.0" class="0x020700" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0123" link_speed="32.0 GT/s PCIe" link_width="16">
<nic> <net name="mlx5_5"/> </nic>
</pci>
<pci busid="0000:6f:02.0" class="0x030200" vendor="0x10de" device="0x2324" subsystem_vendor="0x10de" subsystem_device="0x17a6" link_speed="32.0 GT/s PCIe" link_width="16"/>
</pci>
<pci busid="0000:70:00.0" class="0x060400" vendor="0x1b36" device="0x0001" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:71:01.0" class="0x020700" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0123" link_speed="32.0 GT/s PCIe" link_width="16">
<nic> <net name="mlx5_6"/> </nic>
</pci>
<pci busid="0000:71:02.0" class="0x030200" vendor="0x10de" device="0x2324" subsystem_vendor="0x10de" subsystem_device="0x17a6" link_speed="32.0 GT/s PCIe" link_width="16"/>
</pci>
<pci busid="0000:72:00.0" class="0x060400" vendor="0x1b36" device="0x0001" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:73:01.0" class="0x020700" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0123" link_speed="32.0 GT/s PCIe" link_width="16">
<nic> <net name="mlx5_7"/> </nic>
</pci>
<pci busid="0000:73:02.0" class="0x030200" vendor="0x10de" device="0x2324" subsystem_vendor="0x10de" subsystem_device="0x17a6" link_speed="32.0 GT/s PCIe" link_width="16"/>
</pci>
<pci busid="0000:74:00.0" class="0x060400" vendor="0x1b36" device="0x0001" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:75:01.0" class="0x020700" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0123" link_speed="32.0 GT/s PCIe" link_width="16">
<nic> <net name="mlx5_8"/> </nic>
</pci>
<pci busid="0000:75:02.0" class="0x030200" vendor="0x10de" device="0x2324" subsystem_vendor="0x10de" subsystem_device="0x17a6" link_speed="32.0 GT/s PCIe" link_width="16"/>
</pci>
</cpu>
</system>

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126


<system version="1">
  <cpu numaid="0" affinity="00000,00000000,00000000,03ffffff,ffffffff,ffffffff" arch="x86_64" vendor="GenuineIntel" familyid="6" modelid="143">
    <pci busid="0000:64:00.0" class="0x060400" vendor="0x1b36" device="0x0001" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
      <pci busid="0000:65:01.0" class="0x020700" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0123" link_speed="32.0 GT/s PCIe" link_width="16">
        <nic>
          <net name="mlx5_1" dev="0" speed="400000" port="1" guid="0xf250cd0003ae6d94" maxconn="1" gdr="1" coll="1" latency="0.000000"/>
        </nic>
      </pci>
      <pci busid="0000:65:02.0" class="0x030200" vendor="0x10de" device="0x2324" subsystem_vendor="0x10de" subsystem_device="0x17a6" link_speed="32.0 GT/s PCIe" link_width="16">
        <gpu dev="0" sm="90" rank="0" gdr="1">
          <nvlink target="0000:6d:02.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:6d:04.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:6d:03.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:6d:01.0" count="2" tclass="0x068000"/>
        </gpu>
      </pci>
    </pci>
    <pci busid="0000:66:00.0" class="0x060400" vendor="0x1b36" device="0x0001" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
      <pci busid="0000:67:01.0" class="0x020700" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0123" link_speed="32.0 GT/s PCIe" link_width="16">
        <nic>
          <net name="mlx5_2" dev="1" speed="400000" port="1" guid="0xfa4fcd0003ae6d94" maxconn="1" gdr="1" coll="1" latency="0.000000"/>
        </nic>
      </pci>
      <pci busid="0000:67:02.0" class="0x030200" vendor="0x10de" device="0x2324" subsystem_vendor="0x10de" subsystem_device="0x17a6" link_speed="32.0 GT/s PCIe" link_width="16">
        <gpu dev="1" sm="90" rank="1" gdr="1">
          <nvlink target="0000:6d:02.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:6d:03.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:6d:04.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:6d:01.0" count="2" tclass="0x068000"/>
        </gpu>
      </pci>
    </pci>
    <pci busid="0000:68:00.0" class="0x060400" vendor="0x1b36" device="0x0001" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
      <pci busid="0000:69:01.0" class="0x020700" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0123" link_speed="32.0 GT/s PCIe" link_width="16">
        <nic>
          <net name="mlx5_3" dev="2" speed="400000" port="1" guid="0xa45cd0003ae6d94" maxconn="1" gdr="1" coll="1" latency="0.000000"/>
        </nic>
      </pci>
      <pci busid="0000:69:02.0" class="0x030200" vendor="0x10de" device="0x2324" subsystem_vendor="0x10de" subsystem_device="0x17a6" link_speed="32.0 GT/s PCIe" link_width="16">
        <gpu dev="2" sm="90" rank="2" gdr="1">
          <nvlink target="0000:6d:02.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:6d:04.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:6d:03.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:6d:01.0" count="2" tclass="0x068000"/>
        </gpu>
      </pci>
    </pci>
    <pci busid="0000:6a:00.0" class="0x060400" vendor="0x1b36" device="0x0001" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
      <pci busid="0000:6b:01.0" class="0x020700" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0123" link_speed="32.0 GT/s PCIe" link_width="16">
        <nic>
          <net name="mlx5_4" dev="3" speed="400000" port="1" guid="0xb244cd0003ae6d94" maxconn="1" gdr="1" coll="1" latency="0.000000"/>
        </nic>
      </pci>
      <pci busid="0000:6b:02.0" class="0x030200" vendor="0x10de" device="0x2324" subsystem_vendor="0x10de" subsystem_device="0x17a6" link_speed="32.0 GT/s PCIe" link_width="16">
        <gpu dev="3" sm="90" rank="3" gdr="1">
          <nvlink target="0000:6d:03.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:6d:04.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:6d:01.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:6d:02.0" count="2" tclass="0x068000"/>
        </gpu>
      </pci>
    </pci>
  </cpu>
  <cpu numaid="1" affinity="fffff,ffffffff,ffffffff,fc000000,00000000,00000000" arch="x86_64" vendor="GenuineIntel" familyid="6" modelid="143">
    <pci busid="0000:6e:00.0" class="0x060400" vendor="0x1b36" device="0x0001" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
      <pci busid="0000:6f:01.0" class="0x020700" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0123" link_speed="32.0 GT/s PCIe" link_width="16">
        <nic>
          <net name="mlx5_5" dev="4" speed="400000" port="1" guid="0x8a46cd0003ae6d94" maxconn="1" gdr="1" coll="1" latency="0.000000"/>
        </nic>
      </pci>
      <pci busid="0000:6f:02.0" class="0x030200" vendor="0x10de" device="0x2324" subsystem_vendor="0x10de" subsystem_device="0x17a6" link_speed="32.0 GT/s PCIe" link_width="16">
        <gpu dev="4" sm="90" rank="4" gdr="1">
          <nvlink target="0000:6d:04.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:6d:02.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:6d:01.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:6d:03.0" count="2" tclass="0x068000"/>
        </gpu>
      </pci>
    </pci>
    <pci busid="0000:70:00.0" class="0x060400" vendor="0x1b36" device="0x0001" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
      <pci busid="0000:71:01.0" class="0x020700" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0123" link_speed="32.0 GT/s PCIe" link_width="16">
        <nic>
          <net name="mlx5_6" dev="5" speed="400000" port="1" guid="0xa51cd0003ae6d94" maxconn="1" gdr="1" coll="1" latency="0.000000"/>
        </nic>
      </pci>
      <pci busid="0000:71:02.0" class="0x030200" vendor="0x10de" device="0x2324" subsystem_vendor="0x10de" subsystem_device="0x17a6" link_speed="32.0 GT/s PCIe" link_width="16">
        <gpu dev="5" sm="90" rank="5" gdr="1">
          <nvlink target="0000:6d:02.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:6d:01.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:6d:04.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:6d:03.0" count="2" tclass="0x068000"/>
        </gpu>
      </pci>
    </pci>
    <pci busid="0000:72:00.0" class="0x060400" vendor="0x1b36" device="0x0001" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
      <pci busid="0000:73:01.0" class="0x020700" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0123" link_speed="32.0 GT/s PCIe" link_width="16">
        <nic>
          <net name="mlx5_7" dev="6" speed="400000" port="1" guid="0x245cd0003ae6d94" maxconn="1" gdr="1" coll="1" latency="0.000000"/>
        </nic>
      </pci>
      <pci busid="0000:73:02.0" class="0x030200" vendor="0x10de" device="0x2324" subsystem_vendor="0x10de" subsystem_device="0x17a6" link_speed="32.0 GT/s PCIe" link_width="16">
        <gpu dev="6" sm="90" rank="6" gdr="1">
          <nvlink target="0000:6d:04.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:6d:02.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:6d:03.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:6d:01.0" count="2" tclass="0x068000"/>
        </gpu>
      </pci>
    </pci>
    <pci busid="0000:74:00.0" class="0x060400" vendor="0x1b36" device="0x0001" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
      <pci busid="0000:75:01.0" class="0x020700" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0123" link_speed="32.0 GT/s PCIe" link_width="16">
        <nic>
          <net name="mlx5_8" dev="7" speed="400000" port="1" guid="0x7a46cd0003ae6d94" maxconn="1" gdr="1" coll="1" latency="0.000000"/>
        </nic>
      </pci>
      <pci busid="0000:75:02.0" class="0x030200" vendor="0x10de" device="0x2324" subsystem_vendor="0x10de" subsystem_device="0x17a6" link_speed="32.0 GT/s PCIe" link_width="16">
        <gpu dev="7" sm="90" rank="7" gdr="1">
          <nvlink target="0000:6d:03.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:6d:02.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:6d:01.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:6d:04.0" count="2" tclass="0x068000"/>
        </gpu>
      </pci>
    </pci>
  </cpu>
</system>

Fill

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16


static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* parent = NULL) {

  // Topo detection / System graph creation
  NCCLCHECKGOTO(ncclTopoGetSystem(comm, &comm->topo), ret, fail);
  // Compute paths between GPUs and NICs
  NCCLCHECKGOTO(ncclTopoComputePaths(comm->topo, comm), ret, fail);
  // Remove inaccessible GPUs and unused NICs
  NCCLCHECKGOTO(ncclTopoTrimSystem(comm->topo, comm), ret, fail);
  // Recompute paths after trimming
  NCCLCHECKGOTO(ncclTopoComputePaths(comm->topo, comm), ret, fail);
  // Init search
  NCCLCHECKGOTO(ncclTopoSearchInit(comm->topo), ret, fail);
  // Print final topology
  NCCLCHECKGOTO(ncclTopoPrint(comm->topo), ret, fail);

}

首先需要构建一个 Topo 图：

ncclXmlNode 表示一个节点，记录了父节点和所有子节点，节点有 name 和 attr，通过 xmlSetAttr 进行设置属性
ncclXml 预先分配了所有的 node，maxIndex 表示分配到了哪里

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17


struct ncclXmlNode {
  char name[MAX_STR_LEN+1];
  struct {
    char key[MAX_STR_LEN+1];
    char value[MAX_STR_LEN+1];
  } attrs[MAX_ATTR_COUNT+1]; // Need an extra one to consume extra params
  int nAttrs;
  int type;
  struct ncclXmlNode* parent;
  struct ncclXmlNode* subs[MAX_SUBS];
  int nSubs;
};

struct ncclXml {
  struct ncclXmlNode nodes[MAX_NODES];
  int maxIndex;
};

关于 ncclXml 的几个 API 如下：

1
2
3
4
5
6
7
8


// 在 xml 里面申请一个节点 sub, sub 的 name 设置为 subName, 父节点为 parent
static ncclResult_t xmlAddNode(struct ncclXml* xml, struct ncclXmlNode* parent, const char* subName, struct ncclXmlNode** sub);

// 遍历xml已分配的节点，找到节点名为tagName的节点n，然后判断节点n["attrName"]是否等于attrValue，如果相等，则设置node为n
static ncclResult_t xmlFindTagKv(struct ncclXml* xml, const char* tagName, struct ncclXmlNode** node, const char* attrName, const char* attrValue)

// 查看attrName是node的第几个属性
static ncclResult_t xmlGetAttrIndex(struct ncclXmlNode* node, const char* attrName, int* index)

首先通过 xmlAddNode 创建根节点 system（后续使用双引号表示 xml 树节点），并设置根节点属性 system["version"] = NCCL_TOPO_XML_VERSION，然后遍历每个 rank 的 hosthash，如果相等的话说明在同一个机器，然后执行 ncclTopoFillGpu，将 gpu 加入到 xml 树

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86


ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) {
  struct ncclXml* xml;
  NCCLCHECK(ncclCalloc(&xml, 1));
  char* xmlTopoFile = getenv("NCCL_TOPO_FILE");
  if (xmlTopoFile) {
    INFO(NCCL_ENV, "NCCL_TOPO_FILE set by environment to %s", xmlTopoFile);
    NCCLCHECK(ncclTopoGetXmlFromFile(xmlTopoFile, xml, 1));
  } else {
    // Try default XML topology location
    NCCLCHECK(ncclTopoGetXmlFromFile("/var/run/nvidia-topologyd/virtualTopology.xml", xml, 0));
  }
  if (xml->maxIndex == 0) {
    // Create top tag
    struct ncclXmlNode* top;
    NCCLCHECK(xmlAddNode(xml, NULL, "system", &top));
    NCCLCHECK(xmlSetAttrInt(top, "version", NCCL_TOPO_XML_VERSION));
  }

  // Auto-detect GPUs if needed
  for (int r=0; r<comm->nRanks; r++) {
    if (comm->peerInfo[r].hostHash == comm->peerInfo[comm->rank].hostHash) {
      char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
      NCCLCHECK(int64ToBusId(comm->peerInfo[r].busId, busId));
      struct ncclXmlNode* node;
      NCCLCHECK(ncclTopoFillGpu(xml, busId, &node));
      if (node == NULL) continue;
      NCCLCHECK(xmlSetAttrInt(node, "keep", 1));
      NCCLCHECK(xmlSetAttrInt(node, "rank", r));
      NCCLCHECK(xmlInitAttrInt(node, "gdr", comm->peerInfo[r].gdrSupport));
    }
  }
  // Auto-detect NICs if needed. net/collnet share the same xml/graph nodes,
  // so we start with collnet so that it has precedence.
  int netDevCount = 0;
  if (collNetSupport(comm)) {
    NCCLCHECK(collNetDevices(comm, &netDevCount));
    for (int n=0; n<netDevCount; n++) {
      ncclNetProperties_t props;
      NCCLCHECK(collNetGetProperties(comm, n, &props));
      struct ncclXmlNode* netNode;
      NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
      NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
      NCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
      NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
      NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
      NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid));
      NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms));
      bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF));
      INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", comm->ncclNet->name, gdrSupport ? "Enabled" : "Disabled", n, props.name);
      NCCLCHECK(xmlInitAttrInt(netNode, "gdr", gdrSupport));
      NCCLCHECK(xmlInitAttrInt(netNode, "coll", 1));
    }
  }
  if (netDevCount == 0) {
    NCCLCHECK(comm->ncclNet->devices(&netDevCount));
  }
  for (int n=0; n<netDevCount; n++) {
    ncclNetProperties_t props;
    NCCLCHECK(comm->ncclNet->getProperties(n, &props));
    struct ncclXmlNode* netNode;
    NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
    NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
    NCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
    NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
    NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
    NCCLCHECK(xmlInitAttrFloat(netNode, "latency", props.latency));
    NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid));
    NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms));
    bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF));
    INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", comm->ncclNet->name, gdrSupport ? "Enabled" : "Disabled", n, props.name);
    NCCLCHECK(xmlInitAttrInt(netNode, "gdr", gdrSupport));
  }

  // Remove XML branches which don't have a node with keep="1" (typically when importing a topology)
  NCCLCHECK(ncclTopoTrimXml(xml));

  xmlTopoFile = getenv("NCCL_TOPO_DUMP_FILE");
  if (xmlTopoFile && comm->rank == ncclParamTopoDumpFileRank()) {
    INFO(NCCL_ENV, "NCCL_TOPO_DUMP_FILE set by environment to %s", xmlTopoFile);
    NCCLCHECK(ncclTopoDumpXmlToFile(xmlTopoFile, xml));
  }

  NCCLCHECK(ncclTopoGetSystemFromXml(xml, system));
  free(xml);
  return ncclSuccess;
}

 1
 2
 3
 4
 5
 6
 7
 8
 9
10


ncclResult_t ncclTopoFillGpu(struct ncclXml* xml, const char* busId, struct ncclXmlNode** gpuNode) {
  struct ncclXmlNode* node;
  NCCLCHECK(ncclTopoGetPciNode(xml, busId, &node));
  NCCLCHECK(xmlSetAttrIfUnset(node, "class", "0x03"));
  NCCLCHECK(ncclTopoGetXmlFromSys(node, xml));
  nvmlDevice_t nvmlDev;
  NCCLCHECK(ncclNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev));
  NCCLCHECK(ncclTopoGetXmlFromGpu(node, nvmlDev, xml, gpuNode));
  return ncclSuccess;
}

通过 ncclTopoGetPciNode 获取 xml 中的有没有创建当前卡的 xml node，此时没有，所以就新建一个 xml node 叫做 “pci”，表示当前 gpu 卡，设置 pci["busid"]=busId

1
2
3
4
5
6
7
8


ncclResult_t ncclTopoGetPciNode(struct ncclXml* xml, const char* busId, struct ncclXmlNode** pciNode) {
  NCCLCHECK(xmlFindTagKv(xml, "pci", pciNode, "busid", busId));
  if (*pciNode == NULL) {
    NCCLCHECK(xmlAddNode(xml, NULL, "pci", pciNode));
    NCCLCHECK(xmlSetAttr(*pciNode, "busid", busId));
  }
  return ncclSuccess;
}

然后执行 ncclTopoGetXmlFromSys，这个函数主要逻辑就是在 sysfs 中获取 gpu 节点到 cpu 的路径，通过这个路径转成 xml 树，并读取该路径下相关属性设置到 xml 里

然后从 pciNode 开始往上跳，因为一个 switch 的上游端口和下游端口分别对应了一个 bridge，NCCL 使用上游端口 bridge 的 busid 表示这个 switch，因此这里要向上跳两次再建立一个 xml node 表示这个 switch，往上找到一个 PCI 设备就将 slashCount 加一，当 slashCount 2 就找到了一个 switch 上游端口，这个时候创建一个新的 xml pci 节点 parent 表示当前 switch，然后将当前节点 pciNode 链接到 parent，此时 parent 仍然是 xml pci 节点，因此继续递归执行 ncclTopoGetXmlFromSys，直到遇到 RC，此时给"system"创建一个子节点"cpu"，停止递归，然后执行 ncclTopoGetXmlFromCpu，设置"cpu"的各种属性，比如 arch（比如 x 86 还是 arm），affinity（该 cpu 的 numa 都有哪些 cpu core），numaid 等。

然然后通过 wrapNvmlSymbols 加载动态库 libnvidia-ml.so.1，用来获取 gpu 的相关信息

首先在 xml gpu 节点"pci"下创建节点"gpu"，然后设置"gpu"节点的属性，比如 dev，计算能力 sm，然后开始查询 nvlink 相关信息，遍历所有可能的 nvlink，通过 nvmlDeviceGetNvLinkCapability 查询 nvlink 信息，如果这个 nvlink 被启用，那么在"gpu"节点下新建一个"nvlink"节点，设置"target"属性表示 nvlink 对端的 PCIe busId，将"target"相同的"nvlink"节点表示为一个，用"count"表示起止点之间有多少条 nvlink，然后设置属性"tclass"表示"target"是什么类型的 PCI 设备

到这里 ncclTopoFillGpu 就执行结束了，此时 xml 如下所示，图里只展示了一张网卡的情况，其中"gpu"和他的父节点其实都是指的同一个 gpu

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64


struct ncclTopoNode {
  int type;
  int64_t id;
  // Type specific data
  union {
    struct {
      int dev; // NVML dev number
      int rank;
      int cudaCompCap;
      int gdrSupport;
    }gpu;
    struct {
      uint64_t asic;
      int port;
      float bw;
      float latency;
      int gdrSupport;
      int collSupport;
      int maxChannels;
    }net;
    struct {
      int arch;
      int vendor;
      int model;
      cpu_set_t affinity;
    }cpu;
    struct {
      uint64_t device;
    }pci;
  };
  int nlinks;
  struct ncclTopoLink links[NCCL_TOPO_MAX_LINKS];
  // Pre-computed paths to GPUs and NICs
  struct ncclTopoLinkList* paths[NCCL_TOPO_NODE_TYPES];
  // Used during search
  uint64_t used;
};

struct ncclTopoNodeSet {
  int count;
  struct ncclTopoNode nodes[NCCL_TOPO_MAX_NODES];
};

struct ncclTopoSystem {
  struct ncclTopoNodeSet nodes[NCCL_TOPO_NODE_TYPES];
  float maxBw;
  float totalBw;
};

struct ncclTopoNode;
struct ncclTopoLink {
  int type;
  float bw;
  struct ncclTopoNode* remNode;
};
#define NCCL_TOPO_MAX_LINKS 32
#define NCCL_TOPO_MAX_HOPS (NCCL_TOPO_MAX_NODES*NCCL_TOPO_NODE_TYPES)

struct ncclTopoLinkList {
  struct ncclTopoLink* list[NCCL_TOPO_MAX_HOPS];
  int count;
  float bw;
  int type;
};

Graph

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394


<graphs version="1">
  <graph id="0" pattern="4" crossnic="0" nchannels="8" speedintra="20" speedinter="20" latencyinter="0" typeintra="NVL" typeinter="PXN" samechannels="0">
    <channel>
      <net dev="0"/>
      <gpu dev="0"/>
      <gpu dev="7"/>
      <gpu dev="6"/>
      <gpu dev="5"/>
      <gpu dev="4"/>
      <gpu dev="3"/>
      <gpu dev="2"/>
      <gpu dev="1"/>
      <net dev="0"/>
    </channel>
    <channel>
      <net dev="1"/>
      <gpu dev="1"/>
      <gpu dev="0"/>
      <gpu dev="7"/>
      <gpu dev="6"/>
      <gpu dev="5"/>
      <gpu dev="4"/>
      <gpu dev="3"/>
      <gpu dev="2"/>
      <net dev="1"/>
    </channel>
    <channel>
      <net dev="2"/>
      <gpu dev="2"/>
      <gpu dev="1"/>
      <gpu dev="0"/>
      <gpu dev="7"/>
      <gpu dev="6"/>
      <gpu dev="5"/>
      <gpu dev="4"/>
      <gpu dev="3"/>
      <net dev="2"/>
    </channel>
    <channel>
      <net dev="3"/>
      <gpu dev="3"/>
      <gpu dev="2"/>
      <gpu dev="1"/>
      <gpu dev="0"/>
      <gpu dev="7"/>
      <gpu dev="6"/>
      <gpu dev="5"/>
      <gpu dev="4"/>
      <net dev="3"/>
    </channel>
    <channel>
      <net dev="0"/>
      <gpu dev="0"/>
      <gpu dev="7"/>
      <gpu dev="6"/>
      <gpu dev="5"/>
      <gpu dev="4"/>
      <gpu dev="3"/>
      <gpu dev="2"/>
      <gpu dev="1"/>
      <net dev="0"/>
    </channel>
    <channel>
      <net dev="1"/>
      <gpu dev="1"/>
      <gpu dev="0"/>
      <gpu dev="7"/>
      <gpu dev="6"/>
      <gpu dev="5"/>
      <gpu dev="4"/>
      <gpu dev="3"/>
      <gpu dev="2"/>
      <net dev="1"/>
    </channel>
    <channel>
      <net dev="2"/>
      <gpu dev="2"/>
      <gpu dev="1"/>
      <gpu dev="0"/>
      <gpu dev="7"/>
      <gpu dev="6"/>
      <gpu dev="5"/>
      <gpu dev="4"/>
      <gpu dev="3"/>
      <net dev="2"/>
    </channel>
    <channel>
      <net dev="3"/>
      <gpu dev="3"/>
      <gpu dev="2"/>
      <gpu dev="1"/>
      <gpu dev="0"/>
      <gpu dev="7"/>
      <gpu dev="6"/>
      <gpu dev="5"/>
      <gpu dev="4"/>
      <net dev="3"/>
    </channel>
  </graph>
  <graph id="1" pattern="3" crossnic="0" nchannels="8" speedintra="22" speedinter="22" latencyinter="0" typeintra="NVL" typeinter="PIX" samechannels="0">
    <channel>
      <net dev="0"/>
      <gpu dev="0"/>
      <gpu dev="1"/>
      <gpu dev="2"/>
      <gpu dev="3"/>
      <gpu dev="4"/>
      <gpu dev="5"/>
      <gpu dev="6"/>
      <gpu dev="7"/>
      <net dev="0"/>
    </channel>
    <channel>
      <net dev="1"/>
      <gpu dev="1"/>
      <gpu dev="2"/>
      <gpu dev="3"/>
      <gpu dev="4"/>
      <gpu dev="5"/>
      <gpu dev="6"/>
      <gpu dev="7"/>
      <gpu dev="0"/>
      <net dev="1"/>
    </channel>
    <channel>
      <net dev="2"/>
      <gpu dev="2"/>
      <gpu dev="3"/>
      <gpu dev="4"/>
      <gpu dev="5"/>
      <gpu dev="6"/>
      <gpu dev="7"/>
      <gpu dev="0"/>
      <gpu dev="1"/>
      <net dev="2"/>
    </channel>
    <channel>
      <net dev="3"/>
      <gpu dev="3"/>
      <gpu dev="4"/>
      <gpu dev="5"/>
      <gpu dev="6"/>
      <gpu dev="7"/>
      <gpu dev="0"/>
      <gpu dev="1"/>
      <gpu dev="2"/>
      <net dev="3"/>
    </channel>
    <channel>
      <net dev="4"/>
      <gpu dev="4"/>
      <gpu dev="5"/>
      <gpu dev="6"/>
      <gpu dev="7"/>
      <gpu dev="0"/>
      <gpu dev="1"/>
      <gpu dev="2"/>
      <gpu dev="3"/>
      <net dev="4"/>
    </channel>
    <channel>
      <net dev="5"/>
      <gpu dev="5"/>
      <gpu dev="6"/>
      <gpu dev="7"/>
      <gpu dev="0"/>
      <gpu dev="1"/>
      <gpu dev="2"/>
      <gpu dev="3"/>
      <gpu dev="4"/>
      <net dev="5"/>
    </channel>
    <channel>
      <net dev="6"/>
      <gpu dev="6"/>
      <gpu dev="7"/>
      <gpu dev="0"/>
      <gpu dev="1"/>
      <gpu dev="2"/>
      <gpu dev="3"/>
      <gpu dev="4"/>
      <gpu dev="5"/>
      <net dev="6"/>
    </channel>
    <channel>
      <net dev="7"/>
      <gpu dev="7"/>
      <gpu dev="0"/>
      <gpu dev="1"/>
      <gpu dev="2"/>
      <gpu dev="3"/>
      <gpu dev="4"/>
      <gpu dev="5"/>
      <gpu dev="6"/>
      <net dev="7"/>
    </channel>
  </graph>
  <graph id="2" pattern="3" crossnic="0" nchannels="8" speedintra="22" speedinter="22" latencyinter="0" typeintra="NVL" typeinter="PIX" samechannels="0">
    <channel>
      <net dev="0"/>
      <gpu dev="0"/>
      <gpu dev="1"/>
      <gpu dev="2"/>
      <gpu dev="3"/>
      <gpu dev="4"/>
      <gpu dev="5"/>
      <gpu dev="6"/>
      <gpu dev="7"/>
      <net dev="0"/>
    </channel>
    <channel>
      <net dev="1"/>
      <gpu dev="1"/>
      <gpu dev="2"/>
      <gpu dev="3"/>
      <gpu dev="4"/>
      <gpu dev="5"/>
      <gpu dev="6"/>
      <gpu dev="7"/>
      <gpu dev="0"/>
      <net dev="1"/>
    </channel>
    <channel>
      <net dev="2"/>
      <gpu dev="2"/>
      <gpu dev="3"/>
      <gpu dev="4"/>
      <gpu dev="5"/>
      <gpu dev="6"/>
      <gpu dev="7"/>
      <gpu dev="0"/>
      <gpu dev="1"/>
      <net dev="2"/>
    </channel>
    <channel>
      <net dev="3"/>
      <gpu dev="3"/>
      <gpu dev="4"/>
      <gpu dev="5"/>
      <gpu dev="6"/>
      <gpu dev="7"/>
      <gpu dev="0"/>
      <gpu dev="1"/>
      <gpu dev="2"/>
      <net dev="3"/>
    </channel>
    <channel>
      <net dev="4"/>
      <gpu dev="4"/>
      <gpu dev="5"/>
      <gpu dev="6"/>
      <gpu dev="7"/>
      <gpu dev="0"/>
      <gpu dev="1"/>
      <gpu dev="2"/>
      <gpu dev="3"/>
      <net dev="4"/>
    </channel>
    <channel>
      <net dev="5"/>
      <gpu dev="5"/>
      <gpu dev="6"/>
      <gpu dev="7"/>
      <gpu dev="0"/>
      <gpu dev="1"/>
      <gpu dev="2"/>
      <gpu dev="3"/>
      <gpu dev="4"/>
      <net dev="5"/>
    </channel>
    <channel>
      <net dev="6"/>
      <gpu dev="6"/>
      <gpu dev="7"/>
      <gpu dev="0"/>
      <gpu dev="1"/>
      <gpu dev="2"/>
      <gpu dev="3"/>
      <gpu dev="4"/>
      <gpu dev="5"/>
      <net dev="6"/>
    </channel>
    <channel>
      <net dev="7"/>
      <gpu dev="7"/>
      <gpu dev="0"/>
      <gpu dev="1"/>
      <gpu dev="2"/>
      <gpu dev="3"/>
      <gpu dev="4"/>
      <gpu dev="5"/>
      <gpu dev="6"/>
      <net dev="7"/>
    </channel>
  </graph>
  <graph id="3" pattern="5" crossnic="0" nchannels="8" speedintra="17.5" speedinter="17.5" latencyinter="0" typeintra="NVL" typeinter="PIX" samechannels="0">
    <channel>
      <net dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <net dev="0"/>
    </channel>
    <channel>
      <net dev="1"/>
      <gpu dev="1"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <net dev="1"/>
    </channel>
    <channel>
      <net dev="2"/>
      <gpu dev="2"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <net dev="2"/>
    </channel>
    <channel>
      <net dev="3"/>
      <gpu dev="3"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <net dev="3"/>
    </channel>
    <channel>
      <net dev="4"/>
      <gpu dev="4"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <net dev="4"/>
    </channel>
    <channel>
      <net dev="5"/>
      <gpu dev="5"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <net dev="5"/>
    </channel>
    <channel>
      <net dev="6"/>
      <gpu dev="6"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <net dev="6"/>
    </channel>
    <channel>
      <net dev="7"/>
      <gpu dev="7"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <net dev="7"/>
    </channel>
  </graph>
</graphs>