llmq/src/utilities/gpu_info_nvml.cpp at dev · IST-DASLab/llmq · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
// Copyright (c) 2025, IST Austria, developed by Erik Schultheis
// SPDX-License-Identifier: Apache-2.0
//

#include "gpu_info.h"

#include <type_traits>

#include <fmt/core.h>
#include <nvml.h>

#include "utils.h"


inline nvmlReturn_t nvml_check(nvmlReturn_t status, const char *file, int line, bool allow_unsupported) {
    if(status == NVML_ERROR_NOT_SUPPORTED && allow_unsupported) {
        static bool warned = false;
        if(!warned) {
            warned = true;
            fprintf(stderr, "[NVML WARNING] NVML_ERROR_NOT_SUPPORTED\n");
        }
    } else if(status == NVML_ERROR_NO_PERMISSION && allow_unsupported) {
        static bool warned = false;
        if(!warned) {
            warned = true;
            fprintf(stderr, "[NVML WARNING] NVML_ERROR_NO_PERMISSION\n");
        }
    } else if (status != NVML_SUCCESS) {
        throw std::runtime_error(fmt::format("[NVML ERROR] at file {}:{}:\n{}\n", file, line, nvmlErrorString(status)));
    }
    return status;
};
// the _U version of the macro does not throw an exception on unsupported features
#define NVML_CHECK(err) (nvml_check(err, __FILE__, __LINE__, false))
#define NVML_CHECK_U(err) (nvml_check(err, __FILE__, __LINE__, true))

inline nvmlDevice_t nvml_get_device() {
    thread_local bool needs_init = true;
    thread_local nvmlDevice_t device;
    if (needs_init) {
        needs_init = false;
        NVML_CHECK(nvmlInit());
        char bus_id[256];
        int did;
        CUDA_CHECK(cudaGetDevice(&did));
        CUDA_CHECK(cudaDeviceGetPCIBusId (bus_id, sizeof(bus_id), did));
        NVML_CHECK(nvmlDeviceGetHandleByPciBusId(bus_id, &device));
    }
    return device;
}

inline const char* get_throttle_reason(unsigned long long bits) {
    if (bits & (nvmlClocksThrottleReasonSwPowerCap | nvmlClocksThrottleReasonHwPowerBrakeSlowdown)) {
        return "power cap";
    } else if (bits & (nvmlClocksThrottleReasonSwThermalSlowdown | nvmlClocksThrottleReasonHwThermalSlowdown)) {
        return "thermal cap";
    } else if (bits & (nvmlClocksThrottleReasonGpuIdle)) {
        return "idle";
    } else if (bits & (nvmlClocksThrottleReasonAll)) {
        return "other cap";
    } else {
        return "no cap";
    }
}

class GPUUtilTrackerNVML : public IGPUUtilTracker {
public:
    GPUUtilTrackerNVML();
    ~GPUUtilTrackerNVML();
    const GPUUtilInfo& update() override;
private:
    GPUUtilInfo mInfo;
    nvmlDevice_t mDevice;

    void setup_tracking_thread();

    long long mLastTimestamp;       // µs
    std::size_t mLastPCIeRX;
    std::size_t mLastPCIeTX;
    unsigned long long mLastEnergy;

    std::atomic<std::size_t> mIntervalPCIeRX{0};
    std::atomic<std::size_t> mIntervalPCIeTX{0};
    std::atomic<std::size_t> mIntervalEnergy{0};

    std::size_t mTotalPCIeRX;
    std::size_t mTotalPCIeTX;
    std::size_t mTotalEnergy;

    std::jthread mThread;
};

std::unique_ptr<IGPUUtilTracker> IGPUUtilTracker::create() {
    return std::make_unique<GPUUtilTrackerNVML>();
}

GPUUtilTrackerNVML::GPUUtilTrackerNVML() : mDevice(nvml_get_device()) {
    nvmlFieldValue_t fields[] = {{NVML_FI_DEV_PCIE_COUNT_RX_BYTES}, {NVML_FI_DEV_PCIE_COUNT_TX_BYTES}, {NVML_FI_DEV_TOTAL_ENERGY_CONSUMPTION, 0}};
    NVML_CHECK(nvmlDeviceGetFieldValues(mDevice, 3, fields));
    if(fields[0].nvmlReturn == NVML_ERROR_NOT_SUPPORTED || fields[1].nvmlReturn == NVML_ERROR_NOT_SUPPORTED || fields[2].nvmlReturn == NVML_ERROR_NOT_SUPPORTED) {
        fprintf(stderr, "[NVML WARNING] PCIe counters not supported\n");
    } else {
        NVML_CHECK(fields[0].nvmlReturn);
        NVML_CHECK(fields[1].nvmlReturn);
        NVML_CHECK(fields[2].nvmlReturn);mLastPCIeRX = fields[0].value.uiVal;
        mLastPCIeTX = fields[1].value.uiVal;
        mLastEnergy = fields[2].value.ullVal;

        mLastTimestamp = std::chrono::steady_clock::now().time_since_epoch().count();

        setup_tracking_thread();
    }


}

GPUUtilTrackerNVML::~GPUUtilTrackerNVML() {
    if(mThread.joinable()) {
        mThread.request_stop();
        mThread.join();
    }
    NVML_CHECK(nvmlShutdown());
}

void GPUUtilTrackerNVML::setup_tracking_thread() {
    // TODO should this be one thread for all devices?
    mThread = std::jthread([this](std::stop_token stop_token)
    {
        NVML_CHECK(nvmlDeviceSetCpuAffinity(mDevice));

        nvmlFieldValue_t fields[] = {{NVML_FI_DEV_PCIE_COUNT_RX_BYTES}, {NVML_FI_DEV_PCIE_COUNT_TX_BYTES}, {NVML_FI_DEV_TOTAL_ENERGY_CONSUMPTION, 0}};
        while (true) {
            NVML_CHECK(nvmlDeviceGetFieldValues(mDevice, 3, fields));
            NVML_CHECK(fields[0].nvmlReturn);
            NVML_CHECK(fields[1].nvmlReturn);
            NVML_CHECK(fields[2].nvmlReturn);
            unsigned pcie_rx, pcie_tx;
            unsigned long long energy;
            if (mLastPCIeRX <= fields[0].value.uiVal) {
               pcie_rx = fields[0].value.uiVal - mLastPCIeRX;
            } else {
               pcie_rx = std::numeric_limits<unsigned>::max() - mLastPCIeRX + fields[0].value.uiVal;
            }

            if (mLastPCIeTX <= fields[1].value.uiVal) {
               pcie_tx = fields[1].value.uiVal - mLastPCIeTX;
            } else {
               pcie_tx = std::numeric_limits<unsigned>::max() - mLastPCIeTX + fields[1].value.uiVal;
            }

            if (mLastEnergy <= fields[2].value.ullVal) {
               energy = fields[2].value.ullVal - mLastEnergy;
            } else {
               energy = (std::numeric_limits<unsigned long long>::max() - mLastEnergy) + fields[2].value.ullVal;
            }

            mIntervalPCIeRX += pcie_rx;
            mIntervalPCIeTX += pcie_tx;
            mIntervalEnergy += energy;
            mLastPCIeRX = fields[0].value.uiVal;
            mLastPCIeTX = fields[1].value.uiVal;
            mLastEnergy = fields[2].value.ullVal;

            std::this_thread::sleep_for(std::chrono::milliseconds(50));
            if (stop_token.stop_requested()) {
                break;
            }
        }
    });
}

nvmlMemory_v2_t get_mem_info(nvmlDevice_t device);

const GPUUtilInfo& GPUUtilTrackerNVML::update() {
    // set default values to indicate not-supported operations
    mInfo.clock = 0;
    mInfo.max_clock = 0;
    mInfo.power_limit = 0;
    mInfo.temperature = 0;
    mInfo.temp_slowdown = 0;
    mInfo.fan = 0;
    mInfo.gpu_utilization = -1.f;
    mInfo.mem_utilization = -1.f;
    mInfo.throttle_reason = "not supported";

    // query different infos directly
    NVML_CHECK_U(nvmlDeviceGetClockInfo(mDevice, NVML_CLOCK_SM, &mInfo.clock));
    NVML_CHECK_U(nvmlDeviceGetMaxClockInfo(mDevice, NVML_CLOCK_SM, &mInfo.max_clock));
    NVML_CHECK_U(nvmlDeviceGetPowerManagementLimit(mDevice, &mInfo.power_limit));
    NVML_CHECK_U(nvmlDeviceGetTemperature(mDevice, NVML_TEMPERATURE_GPU, &mInfo.temperature));
    NVML_CHECK_U(nvmlDeviceGetTemperatureThreshold(mDevice, NVML_TEMPERATURE_THRESHOLD_SLOWDOWN, &mInfo.temp_slowdown));
    unsigned long long throttle;
    if(NVML_CHECK_U(nvmlDeviceGetCurrentClocksThrottleReasons(mDevice, &throttle)) == NVML_SUCCESS) {
        mInfo.throttle_reason = get_throttle_reason(throttle);
    }
    NVML_CHECK_U(nvmlDeviceGetFanSpeed(mDevice, &mInfo.fan));

    // For "utilization", we look at recorded samples. In principle, we could query the driver for how many samples
    // to request, but then we'd need to dynamically allocate sufficient space. Let's just hard-code a limit of 128,
    // and have no memory management required
    constexpr const int BUFFER_LIMIT = 128;
    nvmlSample_t buffer[BUFFER_LIMIT];
    nvmlValueType_t v_type;
    unsigned int sample_count = BUFFER_LIMIT;
    if(NVML_CHECK_U(nvmlDeviceGetSamples(mDevice, NVML_GPU_UTILIZATION_SAMPLES, 0, &v_type, &sample_count, buffer)) == NVML_SUCCESS) {
        float gpu_utilization = 0.f;
        for(unsigned i = 0; i < sample_count; ++i) {
            gpu_utilization += (float)buffer[i].sampleValue.uiVal;
        }
        mInfo.gpu_utilization = gpu_utilization / (float)sample_count;
    }

    // sample count may have been modified by the query above; reset back to buffer size
    sample_count = BUFFER_LIMIT;
    if(NVML_CHECK_U(nvmlDeviceGetSamples(mDevice, NVML_MEMORY_UTILIZATION_SAMPLES, 0, &v_type, &sample_count, buffer)) == NVML_SUCCESS) {
        float mem_utilization = 0.f;
        for(unsigned i = 0; i < sample_count; ++i) {
            mem_utilization += (float)buffer[i].sampleValue.uiVal;
        }
        mInfo.mem_utilization = mem_utilization / (float)sample_count;
    }

    nvmlMemory_v2_t mem_info = get_mem_info(mDevice);
    mInfo.mem_free = mem_info.free;
    mInfo.mem_total = mem_info.total;
    mInfo.mem_reserved = mem_info.reserved;

    // query PCIe info, if available
    if (mThread.joinable()) {
        auto now = std::chrono::steady_clock::now().time_since_epoch();
        auto interval = std::chrono::duration_cast<std::chrono::microseconds>(
            now - std::chrono::steady_clock::duration{mLastTimestamp}).count();

        std::size_t int_rx = mIntervalPCIeRX.exchange(0);
        std::size_t int_tx = mIntervalPCIeTX.exchange(0);
        std::size_t int_eg = mIntervalEnergy.exchange(0);

        mInfo.pcie_rx = (1'000'000ull * int_rx) / interval;
        mInfo.pcie_tx = (1'000'000ull * int_tx) / interval;
        // not using nvmlDeviceGetPowerUsage, because that is the past 1sec average, so it might not be representative
        mInfo.power = (1'000'000ull * int_eg) / interval;

        mTotalPCIeRX += int_rx;
        mTotalPCIeTX += int_tx;
        mTotalEnergy += int_eg;
        mLastTimestamp = now.count();
    }

    return mInfo;
}

nvmlMemory_v2_t get_mem_info(nvmlDevice_t device) {
    static bool has_printed_warning = false;
    nvmlMemory_v2_t mem_info;
    mem_info.version = nvmlMemory_v2;
    auto status = nvmlDeviceGetMemoryInfo_v2(device, &mem_info);
    if (status == NVML_ERROR_NOT_SUPPORTED) {
        fprintf(stderr, "[NVML WARNING] nvmlDeviceGetMemoryInfo not supported.");
        has_printed_warning = true;
        std::size_t free, total;
        // hail mary -- use cuda's basic interface instead
        CUDA_CHECK(cudaMemGetInfo(&free, &total));
        mem_info.reserved = 0;
        mem_info.free = free;
        mem_info.total = total;
        mem_info.used = 0;
    } else {
        NVML_CHECK(status);
    }
    return mem_info;
}

std::size_t get_mem_reserved() {
    return get_mem_info(nvml_get_device()).reserved;
}

std::string get_gpu_name() {
    char name[256];
    NVML_CHECK(nvmlDeviceGetName(nvml_get_device(), name, 256));
    return name;
}

bool set_cpu_affinity() {
    auto status = nvmlDeviceSetCpuAffinity(nvml_get_device());
    return status == NVML_SUCCESS;
}