Line data Source code
1 : #include "energyMonitor.hpp"
2 :
3 : #include <filesystem>
4 :
5 : namespace {
6 : std::unique_ptr<EnergyMonitor> energyMonitor;
7 : }
8 :
9 208 : EnergyMonitor* getEnergyMonitor() {
10 208 : return energyMonitor.get();
11 : }
12 :
13 22 : EnergyMonitor::EnergyMonitor(int period_ms):
14 22 : _period(period_ms),
15 44 : _ncores_phys(std::max(1, countPhysicalCores())) {}
16 :
17 22 : EnergyMonitor::~EnergyMonitor() {
18 22 : stop();
19 22 : }
20 :
21 22 : void EnergyMonitor::start() {
22 22 : _startTime = Clock::now();
23 22 : _domains = findEnergyDomains();
24 22 : if (_domains.empty()) {
25 22 : Logging::LogIt(Logging::logWarn) << "No RAPL energy domains found";
26 22 : return;
27 : }
28 :
29 0 : for (const auto& d : _domains) {
30 0 : const long long v = readUj(d.energy_path.c_str());
31 0 : if (v < 0) warnPermission(d.energy_path);
32 0 : _lastEnergy[d.energy_path] = v;
33 : }
34 :
35 : _running = true;
36 0 : _worker = std::thread(&EnergyMonitor::loop, this);
37 : }
38 :
39 43 : void EnergyMonitor::stop() {
40 : _running = false;
41 43 : if (_worker.joinable()) _worker.join();
42 43 : }
43 :
44 0 : double EnergyMonitor::cpuW() const {
45 0 : return _cpuWLocal.load(std::memory_order_relaxed);
46 : }
47 :
48 0 : double EnergyMonitor::ramW() const {
49 0 : return _ramWLocal.load(std::memory_order_relaxed);
50 : }
51 :
52 : #ifdef WITH_MPI
53 : PowerResult EnergyMonitor::reducePower(int root, MPI_Comm comm) const {
54 : const double cpu = cpuW();
55 : const double ram = ramW();
56 :
57 : PowerResult result{0.0, 0.0};
58 : MPI_Reduce(&cpu, &result.cpu_sum, 1, MPI_DOUBLE, MPI_SUM, root, comm);
59 : MPI_Reduce(&ram, &result.ram_sum, 1, MPI_DOUBLE, MPI_SUM, root, comm);
60 :
61 : return result;
62 : }
63 : #endif
64 :
65 229 : void EnergyMonitor::reportValues(double cpu, double ram, const std::string& unit, Logging::LogLevel level) const {
66 : if (Distributed::isMainProcess()) {
67 229 : Logging::LogIt(level) << "CPU " << cpu << " " << unit
68 : << ", RAM " << ram << " " << unit;
69 : }
70 229 : }
71 :
72 0 : void EnergyMonitor::reportPower(Logging::LogLevel level) const {
73 : #ifdef WITH_MPI
74 : const PowerResult result = reducePower();
75 : if (Distributed::isMainProcess()) {
76 : reportValues(result.cpu_sum, result.ram_sum, "W", level);
77 : }
78 : #else
79 0 : reportValues(cpuW(), ramW(), "W", level);
80 : #endif
81 0 : }
82 :
83 229 : void EnergyMonitor::reportEnergy(TimeType durationMs, Logging::LogLevel level) const {
84 229 : const double hours = durationMs / 1000.0 / 3600.0;
85 : #ifdef WITH_MPI
86 : const PowerResult result = reducePower();
87 : if (Distributed::isMainProcess()) {
88 : reportValues(result.cpu_sum * hours, result.ram_sum * hours, "Wh", level);
89 : }
90 : #else
91 229 : reportValues(cpuW() * hours, ramW() * hours, "Wh", level);
92 : #endif
93 229 : }
94 :
95 229 : void EnergyMonitor::reportCost(TimeType durationMs, Logging::LogLevel level) const {
96 229 : const double hours = durationMs / 1000.0 / 3600.0;
97 : #ifdef WITH_MPI
98 : const PowerResult result = reducePower();
99 : if (Distributed::isMainProcess()) {
100 : const double cpuCost = (result.cpu_sum * hours) / 1000.0 * kWhPriceEuro;
101 : const double ramCost = (result.ram_sum * hours) / 1000.0 * kWhPriceEuro;
102 : Logging::LogIt(level) << "CPU " << cpuCost << " €"
103 : << ", RAM " << ramCost << " €"
104 : << " (" << kWhPriceEuro << " €/kWh)";
105 : }
106 : #else
107 229 : const double cpuCost = (cpuW() * hours) / 1000.0 * kWhPriceEuro;
108 229 : const double ramCost = (ramW() * hours) / 1000.0 * kWhPriceEuro;
109 : if (Distributed::isMainProcess()) {
110 229 : Logging::LogIt(level) << "CPU " << cpuCost << " €"
111 : << ", RAM " << ramCost << " €"
112 : << " (" << kWhPriceEuro << " €/kWh)";
113 : }
114 : #endif
115 229 : }
116 :
117 0 : CpuTimes EnergyMonitor::readCpuTimes() {
118 0 : std::ifstream f("/proc/stat");
119 : std::string line;
120 0 : CpuTimes t{};
121 0 : if (std::getline(f, line)) {
122 0 : std::istringstream ss(line);
123 : std::string cpu;
124 0 : ss >> cpu >> t.user >> t.nice >> t.system >> t.idle >> t.iowait >> t.irq >> t.softirq >> t.steal;
125 0 : }
126 0 : return t;
127 0 : }
128 :
129 0 : double EnergyMonitor::computeLoadFraction(const CpuTimes& prev, const CpuTimes& cur) {
130 0 : const long long prevIdle = prev.idle + prev.iowait;
131 0 : const long long curIdle = cur.idle + cur.iowait;
132 0 : const long long prevTotal = prev.user + prev.nice + prev.system + prevIdle + prev.irq + prev.softirq + prev.steal;
133 0 : const long long curTotal = cur.user + cur.nice + cur.system + curIdle + cur.irq + cur.softirq + cur.steal;
134 0 : const long long deltaTotal = curTotal - prevTotal;
135 0 : const long long deltaIdle = curIdle - prevIdle;
136 :
137 0 : if (deltaTotal <= 0) return 0.0;
138 0 : return double(deltaTotal - deltaIdle) / double(deltaTotal);
139 : }
140 :
141 22 : int EnergyMonitor::countPhysicalCores() {
142 22 : std::ifstream f("/proc/cpuinfo");
143 : std::string line;
144 : std::set<std::pair<int, int>> cores;
145 : int phys = -1, core = -1;
146 :
147 2486 : while (std::getline(f, line)) {
148 2464 : if (line.find("physical id") != std::string::npos)
149 176 : phys = std::stoi(line.substr(line.find(":") + 1));
150 2376 : else if (line.find("core id") != std::string::npos) {
151 88 : core = std::stoi(line.substr(line.find(":") + 1));
152 176 : if (phys >= 0 && core >= 0) cores.insert({phys, core});
153 : }
154 : }
155 22 : if (!cores.empty()) return static_cast<int>(cores.size());
156 : #if defined(_SC_NPROCESSORS_ONLN)
157 0 : return static_cast<int>(sysconf(_SC_NPROCESSORS_ONLN));
158 : #else
159 : const unsigned int n = std::thread::hardware_concurrency();
160 : return static_cast<int>(n > 0 ? n : 1u);
161 : #endif
162 22 : }
163 :
164 0 : long long EnergyMonitor::readUj(const char* path) {
165 0 : std::ifstream f(path);
166 0 : if (!f) return -1;
167 0 : long long v = 0;
168 : f >> v;
169 0 : return v;
170 0 : }
171 :
172 0 : std::string EnergyMonitor::readFile(const std::string& path) {
173 0 : std::ifstream f(path);
174 : std::string s;
175 0 : std::getline(f, s);
176 0 : return s;
177 0 : }
178 :
179 0 : void EnergyMonitor::warnPermission(const std::string& p) {
180 : static std::atomic<bool> warned{false};
181 0 : if (!warned.exchange(true)) {
182 0 : Logging::LogIt(Logging::logWarn) << "\n[WARNING] Cannot read RAPL energy counters.\n"
183 : << "Check permissions on:\n " << p << "\n"
184 : << "Try: sudo chmod a+r /sys/class/powercap/*/energy_uj\n";
185 : }
186 0 : }
187 :
188 22 : std::vector<EnergyDomain> EnergyMonitor::findEnergyDomains() const {
189 22 : std::vector<EnergyDomain> out;
190 66 : for (const auto& p : std::filesystem::directory_iterator("/sys/class/powercap")) {
191 0 : const auto energy = p.path() / "energy_uj";
192 0 : const auto name = p.path() / "name";
193 :
194 : if (std::filesystem::exists(energy) && std::filesystem::exists(name)) {
195 : EnergyDomain d;
196 0 : d.energy_path = energy.string();
197 0 : d.name = readFile(name.string());
198 0 : out.push_back(d);
199 : }
200 0 : }
201 22 : return out;
202 0 : }
203 :
204 0 : void EnergyMonitor::loop() {
205 0 : CpuTimes prevCpu = readCpuTimes();
206 :
207 0 : while (_running) {
208 0 : const auto t0 = std::chrono::steady_clock::now();
209 : const auto eStart = _lastEnergy;
210 :
211 0 : std::this_thread::sleep_for(std::chrono::milliseconds(_period));
212 :
213 : double cpuJ = 0.0;
214 : double ramJ = 0.0;
215 :
216 0 : for (const auto& d : _domains) {
217 0 : const long long e = readUj(d.energy_path.c_str());
218 0 : if (e < 0) continue;
219 :
220 0 : const double deltaJ = (e - eStart.at(d.energy_path)) * 1e-6;
221 0 : _lastEnergy[d.energy_path] = e;
222 :
223 : std::string lname = d.name;
224 0 : for (auto& c : lname) c = std::tolower(c);
225 :
226 0 : if (lname.find("package") != std::string::npos || lname.find("cpu") != std::string::npos)
227 0 : cpuJ += deltaJ;
228 0 : else if (lname.find("dram") != std::string::npos || lname.find("mem") != std::string::npos)
229 0 : ramJ += deltaJ;
230 : }
231 :
232 0 : const auto t1 = std::chrono::steady_clock::now();
233 : const double dt = std::chrono::duration<double>(t1 - t0).count();
234 0 : const CpuTimes curCpu = readCpuTimes();
235 0 : const double loadFrac = computeLoadFraction(prevCpu, curCpu);
236 0 : prevCpu = curCpu;
237 :
238 0 : double activePhys = loadFrac * _ncores_phys;
239 0 : if (activePhys < 0.1) activePhys = 1.0;
240 :
241 0 : _cpuWLocal.store(cpuJ / dt / activePhys, std::memory_order_relaxed);
242 0 : _ramWLocal.store(ramJ / dt / activePhys, std::memory_order_relaxed);
243 : }
244 0 : }
245 :
246 : namespace {
247 : struct EnergyMonitorInitializer {
248 22 : static void create(int period_ms = 500) {
249 44 : energyMonitor = std::make_unique<EnergyMonitor>(period_ms);
250 22 : }
251 : static void destroy() {
252 : energyMonitor.reset();
253 : }
254 : };
255 : }
256 :
257 22 : void initEnergyMonitor(int period_ms) {
258 22 : EnergyMonitorInitializer::create(period_ms);
259 22 : if (energyMonitor) energyMonitor->start();
260 22 : }
261 :
262 21 : void finalizeEnergyMonitor() {
263 21 : if (energyMonitor) {
264 21 : energyMonitor->stop();
265 : const TimeType totalDuration = getTimeDiff(energyMonitor->getStartTime());
266 : if (Distributed::isMainProcess()) {
267 21 : Logging::LogIt(Logging::logInfo) << "Total execution time: " << totalDuration << " ms";
268 21 : energyMonitor->reportEnergy(totalDuration, Logging::logInfo);
269 21 : energyMonitor->reportCost(totalDuration, Logging::logInfo);
270 : }
271 : EnergyMonitorInitializer::destroy();
272 : }
273 21 : }
|