// Copyright 2024 Google LLC // SPDX-License-Identifier: Apache-2.0 // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "hwy/contrib/thread_pool/topology.h" #include // isspace #include #include #include #include // strchr #include #include #include #include #include "hwy/base.h" // HWY_OS_WIN, HWY_WARN #if HWY_OS_APPLE #include #include "hwy/aligned_allocator.h" // HWY_ALIGNMENT #endif #if HWY_OS_WIN #ifndef NOMINMAX #define NOMINMAX #endif #ifndef WIN32_LEAN_AND_MEAN #define WIN32_LEAN_AND_MEAN #endif #ifndef _WIN32_WINNT #define _WIN32_WINNT 0x0601 // Windows 7 / Server 2008 #endif #include #endif // HWY_OS_WIN #if HWY_OS_LINUX || HWY_OS_FREEBSD #ifndef _GNU_SOURCE #define _GNU_SOURCE #endif #include #include #include #include #include #include #include #include // sysconf #endif // HWY_OS_LINUX || HWY_OS_FREEBSD #if HWY_OS_FREEBSD #include // After param.h / types.h. #include #endif // HWY_OS_FREEBSD #if HWY_ARCH_WASM #include #endif namespace hwy { HWY_CONTRIB_DLLEXPORT bool HaveThreadingSupport() { #if HWY_ARCH_WASM return emscripten_has_threading_support() != 0; #else return true; #endif } namespace { // Returns `whole / part`, with a check that `part` evenly divides `whole`, // which implies the result is exact. HWY_MAYBE_UNUSED size_t DivByFactor(size_t whole, size_t part) { HWY_ASSERT(part != 0); const size_t div = whole / part; const size_t mul = div * part; if (mul != whole) { HWY_ABORT("%zu / %zu = %zu; *%zu = %zu\n", whole, part, div, part, mul); } return div; } #if HWY_OS_WIN using SLPI = SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX; template bool ForEachSLPI(LOGICAL_PROCESSOR_RELATIONSHIP rel, Func&& func) { // Get required buffer size. DWORD buf_bytes = 0; HWY_ASSERT(!GetLogicalProcessorInformationEx(rel, nullptr, &buf_bytes)); // Observed when `rel` is not supported: if (HWY_UNLIKELY(buf_bytes == 0 && GetLastError() == ERROR_GEN_FAILURE)) { if (rel != RelationNumaNodeEx && rel != RelationProcessorDie) { HWY_WARN("Unexpected err %lx for GLPI relationship %d\n", GetLastError(), static_cast(rel)); } return false; } HWY_ASSERT(GetLastError() == ERROR_INSUFFICIENT_BUFFER); // Note: `buf_bytes` may be less than `sizeof(SLPI)`, which has padding. // `calloc` zero-initializes the `Reserved` field, part of which has been // repurposed into `GroupCount` in SDKs, 10.0.22000.0 or possibly earlier. uint8_t* buf = static_cast(calloc(1, buf_bytes)); HWY_ASSERT(buf); // Fill the buffer. SLPI* info = reinterpret_cast(buf); if (HWY_UNLIKELY(!GetLogicalProcessorInformationEx(rel, info, &buf_bytes))) { free(buf); return false; } // Iterate over each SLPI. `sizeof(SLPI)` is unreliable, see above. uint8_t* pos = buf; while (pos < buf + buf_bytes) { info = reinterpret_cast(pos); HWY_ASSERT(rel == RelationAll || info->Relationship == rel); func(*info); pos += info->Size; } if (pos != buf + buf_bytes) { HWY_WARN("unexpected pos %p, end %p, buf_bytes %lu, sizeof(SLPI) %zu\n", pos, buf + buf_bytes, buf_bytes, sizeof(SLPI)); } free(buf); return true; } size_t NumBits(size_t num_groups, const GROUP_AFFINITY* affinity) { size_t total_bits = 0; for (size_t i = 0; i < num_groups; ++i) { size_t bits = 0; hwy::CopyBytes(&affinity[i].Mask, &bits); total_bits += hwy::PopCount(bits); } return total_bits; } // Calls `func(lp, lps)` for each index `lp` in the set, after ensuring that // `lp < lps.size()`. `line` is for debugging via Warn(). template void ForeachBit(size_t num_groups, const GROUP_AFFINITY* affinity, std::vector& lps, int line, const Func& func) { for (size_t group = 0; group < num_groups; ++group) { size_t bits = 0; hwy::CopyBytes(&affinity[group].Mask, &bits); while (bits != 0) { size_t lp = group * 64 + Num0BitsBelowLS1Bit_Nonzero64(bits); bits &= bits - 1; // clear LSB if (HWY_UNLIKELY(lp >= lps.size())) { Warn(__FILE__, line, "Clamping lp %zu to lps.size() %zu, groups %zu\n", lp, lps.size(), num_groups); lp = lps.size() - 1; } func(lp, lps); } } } #elif HWY_OS_APPLE // Returns whether sysctlbyname() succeeded; if so, writes `val / div` to // `out`, otherwise sets `err`. template bool Sysctl(const char* name, size_t div, int& err, T* out) { size_t val = 0; size_t size = sizeof(val); // Last two arguments are for updating the value, which we do not want. const int ret = sysctlbyname(name, &val, &size, nullptr, 0); if (HWY_UNLIKELY(ret != 0)) { // Do not print warnings because some `name` are expected to fail. err = ret; return false; } *out = static_cast(DivByFactor(val, div)); return true; } #endif // HWY_OS_* } // namespace HWY_CONTRIB_DLLEXPORT size_t TotalLogicalProcessors() { size_t total_lps = 0; #if HWY_ARCH_WASM const int num_cores = emscripten_num_logical_cores(); if (num_cores > 0) total_lps = static_cast(num_cores); #elif HWY_OS_WIN // If there are multiple groups, this should return them all, rather than // just the first 64, but VMs report less. (void)ForEachSLPI(RelationProcessorCore, [&total_lps](const SLPI& info) { const PROCESSOR_RELATIONSHIP& p = info.Processor; total_lps += NumBits(p.GroupCount, p.GroupMask); }); #elif HWY_OS_LINUX // Only check "online" because sysfs entries such as topology are missing for // offline CPUs, which will cause `DetectPackages` to fail. const long ret = sysconf(_SC_NPROCESSORS_ONLN); // NOLINT(runtime/int) if (ret < 0) { HWY_WARN("Unexpected _SC_NPROCESSORS_CONF = %d\n", static_cast(ret)); } else { total_lps = static_cast(ret); } #elif HWY_OS_APPLE int err; // Only report P processors. if (!Sysctl("hw.perflevel0.logicalcpu", 1, err, &total_lps)) { total_lps = 0; } #endif if (HWY_UNLIKELY(total_lps == 0)) { // Failed to detect. HWY_WARN( "Unknown TotalLogicalProcessors, assuming 1. " "HWY_OS_: WIN=%d LINUX=%d APPLE=%d;\n" "HWY_ARCH_: WASM=%d X86=%d PPC=%d ARM=%d RISCV=%d S390X=%d\n", HWY_OS_WIN, HWY_OS_LINUX, HWY_OS_APPLE, HWY_ARCH_WASM, HWY_ARCH_X86, HWY_ARCH_PPC, HWY_ARCH_ARM, HWY_ARCH_RISCV, HWY_ARCH_S390X); return 1; } // Warn that we are clamping. if (HWY_UNLIKELY(total_lps > kMaxLogicalProcessors)) { HWY_WARN("OS reports %zu processors but clamping to %zu\n", total_lps, kMaxLogicalProcessors); total_lps = kMaxLogicalProcessors; } return total_lps; } // ------------------------------ Affinity #if HWY_OS_LINUX || HWY_OS_FREEBSD namespace { #if HWY_OS_LINUX using CpuSet = cpu_set_t; #else using CpuSet = cpuset_t; #endif // Helper functions reduce the number of #if in GetThreadAffinity. int GetAffinity(CpuSet* set) { // To specify the current thread, pass 0 on Linux/Android and -1 on FreeBSD. #if defined(__ANDROID__) && __ANDROID_API__ < 12 return syscall(__NR_sched_getaffinity, 0, sizeof(CpuSet), set); #elif HWY_OS_FREEBSD return cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, sizeof(CpuSet), set); #else // normal Linux return sched_getaffinity(0, sizeof(CpuSet), set); #endif } int SetAffinity(CpuSet* set) { // To specify the current thread, pass 0 on Linux/Android and -1 on FreeBSD. #if defined(__ANDROID__) && __ANDROID_API__ < 12 return syscall(__NR_sched_setaffinity, 0, sizeof(CpuSet), set); #elif HWY_OS_FREEBSD return cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, sizeof(CpuSet), set); #else // normal Linux return sched_setaffinity(0, sizeof(CpuSet), set); #endif } bool IsSet(size_t lp, const CpuSet* set) { #if HWY_COMPILER_GCC_ACTUAL // Workaround for GCC compiler warning with CPU_ISSET macro HWY_DIAGNOSTICS(push) HWY_DIAGNOSTICS_OFF(disable : 4305 4309, ignored "-Wsign-conversion") #endif const int is_set = CPU_ISSET(static_cast(lp), set); #if HWY_COMPILER_GCC_ACTUAL HWY_DIAGNOSTICS(pop) #endif return is_set != 0; } void Set(size_t lp, CpuSet* set) { #if HWY_COMPILER_GCC_ACTUAL // Workaround for GCC compiler warning with CPU_SET macro HWY_DIAGNOSTICS(push) HWY_DIAGNOSTICS_OFF(disable : 4305 4309, ignored "-Wsign-conversion") #endif CPU_SET(static_cast(lp), set); #if HWY_COMPILER_GCC_ACTUAL HWY_DIAGNOSTICS(pop) #endif } } // namespace #endif // HWY_OS_LINUX || HWY_OS_FREEBSD HWY_CONTRIB_DLLEXPORT bool GetThreadAffinity(LogicalProcessorSet& lps) { #if HWY_OS_WIN // Only support the first 64 because WINE does not support processor groups. const HANDLE hThread = GetCurrentThread(); const DWORD_PTR prev = SetThreadAffinityMask(hThread, ~DWORD_PTR(0)); if (!prev) return false; (void)SetThreadAffinityMask(hThread, prev); lps = LogicalProcessorSet(); // clear all lps.SetNonzeroBitsFrom64(prev); return true; #elif HWY_OS_LINUX || HWY_OS_FREEBSD CpuSet set; CPU_ZERO(&set); const int err = GetAffinity(&set); if (err != 0) return false; for (size_t lp = 0; lp < kMaxLogicalProcessors; ++lp) { if (IsSet(lp, &set)) lps.Set(lp); } return true; #else // For HWY_OS_APPLE, affinity is not supported. Do not even set lp=0 to force // callers to handle this case. (void)lps; return false; #endif } HWY_CONTRIB_DLLEXPORT bool SetThreadAffinity(const LogicalProcessorSet& lps) { #if HWY_OS_WIN const HANDLE hThread = GetCurrentThread(); const DWORD_PTR prev = SetThreadAffinityMask(hThread, lps.Get64()); return prev != 0; #elif HWY_OS_LINUX || HWY_OS_FREEBSD CpuSet set; CPU_ZERO(&set); lps.Foreach([&set](size_t lp) { Set(lp, &set); }); const int err = SetAffinity(&set); if (err != 0) return false; return true; #else // Apple THREAD_AFFINITY_POLICY is only an (often ignored) hint. (void)lps; return false; #endif } namespace { struct PackageSizes { size_t num_clusters; size_t num_cores; }; #if HWY_OS_LINUX class File { public: explicit File(const char* path) { for (;;) { fd_ = open(path, O_RDONLY); if (fd_ > 0) return; // success if (errno == EINTR) continue; // signal: retry if (errno == ENOENT) return; // not found, give up HWY_WARN("Unexpected error opening %s: %d\n", path, errno); return; // unknown error, give up } } ~File() { if (fd_ > 0) { for (;;) { const int ret = close(fd_); if (ret == 0) break; // success if (errno == EINTR) continue; // signal: retry HWY_WARN("Unexpected error closing file: %d\n", errno); return; // unknown error, ignore } } } // Returns number of bytes read or 0 on failure. size_t Read(char* buf200) const { if (fd_ < 0) return 0; size_t pos = 0; for (;;) { // read instead of `pread`, which might not work for sysfs. const auto bytes_read = read(fd_, buf200 + pos, 200 - pos); if (bytes_read == 0) { // EOF: done buf200[pos++] = '\0'; return pos; } if (bytes_read == -1) { if (errno == EINTR) continue; // signal: retry HWY_WARN("Unexpected error reading file: %d\n", errno); return 0; } pos += static_cast(bytes_read); HWY_ASSERT(pos <= 200); } } private: int fd_; }; // Returns bytes read, or 0 on failure. size_t ReadSysfs(const char* format, size_t lp, char* buf200) { char path[200]; const int bytes_written = snprintf(path, sizeof(path), format, lp); HWY_ASSERT(0 < bytes_written && bytes_written < static_cast(sizeof(path) - 1)); const File file(path); return file.Read(buf200); } // Interprets [str + pos, str + end) as base-10 ASCII. Stops when any non-digit // is found, or at end. Returns false if no digits found. bool ParseDigits(const char* str, const size_t end, size_t& pos, size_t* out) { HWY_ASSERT(pos <= end); // 9 digits cannot overflow even 32-bit size_t. const size_t stop = pos + 9; *out = 0; for (; pos < HWY_MIN(end, stop); ++pos) { const int c = str[pos]; if (c < '0' || c > '9') break; *out *= 10; *out += static_cast(c - '0'); } if (pos == 0) { // No digits found *out = 0; return false; } return true; } // Number, plus optional K or M suffix, plus terminator. bool ParseNumberWithOptionalSuffix(const char* str, size_t len, size_t* out) { size_t pos = 0; if (!ParseDigits(str, len, pos, out)) return false; if (str[pos] == 'K') { *out <<= 10; ++pos; } if (str[pos] == 'M') { *out <<= 20; ++pos; } if (str[pos] != '\0' && str[pos] != '\n') { HWY_ABORT("Expected [suffix] terminator at %zu %s\n", pos, str); } return true; } bool ReadNumberWithOptionalSuffix(const char* format, size_t lp, size_t* out) { char buf200[200]; const size_t pos = ReadSysfs(format, lp, buf200); if (pos == 0) return false; return ParseNumberWithOptionalSuffix(buf200, pos, out); } const char* kPackage = "/sys/devices/system/cpu/cpu%zu/topology/physical_package_id"; const char* kCluster = "/sys/devices/system/cpu/cpu%zu/cache/index3/id"; const char* kCore = "/sys/devices/system/cpu/cpu%zu/topology/core_id"; const char* kL2Size = "/sys/devices/system/cpu/cpu%zu/cache/index2/size"; const char* kL3Size = "/sys/devices/system/cpu/cpu%zu/cache/index3/size"; const char* kNode = "/sys/devices/system/node/node%zu/cpulist"; // sysfs values can be arbitrarily large, so store in a map and replace with // indices in order of appearance. class Remapper { public: // Returns false on error, or sets `out_index` to the index of the sysfs // value selected by `format` and `lp`. template bool operator()(const char* format, size_t lp, T* HWY_RESTRICT out_index) { size_t opaque; if (!ReadNumberWithOptionalSuffix(format, lp, &opaque)) return false; const auto ib = indices_.insert({opaque, num_}); num_ += ib.second; // increment if inserted const size_t index = ib.first->second; // new or existing HWY_ASSERT(index < num_); HWY_ASSERT(index < hwy::LimitsMax()); *out_index = static_cast(index); return true; } size_t Num() const { return num_; } private: std::map indices_; size_t num_ = 0; }; // For internal use by `DetectPackages`. struct PerPackage { Remapper clusters; Remapper cores; // We rely on this zero-init and increment it below. uint8_t smt_per_core[kMaxLogicalProcessors] = {0}; }; // Initializes `lps` and returns a PackageSizes vector (empty on failure) // indicating the number of clusters and cores per package. std::vector DetectPackages(std::vector& lps) { std::vector empty; Remapper packages; for (size_t lp = 0; lp < lps.size(); ++lp) { if (!packages(kPackage, lp, &lps[lp].package)) { HWY_WARN("Failed to read sysfs package for LP %zu\n", lp); return empty; } } std::vector per_package(packages.Num()); HWY_ASSERT(!per_package.empty()); for (size_t lp = 0; lp < lps.size(); ++lp) { PerPackage& pp = per_package[lps[lp].package]; // Not a failure: some CPUs lack a (shared) L3 cache. if (!pp.clusters(kCluster, lp, &lps[lp].cluster)) { lps[lp].cluster = 0; } if (!pp.cores(kCore, lp, &lps[lp].core)) { HWY_WARN("Failed to read sysfs core for LP %zu\n", lp); return empty; } // SMT ID is how many LP we have already seen assigned to the same core. HWY_ASSERT(lps[lp].core < kMaxLogicalProcessors); lps[lp].smt = pp.smt_per_core[lps[lp].core]++; HWY_ASSERT(lps[lp].smt < 16); } std::vector package_sizes(per_package.size()); for (size_t p = 0; p < package_sizes.size(); ++p) { // Was zero if the package has no shared L3, see above. package_sizes[p].num_clusters = HWY_MAX(1, per_package[p].clusters.Num()); package_sizes[p].num_cores = per_package[p].cores.Num(); HWY_ASSERT(package_sizes[p].num_cores != 0); } return package_sizes; } std::vector ExpandList(const char* list, size_t list_end, size_t max_lp) { std::vector expanded; constexpr size_t kNotFound = ~size_t{0}; size_t pos = 0; // Gracefully handle empty lists, happens on GH200 systems (#2668). if (isspace(list[0]) && list_end <= 2) return expanded; // Returns first `found_pos >= pos` where `list[found_pos] == c`, or // `kNotFound`. const auto find = [list, list_end, &pos](char c) -> size_t { const char* found_ptr = strchr(list + pos, c); if (found_ptr == nullptr) return kNotFound; const size_t found_pos = static_cast(found_ptr - list); HWY_ASSERT(found_pos < list_end && list[found_pos] == c); return found_pos; }; // Reads LP number and advances `pos`. `end` is for verifying we did not // read past a known terminator, or the end of string. const auto parse_lp = [list, list_end, &pos, max_lp](size_t end) -> size_t { end = HWY_MIN(end, list_end); size_t lp; HWY_ASSERT(ParseDigits(list, end, pos, &lp)); HWY_IF_CONSTEXPR(HWY_ARCH_RISCV) { // On RISC-V, both TotalLogicalProcessors and GetThreadAffinity may // under-report the count, hence clamp. lp = HWY_MIN(lp, max_lp); } HWY_ASSERT(lp <= max_lp); HWY_ASSERT(pos <= end); return lp; }; // Parse all [first-]last separated by commas. for (;;) { // Single number or first of range: ends with dash, comma, or end. const size_t lp_range_first = parse_lp(HWY_MIN(find('-'), find(','))); if (list[pos] == '-') { // range ++pos; // skip dash // Last of range ends with comma or end. const size_t lp_range_last = parse_lp(find(',')); expanded.reserve(expanded.size() + lp_range_last - lp_range_first + 1); for (size_t lp = lp_range_first; lp <= lp_range_last; ++lp) { expanded.push_back(lp); } } else { // single number expanded.push_back(lp_range_first); } // Done if reached end of string. if (pos == list_end || list[pos] == '\0' || list[pos] == '\n') { break; } // Comma means at least one more term is coming. if (list[pos] == ',') { ++pos; continue; } HWY_ABORT("Unexpected character at %zu in %s\n", pos, list); } // for pos return expanded; } // Sets LP.node for all `lps`. void SetNodes(std::vector& lps) { // For each NUMA node found via sysfs: for (size_t node = 0;; node++) { // Read its cpulist so we can scatter `node` to all its `lps`. char buf200[200]; const size_t bytes_read = ReadSysfs(kNode, node, buf200); if (bytes_read == 0) break; const std::vector list = ExpandList(buf200, bytes_read, lps.size() - 1); for (size_t lp : list) { lps[lp].node = static_cast(node); } } } void SetClusterCacheSizes(std::vector& packages) { for (size_t ip = 0; ip < packages.size(); ++ip) { Topology::Package& p = packages[ip]; for (size_t ic = 0; ic < p.clusters.size(); ++ic) { Topology::Cluster& c = p.clusters[ic]; const size_t lp = c.lps.First(); size_t bytes; if (ReadNumberWithOptionalSuffix(kL2Size, lp, &bytes)) { c.private_kib = bytes >> 10; } if (ReadNumberWithOptionalSuffix(kL3Size, lp, &bytes)) { c.shared_kib = bytes >> 10; } } } } #elif HWY_OS_WIN // See #2734. GroupCount was added around Windows 10, but SDK docs do not // mention the actual version required. It is known to be absent in 8.1 and // MinGW 5.0.1, and present in the 10.0.22000.0 SDK. However, the OS must also // know about the field. Thus we zero-initialize the reserved field, assume it // remains zero, and return 1 if zero (old style single GroupMask), otherwise // the number of groups. There are two such structures, but note that // `PROCESSOR_RELATIONSHIP` already had this field. static size_t GroupCount(const CACHE_RELATIONSHIP& cr) { // Added as the last u16 in the reserved area before GroupMask. We only read // one byte because 256*64 processor bits are plenty. const uint8_t* pcount = reinterpret_cast(&cr.GroupMask) - sizeof(uint16_t); return HWY_MAX(pcount[HWY_IS_BIG_ENDIAN], 1); } static size_t GroupCount(const NUMA_NODE_RELATIONSHIP& nn) { const uint8_t* pcount = reinterpret_cast(&nn.GroupMask) - sizeof(uint16_t); return HWY_MAX(pcount[HWY_IS_BIG_ENDIAN], 1); } struct PerPackage { size_t clusters = 0; size_t cores = 0; size_t max_lps_per_core = 0; }; // Returns per-package vector and assigns LP.package to an index within it. std::vector AssignPackageIndices(std::vector& lps) { size_t package_idx = 0; (void)ForEachSLPI( RelationProcessorPackage, [&lps, &package_idx](const SLPI& info) { const PROCESSOR_RELATIONSHIP& p = info.Processor; ForeachBit(p.GroupCount, p.GroupMask, lps, __LINE__, [&package_idx](size_t lp, std::vector& lps) { lps[lp].package = static_cast(package_idx); }); ++package_idx; }); return std::vector(package_idx); } // Sets LP.core and LP.smt and updates `PerPackage.cores/max_lps_per_core`. void AssignCoreSmtIndices(std::vector& lps, std::vector& per_package) { (void)ForEachSLPI( RelationProcessorCore, [&lps, &per_package](const SLPI& info) { const PROCESSOR_RELATIONSHIP& p = info.Processor; PerPackage* pp = nullptr; // Foreach LP in this core: assign its core and smt. size_t smt = 0; ForeachBit(p.GroupCount, p.GroupMask, lps, __LINE__, [&](size_t lp, std::vector& lps) { pp = &per_package[lps[lp].package]; lps[lp].core = static_cast(pp->cores); lps[lp].smt = static_cast(smt++); }); HWY_ASSERT(pp != nullptr); ++pp->cores; pp->max_lps_per_core = HWY_MAX(pp->max_lps_per_core, NumBits(p.GroupCount, p.GroupMask)); HWY_ASSERT(pp->max_lps_per_core != 0); }); } // Interprets cluster (typically a shared L3 cache) as a "processor die". Sets // LP.cluster and updates `PerPackage.clusters`. void AssignClusterIndices(std::vector& lps, std::vector& per_package) { // Shared between `foreach_die` and `foreach_l3`. Assigns all LPs to this // cluster and increments the cluster index. const auto foreach_cluster = [&](size_t num_groups, const GROUP_AFFINITY* groups) { PerPackage* pp = nullptr; ForeachBit(num_groups, groups, lps, __LINE__, [&per_package, &pp](size_t lp, std::vector& lps) { pp = &per_package[lps[lp].package]; lps[lp].cluster = static_cast(pp->clusters); }); if (pp != nullptr) { ++pp->clusters; } }; // Passes group bits to `foreach_cluster`, depending on relationship type. const auto foreach_die = [&foreach_cluster](const SLPI& info) { const PROCESSOR_RELATIONSHIP& p = info.Processor; foreach_cluster(p.GroupCount, p.GroupMask); }; const auto foreach_l3 = [&foreach_cluster](const SLPI& info) { const CACHE_RELATIONSHIP& cr = info.Cache; if (cr.Type != CacheUnified && cr.Type != CacheData) return; if (cr.Level != 3) return; foreach_cluster(GroupCount(cr), cr.GroupMasks); }; if (!ForEachSLPI(RelationProcessorDie, foreach_die)) { // Has been observed to fail; also check for shared L3 caches. (void)ForEachSLPI(RelationCache, foreach_l3); } // All packages should have the same number of clusters. for (size_t package_idx = 1; package_idx < per_package.size(); ++package_idx) { if (per_package[package_idx].clusters != per_package[0].clusters) { HWY_ABORT("pkg %zu has %zu clusters, expected %zu\n", package_idx, per_package[package_idx].clusters, per_package[0].clusters); } } if (per_package[0].clusters == 0) { HWY_WARN("No clusters found, assuming 1 cluster\n"); for (PerPackage& pp : per_package) { pp.clusters = 1; } for (Topology::LP& lp : lps) { lp.cluster = 0; } } } // Initializes `lps` and returns a `PackageSizes` vector (empty on failure) // indicating the number of clusters and cores per package. std::vector DetectPackages(std::vector& lps) { std::vector per_package = AssignPackageIndices(lps); if (per_package.empty()) return {}; AssignCoreSmtIndices(lps, per_package); AssignClusterIndices(lps, per_package); std::vector packages(per_package.size()); for (size_t package_idx = 0; package_idx < per_package.size(); ++package_idx) { packages[package_idx].num_clusters = per_package[package_idx].clusters; packages[package_idx].num_cores = per_package[package_idx].cores; } return packages; } // Sets LP.node for all `lps`. void SetNodes(std::vector& lps) { // Zero-initialize all nodes in case the below fails. for (size_t lp = 0; lp < lps.size(); ++lp) { lps[lp].node = 0; } // We want the full NUMA nodes, but Windows Server 2022 truncates the results // of `RelationNumaNode` to a single 64-LP group. To get the old, unlimited // behavior without using the new `RelationNumaNodeEx` symbol, use the old // `RelationAll` and filter the SLPI we want. (void)ForEachSLPI(RelationAll, [&](const SLPI& info) { if (info.Relationship != RelationNumaNode) return; const NUMA_NODE_RELATIONSHIP& nn = info.NumaNode; // This field was previously reserved/zero. There is at least one group. const size_t num_groups = HWY_MAX(1, GroupCount(nn)); const uint8_t node = static_cast(nn.NodeNumber); ForeachBit(num_groups, nn.GroupMasks, lps, __LINE__, [node](size_t lp, std::vector& lps) { lps[lp].node = node; }); }); } #elif HWY_OS_APPLE // Initializes `lps` and returns a `PackageSizes` vector (empty on failure) // indicating the number of clusters and cores per package. std::vector DetectPackages(std::vector& lps) { int err; size_t total_cores = 0; if (!Sysctl("hw.perflevel0.physicalcpu", 1, err, &total_cores)) { HWY_WARN("Error %d detecting total_cores, assuming one per LP\n", err); total_cores = lps.size(); } if (lps.size() % total_cores != 0) { HWY_WARN("LPs %zu not a multiple of total_cores %zu\n", lps.size(), total_cores); } const size_t lp_per_core = DivCeil(lps.size(), total_cores); size_t cores_per_cluster = 0; if (!Sysctl("hw.perflevel0.cpusperl2", 1, err, &cores_per_cluster)) { HWY_WARN("Error %d detecting cores_per_cluster\n", err); cores_per_cluster = HWY_MIN(4, total_cores); } if (total_cores % cores_per_cluster != 0) { HWY_WARN("total_cores %zu not a multiple of cores_per_cluster %zu\n", total_cores, cores_per_cluster); } for (size_t lp = 0; lp < lps.size(); ++lp) { lps[lp].package = 0; // single package lps[lp].core = static_cast(lp / lp_per_core); lps[lp].smt = static_cast(lp % lp_per_core); lps[lp].cluster = static_cast(lps[lp].core / cores_per_cluster); } PackageSizes ps; ps.num_clusters = DivCeil(total_cores, cores_per_cluster); ps.num_cores = total_cores; return std::vector{ps}; } // Sets LP.node for all `lps`. void SetNodes(std::vector& lps) { for (size_t lp = 0; lp < lps.size(); ++lp) { lps[lp].node = 0; // no NUMA } } #endif // HWY_OS_* #if HWY_OS_WIN || HWY_OS_APPLE void SetClusterCacheSizes(std::vector& packages) { // Assumes clusters are homogeneous. Otherwise, we would have to scan // `RelationCache` again and find the corresponding package_idx. const Cache* caches = DataCaches(); const size_t private_kib = caches ? caches[2].size_kib : 0; const size_t shared_kib = caches ? caches[3].size_kib : 0; for (size_t ip = 0; ip < packages.size(); ++ip) { Topology::Package& p = packages[ip]; for (size_t ic = 0; ic < p.clusters.size(); ++ic) { Topology::Cluster& c = p.clusters[ic]; c.private_kib = private_kib; c.shared_kib = shared_kib; } } } #endif // HWY_OS_WIN || HWY_OS_APPLE } // namespace HWY_CONTRIB_DLLEXPORT Topology::Topology() { #if HWY_OS_LINUX || HWY_OS_WIN || HWY_OS_APPLE lps.resize(TotalLogicalProcessors()); const std::vector& package_sizes = DetectPackages(lps); if (package_sizes.empty()) return; SetNodes(lps); // Allocate per-package/cluster/core vectors. This indicates to callers that // detection succeeded. packages.resize(package_sizes.size()); for (size_t p = 0; p < packages.size(); ++p) { packages[p].clusters.resize(package_sizes[p].num_clusters); packages[p].cores.resize(package_sizes[p].num_cores); } // Populate the per-cluster/core sets of LP. for (size_t lp = 0; lp < lps.size(); ++lp) { Package& p = packages[lps[lp].package]; p.clusters[lps[lp].cluster].lps.Set(lp); p.cores[lps[lp].core].lps.Set(lp); } SetClusterCacheSizes(packages); #endif // HWY_OS_* } // ------------------------------ Cache detection namespace { using Caches = std::array; // We assume homogeneous caches across all clusters because some OS APIs return // a single value for a class of CPUs. #if HWY_OS_LINUX std::string ReadString(const char* name, size_t index) { // First CPU is usually a P core. const std::string path("/sys/devices/system/cpu/cpu0/cache/index%zu/"); char buf200[200]; size_t end = ReadSysfs((path + name).c_str(), index, buf200); // Remove trailing newline/null to simplify string comparison. for (; end != 0; --end) { if (buf200[end - 1] != '\0' && buf200[end - 1] != '\n') break; } return std::string(buf200, buf200 + end); } template bool WriteSysfs(const char* name, size_t index, T* out) { const std::string str = ReadString(name, index); // Do not call `ParseNumberWithOptionalSuffix` because it acts on the // K suffix in "size", but we actually want KiB. size_t pos = 0; size_t val; if (!ParseDigits(str.c_str(), str.length(), pos, &val)) return false; HWY_ASSERT(pos <= str.length()); *out = static_cast(val); return true; } // Reading from sysfs is preferred because sysconf returns L3 associativity = 0 // on some CPUs, and does not indicate sharing across cores. // https://www.kernel.org/doc/Documentation/ABI/testing/sysfs-devices-system-cpu bool InitCachesSysfs(Caches& caches) { // For computing shared cache sizes. std::vector lps(TotalLogicalProcessors()); const std::vector package_sizes = DetectPackages(lps); // `package_sizes` is only used to check that `lps` were filled. if (package_sizes.empty()) { HWY_WARN("no packages, shared cache sizes may be incorrect\n"); return false; } for (size_t i = 0;; ++i) { const std::string type = ReadString("type", i); if (type.empty()) break; // done, no more entries if (type != "Data" && type != "Unified") continue; uint32_t level; if (!WriteSysfs("level", i, &level)) continue; if (level != 1 && level != 2 && level != 3) continue; Cache& c = caches[level]; // Check before overwriting any fields. if (c.size_kib != 0) { HWY_WARN("ignoring another L%u, first size %u\n", level, c.size_kib); continue; } const bool ok = WriteSysfs("size", i, &c.size_kib) && WriteSysfs("ways_of_associativity", i, &c.associativity) && WriteSysfs("number_of_sets", i, &c.sets); if (HWY_UNLIKELY(!ok)) { HWY_WARN("skipping partially-detected L%u, error %d\n", level, errno); c = Cache(); continue; } // Compute line size *before* adjusting the size for sharing. Note that // `coherency_line_size` exists, but we are not sure that is the line size. const size_t bytes = static_cast(c.size_kib) * 1024; const size_t lines = c.associativity * c.sets; c.bytes_per_line = static_cast(DivByFactor(bytes, lines)); // Divide by number of *cores* sharing the cache. const std::string shared_str = ReadString("shared_cpu_list", i); if (HWY_UNLIKELY(shared_str.empty())) { HWY_WARN("no shared_cpu_list for L%u %s\n", level, type.c_str()); c.cores_sharing = 1; } else { const std::vector shared_lps = ExpandList(shared_str.c_str(), shared_str.length(), lps.size() - 1); size_t num_cores = 0; for (size_t lp : shared_lps) { if (HWY_LIKELY(lp < lps.size())) { num_cores += lps[lp].smt == 0; } else { HWY_WARN("out of bounds lp %zu of %zu from %s\n", lp, lps.size(), shared_str.c_str()); } } if (num_cores == 0) { HWY_WARN("no cores sharing L%u %s, setting to 1\n", level, type.c_str()); num_cores = 1; } c.cores_sharing = static_cast(num_cores); // There exist CPUs for which L3 is not evenly divisible by `num_cores`, // hence do not use `DivByFactor`. It is safer to round down. c.size_kib = static_cast(c.size_kib / num_cores); c.sets = static_cast(c.sets / num_cores); } } // Require L1 and L2 cache. if (HWY_UNLIKELY(caches[1].size_kib == 0 || caches[2].size_kib == 0)) { // Don't complain on Android because this is known to happen there. We are // unaware of good alternatives: `getauxval(AT_L1D_CACHEGEOMETRY)` and // `sysconf(_SC_LEVEL1_DCACHE_SIZE)` are unreliable, detecting via timing seems // difficult to do reliably, and we do not want to maintain lists of known CPUs // and their properties. It's OK to return false; callers are responsible for // assuming reasonable defaults. #ifndef __ANDROID__ HWY_WARN("sysfs detected L1=%u L2=%u, err %d\n", caches[1].size_kib, caches[2].size_kib, errno); #endif return false; } // L3 is optional; if not found, its size is already zero from static init. return true; } #elif HWY_OS_WIN bool InitCachesWin(Caches& caches) { std::vector lps(TotalLogicalProcessors()); std::vector per_package = AssignPackageIndices(lps); if (per_package.empty()) return false; AssignCoreSmtIndices(lps, per_package); (void)ForEachSLPI(RelationCache, [&per_package, &caches](const SLPI& info) { const CACHE_RELATIONSHIP& cr = info.Cache; if (cr.Type != CacheUnified && cr.Type != CacheData) return; if (1 <= cr.Level && cr.Level <= 3) { Cache& c = caches[cr.Level]; // If the size is non-zero then we (probably) have already detected this // cache and can skip the CR. if (c.size_kib > 0) return; c.size_kib = static_cast(DivByFactor(cr.CacheSize, 1024)); c.bytes_per_line = static_cast(cr.LineSize); c.associativity = (cr.Associativity == CACHE_FULLY_ASSOCIATIVE) ? Cache::kMaxAssociativity : cr.Associativity; // How many cores share this cache? size_t shared_with = NumBits(GroupCount(cr), cr.GroupMasks); // Divide out hyperthreads. This core may have fewer than // `max_lps_per_core`, hence round up. shared_with = DivCeil(shared_with, per_package[0].max_lps_per_core); if (shared_with == 0) { HWY_WARN("no cores sharing L%u, setting to 1\n", cr.Level); shared_with = 1; } // Update `size_kib` to *per-core* portion. // There exist CPUs for which L3 is not evenly divisible by `shared_with`, // hence do not use `DivByFactor`. It is safer to round down. c.size_kib = static_cast(c.size_kib / shared_with); c.cores_sharing = static_cast(shared_with); } }); // Require L1 and L2 cache. if (HWY_UNLIKELY(caches[1].size_kib == 0 || caches[2].size_kib == 0)) { HWY_WARN("Windows detected L1=%u, L2=%u, err %lx\n", caches[1].size_kib, caches[2].size_kib, GetLastError()); return false; } // L3 is optional; if not found, its size is already zero from static init. return true; } #elif HWY_OS_APPLE bool InitCachesApple(Caches& caches) { int err = 0; Cache& L1 = caches[1]; Cache& L2 = caches[2]; Cache& L3 = caches[3]; // Total L1 and L2 size can be reliably queried, but prefer perflevel0 // (P-cores) because hw.l1dcachesize etc. are documented to describe the // "least performant core". bool ok = Sysctl("hw.perflevel0.l1dcachesize", 1024, err, &L1.size_kib) || Sysctl("hw.l1dcachesize", 1024, err, &L1.size_kib); ok &= Sysctl("hw.perflevel0.l2cachesize", 1024, err, &L2.size_kib) || Sysctl("hw.l2cachesize", 1024, err, &L2.size_kib); if (HWY_UNLIKELY(!ok)) { HWY_WARN("Apple cache detection failed, error %d\n", err); return false; } L1.cores_sharing = 1; if (Sysctl("hw.perflevel0.cpusperl2", 1, err, &L2.cores_sharing)) { // There exist CPUs for which L2 is not evenly divisible by `cores_sharing`, // hence do not use `DivByFactor`. It is safer to round down. L2.size_kib /= L2.cores_sharing; } else { L2.cores_sharing = 1; } // Other properties are not always reported. Set `associativity` and // `bytes_per_line` based on known models. char brand[128] = {0}; size_t size = sizeof(brand); if (!sysctlbyname("machdep.cpu.brand_string", brand, &size, nullptr, 0)) { if (strncmp(brand, "Apple ", 6) != 0) { // Unexpected, but we will continue check the string suffixes. HWY_WARN("unexpected Apple brand %s\n", brand); } if (brand[6] == 'M') { // https://dougallj.github.io/applecpu/firestorm.html, // https://www.7-cpu.com/cpu/Apple_M1.html: L1.bytes_per_line = 64; L1.associativity = 8; L2.bytes_per_line = 128; if (brand[7] == '1') { // M1 L2.associativity = 12; } else if ('2' <= brand[7] && brand[7] <= '4') { // M2/M3, maybe also M4 L2.associativity = 16; } else { L2.associativity = 0; // Unknown, set below via sysctl. } // Although Wikipedia lists SLC sizes per model, we do not know how it is // partitioned/allocated, so do not treat it as a reliable L3. } // M* } // brand string // This sysctl does not distinguish between L1 and L2 line sizes, so only use // it if we have not already set `bytes_per_line` above. uint16_t bytes_per_line; if (!Sysctl("hw.cachelinesize", 1, err, &bytes_per_line)) { bytes_per_line = static_cast(HWY_ALIGNMENT); // guess } for (size_t level = 1; level <= 3; ++level) { if (caches[level].bytes_per_line == 0) { caches[level].bytes_per_line = bytes_per_line; } } // Fill in associativity if not already set. Unfortunately this is only // reported on x86, not on M*. if (L1.associativity == 0 && !Sysctl("machdep.cpu.cache.L1_associativity", 1, err, &L1.associativity)) { L1.associativity = 8; // guess } if (L2.associativity == 0 && !Sysctl("machdep.cpu.cache.L2_associativity", 1, err, &L2.associativity)) { L2.associativity = 12; // guess } // There is no L3_associativity. if (L3.associativity == 0) { L3.associativity = 12; // guess } // Now attempt to query L3. Although this sysctl is documented, M3 does not // report an L3 cache. if (L3.size_kib == 0 && (Sysctl("hw.perflevel0.l3cachesize", 1024, err, &L3.size_kib) || Sysctl("hw.l3cachesize", 1024, err, &L3.size_kib))) { // There exist CPUs for which L3 is not evenly divisible by `cores_sharing`, // hence do not use `DivByFactor`. It is safer to round down. if (Sysctl("hw.perflevel0.cpusperl3", 1, err, &L3.cores_sharing)) { L3.size_kib /= L3.cores_sharing; } else { L3.cores_sharing = 1; } } // If no L3 cache, reset all fields for consistency. if (L3.size_kib == 0) { L3 = Cache(); } // Are there other useful sysctls? hw.cacheconfig appears to be how many // cores share the memory and caches, though this is not documented, and // duplicates information in hw.perflevel0.cpusperl*. return true; } #endif // HWY_OS_* // Most APIs do not set the `sets` field, so compute it from the size and // associativity, and if a value is already set, ensure it matches. HWY_MAYBE_UNUSED void ComputeSets(Cache& c) { // If there is no such cache, avoid division by zero. if (HWY_UNLIKELY(c.size_kib == 0)) { c.sets = 0; return; } const size_t bytes = static_cast(c.size_kib) * 1024; // `size_kib` may have been rounded down, hence `lines` and `sets` are not // necessarily evenly divisible, so round down instead of `DivByFactor`. const size_t lines = bytes / c.bytes_per_line; const size_t sets = lines / c.associativity; if (c.sets == 0) { c.sets = static_cast(sets); } else { const size_t diff = c.sets - sets; if (diff > 1) { HWY_ABORT("Inconsistent cache sets %u != %zu\n", c.sets, sets); } } } const Cache* InitDataCaches() { alignas(64) static Caches caches; // On failure, return immediately because InitCaches*() already warn. #if HWY_OS_LINUX if (HWY_UNLIKELY(!InitCachesSysfs(caches))) return nullptr; #elif HWY_OS_WIN if (HWY_UNLIKELY(!InitCachesWin(caches))) return nullptr; #elif HWY_OS_APPLE if (HWY_UNLIKELY(!InitCachesApple(caches))) return nullptr; #else HWY_WARN("Cache detection not implemented for this platform.\n"); (void)caches; return nullptr; #define HWY_NO_CACHE_DETECTION #endif // Prevents "code not reached" warnings on WASM. #ifndef HWY_NO_CACHE_DETECTION for (size_t level = 1; level <= 3; ++level) { ComputeSets(caches[level]); } // Heuristic to ignore SLCs such as on Ampere Altra, which should not be // treated as a reliable L3 because of their cache inclusion policy. // On Apple M*, these are not even reported as an L3. if (caches[3].cores_sharing >= 16 && caches[3].size_kib <= 512) { caches[3] = Cache(); } return &caches[0]; #endif // HWY_NO_CACHE_DETECTION } } // namespace HWY_CONTRIB_DLLEXPORT const Cache* DataCaches() { static const Cache* caches = InitDataCaches(); return caches; } } // namespace hwy