From 3a4d533a1e2a179ad873c480dc4a42ea23681263 Mon Sep 17 00:00:00 2001 From: Mike Li Date: Wed, 17 Aug 2022 11:44:09 -0400 Subject: [PATCH 1/2] Check permission and handle PermissionError exception Signed-off-by: Mike Li Change-Id: If7cb8464d0b761e4be45c85eb7147ceed609da61 --- rocm_agent_enumerator | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/rocm_agent_enumerator b/rocm_agent_enumerator index 6264a5f..ceb9e11 100755 --- a/rocm_agent_enumerator +++ b/rocm_agent_enumerator @@ -195,10 +195,15 @@ def readFromKFD(): node_path = os.path.join(topology_dir, node) if os.path.isdir(node_path): prop_path = node_path + '/properties' - if os.path.isfile(prop_path): + if os.path.isfile(prop_path) and os.access(prop_path, os.R_OK): target_search_term = re.compile("gfx_target_version.+") with open(prop_path) as f: - line = f.readline() + try: + line = f.readline() + except PermissionError: + # We may have a subsystem (e.g. scheduler) limiting device visibility which + # could cause a permission error. + line = '' while line != '' : search_result = target_search_term.search(line) if search_result is not None: From 94b4b3f0a66eb70912177ca7076b4267f8b5449b Mon Sep 17 00:00:00 2001 From: Johannes Dieterich Date: Mon, 21 Nov 2022 18:09:55 +0000 Subject: [PATCH 2/2] Fix rocminfo when run within docker environments Currently, rocminfo will fail when executed inside a docker container due to being unable to lsmod inside docker. This has impacts on rocprofiler use. Fix this behavior by querying initstate of the amdgpu module from /sys/module/amdgpu instead. If initstate is marked "live" everything if fine - error out with either "not loaded" (initstate file does not exist) or "not live" (initstate file does not contain "live" string). Change-Id: I6f2e9655942fd4cf840fd3f56b7d69e893fa84d7 --- rocminfo.cc | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/rocminfo.cc b/rocminfo.cc index 0842d57..8ed9111 100755 --- a/rocminfo.cc +++ b/rocminfo.cc @@ -51,6 +51,7 @@ #include #include +#include #include #include #include @@ -1039,16 +1040,33 @@ AcquireAndDisplayAgentInfo(hsa_agent_t agent, void* data) { int CheckInitialState(void) { // Check kernel module for ROCk is loaded - FILE *fd = popen("lsmod | grep amdgpu", "r"); - char buf[16]; - if (fread (buf, 1, sizeof (buf), fd) == 0) { + + std::ifstream amdgpu_initstate("/sys/module/amdgpu/initstate"); + if (amdgpu_initstate){ + std::stringstream buffer; + buffer << amdgpu_initstate.rdbuf(); + amdgpu_initstate.close(); + + std::string line; + bool is_live = false; + while (std::getline(buffer, line)){ + if (line.find( "live" ) != std::string::npos){ + is_live = true; + break; + } + } + if (is_live){ + printf("%sROCk module is loaded%s\n", COL_WHT, COL_RESET); + } else { + printf("%sROCk module is NOT live, possibly no GPU devices%s\n", + COL_RED, COL_RESET); + return -1; + } + } else { printf("%sROCk module is NOT loaded, possibly no GPU devices%s\n", COL_RED, COL_RESET); return -1; - } else { - printf("%sROCk module is loaded%s\n", COL_WHT, COL_RESET); } - pclose(fd); // Check if user belongs to the group for /dev/kfd (e.g. "video" or // "render")