50 | 50 |
#define MEMORY_PATH_FMT "/sys/devices/system/memory"
|
51 | 51 |
#define MEMORY_HARD_OFFLINE_PATH_FMT MEMORY_PATH_FMT "/hard_offline_page"
|
52 | 52 |
#define MEMORY_PROBE_PATH_FMT MEMORY_PATH_FMT "/probe"
|
53 | |
#define MEMBLK_DIR_PATH_FMT MEMORY_PATH_FMT "/memory%d"
|
|
53 |
#define MEMBLK_FILE_FMT "memory%d"
|
|
54 |
#define MEMBLK_DIR_PATH_FMT MEMORY_PATH_FMT "/" MEMBLK_FILE_FMT
|
54 | 55 |
#define MEMBLK_STATE_PATH_FMT MEMBLK_DIR_PATH_FMT "/state"
|
55 | 56 |
#define MEMBLK_VALID_ZONES_PATH_FMT MEMBLK_DIR_PATH_FMT "/valid_zones"
|
56 | 57 |
#define NID_PATH_FMT "/sys/devices/system/node/node%d"
|
|
327 | 328 |
return (sscanf(dirname, "memory%" PRIu32, block_id) == 1) ? 0 : -EINVAL;
|
328 | 329 |
}
|
329 | 330 |
|
330 | |
/*
|
331 | |
* Looks through NUMA nodes, finding the upper and lower bounds, and returns
|
332 | |
* those. The assumption is that the nodes are physically contiguous, so that
|
333 | |
* the intervening nodes do not need to be explicitly returned.
|
334 | |
*/
|
335 | |
static
|
336 | |
int gather_memblock_ids_for_node(uint32_t node_id, uint32_t *memblock_start_id,
|
337 | |
uint32_t *memblock_end_id)
|
338 | |
{
|
339 | |
DIR *dir_ptr;
|
340 | |
int status = 0;
|
341 | |
struct dirent *dir_entry;
|
342 | |
char numa_file_path[BUF_SIZE];
|
343 | |
uint32_t start_id = UINT32_MAX;
|
344 | |
uint32_t end_id = 0;
|
345 | |
|
346 | |
sprintf(numa_file_path, NID_PATH_FMT, node_id);
|
347 | |
|
348 | |
dir_ptr = opendir(numa_file_path);
|
349 | |
if (!dir_ptr) {
|
350 | |
syslog(LOG_ERR, "NUMA: Failed to open directory %s: %s\n",
|
351 | |
numa_file_path, strerror(errno));
|
352 | |
return -errno;
|
353 | |
}
|
354 | |
|
355 | |
/* Iterate through the node directory and get the memblock id */
|
356 | |
while ((dir_entry = readdir(dir_ptr)) != NULL) {
|
357 | |
uint32_t memblock_id = 0;
|
358 | |
|
359 | |
/* Skip entries that are not a memory node */
|
360 | |
if (get_memblock_id_from_dirname(dir_entry->d_name, &memblock_id) < 0) {
|
361 | |
continue;
|
362 | |
}
|
363 | |
|
364 | |
if (memblock_id == 0) {
|
365 | |
syslog(LOG_ERR,
|
366 | |
"NUMA: Failed to get memblock id while iterating through %s\n",
|
367 | |
numa_file_path);
|
368 | |
goto cleanup;
|
369 | |
}
|
370 | |
|
371 | |
SYSLOG_VERBOSE(LOG_DEBUG, "NUMA: Found memblock entry %"PRIu32"\n",
|
372 | |
memblock_id);
|
373 | |
|
374 | |
/* Record the smallest and largest assigned memblock IDs */
|
375 | |
start_id = (start_id < memblock_id) ? start_id : memblock_id;
|
376 | |
end_id = (end_id > memblock_id) ? end_id : memblock_id;
|
377 | |
}
|
378 | |
|
379 | |
/*
|
380 | |
* If the wrong directory was specified, readdir can return success,
|
381 | |
* even though it never iterated any files in the directory. Make that case
|
382 | |
* also an error, by verifying that start_id has been set.
|
383 | |
*/
|
384 | |
if (start_id == UINT32_MAX) {
|
385 | |
syslog(LOG_ERR, "NUMA: Failed to find any files in %s", numa_file_path);
|
386 | |
status = -ENOENT;
|
387 | |
goto cleanup;
|
388 | |
}
|
389 | |
|
390 | |
*memblock_start_id = start_id;
|
391 | |
*memblock_end_id = end_id;
|
392 | |
|
393 | |
SYSLOG_VERBOSE(LOG_DEBUG,
|
394 | |
"NUMA: Found memblock start id: %"PRIu32
|
395 | |
" and end id: %"PRIu32"\n",
|
396 | |
*memblock_start_id, *memblock_end_id);
|
397 | |
|
398 | |
cleanup:
|
399 | |
closedir(dir_ptr);
|
400 | |
return status;
|
401 | |
}
|
402 | |
|
403 | |
static
|
404 | |
int change_numa_node_state(uint32_t node_id, uint64_t region_gpu_size,
|
405 | |
uint64_t memblock_size, mem_state_t new_state)
|
|
331 |
static
|
|
332 |
int change_numa_node_state(uint32_t node_id,
|
|
333 |
uint64_t base_addr,
|
|
334 |
uint64_t region_gpu_size,
|
|
335 |
uint64_t memblock_size,
|
|
336 |
mem_state_t new_state)
|
406 | 337 |
{
|
407 | 338 |
uint32_t memblock_id;
|
408 | 339 |
int status = 0, err_status = 0;
|
409 | 340 |
uint64_t blocks_changed = 0;
|
410 | |
uint32_t memblock_start_id = 0;
|
411 | |
uint32_t memblock_end_id = 0;
|
412 | |
|
413 | |
status = gather_memblock_ids_for_node(node_id, &memblock_start_id,
|
414 | |
&memblock_end_id);
|
415 | |
if (status < 0) {
|
416 | |
syslog(LOG_ERR, "NUMA: Failed to get all memblock ID's for node%d\n",
|
417 | |
node_id);
|
418 | |
return status;
|
419 | |
}
|
420 | |
|
421 | |
if (memblock_start_id > memblock_end_id) {
|
422 | |
syslog(LOG_ERR, "NUMA: Invalid memblock IDs were found for node%d\n",
|
423 | |
node_id);
|
424 | |
return -EINVAL;
|
425 | |
}
|
|
341 |
uint32_t memblock_start_id = base_addr / memblock_size;
|
|
342 |
uint32_t memblock_end_id = (base_addr + region_gpu_size) / memblock_size - 1;
|
426 | 343 |
|
427 | 344 |
SYSLOG_VERBOSE(LOG_DEBUG,
|
428 | 345 |
"NUMA: memblock ID range: %"PRIu32"-%"PRIu32
|
|
531 | 448 |
syslog(LOG_ERR, "NUMA: Probe ranges not aligned to memblock size!\n");
|
532 | 449 |
return -EFAULT;
|
533 | 450 |
}
|
|
451 |
|
|
452 |
if (access(MEMORY_PROBE_PATH_FMT, F_OK) != 0 && errno == ENOENT)
|
|
453 |
/*
|
|
454 |
* It is not an error when the 'probe' file is not found, since this
|
|
455 |
* situation is normal for systems where the driver handles probe of
|
|
456 |
* the NUMA memory.
|
|
457 |
*/
|
|
458 |
goto done;
|
534 | 459 |
|
535 | 460 |
for (start_addr = probe_base_addr;
|
536 | 461 |
start_addr + memblock_size <= numa_end_addr;
|
|
618 | 543 |
}
|
619 | 544 |
|
620 | 545 |
status = change_numa_node_state(numa_info_params.nid,
|
|
546 |
numa_info_params.numa_mem_addr,
|
621 | 547 |
numa_info_params.numa_mem_size,
|
622 | 548 |
numa_info_params.memblock_size,
|
623 | 549 |
NV_IOCTL_NUMA_STATUS_OFFLINE);
|
|
649 | 575 |
}
|
650 | 576 |
|
651 | 577 |
#define MEMORY_AUTO_ONLINE_WARNING_FMT \
|
652 | |
"NUMA: %s state is online and the default zone is not movable (%s).\n" \
|
|
578 |
"NUMA: " MEMBLK_FILE_FMT " state is online and the default zone is not movable (%s).\n" \
|
653 | 579 |
"This likely means that some non-NVIDIA software has auto-onlined\n" \
|
654 | 580 |
"the device memory before nvidia-persistenced could. Please check\n" \
|
655 | 581 |
"if the CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE kernel config option\n" \
|
|
657 | 583 |
"/lib/udev/rules.d/."
|
658 | 584 |
|
659 | 585 |
static
|
660 | |
int check_memory_auto_online(uint32_t node_id, NvCfgBool *auto_online_success)
|
|
586 |
int check_memory_auto_online(uint32_t node_id,
|
|
587 |
uint64_t base_addr,
|
|
588 |
uint64_t region_gpu_size,
|
|
589 |
uint64_t memblock_size,
|
|
590 |
NvCfgBool *auto_online_success)
|
661 | 591 |
{
|
662 | 592 |
DIR *dir_ptr;
|
663 | 593 |
int status = 0;
|
664 | |
struct dirent *dir_entry;
|
665 | 594 |
char read_buf[BUF_SIZE];
|
666 | 595 |
char numa_file_path[BUF_SIZE];
|
667 | 596 |
char memory_file_path[BUF_SIZE];
|
668 | 597 |
int num_memory_node_in_dir = 0;
|
669 | 598 |
int num_memory_online_movable = 0;
|
|
599 |
uint32_t block_id;
|
|
600 |
uint32_t memblock_start_id = base_addr / memblock_size;
|
|
601 |
uint32_t memblock_end_id = (base_addr + region_gpu_size) / memblock_size - 1;
|
670 | 602 |
|
671 | 603 |
*auto_online_success = NVCFG_FALSE;
|
672 | 604 |
|
|
679 | 611 |
return -errno;
|
680 | 612 |
}
|
681 | 613 |
|
682 | |
/* Iterate through the node directory */
|
683 | |
while ((dir_entry = readdir(dir_ptr)) != NULL) {
|
684 | |
uint32_t block_id;
|
685 | |
|
686 | |
/* Skip entries that are not a memory node */
|
687 | |
if (get_memblock_id_from_dirname(dir_entry->d_name, &block_id) < 0) {
|
688 | |
continue;
|
689 | |
}
|
|
614 |
/* Iterate through the blocks */
|
|
615 |
for (block_id = memblock_start_id; block_id <= memblock_end_id; block_id++) {
|
690 | 616 |
|
691 | 617 |
num_memory_node_in_dir++;
|
692 | 618 |
|
|
696 | 622 |
read_buf, sizeof(read_buf));
|
697 | 623 |
if (status < 0) {
|
698 | 624 |
syslog(LOG_ERR,
|
699 | |
"NUMA: Failed to read %s state\n", dir_entry->d_name);
|
|
625 |
"NUMA: Failed to read " MEMBLK_FILE_FMT " state\n", block_id);
|
700 | 626 |
goto cleanup;
|
701 | 627 |
}
|
702 | 628 |
|
|
712 | 638 |
read_buf, sizeof(read_buf));
|
713 | 639 |
if (status < 0) {
|
714 | 640 |
syslog(LOG_ERR,
|
715 | |
"NUMA: Failed to read %s valid_zones\n",
|
716 | |
dir_entry->d_name);
|
|
641 |
"NUMA: Failed to read " MEMBLK_FILE_FMT " valid_zones\n",
|
|
642 |
block_id);
|
717 | 643 |
goto cleanup;
|
718 | 644 |
}
|
719 | 645 |
|
720 | 646 |
/* If memory was auto-onlined, check if valid_zones is Movable */
|
721 | 647 |
if (strstr(read_buf, VALID_MOVABLE_STATE) != read_buf) {
|
722 | 648 |
syslog(LOG_NOTICE, MEMORY_AUTO_ONLINE_WARNING_FMT,
|
723 | |
dir_entry->d_name, read_buf);
|
|
649 |
block_id, read_buf);
|
724 | 650 |
status = -ENOTSUP;
|
725 | 651 |
break;
|
726 | 652 |
} else {
|
|
851 | 777 |
|
852 | 778 |
/* Check if probed memory has been auto-onlined */
|
853 | 779 |
status = check_memory_auto_online(numa_info_params.nid,
|
|
780 |
numa_info_params.numa_mem_addr,
|
|
781 |
numa_info_params.numa_mem_size,
|
|
782 |
numa_info_params.memblock_size,
|
854 | 783 |
&auto_online_success);
|
855 | 784 |
if (status < 0) {
|
856 | 785 |
if (status != -ENOTSUP) {
|
|
870 | 799 |
}
|
871 | 800 |
|
872 | 801 |
status = change_numa_node_state(numa_info_params.nid,
|
|
802 |
numa_info_params.numa_mem_addr,
|
873 | 803 |
numa_info_params.numa_mem_size,
|
874 | 804 |
numa_info_params.memblock_size,
|
875 | 805 |
NV_IOCTL_NUMA_STATUS_ONLINE);
|