【python实现】实时监测GPU,空闲时自动执行脚本

发布于:2024-06-05 ⋅ 阅读:(127) ⋅ 点赞:(0)

文章目录


代码

# author: muzhan
# contact: levio.pku@gmail.com
import os
import sys
import time
cmd = 'nohup python -u train_post_2d_aut.py > output1.log &'    # gpu空闲时,需要执行的脚本命令
def gpu_info():
    gpu_status = os.popen('nvidia-smi | grep %').read().split('|')

    # 监控2号gpu
    gpu_memory = int(gpu_status[10].split('/')[0].split('M')[0].strip())                   # max:12288
    gpu_power = int(gpu_status[9].split('   ')[-1].split('/')[0].split('W')[0].strip())   # max:250

    # 监控3号gpu
    # gpu_memory = int(gpu_status[14].split('/')[0].split('M')[0].strip())  # max:12288
    # gpu_power = int(gpu_status[13].split('   ')[-1].split('/')[0].split('W')[0].strip())

    # 获取空闲的功率和内存
    power_all, memory_all = 250, 12288       # gpu功率和内存的最大值
    power_free = power_all - gpu_power
    memory_free = memory_all - gpu_memory
    return power_free, memory_free
def narrow_setup(interval=2):
    power_free, memory_free = gpu_info()
    i = 0
    while not (memory_free > 7000 and power_free > 175):  # 如果空闲的内存和功率大于指定阈值,就执行cmd脚本
        power_free, memory_free = gpu_info()
        i = i % 10
        symbol = 'monitoring: ' + '>' * i + ' ' * (10 - i - 1) + '|'
        gpu_power_str = 'gpu power_free:%d W |' % power_free
        gpu_memory_str = 'gpu memory_free:%d MiB |' % memory_free
        sys.stdout.write('\r' + gpu_memory_str + ' ' + gpu_power_str + ' ' + symbol)
        sys.stdout.flush()
        time.sleep(interval)     # 每2s检测一次
        i += 1
    print('\n' + cmd)
    os.system(cmd)

if __name__ == '__main__':
    narrow_setup()

网站公告

今日签到

点亮在社区的每一天
去签到