Module pysimt.utils.device

Expand source code
import re
import os
import shutil
import subprocess

import torch

DEVICE = None
DEVICE_IDS = None


class DeviceManager:
    __errors = {
        'BadDeviceFormat': 'Device can be cpu, gpu or [N]gpu, i.e. 2gpu',
        'NoDevFiles': 'Make sure you requested a GPU resource from your cluster.',
        'NoSMI': 'nvidia-smi is not installed. Are you on the correct node?',
        'EnvVar': 'Please set CUDA_VISIBLE_DEVICES explicitly.',
        'NoMultiGPU': 'Multi-GPU not supported for now.',
        'NotEnoughGPU': 'You requested {} GPUs while you have access to only {}.',
    }

    def __init__(self, dev):
        self.dev = dev.lower()
        self.pid = os.getpid()
        self.req_cpu = False
        self.req_gpu = False
        self.req_n_gpu = 0
        self.req_multi_gpu = False
        self.nvidia_smi = False
        self.cuda_dev_ids = None

        if not re.match('(cpu|[0-9]{0,1}gpu)$', self.dev):
            raise RuntimeError(self.__errors['BadDeviceFormat'])

        if self.dev == 'cpu':
            self.req_cpu = True
            self.dev = torch.device('cpu')
        else:
            self.req_gpu = True
            if self.dev == 'gpu':
                self.req_n_gpu = 1
            else:
                self.req_n_gpu = int(self.dev[0])

            self.req_multi_gpu = self.req_n_gpu > 1

            # What we have
            self.nvidia_smi = shutil.which('nvidia-smi')
            self.cuda_dev_ids = os.environ.get('CUDA_VISIBLE_DEVICES', None)

            if self.nvidia_smi is None:
                raise RuntimeError(self.__errors['NoSMI'])
            if self.cuda_dev_ids == "NoDevFiles":
                raise RuntimeError(self.__errors['NoDevFiles'])
            elif self.cuda_dev_ids is None:
                raise RuntimeError(self.__errors['EnvVar'])

            # How many GPUs do we have access to?
            self.cuda_dev_ids = [int(de) for de in self.cuda_dev_ids.split(',')]

            # FIXME: Remove this once DataParallel works.
            if self.req_n_gpu > 1 or len(self.cuda_dev_ids) > 1:
                raise RuntimeError(self.__errors['NoMultiGPU'])

            if self.req_n_gpu > len(self.cuda_dev_ids):
                raise RuntimeError(
                    self.__errors['NotEnoughGPU'].format(
                        self.req_n_gpu, len(self.cuda_dev_ids)))
            else:
                self.cuda_dev_ids = self.cuda_dev_ids[:self.req_n_gpu]

            # Set master device (is always cuda:0 since we force env.var
            # restriction)
            self.dev = torch.device('cuda:0')

            global DEVICE, DEVICE_IDS
            DEVICE = self.dev
            DEVICE_IDS = self.cuda_dev_ids

    def get_cuda_mem_usage(self, name=True):
        if self.req_cpu:
            return None

        pr = subprocess.run([
            self.nvidia_smi,
            "--query-compute-apps=pid,gpu_name,used_memory",
            "--format=csv,noheader"], stdout=subprocess.PIPE, universal_newlines=True)

        for line in pr.stdout.strip().split('\n'):
            pid, gpu_name, usage = line.split(',')
            if int(pid) == self.pid:
                if name:
                    return '{} -> {}'.format(gpu_name.strip(), usage.strip())
                return usage.strip()

        return 'N/A'

    def __repr__(self):
        if self.req_cpu:
            return "DeviceManager(dev='cpu')"
        return "DeviceManager({}, n_gpu={})".format(self.dev, self.req_n_gpu)

Classes

class DeviceManager (dev)
Expand source code
class DeviceManager:
    __errors = {
        'BadDeviceFormat': 'Device can be cpu, gpu or [N]gpu, i.e. 2gpu',
        'NoDevFiles': 'Make sure you requested a GPU resource from your cluster.',
        'NoSMI': 'nvidia-smi is not installed. Are you on the correct node?',
        'EnvVar': 'Please set CUDA_VISIBLE_DEVICES explicitly.',
        'NoMultiGPU': 'Multi-GPU not supported for now.',
        'NotEnoughGPU': 'You requested {} GPUs while you have access to only {}.',
    }

    def __init__(self, dev):
        self.dev = dev.lower()
        self.pid = os.getpid()
        self.req_cpu = False
        self.req_gpu = False
        self.req_n_gpu = 0
        self.req_multi_gpu = False
        self.nvidia_smi = False
        self.cuda_dev_ids = None

        if not re.match('(cpu|[0-9]{0,1}gpu)$', self.dev):
            raise RuntimeError(self.__errors['BadDeviceFormat'])

        if self.dev == 'cpu':
            self.req_cpu = True
            self.dev = torch.device('cpu')
        else:
            self.req_gpu = True
            if self.dev == 'gpu':
                self.req_n_gpu = 1
            else:
                self.req_n_gpu = int(self.dev[0])

            self.req_multi_gpu = self.req_n_gpu > 1

            # What we have
            self.nvidia_smi = shutil.which('nvidia-smi')
            self.cuda_dev_ids = os.environ.get('CUDA_VISIBLE_DEVICES', None)

            if self.nvidia_smi is None:
                raise RuntimeError(self.__errors['NoSMI'])
            if self.cuda_dev_ids == "NoDevFiles":
                raise RuntimeError(self.__errors['NoDevFiles'])
            elif self.cuda_dev_ids is None:
                raise RuntimeError(self.__errors['EnvVar'])

            # How many GPUs do we have access to?
            self.cuda_dev_ids = [int(de) for de in self.cuda_dev_ids.split(',')]

            # FIXME: Remove this once DataParallel works.
            if self.req_n_gpu > 1 or len(self.cuda_dev_ids) > 1:
                raise RuntimeError(self.__errors['NoMultiGPU'])

            if self.req_n_gpu > len(self.cuda_dev_ids):
                raise RuntimeError(
                    self.__errors['NotEnoughGPU'].format(
                        self.req_n_gpu, len(self.cuda_dev_ids)))
            else:
                self.cuda_dev_ids = self.cuda_dev_ids[:self.req_n_gpu]

            # Set master device (is always cuda:0 since we force env.var
            # restriction)
            self.dev = torch.device('cuda:0')

            global DEVICE, DEVICE_IDS
            DEVICE = self.dev
            DEVICE_IDS = self.cuda_dev_ids

    def get_cuda_mem_usage(self, name=True):
        if self.req_cpu:
            return None

        pr = subprocess.run([
            self.nvidia_smi,
            "--query-compute-apps=pid,gpu_name,used_memory",
            "--format=csv,noheader"], stdout=subprocess.PIPE, universal_newlines=True)

        for line in pr.stdout.strip().split('\n'):
            pid, gpu_name, usage = line.split(',')
            if int(pid) == self.pid:
                if name:
                    return '{} -> {}'.format(gpu_name.strip(), usage.strip())
                return usage.strip()

        return 'N/A'

    def __repr__(self):
        if self.req_cpu:
            return "DeviceManager(dev='cpu')"
        return "DeviceManager({}, n_gpu={})".format(self.dev, self.req_n_gpu)

Methods

def get_cuda_mem_usage(self, name=True)
Expand source code
def get_cuda_mem_usage(self, name=True):
    if self.req_cpu:
        return None

    pr = subprocess.run([
        self.nvidia_smi,
        "--query-compute-apps=pid,gpu_name,used_memory",
        "--format=csv,noheader"], stdout=subprocess.PIPE, universal_newlines=True)

    for line in pr.stdout.strip().split('\n'):
        pid, gpu_name, usage = line.split(',')
        if int(pid) == self.pid:
            if name:
                return '{} -> {}'.format(gpu_name.strip(), usage.strip())
            return usage.strip()

    return 'N/A'