Source code for pybayopt.bayopt_gpei

import os
import time
import signal
import shutil
import numpy as np
import pandas as pd
from collections import OrderedDict

from spearmint.ExperimentGrid import GridMap
from spearmint.chooser import GPEIOptChooser as module
from .bayopt_base import BayoptBase

grid_size = 20000


def set_timeout(func):
    def handle(signum, frame):
        raise RuntimeError

    def to_do(*args, **kwargs):
        try:
            signal.signal(signal.SIGALRM, handle)
            signal.alarm(args[0].time_out)
            r = func(*args, **kwargs)
            signal.alarm(0)
            return r
        except RuntimeError:
            raise Exception()
    return to_do


[docs]class GPEIOPT(BayoptBase):
    """
    Interface of Gaussian Process - Expected Improvement (Bayesian Optimization).

    Parameters
    ----------
    :type  para_space: dict or list of dictionaries
    :param para_space: It has three types:

        Continuous:
            Specify `Type` as `continuous`, and include the keys of `Range` (a list with lower-upper elements pair) and
            `Wrapper`, a callable function for wrapping the values.
        Integer:
            Specify `Type` as `integer`, and include the keys of `Mapping` (a list with all the sortted integer elements).
        Categorical:
            Specify `Type` as `categorical`, and include the keys of `Mapping` (a list with all the possible categories).

    :type max_runs: int, optional, default=100
    :param max_runs: The maximum number of trials to be evaluated. When this values is reached,
        then the algorithm will stop.

    :type time_out: float, optional, default=10
    :param time_out: The time out threshold (in seconds) for generating the next run.

    :type  estimator: estimator object
    :param estimator: This is assumed to implement the scikit-learn estimator interface.

    :type  cv: cross-validation method, an sklearn object.
    :param cv: e.g., `StratifiedKFold` and KFold` is used.

    :type scoring: string, callable, list/tuple, dict or None, optional, default = None
    :param scoring: A sklearn type scoring function.
        If None, the estimator's default scorer (if available) is used. See the package `sklearn` for details.

    :type refit: boolean, or string, optional, default=True
    :param refit: It controls whether to refit an estimator using the best found parameters on the whole dataset.

    :type random_state: int, optional, default=0
    :param random_state: The random seed for optimization.

    :type verbose: boolean, optional, default=False
    :param verbose: It controls whether the searching history will be printed.


    Examples
    ----------
    >>> import numpy as np
    >>> from sklearn import svm
    >>> from sklearn import datasets
    >>> from sequd import GPEIOPT
    >>> from sklearn.model_selection import KFold
    >>> iris = datasets.load_iris()
    >>> ParaSpace = {'C':{'Type': 'continuous', 'Range': [-6, 16], 'Wrapper': np.exp2},
               'gamma': {'Type': 'continuous', 'Range': [-16, 6], 'Wrapper': np.exp2}}
    >>> estimator = svm.SVC()
    >>> cv = KFold(n_splits=5, random_state=0, shuffle=True)
    >>> clf = GPEIOPT(ParaSpace, max_runs=100, time_out=10,
                estimator=estimator, cv=cv, scoring=None, refit=None, random_state=0, verbose=False)
    >>> clf.fit(iris.data, iris.target)

    Attributes
    ----------
    :vartype best_score\_: float
    :ivar best_score\_: The best average cv score among the evaluated trials.

    :vartype best_params\_: dict
    :ivar best_params\_: Parameters that reaches `best_score_`.

    :vartype best_estimator\_: sklearn estimator
    :ivar best_estimator\_: The estimator refitted based on the `best_params_`.
        Not available if estimator = None or `refit=False`.

    :vartype search_time_consumed\_: float
    :ivar search_time_consumed\_: Seconds used for whole searching procedure.

    :vartype refit_time\_: float
    :ivar refit_time\_: Seconds used for refitting the best model on the whole dataset.
        Not available if estimator=None or `refit=False`.
    """

    def __init__(self, para_space, max_runs=100, time_out=10, estimator=None, cv=None,
                 scoring=None, refit=True, random_state=0, verbose=False):

        super(GPEIOPT, self).__init__(para_space, max_runs, verbose)

        self.cv = cv
        self.refit = refit
        self.scoring = scoring
        self.estimator = estimator
        self.random_state = random_state
        self.time_out = time_out
        self.method = "GPEI"

        self.variables = {}
        for item, values in self.para_space.items():
            if (values['Type'] == "continuous"):
                self.variables[item] = OrderedDict({'name': item,
                                                    'type': 'float',
                                                    'min': values['Range'][0],
                                                    'max': values['Range'][1],
                                                    'size': 1})
            elif (values['Type'] == "integer"):
                self.variables[item] = OrderedDict({'name': item,
                                                    'type': 'int',
                                                    'min': min(values['Mapping']),
                                                    'max': max(values['Mapping']),
                                                    'size': 1})
            elif (values['Type'] == "categorical"):
                self.variables[item] = OrderedDict({'name': item,
                                                    'type': 'enum',
                                                    'options': values['Mapping'],
                                                    'size': 1})

    @set_timeout
    def _spmint_opt(self, chooser, grid, values, grid_status):

        # The status of jobs, 0 - candidate, 1 - pending, 2 - complete.
        # Here we only have two status: 0 or 2 available.
        job_id = chooser.next(grid, np.squeeze(values), [],
                              np.nonzero(grid_status == 0)[0],
                              np.nonzero(grid_status == 1)[0],
                              np.nonzero(grid_status == 2)[0])
        return job_id

    def obj_func(self, cfg):
        next_params = pd.DataFrame(np.array([cfg]), columns=self.para_names)
        parameters = {}
        for item, values in self.para_space.items():
            if (values['Type'] == "continuous"):
                parameters[item] = values['Wrapper'](float(next_params[item].iloc[0]))
            elif (values['Type'] == "integer"):
                parameters[item] = int(next_params[item].iloc[0])
            elif (values['Type'] == "categorical"):
                parameters[item] = next_params[item][0]

        score = self.wrapper_func(parameters)
        logs_aug = parameters
        logs_aug.update({"score": score})
        logs_aug = pd.DataFrame(logs_aug, index=[self.iteration])
        self.logs = pd.concat([self.logs, logs_aug]).reset_index(drop=True)

        if self.verbose:
            self.pbar.update(1)
            self.iteration += 1
            self.pbar.set_description("Iteration %d:" % self.iteration)
            self.pbar.set_postfix_str("Current Best Score = %.5f" % (self.logs.loc[:, "score"].max()))
        return -score

    def _run(self, wrapper_func):
        """
        Main loop for searching the best hyperparameters.

        """
        file_dir = "./temp/" + str(time.time()) + str(np.random.rand(1)[0]) + "/"
        os.makedirs(file_dir)

        self.wrapper_func = wrapper_func
        chooser = module.init(file_dir, "mcmc_iters=10")
        vkeys = [k for k in self.variables]
        gmap = GridMap([self.variables[k] for k in vkeys], grid_size)
        grid = np.asarray(gmap.hypercube_grid(grid_size, 1))
        values = np.zeros(grid_size) + np.nan
        grid_status = np.zeros(grid.shape[0])
        for i in range(np.int(self.max_runs)):
            try:
                job_id = self._spmint_opt(chooser, grid, values, grid_status)
            except Exception:
                print('Spearmint Early Stop!')
                break

            if isinstance(job_id, tuple):
                (job_id, candidate) = job_id
                grid = np.vstack((grid, candidate))
                grid_status = np.append(grid_status, 2)
                values = np.append(values, np.zeros(1) + np.nan)
                job_id = grid.shape[0] - 1
            else:
                candidate = grid[job_id, :]
                grid_status[job_id] = 2

            next_params = gmap.unit_to_list(np.clip(candidate, 0, 1))
            values[job_id] = self.obj_func(next_params)

        shutil.rmtree(file_dir)