Python Quick Start

Contents

Project

  • Create workdir.

  • Add conf.py with configuration dictionary CNFG.

  • Pass CNFG to pycnfg.run() .

Code

github repo

conf.py:

"""Configuration example.

Create pipeline (sgd) and optimize hp_grid:
* target transformer on/off.
* polynomial degree 1/2.

"""


import lightgbm
import mlshell
import pycnfg
import sklearn


target_transformer = sklearn.preprocessing.PowerTransformer(
    method='yeo-johnson', standardize=True, copy=True)

# Optimization hp ranges.
hp_grid = {
    # 'process_parallel__pipeline_numeric__transform_normal__skip': [False],
    # 'process_parallel__pipeline_numeric__scale_column_wise__quantile_range': [(1, 99)],
    'process_parallel__pipeline_numeric__add_polynomial__degree': [1, 2],
    'estimate__transformer': [None, target_transformer],

    # sgd
    # 'estimate__regressor__alpha': np.logspace(-2, -1, 10),
    # 'estimate__regressor__l1_ratio': np.linspace(0.1, 1, 10),
}

CNFG = {
    # Path section - specify project directory.
    'path': {
        'default': {
            'priority': 1,
            'init': pycnfg.find_path,
            'producer': pycnfg.Producer,
        }
    },
    # Logger section - create logger.
    'logger': {
        'default': {
            'priority': 2,
            'init': 'default',
            'producer': mlshell.LoggerProducer,
            'steps': [
                ('make',),
            ],
        }
    },
    # Pipeline section - specify pipelines creation/loading.
    'pipeline': {
        'sgd': {
            'priority': 3,
            'init': mlshell.Pipeline,
            'producer': mlshell.PipelineProducer,
            'steps': [
                ('make', {
                    'estimator_type': 'regressor',
                    'estimator': sklearn.linear_model.SGDRegressor(
                        penalty='elasticnet', l1_ratio=1, shuffle=False,
                        max_iter=1000, alpha=0.02, random_state=42),
                }),
            ],
        },
        'lgbm': {
            'priority': 3,
            'init': mlshell.Pipeline,
            'producer': mlshell.PipelineProducer,
            'steps': [
                ('make', {
                    'estimator_type': 'regressor',
                    'estimator': lightgbm.LGBMRegressor(
                        num_leaves=2, min_data_in_leaf=60,
                        n_estimators=200, max_depth=-1, random_state=42),
                }),
            ],
        }
    },
    # Metric section - specify metric creation/loading.
    'metric': {
        'r2': {
            'priority': 4,
            'init': mlshell.Metric,
            'producer': mlshell.MetricProducer,
            'steps': [
                ('make', {
                    'score_func': sklearn.metrics.r2_score,
                    'greater_is_better': True,
                }),
            ],
        },
        'mse': {
            'priority': 4,
            'init': mlshell.Metric,
            'producer': mlshell.MetricProducer,
            'steps': [
                ('make', {
                    'score_func': sklearn.metrics.mean_squared_error,
                    'greater_is_better': False,
                    'squared': False
                }),
            ],
        },
    },
    # Dataset section - specify dataset loading/preprocessing/splitting.
    'dataset': {
        'train': {
            'priority': 5,
            'init': mlshell.Dataset,
            'producer': mlshell.DatasetProducer,
            'steps': [
                ('load', {'filepath': './data/train.csv'}),
                ('info',),
                ('preprocess', {'targets_names': ['wage'],
                                'categor_names': ['union', 'goodhlth',
                                                  'black', 'female',
                                                  'married', 'service']}),
                ('split', {'train_size': 0.75, 'shuffle': False, }),
            ],
        },
        'test': {
            'priority': 5,
            'init': mlshell.Dataset,
            'producer': mlshell.DatasetProducer,
            'steps': [
                ('load', {'filepath': 'data/test.csv'}),
                ('info',),
                ('preprocess', {'categor_names': ['union', 'goodhlth',
                                                  'black', 'female',
                                                  'married', 'service'],
                                'targets_names': ['wage']}),
            ],
        },
    },
    # Workflow section - fit/predict pipelines on datasets, optimize/validate
    # metrics.
    'workflow': {
        'conf': {
            'priority': 6,
            'init': {},
            'producer': mlshell.Workflow,
            'steps': [
                # Train 'sgd' pipeline on 'train' subset of 'train' dataset
                # with zero position hp from 'hp_grid'.
                ('fit', {
                    'pipeline_id': 'pipeline__sgd',
                    'dataset_id': 'dataset__train',
                    'subset_id': 'train',
                    'hp': hp_grid,
                }),
                # Validate 'sgd' pipeline on 'train' and 'test' subsets of
                # 'train' dataset with 'r2' scorer (after fit).
                ('validate', {
                    'pipeline_id': 'pipeline__sgd',
                    'dataset_id': 'dataset__train',
                    'subset_id': ['train', 'test'],
                    'metric_id': ['metric__r2', 'metric__mse'],
                }),
                # Optimize 'sgd' pipeline on 'train' subset of 'train' dataset
                # on hp combinations from 'hp_grid'. Score and refit on 'r2'
                # scorer.
                ('optimize', {
                    'pipeline_id': 'pipeline__sgd',
                    'dataset_id': 'dataset__train',
                    'subset_id': 'train',
                    'metric_id': ['metric__r2', 'metric__mse'],
                    'hp_grid': hp_grid,
                    'gs_params': {
                        'n_iter': None,
                        'n_jobs': 1,
                        'refit': 'metric__r2',
                        'cv': sklearn.model_selection.KFold(n_splits=3,
                                                            shuffle=True,
                                                            random_state=42),
                        'verbose': 1,
                        'pre_dispatch': 'n_jobs',
                        'return_train_score': True,
                    },
                }),
                # Validate 'sgd' pipeline on 'train' and 'test' subsets of
                # 'train' dataset with 'r2' scorer (after optimization).
                ('validate', {
                    'pipeline_id': 'pipeline__sgd',
                    'dataset_id': 'dataset__train',
                    'subset_id': ['train', 'test'],
                    'metric_id': ['metric__r2', 'metric__mse'],
                }),
                # Predict with 'sgd' pipeline on whole 'test' dataset.
                ('predict', {
                    'pipeline_id': 'pipeline__sgd',
                    'dataset_id': 'dataset__test',
                    'subset_id': '',
                }),
                # Dump 'sgd' pipeline on disk.
                ('dump', {'pipeline_id': 'pipeline__sgd',
                          'dirpath': None}),
            ],
        },
    },
}


if __name__ == '__main__':
    objects = pycnfg.run(CNFG, dcnfg={})

Shortened version using pycnfg features :

CNFG = {
    # In ``pycnfg.run`` set default configuration :data:`mlshell.CNFG`, that
    # has pre-defined path, logger sections and main sub-keys (see below).
    'pipeline': {
        'sgd': {
            'estimator': sklearn.linear_model.SGDRegressor(
                penalty='elasticnet', l1_ratio=1, shuffle=False,
                max_iter=1000, alpha=0.02, random_state=42),
        },
        'lgbm': {
            'estimator': lightgbm.LGBMRegressor(
                num_leaves=2, min_data_in_leaf=60, n_estimators=200,
                max_depth=-1, random_state=42),
        }
    },
    'metric': {
        'r2': {
            'score_func': sklearn.metrics.r2_score,
            'kwargs': {'greater_is_better': True},
        },
        'mse': {
            'score_func': sklearn.metrics.mean_squared_error,
            'kwargs': {
                'greater_is_better': False,
                'squared': False
            },
        },
    },
    'dataset': {
        # Section level 'global' to specify common kwargs for test and train.
        'global': {'targets_names': ['wage'],
                   'categor_names': ['union', 'goodhlth', 'black', 'female',
                                     'married', 'service'],
                   },
        'train': {
            'filepath': './data/train.csv',
            'split__kwargs': {'train_size': 0.75, 'shuffle': False},
        },
        'test': {
            'filepath': 'data/test.csv',
            'split__kwargs': {'train_size': 1},
        },
    },
    'workflow': {
        'conf': {
            # Global values will replace kwargs in corresponding default steps
            # => easy switch between pipeline for example (pycnfg move unknown
            # keys to 'global' by default).
            'pipeline_id': 'pipeline__sgd',
            'dataset_id': 'dataset__train',
            'predict__dataset_id': 'dataset__test',
            'hp': hp_grid,
            'hp_grid': hp_grid,
            'gs_params': 'gs_params__conf',
            'metric_id': ['metric__r2', 'metric__mse'],
            'steps': [
                ('fit',),
                ('validate',),
                ('optimize',),
                ('validate',),
                ('predict',),
                ('dump',),
            ],
        },
    },
    # Separate section for 'gs_params' kwarg.
    'gs_params': {
        'conf': {
            'priority': 3,
            'init': {
                'n_iter': None,
                'n_jobs': 1,
                'refit': 'metric__r2',
                'cv': sklearn.model_selection.KFold(n_splits=3,
                                                    shuffle=True,
                                                    random_state=42),
                'verbose': 1,
                'pre_dispatch': 'n_jobs',
                'return_train_score': True,
            },
        },
    },
}


if __name__ == '__main__':
    objects = pycnfg.run(CNFG, dcnfg=mlshell.CNFG)