Source code for themis_ml.datasets.datasets

"""Datasets for Fairness-aware Analysis or Modeling."""

import pandas as pd

from pathlib2 import Path
from os.path import dirname

from .german_credit_data_map import (
    german_credit_variable_map, preprocess_german_credit_data)
from .census_income_data_map import (
    preprocess_census_income_data, census_income_variable_map)
from .data_types import VariableType


def _data_path():
    return Path(dirname(__file__)) / "data"


def _map_transformer(series, variable_map):
    """Private function for making categorical variables human-readable.

    For raw datasets that use non-human-readable codes in categorical
    variables, this function is used to convert them to human-readable values.
    """
    variable = variable_map[series.name]
    if variable.transformer is None:
        return series
    try:
        return series.map(lambda x: variable.transformer[x])
    except TypeError:
        return series.map(variable.transformer)


def _apply_data_map(df, variable_map):
    return df.apply(_map_transformer, variable_map=variable_map)


[docs]def german_credit(raw=False): """Load German Credit Dataset. The target variable is "credit_risk", where 0 = bad and 1 = good :param bool raw: If True, return raw data, otherwise return model-ready data. The model-ready data has columns arranged in the order of: - numeric features. - ordered categorical features. - binary features. - non-ordered categorical features. - target. Note: Raw data does not have this ordering, nor does it have dummified categorical variables. :returns: DataFrame of raw or model-ready data. """ out = _apply_data_map( pd.read_csv(str(_data_path() / "german_credit.csv")), german_credit_variable_map.variable_map) if raw: return out return preprocess_german_credit_data(out)
[docs]def census_income(raw=False): """Load Census Income Data from 1994 - 1995. The target variable is "income_gt_50k" (income above $50,000), where 0 is below and 1 is above. :param bool raw: if True, return raw data, otherwise return model-ready data. The model-ready data has columns arranged in the the order of: - numeric features. - ordered categorical features. - binary features. - non-ordered categorical features. - target. :returns: DataFrame of raw or model-ready data. """ train = pd.read_csv( str(_data_path() / "census_income_1994_1995_train.csv"), names=census_income_variable_map.all_variables) \ .pipe(_apply_data_map, census_income_variable_map.variable_map) test = pd.read_csv( str(_data_path() / "census_income_1994_1995_test.csv"), names=census_income_variable_map.all_variables) \ .pipe(_apply_data_map, census_income_variable_map.variable_map) out = ( pd.concat([ train.assign(dataset_partition="training_set"), test.assign(dataset_partition="test_set")])) if raw: return out return preprocess_census_income_data(out)