#!/usr/bin/env python
import pandas as pd
import numpy as np
from typing import List
[docs]class DataSet(pd.core.frame.DataFrame):
"""A represenation of a dataset
This is basically a pandas dataframe with a set of "metadata" columns
that will be removed when the dataframe is converted to a numpy array
Two-dimensional size-mutable, potentially heterogeneous tabular data
structure with labeled axes (rows and columns). Arithmetic operations
align on both row and column labels. Can be thought of as a dict-like
container for Series objects. The primary pandas data structure.
Parameters
----------
data : ndarray (structured or homogeneous), Iterable, dict, or DataFrame
Dict can contain Series, arrays, constants, or list-like objects
.. versionchanged :: 0.23.0
If data is a dict, argument order is maintained for Python 3.6
and later.
index : Index or array-like
Index to use for resulting frame. Will default to RangeIndex if
no indexing information part of input data and no index provided
columns : Index or array-like
Column labels to use for resulting frame. Will default to
RangeIndex (0, 1, 2, ..., n) if no column labels are provided
metadata_columns : Array-like
A list of metadata columns that are already contained in the columns parameter.
dtype : dtype, default None
Data type to force. Only a single dtype is allowed. If None, infer
copy : boolean, default False
Copy data from inputs. Only affects DataFrame / 2d ndarray input
See Also
--------
DataFrame.from_records : Constructor from tuples, also record arrays.
DataFrame.from_dict : From dicts of Series, arrays, or dicts.
DataFrame.from_items : From sequence of (key, value) pairs
pandas.read_csv, pandas.read_table, pandas.read_clipboard.
Examples
--------
>>> data_columns = ["tau", "equiv_pldn", "conc_dfnb", "temperature"]
>>> metadata_columns = ["strategy"]
>>> columns = data_columns + metadata_columns
>>> values = [[1.5, 0.5, 0.1, 30.0, "test"]]
>>> ds = DataSet(values, columns=columns, metadata_columns="strategy")
>>> values = {("tau", "DATA"): [1.5, 10.0], \
("equiv_pldn", "DATA"): [0.5, 3.0], \
("conc_dfnb", "DATA"): [0.1, 4.0], \
("temperature", "DATA"): [30.0, 100.0], \
("strategy", "METADATA"): ["test", "test"]}
>>> ds = DataSet(values)
Notes
----
Based on https://notes.mikejarrett.ca/storing-metadata-in-pandas-dataframes/
"""
def __init__(
self,
data=None,
index=None,
columns=None,
metadata_columns=[],
units=None,
dtype=None,
copy=False,
):
# Column multindex level names
level_names = ["NAME", "TYPE"]
if units:
names.append("UNITS")
if type(data) is dict:
column_tuples = [key for key in list(data.keys())]
columns = []
metadata_columns = []
new_data = {}
for column_tuple in column_tuples:
if type(column_tuple) not in [list, tuple]:
raise ValueError(
"Dictionary keys must have the column name and column type as a tuple"
)
columns.append(column_tuple[0])
if column_tuple[1] == "METADATA":
metadata_columns.append(column_tuple[0])
elif column_tuple[1] not in ["DATA", "METADATA"]:
raise ValueError(
f"{column_tuple} must be either a DATA or METADATA column"
)
if isinstance(columns, pd.MultiIndex):
pass
elif columns is not None:
column_names = columns
if metadata_columns:
types = [
"METADATA" if x in metadata_columns else "DATA"
for x in column_names
]
else:
types = ["DATA" for _ in range(len(column_names))]
arrays = [column_names, types]
if units:
arrays.append(units)
tuples = list(zip(*arrays))
columns = pd.MultiIndex.from_tuples(tuples, names=level_names)
pd.core.frame.DataFrame.__init__(
self, data=data, index=index, columns=columns, dtype=dtype, copy=copy
)
[docs] @staticmethod
def from_df(df: pd.DataFrame, metadata_columns: List = [], units: List = []):
"""Create Dataset from a pandas dataframe
Arguments
----------
df: pandas.DataFrame
Dataframe to be converted to a DataSet
metadata_columns: list, optional
names of the columns in the dataframe that are metadata columns
units: list, optional
A list of objects representing the units of the columns
"""
column_names = df.columns.to_numpy()
if metadata_columns:
types = [
"METADATA" if x in metadata_columns else "DATA" for x in df.columns
]
else:
types = ["DATA" for _ in range(len(column_names))]
arrays = [column_names, types]
levels = ["NAME", "TYPE"]
if units:
arrays.append(units)
levels.append("UNITS")
tuples = list(zip(*arrays))
columns = pd.MultiIndex.from_tuples(tuples, names=levels)
return DataSet(df.to_numpy(), columns=columns, index=df.index)
[docs] @staticmethod
def read_csv(filepath_or_buffer, **kwargs):
"""Create a DataSet from a csv"""
header = kwargs.get("header", [0, 1])
index_col = kwargs.get("index_col", 0)
df = pd.read_csv(filepath_or_buffer, header=header, index_col=index_col)
return DataSet(df.to_numpy(), columns=df.columns, index=df.index)
[docs] def to_dict(self, **kwargs):
orient = kwargs.get("orient", "split")
return super().to_dict(orient=orient)
[docs] @classmethod
def from_dict(cls, d):
columns = []
metadata_columns = []
for c in d["columns"]:
if c[1] == "METADATA":
metadata_columns.append(c[0])
columns = [c[0] for c in d["columns"]]
return DataSet(
d["data"],
index=d["index"],
columns=columns,
metadata_columns=metadata_columns,
)
[docs] def zero_to_one(self, small_tol=1.0e-5, return_min_max=False) -> np.ndarray:
"""Scale the data columns between zero and one
Each of the data columns is scaled between zero and one
based on the maximum and minimum values of each column
Arguments
---------
small_tol: float, optional
The minimum value of any value in the final scaled array.
This is used to prevent very small values that will cause
issues in later calcualtions. Defaults to 1e-5.
Returns
-------
scaled: numpy.ndarray
A numpy array with the scaled data columns
if return_min_max true returns a tuple of scaled, mins, maxes
Notes
-----
This method does not change the internal values of the data columns in place.
"""
values = self.data_to_numpy()
values = values.astype(np.float64)
maxes = np.max(values, axis=0)
mins = np.min(values, axis=0)
ranges = maxes - mins
scaled = (values - mins) / ranges
scaled[abs(scaled) < small_tol] = 0.0
if return_min_max:
return scaled, mins, maxes
else:
return scaled
[docs] def standardize(
self, small_tol=1.0e-5, return_mean=False, return_std=False, **kwargs
) -> np.ndarray:
"""Standardize data columns by removing the mean and scaling to unit variance
The standard score of each data column is calculated as:
z = (x - u) / s
where `u` is the mean of the columns and `s` is the standard deviation of
each data column
Parameters
----------
small_tol: float, optional
The minimum value of any value in the final scaled array.
This is used to prevent very small values that will cause
issues in later calcualtions. Defaults to 1e-5.
return_mean: bool, optional
Return an array with the mean of each column in the DataSet
return_std: bool, optional
Return an array with the stnadard deviation of each column
in the DataSet
mean: array, optional
Pass a precalculated array of means for the columns
std: array, optional
Pass a precalculated array of standard deviations
for the columns
Returns
-------
standard: np.ndarray
Numpy array of the standardized data columns
Notes
-----
This method does not change the internal values of the data columns in place.
"""
values = self.data_to_numpy()
values = values.astype(np.float64)
mean = kwargs.get("mean", np.mean(values, axis=0))
sigma = kwargs.get("std", np.std(values, axis=0))
standard = (values - mean) / sigma
standard[abs(standard) < small_tol] = 0.0
if return_mean and return_std:
return standard, mean, sigma
elif return_mean:
return standard, mean
elif return_std:
return standard, sigma
else:
return standard
@property
def _constructor(self):
return DataSet
def __getitem__(self, key):
is_mi_columns = isinstance(self.columns, pd.MultiIndex)
if is_mi_columns and "NAME" in self.columns.names and type(key) == str:
tupkey = [x for x in self.columns if x[0] == key]
if len(tupkey) == 1:
key = tupkey[0]
elif len(tupkey) > 1:
raise ValueError("NAME level column labels must be unique")
return super().__getitem__(key)
def __unicode__(self):
is_mi_columns = isinstance(self.columns, pd.MultiIndex)
if is_mi_columns and "NAME" in self.columns.names:
newdf = self.copy()
newdf.columns = self.columns.get_level_values("NAME")
return newdf.__unicode__()
return super().__unicode__()
def _repr_html_(self):
is_mi_columns = isinstance(self.columns, pd.MultiIndex)
if is_mi_columns and "NAME" in self.columns.names:
newdf = self.copy()
columns = self.columns.get_level_values("NAME").to_numpy()
newdf.columns = columns
return newdf._repr_html_()
return super()._repr_html_()
[docs] def data_to_numpy(self) -> int:
"""Return dataframe with the metadata columns removed"""
result = super().to_numpy()
metadata_columns = []
for i, column in enumerate(self.columns):
if column[1] == "METADATA":
metadata_columns.append(i)
mask = np.ones(len(self.columns), dtype=bool)
mask[metadata_columns] = False
return result[:, mask]
@property
def metadata_columns(self):
"""Names of the metadata columns"""
return [column[0] for column in self.columns if column[1] == "METADATA"]
@property
def data_columns(self):
"""Names of the data columns"""
return [column[0] for column in self.columns if column[1] == "DATA"]
[docs] def insert(
self, loc, column, value, type="DATA", units=None, allow_duplicates=False
):
super().insert(loc, (column, type), value, allow_duplicates)