Source code for rsmtool.container

"""
Class to encapsulate data contained in multiple pandas DataFrames.

It represents each of the multiple data sources as a "dataset". Each
dataset is represented by three properties:
- "name" : the name of the data set
- "frame" : the pandas DataFrame that contains the actual data
- "path" : the path to the file on disk from which the data was read

:author: Jeremy Biggs (jbiggs@ets.org)
:author: Anastassia Loukina (aloukina@ets.org)
:author: Nitin Madnani (nmadnani@ets.org)

:organization: ETS
"""

from __future__ import annotations

import warnings
from copy import copy, deepcopy
from typing import Dict, Generator, List, Optional, Tuple, TypedDict

import pandas as pd


[docs] class DatasetDict(TypedDict, total=False): """Type definition for a dataset dictionary.""" name: str frame: pd.DataFrame path: Optional[str]
[docs] class DataContainer: """Class to encapsulate datasets.""" def __init__(self, datasets: Optional[List[DatasetDict]] = None): """ Initialize a DataContainer object. Parameters ---------- datasets : Optional[List[DatasetDict]] A list of dataset dictionaries. Each dict should have the following keys: "name" containing the name of the dataset, "frame" containing the dataframe object representing the dataset, and "path" containing the path to the file from which the frame was read. """ self._names: List[str] = [] self._dataframes: Dict[str, pd.DataFrame] = {} self._data_paths: Dict[str, Optional[str]] = {} if datasets is not None: for dataset_dict in datasets: self.add_dataset(dataset_dict, update=False) def __contains__(self, name: str) -> bool: """ Check if the container object contains a dataset with a given name. Parameters ---------- name : str The name to check in the container object. Returns ------- key_check : bool ``True`` if a dataset with this name exists in the container object, else ``False``. """ return name in self._names def __getitem__(self, name: str) -> pd.DataFrame: """ Get the data frame for the dataset with the given name. Parameters ---------- name : str The name for the dataset. Returns ------- frame : pandas.DataFrame The data frame for the dataset with the given name. Raises ------ KeyError If the name does not exist in the container. """ return self.get_frame(name) def __len__(self) -> int: """ Return the number of datasets in the container. Returns ------- length : int The size of the container (i.e. number of datasets). """ return len(self._names) def __str__(self) -> str: """ Return a string representation of the container. Returns ------- container_names : str A comma-separated list of dataset names from the container. """ return ", ".join(self._names) def __add__(self, other: DataContainer) -> DataContainer: """ Add another container object to instance. Return a new container object with datasets included in either of the two containers. Parameters ---------- other : DataContainer The container object to add. Returns ------- output : DataContainer New container object containing datasets included in this instance and the other instance. Raises ------ KeyError If there are duplicate keys in the two containers. ValueError If the object being added is not a container. """ if not isinstance(other, DataContainer): raise ValueError(f"Object must be a `DataContainer`, not {type(other)}.") # Make sure there are no duplicate keys common_keys = set(other._names).intersection(self._names) if common_keys: raise KeyError(f"The key(s) `{', '.join(common_keys)}` already exist in the container.") dicts = DataContainer.to_datasets(self) dicts.extend(DataContainer.to_datasets(other)) return DataContainer(dicts) def __iter__(self) -> Generator[str, None, None]: """ Iterate through the container keys (dataset names). Yields ------ key A key (name) in the container dictionary. """ for key in self.keys(): yield key
[docs] @staticmethod def to_datasets(data_container: DataContainer) -> List[DatasetDict]: """ Convert container object to a list of dataset dictionaries. Each dictionary will contain the "name", "frame", and "path" keys. Parameters ---------- data_container : DataContainer The container object to convert. Returns ------- dataset_dicts : List[DatasetDict] A list of dataset dictionaries. """ dataset_dicts: List[DatasetDict] = [] for name in data_container.keys(): dataset_dict = DatasetDict( { "name": name, "path": data_container.get_path(name), "frame": data_container.get_frame(name), } ) dataset_dicts.append(dataset_dict) return dataset_dicts
[docs] def add_dataset(self, dataset_dict: DatasetDict, update: bool = False) -> None: """ Add a new dataset (or update an existing one). Parameters ---------- dataset_dict : DatasetDict The dataset dictionary to add or update with the "name", "frame", and "path" keys. update : bool Update an existing DataFrame, if ``True``. Defaults to ``False``. """ name = dataset_dict["name"] data_frame = dataset_dict["frame"] path = dataset_dict.get("path") if not update and name in self._names: raise KeyError(f"The name {name} already exists in the container dictionary.") if name not in self._names: self._names.append(name) self._dataframes[name] = data_frame self._data_paths[name] = path self.__setattr__(name, data_frame)
[docs] def get_path(self, name: str, default: Optional[str] = None) -> Optional[str]: """ Get the path for the dataset given the name. Parameters ---------- name : str The name for the dataset. default : Optional[str] The default path to return if the named dataset does not exist. Defaults to ``None``. Returns ------- path : Optional[str] The path for the named dataset. """ if name not in self._names: return default return self._data_paths[name]
[docs] def get_frame( self, name: str, default: Optional[pd.DataFrame] = None ) -> Optional[pd.DataFrame]: """ Get the data frame given the dataset name. Parameters ---------- name : str The name for the dataset. default : Optional[pandas.DataFrame] The default value to return if the named dataset does not exist. Defaults to ``None``. Returns ------- frame : Optional[pandas.DataFrame] The data frame for the named dataset. """ if name not in self._names: return default return self._dataframes[name]
[docs] def get_frames( self, prefix: Optional[str] = None, suffix: Optional[str] = None ) -> Dict[str, pd.DataFrame]: """ Get all data frames with a given prefix or suffix in their name. Note that the selection by prefix or suffix is case-insensitive. Parameters ---------- prefix : Optional[str] Only return frames with the given prefix. If ``None``, then do not exclude any frames based on their prefix. Defaults to ``None``. suffix : Optional[str] Only return frames with the given suffix. If ``None``, then do not exclude any frames based on their suffix. Defaults to ``None``. Returns ------- frames : Dict[str, pandas.DataFrame] A dictionary with the data frames that contain the specified prefix and/or suffix in their corresponding names. The names are the keys and the frames are the values. """ if prefix is None: prefix = "" if suffix is None: suffix = "" names = [ name for name in self._names if name.lower().startswith(prefix) and name.lower().endswith(suffix) ] frames = {} for name in names: frames[name] = self._dataframes[name] return frames
[docs] def keys(self) -> List[str]: """ Return the container keys (dataset names) as a list. Returns ------- keys : List[str] A list of keys (names) in the container object. """ return self._names
[docs] def values(self) -> List[pd.DataFrame]: """ Return all data frames as a list. Returns ------- values : List[pandas.DataFrame] A list of all data frames in the container object. """ return [self._dataframes[name] for name in self._names]
[docs] def items(self) -> List[Tuple[str, pd.DataFrame]]: """ Return the container items as a list of (name, frame) tuples. Returns ------- items : List[Tuple[str, pandas.DataFrame]] A list of (name, frame) tuples in the container object. """ return [(name, self._dataframes[name]) for name in self._names]
[docs] def drop(self, name: str) -> DataContainer: """ Drop a given dataset from the container and return instance. Parameters ---------- name : str The name of the dataset to drop. Returns ------- data_container : DataContainer The input container object with the dataset dropped. """ if name not in self: warnings.warn( f"The name `{name}` is not in the container. " f"No datasets will be dropped." ) else: self._names.remove(name) self._dataframes.pop(name) self._data_paths.pop(name) return self
[docs] def rename(self, name: str, new_name: str) -> DataContainer: """ Rename a given dataset in the container and return instance. Parameters ---------- name : str The name of the current dataset in the container object. new_name : str The new name for the dataset in the container object. Returns ------- data_container : DataContainer The input container object with the dataset renamed. """ if name not in self: warnings.warn(f"The name `{name}` is not in the container and cannot be renamed.") else: frame = self._dataframes[name] path = self._data_paths[name] self.add_dataset({"name": new_name, "frame": frame, "path": path}, update=True) self.drop(name) return self
[docs] def copy(self, deep: bool = True) -> DataContainer: """ Return a copy of the container object. Parameters ---------- deep : bool If ``True``, create a deep copy of the underlying data frames. Defaults to ``True``. Returns ------- data_container : DataContainer A copy of the input container object. """ if deep: return deepcopy(self) return copy(self)