Source code for rsmtool.container

"""
Class to encapsulate data contained in multiple pandas DataFrames.

It represents each of the multiple data sources as a "dataset". Each
dataset is represented by three properties:
- "name" : the name of the data set
- "frame" : the pandas DataFrame that contains the actual data
- "path" : the path to the file on disk from which the data was read

:author: Jeremy Biggs (jbiggs@ets.org)
:author: Anastassia Loukina (aloukina@ets.org)
:author: Nitin Madnani (nmadnani@ets.org)

:organization: ETS
"""

import warnings
from copy import copy, deepcopy


[docs]class DataContainer:
    """Class to encapsulate datasets."""

    def __init__(self, datasets=None):
        """
        Initialize a DataContainer object.

        Parameters
        ----------
        datasets : list of dicts, optional
            A list of dataset dictionaries. Each dict should have the
            following keys: "name" containing the name of the dataset,
            "frame" containing the dataframe object representing the
            dataset, and "path" containing the path to the file from
            which the frame was read.
        """
        self._names = []
        self._dataframes = {}
        self._data_paths = {}

        if datasets is not None:
            for dataset_dict in datasets:
                self.add_dataset(dataset_dict, update=False)

    def __contains__(self, name):
        """
        Check if the container object contains a dataset with a given name.

        Parameters
        ----------
        name : str
            The name to check in the container object.

        Returns
        -------
        key_check : bool
            ``True`` if a dataset with this name exists in the container
            object, else ``False``.
        """
        return name in self._names

    def __getitem__(self, name):
        """
        Get the data frame for the dataset with the given name.

        Parameters
        ----------
        name : str
            The name for the dataset.

        Returns
        -------
        frame : pandas DataFrame
            The data frame for the dataset with the given name.

        Raises
        ------
        KeyError
            If the name does not exist in the container.
        """
        return self.get_frame(name)

    def __len__(self):
        """
        Return the number of datasets in the container.

        Returns
        -------
        length : int
            The size of the container (i.e. number of datasets).
        """
        return len(self._names)

    def __str__(self):
        """
        Return a string representation of the container.

        Returns
        -------
        container_names : str
            A comma-separated list of dataset names from the container.
        """
        return ", ".join(self._names)

    def __add__(self, other):
        """
        Add another container object to instance.

        Return a new container object with datasets included
        in either of the two containers.

        Parameters
        ----------
        other : DataContainer
            The container object to add.

        Returns
        -------
        output : DataContainer
            New container object containing datasets
            included in this instance and the other instance.

        Raises
        ------
        KeyError
            If there are duplicate keys in the two containers.
        ValueError
            If the object being added is not a container.

        """
        if not isinstance(other, DataContainer):
            raise ValueError(f"Object must be a `DataContainer`, not {type(other)}.")

        # Make sure there are no duplicate keys
        common_keys = set(other._names).intersection(self._names)
        if common_keys:
            raise KeyError(f"The key(s) `{', '.join(common_keys)}` already exist in the container.")

        dicts = DataContainer.to_datasets(self)
        dicts.extend(DataContainer.to_datasets(other))
        return DataContainer(dicts)

    def __iter__(self):
        """
        Iterate through the container keys (dataset names).

        Yields
        ------
        key
            A key (name) in the container dictionary.
        """
        for key in self.keys():
            yield key

[docs]    @staticmethod
    def to_datasets(data_container):
        """
        Convert container object to a list of dataset dictionaries.

        Each dictionary will contain the "name", "frame", and
        "path" keys.

        Parameters
        ----------
        data_container : DataContainer
            The container object to convert.

        Returns
        -------
        datasets_dict : list of dicts
            A list of dataset dictionaries.
        """
        dataset_dicts = []
        for name in data_container.keys():
            dataset_dict = {
                "name": name,
                "path": data_container.get_path(name),
                "frame": data_container.get_frame(name),
            }
            dataset_dicts.append(dataset_dict)
        return dataset_dicts

[docs]    def add_dataset(self, dataset_dict, update=False):
        """
        Add a new dataset (or update an existing one).

        Parameters
        ----------
        dataset_dict : dict
            The dataset dictionary to add or update
            with the "name", "frame", and "path" keys.
        update : bool, optional
            Update an existing DataFrame, if ``True``.
            Defaults to ``False``.
        """
        name = dataset_dict["name"]
        data_frame = dataset_dict["frame"]
        path = dataset_dict.get("path")

        if not update:
            if name in self._names:
                raise KeyError(f"The name {name} already exists in the container dictionary.")

        if name not in self._names:
            self._names.append(name)

        self._dataframes[name] = data_frame
        self._data_paths[name] = path

        self.__setattr__(name, data_frame)

[docs]    def get_path(self, name, default=None):
        """
        Get the path for the dataset given the name.

        Parameters
        ----------
        name : str
            The name for the dataset.
        default : str, optional
            The default path to return if the named dataset does not exist.
            Defaults to ``None``.

        Returns
        -------
        path : str
            The path for the named dataset.
        """
        if name not in self._names:
            return default
        return self._data_paths[name]

[docs]    def get_frame(self, name, default=None):
        """
        Get the data frame given the dataset name.

        Parameters
        ----------
        name : str
            The name for the dataset.
        default : pandas DataFrame, optional
            The default value to return if the named dataset does not exist.
            Defaults to ``None``.

        Returns
        -------
        frame : pandas DataFrame
            The data frame for the named dataset.
        """
        if name not in self._names:
            return default
        return self._dataframes[name]

[docs]    def get_frames(self, prefix=None, suffix=None):
        """
        Get all data frames with a given prefix or suffix in their name.

        Note that the selection by prefix or suffix is case-insensitive.

        Parameters
        ----------
        prefix : str, optional
            Only return frames with the given prefix. If ``None``, then
            do not exclude any frames based on their prefix.
            Defaults to ``None``.
        suffix : str, optional
            Only return frames with the given suffix. If ``None``, then
            do not exclude any frames based on their suffix.
            Defaults to ``None``.

        Returns
        -------
        frames : dict
            A dictionary with the data frames that contain the specified
            prefix and/or suffix in their corresponding names. The names
            are the keys and the frames are the values.
        """
        if prefix is None:
            prefix = ""

        if suffix is None:
            suffix = ""

        names = [
            name
            for name in self._names
            if name.lower().startswith(prefix) and name.lower().endswith(suffix)
        ]

        frames = {}
        for name in names:
            frames[name] = self._dataframes[name]
        return frames

[docs]    def keys(self):  # noqa: D402
        """
        Return the container keys (dataset names) as a list.

        Returns
        -------
        keys : list
            A list of keys (names) in the container object.
        """
        return self._names

[docs]    def values(self):
        """
        Return all data frames as a list.

        Returns
        -------
        values : list
            A list of all data frames in the container object.
        """
        return [self._dataframes[name] for name in self._names]

[docs]    def items(self):
        """
        Return the container items as a list of (name, frame) tuples.

        Returns
        -------
        items : list of tuples
            A list of (name, frame) tuples in the container object.
        """
        return [(name, self._dataframe[name]) for name in self._names]

[docs]    def drop(self, name):
        """
        Drop a given dataset from the container and return instance.

        Parameters
        ----------
        name : str
            The name of the dataset to drop.

        Returns
        -------
        self
        """
        if name not in self:
            warnings.warn(
                f"The name `{name}` is not in the container. " f"No datasets will be dropped."
            )
        else:
            self._names.remove(name)
            self._dataframes.pop(name)
            self._data_paths.pop(name)
        return self

[docs]    def rename(self, name, new_name):
        """
        Rename a given dataset in the container and return instance.

        Parameters
        ----------
        name : str
            The name of the current dataset in the container object.
        new_name : str
            The new name for the dataset in the container object.

        Returns
        -------
        self
        """
        if name not in self:
            warnings.warn(f"The name `{name}` is not in the container and cannot be renamed.")
        else:
            frame = self._dataframes[name]
            path = self._data_paths[name]
            self.add_dataset({"name": new_name, "frame": frame, "path": path}, update=True)
            self.drop(name)
        return self

[docs]    def copy(self, deep=True):
        """
        Return a copy of the container object.

        Parameters
        ----------
        deep : bool, optional
            If ``True``, create a deep copy of the underlying data frames.
            Defaults to ``True``.
        """
        if deep:
            return deepcopy(self)
        return copy(self)