Source code for rsmtool.writer

"""
Class for writing DataContainer frames to disk.

:author: Jeremy Biggs (jbiggs@ets.org)
:author: Anastassia Loukina (aloukina@ets.org)
:author: Nitin Madnani (nmadnani@ets.org)

:organization: ETS
"""

from os import makedirs
from os.path import join
from typing import Dict, List, Optional, Union

import pandas as pd
from wandb.sdk.lib import RunDisabled
from wandb.wandb_run import Run

from rsmtool.container import DataContainer

from .utils.wandb import log_dataframe_to_wandb



[docs]
class DataWriter:
    """Class to write out DataContainer objects."""

    def __init__(
        self,
        experiment_id: Optional[str] = None,
        context: Optional[str] = None,
        wandb_run: Union[Run, RunDisabled, None] = None,
    ):
        """
        Initialize the DataWriter object.

        Parameters
        ----------
        experiment_id : Optional[str]
            The experiment name to be used in the output file names.
            Defaults to ``None``.
        context : Optional[str]
            The context in which this writer is used.
            Defaults to ``None``.
        wandb_run : Union[wandb.wandb_run.Run, wandb.sdk.lib.RunDisabled, None]
            The wandb run object if wandb is enabled, None otherwise.
            If enabled, all the output data frames will be logged to
            this run as tables.
            Defaults to ``None``.
        """
        self._id = experiment_id
        self.context = context
        self.wandb_run = wandb_run


[docs]
    @staticmethod
    def write_frame_to_file(
        df: pd.DataFrame, name_prefix: str, file_format: str = "csv", index: bool = False, **kwargs
    ) -> None:
        """
        Write given data frame to disk with given name and file format.

        Parameters
        ----------
        df : pandas.DataFrame
            Data frame to write to disk
        name_prefix : str
            The complete prefix for the file to be written to disk.
            This includes everything except the extension.
        file_format : str
            The file format (extension) for the file to be written to disk.
            One of {``"csv"``, ``"xlsx"``, ``"tsv"``}.
            Defaults to ``"csv"``.
        index : bool
            Whether to include the index in the output file.
            Defaults to ``False``.

        Raises
        ------
        KeyError
            If ``file_format`` is not valid.
        """
        file_format = file_format.lower()

        if file_format == "csv":
            name_prefix += ".csv"
            df.to_csv(name_prefix, index=index, **kwargs)

        elif file_format == "tsv":
            name_prefix += ".tsv"
            df.to_csv(name_prefix, index=index, sep="\t", **kwargs)

        # Added jsonlines for experimental purposes, but leaving
        # this out of the documentation at this stage
        elif file_format == "jsonlines":
            name_prefix += ".jsonlines"
            df.to_json(name_prefix, orient="records", lines=True, **kwargs)

        elif file_format == "xlsx":
            name_prefix += ".xlsx"
            df.to_excel(name_prefix, index=index, **kwargs)

        else:
            raise KeyError(
                "Please make sure that the `file_format` specified "
                "is one of the following:\n{`csv`, `tsv`, `xlsx`}.\n"
                f"You specified {file_format}."
            )



[docs]
    def write_experiment_output(
        self,
        csvdir: str,
        container_or_dict: Union[DataContainer, Dict[str, pd.DataFrame]],
        dataframe_names: Optional[List[str]] = None,
        new_names_dict: Optional[Dict[str, str]] = None,
        include_experiment_id: bool = True,
        reset_index: bool = False,
        file_format: str = "csv",
        index: bool = False,
        **kwargs,
    ) -> None:
        """
        Write out each of the named frames to disk.

        This function writes out each of the given list of data frames as a
        ".csv", ".tsv", or ``.xlsx`` file in the given directory. Each data
        frame was generated as part of running an RSMTool experiment. All files
        are prefixed with the given experiment ID and suffixed with either the
        name of the data frame in the DataContainer (or dict) object, or a new
        name if ``new_names_dict`` is specified. Additionally, the indexes in
        the data frames are reset if so specified.

        Parameters
        ----------
        csvdir : str
            Path to the output experiment sub-directory that will
            contain the CSV files corresponding to each of the data frames.
        container_or_dict : Union[container.DataContainer, Dict[str, pd.DataFrame]]
            A DataContainer object or dict, where keys are data frame
            names and values are pandas.DataFrame objects.
        dataframe_names : Optional[List[str]]
            List of data frame names, one for each of the data frames.
            Defaults to ``None``.
        new_names_dict : Optional[Dict[str, str]]
            New dictionary with new names for the data frames, if desired.
            Defaults to ``None``.
        include_experiment_id : bool
            Whether to include the experiment ID in the file name.
            Defaults to ``True``.
        reset_index : bool
            Whether to reset the index of each data frame
            before writing to disk.
            Defaults to ``False``.
        file_format : str
            The file format in which to output the data.
            One of {``"csv"``, ``"xlsx"``, ``"tsv"``}.
            Defaults to ``"csv"``.
        index : bool
            Whether to include the index in the output file.
            Defaults to ``False``.

        Raises
        ------
        KeyError
            If ``file_format`` is not valid, or a named data frame
            is not present in ``container_or_dict``.
        """
        container_or_dict = container_or_dict.copy()

        # If no `dataframe_names` specified, use all names
        if dataframe_names is None:
            dataframe_names = list(container_or_dict.keys())

        # Otherwise, check to make sure all specified names
        # are actually in the DataContainer
        else:
            for name in dataframe_names:
                if name not in container_or_dict:
                    raise KeyError(f"The name `{name}` is not in the container or dictionary.")

        # Loop through DataFrames, and save
        # output in specified format
        for dataframe_name in dataframe_names:
            df = container_or_dict[dataframe_name]
            if df is None:
                raise KeyError(f"The DataFrame `{dataframe_name}` does not exist.")

            # If the DataFrame is empty, skip it
            if df.empty:
                continue

            # If there is a desire to rename the DataFrame,
            # get the new name
            if new_names_dict is not None:
                if dataframe_name in new_names_dict:
                    dataframe_name = new_names_dict[dataframe_name]

            # Reset the index, if desired
            if reset_index:
                df.index.name = ""
                df.reset_index(inplace=True)

            # If include_experiment_id is True, and the experiment_id exists
            # include it in the file name; otherwise, do not include it.
            if include_experiment_id and self._id is not None:
                outfile = join(csvdir, f"{self._id}_{dataframe_name}")
            else:
                outfile = join(csvdir, dataframe_name)

            # write out the frame to disk in the given file
            self.write_frame_to_file(df, outfile, file_format=file_format, index=index, **kwargs)
            log_dataframe_to_wandb(self.wandb_run, df, dataframe_name, self.context)



[docs]
    def write_feature_csv(
        self,
        featuredir: str,
        data_container: DataContainer,
        selected_features: List[str],
        include_experiment_id: bool = True,
        file_format: str = "csv",
    ) -> None:
        """
        Write out the selected features to disk.

        Parameters
        ----------
        featuredir : str
            Path to the experiment output directory where the
            feature JSON file will be saved.
        data_container : DataContainer
            A data container object.
        selected_features : List[str]
            List of features that were selected for model building.
        include_experiment_id : bool
            Whether to include the experiment ID in the file name.
            Defaults to ``True``.
        file_format : str
            The file format in which to output the data. One of {``"csv"``, ``"tsv"``,
            ``"xlsx"``}.
            Defaults to ``"csv"``.
        """
        df_feature_specs = data_container["feature_specs"]

        # Select specific features used in training
        df_selected = df_feature_specs[df_feature_specs["feature"].isin(selected_features)]

        # Replace existing `feature_specs` with selected features specs
        data_container.add_dataset({"frame": df_selected, "name": "feature_specs"}, update=True)

        makedirs(featuredir, exist_ok=True)
        self.write_experiment_output(
            featuredir,
            data_container,
            ["feature_specs"],
            {"feature_specs": "selected"},
            include_experiment_id=include_experiment_id,
            file_format=file_format,
        )