Source code for rsmtool.reader

"""
Classes for reading data files (or dictionaries) into DataContainer objects.

:author: Jeremy Biggs (jbiggs@ets.org)
:author: Anastassia Loukina (aloukina@ets.org)
:author: Nitin Madnani (nmadnani@ets.org)

:organization: ETS
"""

import warnings
from functools import partial
from os.path import abspath, exists, join, splitext
from typing import Any, Dict, List, Optional, Union

import pandas as pd

from .container import DataContainer, DatasetDict

# allow older versions of pandas to work
try:
    from pandas.io.common import DtypeWarning
except ImportError:
    from pandas.errors import DtypeWarning



[docs]
def read_jsonlines(filename: str, converters: Optional[Dict[str, Any]] = None) -> pd.DataFrame:
    """
    Read a data file in .jsonlines format into a data frame.

    Normalize nested jsons with up to one level of nesting.

    Parameters
    ----------
    filename: str
        Name of file to read.
    converters : Optional[Dict[str, Any]]
        A dictionary specifying how the types of the columns
        in the file should be converted. Specified in the same
        format as for ``pandas.read_csv()``.
        Defaults to ``None``.

    Returns
    -------
    df : pandas.DataFrame
         Data frame containing the data in the given file.
    """
    try:
        df = pd.read_json(filename, orient="records", lines=True, dtype=converters)
    except ValueError:
        raise ValueError(
            "The jsonlines file is not formatted correctly. "
            "Please check that it's not a plain JSON file, that "
            "each line ends with a comma, that there is no comma "
            "at the end of the last line, and that all quotes match."
        )

    dfs = []
    for column in df:
        # let's try to normalize this column
        try:
            df_column = pd.json_normalize(df[column])

            # Starting with Pandas v1.3, we get an empty data frame
            # if the column does not contain a nested json.
            # If this is the case, we simply copy the column.
            if df_column.empty:
                df_column = df[column].copy()

        # Pandas <v1.3 will raise an attribute error instead,
        # so we'll catch that too
        except AttributeError:
            df_column = df[column].copy()

        dfs.append(df_column)

    df = pd.concat(dfs, axis=1)

    return df




[docs]
def try_to_load_file(
    filename: str,
    converters: Optional[Dict[str, Any]] = None,
    raise_error: bool = False,
    raise_warning: bool = False,
    **kwargs,
) -> Optional[pd.DataFrame]:
    """
    Read a single file, if it exists.

    Optionally raises an error or warning if the file cannot be found.
    Otherwise, returns ``None``.

    Parameters
    ----------
    filename : str
        Name of file to read.
    converters : Optional[Dict[str, Any]]
        A dictionary specifying how the types of the columns
        in the file should be converted. Specified in the same
        format as for ``pandas.read_csv()``.
        Defaults to ``None``.
    raise_error : bool
        Raise an error if the file cannot be located.
        Defaults to ``False``.
    raise_warning : bool
        Raise a warning if the file cannot be located.
        Defaults to ``False``.

    Returns
    -------
    df : Optional[pandas.DataFrame]
        DataFrame containing the data in the given file,
        or ``None`` if the file does not exist.

    Raises
    ------
    FileNotFoundError
        If ``raise_error`` is ``True`` and the file cannot be located.
    """
    if not exists(filename):
        message = f"The file '{filename}' could not be located."
        if raise_error:
            raise FileNotFoundError(message)
        if raise_warning:
            warnings.warn(message)
        return None
    else:
        return DataReader.read_from_file(filename, converters, **kwargs)




[docs]
class DataReader:
    """Class to generate DataContainer objects."""

    def __init__(
        self,
        filepaths: List[str],
        framenames: List[str],
        file_converters: Optional[Dict[str, Dict[str, Any]]] = None,
    ):
        """
        Initialize a DataReader object.

        Parameters
        ----------
        filepaths : List[str]
            A list of paths to files that are to be read in. Some of the paths
            can be empty strings.
        framenames : List[str]
            A list of names for the data sets to be included in the container.
        file_converters : Optional[Dict[str, Dict[str, Any]]]
            A dictionary of file converter dicts. The keys are the data set
            names and the values are the converter dictionaries to be applied
            to the corresponding data set.
            Defaults to ``None``.

        Raises
        ------
        AssertionError
            If ``len(filepaths)`` does not equal ``len(framenames)``.
        ValueError
            If ``file_converters`` is not a dictionary or if any of its
            values is not a dictionary.
        NameError
            If a key in ``file_converters`` does not exist in ``framenames``.
        ValueError
            If any of the specified file paths is ``None``.
        """
        # Default datasets list
        self.datasets: List[DatasetDict] = []

        # Make sure filepaths length matches frame names length
        assert len(filepaths) == len(framenames)

        # Make sure that there are no empty strings in the filepaths
        frames_with_no_path = [framenames[i] for i in range(len(framenames)) if filepaths[i] == ""]
        if frames_with_no_path:
            raise ValueError(f"No path specified for {' ,'.join(frames_with_no_path)}")

        # Assign names and paths lists
        self.dataset_names = framenames
        self.dataset_paths = filepaths

        # If `file_converters` exists, then
        # check to make sure it is the correct length
        # and add all elements to `file_converters` list
        if file_converters is not None:
            if not isinstance(file_converters, dict):
                raise ValueError(
                    f"The 'file_converters' argument must be a `dict`, "
                    f"not `{type(file_converters)}`."
                )

            for file_converter_name in file_converters:
                # Make sure file_converter name is in `dataset_names`
                if file_converter_name not in self.dataset_names:
                    raise NameError(
                        f"The file converter name ``{file_converter_name}`` "
                        f"does not exist in the dataset names that you passed."
                    )

                # Make sure file converter is a `dict`
                file_converter = file_converters[file_converter_name]
                if not isinstance(file_converter, dict):
                    raise ValueError(
                        f"Value for {file_converter_name} must be``dict`` "
                        f"not {type(file_converter)}"
                    )

        # Default file_converters dict
        self.file_converters = {} if file_converters is None else file_converters


[docs]
    @staticmethod
    def read_from_file(filename: str, converters: Optional[Dict[str, Any]] = None, **kwargs):
        """
        Read a CSV/TSV/XLSX/JSONLINES/SAS7BDAT file and return a data frame.

        Parameters
        ----------
        filename : str
            Name of file to read.
        converters : Optional[Dict[str, Any]]
            A dictionary specifying how the types of the columns
            in the file should be converted. Specified in the same
            format as for `pandas.read_csv()``.
            Defaults to ``None``.

        Returns
        -------
        df : pandas.DataFrame
            Data frame containing the data in the given file.

        Raises
        ------
        ValueError
            If the file has an unsuppored extension.
        pandas.errors.ParserError
            If the file is badly formatted or corrupt.

        Note
        ----
        Any additional keyword arguments are passed to the underlying
        pandas IO reader function.
        """
        file_extension = splitext(filename)[1].lower()

        if file_extension in [".csv", ".tsv"]:
            sep = "\t" if file_extension == ".tsv" else ","
            do_read = partial(pd.read_csv, sep=sep, converters=converters)
        elif file_extension == ".xlsx":
            do_read = partial(pd.read_excel, converters=converters)
        elif file_extension == ".sas7bdat":
            if "encoding" not in kwargs:
                encoding = "latin-1"
            else:
                encoding = kwargs.pop("encoding")
            do_read = partial(pd.read_sas, encoding=encoding)
        elif file_extension in [".jsonlines"]:
            do_read = partial(read_jsonlines, converters=converters)
        else:
            raise ValueError(
                f"RSMTool only supports files in .csv, .tsv, .xlsx, "
                f"and .sas7bdat formats. Input files should have one "
                f"of these extensions. The file you passed is: {filename}."
            )

        # ignore warnings about mixed data types for large files
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", category=DtypeWarning)
            try:
                df = do_read(filename, **kwargs)
            except pd.errors.ParserError:
                raise pd.errors.ParserError(
                    f"Cannot read {filename}. Please check "
                    f"that it is not corrupt or in an incompatible "
                    f"format. (Try running dos2unix?)"
                )
        return df



[docs]
    @staticmethod
    def locate_files(filepaths: Union[str, List[str]], configdir: str) -> List[str]:
        """
        Locate an experiment file, or a list of experiment files.

        If the given path doesn't exist, then maybe the path is relative
        to the path of the config file. If neither exists, then return an
        empty string.

        Parameters
        ----------
        filepaths : Union[str, List[str]]
            Name(s) of the experiment file we want to locate.
        configdir : str
            Path to the reference configuration directory
            (usually the directory of the config file)

        Returns
        -------
        retval : List[str]
            List of absolute paths to the located files. If a file
            does not exist, the corresponding element in the list
            is an empty string.

        Raises
        ------
        ValueError
            If ``filepaths`` is not a string or a list.
        """
        # the feature config file can be in the 'feature' directory
        # at the same level as the main config file
        if not (isinstance(filepaths, str) or isinstance(filepaths, list)):
            raise ValueError(
                f"The 'filepaths' argument must be a string or a list, not {type(filepaths)}."
            )

        if isinstance(filepaths, str):
            filepaths = [filepaths]

        located_paths = []
        for filepath in filepaths:
            retval = ""
            alternate_path = abspath(join(configdir, filepath))

            # if the given path exists as is, convert
            # that to an absolute path and return
            if exists(filepath):
                retval = abspath(filepath)

            # otherwise check if it exists relative
            # to the reference directory
            elif exists(alternate_path):
                retval = alternate_path

            located_paths.append(retval)

        return located_paths



[docs]
    def read(self, kwargs_dict: Optional[Dict[str, Dict[str, Any]]] = None) -> DataContainer:
        """
        Read all files contained in ``self.dataset_paths``.

        Parameters
        ----------
        kwargs_dict : Optional[Dict[str, Dict[str, Any]]]
            Dictionary with the names of the datasets as keys and
            dictionaries of keyword arguments to pass to the pandas
            reader for each dataset as values. The keys in those
            dictionaries are the names of the keyword arguments and
            the values are the values of the keyword arguments.
            Defaults to ``None``.

        Returns
        -------
        datacontainer : DataContainer
            A data container object.

        Raises
        ------
        FileNotFoundError
            If any of the files in ``self.dataset_paths`` does not exist.
        """
        for idx, set_path in enumerate(self.dataset_paths):
            name = self.dataset_names[idx]
            converter = self.file_converters.get(name, None)

            if not set_path or not exists(set_path):
                raise FileNotFoundError(f"The file {set_path} does not exist")

            if kwargs_dict is not None:
                kwargs = kwargs_dict.get(name, {})
            else:
                kwargs = {}

            dataframe = self.read_from_file(set_path, converter, **kwargs)

            # Add to list of datasets
            self.datasets.append({"name": name.strip(), "path": set_path, "frame": dataframe})

        return DataContainer(self.datasets)