Source code for rsmtool.reader

"""
Classes for reading data files (or dictionaries) into DataContainer objects.

:author: Jeremy Biggs (jbiggs@ets.org)
:author: Anastassia Loukina (aloukina@ets.org)
:author: Nitin Madnani (nmadnani@ets.org)

:organization: ETS
"""

import warnings
from functools import partial
from os.path import abspath, exists, join, splitext

import pandas as pd

from .container import DataContainer

# allow older versions of pandas to work
try:
    from pandas.io.common import DtypeWarning
except ImportError:
    from pandas.errors import DtypeWarning


[docs]def read_jsonlines(filename, converters=None):
    """
    Read a data file in .jsonlines format into a data frame.

    Normalize nested jsons with up to one level of nesting.

    Parameters
    ----------
    filename: str
        Name of file to read.
    converters : dict, optional
        A dictionary specifying how the types of the columns
        in the file should be converted. Specified in the same
        format as for ``pandas.read_csv()``.
        Defaults to ``None``.

    Returns
    -------
    df : pandas DataFrame
         Data frame containing the data in the given file.
    """
    try:
        df = pd.read_json(filename, orient="records", lines=True, dtype=converters)
    except ValueError:
        raise ValueError(
            "The jsonlines file is not formatted correctly. "
            "Please check that each line ends with a comma, "
            "there is no comma at the end of the last line, "
            "and that all quotes match."
        )

    # make sure we didn't get a plain json
    if type(df.columns) == pd.RangeIndex:
        raise ValueError(
            f"It looks like {filename} is a simple json file. Please "
            f"check documentation (for the expected file format"
        )

    dfs = []
    for column in df:
        # let's try to normalize this column
        try:
            df_column = pd.json_normalize(df[column])

            # Starting with Pandas v1.3, we get an empty data frame
            # if the column does not contain a nested json.
            # If this is the case, we simply copy the column.
            if df_column.empty:
                df_column = df[column].copy()

        # Pandas <v1.3 will raise an attribute error instead,
        # so we'll catch that too
        except AttributeError:
            df_column = df[column].copy()

        dfs.append(df_column)

    df = pd.concat(dfs, axis=1)

    return df


[docs]def try_to_load_file(filename, converters=None, raise_error=False, raise_warning=False, **kwargs):
    """
    Read a single file, if it exists.

    Optionally raises an error or warning if the file cannot be found.
    Otherwise, returns ``None``.

    Parameters
    ----------
    filename : str
        Name of file to read.
    converters : dict, optional
        A dictionary specifying how the types of the columns
        in the file should be converted. Specified in the same
        format as for ``pandas.read_csv()``.
        Defaults to ``None``.
    raise_error : bool, optional
        Raise an error if the file cannot be located.
        Defaults to ``False``.
    raise_warning : bool, optional
        Raise a warning if the file cannot be located.
        Defaults to ``False``.

    Returns
    -------
    df : pandas DataFrame or ``None``
        DataFrame containing the data in the given file,
        or ``None`` if the file does not exist.

    Raises
    ------
    FileNotFoundError
        If ``raise_error`` is ``True`` and the file cannot be located.
    """
    if exists(filename):
        return DataReader.read_from_file(filename, converters, **kwargs)

    message = f"The file '{filename}' could not be located."
    if raise_error:
        raise FileNotFoundError(message)

    if raise_warning:
        warnings.warn(message)


[docs]class DataReader:
    """Class to generate DataContainer objects."""

    def __init__(self, filepaths, framenames, file_converters=None):
        """
        Initialize a DataReader object.

        Parameters
        ----------
        filepaths : list of str
            A list of paths to files that are to be read in.
        framenames : list of str
            A list of names for the data sets to be included in the container.
        file_converters : dict of dicts, optional
            A dictionary of file converter dicts. The keys are the data set
            names and the values are the converter dictionaries to be applied
            to the corresponding data set.
            Defaults to ``None``.

        Raises
        ------
        AssertionError
            If ``len(filepaths)`` does not equal ``len(framenames)``.
        ValueError
            If ``file_converters`` is not a dictionary or if any of its
            values is not a dictionary.
        NameError
            If a key in ``file_converters`` does not exist in ``framenames``.
        ValueError
            If any of the specified file paths is ``None``.
        """
        # Default datasets list
        self.datasets = []

        # Make sure filepaths length matches frame names length
        assert len(filepaths) == len(framenames)

        # Make sure that there are no Nones in the filepaths
        if None in filepaths:
            frames_with_no_path = [
                framenames[i] for i in range(len(framenames)) if filepaths[i] is None
            ]

            raise ValueError(f"No path specified for {' ,'.join(frames_with_no_path)}")

        # Assign names and paths lists
        self.dataset_names = framenames
        self.dataset_paths = filepaths

        # If `file_converters` exists, then
        # check to make sure it is the correct length
        # and add all elements to `file_converters` list
        if file_converters is not None:
            if not isinstance(file_converters, dict):
                raise ValueError(
                    f"The 'file_converters' argument must be a `dict`, "
                    f"not `{type(file_converters)}`."
                )

            for file_converter_name in file_converters:
                # Make sure file_converter name is in `dataset_names`
                if file_converter_name not in self.dataset_names:
                    raise NameError(
                        f"The file converter name ``{file_converter_name}`` "
                        f"does not exist in the dataset names that you passed."
                    )

                # Make sure file converter is a `dict`
                file_converter = file_converters[file_converter_name]
                if not isinstance(file_converter, dict):
                    raise ValueError(
                        f"Value for {file_converter_name} must be``dict`` "
                        f"not {type(file_converter)}"
                    )

        # Default file_converters dict
        self.file_converters = {} if file_converters is None else file_converters

[docs]    @staticmethod
    def read_from_file(filename, converters=None, **kwargs):
        """
        Read a CSV/TSV/XLSX/JSONLINES/SAS7BDAT file and return a data frame.

        Parameters
        ----------
        filename : str
            Name of file to read.
        converters : dict, optional
            A dictionary specifying how the types of the columns
            in the file should be converted. Specified in the same
            format as for `pandas.read_csv()``.
            Defaults to ``None``.

        Returns
        -------
        df : pandas DataFrame
            Data frame containing the data in the given file.

        Raises
        ------
        ValueError
            If the file has an unsuppored extension.
        pandas.errors.ParserError
            If the file is badly formatted or corrupt.

        Note
        ----
        Any additional keyword arguments are passed to the underlying
        pandas IO reader function.
        """
        file_extension = splitext(filename)[1].lower()

        if file_extension in [".csv", ".tsv"]:
            sep = "\t" if file_extension == ".tsv" else ","
            do_read = partial(pd.read_csv, sep=sep, converters=converters)
        elif file_extension == ".xlsx":
            do_read = partial(pd.read_excel, converters=converters)
        elif file_extension == ".sas7bdat":
            if "encoding" not in kwargs:
                encoding = "latin-1"
            else:
                encoding = kwargs.pop("encoding")
            do_read = partial(pd.read_sas, encoding=encoding)
        elif file_extension in [".jsonlines"]:
            do_read = partial(read_jsonlines, converters=converters)
        else:
            raise ValueError(
                f"RSMTool only supports files in .csv, .tsv, .xlsx, "
                f"and .sas7bdat formats. Input files should have one "
                f"of these extensions. The file you passed is: {filename}."
            )

        # ignore warnings about mixed data types for large files
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", category=DtypeWarning)
            try:
                df = do_read(filename, **kwargs)
            except pd.errors.ParserError:
                raise pd.errors.ParserError(
                    f"Cannot read {filename}. Please check "
                    f"that it is not corrupt or in an incompatible "
                    f"format. (Try running dos2unix?)"
                )
        return df

[docs]    @staticmethod
    def locate_files(filepaths, configdir):
        """
        Locate an experiment file, or a list of experiment files.

        If the given path doesn't exist, then maybe the path is relative
        to the path of the config file. If neither exists, then return
        ``None``.

        Parameters
        ----------
        filepaths : str or list
            Name(s) of the experiment file we want to locate.
        configdir : str
            Path to the reference configuration directory
            (usually the directory of the config file)

        Returns
        --------
        retval :  str or list
            Absolute path to the experiment file or ``None``
            if the file could not be located. If ``filepaths``
            was a string, this method will return a string.
            Otherwise, it will return a list.

        Raises
        ------
        ValueError
            If ``filepaths`` is not a string or a list.
        """
        # the feature config file can be in the 'feature' directory
        # at the same level as the main config file
        if not (isinstance(filepaths, str) or isinstance(filepaths, list)):
            raise ValueError(
                f"The 'filepaths' argument must be a string or a list, " f"not {type(filepaths)}."
            )

        if isinstance(filepaths, str):
            filepaths = [filepaths]
            return_string = True
        else:
            return_string = False

        located_paths = []
        for filepath in filepaths:

            retval = None
            alternate_path = abspath(join(configdir, filepath))

            # if the given path exists as is, convert
            # that to an absolute path and return
            if exists(filepath):
                retval = abspath(filepath)

            # otherwise check if it exists relative
            # to the reference directory
            elif exists(alternate_path):
                retval = alternate_path

            located_paths.append(retval)

        if return_string:
            return located_paths[0]

        return located_paths

[docs]    def read(self, kwargs_dict=None):
        """
        Read all files contained in ``self.dataset_paths``.

        Parameters
        ----------
        kwargs_dict : dict of dicts, optional
            Any additional keyword arguments to pass to a particular DataFrame.
            These arguments will be passed to the pandas IO reader function.
            Defaults to ``None``.

        Returns
        -------
        datacontainer : container.DataContainer
            A data container object.

        Raises
        ------
        FileNotFoundError
            If any of the files in ``self.dataset_paths`` does not exist.
        """
        for idx, set_path in enumerate(self.dataset_paths):

            name = self.dataset_names[idx]
            converter = self.file_converters.get(name, None)

            if not exists(set_path):
                raise FileNotFoundError(f"The file {set_path} does not exist")

            if kwargs_dict is not None:
                kwargs = kwargs_dict.get(name, {})
            else:
                kwargs = {}

            dataframe = self.read_from_file(set_path, converter, **kwargs)

            # Add to list of datasets
            self.datasets.append({"name": name.strip(), "path": set_path, "frame": dataframe})

        return DataContainer(self.datasets)