Source code for rsmtool.utils.files

"""
Utility classes and functions for RSMTool file management.

:author: Jeremy Biggs (jbiggs@ets.org)
:author: Anastassia Loukina (aloukina@ets.org)
:author: Nitin Madnani (nmadnani@ets.org)

:organization: ETS
"""

import json
import re
from glob import glob
from os.path import join
from pathlib import Path
from typing import Any, Dict, Union

from .constants import POSSIBLE_EXTENSIONS


[docs] def parse_json_with_comments(pathlike: Union[str, Path]) -> Dict[str, Any]: """ Parse a JSON file after removing any comments. Comments can use either ``//`` for single-line comments or or ``/* ... */`` for multi-line comments. The input filepath can be a string or ``pathlib.Path``. Parameters ---------- filename : Union[str, Path] Path to the input JSON file either as a string or as a ``pathlib.Path`` object. Returns ------- obj : Dict[str, Any] JSON object representing the input file. Note ---- This code was adapted from: https://web.archive.org/web/20150520154859/http://www.lifl.fr/~riquetd/parse-a-json-file-with-comments.html """ # Regular expression to identify comments comment_re = re.compile( r"(^)?[^\S\n]*(?:/\*(.*?)\*/[^\S\n]*|(?<!https:)(?<!http:)//[^\n]*)($)?", re.DOTALL | re.MULTILINE, ) # if we passed in a string, convert it to a Path if isinstance(pathlike, str): pathlike = Path(pathlike) with open(pathlike, "r") as file_buff: content = "".join(file_buff.readlines()) # Looking for comments match = comment_re.search(content) while match: # single line comment content = content[: match.start()] + content[match.end() :] # noqa match = comment_re.search(content) # Return JSON object config = json.loads(content) return config
def has_files_with_extension(directory: str, ext: str) -> bool: """ Check if the directory has any files with the given extension. Parameters ---------- directory : str The path to the directory where output is located. ext : str The the given extension. Returns ------- ans : bool ``True`` if directory contains files with given extension, else ``False``. """ files_with_extension = glob(join(directory, f"*.{ext}")) return len(files_with_extension) > 0 def get_output_directory_extension(directory: str, experiment_id: str) -> str: """ Check output directory to determine what file extensions exist. If more than one extension (in the possible list of extensions) exists, then raise a ``ValueError``. Otherwise, return the one file extension. If no extensions can be found, then "csv" will be returned by default. Possible extensions include: "csv", "tsv", and "xlsx". Files in the directory with none of these extensions are ignored. Parameters ---------- directory : str The path to the directory where output is located. experiment_id : str The ID of the experiment. Returns ------- extension : str The extension that output files in this directory end with. One of {``"csv"``, ``"tsv"``, ``"xlsx"``}. Raises ------ ValueError If any files in the directory have extensions other than ``"csv"``, ``"tsv"``, or ``"xlsx"``. """ extension = "csv" extensions_identified = { ext for ext in POSSIBLE_EXTENSIONS if has_files_with_extension(directory, ext) } if len(extensions_identified) > 1: raise ValueError( f"Some of the files in the experiment output directory (`{directory}`) " f"for `{experiment_id}` have different extensions. All files in this " f"directory must have the same extension. The following extensions " f"were identified : {', '.join(extensions_identified)}" ) elif len(extensions_identified) == 1: extension = list(extensions_identified)[0] return extension