Source code for chemfit.data_utils

from __future__ import annotations

from collections.abc import Sequence
from pathlib import Path

import pandas as pd

DEFAULT_SLICE = slice(None, None, None)


[docs] def process_csv( paths_to_csv: Path | Sequence[Path], index: slice | Sequence[slice] = DEFAULT_SLICE, ) -> tuple[list[Path], list[str], list[float]]: """ Load a dataset CSV and extract file paths, tags, and reference energies. If a list of paths is passed it forwards them one by one to `process_single_csv` and collects the results. Args: paths_to_csv (Union[Path, Sequence[Path]]): Either a single path to a CSV for a list of paths index (Union[slice, Sequence[slice]]): Either a single slice or a list of slices which is applied to the data read from the CSVs Returns: tuple[list[Path], list[str], list[float]]: - **paths**: List of resolved `Path` objects to each data file. - **tags**: List of dataset tag strings. - **energies**: List of reference energies as floats. """ # If it is a single path we just process it if isinstance(paths_to_csv, Path): return process_single_csv(paths_to_csv) if not isinstance(index, Sequence): index = [index] * len(paths_to_csv) paths: list[Path] = [] tags: list[str] = [] energies: list[float] = [] for i, path_to_csv in zip(index, paths_to_csv): p, t, e = process_single_csv(path_to_csv, i) paths += p tags += t energies += e return paths, tags, energies
[docs] def process_single_csv( path_to_csv: Path, index: slice = DEFAULT_SLICE ) -> tuple[list[Path], list[str], list[float]]: """ Load a dataset CSV and extract file paths, tags, and reference energies. The CSV must include the following columns: - Either `path` or `file`: * If `path` is present, each entry may be absolute or relative to the current working directory. * Otherwise, `file` entries are taken as relative to the CSV's parent directory. * If both are present, `path` takes precedence. - `tag`: A short string label for each dataset. - `reference_energy`: A numeric reference energy for each dataset. Additional columns are permitted and ignored. Args: path_to_csv (Path): Path to the CSV file describing the datasets. index (slice) slice(None, None, None): A slice which is applied to the data read from the CSV Returns: tuple[list[Path], list[str], list[float]]: - **paths**: List of resolved `Path` objects to each data file. - **tags**: List of dataset tag strings. - **energies**: List of reference energies as floats. Raises: FileNotFoundError: If the CSV file does not exist. KeyError: If neither `path` nor `file`, or if `tag` or `reference_energy` columns are missing. ValueError: If any `reference_energy` value cannot be converted to float. """ df = pd.read_csv(path_to_csv) if "path" in df.columns: paths = [Path(p) for p in df["path"]] elif "file" in df.columns: base = path_to_csv.parent.resolve() paths = [base / Path(fname) for fname in df["file"]] else: msg = f"Error while processing {path_to_csv}. CSV must contain either a 'path' or 'file' column." raise KeyError(msg) if "tag" not in df.columns or "reference_energy" not in df.columns: msg = "Error while processing {path_to_csv}. CSV must contain 'tag' and 'reference_energy' columns." raise KeyError(msg) assert isinstance(df["tag"][0], str) tags: list[str] = list(df["tag"]) try: energies = [float(e) for e in df["reference_energy"]] except Exception as err: msg = "Error while processing {path_to_csv}. All 'reference_energy' entries must be numeric." raise ValueError(msg) from err return paths[index], tags[index], energies[index]