Source code for chemfit.data_utils

from __future__ import annotations

from collections.abc import Sequence
from pathlib import Path

import pandas as pd

DEFAULT_SLICE = slice(None, None, None)



[docs]
def process_csv(
    paths_to_csv: Path | Sequence[Path],
    index: slice | Sequence[slice] = DEFAULT_SLICE,
) -> tuple[list[Path], list[str], list[float]]:
    """
    Load a dataset CSV and extract file paths, tags, and reference energies.

    If a list of paths is passed it forwards them one by one to `process_single_csv` and collects
    the results.

    Args:
        paths_to_csv (Union[Path, Sequence[Path]]): Either a single path to a CSV for a list of paths
        index (Union[slice, Sequence[slice]]): Either a single slice or a list of slices which is applied to the data read from the CSVs

    Returns:
        tuple[list[Path], list[str], list[float]]:
        - **paths**: List of resolved `Path` objects to each data file.
        - **tags**: List of dataset tag strings.
        - **energies**: List of reference energies as floats.

    """
    # If it is a single path we just process it
    if isinstance(paths_to_csv, Path):
        return process_single_csv(paths_to_csv)

    if not isinstance(index, Sequence):
        index = [index] * len(paths_to_csv)

    paths: list[Path] = []
    tags: list[str] = []
    energies: list[float] = []

    for i, path_to_csv in zip(index, paths_to_csv):
        p, t, e = process_single_csv(path_to_csv, i)
        paths += p
        tags += t
        energies += e

    return paths, tags, energies




[docs]
def process_single_csv(
    path_to_csv: Path, index: slice = DEFAULT_SLICE
) -> tuple[list[Path], list[str], list[float]]:
    """
    Load a dataset CSV and extract file paths, tags, and reference energies.

    The CSV must include the following columns:
      - Either `path` or `file`:
          * If `path` is present, each entry may be absolute or relative to the current working directory.
          * Otherwise, `file` entries are taken as relative to the CSV's parent directory.
          * If both are present, `path` takes precedence.
      - `tag`: A short string label for each dataset.
      - `reference_energy`: A numeric reference energy for each dataset.

    Additional columns are permitted and ignored.

    Args:
        path_to_csv (Path): Path to the CSV file describing the datasets.
        index (slice) slice(None, None, None): A slice which is applied to the data read from the CSV

    Returns:
        tuple[list[Path], list[str], list[float]]:
        - **paths**: List of resolved `Path` objects to each data file.
        - **tags**: List of dataset tag strings.
        - **energies**: List of reference energies as floats.

    Raises:
        FileNotFoundError: If the CSV file does not exist.
        KeyError: If neither `path` nor `file`, or if `tag` or `reference_energy` columns are missing.
        ValueError: If any `reference_energy` value cannot be converted to float.

    """
    df = pd.read_csv(path_to_csv)
    if "path" in df.columns:
        paths = [Path(p) for p in df["path"]]
    elif "file" in df.columns:
        base = path_to_csv.parent.resolve()
        paths = [base / Path(fname) for fname in df["file"]]
    else:
        msg = f"Error while processing {path_to_csv}. CSV must contain either a 'path' or 'file' column."
        raise KeyError(msg)

    if "tag" not in df.columns or "reference_energy" not in df.columns:
        msg = "Error while processing {path_to_csv}. CSV must contain 'tag' and 'reference_energy' columns."
        raise KeyError(msg)

    assert isinstance(df["tag"][0], str)

    tags: list[str] = list(df["tag"])
    try:
        energies = [float(e) for e in df["reference_energy"]]
    except Exception as err:
        msg = "Error while processing {path_to_csv}. All 'reference_energy' entries must be numeric."
        raise ValueError(msg) from err

    return paths[index], tags[index], energies[index]