Source code for metatrain.utils.data.readers.readers

import importlib
import warnings
from pathlib import Path
from typing import Dict, List, Optional, Tuple

from metatensor.torch import TensorMap
from metatomic.torch import System
from omegaconf import DictConfig

from ..target_info import TargetInfo


AVAILABLE_READERS = ["ase", "metatensor"]
""":py:class:`list`: list containing all implemented reader libraries"""

DEFAULT_READER = {".xyz": "ase", ".extxyz": "ase", ".mts": "metatensor"}
""":py:class:`dict`: dictionary mapping file extensions to a default reader"""



[docs]
def read_systems(
    filename: str,
    reader: Optional[str] = None,
) -> List[System]:
    """Read system informations from a file.

    :param filename: name of the file to read
    :param reader: reader library for parsing the file. If :py:obj:`None` the library is
        is tried to determined from the file extension.
    :param dtype: desired data type of returned tensor
    :returns: list of systems
        determined from the file extension.
    :returns: list of systems stored in double precision
    """
    if reader is None:
        try:
            file_suffix = Path(filename).suffix
            reader = DEFAULT_READER[file_suffix]
        except KeyError:
            raise ValueError(
                f"File extension {file_suffix!r} is not linked to a default reader "
                "library. You can try reading it by setting a specific 'reader' from "
                f"the known ones: {', '.join(AVAILABLE_READERS)} "
            )

    try:
        reader_mod = importlib.import_module(
            name=f".{reader}", package="metatrain.utils.data.readers"
        )
    except ImportError:
        raise ValueError(
            f"Reader library {reader!r} not supported. Choose from "
            f"{', '.join(AVAILABLE_READERS)}"
        )

    try:
        reader_met = reader_mod.read_systems
    except AttributeError:
        raise ValueError(
            f"Reader library {reader!r} cannot read systems."
            f"You can try with other readers: {AVAILABLE_READERS}"
        )

    systems = reader_met(filename)

    # elements in data are `torch.ScriptObject`s and their `dtype` is an integer.
    # A C++ double/torch.float64 is `7` according to
    # https://github.com/pytorch/pytorch/blob/207564bab1c4fe42750931765734ee604032fb69/c10/core/ScalarType.h#L54-L93
    if not all(s.dtype == 7 for s in systems):
        raise ValueError("The loaded systems are not in double precision.")

    return systems




[docs]
def read_targets(
    conf: DictConfig,
) -> Tuple[Dict[str, List[TensorMap]], Dict[str, TargetInfo]]:
    """Reading all target information from a fully expanded config.

    To get such a config you can use :func:`expand_dataset_config
    <metatrain.utils.omegaconf.expand_dataset_config>`. All targets are stored in double
    precision.

    This function uses subfunctions like :func:`read_energy` to parse the requested
    target quantity. Currently only `energy` is a supported target property. But, within
    the `energy` section gradients such as `forces`, the `stress` or the `virial` can be
    added. Other gradients are silently ignored.

    :param conf: config containing the keys for what should be read.
    :returns: Dictionary containing a list of TensorMaps for each target section in the
        config as well as a ``Dict[str, TargetInfo]`` object containing the metadata of
        the targets.

    :raises ValueError: if the target name is not valid. Valid target names are those
        that either start with ``mtt::`` or those that are in the list of standard
        outputs of ``metatomic`` (see
        https://docs.metatensor.org/metatomic/latest/outputs/)
    """
    target_dictionary = {}
    target_info_dictionary = {}
    standard_outputs_list = [
        "energy",
        "non_conservative_forces",
        "non_conservative_stress",
    ]

    for target_key, target in conf.items():
        is_standard_target = target_key in standard_outputs_list
        if not is_standard_target and not target_key.startswith("mtt::"):
            if target_key.lower() in ["force", "forces", "virial", "stress"]:
                warnings.warn(
                    f"{target_key!r} should not be its own top-level target, "
                    "but rather a sub-section of the 'energy' target",
                    stacklevel=2,
                )
            else:
                raise ValueError(
                    f"Target name ({target_key}) must either be one of "
                    f"{standard_outputs_list} or start with `mtt::`."
                )
        if (
            "force" in target_key.lower()
            or "virial" in target_key.lower()
            or "stress" in target_key.lower()
        ):
            warnings.warn(
                f"the name of {target_key!r} resembles to a gradient of "
                "energies; it should probably not be its own top-level target, "
                "but rather a gradient sub-section of a target with the "
                "`energy` quantity",
                stacklevel=2,
            )

        is_energy = (
            (target["quantity"] == "energy")
            and (not target["per_atom"])
            and target["num_subtargets"] == 1
            and target["type"] == "scalar"
        )
        energy_or_generic = "energy" if is_energy else "generic"

        reader = target["reader"]
        filename = target["read_from"]

        if reader is None:
            try:
                file_suffix = Path(filename).suffix
                reader = DEFAULT_READER[file_suffix]
            except KeyError:
                raise ValueError(
                    f"File extension {file_suffix!r} is not linked to a default reader "
                    "library. You can try reading it by setting a specific 'reader' "
                    f"from the known ones: {', '.join(AVAILABLE_READERS)} "
                )

        try:
            reader_mod = importlib.import_module(
                name=f".{reader}", package="metatrain.utils.data.readers"
            )
        except ImportError:
            raise ValueError(
                f"Reader library {reader!r} not supported. Choose from "
                f"{', '.join(AVAILABLE_READERS)}"
            )

        try:
            reader_met = getattr(reader_mod, f"read_{energy_or_generic}")
        except AttributeError:
            raise ValueError(
                f"Reader library {reader!r} cannot read {target!r}."
                f"You can try with other readers: {AVAILABLE_READERS}"
            )

        targets_as_list_of_tensor_maps, target_info = reader_met(target)

        # elements in data are `torch.ScriptObject`s and their `dtype` is an integer.
        # A C++ double/torch.float64 is `7` according to
        # https://github.com/pytorch/pytorch/blob/207564bab1c4fe42750931765734ee604032fb69/c10/core/ScalarType.h#L54-L93
        if not all(t.dtype == 7 for t in targets_as_list_of_tensor_maps):
            raise ValueError("The loaded targets are not in double precision.")

        target_dictionary[target_key] = targets_as_list_of_tensor_maps
        target_info_dictionary[target_key] = target_info

    return target_dictionary, target_info_dictionary