Source code for nfflr.data.datasets.mlearn

__all__ = ()

from pathlib import Path
from typing import Optional

import torch
import jarvis
import numpy as np
import pandas as pd
from cached_path import cached_path

import nfflr


def pmg_to_nfflr(atoms: dict):
    """load atoms from pymatgen dict without pymatgen dependency."""
    # ignores partially occupied sites...
    cell = atoms["lattice"]["matrix"]
    coords = torch.tensor([site["xyz"] for site in atoms["sites"]])
    symbols = [site["species"][0]["element"] for site in atoms["sites"]]
    numbers = [jarvis.core.specie.chem_data[sym]["Z"] for sym in symbols]

    return nfflr.Atoms(cell, coords, numbers)


def _mlearn_dataset(datafile: Path):
    """pymatgen distribution of mlearn dataset."""
    data = pd.read_json(datafile)

    data["atoms"] = data.structure.apply(pmg_to_nfflr)
    data["energy"] = data.outputs.apply(lambda x: x["energy"])
    data["forces"] = data.outputs.apply(lambda x: x["forces"])
    data["stress"] = data.outputs.apply(lambda x: x["virial_stress"])
    data["jid"] = data.index

    return nfflr.AtomsDataset(data, target="energy_and_forces", energy_units="eV")


[docs] def mlearn_dataset( elements: str | list[str] = "Si", transform: Optional[torch.nn.Module] = None, diskcache: bool = False, ): """Construct mlearn dataset with standard splits. Downloads and caches json datafiles from github.com/materialsvirtuallab/mlearn to nfflr.CACHE directory, which respects `XDG_CACHE_HOME`. """ mlearn_base = "https://github.com/materialsvirtuallab/mlearn/raw/master/data" if isinstance(elements, str): elements = [elements] datafiles = [] for element in elements: datafiles.append(f"{mlearn_base}/{element}/training.json") datafiles.append(f"{mlearn_base}/{element}/test.json") dfs = [ pd.read_json(cached_path(datafile, cache_dir=nfflr.CACHE)) for datafile in datafiles ] df = pd.concat(dfs, ignore_index=True) df["atoms"] = df.structure.apply(pmg_to_nfflr) df["energy"] = df.outputs.apply(lambda x: x["energy"]) df["forces"] = df.outputs.apply(lambda x: x["forces"]) df["stress"] = df.outputs.apply(lambda x: x["virial_stress"]) # TODO: fix this index ids = [f"{row.element}-{idx}" for idx, row in df.iterrows()] df["jid"] = ids dataset = nfflr.AtomsDataset( df, target="energy_and_forces", energy_units="eV", transform=transform, diskcache=diskcache, ) # use the standard split: override random splits (id_train,) = np.where(df.tag == "train") (id_val,) = np.where(df.tag == "test") id_test = np.array([]) dataset.split = dict(train=id_train, val=id_val, test=id_test) # redo target standardization since it requires the correct train split if dataset.standardize: dataset.setup_target_standardization() return dataset