Source code for arpes.io

"""Provides the core IO facilities supported by PyARPES.

The most important here are the data loading functions (load_data, load_example_data).
and pickling utilities.

Heavy lifting is actually performed by the plugin definitions which know how to ingest
different data formats into the PyARPES data model.

TODO: An improvement could be made to the example data if served
over a network and someone was willing to host a few larger pieces
of data.
"""

import pickle
import warnings

from typing import Any, List, Union, Optional
from dataclasses import dataclass

from pathlib import Path

import pandas as pd
import xarray as xr

import arpes.config
from arpes.endstations import load_scan
from arpes.typing import DataType

__all__ = (
    "load_data",
    "load_example_data",
    "easy_pickle",
    "list_pickles",
    "stitch",
)


[docs]def load_data(
    file: Union[str, Path, int], location: Optional[Union[str, type]] = None, **kwargs
) -> xr.Dataset:
    """Loads a piece of data using available plugins. This the user facing API for data loading.

    Args:
        file: An identifier for the file which should be loaded. If this is a number or can be coerced to one,
          data will be loaded from the workspace data folder if a matching unique file can be found for the number.
          If the value is a relative path, locations relative to the cwd and the workspace data folder will be checked.
          Absolute paths can also be used in a pinch.
        location: The name of the endstation/plugin to use. You should try to provide one. If None is provided,
          the loader will try to find an appropriate one based on the file extension and brute force. This will be slower
          and can be error prone in certain circumstances.

          Optionally, you can pass a loading plugin (the class) through this kwarg and directly specify
          the class to be used.


    Returns:
        The loaded data. Ideally, data which is loaded through the plugin system should be highly compliant with
        the PyARPES data model and should work seamlessly with PyARPES analysis code.
    """
    try:
        file = int(str(file))
    except ValueError:
        file = str(Path(file).absolute())

    desc = {
        "file": file,
        "location": location,
    }

    if location is None:
        desc.pop("location")
        warnings.warn(
            (
                "You should provide a location indicating the endstation or instrument used directly when "
                "loading data without a dataset. We are going to do our best but no guarantees."
            )
        )

    return load_scan(desc, **kwargs)


DATA_EXAMPLES = {
    "cut": ("ALG-MC", "cut.fits"),
    "map": ("example_data", "fermi_surface.nc"),
    "photon_energy": ("example_data", "photon_energy.nc"),
    "nano_xps": ("example_data", "nano_xps.nc"),
    "temperature_dependence": ("example_data", "temperature_dependence.nc"),
}


[docs]def load_example_data(example_name="cut") -> xr.Dataset:
    """Provides sample data for executable documentation."""
    if example_name not in DATA_EXAMPLES:
        warnings.warn(
            f"Could not find requested example_name: {example_name}. Please provide one of {list(DATA_EXAMPLES.keys())}"
        )

    location, example = DATA_EXAMPLES[example_name]
    file = Path(__file__).parent / "example_data" / example
    return load_data(file=file, location=location)


@dataclass
class ExampleData:
    @property
    def cut(self) -> xr.DataArray:
        return load_example_data("cut")

    @property
    def map(self) -> xr.DataArray:
        return load_example_data("map")

    @property
    def photon_energy(self) -> xr.DataArray:
        return load_example_data("photon_energy")

    @property
    def nano_xps(self) -> xr.DataArray:
        return load_example_data("nano_xps")

    @property
    def temperature_dependence(self) -> xr.DataArray:
        return load_example_data("temperature_dependence")


example_data = ExampleData()


def stitch(
    df_or_list: Union[List[str], pd.DataFrame],
    attr_or_axis: str,
    built_axis_name: Optional[str] = None,
    sort: bool = True,
) -> DataType:
    """Stitches together a sequence of scans or a DataFrame.

    Args:
        df_or_list: The list of the files to load
        attr_or_axis: Coordinate or attribute in order to promote to an index. I.e. if 't_a' is specified,
                      we will create a new axis corresponding to the temperature and concatenate the data along this axis
        built_axis_name: The name of the concatenated output dimensions
        sort: Whether to sort inputs to the concatenation according to their `attr_or_axis` value.

    Returns:
        The concatenated data.
    """
    list_of_files = None
    if isinstance(df_or_list, (pd.DataFrame,)):
        list_of_files = list(df_or_list.index)
    else:
        if not isinstance(df_or_list, (list, tuple)):
            raise TypeError("Expected an interable for a list of the scans to stitch together")

        list_of_files = list(df_or_list)

    if built_axis_name is None:
        built_axis_name = attr_or_axis

    if not list_of_files:
        raise ValueError("Must supply at least one file to stitch")

    loaded = [
        f if isinstance(f, (xr.DataArray, xr.Dataset)) else load_data(f) for f in list_of_files
    ]

    for i, loaded_file in enumerate(loaded):
        value = None
        if isinstance(attr_or_axis, (list, tuple)):
            value = attr_or_axis[i]
        elif attr_or_axis in loaded_file.attrs:
            value = loaded_file.attrs[attr_or_axis]
        elif attr_or_axis in loaded_file.coords:
            value = loaded_file.coords[attr_or_axis]

        loaded_file = loaded_file.assign_coords(dict([[built_axis_name, value]]))

    if sort:
        loaded.sort(key=lambda x: x.coords[built_axis_name])

    concatenated = xr.concat(loaded, dim=built_axis_name)
    if "id" in concatenated.attrs:
        del concatenated.attrs["id"]

    from arpes.provenance import provenance_multiple_parents

    provenance_multiple_parents(
        concatenated,
        loaded,
        {
            "what": "Stitched together separate datasets",
            "by": "stitch",
            "dim": built_axis_name,
        },
    )

    return concatenated


def file_for_pickle(name):
    here = Path(".")

    from arpes.config import CONFIG

    if CONFIG["WORKSPACE"]:
        here = Path(CONFIG["WORKSPACE"]["path"])

    path = here / "picklejar" / "{}.pickle".format(name)
    path.parent.mkdir(exist_ok=True)
    return str(path)


def load_pickle(name: str) -> Any:
    """Loads a workspace local pickle. Inverse to `save_pickle`."""
    with open(file_for_pickle(name), "rb") as file:
        return pickle.load(file)


def save_pickle(data: Any, name: str):
    """Saves a workspace local pickle. Inverse to `load_pickle`."""
    pickle.dump(data, open(file_for_pickle(name), "wb"))


def easy_pickle(data_or_str: Any, name=None) -> Any:
    """A convenience function around pickling.

    Provides a workspace scoped associative set of named pickles which
    can be used for

    Examples:
        Retaining analysis results between sessions.

        Sharing results between workspaces.

        Cacheing expensive or interim work.

    For reproducibility reasons, you should generally prefer to
    duplicate anaysis results using common code to prevent stale data
    dependencies, but there are good reasons to use pickling as well.

    This function knows whether we are pickling or unpickling depending on
    whether one or two arguments are provided.

    Args:
        data_or_str: If saving, the data to be pickled. If loading, the name of the pickle to load.
        name: If saving (non-None value), the name to associate. Defaults to None.

    Returns:
        None if name is not None, which indicates that we are saving data.
        Otherwise, returns the unpickled value associated to `name`.
    """
    # we are loading data
    if isinstance(data_or_str, str) or name is None:
        return load_pickle(data_or_str)

    # we are saving data
    assert isinstance(name, str)
    save_pickle(data_or_str, name)


def list_pickles() -> List[str]:
    """Generates a summary list of (workspace-local) pickled results and data.

    Returns:
        A list of the named pickles, suitable for passing to `easy_pickle`.
    """
    return [str(s.stem) for s in Path(file_for_pickle("just-a-pickle")).parent.glob("*.pickle")]