Source code for gpm.dataset.datatree

# -----------------------------------------------------------------------------.
# MIT License

# Copyright (c) 2024 GPM-API developers
#
# This file is part of GPM-API.

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

# -----------------------------------------------------------------------------.
"""This module contains functions to read a GPM granule into a DataTree object."""
import os

import xarray as xr

import gpm
from gpm.dataset.attrs import decode_string
from gpm.dataset.dimensions import rename_datatree_dimensions


[docs] def open_raw_datatree(filepath, chunks={}, decode_cf=False, use_api_defaults=True, **kwargs): """Open a GPM HDF5 file into a xarray.DataTree object with intuitive dimensions names. Parameters ---------- chunks : int, dict, str or None, optional Chunk size for dask array: - ``chunks=-1`` loads the dataset with dask using a single chunk for each granule arrays. - ``chunks={}`` loads the dataset with dask using the file chunks. - ``chunks='auto'`` will use dask ``auto`` chunking taking into account the file chunks. If you want to load data in memory directly, specify ``chunks=None``. The default is ``auto``. Hint: xarray's lazy loading of remote or on-disk datasets is often but not always desirable. Before performing computationally intense operations, load the dataset entirely into memory by invoking ``ds.compute()``. decode_cf: bool, optional Whether to decode the dataset. The default is ``False``. **kwargs : dict Additional keyword arguments passed to :py:func:`~xarray.open_dataset` for each group. Returns ------- xarray.DataTree """ try: dt = xr.open_datatree( filepath, engine="netcdf4", chunks=chunks, decode_cf=decode_cf, decode_times=False, **kwargs, ) closer = dt._close check_non_empty_granule(dt, filepath) except Exception as e: check_valid_granule(filepath) raise ValueError(e) # Assign dimension names dt = rename_datatree_dimensions(dt, use_api_defaults=use_api_defaults) # Specify closer dt.set_close(closer) return dt
[docs] def check_non_empty_granule(dt, filepath): """Check that the datatree (or dataset) is not empty.""" attrs = dt.attrs attrs = decode_string(attrs["FileHeader"]) is_empty_granule = attrs.get("EmptyGranule", "NOT_EMPTY") != "NOT_EMPTY" if is_empty_granule: raise ValueError(f"{filepath} is an EMPTY granule !")
[docs] def check_valid_granule(filepath): """Raise an explanatory error if the GPM granule is not readable.""" # Check the file exists if not os.path.exists(filepath): raise FileNotFoundError(f"The filepath {filepath} does not exist.") # Identify the cause of the error if xarray can't open the file try: with xr.open_dataset(filepath, engine="netcdf4", group="") as ds: check_non_empty_granule(ds, filepath) except Exception as e: if "an EMPTY granule" in str(e): raise e _identify_error(e, filepath)
def _identify_error(e, filepath): """Identify error when opening HDF file.""" error_str = str(e) if "[Errno -101] NetCDF: HDF error" in error_str: info = "" if gpm.config.get("remove_corrupted_files"): # default False info = " and is being removed" os.remove(filepath) msg = f"The file {filepath} is corrupted{info}. It must be redownload." raise ValueError(msg) if "[Errno -51] NetCDF: Unknown file format" in error_str: msg = f"The GPM-API is not currently able to read the file format of {filepath}. Report the issue please." raise ValueError(msg) if "lock" in error_str: msg = "Unfortunately, HDF locking is occurring." msg += "Export the environment variable HDF5_USE_FILE_LOCKING = 'FALSE' into your environment (i.e. in the .bashrc).\n" # noqa msg += f"The error is: '{error_str}'." raise ValueError(msg) msg = f"The following file is corrupted. Error is {e}. Redownload the file." raise ValueError(msg)