Source code for gpm.io.local

# -----------------------------------------------------------------------------.
# MIT License

# Copyright (c) 2024 GPM-API developers
#
# This file is part of GPM-API.

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

# -----------------------------------------------------------------------------.
"""This module contains functions defining where to download GPM data on the local machine."""
import os
import re

from gpm.configs import get_base_dir
from gpm.io.checks import check_base_dir
from gpm.io.products import get_product_category
from gpm.utils.directories import search_leaf_files

####--------------------------------------------------------------------------.
#####################
#### Directories ####
#####################


[docs] def get_time_tree(date): """Get time tree path ``<YYYY>/<MM>/<DD>``.""" year = date.strftime("%Y") month = date.strftime("%m") day = date.strftime("%d") return os.path.join(year, month, day)
def _get_local_dir_pattern(product, product_type, version): """Defines the local (disk) repository base pattern where data are stored and searched. Parameters ---------- product : str GPM product name. See ``gpm.available_products()``. product_type : str GPM product type. Either ``RS`` (Research) or ``NRT`` (Near-Real-Time). version : int GPM version of the data to retrieve if ``product_type = "RS"``. Returns ------- pattern : str Directory base pattern: - If ``product_type == "RS"``: ``GPM/RS/V<version>/<product_category>/<product>`` - If ``product_type == "NRT"``: ``GPM/NRT/<product_category>/<product>`` Valid `product_category` are ``RADAR``, ``PMW``, ``CMB``, ``IMERG``. """ # Define pattern product_category = get_product_category(product) if product_type == "NRT": dir_structure = os.path.join("GPM", product_type, product_category, product) else: # product_type == "RS" version_str = "V0" + str(int(version)) dir_structure = os.path.join("GPM", product_type, version_str, product_category, product) return dir_structure def _get_local_product_base_directory(base_dir, product, product_type, version): """Provide the local product base directory path where the requested GPM data are stored. Parameters ---------- base_dir : str The base directory where to store GPM data. product : str GPM product name. See ``gpm.available_products()``. product_type : str GPM product type. Either ``RS`` (Research) or ``NRT`` (Near-Real-Time). version : int GPM version of the data to retrieve if ``product_type = "RS"``. Returns ------- product_dir : str Product base directory path where data are located. """ base_dir = check_base_dir(base_dir) product_dir_pattern = _get_local_dir_pattern(product, product_type, version) return os.path.join(base_dir, product_dir_pattern) def _get_local_directory_tree(product, product_type, date, version): """Return the local product directory tree. The directory tree structure for ``product_type``: - ``RS`` is ``GPM/RS/V<version>/<product_category>/<product>/YYYY/MM/YY`` - ``NRT`` is ``GPM/NRT/<product_category>/<product>/YYYY/MM/YY`` Parameters ---------- product : str GPM product name. See ``gpm.available_products()``. product_type : str GPM product type. Either ``RS`` (Research) or ``NRT`` (Near-Real-Time). date : datetime.date Single date for which to retrieve the data. version : int GPM version of the data to retrieve. Returns ------- directory_tree : str DIrectory tree on the NASA GESC DISC server where the data are stored. """ # Define product directory: GPM/RS/V<version>/<product_category>/<product> product_dir_tree = _get_local_dir_pattern(product, product_type, version) # Define time tree time_tree = get_time_tree(date) # Define product directory tree for a specific date return os.path.join(product_dir_tree, time_tree)
[docs] def get_local_product_directory(base_dir, product, product_type, version, date): """Provide the local repository path where the requested daily GPM data are stored/need to be saved. Parameters ---------- base_dir : str The base directory where to store GPM data. product : str GPM product name. See ``gpm.available_products()``. product_type : str GPM product type. Either ``RS`` (Research) or ``NRT`` (Near-Real-Time). version : int GPM version of the data to retrieve if ``product_type = "RS"``. date : datetime.date Single date for which to retrieve the data. Returns ------- product_dir_path : str Directory path where daily GPM data are located. - If ``product_type == "RS"``: ``<base_dir>/GPM/RS/V0<version>/<product_category>/<product>/<YYYY>/<MM>/<DD>`` - If ``product_type == "NRT"``: ``<base_dir>/GPM/NRT/<product_category>/<product>/<YYYY>/<MM>/<DD>`` Valid `product_category` are ``RADAR``, ``PMW``, ``CMB``, ``IMERG``. """ dir_structure = _get_local_directory_tree( product=product, product_type=product_type, version=version, date=date, ) return os.path.join(base_dir, dir_structure)
####--------------------------------------------------------------------------. ############################ #### Filepath retrieval #### ############################
[docs] def get_local_daily_filepaths(product, product_type, date, version, base_dir=None): """Retrieve GPM data filepaths on the local disk directory of a specific day and product. Parameters ---------- product : str GPM product acronym. See ``gpm.available_products()``. product_type : str GPM product type. Either ``RS`` (Research) or ``NRT`` (Near-Real-Time). date : datetime.date Single date for which to retrieve the data. version : int GPM version of the data to retrieve if ``product_type = "RS"``. """ # Retrieve the local GPM base directory base_dir = get_base_dir(base_dir=base_dir) base_dir = check_base_dir(base_dir) # Retrieve the directory on disk where the data are stored dir_path = get_local_product_directory( base_dir=base_dir, product=product, product_type=product_type, date=date, version=version, ) # Check if the folder exists if not os.path.exists(dir_path): return [] # Retrieve the file names in the directory filenames = sorted(os.listdir(dir_path)) # returns [] if empty # Retrieve the filepaths return [os.path.join(dir_path, filename) for filename in filenames]
[docs] def define_local_filepath(product, product_type, date, version, filename, base_dir=None): """Define local file path. This function is called by get_filepath_from_filename(filename, storage, product_type). """ # Retrieve the local GPM base directory base_dir = get_base_dir(base_dir=base_dir) base_dir = check_base_dir(base_dir) # Define disk directory path dir_tree = get_local_product_directory( base_dir=base_dir, product=product, product_type=product_type, date=date, version=version, ) # Define disk file path return os.path.join(dir_tree, filename)
[docs] def get_local_dir_tree_from_filename(filepath, product_type="RS", base_dir=None): """Return directory tree from a GPM filename or filepath.""" from gpm.io.info import get_info_from_filepath base_dir = get_base_dir(base_dir=base_dir) base_dir = check_base_dir(base_dir) # Retrieve file info info = get_info_from_filepath(filepath) product = info["product"] version = int(re.findall("\\d+", info["version"])[0]) date = info["start_time"].date() # Retrieve directory tree return get_local_product_directory( base_dir=base_dir, product=product, product_type=product_type, date=date, version=version, )
[docs] def get_local_filepath_from_filename(filepath, product_type="RS", base_dir=None): """Return the local filepath of a GPM file or filepath.""" filename = os.path.basename(filepath) dir_tree = get_local_dir_tree_from_filename( filepath, product_type=product_type, base_dir=base_dir, ) return os.path.join(dir_tree, filename)
####--------------------------------------------------------------------------. ################# #### Utility #### #################
[docs] def get_local_filepaths(product, version=7, product_type="RS", base_dir=None, groups=None): """Retrieve all GPM filepaths on the local disk directory for a specific product. Parameters ---------- product : str GPM product acronym. See ``gpm.available_products()``. version : int GPM version of the data to retrieve if ``product_type = "RS"``. The default is version ``7``. product_type : str, optional GPM product type. Either ``RS`` (Research) or ``NRT`` (Near-Real-Time). groups: list or str, optional Whether to group the filepaths in a dictionary by a custom selection of keys. Valid group keys are `product_level`, `satellite`, `sensor`, `algorithm`, `start_time`, `end_time`, `granule_id`, `version`, `product_type`, `product`, `data_format`, `year`, `month`, `day`, `doy`, `dow`, `hour`, `minute`, `second`, `month_name`, `quarter`, `season`. The time components are extracted from `start_time` ! If groups is ``None`` returns the filepaths list. The default is ``None``. """ from gpm.io.info import group_filepaths # Retrieve the local GPM base directory base_dir = get_base_dir(base_dir=base_dir) base_dir = check_base_dir(base_dir) # Retrieve the local directory where the data are stored product_dir = _get_local_product_base_directory( base_dir=base_dir, product=product, product_type=product_type, version=version, ) # Check if the folder exists if not os.path.exists(product_dir): return [] # Retrieve the filepaths filepaths = search_leaf_files(base_dir=product_dir, parallel=True) filepaths = sorted(filepaths) # Group filepaths if groups is not None return group_filepaths(filepaths, groups=groups)