# -----------------------------------------------------------------------------.
# MIT License
# Copyright (c) 2024 GPM-API developers
#
# This file is part of GPM-API.
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
# -----------------------------------------------------------------------------.
"""This module contains the routines required to search data on the NASA PPS servers."""
import datetime
import subprocess
from dateutil.relativedelta import relativedelta
from gpm.configs import get_password_pps, get_username_pps
from gpm.io.checks import (
check_product_type,
check_product_validity,
check_product_version,
)
from gpm.io.products import available_products, get_product_info
####--------------------------------------------------------------------------.
#####################
#### Directories ####
#####################
def _get_pps_text_server(product_type):
"""Return the url to the PPS text servers."""
if product_type == "NRT":
url_text_server = "https://jsimpsonhttps.pps.eosdis.nasa.gov/text"
else:
url_text_server = "https://arthurhouhttps.pps.eosdis.nasa.gov/text"
return url_text_server
def _get_pps_data_server(product_type):
"""Return the url to the PPS data servers."""
if product_type == "NRT":
url_data_server = "ftps://jsimpsonftps.pps.eosdis.nasa.gov/data"
else:
url_data_server = "ftps://arthurhouftps.pps.eosdis.nasa.gov"
return url_data_server
def _get_pps_nrt_product_folder_name(product):
"""Retrieve NASA PPS server folder name for NRT product_type."""
folder_name = get_product_info(product).get("pps_nrt_dir", None)
if folder_name is None:
raise ValueError(
f"The pps_nrt_dir key of the {product} product is not specified in the config files.",
)
return folder_name
def _get_pps_rs_product_folder_name(product):
"""Retrieve NASA PPS server folder name for RS product_type."""
folder_name = get_product_info(product).get("pps_rs_dir", None)
if folder_name is None:
raise ValueError(
f"The pps_rs_dir key of the {product} product is not specified in the config files.",
)
return folder_name
def _get_pps_nrt_product_dir(product, date):
"""Retrieve the NASA PPS server directory structure where NRT data are stored.
Parameters
----------
product : str
GPM product name. See ``gpm.available_products()``.
date : datetime.date
Single date for which to retrieve the data.
Note: this is currently only needed when retrieving IMERG data.
"""
folder_name = _get_pps_nrt_product_folder_name(product)
# Specify the directory tree
if product in available_products(product_types="NRT", product_categories="IMERG"):
directory_tree = f"{folder_name}/{datetime.datetime.strftime(date, '%Y%m')}"
else:
directory_tree = folder_name
return directory_tree
def _get_pps_rs_product_dir(product, date, version):
"""Retrieve the NASA PPS server directory structure where RS data are stored.
Parameters
----------
product : str
GPM product name. See ``gpm.available_products()``. .
date : datetime.date
Single date for which to retrieve the data.
version : int
GPM version of the data to retrieve if ``product_type = "RS"``.
"""
version = check_product_version(version, product)
product = check_product_validity(product, product_type="RS")
# Retrieve NASA server folder name for RS
folder_name = _get_pps_rs_product_folder_name(product)
# Specify the directory tree for current RS version
if version == 7:
directory_tree = "/".join(
[
"gpmdata",
datetime.datetime.strftime(date, "%Y/%m/%d"),
folder_name,
],
)
# Specify the directory tree for old RS version
else: # version in [4, 5, 6]:
version_str = "V0" + str(int(version))
directory_tree = "/".join(
[
"gpmallversions",
version_str,
datetime.datetime.strftime(date, "%Y/%m/%d"),
folder_name,
],
)
# Return the directory tree
return directory_tree
def _get_pps_directory_tree(product, product_type, date, version):
"""Retrieve the NASA PPS server directory tree where the GPM data are stored.
The directory tree structure for ``product_type="RS"`` is:
- ``<gpmallversions>/V0<version>/<pps_rs_dir>/YYYY/MM/DD``
- The L3 monthly products are saved in the ``YYYY/MM/01`` directory
The directory tree structure for ``product_type="NRT"`` is:
- IMERG-ER and IMERG-FR: ``imerg/<early/late>/YYYY/MM/``
- Otherwise ``<pps_nrt_dir>/``
Parameters
----------
product : str
GPM product name. See ``gpm.available_products()``. .
product_type : str
GPM product type. Either ``RS`` (Research) or ``NRT`` (Near-Real-Time).
date : datetime.date
Single date for which to retrieve the data.
version : int
GPM version of the data to retrieve if ``product_type = "RS"``.
Returns
-------
directory_tree : str
DIrectory tree on the NASA PPS server where the data are stored.
"""
product_type = check_product_type(product_type)
if product_type == "NRT":
return _get_pps_nrt_product_dir(product, date)
# product_type == "RS"
return _get_pps_rs_product_dir(product, date, version)
[docs]
def get_pps_product_directory(product, product_type, date, version, server_type):
"""Retrieve the NASA PPS server product directory path at specific date.
The data list is retrieved using https.
The data stored are retrieved using ftps.
Parameters
----------
product : str
GPM product name. See ``gpm.available_products()``.
product_type : str
GPM product type. Either ``RS`` (Research) or ``NRT`` (Near-Real-Time).
date : datetime.date
Single date for which to retrieve the data.
version : int
GPM version of the data to retrieve if ``product_type = "RS"``.
server_type: str
Either ``text`` or ``data``
Returns
-------
url_product_dir : str
url of the NASA PPS server where the data are listed.
"""
# Retrieve server URL
url_server = _get_pps_text_server(product_type) if server_type == "text" else _get_pps_data_server(product_type)
# Retrieve directory tree structure
dir_structure = _get_pps_directory_tree(
product=product,
product_type=product_type,
date=date,
version=version,
)
# Define product directory where data are listed
return f"{url_server}/{dir_structure}"
####--------------------------------------------------------------------------.
############################
#### Filepath retrieval ####
############################
def _try_get_pps_file_list(url_product_dir):
# Retrieve GPM-API configs
username = get_username_pps()
password = get_password_pps()
# Ensure url_file_list ends with "/"
if url_product_dir[-1] != "/":
url_product_dir = url_product_dir + "/"
# Define curl command
# -k is required with curl > 7.71 otherwise results in "unauthorized access".
cmd = f"curl -k --user {username}:{password} {url_product_dir}"
# Run command
args = cmd.split()
process = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout = process.communicate()[0].decode()
# Check if server is available
if stdout == "":
raise ValueError("The PPS server is currently unavailable. Sorry for the inconvenience.")
# Check if there are data are available
if stdout[0] == "<":
raise ValueError("No data found on PPS.")
# Retrieve filepaths
return stdout.split()
def _get_pps_file_list(url_product_dir, product, date, version, verbose=True):
"""Retrieve the filepaths of the files available on the NASA PPS server for a specific day.
The query is done using https !
The function does not return the full PPS server url, but the filepath
from the server root: i.e: ``'/gpmdata/2020/07/05/radar/<...>.HDF5'``
The returned filepaths can includes more than one product !!!
Parameters
----------
url_product_dir : str
The PPS product directory url.
product : str
GPM product acronym. See ``gpm.available_products()``.
date : datetime
Single date for which to retrieve the data.
verbose : bool, optional
Default is ``False``. Whether to specify when data are not available for a specific date.
"""
try:
filepaths = _try_get_pps_file_list(url_product_dir)
except Exception as e:
# If url not exist, raise an error
if "The PPS server is currently unavailable." in str(e):
raise e
if "No data found on PPS." in str(e):
# If no filepath (empty directory), print message if verbose=True
if verbose:
version_str = str(int(version))
msg = f"No data found on PPS on date {date} for product {product} (V0{version_str})"
print(msg)
filepaths = []
else:
raise ValueError(f"Undefined error. The error is {e}.")
return filepaths
[docs]
def get_pps_daily_filepaths(product, product_type, date, version, verbose=True):
"""Retrieve the complete url to the files available on the NASA PPS server for a specific day and product.
Parameters
----------
product : str
GPM product acronym. See ``gpm.available_products()``.
date : datetime.date
Single date for which to retrieve the data.
product_type : str, optional
GPM product type. Either ``RS`` (Research) or ``NRT`` (Near-Real-Time).
version : int, optional
GPM version of the data to retrieve if ``product_type = "RS"``.
verbose : bool, optional
Whether to specify when data are not available for a specific date.
The default is ``True``.
"""
# Retrieve url to product directory
url_product_dir = get_pps_product_directory(
product=product,
product_type=product_type,
date=date,
version=version,
server_type="text",
)
# Retrieve filepaths from the PPS base directory of the server
# - If empty: return []
# - Example /gpmdata/2020/07/05/radar/<...>.HDF5'
filepaths = _get_pps_file_list(
url_product_dir=url_product_dir,
product=product,
date=date,
version=version,
verbose=verbose,
)
# Define the complete url of pps filepaths
# Filepaths start with a "/"
url_data_server = _get_pps_data_server(product_type)
return [f"{url_data_server}{filepath}" for filepath in filepaths]
[docs]
def define_pps_filepath(product, product_type, date, version, filename):
"""Define PPS filepath from filename.
This function is called by ``get_filepath_from_filename(filename, storage, product_type)``.
"""
# Retrieve product directory url
url_product_dir = get_pps_product_directory(
product=product,
product_type=product_type,
date=date,
version=version,
server_type="data",
)
# Define PPS filepath
return f"{url_product_dir}/{filename}"
####--------------------------------------------------------------------------.
#################
#### Utility ####
#################
[docs]
def find_first_pps_granule_filepath(product: str, product_type: str, version: int) -> str:
"""Return the PPS filepath of the first available granule."""
from gpm.io.find import find_filepaths
# Retrieve product start_time from product.yaml file.
start_time = get_product_info(product).get("start_time", None)
if start_time is None:
raise ValueError(f"{product} product start_time is not provided in the product.yaml file.")
# Find filepath
end_time = start_time + relativedelta(days=1)
pps_filepaths = find_filepaths(
storage="PPS",
product=product,
start_time=start_time,
end_time=end_time,
version=version,
product_type=product_type,
)
if len(pps_filepaths) == 0:
raise ValueError(f"No PPS files found for {product} product around {start_time}.")
pps_filepaths = sorted(pps_filepaths)
return pps_filepaths[0]